From bec7babec9c924a0ee7ad27e3f6582bc5bd1fef5 Mon Sep 17 00:00:00 2001
From: "Wei (Will) Feng" <134637289+weifengpy@users.noreply.github.com>
Date: Tue, 23 Apr 2024 08:25:56 -0700
Subject: [PATCH] [FSDP1] reduce GPU memory usage from 78G instead of 23G
 (#843)

---
 recipes/lora_finetune_distributed.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
index ee5475a201..b1c50ba193 100644
--- a/recipes/lora_finetune_distributed.py
+++ b/recipes/lora_finetune_distributed.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import os
 import sys
 import time
 
@@ -600,7 +601,7 @@ def recipe_main(cfg: DictConfig) -> None:
             "Distributed finetune recipe should be run via a distributed launcher."
             "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
         )
-
+    os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
     init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl")
 
     config.log_config(recipe_name="LoRAFinetuneRecipeDistributed", cfg=cfg)