Imran1
/

Qwen2.5-72B-Instruct-FP8

Model card Files Files and versions Community

Imran1 commited on Oct 10, 2024

Commit

acb4176

·

verified ·

1 Parent(s): 939342b

Update code/inference.py

Files changed (1) hide show

code/inference.py +12 -6

code/inference.py CHANGED Viewed

@@ -7,8 +7,11 @@ import fcntl  # For file locking
 import os  # For file operations
 import time  # For sleep function
-# Set the max_split_size globally at the start
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
 # Print to verify the environment variable is correctly set
 print(f"PYTORCH_CUDA_ALLOC_CONF: {os.environ.get('PYTORCH_CUDA_ALLOC_CONF')}")
@@ -70,15 +73,18 @@ def model_fn(model_dir, context=None):
                 model = load_checkpoint_and_dispatch(
                     model,
                     model_dir,
-                    device_map="auto",  # Automatically map layers across GPUs
-                    offload_folder=offload_dir,  # Offload parts to disk if needed
-                    max_memory={i: "20GiB" for i in range(4)},  # Adjust memory per GPU (4 GPUs)
-                    no_split_module_classes=["QwenForCausalLM"]  # Ensure model is split across the GPUs
                 )
                 # Load the tokenizer
                 tokenizer = AutoTokenizer.from_pretrained(model_dir)
         except Exception as e:
             print(f"Error loading model and tokenizer: {e}")
             raise

 import os  # For file operations
 import time  # For sleep function
+# Set max_split_size globally to prevent memory fragmentation
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"
+# Enable detailed distributed logs
+os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
 # Print to verify the environment variable is correctly set
 print(f"PYTORCH_CUDA_ALLOC_CONF: {os.environ.get('PYTORCH_CUDA_ALLOC_CONF')}")
                 model = load_checkpoint_and_dispatch(
                     model,
                     model_dir,
+                    device_map="balanced",  # Evenly distribute across GPUs
+                    offload_folder=offload_dir,
+                    max_memory={i: "18GiB" for i in range(torch.cuda.device_count())},  # Allocate 18 GiB per GPU
+                    no_split_module_classes=["QwenForCausalLM"]  # Split model across GPUs
                 )
                 # Load the tokenizer
                 tokenizer = AutoTokenizer.from_pretrained(model_dir)
+                # Free up any unused memory after loading
+                torch.cuda.empty_cache()
         except Exception as e:
             print(f"Error loading model and tokenizer: {e}")
             raise