Tonic commited on
Commit
fa9560d
Β·
1 Parent(s): d47568c

adds a100 memory optimized

Browse files
config/train_gpt_oss_openhermes_fr_memory_optimized.py CHANGED
@@ -41,9 +41,9 @@ config = GPTOSSEnhancedCustomConfig(
41
  # MEMORY-OPTIMIZED TRAINING HYPERPARAMETERS
42
  # ============================================================================
43
  # Batch configuration following memory optimization principles
44
- num_train_epochs=1.0, # Single epoch to reduce memory pressure
45
- batch_size=8, # Reduced from 6 for memory efficiency
46
- gradient_accumulation_steps=8, # Increased to maintain effective batch size 32
47
 
48
  # Learning rate optimized for single epoch + memory constraints
49
  learning_rate=2e-4, # Standard GPT-OSS learning rate
@@ -56,7 +56,7 @@ config = GPTOSSEnhancedCustomConfig(
56
  # MODEL CONFIGURATION - Memory Optimized for GPT-OSS
57
  # ============================================================================
58
  model_name="openai/gpt-oss-20b",
59
- max_seq_length=4096, # Reduced from 3072 for memory optimization
60
  use_flash_attention=True, # Critical for memory efficiency
61
  use_gradient_checkpointing=True, # Essential for memory optimization
62
 
@@ -92,6 +92,7 @@ config = GPTOSSEnhancedCustomConfig(
92
  # QUANTIZATION - GPT-OSS Native MXFP4 Optimization
93
  # ============================================================================
94
  use_quantization=True,
 
95
  quantization_config={
96
  "dequantize": True, # Use native MXFP4 as per GPT-OSS specs
97
  "load_in_4bit": False, # Don't use BNB 4-bit with MXFP4
@@ -106,40 +107,39 @@ config = GPTOSSEnhancedCustomConfig(
106
  # ============================================================================
107
  # Model loading with memory constraints
108
  model_kwargs={
109
- "attn_implementation": "kernels-community/vllm-flash-attn3", # Much faster attention on A100/H100
110
  "torch_dtype": "auto", # Let model decide (MXFP4 compatible)
111
  "use_cache": False, # Disable KV cache for training
112
  "device_map": "auto", # Automatic device mapping
113
  "low_cpu_mem_usage": True, # Critical for memory optimization
114
- "max_memory": {0: "75GB"}, # Reserve memory for other processes
115
  },
116
 
117
  # Data loading optimized for throughput
118
  dataloader_num_workers=4, # More workers for faster loading
119
  dataloader_pin_memory=True, # Pin memory for faster host->GPU copies
120
- dataloader_prefetch_factor=2,
121
 
122
  # Memory management optimizations
123
- max_memory_per_gpu="75GB", # Explicit memory limit
124
  low_cpu_mem_usage=True, # Essential for large models
125
  group_by_length=True, # Efficient batching for memory
126
  remove_unused_columns=True, # Remove unnecessary data
127
 
128
  # ============================================================================
129
- # EVALUATION & LOGGING - Fast Iterations
130
  # ============================================================================
131
  eval_strategy="steps",
132
- eval_steps=500, # Less frequent evaluation for memory
133
- logging_steps=50, # Reduced logging frequency
134
 
135
  save_strategy="steps",
136
- save_steps=1000, # Less frequent saves for memory/storage
137
  save_total_limit=3, # Keep only 2 checkpoints for memory
138
  save_only_model=True, # Save only model weights
139
 
140
  metric_for_best_model="eval_loss",
141
  greater_is_better=False,
142
- load_best_model_at_end=True,
143
 
144
  # Evaluation memory optimization
145
  eval_accumulation_steps=4, # Accumulate eval outputs to save memory
@@ -164,7 +164,7 @@ config = GPTOSSEnhancedCustomConfig(
164
 
165
  # Generation config optimized for GPT-OSS harmony format (exact template compliance)
166
  generation_config={
167
- "max_new_tokens": 256, # Reduced for memory efficiency
168
  "do_sample": True,
169
  "temperature": 0.6, # Slightly lower for more focused training
170
  "top_p": 0.9,
@@ -214,7 +214,7 @@ config = GPTOSSEnhancedCustomConfig(
214
  # Configuration validation and optimization tips
215
  print("\nπŸ”§ GPT-OSS Memory-Optimized OpenHermes-FR Configuration")
216
  print("=" * 60)
217
- print(f"πŸ“Š Dataset: {config.dataset_name} (200K samples)")
218
  print(f"πŸ—£οΈ Language: French with GPT-OSS Harmony Format")
219
  print(f"πŸ“ˆ Training: {config.num_train_epochs} epoch (memory optimized)")
220
  print(f"πŸ”„ Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
@@ -230,7 +230,7 @@ print(" β€’ Native MXFP4 quantization for GPT-OSS MoE layers")
230
  print(" β€’ Reduced batch size with increased gradient accumulation")
231
  print(" β€’ Limited sequence length for memory efficiency")
232
  print(" β€’ Reduced LoRA rank while maintaining effectiveness")
233
- print(" β€’ Dataset sampling (200K from 800K) for faster training")
234
  print(" β€’ Gradient checkpointing and efficient data loading")
235
  print(" β€’ Exact GPT-OSS Harmony format with <|return|> tokens")
236
  print("=" * 60)
 
41
  # MEMORY-OPTIMIZED TRAINING HYPERPARAMETERS
42
  # ============================================================================
43
  # Batch configuration following memory optimization principles
44
+ num_train_epochs=1.0, # Single epoch to reduce memory pressure
45
+ batch_size=2, # A100-safe per-device batch size
46
+ gradient_accumulation_steps=16, # Maintain reasonable effective batch size
47
 
48
  # Learning rate optimized for single epoch + memory constraints
49
  learning_rate=2e-4, # Standard GPT-OSS learning rate
 
56
  # MODEL CONFIGURATION - Memory Optimized for GPT-OSS
57
  # ============================================================================
58
  model_name="openai/gpt-oss-20b",
59
+ max_seq_length=4096, # Maximize sequence length for A100 VRAM utilization
60
  use_flash_attention=True, # Critical for memory efficiency
61
  use_gradient_checkpointing=True, # Essential for memory optimization
62
 
 
92
  # QUANTIZATION - GPT-OSS Native MXFP4 Optimization
93
  # ============================================================================
94
  use_quantization=True,
95
+ # MXFP4 per tutorial: https://cookbook.openai.com/articles/gpt-oss/fine-tune-transfomers
96
  quantization_config={
97
  "dequantize": True, # Use native MXFP4 as per GPT-OSS specs
98
  "load_in_4bit": False, # Don't use BNB 4-bit with MXFP4
 
107
  # ============================================================================
108
  # Model loading with memory constraints
109
  model_kwargs={
110
+ # Rely on training script to set eager + bf16 for MXFP4
111
  "torch_dtype": "auto", # Let model decide (MXFP4 compatible)
112
  "use_cache": False, # Disable KV cache for training
113
  "device_map": "auto", # Automatic device mapping
114
  "low_cpu_mem_usage": True, # Critical for memory optimization
 
115
  },
116
 
117
  # Data loading optimized for throughput
118
  dataloader_num_workers=4, # More workers for faster loading
119
  dataloader_pin_memory=True, # Pin memory for faster host->GPU copies
120
+ dataloader_prefetch_factor=1, # Lower prefetch to keep VRAM headroom
121
 
122
  # Memory management optimizations
123
+ max_memory_per_gpu=None, # No explicit memory limit; use as much VRAM as available
124
  low_cpu_mem_usage=True, # Essential for large models
125
  group_by_length=True, # Efficient batching for memory
126
  remove_unused_columns=True, # Remove unnecessary data
127
 
128
  # ============================================================================
129
+ # EVALUATION & LOGGING - Memory Safe
130
  # ============================================================================
131
  eval_strategy="steps",
132
+ eval_steps=200,
133
+ logging_steps=10,
134
 
135
  save_strategy="steps",
136
+ save_steps=500, # Less frequent saves for memory/storage
137
  save_total_limit=3, # Keep only 2 checkpoints for memory
138
  save_only_model=True, # Save only model weights
139
 
140
  metric_for_best_model="eval_loss",
141
  greater_is_better=False,
142
+ load_best_model_at_end=False, # Skip best model selection to save memory
143
 
144
  # Evaluation memory optimization
145
  eval_accumulation_steps=4, # Accumulate eval outputs to save memory
 
164
 
165
  # Generation config optimized for GPT-OSS harmony format (exact template compliance)
166
  generation_config={
167
+ "max_new_tokens": 1024,
168
  "do_sample": True,
169
  "temperature": 0.6, # Slightly lower for more focused training
170
  "top_p": 0.9,
 
214
  # Configuration validation and optimization tips
215
  print("\nπŸ”§ GPT-OSS Memory-Optimized OpenHermes-FR Configuration")
216
  print("=" * 60)
217
+ print(f"πŸ“Š Dataset: {config.dataset_name} (600K samples)")
218
  print(f"πŸ—£οΈ Language: French with GPT-OSS Harmony Format")
219
  print(f"πŸ“ˆ Training: {config.num_train_epochs} epoch (memory optimized)")
220
  print(f"πŸ”„ Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
 
230
  print(" β€’ Reduced batch size with increased gradient accumulation")
231
  print(" β€’ Limited sequence length for memory efficiency")
232
  print(" β€’ Reduced LoRA rank while maintaining effectiveness")
233
+ print(" β€’ Dataset sampling (600K from 800K) for faster training")
234
  print(" β€’ Gradient checkpointing and efficient data loading")
235
  print(" β€’ Exact GPT-OSS Harmony format with <|return|> tokens")
236
  print("=" * 60)
scripts/training/train_gpt_oss.py CHANGED
@@ -28,6 +28,10 @@ config_dir = project_root / "config"
28
  if str(config_dir) not in sys.path:
29
  sys.path.insert(0, str(config_dir))
30
 
 
 
 
 
31
  def load_gpt_oss_model_and_tokenizer(config):
32
  """Load GPT-OSS model and tokenizer with proper configuration"""
33
 
@@ -48,7 +52,13 @@ def load_gpt_oss_model_and_tokenizer(config):
48
  bnb_4bit_use_double_quant=True,
49
  bnb_4bit_quant_type="nf4"
50
  )
51
- elif config.quantization_config and config.quantization_config.get("dequantize"):
 
 
 
 
 
 
52
  # Try to use Mxfp4Config if available (as per tutorial)
53
  try:
54
  from transformers import Mxfp4Config
@@ -75,11 +85,40 @@ def load_gpt_oss_model_and_tokenizer(config):
75
  model_kwargs = {**default_model_kwargs, **cfg_model_kwargs}
76
  else:
77
  model_kwargs = default_model_kwargs.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  # Only add quantization_config if it's not None
80
  if quantization_config is not None:
81
  model_kwargs["quantization_config"] = quantization_config
82
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  model = AutoModelForCausalLM.from_pretrained(config.model_name, **model_kwargs)
84
 
85
  return model, tokenizer
 
28
  if str(config_dir) not in sys.path:
29
  sys.path.insert(0, str(config_dir))
30
 
31
+ # Reduce tokenizer thread contention and improve CUDA allocator behavior
32
+ os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
33
+ os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
34
+
35
  def load_gpt_oss_model_and_tokenizer(config):
36
  """Load GPT-OSS model and tokenizer with proper configuration"""
37
 
 
52
  bnb_4bit_use_double_quant=True,
53
  bnb_4bit_quant_type="nf4"
54
  )
55
+ elif config.quantization_config and (
56
+ config.quantization_config.get("dequantize")
57
+ or (
58
+ isinstance(config.quantization_config.get("mxfp4_config"), dict)
59
+ and config.quantization_config["mxfp4_config"].get("enabled", False)
60
+ )
61
+ ):
62
  # Try to use Mxfp4Config if available (as per tutorial)
63
  try:
64
  from transformers import Mxfp4Config
 
85
  model_kwargs = {**default_model_kwargs, **cfg_model_kwargs}
86
  else:
87
  model_kwargs = default_model_kwargs.copy()
88
+
89
+ # Normalize torch_dtype if provided as a string in config
90
+ if isinstance(model_kwargs.get("torch_dtype"), str):
91
+ dtype_str = str(model_kwargs["torch_dtype"]).lower()
92
+ if dtype_str in {"bf16", "bfloat16"}:
93
+ model_kwargs["torch_dtype"] = torch.bfloat16
94
+ elif dtype_str in {"fp16", "float16", "half"}:
95
+ model_kwargs["torch_dtype"] = torch.float16
96
+ elif dtype_str == "auto":
97
+ # Leave as-is for HF to decide
98
+ pass
99
+ else:
100
+ # Fallback to bfloat16 for safer memory footprint on A100/H100
101
+ model_kwargs["torch_dtype"] = torch.bfloat16
102
+
103
+ # Ensure we have an offload folder for tight-memory setups
104
+ model_kwargs.setdefault("offload_folder", os.path.join(str(project_root), "offload"))
105
 
106
  # Only add quantization_config if it's not None
107
  if quantization_config is not None:
108
  model_kwargs["quantization_config"] = quantization_config
109
 
110
+ # If using MXFP4, follow tutorial exactly: eager attention + bf16
111
+ try:
112
+ from transformers import Mxfp4Config as _Mxfp4Config
113
+ if isinstance(quantization_config, _Mxfp4Config):
114
+ model_kwargs["attn_implementation"] = "eager"
115
+ model_kwargs["torch_dtype"] = torch.bfloat16
116
+ model_kwargs["use_cache"] = False
117
+ model_kwargs["device_map"] = model_kwargs.get("device_map", "auto")
118
+ model_kwargs["quantization_config"] = quantization_config
119
+ except Exception:
120
+ pass
121
+
122
  model = AutoModelForCausalLM.from_pretrained(config.model_name, **model_kwargs)
123
 
124
  return model, tokenizer