Spaces:
Running
Running
adds a100 memory optimized
Browse files
config/train_gpt_oss_openhermes_fr_memory_optimized.py
CHANGED
@@ -41,9 +41,9 @@ config = GPTOSSEnhancedCustomConfig(
|
|
41 |
# MEMORY-OPTIMIZED TRAINING HYPERPARAMETERS
|
42 |
# ============================================================================
|
43 |
# Batch configuration following memory optimization principles
|
44 |
-
num_train_epochs=1.0,
|
45 |
-
batch_size=
|
46 |
-
gradient_accumulation_steps=
|
47 |
|
48 |
# Learning rate optimized for single epoch + memory constraints
|
49 |
learning_rate=2e-4, # Standard GPT-OSS learning rate
|
@@ -56,7 +56,7 @@ config = GPTOSSEnhancedCustomConfig(
|
|
56 |
# MODEL CONFIGURATION - Memory Optimized for GPT-OSS
|
57 |
# ============================================================================
|
58 |
model_name="openai/gpt-oss-20b",
|
59 |
-
max_seq_length=4096,
|
60 |
use_flash_attention=True, # Critical for memory efficiency
|
61 |
use_gradient_checkpointing=True, # Essential for memory optimization
|
62 |
|
@@ -92,6 +92,7 @@ config = GPTOSSEnhancedCustomConfig(
|
|
92 |
# QUANTIZATION - GPT-OSS Native MXFP4 Optimization
|
93 |
# ============================================================================
|
94 |
use_quantization=True,
|
|
|
95 |
quantization_config={
|
96 |
"dequantize": True, # Use native MXFP4 as per GPT-OSS specs
|
97 |
"load_in_4bit": False, # Don't use BNB 4-bit with MXFP4
|
@@ -106,40 +107,39 @@ config = GPTOSSEnhancedCustomConfig(
|
|
106 |
# ============================================================================
|
107 |
# Model loading with memory constraints
|
108 |
model_kwargs={
|
109 |
-
|
110 |
"torch_dtype": "auto", # Let model decide (MXFP4 compatible)
|
111 |
"use_cache": False, # Disable KV cache for training
|
112 |
"device_map": "auto", # Automatic device mapping
|
113 |
"low_cpu_mem_usage": True, # Critical for memory optimization
|
114 |
-
"max_memory": {0: "75GB"}, # Reserve memory for other processes
|
115 |
},
|
116 |
|
117 |
# Data loading optimized for throughput
|
118 |
dataloader_num_workers=4, # More workers for faster loading
|
119 |
dataloader_pin_memory=True, # Pin memory for faster host->GPU copies
|
120 |
-
dataloader_prefetch_factor=
|
121 |
|
122 |
# Memory management optimizations
|
123 |
-
max_memory_per_gpu=
|
124 |
low_cpu_mem_usage=True, # Essential for large models
|
125 |
group_by_length=True, # Efficient batching for memory
|
126 |
remove_unused_columns=True, # Remove unnecessary data
|
127 |
|
128 |
# ============================================================================
|
129 |
-
# EVALUATION & LOGGING -
|
130 |
# ============================================================================
|
131 |
eval_strategy="steps",
|
132 |
-
eval_steps=
|
133 |
-
logging_steps=
|
134 |
|
135 |
save_strategy="steps",
|
136 |
-
save_steps=
|
137 |
save_total_limit=3, # Keep only 2 checkpoints for memory
|
138 |
save_only_model=True, # Save only model weights
|
139 |
|
140 |
metric_for_best_model="eval_loss",
|
141 |
greater_is_better=False,
|
142 |
-
load_best_model_at_end=
|
143 |
|
144 |
# Evaluation memory optimization
|
145 |
eval_accumulation_steps=4, # Accumulate eval outputs to save memory
|
@@ -164,7 +164,7 @@ config = GPTOSSEnhancedCustomConfig(
|
|
164 |
|
165 |
# Generation config optimized for GPT-OSS harmony format (exact template compliance)
|
166 |
generation_config={
|
167 |
-
"max_new_tokens":
|
168 |
"do_sample": True,
|
169 |
"temperature": 0.6, # Slightly lower for more focused training
|
170 |
"top_p": 0.9,
|
@@ -214,7 +214,7 @@ config = GPTOSSEnhancedCustomConfig(
|
|
214 |
# Configuration validation and optimization tips
|
215 |
print("\nπ§ GPT-OSS Memory-Optimized OpenHermes-FR Configuration")
|
216 |
print("=" * 60)
|
217 |
-
print(f"π Dataset: {config.dataset_name} (
|
218 |
print(f"π£οΈ Language: French with GPT-OSS Harmony Format")
|
219 |
print(f"π Training: {config.num_train_epochs} epoch (memory optimized)")
|
220 |
print(f"π Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
|
@@ -230,7 +230,7 @@ print(" β’ Native MXFP4 quantization for GPT-OSS MoE layers")
|
|
230 |
print(" β’ Reduced batch size with increased gradient accumulation")
|
231 |
print(" β’ Limited sequence length for memory efficiency")
|
232 |
print(" β’ Reduced LoRA rank while maintaining effectiveness")
|
233 |
-
print(" β’ Dataset sampling (
|
234 |
print(" β’ Gradient checkpointing and efficient data loading")
|
235 |
print(" β’ Exact GPT-OSS Harmony format with <|return|> tokens")
|
236 |
print("=" * 60)
|
|
|
41 |
# MEMORY-OPTIMIZED TRAINING HYPERPARAMETERS
|
42 |
# ============================================================================
|
43 |
# Batch configuration following memory optimization principles
|
44 |
+
num_train_epochs=1.0, # Single epoch to reduce memory pressure
|
45 |
+
batch_size=2, # A100-safe per-device batch size
|
46 |
+
gradient_accumulation_steps=16, # Maintain reasonable effective batch size
|
47 |
|
48 |
# Learning rate optimized for single epoch + memory constraints
|
49 |
learning_rate=2e-4, # Standard GPT-OSS learning rate
|
|
|
56 |
# MODEL CONFIGURATION - Memory Optimized for GPT-OSS
|
57 |
# ============================================================================
|
58 |
model_name="openai/gpt-oss-20b",
|
59 |
+
max_seq_length=4096, # Maximize sequence length for A100 VRAM utilization
|
60 |
use_flash_attention=True, # Critical for memory efficiency
|
61 |
use_gradient_checkpointing=True, # Essential for memory optimization
|
62 |
|
|
|
92 |
# QUANTIZATION - GPT-OSS Native MXFP4 Optimization
|
93 |
# ============================================================================
|
94 |
use_quantization=True,
|
95 |
+
# MXFP4 per tutorial: https://cookbook.openai.com/articles/gpt-oss/fine-tune-transfomers
|
96 |
quantization_config={
|
97 |
"dequantize": True, # Use native MXFP4 as per GPT-OSS specs
|
98 |
"load_in_4bit": False, # Don't use BNB 4-bit with MXFP4
|
|
|
107 |
# ============================================================================
|
108 |
# Model loading with memory constraints
|
109 |
model_kwargs={
|
110 |
+
# Rely on training script to set eager + bf16 for MXFP4
|
111 |
"torch_dtype": "auto", # Let model decide (MXFP4 compatible)
|
112 |
"use_cache": False, # Disable KV cache for training
|
113 |
"device_map": "auto", # Automatic device mapping
|
114 |
"low_cpu_mem_usage": True, # Critical for memory optimization
|
|
|
115 |
},
|
116 |
|
117 |
# Data loading optimized for throughput
|
118 |
dataloader_num_workers=4, # More workers for faster loading
|
119 |
dataloader_pin_memory=True, # Pin memory for faster host->GPU copies
|
120 |
+
dataloader_prefetch_factor=1, # Lower prefetch to keep VRAM headroom
|
121 |
|
122 |
# Memory management optimizations
|
123 |
+
max_memory_per_gpu=None, # No explicit memory limit; use as much VRAM as available
|
124 |
low_cpu_mem_usage=True, # Essential for large models
|
125 |
group_by_length=True, # Efficient batching for memory
|
126 |
remove_unused_columns=True, # Remove unnecessary data
|
127 |
|
128 |
# ============================================================================
|
129 |
+
# EVALUATION & LOGGING - Memory Safe
|
130 |
# ============================================================================
|
131 |
eval_strategy="steps",
|
132 |
+
eval_steps=200,
|
133 |
+
logging_steps=10,
|
134 |
|
135 |
save_strategy="steps",
|
136 |
+
save_steps=500, # Less frequent saves for memory/storage
|
137 |
save_total_limit=3, # Keep only 2 checkpoints for memory
|
138 |
save_only_model=True, # Save only model weights
|
139 |
|
140 |
metric_for_best_model="eval_loss",
|
141 |
greater_is_better=False,
|
142 |
+
load_best_model_at_end=False, # Skip best model selection to save memory
|
143 |
|
144 |
# Evaluation memory optimization
|
145 |
eval_accumulation_steps=4, # Accumulate eval outputs to save memory
|
|
|
164 |
|
165 |
# Generation config optimized for GPT-OSS harmony format (exact template compliance)
|
166 |
generation_config={
|
167 |
+
"max_new_tokens": 1024,
|
168 |
"do_sample": True,
|
169 |
"temperature": 0.6, # Slightly lower for more focused training
|
170 |
"top_p": 0.9,
|
|
|
214 |
# Configuration validation and optimization tips
|
215 |
print("\nπ§ GPT-OSS Memory-Optimized OpenHermes-FR Configuration")
|
216 |
print("=" * 60)
|
217 |
+
print(f"π Dataset: {config.dataset_name} (600K samples)")
|
218 |
print(f"π£οΈ Language: French with GPT-OSS Harmony Format")
|
219 |
print(f"π Training: {config.num_train_epochs} epoch (memory optimized)")
|
220 |
print(f"π Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
|
|
|
230 |
print(" β’ Reduced batch size with increased gradient accumulation")
|
231 |
print(" β’ Limited sequence length for memory efficiency")
|
232 |
print(" β’ Reduced LoRA rank while maintaining effectiveness")
|
233 |
+
print(" β’ Dataset sampling (600K from 800K) for faster training")
|
234 |
print(" β’ Gradient checkpointing and efficient data loading")
|
235 |
print(" β’ Exact GPT-OSS Harmony format with <|return|> tokens")
|
236 |
print("=" * 60)
|
scripts/training/train_gpt_oss.py
CHANGED
@@ -28,6 +28,10 @@ config_dir = project_root / "config"
|
|
28 |
if str(config_dir) not in sys.path:
|
29 |
sys.path.insert(0, str(config_dir))
|
30 |
|
|
|
|
|
|
|
|
|
31 |
def load_gpt_oss_model_and_tokenizer(config):
|
32 |
"""Load GPT-OSS model and tokenizer with proper configuration"""
|
33 |
|
@@ -48,7 +52,13 @@ def load_gpt_oss_model_and_tokenizer(config):
|
|
48 |
bnb_4bit_use_double_quant=True,
|
49 |
bnb_4bit_quant_type="nf4"
|
50 |
)
|
51 |
-
elif config.quantization_config and
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
# Try to use Mxfp4Config if available (as per tutorial)
|
53 |
try:
|
54 |
from transformers import Mxfp4Config
|
@@ -75,11 +85,40 @@ def load_gpt_oss_model_and_tokenizer(config):
|
|
75 |
model_kwargs = {**default_model_kwargs, **cfg_model_kwargs}
|
76 |
else:
|
77 |
model_kwargs = default_model_kwargs.copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
# Only add quantization_config if it's not None
|
80 |
if quantization_config is not None:
|
81 |
model_kwargs["quantization_config"] = quantization_config
|
82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
model = AutoModelForCausalLM.from_pretrained(config.model_name, **model_kwargs)
|
84 |
|
85 |
return model, tokenizer
|
|
|
28 |
if str(config_dir) not in sys.path:
|
29 |
sys.path.insert(0, str(config_dir))
|
30 |
|
31 |
+
# Reduce tokenizer thread contention and improve CUDA allocator behavior
|
32 |
+
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
33 |
+
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
|
34 |
+
|
35 |
def load_gpt_oss_model_and_tokenizer(config):
|
36 |
"""Load GPT-OSS model and tokenizer with proper configuration"""
|
37 |
|
|
|
52 |
bnb_4bit_use_double_quant=True,
|
53 |
bnb_4bit_quant_type="nf4"
|
54 |
)
|
55 |
+
elif config.quantization_config and (
|
56 |
+
config.quantization_config.get("dequantize")
|
57 |
+
or (
|
58 |
+
isinstance(config.quantization_config.get("mxfp4_config"), dict)
|
59 |
+
and config.quantization_config["mxfp4_config"].get("enabled", False)
|
60 |
+
)
|
61 |
+
):
|
62 |
# Try to use Mxfp4Config if available (as per tutorial)
|
63 |
try:
|
64 |
from transformers import Mxfp4Config
|
|
|
85 |
model_kwargs = {**default_model_kwargs, **cfg_model_kwargs}
|
86 |
else:
|
87 |
model_kwargs = default_model_kwargs.copy()
|
88 |
+
|
89 |
+
# Normalize torch_dtype if provided as a string in config
|
90 |
+
if isinstance(model_kwargs.get("torch_dtype"), str):
|
91 |
+
dtype_str = str(model_kwargs["torch_dtype"]).lower()
|
92 |
+
if dtype_str in {"bf16", "bfloat16"}:
|
93 |
+
model_kwargs["torch_dtype"] = torch.bfloat16
|
94 |
+
elif dtype_str in {"fp16", "float16", "half"}:
|
95 |
+
model_kwargs["torch_dtype"] = torch.float16
|
96 |
+
elif dtype_str == "auto":
|
97 |
+
# Leave as-is for HF to decide
|
98 |
+
pass
|
99 |
+
else:
|
100 |
+
# Fallback to bfloat16 for safer memory footprint on A100/H100
|
101 |
+
model_kwargs["torch_dtype"] = torch.bfloat16
|
102 |
+
|
103 |
+
# Ensure we have an offload folder for tight-memory setups
|
104 |
+
model_kwargs.setdefault("offload_folder", os.path.join(str(project_root), "offload"))
|
105 |
|
106 |
# Only add quantization_config if it's not None
|
107 |
if quantization_config is not None:
|
108 |
model_kwargs["quantization_config"] = quantization_config
|
109 |
|
110 |
+
# If using MXFP4, follow tutorial exactly: eager attention + bf16
|
111 |
+
try:
|
112 |
+
from transformers import Mxfp4Config as _Mxfp4Config
|
113 |
+
if isinstance(quantization_config, _Mxfp4Config):
|
114 |
+
model_kwargs["attn_implementation"] = "eager"
|
115 |
+
model_kwargs["torch_dtype"] = torch.bfloat16
|
116 |
+
model_kwargs["use_cache"] = False
|
117 |
+
model_kwargs["device_map"] = model_kwargs.get("device_map", "auto")
|
118 |
+
model_kwargs["quantization_config"] = quantization_config
|
119 |
+
except Exception:
|
120 |
+
pass
|
121 |
+
|
122 |
model = AutoModelForCausalLM.from_pretrained(config.model_name, **model_kwargs)
|
123 |
|
124 |
return model, tokenizer
|