Tonic commited on
Commit
7181190
·
1 Parent(s): dfcb060

adds memory optimized configuration

Browse files
config/train_gpt_oss_memory_optimized.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GPT-OSS Memory Optimized Training Configuration
3
+ Based on OpenAI's GPT-OSS fine-tuning tutorial
4
+ Optimized for limited GPU memory (40-80GB)
5
+ """
6
+ import os
7
+ from dataclasses import dataclass
8
+ from typing import Optional
9
+
10
+ @dataclass
11
+ class GPTOSSMemoryOptimizedConfig:
12
+ """Memory-optimized configuration for GPT-OSS fine-tuning"""
13
+ trainer_type: str = "sft"
14
+ model_name: str = "openai/gpt-oss-20b"
15
+ max_seq_length: int = 1024 # Reduced from 4096
16
+ use_flash_attention: bool = True
17
+ use_gradient_checkpointing: bool = True
18
+ batch_size: int = 1 # Reduced from 8
19
+ gradient_accumulation_steps: int = 16 # Increased to maintain effective batch size
20
+ learning_rate: float = 2e-4
21
+ weight_decay: float = 0.01
22
+ warmup_steps: int = 50
23
+ max_iters: int = 500 # Reduced for faster testing
24
+ eval_interval: int = 50
25
+ log_interval: int = 5
26
+ save_interval: int = 100
27
+ optimizer: str = "adamw_torch"
28
+ beta1: float = 0.9
29
+ beta2: float = 0.95
30
+ eps: float = 1e-8
31
+ scheduler: str = "cosine_with_min_lr"
32
+ min_lr: float = 2e-5
33
+ lr_scheduler_kwargs: dict = None
34
+ fp16: bool = False
35
+ bf16: bool = True
36
+ ddp_backend: str = "nccl"
37
+ ddp_find_unused_parameters: bool = False
38
+ save_steps: int = 100
39
+ eval_steps: int = 50
40
+ logging_steps: int = 5
41
+ save_total_limit: Optional[int] = 2
42
+ eval_strategy: str = "steps"
43
+ metric_for_best_model: str = "eval_loss"
44
+ greater_is_better: bool = False
45
+ load_best_model_at_end: bool = True
46
+ dataset_name: str = "HuggingFaceH4/Multilingual-Thinking"
47
+ dataset_split: str = "train"
48
+ input_field: str = "messages"
49
+ target_field: str = None
50
+ filter_bad_entries: bool = False
51
+ bad_entry_field: str = "bad_entry"
52
+ use_chat_template: bool = True
53
+ chat_template_kwargs: dict = None
54
+ enable_tracking: bool = True
55
+ trackio_url: Optional[str] = None
56
+ trackio_token: Optional[str] = None
57
+ log_artifacts: bool = True
58
+ log_metrics: bool = True
59
+ log_config: bool = True
60
+ experiment_name: Optional[str] = None
61
+ hf_token: Optional[str] = None
62
+ dataset_repo: Optional[str] = None
63
+ use_lora: bool = True
64
+ lora_config: dict = None
65
+ use_quantization: bool = True
66
+ quantization_config: dict = None
67
+ model_kwargs: dict = None
68
+ generation_config: dict = None
69
+ reasoning_languages: list = None
70
+
71
+ def __post_init__(self):
72
+ """Set default values for complex fields"""
73
+ if self.lora_config is None:
74
+ self.lora_config = {
75
+ "r": 4, # Reduced from 16
76
+ "lora_alpha": 8, # Reduced from 32
77
+ "target_modules": "all-linear",
78
+ "target_parameters": [
79
+ "7.mlp.experts.gate_up_proj",
80
+ "7.mlp.experts.down_proj",
81
+ "15.mlp.experts.gate_up_proj",
82
+ "15.mlp.experts.down_proj",
83
+ "23.mlp.experts.gate_up_proj",
84
+ "23.mlp.experts.down_proj",
85
+ ],
86
+ "bias": "none",
87
+ "task_type": "CAUSAL_LM"
88
+ }
89
+
90
+ if self.quantization_config is None:
91
+ self.quantization_config = {
92
+ "dequantize": True,
93
+ "load_in_4bit": True,
94
+ "bnb_4bit_compute_dtype": "bfloat16",
95
+ "bnb_4bit_use_double_quant": True,
96
+ "bnb_4bit_quant_type": "nf4"
97
+ }
98
+
99
+ if self.model_kwargs is None:
100
+ self.model_kwargs = {
101
+ "attn_implementation": "eager",
102
+ "torch_dtype": "auto",
103
+ "use_cache": False,
104
+ "device_map": "auto",
105
+ "low_cpu_mem_usage": True,
106
+ "max_memory": {0: "75GB"}, # Reserve some memory
107
+ }
108
+
109
+ if self.generation_config is None:
110
+ self.generation_config = {
111
+ "max_new_tokens": 256, # Reduced from 512
112
+ "do_sample": True,
113
+ "temperature": 0.6,
114
+ "top_p": 0.9,
115
+ "repetition_penalty": 1.1
116
+ }
117
+
118
+ if self.reasoning_languages is None:
119
+ self.reasoning_languages = [
120
+ "English", "Spanish", "French", "Italian", "German",
121
+ "Chinese", "Hindi", "Japanese", "Korean", "Arabic"
122
+ ]
123
+
124
+ if self.lr_scheduler_kwargs is None:
125
+ self.lr_scheduler_kwargs = {"min_lr_rate": 0.1}
126
+
127
+ if self.chat_template_kwargs is None:
128
+ self.chat_template_kwargs = {
129
+ "add_generation_prompt": True,
130
+ "tokenize": False,
131
+ "auto_insert_role": True
132
+ }
133
+
134
+ # Print memory optimization stats
135
+ effective_batch_size = self.batch_size * self.gradient_accumulation_steps
136
+ print("=== GPT-OSS Memory Optimized Configuration ===")
137
+ print(f"Effective batch size: {effective_batch_size}")
138
+ print(f"Max sequence length: {self.max_seq_length}")
139
+ print(f"LoRA rank: {self.lora_config['r']}")
140
+ print(f"Gradient accumulation steps: {self.gradient_accumulation_steps}")
141
+ print(f"Memory optimization: Enabled")
142
+ print(f"Quantization: {self.quantization_config}")
143
+ print(f"Max memory per GPU: {self.model_kwargs.get('max_memory', 'Auto')}")
144
+ print("==================================================")
launch.sh CHANGED
@@ -225,7 +225,16 @@ show_training_configs() {
225
  echo " - Specialized for reasoning tasks"
226
  echo " - Supports 10+ languages"
227
  echo ""
228
- echo "8. Custom Configuration"
 
 
 
 
 
 
 
 
 
229
  echo " - User-defined parameters"
230
  echo ""
231
  }
@@ -306,6 +315,16 @@ get_training_config() {
306
  MAX_SEQ_LENGTH=2048
307
  CONFIG_FILE="config/train_gpt_oss_multilingual_reasoning.py"
308
  ;;
 
 
 
 
 
 
 
 
 
 
309
  "Custom Configuration")
310
  get_custom_config
311
  ;;
@@ -478,7 +497,7 @@ print_step "Step 2: Training Configuration"
478
  echo "=================================="
479
 
480
  show_training_configs
481
- select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "GPT-OSS Basic Training" "GPT-OSS H100 Optimized" "GPT-OSS Multilingual Reasoning" "Custom Configuration" TRAINING_CONFIG_TYPE
482
 
483
  get_training_config "$TRAINING_CONFIG_TYPE"
484
 
 
225
  echo " - Specialized for reasoning tasks"
226
  echo " - Supports 10+ languages"
227
  echo ""
228
+ echo "8. GPT-OSS Memory Optimized"
229
+ echo " - Model: openai/gpt-oss-20b"
230
+ echo " - Dataset: Multilingual-Thinking"
231
+ echo " - Epochs: 1"
232
+ echo " - Batch Size: 1 (effective 16 with accumulation)"
233
+ echo " - Learning Rate: 2e-4"
234
+ echo " - 4-bit quantization + reduced LoRA"
235
+ echo " - Optimized for limited GPU memory"
236
+ echo ""
237
+ echo "9. Custom Configuration"
238
  echo " - User-defined parameters"
239
  echo ""
240
  }
 
315
  MAX_SEQ_LENGTH=2048
316
  CONFIG_FILE="config/train_gpt_oss_multilingual_reasoning.py"
317
  ;;
318
+ "GPT-OSS Memory Optimized")
319
+ MODEL_NAME="openai/gpt-oss-20b"
320
+ DATASET_NAME="HuggingFaceH4/Multilingual-Thinking"
321
+ MAX_EPOCHS=1
322
+ BATCH_SIZE=1
323
+ GRADIENT_ACCUMULATION_STEPS=16
324
+ LEARNING_RATE=2e-4
325
+ MAX_SEQ_LENGTH=1024
326
+ CONFIG_FILE="config/train_gpt_oss_memory_optimized.py"
327
+ ;;
328
  "Custom Configuration")
329
  get_custom_config
330
  ;;
 
497
  echo "=================================="
498
 
499
  show_training_configs
500
+ select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "GPT-OSS Basic Training" "GPT-OSS H100 Optimized" "GPT-OSS Multilingual Reasoning" "GPT-OSS Memory Optimized" "Custom Configuration" TRAINING_CONFIG_TYPE
501
 
502
  get_training_config "$TRAINING_CONFIG_TYPE"
503
 
requirements/requirements_core.txt CHANGED
@@ -20,4 +20,5 @@ pynvml>=12.0.0
20
 
21
  # GPT-OSS specific dependencies
22
  # Note: GPT-OSS requires specific versions for optimal performance
23
- # These are compatible with the tutorial requirements
 
 
20
 
21
  # GPT-OSS specific dependencies
22
  # Note: GPT-OSS requires specific versions for optimal performance
23
+ # These are compatible with the tutorial requirements
24
+ bitsandbytes>=0.41.0 # For 4-bit quantization
scripts/training/train_gpt_oss.py CHANGED
@@ -23,10 +23,20 @@ def load_gpt_oss_model_and_tokenizer(config):
23
  print("Loading GPT-OSS model with quantization...")
24
 
25
  # Import quantization config
26
- from transformers import Mxfp4Config
27
-
28
- # Set up quantization config
29
- quantization_config = Mxfp4Config(dequantize=True)
 
 
 
 
 
 
 
 
 
 
30
 
31
  # Model kwargs as per tutorial
32
  model_kwargs = {
@@ -144,7 +154,7 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
144
  # Try to find a config class
145
  for attr_name in dir(config_module):
146
  attr = getattr(config_module, attr_name)
147
- if hasattr(attr, 'model_name') and 'gpt_oss' in attr.model_name.lower():
148
  config = attr
149
  break
150
  else:
 
23
  print("Loading GPT-OSS model with quantization...")
24
 
25
  # Import quantization config
26
+ from transformers import BitsAndBytesConfig
27
+
28
+ # Set up quantization config based on config
29
+ if config.quantization_config and config.quantization_config.get("load_in_4bit"):
30
+ # Use BitsAndBytesConfig for 4-bit quantization
31
+ quantization_config = BitsAndBytesConfig(
32
+ load_in_4bit=True,
33
+ bnb_4bit_compute_dtype=torch.bfloat16,
34
+ bnb_4bit_use_double_quant=True,
35
+ bnb_4bit_quant_type="nf4"
36
+ )
37
+ else:
38
+ # Use BitsAndBytesConfig as default (no quantization)
39
+ quantization_config = None
40
 
41
  # Model kwargs as per tutorial
42
  model_kwargs = {
 
154
  # Try to find a config class
155
  for attr_name in dir(config_module):
156
  attr = getattr(config_module, attr_name)
157
+ if hasattr(attr, 'model_name') and ('gpt_oss' in attr.model_name.lower() or 'GPTOSS' in attr_name):
158
  config = attr
159
  break
160
  else: