Tonic commited on
Commit
598357a
·
1 Parent(s): eb9e91f

improves launcher with model family and defaults based on options, updates trl trainer , removes trl config paths by switching to trainingarguments class , tokenizer parameters updated to sfttrainer , resolves evaluation_strategy error

Browse files
Files changed (2) hide show
  1. launch.sh +137 -104
  2. scripts/training/train_gpt_oss.py +5 -7
launch.sh CHANGED
@@ -158,112 +158,116 @@ except:
158
  fi
159
  }
160
 
161
- # Function to show training configurations
162
  show_training_configs() {
 
163
  echo ""
164
  print_header "Available Training Configurations"
165
  echo "======================================"
166
  echo ""
167
- echo "=== SmolLM3 Configurations ==="
168
- echo "1. Basic Training (Default)"
169
- echo " - Model: SmolLM3-3B"
170
- echo " - Dataset: SmolTalk"
171
- echo " - Epochs: 3"
172
- echo " - Batch Size: 2"
173
- echo " - Learning Rate: 5e-6"
174
- echo ""
175
- echo "2. H100 Lightweight (Rapid)"
176
- echo " - Model: SmolLM3-3B"
177
- echo " - Dataset: OpenHermes-FR (80K samples)"
178
- echo " - Epochs: 1"
179
- echo " - Batch Size: 16"
180
- echo " - Learning Rate: 8e-6"
181
- echo " - Sequence Length: 8192"
182
- echo " - Optimized for H100 rapid training"
183
- echo ""
184
- echo "3. A100 Large Scale"
185
- echo " - Model: SmolLM3-3B"
186
- echo " - Dataset: OpenHermes-FR"
187
- echo " - Epochs: 1.3 passes"
188
- echo " - Batch Size: 8"
189
- echo " - Learning Rate: 5e-6"
190
- echo " - Sequence Length: 8192"
191
- echo ""
192
- echo "4. Multiple Passes"
193
- echo " - Model: SmolLM3-3B"
194
- echo " - Dataset: OpenHermes-FR"
195
- echo " - Epochs: 4 passes"
196
- echo " - Batch Size: 6"
197
- echo " - Learning Rate: 3e-6"
198
- echo " - Sequence Length: 8192"
199
- echo ""
200
- echo "=== GPT-OSS Configurations ==="
201
- echo "5. GPT-OSS Basic Training"
202
- echo " - Model: openai/gpt-oss-20b"
203
- echo " - Dataset: Multilingual-Thinking"
204
- echo " - Epochs: 1"
205
- echo " - Batch Size: 4"
206
- echo " - Learning Rate: 2e-4"
207
- echo " - LoRA + MXFP4 Quantization"
208
- echo " - Optimized for multilingual reasoning"
209
- echo ""
210
- echo "6. GPT-OSS H100 Optimized"
211
- echo " - Model: openai/gpt-oss-20b"
212
- echo " - Dataset: Multilingual-Thinking"
213
- echo " - Epochs: 2"
214
- echo " - Batch Size: 8"
215
- echo " - Learning Rate: 3e-4"
216
- echo " - Enhanced LoRA (rank 16)"
217
- echo " - Optimized for H100 performance"
218
- echo ""
219
- echo "7. GPT-OSS Multilingual Reasoning"
220
- echo " - Model: openai/gpt-oss-20b"
221
- echo " - Dataset: Multilingual-Thinking"
222
- echo " - Epochs: 1"
223
- echo " - Batch Size: 4"
224
- echo " - Learning Rate: 2e-4"
225
- echo " - Specialized for reasoning tasks"
226
- echo " - Supports 10+ languages"
227
- echo ""
228
- echo "8. GPT-OSS Memory Optimized"
229
- echo " - Model: openai/gpt-oss-20b"
230
- echo " - Dataset: Multilingual-Thinking"
231
- echo " - Epochs: 1"
232
- echo " - Batch Size: 1 (effective 16 with accumulation)"
233
- echo " - Learning Rate: 2e-4"
234
- echo " - 4-bit quantization + reduced LoRA"
235
- echo " - Optimized for limited GPU memory"
236
- echo ""
237
- echo "9. GPT-OSS OpenHermes-FR (Recommended)"
238
- echo " - Model: openai/gpt-oss-20b"
239
- echo " - Dataset: legmlai/openhermes-fr (800K French examples)"
240
- echo " - Epochs: 1.5"
241
- echo " - Batch Size: 6 (effective 36 with accumulation)"
242
- echo " - Learning Rate: 2.5e-4"
243
- echo " - Optimized for French language training"
244
- echo " - Quality filtering enabled"
245
- echo ""
246
- echo "10. GPT-OSS OpenHermes-FR Memory Optimized"
247
- echo " - Model: openai/gpt-oss-20b"
248
- echo " - Dataset: legmlai/openhermes-fr (200K samples)"
249
- echo " - Epochs: 1"
250
- echo " - Batch Size: 2 (effective 32 with accumulation)"
251
- echo " - Learning Rate: 2e-4"
252
- echo " - Native MXFP4 quantization"
253
- echo " - Memory optimized for 40-80GB GPUs"
254
- echo " - Harmony format compatible"
255
- echo ""
256
- echo "10. GPT-OSS Custom Dataset"
257
- echo " - Model: openai/gpt-oss-20b"
258
- echo " - Dataset: User-defined (fully customizable)"
259
- echo " - Epochs: Configurable"
260
- echo " - Batch Size: Configurable"
261
- echo " - Learning Rate: Configurable"
262
- echo " - Maximum flexibility with all parameters"
263
- echo ""
264
- echo "11. Custom Configuration"
265
- echo " - User-defined parameters"
266
- echo ""
 
 
 
267
  }
268
 
269
  # Function to get training configuration
@@ -785,11 +789,40 @@ HF_TOKEN="$HF_WRITE_TOKEN"
785
  print_step "Step 2: Training Configuration"
786
  echo "=================================="
787
 
788
- show_training_configs
789
- select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "GPT-OSS Basic Training" "GPT-OSS H100 Optimized" "GPT-OSS Multilingual Reasoning" "GPT-OSS Memory Optimized" "GPT-OSS OpenHermes-FR (Recommended)" "GPT-OSS OpenHermes-FR Memory Optimized" "GPT-OSS Custom Dataset" "Custom Configuration" TRAINING_CONFIG_TYPE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
790
 
791
  get_training_config "$TRAINING_CONFIG_TYPE"
792
 
 
 
 
 
 
 
 
793
  # Step 3: Get experiment details
794
  print_step "Step 3: Experiment Details"
795
  echo "=============================="
@@ -1042,7 +1075,7 @@ print_info "This information will be used in the model card and citation."
1042
  get_input "Author name for model card" "$HF_USERNAME" AUTHOR_NAME
1043
 
1044
  print_info "Model description will be used in the model card and repository."
1045
- get_input "Model description" "A fine-tuned version of SmolLM3-3B for improved french language text generation and conversation capabilities." MODEL_DESCRIPTION
1046
 
1047
  # Step 9: Deploy Trackio Space (automated)
1048
  print_step "Step 9: Deploying Trackio Space"
 
158
  fi
159
  }
160
 
161
+ # Function to show training configurations (optionally filtered by family)
162
  show_training_configs() {
163
+ local family="$1" # Optional: "SmolLM3" or "GPT-OSS"
164
  echo ""
165
  print_header "Available Training Configurations"
166
  echo "======================================"
167
  echo ""
168
+
169
+ if [ -z "$family" ] || [ "$family" = "SmolLM3" ]; then
170
+ echo "=== SmolLM3 Configurations ==="
171
+ echo "1. Basic Training (Default)"
172
+ echo " - Model: SmolLM3-3B"
173
+ echo " - Dataset: SmolTalk"
174
+ echo " - Epochs: 3"
175
+ echo " - Batch Size: 2"
176
+ echo " - Learning Rate: 5e-6"
177
+ echo ""
178
+ echo "2. H100 Lightweight (Rapid)"
179
+ echo " - Model: SmolLM3-3B"
180
+ echo " - Dataset: OpenHermes-FR (80K samples)"
181
+ echo " - Epochs: 1"
182
+ echo " - Batch Size: 16"
183
+ echo " - Learning Rate: 8e-6"
184
+ echo " - Sequence Length: 8192"
185
+ echo " - Optimized for H100 rapid training"
186
+ echo ""
187
+ echo "3. A100 Large Scale"
188
+ echo " - Model: SmolLM3-3B"
189
+ echo " - Dataset: OpenHermes-FR"
190
+ echo " - Epochs: 1.3 passes"
191
+ echo " - Batch Size: 8"
192
+ echo " - Learning Rate: 5e-6"
193
+ echo " - Sequence Length: 8192"
194
+ echo ""
195
+ echo "4. Multiple Passes"
196
+ echo " - Model: SmolLM3-3B"
197
+ echo " - Dataset: OpenHermes-FR"
198
+ echo " - Epochs: 4 passes"
199
+ echo " - Batch Size: 6"
200
+ echo " - Learning Rate: 3e-6"
201
+ echo " - Sequence Length: 8192"
202
+ echo ""
203
+ fi
204
+
205
+ if [ -z "$family" ] || [ "$family" = "GPT-OSS" ]; then
206
+ echo "=== GPT-OSS Configurations ==="
207
+ echo "1. GPT-OSS Basic Training"
208
+ echo " - Model: openai/gpt-oss-20b"
209
+ echo " - Dataset: Multilingual-Thinking"
210
+ echo " - Epochs: 1"
211
+ echo " - Batch Size: 4"
212
+ echo " - Learning Rate: 2e-4"
213
+ echo " - LoRA + MXFP4 Quantization"
214
+ echo " - Optimized for multilingual reasoning"
215
+ echo ""
216
+ echo "2. GPT-OSS H100 Optimized"
217
+ echo " - Model: openai/gpt-oss-20b"
218
+ echo " - Dataset: Multilingual-Thinking"
219
+ echo " - Epochs: 2"
220
+ echo " - Batch Size: 8"
221
+ echo " - Learning Rate: 3e-4"
222
+ echo " - Enhanced LoRA (rank 16)"
223
+ echo " - Optimized for H100 performance"
224
+ echo ""
225
+ echo "3. GPT-OSS Multilingual Reasoning"
226
+ echo " - Model: openai/gpt-oss-20b"
227
+ echo " - Dataset: Multilingual-Thinking"
228
+ echo " - Epochs: 1"
229
+ echo " - Batch Size: 4"
230
+ echo " - Learning Rate: 2e-4"
231
+ echo " - Specialized for reasoning tasks"
232
+ echo " - Supports 10+ languages"
233
+ echo ""
234
+ echo "4. GPT-OSS Memory Optimized"
235
+ echo " - Model: openai/gpt-oss-20b"
236
+ echo " - Dataset: Multilingual-Thinking"
237
+ echo " - Epochs: 1"
238
+ echo " - Batch Size: 1 (effective 16 with accumulation)"
239
+ echo " - Learning Rate: 2e-4"
240
+ echo " - 4-bit quantization + reduced LoRA"
241
+ echo " - Optimized for limited GPU memory"
242
+ echo ""
243
+ echo "5. GPT-OSS OpenHermes-FR (Recommended)"
244
+ echo " - Model: openai/gpt-oss-20b"
245
+ echo " - Dataset: legmlai/openhermes-fr (800K French examples)"
246
+ echo " - Epochs: 1.5"
247
+ echo " - Batch Size: 6 (effective 36 with accumulation)"
248
+ echo " - Learning Rate: 2.5e-4"
249
+ echo " - Optimized for French language training"
250
+ echo " - Quality filtering enabled"
251
+ echo ""
252
+ echo "6. GPT-OSS OpenHermes-FR Memory Optimized"
253
+ echo " - Model: openai/gpt-oss-20b"
254
+ echo " - Dataset: legmlai/openhermes-fr (200K samples)"
255
+ echo " - Epochs: 1"
256
+ echo " - Batch Size: 2 (effective 32 with accumulation)"
257
+ echo " - Learning Rate: 2e-4"
258
+ echo " - Native MXFP4 quantization"
259
+ echo " - Memory optimized for 40-80GB GPUs"
260
+ echo " - Harmony format compatible"
261
+ echo ""
262
+ echo "7. GPT-OSS Custom Dataset"
263
+ echo " - Model: openai/gpt-oss-20b"
264
+ echo " - Dataset: User-defined (fully customizable)"
265
+ echo " - Epochs: Configurable"
266
+ echo " - Batch Size: Configurable"
267
+ echo " - Learning Rate: Configurable"
268
+ echo " - Maximum flexibility with all parameters"
269
+ echo ""
270
+ fi
271
  }
272
 
273
  # Function to get training configuration
 
789
  print_step "Step 2: Training Configuration"
790
  echo "=================================="
791
 
792
+ # 2.1 Select model family first
793
+ select_option "Select model family:" "SmolLM3" "GPT-OSS" MODEL_FAMILY
794
+
795
+ # 2.2 Show only the configurations for the selected family and prompt selection
796
+ show_training_configs "$MODEL_FAMILY"
797
+ if [ "$MODEL_FAMILY" = "SmolLM3" ]; then
798
+ select_option "Select training configuration:" \
799
+ "Basic Training" \
800
+ "H100 Lightweight (Rapid)" \
801
+ "A100 Large Scale" \
802
+ "Multiple Passes" \
803
+ "Custom Configuration" \
804
+ TRAINING_CONFIG_TYPE
805
+ else
806
+ select_option "Select training configuration:" \
807
+ "GPT-OSS Basic Training" \
808
+ "GPT-OSS H100 Optimized" \
809
+ "GPT-OSS Multilingual Reasoning" \
810
+ "GPT-OSS Memory Optimized" \
811
+ "GPT-OSS OpenHermes-FR (Recommended)" \
812
+ "GPT-OSS OpenHermes-FR Memory Optimized" \
813
+ "GPT-OSS Custom Dataset" \
814
+ TRAINING_CONFIG_TYPE
815
+ fi
816
 
817
  get_training_config "$TRAINING_CONFIG_TYPE"
818
 
819
+ # 2.3 Set a family-specific default model description for the model card
820
+ if [ "$MODEL_FAMILY" = "GPT-OSS" ]; then
821
+ DEFAULT_MODEL_DESCRIPTION="A fine-tuned GPT-OSS-20B model optimized for multilingual reasoning and instruction following."
822
+ else
823
+ DEFAULT_MODEL_DESCRIPTION="A fine-tuned SmolLM3-3B model optimized for instruction following and French language tasks."
824
+ fi
825
+
826
  # Step 3: Get experiment details
827
  print_step "Step 3: Experiment Details"
828
  echo "=============================="
 
1075
  get_input "Author name for model card" "$HF_USERNAME" AUTHOR_NAME
1076
 
1077
  print_info "Model description will be used in the model card and repository."
1078
+ get_input "Model description" "$DEFAULT_MODEL_DESCRIPTION" MODEL_DESCRIPTION
1079
 
1080
  # Step 9: Deploy Trackio Space (automated)
1081
  print_step "Step 9: Deploying Trackio Space"
scripts/training/train_gpt_oss.py CHANGED
@@ -9,9 +9,9 @@ import os
9
  import sys
10
  import argparse
11
  import torch
12
- from transformers import AutoTokenizer, AutoModelForCausalLM
13
  from peft import LoraConfig, get_peft_model
14
- from trl import SFTTrainer, SFTConfig
15
  from datasets import load_dataset
16
  from pathlib import Path
17
 
@@ -353,7 +353,6 @@ def create_sft_config(config, output_dir):
353
  # Learning rate configuration
354
  learning_rate = config.learning_rate
355
  lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
356
- lr_scheduler_kwargs = getattr(config, 'lr_scheduler_kwargs', {"min_lr_rate": 0.1})
357
 
358
  # Batch configuration
359
  per_device_train_batch_size = config.batch_size
@@ -387,7 +386,7 @@ def create_sft_config(config, output_dir):
387
  print(f" • Gradient accumulation: {gradient_accumulation_steps}")
388
  print(f" • Effective batch size: {per_device_train_batch_size * gradient_accumulation_steps}")
389
 
390
- sft_config = SFTConfig(
391
  # Training duration
392
  num_train_epochs=num_train_epochs,
393
  max_steps=max_steps,
@@ -395,7 +394,6 @@ def create_sft_config(config, output_dir):
395
  # Learning rate
396
  learning_rate=learning_rate,
397
  lr_scheduler_type=lr_scheduler_type,
398
- lr_scheduler_kwargs=lr_scheduler_kwargs,
399
  warmup_ratio=warmup_ratio,
400
  warmup_steps=warmup_steps,
401
 
@@ -442,7 +440,7 @@ def create_sft_config(config, output_dir):
442
  push_to_hub=push_to_hub,
443
 
444
  # Monitoring
445
- report_to="trackio" if getattr(config, 'enable_tracking', False) else None,
446
  )
447
 
448
  return sft_config
@@ -504,7 +502,7 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
504
  model=peft_model,
505
  args=sft_config,
506
  train_dataset=dataset,
507
- processing_class=tokenizer,
508
  dataset_text_field="text",
509
  max_seq_length=getattr(config, 'max_seq_length', 2048),
510
  )
 
9
  import sys
10
  import argparse
11
  import torch
12
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
13
  from peft import LoraConfig, get_peft_model
14
+ from trl import SFTTrainer
15
  from datasets import load_dataset
16
  from pathlib import Path
17
 
 
353
  # Learning rate configuration
354
  learning_rate = config.learning_rate
355
  lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
 
356
 
357
  # Batch configuration
358
  per_device_train_batch_size = config.batch_size
 
386
  print(f" • Gradient accumulation: {gradient_accumulation_steps}")
387
  print(f" • Effective batch size: {per_device_train_batch_size * gradient_accumulation_steps}")
388
 
389
+ sft_config = TrainingArguments(
390
  # Training duration
391
  num_train_epochs=num_train_epochs,
392
  max_steps=max_steps,
 
394
  # Learning rate
395
  learning_rate=learning_rate,
396
  lr_scheduler_type=lr_scheduler_type,
 
397
  warmup_ratio=warmup_ratio,
398
  warmup_steps=warmup_steps,
399
 
 
440
  push_to_hub=push_to_hub,
441
 
442
  # Monitoring
443
+ report_to=("trackio" if getattr(config, 'enable_tracking', False) else None),
444
  )
445
 
446
  return sft_config
 
502
  model=peft_model,
503
  args=sft_config,
504
  train_dataset=dataset,
505
+ tokenizer=tokenizer,
506
  dataset_text_field="text",
507
  max_seq_length=getattr(config, 'max_seq_length', 2048),
508
  )