Spaces:
Running
Running
improves launcher with model family and defaults based on options, updates trl trainer , removes trl config paths by switching to trainingarguments class , tokenizer parameters updated to sfttrainer , resolves evaluation_strategy error
Browse files- launch.sh +137 -104
- scripts/training/train_gpt_oss.py +5 -7
launch.sh
CHANGED
@@ -158,112 +158,116 @@ except:
|
|
158 |
fi
|
159 |
}
|
160 |
|
161 |
-
# Function to show training configurations
|
162 |
show_training_configs() {
|
|
|
163 |
echo ""
|
164 |
print_header "Available Training Configurations"
|
165 |
echo "======================================"
|
166 |
echo ""
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
|
|
|
|
|
|
267 |
}
|
268 |
|
269 |
# Function to get training configuration
|
@@ -785,11 +789,40 @@ HF_TOKEN="$HF_WRITE_TOKEN"
|
|
785 |
print_step "Step 2: Training Configuration"
|
786 |
echo "=================================="
|
787 |
|
788 |
-
|
789 |
-
select_option "Select
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
790 |
|
791 |
get_training_config "$TRAINING_CONFIG_TYPE"
|
792 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
793 |
# Step 3: Get experiment details
|
794 |
print_step "Step 3: Experiment Details"
|
795 |
echo "=============================="
|
@@ -1042,7 +1075,7 @@ print_info "This information will be used in the model card and citation."
|
|
1042 |
get_input "Author name for model card" "$HF_USERNAME" AUTHOR_NAME
|
1043 |
|
1044 |
print_info "Model description will be used in the model card and repository."
|
1045 |
-
get_input "Model description" "
|
1046 |
|
1047 |
# Step 9: Deploy Trackio Space (automated)
|
1048 |
print_step "Step 9: Deploying Trackio Space"
|
|
|
158 |
fi
|
159 |
}
|
160 |
|
161 |
+
# Function to show training configurations (optionally filtered by family)
|
162 |
show_training_configs() {
|
163 |
+
local family="$1" # Optional: "SmolLM3" or "GPT-OSS"
|
164 |
echo ""
|
165 |
print_header "Available Training Configurations"
|
166 |
echo "======================================"
|
167 |
echo ""
|
168 |
+
|
169 |
+
if [ -z "$family" ] || [ "$family" = "SmolLM3" ]; then
|
170 |
+
echo "=== SmolLM3 Configurations ==="
|
171 |
+
echo "1. Basic Training (Default)"
|
172 |
+
echo " - Model: SmolLM3-3B"
|
173 |
+
echo " - Dataset: SmolTalk"
|
174 |
+
echo " - Epochs: 3"
|
175 |
+
echo " - Batch Size: 2"
|
176 |
+
echo " - Learning Rate: 5e-6"
|
177 |
+
echo ""
|
178 |
+
echo "2. H100 Lightweight (Rapid)"
|
179 |
+
echo " - Model: SmolLM3-3B"
|
180 |
+
echo " - Dataset: OpenHermes-FR (80K samples)"
|
181 |
+
echo " - Epochs: 1"
|
182 |
+
echo " - Batch Size: 16"
|
183 |
+
echo " - Learning Rate: 8e-6"
|
184 |
+
echo " - Sequence Length: 8192"
|
185 |
+
echo " - Optimized for H100 rapid training"
|
186 |
+
echo ""
|
187 |
+
echo "3. A100 Large Scale"
|
188 |
+
echo " - Model: SmolLM3-3B"
|
189 |
+
echo " - Dataset: OpenHermes-FR"
|
190 |
+
echo " - Epochs: 1.3 passes"
|
191 |
+
echo " - Batch Size: 8"
|
192 |
+
echo " - Learning Rate: 5e-6"
|
193 |
+
echo " - Sequence Length: 8192"
|
194 |
+
echo ""
|
195 |
+
echo "4. Multiple Passes"
|
196 |
+
echo " - Model: SmolLM3-3B"
|
197 |
+
echo " - Dataset: OpenHermes-FR"
|
198 |
+
echo " - Epochs: 4 passes"
|
199 |
+
echo " - Batch Size: 6"
|
200 |
+
echo " - Learning Rate: 3e-6"
|
201 |
+
echo " - Sequence Length: 8192"
|
202 |
+
echo ""
|
203 |
+
fi
|
204 |
+
|
205 |
+
if [ -z "$family" ] || [ "$family" = "GPT-OSS" ]; then
|
206 |
+
echo "=== GPT-OSS Configurations ==="
|
207 |
+
echo "1. GPT-OSS Basic Training"
|
208 |
+
echo " - Model: openai/gpt-oss-20b"
|
209 |
+
echo " - Dataset: Multilingual-Thinking"
|
210 |
+
echo " - Epochs: 1"
|
211 |
+
echo " - Batch Size: 4"
|
212 |
+
echo " - Learning Rate: 2e-4"
|
213 |
+
echo " - LoRA + MXFP4 Quantization"
|
214 |
+
echo " - Optimized for multilingual reasoning"
|
215 |
+
echo ""
|
216 |
+
echo "2. GPT-OSS H100 Optimized"
|
217 |
+
echo " - Model: openai/gpt-oss-20b"
|
218 |
+
echo " - Dataset: Multilingual-Thinking"
|
219 |
+
echo " - Epochs: 2"
|
220 |
+
echo " - Batch Size: 8"
|
221 |
+
echo " - Learning Rate: 3e-4"
|
222 |
+
echo " - Enhanced LoRA (rank 16)"
|
223 |
+
echo " - Optimized for H100 performance"
|
224 |
+
echo ""
|
225 |
+
echo "3. GPT-OSS Multilingual Reasoning"
|
226 |
+
echo " - Model: openai/gpt-oss-20b"
|
227 |
+
echo " - Dataset: Multilingual-Thinking"
|
228 |
+
echo " - Epochs: 1"
|
229 |
+
echo " - Batch Size: 4"
|
230 |
+
echo " - Learning Rate: 2e-4"
|
231 |
+
echo " - Specialized for reasoning tasks"
|
232 |
+
echo " - Supports 10+ languages"
|
233 |
+
echo ""
|
234 |
+
echo "4. GPT-OSS Memory Optimized"
|
235 |
+
echo " - Model: openai/gpt-oss-20b"
|
236 |
+
echo " - Dataset: Multilingual-Thinking"
|
237 |
+
echo " - Epochs: 1"
|
238 |
+
echo " - Batch Size: 1 (effective 16 with accumulation)"
|
239 |
+
echo " - Learning Rate: 2e-4"
|
240 |
+
echo " - 4-bit quantization + reduced LoRA"
|
241 |
+
echo " - Optimized for limited GPU memory"
|
242 |
+
echo ""
|
243 |
+
echo "5. GPT-OSS OpenHermes-FR (Recommended)"
|
244 |
+
echo " - Model: openai/gpt-oss-20b"
|
245 |
+
echo " - Dataset: legmlai/openhermes-fr (800K French examples)"
|
246 |
+
echo " - Epochs: 1.5"
|
247 |
+
echo " - Batch Size: 6 (effective 36 with accumulation)"
|
248 |
+
echo " - Learning Rate: 2.5e-4"
|
249 |
+
echo " - Optimized for French language training"
|
250 |
+
echo " - Quality filtering enabled"
|
251 |
+
echo ""
|
252 |
+
echo "6. GPT-OSS OpenHermes-FR Memory Optimized"
|
253 |
+
echo " - Model: openai/gpt-oss-20b"
|
254 |
+
echo " - Dataset: legmlai/openhermes-fr (200K samples)"
|
255 |
+
echo " - Epochs: 1"
|
256 |
+
echo " - Batch Size: 2 (effective 32 with accumulation)"
|
257 |
+
echo " - Learning Rate: 2e-4"
|
258 |
+
echo " - Native MXFP4 quantization"
|
259 |
+
echo " - Memory optimized for 40-80GB GPUs"
|
260 |
+
echo " - Harmony format compatible"
|
261 |
+
echo ""
|
262 |
+
echo "7. GPT-OSS Custom Dataset"
|
263 |
+
echo " - Model: openai/gpt-oss-20b"
|
264 |
+
echo " - Dataset: User-defined (fully customizable)"
|
265 |
+
echo " - Epochs: Configurable"
|
266 |
+
echo " - Batch Size: Configurable"
|
267 |
+
echo " - Learning Rate: Configurable"
|
268 |
+
echo " - Maximum flexibility with all parameters"
|
269 |
+
echo ""
|
270 |
+
fi
|
271 |
}
|
272 |
|
273 |
# Function to get training configuration
|
|
|
789 |
print_step "Step 2: Training Configuration"
|
790 |
echo "=================================="
|
791 |
|
792 |
+
# 2.1 Select model family first
|
793 |
+
select_option "Select model family:" "SmolLM3" "GPT-OSS" MODEL_FAMILY
|
794 |
+
|
795 |
+
# 2.2 Show only the configurations for the selected family and prompt selection
|
796 |
+
show_training_configs "$MODEL_FAMILY"
|
797 |
+
if [ "$MODEL_FAMILY" = "SmolLM3" ]; then
|
798 |
+
select_option "Select training configuration:" \
|
799 |
+
"Basic Training" \
|
800 |
+
"H100 Lightweight (Rapid)" \
|
801 |
+
"A100 Large Scale" \
|
802 |
+
"Multiple Passes" \
|
803 |
+
"Custom Configuration" \
|
804 |
+
TRAINING_CONFIG_TYPE
|
805 |
+
else
|
806 |
+
select_option "Select training configuration:" \
|
807 |
+
"GPT-OSS Basic Training" \
|
808 |
+
"GPT-OSS H100 Optimized" \
|
809 |
+
"GPT-OSS Multilingual Reasoning" \
|
810 |
+
"GPT-OSS Memory Optimized" \
|
811 |
+
"GPT-OSS OpenHermes-FR (Recommended)" \
|
812 |
+
"GPT-OSS OpenHermes-FR Memory Optimized" \
|
813 |
+
"GPT-OSS Custom Dataset" \
|
814 |
+
TRAINING_CONFIG_TYPE
|
815 |
+
fi
|
816 |
|
817 |
get_training_config "$TRAINING_CONFIG_TYPE"
|
818 |
|
819 |
+
# 2.3 Set a family-specific default model description for the model card
|
820 |
+
if [ "$MODEL_FAMILY" = "GPT-OSS" ]; then
|
821 |
+
DEFAULT_MODEL_DESCRIPTION="A fine-tuned GPT-OSS-20B model optimized for multilingual reasoning and instruction following."
|
822 |
+
else
|
823 |
+
DEFAULT_MODEL_DESCRIPTION="A fine-tuned SmolLM3-3B model optimized for instruction following and French language tasks."
|
824 |
+
fi
|
825 |
+
|
826 |
# Step 3: Get experiment details
|
827 |
print_step "Step 3: Experiment Details"
|
828 |
echo "=============================="
|
|
|
1075 |
get_input "Author name for model card" "$HF_USERNAME" AUTHOR_NAME
|
1076 |
|
1077 |
print_info "Model description will be used in the model card and repository."
|
1078 |
+
get_input "Model description" "$DEFAULT_MODEL_DESCRIPTION" MODEL_DESCRIPTION
|
1079 |
|
1080 |
# Step 9: Deploy Trackio Space (automated)
|
1081 |
print_step "Step 9: Deploying Trackio Space"
|
scripts/training/train_gpt_oss.py
CHANGED
@@ -9,9 +9,9 @@ import os
|
|
9 |
import sys
|
10 |
import argparse
|
11 |
import torch
|
12 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
13 |
from peft import LoraConfig, get_peft_model
|
14 |
-
from trl import SFTTrainer
|
15 |
from datasets import load_dataset
|
16 |
from pathlib import Path
|
17 |
|
@@ -353,7 +353,6 @@ def create_sft_config(config, output_dir):
|
|
353 |
# Learning rate configuration
|
354 |
learning_rate = config.learning_rate
|
355 |
lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
|
356 |
-
lr_scheduler_kwargs = getattr(config, 'lr_scheduler_kwargs', {"min_lr_rate": 0.1})
|
357 |
|
358 |
# Batch configuration
|
359 |
per_device_train_batch_size = config.batch_size
|
@@ -387,7 +386,7 @@ def create_sft_config(config, output_dir):
|
|
387 |
print(f" • Gradient accumulation: {gradient_accumulation_steps}")
|
388 |
print(f" • Effective batch size: {per_device_train_batch_size * gradient_accumulation_steps}")
|
389 |
|
390 |
-
sft_config =
|
391 |
# Training duration
|
392 |
num_train_epochs=num_train_epochs,
|
393 |
max_steps=max_steps,
|
@@ -395,7 +394,6 @@ def create_sft_config(config, output_dir):
|
|
395 |
# Learning rate
|
396 |
learning_rate=learning_rate,
|
397 |
lr_scheduler_type=lr_scheduler_type,
|
398 |
-
lr_scheduler_kwargs=lr_scheduler_kwargs,
|
399 |
warmup_ratio=warmup_ratio,
|
400 |
warmup_steps=warmup_steps,
|
401 |
|
@@ -442,7 +440,7 @@ def create_sft_config(config, output_dir):
|
|
442 |
push_to_hub=push_to_hub,
|
443 |
|
444 |
# Monitoring
|
445 |
-
report_to="trackio" if getattr(config, 'enable_tracking', False) else None,
|
446 |
)
|
447 |
|
448 |
return sft_config
|
@@ -504,7 +502,7 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
|
|
504 |
model=peft_model,
|
505 |
args=sft_config,
|
506 |
train_dataset=dataset,
|
507 |
-
|
508 |
dataset_text_field="text",
|
509 |
max_seq_length=getattr(config, 'max_seq_length', 2048),
|
510 |
)
|
|
|
9 |
import sys
|
10 |
import argparse
|
11 |
import torch
|
12 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
|
13 |
from peft import LoraConfig, get_peft_model
|
14 |
+
from trl import SFTTrainer
|
15 |
from datasets import load_dataset
|
16 |
from pathlib import Path
|
17 |
|
|
|
353 |
# Learning rate configuration
|
354 |
learning_rate = config.learning_rate
|
355 |
lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
|
|
|
356 |
|
357 |
# Batch configuration
|
358 |
per_device_train_batch_size = config.batch_size
|
|
|
386 |
print(f" • Gradient accumulation: {gradient_accumulation_steps}")
|
387 |
print(f" • Effective batch size: {per_device_train_batch_size * gradient_accumulation_steps}")
|
388 |
|
389 |
+
sft_config = TrainingArguments(
|
390 |
# Training duration
|
391 |
num_train_epochs=num_train_epochs,
|
392 |
max_steps=max_steps,
|
|
|
394 |
# Learning rate
|
395 |
learning_rate=learning_rate,
|
396 |
lr_scheduler_type=lr_scheduler_type,
|
|
|
397 |
warmup_ratio=warmup_ratio,
|
398 |
warmup_steps=warmup_steps,
|
399 |
|
|
|
440 |
push_to_hub=push_to_hub,
|
441 |
|
442 |
# Monitoring
|
443 |
+
report_to=("trackio" if getattr(config, 'enable_tracking', False) else None),
|
444 |
)
|
445 |
|
446 |
return sft_config
|
|
|
502 |
model=peft_model,
|
503 |
args=sft_config,
|
504 |
train_dataset=dataset,
|
505 |
+
tokenizer=tokenizer,
|
506 |
dataset_text_field="text",
|
507 |
max_seq_length=getattr(config, 'max_seq_length', 2048),
|
508 |
)
|