Spaces:
Running
Running
adds harmony format , configurable gpt-oss parameters, launch.sh logic , improved templates for legml gpt-oss training, dynamic results directory and improve model pushing
Browse files- config/train_gpt_oss_custom.py +388 -0
- config/train_gpt_oss_openhermes_fr.py +174 -0
- config/train_gpt_oss_openhermes_fr_memory_optimized.py +233 -0
- docs/output.svg +1 -0
- launch.sh +328 -11
- scripts/model_tonic/push_gpt_oss_to_huggingface.py +79 -5
- scripts/model_tonic/push_to_huggingface.py +83 -5
- scripts/training/train_gpt_oss.py +313 -24
- templates/spaces/demo_gpt/README.md +1 -1
config/train_gpt_oss_custom.py
ADDED
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
GPT-OSS Custom Training Configuration
|
3 |
+
Based on OpenAI's GPT-OSS fine-tuning tutorial
|
4 |
+
Fully customizable configuration for any dataset format
|
5 |
+
|
6 |
+
Supports specialized datasets like:
|
7 |
+
- legmlai/openhermes-fr (French instruction dataset)
|
8 |
+
- HuggingFaceH4/Multilingual-Thinking
|
9 |
+
- Custom prompt/completion formats
|
10 |
+
"""
|
11 |
+
import os
|
12 |
+
from dataclasses import dataclass
|
13 |
+
from typing import Optional, Dict, List, Union
|
14 |
+
|
15 |
+
@dataclass
|
16 |
+
class GPTOSSEnhancedCustomConfig:
|
17 |
+
"""Enhanced custom configuration for GPT-OSS fine-tuning with maximum flexibility"""
|
18 |
+
|
19 |
+
# ============================================================================
|
20 |
+
# CORE MODEL CONFIGURATION
|
21 |
+
# ============================================================================
|
22 |
+
trainer_type: str = "sft" # "sft" or "dpo"
|
23 |
+
model_name: str = "openai/gpt-oss-20b"
|
24 |
+
max_seq_length: int = 2048 # Customizable: 512, 1024, 2048, 4096, 8192
|
25 |
+
use_flash_attention: bool = True
|
26 |
+
use_gradient_checkpointing: bool = True
|
27 |
+
|
28 |
+
# ============================================================================
|
29 |
+
# TRAINING HYPERPARAMETERS - FULLY CUSTOMIZABLE
|
30 |
+
# ============================================================================
|
31 |
+
# Batch Configuration
|
32 |
+
batch_size: int = 4 # Per-device batch size (1-32 depending on GPU memory)
|
33 |
+
gradient_accumulation_steps: int = 4 # Effective batch = batch_size * accumulation * num_gpus
|
34 |
+
eval_batch_size: Optional[int] = None # If None, uses batch_size
|
35 |
+
|
36 |
+
# Learning Rate Configuration
|
37 |
+
learning_rate: float = 2e-4 # Main learning rate (1e-5 to 5e-4 typical range)
|
38 |
+
min_lr: float = 2e-5 # Minimum learning rate for scheduler
|
39 |
+
warmup_ratio: float = 0.03 # Fraction of steps for warmup (0.01-0.1)
|
40 |
+
warmup_steps: Optional[int] = None # If set, overrides warmup_ratio
|
41 |
+
|
42 |
+
# Training Duration
|
43 |
+
num_train_epochs: float = 1.0 # Number of epochs (0.5, 1.0, 2.0, 3.0)
|
44 |
+
max_steps: Optional[int] = None # If set, overrides num_train_epochs
|
45 |
+
max_iters: Optional[int] = None # Legacy compatibility
|
46 |
+
|
47 |
+
# Regularization
|
48 |
+
weight_decay: float = 0.01 # L2 regularization (0.0-0.1)
|
49 |
+
max_grad_norm: float = 1.0 # Gradient clipping (0.5-2.0)
|
50 |
+
|
51 |
+
# ============================================================================
|
52 |
+
# OPTIMIZER CONFIGURATION
|
53 |
+
# ============================================================================
|
54 |
+
optimizer: str = "adamw_torch" # "adamw_torch", "adamw_hf", "sgd"
|
55 |
+
beta1: float = 0.9 # Adam beta1 parameter
|
56 |
+
beta2: float = 0.95 # Adam beta2 parameter (0.95-0.999)
|
57 |
+
eps: float = 1e-8 # Adam epsilon
|
58 |
+
|
59 |
+
# ============================================================================
|
60 |
+
# SCHEDULER CONFIGURATION
|
61 |
+
# ============================================================================
|
62 |
+
scheduler: str = "cosine_with_min_lr" # "linear", "cosine", "cosine_with_min_lr", "constant"
|
63 |
+
lr_scheduler_kwargs: Optional[Dict] = None
|
64 |
+
|
65 |
+
# ============================================================================
|
66 |
+
# MIXED PRECISION & DISTRIBUTED TRAINING
|
67 |
+
# ============================================================================
|
68 |
+
fp16: bool = False # Use FP16 (not recommended for GPT-OSS)
|
69 |
+
bf16: bool = True # Use BF16 (recommended for GPT-OSS)
|
70 |
+
tf32: Optional[bool] = None # Use TF32 on A100/H100
|
71 |
+
ddp_backend: str = "nccl"
|
72 |
+
ddp_find_unused_parameters: bool = False
|
73 |
+
|
74 |
+
# ============================================================================
|
75 |
+
# LOGGING, EVALUATION & CHECKPOINTING
|
76 |
+
# ============================================================================
|
77 |
+
# Logging
|
78 |
+
logging_steps: int = 10 # Log every N steps
|
79 |
+
log_level: str = "info" # "debug", "info", "warning", "error"
|
80 |
+
|
81 |
+
# Evaluation
|
82 |
+
eval_strategy: str = "steps" # "no", "steps", "epoch"
|
83 |
+
eval_steps: int = 100 # Evaluate every N steps
|
84 |
+
eval_delay: float = 0 # Delay evaluation for N steps/epochs
|
85 |
+
eval_accumulation_steps: Optional[int] = None # Accumulate eval outputs
|
86 |
+
|
87 |
+
# Checkpointing
|
88 |
+
save_strategy: str = "steps" # "no", "steps", "epoch"
|
89 |
+
save_steps: int = 500 # Save checkpoint every N steps
|
90 |
+
save_total_limit: Optional[int] = 3 # Keep only N best checkpoints
|
91 |
+
save_only_model: bool = False # Save only model weights
|
92 |
+
|
93 |
+
# Model Selection
|
94 |
+
metric_for_best_model: str = "eval_loss"
|
95 |
+
greater_is_better: bool = False
|
96 |
+
load_best_model_at_end: bool = True
|
97 |
+
|
98 |
+
# ============================================================================
|
99 |
+
# DATASET CONFIGURATION - ENHANCED FOR CUSTOM FORMATS
|
100 |
+
# ============================================================================
|
101 |
+
# Dataset Source
|
102 |
+
dataset_name: str = "legmlai/openhermes-fr" # Default to French OpenHermes
|
103 |
+
dataset_split: str = "train" # Dataset split to use
|
104 |
+
dataset_config: Optional[str] = None # Dataset configuration name
|
105 |
+
|
106 |
+
# Field Mapping - Customize for your dataset format
|
107 |
+
input_field: str = "prompt" # Field containing the input/prompt
|
108 |
+
target_field: str = "accepted_completion" # Field containing the target/completion
|
109 |
+
|
110 |
+
# OpenHermes-FR specific fields
|
111 |
+
filter_bad_entries: bool = True # Filter entries marked as bad
|
112 |
+
bad_entry_field: str = "bad_entry" # Field indicating bad entries
|
113 |
+
bad_prompt_field: str = "bad_prompt_detected" # Field for bad prompts
|
114 |
+
bad_response_field: str = "bad_response_detected" # Field for bad responses
|
115 |
+
|
116 |
+
# Data Processing Options
|
117 |
+
concatenate_fields: bool = True # Combine input and target fields for training
|
118 |
+
field_separator: str = "\n\n### Response:\n" # Separator between input and target
|
119 |
+
add_eos_token: bool = True # Add EOS token at the end
|
120 |
+
|
121 |
+
# Dataset Filtering & Sampling
|
122 |
+
max_samples: Optional[int] = None # Limit dataset size (e.g., 100000 for testing)
|
123 |
+
min_length: int = 10 # Minimum sequence length
|
124 |
+
max_length: Optional[int] = None # Maximum sequence length (None = use max_seq_length)
|
125 |
+
|
126 |
+
# Custom Dataset Formats Support
|
127 |
+
dataset_format: str = "openhermes_fr" # "openhermes_fr", "messages", "text", "custom"
|
128 |
+
|
129 |
+
# GPT-OSS Harmony Format Configuration
|
130 |
+
use_harmony_format: bool = True # Enable GPT-OSS harmony format
|
131 |
+
use_chat_template: bool = False # Set to True for messages format
|
132 |
+
chat_template_kwargs: Optional[Dict] = None
|
133 |
+
|
134 |
+
# ============================================================================
|
135 |
+
# TRACKIO MONITORING CONFIGURATION
|
136 |
+
# ============================================================================
|
137 |
+
enable_tracking: bool = True
|
138 |
+
trackio_url: Optional[str] = None
|
139 |
+
trackio_token: Optional[str] = None
|
140 |
+
log_artifacts: bool = True
|
141 |
+
log_metrics: bool = True
|
142 |
+
log_config: bool = True
|
143 |
+
experiment_name: Optional[str] = None
|
144 |
+
|
145 |
+
# ============================================================================
|
146 |
+
# HUGGING FACE INTEGRATION
|
147 |
+
# ============================================================================
|
148 |
+
hf_token: Optional[str] = None
|
149 |
+
dataset_repo: Optional[str] = None
|
150 |
+
push_to_hub: bool = False # Push model to HF Hub after training
|
151 |
+
hub_model_id: Optional[str] = None # HF Hub model ID
|
152 |
+
hub_private_repo: bool = False # Make HF repo private
|
153 |
+
|
154 |
+
# ============================================================================
|
155 |
+
# GPT-OSS SPECIFIC CONFIGURATIONS
|
156 |
+
# ============================================================================
|
157 |
+
# LoRA Configuration
|
158 |
+
use_lora: bool = True
|
159 |
+
lora_config: Optional[Dict] = None
|
160 |
+
|
161 |
+
# Quantization Configuration
|
162 |
+
use_quantization: bool = True
|
163 |
+
quantization_config: Optional[Dict] = None
|
164 |
+
|
165 |
+
# Model Loading Configuration
|
166 |
+
model_kwargs: Optional[Dict] = None
|
167 |
+
|
168 |
+
# Generation Configuration (for evaluation/testing)
|
169 |
+
generation_config: Optional[Dict] = None
|
170 |
+
|
171 |
+
# ============================================================================
|
172 |
+
# MULTILINGUAL & DOMAIN SPECIFIC SETTINGS
|
173 |
+
# ============================================================================
|
174 |
+
# Language Support (for multilingual datasets)
|
175 |
+
primary_language: str = "fr" # Primary language code
|
176 |
+
reasoning_languages: Optional[List[str]] = None # Supported languages for reasoning
|
177 |
+
|
178 |
+
# Domain-specific settings
|
179 |
+
domain_focus: Optional[str] = None # "reasoning", "conversation", "instruction", "general"
|
180 |
+
|
181 |
+
# ============================================================================
|
182 |
+
# PERFORMANCE & MEMORY OPTIMIZATION
|
183 |
+
# ============================================================================
|
184 |
+
# Data Loading
|
185 |
+
dataloader_num_workers: int = 4 # Number of data loading workers
|
186 |
+
dataloader_pin_memory: bool = True # Pin memory for faster GPU transfer
|
187 |
+
dataloader_prefetch_factor: int = 2 # Prefetch factor for data loading
|
188 |
+
|
189 |
+
# Memory Management
|
190 |
+
max_memory_per_gpu: Optional[str] = None # e.g., "80GB", "40GB"
|
191 |
+
low_cpu_mem_usage: bool = True # Use low CPU memory loading
|
192 |
+
|
193 |
+
# Performance Optimizations
|
194 |
+
group_by_length: bool = True # Group sequences by length
|
195 |
+
length_column_name: str = "length" # Column name for sequence lengths
|
196 |
+
remove_unused_columns: bool = True # Remove unused dataset columns
|
197 |
+
|
198 |
+
def __post_init__(self):
|
199 |
+
"""Initialize default values and validate configuration"""
|
200 |
+
|
201 |
+
# ============================================================================
|
202 |
+
# LORA CONFIGURATION DEFAULTS
|
203 |
+
# ============================================================================
|
204 |
+
if self.lora_config is None:
|
205 |
+
self.lora_config = {
|
206 |
+
"r": 16, # Rank (4, 8, 16, 32, 64) - higher = more parameters
|
207 |
+
"lora_alpha": 32, # Scaling factor (usually 2*r)
|
208 |
+
"target_modules": "all-linear", # Apply LoRA to all linear layers
|
209 |
+
"target_parameters": [
|
210 |
+
"7.mlp.experts.gate_up_proj",
|
211 |
+
"7.mlp.experts.down_proj",
|
212 |
+
"15.mlp.experts.gate_up_proj",
|
213 |
+
"15.mlp.experts.down_proj",
|
214 |
+
"23.mlp.experts.gate_up_proj",
|
215 |
+
"23.mlp.experts.down_proj",
|
216 |
+
],
|
217 |
+
"bias": "none", # "none", "all", "lora_only"
|
218 |
+
"task_type": "CAUSAL_LM",
|
219 |
+
"lora_dropout": 0.05, # LoRA dropout rate
|
220 |
+
}
|
221 |
+
|
222 |
+
# ============================================================================
|
223 |
+
# QUANTIZATION CONFIGURATION DEFAULTS
|
224 |
+
# ============================================================================
|
225 |
+
if self.quantization_config is None:
|
226 |
+
self.quantization_config = {
|
227 |
+
"dequantize": True, # Use Mxfp4Config as per GPT-OSS tutorial
|
228 |
+
"load_in_4bit": False, # Set to True for extreme memory optimization
|
229 |
+
"bnb_4bit_compute_dtype": "bfloat16", # For 4-bit quantization
|
230 |
+
"bnb_4bit_use_double_quant": True, # Double quantization
|
231 |
+
"bnb_4bit_quant_type": "nf4" # Quantization type
|
232 |
+
}
|
233 |
+
|
234 |
+
# ============================================================================
|
235 |
+
# MODEL LOADING CONFIGURATION DEFAULTS
|
236 |
+
# ============================================================================
|
237 |
+
if self.model_kwargs is None:
|
238 |
+
self.model_kwargs = {
|
239 |
+
"attn_implementation": "eager", # "eager", "flash_attention_2"
|
240 |
+
"torch_dtype": "auto", # "auto", "bfloat16", "float16"
|
241 |
+
"use_cache": False, # Disable KV cache for training
|
242 |
+
"device_map": "auto", # Automatic device mapping
|
243 |
+
"low_cpu_mem_usage": self.low_cpu_mem_usage,
|
244 |
+
}
|
245 |
+
|
246 |
+
# Add memory constraints if specified
|
247 |
+
if self.max_memory_per_gpu:
|
248 |
+
self.model_kwargs["max_memory"] = {0: self.max_memory_per_gpu}
|
249 |
+
|
250 |
+
# ============================================================================
|
251 |
+
# GENERATION CONFIGURATION DEFAULTS
|
252 |
+
# ============================================================================
|
253 |
+
if self.generation_config is None:
|
254 |
+
self.generation_config = {
|
255 |
+
"max_new_tokens": 512, # Maximum tokens to generate
|
256 |
+
"do_sample": True, # Use sampling
|
257 |
+
"temperature": 0.7, # Sampling temperature
|
258 |
+
"top_p": 0.9, # Nucleus sampling
|
259 |
+
"top_k": 50, # Top-k sampling
|
260 |
+
"repetition_penalty": 1.1, # Repetition penalty
|
261 |
+
"pad_token_id": None, # Will be set from tokenizer
|
262 |
+
"eos_token_id": None, # Will be set from tokenizer
|
263 |
+
}
|
264 |
+
|
265 |
+
# ============================================================================
|
266 |
+
# LANGUAGE CONFIGURATION DEFAULTS
|
267 |
+
# ============================================================================
|
268 |
+
if self.reasoning_languages is None:
|
269 |
+
if self.primary_language == "fr":
|
270 |
+
self.reasoning_languages = [
|
271 |
+
"French", "English", "Spanish", "Italian", "German"
|
272 |
+
]
|
273 |
+
else:
|
274 |
+
self.reasoning_languages = [
|
275 |
+
"English", "Spanish", "French", "Italian", "German",
|
276 |
+
"Chinese", "Hindi", "Japanese", "Korean", "Arabic"
|
277 |
+
]
|
278 |
+
|
279 |
+
# ============================================================================
|
280 |
+
# SCHEDULER CONFIGURATION DEFAULTS
|
281 |
+
# ============================================================================
|
282 |
+
if self.lr_scheduler_kwargs is None:
|
283 |
+
self.lr_scheduler_kwargs = {"min_lr_rate": 0.1}
|
284 |
+
|
285 |
+
# ============================================================================
|
286 |
+
# CHAT TEMPLATE CONFIGURATION DEFAULTS (GPT-OSS Harmony Format)
|
287 |
+
# ============================================================================
|
288 |
+
if self.chat_template_kwargs is None:
|
289 |
+
self.chat_template_kwargs = {
|
290 |
+
"add_generation_prompt": True,
|
291 |
+
"tokenize": False,
|
292 |
+
"auto_insert_role": True,
|
293 |
+
# GPT-OSS Harmony Format specific settings
|
294 |
+
"reasoning_effort": "medium", # low, medium, high
|
295 |
+
"model_identity": "You are GPT-Tonic, a large language model trained by TonicAI.",
|
296 |
+
"builtin_tools": [], # Can include "browser" and/or "python"
|
297 |
+
}
|
298 |
+
|
299 |
+
# ============================================================================
|
300 |
+
# VALIDATION AND COMPUTED VALUES
|
301 |
+
# ============================================================================
|
302 |
+
# Compute effective batch size
|
303 |
+
effective_batch_size = self.batch_size * self.gradient_accumulation_steps
|
304 |
+
|
305 |
+
# Set warmup steps if not provided
|
306 |
+
if self.warmup_steps is None and self.max_steps:
|
307 |
+
self.warmup_steps = int(self.max_steps * self.warmup_ratio)
|
308 |
+
|
309 |
+
# Set max_length for dataset filtering
|
310 |
+
if self.max_length is None:
|
311 |
+
self.max_length = self.max_seq_length
|
312 |
+
|
313 |
+
# Validate configuration
|
314 |
+
self._validate_config()
|
315 |
+
|
316 |
+
# Print comprehensive configuration summary
|
317 |
+
self._print_config_summary(effective_batch_size)
|
318 |
+
|
319 |
+
def _validate_config(self):
|
320 |
+
"""Validate configuration parameters"""
|
321 |
+
|
322 |
+
# Validate batch configuration
|
323 |
+
if self.batch_size < 1:
|
324 |
+
raise ValueError("batch_size must be >= 1")
|
325 |
+
if self.gradient_accumulation_steps < 1:
|
326 |
+
raise ValueError("gradient_accumulation_steps must be >= 1")
|
327 |
+
|
328 |
+
# Validate learning rate
|
329 |
+
if self.learning_rate <= 0:
|
330 |
+
raise ValueError("learning_rate must be > 0")
|
331 |
+
if self.min_lr >= self.learning_rate:
|
332 |
+
raise ValueError("min_lr must be < learning_rate")
|
333 |
+
|
334 |
+
# Validate sequence length
|
335 |
+
if self.max_seq_length < 1:
|
336 |
+
raise ValueError("max_seq_length must be >= 1")
|
337 |
+
|
338 |
+
# Validate dataset format
|
339 |
+
valid_formats = ["openhermes_fr", "messages", "text", "custom"]
|
340 |
+
if self.dataset_format not in valid_formats:
|
341 |
+
raise ValueError(f"dataset_format must be one of {valid_formats}")
|
342 |
+
|
343 |
+
def _print_config_summary(self, effective_batch_size):
|
344 |
+
"""Print detailed configuration summary"""
|
345 |
+
|
346 |
+
print("\n" + "="*80)
|
347 |
+
print("🚀 GPT-OSS ENHANCED CUSTOM CONFIGURATION")
|
348 |
+
print("="*80)
|
349 |
+
|
350 |
+
print(f"📊 Model & Training:")
|
351 |
+
print(f" • Model: {self.model_name}")
|
352 |
+
print(f" • Dataset: {self.dataset_name} ({self.dataset_format})")
|
353 |
+
print(f" • Primary Language: {self.primary_language}")
|
354 |
+
print(f" • Sequence Length: {self.max_seq_length}")
|
355 |
+
print(f" • Epochs: {self.num_train_epochs}")
|
356 |
+
|
357 |
+
print(f"\n🔄 Batch Configuration:")
|
358 |
+
print(f" • Per-device Batch Size: {self.batch_size}")
|
359 |
+
print(f" • Gradient Accumulation: {self.gradient_accumulation_steps}")
|
360 |
+
print(f" • Effective Batch Size: {effective_batch_size}")
|
361 |
+
|
362 |
+
print(f"\n📈 Learning Configuration:")
|
363 |
+
print(f" • Learning Rate: {self.learning_rate}")
|
364 |
+
print(f" • Min Learning Rate: {self.min_lr}")
|
365 |
+
print(f" • Weight Decay: {self.weight_decay}")
|
366 |
+
print(f" • Warmup Ratio: {self.warmup_ratio}")
|
367 |
+
|
368 |
+
print(f"\n🎛️ LoRA Configuration:")
|
369 |
+
print(f" • Rank: {self.lora_config['r']}")
|
370 |
+
print(f" • Alpha: {self.lora_config['lora_alpha']}")
|
371 |
+
print(f" • Target Modules: {self.lora_config['target_modules']}")
|
372 |
+
|
373 |
+
print(f"\n📁 Dataset Configuration:")
|
374 |
+
print(f" • Input Field: {self.input_field}")
|
375 |
+
print(f" • Target Field: {self.target_field}")
|
376 |
+
print(f" • Filter Bad Entries: {self.filter_bad_entries}")
|
377 |
+
print(f" • Max Samples: {self.max_samples or 'All'}")
|
378 |
+
|
379 |
+
print(f"\n💾 Memory & Performance:")
|
380 |
+
print(f" • Mixed Precision: {'BF16' if self.bf16 else 'FP32'}")
|
381 |
+
print(f" • Gradient Checkpointing: {self.use_gradient_checkpointing}")
|
382 |
+
print(f" • Data Workers: {self.dataloader_num_workers}")
|
383 |
+
print(f" • Group by Length: {self.group_by_length}")
|
384 |
+
|
385 |
+
print("="*80 + "\n")
|
386 |
+
|
387 |
+
# Create the config instance with OpenHermes-FR optimized defaults
|
388 |
+
config = GPTOSSEnhancedCustomConfig()
|
config/train_gpt_oss_openhermes_fr.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
GPT-OSS OpenHermes-FR Optimized Configuration
|
3 |
+
Specifically optimized for the legmlai/openhermes-fr dataset
|
4 |
+
800K French instruction-response pairs with quality filtering
|
5 |
+
"""
|
6 |
+
|
7 |
+
from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
|
8 |
+
|
9 |
+
# OpenHermes-FR optimized configuration
|
10 |
+
config = GPTOSSEnhancedCustomConfig(
|
11 |
+
# ============================================================================
|
12 |
+
# DATASET CONFIGURATION - OpenHermes-FR Specific
|
13 |
+
# ============================================================================
|
14 |
+
dataset_name="legmlai/openhermes-fr",
|
15 |
+
dataset_split="train",
|
16 |
+
dataset_format="openhermes_fr",
|
17 |
+
|
18 |
+
# OpenHermes-FR field mapping
|
19 |
+
input_field="prompt", # French prompts
|
20 |
+
target_field="accepted_completion", # GPT-4o generated completions
|
21 |
+
|
22 |
+
# Quality filtering using OpenHermes-FR metadata
|
23 |
+
filter_bad_entries=True, # Use built-in quality flags
|
24 |
+
bad_entry_field="bad_entry",
|
25 |
+
bad_prompt_field="bad_prompt_detected",
|
26 |
+
bad_response_field="bad_response_detected",
|
27 |
+
|
28 |
+
# Data processing optimized for French with GPT-OSS Harmony Format
|
29 |
+
concatenate_fields=True,
|
30 |
+
field_separator="\n\n### Réponse:\n", # Fallback separator (harmony format takes precedence)
|
31 |
+
add_eos_token=True,
|
32 |
+
use_harmony_format=True, # Enable GPT-OSS harmony format
|
33 |
+
|
34 |
+
# Dataset sampling (use all 800K examples by default)
|
35 |
+
max_samples=None, # Use full dataset
|
36 |
+
min_length=20, # Minimum for meaningful French text
|
37 |
+
max_length=None, # Auto-set to max_seq_length
|
38 |
+
|
39 |
+
# ============================================================================
|
40 |
+
# TRAINING HYPERPARAMETERS - French Language Optimized
|
41 |
+
# ============================================================================
|
42 |
+
num_train_epochs=1.5, # 1.5 epochs optimal for large dataset
|
43 |
+
batch_size=6, # Balanced for most GPUs
|
44 |
+
gradient_accumulation_steps=6, # Effective batch size: 36
|
45 |
+
|
46 |
+
# Learning rate schedule optimized for French fine-tuning
|
47 |
+
learning_rate=2.5e-4, # Slightly higher for multilingual
|
48 |
+
min_lr=2.5e-5, # 10% of max learning rate
|
49 |
+
warmup_ratio=0.05, # 5% warmup for stability
|
50 |
+
weight_decay=0.01, # Standard L2 regularization
|
51 |
+
max_grad_norm=1.0, # Gradient clipping
|
52 |
+
|
53 |
+
# ============================================================================
|
54 |
+
# MODEL CONFIGURATION - Optimized for French
|
55 |
+
# ============================================================================
|
56 |
+
model_name="openai/gpt-oss-20b",
|
57 |
+
max_seq_length=3072, # Balanced length for French
|
58 |
+
use_flash_attention=True,
|
59 |
+
use_gradient_checkpointing=True,
|
60 |
+
|
61 |
+
# Mixed precision for efficiency
|
62 |
+
fp16=False,
|
63 |
+
bf16=True, # Better for GPT-OSS
|
64 |
+
|
65 |
+
# ============================================================================
|
66 |
+
# LORA CONFIGURATION - Optimized for French Language Learning
|
67 |
+
# ============================================================================
|
68 |
+
use_lora=True,
|
69 |
+
lora_config={
|
70 |
+
"r": 24, # Higher rank for language adaptation
|
71 |
+
"lora_alpha": 48, # 2x rank scaling
|
72 |
+
"lora_dropout": 0.05, # Light regularization
|
73 |
+
"target_modules": "all-linear",
|
74 |
+
"target_parameters": [
|
75 |
+
"7.mlp.experts.gate_up_proj",
|
76 |
+
"7.mlp.experts.down_proj",
|
77 |
+
"15.mlp.experts.gate_up_proj",
|
78 |
+
"15.mlp.experts.down_proj",
|
79 |
+
"23.mlp.experts.gate_up_proj",
|
80 |
+
"23.mlp.experts.down_proj",
|
81 |
+
],
|
82 |
+
"bias": "none",
|
83 |
+
"task_type": "CAUSAL_LM",
|
84 |
+
},
|
85 |
+
|
86 |
+
# ============================================================================
|
87 |
+
# QUANTIZATION - Balanced Performance/Memory
|
88 |
+
# ============================================================================
|
89 |
+
use_quantization=True,
|
90 |
+
quantization_config={
|
91 |
+
"dequantize": True, # MXFP4 as per GPT-OSS tutorial
|
92 |
+
"load_in_4bit": False, # Standard precision for quality
|
93 |
+
},
|
94 |
+
|
95 |
+
# ============================================================================
|
96 |
+
# PERFORMANCE OPTIMIZATION
|
97 |
+
# ============================================================================
|
98 |
+
# Data loading optimized for large dataset
|
99 |
+
dataloader_num_workers=6, # More workers for large dataset
|
100 |
+
dataloader_pin_memory=True,
|
101 |
+
dataloader_prefetch_factor=3, # Higher prefetch for efficiency
|
102 |
+
|
103 |
+
# Memory management
|
104 |
+
low_cpu_mem_usage=True,
|
105 |
+
group_by_length=True, # Efficient batching
|
106 |
+
remove_unused_columns=True,
|
107 |
+
|
108 |
+
# ============================================================================
|
109 |
+
# EVALUATION & LOGGING
|
110 |
+
# ============================================================================
|
111 |
+
eval_strategy="steps",
|
112 |
+
eval_steps=200, # Evaluate every 200 steps
|
113 |
+
logging_steps=20, # Log every 20 steps
|
114 |
+
|
115 |
+
save_strategy="steps",
|
116 |
+
save_steps=500, # Save every 500 steps
|
117 |
+
save_total_limit=3, # Keep 3 best checkpoints
|
118 |
+
|
119 |
+
metric_for_best_model="eval_loss",
|
120 |
+
greater_is_better=False,
|
121 |
+
load_best_model_at_end=True,
|
122 |
+
|
123 |
+
# ============================================================================
|
124 |
+
# MULTILINGUAL & FRENCH SPECIFIC SETTINGS
|
125 |
+
# ============================================================================
|
126 |
+
primary_language="fr", # French as primary language
|
127 |
+
reasoning_languages=["French", "English"], # Bilingual reasoning
|
128 |
+
domain_focus="instruction", # Instruction following
|
129 |
+
|
130 |
+
# ============================================================================
|
131 |
+
# GENERATION CONFIG FOR EVALUATION - GPT-OSS Harmony Format
|
132 |
+
# ============================================================================
|
133 |
+
generation_config={
|
134 |
+
"max_new_tokens": 512,
|
135 |
+
"do_sample": True,
|
136 |
+
"temperature": 0.7,
|
137 |
+
"top_p": 0.9,
|
138 |
+
"top_k": 50,
|
139 |
+
"repetition_penalty": 1.1,
|
140 |
+
"pad_token_id": None,
|
141 |
+
"eos_token_id": None,
|
142 |
+
# GPT-OSS Harmony Format specific settings
|
143 |
+
"reasoning_effort": "medium", # Configurable reasoning level
|
144 |
+
"use_harmony_format": True, # Ensure harmony format in generation
|
145 |
+
},
|
146 |
+
|
147 |
+
# ============================================================================
|
148 |
+
# HF HUB INTEGRATION
|
149 |
+
# ============================================================================
|
150 |
+
push_to_hub=False, # Set to True to auto-push
|
151 |
+
hub_model_id=None, # Will be set by launch script
|
152 |
+
hub_private_repo=False,
|
153 |
+
|
154 |
+
# ============================================================================
|
155 |
+
# MONITORING
|
156 |
+
# ============================================================================
|
157 |
+
enable_tracking=True, # Trackio monitoring
|
158 |
+
log_artifacts=True,
|
159 |
+
log_metrics=True,
|
160 |
+
log_config=True,
|
161 |
+
)
|
162 |
+
|
163 |
+
# Print configuration summary on import
|
164 |
+
print("\n🇫🇷 OpenHermes-FR Configuration Loaded")
|
165 |
+
print("=" * 50)
|
166 |
+
print(f"📊 Dataset: {config.dataset_name}")
|
167 |
+
print(f"🗣️ Language: French (with {config.dataset_format} format)")
|
168 |
+
print(f"📈 Training: {config.num_train_epochs} epochs")
|
169 |
+
print(f"🔄 Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
|
170 |
+
print(f"🧠 LoRA Rank: {config.lora_config['r']}")
|
171 |
+
print(f"📏 Sequence Length: {config.max_seq_length}")
|
172 |
+
print(f"🔍 Quality Filtering: {'Enabled' if config.filter_bad_entries else 'Disabled'}")
|
173 |
+
print(f"🎵 GPT-OSS Harmony Format: {'Enabled' if config.use_harmony_format else 'Disabled'}")
|
174 |
+
print("=" * 50)
|
config/train_gpt_oss_openhermes_fr_memory_optimized.py
ADDED
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
GPT-OSS OpenHermes-FR Memory-Optimized Configuration
|
3 |
+
Combines memory optimization best practices with OpenHermes-FR dataset
|
4 |
+
Optimized for GPT-OSS harmony format and MXFP4 quantization
|
5 |
+
Based on OpenAI GPT-OSS specifications and memory optimization principles
|
6 |
+
"""
|
7 |
+
|
8 |
+
from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
|
9 |
+
|
10 |
+
# Memory-optimized OpenHermes-FR configuration for GPT-OSS
|
11 |
+
config = GPTOSSEnhancedCustomConfig(
|
12 |
+
# ============================================================================
|
13 |
+
# DATASET CONFIGURATION - OpenHermes-FR with Harmony Format
|
14 |
+
# ============================================================================
|
15 |
+
dataset_name="legmlai/openhermes-fr",
|
16 |
+
dataset_split="train",
|
17 |
+
dataset_format="openhermes_fr",
|
18 |
+
|
19 |
+
# OpenHermes-FR field mapping optimized for harmony format
|
20 |
+
input_field="prompt", # French prompts
|
21 |
+
target_field="accepted_completion", # GPT-4o generated completions
|
22 |
+
|
23 |
+
# Enhanced quality filtering for memory-constrained training
|
24 |
+
filter_bad_entries=True, # Critical for memory efficiency
|
25 |
+
bad_entry_field="bad_entry",
|
26 |
+
bad_prompt_field="bad_prompt_detected",
|
27 |
+
bad_response_field="bad_response_detected",
|
28 |
+
|
29 |
+
# Memory-optimized data processing with GPT-OSS Harmony Format
|
30 |
+
concatenate_fields=True,
|
31 |
+
field_separator="\n\n### Réponse:\n", # Fallback separator (harmony format takes precedence)
|
32 |
+
add_eos_token=True, # Required for proper training
|
33 |
+
use_harmony_format=True, # Enable GPT-OSS harmony format
|
34 |
+
|
35 |
+
# Dataset sampling optimized for memory constraints
|
36 |
+
max_samples=200000, # Reduced from 800K for memory efficiency
|
37 |
+
min_length=15, # Slightly higher minimum for quality
|
38 |
+
max_length=2048, # Explicit max length for memory control
|
39 |
+
|
40 |
+
# ============================================================================
|
41 |
+
# MEMORY-OPTIMIZED TRAINING HYPERPARAMETERS
|
42 |
+
# ============================================================================
|
43 |
+
# Batch configuration following memory optimization principles
|
44 |
+
num_train_epochs=1.0, # Single epoch to reduce memory pressure
|
45 |
+
batch_size=2, # Reduced from 6 for memory efficiency
|
46 |
+
gradient_accumulation_steps=16, # Increased to maintain effective batch size 32
|
47 |
+
|
48 |
+
# Learning rate optimized for single epoch + memory constraints
|
49 |
+
learning_rate=2e-4, # Standard GPT-OSS learning rate
|
50 |
+
min_lr=2e-5, # 10% of max learning rate
|
51 |
+
warmup_ratio=0.03, # Reduced warmup for memory efficiency
|
52 |
+
weight_decay=0.01, # Standard L2 regularization
|
53 |
+
max_grad_norm=1.0, # Gradient clipping for stability
|
54 |
+
|
55 |
+
# ============================================================================
|
56 |
+
# MODEL CONFIGURATION - Memory Optimized for GPT-OSS
|
57 |
+
# ============================================================================
|
58 |
+
model_name="openai/gpt-oss-20b",
|
59 |
+
max_seq_length=1024, # Reduced from 3072 for memory optimization
|
60 |
+
use_flash_attention=True, # Critical for memory efficiency
|
61 |
+
use_gradient_checkpointing=True, # Essential for memory optimization
|
62 |
+
|
63 |
+
# Mixed precision optimized for GPT-OSS MXFP4
|
64 |
+
fp16=False, # Not recommended for GPT-OSS
|
65 |
+
bf16=True, # Required for GPT-OSS stability
|
66 |
+
tf32=True, # Enable TF32 for A100/H100 efficiency
|
67 |
+
|
68 |
+
# ============================================================================
|
69 |
+
# LORA CONFIGURATION - Memory Optimized for GPT-OSS MoE
|
70 |
+
# ============================================================================
|
71 |
+
use_lora=True,
|
72 |
+
lora_config={
|
73 |
+
"r": 8, # Reduced rank for memory efficiency
|
74 |
+
"lora_alpha": 16, # 2x rank scaling (memory optimized)
|
75 |
+
"lora_dropout": 0.1, # Higher dropout for better generalization
|
76 |
+
"target_modules": "all-linear", # Apply to all linear layers
|
77 |
+
"target_parameters": [
|
78 |
+
# GPT-OSS specific MoE expert targeting
|
79 |
+
"7.mlp.experts.gate_up_proj",
|
80 |
+
"7.mlp.experts.down_proj",
|
81 |
+
"15.mlp.experts.gate_up_proj",
|
82 |
+
"15.mlp.experts.down_proj",
|
83 |
+
"23.mlp.experts.gate_up_proj",
|
84 |
+
"23.mlp.experts.down_proj",
|
85 |
+
],
|
86 |
+
"bias": "none", # No bias adaptation for memory efficiency
|
87 |
+
"task_type": "CAUSAL_LM",
|
88 |
+
"modules_to_save": [], # Don't save additional modules for memory
|
89 |
+
},
|
90 |
+
|
91 |
+
# ============================================================================
|
92 |
+
# QUANTIZATION - GPT-OSS Native MXFP4 Optimization
|
93 |
+
# ============================================================================
|
94 |
+
use_quantization=True,
|
95 |
+
quantization_config={
|
96 |
+
"dequantize": True, # Use native MXFP4 as per GPT-OSS specs
|
97 |
+
"load_in_4bit": False, # Don't use BNB 4-bit with MXFP4
|
98 |
+
"mxfp4_config": { # Native GPT-OSS MXFP4 settings
|
99 |
+
"enabled": True,
|
100 |
+
"block_size": 32, # Optimized block size for MoE
|
101 |
+
}
|
102 |
+
},
|
103 |
+
|
104 |
+
# ============================================================================
|
105 |
+
# MEMORY OPTIMIZATION CONFIGURATION
|
106 |
+
# ============================================================================
|
107 |
+
# Model loading with memory constraints
|
108 |
+
model_kwargs={
|
109 |
+
"attn_implementation": "eager", # Memory-safe attention
|
110 |
+
"torch_dtype": "auto", # Let model decide (MXFP4 compatible)
|
111 |
+
"use_cache": False, # Disable KV cache for training
|
112 |
+
"device_map": "auto", # Automatic device mapping
|
113 |
+
"low_cpu_mem_usage": True, # Critical for memory optimization
|
114 |
+
"max_memory": {0: "75GB"}, # Reserve memory for other processes
|
115 |
+
},
|
116 |
+
|
117 |
+
# Data loading optimized for memory efficiency
|
118 |
+
dataloader_num_workers=2, # Reduced workers to save memory
|
119 |
+
dataloader_pin_memory=False, # Disable to save memory
|
120 |
+
dataloader_prefetch_factor=1, # Minimal prefetch for memory
|
121 |
+
|
122 |
+
# Memory management optimizations
|
123 |
+
max_memory_per_gpu="75GB", # Explicit memory limit
|
124 |
+
low_cpu_mem_usage=True, # Essential for large models
|
125 |
+
group_by_length=True, # Efficient batching for memory
|
126 |
+
remove_unused_columns=True, # Remove unnecessary data
|
127 |
+
|
128 |
+
# ============================================================================
|
129 |
+
# EVALUATION & LOGGING - Memory Efficient
|
130 |
+
# ============================================================================
|
131 |
+
eval_strategy="steps",
|
132 |
+
eval_steps=500, # Less frequent evaluation for memory
|
133 |
+
logging_steps=50, # Reduced logging frequency
|
134 |
+
|
135 |
+
save_strategy="steps",
|
136 |
+
save_steps=1000, # Less frequent saves for memory/storage
|
137 |
+
save_total_limit=2, # Keep only 2 checkpoints for memory
|
138 |
+
save_only_model=True, # Save only model weights
|
139 |
+
|
140 |
+
metric_for_best_model="eval_loss",
|
141 |
+
greater_is_better=False,
|
142 |
+
load_best_model_at_end=True,
|
143 |
+
|
144 |
+
# Evaluation memory optimization
|
145 |
+
eval_accumulation_steps=4, # Accumulate eval outputs to save memory
|
146 |
+
eval_batch_size=1, # Smaller eval batch size
|
147 |
+
|
148 |
+
# ============================================================================
|
149 |
+
# GPT-OSS HARMONY FORMAT OPTIMIZATION
|
150 |
+
# ============================================================================
|
151 |
+
# Chat template for harmony format compatibility (following exact template)
|
152 |
+
use_chat_template=False, # Use custom harmony format instead
|
153 |
+
chat_template_kwargs={
|
154 |
+
"add_generation_prompt": True,
|
155 |
+
"tokenize": False,
|
156 |
+
# GPT-OSS Harmony Format specific settings (exact template format)
|
157 |
+
"reasoning_effort": "medium", # low, medium, high
|
158 |
+
"model_identity": "You are GPT-Tonic, a large language model trained by TonicAI.",
|
159 |
+
"builtin_tools": [], # Can include "browser" and/or "python"
|
160 |
+
},
|
161 |
+
|
162 |
+
# Generation config optimized for GPT-OSS harmony format (exact template compliance)
|
163 |
+
generation_config={
|
164 |
+
"max_new_tokens": 256, # Reduced for memory efficiency
|
165 |
+
"do_sample": True,
|
166 |
+
"temperature": 0.6, # Slightly lower for more focused training
|
167 |
+
"top_p": 0.9,
|
168 |
+
"top_k": 40, # Reduced for memory efficiency
|
169 |
+
"repetition_penalty": 1.1,
|
170 |
+
"pad_token_id": None,
|
171 |
+
"eos_token_id": None,
|
172 |
+
# GPT-OSS Harmony Format specific settings (exact template format)
|
173 |
+
"reasoning_effort": "medium", # Configurable reasoning level
|
174 |
+
"use_harmony_format": True, # Ensure harmony format in generation
|
175 |
+
},
|
176 |
+
|
177 |
+
# ============================================================================
|
178 |
+
# MULTILINGUAL & REASONING OPTIMIZATION
|
179 |
+
# ============================================================================
|
180 |
+
primary_language="fr", # French as primary language
|
181 |
+
reasoning_languages=["French", "English"], # Bilingual reasoning capability
|
182 |
+
domain_focus="reasoning", # Align with GPT-OSS reasoning focus
|
183 |
+
|
184 |
+
# ============================================================================
|
185 |
+
# OPTIMIZER & SCHEDULER - Memory Optimized
|
186 |
+
# ============================================================================
|
187 |
+
optimizer="adamw_torch", # Memory-efficient optimizer
|
188 |
+
beta1=0.9,
|
189 |
+
beta2=0.95, # GPT-OSS optimized beta2
|
190 |
+
eps=1e-8,
|
191 |
+
|
192 |
+
scheduler="cosine_with_min_lr", # Stable scheduler for single epoch
|
193 |
+
lr_scheduler_kwargs={
|
194 |
+
"min_lr_rate": 0.1,
|
195 |
+
"warmup_steps": None, # Use warmup_ratio instead
|
196 |
+
},
|
197 |
+
|
198 |
+
# ============================================================================
|
199 |
+
# MONITORING & HUB INTEGRATION
|
200 |
+
# ============================================================================
|
201 |
+
enable_tracking=True, # Trackio monitoring
|
202 |
+
log_artifacts=False, # Disable to save memory/storage
|
203 |
+
log_metrics=True,
|
204 |
+
log_config=True,
|
205 |
+
|
206 |
+
push_to_hub=False, # Set to True after successful training
|
207 |
+
hub_model_id=None,
|
208 |
+
hub_private_repo=False,
|
209 |
+
)
|
210 |
+
|
211 |
+
# Configuration validation and optimization tips
|
212 |
+
print("\n🔧 GPT-OSS Memory-Optimized OpenHermes-FR Configuration")
|
213 |
+
print("=" * 60)
|
214 |
+
print(f"📊 Dataset: {config.dataset_name} (200K samples)")
|
215 |
+
print(f"🗣️ Language: French with GPT-OSS Harmony Format")
|
216 |
+
print(f"📈 Training: {config.num_train_epochs} epoch (memory optimized)")
|
217 |
+
print(f"🔄 Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
|
218 |
+
print(f"🧠 LoRA Rank: {config.lora_config['r']} (memory optimized)")
|
219 |
+
print(f"📏 Sequence Length: {config.max_seq_length} (memory optimized)")
|
220 |
+
print(f"💾 Memory Limit: {config.max_memory_per_gpu}")
|
221 |
+
print(f"⚡ Quantization: MXFP4 (GPT-OSS native)")
|
222 |
+
print(f"🔍 Quality Filtering: Enabled")
|
223 |
+
print(f"🎵 GPT-OSS Harmony Format: {'Enabled' if config.use_harmony_format else 'Disabled'}")
|
224 |
+
print("=" * 60)
|
225 |
+
print("\n💡 Memory Optimization Features:")
|
226 |
+
print(" • Native MXFP4 quantization for GPT-OSS MoE layers")
|
227 |
+
print(" • Reduced batch size with increased gradient accumulation")
|
228 |
+
print(" • Limited sequence length for memory efficiency")
|
229 |
+
print(" • Reduced LoRA rank while maintaining effectiveness")
|
230 |
+
print(" • Dataset sampling (200K from 800K) for faster training")
|
231 |
+
print(" • Gradient checkpointing and efficient data loading")
|
232 |
+
print(" • Exact GPT-OSS Harmony format with <|return|> tokens")
|
233 |
+
print("=" * 60)
|
docs/output.svg
ADDED
|
launch.sh
CHANGED
@@ -234,7 +234,34 @@ show_training_configs() {
|
|
234 |
echo " - 4-bit quantization + reduced LoRA"
|
235 |
echo " - Optimized for limited GPU memory"
|
236 |
echo ""
|
237 |
-
echo "9.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
echo " - User-defined parameters"
|
239 |
echo ""
|
240 |
}
|
@@ -325,12 +352,142 @@ get_training_config() {
|
|
325 |
MAX_SEQ_LENGTH=1024
|
326 |
CONFIG_FILE="config/train_gpt_oss_memory_optimized.py"
|
327 |
;;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
"Custom Configuration")
|
329 |
get_custom_config
|
330 |
;;
|
331 |
esac
|
332 |
}
|
333 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
# Function to get custom configuration
|
335 |
get_custom_config() {
|
336 |
print_step "Custom Configuration Setup"
|
@@ -352,6 +509,136 @@ get_custom_config() {
|
|
352 |
fi
|
353 |
}
|
354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
# Function to create training configuration file
|
356 |
create_training_config() {
|
357 |
local config_file="$1"
|
@@ -499,7 +786,7 @@ print_step "Step 2: Training Configuration"
|
|
499 |
echo "=================================="
|
500 |
|
501 |
show_training_configs
|
502 |
-
select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "GPT-OSS Basic Training" "GPT-OSS H100 Optimized" "GPT-OSS Multilingual Reasoning" "GPT-OSS Memory Optimized" "Custom Configuration" TRAINING_CONFIG_TYPE
|
503 |
|
504 |
get_training_config "$TRAINING_CONFIG_TYPE"
|
505 |
|
@@ -836,13 +1123,25 @@ print_info "Dataset: $DATASET_NAME"
|
|
836 |
print_info "Batch size: $BATCH_SIZE"
|
837 |
print_info "Learning rate: $LEARNING_RATE"
|
838 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
839 |
# Step 15: Start training
|
840 |
print_step "Step 15: Starting Training"
|
841 |
echo "=============================="
|
842 |
|
843 |
print_info "Starting training with configuration: $CONFIG_FILE"
|
844 |
print_info "Experiment: $EXPERIMENT_NAME"
|
845 |
-
print_info "Output:
|
846 |
print_info "Trackio: $TRACKIO_URL"
|
847 |
|
848 |
# Ensure environment variables are available for training
|
@@ -852,6 +1151,7 @@ export HF_TOKEN="$HF_TOKEN"
|
|
852 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
853 |
export HF_USERNAME="$HF_USERNAME"
|
854 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
|
|
855 |
|
856 |
# Run the appropriate training script based on model type
|
857 |
if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
|
@@ -859,7 +1159,7 @@ if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
|
|
859 |
python scripts/training/train_gpt_oss.py \
|
860 |
--config "$CONFIG_FILE" \
|
861 |
--experiment-name "$EXPERIMENT_NAME" \
|
862 |
-
--output-dir
|
863 |
--trackio-url "$TRACKIO_URL" \
|
864 |
--trainer-type "$TRAINER_TYPE_LOWER"
|
865 |
else
|
@@ -867,7 +1167,7 @@ else
|
|
867 |
python scripts/training/train.py \
|
868 |
--config "$CONFIG_FILE" \
|
869 |
--experiment-name "$EXPERIMENT_NAME" \
|
870 |
-
--output-dir
|
871 |
--trackio-url "$TRACKIO_URL" \
|
872 |
--trainer-type "$TRAINER_TYPE_LOWER"
|
873 |
fi
|
@@ -877,7 +1177,7 @@ print_step "Step 16: Pushing Model to HF Hub"
|
|
877 |
echo "====================================="
|
878 |
|
879 |
print_info "Pushing model to: $REPO_NAME"
|
880 |
-
print_info "Checkpoint:
|
881 |
|
882 |
# Ensure environment variables are available for model push
|
883 |
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
@@ -886,26 +1186,43 @@ export HF_TOKEN="$HF_TOKEN"
|
|
886 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
887 |
export HF_USERNAME="$HF_USERNAME"
|
888 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
|
|
889 |
|
890 |
# Run the appropriate push script based on model type
|
891 |
if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
|
892 |
print_info "Using GPT-OSS specialized push script..."
|
893 |
-
python scripts/model_tonic/push_gpt_oss_to_huggingface.py
|
894 |
--token "$HF_TOKEN" \
|
895 |
--trackio-url "$TRACKIO_URL" \
|
896 |
--experiment-name "$EXPERIMENT_NAME" \
|
897 |
--dataset-repo "$TRACKIO_DATASET_REPO" \
|
898 |
--author-name "$AUTHOR_NAME" \
|
899 |
-
--model-description "$MODEL_DESCRIPTION"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
900 |
else
|
901 |
print_info "Using standard SmolLM3 push script..."
|
902 |
-
python scripts/model_tonic/push_to_huggingface.py
|
903 |
--token "$HF_TOKEN" \
|
904 |
--trackio-url "$TRACKIO_URL" \
|
905 |
--experiment-name "$EXPERIMENT_NAME" \
|
906 |
--dataset-repo "$TRACKIO_DATASET_REPO" \
|
907 |
--author-name "$AUTHOR_NAME" \
|
908 |
-
--model-description "$MODEL_DESCRIPTION"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
909 |
fi
|
910 |
|
911 |
# Step 16.5: Switch Trackio Space to Read Token (Security)
|
@@ -1018,7 +1335,7 @@ fi)
|
|
1018 |
|
1019 |
## Files Created
|
1020 |
- Training configuration: \`$CONFIG_FILE\`
|
1021 |
-
- Model checkpoint:
|
1022 |
- Training logs: \`training.log\`
|
1023 |
- Summary report: \`training_summary.md\`
|
1024 |
EOF
|
|
|
234 |
echo " - 4-bit quantization + reduced LoRA"
|
235 |
echo " - Optimized for limited GPU memory"
|
236 |
echo ""
|
237 |
+
echo "9. GPT-OSS OpenHermes-FR (Recommended)"
|
238 |
+
echo " - Model: openai/gpt-oss-20b"
|
239 |
+
echo " - Dataset: legmlai/openhermes-fr (800K French examples)"
|
240 |
+
echo " - Epochs: 1.5"
|
241 |
+
echo " - Batch Size: 6 (effective 36 with accumulation)"
|
242 |
+
echo " - Learning Rate: 2.5e-4"
|
243 |
+
echo " - Optimized for French language training"
|
244 |
+
echo " - Quality filtering enabled"
|
245 |
+
echo ""
|
246 |
+
echo "10. GPT-OSS OpenHermes-FR Memory Optimized"
|
247 |
+
echo " - Model: openai/gpt-oss-20b"
|
248 |
+
echo " - Dataset: legmlai/openhermes-fr (200K samples)"
|
249 |
+
echo " - Epochs: 1"
|
250 |
+
echo " - Batch Size: 2 (effective 32 with accumulation)"
|
251 |
+
echo " - Learning Rate: 2e-4"
|
252 |
+
echo " - Native MXFP4 quantization"
|
253 |
+
echo " - Memory optimized for 40-80GB GPUs"
|
254 |
+
echo " - Harmony format compatible"
|
255 |
+
echo ""
|
256 |
+
echo "10. GPT-OSS Custom Dataset"
|
257 |
+
echo " - Model: openai/gpt-oss-20b"
|
258 |
+
echo " - Dataset: User-defined (fully customizable)"
|
259 |
+
echo " - Epochs: Configurable"
|
260 |
+
echo " - Batch Size: Configurable"
|
261 |
+
echo " - Learning Rate: Configurable"
|
262 |
+
echo " - Maximum flexibility with all parameters"
|
263 |
+
echo ""
|
264 |
+
echo "11. Custom Configuration"
|
265 |
echo " - User-defined parameters"
|
266 |
echo ""
|
267 |
}
|
|
|
352 |
MAX_SEQ_LENGTH=1024
|
353 |
CONFIG_FILE="config/train_gpt_oss_memory_optimized.py"
|
354 |
;;
|
355 |
+
"GPT-OSS OpenHermes-FR (Recommended)")
|
356 |
+
MODEL_NAME="openai/gpt-oss-20b"
|
357 |
+
DATASET_NAME="legmlai/openhermes-fr"
|
358 |
+
MAX_EPOCHS=1.5
|
359 |
+
BATCH_SIZE=6
|
360 |
+
GRADIENT_ACCUMULATION_STEPS=6
|
361 |
+
LEARNING_RATE=2.5e-4
|
362 |
+
MAX_SEQ_LENGTH=3072
|
363 |
+
CONFIG_FILE="config/train_gpt_oss_openhermes_fr.py"
|
364 |
+
;;
|
365 |
+
"GPT-OSS OpenHermes-FR Memory Optimized")
|
366 |
+
MODEL_NAME="openai/gpt-oss-20b"
|
367 |
+
DATASET_NAME="legmlai/openhermes-fr"
|
368 |
+
MAX_EPOCHS=1
|
369 |
+
BATCH_SIZE=2
|
370 |
+
GRADIENT_ACCUMULATION_STEPS=16
|
371 |
+
LEARNING_RATE=2e-4
|
372 |
+
MAX_SEQ_LENGTH=1024
|
373 |
+
CONFIG_FILE="config/train_gpt_oss_openhermes_fr_memory_optimized.py"
|
374 |
+
;;
|
375 |
+
"GPT-OSS Custom Dataset")
|
376 |
+
MODEL_NAME="openai/gpt-oss-20b"
|
377 |
+
DATASET_NAME="legmlai/openhermes-fr" # Will be customizable
|
378 |
+
MAX_EPOCHS=1
|
379 |
+
BATCH_SIZE=4
|
380 |
+
GRADIENT_ACCUMULATION_STEPS=4
|
381 |
+
LEARNING_RATE=2e-4
|
382 |
+
MAX_SEQ_LENGTH=2048
|
383 |
+
CONFIG_FILE="config/train_gpt_oss_custom.py"
|
384 |
+
get_custom_dataset_config
|
385 |
+
;;
|
386 |
"Custom Configuration")
|
387 |
get_custom_config
|
388 |
;;
|
389 |
esac
|
390 |
}
|
391 |
|
392 |
+
# Function to get custom dataset configuration
|
393 |
+
get_custom_dataset_config() {
|
394 |
+
print_step "GPT-OSS Custom Configuration"
|
395 |
+
echo "======================================"
|
396 |
+
|
397 |
+
echo "Configure your GPT-OSS training:"
|
398 |
+
echo ""
|
399 |
+
|
400 |
+
# Dataset Configuration
|
401 |
+
print_info "📊 Dataset Configuration"
|
402 |
+
get_input "Dataset name (HuggingFace format: username/dataset)" "legmlai/openhermes-fr" DATASET_NAME
|
403 |
+
get_input "Dataset split" "train" DATASET_SPLIT
|
404 |
+
|
405 |
+
echo ""
|
406 |
+
echo "Dataset format options:"
|
407 |
+
echo "1. OpenHermes-FR (prompt + accepted_completion fields)"
|
408 |
+
echo "2. Messages format (chat conversations)"
|
409 |
+
echo "3. Text format (plain text field)"
|
410 |
+
echo "4. Custom format (specify field names)"
|
411 |
+
echo ""
|
412 |
+
|
413 |
+
select_option "Select dataset format:" "OpenHermes-FR" "Messages format" "Text format" "Custom format" DATASET_FORMAT
|
414 |
+
|
415 |
+
case "$DATASET_FORMAT" in
|
416 |
+
"OpenHermes-FR")
|
417 |
+
INPUT_FIELD="prompt"
|
418 |
+
TARGET_FIELD="accepted_completion"
|
419 |
+
DATASET_FORMAT_CODE="openhermes_fr"
|
420 |
+
FILTER_BAD_ENTRIES="true"
|
421 |
+
;;
|
422 |
+
"Messages format")
|
423 |
+
INPUT_FIELD="messages"
|
424 |
+
TARGET_FIELD=""
|
425 |
+
DATASET_FORMAT_CODE="messages"
|
426 |
+
FILTER_BAD_ENTRIES="false"
|
427 |
+
;;
|
428 |
+
"Text format")
|
429 |
+
INPUT_FIELD="text"
|
430 |
+
TARGET_FIELD=""
|
431 |
+
DATASET_FORMAT_CODE="text"
|
432 |
+
FILTER_BAD_ENTRIES="false"
|
433 |
+
;;
|
434 |
+
"Custom format")
|
435 |
+
get_input "Input field name" "prompt" INPUT_FIELD
|
436 |
+
get_input "Target field name (leave empty if not needed)" "accepted_completion" TARGET_FIELD
|
437 |
+
DATASET_FORMAT_CODE="custom"
|
438 |
+
get_input "Filter bad entries? (true/false)" "false" FILTER_BAD_ENTRIES
|
439 |
+
;;
|
440 |
+
esac
|
441 |
+
|
442 |
+
# Dataset Filtering Options
|
443 |
+
echo ""
|
444 |
+
print_info "🔍 Dataset Filtering Options"
|
445 |
+
get_input "Maximum samples to use (leave empty for all)" "" MAX_SAMPLES
|
446 |
+
get_input "Minimum sequence length" "10" MIN_LENGTH
|
447 |
+
get_input "Maximum sequence length (leave empty for auto)" "" MAX_LENGTH
|
448 |
+
|
449 |
+
# Training Hyperparameters
|
450 |
+
echo ""
|
451 |
+
print_info "⚙️ Training Hyperparameters"
|
452 |
+
get_input "Number of epochs" "1.0" NUM_EPOCHS
|
453 |
+
get_input "Batch size per device" "4" BATCH_SIZE
|
454 |
+
get_input "Gradient accumulation steps" "4" GRAD_ACCUM_STEPS
|
455 |
+
get_input "Learning rate" "2e-4" LEARNING_RATE
|
456 |
+
get_input "Minimum learning rate" "2e-5" MIN_LR
|
457 |
+
get_input "Weight decay" "0.01" WEIGHT_DECAY
|
458 |
+
get_input "Warmup ratio" "0.03" WARMUP_RATIO
|
459 |
+
|
460 |
+
# Sequence Length
|
461 |
+
echo ""
|
462 |
+
print_info "📏 Sequence Configuration"
|
463 |
+
get_input "Maximum sequence length" "2048" MAX_SEQ_LENGTH
|
464 |
+
|
465 |
+
# LoRA Configuration
|
466 |
+
echo ""
|
467 |
+
print_info "🎛️ LoRA Configuration"
|
468 |
+
get_input "LoRA rank" "16" LORA_RANK
|
469 |
+
get_input "LoRA alpha" "32" LORA_ALPHA
|
470 |
+
get_input "LoRA dropout" "0.05" LORA_DROPOUT
|
471 |
+
|
472 |
+
# Memory & Performance
|
473 |
+
echo ""
|
474 |
+
print_info "💾 Memory & Performance"
|
475 |
+
select_option "Mixed precision:" "BF16 (recommended)" "FP16" "FP32" MIXED_PRECISION
|
476 |
+
get_input "Data loading workers" "4" NUM_WORKERS
|
477 |
+
select_option "Quantization:" "MXFP4 (default)" "4-bit BNB" "None" QUANTIZATION_TYPE
|
478 |
+
|
479 |
+
# Advanced Options
|
480 |
+
echo ""
|
481 |
+
echo "Advanced options (press Enter for defaults):"
|
482 |
+
get_input "Max gradient norm" "1.0" MAX_GRAD_NORM
|
483 |
+
get_input "Logging steps" "10" LOGGING_STEPS
|
484 |
+
get_input "Evaluation steps" "100" EVAL_STEPS
|
485 |
+
get_input "Save steps" "500" SAVE_STEPS
|
486 |
+
|
487 |
+
# Update the custom config file with user's choices
|
488 |
+
update_enhanced_gpt_oss_config
|
489 |
+
}
|
490 |
+
|
491 |
# Function to get custom configuration
|
492 |
get_custom_config() {
|
493 |
print_step "Custom Configuration Setup"
|
|
|
509 |
fi
|
510 |
}
|
511 |
|
512 |
+
# Function to update enhanced GPT-OSS config with user choices
|
513 |
+
update_enhanced_gpt_oss_config() {
|
514 |
+
print_info "Generating enhanced custom GPT-OSS configuration..."
|
515 |
+
|
516 |
+
# Process mixed precision setting
|
517 |
+
case "$MIXED_PRECISION" in
|
518 |
+
"BF16 (recommended)")
|
519 |
+
FP16="False"
|
520 |
+
BF16="True"
|
521 |
+
;;
|
522 |
+
"FP16")
|
523 |
+
FP16="True"
|
524 |
+
BF16="False"
|
525 |
+
;;
|
526 |
+
"FP32")
|
527 |
+
FP16="False"
|
528 |
+
BF16="False"
|
529 |
+
;;
|
530 |
+
esac
|
531 |
+
|
532 |
+
# Process quantization setting
|
533 |
+
case "$QUANTIZATION_TYPE" in
|
534 |
+
"MXFP4 (default)")
|
535 |
+
USE_QUANTIZATION="True"
|
536 |
+
QUANTIZATION_CONFIG='{"dequantize": True, "load_in_4bit": False}'
|
537 |
+
;;
|
538 |
+
"4-bit BNB")
|
539 |
+
USE_QUANTIZATION="True"
|
540 |
+
QUANTIZATION_CONFIG='{"dequantize": False, "load_in_4bit": True, "bnb_4bit_compute_dtype": "bfloat16", "bnb_4bit_use_double_quant": True, "bnb_4bit_quant_type": "nf4"}'
|
541 |
+
;;
|
542 |
+
"None")
|
543 |
+
USE_QUANTIZATION="False"
|
544 |
+
QUANTIZATION_CONFIG='{"dequantize": False, "load_in_4bit": False}'
|
545 |
+
;;
|
546 |
+
esac
|
547 |
+
|
548 |
+
# Create enhanced config file with all user choices
|
549 |
+
cat > "$CONFIG_FILE" << EOF
|
550 |
+
"""
|
551 |
+
GPT-OSS Enhanced Custom Training Configuration - Generated by launch.sh
|
552 |
+
Dataset: $DATASET_NAME ($DATASET_FORMAT)
|
553 |
+
Optimized for: ${DATASET_FORMAT} format with full customization
|
554 |
+
"""
|
555 |
+
|
556 |
+
from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
|
557 |
+
|
558 |
+
# Create enhanced config with all customizations
|
559 |
+
config = GPTOSSEnhancedCustomConfig(
|
560 |
+
# ============================================================================
|
561 |
+
# DATASET CONFIGURATION
|
562 |
+
# ============================================================================
|
563 |
+
dataset_name="$DATASET_NAME",
|
564 |
+
dataset_split="$DATASET_SPLIT",
|
565 |
+
dataset_format="$DATASET_FORMAT_CODE",
|
566 |
+
input_field="$INPUT_FIELD",
|
567 |
+
target_field=$(if [ -n "$TARGET_FIELD" ]; then echo "\"$TARGET_FIELD\""; else echo "None"; fi),
|
568 |
+
filter_bad_entries=$FILTER_BAD_ENTRIES,
|
569 |
+
max_samples=$(if [ -n "$MAX_SAMPLES" ]; then echo "$MAX_SAMPLES"; else echo "None"; fi),
|
570 |
+
min_length=$MIN_LENGTH,
|
571 |
+
max_length=$(if [ -n "$MAX_LENGTH" ]; then echo "$MAX_LENGTH"; else echo "None"; fi),
|
572 |
+
|
573 |
+
# ============================================================================
|
574 |
+
# TRAINING HYPERPARAMETERS
|
575 |
+
# ============================================================================
|
576 |
+
num_train_epochs=$NUM_EPOCHS,
|
577 |
+
batch_size=$BATCH_SIZE,
|
578 |
+
gradient_accumulation_steps=$GRAD_ACCUM_STEPS,
|
579 |
+
learning_rate=$LEARNING_RATE,
|
580 |
+
min_lr=$MIN_LR,
|
581 |
+
weight_decay=$WEIGHT_DECAY,
|
582 |
+
warmup_ratio=$WARMUP_RATIO,
|
583 |
+
max_grad_norm=$MAX_GRAD_NORM,
|
584 |
+
|
585 |
+
# ============================================================================
|
586 |
+
# MODEL CONFIGURATION
|
587 |
+
# ============================================================================
|
588 |
+
max_seq_length=$MAX_SEQ_LENGTH,
|
589 |
+
|
590 |
+
# ============================================================================
|
591 |
+
# MIXED PRECISION
|
592 |
+
# ============================================================================
|
593 |
+
fp16=$FP16,
|
594 |
+
bf16=$BF16,
|
595 |
+
|
596 |
+
# ============================================================================
|
597 |
+
# LORA CONFIGURATION
|
598 |
+
# ============================================================================
|
599 |
+
lora_config={
|
600 |
+
"r": $LORA_RANK,
|
601 |
+
"lora_alpha": $LORA_ALPHA,
|
602 |
+
"lora_dropout": $LORA_DROPOUT,
|
603 |
+
"target_modules": "all-linear",
|
604 |
+
"bias": "none",
|
605 |
+
"task_type": "CAUSAL_LM",
|
606 |
+
},
|
607 |
+
|
608 |
+
# ============================================================================
|
609 |
+
# QUANTIZATION CONFIGURATION
|
610 |
+
# ============================================================================
|
611 |
+
use_quantization=$USE_QUANTIZATION,
|
612 |
+
quantization_config=$QUANTIZATION_CONFIG,
|
613 |
+
|
614 |
+
# ============================================================================
|
615 |
+
# PERFORMANCE CONFIGURATION
|
616 |
+
# ============================================================================
|
617 |
+
dataloader_num_workers=$NUM_WORKERS,
|
618 |
+
dataloader_pin_memory=True,
|
619 |
+
group_by_length=True,
|
620 |
+
|
621 |
+
# ============================================================================
|
622 |
+
# LOGGING & EVALUATION
|
623 |
+
# ============================================================================
|
624 |
+
logging_steps=$LOGGING_STEPS,
|
625 |
+
eval_steps=$EVAL_STEPS,
|
626 |
+
save_steps=$SAVE_STEPS,
|
627 |
+
|
628 |
+
# ============================================================================
|
629 |
+
# RUNTIME CONFIGURATION
|
630 |
+
# ============================================================================
|
631 |
+
experiment_name="$EXPERIMENT_NAME",
|
632 |
+
trackio_url="$TRACKIO_URL",
|
633 |
+
dataset_repo="$TRACKIO_DATASET_REPO",
|
634 |
+
enable_tracking=True,
|
635 |
+
)
|
636 |
+
EOF
|
637 |
+
|
638 |
+
print_status "Enhanced GPT-OSS configuration generated successfully!"
|
639 |
+
print_info "Configuration saved to: $CONFIG_FILE"
|
640 |
+
}
|
641 |
+
|
642 |
# Function to create training configuration file
|
643 |
create_training_config() {
|
644 |
local config_file="$1"
|
|
|
786 |
echo "=================================="
|
787 |
|
788 |
show_training_configs
|
789 |
+
select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "GPT-OSS Basic Training" "GPT-OSS H100 Optimized" "GPT-OSS Multilingual Reasoning" "GPT-OSS Memory Optimized" "GPT-OSS OpenHermes-FR (Recommended)" "GPT-OSS OpenHermes-FR Memory Optimized" "GPT-OSS Custom Dataset" "Custom Configuration" TRAINING_CONFIG_TYPE
|
790 |
|
791 |
get_training_config "$TRAINING_CONFIG_TYPE"
|
792 |
|
|
|
1123 |
print_info "Batch size: $BATCH_SIZE"
|
1124 |
print_info "Learning rate: $LEARNING_RATE"
|
1125 |
|
1126 |
+
# Step 14.5: Define Output Directory
|
1127 |
+
print_step "Step 14.5: Output Directory Configuration"
|
1128 |
+
echo "============================================="
|
1129 |
+
|
1130 |
+
# Define the output directory for training results
|
1131 |
+
OUTPUT_DIR="./outputs/${EXPERIMENT_NAME}_$(date +%Y%m%d_%H%M%S)"
|
1132 |
+
print_info "Training output directory: $OUTPUT_DIR"
|
1133 |
+
|
1134 |
+
# Create output directory
|
1135 |
+
mkdir -p "$OUTPUT_DIR"
|
1136 |
+
print_status "Output directory created: $OUTPUT_DIR"
|
1137 |
+
|
1138 |
# Step 15: Start training
|
1139 |
print_step "Step 15: Starting Training"
|
1140 |
echo "=============================="
|
1141 |
|
1142 |
print_info "Starting training with configuration: $CONFIG_FILE"
|
1143 |
print_info "Experiment: $EXPERIMENT_NAME"
|
1144 |
+
print_info "Output: $OUTPUT_DIR"
|
1145 |
print_info "Trackio: $TRACKIO_URL"
|
1146 |
|
1147 |
# Ensure environment variables are available for training
|
|
|
1151 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
1152 |
export HF_USERNAME="$HF_USERNAME"
|
1153 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
1154 |
+
export OUTPUT_DIR="$OUTPUT_DIR"
|
1155 |
|
1156 |
# Run the appropriate training script based on model type
|
1157 |
if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
|
|
|
1159 |
python scripts/training/train_gpt_oss.py \
|
1160 |
--config "$CONFIG_FILE" \
|
1161 |
--experiment-name "$EXPERIMENT_NAME" \
|
1162 |
+
--output-dir "$OUTPUT_DIR" \
|
1163 |
--trackio-url "$TRACKIO_URL" \
|
1164 |
--trainer-type "$TRAINER_TYPE_LOWER"
|
1165 |
else
|
|
|
1167 |
python scripts/training/train.py \
|
1168 |
--config "$CONFIG_FILE" \
|
1169 |
--experiment-name "$EXPERIMENT_NAME" \
|
1170 |
+
--output-dir "$OUTPUT_DIR" \
|
1171 |
--trackio-url "$TRACKIO_URL" \
|
1172 |
--trainer-type "$TRAINER_TYPE_LOWER"
|
1173 |
fi
|
|
|
1177 |
echo "====================================="
|
1178 |
|
1179 |
print_info "Pushing model to: $REPO_NAME"
|
1180 |
+
print_info "Checkpoint: $OUTPUT_DIR"
|
1181 |
|
1182 |
# Ensure environment variables are available for model push
|
1183 |
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
|
|
1186 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
1187 |
export HF_USERNAME="$HF_USERNAME"
|
1188 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
1189 |
+
export OUTPUT_DIR="$OUTPUT_DIR"
|
1190 |
|
1191 |
# Run the appropriate push script based on model type
|
1192 |
if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
|
1193 |
print_info "Using GPT-OSS specialized push script..."
|
1194 |
+
python scripts/model_tonic/push_gpt_oss_to_huggingface.py "$OUTPUT_DIR" "$REPO_NAME" \
|
1195 |
--token "$HF_TOKEN" \
|
1196 |
--trackio-url "$TRACKIO_URL" \
|
1197 |
--experiment-name "$EXPERIMENT_NAME" \
|
1198 |
--dataset-repo "$TRACKIO_DATASET_REPO" \
|
1199 |
--author-name "$AUTHOR_NAME" \
|
1200 |
+
--model-description "$MODEL_DESCRIPTION" \
|
1201 |
+
--training-config-type "$TRAINING_CONFIG_TYPE" \
|
1202 |
+
--model-name "$MODEL_NAME" \
|
1203 |
+
--dataset-name "$DATASET_NAME" \
|
1204 |
+
--batch-size "$BATCH_SIZE" \
|
1205 |
+
--learning-rate "$LEARNING_RATE" \
|
1206 |
+
--max-epochs "$MAX_EPOCHS" \
|
1207 |
+
--max-seq-length "$MAX_SEQ_LENGTH" \
|
1208 |
+
--trainer-type "$TRAINER_TYPE"
|
1209 |
else
|
1210 |
print_info "Using standard SmolLM3 push script..."
|
1211 |
+
python scripts/model_tonic/push_to_huggingface.py "$OUTPUT_DIR" "$REPO_NAME" \
|
1212 |
--token "$HF_TOKEN" \
|
1213 |
--trackio-url "$TRACKIO_URL" \
|
1214 |
--experiment-name "$EXPERIMENT_NAME" \
|
1215 |
--dataset-repo "$TRACKIO_DATASET_REPO" \
|
1216 |
--author-name "$AUTHOR_NAME" \
|
1217 |
+
--model-description "$MODEL_DESCRIPTION" \
|
1218 |
+
--training-config-type "$TRAINING_CONFIG_TYPE" \
|
1219 |
+
--model-name "$MODEL_NAME" \
|
1220 |
+
--dataset-name "$DATASET_NAME" \
|
1221 |
+
--batch-size "$BATCH_SIZE" \
|
1222 |
+
--learning-rate "$LEARNING_RATE" \
|
1223 |
+
--max-epochs "$MAX_EPOCHS" \
|
1224 |
+
--max-seq-length "$MAX_SEQ_LENGTH" \
|
1225 |
+
--trainer-type "$TRAINER_TYPE"
|
1226 |
fi
|
1227 |
|
1228 |
# Step 16.5: Switch Trackio Space to Read Token (Security)
|
|
|
1335 |
|
1336 |
## Files Created
|
1337 |
- Training configuration: \`$CONFIG_FILE\`
|
1338 |
+
- Model checkpoint: \`$OUTPUT_DIR/\`
|
1339 |
- Training logs: \`training.log\`
|
1340 |
- Summary report: \`training_summary.md\`
|
1341 |
EOF
|
scripts/model_tonic/push_gpt_oss_to_huggingface.py
CHANGED
@@ -43,8 +43,59 @@ def merge_lora_weights(checkpoint_path, base_model_name, output_path):
|
|
43 |
|
44 |
return model, tokenizer
|
45 |
|
46 |
-
def create_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description):
|
47 |
-
"""Create a comprehensive model card for GPT-OSS models"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
card_content = f"""---
|
50 |
language:
|
@@ -196,7 +247,7 @@ This model is licensed under the MIT License.
|
|
196 |
|
197 |
return card_content
|
198 |
|
199 |
-
def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experiment_name, dataset_repo, author_name, model_description):
|
200 |
"""Push GPT-OSS model to Hugging Face Hub"""
|
201 |
|
202 |
print("=== GPT-OSS Model Push Pipeline ===")
|
@@ -230,7 +281,14 @@ def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experi
|
|
230 |
trackio_url=trackio_url,
|
231 |
dataset_repo=dataset_repo,
|
232 |
author_name=author_name,
|
233 |
-
model_description=model_description
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
)
|
235 |
|
236 |
# Save model card
|
@@ -291,6 +349,14 @@ def main():
|
|
291 |
parser.add_argument("--dataset-repo", help="Dataset repository")
|
292 |
parser.add_argument("--author-name", help="Author name")
|
293 |
parser.add_argument("--model-description", help="Model description")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
|
295 |
args = parser.parse_args()
|
296 |
|
@@ -308,7 +374,15 @@ def main():
|
|
308 |
experiment_name=experiment_name,
|
309 |
dataset_repo=dataset_repo,
|
310 |
author_name=author_name,
|
311 |
-
model_description=model_description
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
)
|
313 |
|
314 |
sys.exit(0 if success else 1)
|
|
|
43 |
|
44 |
return model, tokenizer
|
45 |
|
46 |
+
def create_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description, training_config_type=None, dataset_name=None, batch_size=None, learning_rate=None, max_epochs=None, max_seq_length=None, trainer_type=None):
|
47 |
+
"""Create a comprehensive model card for GPT-OSS models using generate_model_card.py"""
|
48 |
+
|
49 |
+
try:
|
50 |
+
# Import the model card generator
|
51 |
+
import sys
|
52 |
+
import os
|
53 |
+
sys.path.append(os.path.join(os.path.dirname(__file__)))
|
54 |
+
from generate_model_card import ModelCardGenerator, create_default_variables
|
55 |
+
|
56 |
+
# Create generator
|
57 |
+
generator = ModelCardGenerator()
|
58 |
+
|
59 |
+
# Create variables for the model card
|
60 |
+
variables = create_default_variables()
|
61 |
+
|
62 |
+
# Update with GPT-OSS specific values
|
63 |
+
variables.update({
|
64 |
+
"repo_name": model_name,
|
65 |
+
"model_name": model_name.split('/')[-1],
|
66 |
+
"experiment_name": experiment_name or "gpt_oss_finetune",
|
67 |
+
"dataset_repo": dataset_repo,
|
68 |
+
"author_name": author_name or "GPT-OSS Fine-tuner",
|
69 |
+
"model_description": model_description or "A fine-tuned version of OpenAI's GPT-OSS-20B model for multilingual reasoning tasks.",
|
70 |
+
"training_config_type": training_config_type or "GPT-OSS Configuration",
|
71 |
+
"base_model": "openai/gpt-oss-20b",
|
72 |
+
"dataset_name": dataset_name or "HuggingFaceH4/Multilingual-Thinking",
|
73 |
+
"trainer_type": trainer_type or "SFTTrainer",
|
74 |
+
"batch_size": str(batch_size) if batch_size else "4",
|
75 |
+
"learning_rate": str(learning_rate) if learning_rate else "2e-4",
|
76 |
+
"max_epochs": str(max_epochs) if max_epochs else "1",
|
77 |
+
"max_seq_length": str(max_seq_length) if max_seq_length else "2048",
|
78 |
+
"hardware_info": "GPU (H100/A100)",
|
79 |
+
"trackio_url": trackio_url or "N/A",
|
80 |
+
"training_loss": "N/A",
|
81 |
+
"validation_loss": "N/A",
|
82 |
+
"perplexity": "N/A",
|
83 |
+
"quantized_models": False
|
84 |
+
})
|
85 |
+
|
86 |
+
# Generate the model card
|
87 |
+
model_card_content = generator.generate_model_card(variables)
|
88 |
+
|
89 |
+
print("✅ Model card generated using generate_model_card.py")
|
90 |
+
return model_card_content
|
91 |
+
|
92 |
+
except Exception as e:
|
93 |
+
print(f"❌ Failed to generate model card with generator: {e}")
|
94 |
+
print("🔄 Falling back to original GPT-OSS model card")
|
95 |
+
return _create_original_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description)
|
96 |
+
|
97 |
+
def _create_original_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description):
|
98 |
+
"""Create the original GPT-OSS model card as fallback"""
|
99 |
|
100 |
card_content = f"""---
|
101 |
language:
|
|
|
247 |
|
248 |
return card_content
|
249 |
|
250 |
+
def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experiment_name, dataset_repo, author_name, model_description, training_config_type=None, model_name=None, dataset_name=None, batch_size=None, learning_rate=None, max_epochs=None, max_seq_length=None, trainer_type=None):
|
251 |
"""Push GPT-OSS model to Hugging Face Hub"""
|
252 |
|
253 |
print("=== GPT-OSS Model Push Pipeline ===")
|
|
|
281 |
trackio_url=trackio_url,
|
282 |
dataset_repo=dataset_repo,
|
283 |
author_name=author_name,
|
284 |
+
model_description=model_description,
|
285 |
+
training_config_type=training_config_type,
|
286 |
+
dataset_name=dataset_name,
|
287 |
+
batch_size=batch_size,
|
288 |
+
learning_rate=learning_rate,
|
289 |
+
max_epochs=max_epochs,
|
290 |
+
max_seq_length=max_seq_length,
|
291 |
+
trainer_type=trainer_type
|
292 |
)
|
293 |
|
294 |
# Save model card
|
|
|
349 |
parser.add_argument("--dataset-repo", help="Dataset repository")
|
350 |
parser.add_argument("--author-name", help="Author name")
|
351 |
parser.add_argument("--model-description", help="Model description")
|
352 |
+
parser.add_argument("--training-config-type", help="Training configuration type")
|
353 |
+
parser.add_argument("--model-name", help="Base model name")
|
354 |
+
parser.add_argument("--dataset-name", help="Dataset name")
|
355 |
+
parser.add_argument("--batch-size", help="Batch size")
|
356 |
+
parser.add_argument("--learning-rate", help="Learning rate")
|
357 |
+
parser.add_argument("--max-epochs", help="Maximum epochs")
|
358 |
+
parser.add_argument("--max-seq-length", help="Maximum sequence length")
|
359 |
+
parser.add_argument("--trainer-type", help="Trainer type")
|
360 |
|
361 |
args = parser.parse_args()
|
362 |
|
|
|
374 |
experiment_name=experiment_name,
|
375 |
dataset_repo=dataset_repo,
|
376 |
author_name=author_name,
|
377 |
+
model_description=model_description,
|
378 |
+
training_config_type=args.training_config_type,
|
379 |
+
model_name=args.model_name,
|
380 |
+
dataset_name=args.dataset_name,
|
381 |
+
batch_size=args.batch_size,
|
382 |
+
learning_rate=args.learning_rate,
|
383 |
+
max_epochs=args.max_epochs,
|
384 |
+
max_seq_length=args.max_seq_length,
|
385 |
+
trainer_type=args.trainer_type
|
386 |
)
|
387 |
|
388 |
sys.exit(0 if success else 1)
|
scripts/model_tonic/push_to_huggingface.py
CHANGED
@@ -62,7 +62,15 @@ class HuggingFacePusher:
|
|
62 |
dataset_repo: Optional[str] = None,
|
63 |
hf_token: Optional[str] = None,
|
64 |
author_name: Optional[str] = None,
|
65 |
-
model_description: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
):
|
67 |
self.model_path = Path(model_path)
|
68 |
self.repo_name = repo_name
|
@@ -73,6 +81,16 @@ class HuggingFacePusher:
|
|
73 |
self.author_name = author_name
|
74 |
self.model_description = model_description
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
# HF Datasets configuration
|
77 |
self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
|
78 |
self.hf_token = hf_token or os.getenv('HF_TOKEN')
|
@@ -156,9 +174,53 @@ class HuggingFacePusher:
|
|
156 |
return True
|
157 |
|
158 |
def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
|
159 |
-
"""Create a comprehensive model card using the
|
160 |
-
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
def _create_simple_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
|
164 |
"""Create a simple model card without complex YAML to avoid formatting issues"""
|
@@ -531,6 +593,14 @@ def parse_args():
|
|
531 |
parser.add_argument('--dataset-repo', type=str, default=None, help='HF Dataset repository for experiment storage')
|
532 |
parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
|
533 |
parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
534 |
|
535 |
return parser.parse_args()
|
536 |
|
@@ -558,7 +628,15 @@ def main():
|
|
558 |
dataset_repo=args.dataset_repo,
|
559 |
hf_token=args.hf_token,
|
560 |
author_name=args.author_name,
|
561 |
-
model_description=args.model_description
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
562 |
)
|
563 |
|
564 |
# Push model
|
|
|
62 |
dataset_repo: Optional[str] = None,
|
63 |
hf_token: Optional[str] = None,
|
64 |
author_name: Optional[str] = None,
|
65 |
+
model_description: Optional[str] = None,
|
66 |
+
training_config_type: Optional[str] = None,
|
67 |
+
model_name: Optional[str] = None,
|
68 |
+
dataset_name: Optional[str] = None,
|
69 |
+
batch_size: Optional[str] = None,
|
70 |
+
learning_rate: Optional[str] = None,
|
71 |
+
max_epochs: Optional[str] = None,
|
72 |
+
max_seq_length: Optional[str] = None,
|
73 |
+
trainer_type: Optional[str] = None
|
74 |
):
|
75 |
self.model_path = Path(model_path)
|
76 |
self.repo_name = repo_name
|
|
|
81 |
self.author_name = author_name
|
82 |
self.model_description = model_description
|
83 |
|
84 |
+
# Training configuration details for model card generation
|
85 |
+
self.training_config_type = training_config_type
|
86 |
+
self.model_name = model_name
|
87 |
+
self.dataset_name = dataset_name
|
88 |
+
self.batch_size = batch_size
|
89 |
+
self.learning_rate = learning_rate
|
90 |
+
self.max_epochs = max_epochs
|
91 |
+
self.max_seq_length = max_seq_length
|
92 |
+
self.trainer_type = trainer_type
|
93 |
+
|
94 |
# HF Datasets configuration
|
95 |
self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
|
96 |
self.hf_token = hf_token or os.getenv('HF_TOKEN')
|
|
|
174 |
return True
|
175 |
|
176 |
def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
|
177 |
+
"""Create a comprehensive model card using the generate_model_card.py script"""
|
178 |
+
try:
|
179 |
+
# Import the model card generator
|
180 |
+
import sys
|
181 |
+
sys.path.append(os.path.join(os.path.dirname(__file__)))
|
182 |
+
from generate_model_card import ModelCardGenerator, create_default_variables
|
183 |
+
|
184 |
+
# Create generator
|
185 |
+
generator = ModelCardGenerator()
|
186 |
+
|
187 |
+
# Create variables for the model card
|
188 |
+
variables = create_default_variables()
|
189 |
+
|
190 |
+
# Update with actual values
|
191 |
+
variables.update({
|
192 |
+
"repo_name": self.repo_name,
|
193 |
+
"model_name": self.repo_name.split('/')[-1],
|
194 |
+
"experiment_name": self.experiment_name or "model_push",
|
195 |
+
"dataset_repo": self.dataset_repo,
|
196 |
+
"author_name": self.author_name or "Model Author",
|
197 |
+
"model_description": self.model_description or "A fine-tuned version of SmolLM3-3B for improved text generation capabilities.",
|
198 |
+
"training_config_type": self.training_config_type or "Custom Configuration",
|
199 |
+
"base_model": self.model_name or "HuggingFaceTB/SmolLM3-3B",
|
200 |
+
"dataset_name": self.dataset_name or "Custom Dataset",
|
201 |
+
"trainer_type": self.trainer_type or "SFTTrainer",
|
202 |
+
"batch_size": str(self.batch_size) if self.batch_size else "8",
|
203 |
+
"learning_rate": str(self.learning_rate) if self.learning_rate else "5e-6",
|
204 |
+
"max_epochs": str(self.max_epochs) if self.max_epochs else "3",
|
205 |
+
"max_seq_length": str(self.max_seq_length) if self.max_seq_length else "2048",
|
206 |
+
"hardware_info": self._get_hardware_info(),
|
207 |
+
"trackio_url": self.trackio_url or "N/A",
|
208 |
+
"training_loss": str(results.get('train_loss', 'N/A')),
|
209 |
+
"validation_loss": str(results.get('eval_loss', 'N/A')),
|
210 |
+
"perplexity": str(results.get('perplexity', 'N/A')),
|
211 |
+
"quantized_models": False # Set to True if quantized models are available
|
212 |
+
})
|
213 |
+
|
214 |
+
# Generate the model card
|
215 |
+
model_card_content = generator.generate_model_card(variables)
|
216 |
+
|
217 |
+
logger.info("✅ Model card generated using generate_model_card.py")
|
218 |
+
return model_card_content
|
219 |
+
|
220 |
+
except Exception as e:
|
221 |
+
logger.error(f"❌ Failed to generate model card with generator: {e}")
|
222 |
+
logger.info("🔄 Falling back to simple model card")
|
223 |
+
return self._create_simple_model_card(training_config, results)
|
224 |
|
225 |
def _create_simple_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
|
226 |
"""Create a simple model card without complex YAML to avoid formatting issues"""
|
|
|
593 |
parser.add_argument('--dataset-repo', type=str, default=None, help='HF Dataset repository for experiment storage')
|
594 |
parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
|
595 |
parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
|
596 |
+
parser.add_argument('--training-config-type', type=str, default=None, help='Training configuration type')
|
597 |
+
parser.add_argument('--model-name', type=str, default=None, help='Base model name')
|
598 |
+
parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name')
|
599 |
+
parser.add_argument('--batch-size', type=str, default=None, help='Batch size')
|
600 |
+
parser.add_argument('--learning-rate', type=str, default=None, help='Learning rate')
|
601 |
+
parser.add_argument('--max-epochs', type=str, default=None, help='Maximum epochs')
|
602 |
+
parser.add_argument('--max-seq-length', type=str, default=None, help='Maximum sequence length')
|
603 |
+
parser.add_argument('--trainer-type', type=str, default=None, help='Trainer type')
|
604 |
|
605 |
return parser.parse_args()
|
606 |
|
|
|
628 |
dataset_repo=args.dataset_repo,
|
629 |
hf_token=args.hf_token,
|
630 |
author_name=args.author_name,
|
631 |
+
model_description=args.model_description,
|
632 |
+
training_config_type=args.training_config_type,
|
633 |
+
model_name=args.model_name,
|
634 |
+
dataset_name=args.dataset_name,
|
635 |
+
batch_size=args.batch_size,
|
636 |
+
learning_rate=args.learning_rate,
|
637 |
+
max_epochs=args.max_epochs,
|
638 |
+
max_seq_length=args.max_seq_length,
|
639 |
+
trainer_type=args.trainer_type
|
640 |
)
|
641 |
|
642 |
# Push model
|
scripts/training/train_gpt_oss.py
CHANGED
@@ -95,12 +95,215 @@ def setup_lora_for_gpt_oss(model, config):
|
|
95 |
|
96 |
return peft_model
|
97 |
|
98 |
-
def
|
99 |
-
"""Load
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
return dataset
|
106 |
|
@@ -127,25 +330,111 @@ def setup_trackio_tracking(config):
|
|
127 |
|
128 |
return trackio_client
|
129 |
|
130 |
-
def create_sft_config(config):
|
131 |
-
"""Create SFTConfig for GPT-OSS training"""
|
132 |
-
|
133 |
-
print("Creating SFT configuration...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
sft_config = SFTConfig(
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
)
|
150 |
|
151 |
return sft_config
|
@@ -193,13 +482,13 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
|
|
193 |
peft_model = setup_lora_for_gpt_oss(model, config)
|
194 |
|
195 |
# Load dataset
|
196 |
-
dataset =
|
197 |
|
198 |
# Setup Trackio tracking
|
199 |
trackio_client = setup_trackio_tracking(config)
|
200 |
|
201 |
# Create SFT configuration
|
202 |
-
sft_config = create_sft_config(config)
|
203 |
|
204 |
# Create trainer
|
205 |
print("Creating SFT trainer...")
|
|
|
95 |
|
96 |
return peft_model
|
97 |
|
98 |
+
def load_dataset_from_config(config):
|
99 |
+
"""Load dataset based on configuration"""
|
100 |
|
101 |
+
dataset_name = getattr(config, 'dataset_name', 'HuggingFaceH4/Multilingual-Thinking')
|
102 |
+
dataset_split = getattr(config, 'dataset_split', 'train')
|
103 |
+
dataset_config = getattr(config, 'dataset_config', None)
|
104 |
+
|
105 |
+
print(f"Loading dataset: {dataset_name}")
|
106 |
+
print(f"Dataset split: {dataset_split}")
|
107 |
+
if dataset_config:
|
108 |
+
print(f"Dataset config: {dataset_config}")
|
109 |
+
|
110 |
+
# Load the dataset
|
111 |
+
if dataset_config:
|
112 |
+
dataset = load_dataset(dataset_name, dataset_config, split=dataset_split)
|
113 |
+
else:
|
114 |
+
dataset = load_dataset(dataset_name, split=dataset_split)
|
115 |
+
|
116 |
+
print(f"Original dataset size: {len(dataset)} examples")
|
117 |
+
|
118 |
+
# Apply filtering based on configuration
|
119 |
+
dataset = apply_dataset_filtering(dataset, config)
|
120 |
+
|
121 |
+
# Apply dataset processing based on format
|
122 |
+
dataset = process_dataset_format(dataset, config)
|
123 |
+
|
124 |
+
print(f"Final dataset size: {len(dataset)} examples")
|
125 |
+
|
126 |
+
return dataset
|
127 |
+
|
128 |
+
def apply_dataset_filtering(dataset, config):
|
129 |
+
"""Apply filtering based on configuration"""
|
130 |
+
|
131 |
+
# Filter bad entries if specified
|
132 |
+
if getattr(config, 'filter_bad_entries', False):
|
133 |
+
bad_entry_field = getattr(config, 'bad_entry_field', 'bad_entry')
|
134 |
+
bad_prompt_field = getattr(config, 'bad_prompt_field', 'bad_prompt_detected')
|
135 |
+
bad_response_field = getattr(config, 'bad_response_field', 'bad_response_detected')
|
136 |
+
|
137 |
+
original_size = len(dataset)
|
138 |
+
|
139 |
+
# Filter out bad entries
|
140 |
+
if bad_entry_field in dataset.column_names:
|
141 |
+
dataset = dataset.filter(lambda x: not x.get(bad_entry_field, False))
|
142 |
+
print(f"Filtered {original_size - len(dataset)} bad entries")
|
143 |
+
|
144 |
+
# Filter out bad prompts
|
145 |
+
if bad_prompt_field in dataset.column_names:
|
146 |
+
dataset = dataset.filter(lambda x: not x.get(bad_prompt_field, False))
|
147 |
+
print(f"Filtered bad prompts, remaining: {len(dataset)} examples")
|
148 |
+
|
149 |
+
# Filter out bad responses
|
150 |
+
if bad_response_field in dataset.column_names:
|
151 |
+
dataset = dataset.filter(lambda x: not x.get(bad_response_field, False))
|
152 |
+
print(f"Filtered bad responses, remaining: {len(dataset)} examples")
|
153 |
+
|
154 |
+
# Apply length filtering
|
155 |
+
min_length = getattr(config, 'min_length', 10)
|
156 |
+
max_length = getattr(config, 'max_length', None)
|
157 |
+
|
158 |
+
input_field = getattr(config, 'input_field', 'prompt')
|
159 |
+
target_field = getattr(config, 'target_field', 'accepted_completion')
|
160 |
+
|
161 |
+
if min_length > 0 or max_length:
|
162 |
+
def length_filter(example):
|
163 |
+
input_len = len(example.get(input_field, ''))
|
164 |
+
target_len = len(example.get(target_field, ''))
|
165 |
+
total_len = input_len + target_len
|
166 |
+
|
167 |
+
if total_len < min_length:
|
168 |
+
return False
|
169 |
+
if max_length and total_len > max_length:
|
170 |
+
return False
|
171 |
+
return True
|
172 |
+
|
173 |
+
original_size = len(dataset)
|
174 |
+
dataset = dataset.filter(length_filter)
|
175 |
+
print(f"Length filtering: {original_size} -> {len(dataset)} examples")
|
176 |
+
|
177 |
+
# Apply sampling if specified
|
178 |
+
max_samples = getattr(config, 'max_samples', None)
|
179 |
+
if max_samples and len(dataset) > max_samples:
|
180 |
+
dataset = dataset.shuffle(seed=42).select(range(max_samples))
|
181 |
+
print(f"Sampled {max_samples} examples from dataset")
|
182 |
+
|
183 |
+
return dataset
|
184 |
+
|
185 |
+
def format_gpt_oss_harmony(prompt, completion, add_eos_token=True):
|
186 |
+
"""
|
187 |
+
Format data for GPT-OSS Harmony format following the exact template structure.
|
188 |
+
Based on: https://huggingface.co/openai/gpt-oss-20b/raw/main/chat_template.jinja
|
189 |
+
"""
|
190 |
+
# GPT-OSS Harmony format structure (exact template compliance)
|
191 |
+
# User message: <|start|>user<|message|>content<|end|>
|
192 |
+
# Assistant message: <|start|>assistant<|channel|>final<|message|>content<|end|> (inference)
|
193 |
+
# Assistant message: <|start|>assistant<|channel|>final<|message|>content<|return|> (training)
|
194 |
+
|
195 |
+
harmony_text = f"<|start|>user<|message|>{prompt}<|end|><|start|>assistant<|channel|>final<|message|>{completion}"
|
196 |
+
|
197 |
+
if add_eos_token:
|
198 |
+
# Use <|return|> for training as per template specification
|
199 |
+
# This indicates the end of generation in training
|
200 |
+
harmony_text += "<|return|>"
|
201 |
+
else:
|
202 |
+
# Use <|end|> for inference
|
203 |
+
harmony_text += "<|end|>"
|
204 |
+
|
205 |
+
return harmony_text
|
206 |
+
|
207 |
+
def process_dataset_format(dataset, config):
|
208 |
+
"""Process dataset based on format configuration with exact GPT-OSS Harmony compliance"""
|
209 |
+
|
210 |
+
dataset_format = getattr(config, 'dataset_format', 'openhermes_fr')
|
211 |
+
input_field = getattr(config, 'input_field', 'prompt')
|
212 |
+
target_field = getattr(config, 'target_field', 'accepted_completion')
|
213 |
+
concatenate_fields = getattr(config, 'concatenate_fields', True)
|
214 |
+
field_separator = getattr(config, 'field_separator', '\n\n### Response:\n')
|
215 |
+
add_eos_token = getattr(config, 'add_eos_token', True)
|
216 |
+
use_harmony_format = getattr(config, 'use_harmony_format', True)
|
217 |
+
|
218 |
+
print(f"Processing dataset format: {dataset_format}")
|
219 |
+
print(f"Input field: {input_field}, Target field: {target_field}")
|
220 |
+
print(f"GPT-OSS Harmony Format: {'Enabled' if use_harmony_format else 'Disabled'}")
|
221 |
+
|
222 |
+
if dataset_format == "openhermes_fr":
|
223 |
+
# Process OpenHermes-FR format: prompt + accepted_completion
|
224 |
+
def format_openhermes_fr(example):
|
225 |
+
prompt = example.get(input_field, '')
|
226 |
+
completion = example.get(target_field, '')
|
227 |
+
|
228 |
+
if concatenate_fields:
|
229 |
+
if use_harmony_format:
|
230 |
+
# Use exact GPT-OSS Harmony format from template
|
231 |
+
text = format_gpt_oss_harmony(prompt, completion, add_eos_token)
|
232 |
+
else:
|
233 |
+
# Fallback to standard format with separator
|
234 |
+
text = prompt + field_separator + completion
|
235 |
+
if add_eos_token:
|
236 |
+
text += "</s>"
|
237 |
+
|
238 |
+
return {"text": text}
|
239 |
+
else:
|
240 |
+
# Keep separate for more advanced training setups
|
241 |
+
return {
|
242 |
+
"input": prompt,
|
243 |
+
"output": completion
|
244 |
+
}
|
245 |
+
|
246 |
+
dataset = dataset.map(format_openhermes_fr, remove_columns=dataset.column_names)
|
247 |
+
|
248 |
+
elif dataset_format == "messages":
|
249 |
+
# Process messages format (like HuggingFaceH4/Multilingual-Thinking)
|
250 |
+
def format_messages(example):
|
251 |
+
messages = example.get(input_field, [])
|
252 |
+
|
253 |
+
if use_harmony_format and len(messages) >= 2:
|
254 |
+
# Extract user and assistant messages for harmony format
|
255 |
+
user_message = ""
|
256 |
+
assistant_message = ""
|
257 |
+
|
258 |
+
for message in messages:
|
259 |
+
role = message.get("role", "")
|
260 |
+
content = message.get("content", "")
|
261 |
+
|
262 |
+
if role == "user":
|
263 |
+
user_message = content
|
264 |
+
elif role == "assistant":
|
265 |
+
assistant_message = content
|
266 |
+
|
267 |
+
if user_message and assistant_message:
|
268 |
+
# Use GPT-OSS Harmony format
|
269 |
+
text = format_gpt_oss_harmony(user_message, assistant_message, add_eos_token)
|
270 |
+
else:
|
271 |
+
# Fallback to simple concatenation
|
272 |
+
text = ""
|
273 |
+
for message in messages:
|
274 |
+
role = message.get("role", "")
|
275 |
+
content = message.get("content", "")
|
276 |
+
text += f"{role}: {content}\n"
|
277 |
+
if add_eos_token:
|
278 |
+
text += "</s>"
|
279 |
+
else:
|
280 |
+
# Standard format - convert messages to simple text
|
281 |
+
text = ""
|
282 |
+
for message in messages:
|
283 |
+
role = message.get("role", "")
|
284 |
+
content = message.get("content", "")
|
285 |
+
text += f"{role}: {content}\n"
|
286 |
+
if add_eos_token:
|
287 |
+
text += "</s>"
|
288 |
+
|
289 |
+
return {"text": text}
|
290 |
+
|
291 |
+
dataset = dataset.map(format_messages, remove_columns=dataset.column_names)
|
292 |
+
|
293 |
+
elif dataset_format == "text":
|
294 |
+
# Process plain text format
|
295 |
+
text_field = input_field
|
296 |
+
def format_text(example):
|
297 |
+
text = example.get(text_field, '')
|
298 |
+
if add_eos_token:
|
299 |
+
text += "</s>"
|
300 |
+
return {"text": text}
|
301 |
+
|
302 |
+
dataset = dataset.map(format_text, remove_columns=dataset.column_names)
|
303 |
+
|
304 |
+
elif dataset_format == "custom":
|
305 |
+
# Custom format - user handles this in their config
|
306 |
+
print("Using custom dataset format - no automatic processing")
|
307 |
|
308 |
return dataset
|
309 |
|
|
|
330 |
|
331 |
return trackio_client
|
332 |
|
333 |
+
def create_sft_config(config, output_dir):
|
334 |
+
"""Create enhanced SFTConfig for GPT-OSS training"""
|
335 |
+
|
336 |
+
print("Creating enhanced SFT configuration...")
|
337 |
+
|
338 |
+
# Extract training parameters from config with enhanced defaults
|
339 |
+
num_train_epochs = getattr(config, 'num_train_epochs', 1.0)
|
340 |
+
max_steps = getattr(config, 'max_steps', None)
|
341 |
+
warmup_ratio = getattr(config, 'warmup_ratio', 0.03)
|
342 |
+
warmup_steps = getattr(config, 'warmup_steps', None)
|
343 |
+
|
344 |
+
# Learning rate configuration
|
345 |
+
learning_rate = config.learning_rate
|
346 |
+
lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
|
347 |
+
lr_scheduler_kwargs = getattr(config, 'lr_scheduler_kwargs', {"min_lr_rate": 0.1})
|
348 |
+
|
349 |
+
# Batch configuration
|
350 |
+
per_device_train_batch_size = config.batch_size
|
351 |
+
per_device_eval_batch_size = getattr(config, 'eval_batch_size', config.batch_size)
|
352 |
+
gradient_accumulation_steps = config.gradient_accumulation_steps
|
353 |
+
|
354 |
+
# Evaluation and logging
|
355 |
+
eval_strategy = getattr(config, 'eval_strategy', 'steps')
|
356 |
+
eval_steps = getattr(config, 'eval_steps', 100)
|
357 |
+
logging_steps = getattr(config, 'logging_steps', 10)
|
358 |
+
|
359 |
+
# Saving configuration
|
360 |
+
save_strategy = getattr(config, 'save_strategy', 'steps')
|
361 |
+
save_steps = getattr(config, 'save_steps', 500)
|
362 |
+
save_total_limit = getattr(config, 'save_total_limit', 3)
|
363 |
+
|
364 |
+
# Mixed precision
|
365 |
+
fp16 = getattr(config, 'fp16', False)
|
366 |
+
bf16 = getattr(config, 'bf16', True)
|
367 |
+
|
368 |
+
# Regularization
|
369 |
+
weight_decay = getattr(config, 'weight_decay', 0.01)
|
370 |
+
max_grad_norm = getattr(config, 'max_grad_norm', 1.0)
|
371 |
+
|
372 |
+
# HuggingFace Hub integration
|
373 |
+
push_to_hub = getattr(config, 'push_to_hub', False)
|
374 |
+
|
375 |
+
print(f" • Epochs: {num_train_epochs}")
|
376 |
+
print(f" • Learning rate: {learning_rate}")
|
377 |
+
print(f" • Batch size: {per_device_train_batch_size}")
|
378 |
+
print(f" • Gradient accumulation: {gradient_accumulation_steps}")
|
379 |
+
print(f" • Effective batch size: {per_device_train_batch_size * gradient_accumulation_steps}")
|
380 |
|
381 |
sft_config = SFTConfig(
|
382 |
+
# Training duration
|
383 |
+
num_train_epochs=num_train_epochs,
|
384 |
+
max_steps=max_steps,
|
385 |
+
|
386 |
+
# Learning rate
|
387 |
+
learning_rate=learning_rate,
|
388 |
+
lr_scheduler_type=lr_scheduler_type,
|
389 |
+
lr_scheduler_kwargs=lr_scheduler_kwargs,
|
390 |
+
warmup_ratio=warmup_ratio,
|
391 |
+
warmup_steps=warmup_steps,
|
392 |
+
|
393 |
+
# Batch configuration
|
394 |
+
per_device_train_batch_size=per_device_train_batch_size,
|
395 |
+
per_device_eval_batch_size=per_device_eval_batch_size,
|
396 |
+
gradient_accumulation_steps=gradient_accumulation_steps,
|
397 |
+
|
398 |
+
# Model configuration
|
399 |
+
max_seq_length=config.max_seq_length,
|
400 |
+
gradient_checkpointing=getattr(config, 'use_gradient_checkpointing', True),
|
401 |
+
|
402 |
+
# Mixed precision
|
403 |
+
fp16=fp16,
|
404 |
+
bf16=bf16,
|
405 |
+
|
406 |
+
# Regularization
|
407 |
+
weight_decay=weight_decay,
|
408 |
+
max_grad_norm=max_grad_norm,
|
409 |
+
|
410 |
+
# Evaluation
|
411 |
+
evaluation_strategy=eval_strategy,
|
412 |
+
eval_steps=eval_steps,
|
413 |
+
|
414 |
+
# Logging
|
415 |
+
logging_steps=logging_steps,
|
416 |
+
|
417 |
+
# Saving
|
418 |
+
save_strategy=save_strategy,
|
419 |
+
save_steps=save_steps,
|
420 |
+
save_total_limit=save_total_limit,
|
421 |
+
|
422 |
+
# Output
|
423 |
+
output_dir=output_dir,
|
424 |
+
|
425 |
+
# Data loading
|
426 |
+
dataloader_num_workers=getattr(config, 'dataloader_num_workers', 4),
|
427 |
+
dataloader_pin_memory=getattr(config, 'dataloader_pin_memory', True),
|
428 |
+
|
429 |
+
# Performance
|
430 |
+
group_by_length=getattr(config, 'group_by_length', True),
|
431 |
+
remove_unused_columns=getattr(config, 'remove_unused_columns', True),
|
432 |
+
|
433 |
+
# HuggingFace Hub
|
434 |
+
push_to_hub=push_to_hub,
|
435 |
+
|
436 |
+
# Monitoring
|
437 |
+
report_to="trackio" if getattr(config, 'enable_tracking', False) else None,
|
438 |
)
|
439 |
|
440 |
return sft_config
|
|
|
482 |
peft_model = setup_lora_for_gpt_oss(model, config)
|
483 |
|
484 |
# Load dataset
|
485 |
+
dataset = load_dataset_from_config(config)
|
486 |
|
487 |
# Setup Trackio tracking
|
488 |
trackio_client = setup_trackio_tracking(config)
|
489 |
|
490 |
# Create SFT configuration
|
491 |
+
sft_config = create_sft_config(config, output_dir)
|
492 |
|
493 |
# Create trainer
|
494 |
print("Creating SFT trainer...")
|
templates/spaces/demo_gpt/README.md
CHANGED
@@ -6,7 +6,7 @@ colorTo: pink
|
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.40.0
|
8 |
app_file: app.py
|
9 |
-
pinned:
|
10 |
short_description: GPT-OSS-20B Multilingual Reasoner LoRA adapter
|
11 |
---
|
12 |
|
|
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.40.0
|
8 |
app_file: app.py
|
9 |
+
pinned: false
|
10 |
short_description: GPT-OSS-20B Multilingual Reasoner LoRA adapter
|
11 |
---
|
12 |
|