Tonic commited on
Commit
59e57ff
·
1 Parent(s): 75bcdb3

adds harmony format , configurable gpt-oss parameters, launch.sh logic , improved templates for legml gpt-oss training, dynamic results directory and improve model pushing

Browse files
config/train_gpt_oss_custom.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GPT-OSS Custom Training Configuration
3
+ Based on OpenAI's GPT-OSS fine-tuning tutorial
4
+ Fully customizable configuration for any dataset format
5
+
6
+ Supports specialized datasets like:
7
+ - legmlai/openhermes-fr (French instruction dataset)
8
+ - HuggingFaceH4/Multilingual-Thinking
9
+ - Custom prompt/completion formats
10
+ """
11
+ import os
12
+ from dataclasses import dataclass
13
+ from typing import Optional, Dict, List, Union
14
+
15
+ @dataclass
16
+ class GPTOSSEnhancedCustomConfig:
17
+ """Enhanced custom configuration for GPT-OSS fine-tuning with maximum flexibility"""
18
+
19
+ # ============================================================================
20
+ # CORE MODEL CONFIGURATION
21
+ # ============================================================================
22
+ trainer_type: str = "sft" # "sft" or "dpo"
23
+ model_name: str = "openai/gpt-oss-20b"
24
+ max_seq_length: int = 2048 # Customizable: 512, 1024, 2048, 4096, 8192
25
+ use_flash_attention: bool = True
26
+ use_gradient_checkpointing: bool = True
27
+
28
+ # ============================================================================
29
+ # TRAINING HYPERPARAMETERS - FULLY CUSTOMIZABLE
30
+ # ============================================================================
31
+ # Batch Configuration
32
+ batch_size: int = 4 # Per-device batch size (1-32 depending on GPU memory)
33
+ gradient_accumulation_steps: int = 4 # Effective batch = batch_size * accumulation * num_gpus
34
+ eval_batch_size: Optional[int] = None # If None, uses batch_size
35
+
36
+ # Learning Rate Configuration
37
+ learning_rate: float = 2e-4 # Main learning rate (1e-5 to 5e-4 typical range)
38
+ min_lr: float = 2e-5 # Minimum learning rate for scheduler
39
+ warmup_ratio: float = 0.03 # Fraction of steps for warmup (0.01-0.1)
40
+ warmup_steps: Optional[int] = None # If set, overrides warmup_ratio
41
+
42
+ # Training Duration
43
+ num_train_epochs: float = 1.0 # Number of epochs (0.5, 1.0, 2.0, 3.0)
44
+ max_steps: Optional[int] = None # If set, overrides num_train_epochs
45
+ max_iters: Optional[int] = None # Legacy compatibility
46
+
47
+ # Regularization
48
+ weight_decay: float = 0.01 # L2 regularization (0.0-0.1)
49
+ max_grad_norm: float = 1.0 # Gradient clipping (0.5-2.0)
50
+
51
+ # ============================================================================
52
+ # OPTIMIZER CONFIGURATION
53
+ # ============================================================================
54
+ optimizer: str = "adamw_torch" # "adamw_torch", "adamw_hf", "sgd"
55
+ beta1: float = 0.9 # Adam beta1 parameter
56
+ beta2: float = 0.95 # Adam beta2 parameter (0.95-0.999)
57
+ eps: float = 1e-8 # Adam epsilon
58
+
59
+ # ============================================================================
60
+ # SCHEDULER CONFIGURATION
61
+ # ============================================================================
62
+ scheduler: str = "cosine_with_min_lr" # "linear", "cosine", "cosine_with_min_lr", "constant"
63
+ lr_scheduler_kwargs: Optional[Dict] = None
64
+
65
+ # ============================================================================
66
+ # MIXED PRECISION & DISTRIBUTED TRAINING
67
+ # ============================================================================
68
+ fp16: bool = False # Use FP16 (not recommended for GPT-OSS)
69
+ bf16: bool = True # Use BF16 (recommended for GPT-OSS)
70
+ tf32: Optional[bool] = None # Use TF32 on A100/H100
71
+ ddp_backend: str = "nccl"
72
+ ddp_find_unused_parameters: bool = False
73
+
74
+ # ============================================================================
75
+ # LOGGING, EVALUATION & CHECKPOINTING
76
+ # ============================================================================
77
+ # Logging
78
+ logging_steps: int = 10 # Log every N steps
79
+ log_level: str = "info" # "debug", "info", "warning", "error"
80
+
81
+ # Evaluation
82
+ eval_strategy: str = "steps" # "no", "steps", "epoch"
83
+ eval_steps: int = 100 # Evaluate every N steps
84
+ eval_delay: float = 0 # Delay evaluation for N steps/epochs
85
+ eval_accumulation_steps: Optional[int] = None # Accumulate eval outputs
86
+
87
+ # Checkpointing
88
+ save_strategy: str = "steps" # "no", "steps", "epoch"
89
+ save_steps: int = 500 # Save checkpoint every N steps
90
+ save_total_limit: Optional[int] = 3 # Keep only N best checkpoints
91
+ save_only_model: bool = False # Save only model weights
92
+
93
+ # Model Selection
94
+ metric_for_best_model: str = "eval_loss"
95
+ greater_is_better: bool = False
96
+ load_best_model_at_end: bool = True
97
+
98
+ # ============================================================================
99
+ # DATASET CONFIGURATION - ENHANCED FOR CUSTOM FORMATS
100
+ # ============================================================================
101
+ # Dataset Source
102
+ dataset_name: str = "legmlai/openhermes-fr" # Default to French OpenHermes
103
+ dataset_split: str = "train" # Dataset split to use
104
+ dataset_config: Optional[str] = None # Dataset configuration name
105
+
106
+ # Field Mapping - Customize for your dataset format
107
+ input_field: str = "prompt" # Field containing the input/prompt
108
+ target_field: str = "accepted_completion" # Field containing the target/completion
109
+
110
+ # OpenHermes-FR specific fields
111
+ filter_bad_entries: bool = True # Filter entries marked as bad
112
+ bad_entry_field: str = "bad_entry" # Field indicating bad entries
113
+ bad_prompt_field: str = "bad_prompt_detected" # Field for bad prompts
114
+ bad_response_field: str = "bad_response_detected" # Field for bad responses
115
+
116
+ # Data Processing Options
117
+ concatenate_fields: bool = True # Combine input and target fields for training
118
+ field_separator: str = "\n\n### Response:\n" # Separator between input and target
119
+ add_eos_token: bool = True # Add EOS token at the end
120
+
121
+ # Dataset Filtering & Sampling
122
+ max_samples: Optional[int] = None # Limit dataset size (e.g., 100000 for testing)
123
+ min_length: int = 10 # Minimum sequence length
124
+ max_length: Optional[int] = None # Maximum sequence length (None = use max_seq_length)
125
+
126
+ # Custom Dataset Formats Support
127
+ dataset_format: str = "openhermes_fr" # "openhermes_fr", "messages", "text", "custom"
128
+
129
+ # GPT-OSS Harmony Format Configuration
130
+ use_harmony_format: bool = True # Enable GPT-OSS harmony format
131
+ use_chat_template: bool = False # Set to True for messages format
132
+ chat_template_kwargs: Optional[Dict] = None
133
+
134
+ # ============================================================================
135
+ # TRACKIO MONITORING CONFIGURATION
136
+ # ============================================================================
137
+ enable_tracking: bool = True
138
+ trackio_url: Optional[str] = None
139
+ trackio_token: Optional[str] = None
140
+ log_artifacts: bool = True
141
+ log_metrics: bool = True
142
+ log_config: bool = True
143
+ experiment_name: Optional[str] = None
144
+
145
+ # ============================================================================
146
+ # HUGGING FACE INTEGRATION
147
+ # ============================================================================
148
+ hf_token: Optional[str] = None
149
+ dataset_repo: Optional[str] = None
150
+ push_to_hub: bool = False # Push model to HF Hub after training
151
+ hub_model_id: Optional[str] = None # HF Hub model ID
152
+ hub_private_repo: bool = False # Make HF repo private
153
+
154
+ # ============================================================================
155
+ # GPT-OSS SPECIFIC CONFIGURATIONS
156
+ # ============================================================================
157
+ # LoRA Configuration
158
+ use_lora: bool = True
159
+ lora_config: Optional[Dict] = None
160
+
161
+ # Quantization Configuration
162
+ use_quantization: bool = True
163
+ quantization_config: Optional[Dict] = None
164
+
165
+ # Model Loading Configuration
166
+ model_kwargs: Optional[Dict] = None
167
+
168
+ # Generation Configuration (for evaluation/testing)
169
+ generation_config: Optional[Dict] = None
170
+
171
+ # ============================================================================
172
+ # MULTILINGUAL & DOMAIN SPECIFIC SETTINGS
173
+ # ============================================================================
174
+ # Language Support (for multilingual datasets)
175
+ primary_language: str = "fr" # Primary language code
176
+ reasoning_languages: Optional[List[str]] = None # Supported languages for reasoning
177
+
178
+ # Domain-specific settings
179
+ domain_focus: Optional[str] = None # "reasoning", "conversation", "instruction", "general"
180
+
181
+ # ============================================================================
182
+ # PERFORMANCE & MEMORY OPTIMIZATION
183
+ # ============================================================================
184
+ # Data Loading
185
+ dataloader_num_workers: int = 4 # Number of data loading workers
186
+ dataloader_pin_memory: bool = True # Pin memory for faster GPU transfer
187
+ dataloader_prefetch_factor: int = 2 # Prefetch factor for data loading
188
+
189
+ # Memory Management
190
+ max_memory_per_gpu: Optional[str] = None # e.g., "80GB", "40GB"
191
+ low_cpu_mem_usage: bool = True # Use low CPU memory loading
192
+
193
+ # Performance Optimizations
194
+ group_by_length: bool = True # Group sequences by length
195
+ length_column_name: str = "length" # Column name for sequence lengths
196
+ remove_unused_columns: bool = True # Remove unused dataset columns
197
+
198
+ def __post_init__(self):
199
+ """Initialize default values and validate configuration"""
200
+
201
+ # ============================================================================
202
+ # LORA CONFIGURATION DEFAULTS
203
+ # ============================================================================
204
+ if self.lora_config is None:
205
+ self.lora_config = {
206
+ "r": 16, # Rank (4, 8, 16, 32, 64) - higher = more parameters
207
+ "lora_alpha": 32, # Scaling factor (usually 2*r)
208
+ "target_modules": "all-linear", # Apply LoRA to all linear layers
209
+ "target_parameters": [
210
+ "7.mlp.experts.gate_up_proj",
211
+ "7.mlp.experts.down_proj",
212
+ "15.mlp.experts.gate_up_proj",
213
+ "15.mlp.experts.down_proj",
214
+ "23.mlp.experts.gate_up_proj",
215
+ "23.mlp.experts.down_proj",
216
+ ],
217
+ "bias": "none", # "none", "all", "lora_only"
218
+ "task_type": "CAUSAL_LM",
219
+ "lora_dropout": 0.05, # LoRA dropout rate
220
+ }
221
+
222
+ # ============================================================================
223
+ # QUANTIZATION CONFIGURATION DEFAULTS
224
+ # ============================================================================
225
+ if self.quantization_config is None:
226
+ self.quantization_config = {
227
+ "dequantize": True, # Use Mxfp4Config as per GPT-OSS tutorial
228
+ "load_in_4bit": False, # Set to True for extreme memory optimization
229
+ "bnb_4bit_compute_dtype": "bfloat16", # For 4-bit quantization
230
+ "bnb_4bit_use_double_quant": True, # Double quantization
231
+ "bnb_4bit_quant_type": "nf4" # Quantization type
232
+ }
233
+
234
+ # ============================================================================
235
+ # MODEL LOADING CONFIGURATION DEFAULTS
236
+ # ============================================================================
237
+ if self.model_kwargs is None:
238
+ self.model_kwargs = {
239
+ "attn_implementation": "eager", # "eager", "flash_attention_2"
240
+ "torch_dtype": "auto", # "auto", "bfloat16", "float16"
241
+ "use_cache": False, # Disable KV cache for training
242
+ "device_map": "auto", # Automatic device mapping
243
+ "low_cpu_mem_usage": self.low_cpu_mem_usage,
244
+ }
245
+
246
+ # Add memory constraints if specified
247
+ if self.max_memory_per_gpu:
248
+ self.model_kwargs["max_memory"] = {0: self.max_memory_per_gpu}
249
+
250
+ # ============================================================================
251
+ # GENERATION CONFIGURATION DEFAULTS
252
+ # ============================================================================
253
+ if self.generation_config is None:
254
+ self.generation_config = {
255
+ "max_new_tokens": 512, # Maximum tokens to generate
256
+ "do_sample": True, # Use sampling
257
+ "temperature": 0.7, # Sampling temperature
258
+ "top_p": 0.9, # Nucleus sampling
259
+ "top_k": 50, # Top-k sampling
260
+ "repetition_penalty": 1.1, # Repetition penalty
261
+ "pad_token_id": None, # Will be set from tokenizer
262
+ "eos_token_id": None, # Will be set from tokenizer
263
+ }
264
+
265
+ # ============================================================================
266
+ # LANGUAGE CONFIGURATION DEFAULTS
267
+ # ============================================================================
268
+ if self.reasoning_languages is None:
269
+ if self.primary_language == "fr":
270
+ self.reasoning_languages = [
271
+ "French", "English", "Spanish", "Italian", "German"
272
+ ]
273
+ else:
274
+ self.reasoning_languages = [
275
+ "English", "Spanish", "French", "Italian", "German",
276
+ "Chinese", "Hindi", "Japanese", "Korean", "Arabic"
277
+ ]
278
+
279
+ # ============================================================================
280
+ # SCHEDULER CONFIGURATION DEFAULTS
281
+ # ============================================================================
282
+ if self.lr_scheduler_kwargs is None:
283
+ self.lr_scheduler_kwargs = {"min_lr_rate": 0.1}
284
+
285
+ # ============================================================================
286
+ # CHAT TEMPLATE CONFIGURATION DEFAULTS (GPT-OSS Harmony Format)
287
+ # ============================================================================
288
+ if self.chat_template_kwargs is None:
289
+ self.chat_template_kwargs = {
290
+ "add_generation_prompt": True,
291
+ "tokenize": False,
292
+ "auto_insert_role": True,
293
+ # GPT-OSS Harmony Format specific settings
294
+ "reasoning_effort": "medium", # low, medium, high
295
+ "model_identity": "You are GPT-Tonic, a large language model trained by TonicAI.",
296
+ "builtin_tools": [], # Can include "browser" and/or "python"
297
+ }
298
+
299
+ # ============================================================================
300
+ # VALIDATION AND COMPUTED VALUES
301
+ # ============================================================================
302
+ # Compute effective batch size
303
+ effective_batch_size = self.batch_size * self.gradient_accumulation_steps
304
+
305
+ # Set warmup steps if not provided
306
+ if self.warmup_steps is None and self.max_steps:
307
+ self.warmup_steps = int(self.max_steps * self.warmup_ratio)
308
+
309
+ # Set max_length for dataset filtering
310
+ if self.max_length is None:
311
+ self.max_length = self.max_seq_length
312
+
313
+ # Validate configuration
314
+ self._validate_config()
315
+
316
+ # Print comprehensive configuration summary
317
+ self._print_config_summary(effective_batch_size)
318
+
319
+ def _validate_config(self):
320
+ """Validate configuration parameters"""
321
+
322
+ # Validate batch configuration
323
+ if self.batch_size < 1:
324
+ raise ValueError("batch_size must be >= 1")
325
+ if self.gradient_accumulation_steps < 1:
326
+ raise ValueError("gradient_accumulation_steps must be >= 1")
327
+
328
+ # Validate learning rate
329
+ if self.learning_rate <= 0:
330
+ raise ValueError("learning_rate must be > 0")
331
+ if self.min_lr >= self.learning_rate:
332
+ raise ValueError("min_lr must be < learning_rate")
333
+
334
+ # Validate sequence length
335
+ if self.max_seq_length < 1:
336
+ raise ValueError("max_seq_length must be >= 1")
337
+
338
+ # Validate dataset format
339
+ valid_formats = ["openhermes_fr", "messages", "text", "custom"]
340
+ if self.dataset_format not in valid_formats:
341
+ raise ValueError(f"dataset_format must be one of {valid_formats}")
342
+
343
+ def _print_config_summary(self, effective_batch_size):
344
+ """Print detailed configuration summary"""
345
+
346
+ print("\n" + "="*80)
347
+ print("🚀 GPT-OSS ENHANCED CUSTOM CONFIGURATION")
348
+ print("="*80)
349
+
350
+ print(f"📊 Model & Training:")
351
+ print(f" • Model: {self.model_name}")
352
+ print(f" • Dataset: {self.dataset_name} ({self.dataset_format})")
353
+ print(f" • Primary Language: {self.primary_language}")
354
+ print(f" • Sequence Length: {self.max_seq_length}")
355
+ print(f" • Epochs: {self.num_train_epochs}")
356
+
357
+ print(f"\n🔄 Batch Configuration:")
358
+ print(f" • Per-device Batch Size: {self.batch_size}")
359
+ print(f" • Gradient Accumulation: {self.gradient_accumulation_steps}")
360
+ print(f" • Effective Batch Size: {effective_batch_size}")
361
+
362
+ print(f"\n📈 Learning Configuration:")
363
+ print(f" • Learning Rate: {self.learning_rate}")
364
+ print(f" • Min Learning Rate: {self.min_lr}")
365
+ print(f" • Weight Decay: {self.weight_decay}")
366
+ print(f" • Warmup Ratio: {self.warmup_ratio}")
367
+
368
+ print(f"\n🎛️ LoRA Configuration:")
369
+ print(f" • Rank: {self.lora_config['r']}")
370
+ print(f" • Alpha: {self.lora_config['lora_alpha']}")
371
+ print(f" • Target Modules: {self.lora_config['target_modules']}")
372
+
373
+ print(f"\n📁 Dataset Configuration:")
374
+ print(f" • Input Field: {self.input_field}")
375
+ print(f" • Target Field: {self.target_field}")
376
+ print(f" • Filter Bad Entries: {self.filter_bad_entries}")
377
+ print(f" • Max Samples: {self.max_samples or 'All'}")
378
+
379
+ print(f"\n💾 Memory & Performance:")
380
+ print(f" • Mixed Precision: {'BF16' if self.bf16 else 'FP32'}")
381
+ print(f" • Gradient Checkpointing: {self.use_gradient_checkpointing}")
382
+ print(f" • Data Workers: {self.dataloader_num_workers}")
383
+ print(f" • Group by Length: {self.group_by_length}")
384
+
385
+ print("="*80 + "\n")
386
+
387
+ # Create the config instance with OpenHermes-FR optimized defaults
388
+ config = GPTOSSEnhancedCustomConfig()
config/train_gpt_oss_openhermes_fr.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GPT-OSS OpenHermes-FR Optimized Configuration
3
+ Specifically optimized for the legmlai/openhermes-fr dataset
4
+ 800K French instruction-response pairs with quality filtering
5
+ """
6
+
7
+ from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
8
+
9
+ # OpenHermes-FR optimized configuration
10
+ config = GPTOSSEnhancedCustomConfig(
11
+ # ============================================================================
12
+ # DATASET CONFIGURATION - OpenHermes-FR Specific
13
+ # ============================================================================
14
+ dataset_name="legmlai/openhermes-fr",
15
+ dataset_split="train",
16
+ dataset_format="openhermes_fr",
17
+
18
+ # OpenHermes-FR field mapping
19
+ input_field="prompt", # French prompts
20
+ target_field="accepted_completion", # GPT-4o generated completions
21
+
22
+ # Quality filtering using OpenHermes-FR metadata
23
+ filter_bad_entries=True, # Use built-in quality flags
24
+ bad_entry_field="bad_entry",
25
+ bad_prompt_field="bad_prompt_detected",
26
+ bad_response_field="bad_response_detected",
27
+
28
+ # Data processing optimized for French with GPT-OSS Harmony Format
29
+ concatenate_fields=True,
30
+ field_separator="\n\n### Réponse:\n", # Fallback separator (harmony format takes precedence)
31
+ add_eos_token=True,
32
+ use_harmony_format=True, # Enable GPT-OSS harmony format
33
+
34
+ # Dataset sampling (use all 800K examples by default)
35
+ max_samples=None, # Use full dataset
36
+ min_length=20, # Minimum for meaningful French text
37
+ max_length=None, # Auto-set to max_seq_length
38
+
39
+ # ============================================================================
40
+ # TRAINING HYPERPARAMETERS - French Language Optimized
41
+ # ============================================================================
42
+ num_train_epochs=1.5, # 1.5 epochs optimal for large dataset
43
+ batch_size=6, # Balanced for most GPUs
44
+ gradient_accumulation_steps=6, # Effective batch size: 36
45
+
46
+ # Learning rate schedule optimized for French fine-tuning
47
+ learning_rate=2.5e-4, # Slightly higher for multilingual
48
+ min_lr=2.5e-5, # 10% of max learning rate
49
+ warmup_ratio=0.05, # 5% warmup for stability
50
+ weight_decay=0.01, # Standard L2 regularization
51
+ max_grad_norm=1.0, # Gradient clipping
52
+
53
+ # ============================================================================
54
+ # MODEL CONFIGURATION - Optimized for French
55
+ # ============================================================================
56
+ model_name="openai/gpt-oss-20b",
57
+ max_seq_length=3072, # Balanced length for French
58
+ use_flash_attention=True,
59
+ use_gradient_checkpointing=True,
60
+
61
+ # Mixed precision for efficiency
62
+ fp16=False,
63
+ bf16=True, # Better for GPT-OSS
64
+
65
+ # ============================================================================
66
+ # LORA CONFIGURATION - Optimized for French Language Learning
67
+ # ============================================================================
68
+ use_lora=True,
69
+ lora_config={
70
+ "r": 24, # Higher rank for language adaptation
71
+ "lora_alpha": 48, # 2x rank scaling
72
+ "lora_dropout": 0.05, # Light regularization
73
+ "target_modules": "all-linear",
74
+ "target_parameters": [
75
+ "7.mlp.experts.gate_up_proj",
76
+ "7.mlp.experts.down_proj",
77
+ "15.mlp.experts.gate_up_proj",
78
+ "15.mlp.experts.down_proj",
79
+ "23.mlp.experts.gate_up_proj",
80
+ "23.mlp.experts.down_proj",
81
+ ],
82
+ "bias": "none",
83
+ "task_type": "CAUSAL_LM",
84
+ },
85
+
86
+ # ============================================================================
87
+ # QUANTIZATION - Balanced Performance/Memory
88
+ # ============================================================================
89
+ use_quantization=True,
90
+ quantization_config={
91
+ "dequantize": True, # MXFP4 as per GPT-OSS tutorial
92
+ "load_in_4bit": False, # Standard precision for quality
93
+ },
94
+
95
+ # ============================================================================
96
+ # PERFORMANCE OPTIMIZATION
97
+ # ============================================================================
98
+ # Data loading optimized for large dataset
99
+ dataloader_num_workers=6, # More workers for large dataset
100
+ dataloader_pin_memory=True,
101
+ dataloader_prefetch_factor=3, # Higher prefetch for efficiency
102
+
103
+ # Memory management
104
+ low_cpu_mem_usage=True,
105
+ group_by_length=True, # Efficient batching
106
+ remove_unused_columns=True,
107
+
108
+ # ============================================================================
109
+ # EVALUATION & LOGGING
110
+ # ============================================================================
111
+ eval_strategy="steps",
112
+ eval_steps=200, # Evaluate every 200 steps
113
+ logging_steps=20, # Log every 20 steps
114
+
115
+ save_strategy="steps",
116
+ save_steps=500, # Save every 500 steps
117
+ save_total_limit=3, # Keep 3 best checkpoints
118
+
119
+ metric_for_best_model="eval_loss",
120
+ greater_is_better=False,
121
+ load_best_model_at_end=True,
122
+
123
+ # ============================================================================
124
+ # MULTILINGUAL & FRENCH SPECIFIC SETTINGS
125
+ # ============================================================================
126
+ primary_language="fr", # French as primary language
127
+ reasoning_languages=["French", "English"], # Bilingual reasoning
128
+ domain_focus="instruction", # Instruction following
129
+
130
+ # ============================================================================
131
+ # GENERATION CONFIG FOR EVALUATION - GPT-OSS Harmony Format
132
+ # ============================================================================
133
+ generation_config={
134
+ "max_new_tokens": 512,
135
+ "do_sample": True,
136
+ "temperature": 0.7,
137
+ "top_p": 0.9,
138
+ "top_k": 50,
139
+ "repetition_penalty": 1.1,
140
+ "pad_token_id": None,
141
+ "eos_token_id": None,
142
+ # GPT-OSS Harmony Format specific settings
143
+ "reasoning_effort": "medium", # Configurable reasoning level
144
+ "use_harmony_format": True, # Ensure harmony format in generation
145
+ },
146
+
147
+ # ============================================================================
148
+ # HF HUB INTEGRATION
149
+ # ============================================================================
150
+ push_to_hub=False, # Set to True to auto-push
151
+ hub_model_id=None, # Will be set by launch script
152
+ hub_private_repo=False,
153
+
154
+ # ============================================================================
155
+ # MONITORING
156
+ # ============================================================================
157
+ enable_tracking=True, # Trackio monitoring
158
+ log_artifacts=True,
159
+ log_metrics=True,
160
+ log_config=True,
161
+ )
162
+
163
+ # Print configuration summary on import
164
+ print("\n🇫🇷 OpenHermes-FR Configuration Loaded")
165
+ print("=" * 50)
166
+ print(f"📊 Dataset: {config.dataset_name}")
167
+ print(f"🗣️ Language: French (with {config.dataset_format} format)")
168
+ print(f"📈 Training: {config.num_train_epochs} epochs")
169
+ print(f"🔄 Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
170
+ print(f"🧠 LoRA Rank: {config.lora_config['r']}")
171
+ print(f"📏 Sequence Length: {config.max_seq_length}")
172
+ print(f"🔍 Quality Filtering: {'Enabled' if config.filter_bad_entries else 'Disabled'}")
173
+ print(f"🎵 GPT-OSS Harmony Format: {'Enabled' if config.use_harmony_format else 'Disabled'}")
174
+ print("=" * 50)
config/train_gpt_oss_openhermes_fr_memory_optimized.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GPT-OSS OpenHermes-FR Memory-Optimized Configuration
3
+ Combines memory optimization best practices with OpenHermes-FR dataset
4
+ Optimized for GPT-OSS harmony format and MXFP4 quantization
5
+ Based on OpenAI GPT-OSS specifications and memory optimization principles
6
+ """
7
+
8
+ from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
9
+
10
+ # Memory-optimized OpenHermes-FR configuration for GPT-OSS
11
+ config = GPTOSSEnhancedCustomConfig(
12
+ # ============================================================================
13
+ # DATASET CONFIGURATION - OpenHermes-FR with Harmony Format
14
+ # ============================================================================
15
+ dataset_name="legmlai/openhermes-fr",
16
+ dataset_split="train",
17
+ dataset_format="openhermes_fr",
18
+
19
+ # OpenHermes-FR field mapping optimized for harmony format
20
+ input_field="prompt", # French prompts
21
+ target_field="accepted_completion", # GPT-4o generated completions
22
+
23
+ # Enhanced quality filtering for memory-constrained training
24
+ filter_bad_entries=True, # Critical for memory efficiency
25
+ bad_entry_field="bad_entry",
26
+ bad_prompt_field="bad_prompt_detected",
27
+ bad_response_field="bad_response_detected",
28
+
29
+ # Memory-optimized data processing with GPT-OSS Harmony Format
30
+ concatenate_fields=True,
31
+ field_separator="\n\n### Réponse:\n", # Fallback separator (harmony format takes precedence)
32
+ add_eos_token=True, # Required for proper training
33
+ use_harmony_format=True, # Enable GPT-OSS harmony format
34
+
35
+ # Dataset sampling optimized for memory constraints
36
+ max_samples=200000, # Reduced from 800K for memory efficiency
37
+ min_length=15, # Slightly higher minimum for quality
38
+ max_length=2048, # Explicit max length for memory control
39
+
40
+ # ============================================================================
41
+ # MEMORY-OPTIMIZED TRAINING HYPERPARAMETERS
42
+ # ============================================================================
43
+ # Batch configuration following memory optimization principles
44
+ num_train_epochs=1.0, # Single epoch to reduce memory pressure
45
+ batch_size=2, # Reduced from 6 for memory efficiency
46
+ gradient_accumulation_steps=16, # Increased to maintain effective batch size 32
47
+
48
+ # Learning rate optimized for single epoch + memory constraints
49
+ learning_rate=2e-4, # Standard GPT-OSS learning rate
50
+ min_lr=2e-5, # 10% of max learning rate
51
+ warmup_ratio=0.03, # Reduced warmup for memory efficiency
52
+ weight_decay=0.01, # Standard L2 regularization
53
+ max_grad_norm=1.0, # Gradient clipping for stability
54
+
55
+ # ============================================================================
56
+ # MODEL CONFIGURATION - Memory Optimized for GPT-OSS
57
+ # ============================================================================
58
+ model_name="openai/gpt-oss-20b",
59
+ max_seq_length=1024, # Reduced from 3072 for memory optimization
60
+ use_flash_attention=True, # Critical for memory efficiency
61
+ use_gradient_checkpointing=True, # Essential for memory optimization
62
+
63
+ # Mixed precision optimized for GPT-OSS MXFP4
64
+ fp16=False, # Not recommended for GPT-OSS
65
+ bf16=True, # Required for GPT-OSS stability
66
+ tf32=True, # Enable TF32 for A100/H100 efficiency
67
+
68
+ # ============================================================================
69
+ # LORA CONFIGURATION - Memory Optimized for GPT-OSS MoE
70
+ # ============================================================================
71
+ use_lora=True,
72
+ lora_config={
73
+ "r": 8, # Reduced rank for memory efficiency
74
+ "lora_alpha": 16, # 2x rank scaling (memory optimized)
75
+ "lora_dropout": 0.1, # Higher dropout for better generalization
76
+ "target_modules": "all-linear", # Apply to all linear layers
77
+ "target_parameters": [
78
+ # GPT-OSS specific MoE expert targeting
79
+ "7.mlp.experts.gate_up_proj",
80
+ "7.mlp.experts.down_proj",
81
+ "15.mlp.experts.gate_up_proj",
82
+ "15.mlp.experts.down_proj",
83
+ "23.mlp.experts.gate_up_proj",
84
+ "23.mlp.experts.down_proj",
85
+ ],
86
+ "bias": "none", # No bias adaptation for memory efficiency
87
+ "task_type": "CAUSAL_LM",
88
+ "modules_to_save": [], # Don't save additional modules for memory
89
+ },
90
+
91
+ # ============================================================================
92
+ # QUANTIZATION - GPT-OSS Native MXFP4 Optimization
93
+ # ============================================================================
94
+ use_quantization=True,
95
+ quantization_config={
96
+ "dequantize": True, # Use native MXFP4 as per GPT-OSS specs
97
+ "load_in_4bit": False, # Don't use BNB 4-bit with MXFP4
98
+ "mxfp4_config": { # Native GPT-OSS MXFP4 settings
99
+ "enabled": True,
100
+ "block_size": 32, # Optimized block size for MoE
101
+ }
102
+ },
103
+
104
+ # ============================================================================
105
+ # MEMORY OPTIMIZATION CONFIGURATION
106
+ # ============================================================================
107
+ # Model loading with memory constraints
108
+ model_kwargs={
109
+ "attn_implementation": "eager", # Memory-safe attention
110
+ "torch_dtype": "auto", # Let model decide (MXFP4 compatible)
111
+ "use_cache": False, # Disable KV cache for training
112
+ "device_map": "auto", # Automatic device mapping
113
+ "low_cpu_mem_usage": True, # Critical for memory optimization
114
+ "max_memory": {0: "75GB"}, # Reserve memory for other processes
115
+ },
116
+
117
+ # Data loading optimized for memory efficiency
118
+ dataloader_num_workers=2, # Reduced workers to save memory
119
+ dataloader_pin_memory=False, # Disable to save memory
120
+ dataloader_prefetch_factor=1, # Minimal prefetch for memory
121
+
122
+ # Memory management optimizations
123
+ max_memory_per_gpu="75GB", # Explicit memory limit
124
+ low_cpu_mem_usage=True, # Essential for large models
125
+ group_by_length=True, # Efficient batching for memory
126
+ remove_unused_columns=True, # Remove unnecessary data
127
+
128
+ # ============================================================================
129
+ # EVALUATION & LOGGING - Memory Efficient
130
+ # ============================================================================
131
+ eval_strategy="steps",
132
+ eval_steps=500, # Less frequent evaluation for memory
133
+ logging_steps=50, # Reduced logging frequency
134
+
135
+ save_strategy="steps",
136
+ save_steps=1000, # Less frequent saves for memory/storage
137
+ save_total_limit=2, # Keep only 2 checkpoints for memory
138
+ save_only_model=True, # Save only model weights
139
+
140
+ metric_for_best_model="eval_loss",
141
+ greater_is_better=False,
142
+ load_best_model_at_end=True,
143
+
144
+ # Evaluation memory optimization
145
+ eval_accumulation_steps=4, # Accumulate eval outputs to save memory
146
+ eval_batch_size=1, # Smaller eval batch size
147
+
148
+ # ============================================================================
149
+ # GPT-OSS HARMONY FORMAT OPTIMIZATION
150
+ # ============================================================================
151
+ # Chat template for harmony format compatibility (following exact template)
152
+ use_chat_template=False, # Use custom harmony format instead
153
+ chat_template_kwargs={
154
+ "add_generation_prompt": True,
155
+ "tokenize": False,
156
+ # GPT-OSS Harmony Format specific settings (exact template format)
157
+ "reasoning_effort": "medium", # low, medium, high
158
+ "model_identity": "You are GPT-Tonic, a large language model trained by TonicAI.",
159
+ "builtin_tools": [], # Can include "browser" and/or "python"
160
+ },
161
+
162
+ # Generation config optimized for GPT-OSS harmony format (exact template compliance)
163
+ generation_config={
164
+ "max_new_tokens": 256, # Reduced for memory efficiency
165
+ "do_sample": True,
166
+ "temperature": 0.6, # Slightly lower for more focused training
167
+ "top_p": 0.9,
168
+ "top_k": 40, # Reduced for memory efficiency
169
+ "repetition_penalty": 1.1,
170
+ "pad_token_id": None,
171
+ "eos_token_id": None,
172
+ # GPT-OSS Harmony Format specific settings (exact template format)
173
+ "reasoning_effort": "medium", # Configurable reasoning level
174
+ "use_harmony_format": True, # Ensure harmony format in generation
175
+ },
176
+
177
+ # ============================================================================
178
+ # MULTILINGUAL & REASONING OPTIMIZATION
179
+ # ============================================================================
180
+ primary_language="fr", # French as primary language
181
+ reasoning_languages=["French", "English"], # Bilingual reasoning capability
182
+ domain_focus="reasoning", # Align with GPT-OSS reasoning focus
183
+
184
+ # ============================================================================
185
+ # OPTIMIZER & SCHEDULER - Memory Optimized
186
+ # ============================================================================
187
+ optimizer="adamw_torch", # Memory-efficient optimizer
188
+ beta1=0.9,
189
+ beta2=0.95, # GPT-OSS optimized beta2
190
+ eps=1e-8,
191
+
192
+ scheduler="cosine_with_min_lr", # Stable scheduler for single epoch
193
+ lr_scheduler_kwargs={
194
+ "min_lr_rate": 0.1,
195
+ "warmup_steps": None, # Use warmup_ratio instead
196
+ },
197
+
198
+ # ============================================================================
199
+ # MONITORING & HUB INTEGRATION
200
+ # ============================================================================
201
+ enable_tracking=True, # Trackio monitoring
202
+ log_artifacts=False, # Disable to save memory/storage
203
+ log_metrics=True,
204
+ log_config=True,
205
+
206
+ push_to_hub=False, # Set to True after successful training
207
+ hub_model_id=None,
208
+ hub_private_repo=False,
209
+ )
210
+
211
+ # Configuration validation and optimization tips
212
+ print("\n🔧 GPT-OSS Memory-Optimized OpenHermes-FR Configuration")
213
+ print("=" * 60)
214
+ print(f"📊 Dataset: {config.dataset_name} (200K samples)")
215
+ print(f"🗣️ Language: French with GPT-OSS Harmony Format")
216
+ print(f"📈 Training: {config.num_train_epochs} epoch (memory optimized)")
217
+ print(f"🔄 Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
218
+ print(f"🧠 LoRA Rank: {config.lora_config['r']} (memory optimized)")
219
+ print(f"📏 Sequence Length: {config.max_seq_length} (memory optimized)")
220
+ print(f"💾 Memory Limit: {config.max_memory_per_gpu}")
221
+ print(f"⚡ Quantization: MXFP4 (GPT-OSS native)")
222
+ print(f"🔍 Quality Filtering: Enabled")
223
+ print(f"🎵 GPT-OSS Harmony Format: {'Enabled' if config.use_harmony_format else 'Disabled'}")
224
+ print("=" * 60)
225
+ print("\n💡 Memory Optimization Features:")
226
+ print(" • Native MXFP4 quantization for GPT-OSS MoE layers")
227
+ print(" • Reduced batch size with increased gradient accumulation")
228
+ print(" • Limited sequence length for memory efficiency")
229
+ print(" • Reduced LoRA rank while maintaining effectiveness")
230
+ print(" • Dataset sampling (200K from 800K) for faster training")
231
+ print(" • Gradient checkpointing and efficient data loading")
232
+ print(" • Exact GPT-OSS Harmony format with <|return|> tokens")
233
+ print("=" * 60)
docs/output.svg ADDED
launch.sh CHANGED
@@ -234,7 +234,34 @@ show_training_configs() {
234
  echo " - 4-bit quantization + reduced LoRA"
235
  echo " - Optimized for limited GPU memory"
236
  echo ""
237
- echo "9. Custom Configuration"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  echo " - User-defined parameters"
239
  echo ""
240
  }
@@ -325,12 +352,142 @@ get_training_config() {
325
  MAX_SEQ_LENGTH=1024
326
  CONFIG_FILE="config/train_gpt_oss_memory_optimized.py"
327
  ;;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  "Custom Configuration")
329
  get_custom_config
330
  ;;
331
  esac
332
  }
333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
  # Function to get custom configuration
335
  get_custom_config() {
336
  print_step "Custom Configuration Setup"
@@ -352,6 +509,136 @@ get_custom_config() {
352
  fi
353
  }
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  # Function to create training configuration file
356
  create_training_config() {
357
  local config_file="$1"
@@ -499,7 +786,7 @@ print_step "Step 2: Training Configuration"
499
  echo "=================================="
500
 
501
  show_training_configs
502
- select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "GPT-OSS Basic Training" "GPT-OSS H100 Optimized" "GPT-OSS Multilingual Reasoning" "GPT-OSS Memory Optimized" "Custom Configuration" TRAINING_CONFIG_TYPE
503
 
504
  get_training_config "$TRAINING_CONFIG_TYPE"
505
 
@@ -836,13 +1123,25 @@ print_info "Dataset: $DATASET_NAME"
836
  print_info "Batch size: $BATCH_SIZE"
837
  print_info "Learning rate: $LEARNING_RATE"
838
 
 
 
 
 
 
 
 
 
 
 
 
 
839
  # Step 15: Start training
840
  print_step "Step 15: Starting Training"
841
  echo "=============================="
842
 
843
  print_info "Starting training with configuration: $CONFIG_FILE"
844
  print_info "Experiment: $EXPERIMENT_NAME"
845
- print_info "Output: /output-checkpoint"
846
  print_info "Trackio: $TRACKIO_URL"
847
 
848
  # Ensure environment variables are available for training
@@ -852,6 +1151,7 @@ export HF_TOKEN="$HF_TOKEN"
852
  export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
853
  export HF_USERNAME="$HF_USERNAME"
854
  export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
 
855
 
856
  # Run the appropriate training script based on model type
857
  if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
@@ -859,7 +1159,7 @@ if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
859
  python scripts/training/train_gpt_oss.py \
860
  --config "$CONFIG_FILE" \
861
  --experiment-name "$EXPERIMENT_NAME" \
862
- --output-dir /output-checkpoint \
863
  --trackio-url "$TRACKIO_URL" \
864
  --trainer-type "$TRAINER_TYPE_LOWER"
865
  else
@@ -867,7 +1167,7 @@ else
867
  python scripts/training/train.py \
868
  --config "$CONFIG_FILE" \
869
  --experiment-name "$EXPERIMENT_NAME" \
870
- --output-dir /output-checkpoint \
871
  --trackio-url "$TRACKIO_URL" \
872
  --trainer-type "$TRAINER_TYPE_LOWER"
873
  fi
@@ -877,7 +1177,7 @@ print_step "Step 16: Pushing Model to HF Hub"
877
  echo "====================================="
878
 
879
  print_info "Pushing model to: $REPO_NAME"
880
- print_info "Checkpoint: /output-checkpoint"
881
 
882
  # Ensure environment variables are available for model push
883
  export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
@@ -886,26 +1186,43 @@ export HF_TOKEN="$HF_TOKEN"
886
  export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
887
  export HF_USERNAME="$HF_USERNAME"
888
  export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
 
889
 
890
  # Run the appropriate push script based on model type
891
  if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
892
  print_info "Using GPT-OSS specialized push script..."
893
- python scripts/model_tonic/push_gpt_oss_to_huggingface.py /output-checkpoint "$REPO_NAME" \
894
  --token "$HF_TOKEN" \
895
  --trackio-url "$TRACKIO_URL" \
896
  --experiment-name "$EXPERIMENT_NAME" \
897
  --dataset-repo "$TRACKIO_DATASET_REPO" \
898
  --author-name "$AUTHOR_NAME" \
899
- --model-description "$MODEL_DESCRIPTION"
 
 
 
 
 
 
 
 
900
  else
901
  print_info "Using standard SmolLM3 push script..."
902
- python scripts/model_tonic/push_to_huggingface.py /output-checkpoint "$REPO_NAME" \
903
  --token "$HF_TOKEN" \
904
  --trackio-url "$TRACKIO_URL" \
905
  --experiment-name "$EXPERIMENT_NAME" \
906
  --dataset-repo "$TRACKIO_DATASET_REPO" \
907
  --author-name "$AUTHOR_NAME" \
908
- --model-description "$MODEL_DESCRIPTION"
 
 
 
 
 
 
 
 
909
  fi
910
 
911
  # Step 16.5: Switch Trackio Space to Read Token (Security)
@@ -1018,7 +1335,7 @@ fi)
1018
 
1019
  ## Files Created
1020
  - Training configuration: \`$CONFIG_FILE\`
1021
- - Model checkpoint: \`/output-checkpoint/\`
1022
  - Training logs: \`training.log\`
1023
  - Summary report: \`training_summary.md\`
1024
  EOF
 
234
  echo " - 4-bit quantization + reduced LoRA"
235
  echo " - Optimized for limited GPU memory"
236
  echo ""
237
+ echo "9. GPT-OSS OpenHermes-FR (Recommended)"
238
+ echo " - Model: openai/gpt-oss-20b"
239
+ echo " - Dataset: legmlai/openhermes-fr (800K French examples)"
240
+ echo " - Epochs: 1.5"
241
+ echo " - Batch Size: 6 (effective 36 with accumulation)"
242
+ echo " - Learning Rate: 2.5e-4"
243
+ echo " - Optimized for French language training"
244
+ echo " - Quality filtering enabled"
245
+ echo ""
246
+ echo "10. GPT-OSS OpenHermes-FR Memory Optimized"
247
+ echo " - Model: openai/gpt-oss-20b"
248
+ echo " - Dataset: legmlai/openhermes-fr (200K samples)"
249
+ echo " - Epochs: 1"
250
+ echo " - Batch Size: 2 (effective 32 with accumulation)"
251
+ echo " - Learning Rate: 2e-4"
252
+ echo " - Native MXFP4 quantization"
253
+ echo " - Memory optimized for 40-80GB GPUs"
254
+ echo " - Harmony format compatible"
255
+ echo ""
256
+ echo "10. GPT-OSS Custom Dataset"
257
+ echo " - Model: openai/gpt-oss-20b"
258
+ echo " - Dataset: User-defined (fully customizable)"
259
+ echo " - Epochs: Configurable"
260
+ echo " - Batch Size: Configurable"
261
+ echo " - Learning Rate: Configurable"
262
+ echo " - Maximum flexibility with all parameters"
263
+ echo ""
264
+ echo "11. Custom Configuration"
265
  echo " - User-defined parameters"
266
  echo ""
267
  }
 
352
  MAX_SEQ_LENGTH=1024
353
  CONFIG_FILE="config/train_gpt_oss_memory_optimized.py"
354
  ;;
355
+ "GPT-OSS OpenHermes-FR (Recommended)")
356
+ MODEL_NAME="openai/gpt-oss-20b"
357
+ DATASET_NAME="legmlai/openhermes-fr"
358
+ MAX_EPOCHS=1.5
359
+ BATCH_SIZE=6
360
+ GRADIENT_ACCUMULATION_STEPS=6
361
+ LEARNING_RATE=2.5e-4
362
+ MAX_SEQ_LENGTH=3072
363
+ CONFIG_FILE="config/train_gpt_oss_openhermes_fr.py"
364
+ ;;
365
+ "GPT-OSS OpenHermes-FR Memory Optimized")
366
+ MODEL_NAME="openai/gpt-oss-20b"
367
+ DATASET_NAME="legmlai/openhermes-fr"
368
+ MAX_EPOCHS=1
369
+ BATCH_SIZE=2
370
+ GRADIENT_ACCUMULATION_STEPS=16
371
+ LEARNING_RATE=2e-4
372
+ MAX_SEQ_LENGTH=1024
373
+ CONFIG_FILE="config/train_gpt_oss_openhermes_fr_memory_optimized.py"
374
+ ;;
375
+ "GPT-OSS Custom Dataset")
376
+ MODEL_NAME="openai/gpt-oss-20b"
377
+ DATASET_NAME="legmlai/openhermes-fr" # Will be customizable
378
+ MAX_EPOCHS=1
379
+ BATCH_SIZE=4
380
+ GRADIENT_ACCUMULATION_STEPS=4
381
+ LEARNING_RATE=2e-4
382
+ MAX_SEQ_LENGTH=2048
383
+ CONFIG_FILE="config/train_gpt_oss_custom.py"
384
+ get_custom_dataset_config
385
+ ;;
386
  "Custom Configuration")
387
  get_custom_config
388
  ;;
389
  esac
390
  }
391
 
392
+ # Function to get custom dataset configuration
393
+ get_custom_dataset_config() {
394
+ print_step "GPT-OSS Custom Configuration"
395
+ echo "======================================"
396
+
397
+ echo "Configure your GPT-OSS training:"
398
+ echo ""
399
+
400
+ # Dataset Configuration
401
+ print_info "📊 Dataset Configuration"
402
+ get_input "Dataset name (HuggingFace format: username/dataset)" "legmlai/openhermes-fr" DATASET_NAME
403
+ get_input "Dataset split" "train" DATASET_SPLIT
404
+
405
+ echo ""
406
+ echo "Dataset format options:"
407
+ echo "1. OpenHermes-FR (prompt + accepted_completion fields)"
408
+ echo "2. Messages format (chat conversations)"
409
+ echo "3. Text format (plain text field)"
410
+ echo "4. Custom format (specify field names)"
411
+ echo ""
412
+
413
+ select_option "Select dataset format:" "OpenHermes-FR" "Messages format" "Text format" "Custom format" DATASET_FORMAT
414
+
415
+ case "$DATASET_FORMAT" in
416
+ "OpenHermes-FR")
417
+ INPUT_FIELD="prompt"
418
+ TARGET_FIELD="accepted_completion"
419
+ DATASET_FORMAT_CODE="openhermes_fr"
420
+ FILTER_BAD_ENTRIES="true"
421
+ ;;
422
+ "Messages format")
423
+ INPUT_FIELD="messages"
424
+ TARGET_FIELD=""
425
+ DATASET_FORMAT_CODE="messages"
426
+ FILTER_BAD_ENTRIES="false"
427
+ ;;
428
+ "Text format")
429
+ INPUT_FIELD="text"
430
+ TARGET_FIELD=""
431
+ DATASET_FORMAT_CODE="text"
432
+ FILTER_BAD_ENTRIES="false"
433
+ ;;
434
+ "Custom format")
435
+ get_input "Input field name" "prompt" INPUT_FIELD
436
+ get_input "Target field name (leave empty if not needed)" "accepted_completion" TARGET_FIELD
437
+ DATASET_FORMAT_CODE="custom"
438
+ get_input "Filter bad entries? (true/false)" "false" FILTER_BAD_ENTRIES
439
+ ;;
440
+ esac
441
+
442
+ # Dataset Filtering Options
443
+ echo ""
444
+ print_info "🔍 Dataset Filtering Options"
445
+ get_input "Maximum samples to use (leave empty for all)" "" MAX_SAMPLES
446
+ get_input "Minimum sequence length" "10" MIN_LENGTH
447
+ get_input "Maximum sequence length (leave empty for auto)" "" MAX_LENGTH
448
+
449
+ # Training Hyperparameters
450
+ echo ""
451
+ print_info "⚙️ Training Hyperparameters"
452
+ get_input "Number of epochs" "1.0" NUM_EPOCHS
453
+ get_input "Batch size per device" "4" BATCH_SIZE
454
+ get_input "Gradient accumulation steps" "4" GRAD_ACCUM_STEPS
455
+ get_input "Learning rate" "2e-4" LEARNING_RATE
456
+ get_input "Minimum learning rate" "2e-5" MIN_LR
457
+ get_input "Weight decay" "0.01" WEIGHT_DECAY
458
+ get_input "Warmup ratio" "0.03" WARMUP_RATIO
459
+
460
+ # Sequence Length
461
+ echo ""
462
+ print_info "📏 Sequence Configuration"
463
+ get_input "Maximum sequence length" "2048" MAX_SEQ_LENGTH
464
+
465
+ # LoRA Configuration
466
+ echo ""
467
+ print_info "🎛️ LoRA Configuration"
468
+ get_input "LoRA rank" "16" LORA_RANK
469
+ get_input "LoRA alpha" "32" LORA_ALPHA
470
+ get_input "LoRA dropout" "0.05" LORA_DROPOUT
471
+
472
+ # Memory & Performance
473
+ echo ""
474
+ print_info "💾 Memory & Performance"
475
+ select_option "Mixed precision:" "BF16 (recommended)" "FP16" "FP32" MIXED_PRECISION
476
+ get_input "Data loading workers" "4" NUM_WORKERS
477
+ select_option "Quantization:" "MXFP4 (default)" "4-bit BNB" "None" QUANTIZATION_TYPE
478
+
479
+ # Advanced Options
480
+ echo ""
481
+ echo "Advanced options (press Enter for defaults):"
482
+ get_input "Max gradient norm" "1.0" MAX_GRAD_NORM
483
+ get_input "Logging steps" "10" LOGGING_STEPS
484
+ get_input "Evaluation steps" "100" EVAL_STEPS
485
+ get_input "Save steps" "500" SAVE_STEPS
486
+
487
+ # Update the custom config file with user's choices
488
+ update_enhanced_gpt_oss_config
489
+ }
490
+
491
  # Function to get custom configuration
492
  get_custom_config() {
493
  print_step "Custom Configuration Setup"
 
509
  fi
510
  }
511
 
512
+ # Function to update enhanced GPT-OSS config with user choices
513
+ update_enhanced_gpt_oss_config() {
514
+ print_info "Generating enhanced custom GPT-OSS configuration..."
515
+
516
+ # Process mixed precision setting
517
+ case "$MIXED_PRECISION" in
518
+ "BF16 (recommended)")
519
+ FP16="False"
520
+ BF16="True"
521
+ ;;
522
+ "FP16")
523
+ FP16="True"
524
+ BF16="False"
525
+ ;;
526
+ "FP32")
527
+ FP16="False"
528
+ BF16="False"
529
+ ;;
530
+ esac
531
+
532
+ # Process quantization setting
533
+ case "$QUANTIZATION_TYPE" in
534
+ "MXFP4 (default)")
535
+ USE_QUANTIZATION="True"
536
+ QUANTIZATION_CONFIG='{"dequantize": True, "load_in_4bit": False}'
537
+ ;;
538
+ "4-bit BNB")
539
+ USE_QUANTIZATION="True"
540
+ QUANTIZATION_CONFIG='{"dequantize": False, "load_in_4bit": True, "bnb_4bit_compute_dtype": "bfloat16", "bnb_4bit_use_double_quant": True, "bnb_4bit_quant_type": "nf4"}'
541
+ ;;
542
+ "None")
543
+ USE_QUANTIZATION="False"
544
+ QUANTIZATION_CONFIG='{"dequantize": False, "load_in_4bit": False}'
545
+ ;;
546
+ esac
547
+
548
+ # Create enhanced config file with all user choices
549
+ cat > "$CONFIG_FILE" << EOF
550
+ """
551
+ GPT-OSS Enhanced Custom Training Configuration - Generated by launch.sh
552
+ Dataset: $DATASET_NAME ($DATASET_FORMAT)
553
+ Optimized for: ${DATASET_FORMAT} format with full customization
554
+ """
555
+
556
+ from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
557
+
558
+ # Create enhanced config with all customizations
559
+ config = GPTOSSEnhancedCustomConfig(
560
+ # ============================================================================
561
+ # DATASET CONFIGURATION
562
+ # ============================================================================
563
+ dataset_name="$DATASET_NAME",
564
+ dataset_split="$DATASET_SPLIT",
565
+ dataset_format="$DATASET_FORMAT_CODE",
566
+ input_field="$INPUT_FIELD",
567
+ target_field=$(if [ -n "$TARGET_FIELD" ]; then echo "\"$TARGET_FIELD\""; else echo "None"; fi),
568
+ filter_bad_entries=$FILTER_BAD_ENTRIES,
569
+ max_samples=$(if [ -n "$MAX_SAMPLES" ]; then echo "$MAX_SAMPLES"; else echo "None"; fi),
570
+ min_length=$MIN_LENGTH,
571
+ max_length=$(if [ -n "$MAX_LENGTH" ]; then echo "$MAX_LENGTH"; else echo "None"; fi),
572
+
573
+ # ============================================================================
574
+ # TRAINING HYPERPARAMETERS
575
+ # ============================================================================
576
+ num_train_epochs=$NUM_EPOCHS,
577
+ batch_size=$BATCH_SIZE,
578
+ gradient_accumulation_steps=$GRAD_ACCUM_STEPS,
579
+ learning_rate=$LEARNING_RATE,
580
+ min_lr=$MIN_LR,
581
+ weight_decay=$WEIGHT_DECAY,
582
+ warmup_ratio=$WARMUP_RATIO,
583
+ max_grad_norm=$MAX_GRAD_NORM,
584
+
585
+ # ============================================================================
586
+ # MODEL CONFIGURATION
587
+ # ============================================================================
588
+ max_seq_length=$MAX_SEQ_LENGTH,
589
+
590
+ # ============================================================================
591
+ # MIXED PRECISION
592
+ # ============================================================================
593
+ fp16=$FP16,
594
+ bf16=$BF16,
595
+
596
+ # ============================================================================
597
+ # LORA CONFIGURATION
598
+ # ============================================================================
599
+ lora_config={
600
+ "r": $LORA_RANK,
601
+ "lora_alpha": $LORA_ALPHA,
602
+ "lora_dropout": $LORA_DROPOUT,
603
+ "target_modules": "all-linear",
604
+ "bias": "none",
605
+ "task_type": "CAUSAL_LM",
606
+ },
607
+
608
+ # ============================================================================
609
+ # QUANTIZATION CONFIGURATION
610
+ # ============================================================================
611
+ use_quantization=$USE_QUANTIZATION,
612
+ quantization_config=$QUANTIZATION_CONFIG,
613
+
614
+ # ============================================================================
615
+ # PERFORMANCE CONFIGURATION
616
+ # ============================================================================
617
+ dataloader_num_workers=$NUM_WORKERS,
618
+ dataloader_pin_memory=True,
619
+ group_by_length=True,
620
+
621
+ # ============================================================================
622
+ # LOGGING & EVALUATION
623
+ # ============================================================================
624
+ logging_steps=$LOGGING_STEPS,
625
+ eval_steps=$EVAL_STEPS,
626
+ save_steps=$SAVE_STEPS,
627
+
628
+ # ============================================================================
629
+ # RUNTIME CONFIGURATION
630
+ # ============================================================================
631
+ experiment_name="$EXPERIMENT_NAME",
632
+ trackio_url="$TRACKIO_URL",
633
+ dataset_repo="$TRACKIO_DATASET_REPO",
634
+ enable_tracking=True,
635
+ )
636
+ EOF
637
+
638
+ print_status "Enhanced GPT-OSS configuration generated successfully!"
639
+ print_info "Configuration saved to: $CONFIG_FILE"
640
+ }
641
+
642
  # Function to create training configuration file
643
  create_training_config() {
644
  local config_file="$1"
 
786
  echo "=================================="
787
 
788
  show_training_configs
789
+ select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "GPT-OSS Basic Training" "GPT-OSS H100 Optimized" "GPT-OSS Multilingual Reasoning" "GPT-OSS Memory Optimized" "GPT-OSS OpenHermes-FR (Recommended)" "GPT-OSS OpenHermes-FR Memory Optimized" "GPT-OSS Custom Dataset" "Custom Configuration" TRAINING_CONFIG_TYPE
790
 
791
  get_training_config "$TRAINING_CONFIG_TYPE"
792
 
 
1123
  print_info "Batch size: $BATCH_SIZE"
1124
  print_info "Learning rate: $LEARNING_RATE"
1125
 
1126
+ # Step 14.5: Define Output Directory
1127
+ print_step "Step 14.5: Output Directory Configuration"
1128
+ echo "============================================="
1129
+
1130
+ # Define the output directory for training results
1131
+ OUTPUT_DIR="./outputs/${EXPERIMENT_NAME}_$(date +%Y%m%d_%H%M%S)"
1132
+ print_info "Training output directory: $OUTPUT_DIR"
1133
+
1134
+ # Create output directory
1135
+ mkdir -p "$OUTPUT_DIR"
1136
+ print_status "Output directory created: $OUTPUT_DIR"
1137
+
1138
  # Step 15: Start training
1139
  print_step "Step 15: Starting Training"
1140
  echo "=============================="
1141
 
1142
  print_info "Starting training with configuration: $CONFIG_FILE"
1143
  print_info "Experiment: $EXPERIMENT_NAME"
1144
+ print_info "Output: $OUTPUT_DIR"
1145
  print_info "Trackio: $TRACKIO_URL"
1146
 
1147
  # Ensure environment variables are available for training
 
1151
  export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
1152
  export HF_USERNAME="$HF_USERNAME"
1153
  export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
1154
+ export OUTPUT_DIR="$OUTPUT_DIR"
1155
 
1156
  # Run the appropriate training script based on model type
1157
  if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
 
1159
  python scripts/training/train_gpt_oss.py \
1160
  --config "$CONFIG_FILE" \
1161
  --experiment-name "$EXPERIMENT_NAME" \
1162
+ --output-dir "$OUTPUT_DIR" \
1163
  --trackio-url "$TRACKIO_URL" \
1164
  --trainer-type "$TRAINER_TYPE_LOWER"
1165
  else
 
1167
  python scripts/training/train.py \
1168
  --config "$CONFIG_FILE" \
1169
  --experiment-name "$EXPERIMENT_NAME" \
1170
+ --output-dir "$OUTPUT_DIR" \
1171
  --trackio-url "$TRACKIO_URL" \
1172
  --trainer-type "$TRAINER_TYPE_LOWER"
1173
  fi
 
1177
  echo "====================================="
1178
 
1179
  print_info "Pushing model to: $REPO_NAME"
1180
+ print_info "Checkpoint: $OUTPUT_DIR"
1181
 
1182
  # Ensure environment variables are available for model push
1183
  export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
 
1186
  export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
1187
  export HF_USERNAME="$HF_USERNAME"
1188
  export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
1189
+ export OUTPUT_DIR="$OUTPUT_DIR"
1190
 
1191
  # Run the appropriate push script based on model type
1192
  if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
1193
  print_info "Using GPT-OSS specialized push script..."
1194
+ python scripts/model_tonic/push_gpt_oss_to_huggingface.py "$OUTPUT_DIR" "$REPO_NAME" \
1195
  --token "$HF_TOKEN" \
1196
  --trackio-url "$TRACKIO_URL" \
1197
  --experiment-name "$EXPERIMENT_NAME" \
1198
  --dataset-repo "$TRACKIO_DATASET_REPO" \
1199
  --author-name "$AUTHOR_NAME" \
1200
+ --model-description "$MODEL_DESCRIPTION" \
1201
+ --training-config-type "$TRAINING_CONFIG_TYPE" \
1202
+ --model-name "$MODEL_NAME" \
1203
+ --dataset-name "$DATASET_NAME" \
1204
+ --batch-size "$BATCH_SIZE" \
1205
+ --learning-rate "$LEARNING_RATE" \
1206
+ --max-epochs "$MAX_EPOCHS" \
1207
+ --max-seq-length "$MAX_SEQ_LENGTH" \
1208
+ --trainer-type "$TRAINER_TYPE"
1209
  else
1210
  print_info "Using standard SmolLM3 push script..."
1211
+ python scripts/model_tonic/push_to_huggingface.py "$OUTPUT_DIR" "$REPO_NAME" \
1212
  --token "$HF_TOKEN" \
1213
  --trackio-url "$TRACKIO_URL" \
1214
  --experiment-name "$EXPERIMENT_NAME" \
1215
  --dataset-repo "$TRACKIO_DATASET_REPO" \
1216
  --author-name "$AUTHOR_NAME" \
1217
+ --model-description "$MODEL_DESCRIPTION" \
1218
+ --training-config-type "$TRAINING_CONFIG_TYPE" \
1219
+ --model-name "$MODEL_NAME" \
1220
+ --dataset-name "$DATASET_NAME" \
1221
+ --batch-size "$BATCH_SIZE" \
1222
+ --learning-rate "$LEARNING_RATE" \
1223
+ --max-epochs "$MAX_EPOCHS" \
1224
+ --max-seq-length "$MAX_SEQ_LENGTH" \
1225
+ --trainer-type "$TRAINER_TYPE"
1226
  fi
1227
 
1228
  # Step 16.5: Switch Trackio Space to Read Token (Security)
 
1335
 
1336
  ## Files Created
1337
  - Training configuration: \`$CONFIG_FILE\`
1338
+ - Model checkpoint: \`$OUTPUT_DIR/\`
1339
  - Training logs: \`training.log\`
1340
  - Summary report: \`training_summary.md\`
1341
  EOF
scripts/model_tonic/push_gpt_oss_to_huggingface.py CHANGED
@@ -43,8 +43,59 @@ def merge_lora_weights(checkpoint_path, base_model_name, output_path):
43
 
44
  return model, tokenizer
45
 
46
- def create_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description):
47
- """Create a comprehensive model card for GPT-OSS models"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  card_content = f"""---
50
  language:
@@ -196,7 +247,7 @@ This model is licensed under the MIT License.
196
 
197
  return card_content
198
 
199
- def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experiment_name, dataset_repo, author_name, model_description):
200
  """Push GPT-OSS model to Hugging Face Hub"""
201
 
202
  print("=== GPT-OSS Model Push Pipeline ===")
@@ -230,7 +281,14 @@ def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experi
230
  trackio_url=trackio_url,
231
  dataset_repo=dataset_repo,
232
  author_name=author_name,
233
- model_description=model_description
 
 
 
 
 
 
 
234
  )
235
 
236
  # Save model card
@@ -291,6 +349,14 @@ def main():
291
  parser.add_argument("--dataset-repo", help="Dataset repository")
292
  parser.add_argument("--author-name", help="Author name")
293
  parser.add_argument("--model-description", help="Model description")
 
 
 
 
 
 
 
 
294
 
295
  args = parser.parse_args()
296
 
@@ -308,7 +374,15 @@ def main():
308
  experiment_name=experiment_name,
309
  dataset_repo=dataset_repo,
310
  author_name=author_name,
311
- model_description=model_description
 
 
 
 
 
 
 
 
312
  )
313
 
314
  sys.exit(0 if success else 1)
 
43
 
44
  return model, tokenizer
45
 
46
+ def create_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description, training_config_type=None, dataset_name=None, batch_size=None, learning_rate=None, max_epochs=None, max_seq_length=None, trainer_type=None):
47
+ """Create a comprehensive model card for GPT-OSS models using generate_model_card.py"""
48
+
49
+ try:
50
+ # Import the model card generator
51
+ import sys
52
+ import os
53
+ sys.path.append(os.path.join(os.path.dirname(__file__)))
54
+ from generate_model_card import ModelCardGenerator, create_default_variables
55
+
56
+ # Create generator
57
+ generator = ModelCardGenerator()
58
+
59
+ # Create variables for the model card
60
+ variables = create_default_variables()
61
+
62
+ # Update with GPT-OSS specific values
63
+ variables.update({
64
+ "repo_name": model_name,
65
+ "model_name": model_name.split('/')[-1],
66
+ "experiment_name": experiment_name or "gpt_oss_finetune",
67
+ "dataset_repo": dataset_repo,
68
+ "author_name": author_name or "GPT-OSS Fine-tuner",
69
+ "model_description": model_description or "A fine-tuned version of OpenAI's GPT-OSS-20B model for multilingual reasoning tasks.",
70
+ "training_config_type": training_config_type or "GPT-OSS Configuration",
71
+ "base_model": "openai/gpt-oss-20b",
72
+ "dataset_name": dataset_name or "HuggingFaceH4/Multilingual-Thinking",
73
+ "trainer_type": trainer_type or "SFTTrainer",
74
+ "batch_size": str(batch_size) if batch_size else "4",
75
+ "learning_rate": str(learning_rate) if learning_rate else "2e-4",
76
+ "max_epochs": str(max_epochs) if max_epochs else "1",
77
+ "max_seq_length": str(max_seq_length) if max_seq_length else "2048",
78
+ "hardware_info": "GPU (H100/A100)",
79
+ "trackio_url": trackio_url or "N/A",
80
+ "training_loss": "N/A",
81
+ "validation_loss": "N/A",
82
+ "perplexity": "N/A",
83
+ "quantized_models": False
84
+ })
85
+
86
+ # Generate the model card
87
+ model_card_content = generator.generate_model_card(variables)
88
+
89
+ print("✅ Model card generated using generate_model_card.py")
90
+ return model_card_content
91
+
92
+ except Exception as e:
93
+ print(f"❌ Failed to generate model card with generator: {e}")
94
+ print("🔄 Falling back to original GPT-OSS model card")
95
+ return _create_original_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description)
96
+
97
+ def _create_original_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description):
98
+ """Create the original GPT-OSS model card as fallback"""
99
 
100
  card_content = f"""---
101
  language:
 
247
 
248
  return card_content
249
 
250
+ def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experiment_name, dataset_repo, author_name, model_description, training_config_type=None, model_name=None, dataset_name=None, batch_size=None, learning_rate=None, max_epochs=None, max_seq_length=None, trainer_type=None):
251
  """Push GPT-OSS model to Hugging Face Hub"""
252
 
253
  print("=== GPT-OSS Model Push Pipeline ===")
 
281
  trackio_url=trackio_url,
282
  dataset_repo=dataset_repo,
283
  author_name=author_name,
284
+ model_description=model_description,
285
+ training_config_type=training_config_type,
286
+ dataset_name=dataset_name,
287
+ batch_size=batch_size,
288
+ learning_rate=learning_rate,
289
+ max_epochs=max_epochs,
290
+ max_seq_length=max_seq_length,
291
+ trainer_type=trainer_type
292
  )
293
 
294
  # Save model card
 
349
  parser.add_argument("--dataset-repo", help="Dataset repository")
350
  parser.add_argument("--author-name", help="Author name")
351
  parser.add_argument("--model-description", help="Model description")
352
+ parser.add_argument("--training-config-type", help="Training configuration type")
353
+ parser.add_argument("--model-name", help="Base model name")
354
+ parser.add_argument("--dataset-name", help="Dataset name")
355
+ parser.add_argument("--batch-size", help="Batch size")
356
+ parser.add_argument("--learning-rate", help="Learning rate")
357
+ parser.add_argument("--max-epochs", help="Maximum epochs")
358
+ parser.add_argument("--max-seq-length", help="Maximum sequence length")
359
+ parser.add_argument("--trainer-type", help="Trainer type")
360
 
361
  args = parser.parse_args()
362
 
 
374
  experiment_name=experiment_name,
375
  dataset_repo=dataset_repo,
376
  author_name=author_name,
377
+ model_description=model_description,
378
+ training_config_type=args.training_config_type,
379
+ model_name=args.model_name,
380
+ dataset_name=args.dataset_name,
381
+ batch_size=args.batch_size,
382
+ learning_rate=args.learning_rate,
383
+ max_epochs=args.max_epochs,
384
+ max_seq_length=args.max_seq_length,
385
+ trainer_type=args.trainer_type
386
  )
387
 
388
  sys.exit(0 if success else 1)
scripts/model_tonic/push_to_huggingface.py CHANGED
@@ -62,7 +62,15 @@ class HuggingFacePusher:
62
  dataset_repo: Optional[str] = None,
63
  hf_token: Optional[str] = None,
64
  author_name: Optional[str] = None,
65
- model_description: Optional[str] = None
 
 
 
 
 
 
 
 
66
  ):
67
  self.model_path = Path(model_path)
68
  self.repo_name = repo_name
@@ -73,6 +81,16 @@ class HuggingFacePusher:
73
  self.author_name = author_name
74
  self.model_description = model_description
75
 
 
 
 
 
 
 
 
 
 
 
76
  # HF Datasets configuration
77
  self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
78
  self.hf_token = hf_token or os.getenv('HF_TOKEN')
@@ -156,9 +174,53 @@ class HuggingFacePusher:
156
  return True
157
 
158
  def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
159
- """Create a comprehensive model card using the simple method to avoid YAML issues"""
160
- # Always use the simple model card to avoid YAML formatting issues
161
- return self._create_simple_model_card(training_config, results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  def _create_simple_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
164
  """Create a simple model card without complex YAML to avoid formatting issues"""
@@ -531,6 +593,14 @@ def parse_args():
531
  parser.add_argument('--dataset-repo', type=str, default=None, help='HF Dataset repository for experiment storage')
532
  parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
533
  parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
 
 
 
 
 
 
 
 
534
 
535
  return parser.parse_args()
536
 
@@ -558,7 +628,15 @@ def main():
558
  dataset_repo=args.dataset_repo,
559
  hf_token=args.hf_token,
560
  author_name=args.author_name,
561
- model_description=args.model_description
 
 
 
 
 
 
 
 
562
  )
563
 
564
  # Push model
 
62
  dataset_repo: Optional[str] = None,
63
  hf_token: Optional[str] = None,
64
  author_name: Optional[str] = None,
65
+ model_description: Optional[str] = None,
66
+ training_config_type: Optional[str] = None,
67
+ model_name: Optional[str] = None,
68
+ dataset_name: Optional[str] = None,
69
+ batch_size: Optional[str] = None,
70
+ learning_rate: Optional[str] = None,
71
+ max_epochs: Optional[str] = None,
72
+ max_seq_length: Optional[str] = None,
73
+ trainer_type: Optional[str] = None
74
  ):
75
  self.model_path = Path(model_path)
76
  self.repo_name = repo_name
 
81
  self.author_name = author_name
82
  self.model_description = model_description
83
 
84
+ # Training configuration details for model card generation
85
+ self.training_config_type = training_config_type
86
+ self.model_name = model_name
87
+ self.dataset_name = dataset_name
88
+ self.batch_size = batch_size
89
+ self.learning_rate = learning_rate
90
+ self.max_epochs = max_epochs
91
+ self.max_seq_length = max_seq_length
92
+ self.trainer_type = trainer_type
93
+
94
  # HF Datasets configuration
95
  self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
96
  self.hf_token = hf_token or os.getenv('HF_TOKEN')
 
174
  return True
175
 
176
  def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
177
+ """Create a comprehensive model card using the generate_model_card.py script"""
178
+ try:
179
+ # Import the model card generator
180
+ import sys
181
+ sys.path.append(os.path.join(os.path.dirname(__file__)))
182
+ from generate_model_card import ModelCardGenerator, create_default_variables
183
+
184
+ # Create generator
185
+ generator = ModelCardGenerator()
186
+
187
+ # Create variables for the model card
188
+ variables = create_default_variables()
189
+
190
+ # Update with actual values
191
+ variables.update({
192
+ "repo_name": self.repo_name,
193
+ "model_name": self.repo_name.split('/')[-1],
194
+ "experiment_name": self.experiment_name or "model_push",
195
+ "dataset_repo": self.dataset_repo,
196
+ "author_name": self.author_name or "Model Author",
197
+ "model_description": self.model_description or "A fine-tuned version of SmolLM3-3B for improved text generation capabilities.",
198
+ "training_config_type": self.training_config_type or "Custom Configuration",
199
+ "base_model": self.model_name or "HuggingFaceTB/SmolLM3-3B",
200
+ "dataset_name": self.dataset_name or "Custom Dataset",
201
+ "trainer_type": self.trainer_type or "SFTTrainer",
202
+ "batch_size": str(self.batch_size) if self.batch_size else "8",
203
+ "learning_rate": str(self.learning_rate) if self.learning_rate else "5e-6",
204
+ "max_epochs": str(self.max_epochs) if self.max_epochs else "3",
205
+ "max_seq_length": str(self.max_seq_length) if self.max_seq_length else "2048",
206
+ "hardware_info": self._get_hardware_info(),
207
+ "trackio_url": self.trackio_url or "N/A",
208
+ "training_loss": str(results.get('train_loss', 'N/A')),
209
+ "validation_loss": str(results.get('eval_loss', 'N/A')),
210
+ "perplexity": str(results.get('perplexity', 'N/A')),
211
+ "quantized_models": False # Set to True if quantized models are available
212
+ })
213
+
214
+ # Generate the model card
215
+ model_card_content = generator.generate_model_card(variables)
216
+
217
+ logger.info("✅ Model card generated using generate_model_card.py")
218
+ return model_card_content
219
+
220
+ except Exception as e:
221
+ logger.error(f"❌ Failed to generate model card with generator: {e}")
222
+ logger.info("🔄 Falling back to simple model card")
223
+ return self._create_simple_model_card(training_config, results)
224
 
225
  def _create_simple_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
226
  """Create a simple model card without complex YAML to avoid formatting issues"""
 
593
  parser.add_argument('--dataset-repo', type=str, default=None, help='HF Dataset repository for experiment storage')
594
  parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
595
  parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
596
+ parser.add_argument('--training-config-type', type=str, default=None, help='Training configuration type')
597
+ parser.add_argument('--model-name', type=str, default=None, help='Base model name')
598
+ parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name')
599
+ parser.add_argument('--batch-size', type=str, default=None, help='Batch size')
600
+ parser.add_argument('--learning-rate', type=str, default=None, help='Learning rate')
601
+ parser.add_argument('--max-epochs', type=str, default=None, help='Maximum epochs')
602
+ parser.add_argument('--max-seq-length', type=str, default=None, help='Maximum sequence length')
603
+ parser.add_argument('--trainer-type', type=str, default=None, help='Trainer type')
604
 
605
  return parser.parse_args()
606
 
 
628
  dataset_repo=args.dataset_repo,
629
  hf_token=args.hf_token,
630
  author_name=args.author_name,
631
+ model_description=args.model_description,
632
+ training_config_type=args.training_config_type,
633
+ model_name=args.model_name,
634
+ dataset_name=args.dataset_name,
635
+ batch_size=args.batch_size,
636
+ learning_rate=args.learning_rate,
637
+ max_epochs=args.max_epochs,
638
+ max_seq_length=args.max_seq_length,
639
+ trainer_type=args.trainer_type
640
  )
641
 
642
  # Push model
scripts/training/train_gpt_oss.py CHANGED
@@ -95,12 +95,215 @@ def setup_lora_for_gpt_oss(model, config):
95
 
96
  return peft_model
97
 
98
- def load_multilingual_thinking_dataset():
99
- """Load the Multilingual-Thinking dataset"""
100
 
101
- print("Loading Multilingual-Thinking dataset...")
102
- dataset = load_dataset("HuggingFaceH4/Multilingual-Thinking", split="train")
103
- print(f"Dataset loaded: {len(dataset)} examples")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  return dataset
106
 
@@ -127,25 +330,111 @@ def setup_trackio_tracking(config):
127
 
128
  return trackio_client
129
 
130
- def create_sft_config(config):
131
- """Create SFTConfig for GPT-OSS training"""
132
-
133
- print("Creating SFT configuration...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
  sft_config = SFTConfig(
136
- learning_rate=config.learning_rate,
137
- gradient_checkpointing=True,
138
- num_train_epochs=1, # Single epoch as per tutorial
139
- logging_steps=config.logging_steps,
140
- per_device_train_batch_size=config.batch_size,
141
- gradient_accumulation_steps=config.gradient_accumulation_steps,
142
- max_length=config.max_seq_length,
143
- warmup_ratio=0.03,
144
- lr_scheduler_type="cosine_with_min_lr",
145
- lr_scheduler_kwargs={"min_lr_rate": 0.1},
146
- output_dir="gpt-oss-20b-multilingual-reasoner",
147
- report_to="trackio" if config.enable_tracking else None,
148
- push_to_hub=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  )
150
 
151
  return sft_config
@@ -193,13 +482,13 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
193
  peft_model = setup_lora_for_gpt_oss(model, config)
194
 
195
  # Load dataset
196
- dataset = load_multilingual_thinking_dataset()
197
 
198
  # Setup Trackio tracking
199
  trackio_client = setup_trackio_tracking(config)
200
 
201
  # Create SFT configuration
202
- sft_config = create_sft_config(config)
203
 
204
  # Create trainer
205
  print("Creating SFT trainer...")
 
95
 
96
  return peft_model
97
 
98
+ def load_dataset_from_config(config):
99
+ """Load dataset based on configuration"""
100
 
101
+ dataset_name = getattr(config, 'dataset_name', 'HuggingFaceH4/Multilingual-Thinking')
102
+ dataset_split = getattr(config, 'dataset_split', 'train')
103
+ dataset_config = getattr(config, 'dataset_config', None)
104
+
105
+ print(f"Loading dataset: {dataset_name}")
106
+ print(f"Dataset split: {dataset_split}")
107
+ if dataset_config:
108
+ print(f"Dataset config: {dataset_config}")
109
+
110
+ # Load the dataset
111
+ if dataset_config:
112
+ dataset = load_dataset(dataset_name, dataset_config, split=dataset_split)
113
+ else:
114
+ dataset = load_dataset(dataset_name, split=dataset_split)
115
+
116
+ print(f"Original dataset size: {len(dataset)} examples")
117
+
118
+ # Apply filtering based on configuration
119
+ dataset = apply_dataset_filtering(dataset, config)
120
+
121
+ # Apply dataset processing based on format
122
+ dataset = process_dataset_format(dataset, config)
123
+
124
+ print(f"Final dataset size: {len(dataset)} examples")
125
+
126
+ return dataset
127
+
128
+ def apply_dataset_filtering(dataset, config):
129
+ """Apply filtering based on configuration"""
130
+
131
+ # Filter bad entries if specified
132
+ if getattr(config, 'filter_bad_entries', False):
133
+ bad_entry_field = getattr(config, 'bad_entry_field', 'bad_entry')
134
+ bad_prompt_field = getattr(config, 'bad_prompt_field', 'bad_prompt_detected')
135
+ bad_response_field = getattr(config, 'bad_response_field', 'bad_response_detected')
136
+
137
+ original_size = len(dataset)
138
+
139
+ # Filter out bad entries
140
+ if bad_entry_field in dataset.column_names:
141
+ dataset = dataset.filter(lambda x: not x.get(bad_entry_field, False))
142
+ print(f"Filtered {original_size - len(dataset)} bad entries")
143
+
144
+ # Filter out bad prompts
145
+ if bad_prompt_field in dataset.column_names:
146
+ dataset = dataset.filter(lambda x: not x.get(bad_prompt_field, False))
147
+ print(f"Filtered bad prompts, remaining: {len(dataset)} examples")
148
+
149
+ # Filter out bad responses
150
+ if bad_response_field in dataset.column_names:
151
+ dataset = dataset.filter(lambda x: not x.get(bad_response_field, False))
152
+ print(f"Filtered bad responses, remaining: {len(dataset)} examples")
153
+
154
+ # Apply length filtering
155
+ min_length = getattr(config, 'min_length', 10)
156
+ max_length = getattr(config, 'max_length', None)
157
+
158
+ input_field = getattr(config, 'input_field', 'prompt')
159
+ target_field = getattr(config, 'target_field', 'accepted_completion')
160
+
161
+ if min_length > 0 or max_length:
162
+ def length_filter(example):
163
+ input_len = len(example.get(input_field, ''))
164
+ target_len = len(example.get(target_field, ''))
165
+ total_len = input_len + target_len
166
+
167
+ if total_len < min_length:
168
+ return False
169
+ if max_length and total_len > max_length:
170
+ return False
171
+ return True
172
+
173
+ original_size = len(dataset)
174
+ dataset = dataset.filter(length_filter)
175
+ print(f"Length filtering: {original_size} -> {len(dataset)} examples")
176
+
177
+ # Apply sampling if specified
178
+ max_samples = getattr(config, 'max_samples', None)
179
+ if max_samples and len(dataset) > max_samples:
180
+ dataset = dataset.shuffle(seed=42).select(range(max_samples))
181
+ print(f"Sampled {max_samples} examples from dataset")
182
+
183
+ return dataset
184
+
185
+ def format_gpt_oss_harmony(prompt, completion, add_eos_token=True):
186
+ """
187
+ Format data for GPT-OSS Harmony format following the exact template structure.
188
+ Based on: https://huggingface.co/openai/gpt-oss-20b/raw/main/chat_template.jinja
189
+ """
190
+ # GPT-OSS Harmony format structure (exact template compliance)
191
+ # User message: <|start|>user<|message|>content<|end|>
192
+ # Assistant message: <|start|>assistant<|channel|>final<|message|>content<|end|> (inference)
193
+ # Assistant message: <|start|>assistant<|channel|>final<|message|>content<|return|> (training)
194
+
195
+ harmony_text = f"<|start|>user<|message|>{prompt}<|end|><|start|>assistant<|channel|>final<|message|>{completion}"
196
+
197
+ if add_eos_token:
198
+ # Use <|return|> for training as per template specification
199
+ # This indicates the end of generation in training
200
+ harmony_text += "<|return|>"
201
+ else:
202
+ # Use <|end|> for inference
203
+ harmony_text += "<|end|>"
204
+
205
+ return harmony_text
206
+
207
+ def process_dataset_format(dataset, config):
208
+ """Process dataset based on format configuration with exact GPT-OSS Harmony compliance"""
209
+
210
+ dataset_format = getattr(config, 'dataset_format', 'openhermes_fr')
211
+ input_field = getattr(config, 'input_field', 'prompt')
212
+ target_field = getattr(config, 'target_field', 'accepted_completion')
213
+ concatenate_fields = getattr(config, 'concatenate_fields', True)
214
+ field_separator = getattr(config, 'field_separator', '\n\n### Response:\n')
215
+ add_eos_token = getattr(config, 'add_eos_token', True)
216
+ use_harmony_format = getattr(config, 'use_harmony_format', True)
217
+
218
+ print(f"Processing dataset format: {dataset_format}")
219
+ print(f"Input field: {input_field}, Target field: {target_field}")
220
+ print(f"GPT-OSS Harmony Format: {'Enabled' if use_harmony_format else 'Disabled'}")
221
+
222
+ if dataset_format == "openhermes_fr":
223
+ # Process OpenHermes-FR format: prompt + accepted_completion
224
+ def format_openhermes_fr(example):
225
+ prompt = example.get(input_field, '')
226
+ completion = example.get(target_field, '')
227
+
228
+ if concatenate_fields:
229
+ if use_harmony_format:
230
+ # Use exact GPT-OSS Harmony format from template
231
+ text = format_gpt_oss_harmony(prompt, completion, add_eos_token)
232
+ else:
233
+ # Fallback to standard format with separator
234
+ text = prompt + field_separator + completion
235
+ if add_eos_token:
236
+ text += "</s>"
237
+
238
+ return {"text": text}
239
+ else:
240
+ # Keep separate for more advanced training setups
241
+ return {
242
+ "input": prompt,
243
+ "output": completion
244
+ }
245
+
246
+ dataset = dataset.map(format_openhermes_fr, remove_columns=dataset.column_names)
247
+
248
+ elif dataset_format == "messages":
249
+ # Process messages format (like HuggingFaceH4/Multilingual-Thinking)
250
+ def format_messages(example):
251
+ messages = example.get(input_field, [])
252
+
253
+ if use_harmony_format and len(messages) >= 2:
254
+ # Extract user and assistant messages for harmony format
255
+ user_message = ""
256
+ assistant_message = ""
257
+
258
+ for message in messages:
259
+ role = message.get("role", "")
260
+ content = message.get("content", "")
261
+
262
+ if role == "user":
263
+ user_message = content
264
+ elif role == "assistant":
265
+ assistant_message = content
266
+
267
+ if user_message and assistant_message:
268
+ # Use GPT-OSS Harmony format
269
+ text = format_gpt_oss_harmony(user_message, assistant_message, add_eos_token)
270
+ else:
271
+ # Fallback to simple concatenation
272
+ text = ""
273
+ for message in messages:
274
+ role = message.get("role", "")
275
+ content = message.get("content", "")
276
+ text += f"{role}: {content}\n"
277
+ if add_eos_token:
278
+ text += "</s>"
279
+ else:
280
+ # Standard format - convert messages to simple text
281
+ text = ""
282
+ for message in messages:
283
+ role = message.get("role", "")
284
+ content = message.get("content", "")
285
+ text += f"{role}: {content}\n"
286
+ if add_eos_token:
287
+ text += "</s>"
288
+
289
+ return {"text": text}
290
+
291
+ dataset = dataset.map(format_messages, remove_columns=dataset.column_names)
292
+
293
+ elif dataset_format == "text":
294
+ # Process plain text format
295
+ text_field = input_field
296
+ def format_text(example):
297
+ text = example.get(text_field, '')
298
+ if add_eos_token:
299
+ text += "</s>"
300
+ return {"text": text}
301
+
302
+ dataset = dataset.map(format_text, remove_columns=dataset.column_names)
303
+
304
+ elif dataset_format == "custom":
305
+ # Custom format - user handles this in their config
306
+ print("Using custom dataset format - no automatic processing")
307
 
308
  return dataset
309
 
 
330
 
331
  return trackio_client
332
 
333
+ def create_sft_config(config, output_dir):
334
+ """Create enhanced SFTConfig for GPT-OSS training"""
335
+
336
+ print("Creating enhanced SFT configuration...")
337
+
338
+ # Extract training parameters from config with enhanced defaults
339
+ num_train_epochs = getattr(config, 'num_train_epochs', 1.0)
340
+ max_steps = getattr(config, 'max_steps', None)
341
+ warmup_ratio = getattr(config, 'warmup_ratio', 0.03)
342
+ warmup_steps = getattr(config, 'warmup_steps', None)
343
+
344
+ # Learning rate configuration
345
+ learning_rate = config.learning_rate
346
+ lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
347
+ lr_scheduler_kwargs = getattr(config, 'lr_scheduler_kwargs', {"min_lr_rate": 0.1})
348
+
349
+ # Batch configuration
350
+ per_device_train_batch_size = config.batch_size
351
+ per_device_eval_batch_size = getattr(config, 'eval_batch_size', config.batch_size)
352
+ gradient_accumulation_steps = config.gradient_accumulation_steps
353
+
354
+ # Evaluation and logging
355
+ eval_strategy = getattr(config, 'eval_strategy', 'steps')
356
+ eval_steps = getattr(config, 'eval_steps', 100)
357
+ logging_steps = getattr(config, 'logging_steps', 10)
358
+
359
+ # Saving configuration
360
+ save_strategy = getattr(config, 'save_strategy', 'steps')
361
+ save_steps = getattr(config, 'save_steps', 500)
362
+ save_total_limit = getattr(config, 'save_total_limit', 3)
363
+
364
+ # Mixed precision
365
+ fp16 = getattr(config, 'fp16', False)
366
+ bf16 = getattr(config, 'bf16', True)
367
+
368
+ # Regularization
369
+ weight_decay = getattr(config, 'weight_decay', 0.01)
370
+ max_grad_norm = getattr(config, 'max_grad_norm', 1.0)
371
+
372
+ # HuggingFace Hub integration
373
+ push_to_hub = getattr(config, 'push_to_hub', False)
374
+
375
+ print(f" • Epochs: {num_train_epochs}")
376
+ print(f" • Learning rate: {learning_rate}")
377
+ print(f" • Batch size: {per_device_train_batch_size}")
378
+ print(f" • Gradient accumulation: {gradient_accumulation_steps}")
379
+ print(f" • Effective batch size: {per_device_train_batch_size * gradient_accumulation_steps}")
380
 
381
  sft_config = SFTConfig(
382
+ # Training duration
383
+ num_train_epochs=num_train_epochs,
384
+ max_steps=max_steps,
385
+
386
+ # Learning rate
387
+ learning_rate=learning_rate,
388
+ lr_scheduler_type=lr_scheduler_type,
389
+ lr_scheduler_kwargs=lr_scheduler_kwargs,
390
+ warmup_ratio=warmup_ratio,
391
+ warmup_steps=warmup_steps,
392
+
393
+ # Batch configuration
394
+ per_device_train_batch_size=per_device_train_batch_size,
395
+ per_device_eval_batch_size=per_device_eval_batch_size,
396
+ gradient_accumulation_steps=gradient_accumulation_steps,
397
+
398
+ # Model configuration
399
+ max_seq_length=config.max_seq_length,
400
+ gradient_checkpointing=getattr(config, 'use_gradient_checkpointing', True),
401
+
402
+ # Mixed precision
403
+ fp16=fp16,
404
+ bf16=bf16,
405
+
406
+ # Regularization
407
+ weight_decay=weight_decay,
408
+ max_grad_norm=max_grad_norm,
409
+
410
+ # Evaluation
411
+ evaluation_strategy=eval_strategy,
412
+ eval_steps=eval_steps,
413
+
414
+ # Logging
415
+ logging_steps=logging_steps,
416
+
417
+ # Saving
418
+ save_strategy=save_strategy,
419
+ save_steps=save_steps,
420
+ save_total_limit=save_total_limit,
421
+
422
+ # Output
423
+ output_dir=output_dir,
424
+
425
+ # Data loading
426
+ dataloader_num_workers=getattr(config, 'dataloader_num_workers', 4),
427
+ dataloader_pin_memory=getattr(config, 'dataloader_pin_memory', True),
428
+
429
+ # Performance
430
+ group_by_length=getattr(config, 'group_by_length', True),
431
+ remove_unused_columns=getattr(config, 'remove_unused_columns', True),
432
+
433
+ # HuggingFace Hub
434
+ push_to_hub=push_to_hub,
435
+
436
+ # Monitoring
437
+ report_to="trackio" if getattr(config, 'enable_tracking', False) else None,
438
  )
439
 
440
  return sft_config
 
482
  peft_model = setup_lora_for_gpt_oss(model, config)
483
 
484
  # Load dataset
485
+ dataset = load_dataset_from_config(config)
486
 
487
  # Setup Trackio tracking
488
  trackio_client = setup_trackio_tracking(config)
489
 
490
  # Create SFT configuration
491
+ sft_config = create_sft_config(config, output_dir)
492
 
493
  # Create trainer
494
  print("Creating SFT trainer...")
templates/spaces/demo_gpt/README.md CHANGED
@@ -6,7 +6,7 @@ colorTo: pink
6
  sdk: gradio
7
  sdk_version: 5.40.0
8
  app_file: app.py
9
- pinned: true
10
  short_description: GPT-OSS-20B Multilingual Reasoner LoRA adapter
11
  ---
12
 
 
6
  sdk: gradio
7
  sdk_version: 5.40.0
8
  app_file: app.py
9
+ pinned: false
10
  short_description: GPT-OSS-20B Multilingual Reasoner LoRA adapter
11
  ---
12