Spaces:

Tonic
/

SmolFactory

Running

App Files Files Community

Tonic commited on 18 days ago

Commit

59e57ff

1 Parent(s): 75bcdb3

adds harmony format , configurable gpt-oss parameters, launch.sh logic , improved templates for legml gpt-oss training, dynamic results directory and improve model pushing

Browse files

Files changed (9) hide show

config/train_gpt_oss_custom.py +388 -0
config/train_gpt_oss_openhermes_fr.py +174 -0
config/train_gpt_oss_openhermes_fr_memory_optimized.py +233 -0
docs/output.svg +1 -0
launch.sh +328 -11
scripts/model_tonic/push_gpt_oss_to_huggingface.py +79 -5
scripts/model_tonic/push_to_huggingface.py +83 -5
scripts/training/train_gpt_oss.py +313 -24
templates/spaces/demo_gpt/README.md +1 -1

config/train_gpt_oss_custom.py ADDED Viewed

	@@ -0,0 +1,388 @@

+"""
+GPT-OSS Custom Training Configuration
+Based on OpenAI's GPT-OSS fine-tuning tutorial
+Fully customizable configuration for any dataset format
+Supports specialized datasets like:
+- legmlai/openhermes-fr (French instruction dataset)
+- HuggingFaceH4/Multilingual-Thinking
+- Custom prompt/completion formats
+"""
+import os
+from dataclasses import dataclass
+from typing import Optional, Dict, List, Union
+@dataclass
+class GPTOSSEnhancedCustomConfig:
+    """Enhanced custom configuration for GPT-OSS fine-tuning with maximum flexibility"""
+    # ============================================================================
+    # CORE MODEL CONFIGURATION
+    # ============================================================================
+    trainer_type: str = "sft"  # "sft" or "dpo"
+    model_name: str = "openai/gpt-oss-20b"
+    max_seq_length: int = 2048  # Customizable: 512, 1024, 2048, 4096, 8192
+    use_flash_attention: bool = True
+    use_gradient_checkpointing: bool = True
+    # ============================================================================
+    # TRAINING HYPERPARAMETERS - FULLY CUSTOMIZABLE
+    # ============================================================================
+    # Batch Configuration
+    batch_size: int = 4  # Per-device batch size (1-32 depending on GPU memory)
+    gradient_accumulation_steps: int = 4  # Effective batch = batch_size * accumulation * num_gpus
+    eval_batch_size: Optional[int] = None  # If None, uses batch_size
+    # Learning Rate Configuration
+    learning_rate: float = 2e-4  # Main learning rate (1e-5 to 5e-4 typical range)
+    min_lr: float = 2e-5  # Minimum learning rate for scheduler
+    warmup_ratio: float = 0.03  # Fraction of steps for warmup (0.01-0.1)
+    warmup_steps: Optional[int] = None  # If set, overrides warmup_ratio
+    # Training Duration
+    num_train_epochs: float = 1.0  # Number of epochs (0.5, 1.0, 2.0, 3.0)
+    max_steps: Optional[int] = None  # If set, overrides num_train_epochs
+    max_iters: Optional[int] = None  # Legacy compatibility
+    # Regularization
+    weight_decay: float = 0.01  # L2 regularization (0.0-0.1)
+    max_grad_norm: float = 1.0  # Gradient clipping (0.5-2.0)
+    # ============================================================================
+    # OPTIMIZER CONFIGURATION
+    # ============================================================================
+    optimizer: str = "adamw_torch"  # "adamw_torch", "adamw_hf", "sgd"
+    beta1: float = 0.9  # Adam beta1 parameter
+    beta2: float = 0.95  # Adam beta2 parameter (0.95-0.999)
+    eps: float = 1e-8  # Adam epsilon
+    # ============================================================================
+    # SCHEDULER CONFIGURATION
+    # ============================================================================
+    scheduler: str = "cosine_with_min_lr"  # "linear", "cosine", "cosine_with_min_lr", "constant"
+    lr_scheduler_kwargs: Optional[Dict] = None
+    # ============================================================================
+    # MIXED PRECISION & DISTRIBUTED TRAINING
+    # ============================================================================
+    fp16: bool = False  # Use FP16 (not recommended for GPT-OSS)
+    bf16: bool = True  # Use BF16 (recommended for GPT-OSS)
+    tf32: Optional[bool] = None  # Use TF32 on A100/H100
+    ddp_backend: str = "nccl"
+    ddp_find_unused_parameters: bool = False
+    # ============================================================================
+    # LOGGING, EVALUATION & CHECKPOINTING
+    # ============================================================================
+    # Logging
+    logging_steps: int = 10  # Log every N steps
+    log_level: str = "info"  # "debug", "info", "warning", "error"
+    # Evaluation
+    eval_strategy: str = "steps"  # "no", "steps", "epoch"
+    eval_steps: int = 100  # Evaluate every N steps
+    eval_delay: float = 0  # Delay evaluation for N steps/epochs
+    eval_accumulation_steps: Optional[int] = None  # Accumulate eval outputs
+    # Checkpointing
+    save_strategy: str = "steps"  # "no", "steps", "epoch"
+    save_steps: int = 500  # Save checkpoint every N steps
+    save_total_limit: Optional[int] = 3  # Keep only N best checkpoints
+    save_only_model: bool = False  # Save only model weights
+    # Model Selection
+    metric_for_best_model: str = "eval_loss"
+    greater_is_better: bool = False
+    load_best_model_at_end: bool = True
+    # ============================================================================
+    # DATASET CONFIGURATION - ENHANCED FOR CUSTOM FORMATS
+    # ============================================================================
+    # Dataset Source
+    dataset_name: str = "legmlai/openhermes-fr"  # Default to French OpenHermes
+    dataset_split: str = "train"  # Dataset split to use
+    dataset_config: Optional[str] = None  # Dataset configuration name
+    # Field Mapping - Customize for your dataset format
+    input_field: str = "prompt"  # Field containing the input/prompt
+    target_field: str = "accepted_completion"  # Field containing the target/completion
+    # OpenHermes-FR specific fields
+    filter_bad_entries: bool = True  # Filter entries marked as bad
+    bad_entry_field: str = "bad_entry"  # Field indicating bad entries
+    bad_prompt_field: str = "bad_prompt_detected"  # Field for bad prompts
+    bad_response_field: str = "bad_response_detected"  # Field for bad responses
+    # Data Processing Options
+    concatenate_fields: bool = True  # Combine input and target fields for training
+    field_separator: str = "\n\n### Response:\n"  # Separator between input and target
+    add_eos_token: bool = True  # Add EOS token at the end
+    # Dataset Filtering & Sampling
+    max_samples: Optional[int] = None  # Limit dataset size (e.g., 100000 for testing)
+    min_length: int = 10  # Minimum sequence length
+    max_length: Optional[int] = None  # Maximum sequence length (None = use max_seq_length)
+    # Custom Dataset Formats Support
+    dataset_format: str = "openhermes_fr"  # "openhermes_fr", "messages", "text", "custom"
+    # GPT-OSS Harmony Format Configuration
+    use_harmony_format: bool = True  # Enable GPT-OSS harmony format
+    use_chat_template: bool = False  # Set to True for messages format
+    chat_template_kwargs: Optional[Dict] = None
+    # ============================================================================
+    # TRACKIO MONITORING CONFIGURATION
+    # ============================================================================
+    enable_tracking: bool = True
+    trackio_url: Optional[str] = None
+    trackio_token: Optional[str] = None
+    log_artifacts: bool = True
+    log_metrics: bool = True
+    log_config: bool = True
+    experiment_name: Optional[str] = None
+    # ============================================================================
+    # HUGGING FACE INTEGRATION
+    # ============================================================================
+    hf_token: Optional[str] = None
+    dataset_repo: Optional[str] = None
+    push_to_hub: bool = False  # Push model to HF Hub after training
+    hub_model_id: Optional[str] = None  # HF Hub model ID
+    hub_private_repo: bool = False  # Make HF repo private
+    # ============================================================================
+    # GPT-OSS SPECIFIC CONFIGURATIONS
+    # ============================================================================
+    # LoRA Configuration
+    use_lora: bool = True
+    lora_config: Optional[Dict] = None
+    # Quantization Configuration
+    use_quantization: bool = True
+    quantization_config: Optional[Dict] = None
+    # Model Loading Configuration
+    model_kwargs: Optional[Dict] = None
+    # Generation Configuration (for evaluation/testing)
+    generation_config: Optional[Dict] = None
+    # ============================================================================
+    # MULTILINGUAL & DOMAIN SPECIFIC SETTINGS
+    # ============================================================================
+    # Language Support (for multilingual datasets)
+    primary_language: str = "fr"  # Primary language code
+    reasoning_languages: Optional[List[str]] = None  # Supported languages for reasoning
+    # Domain-specific settings
+    domain_focus: Optional[str] = None  # "reasoning", "conversation", "instruction", "general"
+    # ============================================================================
+    # PERFORMANCE & MEMORY OPTIMIZATION
+    # ============================================================================
+    # Data Loading
+    dataloader_num_workers: int = 4  # Number of data loading workers
+    dataloader_pin_memory: bool = True  # Pin memory for faster GPU transfer
+    dataloader_prefetch_factor: int = 2  # Prefetch factor for data loading
+    # Memory Management
+    max_memory_per_gpu: Optional[str] = None  # e.g., "80GB", "40GB"
+    low_cpu_mem_usage: bool = True  # Use low CPU memory loading
+    # Performance Optimizations
+    group_by_length: bool = True  # Group sequences by length
+    length_column_name: str = "length"  # Column name for sequence lengths
+    remove_unused_columns: bool = True  # Remove unused dataset columns
+    def __post_init__(self):
+        """Initialize default values and validate configuration"""
+        # ============================================================================
+        # LORA CONFIGURATION DEFAULTS
+        # ============================================================================
+        if self.lora_config is None:
+            self.lora_config = {
+                "r": 16,  # Rank (4, 8, 16, 32, 64) - higher = more parameters
+                "lora_alpha": 32,  # Scaling factor (usually 2*r)
+                "target_modules": "all-linear",  # Apply LoRA to all linear layers
+                "target_parameters": [
+                    "7.mlp.experts.gate_up_proj",
+                    "7.mlp.experts.down_proj",
+                    "15.mlp.experts.gate_up_proj",
+                    "15.mlp.experts.down_proj",
+                    "23.mlp.experts.gate_up_proj",
+                    "23.mlp.experts.down_proj",
+                ],
+                "bias": "none",  # "none", "all", "lora_only"
+                "task_type": "CAUSAL_LM",
+                "lora_dropout": 0.05,  # LoRA dropout rate
+            }
+        # ============================================================================
+        # QUANTIZATION CONFIGURATION DEFAULTS
+        # ============================================================================
+        if self.quantization_config is None:
+            self.quantization_config = {
+                "dequantize": True,  # Use Mxfp4Config as per GPT-OSS tutorial
+                "load_in_4bit": False,  # Set to True for extreme memory optimization
+                "bnb_4bit_compute_dtype": "bfloat16",  # For 4-bit quantization
+                "bnb_4bit_use_double_quant": True,  # Double quantization
+                "bnb_4bit_quant_type": "nf4"  # Quantization type
+            }
+        # ============================================================================
+        # MODEL LOADING CONFIGURATION DEFAULTS
+        # ============================================================================
+        if self.model_kwargs is None:
+            self.model_kwargs = {
+                "attn_implementation": "eager",  # "eager", "flash_attention_2"
+                "torch_dtype": "auto",  # "auto", "bfloat16", "float16"
+                "use_cache": False,  # Disable KV cache for training
+                "device_map": "auto",  # Automatic device mapping
+                "low_cpu_mem_usage": self.low_cpu_mem_usage,
+            }
+            # Add memory constraints if specified
+            if self.max_memory_per_gpu:
+                self.model_kwargs["max_memory"] = {0: self.max_memory_per_gpu}
+        # ============================================================================
+        # GENERATION CONFIGURATION DEFAULTS
+        # ============================================================================
+        if self.generation_config is None:
+            self.generation_config = {
+                "max_new_tokens": 512,  # Maximum tokens to generate
+                "do_sample": True,  # Use sampling
+                "temperature": 0.7,  # Sampling temperature
+                "top_p": 0.9,  # Nucleus sampling
+                "top_k": 50,  # Top-k sampling
+                "repetition_penalty": 1.1,  # Repetition penalty
+                "pad_token_id": None,  # Will be set from tokenizer
+                "eos_token_id": None,  # Will be set from tokenizer
+            }
+        # ============================================================================
+        # LANGUAGE CONFIGURATION DEFAULTS
+        # ============================================================================
+        if self.reasoning_languages is None:
+            if self.primary_language == "fr":
+                self.reasoning_languages = [
+                    "French", "English", "Spanish", "Italian", "German"
+                ]
+            else:
+                self.reasoning_languages = [
+                    "English", "Spanish", "French", "Italian", "German",
+                    "Chinese", "Hindi", "Japanese", "Korean", "Arabic"
+                ]
+        # ============================================================================
+        # SCHEDULER CONFIGURATION DEFAULTS
+        # ============================================================================
+        if self.lr_scheduler_kwargs is None:
+            self.lr_scheduler_kwargs = {"min_lr_rate": 0.1}
+        # ============================================================================
+        # CHAT TEMPLATE CONFIGURATION DEFAULTS (GPT-OSS Harmony Format)
+        # ============================================================================
+        if self.chat_template_kwargs is None:
+            self.chat_template_kwargs = {
+                "add_generation_prompt": True,
+                "tokenize": False,
+                "auto_insert_role": True,
+                # GPT-OSS Harmony Format specific settings
+                "reasoning_effort": "medium",  # low, medium, high
+                "model_identity": "You are GPT-Tonic, a large language model trained by TonicAI.",
+                "builtin_tools": [],  # Can include "browser" and/or "python"
+            }
+        # ============================================================================
+        # VALIDATION AND COMPUTED VALUES
+        # ============================================================================
+        # Compute effective batch size
+        effective_batch_size = self.batch_size * self.gradient_accumulation_steps
+        # Set warmup steps if not provided
+        if self.warmup_steps is None and self.max_steps:
+            self.warmup_steps = int(self.max_steps * self.warmup_ratio)
+        # Set max_length for dataset filtering
+        if self.max_length is None:
+            self.max_length = self.max_seq_length
+        # Validate configuration
+        self._validate_config()
+        # Print comprehensive configuration summary
+        self._print_config_summary(effective_batch_size)
+    def _validate_config(self):
+        """Validate configuration parameters"""
+        # Validate batch configuration
+        if self.batch_size < 1:
+            raise ValueError("batch_size must be >= 1")
+        if self.gradient_accumulation_steps < 1:
+            raise ValueError("gradient_accumulation_steps must be >= 1")
+        # Validate learning rate
+        if self.learning_rate <= 0:
+            raise ValueError("learning_rate must be > 0")
+        if self.min_lr >= self.learning_rate:
+            raise ValueError("min_lr must be < learning_rate")
+        # Validate sequence length
+        if self.max_seq_length < 1:
+            raise ValueError("max_seq_length must be >= 1")
+        # Validate dataset format
+        valid_formats = ["openhermes_fr", "messages", "text", "custom"]
+        if self.dataset_format not in valid_formats:
+            raise ValueError(f"dataset_format must be one of {valid_formats}")
+    def _print_config_summary(self, effective_batch_size):
+        """Print detailed configuration summary"""
+        print("\n" + "="*80)
+        print("🚀 GPT-OSS ENHANCED CUSTOM CONFIGURATION")
+        print("="*80)
+        print(f"📊 Model & Training:")
+        print(f"   • Model: {self.model_name}")
+        print(f"   • Dataset: {self.dataset_name} ({self.dataset_format})")
+        print(f"   • Primary Language: {self.primary_language}")
+        print(f"   • Sequence Length: {self.max_seq_length}")
+        print(f"   • Epochs: {self.num_train_epochs}")
+        print(f"\n🔄 Batch Configuration:")
+        print(f"   • Per-device Batch Size: {self.batch_size}")
+        print(f"   • Gradient Accumulation: {self.gradient_accumulation_steps}")
+        print(f"   • Effective Batch Size: {effective_batch_size}")
+        print(f"\n📈 Learning Configuration:")
+        print(f"   • Learning Rate: {self.learning_rate}")
+        print(f"   • Min Learning Rate: {self.min_lr}")
+        print(f"   • Weight Decay: {self.weight_decay}")
+        print(f"   • Warmup Ratio: {self.warmup_ratio}")
+        print(f"\n🎛️ LoRA Configuration:")
+        print(f"   • Rank: {self.lora_config['r']}")
+        print(f"   • Alpha: {self.lora_config['lora_alpha']}")
+        print(f"   • Target Modules: {self.lora_config['target_modules']}")
+        print(f"\n📁 Dataset Configuration:")
+        print(f"   • Input Field: {self.input_field}")
+        print(f"   • Target Field: {self.target_field}")
+        print(f"   • Filter Bad Entries: {self.filter_bad_entries}")
+        print(f"   • Max Samples: {self.max_samples or 'All'}")
+        print(f"\n💾 Memory & Performance:")
+        print(f"   • Mixed Precision: {'BF16' if self.bf16 else 'FP32'}")
+        print(f"   • Gradient Checkpointing: {self.use_gradient_checkpointing}")
+        print(f"   • Data Workers: {self.dataloader_num_workers}")
+        print(f"   • Group by Length: {self.group_by_length}")
+        print("="*80 + "\n")
+# Create the config instance with OpenHermes-FR optimized defaults
+config = GPTOSSEnhancedCustomConfig()

config/train_gpt_oss_openhermes_fr.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""
+GPT-OSS OpenHermes-FR Optimized Configuration
+Specifically optimized for the legmlai/openhermes-fr dataset
+800K French instruction-response pairs with quality filtering
+"""
+from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
+# OpenHermes-FR optimized configuration
+config = GPTOSSEnhancedCustomConfig(
+    # ============================================================================
+    # DATASET CONFIGURATION - OpenHermes-FR Specific
+    # ============================================================================
+    dataset_name="legmlai/openhermes-fr",
+    dataset_split="train",
+    dataset_format="openhermes_fr",
+    # OpenHermes-FR field mapping
+    input_field="prompt",                    # French prompts
+    target_field="accepted_completion",      # GPT-4o generated completions
+    # Quality filtering using OpenHermes-FR metadata
+    filter_bad_entries=True,                 # Use built-in quality flags
+    bad_entry_field="bad_entry",
+    bad_prompt_field="bad_prompt_detected",
+    bad_response_field="bad_response_detected",
+    # Data processing optimized for French with GPT-OSS Harmony Format
+    concatenate_fields=True,
+    field_separator="\n\n### Réponse:\n",   # Fallback separator (harmony format takes precedence)
+    add_eos_token=True,
+    use_harmony_format=True,                 # Enable GPT-OSS harmony format
+    # Dataset sampling (use all 800K examples by default)
+    max_samples=None,                        # Use full dataset
+    min_length=20,                          # Minimum for meaningful French text
+    max_length=None,                        # Auto-set to max_seq_length
+    # ============================================================================
+    # TRAINING HYPERPARAMETERS - French Language Optimized
+    # ============================================================================
+    num_train_epochs=1.5,                   # 1.5 epochs optimal for large dataset
+    batch_size=6,                           # Balanced for most GPUs
+    gradient_accumulation_steps=6,          # Effective batch size: 36
+    # Learning rate schedule optimized for French fine-tuning
+    learning_rate=2.5e-4,                   # Slightly higher for multilingual
+    min_lr=2.5e-5,                          # 10% of max learning rate
+    warmup_ratio=0.05,                      # 5% warmup for stability
+    weight_decay=0.01,                      # Standard L2 regularization
+    max_grad_norm=1.0,                      # Gradient clipping
+    # ============================================================================
+    # MODEL CONFIGURATION - Optimized for French
+    # ============================================================================
+    model_name="openai/gpt-oss-20b",
+    max_seq_length=3072,                    # Balanced length for French
+    use_flash_attention=True,
+    use_gradient_checkpointing=True,
+    # Mixed precision for efficiency
+    fp16=False,
+    bf16=True,                              # Better for GPT-OSS
+    # ============================================================================
+    # LORA CONFIGURATION - Optimized for French Language Learning
+    # ============================================================================
+    use_lora=True,
+    lora_config={
+        "r": 24,                            # Higher rank for language adaptation
+        "lora_alpha": 48,                   # 2x rank scaling
+        "lora_dropout": 0.05,               # Light regularization
+        "target_modules": "all-linear",
+        "target_parameters": [
+            "7.mlp.experts.gate_up_proj",
+            "7.mlp.experts.down_proj",
+            "15.mlp.experts.gate_up_proj",
+            "15.mlp.experts.down_proj",
+            "23.mlp.experts.gate_up_proj",
+            "23.mlp.experts.down_proj",
+        ],
+        "bias": "none",
+        "task_type": "CAUSAL_LM",
+    },
+    # ============================================================================
+    # QUANTIZATION - Balanced Performance/Memory
+    # ============================================================================
+    use_quantization=True,
+    quantization_config={
+        "dequantize": True,                 # MXFP4 as per GPT-OSS tutorial
+        "load_in_4bit": False,              # Standard precision for quality
+    },
+    # ============================================================================
+    # PERFORMANCE OPTIMIZATION
+    # ============================================================================
+    # Data loading optimized for large dataset
+    dataloader_num_workers=6,               # More workers for large dataset
+    dataloader_pin_memory=True,
+    dataloader_prefetch_factor=3,           # Higher prefetch for efficiency
+    # Memory management
+    low_cpu_mem_usage=True,
+    group_by_length=True,                   # Efficient batching
+    remove_unused_columns=True,
+    # ============================================================================
+    # EVALUATION & LOGGING
+    # ============================================================================
+    eval_strategy="steps",
+    eval_steps=200,                         # Evaluate every 200 steps
+    logging_steps=20,                       # Log every 20 steps
+    save_strategy="steps",
+    save_steps=500,                         # Save every 500 steps
+    save_total_limit=3,                     # Keep 3 best checkpoints
+    metric_for_best_model="eval_loss",
+    greater_is_better=False,
+    load_best_model_at_end=True,
+    # ============================================================================
+    # MULTILINGUAL & FRENCH SPECIFIC SETTINGS
+    # ============================================================================
+    primary_language="fr",                  # French as primary language
+    reasoning_languages=["French", "English"],  # Bilingual reasoning
+    domain_focus="instruction",             # Instruction following
+    # ============================================================================
+    # GENERATION CONFIG FOR EVALUATION - GPT-OSS Harmony Format
+    # ============================================================================
+    generation_config={
+        "max_new_tokens": 512,
+        "do_sample": True,
+        "temperature": 0.7,
+        "top_p": 0.9,
+        "top_k": 50,
+        "repetition_penalty": 1.1,
+        "pad_token_id": None,
+        "eos_token_id": None,
+        # GPT-OSS Harmony Format specific settings
+        "reasoning_effort": "medium",           # Configurable reasoning level
+        "use_harmony_format": True,             # Ensure harmony format in generation
+    },
+    # ============================================================================
+    # HF HUB INTEGRATION
+    # ============================================================================
+    push_to_hub=False,                      # Set to True to auto-push
+    hub_model_id=None,                      # Will be set by launch script
+    hub_private_repo=False,
+    # ============================================================================
+    # MONITORING
+    # ============================================================================
+    enable_tracking=True,                   # Trackio monitoring
+    log_artifacts=True,
+    log_metrics=True,
+    log_config=True,
+)
+# Print configuration summary on import
+print("\n🇫🇷 OpenHermes-FR Configuration Loaded")
+print("=" * 50)
+print(f"📊 Dataset: {config.dataset_name}")
+print(f"🗣️  Language: French (with {config.dataset_format} format)")
+print(f"📈 Training: {config.num_train_epochs} epochs")
+print(f"🔄 Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
+print(f"🧠 LoRA Rank: {config.lora_config['r']}")
+print(f"📏 Sequence Length: {config.max_seq_length}")
+print(f"🔍 Quality Filtering: {'Enabled' if config.filter_bad_entries else 'Disabled'}")
+print(f"🎵 GPT-OSS Harmony Format: {'Enabled' if config.use_harmony_format else 'Disabled'}")
+print("=" * 50)

config/train_gpt_oss_openhermes_fr_memory_optimized.py ADDED Viewed

	@@ -0,0 +1,233 @@

+"""
+GPT-OSS OpenHermes-FR Memory-Optimized Configuration
+Combines memory optimization best practices with OpenHermes-FR dataset
+Optimized for GPT-OSS harmony format and MXFP4 quantization
+Based on OpenAI GPT-OSS specifications and memory optimization principles
+"""
+from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
+# Memory-optimized OpenHermes-FR configuration for GPT-OSS
+config = GPTOSSEnhancedCustomConfig(
+    # ============================================================================
+    # DATASET CONFIGURATION - OpenHermes-FR with Harmony Format
+    # ============================================================================
+    dataset_name="legmlai/openhermes-fr",
+    dataset_split="train",
+    dataset_format="openhermes_fr",
+    # OpenHermes-FR field mapping optimized for harmony format
+    input_field="prompt",                    # French prompts
+    target_field="accepted_completion",      # GPT-4o generated completions
+    # Enhanced quality filtering for memory-constrained training
+    filter_bad_entries=True,                 # Critical for memory efficiency
+    bad_entry_field="bad_entry",
+    bad_prompt_field="bad_prompt_detected",
+    bad_response_field="bad_response_detected",
+    # Memory-optimized data processing with GPT-OSS Harmony Format
+    concatenate_fields=True,
+    field_separator="\n\n### Réponse:\n",   # Fallback separator (harmony format takes precedence)
+    add_eos_token=True,                      # Required for proper training
+    use_harmony_format=True,                 # Enable GPT-OSS harmony format
+    # Dataset sampling optimized for memory constraints
+    max_samples=200000,                      # Reduced from 800K for memory efficiency
+    min_length=15,                          # Slightly higher minimum for quality
+    max_length=2048,                        # Explicit max length for memory control
+    # ============================================================================
+    # MEMORY-OPTIMIZED TRAINING HYPERPARAMETERS
+    # ============================================================================
+    # Batch configuration following memory optimization principles
+    num_train_epochs=1.0,                   # Single epoch to reduce memory pressure
+    batch_size=2,                           # Reduced from 6 for memory efficiency
+    gradient_accumulation_steps=16,         # Increased to maintain effective batch size 32
+    # Learning rate optimized for single epoch + memory constraints
+    learning_rate=2e-4,                     # Standard GPT-OSS learning rate
+    min_lr=2e-5,                            # 10% of max learning rate
+    warmup_ratio=0.03,                      # Reduced warmup for memory efficiency
+    weight_decay=0.01,                      # Standard L2 regularization
+    max_grad_norm=1.0,                      # Gradient clipping for stability
+    # ============================================================================
+    # MODEL CONFIGURATION - Memory Optimized for GPT-OSS
+    # ============================================================================
+    model_name="openai/gpt-oss-20b",
+    max_seq_length=1024,                    # Reduced from 3072 for memory optimization
+    use_flash_attention=True,               # Critical for memory efficiency
+    use_gradient_checkpointing=True,        # Essential for memory optimization
+    # Mixed precision optimized for GPT-OSS MXFP4
+    fp16=False,                             # Not recommended for GPT-OSS
+    bf16=True,                              # Required for GPT-OSS stability
+    tf32=True,                              # Enable TF32 for A100/H100 efficiency
+    # ============================================================================
+    # LORA CONFIGURATION - Memory Optimized for GPT-OSS MoE
+    # ============================================================================
+    use_lora=True,
+    lora_config={
+        "r": 8,                             # Reduced rank for memory efficiency
+        "lora_alpha": 16,                   # 2x rank scaling (memory optimized)
+        "lora_dropout": 0.1,                # Higher dropout for better generalization
+        "target_modules": "all-linear",     # Apply to all linear layers
+        "target_parameters": [
+            # GPT-OSS specific MoE expert targeting
+            "7.mlp.experts.gate_up_proj",
+            "7.mlp.experts.down_proj",
+            "15.mlp.experts.gate_up_proj",
+            "15.mlp.experts.down_proj",
+            "23.mlp.experts.gate_up_proj",
+            "23.mlp.experts.down_proj",
+        ],
+        "bias": "none",                     # No bias adaptation for memory efficiency
+        "task_type": "CAUSAL_LM",
+        "modules_to_save": [],              # Don't save additional modules for memory
+    },
+    # ============================================================================
+    # QUANTIZATION - GPT-OSS Native MXFP4 Optimization
+    # ============================================================================
+    use_quantization=True,
+    quantization_config={
+        "dequantize": True,                 # Use native MXFP4 as per GPT-OSS specs
+        "load_in_4bit": False,              # Don't use BNB 4-bit with MXFP4
+        "mxfp4_config": {                   # Native GPT-OSS MXFP4 settings
+            "enabled": True,
+            "block_size": 32,               # Optimized block size for MoE
+        }
+    },
+    # ============================================================================
+    # MEMORY OPTIMIZATION CONFIGURATION
+    # ============================================================================
+    # Model loading with memory constraints
+    model_kwargs={
+        "attn_implementation": "eager",     # Memory-safe attention
+        "torch_dtype": "auto",              # Let model decide (MXFP4 compatible)
+        "use_cache": False,                 # Disable KV cache for training
+        "device_map": "auto",               # Automatic device mapping
+        "low_cpu_mem_usage": True,          # Critical for memory optimization
+        "max_memory": {0: "75GB"},          # Reserve memory for other processes
+    },
+    # Data loading optimized for memory efficiency
+    dataloader_num_workers=2,               # Reduced workers to save memory
+    dataloader_pin_memory=False,            # Disable to save memory
+    dataloader_prefetch_factor=1,           # Minimal prefetch for memory
+    # Memory management optimizations
+    max_memory_per_gpu="75GB",              # Explicit memory limit
+    low_cpu_mem_usage=True,                 # Essential for large models
+    group_by_length=True,                   # Efficient batching for memory
+    remove_unused_columns=True,             # Remove unnecessary data
+    # ============================================================================
+    # EVALUATION & LOGGING - Memory Efficient
+    # ============================================================================
+    eval_strategy="steps",
+    eval_steps=500,                         # Less frequent evaluation for memory
+    logging_steps=50,                       # Reduced logging frequency
+    save_strategy="steps",
+    save_steps=1000,                        # Less frequent saves for memory/storage
+    save_total_limit=2,                     # Keep only 2 checkpoints for memory
+    save_only_model=True,                   # Save only model weights
+    metric_for_best_model="eval_loss",
+    greater_is_better=False,
+    load_best_model_at_end=True,
+    # Evaluation memory optimization
+    eval_accumulation_steps=4,              # Accumulate eval outputs to save memory
+    eval_batch_size=1,                      # Smaller eval batch size
+    # ============================================================================
+    # GPT-OSS HARMONY FORMAT OPTIMIZATION
+    # ============================================================================
+    # Chat template for harmony format compatibility (following exact template)
+    use_chat_template=False,                # Use custom harmony format instead
+    chat_template_kwargs={
+        "add_generation_prompt": True,
+        "tokenize": False,
+        # GPT-OSS Harmony Format specific settings (exact template format)
+        "reasoning_effort": "medium",       # low, medium, high
+        "model_identity": "You are GPT-Tonic, a large language model trained by TonicAI.",
+        "builtin_tools": [],                # Can include "browser" and/or "python"
+    },
+    # Generation config optimized for GPT-OSS harmony format (exact template compliance)
+    generation_config={
+        "max_new_tokens": 256,              # Reduced for memory efficiency
+        "do_sample": True,
+        "temperature": 0.6,                 # Slightly lower for more focused training
+        "top_p": 0.9,
+        "top_k": 40,                        # Reduced for memory efficiency
+        "repetition_penalty": 1.1,
+        "pad_token_id": None,
+        "eos_token_id": None,
+        # GPT-OSS Harmony Format specific settings (exact template format)
+        "reasoning_effort": "medium",       # Configurable reasoning level
+        "use_harmony_format": True,         # Ensure harmony format in generation
+    },
+    # ============================================================================
+    # MULTILINGUAL & REASONING OPTIMIZATION
+    # ============================================================================
+    primary_language="fr",                  # French as primary language
+    reasoning_languages=["French", "English"],  # Bilingual reasoning capability
+    domain_focus="reasoning",               # Align with GPT-OSS reasoning focus
+    # ============================================================================
+    # OPTIMIZER & SCHEDULER - Memory Optimized
+    # ============================================================================
+    optimizer="adamw_torch",                # Memory-efficient optimizer
+    beta1=0.9,
+    beta2=0.95,                             # GPT-OSS optimized beta2
+    eps=1e-8,
+    scheduler="cosine_with_min_lr",         # Stable scheduler for single epoch
+    lr_scheduler_kwargs={
+        "min_lr_rate": 0.1,
+        "warmup_steps": None,               # Use warmup_ratio instead
+    },
+    # ============================================================================
+    # MONITORING & HUB INTEGRATION
+    # ============================================================================
+    enable_tracking=True,                   # Trackio monitoring
+    log_artifacts=False,                    # Disable to save memory/storage
+    log_metrics=True,
+    log_config=True,
+    push_to_hub=False,                      # Set to True after successful training
+    hub_model_id=None,
+    hub_private_repo=False,
+)
+# Configuration validation and optimization tips
+print("\n🔧 GPT-OSS Memory-Optimized OpenHermes-FR Configuration")
+print("=" * 60)
+print(f"📊 Dataset: {config.dataset_name} (200K samples)")
+print(f"🗣️  Language: French with GPT-OSS Harmony Format")
+print(f"📈 Training: {config.num_train_epochs} epoch (memory optimized)")
+print(f"🔄 Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
+print(f"🧠 LoRA Rank: {config.lora_config['r']} (memory optimized)")
+print(f"📏 Sequence Length: {config.max_seq_length} (memory optimized)")
+print(f"💾 Memory Limit: {config.max_memory_per_gpu}")
+print(f"⚡ Quantization: MXFP4 (GPT-OSS native)")
+print(f"🔍 Quality Filtering: Enabled")
+print(f"🎵 GPT-OSS Harmony Format: {'Enabled' if config.use_harmony_format else 'Disabled'}")
+print("=" * 60)
+print("\n💡 Memory Optimization Features:")
+print("  • Native MXFP4 quantization for GPT-OSS MoE layers")
+print("  • Reduced batch size with increased gradient accumulation")
+print("  • Limited sequence length for memory efficiency")
+print("  • Reduced LoRA rank while maintaining effectiveness")
+print("  • Dataset sampling (200K from 800K) for faster training")
+print("  • Gradient checkpointing and efficient data loading")
+print("  • Exact GPT-OSS Harmony format with <|return|> tokens")
+print("=" * 60)

docs/output.svg ADDED Viewed

launch.sh CHANGED Viewed

@@ -234,7 +234,34 @@ show_training_configs() {
     echo "   - 4-bit quantization + reduced LoRA"
     echo "   - Optimized for limited GPU memory"
     echo ""
-    echo "9. Custom Configuration"
     echo "   - User-defined parameters"
     echo ""
 }
@@ -325,12 +352,142 @@ get_training_config() {
             MAX_SEQ_LENGTH=1024
             CONFIG_FILE="config/train_gpt_oss_memory_optimized.py"
             ;;
         "Custom Configuration")
             get_custom_config
             ;;
     esac
 }
 # Function to get custom configuration
 get_custom_config() {
     print_step "Custom Configuration Setup"
@@ -352,6 +509,136 @@ get_custom_config() {
     fi
 }
 # Function to create training configuration file
 create_training_config() {
     local config_file="$1"
@@ -499,7 +786,7 @@ print_step "Step 2: Training Configuration"
 echo "=================================="
 show_training_configs
-select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "GPT-OSS Basic Training" "GPT-OSS H100 Optimized" "GPT-OSS Multilingual Reasoning" "GPT-OSS Memory Optimized" "Custom Configuration" TRAINING_CONFIG_TYPE
 get_training_config "$TRAINING_CONFIG_TYPE"
@@ -836,13 +1123,25 @@ print_info "Dataset: $DATASET_NAME"
 print_info "Batch size: $BATCH_SIZE"
 print_info "Learning rate: $LEARNING_RATE"
 # Step 15: Start training
 print_step "Step 15: Starting Training"
 echo "=============================="
 print_info "Starting training with configuration: $CONFIG_FILE"
 print_info "Experiment: $EXPERIMENT_NAME"
-print_info "Output: /output-checkpoint"
 print_info "Trackio: $TRACKIO_URL"
 # Ensure environment variables are available for training
@@ -852,6 +1151,7 @@ export HF_TOKEN="$HF_TOKEN"
 export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
 export HF_USERNAME="$HF_USERNAME"
 export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
 # Run the appropriate training script based on model type
 if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
@@ -859,7 +1159,7 @@ if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
     python scripts/training/train_gpt_oss.py \
         --config "$CONFIG_FILE" \
         --experiment-name "$EXPERIMENT_NAME" \
-        --output-dir /output-checkpoint \
         --trackio-url "$TRACKIO_URL" \
         --trainer-type "$TRAINER_TYPE_LOWER"
 else
@@ -867,7 +1167,7 @@ else
     python scripts/training/train.py \
         --config "$CONFIG_FILE" \
         --experiment-name "$EXPERIMENT_NAME" \
-        --output-dir /output-checkpoint \
         --trackio-url "$TRACKIO_URL" \
         --trainer-type "$TRAINER_TYPE_LOWER"
 fi
@@ -877,7 +1177,7 @@ print_step "Step 16: Pushing Model to HF Hub"
 echo "====================================="
 print_info "Pushing model to: $REPO_NAME"
-print_info "Checkpoint: /output-checkpoint"
 # Ensure environment variables are available for model push
 export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
@@ -886,26 +1186,43 @@ export HF_TOKEN="$HF_TOKEN"
 export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
 export HF_USERNAME="$HF_USERNAME"
 export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
 # Run the appropriate push script based on model type
 if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
     print_info "Using GPT-OSS specialized push script..."
-    python scripts/model_tonic/push_gpt_oss_to_huggingface.py /output-checkpoint "$REPO_NAME" \
         --token "$HF_TOKEN" \
         --trackio-url "$TRACKIO_URL" \
         --experiment-name "$EXPERIMENT_NAME" \
         --dataset-repo "$TRACKIO_DATASET_REPO" \
         --author-name "$AUTHOR_NAME" \
-        --model-description "$MODEL_DESCRIPTION"
 else
     print_info "Using standard SmolLM3 push script..."
-    python scripts/model_tonic/push_to_huggingface.py /output-checkpoint "$REPO_NAME" \
         --token "$HF_TOKEN" \
         --trackio-url "$TRACKIO_URL" \
         --experiment-name "$EXPERIMENT_NAME" \
         --dataset-repo "$TRACKIO_DATASET_REPO" \
         --author-name "$AUTHOR_NAME" \
-        --model-description "$MODEL_DESCRIPTION"
 fi
 # Step 16.5: Switch Trackio Space to Read Token (Security)
@@ -1018,7 +1335,7 @@ fi)
 ## Files Created
 - Training configuration: \`$CONFIG_FILE\`
-- Model checkpoint: \`/output-checkpoint/\`
 - Training logs: \`training.log\`
 - Summary report: \`training_summary.md\`
 EOF

     echo "   - 4-bit quantization + reduced LoRA"
     echo "   - Optimized for limited GPU memory"
     echo ""
+    echo "9. GPT-OSS OpenHermes-FR (Recommended)"
+    echo "   - Model: openai/gpt-oss-20b"
+    echo "   - Dataset: legmlai/openhermes-fr (800K French examples)"
+    echo "   - Epochs: 1.5"
+    echo "   - Batch Size: 6 (effective 36 with accumulation)"
+    echo "   - Learning Rate: 2.5e-4"
+    echo "   - Optimized for French language training"
+    echo "   - Quality filtering enabled"
+    echo ""
+    echo "10. GPT-OSS OpenHermes-FR Memory Optimized"
+    echo "   - Model: openai/gpt-oss-20b"
+    echo "   - Dataset: legmlai/openhermes-fr (200K samples)"
+    echo "   - Epochs: 1"
+    echo "   - Batch Size: 2 (effective 32 with accumulation)"
+    echo "   - Learning Rate: 2e-4"
+    echo "   - Native MXFP4 quantization"
+    echo "   - Memory optimized for 40-80GB GPUs"
+    echo "   - Harmony format compatible"
+    echo ""
+    echo "10. GPT-OSS Custom Dataset"
+    echo "   - Model: openai/gpt-oss-20b"
+    echo "   - Dataset: User-defined (fully customizable)"
+    echo "   - Epochs: Configurable"
+    echo "   - Batch Size: Configurable"
+    echo "   - Learning Rate: Configurable"
+    echo "   - Maximum flexibility with all parameters"
+    echo ""
+    echo "11. Custom Configuration"
     echo "   - User-defined parameters"
     echo ""
 }
             MAX_SEQ_LENGTH=1024
             CONFIG_FILE="config/train_gpt_oss_memory_optimized.py"
             ;;
+        "GPT-OSS OpenHermes-FR (Recommended)")
+            MODEL_NAME="openai/gpt-oss-20b"
+            DATASET_NAME="legmlai/openhermes-fr"
+            MAX_EPOCHS=1.5
+            BATCH_SIZE=6
+            GRADIENT_ACCUMULATION_STEPS=6
+            LEARNING_RATE=2.5e-4
+            MAX_SEQ_LENGTH=3072
+            CONFIG_FILE="config/train_gpt_oss_openhermes_fr.py"
+            ;;
+        "GPT-OSS OpenHermes-FR Memory Optimized")
+            MODEL_NAME="openai/gpt-oss-20b"
+            DATASET_NAME="legmlai/openhermes-fr"
+            MAX_EPOCHS=1
+            BATCH_SIZE=2
+            GRADIENT_ACCUMULATION_STEPS=16
+            LEARNING_RATE=2e-4
+            MAX_SEQ_LENGTH=1024
+            CONFIG_FILE="config/train_gpt_oss_openhermes_fr_memory_optimized.py"
+            ;;
+        "GPT-OSS Custom Dataset")
+            MODEL_NAME="openai/gpt-oss-20b"
+            DATASET_NAME="legmlai/openhermes-fr"  # Will be customizable
+            MAX_EPOCHS=1
+            BATCH_SIZE=4
+            GRADIENT_ACCUMULATION_STEPS=4
+            LEARNING_RATE=2e-4
+            MAX_SEQ_LENGTH=2048
+            CONFIG_FILE="config/train_gpt_oss_custom.py"
+            get_custom_dataset_config
+            ;;
         "Custom Configuration")
             get_custom_config
             ;;
     esac
 }
+# Function to get custom dataset configuration
+get_custom_dataset_config() {
+    print_step "GPT-OSS Custom Configuration"
+    echo "======================================"
+    echo "Configure your GPT-OSS training:"
+    echo ""
+    # Dataset Configuration
+    print_info "📊 Dataset Configuration"
+    get_input "Dataset name (HuggingFace format: username/dataset)" "legmlai/openhermes-fr" DATASET_NAME
+    get_input "Dataset split" "train" DATASET_SPLIT
+    echo ""
+    echo "Dataset format options:"
+    echo "1. OpenHermes-FR (prompt + accepted_completion fields)"
+    echo "2. Messages format (chat conversations)"
+    echo "3. Text format (plain text field)"
+    echo "4. Custom format (specify field names)"
+    echo ""
+    select_option "Select dataset format:" "OpenHermes-FR" "Messages format" "Text format" "Custom format" DATASET_FORMAT
+    case "$DATASET_FORMAT" in
+        "OpenHermes-FR")
+            INPUT_FIELD="prompt"
+            TARGET_FIELD="accepted_completion"
+            DATASET_FORMAT_CODE="openhermes_fr"
+            FILTER_BAD_ENTRIES="true"
+            ;;
+        "Messages format")
+            INPUT_FIELD="messages"
+            TARGET_FIELD=""
+            DATASET_FORMAT_CODE="messages"
+            FILTER_BAD_ENTRIES="false"
+            ;;
+        "Text format")
+            INPUT_FIELD="text"
+            TARGET_FIELD=""
+            DATASET_FORMAT_CODE="text"
+            FILTER_BAD_ENTRIES="false"
+            ;;
+        "Custom format")
+            get_input "Input field name" "prompt" INPUT_FIELD
+            get_input "Target field name (leave empty if not needed)" "accepted_completion" TARGET_FIELD
+            DATASET_FORMAT_CODE="custom"
+            get_input "Filter bad entries? (true/false)" "false" FILTER_BAD_ENTRIES
+            ;;
+    esac
+    # Dataset Filtering Options
+    echo ""
+    print_info "🔍 Dataset Filtering Options"
+    get_input "Maximum samples to use (leave empty for all)" "" MAX_SAMPLES
+    get_input "Minimum sequence length" "10" MIN_LENGTH
+    get_input "Maximum sequence length (leave empty for auto)" "" MAX_LENGTH
+    # Training Hyperparameters
+    echo ""
+    print_info "⚙️ Training Hyperparameters"
+    get_input "Number of epochs" "1.0" NUM_EPOCHS
+    get_input "Batch size per device" "4" BATCH_SIZE
+    get_input "Gradient accumulation steps" "4" GRAD_ACCUM_STEPS
+    get_input "Learning rate" "2e-4" LEARNING_RATE
+    get_input "Minimum learning rate" "2e-5" MIN_LR
+    get_input "Weight decay" "0.01" WEIGHT_DECAY
+    get_input "Warmup ratio" "0.03" WARMUP_RATIO
+    # Sequence Length
+    echo ""
+    print_info "📏 Sequence Configuration"
+    get_input "Maximum sequence length" "2048" MAX_SEQ_LENGTH
+    # LoRA Configuration
+    echo ""
+    print_info "🎛️ LoRA Configuration"
+    get_input "LoRA rank" "16" LORA_RANK
+    get_input "LoRA alpha" "32" LORA_ALPHA
+    get_input "LoRA dropout" "0.05" LORA_DROPOUT
+    # Memory & Performance
+    echo ""
+    print_info "💾 Memory & Performance"
+    select_option "Mixed precision:" "BF16 (recommended)" "FP16" "FP32" MIXED_PRECISION
+    get_input "Data loading workers" "4" NUM_WORKERS
+    select_option "Quantization:" "MXFP4 (default)" "4-bit BNB" "None" QUANTIZATION_TYPE
+    # Advanced Options
+    echo ""
+    echo "Advanced options (press Enter for defaults):"
+    get_input "Max gradient norm" "1.0" MAX_GRAD_NORM
+    get_input "Logging steps" "10" LOGGING_STEPS
+    get_input "Evaluation steps" "100" EVAL_STEPS
+    get_input "Save steps" "500" SAVE_STEPS
+    # Update the custom config file with user's choices
+    update_enhanced_gpt_oss_config
+}
 # Function to get custom configuration
 get_custom_config() {
     print_step "Custom Configuration Setup"
     fi
 }
+# Function to update enhanced GPT-OSS config with user choices
+update_enhanced_gpt_oss_config() {
+    print_info "Generating enhanced custom GPT-OSS configuration..."
+    # Process mixed precision setting
+    case "$MIXED_PRECISION" in
+        "BF16 (recommended)")
+            FP16="False"
+            BF16="True"
+            ;;
+        "FP16")
+            FP16="True"
+            BF16="False"
+            ;;
+        "FP32")
+            FP16="False"
+            BF16="False"
+            ;;
+    esac
+    # Process quantization setting
+    case "$QUANTIZATION_TYPE" in
+        "MXFP4 (default)")
+            USE_QUANTIZATION="True"
+            QUANTIZATION_CONFIG='{"dequantize": True, "load_in_4bit": False}'
+            ;;
+        "4-bit BNB")
+            USE_QUANTIZATION="True"
+            QUANTIZATION_CONFIG='{"dequantize": False, "load_in_4bit": True, "bnb_4bit_compute_dtype": "bfloat16", "bnb_4bit_use_double_quant": True, "bnb_4bit_quant_type": "nf4"}'
+            ;;
+        "None")
+            USE_QUANTIZATION="False"
+            QUANTIZATION_CONFIG='{"dequantize": False, "load_in_4bit": False}'
+            ;;
+    esac
+    # Create enhanced config file with all user choices
+    cat > "$CONFIG_FILE" << EOF
+"""
+GPT-OSS Enhanced Custom Training Configuration - Generated by launch.sh
+Dataset: $DATASET_NAME ($DATASET_FORMAT)
+Optimized for: ${DATASET_FORMAT} format with full customization
+"""
+from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
+# Create enhanced config with all customizations
+config = GPTOSSEnhancedCustomConfig(
+    # ============================================================================
+    # DATASET CONFIGURATION
+    # ============================================================================
+    dataset_name="$DATASET_NAME",
+    dataset_split="$DATASET_SPLIT",
+    dataset_format="$DATASET_FORMAT_CODE",
+    input_field="$INPUT_FIELD",
+    target_field=$(if [ -n "$TARGET_FIELD" ]; then echo "\"$TARGET_FIELD\""; else echo "None"; fi),
+    filter_bad_entries=$FILTER_BAD_ENTRIES,
+    max_samples=$(if [ -n "$MAX_SAMPLES" ]; then echo "$MAX_SAMPLES"; else echo "None"; fi),
+    min_length=$MIN_LENGTH,
+    max_length=$(if [ -n "$MAX_LENGTH" ]; then echo "$MAX_LENGTH"; else echo "None"; fi),
+    # ============================================================================
+    # TRAINING HYPERPARAMETERS
+    # ============================================================================
+    num_train_epochs=$NUM_EPOCHS,
+    batch_size=$BATCH_SIZE,
+    gradient_accumulation_steps=$GRAD_ACCUM_STEPS,
+    learning_rate=$LEARNING_RATE,
+    min_lr=$MIN_LR,
+    weight_decay=$WEIGHT_DECAY,
+    warmup_ratio=$WARMUP_RATIO,
+    max_grad_norm=$MAX_GRAD_NORM,
+    # ============================================================================
+    # MODEL CONFIGURATION
+    # ============================================================================
+    max_seq_length=$MAX_SEQ_LENGTH,
+    # ============================================================================
+    # MIXED PRECISION
+    # ============================================================================
+    fp16=$FP16,
+    bf16=$BF16,
+    # ============================================================================
+    # LORA CONFIGURATION
+    # ============================================================================
+    lora_config={
+        "r": $LORA_RANK,
+        "lora_alpha": $LORA_ALPHA,
+        "lora_dropout": $LORA_DROPOUT,
+        "target_modules": "all-linear",
+        "bias": "none",
+        "task_type": "CAUSAL_LM",
+    },
+    # ============================================================================
+    # QUANTIZATION CONFIGURATION
+    # ============================================================================
+    use_quantization=$USE_QUANTIZATION,
+    quantization_config=$QUANTIZATION_CONFIG,
+    # ============================================================================
+    # PERFORMANCE CONFIGURATION
+    # ============================================================================
+    dataloader_num_workers=$NUM_WORKERS,
+    dataloader_pin_memory=True,
+    group_by_length=True,
+    # ============================================================================
+    # LOGGING & EVALUATION
+    # ============================================================================
+    logging_steps=$LOGGING_STEPS,
+    eval_steps=$EVAL_STEPS,
+    save_steps=$SAVE_STEPS,
+    # ============================================================================
+    # RUNTIME CONFIGURATION
+    # ============================================================================
+    experiment_name="$EXPERIMENT_NAME",
+    trackio_url="$TRACKIO_URL",
+    dataset_repo="$TRACKIO_DATASET_REPO",
+    enable_tracking=True,
+)
+EOF
+    print_status "Enhanced GPT-OSS configuration generated successfully!"
+    print_info "Configuration saved to: $CONFIG_FILE"
+}
 # Function to create training configuration file
 create_training_config() {
     local config_file="$1"
 echo "=================================="
 show_training_configs
+select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "GPT-OSS Basic Training" "GPT-OSS H100 Optimized" "GPT-OSS Multilingual Reasoning" "GPT-OSS Memory Optimized" "GPT-OSS OpenHermes-FR (Recommended)" "GPT-OSS OpenHermes-FR Memory Optimized" "GPT-OSS Custom Dataset" "Custom Configuration" TRAINING_CONFIG_TYPE
 get_training_config "$TRAINING_CONFIG_TYPE"
 print_info "Batch size: $BATCH_SIZE"
 print_info "Learning rate: $LEARNING_RATE"
+# Step 14.5: Define Output Directory
+print_step "Step 14.5: Output Directory Configuration"
+echo "============================================="
+# Define the output directory for training results
+OUTPUT_DIR="./outputs/${EXPERIMENT_NAME}_$(date +%Y%m%d_%H%M%S)"
+print_info "Training output directory: $OUTPUT_DIR"
+# Create output directory
+mkdir -p "$OUTPUT_DIR"
+print_status "Output directory created: $OUTPUT_DIR"
 # Step 15: Start training
 print_step "Step 15: Starting Training"
 echo "=============================="
 print_info "Starting training with configuration: $CONFIG_FILE"
 print_info "Experiment: $EXPERIMENT_NAME"
+print_info "Output: $OUTPUT_DIR"
 print_info "Trackio: $TRACKIO_URL"
 # Ensure environment variables are available for training
 export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
 export HF_USERNAME="$HF_USERNAME"
 export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
+export OUTPUT_DIR="$OUTPUT_DIR"
 # Run the appropriate training script based on model type
 if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
     python scripts/training/train_gpt_oss.py \
         --config "$CONFIG_FILE" \
         --experiment-name "$EXPERIMENT_NAME" \
+        --output-dir "$OUTPUT_DIR" \
         --trackio-url "$TRACKIO_URL" \
         --trainer-type "$TRAINER_TYPE_LOWER"
 else
     python scripts/training/train.py \
         --config "$CONFIG_FILE" \
         --experiment-name "$EXPERIMENT_NAME" \
+        --output-dir "$OUTPUT_DIR" \
         --trackio-url "$TRACKIO_URL" \
         --trainer-type "$TRAINER_TYPE_LOWER"
 fi
 echo "====================================="
 print_info "Pushing model to: $REPO_NAME"
+print_info "Checkpoint: $OUTPUT_DIR"
 # Ensure environment variables are available for model push
 export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
 export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
 export HF_USERNAME="$HF_USERNAME"
 export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
+export OUTPUT_DIR="$OUTPUT_DIR"
 # Run the appropriate push script based on model type
 if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
     print_info "Using GPT-OSS specialized push script..."
+    python scripts/model_tonic/push_gpt_oss_to_huggingface.py "$OUTPUT_DIR" "$REPO_NAME" \
         --token "$HF_TOKEN" \
         --trackio-url "$TRACKIO_URL" \
         --experiment-name "$EXPERIMENT_NAME" \
         --dataset-repo "$TRACKIO_DATASET_REPO" \
         --author-name "$AUTHOR_NAME" \
+        --model-description "$MODEL_DESCRIPTION" \
+        --training-config-type "$TRAINING_CONFIG_TYPE" \
+        --model-name "$MODEL_NAME" \
+        --dataset-name "$DATASET_NAME" \
+        --batch-size "$BATCH_SIZE" \
+        --learning-rate "$LEARNING_RATE" \
+        --max-epochs "$MAX_EPOCHS" \
+        --max-seq-length "$MAX_SEQ_LENGTH" \
+        --trainer-type "$TRAINER_TYPE"
 else
     print_info "Using standard SmolLM3 push script..."
+    python scripts/model_tonic/push_to_huggingface.py "$OUTPUT_DIR" "$REPO_NAME" \
         --token "$HF_TOKEN" \
         --trackio-url "$TRACKIO_URL" \
         --experiment-name "$EXPERIMENT_NAME" \
         --dataset-repo "$TRACKIO_DATASET_REPO" \
         --author-name "$AUTHOR_NAME" \
+        --model-description "$MODEL_DESCRIPTION" \
+        --training-config-type "$TRAINING_CONFIG_TYPE" \
+        --model-name "$MODEL_NAME" \
+        --dataset-name "$DATASET_NAME" \
+        --batch-size "$BATCH_SIZE" \
+        --learning-rate "$LEARNING_RATE" \
+        --max-epochs "$MAX_EPOCHS" \
+        --max-seq-length "$MAX_SEQ_LENGTH" \
+        --trainer-type "$TRAINER_TYPE"
 fi
 # Step 16.5: Switch Trackio Space to Read Token (Security)
 ## Files Created
 - Training configuration: \`$CONFIG_FILE\`
+- Model checkpoint: \`$OUTPUT_DIR/\`
 - Training logs: \`training.log\`
 - Summary report: \`training_summary.md\`
 EOF

scripts/model_tonic/push_gpt_oss_to_huggingface.py CHANGED Viewed

@@ -43,8 +43,59 @@ def merge_lora_weights(checkpoint_path, base_model_name, output_path):
     return model, tokenizer
-def create_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description):
-    """Create a comprehensive model card for GPT-OSS models"""
     card_content = f"""---
 language:
@@ -196,7 +247,7 @@ This model is licensed under the MIT License.
     return card_content
-def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experiment_name, dataset_repo, author_name, model_description):
     """Push GPT-OSS model to Hugging Face Hub"""
     print("=== GPT-OSS Model Push Pipeline ===")
@@ -230,7 +281,14 @@ def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experi
             trackio_url=trackio_url,
             dataset_repo=dataset_repo,
             author_name=author_name,
-            model_description=model_description
         )
         # Save model card
@@ -291,6 +349,14 @@ def main():
     parser.add_argument("--dataset-repo", help="Dataset repository")
     parser.add_argument("--author-name", help="Author name")
     parser.add_argument("--model-description", help="Model description")
     args = parser.parse_args()
@@ -308,7 +374,15 @@ def main():
         experiment_name=experiment_name,
         dataset_repo=dataset_repo,
         author_name=author_name,
-        model_description=model_description
     )
     sys.exit(0 if success else 1)

     return model, tokenizer
+def create_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description, training_config_type=None, dataset_name=None, batch_size=None, learning_rate=None, max_epochs=None, max_seq_length=None, trainer_type=None):
+    """Create a comprehensive model card for GPT-OSS models using generate_model_card.py"""
+    try:
+        # Import the model card generator
+        import sys
+        import os
+        sys.path.append(os.path.join(os.path.dirname(__file__)))
+        from generate_model_card import ModelCardGenerator, create_default_variables
+        # Create generator
+        generator = ModelCardGenerator()
+        # Create variables for the model card
+        variables = create_default_variables()
+        # Update with GPT-OSS specific values
+        variables.update({
+            "repo_name": model_name,
+            "model_name": model_name.split('/')[-1],
+            "experiment_name": experiment_name or "gpt_oss_finetune",
+            "dataset_repo": dataset_repo,
+            "author_name": author_name or "GPT-OSS Fine-tuner",
+            "model_description": model_description or "A fine-tuned version of OpenAI's GPT-OSS-20B model for multilingual reasoning tasks.",
+            "training_config_type": training_config_type or "GPT-OSS Configuration",
+            "base_model": "openai/gpt-oss-20b",
+            "dataset_name": dataset_name or "HuggingFaceH4/Multilingual-Thinking",
+            "trainer_type": trainer_type or "SFTTrainer",
+            "batch_size": str(batch_size) if batch_size else "4",
+            "learning_rate": str(learning_rate) if learning_rate else "2e-4",
+            "max_epochs": str(max_epochs) if max_epochs else "1",
+            "max_seq_length": str(max_seq_length) if max_seq_length else "2048",
+            "hardware_info": "GPU (H100/A100)",
+            "trackio_url": trackio_url or "N/A",
+            "training_loss": "N/A",
+            "validation_loss": "N/A",
+            "perplexity": "N/A",
+            "quantized_models": False
+        })
+        # Generate the model card
+        model_card_content = generator.generate_model_card(variables)
+        print("✅ Model card generated using generate_model_card.py")
+        return model_card_content
+    except Exception as e:
+        print(f"❌ Failed to generate model card with generator: {e}")
+        print("🔄 Falling back to original GPT-OSS model card")
+        return _create_original_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description)
+def _create_original_gpt_oss_model_card(model_name, experiment_name, trackio_url, dataset_repo, author_name, model_description):
+    """Create the original GPT-OSS model card as fallback"""
     card_content = f"""---
 language:
     return card_content
+def push_gpt_oss_model(checkpoint_path, repo_name, hf_token, trackio_url, experiment_name, dataset_repo, author_name, model_description, training_config_type=None, model_name=None, dataset_name=None, batch_size=None, learning_rate=None, max_epochs=None, max_seq_length=None, trainer_type=None):
     """Push GPT-OSS model to Hugging Face Hub"""
     print("=== GPT-OSS Model Push Pipeline ===")
             trackio_url=trackio_url,
             dataset_repo=dataset_repo,
             author_name=author_name,
+            model_description=model_description,
+            training_config_type=training_config_type,
+            dataset_name=dataset_name,
+            batch_size=batch_size,
+            learning_rate=learning_rate,
+            max_epochs=max_epochs,
+            max_seq_length=max_seq_length,
+            trainer_type=trainer_type
         )
         # Save model card
     parser.add_argument("--dataset-repo", help="Dataset repository")
     parser.add_argument("--author-name", help="Author name")
     parser.add_argument("--model-description", help="Model description")
+    parser.add_argument("--training-config-type", help="Training configuration type")
+    parser.add_argument("--model-name", help="Base model name")
+    parser.add_argument("--dataset-name", help="Dataset name")
+    parser.add_argument("--batch-size", help="Batch size")
+    parser.add_argument("--learning-rate", help="Learning rate")
+    parser.add_argument("--max-epochs", help="Maximum epochs")
+    parser.add_argument("--max-seq-length", help="Maximum sequence length")
+    parser.add_argument("--trainer-type", help="Trainer type")
     args = parser.parse_args()
         experiment_name=experiment_name,
         dataset_repo=dataset_repo,
         author_name=author_name,
+        model_description=model_description,
+        training_config_type=args.training_config_type,
+        model_name=args.model_name,
+        dataset_name=args.dataset_name,
+        batch_size=args.batch_size,
+        learning_rate=args.learning_rate,
+        max_epochs=args.max_epochs,
+        max_seq_length=args.max_seq_length,
+        trainer_type=args.trainer_type
     )
     sys.exit(0 if success else 1)

scripts/model_tonic/push_to_huggingface.py CHANGED Viewed

@@ -62,7 +62,15 @@ class HuggingFacePusher:
         dataset_repo: Optional[str] = None,
         hf_token: Optional[str] = None,
         author_name: Optional[str] = None,
-        model_description: Optional[str] = None
     ):
         self.model_path = Path(model_path)
         self.repo_name = repo_name
@@ -73,6 +81,16 @@ class HuggingFacePusher:
         self.author_name = author_name
         self.model_description = model_description
         # HF Datasets configuration
         self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
         self.hf_token = hf_token or os.getenv('HF_TOKEN')
@@ -156,9 +174,53 @@ class HuggingFacePusher:
         return True
     def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
-        """Create a comprehensive model card using the simple method to avoid YAML issues"""
-        # Always use the simple model card to avoid YAML formatting issues
-        return self._create_simple_model_card(training_config, results)
     def _create_simple_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
         """Create a simple model card without complex YAML to avoid formatting issues"""
@@ -531,6 +593,14 @@ def parse_args():
     parser.add_argument('--dataset-repo', type=str, default=None, help='HF Dataset repository for experiment storage')
     parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
     parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
     return parser.parse_args()
@@ -558,7 +628,15 @@ def main():
             dataset_repo=args.dataset_repo,
             hf_token=args.hf_token,
             author_name=args.author_name,
-            model_description=args.model_description
         )
         # Push model

         dataset_repo: Optional[str] = None,
         hf_token: Optional[str] = None,
         author_name: Optional[str] = None,
+        model_description: Optional[str] = None,
+        training_config_type: Optional[str] = None,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        batch_size: Optional[str] = None,
+        learning_rate: Optional[str] = None,
+        max_epochs: Optional[str] = None,
+        max_seq_length: Optional[str] = None,
+        trainer_type: Optional[str] = None
     ):
         self.model_path = Path(model_path)
         self.repo_name = repo_name
         self.author_name = author_name
         self.model_description = model_description
+        # Training configuration details for model card generation
+        self.training_config_type = training_config_type
+        self.model_name = model_name
+        self.dataset_name = dataset_name
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.max_epochs = max_epochs
+        self.max_seq_length = max_seq_length
+        self.trainer_type = trainer_type
         # HF Datasets configuration
         self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
         self.hf_token = hf_token or os.getenv('HF_TOKEN')
         return True
     def create_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
+        """Create a comprehensive model card using the generate_model_card.py script"""
+        try:
+            # Import the model card generator
+            import sys
+            sys.path.append(os.path.join(os.path.dirname(__file__)))
+            from generate_model_card import ModelCardGenerator, create_default_variables
+            # Create generator
+            generator = ModelCardGenerator()
+            # Create variables for the model card
+            variables = create_default_variables()
+            # Update with actual values
+            variables.update({
+                "repo_name": self.repo_name,
+                "model_name": self.repo_name.split('/')[-1],
+                "experiment_name": self.experiment_name or "model_push",
+                "dataset_repo": self.dataset_repo,
+                "author_name": self.author_name or "Model Author",
+                "model_description": self.model_description or "A fine-tuned version of SmolLM3-3B for improved text generation capabilities.",
+                "training_config_type": self.training_config_type or "Custom Configuration",
+                "base_model": self.model_name or "HuggingFaceTB/SmolLM3-3B",
+                "dataset_name": self.dataset_name or "Custom Dataset",
+                "trainer_type": self.trainer_type or "SFTTrainer",
+                "batch_size": str(self.batch_size) if self.batch_size else "8",
+                "learning_rate": str(self.learning_rate) if self.learning_rate else "5e-6",
+                "max_epochs": str(self.max_epochs) if self.max_epochs else "3",
+                "max_seq_length": str(self.max_seq_length) if self.max_seq_length else "2048",
+                "hardware_info": self._get_hardware_info(),
+                "trackio_url": self.trackio_url or "N/A",
+                "training_loss": str(results.get('train_loss', 'N/A')),
+                "validation_loss": str(results.get('eval_loss', 'N/A')),
+                "perplexity": str(results.get('perplexity', 'N/A')),
+                "quantized_models": False  # Set to True if quantized models are available
+            })
+            # Generate the model card
+            model_card_content = generator.generate_model_card(variables)
+            logger.info("✅ Model card generated using generate_model_card.py")
+            return model_card_content
+        except Exception as e:
+            logger.error(f"❌ Failed to generate model card with generator: {e}")
+            logger.info("🔄 Falling back to simple model card")
+            return self._create_simple_model_card(training_config, results)
     def _create_simple_model_card(self, training_config: Dict[str, Any], results: Dict[str, Any]) -> str:
         """Create a simple model card without complex YAML to avoid formatting issues"""
     parser.add_argument('--dataset-repo', type=str, default=None, help='HF Dataset repository for experiment storage')
     parser.add_argument('--author-name', type=str, default=None, help='Author name for model card')
     parser.add_argument('--model-description', type=str, default=None, help='Model description for model card')
+    parser.add_argument('--training-config-type', type=str, default=None, help='Training configuration type')
+    parser.add_argument('--model-name', type=str, default=None, help='Base model name')
+    parser.add_argument('--dataset-name', type=str, default=None, help='Dataset name')
+    parser.add_argument('--batch-size', type=str, default=None, help='Batch size')
+    parser.add_argument('--learning-rate', type=str, default=None, help='Learning rate')
+    parser.add_argument('--max-epochs', type=str, default=None, help='Maximum epochs')
+    parser.add_argument('--max-seq-length', type=str, default=None, help='Maximum sequence length')
+    parser.add_argument('--trainer-type', type=str, default=None, help='Trainer type')
     return parser.parse_args()
             dataset_repo=args.dataset_repo,
             hf_token=args.hf_token,
             author_name=args.author_name,
+            model_description=args.model_description,
+            training_config_type=args.training_config_type,
+            model_name=args.model_name,
+            dataset_name=args.dataset_name,
+            batch_size=args.batch_size,
+            learning_rate=args.learning_rate,
+            max_epochs=args.max_epochs,
+            max_seq_length=args.max_seq_length,
+            trainer_type=args.trainer_type
         )
         # Push model

scripts/training/train_gpt_oss.py CHANGED Viewed

@@ -95,12 +95,215 @@ def setup_lora_for_gpt_oss(model, config):
     return peft_model
-def load_multilingual_thinking_dataset():
-    """Load the Multilingual-Thinking dataset"""
-    print("Loading Multilingual-Thinking dataset...")
-    dataset = load_dataset("HuggingFaceH4/Multilingual-Thinking", split="train")
-    print(f"Dataset loaded: {len(dataset)} examples")
     return dataset
@@ -127,25 +330,111 @@ def setup_trackio_tracking(config):
     return trackio_client
-def create_sft_config(config):
-    """Create SFTConfig for GPT-OSS training"""
-    print("Creating SFT configuration...")
     sft_config = SFTConfig(
-        learning_rate=config.learning_rate,
-        gradient_checkpointing=True,
-        num_train_epochs=1,  # Single epoch as per tutorial
-        logging_steps=config.logging_steps,
-        per_device_train_batch_size=config.batch_size,
-        gradient_accumulation_steps=config.gradient_accumulation_steps,
-        max_length=config.max_seq_length,
-        warmup_ratio=0.03,
-        lr_scheduler_type="cosine_with_min_lr",
-        lr_scheduler_kwargs={"min_lr_rate": 0.1},
-        output_dir="gpt-oss-20b-multilingual-reasoner",
-        report_to="trackio" if config.enable_tracking else None,
-        push_to_hub=True,
     )
     return sft_config
@@ -193,13 +482,13 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
     peft_model = setup_lora_for_gpt_oss(model, config)
     # Load dataset
-    dataset = load_multilingual_thinking_dataset()
     # Setup Trackio tracking
     trackio_client = setup_trackio_tracking(config)
     # Create SFT configuration
-    sft_config = create_sft_config(config)
     # Create trainer
     print("Creating SFT trainer...")

     return peft_model
+def load_dataset_from_config(config):
+    """Load dataset based on configuration"""
+    dataset_name = getattr(config, 'dataset_name', 'HuggingFaceH4/Multilingual-Thinking')
+    dataset_split = getattr(config, 'dataset_split', 'train')
+    dataset_config = getattr(config, 'dataset_config', None)
+    print(f"Loading dataset: {dataset_name}")
+    print(f"Dataset split: {dataset_split}")
+    if dataset_config:
+        print(f"Dataset config: {dataset_config}")
+    # Load the dataset
+    if dataset_config:
+        dataset = load_dataset(dataset_name, dataset_config, split=dataset_split)
+    else:
+        dataset = load_dataset(dataset_name, split=dataset_split)
+    print(f"Original dataset size: {len(dataset)} examples")
+    # Apply filtering based on configuration
+    dataset = apply_dataset_filtering(dataset, config)
+    # Apply dataset processing based on format
+    dataset = process_dataset_format(dataset, config)
+    print(f"Final dataset size: {len(dataset)} examples")
+    return dataset
+def apply_dataset_filtering(dataset, config):
+    """Apply filtering based on configuration"""
+    # Filter bad entries if specified
+    if getattr(config, 'filter_bad_entries', False):
+        bad_entry_field = getattr(config, 'bad_entry_field', 'bad_entry')
+        bad_prompt_field = getattr(config, 'bad_prompt_field', 'bad_prompt_detected')
+        bad_response_field = getattr(config, 'bad_response_field', 'bad_response_detected')
+        original_size = len(dataset)
+        # Filter out bad entries
+        if bad_entry_field in dataset.column_names:
+            dataset = dataset.filter(lambda x: not x.get(bad_entry_field, False))
+            print(f"Filtered {original_size - len(dataset)} bad entries")
+        # Filter out bad prompts
+        if bad_prompt_field in dataset.column_names:
+            dataset = dataset.filter(lambda x: not x.get(bad_prompt_field, False))
+            print(f"Filtered bad prompts, remaining: {len(dataset)} examples")
+        # Filter out bad responses
+        if bad_response_field in dataset.column_names:
+            dataset = dataset.filter(lambda x: not x.get(bad_response_field, False))
+            print(f"Filtered bad responses, remaining: {len(dataset)} examples")
+    # Apply length filtering
+    min_length = getattr(config, 'min_length', 10)
+    max_length = getattr(config, 'max_length', None)
+    input_field = getattr(config, 'input_field', 'prompt')
+    target_field = getattr(config, 'target_field', 'accepted_completion')
+    if min_length > 0 or max_length:
+        def length_filter(example):
+            input_len = len(example.get(input_field, ''))
+            target_len = len(example.get(target_field, ''))
+            total_len = input_len + target_len
+            if total_len < min_length:
+                return False
+            if max_length and total_len > max_length:
+                return False
+            return True
+        original_size = len(dataset)
+        dataset = dataset.filter(length_filter)
+        print(f"Length filtering: {original_size} -> {len(dataset)} examples")
+    # Apply sampling if specified
+    max_samples = getattr(config, 'max_samples', None)
+    if max_samples and len(dataset) > max_samples:
+        dataset = dataset.shuffle(seed=42).select(range(max_samples))
+        print(f"Sampled {max_samples} examples from dataset")
+    return dataset
+def format_gpt_oss_harmony(prompt, completion, add_eos_token=True):
+    """
+    Format data for GPT-OSS Harmony format following the exact template structure.
+    Based on: https://huggingface.co/openai/gpt-oss-20b/raw/main/chat_template.jinja
+    """
+    # GPT-OSS Harmony format structure (exact template compliance)
+    # User message: <|start|>user<|message|>content<|end|>
+    # Assistant message: <|start|>assistant<|channel|>final<|message|>content<|end|> (inference)
+    # Assistant message: <|start|>assistant<|channel|>final<|message|>content<|return|> (training)
+    harmony_text = f"<|start|>user<|message|>{prompt}<|end|><|start|>assistant<|channel|>final<|message|>{completion}"
+    if add_eos_token:
+        # Use <|return|> for training as per template specification
+        # This indicates the end of generation in training
+        harmony_text += "<|return|>"
+    else:
+        # Use <|end|> for inference
+        harmony_text += "<|end|>"
+    return harmony_text
+def process_dataset_format(dataset, config):
+    """Process dataset based on format configuration with exact GPT-OSS Harmony compliance"""
+    dataset_format = getattr(config, 'dataset_format', 'openhermes_fr')
+    input_field = getattr(config, 'input_field', 'prompt')
+    target_field = getattr(config, 'target_field', 'accepted_completion')
+    concatenate_fields = getattr(config, 'concatenate_fields', True)
+    field_separator = getattr(config, 'field_separator', '\n\n### Response:\n')
+    add_eos_token = getattr(config, 'add_eos_token', True)
+    use_harmony_format = getattr(config, 'use_harmony_format', True)
+    print(f"Processing dataset format: {dataset_format}")
+    print(f"Input field: {input_field}, Target field: {target_field}")
+    print(f"GPT-OSS Harmony Format: {'Enabled' if use_harmony_format else 'Disabled'}")
+    if dataset_format == "openhermes_fr":
+        # Process OpenHermes-FR format: prompt + accepted_completion
+        def format_openhermes_fr(example):
+            prompt = example.get(input_field, '')
+            completion = example.get(target_field, '')
+            if concatenate_fields:
+                if use_harmony_format:
+                    # Use exact GPT-OSS Harmony format from template
+                    text = format_gpt_oss_harmony(prompt, completion, add_eos_token)
+                else:
+                    # Fallback to standard format with separator
+                    text = prompt + field_separator + completion
+                    if add_eos_token:
+                        text += "</s>"
+                return {"text": text}
+            else:
+                # Keep separate for more advanced training setups
+                return {
+                    "input": prompt,
+                    "output": completion
+                }
+        dataset = dataset.map(format_openhermes_fr, remove_columns=dataset.column_names)
+    elif dataset_format == "messages":
+        # Process messages format (like HuggingFaceH4/Multilingual-Thinking)
+        def format_messages(example):
+            messages = example.get(input_field, [])
+            if use_harmony_format and len(messages) >= 2:
+                # Extract user and assistant messages for harmony format
+                user_message = ""
+                assistant_message = ""
+                for message in messages:
+                    role = message.get("role", "")
+                    content = message.get("content", "")
+                    if role == "user":
+                        user_message = content
+                    elif role == "assistant":
+                        assistant_message = content
+                if user_message and assistant_message:
+                    # Use GPT-OSS Harmony format
+                    text = format_gpt_oss_harmony(user_message, assistant_message, add_eos_token)
+                else:
+                    # Fallback to simple concatenation
+                    text = ""
+                    for message in messages:
+                        role = message.get("role", "")
+                        content = message.get("content", "")
+                        text += f"{role}: {content}\n"
+                    if add_eos_token:
+                        text += "</s>"
+            else:
+                # Standard format - convert messages to simple text
+                text = ""
+                for message in messages:
+                    role = message.get("role", "")
+                    content = message.get("content", "")
+                    text += f"{role}: {content}\n"
+                if add_eos_token:
+                    text += "</s>"
+            return {"text": text}
+        dataset = dataset.map(format_messages, remove_columns=dataset.column_names)
+    elif dataset_format == "text":
+        # Process plain text format
+        text_field = input_field
+        def format_text(example):
+            text = example.get(text_field, '')
+            if add_eos_token:
+                text += "</s>"
+            return {"text": text}
+        dataset = dataset.map(format_text, remove_columns=dataset.column_names)
+    elif dataset_format == "custom":
+        # Custom format - user handles this in their config
+        print("Using custom dataset format - no automatic processing")
     return dataset
     return trackio_client
+def create_sft_config(config, output_dir):
+    """Create enhanced SFTConfig for GPT-OSS training"""
+    print("Creating enhanced SFT configuration...")
+    # Extract training parameters from config with enhanced defaults
+    num_train_epochs = getattr(config, 'num_train_epochs', 1.0)
+    max_steps = getattr(config, 'max_steps', None)
+    warmup_ratio = getattr(config, 'warmup_ratio', 0.03)
+    warmup_steps = getattr(config, 'warmup_steps', None)
+    # Learning rate configuration
+    learning_rate = config.learning_rate
+    lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
+    lr_scheduler_kwargs = getattr(config, 'lr_scheduler_kwargs', {"min_lr_rate": 0.1})
+    # Batch configuration
+    per_device_train_batch_size = config.batch_size
+    per_device_eval_batch_size = getattr(config, 'eval_batch_size', config.batch_size)
+    gradient_accumulation_steps = config.gradient_accumulation_steps
+    # Evaluation and logging
+    eval_strategy = getattr(config, 'eval_strategy', 'steps')
+    eval_steps = getattr(config, 'eval_steps', 100)
+    logging_steps = getattr(config, 'logging_steps', 10)
+    # Saving configuration
+    save_strategy = getattr(config, 'save_strategy', 'steps')
+    save_steps = getattr(config, 'save_steps', 500)
+    save_total_limit = getattr(config, 'save_total_limit', 3)
+    # Mixed precision
+    fp16 = getattr(config, 'fp16', False)
+    bf16 = getattr(config, 'bf16', True)
+    # Regularization
+    weight_decay = getattr(config, 'weight_decay', 0.01)
+    max_grad_norm = getattr(config, 'max_grad_norm', 1.0)
+    # HuggingFace Hub integration
+    push_to_hub = getattr(config, 'push_to_hub', False)
+    print(f"  • Epochs: {num_train_epochs}")
+    print(f"  • Learning rate: {learning_rate}")
+    print(f"  • Batch size: {per_device_train_batch_size}")
+    print(f"  • Gradient accumulation: {gradient_accumulation_steps}")
+    print(f"  • Effective batch size: {per_device_train_batch_size * gradient_accumulation_steps}")
     sft_config = SFTConfig(
+        # Training duration
+        num_train_epochs=num_train_epochs,
+        max_steps=max_steps,
+        # Learning rate
+        learning_rate=learning_rate,
+        lr_scheduler_type=lr_scheduler_type,
+        lr_scheduler_kwargs=lr_scheduler_kwargs,
+        warmup_ratio=warmup_ratio,
+        warmup_steps=warmup_steps,
+        # Batch configuration
+        per_device_train_batch_size=per_device_train_batch_size,
+        per_device_eval_batch_size=per_device_eval_batch_size,
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        # Model configuration
+        max_seq_length=config.max_seq_length,
+        gradient_checkpointing=getattr(config, 'use_gradient_checkpointing', True),
+        # Mixed precision
+        fp16=fp16,
+        bf16=bf16,
+        # Regularization
+        weight_decay=weight_decay,
+        max_grad_norm=max_grad_norm,
+        # Evaluation
+        evaluation_strategy=eval_strategy,
+        eval_steps=eval_steps,
+        # Logging
+        logging_steps=logging_steps,
+        # Saving
+        save_strategy=save_strategy,
+        save_steps=save_steps,
+        save_total_limit=save_total_limit,
+        # Output
+        output_dir=output_dir,
+        # Data loading
+        dataloader_num_workers=getattr(config, 'dataloader_num_workers', 4),
+        dataloader_pin_memory=getattr(config, 'dataloader_pin_memory', True),
+        # Performance
+        group_by_length=getattr(config, 'group_by_length', True),
+        remove_unused_columns=getattr(config, 'remove_unused_columns', True),
+        # HuggingFace Hub
+        push_to_hub=push_to_hub,
+        # Monitoring
+        report_to="trackio" if getattr(config, 'enable_tracking', False) else None,
     )
     return sft_config
     peft_model = setup_lora_for_gpt_oss(model, config)
     # Load dataset
+    dataset = load_dataset_from_config(config)
     # Setup Trackio tracking
     trackio_client = setup_trackio_tracking(config)
     # Create SFT configuration
+    sft_config = create_sft_config(config, output_dir)
     # Create trainer
     print("Creating SFT trainer...")

templates/spaces/demo_gpt/README.md CHANGED Viewed

@@ -6,7 +6,7 @@ colorTo: pink
 sdk: gradio
 sdk_version: 5.40.0
 app_file: app.py
-pinned: true
 short_description: GPT-OSS-20B Multilingual Reasoner LoRA adapter
 ---

 sdk: gradio
 sdk_version: 5.40.0
 app_file: app.py
+pinned: false
 short_description: GPT-OSS-20B Multilingual Reasoner LoRA adapter
 ---