Spaces:

Tonic
/

SmolFactory

Running

App Files Files Community

Tonic commited on 20 days ago

Commit

7181190

1 Parent(s): dfcb060

adds memory optimized configuration

Browse files

Files changed (4) hide show

config/train_gpt_oss_memory_optimized.py +144 -0
launch.sh +21 -2
requirements/requirements_core.txt +2 -1
scripts/training/train_gpt_oss.py +15 -5

config/train_gpt_oss_memory_optimized.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+GPT-OSS Memory Optimized Training Configuration
+Based on OpenAI's GPT-OSS fine-tuning tutorial
+Optimized for limited GPU memory (40-80GB)
+"""
+import os
+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class GPTOSSMemoryOptimizedConfig:
+    """Memory-optimized configuration for GPT-OSS fine-tuning"""
+    trainer_type: str = "sft"
+    model_name: str = "openai/gpt-oss-20b"
+    max_seq_length: int = 1024  # Reduced from 4096
+    use_flash_attention: bool = True
+    use_gradient_checkpointing: bool = True
+    batch_size: int = 1  # Reduced from 8
+    gradient_accumulation_steps: int = 16  # Increased to maintain effective batch size
+    learning_rate: float = 2e-4
+    weight_decay: float = 0.01
+    warmup_steps: int = 50
+    max_iters: int = 500  # Reduced for faster testing
+    eval_interval: int = 50
+    log_interval: int = 5
+    save_interval: int = 100
+    optimizer: str = "adamw_torch"
+    beta1: float = 0.9
+    beta2: float = 0.95
+    eps: float = 1e-8
+    scheduler: str = "cosine_with_min_lr"
+    min_lr: float = 2e-5
+    lr_scheduler_kwargs: dict = None
+    fp16: bool = False
+    bf16: bool = True
+    ddp_backend: str = "nccl"
+    ddp_find_unused_parameters: bool = False
+    save_steps: int = 100
+    eval_steps: int = 50
+    logging_steps: int = 5
+    save_total_limit: Optional[int] = 2
+    eval_strategy: str = "steps"
+    metric_for_best_model: str = "eval_loss"
+    greater_is_better: bool = False
+    load_best_model_at_end: bool = True
+    dataset_name: str = "HuggingFaceH4/Multilingual-Thinking"
+    dataset_split: str = "train"
+    input_field: str = "messages"
+    target_field: str = None
+    filter_bad_entries: bool = False
+    bad_entry_field: str = "bad_entry"
+    use_chat_template: bool = True
+    chat_template_kwargs: dict = None
+    enable_tracking: bool = True
+    trackio_url: Optional[str] = None
+    trackio_token: Optional[str] = None
+    log_artifacts: bool = True
+    log_metrics: bool = True
+    log_config: bool = True
+    experiment_name: Optional[str] = None
+    hf_token: Optional[str] = None
+    dataset_repo: Optional[str] = None
+    use_lora: bool = True
+    lora_config: dict = None
+    use_quantization: bool = True
+    quantization_config: dict = None
+    model_kwargs: dict = None
+    generation_config: dict = None
+    reasoning_languages: list = None
+    def __post_init__(self):
+        """Set default values for complex fields"""
+        if self.lora_config is None:
+            self.lora_config = {
+                "r": 4,  # Reduced from 16
+                "lora_alpha": 8,  # Reduced from 32
+                "target_modules": "all-linear",
+                "target_parameters": [
+                    "7.mlp.experts.gate_up_proj",
+                    "7.mlp.experts.down_proj",
+                    "15.mlp.experts.gate_up_proj",
+                    "15.mlp.experts.down_proj",
+                    "23.mlp.experts.gate_up_proj",
+                    "23.mlp.experts.down_proj",
+                ],
+                "bias": "none",
+                "task_type": "CAUSAL_LM"
+            }
+        if self.quantization_config is None:
+            self.quantization_config = {
+                "dequantize": True,
+                "load_in_4bit": True,
+                "bnb_4bit_compute_dtype": "bfloat16",
+                "bnb_4bit_use_double_quant": True,
+                "bnb_4bit_quant_type": "nf4"
+            }
+        if self.model_kwargs is None:
+            self.model_kwargs = {
+                "attn_implementation": "eager",
+                "torch_dtype": "auto",
+                "use_cache": False,
+                "device_map": "auto",
+                "low_cpu_mem_usage": True,
+                "max_memory": {0: "75GB"},  # Reserve some memory
+            }
+        if self.generation_config is None:
+            self.generation_config = {
+                "max_new_tokens": 256,  # Reduced from 512
+                "do_sample": True,
+                "temperature": 0.6,
+                "top_p": 0.9,
+                "repetition_penalty": 1.1
+            }
+        if self.reasoning_languages is None:
+            self.reasoning_languages = [
+                "English", "Spanish", "French", "Italian", "German",
+                "Chinese", "Hindi", "Japanese", "Korean", "Arabic"
+            ]
+        if self.lr_scheduler_kwargs is None:
+            self.lr_scheduler_kwargs = {"min_lr_rate": 0.1}
+        if self.chat_template_kwargs is None:
+            self.chat_template_kwargs = {
+                "add_generation_prompt": True,
+                "tokenize": False,
+                "auto_insert_role": True
+            }
+        # Print memory optimization stats
+        effective_batch_size = self.batch_size * self.gradient_accumulation_steps
+        print("=== GPT-OSS Memory Optimized Configuration ===")
+        print(f"Effective batch size: {effective_batch_size}")
+        print(f"Max sequence length: {self.max_seq_length}")
+        print(f"LoRA rank: {self.lora_config['r']}")
+        print(f"Gradient accumulation steps: {self.gradient_accumulation_steps}")
+        print(f"Memory optimization: Enabled")
+        print(f"Quantization: {self.quantization_config}")
+        print(f"Max memory per GPU: {self.model_kwargs.get('max_memory', 'Auto')}")
+        print("==================================================")

launch.sh CHANGED Viewed

@@ -225,7 +225,16 @@ show_training_configs() {
     echo "   - Specialized for reasoning tasks"
     echo "   - Supports 10+ languages"
     echo ""
-    echo "8. Custom Configuration"
     echo "   - User-defined parameters"
     echo ""
 }
@@ -306,6 +315,16 @@ get_training_config() {
             MAX_SEQ_LENGTH=2048
             CONFIG_FILE="config/train_gpt_oss_multilingual_reasoning.py"
             ;;
         "Custom Configuration")
             get_custom_config
             ;;
@@ -478,7 +497,7 @@ print_step "Step 2: Training Configuration"
 echo "=================================="
 show_training_configs
-select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "GPT-OSS Basic Training" "GPT-OSS H100 Optimized" "GPT-OSS Multilingual Reasoning" "Custom Configuration" TRAINING_CONFIG_TYPE
 get_training_config "$TRAINING_CONFIG_TYPE"

     echo "   - Specialized for reasoning tasks"
     echo "   - Supports 10+ languages"
     echo ""
+    echo "8. GPT-OSS Memory Optimized"
+    echo "   - Model: openai/gpt-oss-20b"
+    echo "   - Dataset: Multilingual-Thinking"
+    echo "   - Epochs: 1"
+    echo "   - Batch Size: 1 (effective 16 with accumulation)"
+    echo "   - Learning Rate: 2e-4"
+    echo "   - 4-bit quantization + reduced LoRA"
+    echo "   - Optimized for limited GPU memory"
+    echo ""
+    echo "9. Custom Configuration"
     echo "   - User-defined parameters"
     echo ""
 }
             MAX_SEQ_LENGTH=2048
             CONFIG_FILE="config/train_gpt_oss_multilingual_reasoning.py"
             ;;
+        "GPT-OSS Memory Optimized")
+            MODEL_NAME="openai/gpt-oss-20b"
+            DATASET_NAME="HuggingFaceH4/Multilingual-Thinking"
+            MAX_EPOCHS=1
+            BATCH_SIZE=1
+            GRADIENT_ACCUMULATION_STEPS=16
+            LEARNING_RATE=2e-4
+            MAX_SEQ_LENGTH=1024
+            CONFIG_FILE="config/train_gpt_oss_memory_optimized.py"
+            ;;
         "Custom Configuration")
             get_custom_config
             ;;
 echo "=================================="
 show_training_configs
+select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "GPT-OSS Basic Training" "GPT-OSS H100 Optimized" "GPT-OSS Multilingual Reasoning" "GPT-OSS Memory Optimized" "Custom Configuration" TRAINING_CONFIG_TYPE
 get_training_config "$TRAINING_CONFIG_TYPE"

requirements/requirements_core.txt CHANGED Viewed

@@ -20,4 +20,5 @@ pynvml>=12.0.0
 # GPT-OSS specific dependencies
 # Note: GPT-OSS requires specific versions for optimal performance
-# These are compatible with the tutorial requirements

 # GPT-OSS specific dependencies
 # Note: GPT-OSS requires specific versions for optimal performance
+# These are compatible with the tutorial requirements
+bitsandbytes>=0.41.0  # For 4-bit quantization

scripts/training/train_gpt_oss.py CHANGED Viewed

@@ -23,10 +23,20 @@ def load_gpt_oss_model_and_tokenizer(config):
     print("Loading GPT-OSS model with quantization...")
     # Import quantization config
-    from transformers import Mxfp4Config
-    # Set up quantization config
-    quantization_config = Mxfp4Config(dequantize=True)
     # Model kwargs as per tutorial
     model_kwargs = {
@@ -144,7 +154,7 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
             # Try to find a config class
             for attr_name in dir(config_module):
                 attr = getattr(config_module, attr_name)
-                if hasattr(attr, 'model_name') and 'gpt_oss' in attr.model_name.lower():
                     config = attr
                     break
             else:

     print("Loading GPT-OSS model with quantization...")
     # Import quantization config
+    from transformers import BitsAndBytesConfig
+    # Set up quantization config based on config
+    if config.quantization_config and config.quantization_config.get("load_in_4bit"):
+        # Use BitsAndBytesConfig for 4-bit quantization
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4"
+        )
+    else:
+        # Use BitsAndBytesConfig as default (no quantization)
+        quantization_config = None
     # Model kwargs as per tutorial
     model_kwargs = {
             # Try to find a config class
             for attr_name in dir(config_module):
                 attr = getattr(config_module, attr_name)
+                if hasattr(attr, 'model_name') and ('gpt_oss' in attr.model_name.lower() or 'GPTOSS' in attr_name):
                     config = attr
                     break
             else: