Spaces:

Tonic
/

SmolFactory

Running

App Files Files Community

Tonic commited on Jul 20

Commit

54ebacf

verified ·

1 Parent(s): 967ff41

fix config bug

Browse files

Files changed (4) hide show

config/train_smollm3_h100_lightweight.py +142 -92
scripts/training/train.py +20 -3
src/train.py +6 -2
test_config.py +53 -0

config/train_smollm3_h100_lightweight.py CHANGED Viewed

@@ -3,112 +3,162 @@ SmolLM3 H100 Lightweight Training Configuration
 Optimized for rapid training on H100 with 80K Hermes-FR samples
 """
 from config.train_smollm3 import SmolLM3Config
-config = SmolLM3Config(
-    # Model configuration
-    model_name="HuggingFaceTB/SmolLM3-3B",
-    max_seq_length=8192,
-    use_flash_attention=True,
-    use_gradient_checkpointing=True,
-    # Training configuration - Optimized for H100
-    batch_size=16,  # Larger batch size for H100
-    gradient_accumulation_steps=4,  # Reduced for faster updates
-    learning_rate=8e-6,  # Slightly higher for rapid convergence
-    weight_decay=0.01,
-    warmup_steps=50,  # Reduced warmup for rapid training
-    max_iters=None,  # Will be calculated based on epochs
-    eval_interval=50,  # More frequent evaluation
-    log_interval=5,  # More frequent logging
-    save_interval=200,  # More frequent saving
-    # Optimizer configuration - Optimized for rapid training
-    optimizer="adamw",
-    beta1=0.9,
-    beta2=0.95,
-    eps=1e-8,
-    # Scheduler configuration - Faster learning
-    scheduler="cosine",
-    min_lr=2e-6,  # Higher minimum LR
     # Mixed precision - Full precision for H100
-    fp16=True,
-    bf16=False,
-    # Logging and saving - More frequent for rapid training
-    save_steps=200,
-    eval_steps=50,
-    logging_steps=5,
-    save_total_limit=2,  # Keep fewer checkpoints
     # Evaluation
-    eval_strategy="steps",
-    metric_for_best_model="eval_loss",
-    greater_is_better=False,
-    load_best_model_at_end=True,
-    # Data configuration - Hermes-FR with sampling
-    dataset_name="legmlai/openhermes-fr",
-    dataset_split="train",
-    input_field="prompt",
-    target_field="completion",
-    filter_bad_entries=False,
-    bad_entry_field="bad_entry",
-    sample_size=80000,  # 80K samples for lightweight training
-    sample_seed=42,  # For reproducibility
     # Chat template configuration
-    use_chat_template=True,
-    chat_template_kwargs={
-        "enable_thinking": False,
-        "add_generation_prompt": True,
-        "no_think_system_message": True
-    },
     # Trackio monitoring configuration
-    enable_tracking=True,
-    trackio_url=None,  # Will be set by launch script
-    trackio_token=None,
-    log_artifacts=True,
-    log_metrics=True,
-    log_config=True,
-    experiment_name=None,  # Will be set by launch script
     # HF Datasets configuration
-    dataset_repo=None,  # Will be set by launch script
     # H100-specific optimizations
-    dataloader_num_workers=4,  # Optimized for H100
-    dataloader_pin_memory=True,
-    gradient_clipping=1.0,  # Prevent gradient explosion
     # Memory optimizations for rapid training
-    max_grad_norm=1.0,
-    warmup_ratio=0.1,  # 10% warmup
-    lr_scheduler_type="cosine",
-    # Early stopping for rapid training
-    early_stopping_patience=3,
-    early_stopping_threshold=0.001,
-    # H100-specific training optimizations
-    remove_unused_columns=False,
-    group_by_length=True,  # Group similar length sequences
-    length_column_name="length",
-    ignore_data_skip=False,
-    # Reporting
-    report_to=["tensorboard"],
-    run_name="smollm3-h100-lightweight",
-    # Seed for reproducibility
-    seed=42,
-    # Data collator settings
-    data_collator_kwargs={
-        "pad_to_multiple_of": 8,  # Optimized for H100
-        "return_tensors": "pt"
-    }
-)

 Optimized for rapid training on H100 with 80K Hermes-FR samples
 """
+import os
+from dataclasses import dataclass
+from typing import Optional
 from config.train_smollm3 import SmolLM3Config
+@dataclass
+class SmolLM3ConfigH100Lightweight(SmolLM3Config):
+    """Configuration for SmolLM3 fine-tuning on OpenHermes-FR dataset - H100 Lightweight"""
+    # Model configuration - optimized for H100
+    model_name: str = "HuggingFaceTB/SmolLM3-3B"
+    max_seq_length: int = 8192  # Increased for better context understanding
+    use_flash_attention: bool = True
+    use_gradient_checkpointing: bool = True  # Enabled for memory efficiency
+    # Training configuration - H100 optimized for rapid training
+    batch_size: int = 16  # Larger batch size for H100
+    gradient_accumulation_steps: int = 4  # Reduced for faster updates
+    learning_rate: float = 8e-6  # Slightly higher for rapid convergence
+    weight_decay: float = 0.01
+    warmup_steps: int = 50  # Reduced warmup for rapid training
+    max_iters: int = None  # Will be calculated based on epochs
+    eval_interval: int = 50  # More frequent evaluation
+    log_interval: int = 5  # More frequent logging
+    save_interval: int = 200  # More frequent saving
+    # Optimizer configuration - optimized for rapid training
+    optimizer: str = "adamw_torch"
+    beta1: float = 0.9
+    beta2: float = 0.95
+    eps: float = 1e-8
+    # Scheduler configuration - faster learning
+    scheduler: str = "cosine"
+    min_lr: float = 2e-6  # Higher minimum LR
     # Mixed precision - Full precision for H100
+    fp16: bool = True
+    bf16: bool = False
+    # Logging and saving - more frequent for rapid training
+    save_steps: int = 200
+    eval_steps: int = 50
+    logging_steps: int = 5
+    save_total_limit: Optional[int] = 2  # Keep fewer checkpoints
     # Evaluation
+    eval_strategy: str = "steps"
+    metric_for_best_model: str = "eval_loss"
+    greater_is_better: bool = False
+    load_best_model_at_end: bool = True
+    # OpenHermes-FR Dataset configuration with sampling
+    dataset_name: str = "legmlai/openhermes-fr"
+    dataset_split: str = "train"
+    input_field: str = "prompt"
+    target_field: str = "completion"
+    filter_bad_entries: bool = False
+    bad_entry_field: str = "bad_entry"
+    sample_size: int = 80000  # 80K samples for lightweight training
+    sample_seed: int = 42  # For reproducibility
+    # Data configuration (not used for HF datasets but kept for compatibility)
+    data_dir: str = "my_dataset"
+    train_file: str = "train.json"
+    validation_file: Optional[str] = "validation.json"
+    test_file: Optional[str] = None
     # Chat template configuration
+    use_chat_template: bool = True
+    chat_template_kwargs: dict = None
     # Trackio monitoring configuration
+    enable_tracking: bool = True
+    trackio_url: Optional[str] = None
+    trackio_token: Optional[str] = None
+    log_artifacts: bool = True
+    log_metrics: bool = True
+    log_config: bool = True
+    experiment_name: Optional[str] = None
     # HF Datasets configuration
+    hf_token: Optional[str] = None
+    dataset_repo: Optional[str] = None
     # H100-specific optimizations
+    dataloader_num_workers: int = 4  # Optimized for H100
+    dataloader_pin_memory: bool = True
+    dataloader_prefetch_factor: int = 2
     # Memory optimizations for rapid training
+    max_grad_norm: float = 1.0
+    group_by_length: bool = True  # Group similar length sequences
+    # Training duration calculations
+    # With 80k datapoints and effective batch size of 64:
+    # Steps per epoch = 80,000 / 64 = 1,250 steps
+    # For 1 epoch: 1,250 steps
+    # For 2 epochs: 2,500 steps
+    def __post_init__(self):
+        if self.chat_template_kwargs is None:
+            self.chat_template_kwargs = {
+                "enable_thinking": False,
+                "add_generation_prompt": True,
+                "no_think_system_message": True
+            }
+        # Validate configuration
+        if self.fp16 and self.bf16:
+            raise ValueError("Cannot use both fp16 and bf16")
+        if self.max_seq_length > 131072:  # 128k limit
+            raise ValueError("max_seq_length cannot exceed 131072")
+        # Calculate training statistics
+        effective_batch_size = self.batch_size * self.gradient_accumulation_steps
+        steps_per_epoch = self.sample_size // effective_batch_size  # For 80k dataset
+        epochs_for_max_iters = self.max_iters / steps_per_epoch if self.max_iters else 1
+        print(f"=== H100 Lightweight Training Configuration ===")
+        print(f"Effective batch size: {effective_batch_size}")
+        print(f"Steps per epoch: ~{steps_per_epoch}")
+        print(f"Training for ~{epochs_for_max_iters:.1f} epochs")
+        print(f"Total training steps: {self.max_iters or 'auto'}")
+        print(f"Learning rate: {self.learning_rate}")
+        print(f"Mixed precision: {'fp16' if self.fp16 else 'bf16'}")
+        print(f"Max sequence length: {self.max_seq_length}")
+        print(f"Gradient checkpointing: {self.use_gradient_checkpointing}")
+        print(f"Dataset sample size: {self.sample_size}")
+        print("=" * 50)
+        # Set default experiment name if not provided
+        if self.experiment_name is None:
+            self.experiment_name = "smollm3_h100_lightweight"
+def get_config(config_path: str) -> SmolLM3ConfigH100Lightweight:
+    """Load configuration from file or return default"""
+    if os.path.exists(config_path):
+        # Load from file if it exists
+        import importlib.util
+        spec = importlib.util.spec_from_file_location("config_module", config_path)
+        config_module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(config_module)
+        if hasattr(config_module, 'config'):
+            return config_module.config
+        else:
+            # Try to find a config class
+            for attr_name in dir(config_module):
+                attr = getattr(config_module, attr_name)
+                if isinstance(attr, SmolLM3ConfigH100Lightweight):
+                    return attr
+    # Return default configuration
+    return SmolLM3ConfigH100Lightweight()
+# Default configuration instance
+config = SmolLM3ConfigH100Lightweight()

scripts/training/train.py CHANGED Viewed

@@ -53,6 +53,12 @@ def main():
         type=str,
         help="Trackio token for authentication"
     )
     args = parser.parse_args()
@@ -65,13 +71,13 @@ def main():
         # Import all available configurations
         from config.train_smollm3_openhermes_fr_a100_large import get_config as get_large_config
         from config.train_smollm3_openhermes_fr_a100_multiple_passes import get_config as get_multiple_passes_config
-        from config.train_smollm3_h100_lightweight import config as h100_lightweight_config
         # Map config files to their respective functions
         config_map = {
             "config/train_smollm3_openhermes_fr_a100_large.py": get_large_config,
             "config/train_smollm3_openhermes_fr_a100_multiple_passes.py": get_multiple_passes_config,
-            "config/train_smollm3_h100_lightweight.py": lambda x: h100_lightweight_config,
         }
         if args.config in config_map:
@@ -116,7 +122,15 @@ def main():
     print(f"Max iterations: {config.max_iters}")
     print(f"Max sequence length: {config.max_seq_length}")
     print(f"Mixed precision: {'bf16' if config.bf16 else 'fp16'}")
-    print(f"Dataset: {config.dataset_name}")
     if config.trackio_url:
         print(f"Trackio URL: {config.trackio_url}")
     if config.trackio_token:
@@ -151,6 +165,9 @@ def main():
         if args.experiment_name:
             train_args.extend(["--experiment_name", args.experiment_name])
         # Override sys.argv for the training script
         original_argv = sys.argv
         sys.argv = ["train.py"] + train_args

         type=str,
         help="Trackio token for authentication"
     )
+    parser.add_argument(
+        "--dataset-dir",
+        type=str,
+        default="my_dataset",
+        help="Dataset directory path"
+    )
     args = parser.parse_args()
         # Import all available configurations
         from config.train_smollm3_openhermes_fr_a100_large import get_config as get_large_config
         from config.train_smollm3_openhermes_fr_a100_multiple_passes import get_config as get_multiple_passes_config
+        from config.train_smollm3_h100_lightweight import get_config as get_h100_lightweight_config
         # Map config files to their respective functions
         config_map = {
             "config/train_smollm3_openhermes_fr_a100_large.py": get_large_config,
             "config/train_smollm3_openhermes_fr_a100_multiple_passes.py": get_multiple_passes_config,
+            "config/train_smollm3_h100_lightweight.py": get_h100_lightweight_config,
         }
         if args.config in config_map:
     print(f"Max iterations: {config.max_iters}")
     print(f"Max sequence length: {config.max_seq_length}")
     print(f"Mixed precision: {'bf16' if config.bf16 else 'fp16'}")
+    if hasattr(config, 'dataset_name') and config.dataset_name:
+        print(f"Dataset: {config.dataset_name}")
+        if hasattr(config, 'sample_size') and config.sample_size:
+            print(f"Sample size: {config.sample_size}")
+    else:
+        print(f"Dataset directory: {config.data_dir}")
+        print(f"Training file: {config.train_file}")
+        if config.validation_file:
+            print(f"Validation file: {config.validation_file}")
     if config.trackio_url:
         print(f"Trackio URL: {config.trackio_url}")
     if config.trackio_token:
         if args.experiment_name:
             train_args.extend(["--experiment_name", args.experiment_name])
+        # Add dataset directory argument
+        train_args.extend(["--dataset_dir", args.dataset_dir])
         # Override sys.argv for the training script
         original_argv = sys.argv
         sys.argv = ["train.py"] + train_args

src/train.py CHANGED Viewed

@@ -174,13 +174,17 @@ def main():
     )
     # Determine dataset path
     if hasattr(config, 'dataset_name') and config.dataset_name:
         # Use Hugging Face dataset
         dataset_path = config.dataset_name
         logger.info(f"Using Hugging Face dataset: {dataset_path}")
     else:
-        # Use local dataset
-        dataset_path = os.path.join('/input', args.dataset_dir)
         logger.info(f"Using local dataset: {dataset_path}")
     # Load dataset with filtering options and sampling

     )
     # Determine dataset path
+    # Check if using Hugging Face dataset or local dataset
     if hasattr(config, 'dataset_name') and config.dataset_name:
         # Use Hugging Face dataset
         dataset_path = config.dataset_name
         logger.info(f"Using Hugging Face dataset: {dataset_path}")
     else:
+        # Use local dataset from config or command line argument
+        if args.dataset_dir:
+            dataset_path = os.path.join('/input', args.dataset_dir)
+        else:
+            dataset_path = os.path.join('/input', config.data_dir)
         logger.info(f"Using local dataset: {dataset_path}")
     # Load dataset with filtering options and sampling

test_config.py ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/usr/bin/env python3
+"""
+Test script to verify H100 lightweight configuration loads correctly
+"""
+import sys
+import os
+# Add project root to path
+project_root = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, project_root)
+def test_h100_lightweight_config():
+    """Test the H100 lightweight configuration"""
+    try:
+        from config.train_smollm3_h100_lightweight import config
+        print("✅ H100 Lightweight configuration loaded successfully!")
+        print(f"Model: {config.model_name}")
+        print(f"Dataset: {config.dataset_name}")
+        print(f"Sample size: {config.sample_size}")
+        print(f"Batch size: {config.batch_size}")
+        print(f"Learning rate: {config.learning_rate}")
+        print(f"Max sequence length: {config.max_seq_length}")
+        return True
+    except Exception as e:
+        print(f"❌ Error loading H100 lightweight configuration: {e}")
+        return False
+def test_training_script_import():
+    """Test that the training script can import the configuration"""
+    try:
+        from scripts.training.train import main
+        print("✅ Training script imports successfully!")
+        return True
+    except Exception as e:
+        print(f"❌ Error importing training script: {e}")
+        return False
+if __name__ == "__main__":
+    print("Testing H100 Lightweight Configuration...")
+    print("=" * 50)
+    success = True
+    success &= test_h100_lightweight_config()
+    success &= test_training_script_import()
+    if success:
+        print("\n🎉 All tests passed! Configuration is ready for training.")
+    else:
+        print("\n❌ Some tests failed. Please check the configuration.")
+        sys.exit(1)