Spaces:

Tonic
/

SmolFactory

Running

App Files Files Community

Tonic commited on Jul 20

Commit

6c63876

verified ·

1 Parent(s): c33a1e3

attempt to fix bfloat16 issue

Browse files

Files changed (5) hide show

config/train_smollm3_h100_lightweight.py +4 -3
quick_test_training.py +60 -0
src/model.py +29 -6
test_mixed_precision.py +63 -0
test_training_fix.py +62 -0

config/train_smollm3_h100_lightweight.py CHANGED Viewed

@@ -39,9 +39,10 @@ class SmolLM3ConfigH100Lightweight(SmolLM3Config):
     scheduler: str = "cosine"
     min_lr: float = 2e-6  # Higher minimum LR
-    # Mixed precision - Full precision for H100
-    fp16: bool = True
-    bf16: bool = False
     # Logging and saving - more frequent for rapid training
     save_steps: int = 200

     scheduler: str = "cosine"
     min_lr: float = 2e-6  # Higher minimum LR
+    # Mixed precision - Using fp16 for better compatibility
+    # Note: bf16 can cause issues on some GPU setups, fp16 is more universally supported
+    fp16: bool = False
+    bf16: bool = True
     # Logging and saving - more frequent for rapid training
     save_steps: int = 200

quick_test_training.py ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/usr/bin/env python3
+"""
+Quick test for the training fix
+"""
+import os
+import sys
+# Add project root to path
+project_root = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, project_root)
+def main():
+    print("🔧 Testing H100 Lightweight Training Fix")
+    print("=" * 50)
+    # Set environment variables to fix mixed precision issues
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+    os.environ["TORCH_USE_CUDA_DSA"] = "1"
+    print("✅ Environment variables set")
+    # Test configuration
+    try:
+        from config.train_smollm3_h100_lightweight import SmolLM3ConfigH100Lightweight
+        config = SmolLM3ConfigH100Lightweight()
+        print(f"✅ Configuration loaded: fp16={config.fp16}, bf16={config.bf16}")
+        # Test model loading (without actually loading the full model)
+        from src.model import SmolLM3Model
+        # Create model instance
+        model = SmolLM3Model(
+            model_name="HuggingFaceTB/SmolLM3-3B",
+            max_seq_length=4096,
+            config=config
+        )
+        print(f"✅ Model dtype: {model.torch_dtype}")
+        print(f"✅ Model device map: {model.device_map}")
+        # Test training arguments
+        training_args = model.get_training_arguments("/tmp/test")
+        print(f"✅ Training args: fp16={training_args.fp16}, bf16={training_args.bf16}")
+        print("\n🎉 All tests passed!")
+        print("You can now run the training with:")
+        print("  ./launch.sh")
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+    return 0
+if __name__ == "__main__":
+    exit(main())

src/model.py CHANGED Viewed

@@ -36,7 +36,16 @@ class SmolLM3Model:
         # Set device and dtype
         if torch_dtype is None:
             if torch.cuda.is_available():
-                self.torch_dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
             else:
                 self.torch_dtype = torch.float32
         else:
@@ -110,11 +119,25 @@ class SmolLM3Model:
                     # If flash attention is not supported, skip it
                     pass
-            self.model = AutoModelForCausalLM.from_pretrained(
-                self.model_name,
-                config=model_config,
-                **model_kwargs
-            )
             # Enable gradient checkpointing if specified
             if self.config and self.config.use_gradient_checkpointing:

         # Set device and dtype
         if torch_dtype is None:
             if torch.cuda.is_available():
+                # Check if config specifies mixed precision
+                if config and hasattr(config, 'fp16') and config.fp16:
+                    # Use fp16 if explicitly configured
+                    self.torch_dtype = torch.float16
+                elif config and hasattr(config, 'bf16') and config.bf16:
+                    # Use bf16 if explicitly configured
+                    self.torch_dtype = torch.bfloat16
+                else:
+                    # Default to bfp16 for better compatibility
+                    self.torch_dtype = torch.bfloat16
             else:
                 self.torch_dtype = torch.float32
         else:
                     # If flash attention is not supported, skip it
                     pass
+            # Try to load the model, fallback to fp16 if bf16 fails
+            try:
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.model_name,
+                    config=model_config,
+                    **model_kwargs
+                )
+            except RuntimeError as e:
+                if "bfloat16" in str(e) or "BFloat16" in str(e):
+                    logger.warning("BFloat16 not supported, falling back to Float16")
+                    model_kwargs["torch_dtype"] = torch.float16
+                    self.torch_dtype = torch.float16
+                    self.model = AutoModelForCausalLM.from_pretrained(
+                        self.model_name,
+                        config=model_config,
+                        **model_kwargs
+                    )
+                else:
+                    raise
             # Enable gradient checkpointing if specified
             if self.config and self.config.use_gradient_checkpointing:

test_mixed_precision.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3
+"""
+Test script to verify mixed precision configuration
+"""
+import torch
+import sys
+import os
+# Add project root to path
+project_root = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, project_root)
+def test_mixed_precision():
+    """Test mixed precision configuration"""
+    print("Testing mixed precision configuration...")
+    # Test 1: Check CUDA availability
+    print(f"CUDA available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        print(f"CUDA device count: {torch.cuda.device_count()}")
+        print(f"CUDA device capability: {torch.cuda.get_device_capability()}")
+        print(f"Current device: {torch.cuda.current_device()}")
+    # Test 2: Test model loading with different dtypes
+    try:
+        from src.model import SmolLM3Model
+        from config.train_smollm3_h100_lightweight import SmolLM3ConfigH100Lightweight
+        config = SmolLM3ConfigH100Lightweight()
+        print(f"Config fp16: {config.fp16}")
+        print(f"Config bf16: {config.bf16}")
+        # Test model loading
+        model = SmolLM3Model(
+            model_name="HuggingFaceTB/SmolLM3-3B",
+            max_seq_length=4096,
+            config=config
+        )
+        print(f"Model dtype: {model.torch_dtype}")
+        print(f"Model device map: {model.device_map}")
+        print("✅ Model loading successful!")
+        # Test training arguments
+        training_args = model.get_training_arguments("/tmp/test")
+        print(f"Training args fp16: {training_args.fp16}")
+        print(f"Training args bf16: {training_args.bf16}")
+        print("✅ Training arguments created successfully!")
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        return False
+    return True
+if __name__ == "__main__":
+    success = test_mixed_precision()
+    if success:
+        print("\n🎉 Mixed precision test passed!")
+    else:
+        print("\n❌ Mixed precision test failed!")
+        sys.exit(1)

test_training_fix.py ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/usr/bin/env python3
+"""
+Quick test to verify the training configuration fix
+"""
+import os
+import sys
+# Add project root to path
+project_root = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, project_root)
+def test_configuration():
+    """Test the H100 lightweight configuration"""
+    print("Testing H100 Lightweight Configuration...")
+    try:
+        from config.train_smollm3_h100_lightweight import SmolLM3ConfigH100Lightweight
+        config = SmolLM3ConfigH100Lightweight()
+        print("✅ Configuration loaded successfully")
+        print(f"  Model: {config.model_name}")
+        print(f"  Batch size: {config.batch_size}")
+        print(f"  Learning rate: {config.learning_rate}")
+        print(f"  FP16: {config.fp16}")
+        print(f"  BF16: {config.bf16}")
+        print(f"  Mixed precision: {'fp16' if config.fp16 else 'bf16'}")
+        print(f"  Sample size: {config.sample_size}")
+        # Test training arguments creation
+        from src.model import SmolLM3Model
+        # Create a minimal model instance for testing
+        model = SmolLM3Model(
+            model_name="HuggingFaceTB/SmolLM3-3B",
+            max_seq_length=4096,
+            config=config
+        )
+        # Test training arguments
+        training_args = model.get_training_arguments("/tmp/test")
+        print(f"✅ Training arguments created successfully")
+        print(f"  Training args FP16: {training_args.fp16}")
+        print(f"  Training args BF16: {training_args.bf16}")
+        return True
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+if __name__ == "__main__":
+    success = test_configuration()
+    if success:
+        print("\n🎉 Configuration test passed!")
+        print("You can now run the training with: ./launch.sh")
+    else:
+        print("\n❌ Configuration test failed!")
+        sys.exit(1)