Spaces:
Running
Running
adds formatting fix
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +14 -11
- FORMATTING_FIX_SUMMARY.md +15 -8
- H100_LIGHTWEIGHT_GUIDE.md +276 -0
- INTERACTIVE_PIPELINE_IMPROVEMENTS.md +330 -0
- PIPELINE_SUMMARY.md +330 -0
- README.md +1 -1
- README_END_TO_END.md +304 -0
- cloud_deployment.sh +0 -279
- config/train_smollm3.py +4 -0
- config/train_smollm3_h100_lightweight.py +112 -0
- config/train_smollm3_openhermes_fr.py +4 -0
- config/train_smollm3_openhermes_fr_a100_balanced.py +4 -0
- config/train_smollm3_openhermes_fr_a100_large.py +4 -0
- config/train_smollm3_openhermes_fr_a100_max_performance.py +4 -0
- config/train_smollm3_openhermes_fr_a100_multiple_passes.py +4 -0
- A100_LARGE_SCALE_GUIDE.md → docs/A100_LARGE_SCALE_GUIDE.md +0 -0
- docs/APP_CONFIGURATION_GUIDE.md +234 -0
- CLOUD_DEPLOYMENT_GUIDE.md → docs/CLOUD_DEPLOYMENT_GUIDE.md +0 -0
- CLOUD_TRAINING_GUIDE.md → docs/CLOUD_TRAINING_GUIDE.md +0 -0
- DEPLOYMENT_GUIDE.md → docs/DEPLOYMENT_GUIDE.md +0 -0
- docs/ENVIRONMENT_VARIABLES.md +113 -0
- docs/HF_DATASETS_GUIDE.md +269 -0
- docs/HF_SPACES_GUIDE.md +163 -0
- docs/MONITORING_IMPROVEMENTS_SUMMARY.md +191 -0
- docs/MONITORING_INTEGRATION_GUIDE.md +245 -0
- NO_THINK_TAG_GUIDE.md → docs/NO_THINK_TAG_GUIDE.md +0 -0
- PUSH_GUIDE.md → docs/PUSH_GUIDE.md +0 -0
- docs/PUSH_SCRIPT_GUIDE.md +267 -0
- TRACKIO_INTEGRATION.md → docs/TRACKIO_INTEGRATION.md +0 -0
- TRACKIO_INTEGRATION_VERIFICATION.md → docs/TRACKIO_INTEGRATION_VERIFICATION.md +0 -0
- TRACKIO_INTERFACE_GUIDE.md → docs/TRACKIO_INTERFACE_GUIDE.md +0 -0
- launch.sh +690 -0
- requirements.txt → requirements/requirements.txt +0 -0
- requirements_core.txt → requirements/requirements_core.txt +7 -1
- requirements_minimal.txt → requirements/requirements_minimal.txt +0 -0
- add_demo_data.py → scripts/dataset_tonic/add_demo_data.py +0 -0
- scripts/dataset_tonic/setup_hf_dataset.py +275 -0
- push_to_huggingface.py → scripts/model_tonic/push_to_huggingface.py +56 -13
- scripts/trackio_tonic/configure_trackio.py +145 -0
- deploy_trackio_space.py → scripts/trackio_tonic/deploy_trackio_space.py +1 -1
- scripts/trackio_tonic/trackio_api_client.py +286 -0
- run_a100_large_experiment.py → scripts/training/train.py +0 -0
- setup_launch.py +283 -0
- config.py → src/config.py +0 -0
- data.py → src/data.py +0 -0
- model.py → src/model.py +0 -0
- monitoring.py → src/monitoring.py +157 -58
- train.py → src/train.py +76 -0
- trainer.py → src/trainer.py +0 -0
- templates/datasets/readme.md +0 -0
.gitignore
CHANGED
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
1 |
# Python
|
2 |
__pycache__/
|
3 |
*.py[cod]
|
@@ -59,17 +62,17 @@ Thumbs.db
|
|
59 |
logs/
|
60 |
tensorboard_logs/
|
61 |
|
62 |
-
# Model outputs
|
63 |
-
output/
|
64 |
-
checkpoints/
|
65 |
-
models/
|
66 |
-
wandb/
|
67 |
|
68 |
# Datasets
|
69 |
-
data/
|
70 |
-
datasets/
|
71 |
-
my_dataset/
|
72 |
-
test_dataset/
|
73 |
|
74 |
# Temporary files
|
75 |
tmp/
|
@@ -86,9 +89,9 @@ accelerate_config.yaml
|
|
86 |
|
87 |
# Training outputs
|
88 |
runs/
|
89 |
-
|
90 |
!config/*.json
|
91 |
-
|
92 |
|
93 |
# Evaluation results
|
94 |
eval_results/
|
|
|
1 |
+
.cursorrules/
|
2 |
+
*.mdc
|
3 |
+
|
4 |
# Python
|
5 |
__pycache__/
|
6 |
*.py[cod]
|
|
|
62 |
logs/
|
63 |
tensorboard_logs/
|
64 |
|
65 |
+
# # Model outputs
|
66 |
+
# output/
|
67 |
+
# checkpoints/
|
68 |
+
# models/
|
69 |
+
# wandb/
|
70 |
|
71 |
# Datasets
|
72 |
+
# data/
|
73 |
+
# datasets/
|
74 |
+
# my_dataset/
|
75 |
+
# test_dataset/
|
76 |
|
77 |
# Temporary files
|
78 |
tmp/
|
|
|
89 |
|
90 |
# Training outputs
|
91 |
runs/
|
92 |
+
#*.json
|
93 |
!config/*.json
|
94 |
+
#!*.json.example
|
95 |
|
96 |
# Evaluation results
|
97 |
eval_results/
|
FORMATTING_FIX_SUMMARY.md
CHANGED
@@ -19,10 +19,10 @@ I fixed the issue by standardizing all logging statements to use traditional str
|
|
19 |
|
20 |
### Files Fixed
|
21 |
|
22 |
-
1. **`monitoring.py`** - Fixed all logging statements
|
23 |
-
2. **`trainer.py`** - Fixed all logging statements
|
24 |
-
3. **`model.py`** - Fixed all logging statements
|
25 |
-
4. **`data.py`** - Fixed all logging statements
|
26 |
|
27 |
### Changes Made
|
28 |
|
@@ -52,6 +52,7 @@ This script tests:
|
|
52 |
- ✅ Logging functionality
|
53 |
- ✅ Module imports
|
54 |
- ✅ Configuration loading
|
|
|
55 |
- ✅ Error handling
|
56 |
|
57 |
## 🚀 Usage
|
@@ -68,25 +69,29 @@ python run_a100_large_experiment.py \
|
|
68 |
|
69 |
## 📋 Key Changes
|
70 |
|
71 |
-
### 1. Monitoring Module (`monitoring.py`)
|
72 |
- Fixed all `logger.info()`, `logger.error()`, `logger.warning()` calls
|
73 |
- Replaced f-strings with `%` formatting
|
74 |
- Fixed string concatenation in file paths
|
|
|
75 |
|
76 |
-
### 2. Trainer Module (`trainer.py`)
|
77 |
- Fixed logging in `SmolLM3Trainer` class
|
78 |
- Fixed console output formatting
|
79 |
- Fixed error message formatting
|
|
|
80 |
|
81 |
-
### 3. Model Module (`model.py`)
|
82 |
- Fixed model loading logging
|
83 |
- Fixed configuration logging
|
84 |
- Fixed error reporting
|
|
|
85 |
|
86 |
-
### 4. Data Module (`data.py`)
|
87 |
- Fixed dataset loading logging
|
88 |
- Fixed processing progress logging
|
89 |
- Fixed error handling
|
|
|
90 |
|
91 |
## 🔧 Technical Details
|
92 |
|
@@ -119,6 +124,7 @@ To verify the fix works:
|
|
119 |
- ✅ Logging tests
|
120 |
- ✅ Import tests
|
121 |
- ✅ Configuration tests
|
|
|
122 |
|
123 |
3. **Run your training command**:
|
124 |
```bash
|
@@ -131,6 +137,7 @@ To verify the fix works:
|
|
131 |
- No changes to the training logic or configuration
|
132 |
- All error messages and logging remain informative
|
133 |
- The fix is backward compatible
|
|
|
134 |
|
135 |
## 🚨 Prevention
|
136 |
|
|
|
19 |
|
20 |
### Files Fixed
|
21 |
|
22 |
+
1. **`src/monitoring.py`** - Fixed all logging statements
|
23 |
+
2. **`src/trainer.py`** - Fixed all logging statements
|
24 |
+
3. **`src/model.py`** - Fixed all logging statements
|
25 |
+
4. **`src/data.py`** - Fixed all logging statements
|
26 |
|
27 |
### Changes Made
|
28 |
|
|
|
52 |
- ✅ Logging functionality
|
53 |
- ✅ Module imports
|
54 |
- ✅ Configuration loading
|
55 |
+
- ✅ Monitoring creation
|
56 |
- ✅ Error handling
|
57 |
|
58 |
## 🚀 Usage
|
|
|
69 |
|
70 |
## 📋 Key Changes
|
71 |
|
72 |
+
### 1. Monitoring Module (`src/monitoring.py`)
|
73 |
- Fixed all `logger.info()`, `logger.error()`, `logger.warning()` calls
|
74 |
- Replaced f-strings with `%` formatting
|
75 |
- Fixed string concatenation in file paths
|
76 |
+
- Fixed HF Datasets integration logging
|
77 |
|
78 |
+
### 2. Trainer Module (`src/trainer.py`)
|
79 |
- Fixed logging in `SmolLM3Trainer` class
|
80 |
- Fixed console output formatting
|
81 |
- Fixed error message formatting
|
82 |
+
- Fixed callback logging
|
83 |
|
84 |
+
### 3. Model Module (`src/model.py`)
|
85 |
- Fixed model loading logging
|
86 |
- Fixed configuration logging
|
87 |
- Fixed error reporting
|
88 |
+
- Fixed parameter logging
|
89 |
|
90 |
+
### 4. Data Module (`src/data.py`)
|
91 |
- Fixed dataset loading logging
|
92 |
- Fixed processing progress logging
|
93 |
- Fixed error handling
|
94 |
+
- Fixed split processing logging
|
95 |
|
96 |
## 🔧 Technical Details
|
97 |
|
|
|
124 |
- ✅ Logging tests
|
125 |
- ✅ Import tests
|
126 |
- ✅ Configuration tests
|
127 |
+
- ✅ Monitoring creation tests
|
128 |
|
129 |
3. **Run your training command**:
|
130 |
```bash
|
|
|
137 |
- No changes to the training logic or configuration
|
138 |
- All error messages and logging remain informative
|
139 |
- The fix is backward compatible
|
140 |
+
- HF Datasets integration is preserved
|
141 |
|
142 |
## 🚨 Prevention
|
143 |
|
H100_LIGHTWEIGHT_GUIDE.md
ADDED
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# H100 Lightweight Training Configuration Guide
|
2 |
+
|
3 |
+
This guide explains the new **H100 Lightweight (Rapid)** training configuration, optimized for rapid fine-tuning on H100 GPUs with a small, carefully selected dataset.
|
4 |
+
|
5 |
+
## 🎯 Overview
|
6 |
+
|
7 |
+
The H100 Lightweight configuration is designed for:
|
8 |
+
- **Rapid experimentation** on H100 GPUs
|
9 |
+
- **Efficient training** with 80K carefully selected samples
|
10 |
+
- **Quick iteration** for research and development
|
11 |
+
- **Cost-effective** training sessions
|
12 |
+
|
13 |
+
## 🚀 Key Features
|
14 |
+
|
15 |
+
### **Optimized for H100**
|
16 |
+
- **Batch Size**: 16 (larger than A100 configs)
|
17 |
+
- **Gradient Accumulation**: 4 (reduced for faster updates)
|
18 |
+
- **Learning Rate**: 8e-6 (slightly higher for rapid convergence)
|
19 |
+
- **Sequence Length**: 8192 (full context window)
|
20 |
+
|
21 |
+
### **Dataset Sampling**
|
22 |
+
- **Source**: OpenHermes-FR dataset
|
23 |
+
- **Sample Size**: 80,000 random samples
|
24 |
+
- **Validation**: 1,000 samples (if available)
|
25 |
+
- **Reproducibility**: Fixed random seed (42)
|
26 |
+
|
27 |
+
### **Training Optimizations**
|
28 |
+
- **Warmup Steps**: 50 (reduced for rapid training)
|
29 |
+
- **Evaluation**: Every 50 steps
|
30 |
+
- **Logging**: Every 5 steps
|
31 |
+
- **Saving**: Every 200 steps
|
32 |
+
- **Checkpoints**: Keep only 2 (save storage)
|
33 |
+
|
34 |
+
## 📊 Configuration Details
|
35 |
+
|
36 |
+
### **Model Configuration**
|
37 |
+
```python
|
38 |
+
model_name="HuggingFaceTB/SmolLM3-3B"
|
39 |
+
max_seq_length=8192
|
40 |
+
use_flash_attention=True
|
41 |
+
use_gradient_checkpointing=True
|
42 |
+
```
|
43 |
+
|
44 |
+
### **Training Parameters**
|
45 |
+
```python
|
46 |
+
batch_size=16
|
47 |
+
gradient_accumulation_steps=4
|
48 |
+
learning_rate=8e-6
|
49 |
+
warmup_steps=50
|
50 |
+
max_epochs=1
|
51 |
+
```
|
52 |
+
|
53 |
+
### **H100-Specific Optimizations**
|
54 |
+
```python
|
55 |
+
dataloader_num_workers=4
|
56 |
+
dataloader_pin_memory=True
|
57 |
+
gradient_clipping=1.0
|
58 |
+
group_by_length=True
|
59 |
+
pad_to_multiple_of=8
|
60 |
+
```
|
61 |
+
|
62 |
+
### **Memory Optimizations**
|
63 |
+
```python
|
64 |
+
save_total_limit=2
|
65 |
+
early_stopping_patience=3
|
66 |
+
max_grad_norm=1.0
|
67 |
+
warmup_ratio=0.1
|
68 |
+
```
|
69 |
+
|
70 |
+
## 🔧 Usage
|
71 |
+
|
72 |
+
### **Interactive Selection**
|
73 |
+
```bash
|
74 |
+
./launch.sh
|
75 |
+
# Select "H100 Lightweight (Rapid)" when prompted
|
76 |
+
```
|
77 |
+
|
78 |
+
### **Expected Training Time**
|
79 |
+
- **H100**: ~2-4 hours (depending on hardware)
|
80 |
+
- **A100**: ~4-6 hours
|
81 |
+
- **V100**: ~6-8 hours
|
82 |
+
|
83 |
+
### **Memory Requirements**
|
84 |
+
- **GPU Memory**: 40GB+ (H100 recommended)
|
85 |
+
- **System RAM**: 32GB+
|
86 |
+
- **Storage**: 50GB+ for dataset and checkpoints
|
87 |
+
|
88 |
+
## 📈 Performance Characteristics
|
89 |
+
|
90 |
+
### **Training Speed**
|
91 |
+
- **Steps per Second**: ~2-3 (on H100)
|
92 |
+
- **Samples per Second**: ~32-48
|
93 |
+
- **Effective Batch Size**: 64 (16 × 4)
|
94 |
+
|
95 |
+
### **Convergence**
|
96 |
+
- **Expected Loss**: 1.2-1.8 (after 1 epoch)
|
97 |
+
- **Evaluation Frequency**: Every 50 steps
|
98 |
+
- **Early Stopping**: After 3 evaluations without improvement
|
99 |
+
|
100 |
+
### **Dataset Efficiency**
|
101 |
+
- **80K samples**: ~1.3% of full OpenHermes-FR
|
102 |
+
- **Random sampling**: Ensures diversity
|
103 |
+
- **Fixed seed**: Reproducible results
|
104 |
+
|
105 |
+
## 🎯 Use Cases
|
106 |
+
|
107 |
+
### **Perfect For**
|
108 |
+
- **Rapid prototyping** of new ideas
|
109 |
+
- **Hyperparameter tuning** experiments
|
110 |
+
- **Model comparison** studies
|
111 |
+
- **Research validation** before full training
|
112 |
+
- **Educational purposes** and learning
|
113 |
+
|
114 |
+
### **Not Recommended For**
|
115 |
+
- **Production models** (use Multiple Passes instead)
|
116 |
+
- **Competition submissions** (use full dataset)
|
117 |
+
- **Research papers** (use complete training)
|
118 |
+
|
119 |
+
## 🔄 Comparison with Other Configurations
|
120 |
+
|
121 |
+
| Configuration | Dataset Size | Batch Size | Epochs | Training Time | Use Case |
|
122 |
+
|---------------|--------------|------------|--------|---------------|----------|
|
123 |
+
| **Basic Training** | Full SmolTalk | 2 | 3 | 6-8 hours | Learning |
|
124 |
+
| **H100 Lightweight** | 80K Hermes-FR | 16 | 1 | 2-4 hours | Rapid experiments |
|
125 |
+
| **A100 Large Scale** | Full Hermes-FR | 8 | 1.3 | 8-12 hours | Serious research |
|
126 |
+
| **Multiple Passes** | Full Hermes-FR | 6 | 4 | 24-36 hours | Production |
|
127 |
+
|
128 |
+
## 🛠️ Customization
|
129 |
+
|
130 |
+
### **Modifying Sample Size**
|
131 |
+
```bash
|
132 |
+
# In the launch script, you can modify:
|
133 |
+
DATASET_SAMPLE_SIZE=50000 # For 50K samples
|
134 |
+
DATASET_SAMPLE_SIZE=100000 # For 100K samples
|
135 |
+
```
|
136 |
+
|
137 |
+
### **Adjusting Training Parameters**
|
138 |
+
```bash
|
139 |
+
# Modify in config/train_smollm3_h100_lightweight.py:
|
140 |
+
batch_size=12 # Smaller batch size
|
141 |
+
learning_rate=6e-6 # Lower learning rate
|
142 |
+
warmup_steps=100 # More warmup steps
|
143 |
+
```
|
144 |
+
|
145 |
+
### **Changing Dataset**
|
146 |
+
```bash
|
147 |
+
# Modify the dataset name in the configuration:
|
148 |
+
dataset_name="your-custom-dataset"
|
149 |
+
```
|
150 |
+
|
151 |
+
## 📊 Monitoring and Results
|
152 |
+
|
153 |
+
### **Trackio Integration**
|
154 |
+
- **Real-time metrics**: Loss, learning rate, gradient norm
|
155 |
+
- **Training curves**: Visual progress tracking
|
156 |
+
- **Resource usage**: GPU utilization, memory consumption
|
157 |
+
- **Artifacts**: Model checkpoints, logs
|
158 |
+
|
159 |
+
### **Expected Metrics**
|
160 |
+
- **Training Loss**: Starts ~3.0, ends ~1.5
|
161 |
+
- **Validation Loss**: Should be close to training loss
|
162 |
+
- **Learning Rate**: Cosine decay from 8e-6 to 2e-6
|
163 |
+
- **Gradient Norm**: Should stay below 1.0
|
164 |
+
|
165 |
+
### **Success Indicators**
|
166 |
+
- **Converging loss**: Steady decrease over time
|
167 |
+
- **Stable gradients**: Consistent gradient norms
|
168 |
+
- **Good validation**: Validation loss follows training loss
|
169 |
+
- **No overfitting**: Validation loss doesn't increase
|
170 |
+
|
171 |
+
## 🚨 Troubleshooting
|
172 |
+
|
173 |
+
### **Common Issues**
|
174 |
+
|
175 |
+
#### **Out of Memory (OOM)**
|
176 |
+
```bash
|
177 |
+
# Reduce batch size in config:
|
178 |
+
batch_size=12 # Instead of 16
|
179 |
+
gradient_accumulation_steps=6 # Instead of 4
|
180 |
+
```
|
181 |
+
|
182 |
+
#### **Slow Training**
|
183 |
+
```bash
|
184 |
+
# Check GPU utilization:
|
185 |
+
nvidia-smi
|
186 |
+
# Ensure CUDA is properly installed
|
187 |
+
python -c "import torch; print(torch.cuda.is_available())"
|
188 |
+
```
|
189 |
+
|
190 |
+
#### **Poor Convergence**
|
191 |
+
```bash
|
192 |
+
# Try different learning rate:
|
193 |
+
learning_rate=6e-6 # Instead of 8e-6
|
194 |
+
# Or increase warmup:
|
195 |
+
warmup_steps=100 # Instead of 50
|
196 |
+
```
|
197 |
+
|
198 |
+
#### **Dataset Issues**
|
199 |
+
```bash
|
200 |
+
# Check dataset loading:
|
201 |
+
python -c "from datasets import load_dataset; print(len(load_dataset('legmlai/openhermes-fr')['train']))"
|
202 |
+
```
|
203 |
+
|
204 |
+
### **Performance Tips**
|
205 |
+
|
206 |
+
1. **Use H100 if available**: Significantly faster than A100
|
207 |
+
2. **Monitor GPU memory**: Keep utilization below 90%
|
208 |
+
3. **Check logs regularly**: Look for convergence issues
|
209 |
+
4. **Save checkpoints**: Don't lose progress
|
210 |
+
5. **Use early stopping**: Prevent overfitting
|
211 |
+
|
212 |
+
## 📋 Example Workflow
|
213 |
+
|
214 |
+
### **Complete H100 Lightweight Training**
|
215 |
+
```bash
|
216 |
+
# 1. Setup
|
217 |
+
python setup_launch.py
|
218 |
+
|
219 |
+
# 2. Check requirements
|
220 |
+
python check_requirements.py
|
221 |
+
|
222 |
+
# 3. Run interactive pipeline
|
223 |
+
./launch.sh
|
224 |
+
|
225 |
+
# 4. Select configuration
|
226 |
+
# Choose: "H100 Lightweight (Rapid)"
|
227 |
+
|
228 |
+
# 5. Monitor training
|
229 |
+
# Watch Trackio Space for real-time progress
|
230 |
+
|
231 |
+
# 6. Check results
|
232 |
+
# Model will be pushed to HF Hub
|
233 |
+
# Summary in training_summary.md
|
234 |
+
```
|
235 |
+
|
236 |
+
### **Expected Output**
|
237 |
+
```
|
238 |
+
✅ Dataset prepared: 80000 train samples, 1000 validation samples
|
239 |
+
📈 Training started with 5000 total steps
|
240 |
+
⏱️ Estimated time: 2-4 hours
|
241 |
+
📊 Monitor progress at: https://huggingface.co/spaces/...
|
242 |
+
```
|
243 |
+
|
244 |
+
## 🎉 Benefits
|
245 |
+
|
246 |
+
### **Speed**
|
247 |
+
- **3-4x faster** than full dataset training
|
248 |
+
- **Rapid iteration** for research
|
249 |
+
- **Quick validation** of ideas
|
250 |
+
|
251 |
+
### **Efficiency**
|
252 |
+
- **Reduced costs** (less GPU time)
|
253 |
+
- **Lower storage** requirements
|
254 |
+
- **Faster experimentation** cycle
|
255 |
+
|
256 |
+
### **Quality**
|
257 |
+
- **Still high quality** results
|
258 |
+
- **Good for prototyping**
|
259 |
+
- **Suitable for many use cases**
|
260 |
+
|
261 |
+
## 🔮 Future Enhancements
|
262 |
+
|
263 |
+
### **Planned Improvements**
|
264 |
+
- **Adaptive sampling**: Smart dataset selection
|
265 |
+
- **Multi-GPU support**: Distributed training
|
266 |
+
- **Advanced monitoring**: More detailed metrics
|
267 |
+
- **Auto-tuning**: Automatic hyperparameter optimization
|
268 |
+
|
269 |
+
### **Extensibility**
|
270 |
+
- **Custom datasets**: Easy integration
|
271 |
+
- **Different models**: Support for other architectures
|
272 |
+
- **Advanced sampling**: Stratified, balanced sampling
|
273 |
+
|
274 |
+
---
|
275 |
+
|
276 |
+
**Happy Rapid Training on H100! 🚀**
|
INTERACTIVE_PIPELINE_IMPROVEMENTS.md
ADDED
@@ -0,0 +1,330 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Interactive Pipeline Improvements
|
2 |
+
|
3 |
+
This document explains the improvements made to the `launch.sh` script to make it interactive and configurable for different training scenarios.
|
4 |
+
|
5 |
+
## 🎯 Key Improvements
|
6 |
+
|
7 |
+
### 1. **Interactive User Interface**
|
8 |
+
- **Colored Output**: Added color-coded status messages for better UX
|
9 |
+
- **Input Validation**: Real-time validation of user inputs
|
10 |
+
- **Default Values**: Smart defaults for common configurations
|
11 |
+
- **Error Handling**: Graceful error handling with helpful messages
|
12 |
+
|
13 |
+
### 2. **Training Configuration Selection**
|
14 |
+
The script now offers 4 predefined training configurations:
|
15 |
+
|
16 |
+
#### **Basic Training (Default)**
|
17 |
+
```bash
|
18 |
+
Model: SmolLM3-3B
|
19 |
+
Dataset: SmolTalk
|
20 |
+
Epochs: 3
|
21 |
+
Batch Size: 2
|
22 |
+
Learning Rate: 5e-6
|
23 |
+
Sequence Length: 4096
|
24 |
+
Best for: Quick experiments, learning
|
25 |
+
```
|
26 |
+
|
27 |
+
#### **H100 Lightweight (Rapid)**
|
28 |
+
```bash
|
29 |
+
Model: SmolLM3-3B
|
30 |
+
Dataset: OpenHermes-FR (80K samples)
|
31 |
+
Epochs: 1
|
32 |
+
Batch Size: 16
|
33 |
+
Learning Rate: 8e-6
|
34 |
+
Sequence Length: 8192
|
35 |
+
Best for: Rapid training on H100
|
36 |
+
```
|
37 |
+
|
38 |
+
#### **A100 Large Scale**
|
39 |
+
```bash
|
40 |
+
Model: SmolLM3-3B
|
41 |
+
Dataset: OpenHermes-FR
|
42 |
+
Epochs: 1.3 passes
|
43 |
+
Batch Size: 8
|
44 |
+
Learning Rate: 5e-6
|
45 |
+
Sequence Length: 8192
|
46 |
+
Best for: High-performance training
|
47 |
+
```
|
48 |
+
|
49 |
+
#### **Multiple Passes**
|
50 |
+
```bash
|
51 |
+
Model: SmolLM3-3B
|
52 |
+
Dataset: OpenHermes-FR
|
53 |
+
Epochs: 4 passes
|
54 |
+
Batch Size: 6
|
55 |
+
Learning Rate: 3e-6
|
56 |
+
Sequence Length: 8192
|
57 |
+
Best for: Thorough training
|
58 |
+
```
|
59 |
+
|
60 |
+
#### **Custom Configuration**
|
61 |
+
- User-defined parameters
|
62 |
+
- Flexible model and dataset selection
|
63 |
+
- Custom training parameters
|
64 |
+
|
65 |
+
### 3. **Enhanced User Experience**
|
66 |
+
|
67 |
+
#### **Step-by-Step Guidance**
|
68 |
+
1. **Authentication** - HF username and token validation
|
69 |
+
2. **Configuration Selection** - Choose from predefined configs
|
70 |
+
3. **Experiment Setup** - Configure experiment details
|
71 |
+
4. **Training Parameters** - Adjust hyperparameters
|
72 |
+
5. **Deployment Setup** - Trackio Space configuration
|
73 |
+
6. **Confirmation** - Review and confirm settings
|
74 |
+
|
75 |
+
#### **Input Functions**
|
76 |
+
```bash
|
77 |
+
# Get input with default value
|
78 |
+
get_input "Prompt" "default_value" VARIABLE_NAME
|
79 |
+
|
80 |
+
# Select from options
|
81 |
+
select_option "Choose option:" "Option 1" "Option 2" "Option 3" VARIABLE_NAME
|
82 |
+
|
83 |
+
# Validate HF token
|
84 |
+
validate_hf_token "$HF_TOKEN"
|
85 |
+
```
|
86 |
+
|
87 |
+
#### **Colored Output Functions**
|
88 |
+
```bash
|
89 |
+
print_status "Success message" # Green ✅
|
90 |
+
print_warning "Warning message" # Yellow ⚠️
|
91 |
+
print_error "Error message" # Red ❌
|
92 |
+
print_info "Info message" # Blue ℹ️
|
93 |
+
print_header "Header message" # Purple 🚀
|
94 |
+
print_step "Step message" # Cyan 📋
|
95 |
+
```
|
96 |
+
|
97 |
+
### 4. **Dynamic Configuration Generation**
|
98 |
+
|
99 |
+
The script now generates training configurations based on user selection:
|
100 |
+
|
101 |
+
```python
|
102 |
+
# Generated config file
|
103 |
+
config = SmolLM3Config(
|
104 |
+
model_name="$MODEL_NAME",
|
105 |
+
max_seq_length=$MAX_SEQ_LENGTH,
|
106 |
+
batch_size=$BATCH_SIZE,
|
107 |
+
learning_rate=$LEARNING_RATE,
|
108 |
+
# ... other parameters
|
109 |
+
)
|
110 |
+
```
|
111 |
+
|
112 |
+
### 5. **Improved Error Handling**
|
113 |
+
|
114 |
+
#### **Input Validation**
|
115 |
+
- Required field validation
|
116 |
+
- HF token validation
|
117 |
+
- Numeric input validation
|
118 |
+
- Choice validation
|
119 |
+
|
120 |
+
#### **Graceful Degradation**
|
121 |
+
- Clear error messages
|
122 |
+
- Recovery suggestions
|
123 |
+
- Exit on critical errors
|
124 |
+
|
125 |
+
### 6. **Configuration Management**
|
126 |
+
|
127 |
+
#### **User Credentials**
|
128 |
+
- Interactive username input
|
129 |
+
- Secure token input
|
130 |
+
- Real-time token validation
|
131 |
+
|
132 |
+
#### **Experiment Details**
|
133 |
+
- Dynamic experiment naming
|
134 |
+
- Repository name generation
|
135 |
+
- Dataset repository configuration
|
136 |
+
|
137 |
+
#### **Training Parameters**
|
138 |
+
- Batch size selection
|
139 |
+
- Learning rate adjustment
|
140 |
+
- Sequence length configuration
|
141 |
+
- Save/eval/logging steps
|
142 |
+
|
143 |
+
### 7. **Enhanced Monitoring Integration**
|
144 |
+
|
145 |
+
#### **Trackio Space**
|
146 |
+
- Dynamic space naming
|
147 |
+
- Automatic deployment
|
148 |
+
- URL generation
|
149 |
+
|
150 |
+
#### **HF Datasets**
|
151 |
+
- Dataset repository setup
|
152 |
+
- Experiment data storage
|
153 |
+
- Access configuration
|
154 |
+
|
155 |
+
## 🔧 Technical Improvements
|
156 |
+
|
157 |
+
### 1. **Modular Functions**
|
158 |
+
```bash
|
159 |
+
# Input handling
|
160 |
+
get_input() # Get user input with defaults
|
161 |
+
select_option() # Select from options
|
162 |
+
validate_hf_token() # Validate HF token
|
163 |
+
|
164 |
+
# Configuration
|
165 |
+
show_training_configs() # Display available configs
|
166 |
+
get_training_config() # Get config based on selection
|
167 |
+
create_training_config() # Generate config file
|
168 |
+
|
169 |
+
# Output formatting
|
170 |
+
print_status() # Success messages
|
171 |
+
print_warning() # Warning messages
|
172 |
+
print_error() # Error messages
|
173 |
+
print_info() # Info messages
|
174 |
+
print_header() # Header messages
|
175 |
+
print_step() # Step messages
|
176 |
+
```
|
177 |
+
|
178 |
+
### 2. **Configuration Selection Logic**
|
179 |
+
```bash
|
180 |
+
case "$config_type" in
|
181 |
+
"Basic Training")
|
182 |
+
MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
|
183 |
+
DATASET_NAME="HuggingFaceTB/smoltalk"
|
184 |
+
# ... other parameters
|
185 |
+
;;
|
186 |
+
"A100 Large Scale")
|
187 |
+
MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
|
188 |
+
DATASET_NAME="legmlai/openhermes-fr"
|
189 |
+
# ... other parameters
|
190 |
+
;;
|
191 |
+
# ... other configurations
|
192 |
+
esac
|
193 |
+
```
|
194 |
+
|
195 |
+
### 3. **Dynamic File Generation**
|
196 |
+
```bash
|
197 |
+
# Generate training config
|
198 |
+
create_training_config "$CONFIG_FILE"
|
199 |
+
|
200 |
+
# Generate deployment input
|
201 |
+
cat > deploy_input.txt << EOF
|
202 |
+
$HF_USERNAME
|
203 |
+
$TRACKIO_SPACE_NAME
|
204 |
+
$HF_TOKEN
|
205 |
+
EOF
|
206 |
+
```
|
207 |
+
|
208 |
+
## 📊 User Workflow
|
209 |
+
|
210 |
+
### **Before (Static)**
|
211 |
+
1. Edit `launch.sh` manually
|
212 |
+
2. Update hardcoded variables
|
213 |
+
3. Run script
|
214 |
+
4. Hope configuration is correct
|
215 |
+
|
216 |
+
### **After (Interactive)**
|
217 |
+
1. Run `./launch.sh`
|
218 |
+
2. Follow interactive prompts
|
219 |
+
3. Select training configuration
|
220 |
+
4. Confirm settings
|
221 |
+
5. Watch automated pipeline
|
222 |
+
|
223 |
+
## 🎯 Benefits
|
224 |
+
|
225 |
+
### **For Users**
|
226 |
+
- **No Manual Editing**: No need to edit script files
|
227 |
+
- **Guided Experience**: Step-by-step prompts
|
228 |
+
- **Validation**: Real-time input validation
|
229 |
+
- **Flexibility**: Multiple configuration options
|
230 |
+
- **Safety**: Confirmation before execution
|
231 |
+
|
232 |
+
### **For Developers**
|
233 |
+
- **Maintainable**: Modular function structure
|
234 |
+
- **Extensible**: Easy to add new configurations
|
235 |
+
- **Robust**: Comprehensive error handling
|
236 |
+
- **User-Friendly**: Clear feedback and guidance
|
237 |
+
|
238 |
+
### **For Different Use Cases**
|
239 |
+
- **Beginners**: Basic Training configuration
|
240 |
+
- **H100 Users**: H100 Lightweight for rapid experiments
|
241 |
+
- **Researchers**: A100 Large Scale for serious experiments
|
242 |
+
- **Production**: Multiple Passes for thorough training
|
243 |
+
- **Custom**: User-defined parameters for specific needs
|
244 |
+
|
245 |
+
## 🔄 Configuration Examples
|
246 |
+
|
247 |
+
### **Quick Start (Basic Training)**
|
248 |
+
```bash
|
249 |
+
./launch.sh
|
250 |
+
# Follow prompts:
|
251 |
+
# 1. Enter HF username and token
|
252 |
+
# 2. Select "Basic Training"
|
253 |
+
# 3. Confirm settings
|
254 |
+
# 4. Watch automated pipeline
|
255 |
+
```
|
256 |
+
|
257 |
+
### **High-Performance Training (A100)**
|
258 |
+
```bash
|
259 |
+
./launch.sh
|
260 |
+
# Follow prompts:
|
261 |
+
# 1. Enter HF username and token
|
262 |
+
# 2. Select "A100 Large Scale"
|
263 |
+
# 3. Adjust parameters if needed
|
264 |
+
# 4. Confirm and run
|
265 |
+
```
|
266 |
+
|
267 |
+
### **Rapid Training (H100)**
|
268 |
+
```bash
|
269 |
+
./launch.sh
|
270 |
+
# Follow prompts:
|
271 |
+
# 1. Enter HF username and token
|
272 |
+
# 2. Select "H100 Lightweight (Rapid)"
|
273 |
+
# 3. Confirm settings
|
274 |
+
# 4. Watch rapid training on H100
|
275 |
+
```
|
276 |
+
|
277 |
+
### **Custom Training**
|
278 |
+
```bash
|
279 |
+
./launch.sh
|
280 |
+
# Follow prompts:
|
281 |
+
# 1. Enter HF username and token
|
282 |
+
# 2. Select "Custom Configuration"
|
283 |
+
# 3. Enter custom parameters:
|
284 |
+
# - Model: microsoft/DialoGPT-medium
|
285 |
+
# - Dataset: your-custom-dataset
|
286 |
+
# - Epochs: 5
|
287 |
+
# - Batch Size: 4
|
288 |
+
# - Learning Rate: 1e-5
|
289 |
+
# 4. Confirm and run
|
290 |
+
```
|
291 |
+
|
292 |
+
## 🚀 Future Enhancements
|
293 |
+
|
294 |
+
### **Planned Improvements**
|
295 |
+
- **GUI Interface**: Web-based configuration interface
|
296 |
+
- **Configuration Templates**: Save/load custom configurations
|
297 |
+
- **Advanced Validation**: More sophisticated input validation
|
298 |
+
- **Progress Tracking**: Real-time progress indicators
|
299 |
+
- **Rollback Capability**: Undo changes if needed
|
300 |
+
|
301 |
+
### **Extensibility**
|
302 |
+
- **Plugin System**: Add custom training configurations
|
303 |
+
- **API Integration**: Connect to external services
|
304 |
+
- **Multi-GPU Support**: Distributed training options
|
305 |
+
- **Advanced Monitoring**: Enhanced tracking capabilities
|
306 |
+
|
307 |
+
## 📋 Migration Guide
|
308 |
+
|
309 |
+
### **For Existing Users**
|
310 |
+
1. **Backup**: Save your current `launch.sh`
|
311 |
+
2. **Update**: Replace with new interactive version
|
312 |
+
3. **Test**: Run with basic configuration first
|
313 |
+
4. **Migrate**: Use interactive prompts instead of manual editing
|
314 |
+
|
315 |
+
### **For New Users**
|
316 |
+
1. **Setup**: Run `python setup_launch.py`
|
317 |
+
2. **Check**: Run `python check_requirements.py`
|
318 |
+
3. **Launch**: Run `./launch.sh`
|
319 |
+
4. **Follow**: Use interactive prompts
|
320 |
+
|
321 |
+
## 🎉 Conclusion
|
322 |
+
|
323 |
+
The interactive pipeline provides a much better user experience with:
|
324 |
+
- **Guided Configuration**: No manual editing required
|
325 |
+
- **Multiple Options**: Predefined configurations for different use cases
|
326 |
+
- **Validation**: Real-time input validation and error handling
|
327 |
+
- **Flexibility**: Custom configuration support
|
328 |
+
- **Safety**: Confirmation steps and error recovery
|
329 |
+
|
330 |
+
The script is now production-ready for users of all skill levels, from beginners to advanced researchers.
|
PIPELINE_SUMMARY.md
ADDED
@@ -0,0 +1,330 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SmolLM3 End-to-End Pipeline - Implementation Summary
|
2 |
+
|
3 |
+
This document summarizes the comprehensive refactoring and enhancement of the SmolLM3 fine-tuning codebase to create a complete end-to-end pipeline.
|
4 |
+
|
5 |
+
## 🎯 Overview
|
6 |
+
|
7 |
+
The pipeline now provides a complete solution from Trackio Space deployment to model push, with integrated monitoring, dataset management, and automated deployment.
|
8 |
+
|
9 |
+
## 📁 Files Created/Modified
|
10 |
+
|
11 |
+
### **Core Pipeline Files**
|
12 |
+
|
13 |
+
1. **`launch.sh`** - Complete end-to-end pipeline script
|
14 |
+
- 16-step comprehensive pipeline
|
15 |
+
- Automated environment setup
|
16 |
+
- Integrated monitoring and deployment
|
17 |
+
- Dynamic configuration generation
|
18 |
+
|
19 |
+
2. **`setup_launch.py`** - User configuration helper
|
20 |
+
- Interactive setup for user credentials
|
21 |
+
- Automatic script configuration
|
22 |
+
- Requirements checker generation
|
23 |
+
|
24 |
+
3. **`test_pipeline.py`** - Comprehensive testing suite
|
25 |
+
- Import testing
|
26 |
+
- Component verification
|
27 |
+
- CUDA and HF token validation
|
28 |
+
|
29 |
+
4. **`README_END_TO_END.md`** - Complete documentation
|
30 |
+
- Step-by-step usage guide
|
31 |
+
- Troubleshooting section
|
32 |
+
- Advanced configuration options
|
33 |
+
|
34 |
+
### **Scripts and Utilities**
|
35 |
+
|
36 |
+
5. **`scripts/trackio_tonic/trackio_api_client.py`** - API client for Trackio
|
37 |
+
- Complete API client implementation
|
38 |
+
- Error handling and retry logic
|
39 |
+
- Support for both JSON and SSE responses
|
40 |
+
|
41 |
+
6. **`scripts/trackio_tonic/deploy_trackio_space.py`** - Space deployment
|
42 |
+
- Automated HF Space creation
|
43 |
+
- File upload and configuration
|
44 |
+
- Space testing and validation
|
45 |
+
|
46 |
+
7. **`scripts/trackio_tonic/configure_trackio.py`** - Configuration helper
|
47 |
+
- Environment variable setup
|
48 |
+
- Dataset repository configuration
|
49 |
+
- Usage examples and validation
|
50 |
+
|
51 |
+
8. **`scripts/model_tonic/push_to_huggingface.py`** - Model deployment
|
52 |
+
- Complete model upload pipeline
|
53 |
+
- Model card generation
|
54 |
+
- Training results documentation
|
55 |
+
|
56 |
+
9. **`scripts/dataset_tonic/setup_hf_dataset.py`** - Dataset setup
|
57 |
+
- HF Dataset repository creation
|
58 |
+
- Initial experiment data structure
|
59 |
+
- Dataset access configuration
|
60 |
+
|
61 |
+
### **Source Code Updates**
|
62 |
+
|
63 |
+
10. **`src/monitoring.py`** - Enhanced monitoring
|
64 |
+
- HF Datasets integration
|
65 |
+
- Trackio API client integration
|
66 |
+
- Comprehensive metrics logging
|
67 |
+
|
68 |
+
11. **`src/train.py`** - Updated training script
|
69 |
+
- Monitoring integration
|
70 |
+
- HF Datasets support
|
71 |
+
- Enhanced error handling
|
72 |
+
|
73 |
+
12. **`src/config.py`** - Configuration management
|
74 |
+
- Dynamic config loading
|
75 |
+
- Multiple config type support
|
76 |
+
- Fallback mechanisms
|
77 |
+
|
78 |
+
13. **`src/data.py`** - Enhanced dataset handling
|
79 |
+
- Multiple format support
|
80 |
+
- Automatic conversion
|
81 |
+
- Bad entry filtering
|
82 |
+
|
83 |
+
14. **`src/model.py`** - Model wrapper
|
84 |
+
- SmolLM3-specific optimizations
|
85 |
+
- Flash attention support
|
86 |
+
- Long context handling
|
87 |
+
|
88 |
+
15. **`src/trainer.py`** - Training orchestration
|
89 |
+
- Monitoring callback integration
|
90 |
+
- Enhanced logging
|
91 |
+
- Checkpoint management
|
92 |
+
|
93 |
+
## 🔧 Key Improvements
|
94 |
+
|
95 |
+
### **1. Import Path Fixes**
|
96 |
+
- Fixed all import paths to work with the refactored structure
|
97 |
+
- Added proper sys.path handling for cross-module imports
|
98 |
+
- Ensured compatibility between different script locations
|
99 |
+
|
100 |
+
### **2. Monitoring Integration**
|
101 |
+
- **Trackio Space**: Real-time experiment tracking
|
102 |
+
- **HF Datasets**: Persistent experiment storage
|
103 |
+
- **System Metrics**: GPU, memory, and CPU monitoring
|
104 |
+
- **Training Callbacks**: Automatic metric logging
|
105 |
+
|
106 |
+
### **3. Dataset Handling**
|
107 |
+
- **Multi-format Support**: Prompt/completion, instruction/output, chat formats
|
108 |
+
- **Automatic Conversion**: Handles different dataset structures
|
109 |
+
- **Validation**: Ensures data quality and completeness
|
110 |
+
- **Splitting**: Automatic train/validation/test splits
|
111 |
+
|
112 |
+
### **4. Configuration Management**
|
113 |
+
- **Dynamic Generation**: Creates configs based on user input
|
114 |
+
- **Multiple Types**: Support for different training configurations
|
115 |
+
- **Environment Variables**: Proper integration with environment
|
116 |
+
- **Validation**: Ensures configuration correctness
|
117 |
+
|
118 |
+
### **5. Deployment Automation**
|
119 |
+
- **Model Upload**: Complete model push to HF Hub
|
120 |
+
- **Model Cards**: Comprehensive documentation generation
|
121 |
+
- **Training Results**: Complete experiment documentation
|
122 |
+
- **Testing**: Automated model validation
|
123 |
+
|
124 |
+
## 🚀 Pipeline Steps
|
125 |
+
|
126 |
+
The end-to-end pipeline performs these 16 steps:
|
127 |
+
|
128 |
+
1. **Environment Setup** - System dependencies and Python environment
|
129 |
+
2. **PyTorch Installation** - CUDA-enabled PyTorch installation
|
130 |
+
3. **Dependencies** - All required Python packages
|
131 |
+
4. **Authentication** - HF token setup and validation
|
132 |
+
5. **Trackio Deployment** - HF Space creation and configuration
|
133 |
+
6. **Dataset Setup** - HF Dataset repository creation
|
134 |
+
7. **Trackio Configuration** - Environment and dataset configuration
|
135 |
+
8. **Training Config** - Dynamic configuration generation
|
136 |
+
9. **Dataset Preparation** - Download and format conversion
|
137 |
+
10. **Parameter Calculation** - Training steps and batch calculations
|
138 |
+
11. **Training Execution** - Model fine-tuning with monitoring
|
139 |
+
12. **Model Push** - Upload to HF Hub with documentation
|
140 |
+
13. **Model Testing** - Validation of uploaded model
|
141 |
+
14. **Summary Report** - Complete training documentation
|
142 |
+
15. **Resource Links** - All online resource URLs
|
143 |
+
16. **Next Steps** - Usage instructions and recommendations
|
144 |
+
|
145 |
+
## 📊 Monitoring Features
|
146 |
+
|
147 |
+
### **Trackio Space Interface**
|
148 |
+
- Real-time training metrics
|
149 |
+
- Experiment comparison
|
150 |
+
- System resource monitoring
|
151 |
+
- Training progress visualization
|
152 |
+
|
153 |
+
### **HF Dataset Storage**
|
154 |
+
- Persistent experiment data
|
155 |
+
- Version-controlled history
|
156 |
+
- Collaborative sharing
|
157 |
+
- Automated backup
|
158 |
+
|
159 |
+
### **Comprehensive Logging**
|
160 |
+
- Training metrics (loss, accuracy, etc.)
|
161 |
+
- System metrics (GPU, memory, CPU)
|
162 |
+
- Configuration parameters
|
163 |
+
- Training artifacts
|
164 |
+
|
165 |
+
## 🔧 Configuration Options
|
166 |
+
|
167 |
+
### **User Configuration**
|
168 |
+
```bash
|
169 |
+
# Required
|
170 |
+
HF_TOKEN="your_token"
|
171 |
+
HF_USERNAME="your_username"
|
172 |
+
|
173 |
+
# Optional
|
174 |
+
MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
|
175 |
+
DATASET_NAME="HuggingFaceTB/smoltalk"
|
176 |
+
```
|
177 |
+
|
178 |
+
### **Training Parameters**
|
179 |
+
```bash
|
180 |
+
BATCH_SIZE=2
|
181 |
+
GRADIENT_ACCUMULATION_STEPS=8
|
182 |
+
LEARNING_RATE=5e-6
|
183 |
+
MAX_EPOCHS=3
|
184 |
+
MAX_SEQ_LENGTH=4096
|
185 |
+
```
|
186 |
+
|
187 |
+
### **Monitoring Configuration**
|
188 |
+
```bash
|
189 |
+
TRACKIO_DATASET_REPO="username/trackio-experiments"
|
190 |
+
EXPERIMENT_NAME="smollm3_finetune_YYYYMMDD_HHMMSS"
|
191 |
+
```
|
192 |
+
|
193 |
+
## 🛠️ Error Handling
|
194 |
+
|
195 |
+
### **Comprehensive Error Handling**
|
196 |
+
- Import error detection and reporting
|
197 |
+
- Configuration validation
|
198 |
+
- Network timeout handling
|
199 |
+
- Graceful degradation
|
200 |
+
|
201 |
+
### **Debugging Support**
|
202 |
+
- Detailed logging at all levels
|
203 |
+
- Component-specific error messages
|
204 |
+
- Fallback mechanisms
|
205 |
+
- Testing utilities
|
206 |
+
|
207 |
+
## 📈 Performance Optimizations
|
208 |
+
|
209 |
+
### **Training Optimizations**
|
210 |
+
- Flash Attention for efficiency
|
211 |
+
- Gradient checkpointing for memory
|
212 |
+
- Mixed precision training
|
213 |
+
- Optimized data loading
|
214 |
+
|
215 |
+
### **Monitoring Optimizations**
|
216 |
+
- Asynchronous logging
|
217 |
+
- Batch metric updates
|
218 |
+
- Efficient data storage
|
219 |
+
- Minimal overhead
|
220 |
+
|
221 |
+
## 🔄 Integration Points
|
222 |
+
|
223 |
+
### **Hugging Face Ecosystem**
|
224 |
+
- **HF Hub**: Model and dataset storage
|
225 |
+
- **HF Spaces**: Trackio monitoring interface
|
226 |
+
- **HF Datasets**: Experiment data persistence
|
227 |
+
- **HF CLI**: Authentication and deployment
|
228 |
+
|
229 |
+
### **External Services**
|
230 |
+
- **Trackio**: Experiment tracking
|
231 |
+
- **CUDA**: GPU acceleration
|
232 |
+
- **PyTorch**: Deep learning framework
|
233 |
+
- **Transformers**: Model library
|
234 |
+
|
235 |
+
## 🎯 Usage Workflow
|
236 |
+
|
237 |
+
### **1. Setup Phase**
|
238 |
+
```bash
|
239 |
+
python setup_launch.py # Configure with user info
|
240 |
+
python test_pipeline.py # Verify all components
|
241 |
+
```
|
242 |
+
|
243 |
+
### **2. Execution Phase**
|
244 |
+
```bash
|
245 |
+
chmod +x launch.sh # Make executable
|
246 |
+
./launch.sh # Run complete pipeline
|
247 |
+
```
|
248 |
+
|
249 |
+
### **3. Monitoring Phase**
|
250 |
+
- Track progress in Trackio Space
|
251 |
+
- Monitor metrics in real-time
|
252 |
+
- Check logs for issues
|
253 |
+
- Validate results
|
254 |
+
|
255 |
+
### **4. Results Phase**
|
256 |
+
- Access model on HF Hub
|
257 |
+
- Review training summary
|
258 |
+
- Test model performance
|
259 |
+
- Share results
|
260 |
+
|
261 |
+
## 📋 Quality Assurance
|
262 |
+
|
263 |
+
### **Testing Coverage**
|
264 |
+
- Import testing for all modules
|
265 |
+
- Script availability verification
|
266 |
+
- Configuration validation
|
267 |
+
- CUDA and token testing
|
268 |
+
- Component integration testing
|
269 |
+
|
270 |
+
### **Documentation**
|
271 |
+
- Comprehensive README
|
272 |
+
- Step-by-step guides
|
273 |
+
- Troubleshooting section
|
274 |
+
- Advanced usage examples
|
275 |
+
|
276 |
+
### **Error Recovery**
|
277 |
+
- Graceful error handling
|
278 |
+
- Detailed error messages
|
279 |
+
- Recovery mechanisms
|
280 |
+
- Fallback options
|
281 |
+
|
282 |
+
## 🚀 Future Enhancements
|
283 |
+
|
284 |
+
### **Planned Improvements**
|
285 |
+
- Multi-GPU training support
|
286 |
+
- Distributed training
|
287 |
+
- Advanced hyperparameter tuning
|
288 |
+
- Custom dataset upload
|
289 |
+
- Model evaluation metrics
|
290 |
+
- Automated testing pipeline
|
291 |
+
|
292 |
+
### **Extensibility**
|
293 |
+
- Plugin architecture for custom components
|
294 |
+
- Configuration templates
|
295 |
+
- Custom monitoring backends
|
296 |
+
- Advanced deployment options
|
297 |
+
|
298 |
+
## 📊 Success Metrics
|
299 |
+
|
300 |
+
### **Pipeline Completeness**
|
301 |
+
- ✅ All 16 steps implemented
|
302 |
+
- ✅ Error handling at each step
|
303 |
+
- ✅ Monitoring integration
|
304 |
+
- ✅ Documentation complete
|
305 |
+
|
306 |
+
### **User Experience**
|
307 |
+
- ✅ Simple setup process
|
308 |
+
- ✅ Clear error messages
|
309 |
+
- ✅ Comprehensive documentation
|
310 |
+
- ✅ Testing utilities
|
311 |
+
|
312 |
+
### **Technical Quality**
|
313 |
+
- ✅ Import path fixes
|
314 |
+
- ✅ Configuration management
|
315 |
+
- ✅ Monitoring integration
|
316 |
+
- ✅ Deployment automation
|
317 |
+
|
318 |
+
## 🎉 Conclusion
|
319 |
+
|
320 |
+
The SmolLM3 end-to-end pipeline provides a complete solution for fine-tuning with integrated monitoring, automated deployment, and comprehensive documentation. The refactored codebase is now production-ready with proper error handling, testing, and user experience considerations.
|
321 |
+
|
322 |
+
**Key Achievements:**
|
323 |
+
- Complete end-to-end automation
|
324 |
+
- Integrated monitoring and tracking
|
325 |
+
- Comprehensive error handling
|
326 |
+
- Production-ready deployment
|
327 |
+
- Extensive documentation
|
328 |
+
- Testing and validation suite
|
329 |
+
|
330 |
+
The pipeline is now ready for users to easily fine-tune SmolLM3 models with full monitoring and deployment capabilities.
|
README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# SmolLM3 Fine-tuning
|
2 |
|
3 |
This repository provides a complete setup for fine-tuning SmolLM3 models using the FlexAI console, following the nanoGPT structure but adapted for modern transformer models.
|
4 |
|
|
|
1 |
+
# SmolLM3 Fine-tuning
|
2 |
|
3 |
This repository provides a complete setup for fine-tuning SmolLM3 models using the FlexAI console, following the nanoGPT structure but adapted for modern transformer models.
|
4 |
|
README_END_TO_END.md
ADDED
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SmolLM3 End-to-End Fine-tuning Pipeline
|
2 |
+
|
3 |
+
This repository provides a complete end-to-end pipeline for fine-tuning SmolLM3 models with integrated experiment tracking, monitoring, and model deployment.
|
4 |
+
|
5 |
+
## 🚀 Quick Start
|
6 |
+
|
7 |
+
### 1. Setup Configuration
|
8 |
+
|
9 |
+
```bash
|
10 |
+
# Run the setup script to configure with your information
|
11 |
+
python setup_launch.py
|
12 |
+
```
|
13 |
+
|
14 |
+
This will prompt you for:
|
15 |
+
- Your Hugging Face username
|
16 |
+
- Your Hugging Face token
|
17 |
+
- Optional model and dataset customizations
|
18 |
+
|
19 |
+
### 2. Check Requirements
|
20 |
+
|
21 |
+
```bash
|
22 |
+
# Verify all dependencies are installed
|
23 |
+
python check_requirements.py
|
24 |
+
```
|
25 |
+
|
26 |
+
### 3. Run the Pipeline
|
27 |
+
|
28 |
+
```bash
|
29 |
+
# Make the script executable and run
|
30 |
+
chmod +x launch.sh
|
31 |
+
./launch.sh
|
32 |
+
```
|
33 |
+
|
34 |
+
## 📋 What the Pipeline Does
|
35 |
+
|
36 |
+
The end-to-end pipeline performs the following steps:
|
37 |
+
|
38 |
+
### 1. **Environment Setup**
|
39 |
+
- Installs system dependencies
|
40 |
+
- Creates Python virtual environment
|
41 |
+
- Installs PyTorch with CUDA support
|
42 |
+
- Installs all required Python packages
|
43 |
+
|
44 |
+
### 2. **Trackio Space Deployment**
|
45 |
+
- Creates a new Hugging Face Space for experiment tracking
|
46 |
+
- Configures the Trackio monitoring interface
|
47 |
+
- Sets up environment variables
|
48 |
+
|
49 |
+
### 3. **HF Dataset Setup**
|
50 |
+
- Creates a Hugging Face Dataset repository for experiment storage
|
51 |
+
- Configures dataset access and permissions
|
52 |
+
- Sets up initial experiment data structure
|
53 |
+
|
54 |
+
### 4. **Dataset Preparation**
|
55 |
+
- Downloads the specified dataset from Hugging Face Hub
|
56 |
+
- Converts to training format (prompt/completion pairs)
|
57 |
+
- Handles multiple dataset formats automatically
|
58 |
+
- Creates train/validation splits
|
59 |
+
|
60 |
+
### 5. **Training Configuration**
|
61 |
+
- Creates optimized training configuration
|
62 |
+
- Sets up monitoring integration
|
63 |
+
- Configures model parameters and hyperparameters
|
64 |
+
|
65 |
+
### 6. **Model Training**
|
66 |
+
- Runs the SmolLM3 fine-tuning process
|
67 |
+
- Logs metrics to Trackio Space in real-time
|
68 |
+
- Saves experiment data to HF Dataset
|
69 |
+
- Creates checkpoints during training
|
70 |
+
|
71 |
+
### 7. **Model Deployment**
|
72 |
+
- Pushes trained model to Hugging Face Hub
|
73 |
+
- Creates comprehensive model card
|
74 |
+
- Uploads training results and logs
|
75 |
+
- Tests the uploaded model
|
76 |
+
|
77 |
+
### 8. **Summary Report**
|
78 |
+
- Generates detailed training summary
|
79 |
+
- Provides links to all resources
|
80 |
+
- Documents configuration and results
|
81 |
+
|
82 |
+
## 🎯 Features
|
83 |
+
|
84 |
+
### **Integrated Monitoring**
|
85 |
+
- Real-time experiment tracking via Trackio Space
|
86 |
+
- Persistent storage in Hugging Face Datasets
|
87 |
+
- Comprehensive metrics logging
|
88 |
+
- System resource monitoring
|
89 |
+
|
90 |
+
### **Flexible Dataset Support**
|
91 |
+
- Automatic format detection and conversion
|
92 |
+
- Support for multiple dataset types
|
93 |
+
- Built-in data preprocessing
|
94 |
+
- Train/validation split handling
|
95 |
+
|
96 |
+
### **Optimized Training**
|
97 |
+
- Flash Attention support for efficiency
|
98 |
+
- Gradient checkpointing for memory optimization
|
99 |
+
- Mixed precision training
|
100 |
+
- Automatic hyperparameter optimization
|
101 |
+
|
102 |
+
### **Complete Deployment**
|
103 |
+
- Automated model upload to Hugging Face Hub
|
104 |
+
- Comprehensive model cards
|
105 |
+
- Training results documentation
|
106 |
+
- Model testing and validation
|
107 |
+
|
108 |
+
## 📊 Monitoring & Tracking
|
109 |
+
|
110 |
+
### **Trackio Space Interface**
|
111 |
+
- Real-time training metrics visualization
|
112 |
+
- Experiment management and comparison
|
113 |
+
- System resource monitoring
|
114 |
+
- Training progress tracking
|
115 |
+
|
116 |
+
### **HF Dataset Storage**
|
117 |
+
- Persistent experiment data storage
|
118 |
+
- Version-controlled experiment history
|
119 |
+
- Collaborative experiment sharing
|
120 |
+
- Automated data backup
|
121 |
+
|
122 |
+
## 🔧 Configuration
|
123 |
+
|
124 |
+
### **Required Configuration**
|
125 |
+
Update these variables in `launch.sh`:
|
126 |
+
|
127 |
+
```bash
|
128 |
+
# Your Hugging Face credentials
|
129 |
+
HF_TOKEN="your_hf_token_here"
|
130 |
+
HF_USERNAME="your-username"
|
131 |
+
|
132 |
+
# Model and dataset
|
133 |
+
MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
|
134 |
+
DATASET_NAME="HuggingFaceTB/smoltalk"
|
135 |
+
|
136 |
+
# Output repositories
|
137 |
+
REPO_NAME="your-username/smollm3-finetuned-$(date +%Y%m%d)"
|
138 |
+
TRACKIO_DATASET_REPO="your-username/trackio-experiments"
|
139 |
+
```
|
140 |
+
|
141 |
+
### **Training Parameters**
|
142 |
+
Customize training parameters:
|
143 |
+
|
144 |
+
```bash
|
145 |
+
# Training configuration
|
146 |
+
BATCH_SIZE=2
|
147 |
+
GRADIENT_ACCUMULATION_STEPS=8
|
148 |
+
LEARNING_RATE=5e-6
|
149 |
+
MAX_EPOCHS=3
|
150 |
+
MAX_SEQ_LENGTH=4096
|
151 |
+
```
|
152 |
+
|
153 |
+
## 📁 Output Structure
|
154 |
+
|
155 |
+
After running the pipeline, you'll have:
|
156 |
+
|
157 |
+
```
|
158 |
+
├── training_dataset/ # Prepared dataset
|
159 |
+
│ ├── train.json
|
160 |
+
│ └── validation.json
|
161 |
+
├── /output-checkpoint/ # Model checkpoints
|
162 |
+
│ ├── config.json
|
163 |
+
│ ├── pytorch_model.bin
|
164 |
+
│ └── training_results/
|
165 |
+
├── training.log # Training logs
|
166 |
+
├── training_summary.md # Summary report
|
167 |
+
└── config/train_smollm3_end_to_end.py # Training config
|
168 |
+
```
|
169 |
+
|
170 |
+
## 🌐 Online Resources
|
171 |
+
|
172 |
+
The pipeline creates these online resources:
|
173 |
+
|
174 |
+
- **Model Repository**: `https://huggingface.co/your-username/smollm3-finetuned-YYYYMMDD`
|
175 |
+
- **Trackio Space**: `https://huggingface.co/spaces/your-username/trackio-monitoring-YYYYMMDD`
|
176 |
+
- **Experiment Dataset**: `https://huggingface.co/datasets/your-username/trackio-experiments`
|
177 |
+
|
178 |
+
## 🛠️ Troubleshooting
|
179 |
+
|
180 |
+
### **Common Issues**
|
181 |
+
|
182 |
+
1. **HF Token Issues**
|
183 |
+
```bash
|
184 |
+
# Verify your token is correct
|
185 |
+
huggingface-cli whoami
|
186 |
+
```
|
187 |
+
|
188 |
+
2. **CUDA Issues**
|
189 |
+
```bash
|
190 |
+
# Check CUDA availability
|
191 |
+
python -c "import torch; print(torch.cuda.is_available())"
|
192 |
+
```
|
193 |
+
|
194 |
+
3. **Memory Issues**
|
195 |
+
```bash
|
196 |
+
# Reduce batch size or gradient accumulation
|
197 |
+
BATCH_SIZE=1
|
198 |
+
GRADIENT_ACCUMULATION_STEPS=16
|
199 |
+
```
|
200 |
+
|
201 |
+
4. **Dataset Issues**
|
202 |
+
```bash
|
203 |
+
# Test dataset access
|
204 |
+
python -c "from datasets import load_dataset; print(load_dataset('your-dataset'))"
|
205 |
+
```
|
206 |
+
|
207 |
+
### **Debug Mode**
|
208 |
+
|
209 |
+
Run individual components for debugging:
|
210 |
+
|
211 |
+
```bash
|
212 |
+
# Test Trackio deployment
|
213 |
+
cd scripts/trackio_tonic
|
214 |
+
python deploy_trackio_space.py
|
215 |
+
|
216 |
+
# Test dataset setup
|
217 |
+
cd scripts/dataset_tonic
|
218 |
+
python setup_hf_dataset.py
|
219 |
+
|
220 |
+
# Test training
|
221 |
+
python src/train.py config/train_smollm3_end_to_end.py --help
|
222 |
+
```
|
223 |
+
|
224 |
+
## 📚 Advanced Usage
|
225 |
+
|
226 |
+
### **Custom Datasets**
|
227 |
+
|
228 |
+
For custom datasets, ensure they have one of these formats:
|
229 |
+
|
230 |
+
```json
|
231 |
+
// Format 1: Prompt/Completion
|
232 |
+
{
|
233 |
+
"prompt": "What is machine learning?",
|
234 |
+
"completion": "Machine learning is..."
|
235 |
+
}
|
236 |
+
|
237 |
+
// Format 2: Instruction/Output
|
238 |
+
{
|
239 |
+
"instruction": "Explain machine learning",
|
240 |
+
"output": "Machine learning is..."
|
241 |
+
}
|
242 |
+
|
243 |
+
// Format 3: Chat format
|
244 |
+
{
|
245 |
+
"messages": [
|
246 |
+
{"role": "user", "content": "What is ML?"},
|
247 |
+
{"role": "assistant", "content": "ML is..."}
|
248 |
+
]
|
249 |
+
}
|
250 |
+
```
|
251 |
+
|
252 |
+
### **Custom Models**
|
253 |
+
|
254 |
+
To use different models, update the configuration:
|
255 |
+
|
256 |
+
```bash
|
257 |
+
MODEL_NAME="microsoft/DialoGPT-medium"
|
258 |
+
MAX_SEQ_LENGTH=1024
|
259 |
+
```
|
260 |
+
|
261 |
+
### **Custom Training**
|
262 |
+
|
263 |
+
Modify training parameters in the generated config:
|
264 |
+
|
265 |
+
```python
|
266 |
+
# In config/train_smollm3_end_to_end.py
|
267 |
+
config = SmolLM3Config(
|
268 |
+
learning_rate=1e-5, # Custom learning rate
|
269 |
+
max_iters=5000, # Custom training steps
|
270 |
+
# ... other parameters
|
271 |
+
)
|
272 |
+
```
|
273 |
+
|
274 |
+
## 🤝 Contributing
|
275 |
+
|
276 |
+
1. Fork the repository
|
277 |
+
2. Create a feature branch
|
278 |
+
3. Make your changes
|
279 |
+
4. Test the pipeline
|
280 |
+
5. Submit a pull request
|
281 |
+
|
282 |
+
## 📄 License
|
283 |
+
|
284 |
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
285 |
+
|
286 |
+
## 🙏 Acknowledgments
|
287 |
+
|
288 |
+
- Hugging Face for the excellent transformers library
|
289 |
+
- The SmolLM3 team for the base model
|
290 |
+
- The Trackio team for experiment tracking
|
291 |
+
- The open-source community for contributions
|
292 |
+
|
293 |
+
## 📞 Support
|
294 |
+
|
295 |
+
For issues and questions:
|
296 |
+
|
297 |
+
1. Check the troubleshooting section
|
298 |
+
2. Review the logs in `training.log`
|
299 |
+
3. Check the Trackio Space for monitoring data
|
300 |
+
4. Open an issue on GitHub
|
301 |
+
|
302 |
+
---
|
303 |
+
|
304 |
+
**Happy Fine-tuning! 🚀**
|
cloud_deployment.sh
DELETED
@@ -1,279 +0,0 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
# Cloud Deployment Script for SmolLM3 DPO Training
|
3 |
-
# This script sets up a cloud instance for training and uploading to Hugging Face
|
4 |
-
|
5 |
-
set -e # Exit on any error
|
6 |
-
|
7 |
-
echo "🚀 Starting SmolLM3 DPO Cloud Deployment"
|
8 |
-
echo "=========================================="
|
9 |
-
|
10 |
-
# Configuration
|
11 |
-
MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
|
12 |
-
DATASET_NAME="HuggingFaceTB/smoltalk"
|
13 |
-
EXPERIMENT_NAME="smollm3_dpo_6epochs"
|
14 |
-
REPO_NAME="your-username/smollm3-dpo-6epochs" # Change this to your username
|
15 |
-
TRACKIO_URL="https://your-trackio-space.hf.space" # Change this to your Trackio Space URL
|
16 |
-
HF_TOKEN="your_hf_token_here" # Change this to your HF token
|
17 |
-
|
18 |
-
# Training Configuration
|
19 |
-
BATCH_SIZE=2
|
20 |
-
GRADIENT_ACCUMULATION_STEPS=8
|
21 |
-
LEARNING_RATE=5e-6
|
22 |
-
MAX_EPOCHS=6
|
23 |
-
MAX_SEQ_LENGTH=4096
|
24 |
-
SAVE_STEPS=500
|
25 |
-
EVAL_STEPS=100
|
26 |
-
LOGGING_STEPS=10
|
27 |
-
|
28 |
-
echo "📋 Configuration:"
|
29 |
-
echo " Model: $MODEL_NAME"
|
30 |
-
echo " Dataset: $DATASET_NAME"
|
31 |
-
echo " Experiment: $EXPERIMENT_NAME"
|
32 |
-
echo " Repository: $REPO_NAME"
|
33 |
-
echo " Epochs: $MAX_EPOCHS"
|
34 |
-
echo " Batch Size: $BATCH_SIZE"
|
35 |
-
echo " Learning Rate: $LEARNING_RATE"
|
36 |
-
|
37 |
-
# Step 1: Update system and install dependencies
|
38 |
-
echo ""
|
39 |
-
echo "🔧 Step 1: Installing system dependencies..."
|
40 |
-
sudo apt-get update
|
41 |
-
sudo apt-get install -y git curl wget unzip
|
42 |
-
|
43 |
-
# Step 2: Install Python and pip
|
44 |
-
echo ""
|
45 |
-
echo "🐍 Step 2: Installing Python dependencies..."
|
46 |
-
sudo apt-get install -y python3 python3-pip python3-venv
|
47 |
-
|
48 |
-
# Step 3: Create virtual environment
|
49 |
-
echo ""
|
50 |
-
echo "📦 Step 3: Setting up Python virtual environment..."
|
51 |
-
python3 -m venv smollm3_env
|
52 |
-
source smollm3_env/bin/activate
|
53 |
-
|
54 |
-
# Step 4: Install PyTorch and CUDA
|
55 |
-
echo ""
|
56 |
-
echo "🔥 Step 4: Installing PyTorch with CUDA support..."
|
57 |
-
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
58 |
-
|
59 |
-
# Step 5: Install project dependencies
|
60 |
-
echo ""
|
61 |
-
echo "📚 Step 5: Installing project dependencies..."
|
62 |
-
pip install -r requirements.txt
|
63 |
-
|
64 |
-
# Step 6: Install additional dependencies for DPO
|
65 |
-
echo ""
|
66 |
-
echo "🎯 Step 6: Installing DPO-specific dependencies..."
|
67 |
-
pip install trl>=0.7.0
|
68 |
-
pip install peft>=0.4.0
|
69 |
-
pip install accelerate>=0.20.0
|
70 |
-
|
71 |
-
# Step 7: Set up Hugging Face token
|
72 |
-
echo ""
|
73 |
-
echo "🔑 Step 7: Setting up Hugging Face authentication..."
|
74 |
-
export HF_TOKEN="$HF_TOKEN"
|
75 |
-
huggingface-cli login --token $HF_TOKEN
|
76 |
-
|
77 |
-
# Step 8: Create DPO configuration
|
78 |
-
echo ""
|
79 |
-
echo "⚙️ Step 8: Creating DPO configuration..."
|
80 |
-
cat > config/train_smollm3_dpo_6epochs.py << EOF
|
81 |
-
"""
|
82 |
-
SmolLM3 DPO Training Configuration - 6 Epochs
|
83 |
-
Optimized for cloud deployment
|
84 |
-
"""
|
85 |
-
|
86 |
-
from config.train_smollm3_dpo import SmolLM3DPOConfig
|
87 |
-
|
88 |
-
config = SmolLM3DPOConfig(
|
89 |
-
# Model configuration
|
90 |
-
model_name="$MODEL_NAME",
|
91 |
-
max_seq_length=$MAX_SEQ_LENGTH,
|
92 |
-
use_flash_attention=True,
|
93 |
-
use_gradient_checkpointing=True,
|
94 |
-
|
95 |
-
# Training configuration
|
96 |
-
batch_size=$BATCH_SIZE,
|
97 |
-
gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS,
|
98 |
-
learning_rate=$LEARNING_RATE,
|
99 |
-
weight_decay=0.01,
|
100 |
-
warmup_steps=100,
|
101 |
-
max_iters=None, # Will be calculated based on epochs
|
102 |
-
eval_interval=100,
|
103 |
-
log_interval=10,
|
104 |
-
save_interval=500,
|
105 |
-
|
106 |
-
# DPO configuration
|
107 |
-
beta=0.1,
|
108 |
-
max_prompt_length=$((MAX_SEQ_LENGTH // 2)),
|
109 |
-
|
110 |
-
# Optimizer configuration
|
111 |
-
optimizer="adamw",
|
112 |
-
beta1=0.9,
|
113 |
-
beta2=0.95,
|
114 |
-
eps=1e-8,
|
115 |
-
|
116 |
-
# Scheduler configuration
|
117 |
-
scheduler="cosine",
|
118 |
-
min_lr=1e-6,
|
119 |
-
|
120 |
-
# Mixed precision
|
121 |
-
fp16=True,
|
122 |
-
bf16=False,
|
123 |
-
|
124 |
-
# Logging and saving
|
125 |
-
save_steps=$SAVE_STEPS,
|
126 |
-
eval_steps=$EVAL_STEPS,
|
127 |
-
logging_steps=$LOGGING_STEPS,
|
128 |
-
save_total_limit=3,
|
129 |
-
|
130 |
-
# Evaluation
|
131 |
-
eval_strategy="steps",
|
132 |
-
metric_for_best_model="eval_loss",
|
133 |
-
greater_is_better=False,
|
134 |
-
load_best_model_at_end=True,
|
135 |
-
|
136 |
-
# Data configuration
|
137 |
-
data_dir="smoltalk_dataset",
|
138 |
-
train_file="train.json",
|
139 |
-
validation_file="validation.json",
|
140 |
-
|
141 |
-
# Chat template configuration
|
142 |
-
use_chat_template=True,
|
143 |
-
chat_template_kwargs={
|
144 |
-
"enable_thinking": False,
|
145 |
-
"add_generation_prompt": True
|
146 |
-
},
|
147 |
-
|
148 |
-
# Trackio monitoring configuration
|
149 |
-
enable_tracking=True,
|
150 |
-
trackio_url="$TRACKIO_URL",
|
151 |
-
trackio_token=None,
|
152 |
-
log_artifacts=True,
|
153 |
-
log_metrics=True,
|
154 |
-
log_config=True,
|
155 |
-
experiment_name="$EXPERIMENT_NAME"
|
156 |
-
)
|
157 |
-
EOF
|
158 |
-
|
159 |
-
# Step 9: Download and prepare dataset
|
160 |
-
echo ""
|
161 |
-
echo "📊 Step 9: Downloading and preparing dataset..."
|
162 |
-
python -c "
|
163 |
-
from datasets import load_dataset
|
164 |
-
import json
|
165 |
-
import os
|
166 |
-
|
167 |
-
# Load SmolTalk dataset
|
168 |
-
print('Loading SmolTalk dataset...')
|
169 |
-
dataset = load_dataset('$DATASET_NAME')
|
170 |
-
|
171 |
-
# Create dataset directory
|
172 |
-
os.makedirs('smoltalk_dataset', exist_ok=True)
|
173 |
-
|
174 |
-
# Convert to DPO format (preference pairs)
|
175 |
-
def convert_to_dpo_format(example):
|
176 |
-
# For SmolTalk, we'll create preference pairs based on response quality
|
177 |
-
# This is a simplified example - you may need to adjust based on your needs
|
178 |
-
return {
|
179 |
-
'prompt': example.get('prompt', ''),
|
180 |
-
'chosen': example.get('chosen', ''),
|
181 |
-
'rejected': example.get('rejected', '')
|
182 |
-
}
|
183 |
-
|
184 |
-
# Process train split
|
185 |
-
train_data = []
|
186 |
-
for example in dataset['train']:
|
187 |
-
dpo_example = convert_to_dpo_format(example)
|
188 |
-
if dpo_example['prompt'] and dpo_example['chosen'] and dpo_example['rejected']:
|
189 |
-
train_data.append(dpo_example)
|
190 |
-
|
191 |
-
# Process validation split
|
192 |
-
val_data = []
|
193 |
-
for example in dataset['validation']:
|
194 |
-
dpo_example = convert_to_dpo_format(example)
|
195 |
-
if dpo_example['prompt'] and dpo_example['chosen'] and dpo_example['rejected']:
|
196 |
-
val_data.append(dpo_example)
|
197 |
-
|
198 |
-
# Save to files
|
199 |
-
with open('smoltalk_dataset/train.json', 'w') as f:
|
200 |
-
json.dump(train_data, f, indent=2)
|
201 |
-
|
202 |
-
with open('smoltalk_dataset/validation.json', 'w') as f:
|
203 |
-
json.dump(val_data, f, indent=2)
|
204 |
-
|
205 |
-
print(f'Dataset prepared: {len(train_data)} train samples, {len(val_data)} validation samples')
|
206 |
-
"
|
207 |
-
|
208 |
-
# Step 10: Calculate training steps based on epochs
|
209 |
-
echo ""
|
210 |
-
echo "📈 Step 10: Calculating training parameters..."
|
211 |
-
TOTAL_SAMPLES=$(python -c "import json; data=json.load(open('smoltalk_dataset/train.json')); print(len(data))")
|
212 |
-
EFFECTIVE_BATCH_SIZE=$((BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS))
|
213 |
-
STEPS_PER_EPOCH=$((TOTAL_SAMPLES / EFFECTIVE_BATCH_SIZE))
|
214 |
-
MAX_STEPS=$((STEPS_PER_EPOCH * MAX_EPOCHS))
|
215 |
-
|
216 |
-
echo " Total samples: $TOTAL_SAMPLES"
|
217 |
-
echo " Effective batch size: $EFFECTIVE_BATCH_SIZE"
|
218 |
-
echo " Steps per epoch: $STEPS_PER_EPOCH"
|
219 |
-
echo " Total training steps: $MAX_STEPS"
|
220 |
-
|
221 |
-
# Step 11: Start DPO training
|
222 |
-
echo ""
|
223 |
-
echo "🎯 Step 11: Starting DPO training..."
|
224 |
-
python train.py config/train_smollm3_dpo_6epochs.py \
|
225 |
-
--dataset_dir smoltalk_dataset \
|
226 |
-
--out_dir /output-checkpoint \
|
227 |
-
--init_from scratch \
|
228 |
-
--max_iters $MAX_STEPS \
|
229 |
-
--batch_size $BATCH_SIZE \
|
230 |
-
--learning_rate $LEARNING_RATE \
|
231 |
-
--gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
|
232 |
-
--max_seq_length $MAX_SEQ_LENGTH \
|
233 |
-
--save_steps $SAVE_STEPS \
|
234 |
-
--eval_steps $EVAL_STEPS \
|
235 |
-
--logging_steps $LOGGING_STEPS \
|
236 |
-
--enable_tracking \
|
237 |
-
--trackio_url "$TRACKIO_URL" \
|
238 |
-
--experiment_name "$EXPERIMENT_NAME"
|
239 |
-
|
240 |
-
# Step 12: Push model to Hugging Face Hub
|
241 |
-
echo ""
|
242 |
-
echo "📤 Step 12: Pushing model to Hugging Face Hub..."
|
243 |
-
python push_to_huggingface.py /output-checkpoint "$REPO_NAME" \
|
244 |
-
--token "$HF_TOKEN" \
|
245 |
-
--trackio-url "$TRACKIO_URL" \
|
246 |
-
--experiment-name "$EXPERIMENT_NAME"
|
247 |
-
|
248 |
-
# Step 13: Test the uploaded model
|
249 |
-
echo ""
|
250 |
-
echo "🧪 Step 13: Testing uploaded model..."
|
251 |
-
python -c "
|
252 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
253 |
-
import torch
|
254 |
-
|
255 |
-
print('Loading uploaded model...')
|
256 |
-
model = AutoModelForCausalLM.from_pretrained('$REPO_NAME', torch_dtype=torch.float16, device_map='auto')
|
257 |
-
tokenizer = AutoTokenizer.from_pretrained('$REPO_NAME')
|
258 |
-
|
259 |
-
print('Testing model generation...')
|
260 |
-
prompt = 'Hello, how are you?'
|
261 |
-
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
|
262 |
-
outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, temperature=0.7)
|
263 |
-
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
264 |
-
print(f'Prompt: {prompt}')
|
265 |
-
print(f'Response: {response}')
|
266 |
-
print('✅ Model test completed successfully!')
|
267 |
-
"
|
268 |
-
|
269 |
-
echo ""
|
270 |
-
echo "🎉 Deployment completed successfully!"
|
271 |
-
echo "====================================="
|
272 |
-
echo "📊 Model: https://huggingface.co/$REPO_NAME"
|
273 |
-
echo "📈 Trackio: $TRACKIO_URL"
|
274 |
-
echo "📋 Experiment: $EXPERIMENT_NAME"
|
275 |
-
echo ""
|
276 |
-
echo "Next steps:"
|
277 |
-
echo "1. Monitor training progress in your Trackio Space"
|
278 |
-
echo "2. Check the model repository on Hugging Face Hub"
|
279 |
-
echo "3. Use the model in your applications"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config/train_smollm3.py
CHANGED
@@ -76,6 +76,10 @@ class SmolLM3Config:
|
|
76 |
log_metrics: bool = True
|
77 |
log_config: bool = True
|
78 |
experiment_name: Optional[str] = None
|
|
|
|
|
|
|
|
|
79 |
|
80 |
def __post_init__(self):
|
81 |
if self.chat_template_kwargs is None:
|
|
|
76 |
log_metrics: bool = True
|
77 |
log_config: bool = True
|
78 |
experiment_name: Optional[str] = None
|
79 |
+
# HF Datasets configuration
|
80 |
+
hf_token: Optional[str] = None
|
81 |
+
dataset_repo: Optional[str] = None
|
82 |
+
|
83 |
|
84 |
def __post_init__(self):
|
85 |
if self.chat_template_kwargs is None:
|
config/train_smollm3_h100_lightweight.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
SmolLM3 H100 Lightweight Training Configuration
|
3 |
+
Optimized for rapid training on H100 with 80K Hermes-FR samples
|
4 |
+
"""
|
5 |
+
|
6 |
+
from config.train_smollm3 import SmolLM3Config
|
7 |
+
|
8 |
+
config = SmolLM3Config(
|
9 |
+
# Model configuration
|
10 |
+
model_name="HuggingFaceTB/SmolLM3-3B",
|
11 |
+
max_seq_length=8192,
|
12 |
+
use_flash_attention=True,
|
13 |
+
use_gradient_checkpointing=True,
|
14 |
+
|
15 |
+
# Training configuration - Optimized for H100
|
16 |
+
batch_size=16, # Larger batch size for H100
|
17 |
+
gradient_accumulation_steps=4, # Reduced for faster updates
|
18 |
+
learning_rate=8e-6, # Slightly higher for rapid convergence
|
19 |
+
weight_decay=0.01,
|
20 |
+
warmup_steps=50, # Reduced warmup for rapid training
|
21 |
+
max_iters=None, # Will be calculated based on epochs
|
22 |
+
eval_interval=50, # More frequent evaluation
|
23 |
+
log_interval=5, # More frequent logging
|
24 |
+
save_interval=200, # More frequent saving
|
25 |
+
|
26 |
+
# Optimizer configuration - Optimized for rapid training
|
27 |
+
optimizer="adamw",
|
28 |
+
beta1=0.9,
|
29 |
+
beta2=0.95,
|
30 |
+
eps=1e-8,
|
31 |
+
|
32 |
+
# Scheduler configuration - Faster learning
|
33 |
+
scheduler="cosine",
|
34 |
+
min_lr=2e-6, # Higher minimum LR
|
35 |
+
|
36 |
+
# Mixed precision - Full precision for H100
|
37 |
+
fp16=True,
|
38 |
+
bf16=False,
|
39 |
+
|
40 |
+
# Logging and saving - More frequent for rapid training
|
41 |
+
save_steps=200,
|
42 |
+
eval_steps=50,
|
43 |
+
logging_steps=5,
|
44 |
+
save_total_limit=2, # Keep fewer checkpoints
|
45 |
+
|
46 |
+
# Evaluation
|
47 |
+
eval_strategy="steps",
|
48 |
+
metric_for_best_model="eval_loss",
|
49 |
+
greater_is_better=False,
|
50 |
+
load_best_model_at_end=True,
|
51 |
+
|
52 |
+
# Data configuration - Hermes-FR with sampling
|
53 |
+
dataset_name="legmlai/openhermes-fr",
|
54 |
+
dataset_split="train",
|
55 |
+
input_field="prompt",
|
56 |
+
target_field="completion",
|
57 |
+
filter_bad_entries=False,
|
58 |
+
bad_entry_field="bad_entry",
|
59 |
+
|
60 |
+
# Chat template configuration
|
61 |
+
use_chat_template=True,
|
62 |
+
chat_template_kwargs={
|
63 |
+
"enable_thinking": False,
|
64 |
+
"add_generation_prompt": True,
|
65 |
+
"no_think_system_message": True
|
66 |
+
},
|
67 |
+
|
68 |
+
# Trackio monitoring configuration
|
69 |
+
enable_tracking=True,
|
70 |
+
trackio_url=None, # Will be set by launch script
|
71 |
+
trackio_token=None,
|
72 |
+
log_artifacts=True,
|
73 |
+
log_metrics=True,
|
74 |
+
log_config=True,
|
75 |
+
experiment_name=None, # Will be set by launch script
|
76 |
+
|
77 |
+
# HF Datasets configuration
|
78 |
+
dataset_repo=None, # Will be set by launch script
|
79 |
+
|
80 |
+
# H100-specific optimizations
|
81 |
+
dataloader_num_workers=4, # Optimized for H100
|
82 |
+
dataloader_pin_memory=True,
|
83 |
+
gradient_clipping=1.0, # Prevent gradient explosion
|
84 |
+
|
85 |
+
# Memory optimizations for rapid training
|
86 |
+
max_grad_norm=1.0,
|
87 |
+
warmup_ratio=0.1, # 10% warmup
|
88 |
+
lr_scheduler_type="cosine",
|
89 |
+
|
90 |
+
# Early stopping for rapid training
|
91 |
+
early_stopping_patience=3,
|
92 |
+
early_stopping_threshold=0.001,
|
93 |
+
|
94 |
+
# H100-specific training optimizations
|
95 |
+
remove_unused_columns=False,
|
96 |
+
group_by_length=True, # Group similar length sequences
|
97 |
+
length_column_name="length",
|
98 |
+
ignore_data_skip=False,
|
99 |
+
|
100 |
+
# Reporting
|
101 |
+
report_to=["tensorboard"],
|
102 |
+
run_name="smollm3-h100-lightweight",
|
103 |
+
|
104 |
+
# Seed for reproducibility
|
105 |
+
seed=42,
|
106 |
+
|
107 |
+
# Data collator settings
|
108 |
+
data_collator_kwargs={
|
109 |
+
"pad_to_multiple_of": 8, # Optimized for H100
|
110 |
+
"return_tensors": "pt"
|
111 |
+
}
|
112 |
+
)
|
config/train_smollm3_openhermes_fr.py
CHANGED
@@ -85,6 +85,10 @@ class SmolLM3ConfigOpenHermesFR(SmolLM3Config):
|
|
85 |
log_metrics: bool = True
|
86 |
log_config: bool = True
|
87 |
experiment_name: Optional[str] = None
|
|
|
|
|
|
|
|
|
88 |
|
89 |
def __post_init__(self):
|
90 |
if self.chat_template_kwargs is None:
|
|
|
85 |
log_metrics: bool = True
|
86 |
log_config: bool = True
|
87 |
experiment_name: Optional[str] = None
|
88 |
+
# HF Datasets configuration
|
89 |
+
hf_token: Optional[str] = None
|
90 |
+
dataset_repo: Optional[str] = None
|
91 |
+
|
92 |
|
93 |
def __post_init__(self):
|
94 |
if self.chat_template_kwargs is None:
|
config/train_smollm3_openhermes_fr_a100_balanced.py
CHANGED
@@ -91,6 +91,10 @@ class SmolLM3ConfigOpenHermesFRBalanced(SmolLM3Config):
|
|
91 |
log_metrics: bool = True
|
92 |
log_config: bool = True
|
93 |
experiment_name: Optional[str] = None
|
|
|
|
|
|
|
|
|
94 |
|
95 |
# Additional A100 optimizations for balanced performance
|
96 |
dataloader_num_workers: int = 10 # More workers for faster data loading
|
|
|
91 |
log_metrics: bool = True
|
92 |
log_config: bool = True
|
93 |
experiment_name: Optional[str] = None
|
94 |
+
# HF Datasets configuration
|
95 |
+
hf_token: Optional[str] = None
|
96 |
+
dataset_repo: Optional[str] = None
|
97 |
+
|
98 |
|
99 |
# Additional A100 optimizations for balanced performance
|
100 |
dataloader_num_workers: int = 10 # More workers for faster data loading
|
config/train_smollm3_openhermes_fr_a100_large.py
CHANGED
@@ -85,6 +85,10 @@ class SmolLM3ConfigOpenHermesFRA100Large(SmolLM3Config):
|
|
85 |
log_metrics: bool = True
|
86 |
log_config: bool = True
|
87 |
experiment_name: Optional[str] = None
|
|
|
|
|
|
|
|
|
88 |
|
89 |
# Additional A100 optimizations
|
90 |
dataloader_num_workers: int = 8 # More workers for faster data loading
|
|
|
85 |
log_metrics: bool = True
|
86 |
log_config: bool = True
|
87 |
experiment_name: Optional[str] = None
|
88 |
+
# HF Datasets configuration
|
89 |
+
hf_token: Optional[str] = None
|
90 |
+
dataset_repo: Optional[str] = None
|
91 |
+
|
92 |
|
93 |
# Additional A100 optimizations
|
94 |
dataloader_num_workers: int = 8 # More workers for faster data loading
|
config/train_smollm3_openhermes_fr_a100_max_performance.py
CHANGED
@@ -85,6 +85,10 @@ class SmolLM3ConfigOpenHermesFRMaxPerformance(SmolLM3Config):
|
|
85 |
log_metrics: bool = True
|
86 |
log_config: bool = True
|
87 |
experiment_name: Optional[str] = None
|
|
|
|
|
|
|
|
|
88 |
|
89 |
# Additional A100 optimizations for maximum performance
|
90 |
dataloader_num_workers: int = 12 # More workers for faster data loading
|
|
|
85 |
log_metrics: bool = True
|
86 |
log_config: bool = True
|
87 |
experiment_name: Optional[str] = None
|
88 |
+
# HF Datasets configuration
|
89 |
+
hf_token: Optional[str] = None
|
90 |
+
dataset_repo: Optional[str] = None
|
91 |
+
|
92 |
|
93 |
# Additional A100 optimizations for maximum performance
|
94 |
dataloader_num_workers: int = 12 # More workers for faster data loading
|
config/train_smollm3_openhermes_fr_a100_multiple_passes.py
CHANGED
@@ -85,6 +85,10 @@ class SmolLM3ConfigOpenHermesFRMultiplePasses(SmolLM3Config):
|
|
85 |
log_metrics: bool = True
|
86 |
log_config: bool = True
|
87 |
experiment_name: Optional[str] = None
|
|
|
|
|
|
|
|
|
88 |
|
89 |
# Additional A100 optimizations
|
90 |
dataloader_num_workers: int = 8 # More workers for faster data loading
|
|
|
85 |
log_metrics: bool = True
|
86 |
log_config: bool = True
|
87 |
experiment_name: Optional[str] = None
|
88 |
+
# HF Datasets configuration
|
89 |
+
hf_token: Optional[str] = None
|
90 |
+
dataset_repo: Optional[str] = None
|
91 |
+
|
92 |
|
93 |
# Additional A100 optimizations
|
94 |
dataloader_num_workers: int = 8 # More workers for faster data loading
|
A100_LARGE_SCALE_GUIDE.md → docs/A100_LARGE_SCALE_GUIDE.md
RENAMED
File without changes
|
docs/APP_CONFIGURATION_GUIDE.md
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ⚙️ App Configuration Guide
|
2 |
+
|
3 |
+
## Overview
|
4 |
+
|
5 |
+
The Trackio app now includes a **Configuration tab** that allows you to set your Hugging Face token and dataset repository directly through the interface, providing an alternative to environment variables.
|
6 |
+
|
7 |
+
## 🚀 New Features
|
8 |
+
|
9 |
+
### **Configuration Tab**
|
10 |
+
- ✅ **HF Token Input**: Secure password field for your Hugging Face token
|
11 |
+
- ✅ **Dataset Repository Input**: Text field for your dataset repository
|
12 |
+
- ✅ **Update Configuration**: Apply new settings and reload experiments
|
13 |
+
- ✅ **Test Connection**: Verify access to the dataset repository
|
14 |
+
- ✅ **Create Dataset**: Create a new dataset repository if it doesn't exist
|
15 |
+
|
16 |
+
### **Flexible Configuration**
|
17 |
+
- ✅ **Environment Variables**: Still supported as fallback
|
18 |
+
- ✅ **Interface Input**: New direct input method
|
19 |
+
- ✅ **Dynamic Updates**: Change configuration without restarting
|
20 |
+
- ✅ **Validation**: Input validation and error handling
|
21 |
+
|
22 |
+
## 📋 Configuration Tab Usage
|
23 |
+
|
24 |
+
### **1. Access the Configuration Tab**
|
25 |
+
- Open the Trackio app
|
26 |
+
- Click on the "⚙️ Configuration" tab
|
27 |
+
- You'll see input fields for HF Token and Dataset Repository
|
28 |
+
|
29 |
+
### **2. Set Your HF Token**
|
30 |
+
```
|
31 |
+
Hugging Face Token: hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
32 |
+
```
|
33 |
+
- **Type**: Password field (hidden for security)
|
34 |
+
- **Required**: Yes (for dataset access)
|
35 |
+
- **Format**: Your HF token starting with `hf_`
|
36 |
+
- **Help**: Click the help text for instructions on getting your token
|
37 |
+
|
38 |
+
### **3. Set Your Dataset Repository**
|
39 |
+
```
|
40 |
+
Dataset Repository: your-username/your-dataset-name
|
41 |
+
```
|
42 |
+
- **Type**: Text field
|
43 |
+
- **Required**: No (defaults to `tonic/trackio-experiments`)
|
44 |
+
- **Format**: `username/dataset-name`
|
45 |
+
- **Examples**:
|
46 |
+
- `tonic/trackio-experiments`
|
47 |
+
- `your-username/my-experiments`
|
48 |
+
- `your-org/team-experiments`
|
49 |
+
|
50 |
+
### **4. Use the Action Buttons**
|
51 |
+
|
52 |
+
#### **Update Configuration**
|
53 |
+
- Applies new settings immediately
|
54 |
+
- Reloads experiments with new configuration
|
55 |
+
- Shows current status and experiment count
|
56 |
+
|
57 |
+
#### **Test Connection**
|
58 |
+
- Verifies access to the dataset repository
|
59 |
+
- Tests HF token permissions
|
60 |
+
- Shows dataset information and experiment count
|
61 |
+
|
62 |
+
#### **Create Dataset**
|
63 |
+
- Creates a new dataset repository if it doesn't exist
|
64 |
+
- Sets up the correct schema for experiments
|
65 |
+
- Makes the dataset private by default
|
66 |
+
|
67 |
+
## 🔧 Configuration Methods
|
68 |
+
|
69 |
+
### **Method 1: Interface Input (New)**
|
70 |
+
1. Go to "⚙️ Configuration" tab
|
71 |
+
2. Enter your HF token and dataset repository
|
72 |
+
3. Click "Update Configuration"
|
73 |
+
4. Verify with "Test Connection"
|
74 |
+
|
75 |
+
### **Method 2: Environment Variables (Existing)**
|
76 |
+
```bash
|
77 |
+
# Set environment variables
|
78 |
+
export HF_TOKEN=your_hf_token_here
|
79 |
+
export TRACKIO_DATASET_REPO=your-username/your-dataset-name
|
80 |
+
|
81 |
+
# Or for HF Spaces, add to Space settings
|
82 |
+
HF_TOKEN=your_hf_token_here
|
83 |
+
TRACKIO_DATASET_REPO=your-username/your-dataset-name
|
84 |
+
```
|
85 |
+
|
86 |
+
### **Method 3: Hybrid Approach**
|
87 |
+
- Set environment variables as defaults
|
88 |
+
- Override specific values through the interface
|
89 |
+
- Interface values take precedence over environment variables
|
90 |
+
|
91 |
+
## 📊 Configuration Priority
|
92 |
+
|
93 |
+
The app uses this priority order for configuration:
|
94 |
+
|
95 |
+
1. **Interface Input** (highest priority)
|
96 |
+
2. **Environment Variables** (fallback)
|
97 |
+
3. **Default Values** (lowest priority)
|
98 |
+
|
99 |
+
## 🛠️ Getting Your HF Token
|
100 |
+
|
101 |
+
### **Step-by-Step Instructions**
|
102 |
+
1. Go to [Hugging Face Settings](https://huggingface.co/settings/tokens)
|
103 |
+
2. Click "New token"
|
104 |
+
3. Give it a name (e.g., "Trackio Access")
|
105 |
+
4. Select "Write" permissions
|
106 |
+
5. Click "Generate token"
|
107 |
+
6. Copy the token (starts with `hf_`)
|
108 |
+
7. Paste it in the app's HF Token field
|
109 |
+
|
110 |
+
### **Token Permissions**
|
111 |
+
- **Read**: Required for loading experiments
|
112 |
+
- **Write**: Required for saving experiments
|
113 |
+
- **Scope**: Should have access to your dataset repositories
|
114 |
+
|
115 |
+
## 📁 Dataset Repository Format
|
116 |
+
|
117 |
+
### **Correct Format**
|
118 |
+
```
|
119 |
+
username/dataset-name
|
120 |
+
```
|
121 |
+
|
122 |
+
### **Examples**
|
123 |
+
- `tonic/trackio-experiments` (default)
|
124 |
+
- `your-username/my-experiments`
|
125 |
+
- `your-org/team-experiments`
|
126 |
+
- `your-username/smollm3-experiments`
|
127 |
+
|
128 |
+
### **Validation**
|
129 |
+
- Must contain exactly one `/`
|
130 |
+
- Username must be valid HF username
|
131 |
+
- Dataset name must be valid (alphanumeric + hyphens)
|
132 |
+
|
133 |
+
## 🔍 Testing Your Configuration
|
134 |
+
|
135 |
+
### **1. Test Connection**
|
136 |
+
- Enter your HF token and dataset repository
|
137 |
+
- Click "Test Connection"
|
138 |
+
- Should show: "✅ Connection successful!"
|
139 |
+
|
140 |
+
### **2. Create Dataset (if needed)**
|
141 |
+
- If dataset doesn't exist, click "Create Dataset"
|
142 |
+
- Should show: "✅ Dataset created successfully!"
|
143 |
+
|
144 |
+
### **3. Update Configuration**
|
145 |
+
- Click "Update Configuration"
|
146 |
+
- Should show: "✅ Configuration updated successfully!"
|
147 |
+
|
148 |
+
## 🚨 Troubleshooting
|
149 |
+
|
150 |
+
### **Issue: "Please provide a Hugging Face token"**
|
151 |
+
**Solution**:
|
152 |
+
- Enter your HF token in the interface
|
153 |
+
- Or set the `HF_TOKEN` environment variable
|
154 |
+
|
155 |
+
### **Issue: "Connection failed: 401 Unauthorized"**
|
156 |
+
**Solutions**:
|
157 |
+
1. Check your HF token is correct
|
158 |
+
2. Verify the token has read access to the dataset
|
159 |
+
3. Ensure the dataset repository exists
|
160 |
+
|
161 |
+
### **Issue: "Failed to create dataset"**
|
162 |
+
**Solutions**:
|
163 |
+
1. Check your HF token has write permissions
|
164 |
+
2. Verify the username in the repository name
|
165 |
+
3. Ensure the dataset name is valid
|
166 |
+
|
167 |
+
### **Issue: "Dataset repository must be in format: username/dataset-name"**
|
168 |
+
**Solution**:
|
169 |
+
- Use the correct format: `username/dataset-name`
|
170 |
+
- Example: `your-username/my-experiments`
|
171 |
+
|
172 |
+
## 📈 Benefits
|
173 |
+
|
174 |
+
### **For Users**
|
175 |
+
- ✅ **Easy Setup**: No need to set environment variables
|
176 |
+
- ✅ **Visual Interface**: Clear input fields and validation
|
177 |
+
- ✅ **Immediate Feedback**: Test connection and see results
|
178 |
+
- ✅ **Flexible**: Can change configuration anytime
|
179 |
+
|
180 |
+
### **For Development**
|
181 |
+
- ✅ **Backward Compatible**: Environment variables still work
|
182 |
+
- ✅ **Fallback Support**: Graceful degradation
|
183 |
+
- ✅ **Error Handling**: Clear error messages
|
184 |
+
- ✅ **Validation**: Input validation and testing
|
185 |
+
|
186 |
+
### **For Deployment**
|
187 |
+
- ✅ **HF Spaces Ready**: Works on Hugging Face Spaces
|
188 |
+
- ✅ **No Restart Required**: Dynamic configuration updates
|
189 |
+
- ✅ **Secure**: Password field for token input
|
190 |
+
- ✅ **User-Friendly**: Clear instructions and help text
|
191 |
+
|
192 |
+
## 🎯 Usage Examples
|
193 |
+
|
194 |
+
### **Basic Setup**
|
195 |
+
1. Open the app
|
196 |
+
2. Go to "⚙️ Configuration" tab
|
197 |
+
3. Enter your HF token
|
198 |
+
4. Enter your dataset repository
|
199 |
+
5. Click "Update Configuration"
|
200 |
+
6. Click "Test Connection" to verify
|
201 |
+
|
202 |
+
### **Advanced Setup**
|
203 |
+
1. Set environment variables as defaults
|
204 |
+
2. Use interface to override specific values
|
205 |
+
3. Test connection to verify access
|
206 |
+
4. Create dataset if it doesn't exist
|
207 |
+
5. Start using the app with persistent storage
|
208 |
+
|
209 |
+
### **Team Setup**
|
210 |
+
1. Create a shared dataset repository
|
211 |
+
2. Share the repository name with team
|
212 |
+
3. Each team member sets their own HF token
|
213 |
+
4. All experiments are stored in the shared dataset
|
214 |
+
|
215 |
+
## 📋 Configuration Status
|
216 |
+
|
217 |
+
The app shows current configuration status:
|
218 |
+
```
|
219 |
+
📊 Dataset: your-username/your-dataset
|
220 |
+
🔑 HF Token: Set
|
221 |
+
📈 Experiments: 5
|
222 |
+
```
|
223 |
+
|
224 |
+
## 🔄 Updating Configuration
|
225 |
+
|
226 |
+
You can update configuration at any time:
|
227 |
+
1. Go to "⚙️ Configuration" tab
|
228 |
+
2. Change HF token or dataset repository
|
229 |
+
3. Click "Update Configuration"
|
230 |
+
4. Experiments will reload with new settings
|
231 |
+
|
232 |
+
---
|
233 |
+
|
234 |
+
**🎉 Your Trackio app is now more flexible and user-friendly with direct configuration input!**
|
CLOUD_DEPLOYMENT_GUIDE.md → docs/CLOUD_DEPLOYMENT_GUIDE.md
RENAMED
File without changes
|
CLOUD_TRAINING_GUIDE.md → docs/CLOUD_TRAINING_GUIDE.md
RENAMED
File without changes
|
DEPLOYMENT_GUIDE.md → docs/DEPLOYMENT_GUIDE.md
RENAMED
File without changes
|
docs/ENVIRONMENT_VARIABLES.md
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🔧 Trackio Environment Variables Reference
|
2 |
+
|
3 |
+
## Quick Setup
|
4 |
+
|
5 |
+
Set these environment variables in your Hugging Face Space:
|
6 |
+
|
7 |
+
```bash
|
8 |
+
# Required: Your HF token for dataset access
|
9 |
+
HF_TOKEN=your_hf_token_here
|
10 |
+
|
11 |
+
# Optional: Dataset repository to use (defaults to tonic/trackio-experiments)
|
12 |
+
TRACKIO_DATASET_REPO=your-username/your-dataset-name
|
13 |
+
```
|
14 |
+
|
15 |
+
## Environment Variables
|
16 |
+
|
17 |
+
| Variable | Required | Default | Description |
|
18 |
+
|----------|----------|---------|-------------|
|
19 |
+
| `HF_TOKEN` | ✅ Yes | None | Your Hugging Face token for dataset access |
|
20 |
+
| `TRACKIO_DATASET_REPO` | ❌ No | `tonic/trackio-experiments` | Dataset repository to load experiments from |
|
21 |
+
| `SPACE_ID` | 🔄 Auto | None | HF Space ID (automatically detected) |
|
22 |
+
|
23 |
+
## Configuration Examples
|
24 |
+
|
25 |
+
### 1. Default Setup
|
26 |
+
```bash
|
27 |
+
HF_TOKEN=your_token_here
|
28 |
+
# Uses: tonic/trackio-experiments
|
29 |
+
```
|
30 |
+
|
31 |
+
### 2. Personal Dataset
|
32 |
+
```bash
|
33 |
+
HF_TOKEN=your_token_here
|
34 |
+
TRACKIO_DATASET_REPO=your-username/trackio-experiments
|
35 |
+
```
|
36 |
+
|
37 |
+
### 3. Team Dataset
|
38 |
+
```bash
|
39 |
+
HF_TOKEN=your_token_here
|
40 |
+
TRACKIO_DATASET_REPO=your-org/team-experiments
|
41 |
+
```
|
42 |
+
|
43 |
+
### 4. Project-Specific Dataset
|
44 |
+
```bash
|
45 |
+
HF_TOKEN=your_token_here
|
46 |
+
TRACKIO_DATASET_REPO=your-username/smollm3-experiments
|
47 |
+
```
|
48 |
+
|
49 |
+
## How to Set in HF Spaces
|
50 |
+
|
51 |
+
1. Go to your Hugging Face Space settings
|
52 |
+
2. Navigate to "Settings" → "Environment variables"
|
53 |
+
3. Add the variables:
|
54 |
+
- `HF_TOKEN`: Your HF token
|
55 |
+
- `TRACKIO_DATASET_REPO`: Your dataset repository (optional)
|
56 |
+
|
57 |
+
## Testing Configuration
|
58 |
+
|
59 |
+
Run the configuration script to check your setup:
|
60 |
+
|
61 |
+
```bash
|
62 |
+
python configure_trackio.py
|
63 |
+
```
|
64 |
+
|
65 |
+
This will:
|
66 |
+
- ✅ Show current environment variables
|
67 |
+
- 🧪 Test dataset access
|
68 |
+
- 📊 Display experiment count
|
69 |
+
- 💾 Generate configuration file
|
70 |
+
|
71 |
+
## Getting Your HF Token
|
72 |
+
|
73 |
+
1. Go to [Hugging Face Settings](https://huggingface.co/settings/tokens)
|
74 |
+
2. Click "New token"
|
75 |
+
3. Give it a name (e.g., "Trackio Access")
|
76 |
+
4. Select "Write" permissions
|
77 |
+
5. Copy the token and set it as `HF_TOKEN`
|
78 |
+
|
79 |
+
## Dataset Repository Format
|
80 |
+
|
81 |
+
The `TRACKIO_DATASET_REPO` should follow this format:
|
82 |
+
```
|
83 |
+
username/dataset-name
|
84 |
+
```
|
85 |
+
|
86 |
+
Examples:
|
87 |
+
- `tonic/trackio-experiments`
|
88 |
+
- `your-username/my-experiments`
|
89 |
+
- `your-org/team-experiments`
|
90 |
+
|
91 |
+
## Troubleshooting
|
92 |
+
|
93 |
+
### Issue: "HF_TOKEN not found"
|
94 |
+
**Solution**: Set your HF token in the Space environment variables
|
95 |
+
|
96 |
+
### Issue: "Failed to load dataset"
|
97 |
+
**Solutions**:
|
98 |
+
1. Check your token has read access to the dataset
|
99 |
+
2. Verify the dataset repository exists
|
100 |
+
3. Try the backup fallback (automatic)
|
101 |
+
|
102 |
+
### Issue: "Failed to save experiments"
|
103 |
+
**Solutions**:
|
104 |
+
1. Check your token has write permissions
|
105 |
+
2. Verify the dataset repository exists
|
106 |
+
3. Check network connectivity
|
107 |
+
|
108 |
+
## Security Notes
|
109 |
+
|
110 |
+
- 🔒 Dataset is private by default
|
111 |
+
- 🔑 Only accessible with your HF_TOKEN
|
112 |
+
- 🛡️ No sensitive data exposed publicly
|
113 |
+
- 🔐 Secure storage on HF infrastructure
|
docs/HF_DATASETS_GUIDE.md
ADDED
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🚀 Trackio with Hugging Face Datasets - Complete Guide
|
2 |
+
|
3 |
+
## Overview
|
4 |
+
|
5 |
+
This guide explains how to use Hugging Face Datasets for persistent storage of Trackio experiments, providing reliable data persistence across Hugging Face Spaces deployments.
|
6 |
+
|
7 |
+
## 🏗️ Architecture
|
8 |
+
|
9 |
+
### Why HF Datasets?
|
10 |
+
|
11 |
+
1. **Persistent Storage**: Data survives Space restarts and redeployments
|
12 |
+
2. **Version Control**: Automatic versioning of experiment data
|
13 |
+
3. **Access Control**: Private datasets for security
|
14 |
+
4. **Reliability**: HF's infrastructure ensures data availability
|
15 |
+
5. **Scalability**: Handles large amounts of experiment data
|
16 |
+
|
17 |
+
### Data Flow
|
18 |
+
|
19 |
+
```
|
20 |
+
Training Script → Trackio App → HF Dataset → Trackio App → Plots
|
21 |
+
```
|
22 |
+
|
23 |
+
## 🚀 Setup Instructions
|
24 |
+
|
25 |
+
### 1. Create HF Token
|
26 |
+
|
27 |
+
1. Go to [Hugging Face Settings](https://huggingface.co/settings/tokens)
|
28 |
+
2. Create a new token with `write` permissions
|
29 |
+
3. Copy the token for use in your Space
|
30 |
+
|
31 |
+
### 2. Set Up Dataset Repository
|
32 |
+
|
33 |
+
```bash
|
34 |
+
# Run the setup script
|
35 |
+
python setup_hf_dataset.py
|
36 |
+
```
|
37 |
+
|
38 |
+
This will:
|
39 |
+
- Create a private dataset: `tonic/trackio-experiments`
|
40 |
+
- Add your existing experiments
|
41 |
+
- Configure the dataset for Trackio
|
42 |
+
|
43 |
+
### 3. Configure Hugging Face Space
|
44 |
+
|
45 |
+
#### Environment Variables
|
46 |
+
Set these in your HF Space settings:
|
47 |
+
```bash
|
48 |
+
HF_TOKEN=your_hf_token_here
|
49 |
+
TRACKIO_DATASET_REPO=your-username/your-dataset-name
|
50 |
+
```
|
51 |
+
|
52 |
+
**Environment Variables Explained:**
|
53 |
+
- `HF_TOKEN`: Your Hugging Face token (required for dataset access)
|
54 |
+
- `TRACKIO_DATASET_REPO`: Dataset repository to use (optional, defaults to `tonic/trackio-experiments`)
|
55 |
+
|
56 |
+
**Example Configurations:**
|
57 |
+
```bash
|
58 |
+
# Use default dataset
|
59 |
+
HF_TOKEN=your_token_here
|
60 |
+
|
61 |
+
# Use personal dataset
|
62 |
+
HF_TOKEN=your_token_here
|
63 |
+
TRACKIO_DATASET_REPO=your-username/trackio-experiments
|
64 |
+
|
65 |
+
# Use team dataset
|
66 |
+
HF_TOKEN=your_token_here
|
67 |
+
TRACKIO_DATASET_REPO=your-org/team-experiments
|
68 |
+
|
69 |
+
# Use project-specific dataset
|
70 |
+
HF_TOKEN=your_token_here
|
71 |
+
TRACKIO_DATASET_REPO=your-username/smollm3-experiments
|
72 |
+
```
|
73 |
+
|
74 |
+
#### Requirements
|
75 |
+
Update your `requirements.txt`:
|
76 |
+
```txt
|
77 |
+
gradio>=4.0.0
|
78 |
+
plotly>=5.0.0
|
79 |
+
pandas>=1.5.0
|
80 |
+
numpy>=1.24.0
|
81 |
+
datasets>=2.14.0
|
82 |
+
huggingface-hub>=0.16.0
|
83 |
+
requests>=2.31.0
|
84 |
+
```
|
85 |
+
|
86 |
+
### 4. Deploy Updated App
|
87 |
+
|
88 |
+
The updated `app.py` now:
|
89 |
+
- Loads experiments from HF Dataset
|
90 |
+
- Saves new experiments to the dataset
|
91 |
+
- Falls back to backup data if dataset unavailable
|
92 |
+
- Provides better error handling
|
93 |
+
|
94 |
+
### 5. Configure Environment Variables
|
95 |
+
|
96 |
+
Use the configuration script to check your setup:
|
97 |
+
|
98 |
+
```bash
|
99 |
+
python configure_trackio.py
|
100 |
+
```
|
101 |
+
|
102 |
+
This script will:
|
103 |
+
- Show current environment variables
|
104 |
+
- Test dataset access
|
105 |
+
- Generate configuration file
|
106 |
+
- Provide usage examples
|
107 |
+
|
108 |
+
**Available Environment Variables:**
|
109 |
+
|
110 |
+
| Variable | Required | Default | Description |
|
111 |
+
|----------|----------|---------|-------------|
|
112 |
+
| `HF_TOKEN` | Yes | None | Your Hugging Face token |
|
113 |
+
| `TRACKIO_DATASET_REPO` | No | `tonic/trackio-experiments` | Dataset repository to use |
|
114 |
+
| `SPACE_ID` | Auto | None | HF Space ID (auto-detected) |
|
115 |
+
|
116 |
+
## 📊 Dataset Schema
|
117 |
+
|
118 |
+
The HF Dataset contains these columns:
|
119 |
+
|
120 |
+
| Column | Type | Description |
|
121 |
+
|--------|------|-------------|
|
122 |
+
| `experiment_id` | string | Unique experiment identifier |
|
123 |
+
| `name` | string | Experiment name |
|
124 |
+
| `description` | string | Experiment description |
|
125 |
+
| `created_at` | string | ISO timestamp |
|
126 |
+
| `status` | string | running/completed/failed |
|
127 |
+
| `metrics` | string | JSON array of metric entries |
|
128 |
+
| `parameters` | string | JSON object of experiment parameters |
|
129 |
+
| `artifacts` | string | JSON array of artifacts |
|
130 |
+
| `logs` | string | JSON array of log entries |
|
131 |
+
| `last_updated` | string | ISO timestamp of last update |
|
132 |
+
|
133 |
+
## 🔧 Technical Details
|
134 |
+
|
135 |
+
### Loading Experiments
|
136 |
+
|
137 |
+
```python
|
138 |
+
from datasets import load_dataset
|
139 |
+
|
140 |
+
# Load from HF Dataset
|
141 |
+
dataset = load_dataset("tonic/trackio-experiments", token=HF_TOKEN)
|
142 |
+
|
143 |
+
# Convert to experiments dict
|
144 |
+
for row in dataset['train']:
|
145 |
+
experiment = {
|
146 |
+
'id': row['experiment_id'],
|
147 |
+
'metrics': json.loads(row['metrics']),
|
148 |
+
'parameters': json.loads(row['parameters']),
|
149 |
+
# ... other fields
|
150 |
+
}
|
151 |
+
```
|
152 |
+
|
153 |
+
### Saving Experiments
|
154 |
+
|
155 |
+
```python
|
156 |
+
from datasets import Dataset
|
157 |
+
from huggingface_hub import HfApi
|
158 |
+
|
159 |
+
# Convert experiments to dataset format
|
160 |
+
dataset_data = []
|
161 |
+
for exp_id, exp_data in experiments.items():
|
162 |
+
dataset_data.append({
|
163 |
+
'experiment_id': exp_id,
|
164 |
+
'metrics': json.dumps(exp_data['metrics']),
|
165 |
+
'parameters': json.dumps(exp_data['parameters']),
|
166 |
+
# ... other fields
|
167 |
+
})
|
168 |
+
|
169 |
+
# Push to HF Hub
|
170 |
+
dataset = Dataset.from_list(dataset_data)
|
171 |
+
dataset.push_to_hub("tonic/trackio-experiments", token=HF_TOKEN, private=True)
|
172 |
+
```
|
173 |
+
|
174 |
+
## 📈 Your Current Experiments
|
175 |
+
|
176 |
+
### Available Experiments
|
177 |
+
|
178 |
+
1. **`exp_20250720_130853`** (petite-elle-l-aime-3)
|
179 |
+
- 4 metric entries (steps 25, 50, 75, 100)
|
180 |
+
- Loss decreasing: 1.1659 → 1.1528
|
181 |
+
- Good convergence pattern
|
182 |
+
|
183 |
+
2. **`exp_20250720_134319`** (petite-elle-l-aime-3-1)
|
184 |
+
- 2 metric entries (step 25)
|
185 |
+
- Loss: 1.166
|
186 |
+
- GPU memory tracking
|
187 |
+
|
188 |
+
### Metrics Available for Plotting
|
189 |
+
|
190 |
+
- `loss` - Training loss curve
|
191 |
+
- `learning_rate` - Learning rate schedule
|
192 |
+
- `mean_token_accuracy` - Token-level accuracy
|
193 |
+
- `grad_norm` - Gradient norm
|
194 |
+
- `num_tokens` - Tokens processed
|
195 |
+
- `epoch` - Training epoch
|
196 |
+
- `gpu_0_memory_allocated` - GPU memory usage
|
197 |
+
- `cpu_percent` - CPU usage
|
198 |
+
- `memory_percent` - System memory
|
199 |
+
|
200 |
+
## 🎯 Usage Instructions
|
201 |
+
|
202 |
+
### 1. View Experiments
|
203 |
+
- Go to "View Experiments" tab
|
204 |
+
- Enter experiment ID: `exp_20250720_130853` or `exp_20250720_134319`
|
205 |
+
- Click "View Experiment"
|
206 |
+
|
207 |
+
### 2. Create Plots
|
208 |
+
- Go to "Visualizations" tab
|
209 |
+
- Enter experiment ID
|
210 |
+
- Select metric to plot
|
211 |
+
- Click "Create Plot"
|
212 |
+
|
213 |
+
### 3. Compare Experiments
|
214 |
+
- Use "Experiment Comparison" feature
|
215 |
+
- Enter: `exp_20250720_130853,exp_20250720_134319`
|
216 |
+
- Compare loss curves
|
217 |
+
|
218 |
+
## 🔍 Troubleshooting
|
219 |
+
|
220 |
+
### Issue: "No metrics data available"
|
221 |
+
**Solutions**:
|
222 |
+
1. Check HF_TOKEN is set correctly
|
223 |
+
2. Verify dataset repository exists
|
224 |
+
3. Check network connectivity to HF Hub
|
225 |
+
|
226 |
+
### Issue: "Failed to load from dataset"
|
227 |
+
**Solutions**:
|
228 |
+
1. App falls back to backup data automatically
|
229 |
+
2. Check dataset permissions
|
230 |
+
3. Verify token has read access
|
231 |
+
|
232 |
+
### Issue: "Failed to save experiments"
|
233 |
+
**Solutions**:
|
234 |
+
1. Check token has write permissions
|
235 |
+
2. Verify dataset repository exists
|
236 |
+
3. Check network connectivity
|
237 |
+
|
238 |
+
## 🚀 Benefits of This Approach
|
239 |
+
|
240 |
+
### ✅ Advantages
|
241 |
+
- **Persistent**: Data survives Space restarts
|
242 |
+
- **Reliable**: HF's infrastructure ensures availability
|
243 |
+
- **Secure**: Private datasets protect your data
|
244 |
+
- **Scalable**: Handles large amounts of experiment data
|
245 |
+
- **Versioned**: Automatic versioning of experiment data
|
246 |
+
|
247 |
+
### 🔄 Fallback Strategy
|
248 |
+
1. **Primary**: Load from HF Dataset
|
249 |
+
2. **Secondary**: Use backup data (your existing experiments)
|
250 |
+
3. **Tertiary**: Create new experiments locally
|
251 |
+
|
252 |
+
## 📋 Next Steps
|
253 |
+
|
254 |
+
1. **Set HF_TOKEN**: Add your token to Space environment
|
255 |
+
2. **Run Setup**: Execute `setup_hf_dataset.py`
|
256 |
+
3. **Deploy App**: Push updated `app.py` to your Space
|
257 |
+
4. **Test Plots**: Verify experiments load and plots work
|
258 |
+
5. **Monitor Training**: New experiments will be saved to dataset
|
259 |
+
|
260 |
+
## 🔐 Security Notes
|
261 |
+
|
262 |
+
- Dataset is **private** by default
|
263 |
+
- Only accessible with your HF_TOKEN
|
264 |
+
- Experiment data is stored securely on HF infrastructure
|
265 |
+
- No sensitive data is exposed publicly
|
266 |
+
|
267 |
+
---
|
268 |
+
|
269 |
+
**Your experiments are now configured for reliable persistence using Hugging Face Datasets!** 🎉
|
docs/HF_SPACES_GUIDE.md
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🚀 Trackio on Hugging Face Spaces - Complete Guide
|
2 |
+
|
3 |
+
## Overview
|
4 |
+
|
5 |
+
This guide explains how to properly deploy and use Trackio on Hugging Face Spaces, addressing the unique challenges of ephemeral storage and data persistence.
|
6 |
+
|
7 |
+
## 🏗️ Hugging Face Spaces Architecture
|
8 |
+
|
9 |
+
### Key Challenges
|
10 |
+
|
11 |
+
1. **Ephemeral Storage**: File system gets reset between deployments
|
12 |
+
2. **No Persistent Storage**: Files written during runtime don't persist
|
13 |
+
3. **Multiple Instances**: Training and monitoring might run in different environments
|
14 |
+
4. **Limited File System**: Restricted write permissions in certain directories
|
15 |
+
|
16 |
+
### How Trackio Handles HF Spaces
|
17 |
+
|
18 |
+
The updated Trackio app now includes:
|
19 |
+
|
20 |
+
- **Automatic HF Spaces Detection**: Detects when running on HF Spaces
|
21 |
+
- **Persistent Path Selection**: Uses `/tmp/` for better persistence
|
22 |
+
- **Backup Recovery**: Automatically recovers experiments from backup data
|
23 |
+
- **Fallback Storage**: Multiple storage locations for redundancy
|
24 |
+
|
25 |
+
## 📊 Your Current Experiments
|
26 |
+
|
27 |
+
Based on your logs, you have these experiments available:
|
28 |
+
|
29 |
+
### Experiment 1: `exp_20250720_130853`
|
30 |
+
- **Name**: petite-elle-l-aime-3
|
31 |
+
- **Status**: Running
|
32 |
+
- **Metrics**: 4 entries (steps 25, 50, 75, 100)
|
33 |
+
- **Key Metrics**: Loss decreasing from 1.1659 to 1.1528
|
34 |
+
|
35 |
+
### Experiment 2: `exp_20250720_134319`
|
36 |
+
- **Name**: petite-elle-l-aime-3-1
|
37 |
+
- **Status**: Running
|
38 |
+
- **Metrics**: 2 entries (step 25)
|
39 |
+
- **Key Metrics**: Loss 1.166, GPU memory usage
|
40 |
+
|
41 |
+
## 🎯 How to Use Your Experiments
|
42 |
+
|
43 |
+
### 1. View Experiments
|
44 |
+
- Go to the "View Experiments" tab
|
45 |
+
- Enter experiment ID: `exp_20250720_130853` or `exp_20250720_134319`
|
46 |
+
- Click "View Experiment" to see details
|
47 |
+
|
48 |
+
### 2. Create Plots
|
49 |
+
- Go to the "Visualizations" tab
|
50 |
+
- Enter experiment ID
|
51 |
+
- Select metric to plot:
|
52 |
+
- `loss` - Training loss curve
|
53 |
+
- `learning_rate` - Learning rate schedule
|
54 |
+
- `mean_token_accuracy` - Token accuracy
|
55 |
+
- `grad_norm` - Gradient norm
|
56 |
+
- `gpu_0_memory_allocated` - GPU memory usage
|
57 |
+
|
58 |
+
### 3. Compare Experiments
|
59 |
+
- Use the "Experiment Comparison" feature
|
60 |
+
- Enter: `exp_20250720_130853,exp_20250720_134319`
|
61 |
+
- Compare loss curves between experiments
|
62 |
+
|
63 |
+
## 🔧 Technical Details
|
64 |
+
|
65 |
+
### Data Persistence Strategy
|
66 |
+
|
67 |
+
```python
|
68 |
+
# HF Spaces detection
|
69 |
+
if os.environ.get('SPACE_ID'):
|
70 |
+
data_file = "/tmp/trackio_experiments.json"
|
71 |
+
else:
|
72 |
+
data_file = "trackio_experiments.json"
|
73 |
+
```
|
74 |
+
|
75 |
+
### Backup Recovery
|
76 |
+
|
77 |
+
The app automatically recovers your experiments from backup data when:
|
78 |
+
- Running on HF Spaces
|
79 |
+
- No existing experiments found
|
80 |
+
- Data file is missing or empty
|
81 |
+
|
82 |
+
### Storage Locations
|
83 |
+
|
84 |
+
1. **Primary**: `/tmp/trackio_experiments.json`
|
85 |
+
2. **Backup**: `/tmp/trackio_backup.json`
|
86 |
+
3. **Fallback**: Local directory (for development)
|
87 |
+
|
88 |
+
## 🚀 Deployment Best Practices
|
89 |
+
|
90 |
+
### 1. Environment Variables
|
91 |
+
```bash
|
92 |
+
# Set in HF Spaces environment
|
93 |
+
SPACE_ID=your-space-id
|
94 |
+
TRACKIO_URL=https://your-space.hf.space
|
95 |
+
```
|
96 |
+
|
97 |
+
### 2. File Structure
|
98 |
+
```
|
99 |
+
your-space/
|
100 |
+
├── app.py # Main Trackio app
|
101 |
+
├── requirements.txt # Dependencies
|
102 |
+
├── README.md # Space description
|
103 |
+
└── .gitignore # Ignore temporary files
|
104 |
+
```
|
105 |
+
|
106 |
+
### 3. Requirements
|
107 |
+
```txt
|
108 |
+
gradio>=4.0.0
|
109 |
+
plotly>=5.0.0
|
110 |
+
pandas>=1.5.0
|
111 |
+
numpy>=1.24.0
|
112 |
+
```
|
113 |
+
|
114 |
+
## 📈 Monitoring Your Training
|
115 |
+
|
116 |
+
### Real-time Metrics
|
117 |
+
Your experiments show:
|
118 |
+
- **Loss**: Decreasing from 1.1659 to 1.1528 (good convergence)
|
119 |
+
- **Learning Rate**: Properly scheduled from 7e-08 to 2.8875e-07
|
120 |
+
- **Token Accuracy**: Around 75-76% (reasonable for early training)
|
121 |
+
- **GPU Memory**: ~17GB allocated, 75GB reserved
|
122 |
+
|
123 |
+
### Expected Behavior
|
124 |
+
- Loss should continue decreasing
|
125 |
+
- Learning rate will follow cosine schedule
|
126 |
+
- Token accuracy should improve over time
|
127 |
+
- GPU memory usage should remain stable
|
128 |
+
|
129 |
+
## 🔍 Troubleshooting
|
130 |
+
|
131 |
+
### Issue: "No metrics data available"
|
132 |
+
**Solution**: The app now automatically recovers experiments from backup
|
133 |
+
|
134 |
+
### Issue: Plots not showing
|
135 |
+
**Solution**:
|
136 |
+
1. Check experiment ID is correct
|
137 |
+
2. Try different metrics (loss, learning_rate, etc.)
|
138 |
+
3. Refresh the page
|
139 |
+
|
140 |
+
### Issue: Data not persisting
|
141 |
+
**Solution**:
|
142 |
+
1. App now uses `/tmp/` for better persistence
|
143 |
+
2. Backup recovery ensures data availability
|
144 |
+
3. Multiple storage locations provide redundancy
|
145 |
+
|
146 |
+
## 🎯 Next Steps
|
147 |
+
|
148 |
+
1. **Deploy Updated App**: Push the updated `app.py` to your HF Space
|
149 |
+
2. **Test Plots**: Try plotting your experiments
|
150 |
+
3. **Monitor Training**: Continue monitoring your training runs
|
151 |
+
4. **Add New Experiments**: Create new experiments as needed
|
152 |
+
|
153 |
+
## 📞 Support
|
154 |
+
|
155 |
+
If you encounter issues:
|
156 |
+
1. Check the logs in your HF Space
|
157 |
+
2. Verify experiment IDs are correct
|
158 |
+
3. Try the backup recovery feature
|
159 |
+
4. Contact for additional support
|
160 |
+
|
161 |
+
---
|
162 |
+
|
163 |
+
**Your experiments are now properly configured and should display correctly in the Trackio interface!** 🎉
|
docs/MONITORING_IMPROVEMENTS_SUMMARY.md
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🚀 Monitoring Improvements Summary
|
2 |
+
|
3 |
+
## Overview
|
4 |
+
|
5 |
+
The monitoring system has been significantly enhanced to support **Hugging Face Datasets** for persistent experiment storage, making it ideal for deployment on Hugging Face Spaces and other cloud environments.
|
6 |
+
|
7 |
+
## ✅ Key Improvements Made
|
8 |
+
|
9 |
+
### 1. **Enhanced `monitoring.py`**
|
10 |
+
- ✅ **HF Datasets Integration**: Added support for saving experiments to HF Datasets repositories
|
11 |
+
- ✅ **Environment Variables**: Automatic detection of `HF_TOKEN` and `TRACKIO_DATASET_REPO`
|
12 |
+
- ✅ **Fallback Support**: Graceful degradation if HF Datasets unavailable
|
13 |
+
- ✅ **Dual Storage**: Experiments saved to both Trackio and HF Datasets
|
14 |
+
- ✅ **Periodic Saving**: Metrics saved to HF Dataset every 10 steps
|
15 |
+
- ✅ **Error Handling**: Robust error logging and recovery
|
16 |
+
|
17 |
+
### 2. **Updated `train.py`**
|
18 |
+
- ✅ **Monitoring Integration**: Automatic monitoring setup in training scripts
|
19 |
+
- ✅ **Configuration Logging**: Experiment configuration logged at start
|
20 |
+
- ✅ **Training Callbacks**: Monitoring callbacks added to trainer
|
21 |
+
- ✅ **Summary Logging**: Training summaries logged at completion
|
22 |
+
- ✅ **Error Logging**: Errors logged to monitoring system
|
23 |
+
- ✅ **Cleanup**: Proper monitoring session cleanup
|
24 |
+
|
25 |
+
### 3. **Configuration Files Updated**
|
26 |
+
- ✅ **HF Datasets Config**: Added `hf_token` and `dataset_repo` parameters
|
27 |
+
- ✅ **Environment Support**: Environment variables automatically detected
|
28 |
+
- ✅ **Backward Compatible**: Existing configurations still work
|
29 |
+
|
30 |
+
### 4. **New Utility Scripts**
|
31 |
+
- ✅ **`configure_trackio.py`**: Configuration testing and setup
|
32 |
+
- ✅ **`integrate_monitoring.py`**: Automated integration script
|
33 |
+
- ✅ **`test_monitoring_integration.py`**: Comprehensive testing
|
34 |
+
- ✅ **`setup_hf_dataset.py`**: Dataset repository setup
|
35 |
+
|
36 |
+
### 5. **Documentation**
|
37 |
+
- ✅ **`MONITORING_INTEGRATION_GUIDE.md`**: Comprehensive usage guide
|
38 |
+
- ✅ **`ENVIRONMENT_VARIABLES.md`**: Environment variable reference
|
39 |
+
- ✅ **`HF_DATASETS_GUIDE.md`**: Detailed HF Datasets guide
|
40 |
+
|
41 |
+
## 🔧 Environment Variables
|
42 |
+
|
43 |
+
| Variable | Required | Default | Description |
|
44 |
+
|----------|----------|---------|-------------|
|
45 |
+
| `HF_TOKEN` | ✅ Yes | None | Your Hugging Face token |
|
46 |
+
| `TRACKIO_DATASET_REPO` | ❌ No | `tonic/trackio-experiments` | Dataset repository |
|
47 |
+
| `TRACKIO_URL` | ❌ No | None | Trackio server URL |
|
48 |
+
| `TRACKIO_TOKEN` | ❌ No | None | Trackio authentication token |
|
49 |
+
|
50 |
+
## 📊 What Gets Monitored
|
51 |
+
|
52 |
+
### **Training Metrics**
|
53 |
+
- Loss values (training and validation)
|
54 |
+
- Learning rate
|
55 |
+
- Gradient norms
|
56 |
+
- Training steps and epochs
|
57 |
+
|
58 |
+
### **System Metrics**
|
59 |
+
- GPU memory usage
|
60 |
+
- GPU utilization
|
61 |
+
- CPU usage
|
62 |
+
- Memory usage
|
63 |
+
|
64 |
+
### **Experiment Data**
|
65 |
+
- Configuration parameters
|
66 |
+
- Model checkpoints
|
67 |
+
- Evaluation results
|
68 |
+
- Training summaries
|
69 |
+
|
70 |
+
### **Artifacts**
|
71 |
+
- Configuration files
|
72 |
+
- Training logs
|
73 |
+
- Evaluation results
|
74 |
+
- Model checkpoints
|
75 |
+
|
76 |
+
## 🚀 Usage Examples
|
77 |
+
|
78 |
+
### **Basic Training**
|
79 |
+
```bash
|
80 |
+
# Set environment variables
|
81 |
+
export HF_TOKEN=your_token_here
|
82 |
+
export TRACKIO_DATASET_REPO=your-username/experiments
|
83 |
+
|
84 |
+
# Run training with monitoring
|
85 |
+
python train.py config/train_smollm3_openhermes_fr.py
|
86 |
+
```
|
87 |
+
|
88 |
+
### **Advanced Configuration**
|
89 |
+
```bash
|
90 |
+
# Train with custom settings
|
91 |
+
python train.py config/train_smollm3_openhermes_fr.py \
|
92 |
+
--experiment_name "smollm3_french_v2" \
|
93 |
+
--hf_token your_token_here \
|
94 |
+
--dataset_repo your-username/french-experiments
|
95 |
+
```
|
96 |
+
|
97 |
+
### **Testing Setup**
|
98 |
+
```bash
|
99 |
+
# Test configuration
|
100 |
+
python configure_trackio.py
|
101 |
+
|
102 |
+
# Test monitoring integration
|
103 |
+
python test_monitoring_integration.py
|
104 |
+
|
105 |
+
# Test dataset access
|
106 |
+
python test_hf_datasets.py
|
107 |
+
```
|
108 |
+
|
109 |
+
## 📈 Benefits
|
110 |
+
|
111 |
+
### **For HF Spaces Deployment**
|
112 |
+
- ✅ **Persistent Storage**: Data survives Space restarts
|
113 |
+
- ✅ **No Local Storage**: No dependency on ephemeral storage
|
114 |
+
- ✅ **Scalable**: Works with any dataset size
|
115 |
+
- ✅ **Secure**: Private dataset storage
|
116 |
+
|
117 |
+
### **For Experiment Management**
|
118 |
+
- ✅ **Centralized**: All experiments in one place
|
119 |
+
- ✅ **Searchable**: Easy to find specific experiments
|
120 |
+
- ✅ **Versioned**: Dataset versioning for experiments
|
121 |
+
- ✅ **Collaborative**: Share experiments with team
|
122 |
+
|
123 |
+
### **For Development**
|
124 |
+
- ✅ **Flexible**: Easy to switch between datasets
|
125 |
+
- ✅ **Configurable**: Environment-based configuration
|
126 |
+
- ✅ **Robust**: Fallback mechanisms
|
127 |
+
- ✅ **Debuggable**: Comprehensive logging
|
128 |
+
|
129 |
+
## 🧪 Testing Results
|
130 |
+
|
131 |
+
All monitoring integration tests passed:
|
132 |
+
- ✅ Module Import
|
133 |
+
- ✅ Monitor Creation
|
134 |
+
- ✅ Config Creation
|
135 |
+
- ✅ Metrics Logging
|
136 |
+
- ✅ Configuration Logging
|
137 |
+
- ✅ System Metrics
|
138 |
+
- ✅ Training Summary
|
139 |
+
- ✅ Callback Creation
|
140 |
+
|
141 |
+
## 📋 Files Modified/Created
|
142 |
+
|
143 |
+
### **Core Files**
|
144 |
+
- `monitoring.py` - Enhanced with HF Datasets support
|
145 |
+
- `train.py` - Updated with monitoring integration
|
146 |
+
- `requirements_core.txt` - Added monitoring dependencies
|
147 |
+
- `requirements_space.txt` - Updated for HF Spaces
|
148 |
+
|
149 |
+
### **Configuration Files**
|
150 |
+
- `config/train_smollm3.py` - Added HF Datasets config
|
151 |
+
- `config/train_smollm3_openhermes_fr.py` - Added HF Datasets config
|
152 |
+
- `config/train_smollm3_openhermes_fr_a100_balanced.py` - Added HF Datasets config
|
153 |
+
- `config/train_smollm3_openhermes_fr_a100_large.py` - Added HF Datasets config
|
154 |
+
- `config/train_smollm3_openhermes_fr_a100_max_performance.py` - Added HF Datasets config
|
155 |
+
- `config/train_smollm3_openhermes_fr_a100_multiple_passes.py` - Added HF Datasets config
|
156 |
+
|
157 |
+
### **New Utility Scripts**
|
158 |
+
- `configure_trackio.py` - Configuration testing
|
159 |
+
- `integrate_monitoring.py` - Automated integration
|
160 |
+
- `test_monitoring_integration.py` - Comprehensive testing
|
161 |
+
- `setup_hf_dataset.py` - Dataset setup
|
162 |
+
|
163 |
+
### **Documentation**
|
164 |
+
- `MONITORING_INTEGRATION_GUIDE.md` - Usage guide
|
165 |
+
- `ENVIRONMENT_VARIABLES.md` - Environment reference
|
166 |
+
- `HF_DATASETS_GUIDE.md` - HF Datasets guide
|
167 |
+
- `MONITORING_IMPROVEMENTS_SUMMARY.md` - This summary
|
168 |
+
|
169 |
+
## 🎯 Next Steps
|
170 |
+
|
171 |
+
1. **Set up your HF token and dataset repository**
|
172 |
+
2. **Test the configuration with `python configure_trackio.py`**
|
173 |
+
3. **Run a training experiment to verify full functionality**
|
174 |
+
4. **Check your HF Dataset repository for experiment data**
|
175 |
+
5. **View results in your Trackio interface**
|
176 |
+
|
177 |
+
## 🔍 Troubleshooting
|
178 |
+
|
179 |
+
### **Common Issues**
|
180 |
+
- **HF_TOKEN not set**: Set your Hugging Face token
|
181 |
+
- **Dataset access failed**: Check token permissions and repository existence
|
182 |
+
- **Monitoring not working**: Run `python test_monitoring_integration.py` to diagnose
|
183 |
+
|
184 |
+
### **Getting Help**
|
185 |
+
- Check the comprehensive guides in the documentation files
|
186 |
+
- Run the test scripts to verify your setup
|
187 |
+
- Check logs for specific error messages
|
188 |
+
|
189 |
+
---
|
190 |
+
|
191 |
+
**🎉 The monitoring system is now ready for production use with persistent HF Datasets storage!**
|
docs/MONITORING_INTEGRATION_GUIDE.md
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🔧 Improved Monitoring Integration Guide
|
2 |
+
|
3 |
+
## Overview
|
4 |
+
|
5 |
+
The monitoring system has been enhanced to support **Hugging Face Datasets** for persistent experiment storage, making it ideal for deployment on Hugging Face Spaces and other cloud environments.
|
6 |
+
|
7 |
+
## 🚀 Key Improvements
|
8 |
+
|
9 |
+
### 1. **HF Datasets Integration**
|
10 |
+
- ✅ **Persistent Storage**: Experiments are saved to HF Datasets repositories
|
11 |
+
- ✅ **Environment Variables**: Configurable via `HF_TOKEN` and `TRACKIO_DATASET_REPO`
|
12 |
+
- ✅ **Fallback Support**: Graceful degradation if HF Datasets unavailable
|
13 |
+
- ✅ **Automatic Backup**: Local files as backup
|
14 |
+
|
15 |
+
### 2. **Enhanced Monitoring Features**
|
16 |
+
- 📊 **Real-time Metrics**: Training metrics logged to both Trackio and HF Datasets
|
17 |
+
- 🔧 **System Metrics**: GPU memory, CPU usage, and system performance
|
18 |
+
- 📈 **Training Summaries**: Comprehensive experiment summaries
|
19 |
+
- 🛡️ **Error Handling**: Robust error logging and recovery
|
20 |
+
|
21 |
+
### 3. **Easy Integration**
|
22 |
+
- 🔌 **Automatic Setup**: Environment variables automatically detected
|
23 |
+
- 📝 **Configuration**: Simple setup with environment variables
|
24 |
+
- 🔄 **Backward Compatible**: Works with existing Trackio setup
|
25 |
+
|
26 |
+
## 📋 Environment Variables
|
27 |
+
|
28 |
+
| Variable | Required | Default | Description |
|
29 |
+
|----------|----------|---------|-------------|
|
30 |
+
| `HF_TOKEN` | ✅ Yes | None | Your Hugging Face token |
|
31 |
+
| `TRACKIO_DATASET_REPO` | ❌ No | `tonic/trackio-experiments` | Dataset repository |
|
32 |
+
| `TRACKIO_URL` | ❌ No | None | Trackio server URL |
|
33 |
+
| `TRACKIO_TOKEN` | ❌ No | None | Trackio authentication token |
|
34 |
+
|
35 |
+
## 🛠️ Setup Instructions
|
36 |
+
|
37 |
+
### 1. **Get Your HF Token**
|
38 |
+
```bash
|
39 |
+
# Go to https://huggingface.co/settings/tokens
|
40 |
+
# Create a new token with "Write" permissions
|
41 |
+
# Copy the token
|
42 |
+
```
|
43 |
+
|
44 |
+
### 2. **Set Environment Variables**
|
45 |
+
```bash
|
46 |
+
# For HF Spaces, add these to your Space settings:
|
47 |
+
HF_TOKEN=your_hf_token_here
|
48 |
+
TRACKIO_DATASET_REPO=your-username/your-dataset-name
|
49 |
+
|
50 |
+
# For local development:
|
51 |
+
export HF_TOKEN=your_hf_token_here
|
52 |
+
export TRACKIO_DATASET_REPO=your-username/your-dataset-name
|
53 |
+
```
|
54 |
+
|
55 |
+
### 3. **Create Dataset Repository**
|
56 |
+
```bash
|
57 |
+
# Run the setup script
|
58 |
+
python setup_hf_dataset.py
|
59 |
+
|
60 |
+
# Or manually create a dataset on HF Hub
|
61 |
+
# Go to https://huggingface.co/datasets
|
62 |
+
# Create a new dataset repository
|
63 |
+
```
|
64 |
+
|
65 |
+
### 4. **Test Configuration**
|
66 |
+
```bash
|
67 |
+
# Test your setup
|
68 |
+
python configure_trackio.py
|
69 |
+
|
70 |
+
# Test dataset access
|
71 |
+
python test_hf_datasets.py
|
72 |
+
```
|
73 |
+
|
74 |
+
## 🚀 Usage Examples
|
75 |
+
|
76 |
+
### **Basic Training with Monitoring**
|
77 |
+
```bash
|
78 |
+
# Train with default monitoring
|
79 |
+
python train.py config/train_smollm3_openhermes_fr.py
|
80 |
+
|
81 |
+
# Train with custom dataset repository
|
82 |
+
TRACKIO_DATASET_REPO=your-username/smollm3-experiments python train.py config/train_smollm3_openhermes_fr.py
|
83 |
+
```
|
84 |
+
|
85 |
+
### **Advanced Training Configuration**
|
86 |
+
```bash
|
87 |
+
# Train with custom experiment name
|
88 |
+
python train.py config/train_smollm3_openhermes_fr.py \
|
89 |
+
--experiment_name "smollm3_french_tuning_v2" \
|
90 |
+
--hf_token your_token_here \
|
91 |
+
--dataset_repo your-username/french-experiments
|
92 |
+
```
|
93 |
+
|
94 |
+
### **Training Scripts with Monitoring**
|
95 |
+
```bash
|
96 |
+
# All training scripts now support monitoring:
|
97 |
+
python train.py config/train_smollm3_openhermes_fr_a100_balanced.py
|
98 |
+
python train.py config/train_smollm3_openhermes_fr_a100_large.py
|
99 |
+
python train.py config/train_smollm3_openhermes_fr_a100_max_performance.py
|
100 |
+
python train.py config/train_smollm3_openhermes_fr_a100_multiple_passes.py
|
101 |
+
```
|
102 |
+
|
103 |
+
## 📊 What Gets Monitored
|
104 |
+
|
105 |
+
### **Training Metrics**
|
106 |
+
- Loss values (training and validation)
|
107 |
+
- Learning rate
|
108 |
+
- Gradient norms
|
109 |
+
- Training steps and epochs
|
110 |
+
|
111 |
+
### **System Metrics**
|
112 |
+
- GPU memory usage
|
113 |
+
- GPU utilization
|
114 |
+
- CPU usage
|
115 |
+
- Memory usage
|
116 |
+
|
117 |
+
### **Experiment Data**
|
118 |
+
- Configuration parameters
|
119 |
+
- Model checkpoints
|
120 |
+
- Evaluation results
|
121 |
+
- Training summaries
|
122 |
+
|
123 |
+
### **Artifacts**
|
124 |
+
- Configuration files
|
125 |
+
- Training logs
|
126 |
+
- Evaluation results
|
127 |
+
- Model checkpoints
|
128 |
+
|
129 |
+
## 🔍 Viewing Results
|
130 |
+
|
131 |
+
### **1. Trackio Interface**
|
132 |
+
- Visit your Trackio Space
|
133 |
+
- Navigate to "Experiments" tab
|
134 |
+
- View real-time metrics and plots
|
135 |
+
|
136 |
+
### **2. HF Dataset Repository**
|
137 |
+
- Go to your dataset repository on HF Hub
|
138 |
+
- Browse experiment data
|
139 |
+
- Download experiment files
|
140 |
+
|
141 |
+
### **3. Local Files**
|
142 |
+
- Check local backup files
|
143 |
+
- Review training logs
|
144 |
+
- Examine configuration files
|
145 |
+
|
146 |
+
## 🛠️ Configuration Examples
|
147 |
+
|
148 |
+
### **Default Setup**
|
149 |
+
```python
|
150 |
+
# Uses default dataset: tonic/trackio-experiments
|
151 |
+
# Requires only HF_TOKEN
|
152 |
+
```
|
153 |
+
|
154 |
+
### **Personal Dataset**
|
155 |
+
```bash
|
156 |
+
export HF_TOKEN=your_token_here
|
157 |
+
export TRACKIO_DATASET_REPO=your-username/trackio-experiments
|
158 |
+
```
|
159 |
+
|
160 |
+
### **Team Dataset**
|
161 |
+
```bash
|
162 |
+
export HF_TOKEN=your_token_here
|
163 |
+
export TRACKIO_DATASET_REPO=your-org/team-experiments
|
164 |
+
```
|
165 |
+
|
166 |
+
### **Project-Specific Dataset**
|
167 |
+
```bash
|
168 |
+
export HF_TOKEN=your_token_here
|
169 |
+
export TRACKIO_DATASET_REPO=your-username/smollm3-experiments
|
170 |
+
```
|
171 |
+
|
172 |
+
## 🔧 Troubleshooting
|
173 |
+
|
174 |
+
### **Issue: "HF_TOKEN not found"**
|
175 |
+
```bash
|
176 |
+
# Solution: Set your HF token
|
177 |
+
export HF_TOKEN=your_token_here
|
178 |
+
# Or add to HF Space environment variables
|
179 |
+
```
|
180 |
+
|
181 |
+
### **Issue: "Failed to load dataset"**
|
182 |
+
```bash
|
183 |
+
# Solutions:
|
184 |
+
# 1. Check token has read access
|
185 |
+
# 2. Verify dataset repository exists
|
186 |
+
# 3. Run setup script: python setup_hf_dataset.py
|
187 |
+
```
|
188 |
+
|
189 |
+
### **Issue: "Failed to save experiments"**
|
190 |
+
```bash
|
191 |
+
# Solutions:
|
192 |
+
# 1. Check token has write permissions
|
193 |
+
# 2. Verify dataset repository exists
|
194 |
+
# 3. Check network connectivity
|
195 |
+
```
|
196 |
+
|
197 |
+
### **Issue: "Monitoring not working"**
|
198 |
+
```bash
|
199 |
+
# Solutions:
|
200 |
+
# 1. Check environment variables
|
201 |
+
# 2. Run configuration test: python configure_trackio.py
|
202 |
+
# 3. Check logs for specific errors
|
203 |
+
```
|
204 |
+
|
205 |
+
## 📈 Benefits
|
206 |
+
|
207 |
+
### **For HF Spaces Deployment**
|
208 |
+
- ✅ **Persistent Storage**: Data survives Space restarts
|
209 |
+
- ✅ **No Local Storage**: No dependency on ephemeral storage
|
210 |
+
- ✅ **Scalable**: Works with any dataset size
|
211 |
+
- ✅ **Secure**: Private dataset storage
|
212 |
+
|
213 |
+
### **For Experiment Management**
|
214 |
+
- ✅ **Centralized**: All experiments in one place
|
215 |
+
- ✅ **Searchable**: Easy to find specific experiments
|
216 |
+
- ✅ **Versioned**: Dataset versioning for experiments
|
217 |
+
- ✅ **Collaborative**: Share experiments with team
|
218 |
+
|
219 |
+
### **For Development**
|
220 |
+
- ✅ **Flexible**: Easy to switch between datasets
|
221 |
+
- ✅ **Configurable**: Environment-based configuration
|
222 |
+
- ✅ **Robust**: Fallback mechanisms
|
223 |
+
- ✅ **Debuggable**: Comprehensive logging
|
224 |
+
|
225 |
+
## 🎯 Next Steps
|
226 |
+
|
227 |
+
1. **Set up your HF token and dataset repository**
|
228 |
+
2. **Test the configuration with `python configure_trackio.py`**
|
229 |
+
3. **Run a training experiment to verify monitoring**
|
230 |
+
4. **Check your HF Dataset repository for experiment data**
|
231 |
+
5. **View results in your Trackio interface**
|
232 |
+
|
233 |
+
## 📚 Related Files
|
234 |
+
|
235 |
+
- `monitoring.py` - Enhanced monitoring with HF Datasets support
|
236 |
+
- `train.py` - Updated training script with monitoring integration
|
237 |
+
- `configure_trackio.py` - Configuration and testing script
|
238 |
+
- `setup_hf_dataset.py` - Dataset repository setup
|
239 |
+
- `test_hf_datasets.py` - Dataset access testing
|
240 |
+
- `ENVIRONMENT_VARIABLES.md` - Environment variable reference
|
241 |
+
- `HF_DATASETS_GUIDE.md` - Detailed HF Datasets guide
|
242 |
+
|
243 |
+
---
|
244 |
+
|
245 |
+
**🎉 Your experiments are now persistently stored and easily accessible!**
|
NO_THINK_TAG_GUIDE.md → docs/NO_THINK_TAG_GUIDE.md
RENAMED
File without changes
|
PUSH_GUIDE.md → docs/PUSH_GUIDE.md
RENAMED
File without changes
|
docs/PUSH_SCRIPT_GUIDE.md
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🚀 Push to Hugging Face Script Guide
|
2 |
+
|
3 |
+
## Overview
|
4 |
+
|
5 |
+
The `push_to_huggingface.py` script has been enhanced to integrate with **HF Datasets** for experiment tracking and provides complete model deployment with persistent experiment storage.
|
6 |
+
|
7 |
+
## 🚀 Key Improvements
|
8 |
+
|
9 |
+
### **1. HF Datasets Integration**
|
10 |
+
- ✅ **Dataset Repository Support**: Configurable dataset repository for experiment storage
|
11 |
+
- ✅ **Environment Variables**: Automatic detection of `HF_TOKEN` and `TRACKIO_DATASET_REPO`
|
12 |
+
- ✅ **Enhanced Logging**: Logs push actions to both Trackio and HF Datasets
|
13 |
+
- ✅ **Model Card Integration**: Includes dataset repository information in model cards
|
14 |
+
|
15 |
+
### **2. Enhanced Configuration**
|
16 |
+
- ✅ **Flexible Token Input**: Multiple ways to provide HF token
|
17 |
+
- ✅ **Dataset Repository Tracking**: Links models to their experiment datasets
|
18 |
+
- ✅ **Environment Variable Support**: Fallback to environment variables
|
19 |
+
- ✅ **Command Line Arguments**: New arguments for HF Datasets integration
|
20 |
+
|
21 |
+
### **3. Improved Model Cards**
|
22 |
+
- ✅ **Dataset Repository Info**: Shows which dataset contains experiment data
|
23 |
+
- ✅ **Experiment Tracking Section**: Explains how to access training data
|
24 |
+
- ✅ **Enhanced Documentation**: Better model cards with experiment links
|
25 |
+
|
26 |
+
## 📋 Usage Examples
|
27 |
+
|
28 |
+
### **Basic Usage**
|
29 |
+
```bash
|
30 |
+
# Push model with default settings
|
31 |
+
python push_to_huggingface.py /path/to/model username/repo-name
|
32 |
+
```
|
33 |
+
|
34 |
+
### **With HF Datasets Integration**
|
35 |
+
```bash
|
36 |
+
# Push model with custom dataset repository
|
37 |
+
python push_to_huggingface.py /path/to/model username/repo-name \
|
38 |
+
--dataset-repo username/experiments
|
39 |
+
```
|
40 |
+
|
41 |
+
### **With Custom Token**
|
42 |
+
```bash
|
43 |
+
# Push model with custom HF token
|
44 |
+
python push_to_huggingface.py /path/to/model username/repo-name \
|
45 |
+
--hf-token your_token_here
|
46 |
+
```
|
47 |
+
|
48 |
+
### **Complete Example**
|
49 |
+
```bash
|
50 |
+
# Push model with all options
|
51 |
+
python push_to_huggingface.py /path/to/model username/repo-name \
|
52 |
+
--dataset-repo username/experiments \
|
53 |
+
--hf-token your_token_here \
|
54 |
+
--private \
|
55 |
+
--experiment-name "smollm3_finetune_v2"
|
56 |
+
```
|
57 |
+
|
58 |
+
## 🔧 Command Line Arguments
|
59 |
+
|
60 |
+
| Argument | Required | Default | Description |
|
61 |
+
|----------|----------|---------|-------------|
|
62 |
+
| `model_path` | ✅ Yes | None | Path to trained model directory |
|
63 |
+
| `repo_name` | ✅ Yes | None | HF repository name (username/repo-name) |
|
64 |
+
| `--token` | ❌ No | `HF_TOKEN` env | Hugging Face token |
|
65 |
+
| `--hf-token` | ❌ No | `HF_TOKEN` env | HF token (alternative to --token) |
|
66 |
+
| `--private` | ❌ No | False | Make repository private |
|
67 |
+
| `--trackio-url` | ❌ No | None | Trackio Space URL for logging |
|
68 |
+
| `--experiment-name` | ❌ No | None | Experiment name for Trackio |
|
69 |
+
| `--dataset-repo` | ❌ No | `TRACKIO_DATASET_REPO` env | HF Dataset repository |
|
70 |
+
|
71 |
+
## 🛠️ Configuration Methods
|
72 |
+
|
73 |
+
### **Method 1: Command Line Arguments**
|
74 |
+
```bash
|
75 |
+
python push_to_huggingface.py model_path repo_name \
|
76 |
+
--dataset-repo username/experiments \
|
77 |
+
--hf-token your_token_here
|
78 |
+
```
|
79 |
+
|
80 |
+
### **Method 2: Environment Variables**
|
81 |
+
```bash
|
82 |
+
export HF_TOKEN=your_token_here
|
83 |
+
export TRACKIO_DATASET_REPO=username/experiments
|
84 |
+
python push_to_huggingface.py model_path repo_name
|
85 |
+
```
|
86 |
+
|
87 |
+
### **Method 3: Hybrid Approach**
|
88 |
+
```bash
|
89 |
+
# Set defaults via environment variables
|
90 |
+
export HF_TOKEN=your_token_here
|
91 |
+
export TRACKIO_DATASET_REPO=username/experiments
|
92 |
+
|
93 |
+
# Override specific values via command line
|
94 |
+
python push_to_huggingface.py model_path repo_name \
|
95 |
+
--dataset-repo username/specific-experiments
|
96 |
+
```
|
97 |
+
|
98 |
+
## 📊 What Gets Pushed
|
99 |
+
|
100 |
+
### **Model Files**
|
101 |
+
- ✅ **Model Weights**: `pytorch_model.bin`
|
102 |
+
- ✅ **Configuration**: `config.json`
|
103 |
+
- ✅ **Tokenizer**: `tokenizer.json`, `tokenizer_config.json`
|
104 |
+
- ✅ **All Other Files**: Any additional files in model directory
|
105 |
+
|
106 |
+
### **Documentation**
|
107 |
+
- ✅ **Model Card**: Comprehensive README.md with model information
|
108 |
+
- ✅ **Training Configuration**: JSON configuration used for training
|
109 |
+
- ✅ **Training Results**: JSON results and metrics
|
110 |
+
- ✅ **Training Logs**: Text logs from training process
|
111 |
+
|
112 |
+
### **Experiment Data**
|
113 |
+
- ✅ **Dataset Repository**: Links to HF Dataset containing experiment data
|
114 |
+
- ✅ **Training Metrics**: All training metrics stored in dataset
|
115 |
+
- ✅ **Configuration**: Training configuration stored in dataset
|
116 |
+
- ✅ **Artifacts**: Training artifacts and logs
|
117 |
+
|
118 |
+
## 🔍 Enhanced Model Cards
|
119 |
+
|
120 |
+
The improved script creates enhanced model cards that include:
|
121 |
+
|
122 |
+
### **Model Information**
|
123 |
+
- Base model and architecture
|
124 |
+
- Training date and model size
|
125 |
+
- **Dataset repository** for experiment data
|
126 |
+
|
127 |
+
### **Training Configuration**
|
128 |
+
- Complete training parameters
|
129 |
+
- Hardware information
|
130 |
+
- Training duration and steps
|
131 |
+
|
132 |
+
### **Experiment Tracking**
|
133 |
+
- Links to HF Dataset repository
|
134 |
+
- Instructions for accessing experiment data
|
135 |
+
- Training metrics and results
|
136 |
+
|
137 |
+
### **Usage Examples**
|
138 |
+
- Code examples for loading and using the model
|
139 |
+
- Generation examples
|
140 |
+
- Performance information
|
141 |
+
|
142 |
+
## 📈 Logging Integration
|
143 |
+
|
144 |
+
### **Trackio Logging**
|
145 |
+
- ✅ **Push Actions**: Logs model push events
|
146 |
+
- ✅ **Model Information**: Repository name, size, configuration
|
147 |
+
- ✅ **Training Data**: Links to experiment dataset
|
148 |
+
|
149 |
+
### **HF Datasets Logging**
|
150 |
+
- ✅ **Experiment Summary**: Final training summary
|
151 |
+
- ✅ **Push Metadata**: Model repository and push date
|
152 |
+
- ✅ **Configuration**: Complete training configuration
|
153 |
+
|
154 |
+
### **Dual Storage**
|
155 |
+
- ✅ **Trackio**: Real-time monitoring and visualization
|
156 |
+
- ✅ **HF Datasets**: Persistent experiment storage
|
157 |
+
- ✅ **Synchronized**: Both systems updated together
|
158 |
+
|
159 |
+
## 🚨 Troubleshooting
|
160 |
+
|
161 |
+
### **Issue: "Missing required files"**
|
162 |
+
**Solutions**:
|
163 |
+
1. Check model directory contains required files
|
164 |
+
2. Ensure model was saved correctly during training
|
165 |
+
3. Verify file permissions
|
166 |
+
|
167 |
+
### **Issue: "Failed to create repository"**
|
168 |
+
**Solutions**:
|
169 |
+
1. Check HF token has write permissions
|
170 |
+
2. Verify repository name format: `username/repo-name`
|
171 |
+
3. Ensure repository doesn't already exist (or use `--private`)
|
172 |
+
|
173 |
+
### **Issue: "Failed to upload files"**
|
174 |
+
**Solutions**:
|
175 |
+
1. Check network connectivity
|
176 |
+
2. Verify HF token is valid
|
177 |
+
3. Ensure repository was created successfully
|
178 |
+
|
179 |
+
### **Issue: "Dataset repository not found"**
|
180 |
+
**Solutions**:
|
181 |
+
1. Check dataset repository exists
|
182 |
+
2. Verify HF token has read access
|
183 |
+
3. Use `--dataset-repo` to specify correct repository
|
184 |
+
|
185 |
+
## 📋 Workflow Integration
|
186 |
+
|
187 |
+
### **Complete Training Workflow**
|
188 |
+
1. **Train Model**: Use training scripts with monitoring
|
189 |
+
2. **Monitor Progress**: View metrics in Trackio interface
|
190 |
+
3. **Push Model**: Use improved push script
|
191 |
+
4. **Access Data**: View experiments in HF Dataset repository
|
192 |
+
|
193 |
+
### **Example Workflow**
|
194 |
+
```bash
|
195 |
+
# 1. Train model with monitoring
|
196 |
+
python train.py config/train_smollm3_openhermes_fr.py \
|
197 |
+
--experiment_name "smollm3_french_v2"
|
198 |
+
|
199 |
+
# 2. Push model to HF Hub
|
200 |
+
python push_to_huggingface.py outputs/model username/smollm3-french \
|
201 |
+
--dataset-repo username/experiments \
|
202 |
+
--experiment-name "smollm3_french_v2"
|
203 |
+
|
204 |
+
# 3. View results
|
205 |
+
# - Model: https://huggingface.co/username/smollm3-french
|
206 |
+
# - Experiments: https://huggingface.co/datasets/username/experiments
|
207 |
+
# - Trackio: Your Trackio Space interface
|
208 |
+
```
|
209 |
+
|
210 |
+
## 🎯 Benefits
|
211 |
+
|
212 |
+
### **For Model Deployment**
|
213 |
+
- ✅ **Complete Documentation**: Enhanced model cards with experiment links
|
214 |
+
- ✅ **Persistent Storage**: Experiment data stored in HF Datasets
|
215 |
+
- ✅ **Easy Access**: Direct links to training data and metrics
|
216 |
+
- ✅ **Reproducibility**: Complete training configuration included
|
217 |
+
|
218 |
+
### **For Experiment Management**
|
219 |
+
- ✅ **Centralized Storage**: All experiments in HF Dataset repository
|
220 |
+
- ✅ **Version Control**: Model versions linked to experiment data
|
221 |
+
- ✅ **Collaboration**: Share experiments and models easily
|
222 |
+
- ✅ **Searchability**: Easy to find specific experiments
|
223 |
+
|
224 |
+
### **For Development**
|
225 |
+
- ✅ **Flexible Configuration**: Multiple ways to set parameters
|
226 |
+
- ✅ **Backward Compatible**: Works with existing setups
|
227 |
+
- ✅ **Error Handling**: Clear error messages and troubleshooting
|
228 |
+
- ✅ **Integration**: Works with existing monitoring system
|
229 |
+
|
230 |
+
## 📊 Testing Results
|
231 |
+
|
232 |
+
All push script tests passed:
|
233 |
+
- ✅ **HuggingFacePusher Initialization**: Works with new parameters
|
234 |
+
- ✅ **Model Card Creation**: Includes HF Datasets integration
|
235 |
+
- ✅ **Logging Integration**: Logs to both Trackio and HF Datasets
|
236 |
+
- ✅ **Argument Parsing**: Handles new command line arguments
|
237 |
+
- ✅ **Environment Variables**: Proper fallback handling
|
238 |
+
|
239 |
+
## 🔄 Migration Guide
|
240 |
+
|
241 |
+
### **From Old Script**
|
242 |
+
```bash
|
243 |
+
# Old way
|
244 |
+
python push_to_huggingface.py model_path repo_name --token your_token
|
245 |
+
|
246 |
+
# New way (same functionality)
|
247 |
+
python push_to_huggingface.py model_path repo_name --hf-token your_token
|
248 |
+
|
249 |
+
# New way with HF Datasets
|
250 |
+
python push_to_huggingface.py model_path repo_name \
|
251 |
+
--hf-token your_token \
|
252 |
+
--dataset-repo username/experiments
|
253 |
+
```
|
254 |
+
|
255 |
+
### **Environment Variables**
|
256 |
+
```bash
|
257 |
+
# Set environment variables for automatic detection
|
258 |
+
export HF_TOKEN=your_token_here
|
259 |
+
export TRACKIO_DATASET_REPO=username/experiments
|
260 |
+
|
261 |
+
# Then use simple command
|
262 |
+
python push_to_huggingface.py model_path repo_name
|
263 |
+
```
|
264 |
+
|
265 |
+
---
|
266 |
+
|
267 |
+
**🎉 Your push script is now fully integrated with HF Datasets for complete experiment tracking and model deployment!**
|
TRACKIO_INTEGRATION.md → docs/TRACKIO_INTEGRATION.md
RENAMED
File without changes
|
TRACKIO_INTEGRATION_VERIFICATION.md → docs/TRACKIO_INTEGRATION_VERIFICATION.md
RENAMED
File without changes
|
TRACKIO_INTERFACE_GUIDE.md → docs/TRACKIO_INTERFACE_GUIDE.md
RENAMED
File without changes
|
launch.sh
ADDED
@@ -0,0 +1,690 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Interactive SmolLM3 End-to-End Fine-tuning Pipeline
|
3 |
+
# This script creates a complete finetuning pipeline with user configuration
|
4 |
+
|
5 |
+
set -e # Exit on any error
|
6 |
+
|
7 |
+
# Colors for output
|
8 |
+
RED='\033[0;31m'
|
9 |
+
GREEN='\033[0;32m'
|
10 |
+
YELLOW='\033[1;33m'
|
11 |
+
BLUE='\033[0;34m'
|
12 |
+
PURPLE='\033[0;35m'
|
13 |
+
CYAN='\033[0;36m'
|
14 |
+
NC='\033[0m' # No Color
|
15 |
+
|
16 |
+
# Function to print colored output
|
17 |
+
print_status() {
|
18 |
+
echo -e "${GREEN}✅ $1${NC}"
|
19 |
+
}
|
20 |
+
|
21 |
+
print_warning() {
|
22 |
+
echo -e "${YELLOW}⚠️ $1${NC}"
|
23 |
+
}
|
24 |
+
|
25 |
+
print_error() {
|
26 |
+
echo -e "${RED}❌ $1${NC}"
|
27 |
+
}
|
28 |
+
|
29 |
+
print_info() {
|
30 |
+
echo -e "${BLUE}ℹ️ $1${NC}"
|
31 |
+
}
|
32 |
+
|
33 |
+
print_header() {
|
34 |
+
echo -e "${PURPLE}🚀 $1${NC}"
|
35 |
+
}
|
36 |
+
|
37 |
+
print_step() {
|
38 |
+
echo -e "${CYAN}📋 $1${NC}"
|
39 |
+
}
|
40 |
+
|
41 |
+
# Function to get user input with default value
|
42 |
+
get_input() {
|
43 |
+
local prompt="$1"
|
44 |
+
local default="$2"
|
45 |
+
local var_name="$3"
|
46 |
+
|
47 |
+
if [ -n "$default" ]; then
|
48 |
+
read -p "$prompt [$default]: " input
|
49 |
+
if [ -z "$input" ]; then
|
50 |
+
input="$default"
|
51 |
+
fi
|
52 |
+
else
|
53 |
+
read -p "$prompt: " input
|
54 |
+
while [ -z "$input" ]; do
|
55 |
+
print_error "This field is required!"
|
56 |
+
read -p "$prompt: " input
|
57 |
+
done
|
58 |
+
fi
|
59 |
+
|
60 |
+
eval "$var_name=\"$input\""
|
61 |
+
}
|
62 |
+
|
63 |
+
# Function to select from options
|
64 |
+
select_option() {
|
65 |
+
local prompt="$1"
|
66 |
+
local options=("${@:2}")
|
67 |
+
local var_name="${!#}"
|
68 |
+
|
69 |
+
echo "$prompt"
|
70 |
+
for i in "${!options[@]}"; do
|
71 |
+
echo " $((i+1)). ${options[$i]}"
|
72 |
+
done
|
73 |
+
|
74 |
+
while true; do
|
75 |
+
read -p "Enter your choice (1-${#options[@]}): " choice
|
76 |
+
if [[ "$choice" =~ ^[0-9]+$ ]] && [ "$choice" -ge 1 ] && [ "$choice" -le "${#options[@]}" ]; then
|
77 |
+
eval "$var_name=\"${options[$((choice-1))]}\""
|
78 |
+
break
|
79 |
+
else
|
80 |
+
print_error "Invalid choice. Please enter a number between 1 and ${#options[@]}"
|
81 |
+
fi
|
82 |
+
done
|
83 |
+
}
|
84 |
+
|
85 |
+
# Function to validate HF token
|
86 |
+
validate_hf_token() {
|
87 |
+
local token="$1"
|
88 |
+
if [ -z "$token" ]; then
|
89 |
+
return 1
|
90 |
+
fi
|
91 |
+
|
92 |
+
# Test the token
|
93 |
+
export HF_TOKEN="$token"
|
94 |
+
if huggingface-cli whoami >/dev/null 2>&1; then
|
95 |
+
return 0
|
96 |
+
else
|
97 |
+
return 1
|
98 |
+
fi
|
99 |
+
}
|
100 |
+
|
101 |
+
# Function to show training configurations
|
102 |
+
show_training_configs() {
|
103 |
+
echo ""
|
104 |
+
print_header "Available Training Configurations"
|
105 |
+
echo "======================================"
|
106 |
+
echo ""
|
107 |
+
echo "1. Basic Training (Default)"
|
108 |
+
echo " - Model: SmolLM3-3B"
|
109 |
+
echo " - Dataset: SmolTalk"
|
110 |
+
echo " - Epochs: 3"
|
111 |
+
echo " - Batch Size: 2"
|
112 |
+
echo " - Learning Rate: 5e-6"
|
113 |
+
echo ""
|
114 |
+
echo "2. H100 Lightweight (Rapid)"
|
115 |
+
echo " - Model: SmolLM3-3B"
|
116 |
+
echo " - Dataset: OpenHermes-FR (80K samples)"
|
117 |
+
echo " - Epochs: 1"
|
118 |
+
echo " - Batch Size: 16"
|
119 |
+
echo " - Learning Rate: 8e-6"
|
120 |
+
echo " - Sequence Length: 8192"
|
121 |
+
echo " - Optimized for H100 rapid training"
|
122 |
+
echo ""
|
123 |
+
echo "3. A100 Large Scale"
|
124 |
+
echo " - Model: SmolLM3-3B"
|
125 |
+
echo " - Dataset: OpenHermes-FR"
|
126 |
+
echo " - Epochs: 1.3 passes"
|
127 |
+
echo " - Batch Size: 8"
|
128 |
+
echo " - Learning Rate: 5e-6"
|
129 |
+
echo " - Sequence Length: 8192"
|
130 |
+
echo ""
|
131 |
+
echo "4. Multiple Passes"
|
132 |
+
echo " - Model: SmolLM3-3B"
|
133 |
+
echo " - Dataset: OpenHermes-FR"
|
134 |
+
echo " - Epochs: 4 passes"
|
135 |
+
echo " - Batch Size: 6"
|
136 |
+
echo " - Learning Rate: 3e-6"
|
137 |
+
echo " - Sequence Length: 8192"
|
138 |
+
echo ""
|
139 |
+
echo "5. Custom Configuration"
|
140 |
+
echo " - User-defined parameters"
|
141 |
+
echo ""
|
142 |
+
}
|
143 |
+
|
144 |
+
# Function to get training configuration
|
145 |
+
get_training_config() {
|
146 |
+
local config_type="$1"
|
147 |
+
|
148 |
+
case "$config_type" in
|
149 |
+
"Basic Training")
|
150 |
+
MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
|
151 |
+
DATASET_NAME="HuggingFaceTB/smoltalk"
|
152 |
+
MAX_EPOCHS=3
|
153 |
+
BATCH_SIZE=2
|
154 |
+
GRADIENT_ACCUMULATION_STEPS=8
|
155 |
+
LEARNING_RATE=5e-6
|
156 |
+
MAX_SEQ_LENGTH=4096
|
157 |
+
CONFIG_FILE="config/train_smollm3.py"
|
158 |
+
;;
|
159 |
+
"H100 Lightweight (Rapid)")
|
160 |
+
MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
|
161 |
+
DATASET_NAME="legmlai/openhermes-fr"
|
162 |
+
MAX_EPOCHS=1
|
163 |
+
BATCH_SIZE=16
|
164 |
+
GRADIENT_ACCUMULATION_STEPS=4
|
165 |
+
LEARNING_RATE=8e-6
|
166 |
+
MAX_SEQ_LENGTH=8192
|
167 |
+
DATASET_SAMPLE_SIZE=80000
|
168 |
+
CONFIG_FILE="config/train_smollm3_h100_lightweight.py"
|
169 |
+
;;
|
170 |
+
"A100 Large Scale")
|
171 |
+
MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
|
172 |
+
DATASET_NAME="legmlai/openhermes-fr"
|
173 |
+
MAX_EPOCHS=1
|
174 |
+
BATCH_SIZE=8
|
175 |
+
GRADIENT_ACCUMULATION_STEPS=16
|
176 |
+
LEARNING_RATE=5e-6
|
177 |
+
MAX_SEQ_LENGTH=8192
|
178 |
+
CONFIG_FILE="config/train_smollm3_openhermes_fr_a100_large.py"
|
179 |
+
;;
|
180 |
+
"Multiple Passes")
|
181 |
+
MODEL_NAME="HuggingFaceTB/SmolLM3-3B"
|
182 |
+
DATASET_NAME="legmlai/openhermes-fr"
|
183 |
+
MAX_EPOCHS=4
|
184 |
+
BATCH_SIZE=6
|
185 |
+
GRADIENT_ACCUMULATION_STEPS=20
|
186 |
+
LEARNING_RATE=3e-6
|
187 |
+
MAX_SEQ_LENGTH=8192
|
188 |
+
CONFIG_FILE="config/train_smollm3_openhermes_fr_a100_multiple_passes.py"
|
189 |
+
;;
|
190 |
+
"Custom Configuration")
|
191 |
+
get_custom_config
|
192 |
+
;;
|
193 |
+
esac
|
194 |
+
}
|
195 |
+
|
196 |
+
# Function to get custom configuration
|
197 |
+
get_custom_config() {
|
198 |
+
print_step "Custom Configuration Setup"
|
199 |
+
echo "============================="
|
200 |
+
|
201 |
+
get_input "Model name" "HuggingFaceTB/SmolLM3-3B" MODEL_NAME
|
202 |
+
get_input "Dataset name" "HuggingFaceTB/smoltalk" DATASET_NAME
|
203 |
+
get_input "Number of epochs" "3" MAX_EPOCHS
|
204 |
+
get_input "Batch size" "2" BATCH_SIZE
|
205 |
+
get_input "Gradient accumulation steps" "8" GRADIENT_ACCUMULATION_STEPS
|
206 |
+
get_input "Learning rate" "5e-6" LEARNING_RATE
|
207 |
+
get_input "Max sequence length" "4096" MAX_SEQ_LENGTH
|
208 |
+
|
209 |
+
# Select config file based on dataset
|
210 |
+
if [[ "$DATASET_NAME" == *"openhermes"* ]]; then
|
211 |
+
CONFIG_FILE="config/train_smollm3_openhermes_fr.py"
|
212 |
+
else
|
213 |
+
CONFIG_FILE="config/train_smollm3.py"
|
214 |
+
fi
|
215 |
+
}
|
216 |
+
|
217 |
+
# Function to create training configuration file
|
218 |
+
create_training_config() {
|
219 |
+
local config_file="$1"
|
220 |
+
|
221 |
+
cat > "$config_file" << EOF
|
222 |
+
"""
|
223 |
+
SmolLM3 Training Configuration - Generated by launch.sh
|
224 |
+
Optimized for: $TRAINING_CONFIG_TYPE
|
225 |
+
"""
|
226 |
+
|
227 |
+
from config.train_smollm3 import SmolLM3Config
|
228 |
+
|
229 |
+
config = SmolLM3Config(
|
230 |
+
# Model configuration
|
231 |
+
model_name="$MODEL_NAME",
|
232 |
+
max_seq_length=$MAX_SEQ_LENGTH,
|
233 |
+
use_flash_attention=True,
|
234 |
+
use_gradient_checkpointing=True,
|
235 |
+
|
236 |
+
# Training configuration
|
237 |
+
batch_size=$BATCH_SIZE,
|
238 |
+
gradient_accumulation_steps=$GRADIENT_ACCUMULATION_STEPS,
|
239 |
+
learning_rate=$LEARNING_RATE,
|
240 |
+
weight_decay=0.01,
|
241 |
+
warmup_steps=100,
|
242 |
+
max_iters=None, # Will be calculated based on epochs
|
243 |
+
eval_interval=100,
|
244 |
+
log_interval=10,
|
245 |
+
save_interval=500,
|
246 |
+
|
247 |
+
# Optimizer configuration
|
248 |
+
optimizer="adamw",
|
249 |
+
beta1=0.9,
|
250 |
+
beta2=0.95,
|
251 |
+
eps=1e-8,
|
252 |
+
|
253 |
+
# Scheduler configuration
|
254 |
+
scheduler="cosine",
|
255 |
+
min_lr=1e-6,
|
256 |
+
|
257 |
+
# Mixed precision
|
258 |
+
fp16=True,
|
259 |
+
bf16=False,
|
260 |
+
|
261 |
+
# Logging and saving
|
262 |
+
save_steps=$SAVE_STEPS,
|
263 |
+
eval_steps=$EVAL_STEPS,
|
264 |
+
logging_steps=$LOGGING_STEPS,
|
265 |
+
save_total_limit=3,
|
266 |
+
|
267 |
+
# Evaluation
|
268 |
+
eval_strategy="steps",
|
269 |
+
metric_for_best_model="eval_loss",
|
270 |
+
greater_is_better=False,
|
271 |
+
load_best_model_at_end=True,
|
272 |
+
|
273 |
+
# Data configuration
|
274 |
+
dataset_name="$DATASET_NAME",
|
275 |
+
dataset_split="train",
|
276 |
+
input_field="prompt",
|
277 |
+
target_field="completion",
|
278 |
+
filter_bad_entries=False,
|
279 |
+
bad_entry_field="bad_entry",
|
280 |
+
|
281 |
+
# Chat template configuration
|
282 |
+
use_chat_template=True,
|
283 |
+
chat_template_kwargs={
|
284 |
+
"enable_thinking": False,
|
285 |
+
"add_generation_prompt": True,
|
286 |
+
"no_think_system_message": True
|
287 |
+
},
|
288 |
+
|
289 |
+
# Trackio monitoring configuration
|
290 |
+
enable_tracking=True,
|
291 |
+
trackio_url="$TRACKIO_URL",
|
292 |
+
trackio_token=None,
|
293 |
+
log_artifacts=True,
|
294 |
+
log_metrics=True,
|
295 |
+
log_config=True,
|
296 |
+
experiment_name="$EXPERIMENT_NAME",
|
297 |
+
|
298 |
+
# HF Datasets configuration
|
299 |
+
dataset_repo="$TRACKIO_DATASET_REPO"
|
300 |
+
)
|
301 |
+
EOF
|
302 |
+
}
|
303 |
+
|
304 |
+
# Main script starts here
|
305 |
+
print_header "SmolLM3 End-to-End Fine-tuning Pipeline"
|
306 |
+
echo "=============================================="
|
307 |
+
echo ""
|
308 |
+
|
309 |
+
# Step 1: Get user credentials
|
310 |
+
print_step "Step 1: User Authentication"
|
311 |
+
echo "================================"
|
312 |
+
|
313 |
+
get_input "Hugging Face username" "" HF_USERNAME
|
314 |
+
get_input "Hugging Face token (get from https://huggingface.co/settings/tokens)" "" HF_TOKEN
|
315 |
+
|
316 |
+
# Validate HF token
|
317 |
+
print_info "Validating Hugging Face token..."
|
318 |
+
if validate_hf_token "$HF_TOKEN"; then
|
319 |
+
print_status "HF token validated successfully"
|
320 |
+
else
|
321 |
+
print_error "Invalid HF token. Please check your token and try again."
|
322 |
+
exit 1
|
323 |
+
fi
|
324 |
+
|
325 |
+
# Step 2: Select training configuration
|
326 |
+
print_step "Step 2: Training Configuration"
|
327 |
+
echo "=================================="
|
328 |
+
|
329 |
+
show_training_configs
|
330 |
+
select_option "Select training configuration:" "Basic Training" "H100 Lightweight (Rapid)" "A100 Large Scale" "Multiple Passes" "Custom Configuration" TRAINING_CONFIG_TYPE
|
331 |
+
|
332 |
+
get_training_config "$TRAINING_CONFIG_TYPE"
|
333 |
+
|
334 |
+
# Step 3: Get experiment details
|
335 |
+
print_step "Step 3: Experiment Details"
|
336 |
+
echo "=============================="
|
337 |
+
|
338 |
+
get_input "Experiment name" "smollm3_finetune_$(date +%Y%m%d_%H%M%S)" EXPERIMENT_NAME
|
339 |
+
get_input "Model repository name" "$HF_USERNAME/smollm3-finetuned-$(date +%Y%m%d)" REPO_NAME
|
340 |
+
get_input "Trackio dataset repository" "$HF_USERNAME/trackio-experiments" TRACKIO_DATASET_REPO
|
341 |
+
|
342 |
+
# Step 4: Training parameters
|
343 |
+
print_step "Step 4: Training Parameters"
|
344 |
+
echo "==============================="
|
345 |
+
|
346 |
+
echo "Current configuration:"
|
347 |
+
echo " Model: $MODEL_NAME"
|
348 |
+
echo " Dataset: $DATASET_NAME"
|
349 |
+
if [ "$TRAINING_CONFIG_TYPE" = "H100 Lightweight (Rapid)" ]; then
|
350 |
+
echo " Dataset Sample Size: ${DATASET_SAMPLE_SIZE:-80000}"
|
351 |
+
fi
|
352 |
+
echo " Epochs: $MAX_EPOCHS"
|
353 |
+
echo " Batch Size: $BATCH_SIZE"
|
354 |
+
echo " Gradient Accumulation: $GRADIENT_ACCUMULATION_STEPS"
|
355 |
+
echo " Learning Rate: $LEARNING_RATE"
|
356 |
+
echo " Sequence Length: $MAX_SEQ_LENGTH"
|
357 |
+
|
358 |
+
get_input "Save steps" "500" SAVE_STEPS
|
359 |
+
get_input "Evaluation steps" "100" EVAL_STEPS
|
360 |
+
get_input "Logging steps" "10" LOGGING_STEPS
|
361 |
+
|
362 |
+
# Step 5: Trackio Space configuration
|
363 |
+
print_step "Step 5: Trackio Space Configuration"
|
364 |
+
echo "======================================"
|
365 |
+
|
366 |
+
get_input "Trackio Space name" "trackio-monitoring-$(date +%Y%m%d)" TRACKIO_SPACE_NAME
|
367 |
+
TRACKIO_URL="https://huggingface.co/spaces/$HF_USERNAME/$TRACKIO_SPACE_NAME"
|
368 |
+
|
369 |
+
# Step 6: Confirm configuration
|
370 |
+
print_step "Step 6: Configuration Summary"
|
371 |
+
echo "================================="
|
372 |
+
|
373 |
+
echo ""
|
374 |
+
echo "📋 Configuration Summary:"
|
375 |
+
echo "========================"
|
376 |
+
echo " User: $HF_USERNAME"
|
377 |
+
echo " Experiment: $EXPERIMENT_NAME"
|
378 |
+
echo " Model: $MODEL_NAME"
|
379 |
+
echo " Dataset: $DATASET_NAME"
|
380 |
+
echo " Training Config: $TRAINING_CONFIG_TYPE"
|
381 |
+
if [ "$TRAINING_CONFIG_TYPE" = "H100 Lightweight (Rapid)" ]; then
|
382 |
+
echo " Dataset Sample Size: ${DATASET_SAMPLE_SIZE:-80000}"
|
383 |
+
fi
|
384 |
+
echo " Epochs: $MAX_EPOCHS"
|
385 |
+
echo " Batch Size: $BATCH_SIZE"
|
386 |
+
echo " Learning Rate: $LEARNING_RATE"
|
387 |
+
echo " Model Repo: $REPO_NAME"
|
388 |
+
echo " Trackio Space: $TRACKIO_URL"
|
389 |
+
echo " HF Dataset: $TRACKIO_DATASET_REPO"
|
390 |
+
echo ""
|
391 |
+
|
392 |
+
read -p "Proceed with this configuration? (y/N): " confirm
|
393 |
+
if [[ ! "$confirm" =~ ^[Yy]$ ]]; then
|
394 |
+
print_info "Configuration cancelled. Exiting."
|
395 |
+
exit 0
|
396 |
+
fi
|
397 |
+
|
398 |
+
# Step 7: Environment setup
|
399 |
+
print_step "Step 7: Environment Setup"
|
400 |
+
echo "============================"
|
401 |
+
|
402 |
+
print_info "Installing system dependencies..."
|
403 |
+
sudo apt-get update
|
404 |
+
sudo apt-get install -y git curl wget unzip python3-pip python3-venv
|
405 |
+
|
406 |
+
print_info "Creating Python virtual environment..."
|
407 |
+
python3 -m venv smollm3_env
|
408 |
+
source smollm3_env/bin/activate
|
409 |
+
|
410 |
+
print_info "Installing PyTorch with CUDA support..."
|
411 |
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
412 |
+
|
413 |
+
print_info "Installing project dependencies..."
|
414 |
+
pip install -r requirements/requirements_core.txt
|
415 |
+
|
416 |
+
print_info "Installing additional dependencies..."
|
417 |
+
pip install trl>=0.7.0
|
418 |
+
pip install peft>=0.4.0
|
419 |
+
pip install accelerate>=0.20.0
|
420 |
+
pip install huggingface-hub>=0.16.0
|
421 |
+
pip install datasets>=2.14.0
|
422 |
+
pip install requests>=2.31.0
|
423 |
+
|
424 |
+
# Step 8: Authentication setup
|
425 |
+
print_step "Step 8: Authentication Setup"
|
426 |
+
echo "================================"
|
427 |
+
|
428 |
+
export HF_TOKEN="$HF_TOKEN"
|
429 |
+
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
430 |
+
huggingface-cli login --token $HF_TOKEN
|
431 |
+
|
432 |
+
# Step 9: Deploy Trackio Space
|
433 |
+
print_step "Step 9: Deploying Trackio Space"
|
434 |
+
echo "==================================="
|
435 |
+
|
436 |
+
cd scripts/trackio_tonic
|
437 |
+
|
438 |
+
# Create deployment script input
|
439 |
+
cat > deploy_input.txt << EOF
|
440 |
+
$HF_USERNAME
|
441 |
+
$TRACKIO_SPACE_NAME
|
442 |
+
$HF_TOKEN
|
443 |
+
EOF
|
444 |
+
|
445 |
+
# Run deployment script
|
446 |
+
python deploy_trackio_space.py < deploy_input.txt
|
447 |
+
|
448 |
+
print_status "Trackio Space deployed: $TRACKIO_URL"
|
449 |
+
|
450 |
+
# Step 10: Setup HF Dataset
|
451 |
+
print_step "Step 10: Setting up HF Dataset"
|
452 |
+
echo "=================================="
|
453 |
+
|
454 |
+
cd ../dataset_tonic
|
455 |
+
python setup_hf_dataset.py
|
456 |
+
|
457 |
+
# Step 11: Configure Trackio
|
458 |
+
print_step "Step 11: Configuring Trackio"
|
459 |
+
echo "================================="
|
460 |
+
|
461 |
+
cd ../trackio_tonic
|
462 |
+
python configure_trackio.py
|
463 |
+
|
464 |
+
# Step 12: Create training configuration
|
465 |
+
print_step "Step 12: Creating Training Configuration"
|
466 |
+
echo "==========================================="
|
467 |
+
|
468 |
+
cd ../..
|
469 |
+
create_training_config "$CONFIG_FILE"
|
470 |
+
|
471 |
+
# Step 13: Download and prepare dataset
|
472 |
+
print_step "Step 13: Preparing Dataset"
|
473 |
+
echo "==============================="
|
474 |
+
|
475 |
+
python -c "
|
476 |
+
from datasets import load_dataset
|
477 |
+
import json
|
478 |
+
import os
|
479 |
+
import random
|
480 |
+
|
481 |
+
# Load dataset
|
482 |
+
print('Loading dataset: $DATASET_NAME')
|
483 |
+
dataset = load_dataset('$DATASET_NAME')
|
484 |
+
|
485 |
+
# Create dataset directory
|
486 |
+
os.makedirs('training_dataset', exist_ok=True)
|
487 |
+
|
488 |
+
# Convert to training format
|
489 |
+
def convert_to_training_format(example):
|
490 |
+
# Handle different dataset formats
|
491 |
+
if 'prompt' in example and 'completion' in example:
|
492 |
+
return {
|
493 |
+
'prompt': example['prompt'],
|
494 |
+
'completion': example['completion']
|
495 |
+
}
|
496 |
+
elif 'instruction' in example and 'output' in example:
|
497 |
+
return {
|
498 |
+
'prompt': example['instruction'],
|
499 |
+
'completion': example['output']
|
500 |
+
}
|
501 |
+
elif 'messages' in example:
|
502 |
+
# Handle chat format
|
503 |
+
messages = example['messages']
|
504 |
+
if len(messages) >= 2:
|
505 |
+
return {
|
506 |
+
'prompt': messages[0]['content'],
|
507 |
+
'completion': messages[1]['content']
|
508 |
+
}
|
509 |
+
else:
|
510 |
+
# Fallback
|
511 |
+
return {
|
512 |
+
'prompt': str(example.get('input', '')),
|
513 |
+
'completion': str(example.get('output', ''))
|
514 |
+
}
|
515 |
+
|
516 |
+
# Process train split
|
517 |
+
train_data = []
|
518 |
+
for example in dataset['train']:
|
519 |
+
training_example = convert_to_training_format(example)
|
520 |
+
if training_example['prompt'] and training_example['completion']:
|
521 |
+
train_data.append(training_example)
|
522 |
+
|
523 |
+
# Apply dataset sampling for lightweight configuration
|
524 |
+
if '$TRAINING_CONFIG_TYPE' == 'H100 Lightweight (Rapid)' and len(train_data) > ${DATASET_SAMPLE_SIZE:-0}:
|
525 |
+
print(f'Sampling {${DATASET_SAMPLE_SIZE:-80000}} random samples from {len(train_data)} total samples')
|
526 |
+
random.seed(42) # For reproducibility
|
527 |
+
train_data = random.sample(train_data, ${DATASET_SAMPLE_SIZE:-80000})
|
528 |
+
print(f'Selected {len(train_data)} samples for lightweight training')
|
529 |
+
|
530 |
+
# Process validation split if available
|
531 |
+
val_data = []
|
532 |
+
if 'validation' in dataset:
|
533 |
+
for example in dataset['validation']:
|
534 |
+
training_example = convert_to_training_format(example)
|
535 |
+
if training_example['prompt'] and training_example['completion']:
|
536 |
+
val_data.append(training_example)
|
537 |
+
|
538 |
+
# For lightweight config, also sample validation if it's large
|
539 |
+
if '$TRAINING_CONFIG_TYPE' == 'H100 Lightweight (Rapid)' and len(val_data) > 1000:
|
540 |
+
print(f'Sampling 1000 random validation samples from {len(val_data)} total')
|
541 |
+
random.seed(42) # For reproducibility
|
542 |
+
val_data = random.sample(val_data, 1000)
|
543 |
+
|
544 |
+
# Save to files
|
545 |
+
with open('training_dataset/train.json', 'w') as f:
|
546 |
+
json.dump(train_data, f, indent=2)
|
547 |
+
|
548 |
+
if val_data:
|
549 |
+
with open('training_dataset/validation.json', 'w') as f:
|
550 |
+
json.dump(val_data, f, indent=2)
|
551 |
+
|
552 |
+
print(f'Dataset prepared: {len(train_data)} train samples, {len(val_data)} validation samples')
|
553 |
+
"
|
554 |
+
|
555 |
+
# Step 14: Calculate training parameters
|
556 |
+
print_step "Step 14: Calculating Training Parameters"
|
557 |
+
echo "============================================"
|
558 |
+
|
559 |
+
TOTAL_SAMPLES=$(python -c "import json; data=json.load(open('training_dataset/train.json')); print(len(data))")
|
560 |
+
EFFECTIVE_BATCH_SIZE=$((BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS))
|
561 |
+
STEPS_PER_EPOCH=$((TOTAL_SAMPLES / EFFECTIVE_BATCH_SIZE))
|
562 |
+
MAX_STEPS=$((STEPS_PER_EPOCH * MAX_EPOCHS))
|
563 |
+
|
564 |
+
echo " Total samples: $TOTAL_SAMPLES"
|
565 |
+
echo " Effective batch size: $EFFECTIVE_BATCH_SIZE"
|
566 |
+
echo " Steps per epoch: $STEPS_PER_EPOCH"
|
567 |
+
echo " Total training steps: $MAX_STEPS"
|
568 |
+
|
569 |
+
# Step 15: Start training
|
570 |
+
print_step "Step 15: Starting Training"
|
571 |
+
echo "=============================="
|
572 |
+
|
573 |
+
python src/train.py "$CONFIG_FILE" \
|
574 |
+
--dataset_dir training_dataset \
|
575 |
+
--out_dir /output-checkpoint \
|
576 |
+
--init_from scratch \
|
577 |
+
--max_iters $MAX_STEPS \
|
578 |
+
--batch_size $BATCH_SIZE \
|
579 |
+
--learning_rate $LEARNING_RATE \
|
580 |
+
--gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
|
581 |
+
--max_seq_length $MAX_SEQ_LENGTH \
|
582 |
+
--save_steps $SAVE_STEPS \
|
583 |
+
--eval_steps $EVAL_STEPS \
|
584 |
+
--logging_steps $LOGGING_STEPS \
|
585 |
+
--enable_tracking \
|
586 |
+
--trackio_url "$TRACKIO_URL" \
|
587 |
+
--experiment_name "$EXPERIMENT_NAME" \
|
588 |
+
--hf_token "$HF_TOKEN" \
|
589 |
+
--dataset_repo "$TRACKIO_DATASET_REPO"
|
590 |
+
|
591 |
+
# Step 16: Push model to Hugging Face Hub
|
592 |
+
print_step "Step 16: Pushing Model to HF Hub"
|
593 |
+
echo "====================================="
|
594 |
+
|
595 |
+
python scripts/model_tonic/push_to_huggingface.py /output-checkpoint "$REPO_NAME" \
|
596 |
+
--token "$HF_TOKEN" \
|
597 |
+
--trackio-url "$TRACKIO_URL" \
|
598 |
+
--experiment-name "$EXPERIMENT_NAME" \
|
599 |
+
--dataset-repo "$TRACKIO_DATASET_REPO"
|
600 |
+
|
601 |
+
# Step 17: Test the uploaded model
|
602 |
+
print_step "Step 17: Testing Uploaded Model"
|
603 |
+
echo "==================================="
|
604 |
+
|
605 |
+
python -c "
|
606 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
607 |
+
import torch
|
608 |
+
|
609 |
+
print('Loading uploaded model...')
|
610 |
+
model = AutoModelForCausalLM.from_pretrained('$REPO_NAME', torch_dtype=torch.float16, device_map='auto')
|
611 |
+
tokenizer = AutoTokenizer.from_pretrained('$REPO_NAME')
|
612 |
+
|
613 |
+
print('Testing model generation...')
|
614 |
+
prompt = 'Hello, how are you?'
|
615 |
+
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
|
616 |
+
outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, temperature=0.7)
|
617 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
618 |
+
print(f'Prompt: {prompt}')
|
619 |
+
print(f'Response: {response}')
|
620 |
+
print('✅ Model test completed successfully!')
|
621 |
+
"
|
622 |
+
|
623 |
+
# Step 18: Create summary report
|
624 |
+
print_step "Step 18: Creating Summary Report"
|
625 |
+
echo "===================================="
|
626 |
+
|
627 |
+
cat > training_summary.md << EOF
|
628 |
+
# SmolLM3 Fine-tuning Summary
|
629 |
+
|
630 |
+
## Configuration
|
631 |
+
- **Model**: $MODEL_NAME
|
632 |
+
- **Dataset**: $DATASET_NAME
|
633 |
+
- **Experiment**: $EXPERIMENT_NAME
|
634 |
+
- **Repository**: $REPO_NAME
|
635 |
+
- **Trackio Space**: $TRACKIO_URL
|
636 |
+
- **HF Dataset**: $TRACKIO_DATASET_REPO
|
637 |
+
- **Training Config**: $TRAINING_CONFIG_TYPE
|
638 |
+
$(if [ "$TRAINING_CONFIG_TYPE" = "H100 Lightweight (Rapid)" ]; then
|
639 |
+
echo "- **Dataset Sample Size**: ${DATASET_SAMPLE_SIZE:-80000}"
|
640 |
+
fi)
|
641 |
+
|
642 |
+
## Training Parameters
|
643 |
+
- **Batch Size**: $BATCH_SIZE
|
644 |
+
- **Gradient Accumulation**: $GRADIENT_ACCUMULATION_STEPS
|
645 |
+
- **Learning Rate**: $LEARNING_RATE
|
646 |
+
- **Max Epochs**: $MAX_EPOCHS
|
647 |
+
- **Max Steps**: $MAX_STEPS
|
648 |
+
- **Total Samples**: $TOTAL_SAMPLES
|
649 |
+
- **Sequence Length**: $MAX_SEQ_LENGTH
|
650 |
+
|
651 |
+
## Results
|
652 |
+
- **Model Repository**: https://huggingface.co/$REPO_NAME
|
653 |
+
- **Trackio Monitoring**: $TRACKIO_URL
|
654 |
+
- **Experiment Data**: https://huggingface.co/datasets/$TRACKIO_DATASET_REPO
|
655 |
+
|
656 |
+
## Next Steps
|
657 |
+
1. Monitor training progress in your Trackio Space
|
658 |
+
2. Check the model repository on Hugging Face Hub
|
659 |
+
3. Use the model in your applications
|
660 |
+
4. Share your results with the community
|
661 |
+
|
662 |
+
## Files Created
|
663 |
+
- Training configuration: \`$CONFIG_FILE\`
|
664 |
+
- Dataset: \`training_dataset/\`
|
665 |
+
- Model checkpoint: \`/output-checkpoint/\`
|
666 |
+
- Training logs: \`training.log\`
|
667 |
+
- Summary report: \`training_summary.md\`
|
668 |
+
EOF
|
669 |
+
|
670 |
+
print_status "Summary report saved to: training_summary.md"
|
671 |
+
|
672 |
+
# Final summary
|
673 |
+
echo ""
|
674 |
+
print_header "🎉 End-to-End Pipeline Completed Successfully!"
|
675 |
+
echo "=================================================="
|
676 |
+
echo ""
|
677 |
+
echo "📊 Model: https://huggingface.co/$REPO_NAME"
|
678 |
+
echo "📈 Trackio: $TRACKIO_URL"
|
679 |
+
echo "📋 Experiment: $EXPERIMENT_NAME"
|
680 |
+
echo "📊 Dataset: https://huggingface.co/datasets/$TRACKIO_DATASET_REPO"
|
681 |
+
echo ""
|
682 |
+
echo "📋 Summary report saved to: training_summary.md"
|
683 |
+
echo ""
|
684 |
+
echo "🚀 Next steps:"
|
685 |
+
echo "1. Monitor training progress in your Trackio Space"
|
686 |
+
echo "2. Check the model repository on Hugging Face Hub"
|
687 |
+
echo "3. Use the model in your applications"
|
688 |
+
echo "4. Share your results with the community"
|
689 |
+
echo ""
|
690 |
+
print_status "Pipeline completed successfully!"
|
requirements.txt → requirements/requirements.txt
RENAMED
File without changes
|
requirements_core.txt → requirements/requirements_core.txt
RENAMED
@@ -9,6 +9,12 @@ tokenizers>=0.13.0
|
|
9 |
bitsandbytes>=0.41.0
|
10 |
numpy>=1.24.0
|
11 |
tqdm>=4.65.0
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
trackio>=0.1.0
|
13 |
psutil>=5.9.0
|
14 |
-
pynvml>=12.0.0
|
|
|
9 |
bitsandbytes>=0.41.0
|
10 |
numpy>=1.24.0
|
11 |
tqdm>=4.65.0
|
12 |
+
|
13 |
+
|
14 |
+
# Monitoring dependencies
|
15 |
+
requests>=2.31.0
|
16 |
+
pandas>=2.0.0
|
17 |
+
plotly>=5.0.0
|
18 |
trackio>=0.1.0
|
19 |
psutil>=5.9.0
|
20 |
+
pynvml>=12.0.0
|
requirements_minimal.txt → requirements/requirements_minimal.txt
RENAMED
File without changes
|
add_demo_data.py → scripts/dataset_tonic/add_demo_data.py
RENAMED
File without changes
|
scripts/dataset_tonic/setup_hf_dataset.py
ADDED
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Setup script for Hugging Face Dataset repository for Trackio experiments
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
from datetime import datetime
|
9 |
+
from datasets import Dataset
|
10 |
+
from huggingface_hub import HfApi
|
11 |
+
|
12 |
+
def setup_trackio_dataset():
|
13 |
+
"""Set up the Trackio experiments dataset on Hugging Face Hub"""
|
14 |
+
|
15 |
+
# Configuration - get from environment variables with fallbacks
|
16 |
+
dataset_repo = os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
|
17 |
+
hf_token = os.environ.get('HF_TOKEN')
|
18 |
+
|
19 |
+
if not hf_token:
|
20 |
+
print("❌ HF_TOKEN not found. Please set the HF_TOKEN environment variable.")
|
21 |
+
print("You can get your token from: https://huggingface.co/settings/tokens")
|
22 |
+
return False
|
23 |
+
|
24 |
+
print(f"🚀 Setting up Trackio dataset: {dataset_repo}")
|
25 |
+
print(f"🔧 Using dataset repository: {dataset_repo}")
|
26 |
+
|
27 |
+
# Initial experiment data
|
28 |
+
initial_experiments = [
|
29 |
+
{
|
30 |
+
'experiment_id': 'exp_20250720_130853',
|
31 |
+
'name': 'petite-elle-l-aime-3',
|
32 |
+
'description': 'SmolLM3 fine-tuning experiment',
|
33 |
+
'created_at': '2025-07-20T11:20:01.780908',
|
34 |
+
'status': 'running',
|
35 |
+
'metrics': json.dumps([
|
36 |
+
{
|
37 |
+
'timestamp': '2025-07-20T11:20:01.780908',
|
38 |
+
'step': 25,
|
39 |
+
'metrics': {
|
40 |
+
'loss': 1.1659,
|
41 |
+
'grad_norm': 10.3125,
|
42 |
+
'learning_rate': 7e-08,
|
43 |
+
'num_tokens': 1642080.0,
|
44 |
+
'mean_token_accuracy': 0.75923578992486,
|
45 |
+
'epoch': 0.004851130919895701
|
46 |
+
}
|
47 |
+
},
|
48 |
+
{
|
49 |
+
'timestamp': '2025-07-20T11:26:39.042155',
|
50 |
+
'step': 50,
|
51 |
+
'metrics': {
|
52 |
+
'loss': 1.165,
|
53 |
+
'grad_norm': 10.75,
|
54 |
+
'learning_rate': 1.4291666666666667e-07,
|
55 |
+
'num_tokens': 3324682.0,
|
56 |
+
'mean_token_accuracy': 0.7577659255266189,
|
57 |
+
'epoch': 0.009702261839791402
|
58 |
+
}
|
59 |
+
},
|
60 |
+
{
|
61 |
+
'timestamp': '2025-07-20T11:33:16.203045',
|
62 |
+
'step': 75,
|
63 |
+
'metrics': {
|
64 |
+
'loss': 1.1639,
|
65 |
+
'grad_norm': 10.6875,
|
66 |
+
'learning_rate': 2.1583333333333334e-07,
|
67 |
+
'num_tokens': 4987941.0,
|
68 |
+
'mean_token_accuracy': 0.7581205774843692,
|
69 |
+
'epoch': 0.014553392759687101
|
70 |
+
}
|
71 |
+
},
|
72 |
+
{
|
73 |
+
'timestamp': '2025-07-20T11:39:53.453917',
|
74 |
+
'step': 100,
|
75 |
+
'metrics': {
|
76 |
+
'loss': 1.1528,
|
77 |
+
'grad_norm': 10.75,
|
78 |
+
'learning_rate': 2.8875e-07,
|
79 |
+
'num_tokens': 6630190.0,
|
80 |
+
'mean_token_accuracy': 0.7614579878747463,
|
81 |
+
'epoch': 0.019404523679582803
|
82 |
+
}
|
83 |
+
}
|
84 |
+
]),
|
85 |
+
'parameters': json.dumps({
|
86 |
+
'model_name': 'HuggingFaceTB/SmolLM3-3B',
|
87 |
+
'max_seq_length': 12288,
|
88 |
+
'use_flash_attention': True,
|
89 |
+
'use_gradient_checkpointing': False,
|
90 |
+
'batch_size': 8,
|
91 |
+
'gradient_accumulation_steps': 16,
|
92 |
+
'learning_rate': 3.5e-06,
|
93 |
+
'weight_decay': 0.01,
|
94 |
+
'warmup_steps': 1200,
|
95 |
+
'max_iters': 18000,
|
96 |
+
'eval_interval': 1000,
|
97 |
+
'log_interval': 25,
|
98 |
+
'save_interval': 2000,
|
99 |
+
'optimizer': 'adamw_torch',
|
100 |
+
'beta1': 0.9,
|
101 |
+
'beta2': 0.999,
|
102 |
+
'eps': 1e-08,
|
103 |
+
'scheduler': 'cosine',
|
104 |
+
'min_lr': 3.5e-07,
|
105 |
+
'fp16': False,
|
106 |
+
'bf16': True,
|
107 |
+
'ddp_backend': 'nccl',
|
108 |
+
'ddp_find_unused_parameters': False,
|
109 |
+
'save_steps': 2000,
|
110 |
+
'eval_steps': 1000,
|
111 |
+
'logging_steps': 25,
|
112 |
+
'save_total_limit': 5,
|
113 |
+
'eval_strategy': 'steps',
|
114 |
+
'metric_for_best_model': 'eval_loss',
|
115 |
+
'greater_is_better': False,
|
116 |
+
'load_best_model_at_end': True,
|
117 |
+
'data_dir': None,
|
118 |
+
'train_file': None,
|
119 |
+
'validation_file': None,
|
120 |
+
'test_file': None,
|
121 |
+
'use_chat_template': True,
|
122 |
+
'chat_template_kwargs': {'add_generation_prompt': True, 'no_think_system_message': True},
|
123 |
+
'enable_tracking': True,
|
124 |
+
'trackio_url': 'https://tonic-test-trackio-test.hf.space',
|
125 |
+
'trackio_token': None,
|
126 |
+
'log_artifacts': True,
|
127 |
+
'log_metrics': True,
|
128 |
+
'log_config': True,
|
129 |
+
'experiment_name': 'petite-elle-l-aime-3',
|
130 |
+
'dataset_name': 'legmlai/openhermes-fr',
|
131 |
+
'dataset_split': 'train',
|
132 |
+
'input_field': 'prompt',
|
133 |
+
'target_field': 'accepted_completion',
|
134 |
+
'filter_bad_entries': True,
|
135 |
+
'bad_entry_field': 'bad_entry',
|
136 |
+
'packing': False,
|
137 |
+
'max_prompt_length': 12288,
|
138 |
+
'max_completion_length': 8192,
|
139 |
+
'truncation': True,
|
140 |
+
'dataloader_num_workers': 10,
|
141 |
+
'dataloader_pin_memory': True,
|
142 |
+
'dataloader_prefetch_factor': 3,
|
143 |
+
'max_grad_norm': 1.0,
|
144 |
+
'group_by_length': True
|
145 |
+
}),
|
146 |
+
'artifacts': json.dumps([]),
|
147 |
+
'logs': json.dumps([]),
|
148 |
+
'last_updated': datetime.now().isoformat()
|
149 |
+
},
|
150 |
+
{
|
151 |
+
'experiment_id': 'exp_20250720_134319',
|
152 |
+
'name': 'petite-elle-l-aime-3-1',
|
153 |
+
'description': 'SmolLM3 fine-tuning experiment',
|
154 |
+
'created_at': '2025-07-20T11:54:31.993219',
|
155 |
+
'status': 'running',
|
156 |
+
'metrics': json.dumps([
|
157 |
+
{
|
158 |
+
'timestamp': '2025-07-20T11:54:31.993219',
|
159 |
+
'step': 25,
|
160 |
+
'metrics': {
|
161 |
+
'loss': 1.166,
|
162 |
+
'grad_norm': 10.375,
|
163 |
+
'learning_rate': 7e-08,
|
164 |
+
'num_tokens': 1642080.0,
|
165 |
+
'mean_token_accuracy': 0.7590958896279335,
|
166 |
+
'epoch': 0.004851130919895701
|
167 |
+
}
|
168 |
+
},
|
169 |
+
{
|
170 |
+
'timestamp': '2025-07-20T11:54:33.589487',
|
171 |
+
'step': 25,
|
172 |
+
'metrics': {
|
173 |
+
'gpu_0_memory_allocated': 17.202261447906494,
|
174 |
+
'gpu_0_memory_reserved': 75.474609375,
|
175 |
+
'gpu_0_utilization': 0,
|
176 |
+
'cpu_percent': 2.7,
|
177 |
+
'memory_percent': 10.1
|
178 |
+
}
|
179 |
+
}
|
180 |
+
]),
|
181 |
+
'parameters': json.dumps({
|
182 |
+
'model_name': 'HuggingFaceTB/SmolLM3-3B',
|
183 |
+
'max_seq_length': 12288,
|
184 |
+
'use_flash_attention': True,
|
185 |
+
'use_gradient_checkpointing': False,
|
186 |
+
'batch_size': 8,
|
187 |
+
'gradient_accumulation_steps': 16,
|
188 |
+
'learning_rate': 3.5e-06,
|
189 |
+
'weight_decay': 0.01,
|
190 |
+
'warmup_steps': 1200,
|
191 |
+
'max_iters': 18000,
|
192 |
+
'eval_interval': 1000,
|
193 |
+
'log_interval': 25,
|
194 |
+
'save_interval': 2000,
|
195 |
+
'optimizer': 'adamw_torch',
|
196 |
+
'beta1': 0.9,
|
197 |
+
'beta2': 0.999,
|
198 |
+
'eps': 1e-08,
|
199 |
+
'scheduler': 'cosine',
|
200 |
+
'min_lr': 3.5e-07,
|
201 |
+
'fp16': False,
|
202 |
+
'bf16': True,
|
203 |
+
'ddp_backend': 'nccl',
|
204 |
+
'ddp_find_unused_parameters': False,
|
205 |
+
'save_steps': 2000,
|
206 |
+
'eval_steps': 1000,
|
207 |
+
'logging_steps': 25,
|
208 |
+
'save_total_limit': 5,
|
209 |
+
'eval_strategy': 'steps',
|
210 |
+
'metric_for_best_model': 'eval_loss',
|
211 |
+
'greater_is_better': False,
|
212 |
+
'load_best_model_at_end': True,
|
213 |
+
'data_dir': None,
|
214 |
+
'train_file': None,
|
215 |
+
'validation_file': None,
|
216 |
+
'test_file': None,
|
217 |
+
'use_chat_template': True,
|
218 |
+
'chat_template_kwargs': {'add_generation_prompt': True, 'no_think_system_message': True},
|
219 |
+
'enable_tracking': True,
|
220 |
+
'trackio_url': 'https://tonic-test-trackio-test.hf.space',
|
221 |
+
'trackio_token': None,
|
222 |
+
'log_artifacts': True,
|
223 |
+
'log_metrics': True,
|
224 |
+
'log_config': True,
|
225 |
+
'experiment_name': 'petite-elle-l-aime-3-1',
|
226 |
+
'dataset_name': 'legmlai/openhermes-fr',
|
227 |
+
'dataset_split': 'train',
|
228 |
+
'input_field': 'prompt',
|
229 |
+
'target_field': 'accepted_completion',
|
230 |
+
'filter_bad_entries': True,
|
231 |
+
'bad_entry_field': 'bad_entry',
|
232 |
+
'packing': False,
|
233 |
+
'max_prompt_length': 12288,
|
234 |
+
'max_completion_length': 8192,
|
235 |
+
'truncation': True,
|
236 |
+
'dataloader_num_workers': 10,
|
237 |
+
'dataloader_pin_memory': True,
|
238 |
+
'dataloader_prefetch_factor': 3,
|
239 |
+
'max_grad_norm': 1.0,
|
240 |
+
'group_by_length': True
|
241 |
+
}),
|
242 |
+
'artifacts': json.dumps([]),
|
243 |
+
'logs': json.dumps([]),
|
244 |
+
'last_updated': datetime.now().isoformat()
|
245 |
+
}
|
246 |
+
]
|
247 |
+
|
248 |
+
try:
|
249 |
+
# Create dataset
|
250 |
+
dataset = Dataset.from_list(initial_experiments)
|
251 |
+
|
252 |
+
# Push to HF Hub
|
253 |
+
api = HfApi(token=hf_token)
|
254 |
+
dataset.push_to_hub(
|
255 |
+
dataset_repo,
|
256 |
+
token=hf_token,
|
257 |
+
private=True # Make it private for security
|
258 |
+
)
|
259 |
+
|
260 |
+
print(f"✅ Successfully created dataset: {dataset_repo}")
|
261 |
+
print(f"📊 Added {len(initial_experiments)} experiments")
|
262 |
+
print("🔒 Dataset is private (only accessible with your token)")
|
263 |
+
print("\n🎯 Next steps:")
|
264 |
+
print("1. Set HF_TOKEN in your Hugging Face Space environment")
|
265 |
+
print("2. Deploy the updated app.py to your Space")
|
266 |
+
print("3. The app will now load experiments from the dataset")
|
267 |
+
|
268 |
+
return True
|
269 |
+
|
270 |
+
except Exception as e:
|
271 |
+
print(f"❌ Failed to create dataset: {e}")
|
272 |
+
return False
|
273 |
+
|
274 |
+
if __name__ == "__main__":
|
275 |
+
setup_trackio_dataset()
|
push_to_huggingface.py → scripts/model_tonic/push_to_huggingface.py
RENAMED
@@ -1,7 +1,7 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
Push Trained Model and Results to Hugging Face Hub
|
4 |
-
Integrates with Trackio monitoring and
|
5 |
"""
|
6 |
|
7 |
import os
|
@@ -23,6 +23,9 @@ except ImportError:
|
|
23 |
print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub")
|
24 |
|
25 |
try:
|
|
|
|
|
|
|
26 |
from monitoring import SmolLM3Monitor
|
27 |
MONITORING_AVAILABLE = True
|
28 |
except ImportError:
|
@@ -32,7 +35,7 @@ except ImportError:
|
|
32 |
logger = logging.getLogger(__name__)
|
33 |
|
34 |
class HuggingFacePusher:
|
35 |
-
"""Push trained models and results to Hugging Face Hub"""
|
36 |
|
37 |
def __init__(
|
38 |
self,
|
@@ -41,15 +44,21 @@ class HuggingFacePusher:
|
|
41 |
token: Optional[str] = None,
|
42 |
private: bool = False,
|
43 |
trackio_url: Optional[str] = None,
|
44 |
-
experiment_name: Optional[str] = None
|
|
|
|
|
45 |
):
|
46 |
self.model_path = Path(model_path)
|
47 |
self.repo_name = repo_name
|
48 |
-
self.token = token or os.getenv('HF_TOKEN')
|
49 |
self.private = private
|
50 |
self.trackio_url = trackio_url
|
51 |
self.experiment_name = experiment_name
|
52 |
|
|
|
|
|
|
|
|
|
53 |
# Initialize HF API
|
54 |
if HF_AVAILABLE:
|
55 |
self.api = HfApi(token=self.token)
|
@@ -58,14 +67,17 @@ class HuggingFacePusher:
|
|
58 |
|
59 |
# Initialize monitoring if available
|
60 |
self.monitor = None
|
61 |
-
if MONITORING_AVAILABLE
|
62 |
self.monitor = SmolLM3Monitor(
|
63 |
experiment_name=experiment_name or "model_push",
|
64 |
trackio_url=trackio_url,
|
65 |
-
enable_tracking=
|
|
|
|
|
66 |
)
|
67 |
|
68 |
logger.info(f"Initialized HuggingFacePusher for {repo_name}")
|
|
|
69 |
|
70 |
def create_repository(self) -> bool:
|
71 |
"""Create the Hugging Face repository"""
|
@@ -131,6 +143,7 @@ This is a fine-tuned SmolLM3 model based on the HuggingFaceTB/SmolLM3-3B archite
|
|
131 |
- **Fine-tuning Method**: Supervised Fine-tuning
|
132 |
- **Training Date**: {datetime.now().strftime('%Y-%m-%d')}
|
133 |
- **Model Size**: {self._get_model_size():.1f} GB
|
|
|
134 |
|
135 |
## Training Configuration
|
136 |
|
@@ -166,6 +179,7 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
|
166 |
- **Training Time**: {results.get('training_time_hours', 'Unknown')} hours
|
167 |
- **Final Loss**: {results.get('final_loss', 'Unknown')}
|
168 |
- **Final Accuracy**: {results.get('final_accuracy', 'Unknown')}
|
|
|
169 |
|
170 |
## Model Performance
|
171 |
|
@@ -173,6 +187,10 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
|
173 |
- **Validation Loss**: {results.get('eval_loss', 'Unknown')}
|
174 |
- **Training Steps**: {results.get('total_steps', 'Unknown')}
|
175 |
|
|
|
|
|
|
|
|
|
176 |
## Limitations and Biases
|
177 |
|
178 |
This model is fine-tuned for specific tasks and may not generalize well to all use cases. Please evaluate the model's performance on your specific task before deployment.
|
@@ -293,6 +311,7 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
|
293 |
- **Model Size**: {self._get_model_size():.1f} GB
|
294 |
- **Training Steps**: {results.get('total_steps', 'Unknown')}
|
295 |
- **Final Loss**: {results.get('final_loss', 'Unknown')}
|
|
|
296 |
|
297 |
## Training Configuration
|
298 |
|
@@ -306,6 +325,10 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
|
306 |
{json.dumps(results, indent=2)}
|
307 |
```
|
308 |
|
|
|
|
|
|
|
|
|
309 |
## Files
|
310 |
|
311 |
- `pytorch_model.bin`: Model weights
|
@@ -327,8 +350,8 @@ MIT License
|
|
327 |
upload_file(
|
328 |
path_or_fileobj=str(readme_path),
|
329 |
path_in_repo="README.md",
|
330 |
-
|
331 |
-
|
332 |
)
|
333 |
|
334 |
# Clean up
|
@@ -342,23 +365,36 @@ MIT License
|
|
342 |
return False
|
343 |
|
344 |
def log_to_trackio(self, action: str, details: Dict[str, Any]):
|
345 |
-
"""Log push action to Trackio"""
|
346 |
if self.monitor:
|
347 |
try:
|
|
|
348 |
self.monitor.log_metrics({
|
349 |
"push_action": action,
|
350 |
"repo_name": self.repo_name,
|
351 |
"model_size_gb": self._get_model_size(),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
**details
|
353 |
})
|
354 |
-
|
|
|
355 |
except Exception as e:
|
356 |
logger.error(f"❌ Failed to log to Trackio: {e}")
|
357 |
|
358 |
def push_model(self, training_config: Optional[Dict[str, Any]] = None,
|
359 |
results: Optional[Dict[str, Any]] = None) -> bool:
|
360 |
-
"""Complete model push process"""
|
361 |
logger.info(f"🚀 Starting model push to {self.repo_name}")
|
|
|
362 |
|
363 |
# Validate model path
|
364 |
if not self.validate_model_path():
|
@@ -399,7 +435,7 @@ MIT License
|
|
399 |
if results:
|
400 |
self.upload_training_results(str(self.model_path))
|
401 |
|
402 |
-
# Log to Trackio
|
403 |
self.log_to_trackio("model_push", {
|
404 |
"model_path": str(self.model_path),
|
405 |
"repo_name": self.repo_name,
|
@@ -409,6 +445,7 @@ MIT License
|
|
409 |
})
|
410 |
|
411 |
logger.info(f"🎉 Model successfully pushed to: https://huggingface.co/{self.repo_name}")
|
|
|
412 |
return True
|
413 |
|
414 |
def _load_training_config(self) -> Dict[str, Any]:
|
@@ -437,9 +474,11 @@ def parse_args():
|
|
437 |
|
438 |
# Optional arguments
|
439 |
parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
|
|
|
440 |
parser.add_argument('--private', action='store_true', help='Make repository private')
|
441 |
parser.add_argument('--trackio-url', type=str, default=None, help='Trackio Space URL for logging')
|
442 |
parser.add_argument('--experiment-name', type=str, default=None, help='Experiment name for Trackio')
|
|
|
443 |
|
444 |
return parser.parse_args()
|
445 |
|
@@ -463,7 +502,9 @@ def main():
|
|
463 |
token=args.token,
|
464 |
private=args.private,
|
465 |
trackio_url=args.trackio_url,
|
466 |
-
experiment_name=args.experiment_name
|
|
|
|
|
467 |
)
|
468 |
|
469 |
# Push model
|
@@ -472,6 +513,8 @@ def main():
|
|
472 |
if success:
|
473 |
logger.info("✅ Model push completed successfully!")
|
474 |
logger.info(f"🌐 View your model at: https://huggingface.co/{args.repo_name}")
|
|
|
|
|
475 |
else:
|
476 |
logger.error("❌ Model push failed!")
|
477 |
return 1
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
Push Trained Model and Results to Hugging Face Hub
|
4 |
+
Integrates with Trackio monitoring and HF Datasets for complete model deployment
|
5 |
"""
|
6 |
|
7 |
import os
|
|
|
23 |
print("Warning: huggingface_hub not available. Install with: pip install huggingface_hub")
|
24 |
|
25 |
try:
|
26 |
+
import sys
|
27 |
+
import os
|
28 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
|
29 |
from monitoring import SmolLM3Monitor
|
30 |
MONITORING_AVAILABLE = True
|
31 |
except ImportError:
|
|
|
35 |
logger = logging.getLogger(__name__)
|
36 |
|
37 |
class HuggingFacePusher:
|
38 |
+
"""Push trained models and results to Hugging Face Hub with HF Datasets integration"""
|
39 |
|
40 |
def __init__(
|
41 |
self,
|
|
|
44 |
token: Optional[str] = None,
|
45 |
private: bool = False,
|
46 |
trackio_url: Optional[str] = None,
|
47 |
+
experiment_name: Optional[str] = None,
|
48 |
+
dataset_repo: Optional[str] = None,
|
49 |
+
hf_token: Optional[str] = None
|
50 |
):
|
51 |
self.model_path = Path(model_path)
|
52 |
self.repo_name = repo_name
|
53 |
+
self.token = token or hf_token or os.getenv('HF_TOKEN')
|
54 |
self.private = private
|
55 |
self.trackio_url = trackio_url
|
56 |
self.experiment_name = experiment_name
|
57 |
|
58 |
+
# HF Datasets configuration
|
59 |
+
self.dataset_repo = dataset_repo or os.getenv('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
|
60 |
+
self.hf_token = hf_token or os.getenv('HF_TOKEN')
|
61 |
+
|
62 |
# Initialize HF API
|
63 |
if HF_AVAILABLE:
|
64 |
self.api = HfApi(token=self.token)
|
|
|
67 |
|
68 |
# Initialize monitoring if available
|
69 |
self.monitor = None
|
70 |
+
if MONITORING_AVAILABLE:
|
71 |
self.monitor = SmolLM3Monitor(
|
72 |
experiment_name=experiment_name or "model_push",
|
73 |
trackio_url=trackio_url,
|
74 |
+
enable_tracking=bool(trackio_url),
|
75 |
+
hf_token=self.hf_token,
|
76 |
+
dataset_repo=self.dataset_repo
|
77 |
)
|
78 |
|
79 |
logger.info(f"Initialized HuggingFacePusher for {repo_name}")
|
80 |
+
logger.info(f"Dataset repository: {self.dataset_repo}")
|
81 |
|
82 |
def create_repository(self) -> bool:
|
83 |
"""Create the Hugging Face repository"""
|
|
|
143 |
- **Fine-tuning Method**: Supervised Fine-tuning
|
144 |
- **Training Date**: {datetime.now().strftime('%Y-%m-%d')}
|
145 |
- **Model Size**: {self._get_model_size():.1f} GB
|
146 |
+
- **Dataset Repository**: {self.dataset_repo}
|
147 |
|
148 |
## Training Configuration
|
149 |
|
|
|
179 |
- **Training Time**: {results.get('training_time_hours', 'Unknown')} hours
|
180 |
- **Final Loss**: {results.get('final_loss', 'Unknown')}
|
181 |
- **Final Accuracy**: {results.get('final_accuracy', 'Unknown')}
|
182 |
+
- **Dataset Repository**: {self.dataset_repo}
|
183 |
|
184 |
## Model Performance
|
185 |
|
|
|
187 |
- **Validation Loss**: {results.get('eval_loss', 'Unknown')}
|
188 |
- **Training Steps**: {results.get('total_steps', 'Unknown')}
|
189 |
|
190 |
+
## Experiment Tracking
|
191 |
+
|
192 |
+
This model was trained with experiment tracking enabled. Training metrics and configuration are stored in the HF Dataset repository: `{self.dataset_repo}`
|
193 |
+
|
194 |
## Limitations and Biases
|
195 |
|
196 |
This model is fine-tuned for specific tasks and may not generalize well to all use cases. Please evaluate the model's performance on your specific task before deployment.
|
|
|
311 |
- **Model Size**: {self._get_model_size():.1f} GB
|
312 |
- **Training Steps**: {results.get('total_steps', 'Unknown')}
|
313 |
- **Final Loss**: {results.get('final_loss', 'Unknown')}
|
314 |
+
- **Dataset Repository**: {self.dataset_repo}
|
315 |
|
316 |
## Training Configuration
|
317 |
|
|
|
325 |
{json.dumps(results, indent=2)}
|
326 |
```
|
327 |
|
328 |
+
## Experiment Tracking
|
329 |
+
|
330 |
+
Training metrics and configuration are stored in the HF Dataset repository: `{self.dataset_repo}`
|
331 |
+
|
332 |
## Files
|
333 |
|
334 |
- `pytorch_model.bin`: Model weights
|
|
|
350 |
upload_file(
|
351 |
path_or_fileobj=str(readme_path),
|
352 |
path_in_repo="README.md",
|
353 |
+
token=self.token,
|
354 |
+
repo_id=self.repo_name
|
355 |
)
|
356 |
|
357 |
# Clean up
|
|
|
365 |
return False
|
366 |
|
367 |
def log_to_trackio(self, action: str, details: Dict[str, Any]):
|
368 |
+
"""Log push action to Trackio and HF Datasets"""
|
369 |
if self.monitor:
|
370 |
try:
|
371 |
+
# Log to Trackio
|
372 |
self.monitor.log_metrics({
|
373 |
"push_action": action,
|
374 |
"repo_name": self.repo_name,
|
375 |
"model_size_gb": self._get_model_size(),
|
376 |
+
"dataset_repo": self.dataset_repo,
|
377 |
+
**details
|
378 |
+
})
|
379 |
+
|
380 |
+
# Log training summary
|
381 |
+
self.monitor.log_training_summary({
|
382 |
+
"model_push": True,
|
383 |
+
"model_repo": self.repo_name,
|
384 |
+
"dataset_repo": self.dataset_repo,
|
385 |
+
"push_date": datetime.now().isoformat(),
|
386 |
**details
|
387 |
})
|
388 |
+
|
389 |
+
logger.info(f"✅ Logged {action} to Trackio and HF Datasets")
|
390 |
except Exception as e:
|
391 |
logger.error(f"❌ Failed to log to Trackio: {e}")
|
392 |
|
393 |
def push_model(self, training_config: Optional[Dict[str, Any]] = None,
|
394 |
results: Optional[Dict[str, Any]] = None) -> bool:
|
395 |
+
"""Complete model push process with HF Datasets integration"""
|
396 |
logger.info(f"🚀 Starting model push to {self.repo_name}")
|
397 |
+
logger.info(f"📊 Dataset repository: {self.dataset_repo}")
|
398 |
|
399 |
# Validate model path
|
400 |
if not self.validate_model_path():
|
|
|
435 |
if results:
|
436 |
self.upload_training_results(str(self.model_path))
|
437 |
|
438 |
+
# Log to Trackio and HF Datasets
|
439 |
self.log_to_trackio("model_push", {
|
440 |
"model_path": str(self.model_path),
|
441 |
"repo_name": self.repo_name,
|
|
|
445 |
})
|
446 |
|
447 |
logger.info(f"🎉 Model successfully pushed to: https://huggingface.co/{self.repo_name}")
|
448 |
+
logger.info(f"📊 Experiment data stored in: {self.dataset_repo}")
|
449 |
return True
|
450 |
|
451 |
def _load_training_config(self) -> Dict[str, Any]:
|
|
|
474 |
|
475 |
# Optional arguments
|
476 |
parser.add_argument('--token', type=str, default=None, help='Hugging Face token')
|
477 |
+
parser.add_argument('--hf-token', type=str, default=None, help='Hugging Face token (alternative to --token)')
|
478 |
parser.add_argument('--private', action='store_true', help='Make repository private')
|
479 |
parser.add_argument('--trackio-url', type=str, default=None, help='Trackio Space URL for logging')
|
480 |
parser.add_argument('--experiment-name', type=str, default=None, help='Experiment name for Trackio')
|
481 |
+
parser.add_argument('--dataset-repo', type=str, default=None, help='HF Dataset repository for experiment storage')
|
482 |
|
483 |
return parser.parse_args()
|
484 |
|
|
|
502 |
token=args.token,
|
503 |
private=args.private,
|
504 |
trackio_url=args.trackio_url,
|
505 |
+
experiment_name=args.experiment_name,
|
506 |
+
dataset_repo=args.dataset_repo,
|
507 |
+
hf_token=args.hf_token
|
508 |
)
|
509 |
|
510 |
# Push model
|
|
|
513 |
if success:
|
514 |
logger.info("✅ Model push completed successfully!")
|
515 |
logger.info(f"🌐 View your model at: https://huggingface.co/{args.repo_name}")
|
516 |
+
if args.dataset_repo:
|
517 |
+
logger.info(f"📊 View experiment data at: https://huggingface.co/datasets/{args.dataset_repo}")
|
518 |
else:
|
519 |
logger.error("❌ Model push failed!")
|
520 |
return 1
|
scripts/trackio_tonic/configure_trackio.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Configuration script for Trackio environment variables
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
from datetime import datetime
|
9 |
+
|
10 |
+
def configure_trackio():
|
11 |
+
"""Configure Trackio environment variables"""
|
12 |
+
|
13 |
+
print("🔧 Trackio Configuration")
|
14 |
+
print("=" * 40)
|
15 |
+
|
16 |
+
# Current configuration
|
17 |
+
current_config = {
|
18 |
+
'HF_TOKEN': os.environ.get('HF_TOKEN', 'Not set'),
|
19 |
+
'TRACKIO_DATASET_REPO': os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments'),
|
20 |
+
'SPACE_ID': os.environ.get('SPACE_ID', 'Not set')
|
21 |
+
}
|
22 |
+
|
23 |
+
print("📋 Current Configuration:")
|
24 |
+
for key, value in current_config.items():
|
25 |
+
status = "✅" if value != "Not set" else "❌"
|
26 |
+
print(f" {status} {key}: {value}")
|
27 |
+
|
28 |
+
print("\n🎯 Configuration Options:")
|
29 |
+
print("1. Set HF_TOKEN - Required for dataset access")
|
30 |
+
print("2. Set TRACKIO_DATASET_REPO - Dataset repository (optional)")
|
31 |
+
print("3. Set SPACE_ID - HF Space ID (auto-detected)")
|
32 |
+
|
33 |
+
# Check if running on HF Spaces
|
34 |
+
if os.environ.get('SPACE_ID'):
|
35 |
+
print("\n🚀 Running on Hugging Face Spaces")
|
36 |
+
print(f" Space ID: {os.environ.get('SPACE_ID')}")
|
37 |
+
|
38 |
+
# Validate configuration
|
39 |
+
print("\n🔍 Configuration Validation:")
|
40 |
+
|
41 |
+
# Check HF_TOKEN
|
42 |
+
if current_config['HF_TOKEN'] != 'Not set':
|
43 |
+
print("✅ HF_TOKEN is set")
|
44 |
+
print(" This allows the app to read/write to HF Datasets")
|
45 |
+
else:
|
46 |
+
print("❌ HF_TOKEN is not set")
|
47 |
+
print(" Please set HF_TOKEN to enable dataset functionality")
|
48 |
+
print(" Get your token from: https://huggingface.co/settings/tokens")
|
49 |
+
|
50 |
+
# Check dataset repository
|
51 |
+
dataset_repo = current_config['TRACKIO_DATASET_REPO']
|
52 |
+
print(f"📊 Dataset Repository: {dataset_repo}")
|
53 |
+
|
54 |
+
# Test dataset access if token is available
|
55 |
+
if current_config['HF_TOKEN'] != 'Not set':
|
56 |
+
print("\n🧪 Testing Dataset Access...")
|
57 |
+
try:
|
58 |
+
from datasets import load_dataset
|
59 |
+
|
60 |
+
dataset = load_dataset(dataset_repo, token=current_config['HF_TOKEN'])
|
61 |
+
print(f"✅ Successfully loaded dataset: {dataset_repo}")
|
62 |
+
|
63 |
+
# Show experiment count
|
64 |
+
if 'train' in dataset:
|
65 |
+
experiment_count = len(dataset['train'])
|
66 |
+
print(f"📈 Found {experiment_count} experiments in dataset")
|
67 |
+
|
68 |
+
# Show sample experiments
|
69 |
+
if experiment_count > 0:
|
70 |
+
print("🔬 Sample experiments:")
|
71 |
+
for i, row in enumerate(dataset['train'][:3]): # Show first 3
|
72 |
+
exp_id = row.get('experiment_id', 'Unknown')
|
73 |
+
name = row.get('name', 'Unnamed')
|
74 |
+
print(f" {i+1}. {exp_id}: {name}")
|
75 |
+
|
76 |
+
except Exception as e:
|
77 |
+
print(f"❌ Failed to load dataset: {e}")
|
78 |
+
print(" This might be normal if the dataset doesn't exist yet")
|
79 |
+
|
80 |
+
# Generate configuration file
|
81 |
+
config_file = "trackio_config.json"
|
82 |
+
config_data = {
|
83 |
+
'hf_token': current_config['HF_TOKEN'],
|
84 |
+
'dataset_repo': current_config['TRACKIO_DATASET_REPO'],
|
85 |
+
'space_id': current_config['SPACE_ID'],
|
86 |
+
'last_updated': datetime.now().isoformat(),
|
87 |
+
'notes': 'Trackio configuration - set these as environment variables in your HF Space'
|
88 |
+
}
|
89 |
+
|
90 |
+
with open(config_file, 'w') as f:
|
91 |
+
json.dump(config_data, f, indent=2)
|
92 |
+
|
93 |
+
print(f"\n💾 Configuration saved to: {config_file}")
|
94 |
+
|
95 |
+
# Show environment variable commands
|
96 |
+
print("\n📝 Environment Variables for HF Space:")
|
97 |
+
print("=" * 50)
|
98 |
+
print(f"HF_TOKEN={current_config['HF_TOKEN']}")
|
99 |
+
print(f"TRACKIO_DATASET_REPO={current_config['TRACKIO_DATASET_REPO']}")
|
100 |
+
|
101 |
+
print("\n🎯 Next Steps:")
|
102 |
+
print("1. Set HF_TOKEN in your HF Space environment variables")
|
103 |
+
print("2. Optionally set TRACKIO_DATASET_REPO to use a different dataset")
|
104 |
+
print("3. Deploy your updated app.py to the Space")
|
105 |
+
print("4. Run setup_hf_dataset.py if you haven't created the dataset yet")
|
106 |
+
|
107 |
+
def show_usage_examples():
|
108 |
+
"""Show usage examples for different dataset repositories"""
|
109 |
+
|
110 |
+
print("\n📚 Usage Examples")
|
111 |
+
print("=" * 30)
|
112 |
+
|
113 |
+
examples = [
|
114 |
+
{
|
115 |
+
'name': 'Default Dataset',
|
116 |
+
'repo': 'tonic/trackio-experiments',
|
117 |
+
'description': 'Default dataset for your experiments'
|
118 |
+
},
|
119 |
+
{
|
120 |
+
'name': 'Personal Dataset',
|
121 |
+
'repo': 'your-username/trackio-experiments',
|
122 |
+
'description': 'Your personal experiment dataset'
|
123 |
+
},
|
124 |
+
{
|
125 |
+
'name': 'Team Dataset',
|
126 |
+
'repo': 'your-org/team-experiments',
|
127 |
+
'description': 'Shared dataset for team experiments'
|
128 |
+
},
|
129 |
+
{
|
130 |
+
'name': 'Project Dataset',
|
131 |
+
'repo': 'your-username/smollm3-experiments',
|
132 |
+
'description': 'Dataset specific to SmolLM3 experiments'
|
133 |
+
}
|
134 |
+
]
|
135 |
+
|
136 |
+
for i, example in enumerate(examples, 1):
|
137 |
+
print(f"{i}. {example['name']}")
|
138 |
+
print(f" Repository: {example['repo']}")
|
139 |
+
print(f" Description: {example['description']}")
|
140 |
+
print(f" Set with: TRACKIO_DATASET_REPO={example['repo']}")
|
141 |
+
print()
|
142 |
+
|
143 |
+
if __name__ == "__main__":
|
144 |
+
configure_trackio()
|
145 |
+
show_usage_examples()
|
deploy_trackio_space.py → scripts/trackio_tonic/deploy_trackio_space.py
RENAMED
@@ -95,7 +95,7 @@ class TrackioSpaceDeployer:
|
|
95 |
|
96 |
# Write README.md for the space
|
97 |
space_readme = f"""---
|
98 |
-
title: Trackio
|
99 |
emoji: 🐠
|
100 |
colorFrom: indigo
|
101 |
colorTo: yellow
|
|
|
95 |
|
96 |
# Write README.md for the space
|
97 |
space_readme = f"""---
|
98 |
+
title: Trackio Tonic
|
99 |
emoji: 🐠
|
100 |
colorFrom: indigo
|
101 |
colorTo: yellow
|
scripts/trackio_tonic/trackio_api_client.py
ADDED
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Trackio API Client for Hugging Face Spaces
|
4 |
+
Connects to the Trackio Space using the actual API endpoints
|
5 |
+
"""
|
6 |
+
|
7 |
+
import requests
|
8 |
+
import json
|
9 |
+
import time
|
10 |
+
import logging
|
11 |
+
from typing import Dict, Any, Optional
|
12 |
+
from datetime import datetime
|
13 |
+
|
14 |
+
# Setup logging
|
15 |
+
logging.basicConfig(level=logging.INFO)
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
class TrackioAPIClient:
|
19 |
+
"""API client for Trackio Space"""
|
20 |
+
|
21 |
+
def __init__(self, space_url: str):
|
22 |
+
self.space_url = space_url.rstrip('/')
|
23 |
+
self.base_url = f"{self.space_url}/gradio_api/call"
|
24 |
+
|
25 |
+
def _make_api_call(self, endpoint: str, data: list, max_retries: int = 3) -> Dict[str, Any]:
|
26 |
+
"""Make an API call to the Trackio Space"""
|
27 |
+
url = f"{self.base_url}/{endpoint}"
|
28 |
+
|
29 |
+
payload = {
|
30 |
+
"data": data
|
31 |
+
}
|
32 |
+
|
33 |
+
for attempt in range(max_retries):
|
34 |
+
try:
|
35 |
+
logger.debug(f"Attempt {attempt + 1}: Making POST request to {url}")
|
36 |
+
|
37 |
+
# POST request to get EVENT_ID
|
38 |
+
response = requests.post(
|
39 |
+
url,
|
40 |
+
json=payload,
|
41 |
+
headers={"Content-Type": "application/json"},
|
42 |
+
timeout=30
|
43 |
+
)
|
44 |
+
|
45 |
+
if response.status_code != 200:
|
46 |
+
logger.error(f"POST request failed: {response.status_code} - {response.text}")
|
47 |
+
if attempt < max_retries - 1:
|
48 |
+
time.sleep(2 ** attempt) # Exponential backoff
|
49 |
+
continue
|
50 |
+
return {"error": f"POST failed: {response.status_code}"}
|
51 |
+
|
52 |
+
# Extract EVENT_ID from response
|
53 |
+
response_data = response.json()
|
54 |
+
logger.debug(f"POST response: {response_data}")
|
55 |
+
|
56 |
+
# Check for event_id (correct field name)
|
57 |
+
if "event_id" in response_data:
|
58 |
+
event_id = response_data["event_id"]
|
59 |
+
elif "hash" in response_data:
|
60 |
+
event_id = response_data["hash"]
|
61 |
+
else:
|
62 |
+
logger.error(f"No event_id or hash in response: {response_data}")
|
63 |
+
return {"error": "No EVENT_ID in response"}
|
64 |
+
|
65 |
+
# GET request to get results
|
66 |
+
get_url = f"{url}/{event_id}"
|
67 |
+
logger.debug(f"Making GET request to: {get_url}")
|
68 |
+
|
69 |
+
# Wait a bit for the processing to complete
|
70 |
+
time.sleep(1)
|
71 |
+
|
72 |
+
get_response = requests.get(get_url, timeout=30)
|
73 |
+
|
74 |
+
if get_response.status_code != 200:
|
75 |
+
logger.error(f"GET request failed: {get_response.status_code} - {get_response.text}")
|
76 |
+
if attempt < max_retries - 1:
|
77 |
+
time.sleep(2 ** attempt)
|
78 |
+
continue
|
79 |
+
return {"error": f"GET failed: {get_response.status_code}"}
|
80 |
+
|
81 |
+
# Check if response is empty
|
82 |
+
if not get_response.content:
|
83 |
+
logger.warning(f"Empty response from GET request (attempt {attempt + 1})")
|
84 |
+
if attempt < max_retries - 1:
|
85 |
+
time.sleep(2 ** attempt)
|
86 |
+
continue
|
87 |
+
return {"error": "Empty response from server"}
|
88 |
+
|
89 |
+
# Parse the response - handle both JSON and SSE formats
|
90 |
+
response_text = get_response.text.strip()
|
91 |
+
logger.debug(f"Raw response: {response_text}")
|
92 |
+
|
93 |
+
# Try to parse as JSON first
|
94 |
+
try:
|
95 |
+
result_data = get_response.json()
|
96 |
+
logger.debug(f"Parsed as JSON: {result_data}")
|
97 |
+
|
98 |
+
if "data" in result_data and len(result_data["data"]) > 0:
|
99 |
+
return {"success": True, "data": result_data["data"][0]}
|
100 |
+
else:
|
101 |
+
logger.warning(f"No data in JSON response (attempt {attempt + 1}): {result_data}")
|
102 |
+
if attempt < max_retries - 1:
|
103 |
+
time.sleep(2 ** attempt)
|
104 |
+
continue
|
105 |
+
return {"error": "No data in JSON response", "raw": result_data}
|
106 |
+
|
107 |
+
except json.JSONDecodeError:
|
108 |
+
# Try to parse as Server-Sent Events (SSE) format
|
109 |
+
logger.debug("Response is not JSON, trying SSE format")
|
110 |
+
|
111 |
+
# Parse SSE format: "event: complete\ndata: [\"message\"]"
|
112 |
+
lines = response_text.split('\n')
|
113 |
+
data_line = None
|
114 |
+
|
115 |
+
for line in lines:
|
116 |
+
if line.startswith('data: '):
|
117 |
+
data_line = line[6:] # Remove 'data: ' prefix
|
118 |
+
break
|
119 |
+
|
120 |
+
if data_line:
|
121 |
+
try:
|
122 |
+
# Parse the data array from SSE
|
123 |
+
import ast
|
124 |
+
data_array = ast.literal_eval(data_line)
|
125 |
+
|
126 |
+
if isinstance(data_array, list) and len(data_array) > 0:
|
127 |
+
result_message = data_array[0]
|
128 |
+
logger.debug(f"Parsed SSE data: {result_message}")
|
129 |
+
return {"success": True, "data": result_message}
|
130 |
+
else:
|
131 |
+
logger.warning(f"Invalid SSE data format (attempt {attempt + 1}): {data_array}")
|
132 |
+
if attempt < max_retries - 1:
|
133 |
+
time.sleep(2 ** attempt)
|
134 |
+
continue
|
135 |
+
return {"error": "Invalid SSE data format", "raw": data_array}
|
136 |
+
|
137 |
+
except (ValueError, SyntaxError) as e:
|
138 |
+
logger.error(f"Failed to parse SSE data: {e}")
|
139 |
+
logger.debug(f"Raw SSE data: {data_line}")
|
140 |
+
if attempt < max_retries - 1:
|
141 |
+
time.sleep(2 ** attempt)
|
142 |
+
continue
|
143 |
+
return {"error": f"Failed to parse SSE data: {e}"}
|
144 |
+
else:
|
145 |
+
logger.error(f"No data line found in SSE response")
|
146 |
+
if attempt < max_retries - 1:
|
147 |
+
time.sleep(2 ** attempt)
|
148 |
+
continue
|
149 |
+
return {"error": "No data line in SSE response", "raw": response_text}
|
150 |
+
|
151 |
+
except requests.exceptions.RequestException as e:
|
152 |
+
logger.error(f"API call failed (attempt {attempt + 1}): {e}")
|
153 |
+
if attempt < max_retries - 1:
|
154 |
+
time.sleep(2 ** attempt)
|
155 |
+
continue
|
156 |
+
return {"error": f"Request failed: {e}"}
|
157 |
+
except Exception as e:
|
158 |
+
logger.error(f"Unexpected error (attempt {attempt + 1}): {e}")
|
159 |
+
if attempt < max_retries - 1:
|
160 |
+
time.sleep(2 ** attempt)
|
161 |
+
continue
|
162 |
+
return {"error": f"Unexpected error: {e}"}
|
163 |
+
|
164 |
+
return {"error": f"Failed after {max_retries} attempts"}
|
165 |
+
|
166 |
+
def create_experiment(self, name: str, description: str = "") -> Dict[str, Any]:
|
167 |
+
"""Create a new experiment"""
|
168 |
+
logger.info(f"Creating experiment: {name}")
|
169 |
+
|
170 |
+
result = self._make_api_call("create_experiment_interface", [name, description])
|
171 |
+
|
172 |
+
if "success" in result:
|
173 |
+
logger.info(f"Experiment created successfully: {result['data']}")
|
174 |
+
return result
|
175 |
+
else:
|
176 |
+
logger.error(f"Failed to create experiment: {result}")
|
177 |
+
return result
|
178 |
+
|
179 |
+
def log_metrics(self, experiment_id: str, metrics: Dict[str, Any], step: Optional[int] = None) -> Dict[str, Any]:
|
180 |
+
"""Log metrics for an experiment"""
|
181 |
+
metrics_json = json.dumps(metrics)
|
182 |
+
step_str = str(step) if step is not None else ""
|
183 |
+
|
184 |
+
logger.info(f"Logging metrics for experiment {experiment_id} at step {step}")
|
185 |
+
|
186 |
+
result = self._make_api_call("log_metrics_interface", [experiment_id, metrics_json, step_str])
|
187 |
+
|
188 |
+
if "success" in result:
|
189 |
+
logger.info(f"Metrics logged successfully: {result['data']}")
|
190 |
+
return result
|
191 |
+
else:
|
192 |
+
logger.error(f"Failed to log metrics: {result}")
|
193 |
+
return result
|
194 |
+
|
195 |
+
def log_parameters(self, experiment_id: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
|
196 |
+
"""Log parameters for an experiment"""
|
197 |
+
parameters_json = json.dumps(parameters)
|
198 |
+
|
199 |
+
logger.info(f"Logging parameters for experiment {experiment_id}")
|
200 |
+
|
201 |
+
result = self._make_api_call("log_parameters_interface", [experiment_id, parameters_json])
|
202 |
+
|
203 |
+
if "success" in result:
|
204 |
+
logger.info(f"Parameters logged successfully: {result['data']}")
|
205 |
+
return result
|
206 |
+
else:
|
207 |
+
logger.error(f"Failed to log parameters: {result}")
|
208 |
+
return result
|
209 |
+
|
210 |
+
def get_experiment_details(self, experiment_id: str) -> Dict[str, Any]:
|
211 |
+
"""Get experiment details"""
|
212 |
+
logger.info(f"Getting details for experiment {experiment_id}")
|
213 |
+
|
214 |
+
result = self._make_api_call("get_experiment_details_interface", [experiment_id])
|
215 |
+
|
216 |
+
if "success" in result:
|
217 |
+
logger.info(f"Experiment details retrieved: {result['data']}")
|
218 |
+
return result
|
219 |
+
else:
|
220 |
+
logger.error(f"Failed to get experiment details: {result}")
|
221 |
+
return result
|
222 |
+
|
223 |
+
def list_experiments(self) -> Dict[str, Any]:
|
224 |
+
"""List all experiments"""
|
225 |
+
logger.info("Listing experiments")
|
226 |
+
|
227 |
+
result = self._make_api_call("list_experiments_interface", [])
|
228 |
+
|
229 |
+
if "success" in result:
|
230 |
+
logger.info(f"Experiments listed successfully: {result['data']}")
|
231 |
+
return result
|
232 |
+
else:
|
233 |
+
logger.error(f"Failed to list experiments: {result}")
|
234 |
+
return result
|
235 |
+
|
236 |
+
def update_experiment_status(self, experiment_id: str, status: str) -> Dict[str, Any]:
|
237 |
+
"""Update experiment status"""
|
238 |
+
logger.info(f"Updating experiment {experiment_id} status to {status}")
|
239 |
+
|
240 |
+
result = self._make_api_call("update_experiment_status_interface", [experiment_id, status])
|
241 |
+
|
242 |
+
if "success" in result:
|
243 |
+
logger.info(f"Experiment status updated successfully: {result['data']}")
|
244 |
+
return result
|
245 |
+
else:
|
246 |
+
logger.error(f"Failed to update experiment status: {result}")
|
247 |
+
return result
|
248 |
+
|
249 |
+
def simulate_training_data(self, experiment_id: str) -> Dict[str, Any]:
|
250 |
+
"""Simulate training data for testing"""
|
251 |
+
logger.info(f"Simulating training data for experiment {experiment_id}")
|
252 |
+
|
253 |
+
result = self._make_api_call("simulate_training_data_interface", [experiment_id])
|
254 |
+
|
255 |
+
if "success" in result:
|
256 |
+
logger.info(f"Training data simulated successfully: {result['data']}")
|
257 |
+
return result
|
258 |
+
else:
|
259 |
+
logger.error(f"Failed to simulate training data: {result}")
|
260 |
+
return result
|
261 |
+
|
262 |
+
def get_training_metrics(self, experiment_id: str) -> Dict[str, Any]:
|
263 |
+
"""Get training metrics for an experiment"""
|
264 |
+
logger.info(f"Getting training metrics for experiment {experiment_id}")
|
265 |
+
|
266 |
+
result = self._make_api_call("get_training_metrics_interface", [experiment_id])
|
267 |
+
|
268 |
+
if "success" in result:
|
269 |
+
logger.info(f"Training metrics retrieved: {result['data']}")
|
270 |
+
return result
|
271 |
+
else:
|
272 |
+
logger.error(f"Failed to get training metrics: {result}")
|
273 |
+
return result
|
274 |
+
|
275 |
+
def get_experiment_metrics_history(self, experiment_id: str) -> Dict[str, Any]:
|
276 |
+
"""Get experiment metrics history"""
|
277 |
+
logger.info(f"Getting metrics history for experiment {experiment_id}")
|
278 |
+
|
279 |
+
result = self._make_api_call("get_experiment_metrics_history_interface", [experiment_id])
|
280 |
+
|
281 |
+
if "success" in result:
|
282 |
+
logger.info(f"Metrics history retrieved: {result['data']}")
|
283 |
+
return result
|
284 |
+
else:
|
285 |
+
logger.error(f"Failed to get metrics history: {result}")
|
286 |
+
return result
|
run_a100_large_experiment.py → scripts/training/train.py
RENAMED
File without changes
|
setup_launch.py
ADDED
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Setup script for the interactive SmolLM3 end-to-end fine-tuning pipeline
|
4 |
+
Helps users prepare for the interactive launch script
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import re
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
def setup_launch_script():
|
12 |
+
"""Setup the launch.sh script with user configuration"""
|
13 |
+
|
14 |
+
print("🚀 SmolLM3 Interactive End-to-End Fine-tuning Setup")
|
15 |
+
print("=" * 60)
|
16 |
+
|
17 |
+
print("\n📋 This setup will help you prepare for the interactive pipeline.")
|
18 |
+
print("The launch script will now prompt you for all necessary information.")
|
19 |
+
|
20 |
+
# Check if launch.sh exists
|
21 |
+
launch_path = Path("launch.sh")
|
22 |
+
if not launch_path.exists():
|
23 |
+
print("❌ launch.sh not found")
|
24 |
+
return False
|
25 |
+
|
26 |
+
print("\n✅ launch.sh found - no configuration needed!")
|
27 |
+
print("The script is now interactive and will prompt you for all settings.")
|
28 |
+
|
29 |
+
return True
|
30 |
+
|
31 |
+
def create_requirements_check():
|
32 |
+
"""Create a requirements check script"""
|
33 |
+
|
34 |
+
check_script = """#!/usr/bin/env python3
|
35 |
+
\"\"\"
|
36 |
+
Requirements check for SmolLM3 fine-tuning
|
37 |
+
\"\"\"
|
38 |
+
|
39 |
+
import sys
|
40 |
+
import subprocess
|
41 |
+
|
42 |
+
def check_requirements():
|
43 |
+
\"\"\"Check if all requirements are met\"\"\"
|
44 |
+
|
45 |
+
print("🔍 Checking requirements...")
|
46 |
+
|
47 |
+
# Check Python version
|
48 |
+
if sys.version_info < (3, 8):
|
49 |
+
print("❌ Python 3.8+ required")
|
50 |
+
return False
|
51 |
+
else:
|
52 |
+
print(f"✅ Python {sys.version_info.major}.{sys.version_info.minor}")
|
53 |
+
|
54 |
+
# Check required packages
|
55 |
+
required_packages = [
|
56 |
+
'torch',
|
57 |
+
'transformers',
|
58 |
+
'datasets',
|
59 |
+
'accelerate',
|
60 |
+
'trl',
|
61 |
+
'huggingface_hub',
|
62 |
+
'requests'
|
63 |
+
]
|
64 |
+
|
65 |
+
missing_packages = []
|
66 |
+
for package in required_packages:
|
67 |
+
try:
|
68 |
+
__import__(package)
|
69 |
+
print(f"✅ {package}")
|
70 |
+
except ImportError:
|
71 |
+
print(f"❌ {package}")
|
72 |
+
missing_packages.append(package)
|
73 |
+
|
74 |
+
if missing_packages:
|
75 |
+
print(f"\\n📦 Install missing packages:")
|
76 |
+
print(f"pip install {' '.join(missing_packages)}")
|
77 |
+
return False
|
78 |
+
|
79 |
+
# Check CUDA
|
80 |
+
try:
|
81 |
+
import torch
|
82 |
+
if torch.cuda.is_available():
|
83 |
+
print(f"✅ CUDA available: {torch.cuda.get_device_name(0)}")
|
84 |
+
else:
|
85 |
+
print("⚠️ CUDA not available (training will be slower)")
|
86 |
+
except:
|
87 |
+
print("⚠️ Could not check CUDA availability")
|
88 |
+
|
89 |
+
print("\\n✅ All requirements met!")
|
90 |
+
return True
|
91 |
+
|
92 |
+
if __name__ == "__main__":
|
93 |
+
check_requirements()
|
94 |
+
"""
|
95 |
+
|
96 |
+
with open("check_requirements.py", 'w') as f:
|
97 |
+
f.write(check_script)
|
98 |
+
|
99 |
+
print("✅ Created check_requirements.py")
|
100 |
+
|
101 |
+
def create_quick_start_guide():
|
102 |
+
"""Create a quick start guide"""
|
103 |
+
|
104 |
+
guide = """# SmolLM3 Interactive Pipeline - Quick Start Guide
|
105 |
+
|
106 |
+
## 🚀 Quick Start
|
107 |
+
|
108 |
+
### 1. Check Requirements
|
109 |
+
```bash
|
110 |
+
python check_requirements.py
|
111 |
+
```
|
112 |
+
|
113 |
+
### 2. Run the Interactive Pipeline
|
114 |
+
```bash
|
115 |
+
chmod +x launch.sh
|
116 |
+
./launch.sh
|
117 |
+
```
|
118 |
+
|
119 |
+
## 📋 What the Interactive Pipeline Does
|
120 |
+
|
121 |
+
The pipeline will guide you through:
|
122 |
+
|
123 |
+
1. **Authentication** - Enter your HF username and token
|
124 |
+
2. **Configuration Selection** - Choose from predefined training configs:
|
125 |
+
- Basic Training (SmolLM3 + SmolTalk)
|
126 |
+
- H100 Lightweight (Rapid training on H100)
|
127 |
+
- A100 Large Scale (SmolLM3 + OpenHermes-FR)
|
128 |
+
- Multiple Passes (Extended training)
|
129 |
+
- Custom Configuration (User-defined)
|
130 |
+
3. **Experiment Setup** - Configure experiment name and repositories
|
131 |
+
4. **Training Parameters** - Adjust batch size, learning rate, etc.
|
132 |
+
5. **Deployment** - Automatic Trackio Space and HF Dataset setup
|
133 |
+
6. **Training** - Monitored fine-tuning with real-time tracking
|
134 |
+
7. **Model Push** - Upload to HF Hub with documentation
|
135 |
+
|
136 |
+
## 🎯 Available Training Configurations
|
137 |
+
|
138 |
+
### 1. Basic Training (Default)
|
139 |
+
- **Model**: SmolLM3-3B
|
140 |
+
- **Dataset**: SmolTalk
|
141 |
+
- **Epochs**: 3
|
142 |
+
- **Batch Size**: 2
|
143 |
+
- **Learning Rate**: 5e-6
|
144 |
+
- **Best for**: Quick experiments, learning
|
145 |
+
|
146 |
+
### 2. H100 Lightweight (Rapid)
|
147 |
+
- **Model**: SmolLM3-3B
|
148 |
+
- **Dataset**: OpenHermes-FR (80K samples)
|
149 |
+
- **Epochs**: 1
|
150 |
+
- **Batch Size**: 16
|
151 |
+
- **Learning Rate**: 8e-6
|
152 |
+
- **Sequence Length**: 8192
|
153 |
+
- **Best for**: Rapid training on H100
|
154 |
+
|
155 |
+
### 3. A100 Large Scale
|
156 |
+
- **Model**: SmolLM3-3B
|
157 |
+
- **Dataset**: OpenHermes-FR
|
158 |
+
- **Epochs**: 1.3 passes
|
159 |
+
- **Batch Size**: 8
|
160 |
+
- **Learning Rate**: 5e-6
|
161 |
+
- **Sequence Length**: 8192
|
162 |
+
- **Best for**: High-performance training
|
163 |
+
|
164 |
+
### 4. Multiple Passes
|
165 |
+
- **Model**: SmolLM3-3B
|
166 |
+
- **Dataset**: OpenHermes-FR
|
167 |
+
- **Epochs**: 4 passes
|
168 |
+
- **Batch Size**: 6
|
169 |
+
- **Learning Rate**: 3e-6
|
170 |
+
- **Sequence Length**: 8192
|
171 |
+
- **Best for**: Thorough training
|
172 |
+
|
173 |
+
### 5. Custom Configuration
|
174 |
+
- **User-defined parameters**
|
175 |
+
- **Flexible model and dataset selection**
|
176 |
+
- **Custom training parameters**
|
177 |
+
|
178 |
+
## 🔧 Prerequisites
|
179 |
+
|
180 |
+
1. **Hugging Face Account**
|
181 |
+
- Create account at https://huggingface.co
|
182 |
+
- Generate token at https://huggingface.co/settings/tokens
|
183 |
+
|
184 |
+
2. **System Requirements**
|
185 |
+
- Python 3.8+
|
186 |
+
- CUDA-compatible GPU (recommended)
|
187 |
+
- 16GB+ RAM
|
188 |
+
- 50GB+ storage
|
189 |
+
|
190 |
+
3. **Dependencies**
|
191 |
+
- PyTorch with CUDA
|
192 |
+
- Transformers
|
193 |
+
- Datasets
|
194 |
+
- Accelerate
|
195 |
+
- TRL
|
196 |
+
|
197 |
+
## 📊 Expected Outputs
|
198 |
+
|
199 |
+
After running the pipeline, you'll have:
|
200 |
+
|
201 |
+
- **Model Repository**: `https://huggingface.co/your-username/smollm3-finetuned-YYYYMMDD`
|
202 |
+
- **Trackio Space**: `https://huggingface.co/spaces/your-username/trackio-monitoring-YYYYMMDD`
|
203 |
+
- **Experiment Dataset**: `https://huggingface.co/datasets/your-username/trackio-experiments`
|
204 |
+
- **Training Summary**: `training_summary.md`
|
205 |
+
|
206 |
+
## 🛠️ Troubleshooting
|
207 |
+
|
208 |
+
### Common Issues
|
209 |
+
|
210 |
+
1. **HF Token Issues**
|
211 |
+
```bash
|
212 |
+
huggingface-cli whoami
|
213 |
+
```
|
214 |
+
|
215 |
+
2. **CUDA Issues**
|
216 |
+
```bash
|
217 |
+
python -c "import torch; print(torch.cuda.is_available())"
|
218 |
+
```
|
219 |
+
|
220 |
+
3. **Memory Issues**
|
221 |
+
- Reduce batch size in custom configuration
|
222 |
+
- Increase gradient accumulation steps
|
223 |
+
|
224 |
+
4. **Network Issues**
|
225 |
+
- Check internet connection
|
226 |
+
- Verify HF token permissions
|
227 |
+
|
228 |
+
## 🎯 Tips for Success
|
229 |
+
|
230 |
+
1. **Start with Basic Training** for your first run
|
231 |
+
2. **Use H100 Lightweight** for rapid experiments on H100
|
232 |
+
3. **Use A100 Large Scale** for serious experiments
|
233 |
+
3. **Monitor in Trackio Space** for real-time progress
|
234 |
+
4. **Check logs** if something goes wrong
|
235 |
+
5. **Test the model** after training completes
|
236 |
+
|
237 |
+
## 📞 Support
|
238 |
+
|
239 |
+
- Check the troubleshooting section
|
240 |
+
- Review logs in `training.log`
|
241 |
+
- Monitor progress in Trackio Space
|
242 |
+
- Open an issue on GitHub
|
243 |
+
|
244 |
+
---
|
245 |
+
|
246 |
+
**Happy Fine-tuning! 🚀**
|
247 |
+
"""
|
248 |
+
|
249 |
+
with open("QUICK_START_GUIDE.md", 'w') as f:
|
250 |
+
f.write(guide)
|
251 |
+
|
252 |
+
print("✅ Created QUICK_START_GUIDE.md")
|
253 |
+
|
254 |
+
def main():
|
255 |
+
"""Main setup function"""
|
256 |
+
|
257 |
+
print("Welcome to SmolLM3 Interactive End-to-End Fine-tuning Setup!")
|
258 |
+
print("This will help you prepare for the interactive pipeline.")
|
259 |
+
|
260 |
+
if setup_launch_script():
|
261 |
+
create_requirements_check()
|
262 |
+
create_quick_start_guide()
|
263 |
+
|
264 |
+
print("\n🎉 Setup completed successfully!")
|
265 |
+
print("\n📋 Files created:")
|
266 |
+
print(" - check_requirements.py (requirement checker)")
|
267 |
+
print(" - QUICK_START_GUIDE.md (usage guide)")
|
268 |
+
|
269 |
+
print("\n🚀 Ready to start training!")
|
270 |
+
print("Next steps:")
|
271 |
+
print("1. Run: python check_requirements.py")
|
272 |
+
print("2. Run: chmod +x launch.sh")
|
273 |
+
print("3. Run: ./launch.sh")
|
274 |
+
print("4. Follow the interactive prompts")
|
275 |
+
|
276 |
+
print("\n📚 For detailed information, see:")
|
277 |
+
print(" - QUICK_START_GUIDE.md")
|
278 |
+
print(" - README_END_TO_END.md")
|
279 |
+
else:
|
280 |
+
print("\n❌ Setup failed. Please check your input and try again.")
|
281 |
+
|
282 |
+
if __name__ == "__main__":
|
283 |
+
main()
|
config.py → src/config.py
RENAMED
File without changes
|
data.py → src/data.py
RENAMED
File without changes
|
model.py → src/model.py
RENAMED
File without changes
|
monitoring.py → src/monitoring.py
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
"""
|
2 |
Trackio Monitoring Integration for SmolLM3 Fine-tuning
|
3 |
-
Provides comprehensive experiment tracking and monitoring capabilities
|
4 |
"""
|
5 |
|
6 |
import os
|
@@ -13,7 +13,7 @@ from pathlib import Path
|
|
13 |
|
14 |
# Import the real API client
|
15 |
try:
|
16 |
-
from trackio_api_client import TrackioAPIClient
|
17 |
TRACKIO_AVAILABLE = True
|
18 |
except ImportError:
|
19 |
TRACKIO_AVAILABLE = False
|
@@ -22,7 +22,7 @@ except ImportError:
|
|
22 |
logger = logging.getLogger(__name__)
|
23 |
|
24 |
class SmolLM3Monitor:
|
25 |
-
"""Monitoring and tracking for SmolLM3 fine-tuning experiments"""
|
26 |
|
27 |
def __init__(
|
28 |
self,
|
@@ -32,7 +32,9 @@ class SmolLM3Monitor:
|
|
32 |
enable_tracking: bool = True,
|
33 |
log_artifacts: bool = True,
|
34 |
log_metrics: bool = True,
|
35 |
-
log_config: bool = True
|
|
|
|
|
36 |
):
|
37 |
self.experiment_name = experiment_name
|
38 |
self.enable_tracking = enable_tracking and TRACKIO_AVAILABLE
|
@@ -40,6 +42,10 @@ class SmolLM3Monitor:
|
|
40 |
self.log_metrics_enabled = log_metrics # Rename to avoid conflict
|
41 |
self.log_config_enabled = log_config # Rename to avoid conflict
|
42 |
|
|
|
|
|
|
|
|
|
43 |
# Initialize experiment metadata first
|
44 |
self.experiment_id = None
|
45 |
self.start_time = datetime.now()
|
@@ -51,7 +57,33 @@ class SmolLM3Monitor:
|
|
51 |
if self.enable_tracking:
|
52 |
self._setup_trackio(trackio_url, trackio_token)
|
53 |
|
|
|
|
|
|
|
|
|
|
|
54 |
logger.info("Initialized monitoring for experiment: %s", experiment_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
def _setup_trackio(self, trackio_url: Optional[str], trackio_token: Optional[str]):
|
57 |
"""Setup Trackio API client"""
|
@@ -91,6 +123,44 @@ class SmolLM3Monitor:
|
|
91 |
logger.error("Failed to initialize Trackio API: %s", e)
|
92 |
self.enable_tracking = False
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
def log_configuration(self, config: Dict[str, Any]):
|
95 |
"""Log experiment configuration"""
|
96 |
if not self.enable_tracking or not self.log_config_enabled:
|
@@ -98,24 +168,30 @@ class SmolLM3Monitor:
|
|
98 |
|
99 |
try:
|
100 |
# Log configuration as parameters
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
if "success" in result:
|
107 |
-
# Also save config locally
|
108 |
-
config_path = "config_{}_{}.json".format(
|
109 |
-
self.experiment_name,
|
110 |
-
self.start_time.strftime('%Y%m%d_%H%M%S')
|
111 |
)
|
112 |
-
with open(config_path, 'w') as f:
|
113 |
-
json.dump(config, f, indent=2, default=str)
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
except Exception as e:
|
121 |
logger.error("Failed to log configuration: %s", e)
|
@@ -136,18 +212,26 @@ class SmolLM3Monitor:
|
|
136 |
metrics['step'] = step
|
137 |
|
138 |
# Log to Trackio
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
151 |
|
152 |
except Exception as e:
|
153 |
logger.error("Failed to log metrics: %s", e)
|
@@ -166,16 +250,19 @@ class SmolLM3Monitor:
|
|
166 |
"checkpoint_size": os.path.getsize(checkpoint_path) if os.path.exists(checkpoint_path) else 0
|
167 |
}
|
168 |
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
-
|
175 |
-
|
176 |
-
logger.info("Checkpoint logged: %s", checkpoint_path)
|
177 |
-
else:
|
178 |
-
logger.error("Failed to log checkpoint: %s", result)
|
179 |
|
180 |
except Exception as e:
|
181 |
logger.error("Failed to log checkpoint: %s", e)
|
@@ -245,25 +332,31 @@ class SmolLM3Monitor:
|
|
245 |
summary['experiment_duration_seconds'] = duration
|
246 |
summary['experiment_duration_hours'] = duration / 3600
|
247 |
|
248 |
-
# Log final summary
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
if "success" in result:
|
255 |
-
# Save summary locally
|
256 |
-
summary_path = "training_summary_{}_{}.json".format(
|
257 |
-
self.experiment_name,
|
258 |
-
self.start_time.strftime('%Y%m%d_%H%M%S')
|
259 |
)
|
260 |
-
with open(summary_path, 'w') as f:
|
261 |
-
json.dump(summary, f, indent=2, default=str)
|
262 |
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
|
268 |
except Exception as e:
|
269 |
logger.error("Failed to log training summary: %s", e)
|
@@ -356,6 +449,10 @@ class SmolLM3Monitor:
|
|
356 |
logger.error("Failed to close monitoring session: %s", result)
|
357 |
except Exception as e:
|
358 |
logger.error("Failed to close monitoring session: %s", e)
|
|
|
|
|
|
|
|
|
359 |
|
360 |
# Utility function to create monitor from config
|
361 |
def create_monitor_from_config(config, experiment_name: Optional[str] = None) -> SmolLM3Monitor:
|
@@ -370,5 +467,7 @@ def create_monitor_from_config(config, experiment_name: Optional[str] = None) ->
|
|
370 |
enable_tracking=getattr(config, 'enable_tracking', True),
|
371 |
log_artifacts=getattr(config, 'log_artifacts', True),
|
372 |
log_metrics=getattr(config, 'log_metrics', True),
|
373 |
-
log_config=getattr(config, 'log_config', True)
|
|
|
|
|
374 |
)
|
|
|
1 |
"""
|
2 |
Trackio Monitoring Integration for SmolLM3 Fine-tuning
|
3 |
+
Provides comprehensive experiment tracking and monitoring capabilities with HF Datasets support
|
4 |
"""
|
5 |
|
6 |
import os
|
|
|
13 |
|
14 |
# Import the real API client
|
15 |
try:
|
16 |
+
from scripts.trackio_tonic.trackio_api_client import TrackioAPIClient
|
17 |
TRACKIO_AVAILABLE = True
|
18 |
except ImportError:
|
19 |
TRACKIO_AVAILABLE = False
|
|
|
22 |
logger = logging.getLogger(__name__)
|
23 |
|
24 |
class SmolLM3Monitor:
|
25 |
+
"""Monitoring and tracking for SmolLM3 fine-tuning experiments with HF Datasets support"""
|
26 |
|
27 |
def __init__(
|
28 |
self,
|
|
|
32 |
enable_tracking: bool = True,
|
33 |
log_artifacts: bool = True,
|
34 |
log_metrics: bool = True,
|
35 |
+
log_config: bool = True,
|
36 |
+
hf_token: Optional[str] = None,
|
37 |
+
dataset_repo: Optional[str] = None
|
38 |
):
|
39 |
self.experiment_name = experiment_name
|
40 |
self.enable_tracking = enable_tracking and TRACKIO_AVAILABLE
|
|
|
42 |
self.log_metrics_enabled = log_metrics # Rename to avoid conflict
|
43 |
self.log_config_enabled = log_config # Rename to avoid conflict
|
44 |
|
45 |
+
# HF Datasets configuration
|
46 |
+
self.hf_token = hf_token or os.environ.get('HF_TOKEN')
|
47 |
+
self.dataset_repo = dataset_repo or os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
|
48 |
+
|
49 |
# Initialize experiment metadata first
|
50 |
self.experiment_id = None
|
51 |
self.start_time = datetime.now()
|
|
|
57 |
if self.enable_tracking:
|
58 |
self._setup_trackio(trackio_url, trackio_token)
|
59 |
|
60 |
+
# Initialize HF Datasets client
|
61 |
+
self.hf_dataset_client = None
|
62 |
+
if self.hf_token:
|
63 |
+
self._setup_hf_datasets()
|
64 |
+
|
65 |
logger.info("Initialized monitoring for experiment: %s", experiment_name)
|
66 |
+
logger.info("Dataset repository: %s", self.dataset_repo)
|
67 |
+
|
68 |
+
def _setup_hf_datasets(self):
|
69 |
+
"""Setup HF Datasets client for persistent storage"""
|
70 |
+
try:
|
71 |
+
from datasets import Dataset
|
72 |
+
from huggingface_hub import HfApi
|
73 |
+
|
74 |
+
self.hf_dataset_client = {
|
75 |
+
'Dataset': Dataset,
|
76 |
+
'HfApi': HfApi,
|
77 |
+
'api': HfApi(token=self.hf_token)
|
78 |
+
}
|
79 |
+
logger.info("✅ HF Datasets client initialized for %s", self.dataset_repo)
|
80 |
+
|
81 |
+
except ImportError:
|
82 |
+
logger.warning("⚠️ datasets or huggingface-hub not available. Install with: pip install datasets huggingface-hub")
|
83 |
+
self.hf_dataset_client = None
|
84 |
+
except Exception as e:
|
85 |
+
logger.error("Failed to initialize HF Datasets client: %s", e)
|
86 |
+
self.hf_dataset_client = None
|
87 |
|
88 |
def _setup_trackio(self, trackio_url: Optional[str], trackio_token: Optional[str]):
|
89 |
"""Setup Trackio API client"""
|
|
|
123 |
logger.error("Failed to initialize Trackio API: %s", e)
|
124 |
self.enable_tracking = False
|
125 |
|
126 |
+
def _save_to_hf_dataset(self, experiment_data: Dict[str, Any]):
|
127 |
+
"""Save experiment data to HF Dataset"""
|
128 |
+
if not self.hf_dataset_client:
|
129 |
+
return False
|
130 |
+
|
131 |
+
try:
|
132 |
+
# Convert experiment data to dataset format
|
133 |
+
dataset_data = [{
|
134 |
+
'experiment_id': self.experiment_id or "exp_{}".format(datetime.now().strftime('%Y%m%d_%H%M%S')),
|
135 |
+
'name': self.experiment_name,
|
136 |
+
'description': "SmolLM3 fine-tuning experiment",
|
137 |
+
'created_at': self.start_time.isoformat(),
|
138 |
+
'status': 'running',
|
139 |
+
'metrics': json.dumps(self.metrics_history),
|
140 |
+
'parameters': json.dumps(experiment_data),
|
141 |
+
'artifacts': json.dumps(self.artifacts),
|
142 |
+
'logs': json.dumps([]),
|
143 |
+
'last_updated': datetime.now().isoformat()
|
144 |
+
}]
|
145 |
+
|
146 |
+
# Create dataset
|
147 |
+
Dataset = self.hf_dataset_client['Dataset']
|
148 |
+
dataset = Dataset.from_list(dataset_data)
|
149 |
+
|
150 |
+
# Push to HF Hub
|
151 |
+
dataset.push_to_hub(
|
152 |
+
self.dataset_repo,
|
153 |
+
token=self.hf_token,
|
154 |
+
private=True
|
155 |
+
)
|
156 |
+
|
157 |
+
logger.info("✅ Saved experiment data to %s", self.dataset_repo)
|
158 |
+
return True
|
159 |
+
|
160 |
+
except Exception as e:
|
161 |
+
logger.error("Failed to save to HF Dataset: %s", e)
|
162 |
+
return False
|
163 |
+
|
164 |
def log_configuration(self, config: Dict[str, Any]):
|
165 |
"""Log experiment configuration"""
|
166 |
if not self.enable_tracking or not self.log_config_enabled:
|
|
|
168 |
|
169 |
try:
|
170 |
# Log configuration as parameters
|
171 |
+
if self.trackio_client:
|
172 |
+
result = self.trackio_client.log_parameters(
|
173 |
+
experiment_id=self.experiment_id,
|
174 |
+
parameters=config
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
)
|
|
|
|
|
176 |
|
177 |
+
if "success" in result:
|
178 |
+
logger.info("Configuration logged to Trackio")
|
179 |
+
else:
|
180 |
+
logger.error("Failed to log configuration: %s", result)
|
181 |
+
|
182 |
+
# Save to HF Dataset
|
183 |
+
self._save_to_hf_dataset(config)
|
184 |
+
|
185 |
+
# Also save config locally
|
186 |
+
config_path = "config_{}_{}.json".format(
|
187 |
+
self.experiment_name,
|
188 |
+
self.start_time.strftime('%Y%m%d_%H%M%S')
|
189 |
+
)
|
190 |
+
with open(config_path, 'w') as f:
|
191 |
+
json.dump(config, f, indent=2, default=str)
|
192 |
+
|
193 |
+
self.artifacts.append(config_path)
|
194 |
+
logger.info("Configuration saved to %s", config_path)
|
195 |
|
196 |
except Exception as e:
|
197 |
logger.error("Failed to log configuration: %s", e)
|
|
|
212 |
metrics['step'] = step
|
213 |
|
214 |
# Log to Trackio
|
215 |
+
if self.trackio_client:
|
216 |
+
result = self.trackio_client.log_metrics(
|
217 |
+
experiment_id=self.experiment_id,
|
218 |
+
metrics=metrics,
|
219 |
+
step=step
|
220 |
+
)
|
221 |
+
|
222 |
+
if "success" in result:
|
223 |
+
logger.debug("Metrics logged to Trackio")
|
224 |
+
else:
|
225 |
+
logger.error("Failed to log metrics to Trackio: %s", result)
|
226 |
|
227 |
+
# Store locally
|
228 |
+
self.metrics_history.append(metrics)
|
229 |
+
|
230 |
+
# Save to HF Dataset periodically
|
231 |
+
if len(self.metrics_history) % 10 == 0: # Save every 10 metrics
|
232 |
+
self._save_to_hf_dataset({'metrics': self.metrics_history})
|
233 |
+
|
234 |
+
logger.debug("Metrics logged: %s", metrics)
|
235 |
|
236 |
except Exception as e:
|
237 |
logger.error("Failed to log metrics: %s", e)
|
|
|
250 |
"checkpoint_size": os.path.getsize(checkpoint_path) if os.path.exists(checkpoint_path) else 0
|
251 |
}
|
252 |
|
253 |
+
if self.trackio_client:
|
254 |
+
result = self.trackio_client.log_parameters(
|
255 |
+
experiment_id=self.experiment_id,
|
256 |
+
parameters=checkpoint_info
|
257 |
+
)
|
258 |
+
|
259 |
+
if "success" in result:
|
260 |
+
logger.info("Checkpoint logged to Trackio")
|
261 |
+
else:
|
262 |
+
logger.error("Failed to log checkpoint to Trackio: %s", result)
|
263 |
|
264 |
+
self.artifacts.append(checkpoint_path)
|
265 |
+
logger.info("Checkpoint logged: %s", checkpoint_path)
|
|
|
|
|
|
|
266 |
|
267 |
except Exception as e:
|
268 |
logger.error("Failed to log checkpoint: %s", e)
|
|
|
332 |
summary['experiment_duration_seconds'] = duration
|
333 |
summary['experiment_duration_hours'] = duration / 3600
|
334 |
|
335 |
+
# Log final summary to Trackio
|
336 |
+
if self.trackio_client:
|
337 |
+
result = self.trackio_client.log_parameters(
|
338 |
+
experiment_id=self.experiment_id,
|
339 |
+
parameters=summary
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
)
|
|
|
|
|
341 |
|
342 |
+
if "success" in result:
|
343 |
+
logger.info("Training summary logged to Trackio")
|
344 |
+
else:
|
345 |
+
logger.error("Failed to log training summary to Trackio: %s", result)
|
346 |
+
|
347 |
+
# Save to HF Dataset
|
348 |
+
self._save_to_hf_dataset(summary)
|
349 |
+
|
350 |
+
# Save summary locally
|
351 |
+
summary_path = "training_summary_{}_{}.json".format(
|
352 |
+
self.experiment_name,
|
353 |
+
self.start_time.strftime('%Y%m%d_%H%M%S')
|
354 |
+
)
|
355 |
+
with open(summary_path, 'w') as f:
|
356 |
+
json.dump(summary, f, indent=2, default=str)
|
357 |
+
|
358 |
+
self.artifacts.append(summary_path)
|
359 |
+
logger.info("Training summary logged and saved to %s", summary_path)
|
360 |
|
361 |
except Exception as e:
|
362 |
logger.error("Failed to log training summary: %s", e)
|
|
|
449 |
logger.error("Failed to close monitoring session: %s", result)
|
450 |
except Exception as e:
|
451 |
logger.error("Failed to close monitoring session: %s", e)
|
452 |
+
|
453 |
+
# Final save to HF Dataset
|
454 |
+
if self.hf_dataset_client:
|
455 |
+
self._save_to_hf_dataset({'status': 'completed'})
|
456 |
|
457 |
# Utility function to create monitor from config
|
458 |
def create_monitor_from_config(config, experiment_name: Optional[str] = None) -> SmolLM3Monitor:
|
|
|
467 |
enable_tracking=getattr(config, 'enable_tracking', True),
|
468 |
log_artifacts=getattr(config, 'log_artifacts', True),
|
469 |
log_metrics=getattr(config, 'log_metrics', True),
|
470 |
+
log_config=getattr(config, 'log_config', True),
|
471 |
+
hf_token=getattr(config, 'hf_token', None),
|
472 |
+
dataset_repo=getattr(config, 'dataset_repo', None)
|
473 |
)
|
train.py → src/train.py
RENAMED
@@ -20,6 +20,7 @@ from config import get_config
|
|
20 |
from model import SmolLM3Model
|
21 |
from data import SmolLM3Dataset
|
22 |
from trainer import SmolLM3Trainer
|
|
|
23 |
|
24 |
def setup_logging():
|
25 |
"""Setup logging configuration"""
|
@@ -86,6 +87,12 @@ def parse_args():
|
|
86 |
parser.add_argument('--experiment_name', type=str, default=None,
|
87 |
help='Custom experiment name for tracking')
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
return parser.parse_args()
|
90 |
|
91 |
def main():
|
@@ -119,6 +126,12 @@ def main():
|
|
119 |
if args.experiment_name is not None:
|
120 |
config.experiment_name = args.experiment_name
|
121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
# Setup paths
|
123 |
output_path = args.out_dir
|
124 |
|
@@ -127,6 +140,22 @@ def main():
|
|
127 |
|
128 |
logger.info(f"Output path: {output_path}")
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
# Initialize model
|
131 |
model = SmolLM3Model(
|
132 |
model_name=args.model_name,
|
@@ -162,13 +191,60 @@ def main():
|
|
162 |
init_from=args.init_from
|
163 |
)
|
164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
# Start training
|
166 |
try:
|
167 |
trainer.train()
|
168 |
logger.info("Training completed successfully!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
except Exception as e:
|
170 |
logger.error(f"Training failed: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
if __name__ == '__main__':
|
174 |
main()
|
|
|
20 |
from model import SmolLM3Model
|
21 |
from data import SmolLM3Dataset
|
22 |
from trainer import SmolLM3Trainer
|
23 |
+
from monitoring import create_monitor_from_config
|
24 |
|
25 |
def setup_logging():
|
26 |
"""Setup logging configuration"""
|
|
|
87 |
parser.add_argument('--experiment_name', type=str, default=None,
|
88 |
help='Custom experiment name for tracking')
|
89 |
|
90 |
+
# HF Datasets arguments
|
91 |
+
parser.add_argument('--hf_token', type=str, default=None,
|
92 |
+
help='Hugging Face token for dataset access')
|
93 |
+
parser.add_argument('--dataset_repo', type=str, default=None,
|
94 |
+
help='HF Dataset repository for experiment storage')
|
95 |
+
|
96 |
return parser.parse_args()
|
97 |
|
98 |
def main():
|
|
|
126 |
if args.experiment_name is not None:
|
127 |
config.experiment_name = args.experiment_name
|
128 |
|
129 |
+
# Override HF Datasets configuration
|
130 |
+
if args.hf_token is not None:
|
131 |
+
os.environ['HF_TOKEN'] = args.hf_token
|
132 |
+
if args.dataset_repo is not None:
|
133 |
+
os.environ['TRACKIO_DATASET_REPO'] = args.dataset_repo
|
134 |
+
|
135 |
# Setup paths
|
136 |
output_path = args.out_dir
|
137 |
|
|
|
140 |
|
141 |
logger.info(f"Output path: {output_path}")
|
142 |
|
143 |
+
# Initialize monitoring
|
144 |
+
monitor = None
|
145 |
+
if config.enable_tracking:
|
146 |
+
try:
|
147 |
+
monitor = create_monitor_from_config(config, args.experiment_name)
|
148 |
+
logger.info(f"✅ Monitoring initialized for experiment: {monitor.experiment_name}")
|
149 |
+
logger.info(f"📊 Dataset repository: {monitor.dataset_repo}")
|
150 |
+
|
151 |
+
# Log configuration
|
152 |
+
config_dict = {k: v for k, v in vars(config).items() if not k.startswith('_')}
|
153 |
+
monitor.log_configuration(config_dict)
|
154 |
+
|
155 |
+
except Exception as e:
|
156 |
+
logger.error(f"Failed to initialize monitoring: {e}")
|
157 |
+
logger.warning("Continuing without monitoring...")
|
158 |
+
|
159 |
# Initialize model
|
160 |
model = SmolLM3Model(
|
161 |
model_name=args.model_name,
|
|
|
191 |
init_from=args.init_from
|
192 |
)
|
193 |
|
194 |
+
# Add monitoring callback if available
|
195 |
+
if monitor:
|
196 |
+
try:
|
197 |
+
callback = monitor.create_monitoring_callback()
|
198 |
+
trainer.add_callback(callback)
|
199 |
+
logger.info("✅ Monitoring callback added to trainer")
|
200 |
+
except Exception as e:
|
201 |
+
logger.error(f"Failed to add monitoring callback: {e}")
|
202 |
+
|
203 |
# Start training
|
204 |
try:
|
205 |
trainer.train()
|
206 |
logger.info("Training completed successfully!")
|
207 |
+
|
208 |
+
# Log training summary
|
209 |
+
if monitor:
|
210 |
+
try:
|
211 |
+
summary = {
|
212 |
+
'final_loss': getattr(trainer, 'final_loss', None),
|
213 |
+
'total_steps': getattr(trainer, 'total_steps', None),
|
214 |
+
'training_duration': getattr(trainer, 'training_duration', None),
|
215 |
+
'model_path': output_path,
|
216 |
+
'config_file': args.config
|
217 |
+
}
|
218 |
+
monitor.log_training_summary(summary)
|
219 |
+
logger.info("✅ Training summary logged")
|
220 |
+
except Exception as e:
|
221 |
+
logger.error(f"Failed to log training summary: {e}")
|
222 |
+
|
223 |
except Exception as e:
|
224 |
logger.error(f"Training failed: {e}")
|
225 |
+
|
226 |
+
# Log error to monitoring
|
227 |
+
if monitor:
|
228 |
+
try:
|
229 |
+
error_summary = {
|
230 |
+
'error': str(e),
|
231 |
+
'status': 'failed',
|
232 |
+
'model_path': output_path,
|
233 |
+
'config_file': args.config
|
234 |
+
}
|
235 |
+
monitor.log_training_summary(error_summary)
|
236 |
+
except Exception as log_error:
|
237 |
+
logger.error(f"Failed to log error to monitoring: {log_error}")
|
238 |
+
|
239 |
raise
|
240 |
+
finally:
|
241 |
+
# Close monitoring
|
242 |
+
if monitor:
|
243 |
+
try:
|
244 |
+
monitor.close()
|
245 |
+
logger.info("✅ Monitoring session closed")
|
246 |
+
except Exception as e:
|
247 |
+
logger.error(f"Failed to close monitoring: {e}")
|
248 |
|
249 |
if __name__ == '__main__':
|
250 |
main()
|
trainer.py → src/trainer.py
RENAMED
File without changes
|
templates/datasets/readme.md
ADDED
File without changes
|