Spaces:
Running
Running
fixes linter errors in setup hf dataset
Browse files
scripts/dataset_tonic/setup_hf_dataset.py
CHANGED
@@ -32,7 +32,7 @@ def get_username_from_token(token: str) -> Optional[str]:
|
|
32 |
user_info = api.whoami()
|
33 |
username = user_info.get("name", user_info.get("username"))
|
34 |
|
35 |
-
|
36 |
except Exception as e:
|
37 |
print(f"❌ Error getting username from token: {e}")
|
38 |
return None
|
@@ -162,20 +162,20 @@ def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
|
|
162 |
if not token:
|
163 |
print("⚠️ No token available for uploading data")
|
164 |
return False
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
'experiment_id': f'exp_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
|
170 |
'name': 'smollm3-finetune-demo',
|
171 |
'description': 'SmolLM3 fine-tuning experiment demo with comprehensive metrics tracking',
|
172 |
'created_at': datetime.now().isoformat(),
|
173 |
'status': 'completed',
|
174 |
-
|
175 |
-
|
176 |
'timestamp': datetime.now().isoformat(),
|
177 |
-
|
178 |
-
|
179 |
'loss': 1.15,
|
180 |
'grad_norm': 10.5,
|
181 |
'learning_rate': 5e-6,
|
@@ -191,13 +191,13 @@ def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
|
|
191 |
'gpu_memory_allocated': 15.2,
|
192 |
'gpu_memory_reserved': 70.1,
|
193 |
'gpu_utilization': 85.2,
|
194 |
-
|
195 |
-
|
|
|
196 |
}
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
'model_name': 'HuggingFaceTB/SmolLM3-3B',
|
201 |
'max_seq_length': 4096,
|
202 |
'batch_size': 2,
|
203 |
'learning_rate': 5e-6,
|
@@ -208,8 +208,8 @@ def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
|
|
208 |
'mixed_precision': True,
|
209 |
'gradient_checkpointing': True,
|
210 |
'flash_attention': True
|
211 |
-
|
212 |
-
|
213 |
'logs': json.dumps([
|
214 |
{
|
215 |
'timestamp': datetime.now().isoformat(),
|
@@ -227,10 +227,10 @@ def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
|
|
227 |
'message': 'Dataset loaded and preprocessed'
|
228 |
}
|
229 |
]),
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
# Create dataset and upload
|
235 |
from datasets import Dataset
|
236 |
|
@@ -347,21 +347,19 @@ This dataset is public by default for easier sharing and collaboration. Only non
|
|
347 |
```json
|
348 |
{{
|
349 |
"experiment_id": "exp_20250720_130853",
|
350 |
-
"name": "
|
351 |
-
"description": "SmolLM3 fine-tuning experiment
|
352 |
-
"created_at": "2025-07-
|
353 |
-
"status": "
|
354 |
-
"metrics": "
|
355 |
-
"parameters": "{{
|
356 |
"artifacts": "[]",
|
357 |
-
"logs": "
|
358 |
-
"last_updated": "2025-07-
|
359 |
}}
|
360 |
```
|
361 |
|
362 |
-
|
363 |
-
|
364 |
-
This dataset is part of the Trackio experiment tracking system and follows the same license as the main project.
|
365 |
"""
|
366 |
|
367 |
# Upload README to the dataset repository
|
|
|
32 |
user_info = api.whoami()
|
33 |
username = user_info.get("name", user_info.get("username"))
|
34 |
|
35 |
+
return username
|
36 |
except Exception as e:
|
37 |
print(f"❌ Error getting username from token: {e}")
|
38 |
return None
|
|
|
162 |
if not token:
|
163 |
print("⚠️ No token available for uploading data")
|
164 |
return False
|
165 |
+
|
166 |
+
# Initial experiment data
|
167 |
+
initial_experiments = [
|
168 |
+
{
|
169 |
'experiment_id': f'exp_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
|
170 |
'name': 'smollm3-finetune-demo',
|
171 |
'description': 'SmolLM3 fine-tuning experiment demo with comprehensive metrics tracking',
|
172 |
'created_at': datetime.now().isoformat(),
|
173 |
'status': 'completed',
|
174 |
+
'metrics': json.dumps([
|
175 |
+
{
|
176 |
'timestamp': datetime.now().isoformat(),
|
177 |
+
'step': 100,
|
178 |
+
'metrics': {
|
179 |
'loss': 1.15,
|
180 |
'grad_norm': 10.5,
|
181 |
'learning_rate': 5e-6,
|
|
|
191 |
'gpu_memory_allocated': 15.2,
|
192 |
'gpu_memory_reserved': 70.1,
|
193 |
'gpu_utilization': 85.2,
|
194 |
+
'cpu_percent': 2.7,
|
195 |
+
'memory_percent': 10.1
|
196 |
+
}
|
197 |
}
|
198 |
+
]),
|
199 |
+
'parameters': json.dumps({
|
200 |
+
'model_name': 'HuggingFaceTB/SmolLM3-3B',
|
|
|
201 |
'max_seq_length': 4096,
|
202 |
'batch_size': 2,
|
203 |
'learning_rate': 5e-6,
|
|
|
208 |
'mixed_precision': True,
|
209 |
'gradient_checkpointing': True,
|
210 |
'flash_attention': True
|
211 |
+
}),
|
212 |
+
'artifacts': json.dumps([]),
|
213 |
'logs': json.dumps([
|
214 |
{
|
215 |
'timestamp': datetime.now().isoformat(),
|
|
|
227 |
'message': 'Dataset loaded and preprocessed'
|
228 |
}
|
229 |
]),
|
230 |
+
'last_updated': datetime.now().isoformat()
|
231 |
+
}
|
232 |
+
]
|
233 |
+
|
234 |
# Create dataset and upload
|
235 |
from datasets import Dataset
|
236 |
|
|
|
347 |
```json
|
348 |
{{
|
349 |
"experiment_id": "exp_20250720_130853",
|
350 |
+
"name": "smollm3-finetune-demo",
|
351 |
+
"description": "SmolLM3 fine-tuning experiment demo",
|
352 |
+
"created_at": "2025-07-20T13:08:53",
|
353 |
+
"status": "completed",
|
354 |
+
"metrics": "{{...}}",
|
355 |
+
"parameters": "{{...}}",
|
356 |
"artifacts": "[]",
|
357 |
+
"logs": "{{...}}",
|
358 |
+
"last_updated": "2025-07-20T13:08:53"
|
359 |
}}
|
360 |
```
|
361 |
|
362 |
+
This dataset is maintained by the Trackio monitoring system and automatically updated during training runs.
|
|
|
|
|
363 |
"""
|
364 |
|
365 |
# Upload README to the dataset repository
|