Tonic commited on
Commit
2df26a0
·
verified ·
1 Parent(s): c61ed6b

fixes linter errors in setup hf dataset

Browse files
scripts/dataset_tonic/setup_hf_dataset.py CHANGED
@@ -32,7 +32,7 @@ def get_username_from_token(token: str) -> Optional[str]:
32
  user_info = api.whoami()
33
  username = user_info.get("name", user_info.get("username"))
34
 
35
- return username
36
  except Exception as e:
37
  print(f"❌ Error getting username from token: {e}")
38
  return None
@@ -162,20 +162,20 @@ def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
162
  if not token:
163
  print("⚠️ No token available for uploading data")
164
  return False
165
-
166
- # Initial experiment data
167
- initial_experiments = [
168
- {
169
  'experiment_id': f'exp_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
170
  'name': 'smollm3-finetune-demo',
171
  'description': 'SmolLM3 fine-tuning experiment demo with comprehensive metrics tracking',
172
  'created_at': datetime.now().isoformat(),
173
  'status': 'completed',
174
- 'metrics': json.dumps([
175
- {
176
  'timestamp': datetime.now().isoformat(),
177
- 'step': 100,
178
- 'metrics': {
179
  'loss': 1.15,
180
  'grad_norm': 10.5,
181
  'learning_rate': 5e-6,
@@ -191,13 +191,13 @@ def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
191
  'gpu_memory_allocated': 15.2,
192
  'gpu_memory_reserved': 70.1,
193
  'gpu_utilization': 85.2,
194
- 'cpu_percent': 2.7,
195
- 'memory_percent': 10.1
 
196
  }
197
- }
198
- ]),
199
- 'parameters': json.dumps({
200
- 'model_name': 'HuggingFaceTB/SmolLM3-3B',
201
  'max_seq_length': 4096,
202
  'batch_size': 2,
203
  'learning_rate': 5e-6,
@@ -208,8 +208,8 @@ def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
208
  'mixed_precision': True,
209
  'gradient_checkpointing': True,
210
  'flash_attention': True
211
- }),
212
- 'artifacts': json.dumps([]),
213
  'logs': json.dumps([
214
  {
215
  'timestamp': datetime.now().isoformat(),
@@ -227,10 +227,10 @@ def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
227
  'message': 'Dataset loaded and preprocessed'
228
  }
229
  ]),
230
- 'last_updated': datetime.now().isoformat()
231
- }
232
- ]
233
-
234
  # Create dataset and upload
235
  from datasets import Dataset
236
 
@@ -347,21 +347,19 @@ This dataset is public by default for easier sharing and collaboration. Only non
347
  ```json
348
  {{
349
  "experiment_id": "exp_20250720_130853",
350
- "name": "smollm3_finetune",
351
- "description": "SmolLM3 fine-tuning experiment with comprehensive metrics",
352
- "created_at": "2025-07-20T11:20:01.780908",
353
- "status": "running",
354
- "metrics": "[{{\"timestamp\": \"2025-07-20T11:20:01.780908\", \"step\": 25, \"metrics\": {{\"loss\": 1.1659, \"accuracy\": 0.759, \"total_tokens\": 1642080.0, \"throughput\": 3284160.0, \"train/gate_ortho\": 0.0234, \"train/center\": 0.0156}}}}]",
355
- "parameters": "{{\"model_name\": \"HuggingFaceTB/SmolLM3-3B\", \"batch_size\": 8, \"learning_rate\": 3.5e-06, \"max_seq_length\": 12288}}",
356
  "artifacts": "[]",
357
- "logs": "[]",
358
- "last_updated": "2025-07-20T11:20:01.780908"
359
  }}
360
  ```
361
 
362
- ## License
363
-
364
- This dataset is part of the Trackio experiment tracking system and follows the same license as the main project.
365
  """
366
 
367
  # Upload README to the dataset repository
 
32
  user_info = api.whoami()
33
  username = user_info.get("name", user_info.get("username"))
34
 
35
+ return username
36
  except Exception as e:
37
  print(f"❌ Error getting username from token: {e}")
38
  return None
 
162
  if not token:
163
  print("⚠️ No token available for uploading data")
164
  return False
165
+
166
+ # Initial experiment data
167
+ initial_experiments = [
168
+ {
169
  'experiment_id': f'exp_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
170
  'name': 'smollm3-finetune-demo',
171
  'description': 'SmolLM3 fine-tuning experiment demo with comprehensive metrics tracking',
172
  'created_at': datetime.now().isoformat(),
173
  'status': 'completed',
174
+ 'metrics': json.dumps([
175
+ {
176
  'timestamp': datetime.now().isoformat(),
177
+ 'step': 100,
178
+ 'metrics': {
179
  'loss': 1.15,
180
  'grad_norm': 10.5,
181
  'learning_rate': 5e-6,
 
191
  'gpu_memory_allocated': 15.2,
192
  'gpu_memory_reserved': 70.1,
193
  'gpu_utilization': 85.2,
194
+ 'cpu_percent': 2.7,
195
+ 'memory_percent': 10.1
196
+ }
197
  }
198
+ ]),
199
+ 'parameters': json.dumps({
200
+ 'model_name': 'HuggingFaceTB/SmolLM3-3B',
 
201
  'max_seq_length': 4096,
202
  'batch_size': 2,
203
  'learning_rate': 5e-6,
 
208
  'mixed_precision': True,
209
  'gradient_checkpointing': True,
210
  'flash_attention': True
211
+ }),
212
+ 'artifacts': json.dumps([]),
213
  'logs': json.dumps([
214
  {
215
  'timestamp': datetime.now().isoformat(),
 
227
  'message': 'Dataset loaded and preprocessed'
228
  }
229
  ]),
230
+ 'last_updated': datetime.now().isoformat()
231
+ }
232
+ ]
233
+
234
  # Create dataset and upload
235
  from datasets import Dataset
236
 
 
347
  ```json
348
  {{
349
  "experiment_id": "exp_20250720_130853",
350
+ "name": "smollm3-finetune-demo",
351
+ "description": "SmolLM3 fine-tuning experiment demo",
352
+ "created_at": "2025-07-20T13:08:53",
353
+ "status": "completed",
354
+ "metrics": "{{...}}",
355
+ "parameters": "{{...}}",
356
  "artifacts": "[]",
357
+ "logs": "{{...}}",
358
+ "last_updated": "2025-07-20T13:08:53"
359
  }}
360
  ```
361
 
362
+ This dataset is maintained by the Trackio monitoring system and automatically updated during training runs.
 
 
363
  """
364
 
365
  # Upload README to the dataset repository