Tonic commited on
Commit
d47568c
·
1 Parent(s): cb276d8

adds template files , adds non destructive dataset updates

Browse files
requirements/requirements_core.txt CHANGED
@@ -6,7 +6,7 @@ accelerate>=0.20.0
6
  peft>=0.17.0 # Updated for GPT-OSS LoRA support
7
  trl>=0.20.0 # Updated for GPT-OSS compatibility
8
  kernels
9
-
10
  # Hugging Face Hub for model and space management
11
  huggingface_hub>=0.19.0
12
 
 
6
  peft>=0.17.0 # Updated for GPT-OSS LoRA support
7
  trl>=0.20.0 # Updated for GPT-OSS compatibility
8
  kernels
9
+ openai-harmony
10
  # Hugging Face Hub for model and space management
11
  huggingface_hub>=0.19.0
12
 
scripts/trackio_tonic/app.py CHANGED
@@ -26,6 +26,7 @@ class TrackioSpace:
26
  self.experiments = {}
27
  self.current_experiment = None
28
  self.backup_mode = False
 
29
 
30
  # Get dataset repository and HF token from parameters or environment variables
31
  # Respect explicit values; avoid hardcoded defaults that might point to test repos
@@ -38,6 +39,17 @@ class TrackioSpace:
38
  if not self.hf_token:
39
  logger.warning("⚠️ HF_TOKEN not found. Some features may not work.")
40
 
 
 
 
 
 
 
 
 
 
 
 
41
  self._load_experiments()
42
 
43
  def _load_experiments(self):
@@ -314,13 +326,45 @@ class TrackioSpace:
314
  self.current_experiment = 'exp_20250720_134319'
315
  logger.info(f"✅ Loaded {len(backup_experiments)} backup experiments")
316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  def _save_experiments(self):
318
- """Save experiments to HF Dataset"""
 
 
 
 
319
  try:
320
  if self.backup_mode:
321
  logger.warning("⚠️ Backup mode active; skipping dataset save to avoid overwriting real data with demo values")
322
  return
323
- if self.hf_token:
324
  from datasets import Dataset
325
  from huggingface_hub import HfApi
326
 
@@ -351,10 +395,10 @@ class TrackioSpace:
351
  private=True # Make it private for security
352
  )
353
 
354
- logger.info(f"✅ Saved {len(dataset_data)} experiments to {self.dataset_repo}")
355
 
356
  else:
357
- logger.warning("⚠️ No HF_TOKEN available, experiments not saved to dataset")
358
 
359
  except Exception as e:
360
  logger.error(f"Failed to save experiments to dataset: {e}")
@@ -389,7 +433,8 @@ class TrackioSpace:
389
 
390
  self.experiments[experiment_id] = experiment
391
  self.current_experiment = experiment_id
392
- self._save_experiments()
 
393
 
394
  logger.info(f"Created experiment: {experiment_id} - {name}")
395
  return experiment
@@ -406,7 +451,7 @@ class TrackioSpace:
406
  }
407
 
408
  self.experiments[experiment_id]['metrics'].append(metric_entry)
409
- self._save_experiments()
410
  logger.info(f"Logged metrics for experiment {experiment_id}: {metrics}")
411
 
412
  def log_parameters(self, experiment_id: str, parameters: Dict[str, Any]):
@@ -415,7 +460,7 @@ class TrackioSpace:
415
  raise ValueError(f"Experiment {experiment_id} not found")
416
 
417
  self.experiments[experiment_id]['parameters'].update(parameters)
418
- self._save_experiments()
419
  logger.info(f"Logged parameters for experiment {experiment_id}: {parameters}")
420
 
421
  def log_artifact(self, experiment_id: str, artifact_name: str, artifact_data: str):
@@ -430,7 +475,7 @@ class TrackioSpace:
430
  }
431
 
432
  self.experiments[experiment_id]['artifacts'].append(artifact_entry)
433
- self._save_experiments()
434
  logger.info(f"Logged artifact for experiment {experiment_id}: {artifact_name}")
435
 
436
  def get_experiment(self, experiment_id: str) -> Optional[Dict[str, Any]]:
@@ -449,7 +494,7 @@ class TrackioSpace:
449
  """Update experiment status"""
450
  if experiment_id in self.experiments:
451
  self.experiments[experiment_id]['status'] = status
452
- self._save_experiments()
453
  logger.info(f"Updated experiment {experiment_id} status to {status}")
454
 
455
  def get_metrics_dataframe(self, experiment_id: str) -> pd.DataFrame:
 
26
  self.experiments = {}
27
  self.current_experiment = None
28
  self.backup_mode = False
29
+ self.dataset_manager = None
30
 
31
  # Get dataset repository and HF token from parameters or environment variables
32
  # Respect explicit values; avoid hardcoded defaults that might point to test repos
 
39
  if not self.hf_token:
40
  logger.warning("⚠️ HF_TOKEN not found. Some features may not work.")
41
 
42
+ # Initialize dataset manager for safe, non-destructive operations
43
+ try:
44
+ import sys
45
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
46
+ from dataset_utils import TrackioDatasetManager # type: ignore
47
+ if self.hf_token and self.dataset_repo:
48
+ self.dataset_manager = TrackioDatasetManager(self.dataset_repo, self.hf_token)
49
+ logger.info("✅ Dataset manager initialized (data preservation enabled)")
50
+ except Exception as e:
51
+ logger.warning(f"⚠️ Dataset manager not available, using legacy save mode: {e}")
52
+
53
  self._load_experiments()
54
 
55
  def _load_experiments(self):
 
326
  self.current_experiment = 'exp_20250720_134319'
327
  logger.info(f"✅ Loaded {len(backup_experiments)} backup experiments")
328
 
329
+ def _upsert_experiment(self, experiment_id: str):
330
+ """Non-destructive upsert of a single experiment to the dataset if manager available."""
331
+ try:
332
+ if not self.dataset_manager or not self.hf_token:
333
+ # Fallback to legacy save method
334
+ self._save_experiments()
335
+ return
336
+ exp = self.experiments.get(experiment_id)
337
+ if not exp:
338
+ return
339
+ # Build dataset row with JSON-encoded fields
340
+ payload = {
341
+ 'experiment_id': experiment_id,
342
+ 'name': exp.get('name', ''),
343
+ 'description': exp.get('description', ''),
344
+ 'created_at': exp.get('created_at', ''),
345
+ 'status': exp.get('status', 'running'),
346
+ 'metrics': json.dumps(exp.get('metrics', []), default=str),
347
+ 'parameters': json.dumps(exp.get('parameters', {}), default=str),
348
+ 'artifacts': json.dumps(exp.get('artifacts', []), default=str),
349
+ 'logs': json.dumps(exp.get('logs', []), default=str),
350
+ 'last_updated': datetime.now().isoformat()
351
+ }
352
+ self.dataset_manager.upsert_experiment(payload)
353
+ except Exception as e:
354
+ logger.warning(f"⚠️ Upsert failed, falling back to legacy save: {e}")
355
+ self._save_experiments()
356
+
357
  def _save_experiments(self):
358
+ """Save experiments to HF Dataset (legacy fallback).
359
+
360
+ Prefer using dataset manager upserts in per-operation paths. This method is
361
+ retained as a fallback when the manager isn't available.
362
+ """
363
  try:
364
  if self.backup_mode:
365
  logger.warning("⚠️ Backup mode active; skipping dataset save to avoid overwriting real data with demo values")
366
  return
367
+ if self.hf_token and not self.dataset_manager:
368
  from datasets import Dataset
369
  from huggingface_hub import HfApi
370
 
 
395
  private=True # Make it private for security
396
  )
397
 
398
+ logger.info(f"✅ Saved {len(dataset_data)} experiments to {self.dataset_repo} (legacy mode)")
399
 
400
  else:
401
+ logger.warning("⚠️ No dataset manager and/or HF_TOKEN available, experiments not saved to dataset")
402
 
403
  except Exception as e:
404
  logger.error(f"Failed to save experiments to dataset: {e}")
 
433
 
434
  self.experiments[experiment_id] = experiment
435
  self.current_experiment = experiment_id
436
+ # Prefer non-destructive upsert
437
+ self._upsert_experiment(experiment_id)
438
 
439
  logger.info(f"Created experiment: {experiment_id} - {name}")
440
  return experiment
 
451
  }
452
 
453
  self.experiments[experiment_id]['metrics'].append(metric_entry)
454
+ self._upsert_experiment(experiment_id)
455
  logger.info(f"Logged metrics for experiment {experiment_id}: {metrics}")
456
 
457
  def log_parameters(self, experiment_id: str, parameters: Dict[str, Any]):
 
460
  raise ValueError(f"Experiment {experiment_id} not found")
461
 
462
  self.experiments[experiment_id]['parameters'].update(parameters)
463
+ self._upsert_experiment(experiment_id)
464
  logger.info(f"Logged parameters for experiment {experiment_id}: {parameters}")
465
 
466
  def log_artifact(self, experiment_id: str, artifact_name: str, artifact_data: str):
 
475
  }
476
 
477
  self.experiments[experiment_id]['artifacts'].append(artifact_entry)
478
+ self._upsert_experiment(experiment_id)
479
  logger.info(f"Logged artifact for experiment {experiment_id}: {artifact_name}")
480
 
481
  def get_experiment(self, experiment_id: str) -> Optional[Dict[str, Any]]:
 
494
  """Update experiment status"""
495
  if experiment_id in self.experiments:
496
  self.experiments[experiment_id]['status'] = status
497
+ self._upsert_experiment(experiment_id)
498
  logger.info(f"Updated experiment {experiment_id} status to {status}")
499
 
500
  def get_metrics_dataframe(self, experiment_id: str) -> pd.DataFrame:
scripts/trackio_tonic/dataset_utils.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Dataset utilities for Trackio experiment data management
4
+ Provides functions for safe dataset operations with data preservation
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ from datetime import datetime
10
+ from typing import Dict, Any, List, Optional, Union
11
+ from datasets import Dataset, load_dataset
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ class TrackioDatasetManager:
16
+ """
17
+ Manager class for Trackio experiment datasets with data preservation.
18
+
19
+ This class ensures that existing experiment data is always preserved
20
+ when adding new experiments or updating existing ones.
21
+ """
22
+
23
+ def __init__(self, dataset_repo: str, hf_token: str):
24
+ """
25
+ Initialize the dataset manager.
26
+
27
+ Args:
28
+ dataset_repo (str): HF dataset repository ID (e.g., "username/dataset-name")
29
+ hf_token (str): Hugging Face token for authentication
30
+ """
31
+ self.dataset_repo = dataset_repo
32
+ self.hf_token = hf_token
33
+ self._validate_repo_format()
34
+
35
+ def _validate_repo_format(self):
36
+ """Validate dataset repository format"""
37
+ if not self.dataset_repo or '/' not in self.dataset_repo:
38
+ raise ValueError(f"Invalid dataset repository format: {self.dataset_repo}")
39
+
40
+ def check_dataset_exists(self) -> bool:
41
+ """
42
+ Check if the dataset repository exists and is accessible.
43
+
44
+ Returns:
45
+ bool: True if dataset exists and is accessible, False otherwise
46
+ """
47
+ try:
48
+ load_dataset(self.dataset_repo, token=self.hf_token)
49
+ logger.info(f"✅ Dataset {self.dataset_repo} exists and is accessible")
50
+ return True
51
+ except Exception as e:
52
+ logger.info(f"📊 Dataset {self.dataset_repo} doesn't exist or isn't accessible: {e}")
53
+ return False
54
+
55
+ def load_existing_experiments(self) -> List[Dict[str, Any]]:
56
+ """
57
+ Load all existing experiments from the dataset.
58
+
59
+ Returns:
60
+ List[Dict[str, Any]]: List of existing experiment dictionaries
61
+ """
62
+ try:
63
+ if not self.check_dataset_exists():
64
+ logger.info("📊 No existing dataset found, returning empty list")
65
+ return []
66
+
67
+ dataset = load_dataset(self.dataset_repo, token=self.hf_token)
68
+
69
+ if 'train' not in dataset:
70
+ logger.info("📊 No 'train' split found in dataset")
71
+ return []
72
+
73
+ experiments = list(dataset['train'])
74
+ logger.info(f"📊 Loaded {len(experiments)} existing experiments")
75
+
76
+ # Validate experiment structure
77
+ valid_experiments = []
78
+ for exp in experiments:
79
+ if self._validate_experiment_structure(exp):
80
+ valid_experiments.append(exp)
81
+ else:
82
+ logger.warning(f"⚠️ Skipping invalid experiment: {exp.get('experiment_id', 'unknown')}")
83
+
84
+ logger.info(f"📊 {len(valid_experiments)} valid experiments loaded")
85
+ return valid_experiments
86
+
87
+ except Exception as e:
88
+ logger.error(f"❌ Failed to load existing experiments: {e}")
89
+ return []
90
+
91
+ def _validate_experiment_structure(self, experiment: Dict[str, Any]) -> bool:
92
+ """
93
+ Validate that an experiment has the required structure.
94
+
95
+ Args:
96
+ experiment (Dict[str, Any]): Experiment dictionary to validate
97
+
98
+ Returns:
99
+ bool: True if experiment structure is valid
100
+ """
101
+ required_fields = [
102
+ 'experiment_id', 'name', 'description', 'created_at',
103
+ 'status', 'metrics', 'parameters', 'artifacts', 'logs'
104
+ ]
105
+
106
+ for field in required_fields:
107
+ if field not in experiment:
108
+ logger.warning(f"⚠️ Missing required field '{field}' in experiment")
109
+ return False
110
+
111
+ # Validate JSON fields
112
+ json_fields = ['metrics', 'parameters', 'artifacts', 'logs']
113
+ for field in json_fields:
114
+ if isinstance(experiment[field], str):
115
+ try:
116
+ json.loads(experiment[field])
117
+ except json.JSONDecodeError:
118
+ logger.warning(f"⚠️ Invalid JSON in field '{field}' for experiment {experiment.get('experiment_id')}")
119
+ return False
120
+
121
+ return True
122
+
123
+ def save_experiments(self, experiments: List[Dict[str, Any]], commit_message: Optional[str] = None) -> bool:
124
+ """
125
+ Save a list of experiments to the dataset, preserving data integrity.
126
+
127
+ Args:
128
+ experiments (List[Dict[str, Any]]): List of experiment dictionaries
129
+ commit_message (Optional[str]): Custom commit message
130
+
131
+ Returns:
132
+ bool: True if save was successful, False otherwise
133
+ """
134
+ try:
135
+ if not experiments:
136
+ logger.warning("⚠️ No experiments to save")
137
+ return False
138
+
139
+ # Validate all experiments before saving
140
+ valid_experiments = []
141
+ for exp in experiments:
142
+ if self._validate_experiment_structure(exp):
143
+ # Ensure last_updated is set
144
+ if 'last_updated' not in exp:
145
+ exp['last_updated'] = datetime.now().isoformat()
146
+ valid_experiments.append(exp)
147
+ else:
148
+ logger.error(f"❌ Invalid experiment structure: {exp.get('experiment_id', 'unknown')}")
149
+ return False
150
+
151
+ # Create dataset
152
+ dataset = Dataset.from_list(valid_experiments)
153
+
154
+ # Generate commit message if not provided
155
+ if not commit_message:
156
+ commit_message = f"Update dataset with {len(valid_experiments)} experiments ({datetime.now().isoformat()})"
157
+
158
+ # Push to hub
159
+ dataset.push_to_hub(
160
+ self.dataset_repo,
161
+ token=self.hf_token,
162
+ private=True,
163
+ commit_message=commit_message
164
+ )
165
+
166
+ logger.info(f"✅ Successfully saved {len(valid_experiments)} experiments to {self.dataset_repo}")
167
+ return True
168
+
169
+ except Exception as e:
170
+ logger.error(f"❌ Failed to save experiments to dataset: {e}")
171
+ return False
172
+
173
+ def upsert_experiment(self, experiment: Dict[str, Any]) -> bool:
174
+ """
175
+ Insert a new experiment or update an existing one, preserving all other data.
176
+
177
+ Args:
178
+ experiment (Dict[str, Any]): Experiment dictionary to upsert
179
+
180
+ Returns:
181
+ bool: True if operation was successful, False otherwise
182
+ """
183
+ try:
184
+ # Validate the experiment structure
185
+ if not self._validate_experiment_structure(experiment):
186
+ logger.error(f"❌ Invalid experiment structure for {experiment.get('experiment_id', 'unknown')}")
187
+ return False
188
+
189
+ # Load existing experiments
190
+ existing_experiments = self.load_existing_experiments()
191
+
192
+ # Find if experiment already exists
193
+ experiment_id = experiment['experiment_id']
194
+ experiment_found = False
195
+ updated_experiments = []
196
+
197
+ for existing_exp in existing_experiments:
198
+ if existing_exp.get('experiment_id') == experiment_id:
199
+ # Update existing experiment
200
+ logger.info(f"🔄 Updating existing experiment: {experiment_id}")
201
+ experiment['last_updated'] = datetime.now().isoformat()
202
+ updated_experiments.append(experiment)
203
+ experiment_found = True
204
+ else:
205
+ # Preserve existing experiment
206
+ updated_experiments.append(existing_exp)
207
+
208
+ # If experiment doesn't exist, add it
209
+ if not experiment_found:
210
+ logger.info(f"➕ Adding new experiment: {experiment_id}")
211
+ experiment['last_updated'] = datetime.now().isoformat()
212
+ updated_experiments.append(experiment)
213
+
214
+ # Save all experiments
215
+ commit_message = f"{'Update' if experiment_found else 'Add'} experiment {experiment_id} (preserving {len(existing_experiments)} existing experiments)"
216
+
217
+ return self.save_experiments(updated_experiments, commit_message)
218
+
219
+ except Exception as e:
220
+ logger.error(f"❌ Failed to upsert experiment: {e}")
221
+ return False
222
+
223
+ def get_experiment_by_id(self, experiment_id: str) -> Optional[Dict[str, Any]]:
224
+ """
225
+ Retrieve a specific experiment by its ID.
226
+
227
+ Args:
228
+ experiment_id (str): The experiment ID to search for
229
+
230
+ Returns:
231
+ Optional[Dict[str, Any]]: The experiment dictionary if found, None otherwise
232
+ """
233
+ try:
234
+ experiments = self.load_existing_experiments()
235
+
236
+ for exp in experiments:
237
+ if exp.get('experiment_id') == experiment_id:
238
+ logger.info(f"✅ Found experiment: {experiment_id}")
239
+ return exp
240
+
241
+ logger.info(f"📊 Experiment not found: {experiment_id}")
242
+ return None
243
+
244
+ except Exception as e:
245
+ logger.error(f"❌ Failed to get experiment {experiment_id}: {e}")
246
+ return None
247
+
248
+ def list_experiments(self, status_filter: Optional[str] = None) -> List[Dict[str, Any]]:
249
+ """
250
+ List all experiments, optionally filtered by status.
251
+
252
+ Args:
253
+ status_filter (Optional[str]): Filter by experiment status (running, completed, failed, paused)
254
+
255
+ Returns:
256
+ List[Dict[str, Any]]: List of experiments matching the filter
257
+ """
258
+ try:
259
+ experiments = self.load_existing_experiments()
260
+
261
+ if status_filter:
262
+ filtered_experiments = [exp for exp in experiments if exp.get('status') == status_filter]
263
+ logger.info(f"📊 Found {len(filtered_experiments)} experiments with status '{status_filter}'")
264
+ return filtered_experiments
265
+
266
+ logger.info(f"📊 Found {len(experiments)} total experiments")
267
+ return experiments
268
+
269
+ except Exception as e:
270
+ logger.error(f"❌ Failed to list experiments: {e}")
271
+ return []
272
+
273
+ def backup_dataset(self, backup_suffix: Optional[str] = None) -> str:
274
+ """
275
+ Create a backup of the current dataset.
276
+
277
+ Args:
278
+ backup_suffix (Optional[str]): Optional suffix for backup repo name
279
+
280
+ Returns:
281
+ str: Backup repository name if successful, empty string otherwise
282
+ """
283
+ try:
284
+ if not backup_suffix:
285
+ backup_suffix = datetime.now().strftime('%Y%m%d_%H%M%S')
286
+
287
+ backup_repo = f"{self.dataset_repo}-backup-{backup_suffix}"
288
+
289
+ # Load current experiments
290
+ experiments = self.load_existing_experiments()
291
+
292
+ if not experiments:
293
+ logger.warning("⚠️ No experiments to backup")
294
+ return ""
295
+
296
+ # Create backup dataset manager
297
+ backup_manager = TrackioDatasetManager(backup_repo, self.hf_token)
298
+
299
+ # Save to backup
300
+ success = backup_manager.save_experiments(
301
+ experiments,
302
+ f"Backup of {self.dataset_repo} created on {datetime.now().isoformat()}"
303
+ )
304
+
305
+ if success:
306
+ logger.info(f"✅ Backup created: {backup_repo}")
307
+ return backup_repo
308
+ else:
309
+ logger.error("❌ Failed to create backup")
310
+ return ""
311
+
312
+ except Exception as e:
313
+ logger.error(f"❌ Failed to create backup: {e}")
314
+ return ""
315
+
316
+
317
+ def create_dataset_manager(dataset_repo: str, hf_token: str) -> TrackioDatasetManager:
318
+ """
319
+ Factory function to create a TrackioDatasetManager instance.
320
+
321
+ Args:
322
+ dataset_repo (str): HF dataset repository ID
323
+ hf_token (str): Hugging Face token
324
+
325
+ Returns:
326
+ TrackioDatasetManager: Configured dataset manager instance
327
+ """
328
+ return TrackioDatasetManager(dataset_repo, hf_token)
scripts/trackio_tonic/deploy_trackio_space.py CHANGED
@@ -199,10 +199,14 @@ class TrackioSpaceDeployer:
199
  templates_dir = project_root / "templates" / "spaces" / "trackio"
200
 
201
  # Files to copy from templates/spaces/trackio
 
 
202
  files_to_copy = [
203
  "app.py",
204
  "requirements.txt",
205
- "README.md"
 
 
206
  ]
207
 
208
  # Copy files from templates/spaces/trackio to temp directory
 
199
  templates_dir = project_root / "templates" / "spaces" / "trackio"
200
 
201
  # Files to copy from templates/spaces/trackio
202
+ # Include dataset_utils.py and trackio_api_client.py to enable
203
+ # safe dataset upserts and local API client usage on the Space.
204
  files_to_copy = [
205
  "app.py",
206
  "requirements.txt",
207
+ "README.md",
208
+ "dataset_utils.py",
209
+ "trackio_api_client.py",
210
  ]
211
 
212
  # Copy files from templates/spaces/trackio to temp directory
scripts/training/train_gpt_oss.py CHANGED
@@ -61,13 +61,20 @@ def load_gpt_oss_model_and_tokenizer(config):
61
  # No quantization
62
  quantization_config = None
63
 
64
- # Model kwargs as per tutorial
65
- model_kwargs = {
66
  "attn_implementation": "eager",
67
  "torch_dtype": torch.bfloat16,
68
  "use_cache": False,
69
  "device_map": "auto",
70
  }
 
 
 
 
 
 
 
71
 
72
  # Only add quantization_config if it's not None
73
  if quantization_config is not None:
 
61
  # No quantization
62
  quantization_config = None
63
 
64
+ # Build model kwargs with sensible defaults and allow config overrides
65
+ default_model_kwargs = {
66
  "attn_implementation": "eager",
67
  "torch_dtype": torch.bfloat16,
68
  "use_cache": False,
69
  "device_map": "auto",
70
  }
71
+
72
+ cfg_model_kwargs = getattr(config, "model_kwargs", None)
73
+ if isinstance(cfg_model_kwargs, dict):
74
+ # Config overrides defaults (e.g., attn_implementation="kernels-community/vllm-flash-attn3")
75
+ model_kwargs = {**default_model_kwargs, **cfg_model_kwargs}
76
+ else:
77
+ model_kwargs = default_model_kwargs.copy()
78
 
79
  # Only add quantization_config if it's not None
80
  if quantization_config is not None:
src/monitoring.py CHANGED
@@ -204,36 +204,128 @@ class SmolLM3Monitor:
204
  self.experiment_id = f"exp_{timestamp}"
205
 
206
  def _save_to_hf_dataset(self, experiment_data: Dict[str, Any]):
207
- """Save experiment data to HF Dataset with data preservation using dataset manager"""
 
 
 
 
 
 
 
 
208
  if not self.dataset_manager:
209
  logger.warning("⚠️ Dataset manager not available")
210
  return False
211
-
212
  try:
213
- # Prepare current experiment data with standardized structure
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  current_experiment = {
215
- 'experiment_id': self.experiment_id or f"exp_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
216
- 'name': self.experiment_name,
217
- 'description': "SmolLM3 fine-tuning experiment",
218
- 'created_at': self.start_time.isoformat(),
219
- 'status': 'running',
220
- 'metrics': json.dumps(self.metrics_history, default=str),
221
- 'parameters': json.dumps(experiment_data, default=str),
222
- 'artifacts': json.dumps(self.artifacts, default=str),
223
- 'logs': json.dumps([], default=str),
224
  'last_updated': datetime.now().isoformat()
225
  }
226
-
227
- # Use dataset manager to safely upsert the experiment
228
  success = self.dataset_manager.upsert_experiment(current_experiment)
229
-
230
  if success:
231
  logger.info(f"✅ Experiment data saved to HF Dataset: {self.dataset_repo}")
232
  return True
233
  else:
234
- logger.error(f"❌ Failed to save experiment data to HF Dataset")
235
  return False
236
-
237
  except Exception as e:
238
  logger.error(f"❌ Failed to save to HF Dataset: {e}")
239
  return False
 
204
  self.experiment_id = f"exp_{timestamp}"
205
 
206
  def _save_to_hf_dataset(self, experiment_data: Dict[str, Any]):
207
+ """Save experiment data to HF Dataset with data preservation using dataset manager.
208
+
209
+ This method MERGES with any existing experiment entry to avoid overwriting data:
210
+ - If experiment_data contains a 'metrics' list, append new metric entries (with de-dup by step+timestamp)
211
+ and store using the nested structure expected by the Trackio Space (each entry has
212
+ {timestamp, step, metrics: {...}}).
213
+ - Otherwise, treat experiment_data as a parameters update and dict-merge it into existing parameters.
214
+ - Artifacts are merged and de-duplicated by their string value.
215
+ """
216
  if not self.dataset_manager:
217
  logger.warning("⚠️ Dataset manager not available")
218
  return False
219
+
220
  try:
221
+ experiment_id = self.experiment_id or f"exp_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
222
+
223
+ # Load existing experiment (if any)
224
+ existing = self.dataset_manager.get_experiment_by_id(experiment_id) or {}
225
+
226
+ # Helper to safely parse JSON fields from existing
227
+ def _parse_json_field(value, default):
228
+ try:
229
+ if value is None:
230
+ return default
231
+ if isinstance(value, str):
232
+ return json.loads(value) if value else default
233
+ return value
234
+ except Exception:
235
+ return default
236
+
237
+ existing_metrics = _parse_json_field(existing.get('metrics'), [])
238
+ existing_parameters = _parse_json_field(existing.get('parameters'), {})
239
+ existing_artifacts = _parse_json_field(existing.get('artifacts'), [])
240
+ existing_logs = _parse_json_field(existing.get('logs'), [])
241
+
242
+ # Start from existing fields
243
+ merged_metrics = list(existing_metrics) if isinstance(existing_metrics, list) else []
244
+ merged_parameters = dict(existing_parameters) if isinstance(existing_parameters, dict) else {}
245
+ merged_artifacts = list(existing_artifacts) if isinstance(existing_artifacts, list) else []
246
+
247
+ # Merge incoming data
248
+ if 'metrics' in experiment_data:
249
+ # Accept either a list of metric dicts or a single metrics dict
250
+ incoming_metrics = experiment_data.get('metrics')
251
+
252
+ # Build a set of (step, timestamp) to deduplicate
253
+ def _entry_key(entry: Dict[str, Any]):
254
+ return (entry.get('step'), entry.get('timestamp'))
255
+
256
+ existing_keys = set()
257
+ for entry in merged_metrics:
258
+ # Support both nested and flat formats in existing data
259
+ if isinstance(entry, dict) and 'metrics' in entry:
260
+ existing_keys.add(_entry_key(entry))
261
+ elif isinstance(entry, dict):
262
+ existing_keys.add((entry.get('step'), entry.get('timestamp')))
263
+
264
+ def _to_nested_entry(metric: Dict[str, Any]) -> Dict[str, Any]:
265
+ # If already nested, return as-is
266
+ if isinstance(metric, dict) and 'metrics' in metric:
267
+ return metric
268
+ # Convert flat dict into nested format expected by the Space
269
+ step_val = metric.get('step')
270
+ ts_val = metric.get('timestamp')
271
+ metrics_only = {k: v for k, v in metric.items() if k not in ('step', 'timestamp')}
272
+ return {
273
+ 'timestamp': ts_val,
274
+ 'step': step_val,
275
+ 'metrics': metrics_only
276
+ }
277
+
278
+ if isinstance(incoming_metrics, list):
279
+ for m in incoming_metrics:
280
+ nested = _to_nested_entry(m if isinstance(m, dict) else {})
281
+ if _entry_key(nested) not in existing_keys:
282
+ merged_metrics.append(nested)
283
+ existing_keys.add(_entry_key(nested))
284
+ elif isinstance(incoming_metrics, dict):
285
+ nested = _to_nested_entry(incoming_metrics)
286
+ if _entry_key(nested) not in existing_keys:
287
+ merged_metrics.append(nested)
288
+ # else: ignore invalid metrics payload
289
+ else:
290
+ # Treat as parameters update; merge dict
291
+ try:
292
+ if isinstance(experiment_data, dict):
293
+ merged_parameters.update(experiment_data)
294
+ except Exception:
295
+ pass
296
+
297
+ # Merge artifacts if provided
298
+ if 'artifacts' in experiment_data and isinstance(experiment_data['artifacts'], list):
299
+ # De-duplicate while preserving order
300
+ seen = set(merged_artifacts)
301
+ for a in experiment_data['artifacts']:
302
+ if a not in seen:
303
+ merged_artifacts.append(a)
304
+ seen.add(a)
305
+
306
+ # Build the experiment payload to upsert
307
  current_experiment = {
308
+ 'experiment_id': experiment_id,
309
+ 'name': existing.get('name') or self.experiment_name,
310
+ 'description': existing.get('description') or "SmolLM3 fine-tuning experiment",
311
+ 'created_at': existing.get('created_at') or self.start_time.isoformat(),
312
+ 'status': existing.get('status') or 'running',
313
+ 'metrics': json.dumps(merged_metrics, default=str),
314
+ 'parameters': json.dumps(merged_parameters, default=str),
315
+ 'artifacts': json.dumps(merged_artifacts, default=str),
316
+ 'logs': json.dumps(existing_logs, default=str),
317
  'last_updated': datetime.now().isoformat()
318
  }
319
+
 
320
  success = self.dataset_manager.upsert_experiment(current_experiment)
321
+
322
  if success:
323
  logger.info(f"✅ Experiment data saved to HF Dataset: {self.dataset_repo}")
324
  return True
325
  else:
326
+ logger.error("❌ Failed to save experiment data to HF Dataset")
327
  return False
328
+
329
  except Exception as e:
330
  logger.error(f"❌ Failed to save to HF Dataset: {e}")
331
  return False
templates/spaces/trackio/README.md CHANGED
@@ -6,7 +6,7 @@ colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.38.0
8
  app_file: app.py
9
- pinned: true
10
  license: mit
11
  short_description: trackio for training monitoring
12
  tags:
 
6
  sdk: gradio
7
  sdk_version: 5.38.0
8
  app_file: app.py
9
+ pinned: false
10
  license: mit
11
  short_description: trackio for training monitoring
12
  tags:
templates/spaces/trackio/app.py CHANGED
@@ -37,16 +37,20 @@ class TrackioSpace:
37
  self.dataset_manager = None
38
  if self.hf_token and self.dataset_repo:
39
  try:
40
- # Import dataset manager
41
- import sys
42
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
43
- from dataset_utils import TrackioDatasetManager
44
  self.dataset_manager = TrackioDatasetManager(self.dataset_repo, self.hf_token)
45
- logger.info("✅ Dataset manager initialized for safe operations")
46
- except ImportError:
47
- logger.warning("⚠️ Dataset manager not available, using legacy data handling")
48
- except Exception as e:
49
- logger.warning(f"⚠️ Failed to initialize dataset manager: {e}")
 
 
 
 
 
 
50
 
51
  logger.info(f"🔧 Using dataset repository: {self.dataset_repo}")
52
 
@@ -426,7 +430,11 @@ class TrackioSpace:
426
  logger.info(f"✅ Loaded {len(backup_experiments)} backup experiments")
427
 
428
  def _save_experiments(self):
429
- """Save experiments to HF Dataset with data preservation"""
 
 
 
 
430
  try:
431
  if self.using_backup_data:
432
  logger.warning("⚠️ Using backup data; skip saving to dataset to avoid overwriting with demo values")
@@ -474,6 +482,33 @@ class TrackioSpace:
474
  logger.error(f"❌ Failed to save experiments: {e}")
475
  # Fallback to legacy method
476
  self._save_experiments_legacy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
477
 
478
  def _save_experiments_legacy(self):
479
  """Legacy save method without data preservation (fallback only)"""
@@ -550,7 +585,7 @@ class TrackioSpace:
550
 
551
  self.experiments[experiment_id] = experiment
552
  self.current_experiment = experiment_id
553
- self._save_experiments()
554
 
555
  logger.info(f"Created experiment: {experiment_id} - {name}")
556
  return experiment
@@ -567,7 +602,7 @@ class TrackioSpace:
567
  }
568
 
569
  self.experiments[experiment_id]['metrics'].append(metric_entry)
570
- self._save_experiments()
571
  logger.info(f"Logged metrics for experiment {experiment_id}: {metrics}")
572
 
573
  def log_parameters(self, experiment_id: str, parameters: Dict[str, Any]):
@@ -576,7 +611,7 @@ class TrackioSpace:
576
  raise ValueError(f"Experiment {experiment_id} not found")
577
 
578
  self.experiments[experiment_id]['parameters'].update(parameters)
579
- self._save_experiments()
580
  logger.info(f"Logged parameters for experiment {experiment_id}: {parameters}")
581
 
582
  def log_artifact(self, experiment_id: str, artifact_name: str, artifact_data: str):
@@ -610,7 +645,7 @@ class TrackioSpace:
610
  """Update experiment status"""
611
  if experiment_id in self.experiments:
612
  self.experiments[experiment_id]['status'] = status
613
- self._save_experiments()
614
  logger.info(f"Updated experiment {experiment_id} status to {status}")
615
 
616
  def get_metrics_dataframe(self, experiment_id: str) -> pd.DataFrame:
 
37
  self.dataset_manager = None
38
  if self.hf_token and self.dataset_repo:
39
  try:
40
+ # Prefer local dataset_utils in Space repo
41
+ from dataset_utils import TrackioDatasetManager # type: ignore
 
 
42
  self.dataset_manager = TrackioDatasetManager(self.dataset_repo, self.hf_token)
43
+ logger.info("✅ Dataset manager initialized for safe operations (local)")
44
+ except Exception as local_e:
45
+ try:
46
+ # Fallback: try project src layout if present
47
+ import sys
48
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
49
+ from dataset_utils import TrackioDatasetManager # type: ignore
50
+ self.dataset_manager = TrackioDatasetManager(self.dataset_repo, self.hf_token)
51
+ logger.info("✅ Dataset manager initialized for safe operations (fallback src)")
52
+ except Exception as e:
53
+ logger.warning(f"⚠️ Dataset manager not available, using legacy data handling: {local_e or e}")
54
 
55
  logger.info(f"🔧 Using dataset repository: {self.dataset_repo}")
56
 
 
430
  logger.info(f"✅ Loaded {len(backup_experiments)} backup experiments")
431
 
432
  def _save_experiments(self):
433
+ """Save experiments to HF Dataset with data preservation
434
+
435
+ Note: This saves the full in-memory set. Prefer per-operation upsert via
436
+ dataset manager when available to reduce overwrite risk.
437
+ """
438
  try:
439
  if self.using_backup_data:
440
  logger.warning("⚠️ Using backup data; skip saving to dataset to avoid overwriting with demo values")
 
482
  logger.error(f"❌ Failed to save experiments: {e}")
483
  # Fallback to legacy method
484
  self._save_experiments_legacy()
485
+
486
+ def _upsert_experiment(self, experiment_id: str):
487
+ """Non-destructive upsert of a single experiment when dataset manager is available."""
488
+ try:
489
+ if not self.dataset_manager:
490
+ # Fallback to legacy save of full set
491
+ self._save_experiments()
492
+ return
493
+ exp = self.experiments.get(experiment_id)
494
+ if not exp:
495
+ return
496
+ payload = {
497
+ 'experiment_id': experiment_id,
498
+ 'name': exp.get('name', ''),
499
+ 'description': exp.get('description', ''),
500
+ 'created_at': exp.get('created_at', ''),
501
+ 'status': exp.get('status', 'running'),
502
+ 'metrics': json.dumps(exp.get('metrics', []), default=str),
503
+ 'parameters': json.dumps(exp.get('parameters', {}), default=str),
504
+ 'artifacts': json.dumps(exp.get('artifacts', []), default=str),
505
+ 'logs': json.dumps(exp.get('logs', []), default=str),
506
+ 'last_updated': datetime.now().isoformat()
507
+ }
508
+ self.dataset_manager.upsert_experiment(payload)
509
+ except Exception as e:
510
+ logger.warning(f"⚠️ Upsert failed, falling back to legacy save: {e}")
511
+ self._save_experiments()
512
 
513
  def _save_experiments_legacy(self):
514
  """Legacy save method without data preservation (fallback only)"""
 
585
 
586
  self.experiments[experiment_id] = experiment
587
  self.current_experiment = experiment_id
588
+ self._upsert_experiment(experiment_id)
589
 
590
  logger.info(f"Created experiment: {experiment_id} - {name}")
591
  return experiment
 
602
  }
603
 
604
  self.experiments[experiment_id]['metrics'].append(metric_entry)
605
+ self._upsert_experiment(experiment_id)
606
  logger.info(f"Logged metrics for experiment {experiment_id}: {metrics}")
607
 
608
  def log_parameters(self, experiment_id: str, parameters: Dict[str, Any]):
 
611
  raise ValueError(f"Experiment {experiment_id} not found")
612
 
613
  self.experiments[experiment_id]['parameters'].update(parameters)
614
+ self._upsert_experiment(experiment_id)
615
  logger.info(f"Logged parameters for experiment {experiment_id}: {parameters}")
616
 
617
  def log_artifact(self, experiment_id: str, artifact_name: str, artifact_data: str):
 
645
  """Update experiment status"""
646
  if experiment_id in self.experiments:
647
  self.experiments[experiment_id]['status'] = status
648
+ self._upsert_experiment(experiment_id)
649
  logger.info(f"Updated experiment {experiment_id} status to {status}")
650
 
651
  def get_metrics_dataframe(self, experiment_id: str) -> pd.DataFrame: