Spaces:

Tonic
/

SmolFactory

Running

App Files Files Community

Tonic commited on 16 days ago

Commit

cf628aa

1 Parent(s): 190d843

non destructive dataset operations

Browse files

Files changed (3) hide show

src/dataset_utils.py +12 -1
src/monitoring.py +23 -11
templates/spaces/trackio/dataset_utils.py +12 -1

src/dataset_utils.py CHANGED Viewed

@@ -190,14 +190,25 @@ class TrackioDatasetManager:
             # Load existing experiments for union merge
             existing = {}
             try:
-                for row in self.load_existing_experiments():
                     exp_id = row.get('experiment_id')
                     if exp_id:
                         existing[exp_id] = row
             except Exception:
                 existing = {}
             # Validate and merge
             merged_map: Dict[str, Dict[str, Any]] = {}
             # Seed with existing

             # Load existing experiments for union merge
             existing = {}
+            dataset_exists = self.check_dataset_exists()
             try:
+                existing_list = self.load_existing_experiments()
+                for row in existing_list:
                     exp_id = row.get('experiment_id')
                     if exp_id:
                         existing[exp_id] = row
             except Exception:
                 existing = {}
+            # Safety guard: avoid destructive overwrite if dataset exists but
+            # we failed to read any existing records (e.g., transient HF issue)
+            if dataset_exists and len(existing) == 0 and len(experiments) <= 3:
+                logger.error(
+                    "❌ Refusing to overwrite dataset: existing records could not be loaded "
+                    "but repository exists. Skipping save to prevent data loss."
+                )
+                return False
             # Validate and merge
             merged_map: Dict[str, Dict[str, Any]] = {}
             # Seed with existing

src/monitoring.py CHANGED Viewed

@@ -424,9 +424,18 @@ class SmolLM3Monitor:
             self.metrics_history.append(metrics)
             # Save to HF Dataset periodically (configurable)
-            flush_every = getattr(self, 'flush_interval', 10)
-            if flush_every and (len(self.metrics_history) % flush_every == 0):
-                self._save_to_hf_dataset({'metrics': self.metrics_history})
             logger.debug("Metrics logged: %s", metrics)
@@ -690,20 +699,23 @@ class SmolLM3Monitor:
         # Final save to HF Dataset with proper status update
         if self.dataset_manager:
             try:
-                # Update experiment with final status
                 final_experiment_data = {
                     'status': final_status,
                     'experiment_end_time': datetime.now().isoformat(),
                     'final_metrics_count': len(self.metrics_history),
                     'total_artifacts': len(self.artifacts)
                 }
-                success = self._save_to_hf_dataset(final_experiment_data)
-                if success:
-                    logger.info("✅ Final experiment data saved to HF Dataset")
-                else:
-                    logger.error("❌ Failed to save final experiment data")
             except Exception as e:
                 logger.error(f"❌ Failed to save final experiment data: {e}")

             self.metrics_history.append(metrics)
             # Save to HF Dataset periodically (configurable)
+            flush_every = max(1, int(getattr(self, 'flush_interval', 10)))
+            # Only append the delta since last flush to minimize risk
+            try:
+                if not hasattr(self, '_last_flushed_index'):
+                    self._last_flushed_index = 0
+                if len(self.metrics_history) - self._last_flushed_index >= flush_every:
+                    new_slice = self.metrics_history[self._last_flushed_index:]
+                    # Persist only the tail slice; merge code will union-append
+                    self._save_to_hf_dataset({'metrics': new_slice})
+                    self._last_flushed_index = len(self.metrics_history)
+            except Exception:
+                pass
             logger.debug("Metrics logged: %s", metrics)
         # Final save to HF Dataset with proper status update
         if self.dataset_manager:
             try:
+                # Update experiment with final status without clobbering metrics
                 final_experiment_data = {
                     'status': final_status,
                     'experiment_end_time': datetime.now().isoformat(),
                     'final_metrics_count': len(self.metrics_history),
                     'total_artifacts': len(self.artifacts)
                 }
+                self._save_to_hf_dataset(final_experiment_data)
+                # Also persist any unflushed metrics tail
+                try:
+                    last_idx = getattr(self, '_last_flushed_index', 0)
+                    if len(self.metrics_history) > last_idx:
+                        tail = self.metrics_history[last_idx:]
+                        self._save_to_hf_dataset({'metrics': tail})
+                        self._last_flushed_index = len(self.metrics_history)
+                except Exception:
+                    pass
             except Exception as e:
                 logger.error(f"❌ Failed to save final experiment data: {e}")

templates/spaces/trackio/dataset_utils.py CHANGED Viewed

@@ -173,13 +173,24 @@ class TrackioDatasetManager:
             # Load existing experiments for union merge
             existing = {}
             try:
-                for row in self.load_existing_experiments():
                     exp_id = row.get('experiment_id')
                     if exp_id:
                         existing[exp_id] = row
             except Exception:
                 existing = {}
             # Validate and merge
             merged_map: Dict[str, Dict[str, Any]] = {}

             # Load existing experiments for union merge
             existing = {}
+            dataset_exists = self.check_dataset_exists()
             try:
+                existing_list = self.load_existing_experiments()
+                for row in existing_list:
                     exp_id = row.get('experiment_id')
                     if exp_id:
                         existing[exp_id] = row
             except Exception:
                 existing = {}
+            # Safety guard: avoid destructive overwrite if dataset exists but
+            # we failed to read any existing records (e.g., transient HF issue)
+            if dataset_exists and len(existing) == 0 and len(experiments) <= 3:
+                logger.error(
+                    "❌ Refusing to overwrite dataset: existing records could not be loaded "
+                    "but repository exists. Skipping save to prevent data loss."
+                )
+                return False
             # Validate and merge
             merged_map: Dict[str, Dict[str, Any]] = {}