Spaces:

Tonic
/

SmolFactory

Running

App Files Files Community

Tonic commited on 15 days ago

Commit

c560f4f

1 Parent(s): 0f12d91

adds experiment id fix

Browse files

Files changed (3) hide show

src/trackio.py +18 -7
src/trainer.py +16 -16
templates/spaces/trackio/app.py +23 -3

src/trackio.py CHANGED Viewed

@@ -65,15 +65,12 @@ def init(
             hf_token=hf_token,
             dataset_repo=dataset_repo
         )
-        # Generate experiment ID - use the same format as our monitoring system
-        experiment_id = f"exp_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
-        _monitor.experiment_id = experiment_id
         logger.info(f"Trackio initialized for experiment: {exp_name}")
         logger.info(f"Experiment ID: {experiment_id}")
-        return experiment_id
     except Exception as e:
         logger.error(f"Failed to initialize trackio: {e}")
@@ -128,6 +125,20 @@ def finish():
     except Exception as e:
         logger.error(f"Failed to finish trackio experiment: {e}")
 def log_config(config: Dict[str, Any]):
     """
     Log configuration to trackio (TRL interface)

             hf_token=hf_token,
             dataset_repo=dataset_repo
         )
+        # The monitor constructor creates the experiment remotely and sets
+        # `experiment_id`. Do NOT overwrite it with a locally generated ID.
+        experiment_id = getattr(_monitor, "experiment_id", None)
         logger.info(f"Trackio initialized for experiment: {exp_name}")
         logger.info(f"Experiment ID: {experiment_id}")
+        return experiment_id or f"exp_fallback_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
     except Exception as e:
         logger.error(f"Failed to initialize trackio: {e}")
     except Exception as e:
         logger.error(f"Failed to finish trackio experiment: {e}")
+def set_monitor(monitor: SmolLM3Monitor) -> None:
+    """Set the shared monitor instance used by this module.
+    This allows external code (e.g., our trainer) to create a
+    `SmolLM3Monitor` once and have `trackio.log/finish` operate on
+    the exact same object, preventing mismatched experiment IDs.
+    """
+    global _monitor
+    _monitor = monitor
+    try:
+        logger.info("trackio monitor set: experiment_id=%s", getattr(monitor, "experiment_id", None))
+    except Exception:
+        pass
 def log_config(config: Dict[str, Any]):
     """
     Log configuration to trackio (TRL interface)

src/trainer.py CHANGED Viewed

@@ -158,17 +158,23 @@ class SmolLM3Trainer:
         logger.info("Total callbacks: %d", len(callbacks))
-        # Initialize trackio for TRL compatibility
         try:
             import trackio
-            # Initialize trackio with our configuration and use the same experiment ID
-            if self.monitor and self.monitor.experiment_id:
-                # Use the experiment ID from our monitor
-                experiment_id = self.monitor.experiment_id
-                logger.info(f"Using existing experiment ID: {experiment_id}")
             else:
-                # Initialize trackio with our configuration
-                experiment_id = trackio.init(
                     project_name=getattr(self.config, 'experiment_name', 'smollm3_experiment'),
                     experiment_name=getattr(self.config, 'experiment_name', 'smollm3_experiment'),
                     trackio_url=getattr(self.config, 'trackio_url', None),
@@ -176,15 +182,9 @@ class SmolLM3Trainer:
                     hf_token=getattr(self.config, 'hf_token', None),
                     dataset_repo=getattr(self.config, 'dataset_repo', None)
                 )
-                logger.info(f"Trackio initialized with experiment ID: {experiment_id}")
-                # Update our monitor with the same experiment ID
-                if self.monitor:
-                    self.monitor.experiment_id = experiment_id
-                    logger.info(f"Updated monitor with experiment ID: {experiment_id}")
         except Exception as e:
-            logger.warning(f"Failed to initialize trackio: {e}")
-            logger.info("Continuing without trackio integration")
         # Try SFTTrainer first (better for instruction tuning)
         logger.info("Creating SFTTrainer with training arguments...")

         logger.info("Total callbacks: %d", len(callbacks))
+        # Initialize trackio for TRL compatibility without creating a second experiment
         try:
             import trackio
+            if self.monitor:
+                # Share the same monitor/experiment with the trackio shim
+                try:
+                    trackio.set_monitor(self.monitor)  # type: ignore[attr-defined]
+                except Exception:
+                    # Fallback: ensure the shim at least knows the current ID
+                    pass
+                logger.info(
+                    "Using shared Trackio monitor with experiment ID: %s",
+                    getattr(self.monitor, 'experiment_id', None)
+                )
             else:
+                # Last resort: initialize via shim
+                _ = trackio.init(
                     project_name=getattr(self.config, 'experiment_name', 'smollm3_experiment'),
                     experiment_name=getattr(self.config, 'experiment_name', 'smollm3_experiment'),
                     trackio_url=getattr(self.config, 'trackio_url', None),
                     hf_token=getattr(self.config, 'hf_token', None),
                     dataset_repo=getattr(self.config, 'dataset_repo', None)
                 )
         except Exception as e:
+            logger.warning(f"Failed to wire trackio shim: {e}")
+            logger.info("Continuing without trackio shim integration")
         # Try SFTTrainer first (better for instruction tuning)
         logger.info("Creating SFTTrainer with training arguments...")

templates/spaces/trackio/app.py CHANGED Viewed

@@ -1143,12 +1143,25 @@ def create_metrics_plot(experiment_id: str, metric_name: str = "loss") -> go.Fig
             )
             return fig
         fig = px.line(df, x='step', y=metric_name, title=f'{metric_name} over time')
         fig.update_layout(
             xaxis_title="Training Step",
             yaxis_title=metric_name.title(),
             hovermode='x unified'
         )
         return fig
     except Exception as e:
@@ -1530,16 +1543,23 @@ def create_combined_metrics_plot(experiment_id: str) -> go.Figure:
                 col = (i % n_cols) + 1
                 color = colors[i % len(colors)]
                 fig.add_trace(
                     go.Scatter(
-                        x=df['step'].tolist(),
-                        y=df[metric].tolist(),
                         mode='lines+markers',
                         name=metric,
                         line=dict(width=2, color=color),
                         marker=dict(size=4, color=color),
                         showlegend=False,
-                        connectgaps=True
                     ),
                     row=row, col=col
                 )

             )
             return fig
+        # Ensure steps are numeric and monotonically increasing to avoid zig-zag lines
+        try:
+            df = df.copy()
+            df['step'] = pd.to_numeric(df['step'], errors='coerce').fillna(-1)
+            df.sort_values('step', inplace=True)
+        except Exception:
+            pass
         fig = px.line(df, x='step', y=metric_name, title=f'{metric_name} over time')
         fig.update_layout(
             xaxis_title="Training Step",
             yaxis_title=metric_name.title(),
             hovermode='x unified'
         )
+        # Avoid interpolating across missing steps which can create odd visuals
+        try:
+            for trace in fig.data:
+                trace.connectgaps = False
+        except Exception:
+            pass
         return fig
     except Exception as e:
                 col = (i % n_cols) + 1
                 color = colors[i % len(colors)]
+                # Clean steps for each subplot too
+                try:
+                    df_sub = df.copy()
+                    df_sub['step'] = pd.to_numeric(df_sub['step'], errors='coerce').fillna(-1)
+                    df_sub.sort_values('step', inplace=True)
+                except Exception:
+                    df_sub = df
                 fig.add_trace(
                     go.Scatter(
+                        x=df_sub['step'].tolist(),
+                        y=df_sub[metric].tolist(),
                         mode='lines+markers',
                         name=metric,
                         line=dict(width=2, color=color),
                         marker=dict(size=4, color=color),
                         showlegend=False,
+                        connectgaps=False
                     ),
                     row=row, col=col
                 )