Tonic commited on
Commit
987a674
Β·
verified Β·
1 Parent(s): e99b183

solves monitoring integration

Browse files
add_demo_data.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Add demo training data to an existing experiment
4
+ This will populate the experiment with realistic training metrics for visualization
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ import numpy as np
10
+ from datetime import datetime
11
+ from trackio_api_client import TrackioAPIClient
12
+
13
+ # Setup logging
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ def add_demo_training_data(experiment_id: str, num_steps: int = 50):
18
+ """Add realistic demo training data to an experiment"""
19
+
20
+ client = TrackioAPIClient("https://tonic-test-trackio-test.hf.space")
21
+
22
+ print(f"🎯 Adding demo training data to experiment: {experiment_id}")
23
+ print(f"πŸ“Š Will add {num_steps} metric entries...")
24
+
25
+ # Simulate realistic training metrics
26
+ for step in range(0, num_steps * 25, 25): # Every 25 steps
27
+ # Simulate loss decreasing over time with some noise
28
+ base_loss = 2.0 * np.exp(-step / 500)
29
+ noise = 0.1 * np.random.random()
30
+ loss = max(0.1, base_loss + noise)
31
+
32
+ # Simulate accuracy increasing over time
33
+ base_accuracy = 0.3 + 0.6 * (1 - np.exp(-step / 300))
34
+ accuracy = min(0.95, base_accuracy + 0.05 * np.random.random())
35
+
36
+ # Simulate learning rate decay
37
+ lr = 3.5e-6 * (0.9 ** (step // 200))
38
+
39
+ # Simulate GPU memory usage
40
+ gpu_memory = 20 + 5 * np.random.random()
41
+
42
+ # Simulate training time per step
43
+ training_time = 0.5 + 0.2 * np.random.random()
44
+
45
+ metrics = {
46
+ "loss": round(loss, 4),
47
+ "accuracy": round(accuracy, 4),
48
+ "learning_rate": round(lr, 8),
49
+ "gpu_memory_gb": round(gpu_memory, 2),
50
+ "training_time_per_step": round(training_time, 3),
51
+ "epoch": step // 100 + 1,
52
+ "samples_per_second": round(50 + 20 * np.random.random(), 1)
53
+ }
54
+
55
+ # Log metrics to the experiment
56
+ result = client.log_metrics(experiment_id, metrics, step)
57
+
58
+ if "success" in result:
59
+ print(f"βœ… Step {step}: Loss={loss:.4f}, Accuracy={accuracy:.4f}")
60
+ else:
61
+ print(f"❌ Step {step}: Failed to log metrics - {result}")
62
+
63
+ print(f"\nπŸŽ‰ Demo data added successfully!")
64
+ print(f"πŸ“Š Total steps logged: {num_steps}")
65
+ print(f"πŸ”— View in Trackio Space: https://tonic-test-trackio-test.hf.space")
66
+ print(f"πŸ“ˆ Go to 'Visualizations' tab and select experiment: {experiment_id}")
67
+
68
+ def main():
69
+ """Main function"""
70
+ print("πŸš€ Trackio Demo Data Generator")
71
+ print("=" * 50)
72
+
73
+ # Your experiment ID from the logs
74
+ experiment_id = "exp_20250720_101955" # petit-elle-l-aime-3-balanced
75
+
76
+ print(f"πŸ“‹ Target experiment: {experiment_id}")
77
+ print(f"πŸ“ Experiment name: petit-elle-l-aime-3-balanced")
78
+
79
+ # Add demo data
80
+ add_demo_training_data(experiment_id, num_steps=50)
81
+
82
+ print("\n" + "=" * 50)
83
+ print("🎯 Next Steps:")
84
+ print("1. Go to https://tonic-test-trackio-test.hf.space")
85
+ print("2. Click on 'πŸ“Š Visualizations' tab")
86
+ print("3. Enter your experiment ID: exp_20250720_101955")
87
+ print("4. Select a metric (loss, accuracy, etc.)")
88
+ print("5. Click 'Create Plot' to see the training curves!")
89
+ print("=" * 50)
90
+
91
+ if __name__ == "__main__":
92
+ main()
app.py CHANGED
@@ -25,7 +25,38 @@ class TrackioSpace:
25
  def __init__(self):
26
  self.experiments = {}
27
  self.current_experiment = None
 
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def create_experiment(self, name: str, description: str = "") -> Dict[str, Any]:
30
  """Create a new experiment"""
31
  experiment_id = f"exp_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
@@ -44,6 +75,7 @@ class TrackioSpace:
44
 
45
  self.experiments[experiment_id] = experiment
46
  self.current_experiment = experiment_id
 
47
 
48
  logger.info(f"Created experiment: {experiment_id} - {name}")
49
  return experiment
@@ -60,6 +92,7 @@ class TrackioSpace:
60
  }
61
 
62
  self.experiments[experiment_id]['metrics'].append(metric_entry)
 
63
  logger.info(f"Logged metrics for experiment {experiment_id}: {metrics}")
64
 
65
  def log_parameters(self, experiment_id: str, parameters: Dict[str, Any]):
@@ -68,6 +101,7 @@ class TrackioSpace:
68
  raise ValueError(f"Experiment {experiment_id} not found")
69
 
70
  self.experiments[experiment_id]['parameters'].update(parameters)
 
71
  logger.info(f"Logged parameters for experiment {experiment_id}: {parameters}")
72
 
73
  def log_artifact(self, experiment_id: str, artifact_name: str, artifact_data: str):
@@ -82,6 +116,7 @@ class TrackioSpace:
82
  }
83
 
84
  self.experiments[experiment_id]['artifacts'].append(artifact_entry)
 
85
  logger.info(f"Logged artifact for experiment {experiment_id}: {artifact_name}")
86
 
87
  def get_experiment(self, experiment_id: str) -> Optional[Dict[str, Any]]:
@@ -100,6 +135,7 @@ class TrackioSpace:
100
  """Update experiment status"""
101
  if experiment_id in self.experiments:
102
  self.experiments[experiment_id]['status'] = status
 
103
  logger.info(f"Updated experiment {experiment_id} status to {status}")
104
 
105
  def get_metrics_dataframe(self, experiment_id: str) -> pd.DataFrame:
@@ -127,6 +163,87 @@ class TrackioSpace:
127
  # Initialize Trackio space
128
  trackio_space = TrackioSpace()
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  def create_experiment_interface(name: str, description: str) -> str:
131
  """Create a new experiment"""
132
  try:
@@ -236,7 +353,7 @@ def update_experiment_status_interface(experiment_id: str, status: str) -> str:
236
  def create_metrics_plot(experiment_id: str, metric_name: str = "loss") -> go.Figure:
237
  """Create a plot for a specific metric"""
238
  try:
239
- df = trackio_space.get_metrics_dataframe(experiment_id)
240
  if df.empty:
241
  # Return empty plot
242
  fig = go.Figure()
@@ -283,7 +400,7 @@ def create_experiment_comparison(experiment_ids: str) -> go.Figure:
283
  fig = go.Figure()
284
 
285
  for exp_id in exp_ids:
286
- df = trackio_space.get_metrics_dataframe(exp_id)
287
  if not df.empty and 'loss' in df.columns:
288
  fig.add_trace(go.Scatter(
289
  x=df['step'],
@@ -335,6 +452,35 @@ def simulate_training_data(experiment_id: str):
335
  except Exception as e:
336
  return f"❌ Error simulating data: {str(e)}"
337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  # Create Gradio interface
339
  with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as demo:
340
  gr.Markdown("# πŸš€ Trackio Experiment Tracking & Monitoring")
@@ -518,11 +664,12 @@ with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as
518
  placeholder="exp_20231201_143022"
519
  )
520
  demo_btn = gr.Button("Generate Demo Data", variant="primary")
 
521
 
522
  with gr.Column():
523
  demo_output = gr.Textbox(
524
  label="Result",
525
- lines=3,
526
  interactive=False
527
  )
528
 
@@ -531,6 +678,12 @@ with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as
531
  inputs=[demo_exp_id],
532
  outputs=demo_output
533
  )
 
 
 
 
 
 
534
 
535
  # Update Status Tab
536
  with gr.Tab("Update Status"):
 
25
  def __init__(self):
26
  self.experiments = {}
27
  self.current_experiment = None
28
+ self.data_file = "trackio_experiments.json"
29
+ self._load_experiments()
30
 
31
+ def _load_experiments(self):
32
+ """Load experiments from file"""
33
+ try:
34
+ if os.path.exists(self.data_file):
35
+ with open(self.data_file, 'r') as f:
36
+ data = json.load(f)
37
+ self.experiments = data.get('experiments', {})
38
+ self.current_experiment = data.get('current_experiment')
39
+ logger.info(f"Loaded {len(self.experiments)} experiments from {self.data_file}")
40
+ else:
41
+ logger.info("No existing experiment data found, starting fresh")
42
+ except Exception as e:
43
+ logger.error(f"Failed to load experiments: {e}")
44
+ self.experiments = {}
45
+
46
+ def _save_experiments(self):
47
+ """Save experiments to file"""
48
+ try:
49
+ data = {
50
+ 'experiments': self.experiments,
51
+ 'current_experiment': self.current_experiment,
52
+ 'last_updated': datetime.now().isoformat()
53
+ }
54
+ with open(self.data_file, 'w') as f:
55
+ json.dump(data, f, indent=2, default=str)
56
+ logger.debug(f"Saved {len(self.experiments)} experiments to {self.data_file}")
57
+ except Exception as e:
58
+ logger.error(f"Failed to save experiments: {e}")
59
+
60
  def create_experiment(self, name: str, description: str = "") -> Dict[str, Any]:
61
  """Create a new experiment"""
62
  experiment_id = f"exp_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
 
75
 
76
  self.experiments[experiment_id] = experiment
77
  self.current_experiment = experiment_id
78
+ self._save_experiments()
79
 
80
  logger.info(f"Created experiment: {experiment_id} - {name}")
81
  return experiment
 
92
  }
93
 
94
  self.experiments[experiment_id]['metrics'].append(metric_entry)
95
+ self._save_experiments()
96
  logger.info(f"Logged metrics for experiment {experiment_id}: {metrics}")
97
 
98
  def log_parameters(self, experiment_id: str, parameters: Dict[str, Any]):
 
101
  raise ValueError(f"Experiment {experiment_id} not found")
102
 
103
  self.experiments[experiment_id]['parameters'].update(parameters)
104
+ self._save_experiments()
105
  logger.info(f"Logged parameters for experiment {experiment_id}: {parameters}")
106
 
107
  def log_artifact(self, experiment_id: str, artifact_name: str, artifact_data: str):
 
116
  }
117
 
118
  self.experiments[experiment_id]['artifacts'].append(artifact_entry)
119
+ self._save_experiments()
120
  logger.info(f"Logged artifact for experiment {experiment_id}: {artifact_name}")
121
 
122
  def get_experiment(self, experiment_id: str) -> Optional[Dict[str, Any]]:
 
135
  """Update experiment status"""
136
  if experiment_id in self.experiments:
137
  self.experiments[experiment_id]['status'] = status
138
+ self._save_experiments()
139
  logger.info(f"Updated experiment {experiment_id} status to {status}")
140
 
141
  def get_metrics_dataframe(self, experiment_id: str) -> pd.DataFrame:
 
163
  # Initialize Trackio space
164
  trackio_space = TrackioSpace()
165
 
166
+ # Initialize API client for remote data
167
+ api_client = None
168
+ try:
169
+ from trackio_api_client import TrackioAPIClient
170
+ api_client = TrackioAPIClient("https://tonic-test-trackio-test.hf.space")
171
+ logger.info("βœ… API client initialized for remote data access")
172
+ except ImportError:
173
+ logger.warning("⚠️ API client not available, using local data only")
174
+
175
+ def get_remote_experiment_data(experiment_id: str) -> Dict[str, Any]:
176
+ """Get experiment data from remote API"""
177
+ if api_client is None:
178
+ return None
179
+
180
+ try:
181
+ # Get experiment details from API
182
+ details_result = api_client.get_experiment_details(experiment_id)
183
+ if "success" in details_result:
184
+ return {"remote": True, "data": details_result["data"]}
185
+ else:
186
+ logger.warning(f"Failed to get remote data for {experiment_id}: {details_result}")
187
+ return None
188
+ except Exception as e:
189
+ logger.error(f"Error getting remote data: {e}")
190
+ return None
191
+
192
+ def parse_remote_metrics_data(experiment_details: str) -> pd.DataFrame:
193
+ """Parse metrics data from remote experiment details"""
194
+ try:
195
+ # Look for metrics in the experiment details
196
+ lines = experiment_details.split('\n')
197
+ metrics_data = []
198
+
199
+ for line in lines:
200
+ if 'Step:' in line and 'Metrics:' in line:
201
+ # Extract step and metrics from the line
202
+ try:
203
+ # Parse step number
204
+ step_part = line.split('Step:')[1].split('Metrics:')[0].strip()
205
+ step = int(step_part)
206
+
207
+ # Parse metrics JSON
208
+ metrics_part = line.split('Metrics:')[1].strip()
209
+ metrics = json.loads(metrics_part)
210
+
211
+ # Add timestamp
212
+ row = {'step': step, 'timestamp': datetime.now().isoformat()}
213
+ row.update(metrics)
214
+ metrics_data.append(row)
215
+
216
+ except (ValueError, json.JSONDecodeError) as e:
217
+ logger.warning(f"Failed to parse metrics line: {line} - {e}")
218
+ continue
219
+
220
+ if metrics_data:
221
+ return pd.DataFrame(metrics_data)
222
+ else:
223
+ return pd.DataFrame()
224
+
225
+ except Exception as e:
226
+ logger.error(f"Error parsing remote metrics: {e}")
227
+ return pd.DataFrame()
228
+
229
+ def get_metrics_dataframe(experiment_id: str) -> pd.DataFrame:
230
+ """Get metrics as a pandas DataFrame for plotting - tries remote first, then local"""
231
+ # Try to get remote data first
232
+ remote_data = get_remote_experiment_data(experiment_id)
233
+ if remote_data:
234
+ logger.info(f"Using remote data for {experiment_id}")
235
+ # Parse the remote experiment details to extract metrics
236
+ df = parse_remote_metrics_data(remote_data["data"])
237
+ if not df.empty:
238
+ logger.info(f"Found {len(df)} metrics entries from remote data")
239
+ return df
240
+ else:
241
+ logger.warning(f"No metrics found in remote data for {experiment_id}")
242
+
243
+ # Fall back to local data
244
+ logger.info(f"Using local data for {experiment_id}")
245
+ return trackio_space.get_metrics_dataframe(experiment_id)
246
+
247
  def create_experiment_interface(name: str, description: str) -> str:
248
  """Create a new experiment"""
249
  try:
 
353
  def create_metrics_plot(experiment_id: str, metric_name: str = "loss") -> go.Figure:
354
  """Create a plot for a specific metric"""
355
  try:
356
+ df = get_metrics_dataframe(experiment_id)
357
  if df.empty:
358
  # Return empty plot
359
  fig = go.Figure()
 
400
  fig = go.Figure()
401
 
402
  for exp_id in exp_ids:
403
+ df = get_metrics_dataframe(exp_id)
404
  if not df.empty and 'loss' in df.columns:
405
  fig.add_trace(go.Scatter(
406
  x=df['step'],
 
452
  except Exception as e:
453
  return f"❌ Error simulating data: {str(e)}"
454
 
455
+ def create_demo_experiment():
456
+ """Create a demo experiment with training data"""
457
+ try:
458
+ # Create demo experiment
459
+ experiment = trackio_space.create_experiment(
460
+ "demo_smollm3_training",
461
+ "Demo experiment with simulated training data"
462
+ )
463
+
464
+ experiment_id = experiment['id']
465
+
466
+ # Add some demo parameters
467
+ parameters = {
468
+ "model_name": "HuggingFaceTB/SmolLM3-3B",
469
+ "batch_size": 8,
470
+ "learning_rate": 3.5e-6,
471
+ "max_iters": 18000,
472
+ "mixed_precision": "bf16",
473
+ "dataset": "legmlai/openhermes-fr"
474
+ }
475
+ trackio_space.log_parameters(experiment_id, parameters)
476
+
477
+ # Add demo training data
478
+ simulate_training_data(experiment_id)
479
+
480
+ return f"βœ… Demo experiment created: {experiment_id}\nYou can now test the visualization with this experiment!"
481
+ except Exception as e:
482
+ return f"❌ Error creating demo experiment: {str(e)}"
483
+
484
  # Create Gradio interface
485
  with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as demo:
486
  gr.Markdown("# πŸš€ Trackio Experiment Tracking & Monitoring")
 
664
  placeholder="exp_20231201_143022"
665
  )
666
  demo_btn = gr.Button("Generate Demo Data", variant="primary")
667
+ create_demo_btn = gr.Button("Create Demo Experiment", variant="secondary")
668
 
669
  with gr.Column():
670
  demo_output = gr.Textbox(
671
  label="Result",
672
+ lines=5,
673
  interactive=False
674
  )
675
 
 
678
  inputs=[demo_exp_id],
679
  outputs=demo_output
680
  )
681
+
682
+ create_demo_btn.click(
683
+ create_demo_experiment,
684
+ inputs=[],
685
+ outputs=demo_output
686
+ )
687
 
688
  # Update Status Tab
689
  with gr.Tab("Update Status"):
test_monitoring_integration.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test monitoring integration for real experiment
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ # Add the current directory to the path for imports
11
+ sys.path.insert(0, str(Path(__file__).parent))
12
+
13
+ def test_monitoring_setup():
14
+ """Test that monitoring is correctly configured"""
15
+
16
+ print("πŸ” Testing Monitoring Integration")
17
+ print("=" * 50)
18
+
19
+ # Test 1: Check if monitoring module can be imported
20
+ try:
21
+ from monitoring import SmolLM3Monitor, create_monitor_from_config
22
+ print("βœ… Monitoring module imported successfully")
23
+ except ImportError as e:
24
+ print(f"❌ Failed to import monitoring module: {e}")
25
+ return False
26
+
27
+ # Test 2: Check if API client can be imported
28
+ try:
29
+ from trackio_api_client import TrackioAPIClient
30
+ print("βœ… Trackio API client imported successfully")
31
+ except ImportError as e:
32
+ print(f"❌ Failed to import Trackio API client: {e}")
33
+ return False
34
+
35
+ # Test 3: Test configuration loading
36
+ try:
37
+ from config.train_smollm3_openhermes_fr_a100_balanced import get_config
38
+ config = get_config("config/train_smollm3_openhermes_fr_a100_balanced.py")
39
+ print("βœ… Configuration loaded successfully")
40
+ print(f" Model: {config.model_name}")
41
+ print(f" Batch size: {config.batch_size}")
42
+ print(f" Max iterations: {config.max_iters}")
43
+ print(f" Enable tracking: {config.enable_tracking}")
44
+ print(f" Trackio URL: {config.trackio_url}")
45
+ except Exception as e:
46
+ print(f"❌ Failed to load configuration: {e}")
47
+ return False
48
+
49
+ # Test 4: Test monitor creation
50
+ try:
51
+ # Set the Trackio URL for testing
52
+ config.trackio_url = "https://tonic-test-trackio-test.hf.space"
53
+ config.experiment_name = "test_monitoring_integration"
54
+
55
+ monitor = create_monitor_from_config(config)
56
+ print("βœ… Monitor created successfully")
57
+ print(f" Experiment name: {monitor.experiment_name}")
58
+ print(f" Enable tracking: {monitor.enable_tracking}")
59
+ print(f" Log metrics: {monitor.log_metrics}")
60
+ print(f" Log artifacts: {monitor.log_artifacts}")
61
+
62
+ if monitor.enable_tracking and monitor.trackio_client:
63
+ print("βœ… Trackio client initialized")
64
+ if monitor.experiment_id:
65
+ print(f" Experiment ID: {monitor.experiment_id}")
66
+ else:
67
+ print(" ⚠️ No experiment ID (will be created during training)")
68
+ else:
69
+ print(" ⚠️ Trackio client not initialized")
70
+
71
+ except Exception as e:
72
+ print(f"❌ Failed to create monitor: {e}")
73
+ return False
74
+
75
+ # Test 5: Test callback creation
76
+ try:
77
+ callback = monitor.create_monitoring_callback()
78
+ if callback:
79
+ print("βœ… Monitoring callback created successfully")
80
+ else:
81
+ print(" ⚠️ No monitoring callback (tracking disabled)")
82
+ except Exception as e:
83
+ print(f"❌ Failed to create callback: {e}")
84
+ return False
85
+
86
+ print("\n" + "=" * 50)
87
+ print("🎯 Monitoring Integration Test Complete")
88
+ print("=" * 50)
89
+
90
+ return True
91
+
92
+ def test_real_experiment_command():
93
+ """Test the real experiment command"""
94
+
95
+ print("\nπŸš€ Testing Real Experiment Command")
96
+ print("=" * 50)
97
+
98
+ # Build the command
99
+ cmd = [
100
+ "python", "run_a100_large_experiment.py",
101
+ "--config", "config/train_smollm3_openhermes_fr_a100_balanced.py",
102
+ "--experiment-name", "petit-elle-l-aime-3-balanced-real",
103
+ "--output-dir", "./outputs/balanced-real",
104
+ "--trackio-url", "https://tonic-test-trackio-test.hf.space"
105
+ ]
106
+
107
+ print("Command to run:")
108
+ print(" ".join(cmd))
109
+
110
+ print("\nThis command will:")
111
+ print("βœ… Load the balanced A100 configuration")
112
+ print("βœ… Create a real experiment in Trackio")
113
+ print("βœ… Log real training metrics every 25 steps")
114
+ print("βœ… Save checkpoints every 2000 steps")
115
+ print("βœ… Monitor progress in real-time")
116
+
117
+ print("\nExpected training parameters:")
118
+ print(" Model: HuggingFaceTB/SmolLM3-3B")
119
+ print(" Batch size: 8")
120
+ print(" Gradient accumulation: 16")
121
+ print(" Effective batch size: 128")
122
+ print(" Learning rate: 3.5e-6")
123
+ print(" Max iterations: 18000")
124
+ print(" Mixed precision: bf16")
125
+ print(" Max sequence length: 12288")
126
+
127
+ print("\n" + "=" * 50)
128
+ print("🎯 Ready to run real experiment!")
129
+ print("=" * 50)
130
+
131
+ if __name__ == "__main__":
132
+ # Test monitoring integration
133
+ if test_monitoring_setup():
134
+ # Show real experiment command
135
+ test_real_experiment_command()
136
+ else:
137
+ print("\n❌ Monitoring integration test failed. Please fix issues before running real experiment.")
test_persistence.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test data persistence in Trackio Space
4
+ """
5
+
6
+ import requests
7
+ import json
8
+ import time
9
+ import re
10
+
11
+ def test_persistence():
12
+ """Test if experiment data persists across restarts"""
13
+
14
+ print("πŸ” Testing Data Persistence")
15
+ print("=" * 50)
16
+
17
+ # Test creating an experiment via API
18
+ url = 'https://tonic-test-trackio-test.hf.space/gradio_api/call/create_experiment_interface'
19
+ payload = {'data': ['test_persistence', 'Testing data persistence']}
20
+
21
+ response = requests.post(url, json=payload)
22
+ if response.status_code == 200:
23
+ data = response.json()
24
+ if 'event_id' in data:
25
+ event_id = data['event_id']
26
+ print(f'βœ… Experiment created with event ID: {event_id}')
27
+
28
+ # Get the result
29
+ get_url = f'{url}/{event_id}'
30
+ time.sleep(2)
31
+
32
+ get_response = requests.get(get_url)
33
+ if get_response.status_code == 200:
34
+ result = get_response.text
35
+ print(f'βœ… Experiment creation result: {result[:200]}...')
36
+
37
+ # Extract experiment ID
38
+ match = re.search(r'exp_\d{8}_\d{6}', result)
39
+ if match:
40
+ experiment_id = match.group()
41
+ print(f'πŸ“‹ Experiment ID: {experiment_id}')
42
+
43
+ # Test logging metrics
44
+ metrics_url = 'https://tonic-test-trackio-test.hf.space/gradio_api/call/log_metrics_interface'
45
+ metrics_payload = {
46
+ 'data': [experiment_id, '{"loss": 1.5, "accuracy": 0.8}', '100']
47
+ }
48
+
49
+ metrics_response = requests.post(metrics_url, json=metrics_payload)
50
+ if metrics_response.status_code == 200:
51
+ print('βœ… Metrics logged successfully')
52
+
53
+ # Test getting experiment details
54
+ details_url = 'https://tonic-test-trackio-test.hf.space/gradio_api/call/get_experiment_details'
55
+ details_payload = {'data': [experiment_id]}
56
+
57
+ details_response = requests.post(details_url, json=details_payload)
58
+ if details_response.status_code == 200:
59
+ details_data = details_response.json()
60
+ if 'event_id' in details_data:
61
+ details_event_id = details_data['event_id']
62
+
63
+ # Get details result
64
+ details_get_url = f'{details_url}/{details_event_id}'
65
+ time.sleep(2)
66
+
67
+ details_get_response = requests.get(details_get_url)
68
+ if details_get_response.status_code == 200:
69
+ details_result = details_get_response.text
70
+ print(f'βœ… Experiment details retrieved: {details_result[:200]}...')
71
+
72
+ if 'metrics' in details_result.lower():
73
+ print('βœ… Found metrics in experiment details')
74
+ else:
75
+ print('❌ No metrics found in experiment details')
76
+ else:
77
+ print(f'❌ Failed to get details result: {details_get_response.status_code}')
78
+ else:
79
+ print('❌ No event_id in details response')
80
+ else:
81
+ print(f'❌ Failed to get experiment details: {details_response.status_code}')
82
+ else:
83
+ print(f'❌ Failed to log metrics: {metrics_response.status_code}')
84
+ else:
85
+ print('❌ Could not extract experiment ID')
86
+ else:
87
+ print(f'❌ Failed to get result: {get_response.status_code}')
88
+ else:
89
+ print('❌ No event_id in response')
90
+ else:
91
+ print(f'❌ Failed to create experiment: {response.status_code}')
92
+
93
+ print("\n" + "=" * 50)
94
+ print("🎯 Next Steps:")
95
+ print("1. Check the Trackio Space: https://tonic-test-trackio-test.hf.space")
96
+ print("2. Go to 'πŸ“Š Visualizations' tab")
97
+ print("3. Enter the experiment ID above")
98
+ print("4. Test if the visualization shows data")
99
+ print("=" * 50)
100
+
101
+ if __name__ == "__main__":
102
+ test_persistence()
test_real_data.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test real training data logging and retrieval
4
+ """
5
+
6
+ import json
7
+ import logging
8
+ from trackio_api_client import TrackioAPIClient
9
+
10
+ # Setup logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ def test_real_training_data():
15
+ """Test if real training data is being logged and can be retrieved"""
16
+
17
+ client = TrackioAPIClient("https://tonic-test-trackio-test.hf.space")
18
+
19
+ # Your experiment ID
20
+ experiment_id = "exp_20250720_101955"
21
+
22
+ print("πŸ” Testing Real Training Data")
23
+ print("=" * 50)
24
+
25
+ # 1. Test getting experiment details
26
+ print(f"\n1. Getting experiment details for {experiment_id}...")
27
+ details_result = client.get_experiment_details(experiment_id)
28
+
29
+ if "success" in details_result:
30
+ print("βœ… Experiment details retrieved successfully")
31
+ try:
32
+ details_preview = details_result['data'][:200]
33
+ print(f"Details: {details_preview}...")
34
+ except UnicodeEncodeError:
35
+ print(f"Details: {details_result['data'][:100].encode('utf-8', errors='ignore').decode('utf-8')}...")
36
+
37
+ # Look for metrics in the details
38
+ if "metrics" in details_result['data'].lower():
39
+ print("βœ… Found metrics in experiment details")
40
+ else:
41
+ print("❌ No metrics found in experiment details")
42
+ else:
43
+ print(f"❌ Failed to get experiment details: {details_result}")
44
+
45
+ # 2. Test getting training metrics specifically
46
+ print(f"\n2. Getting training metrics for {experiment_id}...")
47
+ metrics_result = client.get_training_metrics(experiment_id)
48
+
49
+ if "success" in metrics_result:
50
+ print("βœ… Training metrics retrieved successfully")
51
+ print(f"Metrics: {metrics_result['data'][:200]}...")
52
+ else:
53
+ print(f"❌ Failed to get training metrics: {metrics_result}")
54
+
55
+ # 3. Test getting metrics history
56
+ print(f"\n3. Getting metrics history for {experiment_id}...")
57
+ history_result = client.get_experiment_metrics_history(experiment_id)
58
+
59
+ if "success" in history_result:
60
+ print("βœ… Metrics history retrieved successfully")
61
+ print(f"History: {history_result['data'][:200]}...")
62
+ else:
63
+ print(f"❌ Failed to get metrics history: {history_result}")
64
+
65
+ # 4. List all experiments to see what's available
66
+ print(f"\n4. Listing all experiments...")
67
+ list_result = client.list_experiments()
68
+
69
+ if "success" in list_result:
70
+ print("βœ… Experiments listed successfully")
71
+ try:
72
+ response_preview = list_result['data'][:300]
73
+ print(f"Response: {response_preview}...")
74
+ except UnicodeEncodeError:
75
+ print(f"Response: {list_result['data'][:150].encode('utf-8', errors='ignore').decode('utf-8')}...")
76
+ else:
77
+ print(f"❌ Failed to list experiments: {list_result}")
78
+
79
+ print("\n" + "=" * 50)
80
+ print("🎯 Analysis Complete")
81
+ print("=" * 50)
82
+
83
+ def log_real_training_step(experiment_id: str, step: int):
84
+ """Log a single real training step for testing"""
85
+
86
+ client = TrackioAPIClient("https://tonic-test-trackio-test.hf.space")
87
+
88
+ # Real training metrics
89
+ metrics = {
90
+ "loss": 1.2345,
91
+ "accuracy": 0.8567,
92
+ "learning_rate": 3.5e-6,
93
+ "gpu_memory_gb": 22.5,
94
+ "training_time_per_step": 0.8,
95
+ "epoch": 1,
96
+ "samples_per_second": 45.2
97
+ }
98
+
99
+ print(f"πŸ“Š Logging real training step {step}...")
100
+ result = client.log_metrics(experiment_id, metrics, step)
101
+
102
+ if "success" in result:
103
+ print(f"βœ… Step {step} logged successfully")
104
+ print(f"Metrics: {metrics}")
105
+ else:
106
+ print(f"❌ Failed to log step {step}: {result}")
107
+
108
+ if __name__ == "__main__":
109
+ # Test existing data
110
+ test_real_training_data()
111
+
112
+ # Optionally log a test step
113
+ print("\n" + "=" * 50)
114
+ print("πŸ§ͺ Testing Real Data Logging")
115
+ print("=" * 50)
116
+
117
+ experiment_id = "exp_20250720_101955"
118
+ log_real_training_step(experiment_id, 1000)
119
+
120
+ print("\n" + "=" * 50)
121
+ print("🎯 Next Steps:")
122
+ print("1. Run your actual training: python run_a100_large_experiment.py")
123
+ print("2. The training will log real metrics every 25 steps")
124
+ print("3. Check the visualization tab in your Trackio Space")
125
+ print("4. Real training data should appear as training progresses")
126
+ print("=" * 50)
trackio_api_client.py CHANGED
@@ -258,6 +258,32 @@ class TrackioAPIClient:
258
  else:
259
  logger.error(f"Failed to simulate training data: {result}")
260
  return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
262
  def test_simple_connection():
263
  """Test basic connectivity to the Space"""
 
258
  else:
259
  logger.error(f"Failed to simulate training data: {result}")
260
  return result
261
+
262
+ def get_training_metrics(self, experiment_id: str) -> Dict[str, Any]:
263
+ """Get training metrics for an experiment"""
264
+ logger.info(f"Getting training metrics for experiment {experiment_id}")
265
+
266
+ result = self._make_api_call("get_training_metrics", [experiment_id])
267
+
268
+ if "success" in result:
269
+ logger.info(f"Training metrics retrieved: {result['data'][:100]}...")
270
+ return result
271
+ else:
272
+ logger.error(f"Failed to get training metrics: {result}")
273
+ return result
274
+
275
+ def get_experiment_metrics_history(self, experiment_id: str) -> Dict[str, Any]:
276
+ """Get complete metrics history for an experiment"""
277
+ logger.info(f"Getting metrics history for experiment {experiment_id}")
278
+
279
+ result = self._make_api_call("get_metrics_history", [experiment_id])
280
+
281
+ if "success" in result:
282
+ logger.info(f"Metrics history retrieved: {result['data'][:100]}...")
283
+ return result
284
+ else:
285
+ logger.error(f"Failed to get metrics history: {result}")
286
+ return result
287
 
288
  def test_simple_connection():
289
  """Test basic connectivity to the Space"""