Tonic commited on
Commit
e99b183
Β·
verified Β·
1 Parent(s): 6f0279c

monitor experiment script

Browse files
Files changed (3) hide show
  1. monitoring.py +80 -59
  2. test_trackio_connection.py +158 -0
  3. trackio_api_client.py +461 -0
monitoring.py CHANGED
@@ -11,13 +11,13 @@ from datetime import datetime
11
  import torch
12
  from pathlib import Path
13
 
 
14
  try:
15
- import trackio
16
- from trackio import TrackioClient
17
  TRACKIO_AVAILABLE = True
18
  except ImportError:
19
  TRACKIO_AVAILABLE = False
20
- print("Warning: Trackio not available. Install with: pip install trackio")
21
 
22
  logger = logging.getLogger(__name__)
23
 
@@ -40,7 +40,7 @@ class SmolLM3Monitor:
40
  self.log_metrics = log_metrics
41
  self.log_config = log_config
42
 
43
- # Initialize Trackio client
44
  self.trackio_client = None
45
  if self.enable_tracking:
46
  self._setup_trackio(trackio_url, trackio_token)
@@ -54,32 +54,41 @@ class SmolLM3Monitor:
54
  logger.info(f"Initialized monitoring for experiment: {experiment_name}")
55
 
56
  def _setup_trackio(self, trackio_url: Optional[str], trackio_token: Optional[str]):
57
- """Setup Trackio client"""
58
  try:
59
  # Get Trackio configuration from environment or parameters
60
  url = trackio_url or os.getenv('TRACKIO_URL')
61
- token = trackio_token or os.getenv('TRACKIO_TOKEN')
62
 
63
  if not url:
64
  logger.warning("Trackio URL not provided. Set TRACKIO_URL environment variable.")
65
  self.enable_tracking = False
66
  return
67
 
68
- self.trackio_client = TrackioClient(
69
- url=url,
70
- token=token
71
- )
72
 
73
- # Create or get experiment
74
- self.experiment_id = self.trackio_client.create_experiment(
75
  name=self.experiment_name,
76
  description=f"SmolLM3 fine-tuning experiment started at {self.start_time}"
77
  )
78
 
79
- logger.info(f"Trackio client initialized. Experiment ID: {self.experiment_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  except Exception as e:
82
- logger.error(f"Failed to initialize Trackio: {e}")
83
  self.enable_tracking = False
84
 
85
  def log_config(self, config: Dict[str, Any]):
@@ -89,18 +98,21 @@ class SmolLM3Monitor:
89
 
90
  try:
91
  # Log configuration as parameters
92
- self.trackio_client.log_parameters(
93
  experiment_id=self.experiment_id,
94
  parameters=config
95
  )
96
 
97
- # Also save config locally
98
- config_path = f"config_{self.experiment_name}_{self.start_time.strftime('%Y%m%d_%H%M%S')}.json"
99
- with open(config_path, 'w') as f:
100
- json.dump(config, f, indent=2, default=str)
101
-
102
- self.artifacts.append(config_path)
103
- logger.info(f"Configuration logged to Trackio and saved to {config_path}")
 
 
 
104
 
105
  except Exception as e:
106
  logger.error(f"Failed to log configuration: {e}")
@@ -117,16 +129,18 @@ class SmolLM3Monitor:
117
  metrics['step'] = step
118
 
119
  # Log to Trackio
120
- self.trackio_client.log_metrics(
121
  experiment_id=self.experiment_id,
122
  metrics=metrics,
123
  step=step
124
  )
125
 
126
- # Store locally
127
- self.metrics_history.append(metrics)
128
-
129
- logger.debug(f"Metrics logged: {metrics}")
 
 
130
 
131
  except Exception as e:
132
  logger.error(f"Failed to log metrics: {e}")
@@ -137,15 +151,24 @@ class SmolLM3Monitor:
137
  return
138
 
139
  try:
140
- # Log checkpoint as artifact
141
- self.trackio_client.log_artifact(
 
 
 
 
 
 
 
142
  experiment_id=self.experiment_id,
143
- file_path=checkpoint_path,
144
- artifact_name=f"checkpoint_step_{step}" if step else "checkpoint"
145
  )
146
 
147
- self.artifacts.append(checkpoint_path)
148
- logger.info(f"Checkpoint logged: {checkpoint_path}")
 
 
 
149
 
150
  except Exception as e:
151
  logger.error(f"Failed to log checkpoint: {e}")
@@ -210,18 +233,21 @@ class SmolLM3Monitor:
210
  summary['experiment_duration_hours'] = duration / 3600
211
 
212
  # Log final summary
213
- self.trackio_client.log_parameters(
214
  experiment_id=self.experiment_id,
215
  parameters=summary
216
  )
217
 
218
- # Save summary locally
219
- summary_path = f"training_summary_{self.experiment_name}_{self.start_time.strftime('%Y%m%d_%H%M%S')}.json"
220
- with open(summary_path, 'w') as f:
221
- json.dump(summary, f, indent=2, default=str)
222
-
223
- self.artifacts.append(summary_path)
224
- logger.info(f"Training summary logged and saved to {summary_path}")
 
 
 
225
 
226
  except Exception as e:
227
  logger.error(f"Failed to log training summary: {e}")
@@ -257,7 +283,7 @@ class SmolLM3Monitor:
257
  def get_experiment_url(self) -> Optional[str]:
258
  """Get the URL to view the experiment in Trackio"""
259
  if self.trackio_client and self.experiment_id:
260
- return f"{self.trackio_client.url}/experiments/{self.experiment_id}"
261
  return None
262
 
263
  def close(self):
@@ -265,11 +291,14 @@ class SmolLM3Monitor:
265
  if self.enable_tracking and self.trackio_client:
266
  try:
267
  # Mark experiment as completed
268
- self.trackio_client.update_experiment_status(
269
  experiment_id=self.experiment_id,
270
  status="completed"
271
  )
272
- logger.info("Monitoring session closed")
 
 
 
273
  except Exception as e:
274
  logger.error(f"Failed to close monitoring session: {e}")
275
 
@@ -277,22 +306,14 @@ class SmolLM3Monitor:
277
  def create_monitor_from_config(config, experiment_name: Optional[str] = None) -> SmolLM3Monitor:
278
  """Create a monitor instance from configuration"""
279
  if experiment_name is None:
280
- experiment_name = f"smollm3_finetune_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
281
-
282
- # Extract monitoring configuration
283
- trackio_url = getattr(config, 'trackio_url', None)
284
- trackio_token = getattr(config, 'trackio_token', None)
285
- enable_tracking = getattr(config, 'enable_tracking', True)
286
- log_artifacts = getattr(config, 'log_artifacts', True)
287
- log_metrics = getattr(config, 'log_metrics', True)
288
- log_config = getattr(config, 'log_config', True)
289
 
290
  return SmolLM3Monitor(
291
  experiment_name=experiment_name,
292
- trackio_url=trackio_url,
293
- trackio_token=trackio_token,
294
- enable_tracking=enable_tracking,
295
- log_artifacts=log_artifacts,
296
- log_metrics=log_metrics,
297
- log_config=log_config
298
  )
 
11
  import torch
12
  from pathlib import Path
13
 
14
+ # Import the real API client
15
  try:
16
+ from trackio_api_client import TrackioAPIClient
 
17
  TRACKIO_AVAILABLE = True
18
  except ImportError:
19
  TRACKIO_AVAILABLE = False
20
+ print("Warning: Trackio API client not available. Install with: pip install requests")
21
 
22
  logger = logging.getLogger(__name__)
23
 
 
40
  self.log_metrics = log_metrics
41
  self.log_config = log_config
42
 
43
+ # Initialize Trackio API client
44
  self.trackio_client = None
45
  if self.enable_tracking:
46
  self._setup_trackio(trackio_url, trackio_token)
 
54
  logger.info(f"Initialized monitoring for experiment: {experiment_name}")
55
 
56
  def _setup_trackio(self, trackio_url: Optional[str], trackio_token: Optional[str]):
57
+ """Setup Trackio API client"""
58
  try:
59
  # Get Trackio configuration from environment or parameters
60
  url = trackio_url or os.getenv('TRACKIO_URL')
 
61
 
62
  if not url:
63
  logger.warning("Trackio URL not provided. Set TRACKIO_URL environment variable.")
64
  self.enable_tracking = False
65
  return
66
 
67
+ self.trackio_client = TrackioAPIClient(url)
 
 
 
68
 
69
+ # Create experiment
70
+ create_result = self.trackio_client.create_experiment(
71
  name=self.experiment_name,
72
  description=f"SmolLM3 fine-tuning experiment started at {self.start_time}"
73
  )
74
 
75
+ if "success" in create_result:
76
+ # Extract experiment ID from response
77
+ import re
78
+ response_text = create_result['data']
79
+ match = re.search(r'exp_\d{8}_\d{6}', response_text)
80
+ if match:
81
+ self.experiment_id = match.group()
82
+ logger.info(f"Trackio API client initialized. Experiment ID: {self.experiment_id}")
83
+ else:
84
+ logger.error("Could not extract experiment ID from response")
85
+ self.enable_tracking = False
86
+ else:
87
+ logger.error(f"Failed to create experiment: {create_result}")
88
+ self.enable_tracking = False
89
 
90
  except Exception as e:
91
+ logger.error(f"Failed to initialize Trackio API: {e}")
92
  self.enable_tracking = False
93
 
94
  def log_config(self, config: Dict[str, Any]):
 
98
 
99
  try:
100
  # Log configuration as parameters
101
+ result = self.trackio_client.log_parameters(
102
  experiment_id=self.experiment_id,
103
  parameters=config
104
  )
105
 
106
+ if "success" in result:
107
+ # Also save config locally
108
+ config_path = f"config_{self.experiment_name}_{self.start_time.strftime('%Y%m%d_%H%M%S')}.json"
109
+ with open(config_path, 'w') as f:
110
+ json.dump(config, f, indent=2, default=str)
111
+
112
+ self.artifacts.append(config_path)
113
+ logger.info(f"Configuration logged to Trackio and saved to {config_path}")
114
+ else:
115
+ logger.error(f"Failed to log configuration: {result}")
116
 
117
  except Exception as e:
118
  logger.error(f"Failed to log configuration: {e}")
 
129
  metrics['step'] = step
130
 
131
  # Log to Trackio
132
+ result = self.trackio_client.log_metrics(
133
  experiment_id=self.experiment_id,
134
  metrics=metrics,
135
  step=step
136
  )
137
 
138
+ if "success" in result:
139
+ # Store locally
140
+ self.metrics_history.append(metrics)
141
+ logger.debug(f"Metrics logged: {metrics}")
142
+ else:
143
+ logger.error(f"Failed to log metrics: {result}")
144
 
145
  except Exception as e:
146
  logger.error(f"Failed to log metrics: {e}")
 
151
  return
152
 
153
  try:
154
+ # For now, just log the checkpoint path as a parameter
155
+ # The actual file upload would need additional API endpoints
156
+ checkpoint_info = {
157
+ "checkpoint_path": checkpoint_path,
158
+ "checkpoint_step": step,
159
+ "checkpoint_size": os.path.getsize(checkpoint_path) if os.path.exists(checkpoint_path) else 0
160
+ }
161
+
162
+ result = self.trackio_client.log_parameters(
163
  experiment_id=self.experiment_id,
164
+ parameters=checkpoint_info
 
165
  )
166
 
167
+ if "success" in result:
168
+ self.artifacts.append(checkpoint_path)
169
+ logger.info(f"Checkpoint logged: {checkpoint_path}")
170
+ else:
171
+ logger.error(f"Failed to log checkpoint: {result}")
172
 
173
  except Exception as e:
174
  logger.error(f"Failed to log checkpoint: {e}")
 
233
  summary['experiment_duration_hours'] = duration / 3600
234
 
235
  # Log final summary
236
+ result = self.trackio_client.log_parameters(
237
  experiment_id=self.experiment_id,
238
  parameters=summary
239
  )
240
 
241
+ if "success" in result:
242
+ # Save summary locally
243
+ summary_path = f"training_summary_{self.experiment_name}_{self.start_time.strftime('%Y%m%d_%H%M%S')}.json"
244
+ with open(summary_path, 'w') as f:
245
+ json.dump(summary, f, indent=2, default=str)
246
+
247
+ self.artifacts.append(summary_path)
248
+ logger.info(f"Training summary logged and saved to {summary_path}")
249
+ else:
250
+ logger.error(f"Failed to log training summary: {result}")
251
 
252
  except Exception as e:
253
  logger.error(f"Failed to log training summary: {e}")
 
283
  def get_experiment_url(self) -> Optional[str]:
284
  """Get the URL to view the experiment in Trackio"""
285
  if self.trackio_client and self.experiment_id:
286
+ return f"{self.trackio_client.space_url}?tab=view_experiments"
287
  return None
288
 
289
  def close(self):
 
291
  if self.enable_tracking and self.trackio_client:
292
  try:
293
  # Mark experiment as completed
294
+ result = self.trackio_client.update_experiment_status(
295
  experiment_id=self.experiment_id,
296
  status="completed"
297
  )
298
+ if "success" in result:
299
+ logger.info("Monitoring session closed")
300
+ else:
301
+ logger.error(f"Failed to close monitoring session: {result}")
302
  except Exception as e:
303
  logger.error(f"Failed to close monitoring session: {e}")
304
 
 
306
  def create_monitor_from_config(config, experiment_name: Optional[str] = None) -> SmolLM3Monitor:
307
  """Create a monitor instance from configuration"""
308
  if experiment_name is None:
309
+ experiment_name = getattr(config, 'experiment_name', 'smollm3_experiment')
 
 
 
 
 
 
 
 
310
 
311
  return SmolLM3Monitor(
312
  experiment_name=experiment_name,
313
+ trackio_url=getattr(config, 'trackio_url', None),
314
+ trackio_token=getattr(config, 'trackio_token', None),
315
+ enable_tracking=getattr(config, 'enable_tracking', True),
316
+ log_artifacts=getattr(config, 'log_artifacts', True),
317
+ log_metrics=getattr(config, 'log_metrics', True),
318
+ log_config=getattr(config, 'log_config', True)
319
  )
test_trackio_connection.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to check Trackio Space connection
4
+ """
5
+
6
+ import requests
7
+ import json
8
+ from datetime import datetime
9
+
10
+ def test_trackio_space_connection():
11
+ """Test if the Trackio Space is accessible"""
12
+
13
+ trackio_url = "https://tonic-test-trackio-test.hf.space"
14
+
15
+ print("πŸ” Testing Trackio Space Connection")
16
+ print("=" * 50)
17
+
18
+ try:
19
+ # Test basic connectivity
20
+ print(f"1. Testing basic connectivity to {trackio_url}")
21
+ response = requests.get(trackio_url, timeout=10)
22
+
23
+ if response.status_code == 200:
24
+ print("βœ… Space is accessible")
25
+ else:
26
+ print(f"❌ Space returned status code: {response.status_code}")
27
+ return False
28
+
29
+ except requests.exceptions.RequestException as e:
30
+ print(f"❌ Cannot connect to Trackio Space: {e}")
31
+ print(" This means your training script cannot send data to the Space")
32
+ return False
33
+
34
+ print("\n2. Testing experiment creation...")
35
+
36
+ # Try to create a test experiment via the Space interface
37
+ # Note: This is a simplified test - the actual Space might need different approach
38
+
39
+ print("βœ… Basic connectivity test passed")
40
+ print("\nπŸ“‹ Next Steps:")
41
+ print("1. Visit the Trackio Space manually")
42
+ print("2. Create an experiment using the interface")
43
+ print("3. Log some metrics manually")
44
+ print("4. Check if experiments appear in the list")
45
+
46
+ return True
47
+
48
+ def check_local_files():
49
+ """Check what local files were created during training"""
50
+
51
+ print("\nπŸ“ Checking Local Training Files")
52
+ print("=" * 50)
53
+
54
+ import os
55
+ import glob
56
+
57
+ # Check for local files
58
+ local_files = []
59
+
60
+ # Check for config files
61
+ config_files = glob.glob("config_*.json")
62
+ local_files.extend(config_files)
63
+
64
+ # Check for training logs
65
+ if os.path.exists("training.log"):
66
+ local_files.append("training.log")
67
+
68
+ # Check for output directory
69
+ if os.path.exists("./outputs/balanced"):
70
+ local_files.append("./outputs/balanced/")
71
+
72
+ # Check for evaluation results
73
+ eval_files = glob.glob("eval_results_*.json")
74
+ local_files.extend(eval_files)
75
+
76
+ # Check for training summaries
77
+ summary_files = glob.glob("training_summary_*.json")
78
+ local_files.extend(summary_files)
79
+
80
+ if local_files:
81
+ print("βœ… Found local training files:")
82
+ for file in local_files:
83
+ if os.path.isdir(file):
84
+ size = "directory"
85
+ else:
86
+ size = f"{os.path.getsize(file)} bytes"
87
+ print(f" πŸ“„ {file} ({size})")
88
+ else:
89
+ print("❌ No local training files found")
90
+ print(" This suggests training didn't start or failed early")
91
+
92
+ return local_files
93
+
94
+ def provide_solutions():
95
+ """Provide solutions for the experiment visibility issue"""
96
+
97
+ print("\nπŸ› οΈ Solutions for Experiment Visibility")
98
+ print("=" * 50)
99
+
100
+ print("\n1. IMMEDIATE SOLUTION - Use Manual Interface:")
101
+ print(" a) Visit: https://tonic-test-trackio-test.hf.space")
102
+ print(" b) Go to 'Create Experiment' tab")
103
+ print(" c) Create experiment: 'petit-elle-l-aime-3-balanced'")
104
+ print(" d) Copy the experiment ID")
105
+ print(" e) Go to 'Log Metrics' tab")
106
+ print(" f) Enter metrics manually as training progresses")
107
+
108
+ print("\n2. CHECK TRAINING STATUS:")
109
+ print(" a) Check if training is actually running")
110
+ print(" b) Look for local files being created")
111
+ print(" c) Check training logs for errors")
112
+
113
+ print("\n3. ALTERNATIVE - Use Local Monitoring:")
114
+ print(" a) Check local files for training progress")
115
+ print(" b) Use local logs to monitor training")
116
+ print(" c) Trackio Space is for visualization only")
117
+
118
+ print("\n4. DEBUG TRAINING SCRIPT:")
119
+ print(" a) Check if Trackio client is working")
120
+ print(" b) Verify experiment creation in training logs")
121
+ print(" c) Look for connection errors")
122
+
123
+ def main():
124
+ """Main test function"""
125
+
126
+ print("πŸš€ Trackio Space Connection Test")
127
+ print("=" * 60)
128
+
129
+ # Test connection
130
+ connection_ok = test_trackio_space_connection()
131
+
132
+ # Check local files
133
+ local_files = check_local_files()
134
+
135
+ # Provide solutions
136
+ provide_solutions()
137
+
138
+ print("\n" + "=" * 60)
139
+ print("πŸ“Š SUMMARY")
140
+ print("=" * 60)
141
+
142
+ if connection_ok:
143
+ print("βœ… Trackio Space is accessible")
144
+ else:
145
+ print("❌ Trackio Space connection failed")
146
+
147
+ if local_files:
148
+ print("βœ… Local training files found")
149
+ else:
150
+ print("❌ No local training files found")
151
+
152
+ print("\n🎯 RECOMMENDATION:")
153
+ print("Use the Trackio Space manually to create and monitor experiments")
154
+ print("The training script will save data locally, but the Space")
155
+ print("needs manual interaction for now.")
156
+
157
+ if __name__ == "__main__":
158
+ main()
trackio_api_client.py ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Trackio API Client for Hugging Face Spaces
4
+ Connects to the Trackio Space using the actual API endpoints
5
+ """
6
+
7
+ import requests
8
+ import json
9
+ import time
10
+ import logging
11
+ from typing import Dict, Any, Optional
12
+ from datetime import datetime
13
+
14
+ # Setup logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class TrackioAPIClient:
19
+ """API client for Trackio Space"""
20
+
21
+ def __init__(self, space_url: str):
22
+ self.space_url = space_url.rstrip('/')
23
+ self.base_url = f"{self.space_url}/gradio_api/call"
24
+
25
+ def _make_api_call(self, endpoint: str, data: list, max_retries: int = 3) -> Dict[str, Any]:
26
+ """Make an API call to the Trackio Space"""
27
+ url = f"{self.base_url}/{endpoint}"
28
+
29
+ payload = {
30
+ "data": data
31
+ }
32
+
33
+ for attempt in range(max_retries):
34
+ try:
35
+ logger.debug(f"Attempt {attempt + 1}: Making POST request to {url}")
36
+
37
+ # POST request to get EVENT_ID
38
+ response = requests.post(
39
+ url,
40
+ json=payload,
41
+ headers={"Content-Type": "application/json"},
42
+ timeout=30
43
+ )
44
+
45
+ if response.status_code != 200:
46
+ logger.error(f"POST request failed: {response.status_code} - {response.text}")
47
+ if attempt < max_retries - 1:
48
+ time.sleep(2 ** attempt) # Exponential backoff
49
+ continue
50
+ return {"error": f"POST failed: {response.status_code}"}
51
+
52
+ # Extract EVENT_ID from response
53
+ response_data = response.json()
54
+ logger.debug(f"POST response: {response_data}")
55
+
56
+ # Check for event_id (correct field name)
57
+ if "event_id" in response_data:
58
+ event_id = response_data["event_id"]
59
+ elif "hash" in response_data:
60
+ event_id = response_data["hash"]
61
+ else:
62
+ logger.error(f"No event_id or hash in response: {response_data}")
63
+ return {"error": "No EVENT_ID in response"}
64
+
65
+ # GET request to get results
66
+ get_url = f"{url}/{event_id}"
67
+ logger.debug(f"Making GET request to: {get_url}")
68
+
69
+ # Wait a bit for the processing to complete
70
+ time.sleep(1)
71
+
72
+ get_response = requests.get(get_url, timeout=30)
73
+
74
+ if get_response.status_code != 200:
75
+ logger.error(f"GET request failed: {get_response.status_code} - {get_response.text}")
76
+ if attempt < max_retries - 1:
77
+ time.sleep(2 ** attempt)
78
+ continue
79
+ return {"error": f"GET failed: {get_response.status_code}"}
80
+
81
+ # Check if response is empty
82
+ if not get_response.content:
83
+ logger.warning(f"Empty response from GET request (attempt {attempt + 1})")
84
+ if attempt < max_retries - 1:
85
+ time.sleep(2 ** attempt)
86
+ continue
87
+ return {"error": "Empty response from server"}
88
+
89
+ # Parse the response - handle both JSON and SSE formats
90
+ response_text = get_response.text.strip()
91
+ logger.debug(f"Raw response: {response_text}")
92
+
93
+ # Try to parse as JSON first
94
+ try:
95
+ result_data = get_response.json()
96
+ logger.debug(f"Parsed as JSON: {result_data}")
97
+
98
+ if "data" in result_data and len(result_data["data"]) > 0:
99
+ return {"success": True, "data": result_data["data"][0]}
100
+ else:
101
+ logger.warning(f"No data in JSON response (attempt {attempt + 1}): {result_data}")
102
+ if attempt < max_retries - 1:
103
+ time.sleep(2 ** attempt)
104
+ continue
105
+ return {"error": "No data in JSON response", "raw": result_data}
106
+
107
+ except json.JSONDecodeError:
108
+ # Try to parse as Server-Sent Events (SSE) format
109
+ logger.debug("Response is not JSON, trying SSE format")
110
+
111
+ # Parse SSE format: "event: complete\ndata: [\"message\"]"
112
+ lines = response_text.split('\n')
113
+ data_line = None
114
+
115
+ for line in lines:
116
+ if line.startswith('data: '):
117
+ data_line = line[6:] # Remove 'data: ' prefix
118
+ break
119
+
120
+ if data_line:
121
+ try:
122
+ # Parse the data array from SSE
123
+ import ast
124
+ data_array = ast.literal_eval(data_line)
125
+
126
+ if isinstance(data_array, list) and len(data_array) > 0:
127
+ result_message = data_array[0]
128
+ logger.debug(f"Parsed SSE data: {result_message}")
129
+ return {"success": True, "data": result_message}
130
+ else:
131
+ logger.warning(f"Invalid SSE data format (attempt {attempt + 1}): {data_array}")
132
+ if attempt < max_retries - 1:
133
+ time.sleep(2 ** attempt)
134
+ continue
135
+ return {"error": "Invalid SSE data format", "raw": data_array}
136
+
137
+ except (ValueError, SyntaxError) as e:
138
+ logger.error(f"Failed to parse SSE data: {e}")
139
+ logger.debug(f"Raw SSE data: {data_line}")
140
+ if attempt < max_retries - 1:
141
+ time.sleep(2 ** attempt)
142
+ continue
143
+ return {"error": f"Failed to parse SSE data: {e}"}
144
+ else:
145
+ logger.error(f"No data line found in SSE response")
146
+ if attempt < max_retries - 1:
147
+ time.sleep(2 ** attempt)
148
+ continue
149
+ return {"error": "No data line in SSE response", "raw": response_text}
150
+
151
+ except requests.exceptions.RequestException as e:
152
+ logger.error(f"API call failed (attempt {attempt + 1}): {e}")
153
+ if attempt < max_retries - 1:
154
+ time.sleep(2 ** attempt)
155
+ continue
156
+ return {"error": f"Request failed: {e}"}
157
+ except Exception as e:
158
+ logger.error(f"Unexpected error (attempt {attempt + 1}): {e}")
159
+ if attempt < max_retries - 1:
160
+ time.sleep(2 ** attempt)
161
+ continue
162
+ return {"error": f"Unexpected error: {e}"}
163
+
164
+ return {"error": f"Failed after {max_retries} attempts"}
165
+
166
+ def create_experiment(self, name: str, description: str = "") -> Dict[str, Any]:
167
+ """Create a new experiment"""
168
+ logger.info(f"Creating experiment: {name}")
169
+
170
+ result = self._make_api_call("create_experiment_interface", [name, description])
171
+
172
+ if "success" in result:
173
+ logger.info(f"Experiment created successfully: {result['data']}")
174
+ return result
175
+ else:
176
+ logger.error(f"Failed to create experiment: {result}")
177
+ return result
178
+
179
+ def log_metrics(self, experiment_id: str, metrics: Dict[str, Any], step: Optional[int] = None) -> Dict[str, Any]:
180
+ """Log metrics for an experiment"""
181
+ metrics_json = json.dumps(metrics)
182
+ step_str = str(step) if step is not None else ""
183
+
184
+ logger.info(f"Logging metrics for experiment {experiment_id} at step {step}")
185
+
186
+ result = self._make_api_call("log_metrics_interface", [experiment_id, metrics_json, step_str])
187
+
188
+ if "success" in result:
189
+ logger.info(f"Metrics logged successfully: {result['data']}")
190
+ return result
191
+ else:
192
+ logger.error(f"Failed to log metrics: {result}")
193
+ return result
194
+
195
+ def log_parameters(self, experiment_id: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
196
+ """Log parameters for an experiment"""
197
+ parameters_json = json.dumps(parameters)
198
+
199
+ logger.info(f"Logging parameters for experiment {experiment_id}")
200
+
201
+ result = self._make_api_call("log_parameters_interface", [experiment_id, parameters_json])
202
+
203
+ if "success" in result:
204
+ logger.info(f"Parameters logged successfully: {result['data']}")
205
+ return result
206
+ else:
207
+ logger.error(f"Failed to log parameters: {result}")
208
+ return result
209
+
210
+ def get_experiment_details(self, experiment_id: str) -> Dict[str, Any]:
211
+ """Get experiment details"""
212
+ logger.info(f"Getting details for experiment {experiment_id}")
213
+
214
+ result = self._make_api_call("get_experiment_details", [experiment_id])
215
+
216
+ if "success" in result:
217
+ logger.info(f"Experiment details retrieved: {result['data'][:100]}...")
218
+ return result
219
+ else:
220
+ logger.error(f"Failed to get experiment details: {result}")
221
+ return result
222
+
223
+ def list_experiments(self) -> Dict[str, Any]:
224
+ """List all experiments"""
225
+ logger.info("Listing all experiments")
226
+
227
+ result = self._make_api_call("list_experiments_interface", [])
228
+
229
+ if "success" in result:
230
+ logger.info(f"Experiments listed: {result['data'][:100]}...")
231
+ return result
232
+ else:
233
+ logger.error(f"Failed to list experiments: {result}")
234
+ return result
235
+
236
+ def update_experiment_status(self, experiment_id: str, status: str) -> Dict[str, Any]:
237
+ """Update experiment status"""
238
+ logger.info(f"Updating experiment {experiment_id} status to {status}")
239
+
240
+ result = self._make_api_call("update_experiment_status_interface", [experiment_id, status])
241
+
242
+ if "success" in result:
243
+ logger.info(f"Status updated successfully: {result['data']}")
244
+ return result
245
+ else:
246
+ logger.error(f"Failed to update status: {result}")
247
+ return result
248
+
249
+ def simulate_training_data(self, experiment_id: str) -> Dict[str, Any]:
250
+ """Simulate training data for demonstration"""
251
+ logger.info(f"Simulating training data for experiment {experiment_id}")
252
+
253
+ result = self._make_api_call("simulate_training_data", [experiment_id])
254
+
255
+ if "success" in result:
256
+ logger.info(f"Training data simulated: {result['data']}")
257
+ return result
258
+ else:
259
+ logger.error(f"Failed to simulate training data: {result}")
260
+ return result
261
+
262
+ def test_simple_connection():
263
+ """Test basic connectivity to the Space"""
264
+ print("πŸ” Testing Basic Space Connectivity")
265
+ print("=" * 50)
266
+
267
+ try:
268
+ # Test basic connectivity
269
+ response = requests.get("https://tonic-test-trackio-test.hf.space", timeout=10)
270
+ if response.status_code == 200:
271
+ print("βœ… Space is accessible")
272
+ return True
273
+ else:
274
+ print(f"❌ Space returned status: {response.status_code}")
275
+ return False
276
+ except Exception as e:
277
+ print(f"❌ Cannot connect to Space: {e}")
278
+ return False
279
+
280
+ def test_api_connection():
281
+ """Test the API connection"""
282
+ print("πŸ” Testing Trackio API Connection")
283
+ print("=" * 50)
284
+
285
+ # First test basic connectivity
286
+ if not test_simple_connection():
287
+ return
288
+
289
+ # Initialize client
290
+ client = TrackioAPIClient("https://tonic-test-trackio-test.hf.space")
291
+
292
+ # Test 1: Create experiment
293
+ print("\n1. Testing experiment creation...")
294
+ create_result = client.create_experiment(
295
+ "test_experiment_api",
296
+ "Test experiment created via API"
297
+ )
298
+
299
+ if "success" in create_result:
300
+ print("βœ… Experiment created successfully")
301
+
302
+ # Extract experiment ID from the response
303
+ response_text = create_result['data']
304
+ # Look for experiment ID in the response
305
+ if "exp_" in response_text:
306
+ # Extract the experiment ID
307
+ import re
308
+ match = re.search(r'exp_\d{8}_\d{6}', response_text)
309
+ if match:
310
+ experiment_id = match.group()
311
+ print(f" Experiment ID: {experiment_id}")
312
+
313
+ # Test 2: Log parameters
314
+ print("\n2. Testing parameter logging...")
315
+ parameters = {
316
+ "model_name": "HuggingFaceTB/SmolLM3-3B",
317
+ "batch_size": 8,
318
+ "learning_rate": 3.5e-6,
319
+ "max_iters": 18000
320
+ }
321
+
322
+ param_result = client.log_parameters(experiment_id, parameters)
323
+ if "success" in param_result:
324
+ print("βœ… Parameters logged successfully")
325
+ else:
326
+ print(f"❌ Failed to log parameters: {param_result}")
327
+
328
+ # Test 3: Log metrics
329
+ print("\n3. Testing metrics logging...")
330
+ metrics = {
331
+ "loss": 0.5234,
332
+ "accuracy": 0.8567,
333
+ "learning_rate": 3.5e-6,
334
+ "gpu_memory_gb": 22.5
335
+ }
336
+
337
+ metrics_result = client.log_metrics(experiment_id, metrics, 100)
338
+ if "success" in metrics_result:
339
+ print("βœ… Metrics logged successfully")
340
+ else:
341
+ print(f"❌ Failed to log metrics: {metrics_result}")
342
+
343
+ # Test 4: List experiments
344
+ print("\n4. Testing experiment listing...")
345
+ list_result = client.list_experiments()
346
+ if "success" in list_result:
347
+ print("βœ… Experiments listed successfully")
348
+ try:
349
+ response_preview = list_result['data'][:200]
350
+ print(f" Response: {response_preview}...")
351
+ except UnicodeEncodeError:
352
+ print(f" Response: {list_result['data'][:100].encode('utf-8', errors='ignore').decode('utf-8')}...")
353
+ else:
354
+ print(f"❌ Failed to list experiments: {list_result}")
355
+
356
+ # Test 5: Get experiment details
357
+ print("\n5. Testing experiment details...")
358
+ details_result = client.get_experiment_details(experiment_id)
359
+ if "success" in details_result:
360
+ print("βœ… Experiment details retrieved successfully")
361
+ try:
362
+ response_preview = details_result['data'][:200]
363
+ print(f" Response: {response_preview}...")
364
+ except UnicodeEncodeError:
365
+ print(f" Response: {details_result['data'][:100].encode('utf-8', errors='ignore').decode('utf-8')}...")
366
+ else:
367
+ print(f"❌ Failed to get experiment details: {details_result}")
368
+
369
+ else:
370
+ print("❌ Could not extract experiment ID from response")
371
+ else:
372
+ print("❌ No experiment ID found in response")
373
+ else:
374
+ print(f"❌ Failed to create experiment: {create_result}")
375
+
376
+ print("\n" + "=" * 50)
377
+ print("🎯 API Test Complete")
378
+ print("=" * 50)
379
+
380
+ def create_real_experiment():
381
+ """Create a real experiment for your training"""
382
+ print("πŸš€ Creating Real Experiment for Training")
383
+ print("=" * 50)
384
+
385
+ client = TrackioAPIClient("https://tonic-test-trackio-test.hf.space")
386
+
387
+ # Create experiment
388
+ create_result = client.create_experiment(
389
+ "petit-elle-l-aime-3-balanced",
390
+ "SmolLM3 fine-tuning on OpenHermes-FR dataset with balanced A100 configuration"
391
+ )
392
+
393
+ if "success" in create_result:
394
+ print("βœ… Experiment created successfully")
395
+ print(f"Response: {create_result['data']}")
396
+
397
+ # Extract experiment ID
398
+ import re
399
+ match = re.search(r'exp_\d{8}_\d{6}', create_result['data'])
400
+ if match:
401
+ experiment_id = match.group()
402
+ print(f"πŸ“‹ Experiment ID: {experiment_id}")
403
+
404
+ # Log initial parameters
405
+ parameters = {
406
+ "model_name": "HuggingFaceTB/SmolLM3-3B",
407
+ "dataset_name": "legmlai/openhermes-fr",
408
+ "batch_size": 8,
409
+ "gradient_accumulation_steps": 16,
410
+ "effective_batch_size": 128,
411
+ "learning_rate": 3.5e-6,
412
+ "max_iters": 18000,
413
+ "max_seq_length": 12288,
414
+ "mixed_precision": "bf16",
415
+ "use_flash_attention": True,
416
+ "optimizer": "adamw_torch",
417
+ "scheduler": "cosine",
418
+ "warmup_steps": 1200,
419
+ "save_steps": 2000,
420
+ "eval_steps": 1000,
421
+ "logging_steps": 25,
422
+ "no_think_system_message": True
423
+ }
424
+
425
+ param_result = client.log_parameters(experiment_id, parameters)
426
+ if "success" in param_result:
427
+ print("βœ… Initial parameters logged")
428
+ else:
429
+ print(f"❌ Failed to log parameters: {param_result}")
430
+
431
+ return experiment_id
432
+ else:
433
+ print("❌ Could not extract experiment ID")
434
+ return None
435
+ else:
436
+ print(f"❌ Failed to create experiment: {create_result}")
437
+ return None
438
+
439
+ if __name__ == "__main__":
440
+ # Test the API connection
441
+ test_api_connection()
442
+
443
+ print("\n" + "=" * 60)
444
+ print("🎯 CREATING REAL EXPERIMENT")
445
+ print("=" * 60)
446
+
447
+ # Create real experiment
448
+ experiment_id = create_real_experiment()
449
+
450
+ if experiment_id:
451
+ print(f"\nβœ… SUCCESS! Your experiment is ready:")
452
+ print(f" Experiment ID: {experiment_id}")
453
+ print(f" Trackio Space: https://tonic-test-trackio-test.hf.space")
454
+ print(f" View experiments in the 'View Experiments' tab")
455
+
456
+ print(f"\nπŸ“‹ Next steps:")
457
+ print(f"1. Use this experiment ID in your training script")
458
+ print(f"2. Monitor progress in the Trackio Space")
459
+ print(f"3. Log metrics as training progresses")
460
+ else:
461
+ print("\n❌ Failed to create experiment")