Tonic commited on
Commit
08ed534
Β·
verified Β·
1 Parent(s): c3f29a5

matches experiment id for all metrics

Browse files
Files changed (3) hide show
  1. src/trackio.py +4 -4
  2. src/trainer.py +21 -10
  3. tests/test_experiment_id_fix.py +123 -0
src/trackio.py CHANGED
@@ -61,8 +61,8 @@ def init(
61
  dataset_repo=dataset_repo
62
  )
63
 
64
- # Generate experiment ID
65
- experiment_id = f"trl_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
66
  _monitor.experiment_id = experiment_id
67
 
68
  logger.info(f"Trackio initialized for experiment: {exp_name}")
@@ -72,8 +72,8 @@ def init(
72
 
73
  except Exception as e:
74
  logger.error(f"Failed to initialize trackio: {e}")
75
- # Return a fallback experiment ID
76
- return f"trl_fallback_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
77
 
78
  def log(
79
  metrics: Dict[str, Any],
 
61
  dataset_repo=dataset_repo
62
  )
63
 
64
+ # Generate experiment ID - use the same format as our monitoring system
65
+ experiment_id = f"exp_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
66
  _monitor.experiment_id = experiment_id
67
 
68
  logger.info(f"Trackio initialized for experiment: {exp_name}")
 
72
 
73
  except Exception as e:
74
  logger.error(f"Failed to initialize trackio: {e}")
75
+ # Return a fallback experiment ID - use the same format as our monitoring system
76
+ return f"exp_fallback_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
77
 
78
  def log(
79
  metrics: Dict[str, Any],
src/trainer.py CHANGED
@@ -138,16 +138,27 @@ class SmolLM3Trainer:
138
  # Initialize trackio for TRL compatibility
139
  try:
140
  import trackio
141
- # Initialize trackio with our configuration
142
- experiment_id = trackio.init(
143
- project_name=getattr(self.config, 'experiment_name', 'smollm3_experiment'),
144
- experiment_name=getattr(self.config, 'experiment_name', 'smollm3_experiment'),
145
- trackio_url=getattr(self.config, 'trackio_url', None),
146
- trackio_token=getattr(self.config, 'trackio_token', None),
147
- hf_token=getattr(self.config, 'hf_token', None),
148
- dataset_repo=getattr(self.config, 'dataset_repo', None)
149
- )
150
- logger.info(f"Trackio initialized with experiment ID: {experiment_id}")
 
 
 
 
 
 
 
 
 
 
 
151
  except Exception as e:
152
  logger.warning(f"Failed to initialize trackio: {e}")
153
  logger.info("Continuing without trackio integration")
 
138
  # Initialize trackio for TRL compatibility
139
  try:
140
  import trackio
141
+ # Initialize trackio with our configuration and use the same experiment ID
142
+ if self.monitor and self.monitor.experiment_id:
143
+ # Use the experiment ID from our monitor
144
+ experiment_id = self.monitor.experiment_id
145
+ logger.info(f"Using existing experiment ID: {experiment_id}")
146
+ else:
147
+ # Initialize trackio with our configuration
148
+ experiment_id = trackio.init(
149
+ project_name=getattr(self.config, 'experiment_name', 'smollm3_experiment'),
150
+ experiment_name=getattr(self.config, 'experiment_name', 'smollm3_experiment'),
151
+ trackio_url=getattr(self.config, 'trackio_url', None),
152
+ trackio_token=getattr(self.config, 'trackio_token', None),
153
+ hf_token=getattr(self.config, 'hf_token', None),
154
+ dataset_repo=getattr(self.config, 'dataset_repo', None)
155
+ )
156
+ logger.info(f"Trackio initialized with experiment ID: {experiment_id}")
157
+
158
+ # Update our monitor with the same experiment ID
159
+ if self.monitor:
160
+ self.monitor.experiment_id = experiment_id
161
+ logger.info(f"Updated monitor with experiment ID: {experiment_id}")
162
  except Exception as e:
163
  logger.warning(f"Failed to initialize trackio: {e}")
164
  logger.info("Continuing without trackio integration")
tests/test_experiment_id_fix.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to verify that both monitoring systems use the same experiment ID format
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ import logging
9
+
10
+ # Add the project root to the path
11
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
12
+
13
+ from src.monitoring import SmolLM3Monitor
14
+ from src.trackio import init as trackio_init
15
+
16
+ # Setup logging
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+ def test_experiment_id_consistency():
21
+ """Test that both monitoring systems use the same experiment ID format"""
22
+ print("πŸ”§ Testing experiment ID consistency...")
23
+
24
+ # Test 1: SmolLM3Monitor experiment ID format
25
+ print("\n1️⃣ Testing SmolLM3Monitor experiment ID format...")
26
+ monitor = SmolLM3Monitor(
27
+ experiment_name="test_experiment_id_consistency",
28
+ enable_tracking=True
29
+ )
30
+
31
+ print(f"SmolLM3Monitor experiment ID: {monitor.experiment_id}")
32
+
33
+ if monitor.experiment_id and monitor.experiment_id.startswith('exp_'):
34
+ print("βœ… SmolLM3Monitor uses correct experiment ID format (exp_)")
35
+ else:
36
+ print("❌ SmolLM3Monitor uses incorrect experiment ID format")
37
+ return False
38
+
39
+ # Test 2: Trackio experiment ID format
40
+ print("\n2️⃣ Testing Trackio experiment ID format...")
41
+ trackio_experiment_id = trackio_init(
42
+ project_name="test_experiment_id_consistency",
43
+ experiment_name="test_experiment_id_consistency"
44
+ )
45
+
46
+ print(f"Trackio experiment ID: {trackio_experiment_id}")
47
+
48
+ if trackio_experiment_id and trackio_experiment_id.startswith('exp_'):
49
+ print("βœ… Trackio uses correct experiment ID format (exp_)")
50
+ else:
51
+ print("❌ Trackio uses incorrect experiment ID format")
52
+ return False
53
+
54
+ # Test 3: Verify both use the same format
55
+ print("\n3️⃣ Testing experiment ID format consistency...")
56
+ if monitor.experiment_id.startswith('exp_') and trackio_experiment_id.startswith('exp_'):
57
+ print("βœ… Both monitoring systems use the same experiment ID format")
58
+ return True
59
+ else:
60
+ print("❌ Monitoring systems use different experiment ID formats")
61
+ return False
62
+
63
+ def test_monitoring_integration():
64
+ """Test that both monitoring systems can work together"""
65
+ print("\nπŸ”§ Testing monitoring integration...")
66
+
67
+ try:
68
+ # Create monitor
69
+ monitor = SmolLM3Monitor(
70
+ experiment_name="test_monitoring_integration",
71
+ enable_tracking=True
72
+ )
73
+
74
+ print(f"βœ… Monitor created with experiment ID: {monitor.experiment_id}")
75
+
76
+ # Initialize trackio with the same experiment ID
77
+ trackio_experiment_id = trackio_init(
78
+ project_name="test_monitoring_integration",
79
+ experiment_name="test_monitoring_integration"
80
+ )
81
+
82
+ print(f"βœ… Trackio initialized with experiment ID: {trackio_experiment_id}")
83
+
84
+ # Test logging metrics to both systems
85
+ metrics = {"loss": 1.234, "accuracy": 0.85}
86
+
87
+ # Log to monitor
88
+ monitor.log_metrics(metrics, step=100)
89
+ print("βœ… Metrics logged to monitor")
90
+
91
+ # Log to trackio
92
+ from src.trackio import log as trackio_log
93
+ trackio_log(metrics, step=100)
94
+ print("βœ… Metrics logged to trackio")
95
+
96
+ print("πŸŽ‰ Monitoring integration test passed!")
97
+ return True
98
+
99
+ except Exception as e:
100
+ print(f"❌ Monitoring integration test failed: {e}")
101
+ return False
102
+
103
+ if __name__ == "__main__":
104
+ print("πŸš€ Starting Experiment ID Consistency Tests")
105
+ print("=" * 60)
106
+
107
+ # Test 1: Experiment ID format consistency
108
+ format_consistency = test_experiment_id_consistency()
109
+
110
+ # Test 2: Monitoring integration
111
+ integration_success = test_monitoring_integration()
112
+
113
+ print("\n" + "=" * 60)
114
+ print("πŸ“Š Test Results Summary:")
115
+ print(f"Experiment ID Format Consistency: {'βœ… PASSED' if format_consistency else '❌ FAILED'}")
116
+ print(f"Monitoring Integration: {'βœ… PASSED' if integration_success else '❌ FAILED'}")
117
+
118
+ if format_consistency and integration_success:
119
+ print("\nπŸŽ‰ All tests passed! Experiment ID conflict is resolved.")
120
+ sys.exit(0)
121
+ else:
122
+ print("\n❌ Some tests failed. Please check the errors above.")
123
+ sys.exit(1)