jbilcke-hf HF staff commited on
Commit
910a853
·
1 Parent(s): 246c64e

improve doc + investigate log parsing issues

Browse files
vms/ui/project/tabs/train_tab.py CHANGED
@@ -175,7 +175,7 @@ class TrainTab(BaseTab):
175
  value=DEFAULT_NB_LR_WARMUP_STEPS,
176
  minimum=0,
177
  precision=0,
178
- info="Number of warmup steps (typically 20-40% of total training steps)"
179
  )
180
  with gr.Column():
181
  with gr.Row():
 
175
  value=DEFAULT_NB_LR_WARMUP_STEPS,
176
  minimum=0,
177
  precision=0,
178
+ info="Number of warmup steps (typically 20-40% of total training steps). This helps reducing the impact of early training examples as well as giving time to optimizers to compute accurate statistics."
179
  )
180
  with gr.Column():
181
  with gr.Row():
vms/utils/training_log_parser.py CHANGED
@@ -21,10 +21,11 @@ class TrainingState:
21
  memory_reserved: float = 0.0
22
  start_time: Optional[datetime] = None
23
  last_step_time: Optional[datetime] = None
24
- estimated_remaining: Optional[timedelta] = None
25
  error_message: Optional[str] = None
26
  initialization_stage: str = ""
27
  download_progress: float = 0.0
 
28
 
29
  # New fields for current task tracking
30
  current_task: str = ""
@@ -50,12 +51,8 @@ class TrainingState:
50
 
51
  def to_dict(self) -> Dict[str, Any]:
52
  """Convert state to dictionary for UI updates"""
53
- # Calculate elapsed time only if training is active and we have a start time
54
- if self.start_time and self.status in ["training", "initializing"]:
55
- elapsed = str(datetime.now() - self.start_time)
56
- else:
57
- # Use the last known elapsed time or show 0
58
- elapsed = "0:00:00" if not self.last_step_time else str(self.last_step_time - self.start_time if self.start_time else "0:00:00")
59
 
60
  # Use precomputed remaining time from logs if available
61
  remaining = str(self.estimated_remaining) if self.estimated_remaining else "calculating..."
@@ -196,63 +193,81 @@ class TrainingLogParser:
196
  if len(self.state.recent_progress_lines) > self.max_recent_lines:
197
  self.state.recent_progress_lines.pop(0)
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  # Return updated state
200
  return self.state.to_dict()
201
-
202
- # Training step progress line example:
203
- # Training steps: 1%|▏ | 1/70 [00:14<16:11, 14.08s/it, grad_norm=0.00789, step_loss=0.555, lr=3e-7]
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  if ("Started training" in line) or ("Starting training" in line):
206
  self.state.status = "training"
207
-
208
- # Check for "Training steps:" which contains the progress information
209
- if "Training steps:" in line:
210
- # Set status to training if we see this
211
- self.state.status = "training"
212
-
213
  if not self.state.start_time:
214
  self.state.start_time = datetime.now()
215
-
216
- # Extract step numbers
217
- steps_match = re.search(r"(\d+)/(\d+)", line)
218
- if steps_match:
219
- self.state.current_step = int(steps_match.group(1))
220
- self.state.total_steps = int(steps_match.group(2))
221
-
222
- # Extract metrics
223
- for pattern, attr in [
224
- (r"step_loss=([0-9.e-]+)", "step_loss"),
225
- (r"lr=([0-9.e-]+)", "learning_rate"),
226
- (r"grad_norm=([0-9.e-]+)", "grad_norm")
227
- ]:
228
- match = re.search(pattern, line)
229
- if match:
230
- setattr(self.state, attr, float(match.group(1)))
231
-
232
- # Extract time remaining directly from the log
233
- # Format: [MM:SS<M:SS:SS, SS.SSs/it]
234
- time_remaining_match = re.search(r"<(\d+:\d+:\d+)", line)
235
- if time_remaining_match:
236
- remaining_str = time_remaining_match.group(1)
237
- # Store the string directly - no need to parse it
238
- self.state.estimated_remaining = remaining_str
239
-
240
- # If no direct time estimate, look for hour:min format
241
- if not time_remaining_match:
242
- hour_min_match = re.search(r"<(\d+h\s*\d+m)", line)
243
- if hour_min_match:
244
- self.state.estimated_remaining = hour_min_match.group(1)
245
-
246
- # Update last processing time
247
- self.state.last_step_time = datetime.now()
248
-
249
- logger.info(f"Updated training state: step={self.state.current_step}/{self.state.total_steps}, loss={self.state.step_loss}")
250
  return self.state.to_dict()
251
 
252
  # Epoch information
253
- # there is an issue with how epoch is reported because we display:
254
- # Progress: 96.9%, Step: 872/900, Epoch: 12/50
255
- # we should probably just show the steps
256
  epoch_match = re.search(r"Starting epoch \((\d+)/(\d+)\)", line)
257
  if epoch_match:
258
  self.state.current_epoch = int(epoch_match.group(1))
 
21
  memory_reserved: float = 0.0
22
  start_time: Optional[datetime] = None
23
  last_step_time: Optional[datetime] = None
24
+ estimated_remaining: Optional[str] = None
25
  error_message: Optional[str] = None
26
  initialization_stage: str = ""
27
  download_progress: float = 0.0
28
+ elapsed_time: str = "0:00:00"
29
 
30
  # New fields for current task tracking
31
  current_task: str = ""
 
51
 
52
  def to_dict(self) -> Dict[str, Any]:
53
  """Convert state to dictionary for UI updates"""
54
+ # Use the stored elapsed time directly if it exists
55
+ elapsed = self.elapsed_time
 
 
 
 
56
 
57
  # Use precomputed remaining time from logs if available
58
  remaining = str(self.estimated_remaining) if self.estimated_remaining else "calculating..."
 
193
  if len(self.state.recent_progress_lines) > self.max_recent_lines:
194
  self.state.recent_progress_lines.pop(0)
195
 
196
+ # Parse the Training steps line for additional information
197
+ if "Training steps:" in line:
198
+ # Set status to training if we see this
199
+ self.state.status = "training"
200
+
201
+ if not self.state.start_time:
202
+ self.state.start_time = datetime.now()
203
+
204
+ # Extract step numbers from the format: Training steps: 4%|▍ | 44/1000 [41:57<17:22:32, 65.43s/it]
205
+ steps_match = re.search(r"\|\s*(\d+)/(\d+)", line)
206
+ if steps_match:
207
+ self.state.current_step = int(steps_match.group(1))
208
+ self.state.total_steps = int(steps_match.group(2))
209
+
210
+ # Extract elapsed time - Format example: [41:57<17:22:32, 65.43s/it]
211
+ elapsed_match = re.search(r"\[(\d+:\d+)(:\d+)?<", line)
212
+ if elapsed_match:
213
+ if elapsed_match.group(2): # has hours:minutes:seconds format
214
+ self.state.elapsed_time = elapsed_match.group(1) + elapsed_match.group(2)
215
+ else: # has minutes:seconds format
216
+ self.state.elapsed_time = elapsed_match.group(1)
217
+
218
+ # Extract remaining time - Format example: [41:57<17:22:32, 65.43s/it]
219
+ remaining_match = re.search(r"<([\d:]+)", line)
220
+ if remaining_match:
221
+ self.state.estimated_remaining = remaining_match.group(1)
222
+
223
+ # Extract metrics with different patterns
224
+ # Pattern 1: grad_norm=0.113, global_avg_loss=0.15, global_max_loss=0.15
225
+ grad_norm_match = re.search(r"grad_norm=([0-9.e-]+)", line)
226
+ if grad_norm_match:
227
+ self.state.grad_norm = float(grad_norm_match.group(1))
228
+
229
+ # Try global_avg_loss as the main loss metric
230
+ loss_match = re.search(r"global_avg_loss=([0-9.e-]+)", line)
231
+ if loss_match:
232
+ self.state.step_loss = float(loss_match.group(1))
233
+ elif "step_loss=" in line:
234
+ # Fall back to step_loss if global_avg_loss not found
235
+ loss_match = re.search(r"step_loss=([0-9.e-]+)", line)
236
+ if loss_match:
237
+ self.state.step_loss = float(loss_match.group(1))
238
+
239
+ # Extract learning rate if available
240
+ lr_match = re.search(r"lr=([0-9.e-]+)", line)
241
+ if lr_match:
242
+ self.state.learning_rate = float(lr_match.group(1))
243
+
244
+ # Update last processing time
245
+ self.state.last_step_time = datetime.now()
246
+
247
  # Return updated state
248
  return self.state.to_dict()
 
 
 
249
 
250
+ # Parse "Starting training step" lines to extract step/total info if not already parsed
251
+ step_match = re.search(r"Starting training step \((\d+)/(\d+)\)", line)
252
+ if step_match:
253
+ current_step = int(step_match.group(1))
254
+ total_steps = int(step_match.group(2))
255
+
256
+ # Only update if we don't already have a value or if this is more recent
257
+ if self.state.total_steps == 0 or current_step > self.state.current_step:
258
+ self.state.current_step = current_step
259
+ self.state.total_steps = total_steps
260
+ self.state.status = "training" # Ensure status is set to training
261
+ logger.info(f"Updated training step: {current_step}/{total_steps}")
262
+ return self.state.to_dict()
263
+
264
  if ("Started training" in line) or ("Starting training" in line):
265
  self.state.status = "training"
 
 
 
 
 
 
266
  if not self.state.start_time:
267
  self.state.start_time = datetime.now()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  return self.state.to_dict()
269
 
270
  # Epoch information
 
 
 
271
  epoch_match = re.search(r"Starting epoch \((\d+)/(\d+)\)", line)
272
  if epoch_match:
273
  self.state.current_epoch = int(epoch_match.group(1))