Tonic commited on
Commit
c346dad
·
1 Parent(s): 97dacc7

coerce akk numeric config values to safe values

Browse files
scripts/trackio_tonic/deploy_trackio_space.py CHANGED
@@ -411,8 +411,8 @@ class TrackioSpaceDeployer:
411
 
412
  # Wait a bit for the space to build
413
  import time
414
- print("Waiting 180 seconds for Space to build...")
415
- time.sleep(180)
416
 
417
  # Try to access the space
418
  response = requests.get(self.space_url, timeout=30)
 
411
 
412
  # Wait a bit for the space to build
413
  import time
414
+ print("Waiting 120 seconds for Space to build...")
415
+ time.sleep(120)
416
 
417
  # Try to access the space
418
  response = requests.get(self.space_url, timeout=30)
scripts/training/train_gpt_oss.py CHANGED
@@ -345,38 +345,60 @@ def create_sft_config(config, output_dir):
345
 
346
  print("Creating enhanced SFT configuration...")
347
 
348
- # Extract training parameters from config with enhanced defaults
349
- num_train_epochs = getattr(config, 'num_train_epochs', 1.0)
350
- max_steps = getattr(config, 'max_steps', None)
351
- warmup_ratio = getattr(config, 'warmup_ratio', 0.03)
352
- warmup_steps = getattr(config, 'warmup_steps', None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
 
354
  # Learning rate configuration
355
- learning_rate = config.learning_rate
356
  lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
357
 
358
  # Batch configuration
359
- per_device_train_batch_size = config.batch_size
360
- per_device_eval_batch_size = getattr(config, 'eval_batch_size', config.batch_size)
361
- gradient_accumulation_steps = config.gradient_accumulation_steps
362
 
363
  # Evaluation and logging
364
  eval_strategy = getattr(config, 'eval_strategy', 'steps')
365
- eval_steps = getattr(config, 'eval_steps', 100)
366
- logging_steps = getattr(config, 'logging_steps', 10)
 
367
 
368
  # Saving configuration
369
  save_strategy = getattr(config, 'save_strategy', 'steps')
370
- save_steps = getattr(config, 'save_steps', 500)
371
- save_total_limit = getattr(config, 'save_total_limit', 3)
372
 
373
  # Mixed precision
374
- fp16 = getattr(config, 'fp16', False)
375
- bf16 = getattr(config, 'bf16', True)
 
376
 
377
  # Regularization
378
- weight_decay = getattr(config, 'weight_decay', 0.01)
379
- max_grad_norm = getattr(config, 'max_grad_norm', 1.0)
380
 
381
  # HuggingFace Hub integration
382
  push_to_hub = getattr(config, 'push_to_hub', False)
@@ -406,12 +428,15 @@ def create_sft_config(config, output_dir):
406
  # Mixed precision
407
  "fp16": fp16,
408
  "bf16": bf16,
 
 
409
  # Regularization
410
  "weight_decay": weight_decay,
411
  "max_grad_norm": max_grad_norm,
412
  # Evaluation (name may vary across versions)
413
  "evaluation_strategy": eval_strategy,
414
  "eval_steps": eval_steps,
 
415
  # Logging
416
  "logging_steps": logging_steps,
417
  # Saving
@@ -421,8 +446,10 @@ def create_sft_config(config, output_dir):
421
  # Output
422
  "output_dir": output_dir,
423
  # Data loading
424
- "dataloader_num_workers": getattr(config, 'dataloader_num_workers', 4),
425
  "dataloader_pin_memory": getattr(config, 'dataloader_pin_memory', True),
 
 
426
  # Performance
427
  "group_by_length": getattr(config, 'group_by_length', True),
428
  "remove_unused_columns": getattr(config, 'remove_unused_columns', True),
@@ -432,6 +459,9 @@ def create_sft_config(config, output_dir):
432
  "report_to": ("trackio" if getattr(config, 'enable_tracking', False) else None),
433
  }
434
 
 
 
 
435
  # Adapt to transformers versions where 'evaluation_strategy' was renamed
436
  try:
437
  ta_sig = inspect.signature(TrainingArguments.__init__)
 
345
 
346
  print("Creating enhanced SFT configuration...")
347
 
348
+ # Helper coercion utilities to guarantee numeric types
349
+ def _as_int(value, default):
350
+ if value is None:
351
+ return int(default)
352
+ try:
353
+ return int(value)
354
+ except Exception:
355
+ return int(default)
356
+
357
+ def _as_float(value, default):
358
+ if value is None:
359
+ return float(default)
360
+ try:
361
+ return float(value)
362
+ except Exception:
363
+ return float(default)
364
+
365
+ # Extract training parameters from config with enhanced defaults and coercion
366
+ num_train_epochs = _as_float(getattr(config, 'num_train_epochs', 1.0), 1.0)
367
+ # Transformers expects max_steps default -1 (disabled). Some code compares > 0
368
+ raw_max_steps = getattr(config, 'max_steps', None)
369
+ max_steps = _as_int(raw_max_steps if raw_max_steps is not None else -1, -1)
370
+ warmup_ratio = _as_float(getattr(config, 'warmup_ratio', 0.03), 0.03)
371
+ # Ensure warmup_steps is an int; default 0 to avoid None comparisons in schedulers
372
+ warmup_steps = _as_int(getattr(config, 'warmup_steps', 0), 0)
373
 
374
  # Learning rate configuration
375
+ learning_rate = _as_float(getattr(config, 'learning_rate', 2e-4), 2e-4)
376
  lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
377
 
378
  # Batch configuration
379
+ per_device_train_batch_size = _as_int(getattr(config, 'batch_size', 2), 2)
380
+ per_device_eval_batch_size = _as_int(getattr(config, 'eval_batch_size', per_device_train_batch_size), per_device_train_batch_size)
381
+ gradient_accumulation_steps = _as_int(getattr(config, 'gradient_accumulation_steps', 1), 1)
382
 
383
  # Evaluation and logging
384
  eval_strategy = getattr(config, 'eval_strategy', 'steps')
385
+ eval_steps = _as_int(getattr(config, 'eval_steps', 100), 100)
386
+ eval_accumulation_steps = _as_int(getattr(config, 'eval_accumulation_steps', 1), 1)
387
+ logging_steps = _as_int(getattr(config, 'logging_steps', 10), 10)
388
 
389
  # Saving configuration
390
  save_strategy = getattr(config, 'save_strategy', 'steps')
391
+ save_steps = _as_int(getattr(config, 'save_steps', 500), 500)
392
+ save_total_limit = _as_int(getattr(config, 'save_total_limit', 3), 3)
393
 
394
  # Mixed precision
395
+ fp16 = bool(getattr(config, 'fp16', False))
396
+ bf16 = bool(getattr(config, 'bf16', True))
397
+ tf32 = bool(getattr(config, 'tf32', False))
398
 
399
  # Regularization
400
+ weight_decay = _as_float(getattr(config, 'weight_decay', 0.01), 0.01)
401
+ max_grad_norm = _as_float(getattr(config, 'max_grad_norm', 1.0), 1.0)
402
 
403
  # HuggingFace Hub integration
404
  push_to_hub = getattr(config, 'push_to_hub', False)
 
428
  # Mixed precision
429
  "fp16": fp16,
430
  "bf16": bf16,
431
+ # Some versions support tf32
432
+ "tf32": tf32 if 'tf32' in TrainingArguments.__init__.__code__.co_varnames else None,
433
  # Regularization
434
  "weight_decay": weight_decay,
435
  "max_grad_norm": max_grad_norm,
436
  # Evaluation (name may vary across versions)
437
  "evaluation_strategy": eval_strategy,
438
  "eval_steps": eval_steps,
439
+ "eval_accumulation_steps": eval_accumulation_steps,
440
  # Logging
441
  "logging_steps": logging_steps,
442
  # Saving
 
446
  # Output
447
  "output_dir": output_dir,
448
  # Data loading
449
+ "dataloader_num_workers": _as_int(getattr(config, 'dataloader_num_workers', 4), 4),
450
  "dataloader_pin_memory": getattr(config, 'dataloader_pin_memory', True),
451
+ # Optional in some versions
452
+ "dataloader_prefetch_factor": _as_int(getattr(config, 'dataloader_prefetch_factor', 2), 2),
453
  # Performance
454
  "group_by_length": getattr(config, 'group_by_length', True),
455
  "remove_unused_columns": getattr(config, 'remove_unused_columns', True),
 
459
  "report_to": ("trackio" if getattr(config, 'enable_tracking', False) else None),
460
  }
461
 
462
+ # Drop any None-valued kwargs
463
+ ta_kwargs = {k: v for k, v in ta_kwargs.items() if v is not None}
464
+
465
  # Adapt to transformers versions where 'evaluation_strategy' was renamed
466
  try:
467
  ta_sig = inspect.signature(TrainingArguments.__init__)