Spaces:
Running
Running
coerce akk numeric config values to safe values
Browse files
scripts/trackio_tonic/deploy_trackio_space.py
CHANGED
@@ -411,8 +411,8 @@ class TrackioSpaceDeployer:
|
|
411 |
|
412 |
# Wait a bit for the space to build
|
413 |
import time
|
414 |
-
print("Waiting
|
415 |
-
time.sleep(
|
416 |
|
417 |
# Try to access the space
|
418 |
response = requests.get(self.space_url, timeout=30)
|
|
|
411 |
|
412 |
# Wait a bit for the space to build
|
413 |
import time
|
414 |
+
print("Waiting 120 seconds for Space to build...")
|
415 |
+
time.sleep(120)
|
416 |
|
417 |
# Try to access the space
|
418 |
response = requests.get(self.space_url, timeout=30)
|
scripts/training/train_gpt_oss.py
CHANGED
@@ -345,38 +345,60 @@ def create_sft_config(config, output_dir):
|
|
345 |
|
346 |
print("Creating enhanced SFT configuration...")
|
347 |
|
348 |
-
#
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
|
354 |
# Learning rate configuration
|
355 |
-
learning_rate = config
|
356 |
lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
|
357 |
|
358 |
# Batch configuration
|
359 |
-
per_device_train_batch_size = config
|
360 |
-
per_device_eval_batch_size = getattr(config, 'eval_batch_size',
|
361 |
-
gradient_accumulation_steps = config
|
362 |
|
363 |
# Evaluation and logging
|
364 |
eval_strategy = getattr(config, 'eval_strategy', 'steps')
|
365 |
-
eval_steps = getattr(config, 'eval_steps', 100)
|
366 |
-
|
|
|
367 |
|
368 |
# Saving configuration
|
369 |
save_strategy = getattr(config, 'save_strategy', 'steps')
|
370 |
-
save_steps = getattr(config, 'save_steps', 500)
|
371 |
-
save_total_limit = getattr(config, 'save_total_limit', 3)
|
372 |
|
373 |
# Mixed precision
|
374 |
-
fp16 = getattr(config, 'fp16', False)
|
375 |
-
bf16 = getattr(config, 'bf16', True)
|
|
|
376 |
|
377 |
# Regularization
|
378 |
-
weight_decay = getattr(config, 'weight_decay', 0.01)
|
379 |
-
max_grad_norm = getattr(config, 'max_grad_norm', 1.0)
|
380 |
|
381 |
# HuggingFace Hub integration
|
382 |
push_to_hub = getattr(config, 'push_to_hub', False)
|
@@ -406,12 +428,15 @@ def create_sft_config(config, output_dir):
|
|
406 |
# Mixed precision
|
407 |
"fp16": fp16,
|
408 |
"bf16": bf16,
|
|
|
|
|
409 |
# Regularization
|
410 |
"weight_decay": weight_decay,
|
411 |
"max_grad_norm": max_grad_norm,
|
412 |
# Evaluation (name may vary across versions)
|
413 |
"evaluation_strategy": eval_strategy,
|
414 |
"eval_steps": eval_steps,
|
|
|
415 |
# Logging
|
416 |
"logging_steps": logging_steps,
|
417 |
# Saving
|
@@ -421,8 +446,10 @@ def create_sft_config(config, output_dir):
|
|
421 |
# Output
|
422 |
"output_dir": output_dir,
|
423 |
# Data loading
|
424 |
-
"dataloader_num_workers": getattr(config, 'dataloader_num_workers', 4),
|
425 |
"dataloader_pin_memory": getattr(config, 'dataloader_pin_memory', True),
|
|
|
|
|
426 |
# Performance
|
427 |
"group_by_length": getattr(config, 'group_by_length', True),
|
428 |
"remove_unused_columns": getattr(config, 'remove_unused_columns', True),
|
@@ -432,6 +459,9 @@ def create_sft_config(config, output_dir):
|
|
432 |
"report_to": ("trackio" if getattr(config, 'enable_tracking', False) else None),
|
433 |
}
|
434 |
|
|
|
|
|
|
|
435 |
# Adapt to transformers versions where 'evaluation_strategy' was renamed
|
436 |
try:
|
437 |
ta_sig = inspect.signature(TrainingArguments.__init__)
|
|
|
345 |
|
346 |
print("Creating enhanced SFT configuration...")
|
347 |
|
348 |
+
# Helper coercion utilities to guarantee numeric types
|
349 |
+
def _as_int(value, default):
|
350 |
+
if value is None:
|
351 |
+
return int(default)
|
352 |
+
try:
|
353 |
+
return int(value)
|
354 |
+
except Exception:
|
355 |
+
return int(default)
|
356 |
+
|
357 |
+
def _as_float(value, default):
|
358 |
+
if value is None:
|
359 |
+
return float(default)
|
360 |
+
try:
|
361 |
+
return float(value)
|
362 |
+
except Exception:
|
363 |
+
return float(default)
|
364 |
+
|
365 |
+
# Extract training parameters from config with enhanced defaults and coercion
|
366 |
+
num_train_epochs = _as_float(getattr(config, 'num_train_epochs', 1.0), 1.0)
|
367 |
+
# Transformers expects max_steps default -1 (disabled). Some code compares > 0
|
368 |
+
raw_max_steps = getattr(config, 'max_steps', None)
|
369 |
+
max_steps = _as_int(raw_max_steps if raw_max_steps is not None else -1, -1)
|
370 |
+
warmup_ratio = _as_float(getattr(config, 'warmup_ratio', 0.03), 0.03)
|
371 |
+
# Ensure warmup_steps is an int; default 0 to avoid None comparisons in schedulers
|
372 |
+
warmup_steps = _as_int(getattr(config, 'warmup_steps', 0), 0)
|
373 |
|
374 |
# Learning rate configuration
|
375 |
+
learning_rate = _as_float(getattr(config, 'learning_rate', 2e-4), 2e-4)
|
376 |
lr_scheduler_type = getattr(config, 'scheduler', 'cosine_with_min_lr')
|
377 |
|
378 |
# Batch configuration
|
379 |
+
per_device_train_batch_size = _as_int(getattr(config, 'batch_size', 2), 2)
|
380 |
+
per_device_eval_batch_size = _as_int(getattr(config, 'eval_batch_size', per_device_train_batch_size), per_device_train_batch_size)
|
381 |
+
gradient_accumulation_steps = _as_int(getattr(config, 'gradient_accumulation_steps', 1), 1)
|
382 |
|
383 |
# Evaluation and logging
|
384 |
eval_strategy = getattr(config, 'eval_strategy', 'steps')
|
385 |
+
eval_steps = _as_int(getattr(config, 'eval_steps', 100), 100)
|
386 |
+
eval_accumulation_steps = _as_int(getattr(config, 'eval_accumulation_steps', 1), 1)
|
387 |
+
logging_steps = _as_int(getattr(config, 'logging_steps', 10), 10)
|
388 |
|
389 |
# Saving configuration
|
390 |
save_strategy = getattr(config, 'save_strategy', 'steps')
|
391 |
+
save_steps = _as_int(getattr(config, 'save_steps', 500), 500)
|
392 |
+
save_total_limit = _as_int(getattr(config, 'save_total_limit', 3), 3)
|
393 |
|
394 |
# Mixed precision
|
395 |
+
fp16 = bool(getattr(config, 'fp16', False))
|
396 |
+
bf16 = bool(getattr(config, 'bf16', True))
|
397 |
+
tf32 = bool(getattr(config, 'tf32', False))
|
398 |
|
399 |
# Regularization
|
400 |
+
weight_decay = _as_float(getattr(config, 'weight_decay', 0.01), 0.01)
|
401 |
+
max_grad_norm = _as_float(getattr(config, 'max_grad_norm', 1.0), 1.0)
|
402 |
|
403 |
# HuggingFace Hub integration
|
404 |
push_to_hub = getattr(config, 'push_to_hub', False)
|
|
|
428 |
# Mixed precision
|
429 |
"fp16": fp16,
|
430 |
"bf16": bf16,
|
431 |
+
# Some versions support tf32
|
432 |
+
"tf32": tf32 if 'tf32' in TrainingArguments.__init__.__code__.co_varnames else None,
|
433 |
# Regularization
|
434 |
"weight_decay": weight_decay,
|
435 |
"max_grad_norm": max_grad_norm,
|
436 |
# Evaluation (name may vary across versions)
|
437 |
"evaluation_strategy": eval_strategy,
|
438 |
"eval_steps": eval_steps,
|
439 |
+
"eval_accumulation_steps": eval_accumulation_steps,
|
440 |
# Logging
|
441 |
"logging_steps": logging_steps,
|
442 |
# Saving
|
|
|
446 |
# Output
|
447 |
"output_dir": output_dir,
|
448 |
# Data loading
|
449 |
+
"dataloader_num_workers": _as_int(getattr(config, 'dataloader_num_workers', 4), 4),
|
450 |
"dataloader_pin_memory": getattr(config, 'dataloader_pin_memory', True),
|
451 |
+
# Optional in some versions
|
452 |
+
"dataloader_prefetch_factor": _as_int(getattr(config, 'dataloader_prefetch_factor', 2), 2),
|
453 |
# Performance
|
454 |
"group_by_length": getattr(config, 'group_by_length', True),
|
455 |
"remove_unused_columns": getattr(config, 'remove_unused_columns', True),
|
|
|
459 |
"report_to": ("trackio" if getattr(config, 'enable_tracking', False) else None),
|
460 |
}
|
461 |
|
462 |
+
# Drop any None-valued kwargs
|
463 |
+
ta_kwargs = {k: v for k, v in ta_kwargs.items() if v is not None}
|
464 |
+
|
465 |
# Adapt to transformers versions where 'evaluation_strategy' was renamed
|
466 |
try:
|
467 |
ta_sig = inspect.signature(TrainingArguments.__init__)
|