Spaces:
Running
Running
Commit
·
f1c60d3
1
Parent(s):
b7c0de1
fix
Browse files
vms/ui/app_ui.py
CHANGED
@@ -234,7 +234,7 @@ class AppUI:
|
|
234 |
self.project_tabs["train_tab"].components["num_gpus"],
|
235 |
self.project_tabs["train_tab"].components["precomputation_items"],
|
236 |
self.project_tabs["train_tab"].components["lr_warmup_steps"],
|
237 |
-
self.project_tabs["train_tab"].components["
|
238 |
]
|
239 |
)
|
240 |
|
|
|
234 |
self.project_tabs["train_tab"].components["num_gpus"],
|
235 |
self.project_tabs["train_tab"].components["precomputation_items"],
|
236 |
self.project_tabs["train_tab"].components["lr_warmup_steps"],
|
237 |
+
self.project_tabs["train_tab"].components["auto_resume"]
|
238 |
]
|
239 |
)
|
240 |
|
vms/ui/project/services/training.py
CHANGED
@@ -1078,7 +1078,8 @@ class TrainingService:
|
|
1078 |
"learning_rate": ui_state.get("learning_rate", DEFAULT_LEARNING_RATE),
|
1079 |
"save_iterations": ui_state.get("save_iterations", DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS),
|
1080 |
"preset_name": ui_state.get("training_preset", list(TRAINING_PRESETS.keys())[0]),
|
1081 |
-
"repo_id": "" # Default empty repo ID
|
|
|
1082 |
}
|
1083 |
}
|
1084 |
logger.info("Created default session from UI state for recovery")
|
@@ -1150,7 +1151,7 @@ class TrainingService:
|
|
1150 |
"learning_rate": params.get('learning_rate', DEFAULT_LEARNING_RATE),
|
1151 |
"save_iterations": params.get('save_iterations', DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS),
|
1152 |
"training_preset": params.get('preset_name', list(TRAINING_PRESETS.keys())[0]),
|
1153 |
-
"
|
1154 |
})
|
1155 |
|
1156 |
# Check if we should auto-recover (immediate restart)
|
|
|
1078 |
"learning_rate": ui_state.get("learning_rate", DEFAULT_LEARNING_RATE),
|
1079 |
"save_iterations": ui_state.get("save_iterations", DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS),
|
1080 |
"preset_name": ui_state.get("training_preset", list(TRAINING_PRESETS.keys())[0]),
|
1081 |
+
"repo_id": "" # Default empty repo ID,
|
1082 |
+
"auto_resume": ui_state.get("auto_resume", DEFAULT_AUTO_RESUME)
|
1083 |
}
|
1084 |
}
|
1085 |
logger.info("Created default session from UI state for recovery")
|
|
|
1151 |
"learning_rate": params.get('learning_rate', DEFAULT_LEARNING_RATE),
|
1152 |
"save_iterations": params.get('save_iterations', DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS),
|
1153 |
"training_preset": params.get('preset_name', list(TRAINING_PRESETS.keys())[0]),
|
1154 |
+
"auto_resume": params.get("auto_resume", DEFAULT_AUTO_RESUME)
|
1155 |
})
|
1156 |
|
1157 |
# Check if we should auto-recover (immediate restart)
|
vms/ui/project/tabs/train_tab.py
CHANGED
@@ -233,7 +233,7 @@ class TrainTab(BaseTab):
|
|
233 |
)
|
234 |
|
235 |
with gr.Row():
|
236 |
-
self.components["
|
237 |
label="Automatically continue training in case of server reboot.",
|
238 |
value=DEFAULT_AUTO_RESUME,
|
239 |
info="When enabled, training will automatically resume from the latest checkpoint after app restart"
|
@@ -389,9 +389,9 @@ class TrainTab(BaseTab):
|
|
389 |
]
|
390 |
)
|
391 |
|
392 |
-
self.components["
|
393 |
fn=lambda v: self.app.update_ui_state(auto_resume=v),
|
394 |
-
inputs=[self.components["
|
395 |
outputs=[]
|
396 |
)
|
397 |
|
|
|
233 |
)
|
234 |
|
235 |
with gr.Row():
|
236 |
+
self.components["auto_resume"] = gr.Checkbox(
|
237 |
label="Automatically continue training in case of server reboot.",
|
238 |
value=DEFAULT_AUTO_RESUME,
|
239 |
info="When enabled, training will automatically resume from the latest checkpoint after app restart"
|
|
|
389 |
]
|
390 |
)
|
391 |
|
392 |
+
self.components["auto_resume"].change(
|
393 |
fn=lambda v: self.app.update_ui_state(auto_resume=v),
|
394 |
+
inputs=[self.components["auto_resume"]],
|
395 |
outputs=[]
|
396 |
)
|
397 |
|