Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke-hf HF Staff commited on Mar 6

Commit

c6546ad

1 Parent(s): 38cfbff

cleaning code

Browse files

Files changed (4) hide show

vms/config.py +116 -89
vms/services/trainer.py +90 -47
vms/tabs/train_tab.py +153 -177
vms/ui/video_trainer_ui.py +79 -43

vms/config.py CHANGED Viewed

@@ -58,9 +58,9 @@ JPEG_QUALITY = int(os.environ.get('JPEG_QUALITY', '97'))
 # Expanded model types to include Wan-2.1-T2V
 MODEL_TYPES = {
-    "HunyuanVideo (LoRA)": "hunyuan_video",
-    "LTX-Video (LoRA)": "ltx_video",
-    "Wan-2.1-T2V (LoRA)": "wan"
 }
 # Training types
@@ -69,6 +69,23 @@ TRAINING_TYPES = {
     "Full Finetune": "full-finetune"
 }
 # it is best to use resolutions that are powers of 8
 # The resolution should be divisible by 32
@@ -87,39 +104,49 @@ MEDIUM_19_9_RATIO_HEIGHT = 512 # 32 * 16
 NB_FRAMES_1 = 1  #  1
 NB_FRAMES_9 = 8 + 1 # 8 + 1
 NB_FRAMES_17 = 8 * 2 + 1 # 16 + 1
-NB_FRAMES_32 = 8 * 4 + 1  # 32 + 1
-NB_FRAMES_48 = 8 * 6 + 1 # 48 + 1
-NB_FRAMES_64 = 8 * 8 + 1  # 64 + 1
-NB_FRAMES_80 = 8 * 10 + 1  # 80 + 1
-NB_FRAMES_96 = 8 * 12 + 1  # 96 + 1
-NB_FRAMES_112 = 8 * 14 + 1  # 112 + 1
-NB_FRAMES_128 = 8 * 16 + 1  # 128 + 1
-NB_FRAMES_144 = 8 * 18 + 1  # 144 + 1
-NB_FRAMES_160  = 8 * 20 + 1  # 160 + 1
-NB_FRAMES_176 = 8 * 22 + 1  # 176 + 1
-NB_FRAMES_192 = 8 * 24 + 1  # 192 + 1
-NB_FRAMES_224 = 8 * 28 + 1  # 224 + 1
-NB_FRAMES_256 = 8 * 32 + 1  # 256 + 1
 # 256 isn't a lot by the way, especially with 60 FPS videos..
 # can we crank it and put more frames in here?
 SMALL_TRAINING_BUCKETS = [
     (NB_FRAMES_1,   MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 1
     (NB_FRAMES_9,   MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 8 + 1
     (NB_FRAMES_17,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 16 + 1
-    (NB_FRAMES_32,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 32 + 1
-    (NB_FRAMES_48,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 48 + 1
-    (NB_FRAMES_64,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 64 + 1
-    (NB_FRAMES_80,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 80 + 1
-    (NB_FRAMES_96,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 96 + 1
-    (NB_FRAMES_112, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 112 + 1
-    (NB_FRAMES_128, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 128 + 1
-    (NB_FRAMES_144, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 144 + 1
-    (NB_FRAMES_160, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 160 + 1
-    (NB_FRAMES_176, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 176 + 1
-    (NB_FRAMES_192, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 192 + 1
-    (NB_FRAMES_224, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 224 + 1
-    (NB_FRAMES_256, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 256 + 1
 ]
 MEDIUM_19_9_RATIO_WIDTH = 928 # 32 * 29
@@ -129,19 +156,19 @@ MEDIUM_19_9_RATIO_BUCKETS = [
     (NB_FRAMES_1,   MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), #  1
     (NB_FRAMES_9,   MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 8 + 1
     (NB_FRAMES_17,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 16 + 1
-    (NB_FRAMES_32,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 32 + 1
-    (NB_FRAMES_48,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 48 + 1
-    (NB_FRAMES_64,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 64 + 1
-    (NB_FRAMES_80,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 80 + 1
-    (NB_FRAMES_96,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 96 + 1
-    (NB_FRAMES_112, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 112 + 1
-    (NB_FRAMES_128, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 128 + 1
-    (NB_FRAMES_144, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 144 + 1
-    (NB_FRAMES_160, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 160 + 1
-    (NB_FRAMES_176, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 176 + 1
-    (NB_FRAMES_192, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 192 + 1
-    (NB_FRAMES_224, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 224 + 1
-    (NB_FRAMES_256, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 256 + 1
 ]
 # Updated training presets to include Wan-2.1-T2V and support both LoRA and full-finetune
@@ -149,24 +176,24 @@ TRAINING_PRESETS = {
     "HunyuanVideo (normal)": {
         "model_type": "hunyuan_video",
         "training_type": "lora",
-        "lora_rank": "128",
-        "lora_alpha": "128",
-        "num_epochs": 70,
-        "batch_size": 1,
         "learning_rate": 2e-5,
-        "save_iterations": 500,
         "training_buckets": SMALL_TRAINING_BUCKETS,
         "flow_weighting_scheme": "none"
     },
     "LTX-Video (normal)": {
         "model_type": "ltx_video",
         "training_type": "lora",
-        "lora_rank": "128",
-        "lora_alpha": "128",
-        "num_epochs": 70,
-        "batch_size": 1,
-        "learning_rate": 3e-5,
-        "save_iterations": 500,
         "training_buckets": SMALL_TRAINING_BUCKETS,
         "flow_weighting_scheme": "logit_normal"
     },
@@ -174,21 +201,21 @@ TRAINING_PRESETS = {
         "model_type": "ltx_video",
         "training_type": "lora",
         "lora_rank": "256",
-        "lora_alpha": "128",
-        "num_epochs": 50,
-        "batch_size": 1,
-        "learning_rate": 3e-5,
-        "save_iterations": 200,
         "training_buckets": MEDIUM_19_9_RATIO_BUCKETS,
         "flow_weighting_scheme": "logit_normal"
     },
     "LTX-Video (Full Finetune)": {
         "model_type": "ltx_video",
         "training_type": "full-finetune",
-        "num_epochs": 30,
-        "batch_size": 1,
-        "learning_rate": 1e-5,
-        "save_iterations": 300,
         "training_buckets": SMALL_TRAINING_BUCKETS,
         "flow_weighting_scheme": "logit_normal"
     },
@@ -197,10 +224,10 @@ TRAINING_PRESETS = {
         "training_type": "lora",
         "lora_rank": "32",
         "lora_alpha": "32",
-        "num_epochs": 70,
-        "batch_size": 1,
         "learning_rate": 5e-5,
-        "save_iterations": 500,
         "training_buckets": SMALL_TRAINING_BUCKETS,
         "flow_weighting_scheme": "logit_normal"
     },
@@ -209,10 +236,10 @@ TRAINING_PRESETS = {
         "training_type": "lora",
         "lora_rank": "64",
         "lora_alpha": "64",
-        "num_epochs": 50,
-        "batch_size": 1,
-        "learning_rate": 3e-5,
-        "save_iterations": 200,
         "training_buckets": MEDIUM_19_9_RATIO_BUCKETS,
         "flow_weighting_scheme": "logit_normal"
     }
@@ -244,7 +271,7 @@ class TrainingConfig:
     id_token: Optional[str] = None
     video_resolution_buckets: List[Tuple[int, int, int]] = field(default_factory=lambda: SMALL_TRAINING_BUCKETS)
     video_reshape_mode: str = "center"
-    caption_dropout_p: float = 0.05
     caption_dropout_technique: str = "empty"
     precompute_conditions: bool = False
@@ -257,16 +284,16 @@ class TrainingConfig:
     # Training arguments
     training_type: str = "lora"
-    seed: int = 42
     mixed_precision: str = "bf16"
     batch_size: int = 1
-    train_epochs: int = 70
-    lora_rank: int = 128
-    lora_alpha: int = 128
     target_modules: List[str] = field(default_factory=lambda: ["to_q", "to_k", "to_v", "to_out.0"])
     gradient_accumulation_steps: int = 1
     gradient_checkpointing: bool = True
-    checkpointing_steps: int = 500
     checkpointing_limit: Optional[int] = 2
     resume_from_checkpoint: Optional[str] = None
     enable_slicing: bool = True
@@ -300,15 +327,15 @@ class TrainingConfig:
             data_root=data_path,
             output_dir=output_path,
             batch_size=1,
-            train_epochs=70,
             lr=2e-5,
             gradient_checkpointing=True,
             id_token="afkx",
             gradient_accumulation_steps=1,
-            lora_rank=128,
-            lora_alpha=128,
             video_resolution_buckets=buckets or SMALL_TRAINING_BUCKETS,
-            caption_dropout_p=0.05,
             flow_weighting_scheme="none",  # Hunyuan specific
             training_type="lora"
         )
@@ -322,15 +349,15 @@ class TrainingConfig:
             data_root=data_path,
             output_dir=output_path,
             batch_size=1,
-            train_epochs=40,
-            lr=3e-5,
             gradient_checkpointing=True,
             id_token="BW_STYLE",
             gradient_accumulation_steps=4,
-            lora_rank=128,
-            lora_alpha=128,
             video_resolution_buckets=buckets or SMALL_TRAINING_BUCKETS,
-            caption_dropout_p=0.05,
             flow_weighting_scheme="logit_normal",  # LTX specific
             training_type="lora"
         )
@@ -344,13 +371,13 @@ class TrainingConfig:
             data_root=data_path,
             output_dir=output_path,
             batch_size=1,
-            train_epochs=30,
             lr=1e-5,
             gradient_checkpointing=True,
             id_token="BW_STYLE",
             gradient_accumulation_steps=1,
             video_resolution_buckets=buckets or SMALL_TRAINING_BUCKETS,
-            caption_dropout_p=0.05,
             flow_weighting_scheme="logit_normal",  # LTX specific
             training_type="full-finetune"
         )
@@ -364,7 +391,7 @@ class TrainingConfig:
             data_root=data_path,
             output_dir=output_path,
             batch_size=1,
-            train_epochs=70,
             lr=5e-5,
             gradient_checkpointing=True,
             id_token=None,  # Default is no ID token for Wan
@@ -373,7 +400,7 @@ class TrainingConfig:
             lora_alpha=32,
             target_modules=["blocks.*(to_q|to_k|to_v|to_out.0)"],  # Wan-specific target modules
             video_resolution_buckets=buckets or SMALL_TRAINING_BUCKETS,
-            caption_dropout_p=0.05,
             flow_weighting_scheme="logit_normal",  # Wan specific
             training_type="lora"
         )
@@ -428,7 +455,7 @@ class TrainingConfig:
         #args.extend(["--mixed_precision", self.mixed_precision])
         args.extend(["--batch_size", str(self.batch_size)])
-        args.extend(["--train_steps", str(self.train_epochs * 1000)])  # Convert epochs to steps for compatibility
         # LoRA specific arguments
         if self.training_type == "lora":

 # Expanded model types to include Wan-2.1-T2V
 MODEL_TYPES = {
+    "HunyuanVideo": "hunyuan_video",
+    "LTX-Video": "ltx_video",
+    "Wan-2.1-T2V": "wan"
 }
 # Training types
     "Full Finetune": "full-finetune"
 }
+DEFAULT_SEED = 42
+DEFAULT_NB_TRAINING_STEPS = 1000
+DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS = 200
+DEFAULT_LORA_RANK = 128
+DEFAULT_LORA_RANK_STR = str(DEFAULT_LORA_RANK)
+DEFAULT_LORA_ALPHA = 128
+DEFAULT_LORA_ALPHA_STR = str(DEFAULT_LORA_ALPHA)
+DEFAULT_CAPTION_DROPOUT_P = 0.05
+DEFAULT_BATCH_SIZE = 1
+DEFAULT_LEARNING_RATE = 3e-5
 # it is best to use resolutions that are powers of 8
 # The resolution should be divisible by 32
 NB_FRAMES_1 = 1  #  1
 NB_FRAMES_9 = 8 + 1 # 8 + 1
 NB_FRAMES_17 = 8 * 2 + 1 # 16 + 1
+NB_FRAMES_33 = 8 * 4 + 1  # 32 + 1
+NB_FRAMES_49 = 8 * 6 + 1 # 48 + 1
+NB_FRAMES_65 = 8 * 8 + 1  # 64 + 1
+NB_FRAMES_81 = 8 * 10 + 1  # 80 + 1
+NB_FRAMES_97 = 8 * 12 + 1  # 96 + 1
+NB_FRAMES_113 = 8 * 14 + 1  # 112 + 1
+NB_FRAMES_129 = 8 * 16 + 1  # 128 + 1
+NB_FRAMES_145 = 8 * 18 + 1  # 144 + 1
+NB_FRAMES_161  = 8 * 20 + 1  # 160 + 1
+NB_FRAMES_177 = 8 * 22 + 1  # 176 + 1
+NB_FRAMES_193 = 8 * 24 + 1  # 192 + 1
+NB_FRAMES_225 = 8 * 28 + 1  # 224 + 1
+NB_FRAMES_257 = 8 * 32 + 1  # 256 + 1
 # 256 isn't a lot by the way, especially with 60 FPS videos..
 # can we crank it and put more frames in here?
+NB_FRAMES_273 = 8 * 34 + 1  # 272 + 1
+NB_FRAMES_289 = 8 * 36 + 1  # 288 + 1
+NB_FRAMES_305 = 8 * 38 + 1  # 304 + 1
+NB_FRAMES_321 = 8 * 40 + 1  # 320 + 1
+NB_FRAMES_337 = 8 * 42 + 1  # 336 + 1
+NB_FRAMES_353 = 8 * 44 + 1  # 352 + 1
+NB_FRAMES_369 = 8 * 46 + 1  # 368 + 1
+NB_FRAMES_385 = 8 * 48 + 1  # 384 + 1
+NB_FRAMES_401 = 8 * 50 + 1  # 400 + 1
 SMALL_TRAINING_BUCKETS = [
     (NB_FRAMES_1,   MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 1
     (NB_FRAMES_9,   MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 8 + 1
     (NB_FRAMES_17,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 16 + 1
+    (NB_FRAMES_33,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 32 + 1
+    (NB_FRAMES_49,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 48 + 1
+    (NB_FRAMES_65,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 64 + 1
+    (NB_FRAMES_81,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 80 + 1
+    (NB_FRAMES_97,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 96 + 1
+    (NB_FRAMES_113, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 112 + 1
+    (NB_FRAMES_129, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 128 + 1
+    (NB_FRAMES_145, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 144 + 1
+    (NB_FRAMES_161, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 160 + 1
+    (NB_FRAMES_177, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 176 + 1
+    (NB_FRAMES_193, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 192 + 1
+    (NB_FRAMES_225, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 224 + 1
+    (NB_FRAMES_257, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 256 + 1
 ]
 MEDIUM_19_9_RATIO_WIDTH = 928 # 32 * 29
     (NB_FRAMES_1,   MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), #  1
     (NB_FRAMES_9,   MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 8 + 1
     (NB_FRAMES_17,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 16 + 1
+    (NB_FRAMES_33,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 32 + 1
+    (NB_FRAMES_49,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 48 + 1
+    (NB_FRAMES_65,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 64 + 1
+    (NB_FRAMES_81,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 80 + 1
+    (NB_FRAMES_97,  MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 96 + 1
+    (NB_FRAMES_113, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 112 + 1
+    (NB_FRAMES_129, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 128 + 1
+    (NB_FRAMES_145, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 144 + 1
+    (NB_FRAMES_161, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 160 + 1
+    (NB_FRAMES_177, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 176 + 1
+    (NB_FRAMES_193, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 192 + 1
+    (NB_FRAMES_225, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 224 + 1
+    (NB_FRAMES_257, MEDIUM_19_9_RATIO_HEIGHT, MEDIUM_19_9_RATIO_WIDTH), # 256 + 1
 ]
 # Updated training presets to include Wan-2.1-T2V and support both LoRA and full-finetune
     "HunyuanVideo (normal)": {
         "model_type": "hunyuan_video",
         "training_type": "lora",
+        "lora_rank": DEFAULT_LORA_RANK_STR,
+        "lora_alpha": DEFAULT_LORA_ALPHA_STR,
+        "train_steps": DEFAULT_NB_TRAINING_STEPS,
+        "batch_size": DEFAULT_BATCH_SIZE,
         "learning_rate": 2e-5,
+        "save_iterations": DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
         "training_buckets": SMALL_TRAINING_BUCKETS,
         "flow_weighting_scheme": "none"
     },
     "LTX-Video (normal)": {
         "model_type": "ltx_video",
         "training_type": "lora",
+        "lora_rank": DEFAULT_LORA_RANK_STR,
+        "lora_alpha": DEFAULT_LORA_ALPHA_STR,
+        "train_steps": DEFAULT_NB_TRAINING_STEPS,
+        "batch_size": DEFAULT_BATCH_SIZE,
+        "learning_rate": DEFAULT_LEARNING_RATE,
+        "save_iterations": DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
         "training_buckets": SMALL_TRAINING_BUCKETS,
         "flow_weighting_scheme": "logit_normal"
     },
         "model_type": "ltx_video",
         "training_type": "lora",
         "lora_rank": "256",
+        "lora_alpha": DEFAULT_LORA_ALPHA_STR,
+        "train_steps": DEFAULT_NB_TRAINING_STEPS,
+        "batch_size": DEFAULT_BATCH_SIZE,
+        "learning_rate": DEFAULT_LEARNING_RATE,
+        "save_iterations": DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
         "training_buckets": MEDIUM_19_9_RATIO_BUCKETS,
         "flow_weighting_scheme": "logit_normal"
     },
     "LTX-Video (Full Finetune)": {
         "model_type": "ltx_video",
         "training_type": "full-finetune",
+        "train_steps": DEFAULT_NB_TRAINING_STEPS,
+        "batch_size": DEFAULT_BATCH_SIZE,
+        "learning_rate": DEFAULT_LEARNING_RATE,
+        "save_iterations": DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
         "training_buckets": SMALL_TRAINING_BUCKETS,
         "flow_weighting_scheme": "logit_normal"
     },
         "training_type": "lora",
         "lora_rank": "32",
         "lora_alpha": "32",
+        "train_steps": DEFAULT_NB_TRAINING_STEPS,
+        "batch_size": DEFAULT_BATCH_SIZE,
         "learning_rate": 5e-5,
+        "save_iterations": DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
         "training_buckets": SMALL_TRAINING_BUCKETS,
         "flow_weighting_scheme": "logit_normal"
     },
         "training_type": "lora",
         "lora_rank": "64",
         "lora_alpha": "64",
+        "train_steps": DEFAULT_NB_TRAINING_STEPS,
+        "batch_size": DEFAULT_BATCH_SIZE,
+        "learning_rate": DEFAULT_LEARNING_RATE,
+        "save_iterations": DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
         "training_buckets": MEDIUM_19_9_RATIO_BUCKETS,
         "flow_weighting_scheme": "logit_normal"
     }
     id_token: Optional[str] = None
     video_resolution_buckets: List[Tuple[int, int, int]] = field(default_factory=lambda: SMALL_TRAINING_BUCKETS)
     video_reshape_mode: str = "center"
+    caption_dropout_p: float = DEFAULT_CAPTION_DROPOUT_P
     caption_dropout_technique: str = "empty"
     precompute_conditions: bool = False
     # Training arguments
     training_type: str = "lora"
+    seed: int = DEFAULT_SEED
     mixed_precision: str = "bf16"
     batch_size: int = 1
+    train_step: int = DEFAULT_NB_TRAINING_STEPS
+    lora_rank: int = DEFAULT_LORA_RANK
+    lora_alpha: int = DEFAULT_LORA_ALPHA
     target_modules: List[str] = field(default_factory=lambda: ["to_q", "to_k", "to_v", "to_out.0"])
     gradient_accumulation_steps: int = 1
     gradient_checkpointing: bool = True
+    checkpointing_steps: int = DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS
     checkpointing_limit: Optional[int] = 2
     resume_from_checkpoint: Optional[str] = None
     enable_slicing: bool = True
             data_root=data_path,
             output_dir=output_path,
             batch_size=1,
+            train_steps=DEFAULT_NB_TRAINING_STEPS,
             lr=2e-5,
             gradient_checkpointing=True,
             id_token="afkx",
             gradient_accumulation_steps=1,
+            lora_rank=DEFAULT_LORA_RANK,
+            lora_alpha=DEFAULT_LORA_ALPHA,
             video_resolution_buckets=buckets or SMALL_TRAINING_BUCKETS,
+            caption_dropout_p=DEFAULT_CAPTION_DROPOUT_P,
             flow_weighting_scheme="none",  # Hunyuan specific
             training_type="lora"
         )
             data_root=data_path,
             output_dir=output_path,
             batch_size=1,
+            train_steps=DEFAULT_NB_TRAINING_STEPS,
+            lr=DEFAULT_LEARNING_RATE,
             gradient_checkpointing=True,
             id_token="BW_STYLE",
             gradient_accumulation_steps=4,
+            lora_rank=DEFAULT_LORA_RANK,
+            lora_alpha=DEFAULT_LORA_ALPHA,
             video_resolution_buckets=buckets or SMALL_TRAINING_BUCKETS,
+            caption_dropout_p=DEFAULT_CAPTION_DROPOUT_P,
             flow_weighting_scheme="logit_normal",  # LTX specific
             training_type="lora"
         )
             data_root=data_path,
             output_dir=output_path,
             batch_size=1,
+            train_steps=DEFAULT_NB_TRAINING_STEPS,
             lr=1e-5,
             gradient_checkpointing=True,
             id_token="BW_STYLE",
             gradient_accumulation_steps=1,
             video_resolution_buckets=buckets or SMALL_TRAINING_BUCKETS,
+            caption_dropout_p=DEFAULT_CAPTION_DROPOUT_P,
             flow_weighting_scheme="logit_normal",  # LTX specific
             training_type="full-finetune"
         )
             data_root=data_path,
             output_dir=output_path,
             batch_size=1,
+            train_steps=DEFAULT_NB_TRAINING_STEPS,
             lr=5e-5,
             gradient_checkpointing=True,
             id_token=None,  # Default is no ID token for Wan
             lora_alpha=32,
             target_modules=["blocks.*(to_q|to_k|to_v|to_out.0)"],  # Wan-specific target modules
             video_resolution_buckets=buckets or SMALL_TRAINING_BUCKETS,
+            caption_dropout_p=DEFAULT_CAPTION_DROPOUT_P,
             flow_weighting_scheme="logit_normal",  # Wan specific
             training_type="lora"
         )
         #args.extend(["--mixed_precision", self.mixed_precision])
         args.extend(["--batch_size", str(self.batch_size)])
+        args.extend(["--train_steps", str(self.train_steps)])
         # LoRA specific arguments
         if self.training_type == "lora":

vms/services/trainer.py CHANGED Viewed

@@ -23,7 +23,12 @@ from huggingface_hub import upload_folder, create_repo
 from ..config import (
     TrainingConfig, TRAINING_PRESETS, LOG_FILE_PATH, TRAINING_VIDEOS_PATH,
     STORAGE_PATH, TRAINING_PATH, MODEL_PATH, OUTPUT_PATH, HF_API_TOKEN,
-    MODEL_TYPES, TRAINING_TYPES
 )
 from ..utils import make_archive, parse_training_log, is_image_file, is_video_file, prepare_finetrainers_dataset, copy_files_to_training_dir
@@ -111,18 +116,19 @@ class TrainingService:
         except Exception as e:
             logger.error(f"Error saving UI state: {str(e)}")
     def load_ui_state(self) -> Dict[str, Any]:
         """Load saved UI state"""
         ui_state_file = OUTPUT_PATH / "ui_state.json"
         default_state = {
             "model_type": list(MODEL_TYPES.keys())[0],
             "training_type": list(TRAINING_TYPES.keys())[0],
-            "lora_rank": "128",
-            "lora_alpha": "128",
-            "num_epochs": 50,
-            "batch_size": 1,
-            "learning_rate": 3e-5,
-            "save_iterations": 200,
             "training_preset": list(TRAINING_PRESETS.keys())[0]
         }
@@ -145,9 +151,14 @@ class TrainingService:
                 saved_state = json.loads(file_content)
                 # Convert numeric values to appropriate types
-                if "num_epochs" in saved_state:
-                    saved_state["num_epochs"] = int(saved_state["num_epochs"])
                 if "batch_size" in saved_state:
                     saved_state["batch_size"] = int(saved_state["batch_size"])
                 if "learning_rate" in saved_state:
@@ -158,6 +169,40 @@ class TrainingService:
                 # Make sure we have all keys (in case structure changed)
                 merged_state = default_state.copy()
                 merged_state.update(saved_state)
                 return merged_state
         except json.JSONDecodeError as e:
             logger.error(f"Error parsing UI state JSON: {str(e)}")
@@ -176,12 +221,12 @@ class TrainingService:
             default_state = {
                 "model_type": list(MODEL_TYPES.keys())[0],
                 "training_type": list(TRAINING_TYPES.keys())[0],
-                "lora_rank": "128",
-                "lora_alpha": "128",
-                "num_epochs": 50,
-                "batch_size": 1,
-                "learning_rate": 3e-5,
-                "save_iterations": 200,
                 "training_preset": list(TRAINING_PRESETS.keys())[0]
             }
             self.save_ui_state(default_state)
@@ -209,12 +254,12 @@ class TrainingService:
             default_state = {
                 "model_type": list(MODEL_TYPES.keys())[0],
                 "training_type": list(TRAINING_TYPES.keys())[0],
-                "lora_rank": "128",
-                "lora_alpha": "128",
-                "num_epochs": 50,
-                "batch_size": 1,
-                "learning_rate": 3e-5,
-                "save_iterations": 200,
                 "training_preset": list(TRAINING_PRESETS.keys())[0]
             }
             self.save_ui_state(default_state)
@@ -361,7 +406,7 @@ class TrainingService:
         model_type: str,
         lora_rank: str,
         lora_alpha: str,
-        num_epochs: int,
         batch_size: int,
         learning_rate: float,
         save_iterations: int,
@@ -508,7 +553,7 @@ class TrainingService:
                 return error_msg, "Unsupported model"
             # Update with UI parameters
-            config.train_epochs = int(num_epochs)
             config.batch_size = int(batch_size)
             config.lr = float(learning_rate)
             config.checkpointing_steps = int(save_iterations)
@@ -530,11 +575,11 @@ class TrainingService:
             # Common settings for both models
             config.mixed_precision = "bf16"
-            config.seed = 42
             config.gradient_checkpointing = True
             config.enable_slicing = True
             config.enable_tiling = True
-            config.caption_dropout_p = 0.05
             validation_error = self.validate_training_config(config, model_type)
             if validation_error:
@@ -626,7 +671,7 @@ class TrainingService:
                 "training_type": training_type,
                 "lora_rank": lora_rank,
                 "lora_alpha": lora_alpha,
-                "num_epochs": num_epochs,
                 "batch_size": batch_size,
                 "learning_rate": learning_rate,
                 "save_iterations": save_iterations,
@@ -635,14 +680,12 @@ class TrainingService:
             })
             # Update initial training status
-            total_steps = num_epochs * (max(1, video_count) // batch_size)
             self.save_status(
                 state='training',
-                epoch=0,
                 step=0,
                 total_steps=total_steps,
                 loss=0.0,
-                total_epochs=num_epochs,
                 message='Training started',
                 repo_id=repo_id,
                 model_type=model_type,
@@ -789,12 +832,12 @@ class TrainingService:
                         "params": {
                             "model_type": MODEL_TYPES.get(ui_state.get("model_type", list(MODEL_TYPES.keys())[0])),
                             "training_type": TRAINING_TYPES.get(ui_state.get("training_type", list(TRAINING_TYPES.keys())[0])),
-                            "lora_rank": ui_state.get("lora_rank", "128"),
-                            "lora_alpha": ui_state.get("lora_alpha", "128"),
-                            "num_epochs": ui_state.get("num_epochs", 70),
-                            "batch_size": ui_state.get("batch_size", 1),
-                            "learning_rate": ui_state.get("learning_rate", 3e-5),
-                            "save_iterations": ui_state.get("save_iterations", 500),
                             "preset_name": ui_state.get("training_preset", list(TRAINING_PRESETS.keys())[0]),
                             "repo_id": ""  # Default empty repo ID
                         }
@@ -853,12 +896,12 @@ class TrainingService:
             ui_updates.update({
                 "model_type": model_type_display,  # Use the display name for the UI dropdown
                 "training_type": training_type_display,  # Use the display name for training type
-                "lora_rank": params.get('lora_rank', "128"),
-                "lora_alpha": params.get('lora_alpha', "128"),
-                "num_epochs": params.get('num_epochs', 70),
-                "batch_size": params.get('batch_size', 1),
-                "learning_rate": params.get('learning_rate', 3e-5),
-                "save_iterations": params.get('save_iterations', 500),
                 "training_preset": params.get('preset_name', list(TRAINING_PRESETS.keys())[0])
             })
@@ -872,12 +915,12 @@ class TrainingService:
                     # But keep model_type_display for the UI
                     result = self.start_training(
                         model_type=model_type_internal,
-                        lora_rank=params.get('lora_rank', "128"),
-                        lora_alpha=params.get('lora_alpha', "128"),
-                        num_epochs=params.get('num_epochs', 70),
-                        batch_size=params.get('batch_size', 1),
-                        learning_rate=params.get('learning_rate', 3e-5),
-                        save_iterations=params.get('save_iterations', 500),
                         repo_id=params.get('repo_id', ''),
                         preset_name=params.get('preset_name', list(TRAINING_PRESETS.keys())[0]),
                         training_type=training_type_internal,

 from ..config import (
     TrainingConfig, TRAINING_PRESETS, LOG_FILE_PATH, TRAINING_VIDEOS_PATH,
     STORAGE_PATH, TRAINING_PATH, MODEL_PATH, OUTPUT_PATH, HF_API_TOKEN,
+    MODEL_TYPES, TRAINING_TYPES,
+    DEFAULT_NB_TRAINING_STEPS, DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
+    DEFAULT_BATCH_SIZE, DEFAULT_CAPTION_DROPOUT_P,
+    DEFAULT_LEARNING_RATE,
+    DEFAULT_LORA_RANK, DEFAULT_LORA_ALPHA,
+    DEFAULT_LORA_RANK_STR, DEFAULT_LORA_ALPHA_STR
 )
 from ..utils import make_archive, parse_training_log, is_image_file, is_video_file, prepare_finetrainers_dataset, copy_files_to_training_dir
         except Exception as e:
             logger.error(f"Error saving UI state: {str(e)}")
+    # Additional fix for the load_ui_state method in trainer.py to clean up old values
     def load_ui_state(self) -> Dict[str, Any]:
         """Load saved UI state"""
         ui_state_file = OUTPUT_PATH / "ui_state.json"
         default_state = {
             "model_type": list(MODEL_TYPES.keys())[0],
             "training_type": list(TRAINING_TYPES.keys())[0],
+            "lora_rank": DEFAULT_LORA_RANK_STR,
+            "lora_alpha": DEFAULT_LORA_ALPHA_STR,
+            "train_steps": DEFAULT_NB_TRAINING_STEPS,
+            "batch_size": DEFAULT_BATCH_SIZE,
+            "learning_rate": DEFAULT_LEARNING_RATE,
+            "save_iterations": DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
             "training_preset": list(TRAINING_PRESETS.keys())[0]
         }
                 saved_state = json.loads(file_content)
+                # Clean up model type if it contains " (LoRA)" suffix
+                if "model_type" in saved_state and " (LoRA)" in saved_state["model_type"]:
+                    saved_state["model_type"] = saved_state["model_type"].replace(" (LoRA)", "")
+                    logger.info(f"Removed (LoRA) suffix from saved model type: {saved_state['model_type']}")
                 # Convert numeric values to appropriate types
+                if "train_steps" in saved_state:
+                    saved_state["train_steps"] = int(saved_state["train_steps"])
                 if "batch_size" in saved_state:
                     saved_state["batch_size"] = int(saved_state["batch_size"])
                 if "learning_rate" in saved_state:
                 # Make sure we have all keys (in case structure changed)
                 merged_state = default_state.copy()
                 merged_state.update(saved_state)
+                # Validate model_type is in available choices
+                if merged_state["model_type"] not in MODEL_TYPES:
+                    # Try to map from internal name
+                    model_found = False
+                    for display_name, internal_name in MODEL_TYPES.items():
+                        if internal_name == merged_state["model_type"]:
+                            merged_state["model_type"] = display_name
+                            model_found = True
+                            break
+                    # If still not found, use default
+                    if not model_found:
+                        merged_state["model_type"] = default_state["model_type"]
+                        logger.warning(f"Invalid model type in saved state, using default")
+                # Validate training_type is in available choices
+                if merged_state["training_type"] not in TRAINING_TYPES:
+                    # Try to map from internal name
+                    training_found = False
+                    for display_name, internal_name in TRAINING_TYPES.items():
+                        if internal_name == merged_state["training_type"]:
+                            merged_state["training_type"] = display_name
+                            training_found = True
+                            break
+                    # If still not found, use default
+                    if not training_found:
+                        merged_state["training_type"] = default_state["training_type"]
+                        logger.warning(f"Invalid training type in saved state, using default")
+                # Validate training_preset is in available choices
+                if merged_state["training_preset"] not in TRAINING_PRESETS:
+                    merged_state["training_preset"] = default_state["training_preset"]
+                    logger.warning(f"Invalid training preset in saved state, using default")
                 return merged_state
         except json.JSONDecodeError as e:
             logger.error(f"Error parsing UI state JSON: {str(e)}")
             default_state = {
                 "model_type": list(MODEL_TYPES.keys())[0],
                 "training_type": list(TRAINING_TYPES.keys())[0],
+                "lora_rank": DEFAULT_LORA_RANK_STR,
+                "lora_alpha": DEFAULT_LORA_ALPHA_STR,
+                "train_steps": DEFAULT_NB_TRAINING_STEPS,
+                "batch_size": DEFAULT_BATCH_SIZE,
+                "learning_rate": DEFAULT_LEARNING_RATE,
+                "save_iterations": DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
                 "training_preset": list(TRAINING_PRESETS.keys())[0]
             }
             self.save_ui_state(default_state)
             default_state = {
                 "model_type": list(MODEL_TYPES.keys())[0],
                 "training_type": list(TRAINING_TYPES.keys())[0],
+                "lora_rank": DEFAULT_LORA_RANK_STR,
+                "lora_alpha": DEFAULT_LORA_ALPHA_STR,
+                "train_steps": DEFAULT_NB_TRAINING_STEPS,
+                "batch_size": DEFAULT_BATCH_SIZE,
+                "learning_rate": DEFAULT_LEARNING_RATE,
+                "save_iterations": DEFAULT_NB_TRAINING_STEPS,
                 "training_preset": list(TRAINING_PRESETS.keys())[0]
             }
             self.save_ui_state(default_state)
         model_type: str,
         lora_rank: str,
         lora_alpha: str,
+        train_steps: int,
         batch_size: int,
         learning_rate: float,
         save_iterations: int,
                 return error_msg, "Unsupported model"
             # Update with UI parameters
+            config.train_steps = int(train_steps)
             config.batch_size = int(batch_size)
             config.lr = float(learning_rate)
             config.checkpointing_steps = int(save_iterations)
             # Common settings for both models
             config.mixed_precision = "bf16"
+            config.seed = DEFAULT_SEED
             config.gradient_checkpointing = True
             config.enable_slicing = True
             config.enable_tiling = True
+            config.caption_dropout_p = DEFAULT_CAPTION_DROPOUT_P
             validation_error = self.validate_training_config(config, model_type)
             if validation_error:
                 "training_type": training_type,
                 "lora_rank": lora_rank,
                 "lora_alpha": lora_alpha,
+                "train_steps": train_steps,
                 "batch_size": batch_size,
                 "learning_rate": learning_rate,
                 "save_iterations": save_iterations,
             })
             # Update initial training status
+            total_steps = int(train_steps)
             self.save_status(
                 state='training',
                 step=0,
                 total_steps=total_steps,
                 loss=0.0,
                 message='Training started',
                 repo_id=repo_id,
                 model_type=model_type,
                         "params": {
                             "model_type": MODEL_TYPES.get(ui_state.get("model_type", list(MODEL_TYPES.keys())[0])),
                             "training_type": TRAINING_TYPES.get(ui_state.get("training_type", list(TRAINING_TYPES.keys())[0])),
+                            "lora_rank": ui_state.get("lora_rank", DEFAULT_LORA_RANK_STR),
+                            "lora_alpha": ui_state.get("lora_alpha", DEFAULT_LORA_ALPHA_STR),
+                            "train_steps": ui_state.get("train_steps", DEFAULT_NB_TRAINING_STEPS),
+                            "batch_size": ui_state.get("batch_size", DEFAULT_BATCH_SIZE),
+                            "learning_rate": ui_state.get("learning_rate", DEFAULT_LEARNING_RATE),
+                            "save_iterations": ui_state.get("save_iterations", DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS),
                             "preset_name": ui_state.get("training_preset", list(TRAINING_PRESETS.keys())[0]),
                             "repo_id": ""  # Default empty repo ID
                         }
             ui_updates.update({
                 "model_type": model_type_display,  # Use the display name for the UI dropdown
                 "training_type": training_type_display,  # Use the display name for training type
+                "lora_rank": params.get('lora_rank', DEFAULT_LORA_RANK_STR),
+                "lora_alpha": params.get('lora_alpha', DEFAULT_LORA_ALPHA_STR),
+                "train_steps": params.get('train_steps', DEFAULT_NB_TRAINING_STEPS),
+                "batch_size": params.get('batch_size', DEFAULT_BATCH_SIZE),
+                "learning_rate": params.get('learning_rate', DEFAULT_LEARNING_RATE),
+                "save_iterations": params.get('save_iterations', DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS),
                 "training_preset": params.get('preset_name', list(TRAINING_PRESETS.keys())[0])
             })
                     # But keep model_type_display for the UI
                     result = self.start_training(
                         model_type=model_type_internal,
+                        lora_rank=params.get('lora_rank', DEFAULT_LORA_RANK_STR),
+                        lora_alpha=params.get('lora_alpha', DEFAULT_LORA_ALPHA_STR),
+                        train_size=params.get('train_steps', DEFAULT_NB_TRAINING_STEPS),
+                        batch_size=params.get('batch_size', DEFAULT_BATCH_SIZE),
+                        learning_rate=params.get('learning_rate', DEFAULT_LEARNING_RATE),
+                        save_iterations=params.get('save_iterations', DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS),
                         repo_id=params.get('repo_id', ''),
                         preset_name=params.get('preset_name', list(TRAINING_PRESETS.keys())[0]),
                         training_type=training_type_internal,

vms/tabs/train_tab.py CHANGED Viewed

@@ -9,7 +9,14 @@ from typing import Dict, Any, List, Optional, Tuple
 from pathlib import Path
 from .base_tab import BaseTab
-from ..config import TRAINING_PRESETS, OUTPUT_PATH, MODEL_TYPES, ASK_USER_TO_DUPLICATE_SPACE, SMALL_TRAINING_BUCKETS, TRAINING_TYPES
 logger = logging.getLogger(__name__)
@@ -63,20 +70,20 @@ class TrainTab(BaseTab):
                         self.components["lora_rank"] = gr.Dropdown(
                             label="LoRA Rank",
                             choices=["16", "32", "64", "128", "256", "512", "1024"],
-                            value="128",
                             type="value"
                         )
                         self.components["lora_alpha"] = gr.Dropdown(
                             label="LoRA Alpha",
                             choices=["16", "32", "64", "128", "256", "512", "1024"],
-                            value="128",
                             type="value"
                         )
                     with gr.Row():
-                        self.components["num_epochs"] = gr.Number(
-                            label="Number of Epochs",
-                            value=70,
                             minimum=1,
                             precision=0
                         )
@@ -89,13 +96,13 @@ class TrainTab(BaseTab):
                     with gr.Row():
                         self.components["learning_rate"] = gr.Number(
                             label="Learning Rate",
-                            value=2e-5,
-                            minimum=1e-7
                         )
                         self.components["save_iterations"] = gr.Number(
                             label="Save checkpoint every N iterations",
-                            value=500,
-                            minimum=50,
                             precision=0,
                             info="Model will be saved periodically after these many steps"
                         )
@@ -170,7 +177,7 @@ class TrainTab(BaseTab):
             return {
                 self.components["model_info"]: info,
-                self.components["num_epochs"]: params["num_epochs"],
                 self.components["batch_size"]: params["batch_size"],
                 self.components["learning_rate"]: params["learning_rate"],
                 self.components["save_iterations"]: params["save_iterations"],
@@ -186,7 +193,7 @@ class TrainTab(BaseTab):
             inputs=[self.components["model_type"], self.components["training_type"]],
             outputs=[
                 self.components["model_info"],
-                self.components["num_epochs"],
                 self.components["batch_size"],
                 self.components["learning_rate"],
                 self.components["save_iterations"],
@@ -204,7 +211,7 @@ class TrainTab(BaseTab):
             inputs=[self.components["model_type"], self.components["training_type"]],
             outputs=[
                 self.components["model_info"],
-                self.components["num_epochs"],
                 self.components["batch_size"],
                 self.components["learning_rate"],
                 self.components["save_iterations"],
@@ -225,9 +232,9 @@ class TrainTab(BaseTab):
             outputs=[]
         )
-        self.components["num_epochs"].change(
-            fn=lambda v: self.app.update_ui_state(num_epochs=v),
-            inputs=[self.components["num_epochs"]],
             outputs=[]
         )
@@ -262,7 +269,7 @@ class TrainTab(BaseTab):
                 self.components["training_type"],
                 self.components["lora_rank"],
                 self.components["lora_alpha"],
-                self.components["num_epochs"],
                 self.components["batch_size"],
                 self.components["learning_rate"],
                 self.components["save_iterations"],
@@ -280,7 +287,7 @@ class TrainTab(BaseTab):
                 self.components["training_type"],
                 self.components["lora_rank"],
                 self.components["lora_alpha"],
-                self.components["num_epochs"],
                 self.components["batch_size"],
                 self.components["learning_rate"],
                 self.components["save_iterations"],
@@ -290,27 +297,20 @@ class TrainTab(BaseTab):
                 self.components["status_box"],
                 self.components["log_box"]
             ]
-        ).success(
-            fn=self.get_latest_status_message_logs_and_button_labels,
-            outputs=[
-                self.components["status_box"],
-                self.components["log_box"],
-                self.components["start_btn"],
-                self.components["stop_btn"],
-                self.components["pause_resume_btn"],
-                self.components["current_task_box"]  # Include new component
-            ]
         )
         self.components["pause_resume_btn"].click(
             fn=self.handle_pause_resume,
             outputs=[
                 self.components["status_box"],
                 self.components["log_box"],
                 self.components["start_btn"],
                 self.components["stop_btn"],
-                self.components["pause_resume_btn"],
-                self.components["current_task_box"]  # Include new component
             ]
         )
@@ -319,10 +319,10 @@ class TrainTab(BaseTab):
             outputs=[
                 self.components["status_box"],
                 self.components["log_box"],
                 self.components["start_btn"],
                 self.components["stop_btn"],
-                self.components["pause_resume_btn"],
-                self.components["current_task_box"]  # Include new component
             ]
         )
@@ -330,16 +330,6 @@ class TrainTab(BaseTab):
         self.components["delete_checkpoints_btn"].click(
             fn=lambda: self.app.trainer.delete_all_checkpoints(),
             outputs=[self.components["status_box"]]
-        ).then(
-            fn=self.get_latest_status_message_logs_and_button_labels,
-            outputs=[
-                self.components["status_box"],
-                self.components["log_box"],
-                self.components["start_btn"],
-                self.components["stop_btn"],
-                self.components["delete_checkpoints_btn"],
-                self.components["current_task_box"]  # Include new component
-            ]
         )
     def handle_training_start(self, preset, model_type, training_type, *args):
@@ -391,7 +381,7 @@ class TrainTab(BaseTab):
     def get_model_info(self, model_type: str, training_type: str) -> str:
         """Get information about the selected model type and training method"""
-        if model_type == "HunyuanVideo (LoRA)":
             base_info = """### HunyuanVideo
     - Required VRAM: ~48GB minimum
     - Recommended batch size: 1-2
@@ -403,7 +393,7 @@ class TrainTab(BaseTab):
             else:
                 return base_info + "\n- Required VRAM: ~48GB minimum\n- **Full finetune not recommended due to VRAM requirements**"
-        elif model_type == "LTX-Video (LoRA)":
             base_info = """### LTX-Video
     - Recommended batch size: 1-4
     - Typical training time: 1-3 hours
@@ -414,14 +404,14 @@ class TrainTab(BaseTab):
             else:
                 return base_info + "\n- Required VRAM: ~21GB minimum\n- Full model size: ~8GB"
-        elif model_type == "Wan-2.1-T2V (LoRA)":
             base_info = """### Wan-2.1-T2V
-    - Recommended batch size: 1-2
-    - Typical training time: 1-3 hours
     - Default resolution: 49x512x768"""
             if training_type == "LoRA Finetune":
-                return base_info + "\n- Required VRAM: ~16GB minimum\n- Default LoRA rank: 32 (~120 MB)"
             else:
                 return base_info + "\n- **Full finetune not recommended due to VRAM requirements**"
@@ -440,51 +430,51 @@ class TrainTab(BaseTab):
             # Use the first matching preset
             preset = matching_presets[0]
             return {
-                "num_epochs": preset.get("num_epochs", 70),
-                "batch_size": preset.get("batch_size", 1),
-                "learning_rate": preset.get("learning_rate", 3e-5),
-                "save_iterations": preset.get("save_iterations", 500),
-                "lora_rank": preset.get("lora_rank", "128"),
-                "lora_alpha": preset.get("lora_alpha", "128")
             }
         # Default fallbacks
         if model_type == "hunyuan_video":
             return {
-                "num_epochs": 70,
-                "batch_size": 1,
                 "learning_rate": 2e-5,
-                "save_iterations": 500,
-                "lora_rank": "128",
-                "lora_alpha": "128"
             }
         elif model_type == "ltx_video":
             return {
-                "num_epochs": 70,
-                "batch_size": 1,
-                "learning_rate": 3e-5,
-                "save_iterations": 500,
-                "lora_rank": "128",
-                "lora_alpha": "128"
             }
         elif model_type == "wan":
             return {
-                "num_epochs": 70,
-                "batch_size": 1,
                 "learning_rate": 5e-5,
-                "save_iterations": 500,
                 "lora_rank": "32",
                 "lora_alpha": "32"
             }
         else:
             # Generic defaults
             return {
-                "num_epochs": 70,
-                "batch_size": 1,
-                "learning_rate": 3e-5,
-                "save_iterations": 500,
-                "lora_rank": "128",
-                "lora_alpha": "128"
             }
     def update_training_params(self, preset_name: str) -> Tuple:
@@ -522,12 +512,12 @@ class TrainTab(BaseTab):
         show_lora_params = preset["training_type"] == "lora"
         # Use preset defaults but preserve user-modified values if they exist
-        lora_rank_val = current_state.get("lora_rank") if current_state.get("lora_rank") != preset.get("lora_rank", "128") else preset.get("lora_rank", "128")
-        lora_alpha_val = current_state.get("lora_alpha") if current_state.get("lora_alpha") != preset.get("lora_alpha", "128") else preset.get("lora_alpha", "128")
-        num_epochs_val = current_state.get("num_epochs") if current_state.get("num_epochs") != preset.get("num_epochs", 70) else preset.get("num_epochs", 70)
-        batch_size_val = current_state.get("batch_size") if current_state.get("batch_size") != preset.get("batch_size", 1) else preset.get("batch_size", 1)
-        learning_rate_val = current_state.get("learning_rate") if current_state.get("learning_rate") != preset.get("learning_rate", 3e-5) else preset.get("learning_rate", 3e-5)
-        save_iterations_val = current_state.get("save_iterations") if current_state.get("save_iterations") != preset.get("save_iterations", 500) else preset.get("save_iterations", 500)
         # Return values in the same order as the output components
         return (
@@ -535,7 +525,7 @@ class TrainTab(BaseTab):
             training_display_name,
             lora_rank_val,
             lora_alpha_val,
-            num_epochs_val,
             batch_size_val,
             learning_rate_val,
             save_iterations_val,
@@ -543,66 +533,6 @@ class TrainTab(BaseTab):
             gr.Row(visible=show_lora_params)
         )
-    def update_training_ui(self, training_state: Dict[str, Any]):
-        """Update UI components based on training state"""
-        updates = {}
-        # Update status box with high-level information
-        status_text = []
-        if training_state["status"] != "idle":
-            status_text.extend([
-                f"Status: {training_state['status']}",
-                f"Progress: {training_state['progress']}",
-                f"Step: {training_state['current_step']}/{training_state['total_steps']}",
-                f"Time elapsed: {training_state['elapsed']}",
-                f"Estimated remaining: {training_state['remaining']}",
-                "",
-                f"Current loss: {training_state['step_loss']}",
-                f"Learning rate: {training_state['learning_rate']}",
-                f"Gradient norm: {training_state['grad_norm']}",
-                f"Memory usage: {training_state['memory']}"
-            ])
-            if training_state["error_message"]:
-                status_text.append(f"\nError: {training_state['error_message']}")
-        updates["status_box"] = "\n".join(status_text)
-        # Add current task information to the dedicated box
-        if training_state.get("current_task"):
-            updates["current_task_box"] = training_state["current_task"]
-        else:
-            updates["current_task_box"] = "No active task" if training_state["status"] != "training" else "Waiting for task information..."
-        # Update button states
-        updates["start_btn"] = gr.Button(
-            "Start training",
-            interactive=(training_state["status"] in ["idle", "completed", "error", "stopped"]),
-            variant="primary" if training_state["status"] == "idle" else "secondary"
-        )
-        updates["stop_btn"] = gr.Button(
-            "Stop training",
-            interactive=(training_state["status"] in ["training", "initializing"]),
-            variant="stop"
-        )
-        return updates
-    def handle_pause_resume(self):
-        status, _, _ = self.get_latest_status_message_and_logs()
-        if status == "paused":
-            self.app.trainer.resume_training()
-        else:
-            self.app.trainer.pause_training()
-        return self.get_latest_status_message_logs_and_button_labels()
-    def handle_stop(self):
-        self.app.trainer.stop_training()
-        return self.get_latest_status_message_logs_and_button_labels()
     def get_latest_status_message_and_logs(self) -> Tuple[str, str, str]:
         """Get latest status message, log content, and status code in a safer way"""
         state = self.app.trainer.get_status()
@@ -663,61 +593,107 @@ class TrainTab(BaseTab):
         return (state["status"], state["message"], logs)
-    def get_latest_status_message_logs_and_button_labels(self) -> Tuple:
-        """Get latest status message, logs and button states"""
         status, message, logs = self.get_latest_status_message_and_logs()
-        # Add checkpoints detection
-        has_checkpoints = len(list(OUTPUT_PATH.glob("checkpoint-*"))) > 0
-        button_updates = self.update_training_buttons(status, has_checkpoints).values()
         # Get current task if available
         current_task = ""
         if hasattr(self.app, 'log_parser') and self.app.log_parser is not None:
             current_task = self.app.log_parser.get_current_task_display()
-        # Return in order expected by timer (added current_task)
-        return (message, logs, *button_updates, current_task)
-    def update_training_buttons(self, status: str, has_checkpoints: bool = None) -> Dict:
-        """Update training control buttons based on state"""
-        if has_checkpoints is None:
-            has_checkpoints = len(list(OUTPUT_PATH.glob("checkpoint-*"))) > 0
         is_training = status in ["training", "initializing"]
         is_completed = status in ["completed", "error", "stopped"]
         start_text = "Continue Training" if has_checkpoints else "Start Training"
-        # Only include buttons that we know exist in components
-        result = {
-            "start_btn": gr.Button(
-                value=start_text,
-                interactive=not is_training,
-                variant="primary" if not is_training else "secondary",
-            ),
-            "stop_btn": gr.Button(
-                value="Stop at Last Checkpoint",
-                interactive=is_training,
-                variant="primary" if is_training else "secondary",
-            )
-        }
-        # Add delete_checkpoints_btn only if it exists in components
         if "delete_checkpoints_btn" in self.components:
-            result["delete_checkpoints_btn"] = gr.Button(
-                value="Delete All Checkpoints",
                 interactive=has_checkpoints and not is_training,
-                variant="stop",
             )
         else:
-            # Add pause_resume_btn as fallback
-            result["pause_resume_btn"] = gr.Button(
-                value="Resume Training" if status == "paused" else "Pause Training",
                 interactive=(is_training or status == "paused") and not is_completed,
                 variant="secondary",
                 visible=False
             )
-        return result

 from pathlib import Path
 from .base_tab import BaseTab
+from ..config import (
+    TRAINING_PRESETS, OUTPUT_PATH, MODEL_TYPES, ASK_USER_TO_DUPLICATE_SPACE, SMALL_TRAINING_BUCKETS, TRAINING_TYPES,
+    DEFAULT_NB_TRAINING_STEPS, DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
+    DEFAULT_BATCH_SIZE, DEFAULT_CAPTION_DROPOUT_P,
+    DEFAULT_LEARNING_RATE,
+    DEFAULT_LORA_RANK, DEFAULT_LORA_ALPHA,
+    DEFAULT_LORA_RANK_STR, DEFAULT_LORA_ALPHA_STR
+)
 logger = logging.getLogger(__name__)
                         self.components["lora_rank"] = gr.Dropdown(
                             label="LoRA Rank",
                             choices=["16", "32", "64", "128", "256", "512", "1024"],
+                            value=DEFAULT_LORA_RANK_STR,
                             type="value"
                         )
                         self.components["lora_alpha"] = gr.Dropdown(
                             label="LoRA Alpha",
                             choices=["16", "32", "64", "128", "256", "512", "1024"],
+                            value=DEFAULT_LORA_ALPHA_STR,
                             type="value"
                         )
                     with gr.Row():
+                        self.components["train_steps"] = gr.Number(
+                            label="Number of Training Steps",
+                            value=DEFAULT_NB_TRAINING_STEPS,
                             minimum=1,
                             precision=0
                         )
                     with gr.Row():
                         self.components["learning_rate"] = gr.Number(
                             label="Learning Rate",
+                            value=DEFAULT_LEARNING_RATE,
+                            minimum=1e-8
                         )
                         self.components["save_iterations"] = gr.Number(
                             label="Save checkpoint every N iterations",
+                            value=DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
+                            minimum=1,
                             precision=0,
                             info="Model will be saved periodically after these many steps"
                         )
             return {
                 self.components["model_info"]: info,
+                self.components["train_steps"]: params["train_steps"],
                 self.components["batch_size"]: params["batch_size"],
                 self.components["learning_rate"]: params["learning_rate"],
                 self.components["save_iterations"]: params["save_iterations"],
             inputs=[self.components["model_type"], self.components["training_type"]],
             outputs=[
                 self.components["model_info"],
+                self.components["train_steps"],
                 self.components["batch_size"],
                 self.components["learning_rate"],
                 self.components["save_iterations"],
             inputs=[self.components["model_type"], self.components["training_type"]],
             outputs=[
                 self.components["model_info"],
+                self.components["train_steps"],
                 self.components["batch_size"],
                 self.components["learning_rate"],
                 self.components["save_iterations"],
             outputs=[]
         )
+        self.components["train_steps"].change(
+            fn=lambda v: self.app.update_ui_state(train_steps=v),
+            inputs=[self.components["train_steps"]],
             outputs=[]
         )
                 self.components["training_type"],
                 self.components["lora_rank"],
                 self.components["lora_alpha"],
+                self.components["train_steps"],
                 self.components["batch_size"],
                 self.components["learning_rate"],
                 self.components["save_iterations"],
                 self.components["training_type"],
                 self.components["lora_rank"],
                 self.components["lora_alpha"],
+                self.components["train_steps"],
                 self.components["batch_size"],
                 self.components["learning_rate"],
                 self.components["save_iterations"],
                 self.components["status_box"],
                 self.components["log_box"]
             ]
         )
+        # Use simplified event handlers for pause/resume and stop
+        third_btn = self.components["delete_checkpoints_btn"] if "delete_checkpoints_btn" in self.components else self.components["pause_resume_btn"]
         self.components["pause_resume_btn"].click(
             fn=self.handle_pause_resume,
             outputs=[
                 self.components["status_box"],
                 self.components["log_box"],
+                self.components["current_task_box"],
                 self.components["start_btn"],
                 self.components["stop_btn"],
+                third_btn
             ]
         )
             outputs=[
                 self.components["status_box"],
                 self.components["log_box"],
+                self.components["current_task_box"],
                 self.components["start_btn"],
                 self.components["stop_btn"],
+                third_btn
             ]
         )
         self.components["delete_checkpoints_btn"].click(
             fn=lambda: self.app.trainer.delete_all_checkpoints(),
             outputs=[self.components["status_box"]]
         )
     def handle_training_start(self, preset, model_type, training_type, *args):
     def get_model_info(self, model_type: str, training_type: str) -> str:
         """Get information about the selected model type and training method"""
+        if model_type == "HunyuanVideo":
             base_info = """### HunyuanVideo
     - Required VRAM: ~48GB minimum
     - Recommended batch size: 1-2
             else:
                 return base_info + "\n- Required VRAM: ~48GB minimum\n- **Full finetune not recommended due to VRAM requirements**"
+        elif model_type == "LTX-Video":
             base_info = """### LTX-Video
     - Recommended batch size: 1-4
     - Typical training time: 1-3 hours
             else:
                 return base_info + "\n- Required VRAM: ~21GB minimum\n- Full model size: ~8GB"
+        elif model_type == "Wan-2.1-T2V":
             base_info = """### Wan-2.1-T2V
+    - Recommended batch size: ?
+    - Typical training time: ? hours
     - Default resolution: 49x512x768"""
             if training_type == "LoRA Finetune":
+                return base_info + "\n- Required VRAM: ?GB minimum\n- Default LoRA rank: 32 (~120 MB)"
             else:
                 return base_info + "\n- **Full finetune not recommended due to VRAM requirements**"
             # Use the first matching preset
             preset = matching_presets[0]
             return {
+                "train_steps": preset.get("train_steps", DEFAULT_NB_TRAINING_STEPS),
+                "batch_size": preset.get("batch_size", DEFAULT_BATCH_SIZE),
+                "learning_rate": preset.get("learning_rate", DEFAULT_LEARNING_RATE),
+                "save_iterations": preset.get("save_iterations", DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS),
+                "lora_rank": preset.get("lora_rank", DEFAULT_LORA_RANK_STR),
+                "lora_alpha": preset.get("lora_alpha", DEFAULT_LORA_ALPHA_STR)
             }
         # Default fallbacks
         if model_type == "hunyuan_video":
             return {
+                "train_steps": DEFAULT_NB_TRAINING_STEPS,
+                "batch_size": DEFAULT_BATCH_SIZE,
                 "learning_rate": 2e-5,
+                "save_iterations": DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
+                "lora_rank": DEFAULT_LORA_RANK_STR,
+                "lora_alpha": DEFAULT_LORA_ALPHA_STR
             }
         elif model_type == "ltx_video":
             return {
+                "train_steps": DEFAULT_NB_TRAINING_STEPS,
+                "batch_size": DEFAULT_BATCH_SIZE,
+                "learning_rate": DEFAULT_LEARNING_RATE,
+                "save_iterations": DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
+                "lora_rank": DEFAULT_LORA_RANK_STR,
+                "lora_alpha": DEFAULT_LORA_ALPHA_STR
             }
         elif model_type == "wan":
             return {
+                "train_steps": DEFAULT_NB_TRAINING_STEPS,
+                "batch_size": DEFAULT_BATCH_SIZE,
                 "learning_rate": 5e-5,
+                "save_iterations": DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
                 "lora_rank": "32",
                 "lora_alpha": "32"
             }
         else:
             # Generic defaults
             return {
+                "train_steps": DEFAULT_NB_TRAINING_STEPS,
+                "batch_size": DEFAULT_BATCH_SIZE,
+                "learning_rate": DEFAULT_LEARNING_RATE,
+                "save_iterations": DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
+                "lora_rank": DEFAULT_LORA_RANK_STR,
+                "lora_alpha": DEFAULT_LORA_ALPHA_STR
             }
     def update_training_params(self, preset_name: str) -> Tuple:
         show_lora_params = preset["training_type"] == "lora"
         # Use preset defaults but preserve user-modified values if they exist
+        lora_rank_val = current_state.get("lora_rank") if current_state.get("lora_rank") != preset.get("lora_rank", DEFAULT_LORA_RANK_STR) else preset.get("lora_rank", DEFAULT_LORA_RANK_STR)
+        lora_alpha_val = current_state.get("lora_alpha") if current_state.get("lora_alpha") != preset.get("lora_alpha", DEFAULT_LORA_ALPHA_STR) else preset.get("lora_alpha", DEFAULT_LORA_ALPHA_STR)
+        train_steps_val = current_state.get("train_steps") if current_state.get("train_steps") != preset.get("train_steps", DEFAULT_NB_TRAINING_STEPS) else preset.get("train_steps", DEFAULT_NB_TRAINING_STEPS)
+        batch_size_val = current_state.get("batch_size") if current_state.get("batch_size") != preset.get("batch_size", DEFAULT_BATCH_SIZE) else preset.get("batch_size", DEFAULT_BATCH_SIZE)
+        learning_rate_val = current_state.get("learning_rate") if current_state.get("learning_rate") != preset.get("learning_rate", DEFAULT_LEARNING_RATE) else preset.get("learning_rate", DEFAULT_LEARNING_RATE)
+        save_iterations_val = current_state.get("save_iterations") if current_state.get("save_iterations") != preset.get("save_iterations", DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS) else preset.get("save_iterations", DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS)
         # Return values in the same order as the output components
         return (
             training_display_name,
             lora_rank_val,
             lora_alpha_val,
+            train_steps_val,
             batch_size_val,
             learning_rate_val,
             save_iterations_val,
             gr.Row(visible=show_lora_params)
         )
     def get_latest_status_message_and_logs(self) -> Tuple[str, str, str]:
         """Get latest status message, log content, and status code in a safer way"""
         state = self.app.trainer.get_status()
         return (state["status"], state["message"], logs)
+    def get_status_updates(self):
+        """Get status updates for text components (no variant property)"""
         status, message, logs = self.get_latest_status_message_and_logs()
         # Get current task if available
         current_task = ""
         if hasattr(self.app, 'log_parser') and self.app.log_parser is not None:
             current_task = self.app.log_parser.get_current_task_display()
+        return message, logs, current_task
+    def get_button_updates(self):
+        """Get button updates (with variant property)"""
+        status, _, _ = self.get_latest_status_message_and_logs()
+        # Add checkpoints detection
+        has_checkpoints = len(list(OUTPUT_PATH.glob("checkpoint-*"))) > 0
         is_training = status in ["training", "initializing"]
         is_completed = status in ["completed", "error", "stopped"]
         start_text = "Continue Training" if has_checkpoints else "Start Training"
+        # Create button updates
+        start_btn = gr.Button(
+            value=start_text,
+            interactive=not is_training,
+            variant="primary" if not is_training else "secondary"
+        )
+        stop_btn = gr.Button(
+            value="Stop at Last Checkpoint",
+            interactive=is_training,
+            variant="primary" if is_training else "secondary"
+        )
+        # Add delete_checkpoints_btn or pause_resume_btn
         if "delete_checkpoints_btn" in self.components:
+            third_btn = gr.Button(
+                "Delete All Checkpoints",
                 interactive=has_checkpoints and not is_training,
+                variant="stop"
             )
         else:
+            third_btn = gr.Button(
+                "Resume Training" if status == "paused" else "Pause Training",
                 interactive=(is_training or status == "paused") and not is_completed,
                 variant="secondary",
                 visible=False
             )
+        return start_btn, stop_btn, third_btn
+    def update_training_ui(self, training_state: Dict[str, Any]):
+        """Update UI components based on training state"""
+        updates = {}
+        # Update status box with high-level information
+        status_text = []
+        if training_state["status"] != "idle":
+            status_text.extend([
+                f"Status: {training_state['status']}",
+                f"Progress: {training_state['progress']}",
+                f"Step: {training_state['current_step']}/{training_state['total_steps']}",
+                f"Time elapsed: {training_state['elapsed']}",
+                f"Estimated remaining: {training_state['remaining']}",
+                "",
+                f"Current loss: {training_state['step_loss']}",
+                f"Learning rate: {training_state['learning_rate']}",
+                f"Gradient norm: {training_state['grad_norm']}",
+                f"Memory usage: {training_state['memory']}"
+            ])
+            if training_state["error_message"]:
+                status_text.append(f"\nError: {training_state['error_message']}")
+        updates["status_box"] = "\n".join(status_text)
+        # Add current task information to the dedicated box
+        if training_state.get("current_task"):
+            updates["current_task_box"] = training_state["current_task"]
+        else:
+            updates["current_task_box"] = "No active task" if training_state["status"] != "training" else "Waiting for task information..."
+        return updates
+    def handle_pause_resume(self):
+        """Handle pause/resume button click"""
+        status, _, _ = self.get_latest_status_message_and_logs()
+        if status == "paused":
+            self.app.trainer.resume_training()
+        else:
+            self.app.trainer.pause_training()
+        # Return the updates separately for text and buttons
+        return (*self.get_status_updates(), *self.get_button_updates())
+    def handle_stop(self):
+        """Handle stop button click"""
+        self.app.trainer.stop_training()
+        # Return the updates separately for text and buttons
+        return (*self.get_status_updates(), *self.get_button_updates())

vms/ui/video_trainer_ui.py CHANGED Viewed

@@ -9,7 +9,12 @@ from ..services import TrainingService, CaptioningService, SplittingService, Imp
 from ..config import (
     STORAGE_PATH, VIDEOS_TO_SPLIT_PATH, STAGING_PATH, OUTPUT_PATH,
     TRAINING_PATH, LOG_FILE_PATH, TRAINING_PRESETS, TRAINING_VIDEOS_PATH, MODEL_PATH, OUTPUT_PATH,
-    MODEL_TYPES, SMALL_TRAINING_BUCKETS, TRAINING_TYPES
 )
 from ..utils import count_media_files, format_media_title, TrainingLogParser
 from ..tabs import ImportTab, SplitTab, CaptionTab, TrainTab, ManageTab
@@ -92,7 +97,7 @@ class VideoTrainerUI:
                     self.tabs["train_tab"].components["training_type"],
                     self.tabs["train_tab"].components["lora_rank"],
                     self.tabs["train_tab"].components["lora_alpha"],
-                    self.tabs["train_tab"].components["num_epochs"],
                     self.tabs["train_tab"].components["batch_size"],
                     self.tabs["train_tab"].components["learning_rate"],
                     self.tabs["train_tab"].components["save_iterations"],
@@ -104,31 +109,33 @@ class VideoTrainerUI:
     def _add_timers(self):
         """Add auto-refresh timers to the UI"""
-        # Status update timer (every 1 second)
         status_timer = gr.Timer(value=1)
-        # Use a safer approach - check if the component exists before using it
-        outputs = [
-            self.tabs["train_tab"].components["status_box"],
-            self.tabs["train_tab"].components["log_box"],
             self.tabs["train_tab"].components["start_btn"],
             self.tabs["train_tab"].components["stop_btn"]
         ]
-        # Add current_task_box component
-        if "current_task_box" in self.tabs["train_tab"].components:
-            outputs.append(self.tabs["train_tab"].components["current_task_box"])
-        # Add delete_checkpoints_btn only if it exists
         if "delete_checkpoints_btn" in self.tabs["train_tab"].components:
-            outputs.append(self.tabs["train_tab"].components["delete_checkpoints_btn"])
-        else:
-            # Add pause_resume_btn as fallback
-            outputs.append(self.tabs["train_tab"].components["pause_resume_btn"])
-        status_timer.tick(
-            fn=self.tabs["train_tab"].get_latest_status_message_logs_and_button_labels,
-            outputs=outputs
         )
         # Dataset refresh timer (every 5 seconds)
@@ -175,6 +182,11 @@ class VideoTrainerUI:
             if "model_type" in recovery_ui:
                 model_type_value = recovery_ui["model_type"]
                 # If it's an internal name, convert to display name
                 if model_type_value not in MODEL_TYPES:
                     # Find the display name for this internal model type
@@ -201,7 +213,7 @@ class VideoTrainerUI:
                 ui_state["training_type"] = training_type_value
             # Copy other parameters
-            for param in ["lora_rank", "lora_alpha", "num_epochs",
                         "batch_size", "learning_rate", "save_iterations", "training_preset"]:
                 if param in recovery_ui:
                     ui_state[param] = recovery_ui[param]
@@ -216,31 +228,55 @@ class VideoTrainerUI:
         # Load values (potentially with recovery updates applied)
         ui_state = self.load_ui_values()
-        # Ensure model_type is a display name, not internal name
         model_type_val = ui_state.get("model_type", list(MODEL_TYPES.keys())[0])
         if model_type_val not in MODEL_TYPES:
-            # Convert from internal to display name
             for display_name, internal_name in MODEL_TYPES.items():
                 if internal_name == model_type_val:
                     model_type_val = display_name
                     break
-        # Ensure training_type is a display name, not internal name
         training_type_val = ui_state.get("training_type", list(TRAINING_TYPES.keys())[0])
         if training_type_val not in TRAINING_TYPES:
-            # Convert from internal to display name
             for display_name, internal_name in TRAINING_TYPES.items():
                 if internal_name == training_type_val:
                     training_type_val = display_name
                     break
         training_preset = ui_state.get("training_preset", list(TRAINING_PRESETS.keys())[0])
-        lora_rank_val = ui_state.get("lora_rank", "128")
-        lora_alpha_val = ui_state.get("lora_alpha", "128")
-        num_epochs_val = int(ui_state.get("num_epochs", 70))
-        batch_size_val = int(ui_state.get("batch_size", 1))
-        learning_rate_val = float(ui_state.get("learning_rate", 3e-5))
-        save_iterations_val = int(ui_state.get("save_iterations", 500))
         # Initial current task value
         current_task_val = ""
@@ -259,7 +295,7 @@ class VideoTrainerUI:
             training_type_val,
             lora_rank_val,
             lora_alpha_val,
-            num_epochs_val,
             batch_size_val,
             learning_rate_val,
             save_iterations_val,
@@ -275,12 +311,12 @@ class VideoTrainerUI:
             ui_state.get("training_preset", list(TRAINING_PRESETS.keys())[0]),
             ui_state.get("model_type", list(MODEL_TYPES.keys())[0]),
             ui_state.get("training_type", list(TRAINING_TYPES.keys())[0]),
-            ui_state.get("lora_rank", "128"),
-            ui_state.get("lora_alpha", "128"),
-            ui_state.get("num_epochs", 70),
-            ui_state.get("batch_size", 1),
-            ui_state.get("learning_rate", 3e-5),
-            ui_state.get("save_iterations", 500)
         )
     def update_ui_state(self, **kwargs):
@@ -296,12 +332,12 @@ class VideoTrainerUI:
         ui_state = self.trainer.load_ui_state()
         # Ensure proper type conversion for numeric values
-        ui_state["lora_rank"] = ui_state.get("lora_rank", "128")
-        ui_state["lora_alpha"] = ui_state.get("lora_alpha", "128")
-        ui_state["num_epochs"] = int(ui_state.get("num_epochs", 70))
-        ui_state["batch_size"] = int(ui_state.get("batch_size", 1))
-        ui_state["learning_rate"] = float(ui_state.get("learning_rate", 3e-5))
-        ui_state["save_iterations"] = int(ui_state.get("save_iterations", 500))
         return ui_state

 from ..config import (
     STORAGE_PATH, VIDEOS_TO_SPLIT_PATH, STAGING_PATH, OUTPUT_PATH,
     TRAINING_PATH, LOG_FILE_PATH, TRAINING_PRESETS, TRAINING_VIDEOS_PATH, MODEL_PATH, OUTPUT_PATH,
+    MODEL_TYPES, SMALL_TRAINING_BUCKETS, TRAINING_TYPES,
+    DEFAULT_NB_TRAINING_STEPS, DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS,
+    DEFAULT_BATCH_SIZE, DEFAULT_CAPTION_DROPOUT_P,
+    DEFAULT_LEARNING_RATE,
+    DEFAULT_LORA_RANK, DEFAULT_LORA_ALPHA,
+    DEFAULT_LORA_RANK_STR, DEFAULT_LORA_ALPHA_STR
 )
 from ..utils import count_media_files, format_media_title, TrainingLogParser
 from ..tabs import ImportTab, SplitTab, CaptionTab, TrainTab, ManageTab
                     self.tabs["train_tab"].components["training_type"],
                     self.tabs["train_tab"].components["lora_rank"],
                     self.tabs["train_tab"].components["lora_alpha"],
+                    self.tabs["train_tab"].components["train_steps"],
                     self.tabs["train_tab"].components["batch_size"],
                     self.tabs["train_tab"].components["learning_rate"],
                     self.tabs["train_tab"].components["save_iterations"],
     def _add_timers(self):
         """Add auto-refresh timers to the UI"""
+        # Status update timer for text components (every 1 second)
         status_timer = gr.Timer(value=1)
+        status_timer.tick(
+            fn=self.tabs["train_tab"].get_status_updates,  # Use a new function that returns appropriate updates
+            outputs=[
+                self.tabs["train_tab"].components["status_box"],
+                self.tabs["train_tab"].components["log_box"],
+                self.tabs["train_tab"].components["current_task_box"] if "current_task_box" in self.tabs["train_tab"].components else None
+            ]
+        )
+        # Button update timer for button components (every 1 second)
+        button_timer = gr.Timer(value=1)
+        button_outputs = [
             self.tabs["train_tab"].components["start_btn"],
             self.tabs["train_tab"].components["stop_btn"]
         ]
+        # Add delete_checkpoints_btn or pause_resume_btn as the third button
         if "delete_checkpoints_btn" in self.tabs["train_tab"].components:
+            button_outputs.append(self.tabs["train_tab"].components["delete_checkpoints_btn"])
+        elif "pause_resume_btn" in self.tabs["train_tab"].components:
+            button_outputs.append(self.tabs["train_tab"].components["pause_resume_btn"])
+        button_timer.tick(
+            fn=self.tabs["train_tab"].get_button_updates,  # Use a new function for button-specific updates
+            outputs=button_outputs
         )
         # Dataset refresh timer (every 5 seconds)
             if "model_type" in recovery_ui:
                 model_type_value = recovery_ui["model_type"]
+                # Remove " (LoRA)" suffix if present
+                if " (LoRA)" in model_type_value:
+                    model_type_value = model_type_value.replace(" (LoRA)", "")
+                    logger.info(f"Removed (LoRA) suffix from model type: {model_type_value}")
                 # If it's an internal name, convert to display name
                 if model_type_value not in MODEL_TYPES:
                     # Find the display name for this internal model type
                 ui_state["training_type"] = training_type_value
             # Copy other parameters
+            for param in ["lora_rank", "lora_alpha", "train_steps",
                         "batch_size", "learning_rate", "save_iterations", "training_preset"]:
                 if param in recovery_ui:
                     ui_state[param] = recovery_ui[param]
         # Load values (potentially with recovery updates applied)
         ui_state = self.load_ui_values()
+        # Ensure model_type is a valid display name
         model_type_val = ui_state.get("model_type", list(MODEL_TYPES.keys())[0])
+        # Remove " (LoRA)" suffix if present
+        if " (LoRA)" in model_type_val:
+            model_type_val = model_type_val.replace(" (LoRA)", "")
+            logger.info(f"Removed (LoRA) suffix from model type: {model_type_val}")
+        # Ensure it's a valid model type in the dropdown
         if model_type_val not in MODEL_TYPES:
+            # Convert from internal to display name or use default
+            model_type_found = False
             for display_name, internal_name in MODEL_TYPES.items():
                 if internal_name == model_type_val:
                     model_type_val = display_name
+                    model_type_found = True
                     break
+            # If still not found, use the first model type
+            if not model_type_found:
+                model_type_val = list(MODEL_TYPES.keys())[0]
+                logger.warning(f"Invalid model type '{model_type_val}', using default: {model_type_val}")
+        # Ensure training_type is a valid display name
         training_type_val = ui_state.get("training_type", list(TRAINING_TYPES.keys())[0])
         if training_type_val not in TRAINING_TYPES:
+            # Convert from internal to display name or use default
+            training_type_found = False
             for display_name, internal_name in TRAINING_TYPES.items():
                 if internal_name == training_type_val:
                     training_type_val = display_name
+                    training_type_found = True
                     break
+            # If still not found, use the first training type
+            if not training_type_found:
+                training_type_val = list(TRAINING_TYPES.keys())[0]
+                logger.warning(f"Invalid training type '{training_type_val}', using default: {training_type_val}")
+        # Validate training preset
         training_preset = ui_state.get("training_preset", list(TRAINING_PRESETS.keys())[0])
+        if training_preset not in TRAINING_PRESETS:
+            training_preset = list(TRAINING_PRESETS.keys())[0]
+            logger.warning(f"Invalid training preset '{training_preset}', using default: {training_preset}")
+        # Rest of the function remains unchanged
+        lora_rank_val = ui_state.get("lora_rank", DEFAULT_LORA_RANK_STR)
+        lora_alpha_val = ui_state.get("lora_alpha", DEFAULT_LORA_ALPHA_STR)
+        train_steps_val = int(ui_state.get("train_steps", DEFAULT_NB_TRAINING_STEPS))
+        batch_size_val = int(ui_state.get("batch_size", DEFAULT_BATCH_SIZE))
+        learning_rate_val = float(ui_state.get("learning_rate", DEFAULT_LEARNING_RATE))
+        save_iterations_val = int(ui_state.get("save_iterations", DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS))
         # Initial current task value
         current_task_val = ""
             training_type_val,
             lora_rank_val,
             lora_alpha_val,
+            train_steps_val,
             batch_size_val,
             learning_rate_val,
             save_iterations_val,
             ui_state.get("training_preset", list(TRAINING_PRESETS.keys())[0]),
             ui_state.get("model_type", list(MODEL_TYPES.keys())[0]),
             ui_state.get("training_type", list(TRAINING_TYPES.keys())[0]),
+            ui_state.get("lora_rank", DEFAULT_LORA_RANK_STR),
+            ui_state.get("lora_alpha", DEFAULT_LORA_ALPHA_STR),
+            ui_state.get("train_steps", DEFAULT_NB_TRAINING_STEPS),
+            ui_state.get("batch_size", DEFAULT_BATCH_SIZE),
+            ui_state.get("learning_rate", DEFAULT_LEARNING_RATE),
+            ui_state.get("save_iterations", DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS)
         )
     def update_ui_state(self, **kwargs):
         ui_state = self.trainer.load_ui_state()
         # Ensure proper type conversion for numeric values
+        ui_state["lora_rank"] = ui_state.get("lora_rank", DEFAULT_LORA_RANK_STR)
+        ui_state["lora_alpha"] = ui_state.get("lora_alpha", DEFAULT_LORA_ALPHA_STR)
+        ui_state["train_steps"] = int(ui_state.get("train_steps", DEFAULT_NB_TRAINING_STEPS))
+        ui_state["batch_size"] = int(ui_state.get("batch_size", DEFAULT_BATCH_SIZE))
+        ui_state["learning_rate"] = float(ui_state.get("learning_rate", DEFAULT_LEARNING_RATE))
+        ui_state["save_iterations"] = int(ui_state.get("save_iterations", DEFAULT_SAVE_CHECKPOINT_EVERY_N_STEPS))
         return ui_state