VideoModelStudio

Runtime error

App Files Files Community

jbilcke-hf HF Staff commited on Feb 19

Commit

947f205

1 Parent(s): 32b4f0f

ready for the demo

Browse files

Files changed (5) hide show

.gitignore +1 -0
app.py +21 -13
finetrainers_utils.py +7 -3
training_log_parser.py +1 -1
training_service.py +72 -13

.gitignore CHANGED Viewed

@@ -6,3 +6,4 @@ __pycache__
 *.mp4
 *.zip
 training_service.log

 *.mp4
 *.zip
 training_service.log
+wandb/

app.py CHANGED Viewed

@@ -125,8 +125,6 @@ class VideoTrainerUI:
             # Stop captioning if running
             if self.captioner:
                 self.captioner.stop_captioning()
-                #self.captioner.close()
-                #self.captioner = None
                 status_messages["captioning"] = "Captioning stopped"
             # Stop scene detection if running
@@ -134,6 +132,12 @@ class VideoTrainerUI:
                 self.splitter.processing = False
                 status_messages["splitting"] = "Scene detection stopped"
             if LOG_FILE_PATH.exists():
                 LOG_FILE_PATH.unlink()
@@ -153,6 +157,9 @@ class VideoTrainerUI:
             self._should_stop_captioning = True
             self.splitter.processing = False
             return {
                 "status": "All processes stopped and data cleared",
                 "details": status_messages
@@ -163,7 +170,7 @@ class VideoTrainerUI:
                 "status": f"Error during cleanup: {str(e)}",
                 "details": status_messages
             }
     def update_titles(self) -> Tuple[Any]:
         """Update all dynamic titles with current counts
@@ -664,20 +671,20 @@ class VideoTrainerUI:
                 with gr.TabItem("1️⃣  Import", id="import_tab"):
                     with gr.Row():
-                        gr.Markdown("## Optional: automated data cleaning")
                     with gr.Row():
                         enable_automatic_video_split = gr.Checkbox(
                             label="Automatically split videos into smaller clips",
                             info="Note: a clip is a single camera shot, usually a few seconds",
                             value=True,
-                            visible=False
                         )
                         enable_automatic_content_captioning = gr.Checkbox(
                             label="Automatically caption photos and videos",
                             info="Note: this uses LlaVA and takes some extra time to load and process",
                             value=False,
-                            visible=False,
                         )
                     with gr.Row():
@@ -889,13 +896,14 @@ class VideoTrainerUI:
                                         interactive=False,
                                         lines=4
                                     )
-                                    log_box = gr.TextArea(
-                                        label="Training Logs",
-                                        interactive=False,
-                                        lines=10,
-                                        max_lines=40,
-                                        autoscroll=True
-                                    )
                 with gr.TabItem("5️⃣  Manage"):

             # Stop captioning if running
             if self.captioner:
                 self.captioner.stop_captioning()
                 status_messages["captioning"] = "Captioning stopped"
             # Stop scene detection if running
                 self.splitter.processing = False
                 status_messages["splitting"] = "Scene detection stopped"
+            # Properly close logging before clearing log file
+            if self.trainer.file_handler:
+                self.trainer.file_handler.close()
+                logger.removeHandler(self.trainer.file_handler)
+                self.trainer.file_handler = None
             if LOG_FILE_PATH.exists():
                 LOG_FILE_PATH.unlink()
             self._should_stop_captioning = True
             self.splitter.processing = False
+            # Recreate logging setup
+            self.trainer.setup_logging()
             return {
                 "status": "All processes stopped and data cleared",
                 "details": status_messages
                 "status": f"Error during cleanup: {str(e)}",
                 "details": status_messages
             }
     def update_titles(self) -> Tuple[Any]:
         """Update all dynamic titles with current counts
                 with gr.TabItem("1️⃣  Import", id="import_tab"):
                     with gr.Row():
+                        gr.Markdown("## Automatic splitting and captioning")
                     with gr.Row():
                         enable_automatic_video_split = gr.Checkbox(
                             label="Automatically split videos into smaller clips",
                             info="Note: a clip is a single camera shot, usually a few seconds",
                             value=True,
+                            visible=True
                         )
                         enable_automatic_content_captioning = gr.Checkbox(
                             label="Automatically caption photos and videos",
                             info="Note: this uses LlaVA and takes some extra time to load and process",
                             value=False,
+                            visible=True,
                         )
                     with gr.Row():
                                         interactive=False,
                                         lines=4
                                     )
+                                    with gr.Accordion("See training logs"):
+                                        log_box = gr.TextArea(
+                                            label="Finetrainers output (see HF Space logs for more details)",
+                                            interactive=False,
+                                            lines=40,
+                                            max_lines=200,
+                                            autoscroll=True
+                                        )
                 with gr.TabItem("5️⃣  Manage"):

finetrainers_utils.py CHANGED Viewed

@@ -115,9 +115,13 @@ def copy_files_to_training_dir(prompt_prefix: str) -> int:
         # make sure we only copy over VALID pairs
         if caption:
-            target_caption_path.write_text(caption)
-            shutil.copy2(file_path, target_file_path)
-            nb_copied_pairs += 1
     prepare_finetrainers_dataset()

         # make sure we only copy over VALID pairs
         if caption:
+            try:
+                target_caption_path.write_text(caption)
+                shutil.copy2(file_path, target_file_path)
+                nb_copied_pairs += 1
+            except Exception as e:
+                print(f"failed to copy one of the pairs: {e}")
+                pass
     prepare_finetrainers_dataset()

training_log_parser.py CHANGED Viewed

@@ -71,7 +71,7 @@ class TrainingLogParser:
             # Training step progress line example:
             # Training steps:   1%|▏         | 1/70 [00:14<16:11, 14.08s/it, grad_norm=0.00789, step_loss=0.555, lr=3e-7]
-            if ("Started training" in line) or (("Starting training" in line):
                 self.state.status = "training"
             if "Training steps:" in line:

             # Training step progress line example:
             # Training steps:   1%|▏         | 1/70 [00:14<16:11, 14.08s/it, grad_norm=0.00789, step_loss=0.555, lr=3e-7]
+            if ("Started training" in line) or ("Starting training" in line):
                 self.state.status = "training"
             if "Training steps:" in line:

training_service.py CHANGED Viewed

@@ -23,15 +23,6 @@ from config import TrainingConfig, LOG_FILE_PATH, TRAINING_VIDEOS_PATH, STORAGE_
 from utils import make_archive, parse_training_log, is_image_file, is_video_file
 from finetrainers_utils import prepare_finetrainers_dataset, copy_files_to_training_dir
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.StreamHandler(sys.stdout),
-        logging.FileHandler(str(LOG_FILE_PATH))
-    ]
-)
 logger = logging.getLogger(__name__)
 class TrainingService:
@@ -41,8 +32,69 @@ class TrainingService:
         self.status_file = OUTPUT_PATH / "status.json"
         self.pid_file = OUTPUT_PATH / "training.pid"
         self.log_file = OUTPUT_PATH / "training.log"
         logger.info("Training service initialized")
     def save_session(self, params: Dict) -> None:
         """Save training session parameters"""
         session_data = {
@@ -73,7 +125,7 @@ class TrainingService:
         try:
             with open(self.status_file, 'r') as f:
                 status = json.load(f)
-                print("status found in the json:", status)
             # Check if process is actually running
             if self.pid_file.exists():
@@ -81,7 +133,7 @@ class TrainingService:
                     pid = int(f.read().strip())
                 if not psutil.pid_exists(pid):
                     # Process died unexpectedly
-                    if status['status'] == 'running':
                         status['status'] = 'error'
                         status['message'] = 'Training process terminated unexpectedly'
                         self.append_log("Training process terminated unexpectedly")
@@ -302,7 +354,7 @@ class TrainingService:
             # Update initial training status
             total_steps = num_epochs * (max(1, video_count) // batch_size)
             self.save_status(
-                state='running',
                 epoch=0,
                 step=0,
                 total_steps=total_steps,
@@ -389,7 +441,7 @@ class TrainingService:
             if psutil.pid_exists(pid):
                 os.kill(pid, signal.SIGUSR2)  # Signal to resume
-                self.save_status(state='running', message='Training resumed')
                 self.append_log("Training resumed")
             return "Training resumed", self.get_logs()
@@ -437,6 +489,13 @@ class TrainingService:
             'timestamp': datetime.now().isoformat(),
             **kwargs
         }
         with open(self.status_file, 'w') as f:
             json.dump(status, f, indent=2)

 from utils import make_archive, parse_training_log, is_image_file, is_video_file
 from finetrainers_utils import prepare_finetrainers_dataset, copy_files_to_training_dir
 logger = logging.getLogger(__name__)
 class TrainingService:
         self.status_file = OUTPUT_PATH / "status.json"
         self.pid_file = OUTPUT_PATH / "training.pid"
         self.log_file = OUTPUT_PATH / "training.log"
+        self.file_handler = None
+        self.setup_logging()
         logger.info("Training service initialized")
+    def setup_logging(self):
+        """Set up logging with proper handler management"""
+        global logger
+        logger = logging.getLogger(__name__)
+        logger.setLevel(logging.INFO)
+        # Remove any existing handlers to avoid duplicates
+        logger.handlers.clear()
+        # Add stdout handler
+        stdout_handler = logging.StreamHandler(sys.stdout)
+        stdout_handler.setFormatter(logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        ))
+        logger.addHandler(stdout_handler)
+        # Add file handler if log file is accessible
+        try:
+            # Close existing file handler if it exists
+            if self.file_handler:
+                self.file_handler.close()
+                logger.removeHandler(self.file_handler)
+            self.file_handler = logging.FileHandler(str(LOG_FILE_PATH))
+            self.file_handler.setFormatter(logging.Formatter(
+                '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+            ))
+            logger.addHandler(self.file_handler)
+        except Exception as e:
+            logger.warning(f"Could not set up log file: {e}")
+    def clear_logs(self) -> None:
+        """Clear log file with proper handler cleanup"""
+        try:
+            # Remove and close the file handler
+            if self.file_handler:
+                logger.removeHandler(self.file_handler)
+                self.file_handler.close()
+                self.file_handler = None
+            # Delete the file if it exists
+            if LOG_FILE_PATH.exists():
+                LOG_FILE_PATH.unlink()
+            # Recreate logging setup
+            self.setup_logging()
+            self.append_log("Log file cleared and recreated")
+        except Exception as e:
+            logger.error(f"Error clearing logs: {e}")
+            raise
+    def __del__(self):
+        """Cleanup when the service is destroyed"""
+        if self.file_handler:
+            self.file_handler.close()
     def save_session(self, params: Dict) -> None:
         """Save training session parameters"""
         session_data = {
         try:
             with open(self.status_file, 'r') as f:
                 status = json.load(f)
+                #print("status found in the json:", status)
             # Check if process is actually running
             if self.pid_file.exists():
                     pid = int(f.read().strip())
                 if not psutil.pid_exists(pid):
                     # Process died unexpectedly
+                    if status['status'] == 'training':
                         status['status'] = 'error'
                         status['message'] = 'Training process terminated unexpectedly'
                         self.append_log("Training process terminated unexpectedly")
             # Update initial training status
             total_steps = num_epochs * (max(1, video_count) // batch_size)
             self.save_status(
+                state='training',
                 epoch=0,
                 step=0,
                 total_steps=total_steps,
             if psutil.pid_exists(pid):
                 os.kill(pid, signal.SIGUSR2)  # Signal to resume
+                self.save_status(state='training', message='Training resumed')
                 self.append_log("Training resumed")
             return "Training resumed", self.get_logs()
             'timestamp': datetime.now().isoformat(),
             **kwargs
         }
+        if state === "Training started" or state == "initializing":
+            gr.Info("Initializing model and dataset..")
+        elif state == "training":
+            gr.Info("Training started!")
+        elif state == "completed":
+            gr.Info("Training completed!")
         with open(self.status_file, 'w') as f:
             json.dump(status, f, indent=2)