Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke-hf HF Staff commited on Feb 26

Commit

dd52c86

1 Parent(s): 82a6631

working on better captioning status tracking

Browse files

Files changed (8) hide show

app.py +119 -23
vms/captioning_service.py +3 -3
vms/config.py +1 -1
vms/finetrainers_utils.py +3 -2
vms/image_preprocessing.py +2 -1
vms/import_service.py +3 -3
vms/splitting_service.py +4 -4
vms/training_service.py +4 -4

app.py CHANGED Viewed

@@ -60,6 +60,71 @@ class VideoTrainerUI:
         self._should_stop_captioning = False
         self.log_parser = TrainingLogParser()
     def update_training_ui(self, training_state: Dict[str, Any]):
         """Update UI components based on training state"""
         updates = {}
@@ -221,48 +286,75 @@ class VideoTrainerUI:
             # Initialize captioner if not already done
             self._should_stop_captioning = False
             async for rows in self.captioner.start_caption_generation(captioning_bot_instructions, prompt_prefix):
                 # Yield UI update
                 yield gr.update(
-                    value=rows,
                     headers=["name", "status"]
                 )
-            # Final update after completion
             yield gr.update(
                 value=self.list_training_files_to_caption(),
                 headers=["name", "status"]
             )
         except Exception as e:
             yield gr.update(
-                value=[[str(e), "error"]],
                 headers=["name", "status"]
             )
     def list_training_files_to_caption(self) -> List[List[str]]:
         """List all clips and images - both pending and captioned"""
         files = []
-        already_listed: Dict[str, bool] = {}
-        # Check files in STAGING_PATH
         for file in STAGING_PATH.glob("*.*"):
             if is_video_file(file) or is_image_file(file):
                 txt_file = file.with_suffix('.txt')
-                status = "captioned" if txt_file.exists() else "no caption"
                 file_type = "video" if is_video_file(file) else "image"
                 files.append([file.name, f"{status} ({file_type})", str(file)])
-                already_listed[str(file.name)] = True
-        # Check files in TRAINING_VIDEOS_PATH
         for file in TRAINING_VIDEOS_PATH.glob("*.*"):
-            if not str(file.name) in already_listed:
-                if is_video_file(file) or is_image_file(file):
-                    txt_file = file.with_suffix('.txt')
-                    if txt_file.exists():
-                        file_type = "video" if is_video_file(file) else "image"
-                        files.append([file.name, f"captioned ({file_type})", str(file)])
         # Sort by filename
         files.sort(key=lambda x: x[0])
@@ -1106,24 +1198,28 @@ class VideoTrainerUI:
                 }
             run_autocaption_btn.click(
                 fn=self.start_caption_generation,
                 inputs=[captioning_bot_instructions, custom_prompt_prefix],
                 outputs=[training_dataset],
             ).then(
-                fn=lambda: update_button_states(True),
-                outputs=[run_autocaption_btn, stop_autocaption_btn]
             )
             copy_files_to_training_dir_btn.click(
                 fn=self.copy_files_to_training_dir,
                 inputs=[custom_prompt_prefix]
             )
             stop_autocaption_btn.click(
-                fn=lambda: (self.captioner.stop_captioning() if self.captioner else None, update_button_states(False)),
-                outputs=[run_autocaption_btn, stop_autocaption_btn]
             )
             training_dataset.select(
                 fn=self.handle_training_dataset_select,
                 outputs=[preview_image, preview_video, preview_caption, preview_status]

         self._should_stop_captioning = False
         self.log_parser = TrainingLogParser()
+    def update_captioning_buttons_start(self):
+        return {
+            "run_autocaption_btn": gr.Button(
+                interactive=False,
+                variant="secondary",
+            ),
+            "stop_autocaption_btn": gr.Button(
+                interactive=True,
+                variant="stop",
+            ),
+            "copy_files_to_training_dir_btn": gr.Button(
+                interactive=False,
+                variant="secondary",
+            )
+        }
+    def update_captioning_buttons_end(self):
+        return {
+            "run_autocaption_btn": gr.Button(
+                interactive=True,
+                variant="primary",
+            ),
+            "stop_autocaption_btn": gr.Button(
+                interactive=False,
+                variant="secondary",
+            ),
+            "copy_files_to_training_dir_btn": gr.Button(
+                interactive=True,
+                variant="primary",
+            )
+        }
+    def show_refreshing_status(self) -> List[List[str]]:
+        """Show a 'Refreshing...' status in the dataframe"""
+        return [["Refreshing...", "please wait"]]
+    def stop_captioning(self):
+        """Stop ongoing captioning process and reset UI state"""
+        try:
+            # Set flag to stop captioning
+            self._should_stop_captioning = True
+            # Call stop method on captioner
+            if self.captioner:
+                self.captioner.stop_captioning()
+            # Get updated file list
+            updated_list = self.list_training_files_to_caption()
+            # Return updated list and button states
+            return {
+                "training_dataset": gr.update(value=updated_list),
+                "run_autocaption_btn": gr.Button(interactive=True, variant="primary"),
+                "stop_autocaption_btn": gr.Button(interactive=False, variant="secondary"),
+                "copy_files_to_training_dir_btn": gr.Button(interactive=True, variant="primary")
+            }
+        except Exception as e:
+            logger.error(f"Error stopping captioning: {str(e)}")
+            return {
+                "training_dataset": gr.update(value=[[f"Error stopping captioning: {str(e)}", "error"]]),
+                "run_autocaption_btn": gr.Button(interactive=True, variant="primary"),
+                "stop_autocaption_btn": gr.Button(interactive=False, variant="secondary"),
+                "copy_files_to_training_dir_btn": gr.Button(interactive=True, variant="primary")
+            }
     def update_training_ui(self, training_state: Dict[str, Any]):
         """Update UI components based on training state"""
         updates = {}
             # Initialize captioner if not already done
             self._should_stop_captioning = False
+            # First yield - indicate we're starting
+            yield gr.update(
+                value=[["Starting captioning service...", "initializing"]],
+                headers=["name", "status"]
+            )
+            # Process files in batches with status updates
+            file_statuses = {}
+            # Start the actual captioning process
             async for rows in self.captioner.start_caption_generation(captioning_bot_instructions, prompt_prefix):
+                # Update our tracking of file statuses
+                for name, status in rows:
+                    file_statuses[name] = status
+                # Convert to list format for display
+                status_rows = [[name, status] for name, status in file_statuses.items()]
+                # Sort by name for consistent display
+                status_rows.sort(key=lambda x: x[0])
                 # Yield UI update
                 yield gr.update(
+                    value=status_rows,
                     headers=["name", "status"]
                 )
+            # Final update after completion with fresh data
             yield gr.update(
                 value=self.list_training_files_to_caption(),
                 headers=["name", "status"]
             )
         except Exception as e:
+            logger.error(f"Error in captioning: {str(e)}")
             yield gr.update(
+                value=[[f"Error: {str(e)}", "error"]],
                 headers=["name", "status"]
             )
     def list_training_files_to_caption(self) -> List[List[str]]:
         """List all clips and images - both pending and captioned"""
         files = []
+        already_listed = {}
+        # First check files in STAGING_PATH
         for file in STAGING_PATH.glob("*.*"):
             if is_video_file(file) or is_image_file(file):
                 txt_file = file.with_suffix('.txt')
+                # Check if caption file exists and has content
+                has_caption = txt_file.exists() and txt_file.stat().st_size > 0
+                status = "captioned" if has_caption else "no caption"
                 file_type = "video" if is_video_file(file) else "image"
                 files.append([file.name, f"{status} ({file_type})", str(file)])
+                already_listed[file.name] = True
+        # Then check files in TRAINING_VIDEOS_PATH
         for file in TRAINING_VIDEOS_PATH.glob("*.*"):
+            if (is_video_file(file) or is_image_file(file)) and file.name not in already_listed:
+                txt_file = file.with_suffix('.txt')
+                # Only include files with captions
+                if txt_file.exists() and txt_file.stat().st_size > 0:
+                    file_type = "video" if is_video_file(file) else "image"
+                    files.append([file.name, f"captioned ({file_type})", str(file)])
+                    already_listed[file.name] = True
         # Sort by filename
         files.sort(key=lambda x: x[0])
                 }
             run_autocaption_btn.click(
+                fn=self.show_refreshing_status,
+                outputs=[training_dataset]
+            ).then(
+                fn=lambda: self.update_captioning_buttons_start(),
+                outputs=[run_autocaption_btn, stop_autocaption_btn, copy_files_to_training_dir_btn]
+            ).then(
                 fn=self.start_caption_generation,
                 inputs=[captioning_bot_instructions, custom_prompt_prefix],
                 outputs=[training_dataset],
             ).then(
+                fn=lambda: self.update_captioning_buttons_end(),
+                outputs=[run_autocaption_btn, stop_autocaption_btn, copy_files_to_training_dir_btn]
             )
             copy_files_to_training_dir_btn.click(
                 fn=self.copy_files_to_training_dir,
                 inputs=[custom_prompt_prefix]
             )
             stop_autocaption_btn.click(
+                fn=self.stop_captioning,
+                outputs=[training_dataset, run_autocaption_btn, stop_autocaption_btn, copy_files_to_training_dir_btn]
             )
             training_dataset.select(
                 fn=self.handle_training_dataset_select,
                 outputs=[preview_image, preview_video, preview_caption, preview_status]

vms/captioning_service.py CHANGED Viewed

@@ -17,9 +17,9 @@ from llava.mm_utils import tokenizer_image_token
 from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 from llava.conversation import conv_templates, SeparatorStyle
-from config import TRAINING_VIDEOS_PATH, STAGING_PATH, PRELOAD_CAPTIONING_MODEL, CAPTIONING_MODEL, USE_MOCK_CAPTIONING_MODEL, DEFAULT_CAPTIONING_BOT_INSTRUCTIONS, VIDEOS_TO_SPLIT_PATH, DEFAULT_PROMPT_PREFIX
-from utils import extract_scene_info, is_image_file, is_video_file
-from finetrainers_utils import copy_files_to_training_dir, prepare_finetrainers_dataset
 logger = logging.getLogger(__name__)

 from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 from llava.conversation import conv_templates, SeparatorStyle
+from .config import TRAINING_VIDEOS_PATH, STAGING_PATH, PRELOAD_CAPTIONING_MODEL, CAPTIONING_MODEL, USE_MOCK_CAPTIONING_MODEL, DEFAULT_CAPTIONING_BOT_INSTRUCTIONS, VIDEOS_TO_SPLIT_PATH, DEFAULT_PROMPT_PREFIX
+from .utils import extract_scene_info, is_image_file, is_video_file
+from .finetrainers_utils import copy_files_to_training_dir, prepare_finetrainers_dataset
 logger = logging.getLogger(__name__)

vms/config.py CHANGED Viewed

@@ -29,7 +29,7 @@ DEFAULT_PROMPT_PREFIX = "In the style of TOK, "
 # This is only use to debug things in local
 USE_MOCK_CAPTIONING_MODEL = parse_bool_env(os.environ.get('USE_MOCK_CAPTIONING_MODEL'))
-DEFAULT_CAPTIONING_BOT_INSTRUCTIONS = "Please write a full description of the following video: camera (close-up shot, medium-shot..), genre (music video, horror movie scene, video game footage, go pro footage, japanese anime, noir film, science-fiction, action movie, documentary..), characters (physical appearance, look, skin, facial features, haircut, clothing), scene (action, positions, movements), location (indoor, outdoor, place, building, country..), time and lighting (natural, golden hour, night time, LED lights, kelvin temperature etc), weather and climate (dusty, rainy, fog, haze, snowing..), era/settings"
 # Create directories
 STORAGE_PATH.mkdir(parents=True, exist_ok=True)

 # This is only use to debug things in local
 USE_MOCK_CAPTIONING_MODEL = parse_bool_env(os.environ.get('USE_MOCK_CAPTIONING_MODEL'))
+DEFAULT_CAPTIONING_BOT_INSTRUCTIONS = "Please write a full video description.  Be synthetic, don't say things like "this video features.." etc. Instead, methodically list camera (close-up shot, medium-shot..), genre (music video, horror movie scene, video game footage, go pro footage, japanese anime, noir film, science-fiction, action movie, documentary..), characters (physical appearance, look, skin, facial features, haircut, clothing), scene (action, positions, movements), location (indoor, outdoor, place, building, country..), time and lighting (natural, golden hour, night time, LED lights, kelvin temperature etc), weather and climate (dusty, rainy, fog, haze, snowing..), era/settings."
 # Create directories
 STORAGE_PATH.mkdir(parents=True, exist_ok=True)

vms/finetrainers_utils.py CHANGED Viewed

@@ -3,8 +3,9 @@ from pathlib import Path
 import logging
 import shutil
 from typing import Any, Optional, Dict, List, Union, Tuple
-from config import STORAGE_PATH, TRAINING_PATH, STAGING_PATH, TRAINING_VIDEOS_PATH, MODEL_PATH, OUTPUT_PATH, HF_API_TOKEN, MODEL_TYPES
-from utils import get_video_fps, extract_scene_info, make_archive, is_image_file, is_video_file
 logger = logging.getLogger(__name__)

 import logging
 import shutil
 from typing import Any, Optional, Dict, List, Union, Tuple
+from .config import STORAGE_PATH, TRAINING_PATH, STAGING_PATH, TRAINING_VIDEOS_PATH, MODEL_PATH, OUTPUT_PATH, HF_API_TOKEN, MODEL_TYPES
+from .utils import get_video_fps, extract_scene_info, make_archive, is_image_file, is_video_file
 logger = logging.getLogger(__name__)

vms/image_preprocessing.py CHANGED Viewed

@@ -4,7 +4,8 @@ from pathlib import Path
 from PIL import Image
 import pillow_avif
 import logging
-from config import NORMALIZE_IMAGES_TO, JPEG_QUALITY
 logger = logging.getLogger(__name__)

 from PIL import Image
 import pillow_avif
 import logging
+from .config import NORMALIZE_IMAGES_TO, JPEG_QUALITY
 logger = logging.getLogger(__name__)

vms/import_service.py CHANGED Viewed

@@ -7,10 +7,10 @@ from pathlib import Path
 from typing import List, Dict, Optional, Tuple
 from pytubefix import YouTube
 import logging
-from utils import is_image_file, is_video_file, add_prefix_to_caption
-from image_preprocessing import normalize_image
-from config import NORMALIZE_IMAGES_TO, TRAINING_VIDEOS_PATH, VIDEOS_TO_SPLIT_PATH, TRAINING_PATH, DEFAULT_PROMPT_PREFIX
 logger = logging.getLogger(__name__)

 from typing import List, Dict, Optional, Tuple
 from pytubefix import YouTube
 import logging
+from .utils import is_image_file, is_video_file, add_prefix_to_caption
+from .image_preprocessing import normalize_image
+from .config import NORMALIZE_IMAGES_TO, TRAINING_VIDEOS_PATH, VIDEOS_TO_SPLIT_PATH, TRAINING_PATH, DEFAULT_PROMPT_PREFIX
 logger = logging.getLogger(__name__)

vms/splitting_service.py CHANGED Viewed

@@ -12,11 +12,11 @@ import gradio as gr
 from scenedetect import detect, ContentDetector, SceneManager, open_video
 from scenedetect.video_splitter import split_video_ffmpeg
-from config import TRAINING_PATH, STORAGE_PATH, TRAINING_VIDEOS_PATH, VIDEOS_TO_SPLIT_PATH, STAGING_PATH, DEFAULT_PROMPT_PREFIX
-from image_preprocessing import detect_black_bars
-from video_preprocessing import remove_black_bars
-from utils import extract_scene_info, is_video_file, is_image_file, add_prefix_to_caption
 logger = logging.getLogger(__name__)

 from scenedetect import detect, ContentDetector, SceneManager, open_video
 from scenedetect.video_splitter import split_video_ffmpeg
+from .config import TRAINING_PATH, STORAGE_PATH, TRAINING_VIDEOS_PATH, VIDEOS_TO_SPLIT_PATH, STAGING_PATH, DEFAULT_PROMPT_PREFIX
+from .image_preprocessing import detect_black_bars
+from .video_preprocessing import remove_black_bars
+from .utils import extract_scene_info, is_video_file, is_image_file, add_prefix_to_caption
 logger = logging.getLogger(__name__)

vms/training_service.py CHANGED Viewed

@@ -18,10 +18,10 @@ import select
 from typing import Any, Optional, Dict, List, Union, Tuple
-from huggingface_hub import upload_folder, create_repo
-from config import TrainingConfig, TRAINING_PRESETS,  LOG_FILE_PATH, TRAINING_VIDEOS_PATH, STORAGE_PATH, TRAINING_PATH, MODEL_PATH, OUTPUT_PATH, HF_API_TOKEN, MODEL_TYPES
-from utils import make_archive, parse_training_log, is_image_file, is_video_file
-from finetrainers_utils import prepare_finetrainers_dataset, copy_files_to_training_dir
 logger = logging.getLogger(__name__)

 from typing import Any, Optional, Dict, List, Union, Tuple
+from .huggingface_hub import upload_folder, create_repo
+from .config import TrainingConfig, TRAINING_PRESETS,  LOG_FILE_PATH, TRAINING_VIDEOS_PATH, STORAGE_PATH, TRAINING_PATH, MODEL_PATH, OUTPUT_PATH, HF_API_TOKEN, MODEL_TYPES
+from .utils import make_archive, parse_training_log, is_image_file, is_video_file
+from .finetrainers_utils import prepare_finetrainers_dataset, copy_files_to_training_dir
 logger = logging.getLogger(__name__)