Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke-hf HF Staff commited on Feb 19

Commit

f577b1e

1 Parent(s): df0584b

investigating captionning issues

Browse files

Files changed (2) hide show

app.py +3 -3
captioning_service.py +67 -52

app.py CHANGED Viewed

@@ -263,15 +263,15 @@ class VideoTrainerUI:
         is_completed = training_state["status"] in ["completed", "error", "stopped"]
         return {
-            start_btn: gr.Button(
                 interactive=not is_training and not is_paused,
                 variant="primary" if not is_training else "secondary",
             ),
-            stop_btn: gr.Button(
                 interactive=is_training or is_paused,
                 variant="stop",
             ),
-            pause_resume_btn: gr.Button(
                 value="Resume Training" if is_paused else "Pause Training",
                 interactive=(is_training or is_paused) and not is_completed,
                 variant="secondary",

         is_completed = training_state["status"] in ["completed", "error", "stopped"]
         return {
+            "start_btn": gr.Button(
                 interactive=not is_training and not is_paused,
                 variant="primary" if not is_training else "secondary",
             ),
+            "stop_btn": gr.Button(
                 interactive=is_training or is_paused,
                 variant="stop",
             ),
+            "pause_resume_btn": gr.Button(
                 value="Resume Training" if is_paused else "Pause Training",
                 interactive=(is_training or is_paused) and not is_completed,
                 variant="secondary",

captioning_service.py CHANGED Viewed

@@ -2,8 +2,6 @@ import logging
 import torch
 import shutil
 import gradio as gr
-from llava.model.builder import load_pretrained_model
-from llava.mm_utils import tokenizer_image_token
 import numpy as np
 from decord import VideoReader, cpu
 from pathlib import Path
@@ -12,6 +10,10 @@ import asyncio
 from dataclasses import dataclass
 from datetime import datetime
 import cv2
 from config import TRAINING_VIDEOS_PATH, STAGING_PATH, PRELOAD_CAPTIONING_MODEL, CAPTIONING_MODEL, USE_MOCK_CAPTIONING_MODEL, DEFAULT_CAPTIONING_BOT_INSTRUCTIONS, VIDEOS_TO_SPLIT_PATH, DEFAULT_PROMPT_PREFIX
 from utils import extract_scene_info, is_image_file, is_video_file
 from finetrainers_utils import copy_files_to_training_dir, prepare_finetrainers_dataset
@@ -142,12 +144,21 @@ class CaptioningService:
         self.model.eval()
     def _load_video(self, video_path: Path, max_frames_num: int = 64, fps: int = 1, force_sample: bool = True) -> tuple[np.ndarray, str, float]:
-        """Load and preprocess video frames"""
-        video_path_str = str(video_path) if hasattr(video_path, '__fspath__') else video_path
         logger.debug(f"Loading video: {video_path_str}")
         if max_frames_num == 0:
             return np.zeros((1, 336, 336, 3)), "", 0
@@ -155,17 +166,18 @@ class CaptioningService:
         total_frame_num = len(vr)
         video_time = total_frame_num / vr.get_avg_fps()
-        # Calculate frame indices
         fps = round(vr.get_avg_fps()/fps)
         frame_idx = [i for i in range(0, len(vr), fps)]
         frame_time = [i/fps for i in frame_idx]
         if len(frame_idx) > max_frames_num or force_sample:
             sample_fps = max_frames_num
             uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
             frame_idx = uniform_sampled_frames.tolist()
             frame_time = [i/vr.get_avg_fps() for i in frame_idx]
         frame_time_str = ",".join([f"{i:.2f}s" for i in frame_time])
         try:
@@ -181,7 +193,7 @@ class CaptioningService:
             video_name = video_path.name
             logger.info(f"Starting processing of video: {video_name}")
-            # Load video metadata
             logger.debug(f"Loading video metadata for {video_name}")
             loop = asyncio.get_event_loop()
             vr = await loop.run_in_executor(None, lambda: VideoReader(str(video_path), ctx=cpu(0)))
@@ -201,28 +213,21 @@ class CaptioningService:
             parent_caption = ""
             if "___" in video_path.stem:
                 parent_name, _ = extract_scene_info(video_path.stem)
-                #print(f"parent_name is {parent_name}")
                 parent_txt_path = VIDEOS_TO_SPLIT_PATH / f"{parent_name}.txt"
                 if parent_txt_path.exists():
-                    logger.debug(f"Found parent caption file: {parent_txt_path}")
                     parent_caption = parent_txt_path.read_text().strip()
             # Ensure model is loaded before processing
             await self.ensure_model_loaded()
             if USE_MOCK_CAPTIONING_MODEL:
                 # Even in mock mode, we'll generate a caption that shows we processed parent info
                 clip_caption = f"This is a test caption for {video_name}"
                 # Combine clip caption with parent caption
-                if parent_caption and not full_caption.endswith(parent_caption):
-                    #print(f"we have parent_caption, so we define the full_caption as {clip_caption}\n{parent_caption}")
                     full_caption = f"{clip_caption}\n{parent_caption}"
                 else:
-                    #print(f"we don't have a parent_caption, so we define the full_caption as {clip_caption}")
                     full_caption = clip_caption
                 if prompt_prefix and not full_caption.startswith(prompt_prefix):
@@ -238,13 +243,12 @@ class CaptioningService:
                 progress.processed_frames = total_frames
                 progress.completed_at = datetime.now()
                 yield progress, full_caption
             else:
-                # Process frames in batches
-                max_frames_num = 64
                 frames, frame_times_str, video_time = await loop.run_in_executor(
                     None,
-                    lambda: self._load_video(video_path, max_frames_num)
                 )
                 # Process all frames at once using the image processor
@@ -264,16 +268,27 @@ class CaptioningService:
                 # Move processed frames to GPU
                 video_tensor = processed_frames.to('cuda').bfloat16()
                 time_instruction = (f"The video lasts for {video_time:.2f} seconds, and {len(frames)} "
                                 f"frames are uniformly sampled from it. These frames are located at {frame_times_str}.")
-                full_prompt = f"<image>{time_instruction}\n{prompt}"
                 input_ids = await loop.run_in_executor(
                     None,
-                    lambda: tokenizer_image_token(full_prompt, self.tokenizer, return_tensors="pt").unsqueeze(0).to('cuda')
                 )
-                # Generate caption
                 with torch.no_grad():
                     output = await loop.run_in_executor(
                         None,
@@ -283,45 +298,45 @@ class CaptioningService:
                             modalities=["video"],
                             do_sample=False,
                             temperature=0,
-                            max_new_tokens=4096,
                         )
                     )
-                clip_caption = await loop.run_in_executor(
-                    None,
-                    lambda: self.tokenizer.batch_decode(output, skip_special_tokens=True)[0].strip()
-                )
-            # Combine clip caption with parent caption
-            if parent_caption:
-                print(f"we have parent_caption, so we define the full_caption as {clip_caption}\n{parent_caption}")
-                full_caption = f"{clip_caption}\n{parent_caption}"
-            else:
-                print(f"we don't have a parent_caption, so we define the full_caption as {clip_caption}")
-                full_caption = clip_caption
-            if prompt_prefix:
-                full_caption = f"{prompt_prefix}{full_caption}"
-            # Write the caption file
-            txt_path = video_path.with_suffix('.txt')
-            txt_path.write_text(full_caption)
-            progress.status = "completed"
-            progress.completed_at = datetime.now()
-            gr.Info(f"Successfully generated caption for {video_name}")
-            yield progress, full_caption
         except Exception as e:
             progress.status = "error"
             progress.error = str(e)
             progress.completed_at = datetime.now()
             yield progress, None
-            raise gr.Error(f"Error processing video: {str(e)}")
     async def process_image(self, image_path: Path, prompt: str, prompt_prefix: str = "") -> AsyncGenerator[tuple[CaptioningProgress, Optional[str]], None]:
         """Process a single image for captioning"""
         try:

 import torch
 import shutil
 import gradio as gr
 import numpy as np
 from decord import VideoReader, cpu
 from pathlib import Path
 from dataclasses import dataclass
 from datetime import datetime
 import cv2
+import copy
+from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
 from config import TRAINING_VIDEOS_PATH, STAGING_PATH, PRELOAD_CAPTIONING_MODEL, CAPTIONING_MODEL, USE_MOCK_CAPTIONING_MODEL, DEFAULT_CAPTIONING_BOT_INSTRUCTIONS, VIDEOS_TO_SPLIT_PATH, DEFAULT_PROMPT_PREFIX
 from utils import extract_scene_info, is_image_file, is_video_file
 from finetrainers_utils import copy_files_to_training_dir, prepare_finetrainers_dataset
         self.model.eval()
     def _load_video(self, video_path: Path, max_frames_num: int = 64, fps: int = 1, force_sample: bool = True) -> tuple[np.ndarray, str, float]:
+        """Load and preprocess video frames with strict limits
+        Args:
+            video_path: Path to video file
+            max_frames_num: Maximum number of frames to extract (default: 64)
+            fps: Frames per second to sample (default: 1)
+            force_sample: Whether to force uniform sampling (default: True)
+        Returns:
+            Tuple of (frames, frame_times_str, video_time)
+        """
+        video_path_str = str(video_path)
         logger.debug(f"Loading video: {video_path_str}")
+        # Handle empty video case
         if max_frames_num == 0:
             return np.zeros((1, 336, 336, 3)), "", 0
         total_frame_num = len(vr)
         video_time = total_frame_num / vr.get_avg_fps()
+        # Calculate frame indices with uniform sampling
         fps = round(vr.get_avg_fps()/fps)
         frame_idx = [i for i in range(0, len(vr), fps)]
         frame_time = [i/fps for i in frame_idx]
+        # Force uniform sampling if too many frames
         if len(frame_idx) > max_frames_num or force_sample:
             sample_fps = max_frames_num
             uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
             frame_idx = uniform_sampled_frames.tolist()
             frame_time = [i/vr.get_avg_fps() for i in frame_idx]
         frame_time_str = ",".join([f"{i:.2f}s" for i in frame_time])
         try:
             video_name = video_path.name
             logger.info(f"Starting processing of video: {video_name}")
+            # Load video metadata with strict frame limits
             logger.debug(f"Loading video metadata for {video_name}")
             loop = asyncio.get_event_loop()
             vr = await loop.run_in_executor(None, lambda: VideoReader(str(video_path), ctx=cpu(0)))
             parent_caption = ""
             if "___" in video_path.stem:
                 parent_name, _ = extract_scene_info(video_path.stem)
                 parent_txt_path = VIDEOS_TO_SPLIT_PATH / f"{parent_name}.txt"
                 if parent_txt_path.exists():
                     parent_caption = parent_txt_path.read_text().strip()
             # Ensure model is loaded before processing
             await self.ensure_model_loaded()
             if USE_MOCK_CAPTIONING_MODEL:
                 # Even in mock mode, we'll generate a caption that shows we processed parent info
                 clip_caption = f"This is a test caption for {video_name}"
                 # Combine clip caption with parent caption
+                if parent_caption:
                     full_caption = f"{clip_caption}\n{parent_caption}"
                 else:
                     full_caption = clip_caption
                 if prompt_prefix and not full_caption.startswith(prompt_prefix):
                 progress.processed_frames = total_frames
                 progress.completed_at = datetime.now()
                 yield progress, full_caption
             else:
+                # Process frames with strict limits
+                max_frames_num = 64  # Maximum frames supported by the model
                 frames, frame_times_str, video_time = await loop.run_in_executor(
                     None,
+                    lambda: self._load_video(video_path, max_frames_num, fps=1, force_sample=True)
                 )
                 # Process all frames at once using the image processor
                 # Move processed frames to GPU
                 video_tensor = processed_frames.to('cuda').bfloat16()
+                # Use proper conversation template and tokens
+                conv_template = "qwen_1_5"
                 time_instruction = (f"The video lasts for {video_time:.2f} seconds, and {len(frames)} "
                                 f"frames are uniformly sampled from it. These frames are located at {frame_times_str}.")
+                full_question = DEFAULT_IMAGE_TOKEN + f"{time_instruction}\n{prompt}"
+                conv = copy.deepcopy(conv_templates[conv_template])
+                conv.append_message(conv.roles[0], full_question)
+                conv.append_message(conv.roles[1], None)
+                prompt_question = conv.get_prompt()
+                # Cap the output length to prevent hallucination
+                max_new_tokens = 512  # Reasonable limit for caption length
                 input_ids = await loop.run_in_executor(
                     None,
+                    lambda: tokenizer_image_token(prompt_question, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
                 )
+                # Generate caption with controlled parameters
                 with torch.no_grad():
                     output = await loop.run_in_executor(
                         None,
                             modalities=["video"],
                             do_sample=False,
                             temperature=0,
+                            max_new_tokens=max_new_tokens,
                         )
                     )
+                    clip_caption = await loop.run_in_executor(
+                        None,
+                        lambda: self.tokenizer.batch_decode(output, skip_special_tokens=True)[0].strip()
+                    )
+                    # Remove the instruction/question part from the response
+                    if time_instruction in clip_caption:
+                        clip_caption = clip_caption.split(time_instruction)[1].strip()
+                    if prompt in clip_caption:
+                        clip_caption = clip_caption.split(prompt)[1].strip()
+                # Combine captions with proper formatting
+                if parent_caption:
+                    full_caption = f"{clip_caption}\n{parent_caption}"
+                else:
+                    full_caption = clip_caption
+                if prompt_prefix and not full_caption.startswith(prompt_prefix):
+                    full_caption = f"{prompt_prefix}{full_caption}"
+                # Write caption
+                txt_path = video_path.with_suffix('.txt')
+                txt_path.write_text(full_caption)
+                progress.status = "completed"
+                progress.completed_at = datetime.now()
+                yield progress, full_caption
         except Exception as e:
             progress.status = "error"
             progress.error = str(e)
             progress.completed_at = datetime.now()
             yield progress, None
+            raise
     async def process_image(self, image_path: Path, prompt: str, prompt_prefix: str = "") -> AsyncGenerator[tuple[CaptioningProgress, Optional[str]], None]:
         """Process a single image for captioning"""
         try: