jbilcke-hf HF Staff commited on
Commit
f577b1e
·
1 Parent(s): df0584b

investigating captionning issues

Browse files
Files changed (2) hide show
  1. app.py +3 -3
  2. captioning_service.py +67 -52
app.py CHANGED
@@ -263,15 +263,15 @@ class VideoTrainerUI:
263
  is_completed = training_state["status"] in ["completed", "error", "stopped"]
264
 
265
  return {
266
- start_btn: gr.Button(
267
  interactive=not is_training and not is_paused,
268
  variant="primary" if not is_training else "secondary",
269
  ),
270
- stop_btn: gr.Button(
271
  interactive=is_training or is_paused,
272
  variant="stop",
273
  ),
274
- pause_resume_btn: gr.Button(
275
  value="Resume Training" if is_paused else "Pause Training",
276
  interactive=(is_training or is_paused) and not is_completed,
277
  variant="secondary",
 
263
  is_completed = training_state["status"] in ["completed", "error", "stopped"]
264
 
265
  return {
266
+ "start_btn": gr.Button(
267
  interactive=not is_training and not is_paused,
268
  variant="primary" if not is_training else "secondary",
269
  ),
270
+ "stop_btn": gr.Button(
271
  interactive=is_training or is_paused,
272
  variant="stop",
273
  ),
274
+ "pause_resume_btn": gr.Button(
275
  value="Resume Training" if is_paused else "Pause Training",
276
  interactive=(is_training or is_paused) and not is_completed,
277
  variant="secondary",
captioning_service.py CHANGED
@@ -2,8 +2,6 @@ import logging
2
  import torch
3
  import shutil
4
  import gradio as gr
5
- from llava.model.builder import load_pretrained_model
6
- from llava.mm_utils import tokenizer_image_token
7
  import numpy as np
8
  from decord import VideoReader, cpu
9
  from pathlib import Path
@@ -12,6 +10,10 @@ import asyncio
12
  from dataclasses import dataclass
13
  from datetime import datetime
14
  import cv2
 
 
 
 
15
  from config import TRAINING_VIDEOS_PATH, STAGING_PATH, PRELOAD_CAPTIONING_MODEL, CAPTIONING_MODEL, USE_MOCK_CAPTIONING_MODEL, DEFAULT_CAPTIONING_BOT_INSTRUCTIONS, VIDEOS_TO_SPLIT_PATH, DEFAULT_PROMPT_PREFIX
16
  from utils import extract_scene_info, is_image_file, is_video_file
17
  from finetrainers_utils import copy_files_to_training_dir, prepare_finetrainers_dataset
@@ -142,12 +144,21 @@ class CaptioningService:
142
  self.model.eval()
143
 
144
  def _load_video(self, video_path: Path, max_frames_num: int = 64, fps: int = 1, force_sample: bool = True) -> tuple[np.ndarray, str, float]:
145
- """Load and preprocess video frames"""
146
-
147
- video_path_str = str(video_path) if hasattr(video_path, '__fspath__') else video_path
148
-
 
 
 
 
 
 
 
 
149
  logger.debug(f"Loading video: {video_path_str}")
150
 
 
151
  if max_frames_num == 0:
152
  return np.zeros((1, 336, 336, 3)), "", 0
153
 
@@ -155,17 +166,18 @@ class CaptioningService:
155
  total_frame_num = len(vr)
156
  video_time = total_frame_num / vr.get_avg_fps()
157
 
158
- # Calculate frame indices
159
  fps = round(vr.get_avg_fps()/fps)
160
  frame_idx = [i for i in range(0, len(vr), fps)]
161
  frame_time = [i/fps for i in frame_idx]
162
 
 
163
  if len(frame_idx) > max_frames_num or force_sample:
164
  sample_fps = max_frames_num
165
  uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
166
  frame_idx = uniform_sampled_frames.tolist()
167
  frame_time = [i/vr.get_avg_fps() for i in frame_idx]
168
-
169
  frame_time_str = ",".join([f"{i:.2f}s" for i in frame_time])
170
 
171
  try:
@@ -181,7 +193,7 @@ class CaptioningService:
181
  video_name = video_path.name
182
  logger.info(f"Starting processing of video: {video_name}")
183
 
184
- # Load video metadata
185
  logger.debug(f"Loading video metadata for {video_name}")
186
  loop = asyncio.get_event_loop()
187
  vr = await loop.run_in_executor(None, lambda: VideoReader(str(video_path), ctx=cpu(0)))
@@ -201,28 +213,21 @@ class CaptioningService:
201
  parent_caption = ""
202
  if "___" in video_path.stem:
203
  parent_name, _ = extract_scene_info(video_path.stem)
204
- #print(f"parent_name is {parent_name}")
205
  parent_txt_path = VIDEOS_TO_SPLIT_PATH / f"{parent_name}.txt"
206
  if parent_txt_path.exists():
207
- logger.debug(f"Found parent caption file: {parent_txt_path}")
208
  parent_caption = parent_txt_path.read_text().strip()
209
 
210
  # Ensure model is loaded before processing
211
  await self.ensure_model_loaded()
212
 
213
  if USE_MOCK_CAPTIONING_MODEL:
214
-
215
  # Even in mock mode, we'll generate a caption that shows we processed parent info
216
  clip_caption = f"This is a test caption for {video_name}"
217
 
218
  # Combine clip caption with parent caption
219
- if parent_caption and not full_caption.endswith(parent_caption):
220
- #print(f"we have parent_caption, so we define the full_caption as {clip_caption}\n{parent_caption}")
221
-
222
  full_caption = f"{clip_caption}\n{parent_caption}"
223
  else:
224
- #print(f"we don't have a parent_caption, so we define the full_caption as {clip_caption}")
225
-
226
  full_caption = clip_caption
227
 
228
  if prompt_prefix and not full_caption.startswith(prompt_prefix):
@@ -238,13 +243,12 @@ class CaptioningService:
238
  progress.processed_frames = total_frames
239
  progress.completed_at = datetime.now()
240
  yield progress, full_caption
241
-
242
  else:
243
- # Process frames in batches
244
- max_frames_num = 64
245
  frames, frame_times_str, video_time = await loop.run_in_executor(
246
  None,
247
- lambda: self._load_video(video_path, max_frames_num)
248
  )
249
 
250
  # Process all frames at once using the image processor
@@ -264,16 +268,27 @@ class CaptioningService:
264
  # Move processed frames to GPU
265
  video_tensor = processed_frames.to('cuda').bfloat16()
266
 
 
 
267
  time_instruction = (f"The video lasts for {video_time:.2f} seconds, and {len(frames)} "
268
  f"frames are uniformly sampled from it. These frames are located at {frame_times_str}.")
269
- full_prompt = f"<image>{time_instruction}\n{prompt}"
 
 
 
 
 
 
 
 
 
270
 
271
  input_ids = await loop.run_in_executor(
272
  None,
273
- lambda: tokenizer_image_token(full_prompt, self.tokenizer, return_tensors="pt").unsqueeze(0).to('cuda')
274
  )
275
 
276
- # Generate caption
277
  with torch.no_grad():
278
  output = await loop.run_in_executor(
279
  None,
@@ -283,45 +298,45 @@ class CaptioningService:
283
  modalities=["video"],
284
  do_sample=False,
285
  temperature=0,
286
- max_new_tokens=4096,
287
  )
288
  )
289
 
290
- clip_caption = await loop.run_in_executor(
291
- None,
292
- lambda: self.tokenizer.batch_decode(output, skip_special_tokens=True)[0].strip()
293
- )
294
-
295
- # Combine clip caption with parent caption
296
- if parent_caption:
297
- print(f"we have parent_caption, so we define the full_caption as {clip_caption}\n{parent_caption}")
298
-
299
- full_caption = f"{clip_caption}\n{parent_caption}"
300
- else:
301
- print(f"we don't have a parent_caption, so we define the full_caption as {clip_caption}")
302
-
303
- full_caption = clip_caption
304
 
305
- if prompt_prefix:
306
- full_caption = f"{prompt_prefix}{full_caption}"
307
-
 
 
308
 
309
- # Write the caption file
310
- txt_path = video_path.with_suffix('.txt')
311
- txt_path.write_text(full_caption)
312
-
313
- progress.status = "completed"
314
- progress.completed_at = datetime.now()
315
- gr.Info(f"Successfully generated caption for {video_name}")
316
- yield progress, full_caption
 
 
 
 
 
 
 
 
317
 
318
  except Exception as e:
319
  progress.status = "error"
320
  progress.error = str(e)
321
  progress.completed_at = datetime.now()
322
  yield progress, None
323
- raise gr.Error(f"Error processing video: {str(e)}")
324
-
325
  async def process_image(self, image_path: Path, prompt: str, prompt_prefix: str = "") -> AsyncGenerator[tuple[CaptioningProgress, Optional[str]], None]:
326
  """Process a single image for captioning"""
327
  try:
 
2
  import torch
3
  import shutil
4
  import gradio as gr
 
 
5
  import numpy as np
6
  from decord import VideoReader, cpu
7
  from pathlib import Path
 
10
  from dataclasses import dataclass
11
  from datetime import datetime
12
  import cv2
13
+ import copy
14
+ from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
15
+ from llava.conversation import conv_templates, SeparatorStyle
16
+
17
  from config import TRAINING_VIDEOS_PATH, STAGING_PATH, PRELOAD_CAPTIONING_MODEL, CAPTIONING_MODEL, USE_MOCK_CAPTIONING_MODEL, DEFAULT_CAPTIONING_BOT_INSTRUCTIONS, VIDEOS_TO_SPLIT_PATH, DEFAULT_PROMPT_PREFIX
18
  from utils import extract_scene_info, is_image_file, is_video_file
19
  from finetrainers_utils import copy_files_to_training_dir, prepare_finetrainers_dataset
 
144
  self.model.eval()
145
 
146
  def _load_video(self, video_path: Path, max_frames_num: int = 64, fps: int = 1, force_sample: bool = True) -> tuple[np.ndarray, str, float]:
147
+ """Load and preprocess video frames with strict limits
148
+
149
+ Args:
150
+ video_path: Path to video file
151
+ max_frames_num: Maximum number of frames to extract (default: 64)
152
+ fps: Frames per second to sample (default: 1)
153
+ force_sample: Whether to force uniform sampling (default: True)
154
+
155
+ Returns:
156
+ Tuple of (frames, frame_times_str, video_time)
157
+ """
158
+ video_path_str = str(video_path)
159
  logger.debug(f"Loading video: {video_path_str}")
160
 
161
+ # Handle empty video case
162
  if max_frames_num == 0:
163
  return np.zeros((1, 336, 336, 3)), "", 0
164
 
 
166
  total_frame_num = len(vr)
167
  video_time = total_frame_num / vr.get_avg_fps()
168
 
169
+ # Calculate frame indices with uniform sampling
170
  fps = round(vr.get_avg_fps()/fps)
171
  frame_idx = [i for i in range(0, len(vr), fps)]
172
  frame_time = [i/fps for i in frame_idx]
173
 
174
+ # Force uniform sampling if too many frames
175
  if len(frame_idx) > max_frames_num or force_sample:
176
  sample_fps = max_frames_num
177
  uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
178
  frame_idx = uniform_sampled_frames.tolist()
179
  frame_time = [i/vr.get_avg_fps() for i in frame_idx]
180
+
181
  frame_time_str = ",".join([f"{i:.2f}s" for i in frame_time])
182
 
183
  try:
 
193
  video_name = video_path.name
194
  logger.info(f"Starting processing of video: {video_name}")
195
 
196
+ # Load video metadata with strict frame limits
197
  logger.debug(f"Loading video metadata for {video_name}")
198
  loop = asyncio.get_event_loop()
199
  vr = await loop.run_in_executor(None, lambda: VideoReader(str(video_path), ctx=cpu(0)))
 
213
  parent_caption = ""
214
  if "___" in video_path.stem:
215
  parent_name, _ = extract_scene_info(video_path.stem)
 
216
  parent_txt_path = VIDEOS_TO_SPLIT_PATH / f"{parent_name}.txt"
217
  if parent_txt_path.exists():
 
218
  parent_caption = parent_txt_path.read_text().strip()
219
 
220
  # Ensure model is loaded before processing
221
  await self.ensure_model_loaded()
222
 
223
  if USE_MOCK_CAPTIONING_MODEL:
 
224
  # Even in mock mode, we'll generate a caption that shows we processed parent info
225
  clip_caption = f"This is a test caption for {video_name}"
226
 
227
  # Combine clip caption with parent caption
228
+ if parent_caption:
 
 
229
  full_caption = f"{clip_caption}\n{parent_caption}"
230
  else:
 
 
231
  full_caption = clip_caption
232
 
233
  if prompt_prefix and not full_caption.startswith(prompt_prefix):
 
243
  progress.processed_frames = total_frames
244
  progress.completed_at = datetime.now()
245
  yield progress, full_caption
 
246
  else:
247
+ # Process frames with strict limits
248
+ max_frames_num = 64 # Maximum frames supported by the model
249
  frames, frame_times_str, video_time = await loop.run_in_executor(
250
  None,
251
+ lambda: self._load_video(video_path, max_frames_num, fps=1, force_sample=True)
252
  )
253
 
254
  # Process all frames at once using the image processor
 
268
  # Move processed frames to GPU
269
  video_tensor = processed_frames.to('cuda').bfloat16()
270
 
271
+ # Use proper conversation template and tokens
272
+ conv_template = "qwen_1_5"
273
  time_instruction = (f"The video lasts for {video_time:.2f} seconds, and {len(frames)} "
274
  f"frames are uniformly sampled from it. These frames are located at {frame_times_str}.")
275
+
276
+ full_question = DEFAULT_IMAGE_TOKEN + f"{time_instruction}\n{prompt}"
277
+
278
+ conv = copy.deepcopy(conv_templates[conv_template])
279
+ conv.append_message(conv.roles[0], full_question)
280
+ conv.append_message(conv.roles[1], None)
281
+ prompt_question = conv.get_prompt()
282
+
283
+ # Cap the output length to prevent hallucination
284
+ max_new_tokens = 512 # Reasonable limit for caption length
285
 
286
  input_ids = await loop.run_in_executor(
287
  None,
288
+ lambda: tokenizer_image_token(prompt_question, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
289
  )
290
 
291
+ # Generate caption with controlled parameters
292
  with torch.no_grad():
293
  output = await loop.run_in_executor(
294
  None,
 
298
  modalities=["video"],
299
  do_sample=False,
300
  temperature=0,
301
+ max_new_tokens=max_new_tokens,
302
  )
303
  )
304
 
305
+ clip_caption = await loop.run_in_executor(
306
+ None,
307
+ lambda: self.tokenizer.batch_decode(output, skip_special_tokens=True)[0].strip()
308
+ )
 
 
 
 
 
 
 
 
 
 
309
 
310
+ # Remove the instruction/question part from the response
311
+ if time_instruction in clip_caption:
312
+ clip_caption = clip_caption.split(time_instruction)[1].strip()
313
+ if prompt in clip_caption:
314
+ clip_caption = clip_caption.split(prompt)[1].strip()
315
 
316
+ # Combine captions with proper formatting
317
+ if parent_caption:
318
+ full_caption = f"{clip_caption}\n{parent_caption}"
319
+ else:
320
+ full_caption = clip_caption
321
+
322
+ if prompt_prefix and not full_caption.startswith(prompt_prefix):
323
+ full_caption = f"{prompt_prefix}{full_caption}"
324
+
325
+ # Write caption
326
+ txt_path = video_path.with_suffix('.txt')
327
+ txt_path.write_text(full_caption)
328
+
329
+ progress.status = "completed"
330
+ progress.completed_at = datetime.now()
331
+ yield progress, full_caption
332
 
333
  except Exception as e:
334
  progress.status = "error"
335
  progress.error = str(e)
336
  progress.completed_at = datetime.now()
337
  yield progress, None
338
+ raise
339
+
340
  async def process_image(self, image_path: Path, prompt: str, prompt_prefix: str = "") -> AsyncGenerator[tuple[CaptioningProgress, Optional[str]], None]:
341
  """Process a single image for captioning"""
342
  try: