jbilcke-hf HF Staff commited on
Commit
dd52c86
·
1 Parent(s): 82a6631

working on better captioning status tracking

Browse files
app.py CHANGED
@@ -60,6 +60,71 @@ class VideoTrainerUI:
60
  self._should_stop_captioning = False
61
  self.log_parser = TrainingLogParser()
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def update_training_ui(self, training_state: Dict[str, Any]):
64
  """Update UI components based on training state"""
65
  updates = {}
@@ -221,48 +286,75 @@ class VideoTrainerUI:
221
  # Initialize captioner if not already done
222
  self._should_stop_captioning = False
223
 
 
 
 
 
 
 
 
 
 
 
224
  async for rows in self.captioner.start_caption_generation(captioning_bot_instructions, prompt_prefix):
 
 
 
 
 
 
 
 
 
 
225
  # Yield UI update
226
  yield gr.update(
227
- value=rows,
228
  headers=["name", "status"]
229
  )
230
 
231
- # Final update after completion
232
  yield gr.update(
233
  value=self.list_training_files_to_caption(),
234
  headers=["name", "status"]
235
  )
236
 
237
  except Exception as e:
 
238
  yield gr.update(
239
- value=[[str(e), "error"]],
240
  headers=["name", "status"]
241
  )
242
 
243
  def list_training_files_to_caption(self) -> List[List[str]]:
244
  """List all clips and images - both pending and captioned"""
245
  files = []
246
- already_listed: Dict[str, bool] = {}
247
 
248
- # Check files in STAGING_PATH
249
  for file in STAGING_PATH.glob("*.*"):
250
  if is_video_file(file) or is_image_file(file):
251
  txt_file = file.with_suffix('.txt')
252
- status = "captioned" if txt_file.exists() else "no caption"
 
 
 
253
  file_type = "video" if is_video_file(file) else "image"
 
254
  files.append([file.name, f"{status} ({file_type})", str(file)])
255
- already_listed[str(file.name)] = True
256
-
257
- # Check files in TRAINING_VIDEOS_PATH
258
  for file in TRAINING_VIDEOS_PATH.glob("*.*"):
259
- if not str(file.name) in already_listed:
260
- if is_video_file(file) or is_image_file(file):
261
- txt_file = file.with_suffix('.txt')
262
- if txt_file.exists():
263
- file_type = "video" if is_video_file(file) else "image"
264
- files.append([file.name, f"captioned ({file_type})", str(file)])
265
-
 
 
266
  # Sort by filename
267
  files.sort(key=lambda x: x[0])
268
 
@@ -1106,24 +1198,28 @@ class VideoTrainerUI:
1106
  }
1107
 
1108
  run_autocaption_btn.click(
 
 
 
 
 
 
1109
  fn=self.start_caption_generation,
1110
  inputs=[captioning_bot_instructions, custom_prompt_prefix],
1111
  outputs=[training_dataset],
1112
  ).then(
1113
- fn=lambda: update_button_states(True),
1114
- outputs=[run_autocaption_btn, stop_autocaption_btn]
1115
  )
1116
-
1117
  copy_files_to_training_dir_btn.click(
1118
  fn=self.copy_files_to_training_dir,
1119
  inputs=[custom_prompt_prefix]
1120
  )
1121
-
1122
  stop_autocaption_btn.click(
1123
- fn=lambda: (self.captioner.stop_captioning() if self.captioner else None, update_button_states(False)),
1124
- outputs=[run_autocaption_btn, stop_autocaption_btn]
1125
  )
1126
-
1127
  training_dataset.select(
1128
  fn=self.handle_training_dataset_select,
1129
  outputs=[preview_image, preview_video, preview_caption, preview_status]
 
60
  self._should_stop_captioning = False
61
  self.log_parser = TrainingLogParser()
62
 
63
+ def update_captioning_buttons_start(self):
64
+ return {
65
+ "run_autocaption_btn": gr.Button(
66
+ interactive=False,
67
+ variant="secondary",
68
+ ),
69
+ "stop_autocaption_btn": gr.Button(
70
+ interactive=True,
71
+ variant="stop",
72
+ ),
73
+ "copy_files_to_training_dir_btn": gr.Button(
74
+ interactive=False,
75
+ variant="secondary",
76
+ )
77
+ }
78
+
79
+ def update_captioning_buttons_end(self):
80
+ return {
81
+ "run_autocaption_btn": gr.Button(
82
+ interactive=True,
83
+ variant="primary",
84
+ ),
85
+ "stop_autocaption_btn": gr.Button(
86
+ interactive=False,
87
+ variant="secondary",
88
+ ),
89
+ "copy_files_to_training_dir_btn": gr.Button(
90
+ interactive=True,
91
+ variant="primary",
92
+ )
93
+ }
94
+
95
+ def show_refreshing_status(self) -> List[List[str]]:
96
+ """Show a 'Refreshing...' status in the dataframe"""
97
+ return [["Refreshing...", "please wait"]]
98
+
99
+ def stop_captioning(self):
100
+ """Stop ongoing captioning process and reset UI state"""
101
+ try:
102
+ # Set flag to stop captioning
103
+ self._should_stop_captioning = True
104
+
105
+ # Call stop method on captioner
106
+ if self.captioner:
107
+ self.captioner.stop_captioning()
108
+
109
+ # Get updated file list
110
+ updated_list = self.list_training_files_to_caption()
111
+
112
+ # Return updated list and button states
113
+ return {
114
+ "training_dataset": gr.update(value=updated_list),
115
+ "run_autocaption_btn": gr.Button(interactive=True, variant="primary"),
116
+ "stop_autocaption_btn": gr.Button(interactive=False, variant="secondary"),
117
+ "copy_files_to_training_dir_btn": gr.Button(interactive=True, variant="primary")
118
+ }
119
+ except Exception as e:
120
+ logger.error(f"Error stopping captioning: {str(e)}")
121
+ return {
122
+ "training_dataset": gr.update(value=[[f"Error stopping captioning: {str(e)}", "error"]]),
123
+ "run_autocaption_btn": gr.Button(interactive=True, variant="primary"),
124
+ "stop_autocaption_btn": gr.Button(interactive=False, variant="secondary"),
125
+ "copy_files_to_training_dir_btn": gr.Button(interactive=True, variant="primary")
126
+ }
127
+
128
  def update_training_ui(self, training_state: Dict[str, Any]):
129
  """Update UI components based on training state"""
130
  updates = {}
 
286
  # Initialize captioner if not already done
287
  self._should_stop_captioning = False
288
 
289
+ # First yield - indicate we're starting
290
+ yield gr.update(
291
+ value=[["Starting captioning service...", "initializing"]],
292
+ headers=["name", "status"]
293
+ )
294
+
295
+ # Process files in batches with status updates
296
+ file_statuses = {}
297
+
298
+ # Start the actual captioning process
299
  async for rows in self.captioner.start_caption_generation(captioning_bot_instructions, prompt_prefix):
300
+ # Update our tracking of file statuses
301
+ for name, status in rows:
302
+ file_statuses[name] = status
303
+
304
+ # Convert to list format for display
305
+ status_rows = [[name, status] for name, status in file_statuses.items()]
306
+
307
+ # Sort by name for consistent display
308
+ status_rows.sort(key=lambda x: x[0])
309
+
310
  # Yield UI update
311
  yield gr.update(
312
+ value=status_rows,
313
  headers=["name", "status"]
314
  )
315
 
316
+ # Final update after completion with fresh data
317
  yield gr.update(
318
  value=self.list_training_files_to_caption(),
319
  headers=["name", "status"]
320
  )
321
 
322
  except Exception as e:
323
+ logger.error(f"Error in captioning: {str(e)}")
324
  yield gr.update(
325
+ value=[[f"Error: {str(e)}", "error"]],
326
  headers=["name", "status"]
327
  )
328
 
329
  def list_training_files_to_caption(self) -> List[List[str]]:
330
  """List all clips and images - both pending and captioned"""
331
  files = []
332
+ already_listed = {}
333
 
334
+ # First check files in STAGING_PATH
335
  for file in STAGING_PATH.glob("*.*"):
336
  if is_video_file(file) or is_image_file(file):
337
  txt_file = file.with_suffix('.txt')
338
+
339
+ # Check if caption file exists and has content
340
+ has_caption = txt_file.exists() and txt_file.stat().st_size > 0
341
+ status = "captioned" if has_caption else "no caption"
342
  file_type = "video" if is_video_file(file) else "image"
343
+
344
  files.append([file.name, f"{status} ({file_type})", str(file)])
345
+ already_listed[file.name] = True
346
+
347
+ # Then check files in TRAINING_VIDEOS_PATH
348
  for file in TRAINING_VIDEOS_PATH.glob("*.*"):
349
+ if (is_video_file(file) or is_image_file(file)) and file.name not in already_listed:
350
+ txt_file = file.with_suffix('.txt')
351
+
352
+ # Only include files with captions
353
+ if txt_file.exists() and txt_file.stat().st_size > 0:
354
+ file_type = "video" if is_video_file(file) else "image"
355
+ files.append([file.name, f"captioned ({file_type})", str(file)])
356
+ already_listed[file.name] = True
357
+
358
  # Sort by filename
359
  files.sort(key=lambda x: x[0])
360
 
 
1198
  }
1199
 
1200
  run_autocaption_btn.click(
1201
+ fn=self.show_refreshing_status,
1202
+ outputs=[training_dataset]
1203
+ ).then(
1204
+ fn=lambda: self.update_captioning_buttons_start(),
1205
+ outputs=[run_autocaption_btn, stop_autocaption_btn, copy_files_to_training_dir_btn]
1206
+ ).then(
1207
  fn=self.start_caption_generation,
1208
  inputs=[captioning_bot_instructions, custom_prompt_prefix],
1209
  outputs=[training_dataset],
1210
  ).then(
1211
+ fn=lambda: self.update_captioning_buttons_end(),
1212
+ outputs=[run_autocaption_btn, stop_autocaption_btn, copy_files_to_training_dir_btn]
1213
  )
1214
+
1215
  copy_files_to_training_dir_btn.click(
1216
  fn=self.copy_files_to_training_dir,
1217
  inputs=[custom_prompt_prefix]
1218
  )
 
1219
  stop_autocaption_btn.click(
1220
+ fn=self.stop_captioning,
1221
+ outputs=[training_dataset, run_autocaption_btn, stop_autocaption_btn, copy_files_to_training_dir_btn]
1222
  )
 
1223
  training_dataset.select(
1224
  fn=self.handle_training_dataset_select,
1225
  outputs=[preview_image, preview_video, preview_caption, preview_status]
vms/captioning_service.py CHANGED
@@ -17,9 +17,9 @@ from llava.mm_utils import tokenizer_image_token
17
  from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
18
  from llava.conversation import conv_templates, SeparatorStyle
19
 
20
- from config import TRAINING_VIDEOS_PATH, STAGING_PATH, PRELOAD_CAPTIONING_MODEL, CAPTIONING_MODEL, USE_MOCK_CAPTIONING_MODEL, DEFAULT_CAPTIONING_BOT_INSTRUCTIONS, VIDEOS_TO_SPLIT_PATH, DEFAULT_PROMPT_PREFIX
21
- from utils import extract_scene_info, is_image_file, is_video_file
22
- from finetrainers_utils import copy_files_to_training_dir, prepare_finetrainers_dataset
23
 
24
  logger = logging.getLogger(__name__)
25
 
 
17
  from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
18
  from llava.conversation import conv_templates, SeparatorStyle
19
 
20
+ from .config import TRAINING_VIDEOS_PATH, STAGING_PATH, PRELOAD_CAPTIONING_MODEL, CAPTIONING_MODEL, USE_MOCK_CAPTIONING_MODEL, DEFAULT_CAPTIONING_BOT_INSTRUCTIONS, VIDEOS_TO_SPLIT_PATH, DEFAULT_PROMPT_PREFIX
21
+ from .utils import extract_scene_info, is_image_file, is_video_file
22
+ from .finetrainers_utils import copy_files_to_training_dir, prepare_finetrainers_dataset
23
 
24
  logger = logging.getLogger(__name__)
25
 
vms/config.py CHANGED
@@ -29,7 +29,7 @@ DEFAULT_PROMPT_PREFIX = "In the style of TOK, "
29
  # This is only use to debug things in local
30
  USE_MOCK_CAPTIONING_MODEL = parse_bool_env(os.environ.get('USE_MOCK_CAPTIONING_MODEL'))
31
 
32
- DEFAULT_CAPTIONING_BOT_INSTRUCTIONS = "Please write a full description of the following video: camera (close-up shot, medium-shot..), genre (music video, horror movie scene, video game footage, go pro footage, japanese anime, noir film, science-fiction, action movie, documentary..), characters (physical appearance, look, skin, facial features, haircut, clothing), scene (action, positions, movements), location (indoor, outdoor, place, building, country..), time and lighting (natural, golden hour, night time, LED lights, kelvin temperature etc), weather and climate (dusty, rainy, fog, haze, snowing..), era/settings"
33
 
34
  # Create directories
35
  STORAGE_PATH.mkdir(parents=True, exist_ok=True)
 
29
  # This is only use to debug things in local
30
  USE_MOCK_CAPTIONING_MODEL = parse_bool_env(os.environ.get('USE_MOCK_CAPTIONING_MODEL'))
31
 
32
+ DEFAULT_CAPTIONING_BOT_INSTRUCTIONS = "Please write a full video description. Be synthetic, don't say things like "this video features.." etc. Instead, methodically list camera (close-up shot, medium-shot..), genre (music video, horror movie scene, video game footage, go pro footage, japanese anime, noir film, science-fiction, action movie, documentary..), characters (physical appearance, look, skin, facial features, haircut, clothing), scene (action, positions, movements), location (indoor, outdoor, place, building, country..), time and lighting (natural, golden hour, night time, LED lights, kelvin temperature etc), weather and climate (dusty, rainy, fog, haze, snowing..), era/settings."
33
 
34
  # Create directories
35
  STORAGE_PATH.mkdir(parents=True, exist_ok=True)
vms/finetrainers_utils.py CHANGED
@@ -3,8 +3,9 @@ from pathlib import Path
3
  import logging
4
  import shutil
5
  from typing import Any, Optional, Dict, List, Union, Tuple
6
- from config import STORAGE_PATH, TRAINING_PATH, STAGING_PATH, TRAINING_VIDEOS_PATH, MODEL_PATH, OUTPUT_PATH, HF_API_TOKEN, MODEL_TYPES
7
- from utils import get_video_fps, extract_scene_info, make_archive, is_image_file, is_video_file
 
8
 
9
  logger = logging.getLogger(__name__)
10
 
 
3
  import logging
4
  import shutil
5
  from typing import Any, Optional, Dict, List, Union, Tuple
6
+
7
+ from .config import STORAGE_PATH, TRAINING_PATH, STAGING_PATH, TRAINING_VIDEOS_PATH, MODEL_PATH, OUTPUT_PATH, HF_API_TOKEN, MODEL_TYPES
8
+ from .utils import get_video_fps, extract_scene_info, make_archive, is_image_file, is_video_file
9
 
10
  logger = logging.getLogger(__name__)
11
 
vms/image_preprocessing.py CHANGED
@@ -4,7 +4,8 @@ from pathlib import Path
4
  from PIL import Image
5
  import pillow_avif
6
  import logging
7
- from config import NORMALIZE_IMAGES_TO, JPEG_QUALITY
 
8
 
9
  logger = logging.getLogger(__name__)
10
 
 
4
  from PIL import Image
5
  import pillow_avif
6
  import logging
7
+
8
+ from .config import NORMALIZE_IMAGES_TO, JPEG_QUALITY
9
 
10
  logger = logging.getLogger(__name__)
11
 
vms/import_service.py CHANGED
@@ -7,10 +7,10 @@ from pathlib import Path
7
  from typing import List, Dict, Optional, Tuple
8
  from pytubefix import YouTube
9
  import logging
10
- from utils import is_image_file, is_video_file, add_prefix_to_caption
11
- from image_preprocessing import normalize_image
12
 
13
- from config import NORMALIZE_IMAGES_TO, TRAINING_VIDEOS_PATH, VIDEOS_TO_SPLIT_PATH, TRAINING_PATH, DEFAULT_PROMPT_PREFIX
 
 
14
 
15
  logger = logging.getLogger(__name__)
16
 
 
7
  from typing import List, Dict, Optional, Tuple
8
  from pytubefix import YouTube
9
  import logging
 
 
10
 
11
+ from .utils import is_image_file, is_video_file, add_prefix_to_caption
12
+ from .image_preprocessing import normalize_image
13
+ from .config import NORMALIZE_IMAGES_TO, TRAINING_VIDEOS_PATH, VIDEOS_TO_SPLIT_PATH, TRAINING_PATH, DEFAULT_PROMPT_PREFIX
14
 
15
  logger = logging.getLogger(__name__)
16
 
vms/splitting_service.py CHANGED
@@ -12,11 +12,11 @@ import gradio as gr
12
  from scenedetect import detect, ContentDetector, SceneManager, open_video
13
  from scenedetect.video_splitter import split_video_ffmpeg
14
 
15
- from config import TRAINING_PATH, STORAGE_PATH, TRAINING_VIDEOS_PATH, VIDEOS_TO_SPLIT_PATH, STAGING_PATH, DEFAULT_PROMPT_PREFIX
16
 
17
- from image_preprocessing import detect_black_bars
18
- from video_preprocessing import remove_black_bars
19
- from utils import extract_scene_info, is_video_file, is_image_file, add_prefix_to_caption
20
 
21
  logger = logging.getLogger(__name__)
22
 
 
12
  from scenedetect import detect, ContentDetector, SceneManager, open_video
13
  from scenedetect.video_splitter import split_video_ffmpeg
14
 
15
+ from .config import TRAINING_PATH, STORAGE_PATH, TRAINING_VIDEOS_PATH, VIDEOS_TO_SPLIT_PATH, STAGING_PATH, DEFAULT_PROMPT_PREFIX
16
 
17
+ from .image_preprocessing import detect_black_bars
18
+ from .video_preprocessing import remove_black_bars
19
+ from .utils import extract_scene_info, is_video_file, is_image_file, add_prefix_to_caption
20
 
21
  logger = logging.getLogger(__name__)
22
 
vms/training_service.py CHANGED
@@ -18,10 +18,10 @@ import select
18
 
19
  from typing import Any, Optional, Dict, List, Union, Tuple
20
 
21
- from huggingface_hub import upload_folder, create_repo
22
- from config import TrainingConfig, TRAINING_PRESETS, LOG_FILE_PATH, TRAINING_VIDEOS_PATH, STORAGE_PATH, TRAINING_PATH, MODEL_PATH, OUTPUT_PATH, HF_API_TOKEN, MODEL_TYPES
23
- from utils import make_archive, parse_training_log, is_image_file, is_video_file
24
- from finetrainers_utils import prepare_finetrainers_dataset, copy_files_to_training_dir
25
 
26
  logger = logging.getLogger(__name__)
27
 
 
18
 
19
  from typing import Any, Optional, Dict, List, Union, Tuple
20
 
21
+ from .huggingface_hub import upload_folder, create_repo
22
+ from .config import TrainingConfig, TRAINING_PRESETS, LOG_FILE_PATH, TRAINING_VIDEOS_PATH, STORAGE_PATH, TRAINING_PATH, MODEL_PATH, OUTPUT_PATH, HF_API_TOKEN, MODEL_TYPES
23
+ from .utils import make_archive, parse_training_log, is_image_file, is_video_file
24
+ from .finetrainers_utils import prepare_finetrainers_dataset, copy_files_to_training_dir
25
 
26
  logger = logging.getLogger(__name__)
27