Spaces:
Running
Running
Commit
·
dd52c86
1
Parent(s):
82a6631
working on better captioning status tracking
Browse files- app.py +119 -23
- vms/captioning_service.py +3 -3
- vms/config.py +1 -1
- vms/finetrainers_utils.py +3 -2
- vms/image_preprocessing.py +2 -1
- vms/import_service.py +3 -3
- vms/splitting_service.py +4 -4
- vms/training_service.py +4 -4
app.py
CHANGED
@@ -60,6 +60,71 @@ class VideoTrainerUI:
|
|
60 |
self._should_stop_captioning = False
|
61 |
self.log_parser = TrainingLogParser()
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
def update_training_ui(self, training_state: Dict[str, Any]):
|
64 |
"""Update UI components based on training state"""
|
65 |
updates = {}
|
@@ -221,48 +286,75 @@ class VideoTrainerUI:
|
|
221 |
# Initialize captioner if not already done
|
222 |
self._should_stop_captioning = False
|
223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
async for rows in self.captioner.start_caption_generation(captioning_bot_instructions, prompt_prefix):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
# Yield UI update
|
226 |
yield gr.update(
|
227 |
-
value=
|
228 |
headers=["name", "status"]
|
229 |
)
|
230 |
|
231 |
-
# Final update after completion
|
232 |
yield gr.update(
|
233 |
value=self.list_training_files_to_caption(),
|
234 |
headers=["name", "status"]
|
235 |
)
|
236 |
|
237 |
except Exception as e:
|
|
|
238 |
yield gr.update(
|
239 |
-
value=[[str(e), "error"]],
|
240 |
headers=["name", "status"]
|
241 |
)
|
242 |
|
243 |
def list_training_files_to_caption(self) -> List[List[str]]:
|
244 |
"""List all clips and images - both pending and captioned"""
|
245 |
files = []
|
246 |
-
already_listed
|
247 |
|
248 |
-
#
|
249 |
for file in STAGING_PATH.glob("*.*"):
|
250 |
if is_video_file(file) or is_image_file(file):
|
251 |
txt_file = file.with_suffix('.txt')
|
252 |
-
|
|
|
|
|
|
|
253 |
file_type = "video" if is_video_file(file) else "image"
|
|
|
254 |
files.append([file.name, f"{status} ({file_type})", str(file)])
|
255 |
-
already_listed[
|
256 |
-
|
257 |
-
#
|
258 |
for file in TRAINING_VIDEOS_PATH.glob("*.*"):
|
259 |
-
if
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
|
|
|
|
266 |
# Sort by filename
|
267 |
files.sort(key=lambda x: x[0])
|
268 |
|
@@ -1106,24 +1198,28 @@ class VideoTrainerUI:
|
|
1106 |
}
|
1107 |
|
1108 |
run_autocaption_btn.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
1109 |
fn=self.start_caption_generation,
|
1110 |
inputs=[captioning_bot_instructions, custom_prompt_prefix],
|
1111 |
outputs=[training_dataset],
|
1112 |
).then(
|
1113 |
-
fn=lambda:
|
1114 |
-
outputs=[run_autocaption_btn, stop_autocaption_btn]
|
1115 |
)
|
1116 |
-
|
1117 |
copy_files_to_training_dir_btn.click(
|
1118 |
fn=self.copy_files_to_training_dir,
|
1119 |
inputs=[custom_prompt_prefix]
|
1120 |
)
|
1121 |
-
|
1122 |
stop_autocaption_btn.click(
|
1123 |
-
fn=
|
1124 |
-
outputs=[run_autocaption_btn, stop_autocaption_btn]
|
1125 |
)
|
1126 |
-
|
1127 |
training_dataset.select(
|
1128 |
fn=self.handle_training_dataset_select,
|
1129 |
outputs=[preview_image, preview_video, preview_caption, preview_status]
|
|
|
60 |
self._should_stop_captioning = False
|
61 |
self.log_parser = TrainingLogParser()
|
62 |
|
63 |
+
def update_captioning_buttons_start(self):
|
64 |
+
return {
|
65 |
+
"run_autocaption_btn": gr.Button(
|
66 |
+
interactive=False,
|
67 |
+
variant="secondary",
|
68 |
+
),
|
69 |
+
"stop_autocaption_btn": gr.Button(
|
70 |
+
interactive=True,
|
71 |
+
variant="stop",
|
72 |
+
),
|
73 |
+
"copy_files_to_training_dir_btn": gr.Button(
|
74 |
+
interactive=False,
|
75 |
+
variant="secondary",
|
76 |
+
)
|
77 |
+
}
|
78 |
+
|
79 |
+
def update_captioning_buttons_end(self):
|
80 |
+
return {
|
81 |
+
"run_autocaption_btn": gr.Button(
|
82 |
+
interactive=True,
|
83 |
+
variant="primary",
|
84 |
+
),
|
85 |
+
"stop_autocaption_btn": gr.Button(
|
86 |
+
interactive=False,
|
87 |
+
variant="secondary",
|
88 |
+
),
|
89 |
+
"copy_files_to_training_dir_btn": gr.Button(
|
90 |
+
interactive=True,
|
91 |
+
variant="primary",
|
92 |
+
)
|
93 |
+
}
|
94 |
+
|
95 |
+
def show_refreshing_status(self) -> List[List[str]]:
|
96 |
+
"""Show a 'Refreshing...' status in the dataframe"""
|
97 |
+
return [["Refreshing...", "please wait"]]
|
98 |
+
|
99 |
+
def stop_captioning(self):
|
100 |
+
"""Stop ongoing captioning process and reset UI state"""
|
101 |
+
try:
|
102 |
+
# Set flag to stop captioning
|
103 |
+
self._should_stop_captioning = True
|
104 |
+
|
105 |
+
# Call stop method on captioner
|
106 |
+
if self.captioner:
|
107 |
+
self.captioner.stop_captioning()
|
108 |
+
|
109 |
+
# Get updated file list
|
110 |
+
updated_list = self.list_training_files_to_caption()
|
111 |
+
|
112 |
+
# Return updated list and button states
|
113 |
+
return {
|
114 |
+
"training_dataset": gr.update(value=updated_list),
|
115 |
+
"run_autocaption_btn": gr.Button(interactive=True, variant="primary"),
|
116 |
+
"stop_autocaption_btn": gr.Button(interactive=False, variant="secondary"),
|
117 |
+
"copy_files_to_training_dir_btn": gr.Button(interactive=True, variant="primary")
|
118 |
+
}
|
119 |
+
except Exception as e:
|
120 |
+
logger.error(f"Error stopping captioning: {str(e)}")
|
121 |
+
return {
|
122 |
+
"training_dataset": gr.update(value=[[f"Error stopping captioning: {str(e)}", "error"]]),
|
123 |
+
"run_autocaption_btn": gr.Button(interactive=True, variant="primary"),
|
124 |
+
"stop_autocaption_btn": gr.Button(interactive=False, variant="secondary"),
|
125 |
+
"copy_files_to_training_dir_btn": gr.Button(interactive=True, variant="primary")
|
126 |
+
}
|
127 |
+
|
128 |
def update_training_ui(self, training_state: Dict[str, Any]):
|
129 |
"""Update UI components based on training state"""
|
130 |
updates = {}
|
|
|
286 |
# Initialize captioner if not already done
|
287 |
self._should_stop_captioning = False
|
288 |
|
289 |
+
# First yield - indicate we're starting
|
290 |
+
yield gr.update(
|
291 |
+
value=[["Starting captioning service...", "initializing"]],
|
292 |
+
headers=["name", "status"]
|
293 |
+
)
|
294 |
+
|
295 |
+
# Process files in batches with status updates
|
296 |
+
file_statuses = {}
|
297 |
+
|
298 |
+
# Start the actual captioning process
|
299 |
async for rows in self.captioner.start_caption_generation(captioning_bot_instructions, prompt_prefix):
|
300 |
+
# Update our tracking of file statuses
|
301 |
+
for name, status in rows:
|
302 |
+
file_statuses[name] = status
|
303 |
+
|
304 |
+
# Convert to list format for display
|
305 |
+
status_rows = [[name, status] for name, status in file_statuses.items()]
|
306 |
+
|
307 |
+
# Sort by name for consistent display
|
308 |
+
status_rows.sort(key=lambda x: x[0])
|
309 |
+
|
310 |
# Yield UI update
|
311 |
yield gr.update(
|
312 |
+
value=status_rows,
|
313 |
headers=["name", "status"]
|
314 |
)
|
315 |
|
316 |
+
# Final update after completion with fresh data
|
317 |
yield gr.update(
|
318 |
value=self.list_training_files_to_caption(),
|
319 |
headers=["name", "status"]
|
320 |
)
|
321 |
|
322 |
except Exception as e:
|
323 |
+
logger.error(f"Error in captioning: {str(e)}")
|
324 |
yield gr.update(
|
325 |
+
value=[[f"Error: {str(e)}", "error"]],
|
326 |
headers=["name", "status"]
|
327 |
)
|
328 |
|
329 |
def list_training_files_to_caption(self) -> List[List[str]]:
|
330 |
"""List all clips and images - both pending and captioned"""
|
331 |
files = []
|
332 |
+
already_listed = {}
|
333 |
|
334 |
+
# First check files in STAGING_PATH
|
335 |
for file in STAGING_PATH.glob("*.*"):
|
336 |
if is_video_file(file) or is_image_file(file):
|
337 |
txt_file = file.with_suffix('.txt')
|
338 |
+
|
339 |
+
# Check if caption file exists and has content
|
340 |
+
has_caption = txt_file.exists() and txt_file.stat().st_size > 0
|
341 |
+
status = "captioned" if has_caption else "no caption"
|
342 |
file_type = "video" if is_video_file(file) else "image"
|
343 |
+
|
344 |
files.append([file.name, f"{status} ({file_type})", str(file)])
|
345 |
+
already_listed[file.name] = True
|
346 |
+
|
347 |
+
# Then check files in TRAINING_VIDEOS_PATH
|
348 |
for file in TRAINING_VIDEOS_PATH.glob("*.*"):
|
349 |
+
if (is_video_file(file) or is_image_file(file)) and file.name not in already_listed:
|
350 |
+
txt_file = file.with_suffix('.txt')
|
351 |
+
|
352 |
+
# Only include files with captions
|
353 |
+
if txt_file.exists() and txt_file.stat().st_size > 0:
|
354 |
+
file_type = "video" if is_video_file(file) else "image"
|
355 |
+
files.append([file.name, f"captioned ({file_type})", str(file)])
|
356 |
+
already_listed[file.name] = True
|
357 |
+
|
358 |
# Sort by filename
|
359 |
files.sort(key=lambda x: x[0])
|
360 |
|
|
|
1198 |
}
|
1199 |
|
1200 |
run_autocaption_btn.click(
|
1201 |
+
fn=self.show_refreshing_status,
|
1202 |
+
outputs=[training_dataset]
|
1203 |
+
).then(
|
1204 |
+
fn=lambda: self.update_captioning_buttons_start(),
|
1205 |
+
outputs=[run_autocaption_btn, stop_autocaption_btn, copy_files_to_training_dir_btn]
|
1206 |
+
).then(
|
1207 |
fn=self.start_caption_generation,
|
1208 |
inputs=[captioning_bot_instructions, custom_prompt_prefix],
|
1209 |
outputs=[training_dataset],
|
1210 |
).then(
|
1211 |
+
fn=lambda: self.update_captioning_buttons_end(),
|
1212 |
+
outputs=[run_autocaption_btn, stop_autocaption_btn, copy_files_to_training_dir_btn]
|
1213 |
)
|
1214 |
+
|
1215 |
copy_files_to_training_dir_btn.click(
|
1216 |
fn=self.copy_files_to_training_dir,
|
1217 |
inputs=[custom_prompt_prefix]
|
1218 |
)
|
|
|
1219 |
stop_autocaption_btn.click(
|
1220 |
+
fn=self.stop_captioning,
|
1221 |
+
outputs=[training_dataset, run_autocaption_btn, stop_autocaption_btn, copy_files_to_training_dir_btn]
|
1222 |
)
|
|
|
1223 |
training_dataset.select(
|
1224 |
fn=self.handle_training_dataset_select,
|
1225 |
outputs=[preview_image, preview_video, preview_caption, preview_status]
|
vms/captioning_service.py
CHANGED
@@ -17,9 +17,9 @@ from llava.mm_utils import tokenizer_image_token
|
|
17 |
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
|
18 |
from llava.conversation import conv_templates, SeparatorStyle
|
19 |
|
20 |
-
from config import TRAINING_VIDEOS_PATH, STAGING_PATH, PRELOAD_CAPTIONING_MODEL, CAPTIONING_MODEL, USE_MOCK_CAPTIONING_MODEL, DEFAULT_CAPTIONING_BOT_INSTRUCTIONS, VIDEOS_TO_SPLIT_PATH, DEFAULT_PROMPT_PREFIX
|
21 |
-
from utils import extract_scene_info, is_image_file, is_video_file
|
22 |
-
from finetrainers_utils import copy_files_to_training_dir, prepare_finetrainers_dataset
|
23 |
|
24 |
logger = logging.getLogger(__name__)
|
25 |
|
|
|
17 |
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
|
18 |
from llava.conversation import conv_templates, SeparatorStyle
|
19 |
|
20 |
+
from .config import TRAINING_VIDEOS_PATH, STAGING_PATH, PRELOAD_CAPTIONING_MODEL, CAPTIONING_MODEL, USE_MOCK_CAPTIONING_MODEL, DEFAULT_CAPTIONING_BOT_INSTRUCTIONS, VIDEOS_TO_SPLIT_PATH, DEFAULT_PROMPT_PREFIX
|
21 |
+
from .utils import extract_scene_info, is_image_file, is_video_file
|
22 |
+
from .finetrainers_utils import copy_files_to_training_dir, prepare_finetrainers_dataset
|
23 |
|
24 |
logger = logging.getLogger(__name__)
|
25 |
|
vms/config.py
CHANGED
@@ -29,7 +29,7 @@ DEFAULT_PROMPT_PREFIX = "In the style of TOK, "
|
|
29 |
# This is only use to debug things in local
|
30 |
USE_MOCK_CAPTIONING_MODEL = parse_bool_env(os.environ.get('USE_MOCK_CAPTIONING_MODEL'))
|
31 |
|
32 |
-
DEFAULT_CAPTIONING_BOT_INSTRUCTIONS = "Please write a full description
|
33 |
|
34 |
# Create directories
|
35 |
STORAGE_PATH.mkdir(parents=True, exist_ok=True)
|
|
|
29 |
# This is only use to debug things in local
|
30 |
USE_MOCK_CAPTIONING_MODEL = parse_bool_env(os.environ.get('USE_MOCK_CAPTIONING_MODEL'))
|
31 |
|
32 |
+
DEFAULT_CAPTIONING_BOT_INSTRUCTIONS = "Please write a full video description. Be synthetic, don't say things like "this video features.." etc. Instead, methodically list camera (close-up shot, medium-shot..), genre (music video, horror movie scene, video game footage, go pro footage, japanese anime, noir film, science-fiction, action movie, documentary..), characters (physical appearance, look, skin, facial features, haircut, clothing), scene (action, positions, movements), location (indoor, outdoor, place, building, country..), time and lighting (natural, golden hour, night time, LED lights, kelvin temperature etc), weather and climate (dusty, rainy, fog, haze, snowing..), era/settings."
|
33 |
|
34 |
# Create directories
|
35 |
STORAGE_PATH.mkdir(parents=True, exist_ok=True)
|
vms/finetrainers_utils.py
CHANGED
@@ -3,8 +3,9 @@ from pathlib import Path
|
|
3 |
import logging
|
4 |
import shutil
|
5 |
from typing import Any, Optional, Dict, List, Union, Tuple
|
6 |
-
|
7 |
-
from
|
|
|
8 |
|
9 |
logger = logging.getLogger(__name__)
|
10 |
|
|
|
3 |
import logging
|
4 |
import shutil
|
5 |
from typing import Any, Optional, Dict, List, Union, Tuple
|
6 |
+
|
7 |
+
from .config import STORAGE_PATH, TRAINING_PATH, STAGING_PATH, TRAINING_VIDEOS_PATH, MODEL_PATH, OUTPUT_PATH, HF_API_TOKEN, MODEL_TYPES
|
8 |
+
from .utils import get_video_fps, extract_scene_info, make_archive, is_image_file, is_video_file
|
9 |
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
vms/image_preprocessing.py
CHANGED
@@ -4,7 +4,8 @@ from pathlib import Path
|
|
4 |
from PIL import Image
|
5 |
import pillow_avif
|
6 |
import logging
|
7 |
-
|
|
|
8 |
|
9 |
logger = logging.getLogger(__name__)
|
10 |
|
|
|
4 |
from PIL import Image
|
5 |
import pillow_avif
|
6 |
import logging
|
7 |
+
|
8 |
+
from .config import NORMALIZE_IMAGES_TO, JPEG_QUALITY
|
9 |
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
vms/import_service.py
CHANGED
@@ -7,10 +7,10 @@ from pathlib import Path
|
|
7 |
from typing import List, Dict, Optional, Tuple
|
8 |
from pytubefix import YouTube
|
9 |
import logging
|
10 |
-
from utils import is_image_file, is_video_file, add_prefix_to_caption
|
11 |
-
from image_preprocessing import normalize_image
|
12 |
|
13 |
-
from
|
|
|
|
|
14 |
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
|
|
7 |
from typing import List, Dict, Optional, Tuple
|
8 |
from pytubefix import YouTube
|
9 |
import logging
|
|
|
|
|
10 |
|
11 |
+
from .utils import is_image_file, is_video_file, add_prefix_to_caption
|
12 |
+
from .image_preprocessing import normalize_image
|
13 |
+
from .config import NORMALIZE_IMAGES_TO, TRAINING_VIDEOS_PATH, VIDEOS_TO_SPLIT_PATH, TRAINING_PATH, DEFAULT_PROMPT_PREFIX
|
14 |
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
vms/splitting_service.py
CHANGED
@@ -12,11 +12,11 @@ import gradio as gr
|
|
12 |
from scenedetect import detect, ContentDetector, SceneManager, open_video
|
13 |
from scenedetect.video_splitter import split_video_ffmpeg
|
14 |
|
15 |
-
from config import TRAINING_PATH, STORAGE_PATH, TRAINING_VIDEOS_PATH, VIDEOS_TO_SPLIT_PATH, STAGING_PATH, DEFAULT_PROMPT_PREFIX
|
16 |
|
17 |
-
from image_preprocessing import detect_black_bars
|
18 |
-
from video_preprocessing import remove_black_bars
|
19 |
-
from utils import extract_scene_info, is_video_file, is_image_file, add_prefix_to_caption
|
20 |
|
21 |
logger = logging.getLogger(__name__)
|
22 |
|
|
|
12 |
from scenedetect import detect, ContentDetector, SceneManager, open_video
|
13 |
from scenedetect.video_splitter import split_video_ffmpeg
|
14 |
|
15 |
+
from .config import TRAINING_PATH, STORAGE_PATH, TRAINING_VIDEOS_PATH, VIDEOS_TO_SPLIT_PATH, STAGING_PATH, DEFAULT_PROMPT_PREFIX
|
16 |
|
17 |
+
from .image_preprocessing import detect_black_bars
|
18 |
+
from .video_preprocessing import remove_black_bars
|
19 |
+
from .utils import extract_scene_info, is_video_file, is_image_file, add_prefix_to_caption
|
20 |
|
21 |
logger = logging.getLogger(__name__)
|
22 |
|
vms/training_service.py
CHANGED
@@ -18,10 +18,10 @@ import select
|
|
18 |
|
19 |
from typing import Any, Optional, Dict, List, Union, Tuple
|
20 |
|
21 |
-
from huggingface_hub import upload_folder, create_repo
|
22 |
-
from config import TrainingConfig, TRAINING_PRESETS, LOG_FILE_PATH, TRAINING_VIDEOS_PATH, STORAGE_PATH, TRAINING_PATH, MODEL_PATH, OUTPUT_PATH, HF_API_TOKEN, MODEL_TYPES
|
23 |
-
from utils import make_archive, parse_training_log, is_image_file, is_video_file
|
24 |
-
from finetrainers_utils import prepare_finetrainers_dataset, copy_files_to_training_dir
|
25 |
|
26 |
logger = logging.getLogger(__name__)
|
27 |
|
|
|
18 |
|
19 |
from typing import Any, Optional, Dict, List, Union, Tuple
|
20 |
|
21 |
+
from .huggingface_hub import upload_folder, create_repo
|
22 |
+
from .config import TrainingConfig, TRAINING_PRESETS, LOG_FILE_PATH, TRAINING_VIDEOS_PATH, STORAGE_PATH, TRAINING_PATH, MODEL_PATH, OUTPUT_PATH, HF_API_TOKEN, MODEL_TYPES
|
23 |
+
from .utils import make_archive, parse_training_log, is_image_file, is_video_file
|
24 |
+
from .finetrainers_utils import prepare_finetrainers_dataset, copy_files_to_training_dir
|
25 |
|
26 |
logger = logging.getLogger(__name__)
|
27 |
|