feat(ui, core): Implement advanced grouped image backgrounds
Browse filesThis major refactoring introduces a powerful new feature allowing users to assign specific sets of images to designated groups of tracks. This transforms the application from a linear visualizer into a tool capable of creating dynamic, context-aware videos with thematic sections.
The implementation required a complete overhaul of the UI for image uploads and a fundamental rewrite of the backend logic for track processing and image distribution.
Dynamic Group Management:
The single image uploader has been replaced with a dynamic interface for defining up to 10 distinct image groups.
Users can now click "+ Add Image Group" and "- Remove Last Group" buttons to manage the number of visible group definitions.
This is simulated by managing the visibility of a pre-defined maximum number of groups, providing a seamless user experience.
Group Definition:
Each group consists of a Textbox for defining track ranges (e.g., "1-4, 7, 10-13") and a dedicated Files uploader for that group's specific images.
Fallback Images:
A separate "Fallback / Default Images" uploader is provided for any tracks that are not explicitly assigned to a group.
@@ -9,7 +9,7 @@ import subprocess
|
|
9 |
import soundfile as sf
|
10 |
import matplotlib.font_manager as fm
|
11 |
from PIL import ImageFont
|
12 |
-
from typing import Tuple, List, Dict
|
13 |
from mutagen.flac import FLAC
|
14 |
from moviepy import CompositeVideoClip, TextClip, VideoClip, AudioFileClip, ImageClip
|
15 |
|
@@ -28,7 +28,7 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
|
|
28 |
elif platform_id == 1 and encoding_id == 0: # Macintosh, Roman
|
29 |
return name_bytes.decode('mac_roman').strip('\x00')
|
30 |
elif platform_id == 0: # Unicode
|
31 |
-
|
32 |
else: # Fallback
|
33 |
return name_bytes.decode('utf_8', errors='ignore').strip('\x00')
|
34 |
except Exception:
|
@@ -36,9 +36,10 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
|
|
36 |
|
37 |
try:
|
38 |
with open(font_path, 'rb') as f: data = f.read()
|
39 |
-
def read_ushort(offset):
|
40 |
-
|
41 |
-
|
|
|
42 |
font_offsets = [0]
|
43 |
# Check for TTC (TrueType Collection) header
|
44 |
if data[:4] == b'ttcf':
|
@@ -47,7 +48,7 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
|
|
47 |
|
48 |
# For simplicity, we only parse the first font in a TTC
|
49 |
font_offset = font_offsets[0]
|
50 |
-
|
51 |
num_tables = read_ushort(font_offset + 4)
|
52 |
name_table_offset = -1
|
53 |
# Locate the 'name' table
|
@@ -55,38 +56,50 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
|
|
55 |
entry_offset = font_offset + 12 + i * 16
|
56 |
tag = data[entry_offset:entry_offset+4]
|
57 |
if tag == b'name':
|
58 |
-
name_table_offset = read_ulong(entry_offset + 8)
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
62 |
count, string_offset = read_ushort(name_table_offset + 2), read_ushort(name_table_offset + 4)
|
63 |
name_candidates = {}
|
64 |
# Iterate through all name records
|
65 |
for i in range(count):
|
66 |
rec_offset = name_table_offset + 6 + i * 12
|
67 |
platform_id, encoding_id, language_id, name_id, length, offset = struct.unpack('>HHHHHH', data[rec_offset:rec_offset+12])
|
68 |
-
|
69 |
if name_id == 4: # We only care about the "Full Font Name"
|
70 |
string_pos = name_table_offset + string_offset + offset
|
71 |
value = decode_name_string(data[string_pos : string_pos + length], platform_id, encoding_id)
|
72 |
-
|
73 |
if value:
|
74 |
# Store candidates based on language ID
|
75 |
-
if language_id in [1028, 2052, 3076, 4100, 5124]:
|
76 |
-
|
77 |
-
elif language_id ==
|
78 |
-
|
|
|
|
|
|
|
|
|
79 |
else:
|
80 |
-
if "other" not in name_candidates:
|
81 |
-
|
|
|
82 |
# Return the best candidate based on language priority
|
83 |
-
if name_candidates.get("zh"):
|
84 |
-
|
85 |
-
if name_candidates.get("
|
86 |
-
|
87 |
-
if name_candidates.get("
|
|
|
|
|
|
|
|
|
|
|
88 |
return None, None
|
89 |
-
|
90 |
except Exception:
|
91 |
return None, None
|
92 |
|
@@ -106,22 +119,22 @@ def get_font_data() -> Tuple[Dict[str, str], List[str]]:
|
|
106 |
for path in all_font_files:
|
107 |
display_name, lang_tag = get_font_display_name(path)
|
108 |
is_fallback = display_name is None
|
109 |
-
|
110 |
if is_fallback:
|
111 |
# Create a fallback name from the filename
|
112 |
display_name = os.path.splitext(os.path.basename(path))[0].replace('-', ' ').replace('_', ' ').title()
|
113 |
lang_tag = 'fallback'
|
114 |
-
|
115 |
if display_name and display_name not in font_map:
|
116 |
font_map[display_name] = path
|
117 |
found_names.append((display_name, is_fallback, lang_tag))
|
118 |
-
|
119 |
# Define sort priority for languages
|
120 |
sort_order = {'zh': 0, 'ja': 1, 'ko': 2, 'en': 3, 'other': 4, 'fallback': 5}
|
121 |
|
122 |
# Sort by priority, then alphabetically
|
123 |
found_names.sort(key=lambda x: (sort_order.get(x[2], 99), x[0]))
|
124 |
-
|
125 |
sorted_display_names = [name for name, _, _ in found_names]
|
126 |
return font_map, sorted_display_names
|
127 |
|
@@ -188,7 +201,7 @@ def increase_video_framerate(input_path: str, output_path: str, target_fps: int
|
|
188 |
'-c:a', 'copy', # Copy audio without re-encoding
|
189 |
output_path
|
190 |
]
|
191 |
-
|
192 |
try:
|
193 |
# Execute the command
|
194 |
# Using capture_output to hide ffmpeg logs from the main console unless an error occurs
|
@@ -203,24 +216,75 @@ def increase_video_framerate(input_path: str, output_path: str, target_fps: int
|
|
203 |
raise gr.Error(f"FFmpeg failed to increase the framerate. See console for details. Error: {e.stderr}")
|
204 |
|
205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
# --- Main Processing Function ---
|
207 |
-
def process_audio_to_video(
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
if not audio_files:
|
218 |
raise gr.Error("Please upload at least one audio file.")
|
219 |
if not font_name:
|
220 |
raise gr.Error("Please select a font from the list.")
|
221 |
|
222 |
progress(0, desc="Initializing...")
|
223 |
-
|
224 |
# Define paths for temporary and final files
|
225 |
timestamp = int(time.time())
|
226 |
temp_fps1_path = f"temp_{timestamp}_fps1.mp4"
|
@@ -254,7 +318,7 @@ def process_audio_to_video(
|
|
254 |
raise ValueError(f"Could not parse rgb color string: {color_str}")
|
255 |
else:
|
256 |
raise ValueError(f"Unknown color format: {color_str}")
|
257 |
-
|
258 |
# Use the new robust parser for all color inputs
|
259 |
fg_rgb, bg_rgb = parse_color_to_rgb(spec_fg_color), parse_color_to_rgb(spec_bg_color)
|
260 |
grid_rgb = tuple(min(c + 40, 255) for c in bg_rgb)
|
@@ -264,11 +328,9 @@ def process_audio_to_video(
|
|
264 |
# --- Define total steps for the progress bar ---
|
265 |
TOTAL_STEPS = 5
|
266 |
|
267 |
-
# --- 1
|
268 |
-
|
269 |
-
total_duration = 0.0
|
270 |
-
y_accumulator = []
|
271 |
-
current_sr = None
|
272 |
|
273 |
# --- Use `progress.tqdm` to create a progress bar for this loop ---
|
274 |
for file_idx, audio_path in enumerate(progress.tqdm(audio_files, desc=f"Stage 1/{TOTAL_STEPS}: Analyzing Audio Files")):
|
@@ -301,48 +363,20 @@ def process_audio_to_video(
|
|
301 |
|
302 |
print(f"Successfully parsed {len(cue_tracks)} tracks from CUE sheet.")
|
303 |
except Exception as e:
|
304 |
-
print(f"Warning: Could not
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
"end_time": total_duration + (cue_tracks[track_idx+1].get('start_time', file_duration) if track_idx + 1 < len(cue_tracks) else file_duration),
|
317 |
-
"number_str": number_str
|
318 |
-
})
|
319 |
-
else: # Scenario 2: Multiple files, this one has NO CUE
|
320 |
-
number_str = f"{file_num:02d}" if format_double_digits else str(file_num)
|
321 |
-
all_tracks_info.append({
|
322 |
-
"title": os.path.splitext(os.path.basename(audio_path))[0],
|
323 |
-
"start_time": total_duration, "end_time": total_duration + file_duration,
|
324 |
-
"number_str": number_str
|
325 |
-
})
|
326 |
-
else: # Scenario 1: Single file upload
|
327 |
-
if cue_tracks: # With CUE
|
328 |
-
for track_idx, track in enumerate(cue_tracks):
|
329 |
-
track_num = track_idx + 1
|
330 |
-
number_str = f"{track_num:02d}" if format_double_digits else str(track_num)
|
331 |
-
all_tracks_info.append({
|
332 |
-
"title": track.get('title', 'Unknown Track'),
|
333 |
-
"start_time": total_duration + track.get('start_time', 0),
|
334 |
-
"end_time": total_duration + (cue_tracks[track_idx+1].get('start_time', file_duration) if track_idx + 1 < len(cue_tracks) else file_duration),
|
335 |
-
"number_str": f"{number_str}." # Add a dot for single file CUE tracks
|
336 |
-
})
|
337 |
-
else: # No CUE
|
338 |
-
all_tracks_info.append({
|
339 |
-
"title": os.path.splitext(os.path.basename(audio_path))[0],
|
340 |
-
"start_time": total_duration, "end_time": total_duration + file_duration,
|
341 |
-
"number_str": None # Signal to not show any number
|
342 |
-
})
|
343 |
-
|
344 |
total_duration += file_duration
|
345 |
-
|
346 |
# --- Concatenate along the time axis (axis=1) for stereo arrays ---
|
347 |
y_combined = np.concatenate(y_accumulator, axis=1)
|
348 |
duration = total_duration
|
@@ -350,116 +384,128 @@ def process_audio_to_video(
|
|
350 |
# --- Transpose the array for soundfile to write stereo correctly ---
|
351 |
sf.write(temp_audio_path, y_combined.T, current_sr)
|
352 |
print(f"Combined all audio files into one. Total duration: {duration:.2f}s")
|
353 |
-
|
354 |
# --- Update progress to the next stage, use fractional progress (current/total) ---
|
355 |
-
progress(1 / TOTAL_STEPS, desc=f"Stage 2/{TOTAL_STEPS}:
|
356 |
|
357 |
-
# --- 2
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
|
368 |
-
|
|
|
|
|
369 |
|
370 |
-
|
371 |
-
|
372 |
-
|
|
|
|
|
|
|
373 |
|
374 |
-
|
375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
376 |
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
_, descent = pil_font.getmetrics()
|
382 |
-
# Calculate a bottom margin to compensate for the font's descent.
|
383 |
-
# A small constant is added as a safety buffer.
|
384 |
-
# This prevents clipping on fonts with large descenders (like 'g', 'p').
|
385 |
-
bottom_margin = int(descent * 0.5) + 2
|
386 |
-
print(f"Font '{font_name}' descent: {descent}. Applying dynamic bottom margin of {bottom_margin}px.")
|
387 |
-
except Exception as e:
|
388 |
-
# Fallback in case of any font loading error
|
389 |
-
print(f"Warning: Could not get font metrics for '{font_name}'. Using fixed margin. Error: {e}")
|
390 |
-
bottom_margin = int(WIDTH * 0.01) # A small fixed fallback
|
391 |
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
continue
|
396 |
-
|
397 |
-
# Construct display text based on pre-formatted number string
|
398 |
-
display_text = f"{track['number_str']} {track['title']}" if track['number_str'] else track['title']
|
399 |
-
|
400 |
-
|
401 |
-
# 1. Create the TextClip first without positioning to get its size
|
402 |
-
txt_clip = TextClip(
|
403 |
-
text=display_text.strip(),
|
404 |
-
font_size=font_size,
|
405 |
-
color=font_color,
|
406 |
-
font=font_path,
|
407 |
-
bg_color=bg_color_tuple,
|
408 |
-
method='caption', # <-- Set method to caption
|
409 |
-
size=(caption_width, None), # <-- Provide size for wrapping
|
410 |
-
margin=(0, 0, 0, bottom_margin)
|
411 |
-
).with_position(position).with_duration(text_duration).with_start(track['start_time'])
|
412 |
-
|
413 |
-
text_clips.append(txt_clip)
|
414 |
-
|
415 |
-
# --- Update progress to the next stage, use fractional progress (current/total) ---
|
416 |
-
progress(2 / TOTAL_STEPS, desc=f"Stage 3/{TOTAL_STEPS}: Generating Visual Layers")
|
417 |
|
418 |
-
# --- 3. Image and Spectrogram Logic ---
|
419 |
-
image_clips = []
|
420 |
-
if image_paths and len(image_paths) > 0:
|
421 |
-
print(f"Found {len(image_paths)} images to process.")
|
422 |
|
423 |
-
#
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
# 1. Calculate scaling factor to "contain" the image (fit inside).
|
439 |
-
# We use min() to find the ratio that requires the most shrinkage,
|
440 |
-
# ensuring the whole image fits without being cropped.
|
441 |
-
scale_factor = min(WIDTH / img_clip_raw.w, HEIGHT / img_clip_raw.h)
|
442 |
-
|
443 |
-
# 2. Resize the image so it fits perfectly within the video dimensions.
|
444 |
-
resized_clip = img_clip_raw.resized(scale_factor)
|
445 |
-
|
446 |
-
# 3. Create a composite clip to position the resized image on a
|
447 |
-
# correctly-sized transparent canvas. This is the key to preventing overflow.
|
448 |
-
final_layer = CompositeVideoClip(
|
449 |
-
[resized_clip.with_position("center")],
|
450 |
-
size=(WIDTH, HEIGHT)
|
451 |
-
)
|
452 |
-
|
453 |
-
# 4. Set the timing on the final composite layer.
|
454 |
-
return final_layer.with_duration(dur).with_start(start)
|
455 |
-
except Exception as e:
|
456 |
-
print(f"Warning: Failed to process image '{img_path}'. Skipping. Error: {e}")
|
457 |
-
return None
|
458 |
-
|
459 |
-
# Create an ImageClip for the duration of the track.
|
460 |
-
clip = create_image_layer(img_path, i * img_duration, img_duration)
|
461 |
-
if clip:
|
462 |
-
image_clips.append(clip)
|
463 |
|
464 |
N_FFT, HOP_LENGTH, N_BANDS = 2048, 512, 32
|
465 |
MIN_DB, MAX_DB = -80.0, 0.0
|
@@ -506,16 +552,16 @@ def process_audio_to_video(
|
|
506 |
|
507 |
video_clip = VideoClip(frame_function=frame_generator, duration=duration)
|
508 |
|
509 |
-
# ---
|
510 |
# If image clips were created, make the spectrogram layer 50% transparent.
|
511 |
if image_clips:
|
512 |
print("Applying 50% opacity to spectrogram layer.")
|
513 |
video_clip = video_clip.with_opacity(0.5)
|
514 |
-
|
515 |
# --- Use fractional progress (current/total) ---
|
516 |
-
progress(3 / TOTAL_STEPS, desc=f"Stage 4/{TOTAL_STEPS}: Rendering Base Video
|
517 |
|
518 |
-
# ---
|
519 |
audio_clip = AudioFileClip(temp_audio_path)
|
520 |
|
521 |
# --- Clip Composition ---
|
@@ -542,7 +588,7 @@ def process_audio_to_video(
|
|
542 |
audio_bitrate="320k", fps=RENDER_FPS,
|
543 |
logger='bar', threads=os.cpu_count(), preset='ultrafast')
|
544 |
print("High-quality AAC audio encoding complete.")
|
545 |
-
|
546 |
final_clip.close()
|
547 |
|
548 |
# Step 2: Use FFmpeg to quickly increase the framerate to 24 FPS
|
@@ -550,8 +596,8 @@ def process_audio_to_video(
|
|
550 |
|
551 |
# --- Use fractional progress (current/total) ---
|
552 |
progress(4 / TOTAL_STEPS, desc=f"Stage 5/{TOTAL_STEPS}: Finalizing Video")
|
553 |
-
|
554 |
-
# ---
|
555 |
increase_video_framerate(temp_fps1_path, final_output_path, target_fps=PLAYBACK_FPS)
|
556 |
|
557 |
return final_output_path
|
@@ -573,29 +619,63 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
|
|
573 |
with gr.Column(scale=1):
|
574 |
# --- Changed to gr.Files for multi-upload ---
|
575 |
audio_inputs = gr.Files(
|
576 |
-
label="Upload Audio File(s)",
|
577 |
file_count="multiple",
|
578 |
file_types=["audio"]
|
579 |
)
|
580 |
|
581 |
-
# --- Image
|
582 |
-
gr.
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
|
598 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
599 |
with gr.Accordion("Visualizer Options", open=True):
|
600 |
with gr.Row():
|
601 |
width_input = gr.Number(value=1920, label="Video Width (px)", precision=0)
|
@@ -611,7 +691,7 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
|
|
611 |
# --- Checkbox for number formatting ---
|
612 |
format_double_digits_checkbox = gr.Checkbox(label="Format track numbers as double digits (e.g., 01, 05-09)", value=True)
|
613 |
gr.Markdown("If the CUE sheet or filenames contain non-English characters, please select a compatible font.")
|
614 |
-
|
615 |
# Define a priority list for default fonts, starting with common Japanese ones.
|
616 |
# This list can include multiple names for the same font to improve matching.
|
617 |
preferred_fonts = [
|
@@ -634,15 +714,15 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
|
|
634 |
default_font = FONT_DISPLAY_NAMES[0]
|
635 |
|
636 |
font_name_dd = gr.Dropdown(choices=FONT_DISPLAY_NAMES, value=default_font, label="Font Family")
|
637 |
-
|
638 |
with gr.Row():
|
639 |
font_size_slider = gr.Slider(minimum=12, maximum=256, value=80, step=1, label="Font Size")
|
640 |
font_color_picker = gr.ColorPicker(value="#FFFFFF", label="Font Color")
|
641 |
-
|
642 |
with gr.Row():
|
643 |
font_bg_color_picker = gr.ColorPicker(value="#000000", label="Text BG Color")
|
644 |
font_bg_alpha_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.6, step=0.05, label="Text BG Opacity")
|
645 |
-
|
646 |
gr.Markdown("Text Position")
|
647 |
with gr.Row():
|
648 |
pos_h_radio = gr.Radio(["left", "center", "right"], value="center", label="Horizontal Align")
|
@@ -652,20 +732,39 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
|
|
652 |
|
653 |
with gr.Column(scale=2):
|
654 |
video_output = gr.Video(label="Generated Video")
|
655 |
-
|
656 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
657 |
submit_btn.click(
|
658 |
fn=process_audio_to_video,
|
659 |
-
inputs=
|
660 |
-
|
661 |
-
|
662 |
-
width_input, height_input,
|
663 |
-
fg_color, bg_color,
|
664 |
-
font_name_dd, font_size_slider, font_color_picker,
|
665 |
-
font_bg_color_picker, font_bg_alpha_slider,
|
666 |
-
pos_h_radio, pos_v_radio
|
667 |
-
],
|
668 |
-
outputs=video_output
|
669 |
)
|
670 |
|
671 |
if __name__ == "__main__":
|
|
|
9 |
import soundfile as sf
|
10 |
import matplotlib.font_manager as fm
|
11 |
from PIL import ImageFont
|
12 |
+
from typing import Tuple, List, Dict, Set
|
13 |
from mutagen.flac import FLAC
|
14 |
from moviepy import CompositeVideoClip, TextClip, VideoClip, AudioFileClip, ImageClip
|
15 |
|
|
|
28 |
elif platform_id == 1 and encoding_id == 0: # Macintosh, Roman
|
29 |
return name_bytes.decode('mac_roman').strip('\x00')
|
30 |
elif platform_id == 0: # Unicode
|
31 |
+
return name_bytes.decode('utf_16_be').strip('\x00')
|
32 |
else: # Fallback
|
33 |
return name_bytes.decode('utf_8', errors='ignore').strip('\x00')
|
34 |
except Exception:
|
|
|
36 |
|
37 |
try:
|
38 |
with open(font_path, 'rb') as f: data = f.read()
|
39 |
+
def read_ushort(offset):
|
40 |
+
return struct.unpack('>H', data[offset:offset+2])[0]
|
41 |
+
def read_ulong(offset):
|
42 |
+
return struct.unpack('>I', data[offset:offset+4])[0]
|
43 |
font_offsets = [0]
|
44 |
# Check for TTC (TrueType Collection) header
|
45 |
if data[:4] == b'ttcf':
|
|
|
48 |
|
49 |
# For simplicity, we only parse the first font in a TTC
|
50 |
font_offset = font_offsets[0]
|
51 |
+
|
52 |
num_tables = read_ushort(font_offset + 4)
|
53 |
name_table_offset = -1
|
54 |
# Locate the 'name' table
|
|
|
56 |
entry_offset = font_offset + 12 + i * 16
|
57 |
tag = data[entry_offset:entry_offset+4]
|
58 |
if tag == b'name':
|
59 |
+
name_table_offset = read_ulong(entry_offset + 8)
|
60 |
+
break
|
61 |
+
|
62 |
+
if name_table_offset == -1:
|
63 |
+
return None, None
|
64 |
+
|
65 |
count, string_offset = read_ushort(name_table_offset + 2), read_ushort(name_table_offset + 4)
|
66 |
name_candidates = {}
|
67 |
# Iterate through all name records
|
68 |
for i in range(count):
|
69 |
rec_offset = name_table_offset + 6 + i * 12
|
70 |
platform_id, encoding_id, language_id, name_id, length, offset = struct.unpack('>HHHHHH', data[rec_offset:rec_offset+12])
|
71 |
+
|
72 |
if name_id == 4: # We only care about the "Full Font Name"
|
73 |
string_pos = name_table_offset + string_offset + offset
|
74 |
value = decode_name_string(data[string_pos : string_pos + length], platform_id, encoding_id)
|
75 |
+
|
76 |
if value:
|
77 |
# Store candidates based on language ID
|
78 |
+
if language_id in [1028, 2052, 3076, 4100, 5124]:
|
79 |
+
name_candidates["zh"] = value
|
80 |
+
elif language_id == 1041:
|
81 |
+
name_candidates["ja"] = value
|
82 |
+
elif language_id == 1042:
|
83 |
+
name_candidates["ko"] = value
|
84 |
+
elif language_id in [1033, 0]:
|
85 |
+
name_candidates["en"] = value
|
86 |
else:
|
87 |
+
if "other" not in name_candidates:
|
88 |
+
name_candidates["other"] = value
|
89 |
+
|
90 |
# Return the best candidate based on language priority
|
91 |
+
if name_candidates.get("zh"):
|
92 |
+
return name_candidates.get("zh"), "zh"
|
93 |
+
if name_candidates.get("ja"):
|
94 |
+
return name_candidates.get("ja"), "ja"
|
95 |
+
if name_candidates.get("ko"):
|
96 |
+
return name_candidates.get("ko"), "ko"
|
97 |
+
if name_candidates.get("other"):
|
98 |
+
return name_candidates.get("other"), "other"
|
99 |
+
if name_candidates.get("en"):
|
100 |
+
return name_candidates.get("en"), "en"
|
101 |
return None, None
|
102 |
+
|
103 |
except Exception:
|
104 |
return None, None
|
105 |
|
|
|
119 |
for path in all_font_files:
|
120 |
display_name, lang_tag = get_font_display_name(path)
|
121 |
is_fallback = display_name is None
|
122 |
+
|
123 |
if is_fallback:
|
124 |
# Create a fallback name from the filename
|
125 |
display_name = os.path.splitext(os.path.basename(path))[0].replace('-', ' ').replace('_', ' ').title()
|
126 |
lang_tag = 'fallback'
|
127 |
+
|
128 |
if display_name and display_name not in font_map:
|
129 |
font_map[display_name] = path
|
130 |
found_names.append((display_name, is_fallback, lang_tag))
|
131 |
+
|
132 |
# Define sort priority for languages
|
133 |
sort_order = {'zh': 0, 'ja': 1, 'ko': 2, 'en': 3, 'other': 4, 'fallback': 5}
|
134 |
|
135 |
# Sort by priority, then alphabetically
|
136 |
found_names.sort(key=lambda x: (sort_order.get(x[2], 99), x[0]))
|
137 |
+
|
138 |
sorted_display_names = [name for name, _, _ in found_names]
|
139 |
return font_map, sorted_display_names
|
140 |
|
|
|
201 |
'-c:a', 'copy', # Copy audio without re-encoding
|
202 |
output_path
|
203 |
]
|
204 |
+
|
205 |
try:
|
206 |
# Execute the command
|
207 |
# Using capture_output to hide ffmpeg logs from the main console unless an error occurs
|
|
|
216 |
raise gr.Error(f"FFmpeg failed to increase the framerate. See console for details. Error: {e.stderr}")
|
217 |
|
218 |
|
219 |
+
# --- HELPER FUNCTION for parsing track ranges ---
|
220 |
+
def parse_track_ranges(range_str: str) -> Set[int]:
|
221 |
+
"""Parses a string like '1-4, 7, 10-13' into a set of integers."""
|
222 |
+
if not range_str:
|
223 |
+
return set()
|
224 |
+
|
225 |
+
indices = set()
|
226 |
+
parts = range_str.split(',')
|
227 |
+
for part in parts:
|
228 |
+
part = part.strip()
|
229 |
+
if not part:
|
230 |
+
continue
|
231 |
+
if '-' in part:
|
232 |
+
try:
|
233 |
+
start, end = map(int, part.split('-'))
|
234 |
+
indices.update(range(start, end + 1))
|
235 |
+
except ValueError:
|
236 |
+
print(f"Warning: Could not parse range '{part}'. Skipping.")
|
237 |
+
else:
|
238 |
+
try:
|
239 |
+
indices.add(int(part))
|
240 |
+
except ValueError:
|
241 |
+
print(f"Warning: Could not parse track number '{part}'. Skipping.")
|
242 |
+
return indices
|
243 |
+
|
244 |
+
|
245 |
# --- Main Processing Function ---
|
246 |
+
def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
|
247 |
+
# --- Correctly unpack all arguments from *args using slicing ---
|
248 |
+
MAX_GROUPS = 10 # This MUST match the UI definition
|
249 |
+
|
250 |
+
# Define the structure of the *args tuple based on the `all_inputs` list
|
251 |
+
audio_files = args[0]
|
252 |
+
|
253 |
+
# Slice the args tuple to get the continuous blocks of inputs
|
254 |
+
all_track_strs = args[1 : 1 + MAX_GROUPS]
|
255 |
+
all_image_lists = args[1 + MAX_GROUPS : 1 + MAX_GROUPS * 2]
|
256 |
+
|
257 |
+
# Group inputs are packed in pairs (track_str, image_list)
|
258 |
+
group_definitions = []
|
259 |
+
for i in range(MAX_GROUPS):
|
260 |
+
group_definitions.append({
|
261 |
+
"tracks_str": all_track_strs[i],
|
262 |
+
"images": all_image_lists[i]
|
263 |
+
})
|
264 |
+
|
265 |
+
# Unpack the remaining arguments with correct indexing
|
266 |
+
arg_offset = 1 + MAX_GROUPS * 2
|
267 |
+
fallback_images = args[arg_offset]
|
268 |
+
format_double_digits = args[arg_offset + 1]
|
269 |
+
video_width = args[arg_offset + 2]
|
270 |
+
video_height = args[arg_offset + 3]
|
271 |
+
spec_fg_color = args[arg_offset + 4]
|
272 |
+
spec_bg_color = args[arg_offset + 5]
|
273 |
+
font_name = args[arg_offset + 6]
|
274 |
+
font_size = args[arg_offset + 7]
|
275 |
+
font_color = args[arg_offset + 8]
|
276 |
+
font_bg_color = args[arg_offset + 9]
|
277 |
+
font_bg_alpha = args[arg_offset + 10]
|
278 |
+
pos_h = args[arg_offset + 11]
|
279 |
+
pos_v = args[arg_offset + 12]
|
280 |
+
|
281 |
if not audio_files:
|
282 |
raise gr.Error("Please upload at least one audio file.")
|
283 |
if not font_name:
|
284 |
raise gr.Error("Please select a font from the list.")
|
285 |
|
286 |
progress(0, desc="Initializing...")
|
287 |
+
|
288 |
# Define paths for temporary and final files
|
289 |
timestamp = int(time.time())
|
290 |
temp_fps1_path = f"temp_{timestamp}_fps1.mp4"
|
|
|
318 |
raise ValueError(f"Could not parse rgb color string: {color_str}")
|
319 |
else:
|
320 |
raise ValueError(f"Unknown color format: {color_str}")
|
321 |
+
|
322 |
# Use the new robust parser for all color inputs
|
323 |
fg_rgb, bg_rgb = parse_color_to_rgb(spec_fg_color), parse_color_to_rgb(spec_bg_color)
|
324 |
grid_rgb = tuple(min(c + 40, 255) for c in bg_rgb)
|
|
|
328 |
# --- Define total steps for the progress bar ---
|
329 |
TOTAL_STEPS = 5
|
330 |
|
331 |
+
# --- Stage 1: Audio Processing & Master Track List Creation ---
|
332 |
+
master_track_list, y_accumulator, current_sr = [], [], None
|
333 |
+
total_duration, global_track_counter = 0.0, 0
|
|
|
|
|
334 |
|
335 |
# --- Use `progress.tqdm` to create a progress bar for this loop ---
|
336 |
for file_idx, audio_path in enumerate(progress.tqdm(audio_files, desc=f"Stage 1/{TOTAL_STEPS}: Analyzing Audio Files")):
|
|
|
363 |
|
364 |
print(f"Successfully parsed {len(cue_tracks)} tracks from CUE sheet.")
|
365 |
except Exception as e:
|
366 |
+
print(f"Warning: Could not parse CUE sheet for {os.path.basename(audio_path)}: {e}")
|
367 |
+
|
368 |
+
if cue_tracks:
|
369 |
+
for track_idx, track in enumerate(cue_tracks):
|
370 |
+
global_track_counter += 1
|
371 |
+
start_time = track.get('start_time', 0)
|
372 |
+
end_time = cue_tracks[track_idx+1].get('start_time', file_duration) if track_idx + 1 < len(cue_tracks) else file_duration
|
373 |
+
master_track_list.append({"global_index": global_track_counter, "title": track.get('title', 'Unknown'), "start_time": total_duration + start_time, "end_time": total_duration + end_time})
|
374 |
+
else:
|
375 |
+
global_track_counter += 1
|
376 |
+
master_track_list.append({"global_index": global_track_counter, "title": os.path.splitext(os.path.basename(audio_path))[0], "start_time": total_duration, "end_time": total_duration + file_duration})
|
377 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
total_duration += file_duration
|
379 |
+
|
380 |
# --- Concatenate along the time axis (axis=1) for stereo arrays ---
|
381 |
y_combined = np.concatenate(y_accumulator, axis=1)
|
382 |
duration = total_duration
|
|
|
384 |
# --- Transpose the array for soundfile to write stereo correctly ---
|
385 |
sf.write(temp_audio_path, y_combined.T, current_sr)
|
386 |
print(f"Combined all audio files into one. Total duration: {duration:.2f}s")
|
387 |
+
|
388 |
# --- Update progress to the next stage, use fractional progress (current/total) ---
|
389 |
+
progress(1 / TOTAL_STEPS, desc=f"Stage 2/{TOTAL_STEPS}: Mapping Images to Tracks")
|
390 |
|
391 |
+
# --- Stage 2: Map Tracks to Image Groups ---
|
392 |
+
parsed_groups = [parse_track_ranges(g['tracks_str']) for g in group_definitions]
|
393 |
+
track_to_images_map = {}
|
394 |
+
for track_info in master_track_list:
|
395 |
+
track_idx = track_info['global_index']
|
396 |
+
assigned = False
|
397 |
+
for i, group_indices in enumerate(parsed_groups):
|
398 |
+
if track_idx in group_indices:
|
399 |
+
track_to_images_map[track_idx] = group_definitions[i]['images']
|
400 |
+
assigned = True
|
401 |
+
break
|
402 |
+
if not assigned:
|
403 |
+
track_to_images_map[track_idx] = fallback_images
|
404 |
+
|
405 |
+
# --- Stage 3: Generate ImageClips based on contiguous blocks ---
|
406 |
+
image_clips = []
|
407 |
+
if any(track_to_images_map.values()):
|
408 |
+
current_track_cursor = 0
|
409 |
+
while current_track_cursor < len(master_track_list):
|
410 |
+
start_track_info = master_track_list[current_track_cursor]
|
411 |
+
image_set_for_block = track_to_images_map.get(start_track_info['global_index'])
|
412 |
+
|
413 |
+
# Find the end of the contiguous block of tracks that use the same image set
|
414 |
+
end_track_cursor = current_track_cursor
|
415 |
+
while (end_track_cursor + 1 < len(master_track_list) and
|
416 |
+
track_to_images_map.get(master_track_list[end_track_cursor + 1]['global_index']) == image_set_for_block):
|
417 |
+
end_track_cursor += 1
|
418 |
+
|
419 |
+
end_track_info = master_track_list[end_track_cursor]
|
420 |
+
|
421 |
+
block_start_time = start_track_info['start_time']
|
422 |
+
block_end_time = end_track_info['end_time']
|
423 |
+
block_duration = block_end_time - block_start_time
|
424 |
+
|
425 |
+
if image_set_for_block and block_duration > 0:
|
426 |
+
print(f"Creating image block for tracks {start_track_info['global_index']}-{end_track_info['global_index']} (Time: {block_start_time:.2f}s - {block_end_time:.2f}s)")
|
427 |
+
time_per_image = block_duration / len(image_set_for_block)
|
428 |
+
for i, img_path in enumerate(image_set_for_block):
|
429 |
+
def create_image_layer(path, start, dur):
|
430 |
+
try:
|
431 |
+
img = ImageClip(path)
|
432 |
+
scale = min(WIDTH/img.w, HEIGHT/img.h)
|
433 |
+
resized_img = img.resized(scale)
|
434 |
+
return CompositeVideoClip([resized_img.with_position("center")], size=(WIDTH, HEIGHT)).with_duration(dur).with_start(start)
|
435 |
+
except Exception as e:
|
436 |
+
print(f"Warning: Failed to process image '{path}'. Skipping. Error: {e}")
|
437 |
+
return None
|
438 |
+
|
439 |
+
clip = create_image_layer(img_path, block_start_time + i * time_per_image, time_per_image)
|
440 |
+
if clip:
|
441 |
+
image_clips.append(clip)
|
442 |
+
|
443 |
+
current_track_cursor = end_track_cursor + 1
|
444 |
+
|
445 |
+
progress(2 / TOTAL_STEPS, desc=f"Stage 3/{TOTAL_STEPS}: Generating Text & Spectrogram")
|
446 |
+
|
447 |
+
# --- Stage 4: Generate Text and Spectrogram ---
|
448 |
+
# --- Text Overlay Logic using the aggregated track info
|
449 |
+
text_clips = [] # Text clips are now simpler as they don't depend on complex file logic anymore
|
450 |
+
|
451 |
+
font_path = SYSTEM_FONTS_MAP.get(font_name)
|
452 |
+
if not font_path:
|
453 |
+
raise gr.Error(f"Font path for '{font_name}' not found!")
|
454 |
+
|
455 |
+
# Use the robust parser for text colors as well
|
456 |
+
font_bg_rgb = parse_color_to_rgb(font_bg_color)
|
457 |
|
458 |
+
position = (pos_h.lower(), pos_v.lower())
|
459 |
+
|
460 |
+
print(f"Using font: {font_name}, Size: {font_size}, Position: {position}")
|
461 |
|
462 |
+
# Create the RGBA tuple for the background color.
|
463 |
+
# The alpha value is converted from a 0.0-1.0 float to a 0-255 integer.
|
464 |
+
bg_color_tuple = (font_bg_rgb[0], font_bg_rgb[1], font_bg_rgb[2], int(font_bg_alpha * 255))
|
465 |
+
|
466 |
+
# 1. Define a maximum width for the caption. 90% of the video width is a good choice.
|
467 |
+
caption_width = int(WIDTH * 0.9)
|
468 |
|
469 |
+
# --- Get font metrics to calculate dynamic padding ---
|
470 |
+
try:
|
471 |
+
# Load the font with Pillow to access its metrics
|
472 |
+
pil_font = ImageFont.truetype(font_path, size=font_size)
|
473 |
+
_, descent = pil_font.getmetrics()
|
474 |
+
# Calculate a bottom margin to compensate for the font's descent.
|
475 |
+
# A small constant is added as a safety buffer.
|
476 |
+
# This prevents clipping on fonts with large descenders (like 'g', 'p').
|
477 |
+
bottom_margin = int(descent * 0.5) + 2
|
478 |
+
print(f"Font '{font_name}' descent: {descent}. Applying dynamic bottom margin of {bottom_margin}px.")
|
479 |
+
except Exception as e:
|
480 |
+
# Fallback in case of any font loading error
|
481 |
+
print(f"Warning: Could not get font metrics for '{font_name}'. Using fixed margin. Error: {e}")
|
482 |
+
bottom_margin = int(WIDTH * 0.01) # A small fixed fallback
|
483 |
|
484 |
+
for track in master_track_list:
|
485 |
+
text_duration = track['end_time'] - track['start_time']
|
486 |
+
if text_duration <= 0:
|
487 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
488 |
|
489 |
+
# Construct display text based on pre-formatted number string
|
490 |
+
num_str = f"{track['global_index']:02d}" if format_double_digits else str(track['global_index'])
|
491 |
+
display_text = f"{num_str}. {track['title']}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
492 |
|
|
|
|
|
|
|
|
|
493 |
|
494 |
+
# 1. Create the TextClip first without positioning to get its size
|
495 |
+
txt_clip = TextClip(
|
496 |
+
text=display_text.strip(),
|
497 |
+
font_size=font_size,
|
498 |
+
color=font_color,
|
499 |
+
font=font_path,
|
500 |
+
bg_color=bg_color_tuple,
|
501 |
+
method='caption', # <-- Set method to caption
|
502 |
+
size=(caption_width, None), # <-- Provide size for wrapping
|
503 |
+
margin=(0, 0, 0, bottom_margin)
|
504 |
+
).with_position(position).with_duration(text_duration).with_start(track['start_time'])
|
505 |
+
|
506 |
+
text_clips.append(txt_clip)
|
507 |
+
|
508 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
509 |
|
510 |
N_FFT, HOP_LENGTH, N_BANDS = 2048, 512, 32
|
511 |
MIN_DB, MAX_DB = -80.0, 0.0
|
|
|
552 |
|
553 |
video_clip = VideoClip(frame_function=frame_generator, duration=duration)
|
554 |
|
555 |
+
# --- Set Spectrogram Opacity ---
|
556 |
# If image clips were created, make the spectrogram layer 50% transparent.
|
557 |
if image_clips:
|
558 |
print("Applying 50% opacity to spectrogram layer.")
|
559 |
video_clip = video_clip.with_opacity(0.5)
|
560 |
+
|
561 |
# --- Use fractional progress (current/total) ---
|
562 |
+
progress(3 / TOTAL_STEPS, desc=f"Stage 4/{TOTAL_STEPS}: Rendering Base Video")
|
563 |
|
564 |
+
# --- Composition and Rendering ---
|
565 |
audio_clip = AudioFileClip(temp_audio_path)
|
566 |
|
567 |
# --- Clip Composition ---
|
|
|
588 |
audio_bitrate="320k", fps=RENDER_FPS,
|
589 |
logger='bar', threads=os.cpu_count(), preset='ultrafast')
|
590 |
print("High-quality AAC audio encoding complete.")
|
591 |
+
|
592 |
final_clip.close()
|
593 |
|
594 |
# Step 2: Use FFmpeg to quickly increase the framerate to 24 FPS
|
|
|
596 |
|
597 |
# --- Use fractional progress (current/total) ---
|
598 |
progress(4 / TOTAL_STEPS, desc=f"Stage 5/{TOTAL_STEPS}: Finalizing Video")
|
599 |
+
|
600 |
+
# --- Finalizing ---
|
601 |
increase_video_framerate(temp_fps1_path, final_output_path, target_fps=PLAYBACK_FPS)
|
602 |
|
603 |
return final_output_path
|
|
|
619 |
with gr.Column(scale=1):
|
620 |
# --- Changed to gr.Files for multi-upload ---
|
621 |
audio_inputs = gr.Files(
|
622 |
+
label="Upload Audio File(s)",
|
623 |
file_count="multiple",
|
624 |
file_types=["audio"]
|
625 |
)
|
626 |
|
627 |
+
# --- Grouped Image Section ---
|
628 |
+
with gr.Accordion("Grouped Image Backgrounds (Advanced)", open=False):
|
629 |
+
gr.Markdown("Define groups of tracks and assign specific images to them. Tracks are numbered globally starting from 1 across all uploaded files.")
|
630 |
+
|
631 |
+
MAX_GROUPS = 10
|
632 |
+
group_track_inputs = []
|
633 |
+
group_image_inputs = []
|
634 |
+
group_accordions = []
|
635 |
+
|
636 |
+
# --- Create a centralized update function ---
|
637 |
+
def update_group_visibility(target_count: int):
|
638 |
+
"""Updates the visibility of all group accordions and the state of the control buttons."""
|
639 |
+
# Clamp the target count to be within bounds
|
640 |
+
target_count = max(1, min(target_count, MAX_GROUPS))
|
641 |
+
|
642 |
+
updates = {visible_groups_state: target_count}
|
643 |
+
# Update visibility for each accordion
|
644 |
+
for i in range(MAX_GROUPS):
|
645 |
+
updates[group_accordions[i]] = gr.update(visible=(i < target_count))
|
646 |
+
|
647 |
+
# Update button states
|
648 |
+
updates[add_group_btn] = gr.update(visible=(target_count < MAX_GROUPS))
|
649 |
+
updates[remove_group_btn] = gr.update(interactive=(target_count > 1))
|
650 |
+
|
651 |
+
return updates
|
652 |
+
|
653 |
+
# --- Create simple wrapper functions for adding and removing ---
|
654 |
+
def add_group(current_count: int):
|
655 |
+
return update_group_visibility(current_count + 1)
|
656 |
+
|
657 |
+
def remove_group(current_count: int):
|
658 |
+
return update_group_visibility(current_count - 1)
|
659 |
+
|
660 |
+
# Pre-build all group components
|
661 |
+
for i in range(MAX_GROUPS):
|
662 |
+
with gr.Accordion(f"Image Group {i+1}", visible=(i==0)) as acc:
|
663 |
+
track_input = gr.Textbox(label=f"Tracks for Group {i+1} (e.g., '1-4, 7')")
|
664 |
+
image_input = gr.Files(label=f"Images for Group {i+1}", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
|
665 |
+
group_track_inputs.append(track_input)
|
666 |
+
group_image_inputs.append(image_input)
|
667 |
+
group_accordions.append(acc)
|
668 |
+
|
669 |
+
visible_groups_state = gr.State(1)
|
670 |
+
# --- Add a remove button and put both in a row ---
|
671 |
+
with gr.Row():
|
672 |
+
remove_group_btn = gr.Button("- Remove Last Group", variant="secondary", interactive=False)
|
673 |
+
add_group_btn = gr.Button("+ Add Image Group", variant="secondary")
|
674 |
+
|
675 |
+
with gr.Accordion("Fallback / Default Images", open=True):
|
676 |
+
gr.Markdown("These images will be used for any tracks not assigned to a specific group above.")
|
677 |
+
fallback_image_input = gr.Files(label="Fallback Images", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
|
678 |
+
|
679 |
with gr.Accordion("Visualizer Options", open=True):
|
680 |
with gr.Row():
|
681 |
width_input = gr.Number(value=1920, label="Video Width (px)", precision=0)
|
|
|
691 |
# --- Checkbox for number formatting ---
|
692 |
format_double_digits_checkbox = gr.Checkbox(label="Format track numbers as double digits (e.g., 01, 05-09)", value=True)
|
693 |
gr.Markdown("If the CUE sheet or filenames contain non-English characters, please select a compatible font.")
|
694 |
+
|
695 |
# Define a priority list for default fonts, starting with common Japanese ones.
|
696 |
# This list can include multiple names for the same font to improve matching.
|
697 |
preferred_fonts = [
|
|
|
714 |
default_font = FONT_DISPLAY_NAMES[0]
|
715 |
|
716 |
font_name_dd = gr.Dropdown(choices=FONT_DISPLAY_NAMES, value=default_font, label="Font Family")
|
717 |
+
|
718 |
with gr.Row():
|
719 |
font_size_slider = gr.Slider(minimum=12, maximum=256, value=80, step=1, label="Font Size")
|
720 |
font_color_picker = gr.ColorPicker(value="#FFFFFF", label="Font Color")
|
721 |
+
|
722 |
with gr.Row():
|
723 |
font_bg_color_picker = gr.ColorPicker(value="#000000", label="Text BG Color")
|
724 |
font_bg_alpha_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.6, step=0.05, label="Text BG Opacity")
|
725 |
+
|
726 |
gr.Markdown("Text Position")
|
727 |
with gr.Row():
|
728 |
pos_h_radio = gr.Radio(["left", "center", "right"], value="center", label="Horizontal Align")
|
|
|
732 |
|
733 |
with gr.Column(scale=2):
|
734 |
video_output = gr.Video(label="Generated Video")
|
735 |
+
|
736 |
+
# --- Define the full list of outputs for the update functions ---
|
737 |
+
group_update_outputs = [visible_groups_state, add_group_btn, remove_group_btn] + group_accordions
|
738 |
+
|
739 |
+
# Connect the "Add Group" button to its update function
|
740 |
+
add_group_btn.click(
|
741 |
+
fn=add_group,
|
742 |
+
inputs=visible_groups_state,
|
743 |
+
outputs=group_update_outputs
|
744 |
+
)
|
745 |
+
|
746 |
+
remove_group_btn.click(
|
747 |
+
fn=remove_group,
|
748 |
+
inputs=visible_groups_state,
|
749 |
+
outputs=group_update_outputs
|
750 |
+
)
|
751 |
+
|
752 |
+
# --- Define the master list of all inputs for the main button ---
|
753 |
+
all_inputs = [audio_inputs] + group_track_inputs + group_image_inputs + [
|
754 |
+
fallback_image_input,
|
755 |
+
format_double_digits_checkbox,
|
756 |
+
width_input, height_input,
|
757 |
+
fg_color, bg_color,
|
758 |
+
font_name_dd, font_size_slider, font_color_picker,
|
759 |
+
font_bg_color_picker, font_bg_alpha_slider,
|
760 |
+
pos_h_radio, pos_v_radio
|
761 |
+
]
|
762 |
+
|
763 |
submit_btn.click(
|
764 |
fn=process_audio_to_video,
|
765 |
+
inputs=all_inputs,
|
766 |
+
outputs=video_output,
|
767 |
+
show_progress="full"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
768 |
)
|
769 |
|
770 |
if __name__ == "__main__":
|