avans06 commited on
Commit
c6d7d8f
·
1 Parent(s): 2c95cb8

feat(ui, core): Implement advanced grouped image backgrounds

Browse files

This major refactoring introduces a powerful new feature allowing users to assign specific sets of images to designated groups of tracks. This transforms the application from a linear visualizer into a tool capable of creating dynamic, context-aware videos with thematic sections.

The implementation required a complete overhaul of the UI for image uploads and a fundamental rewrite of the backend logic for track processing and image distribution.

Dynamic Group Management:
The single image uploader has been replaced with a dynamic interface for defining up to 10 distinct image groups.
Users can now click "+ Add Image Group" and "- Remove Last Group" buttons to manage the number of visible group definitions.
This is simulated by managing the visibility of a pre-defined maximum number of groups, providing a seamless user experience.

Group Definition:
Each group consists of a Textbox for defining track ranges (e.g., "1-4, 7, 10-13") and a dedicated Files uploader for that group's specific images.

Fallback Images:
A separate "Fallback / Default Images" uploader is provided for any tracks that are not explicitly assigned to a group.

Files changed (1) hide show
  1. app.py +329 -230
app.py CHANGED
@@ -9,7 +9,7 @@ import subprocess
9
  import soundfile as sf
10
  import matplotlib.font_manager as fm
11
  from PIL import ImageFont
12
- from typing import Tuple, List, Dict
13
  from mutagen.flac import FLAC
14
  from moviepy import CompositeVideoClip, TextClip, VideoClip, AudioFileClip, ImageClip
15
 
@@ -28,7 +28,7 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
28
  elif platform_id == 1 and encoding_id == 0: # Macintosh, Roman
29
  return name_bytes.decode('mac_roman').strip('\x00')
30
  elif platform_id == 0: # Unicode
31
- return name_bytes.decode('utf_16_be').strip('\x00')
32
  else: # Fallback
33
  return name_bytes.decode('utf_8', errors='ignore').strip('\x00')
34
  except Exception:
@@ -36,9 +36,10 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
36
 
37
  try:
38
  with open(font_path, 'rb') as f: data = f.read()
39
- def read_ushort(offset): return struct.unpack('>H', data[offset:offset+2])[0]
40
- def read_ulong(offset): return struct.unpack('>I', data[offset:offset+4])[0]
41
-
 
42
  font_offsets = [0]
43
  # Check for TTC (TrueType Collection) header
44
  if data[:4] == b'ttcf':
@@ -47,7 +48,7 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
47
 
48
  # For simplicity, we only parse the first font in a TTC
49
  font_offset = font_offsets[0]
50
-
51
  num_tables = read_ushort(font_offset + 4)
52
  name_table_offset = -1
53
  # Locate the 'name' table
@@ -55,38 +56,50 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
55
  entry_offset = font_offset + 12 + i * 16
56
  tag = data[entry_offset:entry_offset+4]
57
  if tag == b'name':
58
- name_table_offset = read_ulong(entry_offset + 8); break
59
-
60
- if name_table_offset == -1: return None, None
61
-
 
 
62
  count, string_offset = read_ushort(name_table_offset + 2), read_ushort(name_table_offset + 4)
63
  name_candidates = {}
64
  # Iterate through all name records
65
  for i in range(count):
66
  rec_offset = name_table_offset + 6 + i * 12
67
  platform_id, encoding_id, language_id, name_id, length, offset = struct.unpack('>HHHHHH', data[rec_offset:rec_offset+12])
68
-
69
  if name_id == 4: # We only care about the "Full Font Name"
70
  string_pos = name_table_offset + string_offset + offset
71
  value = decode_name_string(data[string_pos : string_pos + length], platform_id, encoding_id)
72
-
73
  if value:
74
  # Store candidates based on language ID
75
- if language_id in [1028, 2052, 3076, 4100, 5124]: name_candidates["zh"] = value # Chinese
76
- elif language_id == 1041: name_candidates["ja"] = value # Japanese
77
- elif language_id == 1042: name_candidates["ko"] = value # Korean
78
- elif language_id in [1033, 0]: name_candidates["en"] = value # English
 
 
 
 
79
  else:
80
- if "other" not in name_candidates: name_candidates["other"] = value
81
-
 
82
  # Return the best candidate based on language priority
83
- if name_candidates.get("zh"): return name_candidates.get("zh"), "zh"
84
- if name_candidates.get("ja"): return name_candidates.get("ja"), "ja"
85
- if name_candidates.get("ko"): return name_candidates.get("ko"), "ko"
86
- if name_candidates.get("other"): return name_candidates.get("other"), "other"
87
- if name_candidates.get("en"): return name_candidates.get("en"), "en"
 
 
 
 
 
88
  return None, None
89
-
90
  except Exception:
91
  return None, None
92
 
@@ -106,22 +119,22 @@ def get_font_data() -> Tuple[Dict[str, str], List[str]]:
106
  for path in all_font_files:
107
  display_name, lang_tag = get_font_display_name(path)
108
  is_fallback = display_name is None
109
-
110
  if is_fallback:
111
  # Create a fallback name from the filename
112
  display_name = os.path.splitext(os.path.basename(path))[0].replace('-', ' ').replace('_', ' ').title()
113
  lang_tag = 'fallback'
114
-
115
  if display_name and display_name not in font_map:
116
  font_map[display_name] = path
117
  found_names.append((display_name, is_fallback, lang_tag))
118
-
119
  # Define sort priority for languages
120
  sort_order = {'zh': 0, 'ja': 1, 'ko': 2, 'en': 3, 'other': 4, 'fallback': 5}
121
 
122
  # Sort by priority, then alphabetically
123
  found_names.sort(key=lambda x: (sort_order.get(x[2], 99), x[0]))
124
-
125
  sorted_display_names = [name for name, _, _ in found_names]
126
  return font_map, sorted_display_names
127
 
@@ -188,7 +201,7 @@ def increase_video_framerate(input_path: str, output_path: str, target_fps: int
188
  '-c:a', 'copy', # Copy audio without re-encoding
189
  output_path
190
  ]
191
-
192
  try:
193
  # Execute the command
194
  # Using capture_output to hide ffmpeg logs from the main console unless an error occurs
@@ -203,24 +216,75 @@ def increase_video_framerate(input_path: str, output_path: str, target_fps: int
203
  raise gr.Error(f"FFmpeg failed to increase the framerate. See console for details. Error: {e.stderr}")
204
 
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  # --- Main Processing Function ---
207
- def process_audio_to_video(
208
- audio_files: List[str], image_paths: List[str],
209
- format_double_digits: bool,
210
- video_width: int, video_height: int,
211
- spec_fg_color: str, spec_bg_color: str,
212
- font_name: str, font_size: int, font_color: str,
213
- font_bg_color: str, font_bg_alpha: float,
214
- pos_h: str, pos_v: str,
215
- progress=gr.Progress(track_tqdm=True)
216
- ) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  if not audio_files:
218
  raise gr.Error("Please upload at least one audio file.")
219
  if not font_name:
220
  raise gr.Error("Please select a font from the list.")
221
 
222
  progress(0, desc="Initializing...")
223
-
224
  # Define paths for temporary and final files
225
  timestamp = int(time.time())
226
  temp_fps1_path = f"temp_{timestamp}_fps1.mp4"
@@ -254,7 +318,7 @@ def process_audio_to_video(
254
  raise ValueError(f"Could not parse rgb color string: {color_str}")
255
  else:
256
  raise ValueError(f"Unknown color format: {color_str}")
257
-
258
  # Use the new robust parser for all color inputs
259
  fg_rgb, bg_rgb = parse_color_to_rgb(spec_fg_color), parse_color_to_rgb(spec_bg_color)
260
  grid_rgb = tuple(min(c + 40, 255) for c in bg_rgb)
@@ -264,11 +328,9 @@ def process_audio_to_video(
264
  # --- Define total steps for the progress bar ---
265
  TOTAL_STEPS = 5
266
 
267
- # --- 1. Audio Processing & Track Info Aggregation ---
268
- all_tracks_info = []
269
- total_duration = 0.0
270
- y_accumulator = []
271
- current_sr = None
272
 
273
  # --- Use `progress.tqdm` to create a progress bar for this loop ---
274
  for file_idx, audio_path in enumerate(progress.tqdm(audio_files, desc=f"Stage 1/{TOTAL_STEPS}: Analyzing Audio Files")):
@@ -301,48 +363,20 @@ def process_audio_to_video(
301
 
302
  print(f"Successfully parsed {len(cue_tracks)} tracks from CUE sheet.")
303
  except Exception as e:
304
- print(f"Warning: Could not read or parse CUE sheet for {os.path.basename(audio_path)}: {e}")
305
-
306
- # --- Apply New Numbering Logic ---
307
- file_num = file_idx + 1 # File numbering starts from 1
308
- if len(audio_files) > 1:
309
- if cue_tracks: # Scenario 3: Multiple files, this one has CUE
310
- for track_idx, track in enumerate(cue_tracks):
311
- track_num = track_idx + 1 # Track numbering starts from 1
312
- number_str = f"{file_num:02d}-{track_num:02d}" if format_double_digits else f"{file_num}-{track_num}"
313
- all_tracks_info.append({
314
- "title": track.get('title', 'Unknown Track'),
315
- "start_time": total_duration + track.get('start_time', 0),
316
- "end_time": total_duration + (cue_tracks[track_idx+1].get('start_time', file_duration) if track_idx + 1 < len(cue_tracks) else file_duration),
317
- "number_str": number_str
318
- })
319
- else: # Scenario 2: Multiple files, this one has NO CUE
320
- number_str = f"{file_num:02d}" if format_double_digits else str(file_num)
321
- all_tracks_info.append({
322
- "title": os.path.splitext(os.path.basename(audio_path))[0],
323
- "start_time": total_duration, "end_time": total_duration + file_duration,
324
- "number_str": number_str
325
- })
326
- else: # Scenario 1: Single file upload
327
- if cue_tracks: # With CUE
328
- for track_idx, track in enumerate(cue_tracks):
329
- track_num = track_idx + 1
330
- number_str = f"{track_num:02d}" if format_double_digits else str(track_num)
331
- all_tracks_info.append({
332
- "title": track.get('title', 'Unknown Track'),
333
- "start_time": total_duration + track.get('start_time', 0),
334
- "end_time": total_duration + (cue_tracks[track_idx+1].get('start_time', file_duration) if track_idx + 1 < len(cue_tracks) else file_duration),
335
- "number_str": f"{number_str}." # Add a dot for single file CUE tracks
336
- })
337
- else: # No CUE
338
- all_tracks_info.append({
339
- "title": os.path.splitext(os.path.basename(audio_path))[0],
340
- "start_time": total_duration, "end_time": total_duration + file_duration,
341
- "number_str": None # Signal to not show any number
342
- })
343
-
344
  total_duration += file_duration
345
-
346
  # --- Concatenate along the time axis (axis=1) for stereo arrays ---
347
  y_combined = np.concatenate(y_accumulator, axis=1)
348
  duration = total_duration
@@ -350,116 +384,128 @@ def process_audio_to_video(
350
  # --- Transpose the array for soundfile to write stereo correctly ---
351
  sf.write(temp_audio_path, y_combined.T, current_sr)
352
  print(f"Combined all audio files into one. Total duration: {duration:.2f}s")
353
-
354
  # --- Update progress to the next stage, use fractional progress (current/total) ---
355
- progress(1 / TOTAL_STEPS, desc=f"Stage 2/{TOTAL_STEPS}: Generating Text Overlays")
356
 
357
- # --- 2. Text Overlay Logic using the aggregated track info
358
- text_clips = []
359
- if all_tracks_info:
360
- font_path = SYSTEM_FONTS_MAP.get(font_name)
361
- if not font_path: raise gr.Error(f"Font path for '{font_name}' not found!")
362
-
363
- # Use the robust parser for text colors as well
364
- font_bg_rgb = parse_color_to_rgb(font_bg_color)
365
-
366
- position = (pos_h.lower(), pos_v.lower())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
 
368
- print(f"Using font: {font_name}, Size: {font_size}, Position: {position}")
 
 
369
 
370
- # Create the RGBA tuple for the background color.
371
- # The alpha value is converted from a 0.0-1.0 float to a 0-255 integer.
372
- bg_color_tuple = (font_bg_rgb[0], font_bg_rgb[1], font_bg_rgb[2], int(font_bg_alpha * 255))
 
 
 
373
 
374
- # 1. Define a maximum width for the caption. 90% of the video width is a good choice.
375
- caption_width = int(WIDTH * 0.9)
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
- # --- Get font metrics to calculate dynamic padding ---
378
- try:
379
- # Load the font with Pillow to access its metrics
380
- pil_font = ImageFont.truetype(font_path, size=font_size)
381
- _, descent = pil_font.getmetrics()
382
- # Calculate a bottom margin to compensate for the font's descent.
383
- # A small constant is added as a safety buffer.
384
- # This prevents clipping on fonts with large descenders (like 'g', 'p').
385
- bottom_margin = int(descent * 0.5) + 2
386
- print(f"Font '{font_name}' descent: {descent}. Applying dynamic bottom margin of {bottom_margin}px.")
387
- except Exception as e:
388
- # Fallback in case of any font loading error
389
- print(f"Warning: Could not get font metrics for '{font_name}'. Using fixed margin. Error: {e}")
390
- bottom_margin = int(WIDTH * 0.01) # A small fixed fallback
391
 
392
- for track in all_tracks_info:
393
- text_duration = track['end_time'] - track['start_time']
394
- if text_duration <= 0:
395
- continue
396
-
397
- # Construct display text based on pre-formatted number string
398
- display_text = f"{track['number_str']} {track['title']}" if track['number_str'] else track['title']
399
-
400
-
401
- # 1. Create the TextClip first without positioning to get its size
402
- txt_clip = TextClip(
403
- text=display_text.strip(),
404
- font_size=font_size,
405
- color=font_color,
406
- font=font_path,
407
- bg_color=bg_color_tuple,
408
- method='caption', # <-- Set method to caption
409
- size=(caption_width, None), # <-- Provide size for wrapping
410
- margin=(0, 0, 0, bottom_margin)
411
- ).with_position(position).with_duration(text_duration).with_start(track['start_time'])
412
-
413
- text_clips.append(txt_clip)
414
-
415
- # --- Update progress to the next stage, use fractional progress (current/total) ---
416
- progress(2 / TOTAL_STEPS, desc=f"Stage 3/{TOTAL_STEPS}: Generating Visual Layers")
417
 
418
- # --- 3. Image and Spectrogram Logic ---
419
- image_clips = []
420
- if image_paths and len(image_paths) > 0:
421
- print(f"Found {len(image_paths)} images to process.")
422
 
423
- # Simplified logic: calculate time per image, max 3 mins, and loop.
424
- img_duration = duration / len(image_paths)
425
- for i, img_path in enumerate(image_paths):
426
-
427
- # --- HELPER FUNCTION FOR ROBUST IMAGE CLIPS ---
428
- def create_image_layer(img_path, start, dur):
429
- """
430
- Creates an image layer that fits entirely within the video frame.
431
- It scales the image down to fit and centers it on a transparent background.
432
- """
433
- # This function implements a "cover" scaling mode to ensure the image
434
- # fills the entire video frame without leaving black bars.
435
- try:
436
- img_clip_raw = ImageClip(img_path)
437
-
438
- # 1. Calculate scaling factor to "contain" the image (fit inside).
439
- # We use min() to find the ratio that requires the most shrinkage,
440
- # ensuring the whole image fits without being cropped.
441
- scale_factor = min(WIDTH / img_clip_raw.w, HEIGHT / img_clip_raw.h)
442
-
443
- # 2. Resize the image so it fits perfectly within the video dimensions.
444
- resized_clip = img_clip_raw.resized(scale_factor)
445
-
446
- # 3. Create a composite clip to position the resized image on a
447
- # correctly-sized transparent canvas. This is the key to preventing overflow.
448
- final_layer = CompositeVideoClip(
449
- [resized_clip.with_position("center")],
450
- size=(WIDTH, HEIGHT)
451
- )
452
-
453
- # 4. Set the timing on the final composite layer.
454
- return final_layer.with_duration(dur).with_start(start)
455
- except Exception as e:
456
- print(f"Warning: Failed to process image '{img_path}'. Skipping. Error: {e}")
457
- return None
458
-
459
- # Create an ImageClip for the duration of the track.
460
- clip = create_image_layer(img_path, i * img_duration, img_duration)
461
- if clip:
462
- image_clips.append(clip)
463
 
464
  N_FFT, HOP_LENGTH, N_BANDS = 2048, 512, 32
465
  MIN_DB, MAX_DB = -80.0, 0.0
@@ -506,16 +552,16 @@ def process_audio_to_video(
506
 
507
  video_clip = VideoClip(frame_function=frame_generator, duration=duration)
508
 
509
- # --- NEW: Set Spectrogram Opacity ---
510
  # If image clips were created, make the spectrogram layer 50% transparent.
511
  if image_clips:
512
  print("Applying 50% opacity to spectrogram layer.")
513
  video_clip = video_clip.with_opacity(0.5)
514
-
515
  # --- Use fractional progress (current/total) ---
516
- progress(3 / TOTAL_STEPS, desc=f"Stage 4/{TOTAL_STEPS}: Rendering Base Video (this may take time)")
517
 
518
- # --- 4. Composition and Rendering ---
519
  audio_clip = AudioFileClip(temp_audio_path)
520
 
521
  # --- Clip Composition ---
@@ -542,7 +588,7 @@ def process_audio_to_video(
542
  audio_bitrate="320k", fps=RENDER_FPS,
543
  logger='bar', threads=os.cpu_count(), preset='ultrafast')
544
  print("High-quality AAC audio encoding complete.")
545
-
546
  final_clip.close()
547
 
548
  # Step 2: Use FFmpeg to quickly increase the framerate to 24 FPS
@@ -550,8 +596,8 @@ def process_audio_to_video(
550
 
551
  # --- Use fractional progress (current/total) ---
552
  progress(4 / TOTAL_STEPS, desc=f"Stage 5/{TOTAL_STEPS}: Finalizing Video")
553
-
554
- # --- 5. Finalizing ---
555
  increase_video_framerate(temp_fps1_path, final_output_path, target_fps=PLAYBACK_FPS)
556
 
557
  return final_output_path
@@ -573,29 +619,63 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
573
  with gr.Column(scale=1):
574
  # --- Changed to gr.Files for multi-upload ---
575
  audio_inputs = gr.Files(
576
- label="Upload Audio File(s)",
577
  file_count="multiple",
578
  file_types=["audio"]
579
  )
580
 
581
- # --- Image Upload Component ---
582
- gr.Markdown("### Background Image Options (Optional)")
583
- gr.Markdown(
584
- """
585
- When background images are uploaded, they will be displayed in a looping sequence.
586
- - The display duration for each image is calculated by dividing the total video length by the number of images, with a maximum duration of **3 minutes** per image.
587
- - The sequence loops until the video ends.
588
- """
589
- )
590
- image_uploads = gr.File(
591
- label="Upload Background Images",
592
- file_count="multiple", # Allow multiple files
593
- # Replace the generic "image" category with a specific list of extensions.
594
- # Note that the dot (.) before each extension is required.
595
- file_types=[".png", ".jpg", ".jpeg", ".bmp", ".gif", ".webp", ".avif"]
596
-
597
- )
598
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
  with gr.Accordion("Visualizer Options", open=True):
600
  with gr.Row():
601
  width_input = gr.Number(value=1920, label="Video Width (px)", precision=0)
@@ -611,7 +691,7 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
611
  # --- Checkbox for number formatting ---
612
  format_double_digits_checkbox = gr.Checkbox(label="Format track numbers as double digits (e.g., 01, 05-09)", value=True)
613
  gr.Markdown("If the CUE sheet or filenames contain non-English characters, please select a compatible font.")
614
-
615
  # Define a priority list for default fonts, starting with common Japanese ones.
616
  # This list can include multiple names for the same font to improve matching.
617
  preferred_fonts = [
@@ -634,15 +714,15 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
634
  default_font = FONT_DISPLAY_NAMES[0]
635
 
636
  font_name_dd = gr.Dropdown(choices=FONT_DISPLAY_NAMES, value=default_font, label="Font Family")
637
-
638
  with gr.Row():
639
  font_size_slider = gr.Slider(minimum=12, maximum=256, value=80, step=1, label="Font Size")
640
  font_color_picker = gr.ColorPicker(value="#FFFFFF", label="Font Color")
641
-
642
  with gr.Row():
643
  font_bg_color_picker = gr.ColorPicker(value="#000000", label="Text BG Color")
644
  font_bg_alpha_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.6, step=0.05, label="Text BG Opacity")
645
-
646
  gr.Markdown("Text Position")
647
  with gr.Row():
648
  pos_h_radio = gr.Radio(["left", "center", "right"], value="center", label="Horizontal Align")
@@ -652,20 +732,39 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
652
 
653
  with gr.Column(scale=2):
654
  video_output = gr.Video(label="Generated Video")
655
-
656
- # --- Update inputs for the click event ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
657
  submit_btn.click(
658
  fn=process_audio_to_video,
659
- inputs=[
660
- audio_inputs, image_uploads,
661
- format_double_digits_checkbox,
662
- width_input, height_input,
663
- fg_color, bg_color,
664
- font_name_dd, font_size_slider, font_color_picker,
665
- font_bg_color_picker, font_bg_alpha_slider,
666
- pos_h_radio, pos_v_radio
667
- ],
668
- outputs=video_output
669
  )
670
 
671
  if __name__ == "__main__":
 
9
  import soundfile as sf
10
  import matplotlib.font_manager as fm
11
  from PIL import ImageFont
12
+ from typing import Tuple, List, Dict, Set
13
  from mutagen.flac import FLAC
14
  from moviepy import CompositeVideoClip, TextClip, VideoClip, AudioFileClip, ImageClip
15
 
 
28
  elif platform_id == 1 and encoding_id == 0: # Macintosh, Roman
29
  return name_bytes.decode('mac_roman').strip('\x00')
30
  elif platform_id == 0: # Unicode
31
+ return name_bytes.decode('utf_16_be').strip('\x00')
32
  else: # Fallback
33
  return name_bytes.decode('utf_8', errors='ignore').strip('\x00')
34
  except Exception:
 
36
 
37
  try:
38
  with open(font_path, 'rb') as f: data = f.read()
39
+ def read_ushort(offset):
40
+ return struct.unpack('>H', data[offset:offset+2])[0]
41
+ def read_ulong(offset):
42
+ return struct.unpack('>I', data[offset:offset+4])[0]
43
  font_offsets = [0]
44
  # Check for TTC (TrueType Collection) header
45
  if data[:4] == b'ttcf':
 
48
 
49
  # For simplicity, we only parse the first font in a TTC
50
  font_offset = font_offsets[0]
51
+
52
  num_tables = read_ushort(font_offset + 4)
53
  name_table_offset = -1
54
  # Locate the 'name' table
 
56
  entry_offset = font_offset + 12 + i * 16
57
  tag = data[entry_offset:entry_offset+4]
58
  if tag == b'name':
59
+ name_table_offset = read_ulong(entry_offset + 8)
60
+ break
61
+
62
+ if name_table_offset == -1:
63
+ return None, None
64
+
65
  count, string_offset = read_ushort(name_table_offset + 2), read_ushort(name_table_offset + 4)
66
  name_candidates = {}
67
  # Iterate through all name records
68
  for i in range(count):
69
  rec_offset = name_table_offset + 6 + i * 12
70
  platform_id, encoding_id, language_id, name_id, length, offset = struct.unpack('>HHHHHH', data[rec_offset:rec_offset+12])
71
+
72
  if name_id == 4: # We only care about the "Full Font Name"
73
  string_pos = name_table_offset + string_offset + offset
74
  value = decode_name_string(data[string_pos : string_pos + length], platform_id, encoding_id)
75
+
76
  if value:
77
  # Store candidates based on language ID
78
+ if language_id in [1028, 2052, 3076, 4100, 5124]:
79
+ name_candidates["zh"] = value
80
+ elif language_id == 1041:
81
+ name_candidates["ja"] = value
82
+ elif language_id == 1042:
83
+ name_candidates["ko"] = value
84
+ elif language_id in [1033, 0]:
85
+ name_candidates["en"] = value
86
  else:
87
+ if "other" not in name_candidates:
88
+ name_candidates["other"] = value
89
+
90
  # Return the best candidate based on language priority
91
+ if name_candidates.get("zh"):
92
+ return name_candidates.get("zh"), "zh"
93
+ if name_candidates.get("ja"):
94
+ return name_candidates.get("ja"), "ja"
95
+ if name_candidates.get("ko"):
96
+ return name_candidates.get("ko"), "ko"
97
+ if name_candidates.get("other"):
98
+ return name_candidates.get("other"), "other"
99
+ if name_candidates.get("en"):
100
+ return name_candidates.get("en"), "en"
101
  return None, None
102
+
103
  except Exception:
104
  return None, None
105
 
 
119
  for path in all_font_files:
120
  display_name, lang_tag = get_font_display_name(path)
121
  is_fallback = display_name is None
122
+
123
  if is_fallback:
124
  # Create a fallback name from the filename
125
  display_name = os.path.splitext(os.path.basename(path))[0].replace('-', ' ').replace('_', ' ').title()
126
  lang_tag = 'fallback'
127
+
128
  if display_name and display_name not in font_map:
129
  font_map[display_name] = path
130
  found_names.append((display_name, is_fallback, lang_tag))
131
+
132
  # Define sort priority for languages
133
  sort_order = {'zh': 0, 'ja': 1, 'ko': 2, 'en': 3, 'other': 4, 'fallback': 5}
134
 
135
  # Sort by priority, then alphabetically
136
  found_names.sort(key=lambda x: (sort_order.get(x[2], 99), x[0]))
137
+
138
  sorted_display_names = [name for name, _, _ in found_names]
139
  return font_map, sorted_display_names
140
 
 
201
  '-c:a', 'copy', # Copy audio without re-encoding
202
  output_path
203
  ]
204
+
205
  try:
206
  # Execute the command
207
  # Using capture_output to hide ffmpeg logs from the main console unless an error occurs
 
216
  raise gr.Error(f"FFmpeg failed to increase the framerate. See console for details. Error: {e.stderr}")
217
 
218
 
219
+ # --- HELPER FUNCTION for parsing track ranges ---
220
+ def parse_track_ranges(range_str: str) -> Set[int]:
221
+ """Parses a string like '1-4, 7, 10-13' into a set of integers."""
222
+ if not range_str:
223
+ return set()
224
+
225
+ indices = set()
226
+ parts = range_str.split(',')
227
+ for part in parts:
228
+ part = part.strip()
229
+ if not part:
230
+ continue
231
+ if '-' in part:
232
+ try:
233
+ start, end = map(int, part.split('-'))
234
+ indices.update(range(start, end + 1))
235
+ except ValueError:
236
+ print(f"Warning: Could not parse range '{part}'. Skipping.")
237
+ else:
238
+ try:
239
+ indices.add(int(part))
240
+ except ValueError:
241
+ print(f"Warning: Could not parse track number '{part}'. Skipping.")
242
+ return indices
243
+
244
+
245
  # --- Main Processing Function ---
246
+ def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
247
+ # --- Correctly unpack all arguments from *args using slicing ---
248
+ MAX_GROUPS = 10 # This MUST match the UI definition
249
+
250
+ # Define the structure of the *args tuple based on the `all_inputs` list
251
+ audio_files = args[0]
252
+
253
+ # Slice the args tuple to get the continuous blocks of inputs
254
+ all_track_strs = args[1 : 1 + MAX_GROUPS]
255
+ all_image_lists = args[1 + MAX_GROUPS : 1 + MAX_GROUPS * 2]
256
+
257
+ # Group inputs are packed in pairs (track_str, image_list)
258
+ group_definitions = []
259
+ for i in range(MAX_GROUPS):
260
+ group_definitions.append({
261
+ "tracks_str": all_track_strs[i],
262
+ "images": all_image_lists[i]
263
+ })
264
+
265
+ # Unpack the remaining arguments with correct indexing
266
+ arg_offset = 1 + MAX_GROUPS * 2
267
+ fallback_images = args[arg_offset]
268
+ format_double_digits = args[arg_offset + 1]
269
+ video_width = args[arg_offset + 2]
270
+ video_height = args[arg_offset + 3]
271
+ spec_fg_color = args[arg_offset + 4]
272
+ spec_bg_color = args[arg_offset + 5]
273
+ font_name = args[arg_offset + 6]
274
+ font_size = args[arg_offset + 7]
275
+ font_color = args[arg_offset + 8]
276
+ font_bg_color = args[arg_offset + 9]
277
+ font_bg_alpha = args[arg_offset + 10]
278
+ pos_h = args[arg_offset + 11]
279
+ pos_v = args[arg_offset + 12]
280
+
281
  if not audio_files:
282
  raise gr.Error("Please upload at least one audio file.")
283
  if not font_name:
284
  raise gr.Error("Please select a font from the list.")
285
 
286
  progress(0, desc="Initializing...")
287
+
288
  # Define paths for temporary and final files
289
  timestamp = int(time.time())
290
  temp_fps1_path = f"temp_{timestamp}_fps1.mp4"
 
318
  raise ValueError(f"Could not parse rgb color string: {color_str}")
319
  else:
320
  raise ValueError(f"Unknown color format: {color_str}")
321
+
322
  # Use the new robust parser for all color inputs
323
  fg_rgb, bg_rgb = parse_color_to_rgb(spec_fg_color), parse_color_to_rgb(spec_bg_color)
324
  grid_rgb = tuple(min(c + 40, 255) for c in bg_rgb)
 
328
  # --- Define total steps for the progress bar ---
329
  TOTAL_STEPS = 5
330
 
331
+ # --- Stage 1: Audio Processing & Master Track List Creation ---
332
+ master_track_list, y_accumulator, current_sr = [], [], None
333
+ total_duration, global_track_counter = 0.0, 0
 
 
334
 
335
  # --- Use `progress.tqdm` to create a progress bar for this loop ---
336
  for file_idx, audio_path in enumerate(progress.tqdm(audio_files, desc=f"Stage 1/{TOTAL_STEPS}: Analyzing Audio Files")):
 
363
 
364
  print(f"Successfully parsed {len(cue_tracks)} tracks from CUE sheet.")
365
  except Exception as e:
366
+ print(f"Warning: Could not parse CUE sheet for {os.path.basename(audio_path)}: {e}")
367
+
368
+ if cue_tracks:
369
+ for track_idx, track in enumerate(cue_tracks):
370
+ global_track_counter += 1
371
+ start_time = track.get('start_time', 0)
372
+ end_time = cue_tracks[track_idx+1].get('start_time', file_duration) if track_idx + 1 < len(cue_tracks) else file_duration
373
+ master_track_list.append({"global_index": global_track_counter, "title": track.get('title', 'Unknown'), "start_time": total_duration + start_time, "end_time": total_duration + end_time})
374
+ else:
375
+ global_track_counter += 1
376
+ master_track_list.append({"global_index": global_track_counter, "title": os.path.splitext(os.path.basename(audio_path))[0], "start_time": total_duration, "end_time": total_duration + file_duration})
377
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
  total_duration += file_duration
379
+
380
  # --- Concatenate along the time axis (axis=1) for stereo arrays ---
381
  y_combined = np.concatenate(y_accumulator, axis=1)
382
  duration = total_duration
 
384
  # --- Transpose the array for soundfile to write stereo correctly ---
385
  sf.write(temp_audio_path, y_combined.T, current_sr)
386
  print(f"Combined all audio files into one. Total duration: {duration:.2f}s")
387
+
388
  # --- Update progress to the next stage, use fractional progress (current/total) ---
389
+ progress(1 / TOTAL_STEPS, desc=f"Stage 2/{TOTAL_STEPS}: Mapping Images to Tracks")
390
 
391
+ # --- Stage 2: Map Tracks to Image Groups ---
392
+ parsed_groups = [parse_track_ranges(g['tracks_str']) for g in group_definitions]
393
+ track_to_images_map = {}
394
+ for track_info in master_track_list:
395
+ track_idx = track_info['global_index']
396
+ assigned = False
397
+ for i, group_indices in enumerate(parsed_groups):
398
+ if track_idx in group_indices:
399
+ track_to_images_map[track_idx] = group_definitions[i]['images']
400
+ assigned = True
401
+ break
402
+ if not assigned:
403
+ track_to_images_map[track_idx] = fallback_images
404
+
405
+ # --- Stage 3: Generate ImageClips based on contiguous blocks ---
406
+ image_clips = []
407
+ if any(track_to_images_map.values()):
408
+ current_track_cursor = 0
409
+ while current_track_cursor < len(master_track_list):
410
+ start_track_info = master_track_list[current_track_cursor]
411
+ image_set_for_block = track_to_images_map.get(start_track_info['global_index'])
412
+
413
+ # Find the end of the contiguous block of tracks that use the same image set
414
+ end_track_cursor = current_track_cursor
415
+ while (end_track_cursor + 1 < len(master_track_list) and
416
+ track_to_images_map.get(master_track_list[end_track_cursor + 1]['global_index']) == image_set_for_block):
417
+ end_track_cursor += 1
418
+
419
+ end_track_info = master_track_list[end_track_cursor]
420
+
421
+ block_start_time = start_track_info['start_time']
422
+ block_end_time = end_track_info['end_time']
423
+ block_duration = block_end_time - block_start_time
424
+
425
+ if image_set_for_block and block_duration > 0:
426
+ print(f"Creating image block for tracks {start_track_info['global_index']}-{end_track_info['global_index']} (Time: {block_start_time:.2f}s - {block_end_time:.2f}s)")
427
+ time_per_image = block_duration / len(image_set_for_block)
428
+ for i, img_path in enumerate(image_set_for_block):
429
+ def create_image_layer(path, start, dur):
430
+ try:
431
+ img = ImageClip(path)
432
+ scale = min(WIDTH/img.w, HEIGHT/img.h)
433
+ resized_img = img.resized(scale)
434
+ return CompositeVideoClip([resized_img.with_position("center")], size=(WIDTH, HEIGHT)).with_duration(dur).with_start(start)
435
+ except Exception as e:
436
+ print(f"Warning: Failed to process image '{path}'. Skipping. Error: {e}")
437
+ return None
438
+
439
+ clip = create_image_layer(img_path, block_start_time + i * time_per_image, time_per_image)
440
+ if clip:
441
+ image_clips.append(clip)
442
+
443
+ current_track_cursor = end_track_cursor + 1
444
+
445
+ progress(2 / TOTAL_STEPS, desc=f"Stage 3/{TOTAL_STEPS}: Generating Text & Spectrogram")
446
+
447
+ # --- Stage 4: Generate Text and Spectrogram ---
448
+ # --- Text Overlay Logic using the aggregated track info
449
+ text_clips = [] # Text clips are now simpler as they don't depend on complex file logic anymore
450
+
451
+ font_path = SYSTEM_FONTS_MAP.get(font_name)
452
+ if not font_path:
453
+ raise gr.Error(f"Font path for '{font_name}' not found!")
454
+
455
+ # Use the robust parser for text colors as well
456
+ font_bg_rgb = parse_color_to_rgb(font_bg_color)
457
 
458
+ position = (pos_h.lower(), pos_v.lower())
459
+
460
+ print(f"Using font: {font_name}, Size: {font_size}, Position: {position}")
461
 
462
+ # Create the RGBA tuple for the background color.
463
+ # The alpha value is converted from a 0.0-1.0 float to a 0-255 integer.
464
+ bg_color_tuple = (font_bg_rgb[0], font_bg_rgb[1], font_bg_rgb[2], int(font_bg_alpha * 255))
465
+
466
+ # 1. Define a maximum width for the caption. 90% of the video width is a good choice.
467
+ caption_width = int(WIDTH * 0.9)
468
 
469
+ # --- Get font metrics to calculate dynamic padding ---
470
+ try:
471
+ # Load the font with Pillow to access its metrics
472
+ pil_font = ImageFont.truetype(font_path, size=font_size)
473
+ _, descent = pil_font.getmetrics()
474
+ # Calculate a bottom margin to compensate for the font's descent.
475
+ # A small constant is added as a safety buffer.
476
+ # This prevents clipping on fonts with large descenders (like 'g', 'p').
477
+ bottom_margin = int(descent * 0.5) + 2
478
+ print(f"Font '{font_name}' descent: {descent}. Applying dynamic bottom margin of {bottom_margin}px.")
479
+ except Exception as e:
480
+ # Fallback in case of any font loading error
481
+ print(f"Warning: Could not get font metrics for '{font_name}'. Using fixed margin. Error: {e}")
482
+ bottom_margin = int(WIDTH * 0.01) # A small fixed fallback
483
 
484
+ for track in master_track_list:
485
+ text_duration = track['end_time'] - track['start_time']
486
+ if text_duration <= 0:
487
+ continue
 
 
 
 
 
 
 
 
 
 
488
 
489
+ # Construct display text based on pre-formatted number string
490
+ num_str = f"{track['global_index']:02d}" if format_double_digits else str(track['global_index'])
491
+ display_text = f"{num_str}. {track['title']}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
 
 
 
 
 
493
 
494
+ # 1. Create the TextClip first without positioning to get its size
495
+ txt_clip = TextClip(
496
+ text=display_text.strip(),
497
+ font_size=font_size,
498
+ color=font_color,
499
+ font=font_path,
500
+ bg_color=bg_color_tuple,
501
+ method='caption', # <-- Set method to caption
502
+ size=(caption_width, None), # <-- Provide size for wrapping
503
+ margin=(0, 0, 0, bottom_margin)
504
+ ).with_position(position).with_duration(text_duration).with_start(track['start_time'])
505
+
506
+ text_clips.append(txt_clip)
507
+
508
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
 
510
  N_FFT, HOP_LENGTH, N_BANDS = 2048, 512, 32
511
  MIN_DB, MAX_DB = -80.0, 0.0
 
552
 
553
  video_clip = VideoClip(frame_function=frame_generator, duration=duration)
554
 
555
+ # --- Set Spectrogram Opacity ---
556
  # If image clips were created, make the spectrogram layer 50% transparent.
557
  if image_clips:
558
  print("Applying 50% opacity to spectrogram layer.")
559
  video_clip = video_clip.with_opacity(0.5)
560
+
561
  # --- Use fractional progress (current/total) ---
562
+ progress(3 / TOTAL_STEPS, desc=f"Stage 4/{TOTAL_STEPS}: Rendering Base Video")
563
 
564
+ # --- Composition and Rendering ---
565
  audio_clip = AudioFileClip(temp_audio_path)
566
 
567
  # --- Clip Composition ---
 
588
  audio_bitrate="320k", fps=RENDER_FPS,
589
  logger='bar', threads=os.cpu_count(), preset='ultrafast')
590
  print("High-quality AAC audio encoding complete.")
591
+
592
  final_clip.close()
593
 
594
  # Step 2: Use FFmpeg to quickly increase the framerate to 24 FPS
 
596
 
597
  # --- Use fractional progress (current/total) ---
598
  progress(4 / TOTAL_STEPS, desc=f"Stage 5/{TOTAL_STEPS}: Finalizing Video")
599
+
600
+ # --- Finalizing ---
601
  increase_video_framerate(temp_fps1_path, final_output_path, target_fps=PLAYBACK_FPS)
602
 
603
  return final_output_path
 
619
  with gr.Column(scale=1):
620
  # --- Changed to gr.Files for multi-upload ---
621
  audio_inputs = gr.Files(
622
+ label="Upload Audio File(s)",
623
  file_count="multiple",
624
  file_types=["audio"]
625
  )
626
 
627
+ # --- Grouped Image Section ---
628
+ with gr.Accordion("Grouped Image Backgrounds (Advanced)", open=False):
629
+ gr.Markdown("Define groups of tracks and assign specific images to them. Tracks are numbered globally starting from 1 across all uploaded files.")
630
+
631
+ MAX_GROUPS = 10
632
+ group_track_inputs = []
633
+ group_image_inputs = []
634
+ group_accordions = []
635
+
636
+ # --- Create a centralized update function ---
637
+ def update_group_visibility(target_count: int):
638
+ """Updates the visibility of all group accordions and the state of the control buttons."""
639
+ # Clamp the target count to be within bounds
640
+ target_count = max(1, min(target_count, MAX_GROUPS))
641
+
642
+ updates = {visible_groups_state: target_count}
643
+ # Update visibility for each accordion
644
+ for i in range(MAX_GROUPS):
645
+ updates[group_accordions[i]] = gr.update(visible=(i < target_count))
646
+
647
+ # Update button states
648
+ updates[add_group_btn] = gr.update(visible=(target_count < MAX_GROUPS))
649
+ updates[remove_group_btn] = gr.update(interactive=(target_count > 1))
650
+
651
+ return updates
652
+
653
+ # --- Create simple wrapper functions for adding and removing ---
654
+ def add_group(current_count: int):
655
+ return update_group_visibility(current_count + 1)
656
+
657
+ def remove_group(current_count: int):
658
+ return update_group_visibility(current_count - 1)
659
+
660
+ # Pre-build all group components
661
+ for i in range(MAX_GROUPS):
662
+ with gr.Accordion(f"Image Group {i+1}", visible=(i==0)) as acc:
663
+ track_input = gr.Textbox(label=f"Tracks for Group {i+1} (e.g., '1-4, 7')")
664
+ image_input = gr.Files(label=f"Images for Group {i+1}", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
665
+ group_track_inputs.append(track_input)
666
+ group_image_inputs.append(image_input)
667
+ group_accordions.append(acc)
668
+
669
+ visible_groups_state = gr.State(1)
670
+ # --- Add a remove button and put both in a row ---
671
+ with gr.Row():
672
+ remove_group_btn = gr.Button("- Remove Last Group", variant="secondary", interactive=False)
673
+ add_group_btn = gr.Button("+ Add Image Group", variant="secondary")
674
+
675
+ with gr.Accordion("Fallback / Default Images", open=True):
676
+ gr.Markdown("These images will be used for any tracks not assigned to a specific group above.")
677
+ fallback_image_input = gr.Files(label="Fallback Images", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
678
+
679
  with gr.Accordion("Visualizer Options", open=True):
680
  with gr.Row():
681
  width_input = gr.Number(value=1920, label="Video Width (px)", precision=0)
 
691
  # --- Checkbox for number formatting ---
692
  format_double_digits_checkbox = gr.Checkbox(label="Format track numbers as double digits (e.g., 01, 05-09)", value=True)
693
  gr.Markdown("If the CUE sheet or filenames contain non-English characters, please select a compatible font.")
694
+
695
  # Define a priority list for default fonts, starting with common Japanese ones.
696
  # This list can include multiple names for the same font to improve matching.
697
  preferred_fonts = [
 
714
  default_font = FONT_DISPLAY_NAMES[0]
715
 
716
  font_name_dd = gr.Dropdown(choices=FONT_DISPLAY_NAMES, value=default_font, label="Font Family")
717
+
718
  with gr.Row():
719
  font_size_slider = gr.Slider(minimum=12, maximum=256, value=80, step=1, label="Font Size")
720
  font_color_picker = gr.ColorPicker(value="#FFFFFF", label="Font Color")
721
+
722
  with gr.Row():
723
  font_bg_color_picker = gr.ColorPicker(value="#000000", label="Text BG Color")
724
  font_bg_alpha_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.6, step=0.05, label="Text BG Opacity")
725
+
726
  gr.Markdown("Text Position")
727
  with gr.Row():
728
  pos_h_radio = gr.Radio(["left", "center", "right"], value="center", label="Horizontal Align")
 
732
 
733
  with gr.Column(scale=2):
734
  video_output = gr.Video(label="Generated Video")
735
+
736
+ # --- Define the full list of outputs for the update functions ---
737
+ group_update_outputs = [visible_groups_state, add_group_btn, remove_group_btn] + group_accordions
738
+
739
+ # Connect the "Add Group" button to its update function
740
+ add_group_btn.click(
741
+ fn=add_group,
742
+ inputs=visible_groups_state,
743
+ outputs=group_update_outputs
744
+ )
745
+
746
+ remove_group_btn.click(
747
+ fn=remove_group,
748
+ inputs=visible_groups_state,
749
+ outputs=group_update_outputs
750
+ )
751
+
752
+ # --- Define the master list of all inputs for the main button ---
753
+ all_inputs = [audio_inputs] + group_track_inputs + group_image_inputs + [
754
+ fallback_image_input,
755
+ format_double_digits_checkbox,
756
+ width_input, height_input,
757
+ fg_color, bg_color,
758
+ font_name_dd, font_size_slider, font_color_picker,
759
+ font_bg_color_picker, font_bg_alpha_slider,
760
+ pos_h_radio, pos_v_radio
761
+ ]
762
+
763
  submit_btn.click(
764
  fn=process_audio_to_video,
765
+ inputs=all_inputs,
766
+ outputs=video_output,
767
+ show_progress="full"
 
 
 
 
 
 
 
768
  )
769
 
770
  if __name__ == "__main__":