avans06 commited on
Commit
2c95cb8
·
1 Parent(s): 7248b37

feat(core): Implement multi-audio file processing and complex numbering

Browse files

This major update refactors the application to support processing multiple audio files into a single, continuous spectrogram video. The previous single-file workflow has been replaced with a robust batch processing pipeline.

This introduces a more powerful and flexible user experience, allowing for the creation of long-form videos from multiple sources, such as album tracks or podcast segments.

Key Changes:

Multi-File Upload:
The Gradio UI has been updated from `gr.Audio` to `gr.Files`, enabling users to upload and process multiple audio files in a single job.

Audio Processing Pipeline:
The backend now concatenates all uploaded audio files into a single, temporary WAV file.
It automatically handles resampling to a consistent sample rate if the source files differ.

Complex Title Numbering Logic:
Implemented a sophisticated system for numbering tracks and files based on the context of the upload:
1. **Single File with CUE:** Tracks are numbered sequentially (e.g., `01.`, `02.`).
2. **Single File without CUE:** The filename is used as the title, with no number.
3. **Multiple Files without CUE:** Files are numbered sequentially (e.g., `01.`, `02.`).
4. **Multiple Files (Mixed):** Files with CUE sheets have their tracks numbered with a composite scheme (e.g., `File 1, Track 1` -> `01-01.`). Files without CUE sheets are numbered by their position in the upload list.

UI Enhancements:
Added a new checkbox in the UI ("Format track numbers as double digits") to allow users to toggle zero-padding for all generated numbers.

Image Distribution:
The logic for background images has been adapted to distribute them evenly across the *total combined duration* of all uploaded audio files.

Files changed (2) hide show
  1. app.py +281 -155
  2. requirements.txt +1 -0
app.py CHANGED
@@ -6,7 +6,9 @@ import os
6
  import time
7
  import struct
8
  import subprocess
 
9
  import matplotlib.font_manager as fm
 
10
  from typing import Tuple, List, Dict
11
  from mutagen.flac import FLAC
12
  from moviepy import CompositeVideoClip, TextClip, VideoClip, AudioFileClip, ImageClip
@@ -178,12 +180,12 @@ def increase_video_framerate(input_path: str, output_path: str, target_fps: int
178
  'ffmpeg',
179
  '-y', # Overwrite output file if exists
180
  '-i', input_path,
181
- '-map', '0', # Map all streams (video, audio, subtitles)
182
- '-vf', 'fps=24', # Use fps filter to convert framerate to 24
183
- '-c:v', 'libx264', # Re-encode video with H.264 codec
184
- '-preset', 'fast', # Encoding speed/quality tradeoff
185
- '-crf', '18', # Quality (lower is better)
186
- '-c:a', 'copy', # Copy audio without re-encoding
187
  output_path
188
  ]
189
 
@@ -203,19 +205,26 @@ def increase_video_framerate(input_path: str, output_path: str, target_fps: int
203
 
204
  # --- Main Processing Function ---
205
  def process_audio_to_video(
206
- audio_path: str, image_paths: List[str],
 
207
  video_width: int, video_height: int,
208
  spec_fg_color: str, spec_bg_color: str,
209
  font_name: str, font_size: int, font_color: str,
210
  font_bg_color: str, font_bg_alpha: float,
211
- pos_h: str, pos_v: str
 
212
  ) -> str:
213
- if not audio_path: raise gr.Error("Please upload an audio file first.")
214
- if not font_name: raise gr.Error("Please select a font from the list.")
 
 
215
 
 
 
216
  # Define paths for temporary and final files
217
  timestamp = int(time.time())
218
  temp_fps1_path = f"temp_{timestamp}_fps1.mp4"
 
219
  final_output_path = f"final_video_{timestamp}_fps24.mp4"
220
 
221
  WIDTH, HEIGHT = int(video_width), int(video_height)
@@ -252,90 +261,214 @@ def process_audio_to_video(
252
 
253
  # Wrap the entire process in a try...finally block to ensure cleanup
254
  try:
255
- y, sr = librosa.load(audio_path, sr=None, mono=True)
256
- duration = librosa.get_duration(y=y, sr=sr)
257
 
258
- # --- Image Processing Logic ---
259
- image_clips = []
260
- # Check if any images were uploaded.
261
- if image_paths and len(image_paths) > 0:
262
- print(f"Found {len(image_paths)} images to process.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
  # First, try to parse the CUE sheet from the audio file.
265
- tracks = []
266
  if audio_path.lower().endswith('.flac'):
267
  try:
268
  audio_meta = FLAC(audio_path)
269
  if 'cuesheet' in audio_meta.tags:
270
- tracks = parse_cue_sheet_manually(audio_meta.tags['cuesheet'][0])
271
- print(f"Successfully parsed {len(tracks)} tracks from CUE sheet.")
272
- except Exception as e: print(f"Warning: Could not read or parse CUE sheet: {e}")
273
-
274
- # --- HELPER FUNCTION FOR ROBUST IMAGE CLIPS ---
275
- def create_image_layer(img_path, start, dur):
276
- """
277
- Creates an image layer that fits entirely within the video frame.
278
- It scales the image down to fit and centers it on a transparent background.
279
- """
280
- # This function implements a "cover" scaling mode to ensure the image
281
- # fills the entire video frame without leaving black bars.
282
- try:
283
- img_clip_raw = ImageClip(img_path)
284
-
285
- # 1. Calculate scaling factor to "contain" the image (fit inside).
286
- # We use min() to find the ratio that requires the most shrinkage,
287
- # ensuring the whole image fits without being cropped.
288
- scale_factor = min(WIDTH / img_clip_raw.w, HEIGHT / img_clip_raw.h)
289
-
290
- # 2. Resize the image so it fits perfectly within the video dimensions.
291
- resized_image_clip = img_clip_raw.resized(scale_factor)
292
-
293
- # 3. Create a composite clip to position the resized image on a
294
- # correctly-sized transparent canvas. This is the key to preventing overflow.
295
- final_layer = CompositeVideoClip(
296
- [resized_image_clip.with_position("center")],
297
- size=(WIDTH, HEIGHT)
298
- )
299
-
300
- # 4. Set the timing on the final composite layer.
301
- return final_layer.with_duration(dur).with_start(start)
302
  except Exception as e:
303
- print(f"Warning: Failed to process image '{img_path}'. Skipping. Error: {e}")
304
- return None
305
- # --- END OF HELPER FUNCTION ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
 
307
- # Mode 1: If CUE tracks match the number of images, align them.
308
- if tracks and len(tracks) == len(image_paths):
309
- print("Image count matches track count. Aligning images with tracks.")
310
- for i, (track, img_path) in enumerate(zip(tracks, image_paths)):
311
- start_time = track.get('start_time', 0)
312
- # The end time of a track is the start time of the next, or the total duration for the last track.
313
- end_time = tracks[i+1].get('start_time', duration) if i + 1 < len(tracks) else duration
314
- img_duration = end_time - start_time
315
- if img_duration <= 0: continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
- # Create an ImageClip for the duration of the track.
318
- clip = create_image_layer(img_path, start_time, img_duration)
319
- if clip:
320
- image_clips.append(clip)
321
-
322
- # Mode 2: If no CUE or mismatch, distribute images evenly across the audio duration.
323
- else:
324
- if tracks: print("Image count does not match track count. Distributing images evenly.")
325
- else: print("No CUE sheet found. Distributing images evenly.")
326
-
327
- img_duration = duration / len(image_paths)
328
- for i, img_path in enumerate(image_paths):
329
- start_time = i * img_duration
330
- # Create an ImageClip for a calculated segment of time.
331
- clip = create_image_layer(img_path, start_time, img_duration)
332
- if clip:
333
- image_clips.append(clip)
334
-
335
- # Spectrogram calculation
336
  N_FFT, HOP_LENGTH, N_BANDS = 2048, 512, 32
337
  MIN_DB, MAX_DB = -80.0, 0.0
338
- S_mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_BANDS, fmax=sr/2)
 
 
 
 
 
339
  S_mel_db = librosa.power_to_db(S_mel, ref=np.max)
340
 
341
  # Frame generation logic for the spectrogram
@@ -351,13 +484,18 @@ def process_audio_to_video(
351
  for i in range(1, 9):
352
  y_pos = int(i * (HEIGHT / 9)); frame[y_pos-1:y_pos, :] = grid_rgb
353
 
354
- time_idx = int((t / duration) * (S_mel_db.shape[1] - 1))
355
  bar_width = WIDTH / N_BANDS
356
  for i in range(N_BANDS):
357
  energy_db = S_mel_db[i, time_idx]
 
 
 
 
358
  norm_height = np.clip((energy_db - MIN_DB) / (MAX_DB - MIN_DB), 0, 1)
359
- bar_height = int(norm_height * HEIGHT)
360
- if bar_height < 1: continue
 
361
  x_start, x_end = int(i * bar_width), int((i + 1) * bar_width - 2)
362
  y_start = HEIGHT - bar_height
363
  for k in range(bar_height):
@@ -367,62 +505,19 @@ def process_audio_to_video(
367
  return frame
368
 
369
  video_clip = VideoClip(frame_function=frame_generator, duration=duration)
370
-
371
  # --- NEW: Set Spectrogram Opacity ---
372
  # If image clips were created, make the spectrogram layer 50% transparent.
373
  if image_clips:
374
  print("Applying 50% opacity to spectrogram layer.")
375
  video_clip = video_clip.with_opacity(0.5)
376
 
377
- audio_clip = AudioFileClip(audio_path)
 
378
 
379
- # CUE Sheet title overlay logic
380
- text_clips = []
381
- tracks = []
382
- if audio_path.lower().endswith('.flac'):
383
- try:
384
- audio_meta = FLAC(audio_path)
385
- if 'cuesheet' in audio_meta.tags:
386
- tracks = parse_cue_sheet_manually(audio_meta.tags['cuesheet'][0])
387
- except Exception:
388
- pass # Already handled above
389
 
390
- if tracks:
391
- font_path = SYSTEM_FONTS_MAP.get(font_name)
392
- if not font_path: raise gr.Error(f"Font path for '{font_name}' not found!")
393
-
394
- # Use the robust parser for text colors as well
395
- font_bg_rgb = parse_color_to_rgb(font_bg_color)
396
-
397
- position = (pos_h.lower(), pos_v.lower())
398
-
399
- print(f"Using font: {font_name}, Size: {font_size}, Position: {position}")
400
-
401
- # Create the RGBA tuple for the background color.
402
- # The alpha value is converted from a 0.0-1.0 float to a 0-255 integer.
403
- bg_color_tuple = (font_bg_rgb[0], font_bg_rgb[1], font_bg_rgb[2], int(font_bg_alpha * 255))
404
-
405
- # 1. Define a maximum width for the caption. 90% of the video width is a good choice.
406
- caption_width = int(WIDTH * 0.9)
407
-
408
- for i, track in enumerate(tracks):
409
- start_time, title = track.get('start_time', 0), track.get('title', 'Unknown Track')
410
- end_time = tracks[i+1].get('start_time', duration) if i + 1 < len(tracks) else duration
411
- text_duration = end_time - start_time
412
- if text_duration <= 0: continue
413
-
414
- txt_clip = (TextClip(text=f"{i+1}. {title}",
415
- font_size=font_size,
416
- color=font_color,
417
- font=font_path,
418
- bg_color=bg_color_tuple,
419
- method='caption', # <-- Set method to caption
420
- size=(caption_width, None)) # <-- Provide size for wrapping
421
- .with_position(position)
422
- .with_duration(text_duration)
423
- .with_start(start_time))
424
- text_clips.append(txt_clip)
425
-
426
  # --- Clip Composition ---
427
  # The final composition order is important: images at the bottom, then spectrogram, then text.
428
  # The base layer is now the list of image clips.
@@ -449,9 +544,14 @@ def process_audio_to_video(
449
  print("High-quality AAC audio encoding complete.")
450
 
451
  final_clip.close()
452
-
453
  # Step 2: Use FFmpeg to quickly increase the framerate to 24 FPS
454
  print(f"\nStep 2/2: Remuxing video to {PLAYBACK_FPS} FPS...")
 
 
 
 
 
455
  increase_video_framerate(temp_fps1_path, final_output_path, target_fps=PLAYBACK_FPS)
456
 
457
  return final_output_path
@@ -461,37 +561,39 @@ def process_audio_to_video(
461
  raise e
462
  finally:
463
  # Step 3: Clean up the temporary file regardless of success or failure
464
- if os.path.exists(temp_fps1_path):
465
- print(f"Cleaning up temporary file: {temp_fps1_path}")
466
- os.remove(temp_fps1_path)
 
467
 
468
  # --- Gradio UI ---
469
  with gr.Blocks(title="Spectrogram Video Generator") as iface:
470
  gr.Markdown("# Spectrogram Video Generator")
471
  with gr.Row():
472
  with gr.Column(scale=1):
473
- audio_input = gr.Audio(type="filepath", label="Upload Audio File")
 
 
 
 
 
474
 
475
  # --- Image Upload Component ---
 
476
  gr.Markdown(
477
  """
478
- ### Background Image Options (Optional)
479
-
480
- Upload one or more images to create a dynamic background for the video. The display behavior changes based on your audio file and the number of images provided.
481
-
482
- * **Mode 1: CUE Sheet Synchronization**
483
- If your audio file contains an embedded CUE sheet AND the number of images you upload **exactly matches** the number of tracks, the images will be synchronized with the tracks. The first image will appear during the first track, the second during the second, and so on.
484
-
485
- * **Mode 2: Even Time Distribution**
486
- In all other cases (e.g., the audio has no CUE sheet, or the number of images and tracks do not match), the images will be displayed sequentially. The total duration of the video will be divided equally among all uploaded images.
487
-
488
- **Note:** When any image is used as a background, the spectrogram visualizer will automatically become **semi-transparent** to ensure the background is clearly visible.
489
  """
490
  )
491
  image_uploads = gr.File(
492
  label="Upload Background Images",
493
  file_count="multiple", # Allow multiple files
494
- file_types=["image"] # Accept only image formats
 
 
 
495
  )
496
 
497
  with gr.Accordion("Visualizer Options", open=True):
@@ -503,11 +605,34 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
503
 
504
  with gr.Accordion("Text Overlay Options", open=True):
505
  gr.Markdown(
506
- "**Note:** These options only take effect if the input audio file has an embedded CUE sheet."
507
  )
508
  gr.Markdown("---")
509
- gr.Markdown("If your CUE sheet contains non-English characters, please select a compatible font.")
510
- default_font = "Microsoft JhengHei" if "Microsoft JhengHei" in FONT_DISPLAY_NAMES else ("Arial" if "Arial" in FONT_DISPLAY_NAMES else (FONT_DISPLAY_NAMES[0] if FONT_DISPLAY_NAMES else None))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
  font_name_dd = gr.Dropdown(choices=FONT_DISPLAY_NAMES, value=default_font, label="Font Family")
512
 
513
  with gr.Row():
@@ -528,11 +653,12 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
528
  with gr.Column(scale=2):
529
  video_output = gr.Video(label="Generated Video")
530
 
531
- # --- Add image_uploads to the inputs list ---
532
  submit_btn.click(
533
  fn=process_audio_to_video,
534
  inputs=[
535
- audio_input, image_uploads,
 
536
  width_input, height_input,
537
  fg_color, bg_color,
538
  font_name_dd, font_size_slider, font_color_picker,
 
6
  import time
7
  import struct
8
  import subprocess
9
+ import soundfile as sf
10
  import matplotlib.font_manager as fm
11
+ from PIL import ImageFont
12
  from typing import Tuple, List, Dict
13
  from mutagen.flac import FLAC
14
  from moviepy import CompositeVideoClip, TextClip, VideoClip, AudioFileClip, ImageClip
 
180
  'ffmpeg',
181
  '-y', # Overwrite output file if exists
182
  '-i', input_path,
183
+ '-map', '0', # Map all streams (video, audio, subtitles)
184
+ '-vf', f'fps={target_fps}', # Use fps filter to convert framerate to 24
185
+ '-c:v', 'libx264', # Re-encode video with H.264 codec
186
+ '-preset', 'fast', # Encoding speed/quality tradeoff
187
+ '-crf', '18', # Quality (lower is better)
188
+ '-c:a', 'copy', # Copy audio without re-encoding
189
  output_path
190
  ]
191
 
 
205
 
206
  # --- Main Processing Function ---
207
  def process_audio_to_video(
208
+ audio_files: List[str], image_paths: List[str],
209
+ format_double_digits: bool,
210
  video_width: int, video_height: int,
211
  spec_fg_color: str, spec_bg_color: str,
212
  font_name: str, font_size: int, font_color: str,
213
  font_bg_color: str, font_bg_alpha: float,
214
+ pos_h: str, pos_v: str,
215
+ progress=gr.Progress(track_tqdm=True)
216
  ) -> str:
217
+ if not audio_files:
218
+ raise gr.Error("Please upload at least one audio file.")
219
+ if not font_name:
220
+ raise gr.Error("Please select a font from the list.")
221
 
222
+ progress(0, desc="Initializing...")
223
+
224
  # Define paths for temporary and final files
225
  timestamp = int(time.time())
226
  temp_fps1_path = f"temp_{timestamp}_fps1.mp4"
227
+ temp_audio_path = f"temp_combined_audio_{timestamp}.wav"
228
  final_output_path = f"final_video_{timestamp}_fps24.mp4"
229
 
230
  WIDTH, HEIGHT = int(video_width), int(video_height)
 
261
 
262
  # Wrap the entire process in a try...finally block to ensure cleanup
263
  try:
264
+ # --- Define total steps for the progress bar ---
265
+ TOTAL_STEPS = 5
266
 
267
+ # --- 1. Audio Processing & Track Info Aggregation ---
268
+ all_tracks_info = []
269
+ total_duration = 0.0
270
+ y_accumulator = []
271
+ current_sr = None
272
+
273
+ # --- Use `progress.tqdm` to create a progress bar for this loop ---
274
+ for file_idx, audio_path in enumerate(progress.tqdm(audio_files, desc=f"Stage 1/{TOTAL_STEPS}: Analyzing Audio Files")):
275
+ # --- Load audio as stereo (or its original channel count) ---
276
+ y, sr = librosa.load(audio_path, sr=None, mono=False)
277
+ # If loaded audio is mono (1D array), convert it to a 2D stereo array
278
+ # by duplicating the channel. This ensures all arrays can be concatenated.
279
+ if y.ndim == 1:
280
+ print(f" - Converting mono file to stereo: {os.path.basename(audio_path)}")
281
+ y = np.stack([y, y])
282
+
283
+ if current_sr is None:
284
+ current_sr = sr
285
+ if current_sr != sr:
286
+ print(f"Warning: Sample rate mismatch for {os.path.basename(audio_path)}. Expected {current_sr}Hz, found {sr}Hz.")
287
+ print(f"Resampling from {sr}Hz to {current_sr}Hz...")
288
+ y = librosa.resample(y, orig_sr=sr, target_sr=current_sr)
289
+
290
+ y_accumulator.append(y)
291
+ # Use the first channel (y[0]) for duration calculation, which is standard practice
292
+ file_duration = librosa.get_duration(y=y[0], sr=current_sr)
293
 
294
  # First, try to parse the CUE sheet from the audio file.
295
+ cue_tracks = []
296
  if audio_path.lower().endswith('.flac'):
297
  try:
298
  audio_meta = FLAC(audio_path)
299
  if 'cuesheet' in audio_meta.tags:
300
+ cue_tracks = parse_cue_sheet_manually(audio_meta.tags['cuesheet'][0])
301
+
302
+ print(f"Successfully parsed {len(cue_tracks)} tracks from CUE sheet.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  except Exception as e:
304
+ print(f"Warning: Could not read or parse CUE sheet for {os.path.basename(audio_path)}: {e}")
305
+
306
+ # --- Apply New Numbering Logic ---
307
+ file_num = file_idx + 1 # File numbering starts from 1
308
+ if len(audio_files) > 1:
309
+ if cue_tracks: # Scenario 3: Multiple files, this one has CUE
310
+ for track_idx, track in enumerate(cue_tracks):
311
+ track_num = track_idx + 1 # Track numbering starts from 1
312
+ number_str = f"{file_num:02d}-{track_num:02d}" if format_double_digits else f"{file_num}-{track_num}"
313
+ all_tracks_info.append({
314
+ "title": track.get('title', 'Unknown Track'),
315
+ "start_time": total_duration + track.get('start_time', 0),
316
+ "end_time": total_duration + (cue_tracks[track_idx+1].get('start_time', file_duration) if track_idx + 1 < len(cue_tracks) else file_duration),
317
+ "number_str": number_str
318
+ })
319
+ else: # Scenario 2: Multiple files, this one has NO CUE
320
+ number_str = f"{file_num:02d}" if format_double_digits else str(file_num)
321
+ all_tracks_info.append({
322
+ "title": os.path.splitext(os.path.basename(audio_path))[0],
323
+ "start_time": total_duration, "end_time": total_duration + file_duration,
324
+ "number_str": number_str
325
+ })
326
+ else: # Scenario 1: Single file upload
327
+ if cue_tracks: # With CUE
328
+ for track_idx, track in enumerate(cue_tracks):
329
+ track_num = track_idx + 1
330
+ number_str = f"{track_num:02d}" if format_double_digits else str(track_num)
331
+ all_tracks_info.append({
332
+ "title": track.get('title', 'Unknown Track'),
333
+ "start_time": total_duration + track.get('start_time', 0),
334
+ "end_time": total_duration + (cue_tracks[track_idx+1].get('start_time', file_duration) if track_idx + 1 < len(cue_tracks) else file_duration),
335
+ "number_str": f"{number_str}." # Add a dot for single file CUE tracks
336
+ })
337
+ else: # No CUE
338
+ all_tracks_info.append({
339
+ "title": os.path.splitext(os.path.basename(audio_path))[0],
340
+ "start_time": total_duration, "end_time": total_duration + file_duration,
341
+ "number_str": None # Signal to not show any number
342
+ })
343
+
344
+ total_duration += file_duration
345
+
346
+ # --- Concatenate along the time axis (axis=1) for stereo arrays ---
347
+ y_combined = np.concatenate(y_accumulator, axis=1)
348
+ duration = total_duration
349
+
350
+ # --- Transpose the array for soundfile to write stereo correctly ---
351
+ sf.write(temp_audio_path, y_combined.T, current_sr)
352
+ print(f"Combined all audio files into one. Total duration: {duration:.2f}s")
353
+
354
+ # --- Update progress to the next stage, use fractional progress (current/total) ---
355
+ progress(1 / TOTAL_STEPS, desc=f"Stage 2/{TOTAL_STEPS}: Generating Text Overlays")
356
+
357
+ # --- 2. Text Overlay Logic using the aggregated track info
358
+ text_clips = []
359
+ if all_tracks_info:
360
+ font_path = SYSTEM_FONTS_MAP.get(font_name)
361
+ if not font_path: raise gr.Error(f"Font path for '{font_name}' not found!")
362
+
363
+ # Use the robust parser for text colors as well
364
+ font_bg_rgb = parse_color_to_rgb(font_bg_color)
365
+
366
+ position = (pos_h.lower(), pos_v.lower())
367
+
368
+ print(f"Using font: {font_name}, Size: {font_size}, Position: {position}")
369
+
370
+ # Create the RGBA tuple for the background color.
371
+ # The alpha value is converted from a 0.0-1.0 float to a 0-255 integer.
372
+ bg_color_tuple = (font_bg_rgb[0], font_bg_rgb[1], font_bg_rgb[2], int(font_bg_alpha * 255))
373
+
374
+ # 1. Define a maximum width for the caption. 90% of the video width is a good choice.
375
+ caption_width = int(WIDTH * 0.9)
376
+
377
+ # --- Get font metrics to calculate dynamic padding ---
378
+ try:
379
+ # Load the font with Pillow to access its metrics
380
+ pil_font = ImageFont.truetype(font_path, size=font_size)
381
+ _, descent = pil_font.getmetrics()
382
+ # Calculate a bottom margin to compensate for the font's descent.
383
+ # A small constant is added as a safety buffer.
384
+ # This prevents clipping on fonts with large descenders (like 'g', 'p').
385
+ bottom_margin = int(descent * 0.5) + 2
386
+ print(f"Font '{font_name}' descent: {descent}. Applying dynamic bottom margin of {bottom_margin}px.")
387
+ except Exception as e:
388
+ # Fallback in case of any font loading error
389
+ print(f"Warning: Could not get font metrics for '{font_name}'. Using fixed margin. Error: {e}")
390
+ bottom_margin = int(WIDTH * 0.01) # A small fixed fallback
391
+
392
+ for track in all_tracks_info:
393
+ text_duration = track['end_time'] - track['start_time']
394
+ if text_duration <= 0:
395
+ continue
396
+
397
+ # Construct display text based on pre-formatted number string
398
+ display_text = f"{track['number_str']} {track['title']}" if track['number_str'] else track['title']
399
+
400
+
401
+ # 1. Create the TextClip first without positioning to get its size
402
+ txt_clip = TextClip(
403
+ text=display_text.strip(),
404
+ font_size=font_size,
405
+ color=font_color,
406
+ font=font_path,
407
+ bg_color=bg_color_tuple,
408
+ method='caption', # <-- Set method to caption
409
+ size=(caption_width, None), # <-- Provide size for wrapping
410
+ margin=(0, 0, 0, bottom_margin)
411
+ ).with_position(position).with_duration(text_duration).with_start(track['start_time'])
412
+
413
+ text_clips.append(txt_clip)
414
+
415
+ # --- Update progress to the next stage, use fractional progress (current/total) ---
416
+ progress(2 / TOTAL_STEPS, desc=f"Stage 3/{TOTAL_STEPS}: Generating Visual Layers")
417
+
418
+ # --- 3. Image and Spectrogram Logic ---
419
+ image_clips = []
420
+ if image_paths and len(image_paths) > 0:
421
+ print(f"Found {len(image_paths)} images to process.")
422
 
423
+ # Simplified logic: calculate time per image, max 3 mins, and loop.
424
+ img_duration = duration / len(image_paths)
425
+ for i, img_path in enumerate(image_paths):
426
+
427
+ # --- HELPER FUNCTION FOR ROBUST IMAGE CLIPS ---
428
+ def create_image_layer(img_path, start, dur):
429
+ """
430
+ Creates an image layer that fits entirely within the video frame.
431
+ It scales the image down to fit and centers it on a transparent background.
432
+ """
433
+ # This function implements a "cover" scaling mode to ensure the image
434
+ # fills the entire video frame without leaving black bars.
435
+ try:
436
+ img_clip_raw = ImageClip(img_path)
437
+
438
+ # 1. Calculate scaling factor to "contain" the image (fit inside).
439
+ # We use min() to find the ratio that requires the most shrinkage,
440
+ # ensuring the whole image fits without being cropped.
441
+ scale_factor = min(WIDTH / img_clip_raw.w, HEIGHT / img_clip_raw.h)
442
+
443
+ # 2. Resize the image so it fits perfectly within the video dimensions.
444
+ resized_clip = img_clip_raw.resized(scale_factor)
445
+
446
+ # 3. Create a composite clip to position the resized image on a
447
+ # correctly-sized transparent canvas. This is the key to preventing overflow.
448
+ final_layer = CompositeVideoClip(
449
+ [resized_clip.with_position("center")],
450
+ size=(WIDTH, HEIGHT)
451
+ )
452
+
453
+ # 4. Set the timing on the final composite layer.
454
+ return final_layer.with_duration(dur).with_start(start)
455
+ except Exception as e:
456
+ print(f"Warning: Failed to process image '{img_path}'. Skipping. Error: {e}")
457
+ return None
458
 
459
+ # Create an ImageClip for the duration of the track.
460
+ clip = create_image_layer(img_path, i * img_duration, img_duration)
461
+ if clip:
462
+ image_clips.append(clip)
463
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  N_FFT, HOP_LENGTH, N_BANDS = 2048, 512, 32
465
  MIN_DB, MAX_DB = -80.0, 0.0
466
+
467
+ # Spectrogram calculation on combined audio
468
+ # --- Create a mono version of audio specifically for the spectrogram ---
469
+ # This resolves the TypeError while keeping the final audio in stereo.
470
+ y_mono_for_spec = librosa.to_mono(y_combined)
471
+ S_mel = librosa.feature.melspectrogram(y=y_mono_for_spec, sr=current_sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_BANDS, fmax=current_sr/2)
472
  S_mel_db = librosa.power_to_db(S_mel, ref=np.max)
473
 
474
  # Frame generation logic for the spectrogram
 
484
  for i in range(1, 9):
485
  y_pos = int(i * (HEIGHT / 9)); frame[y_pos-1:y_pos, :] = grid_rgb
486
 
487
+ time_idx = min(int((t / duration) * S_mel_db.shape[1]), S_mel_db.shape[1] - 1)
488
  bar_width = WIDTH / N_BANDS
489
  for i in range(N_BANDS):
490
  energy_db = S_mel_db[i, time_idx]
491
+
492
+ # The denominator should be the range of DB values (MAX_DB - MIN_DB).
493
+ # Since MAX_DB is 0, this simplifies to -MIN_DB, which is a positive 80.0.
494
+ # This prevents the division by zero warning.
495
  norm_height = np.clip((energy_db - MIN_DB) / (MAX_DB - MIN_DB), 0, 1)
496
+ bar_height = int(np.nan_to_num(norm_height) * HEIGHT)
497
+ if bar_height < 1:
498
+ continue
499
  x_start, x_end = int(i * bar_width), int((i + 1) * bar_width - 2)
500
  y_start = HEIGHT - bar_height
501
  for k in range(bar_height):
 
505
  return frame
506
 
507
  video_clip = VideoClip(frame_function=frame_generator, duration=duration)
508
+
509
  # --- NEW: Set Spectrogram Opacity ---
510
  # If image clips were created, make the spectrogram layer 50% transparent.
511
  if image_clips:
512
  print("Applying 50% opacity to spectrogram layer.")
513
  video_clip = video_clip.with_opacity(0.5)
514
 
515
+ # --- Use fractional progress (current/total) ---
516
+ progress(3 / TOTAL_STEPS, desc=f"Stage 4/{TOTAL_STEPS}: Rendering Base Video (this may take time)")
517
 
518
+ # --- 4. Composition and Rendering ---
519
+ audio_clip = AudioFileClip(temp_audio_path)
 
 
 
 
 
 
 
 
520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521
  # --- Clip Composition ---
522
  # The final composition order is important: images at the bottom, then spectrogram, then text.
523
  # The base layer is now the list of image clips.
 
544
  print("High-quality AAC audio encoding complete.")
545
 
546
  final_clip.close()
547
+
548
  # Step 2: Use FFmpeg to quickly increase the framerate to 24 FPS
549
  print(f"\nStep 2/2: Remuxing video to {PLAYBACK_FPS} FPS...")
550
+
551
+ # --- Use fractional progress (current/total) ---
552
+ progress(4 / TOTAL_STEPS, desc=f"Stage 5/{TOTAL_STEPS}: Finalizing Video")
553
+
554
+ # --- 5. Finalizing ---
555
  increase_video_framerate(temp_fps1_path, final_output_path, target_fps=PLAYBACK_FPS)
556
 
557
  return final_output_path
 
561
  raise e
562
  finally:
563
  # Step 3: Clean up the temporary file regardless of success or failure
564
+ for f in [temp_fps1_path, temp_audio_path]:
565
+ if os.path.exists(f):
566
+ print(f"Cleaning up temporary file: {f}")
567
+ os.remove(f)
568
 
569
  # --- Gradio UI ---
570
  with gr.Blocks(title="Spectrogram Video Generator") as iface:
571
  gr.Markdown("# Spectrogram Video Generator")
572
  with gr.Row():
573
  with gr.Column(scale=1):
574
+ # --- Changed to gr.Files for multi-upload ---
575
+ audio_inputs = gr.Files(
576
+ label="Upload Audio File(s)",
577
+ file_count="multiple",
578
+ file_types=["audio"]
579
+ )
580
 
581
  # --- Image Upload Component ---
582
+ gr.Markdown("### Background Image Options (Optional)")
583
  gr.Markdown(
584
  """
585
+ When background images are uploaded, they will be displayed in a looping sequence.
586
+ - The display duration for each image is calculated by dividing the total video length by the number of images, with a maximum duration of **3 minutes** per image.
587
+ - The sequence loops until the video ends.
 
 
 
 
 
 
 
 
588
  """
589
  )
590
  image_uploads = gr.File(
591
  label="Upload Background Images",
592
  file_count="multiple", # Allow multiple files
593
+ # Replace the generic "image" category with a specific list of extensions.
594
+ # Note that the dot (.) before each extension is required.
595
+ file_types=[".png", ".jpg", ".jpeg", ".bmp", ".gif", ".webp", ".avif"]
596
+
597
  )
598
 
599
  with gr.Accordion("Visualizer Options", open=True):
 
605
 
606
  with gr.Accordion("Text Overlay Options", open=True):
607
  gr.Markdown(
608
+ "**Note:** The title overlay feature automatically detects if a file has an embedded CUE sheet. If not, the filename will be used as the title."
609
  )
610
  gr.Markdown("---")
611
+ # --- Checkbox for number formatting ---
612
+ format_double_digits_checkbox = gr.Checkbox(label="Format track numbers as double digits (e.g., 01, 05-09)", value=True)
613
+ gr.Markdown("If the CUE sheet or filenames contain non-English characters, please select a compatible font.")
614
+
615
+ # Define a priority list for default fonts, starting with common Japanese ones.
616
+ # This list can include multiple names for the same font to improve matching.
617
+ preferred_fonts = [
618
+ "Meiryo", "メイリオ",
619
+ "Yu Gothic", "游ゴシック",
620
+ "MS Gothic", "MS ゴシック",
621
+ "Hiragino Kaku Gothic ProN", # Common on macOS
622
+ "Microsoft JhengHei", # Fallback to Traditional Chinese
623
+ "Arial" # Generic fallback
624
+ ]
625
+ default_font = None
626
+ # Find the first available font from the preferred list
627
+ for font in preferred_fonts:
628
+ if font in FONT_DISPLAY_NAMES:
629
+ default_font = font
630
+ break
631
+
632
+ # If none of the preferred fonts are found, use the first available font as a last resort
633
+ if not default_font and FONT_DISPLAY_NAMES:
634
+ default_font = FONT_DISPLAY_NAMES[0]
635
+
636
  font_name_dd = gr.Dropdown(choices=FONT_DISPLAY_NAMES, value=default_font, label="Font Family")
637
 
638
  with gr.Row():
 
653
  with gr.Column(scale=2):
654
  video_output = gr.Video(label="Generated Video")
655
 
656
+ # --- Update inputs for the click event ---
657
  submit_btn.click(
658
  fn=process_audio_to_video,
659
  inputs=[
660
+ audio_inputs, image_uploads,
661
+ format_double_digits_checkbox,
662
  width_input, height_input,
663
  fg_color, bg_color,
664
  font_name_dd, font_size_slider, font_color_picker,
requirements.txt CHANGED
@@ -3,3 +3,4 @@ moviepy
3
  mutagen
4
  librosa
5
  matplotlib
 
 
3
  mutagen
4
  librosa
5
  matplotlib
6
+ pillow-avif-plugin