feat(visualizer): Add advanced spectrogram style customization
Browse filesThis commit introduces a suite of new options for customizing the appearance of the spectrogram, controlled through a dedicated "Spectrogram Bar Style" section in the UI.
New UI Controls: Added a new accordion with sliders and radio buttons to control:
- Number of frequency bars (Bar Count)
- Spacing between bars/blocks
- Bar style ('Solid Bars' or 'Stacked Blocks')
- Symmetry/Mirror mode ('Off', 'Horizontal', or 'Vertical')
Vertical Mirror Mode: Implemented a new rendering mode where the spectrogram emanates from the vertical center of the screen, expanding left and right.
Stacked Block Style: Added an alternative bar style where each bar is rendered as a series of discrete, stacked blocks, with a configurable block count.
app.py
CHANGED
@@ -270,13 +270,23 @@ def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
|
|
270 |
video_height = args[arg_offset + 3]
|
271 |
spec_fg_color = args[arg_offset + 4]
|
272 |
spec_bg_color = args[arg_offset + 5]
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
|
281 |
if not audio_files:
|
282 |
raise gr.Error("Please upload at least one audio file.")
|
@@ -505,16 +515,31 @@ def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
|
|
505 |
|
506 |
text_clips.append(txt_clip)
|
507 |
|
508 |
-
N_FFT, HOP_LENGTH
|
509 |
MIN_DB, MAX_DB = -80.0, 0.0
|
510 |
|
511 |
# Spectrogram calculation on combined audio
|
512 |
# --- Create a mono version of audio specifically for the spectrogram ---
|
513 |
# This resolves the TypeError while keeping the final audio in stereo.
|
514 |
y_mono_for_spec = librosa.to_mono(y_combined)
|
515 |
-
S_mel = librosa.feature.melspectrogram(y=y_mono_for_spec, sr=current_sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=
|
516 |
S_mel_db = librosa.power_to_db(S_mel, ref=np.max)
|
517 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
518 |
# Frame generation logic for the spectrogram
|
519 |
def frame_generator(t):
|
520 |
# If images are used as background, the spectrogram's own background should be transparent.
|
@@ -527,7 +552,7 @@ def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
|
|
527 |
if not image_clips:
|
528 |
for i in range(1, 9):
|
529 |
y_pos = int(i * (HEIGHT / 9)); frame[y_pos-1:y_pos, :] = grid_rgb
|
530 |
-
|
531 |
# 1. Safety Check: If the spectrogram has no time frames (e.g., from an extremely short audio file),
|
532 |
# return a blank frame immediately to prevent an IndexError.
|
533 |
if S_mel_db.shape[1] == 0:
|
@@ -543,23 +568,108 @@ def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
|
|
543 |
# maximum valid index, preventing any edge-case errors.
|
544 |
time_idx = min(time_idx, S_mel_db.shape[1] - 1)
|
545 |
|
546 |
-
|
547 |
-
|
548 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
549 |
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
563 |
return frame
|
564 |
|
565 |
video_clip = VideoClip(frame_function=frame_generator, duration=duration)
|
@@ -671,7 +781,7 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
|
|
671 |
|
672 |
# Pre-build all group components
|
673 |
for i in range(MAX_GROUPS):
|
674 |
-
with gr.Accordion(f"Image Group {i+1}", visible=(i==0)) as acc:
|
675 |
track_input = gr.Textbox(label=f"Tracks for Group {i+1} (e.g., '1-4, 7')")
|
676 |
image_input = gr.Files(label=f"Images for Group {i+1}", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
|
677 |
group_track_inputs.append(track_input)
|
@@ -687,13 +797,48 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
|
|
687 |
with gr.Accordion("Fallback / Default Images", open=True):
|
688 |
gr.Markdown("These images will be used for any tracks not assigned to a specific group above.")
|
689 |
fallback_image_input = gr.Files(label="Fallback Images", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
|
690 |
-
|
691 |
-
|
|
|
692 |
with gr.Row():
|
693 |
width_input = gr.Number(value=1920, label="Video Width (px)", precision=0)
|
694 |
height_input = gr.Number(value=1080, label="Video Height (px)", precision=0)
|
695 |
-
fg_color = gr.ColorPicker(value="#71808c", label="Spectrogram Bar
|
696 |
bg_color = gr.ColorPicker(value="#2C3E50", label="Background Color (if no images)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
697 |
|
698 |
with gr.Accordion("Text Overlay Options", open=True):
|
699 |
gr.Markdown(
|
@@ -707,9 +852,9 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
|
|
707 |
# Define a priority list for default fonts, starting with common Japanese ones.
|
708 |
# This list can include multiple names for the same font to improve matching.
|
709 |
preferred_fonts = [
|
710 |
-
"Meiryo", "メイリオ",
|
711 |
"Yu Gothic", "游ゴシック",
|
712 |
"MS Gothic", "MS ゴシック",
|
|
|
713 |
"Hiragino Kaku Gothic ProN", # Common on macOS
|
714 |
"Microsoft JhengHei", # Fallback to Traditional Chinese
|
715 |
"Arial" # Generic fallback
|
@@ -717,8 +862,11 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
|
|
717 |
default_font = None
|
718 |
# Find the first available font from the preferred list
|
719 |
for font in preferred_fonts:
|
720 |
-
|
721 |
-
|
|
|
|
|
|
|
722 |
break
|
723 |
|
724 |
# If none of the preferred fonts are found, use the first available font as a last resort
|
@@ -760,13 +908,17 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
|
|
760 |
inputs=visible_groups_state,
|
761 |
outputs=group_update_outputs
|
762 |
)
|
763 |
-
|
764 |
# --- Define the master list of all inputs for the main button ---
|
765 |
all_inputs = [audio_inputs] + group_track_inputs + group_image_inputs + [
|
766 |
fallback_image_input,
|
767 |
format_double_digits_checkbox,
|
768 |
width_input, height_input,
|
769 |
fg_color, bg_color,
|
|
|
|
|
|
|
|
|
770 |
font_name_dd, font_size_slider, font_color_picker,
|
771 |
font_bg_color_picker, font_bg_alpha_slider,
|
772 |
pos_h_radio, pos_v_radio
|
|
|
270 |
video_height = args[arg_offset + 3]
|
271 |
spec_fg_color = args[arg_offset + 4]
|
272 |
spec_bg_color = args[arg_offset + 5]
|
273 |
+
|
274 |
+
# --- NEW: Unpack spectrogram style arguments ---
|
275 |
+
n_bands = int(args[arg_offset + 6])
|
276 |
+
bar_spacing = int(args[arg_offset + 7])
|
277 |
+
mirror_mode = args[arg_offset + 8] # This is now a string
|
278 |
+
bar_style = args[arg_offset + 9]
|
279 |
+
num_blocks = int(args[arg_offset + 10])
|
280 |
+
|
281 |
+
# --- Unpack font and text arguments (indices are shifted) ---
|
282 |
+
font_name = args[arg_offset + 11]
|
283 |
+
font_size = args[arg_offset + 12]
|
284 |
+
font_color = args[arg_offset + 13]
|
285 |
+
font_bg_color = args[arg_offset + 14]
|
286 |
+
font_bg_alpha = args[arg_offset + 15]
|
287 |
+
pos_h = args[arg_offset + 16]
|
288 |
+
pos_v = args[arg_offset + 17]
|
289 |
+
|
290 |
|
291 |
if not audio_files:
|
292 |
raise gr.Error("Please upload at least one audio file.")
|
|
|
515 |
|
516 |
text_clips.append(txt_clip)
|
517 |
|
518 |
+
N_FFT, HOP_LENGTH = 2048, 512
|
519 |
MIN_DB, MAX_DB = -80.0, 0.0
|
520 |
|
521 |
# Spectrogram calculation on combined audio
|
522 |
# --- Create a mono version of audio specifically for the spectrogram ---
|
523 |
# This resolves the TypeError while keeping the final audio in stereo.
|
524 |
y_mono_for_spec = librosa.to_mono(y_combined)
|
525 |
+
S_mel = librosa.feature.melspectrogram(y=y_mono_for_spec, sr=current_sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=n_bands, fmax=current_sr/2)
|
526 |
S_mel_db = librosa.power_to_db(S_mel, ref=np.max)
|
527 |
|
528 |
+
# --- Pre-calculate drawing parameters for stacked block style ---
|
529 |
+
BLOCK_SPACING = 2 # The pixel gap between stacked blocks
|
530 |
+
if bar_style == 'Stacked Blocks':
|
531 |
+
# Calculate the total vertical space available for the blocks themselves
|
532 |
+
# In mirrored mode, this is based on half the screen height
|
533 |
+
if mirror_mode == 'Vertical (Left/Right)':
|
534 |
+
drawable_size = WIDTH // 2
|
535 |
+
elif mirror_mode == 'Horizontal (Top/Bottom)':
|
536 |
+
drawable_size = HEIGHT // 2
|
537 |
+
else: # Off
|
538 |
+
drawable_size = HEIGHT
|
539 |
+
total_block_pixel_size = drawable_size - ((num_blocks - 1) * BLOCK_SPACING)
|
540 |
+
# Calculate the size of a single block
|
541 |
+
single_block_size = total_block_pixel_size / num_blocks
|
542 |
+
|
543 |
# Frame generation logic for the spectrogram
|
544 |
def frame_generator(t):
|
545 |
# If images are used as background, the spectrogram's own background should be transparent.
|
|
|
552 |
if not image_clips:
|
553 |
for i in range(1, 9):
|
554 |
y_pos = int(i * (HEIGHT / 9)); frame[y_pos-1:y_pos, :] = grid_rgb
|
555 |
+
|
556 |
# 1. Safety Check: If the spectrogram has no time frames (e.g., from an extremely short audio file),
|
557 |
# return a blank frame immediately to prevent an IndexError.
|
558 |
if S_mel_db.shape[1] == 0:
|
|
|
568 |
# maximum valid index, preventing any edge-case errors.
|
569 |
time_idx = min(time_idx, S_mel_db.shape[1] - 1)
|
570 |
|
571 |
+
# --- RENDER LOGIC FOR VERTICAL MIRROR ---
|
572 |
+
if mirror_mode == 'Vertical (Left/Right)':
|
573 |
+
center_x = WIDTH // 2
|
574 |
+
max_pixel_length = WIDTH // 2
|
575 |
+
bar_height = HEIGHT / n_bands
|
576 |
+
|
577 |
+
for i in range(n_bands):
|
578 |
+
energy_db = S_mel_db[i, time_idx]
|
579 |
+
norm_height = np.clip((energy_db - MIN_DB) / (MAX_DB - MIN_DB), 0, 1)
|
580 |
+
if norm_height == 0:
|
581 |
+
continue
|
582 |
+
|
583 |
+
# --- Calculate y-coords from bottom-to-top ---
|
584 |
+
# This makes low frequencies appear at the bottom and high frequencies at the top.
|
585 |
+
y_start = int(HEIGHT - (i + 1) * bar_height)
|
586 |
+
y_end = int(HEIGHT - i * bar_height)
|
587 |
+
|
588 |
+
# Apply spacing to create a gap above the current bar.
|
589 |
+
y_start_with_spacing = y_start + bar_spacing
|
590 |
+
|
591 |
+
# Ensure the bar still has visible height after spacing
|
592 |
+
if y_start_with_spacing >= y_end:
|
593 |
+
continue
|
594 |
+
|
595 |
+
if bar_style == 'Stacked Blocks':
|
596 |
+
blocks_to_draw = int(norm_height * num_blocks)
|
597 |
+
if blocks_to_draw == 0:
|
598 |
+
continue
|
599 |
+
|
600 |
+
for j in range(blocks_to_draw):
|
601 |
+
block_left_x = center_x + (j * (single_block_size + BLOCK_SPACING))
|
602 |
+
block_right_x = block_left_x + single_block_size
|
603 |
+
# Draw right side
|
604 |
+
frame[y_start_with_spacing:y_end, int(block_left_x):int(block_right_x)] = fg_rgb
|
605 |
+
# Draw mirrored left side
|
606 |
+
frame[y_start_with_spacing:y_end, int(center_x - (block_right_x - center_x)):int(center_x - (block_left_x - center_x))] = fg_rgb
|
607 |
+
else: # Solid Bars
|
608 |
+
bar_pixel_length = int(norm_height * max_pixel_length)
|
609 |
+
if bar_pixel_length < 1:
|
610 |
+
continue
|
611 |
+
|
612 |
+
# Draw right side
|
613 |
+
frame[y_start_with_spacing:y_end, center_x : center_x + bar_pixel_length] = fg_rgb
|
614 |
+
# Draw mirrored left side
|
615 |
+
frame[y_start_with_spacing:y_end, center_x - bar_pixel_length : center_x] = fg_rgb
|
616 |
+
|
617 |
+
# --- RENDER LOGIC FOR HORIZONTAL MIRROR AND OFF ---
|
618 |
+
else:
|
619 |
+
bar_width = WIDTH / n_bands
|
620 |
+
is_horizontal_mirror = (mirror_mode == 'Horizontal (Top/Bottom)')
|
621 |
+
|
622 |
+
# Determine rendering parameters based on whether the view is mirrored
|
623 |
+
if is_horizontal_mirror:
|
624 |
+
center_y = HEIGHT // 2
|
625 |
+
max_pixel_height = HEIGHT // 2
|
626 |
+
else: # Off
|
627 |
+
center_y = HEIGHT # The "center" is the bottom of the screen
|
628 |
+
max_pixel_height = HEIGHT
|
629 |
+
|
630 |
+
# Loop through each frequency band to draw its bar/blocks
|
631 |
+
for i in range(n_bands):
|
632 |
+
energy_db = S_mel_db[i, time_idx]
|
633 |
|
634 |
+
# The denominator should be the range of DB values (MAX_DB - MIN_DB).
|
635 |
+
# Since MAX_DB is 0, this simplifies to -MIN_DB, which is a positive 80.0.
|
636 |
+
# This prevents the division by zero warning.
|
637 |
+
norm_height = np.clip((energy_db - MIN_DB) / (MAX_DB - MIN_DB), 0, 1)
|
638 |
+
|
639 |
+
if norm_height == 0:
|
640 |
+
continue
|
641 |
+
|
642 |
+
# Calculate the horizontal position of the current bar
|
643 |
+
x_start = int(i * bar_width)
|
644 |
+
x_end = int((i + 1) * bar_width - bar_spacing)
|
645 |
+
|
646 |
+
# --- Main rendering logic: switches between styles ---
|
647 |
+
if bar_style == 'Stacked Blocks':
|
648 |
+
# Calculate how many blocks to draw based on energy
|
649 |
+
blocks_to_draw = int(norm_height * num_blocks)
|
650 |
+
if blocks_to_draw == 0:
|
651 |
+
continue
|
652 |
+
|
653 |
+
# Draw each block from the bottom up
|
654 |
+
for j in range(blocks_to_draw):
|
655 |
+
# Calculate the Y coordinates for this specific block
|
656 |
+
block_bottom_y = center_y - (j * (single_block_size + BLOCK_SPACING))
|
657 |
+
block_top_y = block_bottom_y - single_block_size
|
658 |
+
frame[int(block_top_y):int(block_bottom_y), x_start:x_end] = fg_rgb
|
659 |
+
|
660 |
+
if is_horizontal_mirror:
|
661 |
+
frame[int(center_y + (center_y - block_bottom_y)):int(center_y + (center_y - block_top_y)), x_start:x_end] = fg_rgb
|
662 |
+
else: # Solid Bars
|
663 |
+
# Calculate the total height of the solid bar
|
664 |
+
bar_pixel_height = int(norm_height * max_pixel_height)
|
665 |
+
|
666 |
+
if bar_pixel_height < 1:
|
667 |
+
continue
|
668 |
+
|
669 |
+
frame[center_y - bar_pixel_height : center_y, x_start:x_end] = fg_rgb
|
670 |
+
|
671 |
+
if is_horizontal_mirror:
|
672 |
+
frame[center_y : center_y + bar_pixel_height, x_start:x_end] = fg_rgb
|
673 |
return frame
|
674 |
|
675 |
video_clip = VideoClip(frame_function=frame_generator, duration=duration)
|
|
|
781 |
|
782 |
# Pre-build all group components
|
783 |
for i in range(MAX_GROUPS):
|
784 |
+
with gr.Accordion(f"Image Group {i+1}", open=False, visible=(i==0)) as acc:
|
785 |
track_input = gr.Textbox(label=f"Tracks for Group {i+1} (e.g., '1-4, 7')")
|
786 |
image_input = gr.Files(label=f"Images for Group {i+1}", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
|
787 |
group_track_inputs.append(track_input)
|
|
|
797 |
with gr.Accordion("Fallback / Default Images", open=True):
|
798 |
gr.Markdown("These images will be used for any tracks not assigned to a specific group above.")
|
799 |
fallback_image_input = gr.Files(label="Fallback Images", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
|
800 |
+
|
801 |
+
# --- Renamed for clarity ---
|
802 |
+
with gr.Accordion("General Visualizer Options", open=True):
|
803 |
with gr.Row():
|
804 |
width_input = gr.Number(value=1920, label="Video Width (px)", precision=0)
|
805 |
height_input = gr.Number(value=1080, label="Video Height (px)", precision=0)
|
806 |
+
fg_color = gr.ColorPicker(value="#71808c", label="Spectrogram Bar Color")
|
807 |
bg_color = gr.ColorPicker(value="#2C3E50", label="Background Color (if no images)")
|
808 |
+
|
809 |
+
# --- Dedicated Accordion for Spectrogram Bar Style ---
|
810 |
+
with gr.Accordion("Spectrogram Bar Style", open=True):
|
811 |
+
n_bands_slider = gr.Slider(minimum=8, maximum=256, value=64, step=1, label="Number of Spectrogram Bars")
|
812 |
+
bar_spacing_slider = gr.Slider(minimum=0, maximum=10, value=2, step=1, label="Bar/Block Spacing (px)")
|
813 |
+
|
814 |
+
# --- Replaced Checkbox with Radio for mirror modes ---
|
815 |
+
mirror_mode_radio = gr.Radio(
|
816 |
+
choices=["Off", "Horizontal (Top/Bottom)", "Vertical (Left/Right)"],
|
817 |
+
value="Off",
|
818 |
+
label="Symmetry / Mirror Mode"
|
819 |
+
)
|
820 |
+
|
821 |
+
with gr.Row():
|
822 |
+
bar_style_radio = gr.Radio(
|
823 |
+
choices=["Solid Bars", "Stacked Blocks"],
|
824 |
+
value="Solid Bars",
|
825 |
+
label="Bar Style"
|
826 |
+
)
|
827 |
+
num_blocks_slider = gr.Slider(
|
828 |
+
minimum=5, maximum=50, value=20, step=1,
|
829 |
+
label="Number of Blocks per Bar",
|
830 |
+
visible=False # Initially hidden
|
831 |
+
)
|
832 |
+
|
833 |
+
# --- Function to dynamically show/hide the block count slider ---
|
834 |
+
def update_block_slider_visibility(bar_style):
|
835 |
+
return gr.update(visible=(bar_style == "Stacked Blocks"))
|
836 |
+
|
837 |
+
bar_style_radio.change(
|
838 |
+
fn=update_block_slider_visibility,
|
839 |
+
inputs=bar_style_radio,
|
840 |
+
outputs=num_blocks_slider
|
841 |
+
)
|
842 |
|
843 |
with gr.Accordion("Text Overlay Options", open=True):
|
844 |
gr.Markdown(
|
|
|
852 |
# Define a priority list for default fonts, starting with common Japanese ones.
|
853 |
# This list can include multiple names for the same font to improve matching.
|
854 |
preferred_fonts = [
|
|
|
855 |
"Yu Gothic", "游ゴシック",
|
856 |
"MS Gothic", "MS ゴシック",
|
857 |
+
"Meiryo", "メイリオ",
|
858 |
"Hiragino Kaku Gothic ProN", # Common on macOS
|
859 |
"Microsoft JhengHei", # Fallback to Traditional Chinese
|
860 |
"Arial" # Generic fallback
|
|
|
862 |
default_font = None
|
863 |
# Find the first available font from the preferred list
|
864 |
for font in preferred_fonts:
|
865 |
+
for candidate in FONT_DISPLAY_NAMES:
|
866 |
+
if candidate.startswith(font) or font in candidate:
|
867 |
+
default_font = candidate
|
868 |
+
break
|
869 |
+
if default_font:
|
870 |
break
|
871 |
|
872 |
# If none of the preferred fonts are found, use the first available font as a last resort
|
|
|
908 |
inputs=visible_groups_state,
|
909 |
outputs=group_update_outputs
|
910 |
)
|
911 |
+
|
912 |
# --- Define the master list of all inputs for the main button ---
|
913 |
all_inputs = [audio_inputs] + group_track_inputs + group_image_inputs + [
|
914 |
fallback_image_input,
|
915 |
format_double_digits_checkbox,
|
916 |
width_input, height_input,
|
917 |
fg_color, bg_color,
|
918 |
+
# --- Add spectrogram style inputs in correct order ---
|
919 |
+
n_bands_slider, bar_spacing_slider, mirror_mode_radio,
|
920 |
+
bar_style_radio, num_blocks_slider,
|
921 |
+
# --- Text and font inputs ---
|
922 |
font_name_dd, font_size_slider, font_color_picker,
|
923 |
font_bg_color_picker, font_bg_alpha_slider,
|
924 |
pos_h_radio, pos_v_radio
|