Spaces:

avans06
/

Audio_Spectrogram_Video_Generator

Running

avans06 commited on 4 days ago

Commit

4a7b514

1 Parent(s): 0c9293b

feat(visualizer): Add advanced spectrogram style customization

This commit introduces a suite of new options for customizing the appearance of the spectrogram, controlled through a dedicated "Spectrogram Bar Style" section in the UI.

New UI Controls: Added a new accordion with sliders and radio buttons to control:
- Number of frequency bars (Bar Count)
- Spacing between bars/blocks
- Bar style ('Solid Bars' or 'Stacked Blocks')
- Symmetry/Mirror mode ('Off', 'Horizontal', or 'Vertical')

Vertical Mirror Mode: Implemented a new rendering mode where the spectrogram emanates from the vertical center of the screen, expanding left and right.

Stacked Block Style: Added an alternative bar style where each bar is rendered as a series of discrete, stacked blocks, with a configurable block count.

Files changed (1) hide show

app.py +186 -34

app.py CHANGED Viewed

@@ -270,13 +270,23 @@ def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
     video_height = args[arg_offset + 3]
     spec_fg_color = args[arg_offset + 4]
     spec_bg_color = args[arg_offset + 5]
-    font_name = args[arg_offset + 6]
-    font_size = args[arg_offset + 7]
-    font_color = args[arg_offset + 8]
-    font_bg_color = args[arg_offset + 9]
-    font_bg_alpha = args[arg_offset + 10]
-    pos_h = args[arg_offset + 11]
-    pos_v = args[arg_offset + 12]
     if not audio_files:
         raise gr.Error("Please upload at least one audio file.")
@@ -505,16 +515,31 @@ def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
             text_clips.append(txt_clip)
-        N_FFT, HOP_LENGTH, N_BANDS = 2048, 512, 32
         MIN_DB, MAX_DB = -80.0, 0.0
         # Spectrogram calculation on combined audio
         # --- Create a mono version of audio specifically for the spectrogram ---
         # This resolves the TypeError while keeping the final audio in stereo.
         y_mono_for_spec = librosa.to_mono(y_combined)
-        S_mel = librosa.feature.melspectrogram(y=y_mono_for_spec, sr=current_sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_BANDS, fmax=current_sr/2)
         S_mel_db = librosa.power_to_db(S_mel, ref=np.max)
         # Frame generation logic for the spectrogram
         def frame_generator(t):
             # If images are used as background, the spectrogram's own background should be transparent.
@@ -527,7 +552,7 @@ def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
             if not image_clips:
                 for i in range(1, 9):
                     y_pos = int(i * (HEIGHT / 9)); frame[y_pos-1:y_pos, :] = grid_rgb
             # 1. Safety Check: If the spectrogram has no time frames (e.g., from an extremely short audio file),
             #    return a blank frame immediately to prevent an IndexError.
             if S_mel_db.shape[1] == 0:
@@ -543,23 +568,108 @@ def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
             #    maximum valid index, preventing any edge-case errors.
             time_idx = min(time_idx, S_mel_db.shape[1] - 1)
-            bar_width = WIDTH / N_BANDS
-            for i in range(N_BANDS):
-                energy_db = S_mel_db[i, time_idx]
-                # The denominator should be the range of DB values (MAX_DB - MIN_DB).
-                # Since MAX_DB is 0, this simplifies to -MIN_DB, which is a positive 80.0.
-                # This prevents the division by zero warning.
-                norm_height = np.clip((energy_db - MIN_DB) / (MAX_DB - MIN_DB), 0, 1)
-                bar_height = int(np.nan_to_num(norm_height) * HEIGHT)
-                if bar_height < 1:
-                    continue
-                x_start, x_end = int(i * bar_width), int((i + 1) * bar_width - 2)
-                y_start = HEIGHT - bar_height
-                for k in range(bar_height):
-                    y_pos, ratio = y_start + k, k / bar_height
-                    r, g, b = (int(c1 * (1-ratio) + c2 * ratio) for c1, c2 in zip(fg_rgb, bg_rgb))
-                    frame[y_pos, x_start:x_end] = (r, g, b)
             return frame
         video_clip = VideoClip(frame_function=frame_generator, duration=duration)
@@ -671,7 +781,7 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
                 # Pre-build all group components
                 for i in range(MAX_GROUPS):
-                    with gr.Accordion(f"Image Group {i+1}", visible=(i==0)) as acc:
                         track_input = gr.Textbox(label=f"Tracks for Group {i+1} (e.g., '1-4, 7')")
                         image_input = gr.Files(label=f"Images for Group {i+1}", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
                         group_track_inputs.append(track_input)
@@ -687,13 +797,48 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
                 with gr.Accordion("Fallback / Default Images", open=True):
                     gr.Markdown("These images will be used for any tracks not assigned to a specific group above.")
                     fallback_image_input = gr.Files(label="Fallback Images", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
-            with gr.Accordion("Visualizer Options", open=True):
                 with gr.Row():
                     width_input = gr.Number(value=1920, label="Video Width (px)", precision=0)
                     height_input = gr.Number(value=1080, label="Video Height (px)", precision=0)
-                fg_color = gr.ColorPicker(value="#71808c", label="Spectrogram Bar Top Color")
                 bg_color = gr.ColorPicker(value="#2C3E50", label="Background Color (if no images)")
             with gr.Accordion("Text Overlay Options", open=True):
                 gr.Markdown(
@@ -707,9 +852,9 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
                 # Define a priority list for default fonts, starting with common Japanese ones.
                 # This list can include multiple names for the same font to improve matching.
                 preferred_fonts = [
-                    "Meiryo", "メイリオ",
                     "Yu Gothic", "游ゴシック",
                     "MS Gothic", "ＭＳ ゴシック",
                     "Hiragino Kaku Gothic ProN", # Common on macOS
                     "Microsoft JhengHei", # Fallback to Traditional Chinese
                     "Arial" # Generic fallback
@@ -717,8 +862,11 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
                 default_font = None
                 # Find the first available font from the preferred list
                 for font in preferred_fonts:
-                    if font in FONT_DISPLAY_NAMES:
-                        default_font = font
                         break
                 # If none of the preferred fonts are found, use the first available font as a last resort
@@ -760,13 +908,17 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
         inputs=visible_groups_state,
         outputs=group_update_outputs
     )
     # --- Define the master list of all inputs for the main button ---
     all_inputs = [audio_inputs] + group_track_inputs + group_image_inputs + [
         fallback_image_input,
         format_double_digits_checkbox,
         width_input, height_input,
         fg_color, bg_color,
         font_name_dd, font_size_slider, font_color_picker,
         font_bg_color_picker, font_bg_alpha_slider,
         pos_h_radio, pos_v_radio

     video_height = args[arg_offset + 3]
     spec_fg_color = args[arg_offset + 4]
     spec_bg_color = args[arg_offset + 5]
+    # --- NEW: Unpack spectrogram style arguments ---
+    n_bands = int(args[arg_offset + 6])
+    bar_spacing = int(args[arg_offset + 7])
+    mirror_mode = args[arg_offset + 8] # This is now a string
+    bar_style = args[arg_offset + 9]
+    num_blocks = int(args[arg_offset + 10])
+    # --- Unpack font and text arguments (indices are shifted) ---
+    font_name = args[arg_offset + 11]
+    font_size = args[arg_offset + 12]
+    font_color = args[arg_offset + 13]
+    font_bg_color = args[arg_offset + 14]
+    font_bg_alpha = args[arg_offset + 15]
+    pos_h = args[arg_offset + 16]
+    pos_v = args[arg_offset + 17]
     if not audio_files:
         raise gr.Error("Please upload at least one audio file.")
             text_clips.append(txt_clip)
+        N_FFT, HOP_LENGTH = 2048, 512
         MIN_DB, MAX_DB = -80.0, 0.0
         # Spectrogram calculation on combined audio
         # --- Create a mono version of audio specifically for the spectrogram ---
         # This resolves the TypeError while keeping the final audio in stereo.
         y_mono_for_spec = librosa.to_mono(y_combined)
+        S_mel = librosa.feature.melspectrogram(y=y_mono_for_spec, sr=current_sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=n_bands, fmax=current_sr/2)
         S_mel_db = librosa.power_to_db(S_mel, ref=np.max)
+        # --- Pre-calculate drawing parameters for stacked block style ---
+        BLOCK_SPACING = 2 # The pixel gap between stacked blocks
+        if bar_style == 'Stacked Blocks':
+            # Calculate the total vertical space available for the blocks themselves
+            # In mirrored mode, this is based on half the screen height
+            if mirror_mode == 'Vertical (Left/Right)':
+                drawable_size = WIDTH // 2
+            elif mirror_mode == 'Horizontal (Top/Bottom)':
+                drawable_size = HEIGHT // 2
+            else: # Off
+                drawable_size = HEIGHT
+            total_block_pixel_size = drawable_size - ((num_blocks - 1) * BLOCK_SPACING)
+            # Calculate the size of a single block
+            single_block_size = total_block_pixel_size / num_blocks
         # Frame generation logic for the spectrogram
         def frame_generator(t):
             # If images are used as background, the spectrogram's own background should be transparent.
             if not image_clips:
                 for i in range(1, 9):
                     y_pos = int(i * (HEIGHT / 9)); frame[y_pos-1:y_pos, :] = grid_rgb
             # 1. Safety Check: If the spectrogram has no time frames (e.g., from an extremely short audio file),
             #    return a blank frame immediately to prevent an IndexError.
             if S_mel_db.shape[1] == 0:
             #    maximum valid index, preventing any edge-case errors.
             time_idx = min(time_idx, S_mel_db.shape[1] - 1)
+            # --- RENDER LOGIC FOR VERTICAL MIRROR ---
+            if mirror_mode == 'Vertical (Left/Right)':
+                center_x = WIDTH // 2
+                max_pixel_length = WIDTH // 2
+                bar_height = HEIGHT / n_bands
+                for i in range(n_bands):
+                    energy_db = S_mel_db[i, time_idx]
+                    norm_height = np.clip((energy_db - MIN_DB) / (MAX_DB - MIN_DB), 0, 1)
+                    if norm_height == 0:
+                        continue
+                    # --- Calculate y-coords from bottom-to-top ---
+                    # This makes low frequencies appear at the bottom and high frequencies at the top.
+                    y_start = int(HEIGHT - (i + 1) * bar_height)
+                    y_end = int(HEIGHT - i * bar_height)
+                    # Apply spacing to create a gap above the current bar.
+                    y_start_with_spacing = y_start + bar_spacing
+                    # Ensure the bar still has visible height after spacing
+                    if y_start_with_spacing >= y_end:
+                        continue
+                    if bar_style == 'Stacked Blocks':
+                        blocks_to_draw = int(norm_height * num_blocks)
+                        if blocks_to_draw == 0:
+                            continue
+                        for j in range(blocks_to_draw):
+                            block_left_x = center_x + (j * (single_block_size + BLOCK_SPACING))
+                            block_right_x = block_left_x + single_block_size
+                            # Draw right side
+                            frame[y_start_with_spacing:y_end, int(block_left_x):int(block_right_x)] = fg_rgb
+                            # Draw mirrored left side
+                            frame[y_start_with_spacing:y_end, int(center_x - (block_right_x - center_x)):int(center_x - (block_left_x - center_x))] = fg_rgb
+                    else: # Solid Bars
+                        bar_pixel_length = int(norm_height * max_pixel_length)
+                        if bar_pixel_length < 1:
+                            continue
+                        # Draw right side
+                        frame[y_start_with_spacing:y_end, center_x : center_x + bar_pixel_length] = fg_rgb
+                        # Draw mirrored left side
+                        frame[y_start_with_spacing:y_end, center_x - bar_pixel_length : center_x] = fg_rgb
+            # --- RENDER LOGIC FOR HORIZONTAL MIRROR AND OFF ---
+            else:
+                bar_width = WIDTH / n_bands
+                is_horizontal_mirror = (mirror_mode == 'Horizontal (Top/Bottom)')
+                # Determine rendering parameters based on whether the view is mirrored
+                if is_horizontal_mirror:
+                    center_y = HEIGHT // 2
+                    max_pixel_height = HEIGHT // 2
+                else: # Off
+                    center_y = HEIGHT # The "center" is the bottom of the screen
+                    max_pixel_height = HEIGHT
+                # Loop through each frequency band to draw its bar/blocks
+                for i in range(n_bands):
+                    energy_db = S_mel_db[i, time_idx]
+                    # The denominator should be the range of DB values (MAX_DB - MIN_DB).
+                    # Since MAX_DB is 0, this simplifies to -MIN_DB, which is a positive 80.0.
+                    # This prevents the division by zero warning.
+                    norm_height = np.clip((energy_db - MIN_DB) / (MAX_DB - MIN_DB), 0, 1)
+                    if norm_height == 0:
+                        continue
+                    # Calculate the horizontal position of the current bar
+                    x_start = int(i * bar_width)
+                    x_end = int((i + 1) * bar_width - bar_spacing)
+                    # --- Main rendering logic: switches between styles ---
+                    if bar_style == 'Stacked Blocks':
+                        # Calculate how many blocks to draw based on energy
+                        blocks_to_draw = int(norm_height * num_blocks)
+                        if blocks_to_draw == 0:
+                            continue
+                        # Draw each block from the bottom up
+                        for j in range(blocks_to_draw):
+                            # Calculate the Y coordinates for this specific block
+                            block_bottom_y = center_y - (j * (single_block_size + BLOCK_SPACING))
+                            block_top_y = block_bottom_y - single_block_size
+                            frame[int(block_top_y):int(block_bottom_y), x_start:x_end] = fg_rgb
+                            if is_horizontal_mirror:
+                                frame[int(center_y + (center_y - block_bottom_y)):int(center_y + (center_y - block_top_y)), x_start:x_end] = fg_rgb
+                    else: # Solid Bars
+                        # Calculate the total height of the solid bar
+                        bar_pixel_height = int(norm_height * max_pixel_height)
+                        if bar_pixel_height < 1:
+                            continue
+                        frame[center_y - bar_pixel_height : center_y, x_start:x_end] = fg_rgb
+                        if is_horizontal_mirror:
+                            frame[center_y : center_y + bar_pixel_height, x_start:x_end] = fg_rgb
             return frame
         video_clip = VideoClip(frame_function=frame_generator, duration=duration)
                 # Pre-build all group components
                 for i in range(MAX_GROUPS):
+                    with gr.Accordion(f"Image Group {i+1}", open=False, visible=(i==0)) as acc:
                         track_input = gr.Textbox(label=f"Tracks for Group {i+1} (e.g., '1-4, 7')")
                         image_input = gr.Files(label=f"Images for Group {i+1}", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
                         group_track_inputs.append(track_input)
                 with gr.Accordion("Fallback / Default Images", open=True):
                     gr.Markdown("These images will be used for any tracks not assigned to a specific group above.")
                     fallback_image_input = gr.Files(label="Fallback Images", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
+            # --- Renamed for clarity ---
+            with gr.Accordion("General Visualizer Options", open=True):
                 with gr.Row():
                     width_input = gr.Number(value=1920, label="Video Width (px)", precision=0)
                     height_input = gr.Number(value=1080, label="Video Height (px)", precision=0)
+                fg_color = gr.ColorPicker(value="#71808c", label="Spectrogram Bar Color")
                 bg_color = gr.ColorPicker(value="#2C3E50", label="Background Color (if no images)")
+            # --- Dedicated Accordion for Spectrogram Bar Style ---
+            with gr.Accordion("Spectrogram Bar Style", open=True):
+                n_bands_slider = gr.Slider(minimum=8, maximum=256, value=64, step=1, label="Number of Spectrogram Bars")
+                bar_spacing_slider = gr.Slider(minimum=0, maximum=10, value=2, step=1, label="Bar/Block Spacing (px)")
+                # --- Replaced Checkbox with Radio for mirror modes ---
+                mirror_mode_radio = gr.Radio(
+                    choices=["Off", "Horizontal (Top/Bottom)", "Vertical (Left/Right)"],
+                    value="Off",
+                    label="Symmetry / Mirror Mode"
+                )
+                with gr.Row():
+                    bar_style_radio = gr.Radio(
+                        choices=["Solid Bars", "Stacked Blocks"],
+                        value="Solid Bars",
+                        label="Bar Style"
+                    )
+                    num_blocks_slider = gr.Slider(
+                        minimum=5, maximum=50, value=20, step=1,
+                        label="Number of Blocks per Bar",
+                        visible=False # Initially hidden
+                    )
+                # --- Function to dynamically show/hide the block count slider ---
+                def update_block_slider_visibility(bar_style):
+                    return gr.update(visible=(bar_style == "Stacked Blocks"))
+                bar_style_radio.change(
+                    fn=update_block_slider_visibility,
+                    inputs=bar_style_radio,
+                    outputs=num_blocks_slider
+                )
             with gr.Accordion("Text Overlay Options", open=True):
                 gr.Markdown(
                 # Define a priority list for default fonts, starting with common Japanese ones.
                 # This list can include multiple names for the same font to improve matching.
                 preferred_fonts = [
                     "Yu Gothic", "游ゴシック",
                     "MS Gothic", "ＭＳ ゴシック",
+                    "Meiryo", "メイリオ",
                     "Hiragino Kaku Gothic ProN", # Common on macOS
                     "Microsoft JhengHei", # Fallback to Traditional Chinese
                     "Arial" # Generic fallback
                 default_font = None
                 # Find the first available font from the preferred list
                 for font in preferred_fonts:
+                    for candidate in FONT_DISPLAY_NAMES:
+                        if candidate.startswith(font) or font in candidate:
+                            default_font = candidate
+                            break
+                    if default_font:
                         break
                 # If none of the preferred fonts are found, use the first available font as a last resort
         inputs=visible_groups_state,
         outputs=group_update_outputs
     )
     # --- Define the master list of all inputs for the main button ---
     all_inputs = [audio_inputs] + group_track_inputs + group_image_inputs + [
         fallback_image_input,
         format_double_digits_checkbox,
         width_input, height_input,
         fg_color, bg_color,
+        # --- Add spectrogram style inputs in correct order ---
+        n_bands_slider, bar_spacing_slider, mirror_mode_radio,
+        bar_style_radio, num_blocks_slider,
+        # --- Text and font inputs ---
         font_name_dd, font_size_slider, font_color_picker,
         font_bg_color_picker, font_bg_alpha_slider,
         pos_h_radio, pos_v_radio