avans06 commited on
Commit
4a7b514
·
1 Parent(s): 0c9293b

feat(visualizer): Add advanced spectrogram style customization

Browse files

This commit introduces a suite of new options for customizing the appearance of the spectrogram, controlled through a dedicated "Spectrogram Bar Style" section in the UI.

New UI Controls: Added a new accordion with sliders and radio buttons to control:
- Number of frequency bars (Bar Count)
- Spacing between bars/blocks
- Bar style ('Solid Bars' or 'Stacked Blocks')
- Symmetry/Mirror mode ('Off', 'Horizontal', or 'Vertical')

Vertical Mirror Mode: Implemented a new rendering mode where the spectrogram emanates from the vertical center of the screen, expanding left and right.

Stacked Block Style: Added an alternative bar style where each bar is rendered as a series of discrete, stacked blocks, with a configurable block count.

Files changed (1) hide show
  1. app.py +186 -34
app.py CHANGED
@@ -270,13 +270,23 @@ def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
270
  video_height = args[arg_offset + 3]
271
  spec_fg_color = args[arg_offset + 4]
272
  spec_bg_color = args[arg_offset + 5]
273
- font_name = args[arg_offset + 6]
274
- font_size = args[arg_offset + 7]
275
- font_color = args[arg_offset + 8]
276
- font_bg_color = args[arg_offset + 9]
277
- font_bg_alpha = args[arg_offset + 10]
278
- pos_h = args[arg_offset + 11]
279
- pos_v = args[arg_offset + 12]
 
 
 
 
 
 
 
 
 
 
280
 
281
  if not audio_files:
282
  raise gr.Error("Please upload at least one audio file.")
@@ -505,16 +515,31 @@ def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
505
 
506
  text_clips.append(txt_clip)
507
 
508
- N_FFT, HOP_LENGTH, N_BANDS = 2048, 512, 32
509
  MIN_DB, MAX_DB = -80.0, 0.0
510
 
511
  # Spectrogram calculation on combined audio
512
  # --- Create a mono version of audio specifically for the spectrogram ---
513
  # This resolves the TypeError while keeping the final audio in stereo.
514
  y_mono_for_spec = librosa.to_mono(y_combined)
515
- S_mel = librosa.feature.melspectrogram(y=y_mono_for_spec, sr=current_sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_BANDS, fmax=current_sr/2)
516
  S_mel_db = librosa.power_to_db(S_mel, ref=np.max)
517
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
518
  # Frame generation logic for the spectrogram
519
  def frame_generator(t):
520
  # If images are used as background, the spectrogram's own background should be transparent.
@@ -527,7 +552,7 @@ def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
527
  if not image_clips:
528
  for i in range(1, 9):
529
  y_pos = int(i * (HEIGHT / 9)); frame[y_pos-1:y_pos, :] = grid_rgb
530
-
531
  # 1. Safety Check: If the spectrogram has no time frames (e.g., from an extremely short audio file),
532
  # return a blank frame immediately to prevent an IndexError.
533
  if S_mel_db.shape[1] == 0:
@@ -543,23 +568,108 @@ def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
543
  # maximum valid index, preventing any edge-case errors.
544
  time_idx = min(time_idx, S_mel_db.shape[1] - 1)
545
 
546
- bar_width = WIDTH / N_BANDS
547
- for i in range(N_BANDS):
548
- energy_db = S_mel_db[i, time_idx]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
549
 
550
- # The denominator should be the range of DB values (MAX_DB - MIN_DB).
551
- # Since MAX_DB is 0, this simplifies to -MIN_DB, which is a positive 80.0.
552
- # This prevents the division by zero warning.
553
- norm_height = np.clip((energy_db - MIN_DB) / (MAX_DB - MIN_DB), 0, 1)
554
- bar_height = int(np.nan_to_num(norm_height) * HEIGHT)
555
- if bar_height < 1:
556
- continue
557
- x_start, x_end = int(i * bar_width), int((i + 1) * bar_width - 2)
558
- y_start = HEIGHT - bar_height
559
- for k in range(bar_height):
560
- y_pos, ratio = y_start + k, k / bar_height
561
- r, g, b = (int(c1 * (1-ratio) + c2 * ratio) for c1, c2 in zip(fg_rgb, bg_rgb))
562
- frame[y_pos, x_start:x_end] = (r, g, b)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  return frame
564
 
565
  video_clip = VideoClip(frame_function=frame_generator, duration=duration)
@@ -671,7 +781,7 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
671
 
672
  # Pre-build all group components
673
  for i in range(MAX_GROUPS):
674
- with gr.Accordion(f"Image Group {i+1}", visible=(i==0)) as acc:
675
  track_input = gr.Textbox(label=f"Tracks for Group {i+1} (e.g., '1-4, 7')")
676
  image_input = gr.Files(label=f"Images for Group {i+1}", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
677
  group_track_inputs.append(track_input)
@@ -687,13 +797,48 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
687
  with gr.Accordion("Fallback / Default Images", open=True):
688
  gr.Markdown("These images will be used for any tracks not assigned to a specific group above.")
689
  fallback_image_input = gr.Files(label="Fallback Images", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
690
-
691
- with gr.Accordion("Visualizer Options", open=True):
 
692
  with gr.Row():
693
  width_input = gr.Number(value=1920, label="Video Width (px)", precision=0)
694
  height_input = gr.Number(value=1080, label="Video Height (px)", precision=0)
695
- fg_color = gr.ColorPicker(value="#71808c", label="Spectrogram Bar Top Color")
696
  bg_color = gr.ColorPicker(value="#2C3E50", label="Background Color (if no images)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697
 
698
  with gr.Accordion("Text Overlay Options", open=True):
699
  gr.Markdown(
@@ -707,9 +852,9 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
707
  # Define a priority list for default fonts, starting with common Japanese ones.
708
  # This list can include multiple names for the same font to improve matching.
709
  preferred_fonts = [
710
- "Meiryo", "メイリオ",
711
  "Yu Gothic", "游ゴシック",
712
  "MS Gothic", "MS ゴシック",
 
713
  "Hiragino Kaku Gothic ProN", # Common on macOS
714
  "Microsoft JhengHei", # Fallback to Traditional Chinese
715
  "Arial" # Generic fallback
@@ -717,8 +862,11 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
717
  default_font = None
718
  # Find the first available font from the preferred list
719
  for font in preferred_fonts:
720
- if font in FONT_DISPLAY_NAMES:
721
- default_font = font
 
 
 
722
  break
723
 
724
  # If none of the preferred fonts are found, use the first available font as a last resort
@@ -760,13 +908,17 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
760
  inputs=visible_groups_state,
761
  outputs=group_update_outputs
762
  )
763
-
764
  # --- Define the master list of all inputs for the main button ---
765
  all_inputs = [audio_inputs] + group_track_inputs + group_image_inputs + [
766
  fallback_image_input,
767
  format_double_digits_checkbox,
768
  width_input, height_input,
769
  fg_color, bg_color,
 
 
 
 
770
  font_name_dd, font_size_slider, font_color_picker,
771
  font_bg_color_picker, font_bg_alpha_slider,
772
  pos_h_radio, pos_v_radio
 
270
  video_height = args[arg_offset + 3]
271
  spec_fg_color = args[arg_offset + 4]
272
  spec_bg_color = args[arg_offset + 5]
273
+
274
+ # --- NEW: Unpack spectrogram style arguments ---
275
+ n_bands = int(args[arg_offset + 6])
276
+ bar_spacing = int(args[arg_offset + 7])
277
+ mirror_mode = args[arg_offset + 8] # This is now a string
278
+ bar_style = args[arg_offset + 9]
279
+ num_blocks = int(args[arg_offset + 10])
280
+
281
+ # --- Unpack font and text arguments (indices are shifted) ---
282
+ font_name = args[arg_offset + 11]
283
+ font_size = args[arg_offset + 12]
284
+ font_color = args[arg_offset + 13]
285
+ font_bg_color = args[arg_offset + 14]
286
+ font_bg_alpha = args[arg_offset + 15]
287
+ pos_h = args[arg_offset + 16]
288
+ pos_v = args[arg_offset + 17]
289
+
290
 
291
  if not audio_files:
292
  raise gr.Error("Please upload at least one audio file.")
 
515
 
516
  text_clips.append(txt_clip)
517
 
518
+ N_FFT, HOP_LENGTH = 2048, 512
519
  MIN_DB, MAX_DB = -80.0, 0.0
520
 
521
  # Spectrogram calculation on combined audio
522
  # --- Create a mono version of audio specifically for the spectrogram ---
523
  # This resolves the TypeError while keeping the final audio in stereo.
524
  y_mono_for_spec = librosa.to_mono(y_combined)
525
+ S_mel = librosa.feature.melspectrogram(y=y_mono_for_spec, sr=current_sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=n_bands, fmax=current_sr/2)
526
  S_mel_db = librosa.power_to_db(S_mel, ref=np.max)
527
 
528
+ # --- Pre-calculate drawing parameters for stacked block style ---
529
+ BLOCK_SPACING = 2 # The pixel gap between stacked blocks
530
+ if bar_style == 'Stacked Blocks':
531
+ # Calculate the total vertical space available for the blocks themselves
532
+ # In mirrored mode, this is based on half the screen height
533
+ if mirror_mode == 'Vertical (Left/Right)':
534
+ drawable_size = WIDTH // 2
535
+ elif mirror_mode == 'Horizontal (Top/Bottom)':
536
+ drawable_size = HEIGHT // 2
537
+ else: # Off
538
+ drawable_size = HEIGHT
539
+ total_block_pixel_size = drawable_size - ((num_blocks - 1) * BLOCK_SPACING)
540
+ # Calculate the size of a single block
541
+ single_block_size = total_block_pixel_size / num_blocks
542
+
543
  # Frame generation logic for the spectrogram
544
  def frame_generator(t):
545
  # If images are used as background, the spectrogram's own background should be transparent.
 
552
  if not image_clips:
553
  for i in range(1, 9):
554
  y_pos = int(i * (HEIGHT / 9)); frame[y_pos-1:y_pos, :] = grid_rgb
555
+
556
  # 1. Safety Check: If the spectrogram has no time frames (e.g., from an extremely short audio file),
557
  # return a blank frame immediately to prevent an IndexError.
558
  if S_mel_db.shape[1] == 0:
 
568
  # maximum valid index, preventing any edge-case errors.
569
  time_idx = min(time_idx, S_mel_db.shape[1] - 1)
570
 
571
+ # --- RENDER LOGIC FOR VERTICAL MIRROR ---
572
+ if mirror_mode == 'Vertical (Left/Right)':
573
+ center_x = WIDTH // 2
574
+ max_pixel_length = WIDTH // 2
575
+ bar_height = HEIGHT / n_bands
576
+
577
+ for i in range(n_bands):
578
+ energy_db = S_mel_db[i, time_idx]
579
+ norm_height = np.clip((energy_db - MIN_DB) / (MAX_DB - MIN_DB), 0, 1)
580
+ if norm_height == 0:
581
+ continue
582
+
583
+ # --- Calculate y-coords from bottom-to-top ---
584
+ # This makes low frequencies appear at the bottom and high frequencies at the top.
585
+ y_start = int(HEIGHT - (i + 1) * bar_height)
586
+ y_end = int(HEIGHT - i * bar_height)
587
+
588
+ # Apply spacing to create a gap above the current bar.
589
+ y_start_with_spacing = y_start + bar_spacing
590
+
591
+ # Ensure the bar still has visible height after spacing
592
+ if y_start_with_spacing >= y_end:
593
+ continue
594
+
595
+ if bar_style == 'Stacked Blocks':
596
+ blocks_to_draw = int(norm_height * num_blocks)
597
+ if blocks_to_draw == 0:
598
+ continue
599
+
600
+ for j in range(blocks_to_draw):
601
+ block_left_x = center_x + (j * (single_block_size + BLOCK_SPACING))
602
+ block_right_x = block_left_x + single_block_size
603
+ # Draw right side
604
+ frame[y_start_with_spacing:y_end, int(block_left_x):int(block_right_x)] = fg_rgb
605
+ # Draw mirrored left side
606
+ frame[y_start_with_spacing:y_end, int(center_x - (block_right_x - center_x)):int(center_x - (block_left_x - center_x))] = fg_rgb
607
+ else: # Solid Bars
608
+ bar_pixel_length = int(norm_height * max_pixel_length)
609
+ if bar_pixel_length < 1:
610
+ continue
611
+
612
+ # Draw right side
613
+ frame[y_start_with_spacing:y_end, center_x : center_x + bar_pixel_length] = fg_rgb
614
+ # Draw mirrored left side
615
+ frame[y_start_with_spacing:y_end, center_x - bar_pixel_length : center_x] = fg_rgb
616
+
617
+ # --- RENDER LOGIC FOR HORIZONTAL MIRROR AND OFF ---
618
+ else:
619
+ bar_width = WIDTH / n_bands
620
+ is_horizontal_mirror = (mirror_mode == 'Horizontal (Top/Bottom)')
621
+
622
+ # Determine rendering parameters based on whether the view is mirrored
623
+ if is_horizontal_mirror:
624
+ center_y = HEIGHT // 2
625
+ max_pixel_height = HEIGHT // 2
626
+ else: # Off
627
+ center_y = HEIGHT # The "center" is the bottom of the screen
628
+ max_pixel_height = HEIGHT
629
+
630
+ # Loop through each frequency band to draw its bar/blocks
631
+ for i in range(n_bands):
632
+ energy_db = S_mel_db[i, time_idx]
633
 
634
+ # The denominator should be the range of DB values (MAX_DB - MIN_DB).
635
+ # Since MAX_DB is 0, this simplifies to -MIN_DB, which is a positive 80.0.
636
+ # This prevents the division by zero warning.
637
+ norm_height = np.clip((energy_db - MIN_DB) / (MAX_DB - MIN_DB), 0, 1)
638
+
639
+ if norm_height == 0:
640
+ continue
641
+
642
+ # Calculate the horizontal position of the current bar
643
+ x_start = int(i * bar_width)
644
+ x_end = int((i + 1) * bar_width - bar_spacing)
645
+
646
+ # --- Main rendering logic: switches between styles ---
647
+ if bar_style == 'Stacked Blocks':
648
+ # Calculate how many blocks to draw based on energy
649
+ blocks_to_draw = int(norm_height * num_blocks)
650
+ if blocks_to_draw == 0:
651
+ continue
652
+
653
+ # Draw each block from the bottom up
654
+ for j in range(blocks_to_draw):
655
+ # Calculate the Y coordinates for this specific block
656
+ block_bottom_y = center_y - (j * (single_block_size + BLOCK_SPACING))
657
+ block_top_y = block_bottom_y - single_block_size
658
+ frame[int(block_top_y):int(block_bottom_y), x_start:x_end] = fg_rgb
659
+
660
+ if is_horizontal_mirror:
661
+ frame[int(center_y + (center_y - block_bottom_y)):int(center_y + (center_y - block_top_y)), x_start:x_end] = fg_rgb
662
+ else: # Solid Bars
663
+ # Calculate the total height of the solid bar
664
+ bar_pixel_height = int(norm_height * max_pixel_height)
665
+
666
+ if bar_pixel_height < 1:
667
+ continue
668
+
669
+ frame[center_y - bar_pixel_height : center_y, x_start:x_end] = fg_rgb
670
+
671
+ if is_horizontal_mirror:
672
+ frame[center_y : center_y + bar_pixel_height, x_start:x_end] = fg_rgb
673
  return frame
674
 
675
  video_clip = VideoClip(frame_function=frame_generator, duration=duration)
 
781
 
782
  # Pre-build all group components
783
  for i in range(MAX_GROUPS):
784
+ with gr.Accordion(f"Image Group {i+1}", open=False, visible=(i==0)) as acc:
785
  track_input = gr.Textbox(label=f"Tracks for Group {i+1} (e.g., '1-4, 7')")
786
  image_input = gr.Files(label=f"Images for Group {i+1}", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
787
  group_track_inputs.append(track_input)
 
797
  with gr.Accordion("Fallback / Default Images", open=True):
798
  gr.Markdown("These images will be used for any tracks not assigned to a specific group above.")
799
  fallback_image_input = gr.Files(label="Fallback Images", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
800
+
801
+ # --- Renamed for clarity ---
802
+ with gr.Accordion("General Visualizer Options", open=True):
803
  with gr.Row():
804
  width_input = gr.Number(value=1920, label="Video Width (px)", precision=0)
805
  height_input = gr.Number(value=1080, label="Video Height (px)", precision=0)
806
+ fg_color = gr.ColorPicker(value="#71808c", label="Spectrogram Bar Color")
807
  bg_color = gr.ColorPicker(value="#2C3E50", label="Background Color (if no images)")
808
+
809
+ # --- Dedicated Accordion for Spectrogram Bar Style ---
810
+ with gr.Accordion("Spectrogram Bar Style", open=True):
811
+ n_bands_slider = gr.Slider(minimum=8, maximum=256, value=64, step=1, label="Number of Spectrogram Bars")
812
+ bar_spacing_slider = gr.Slider(minimum=0, maximum=10, value=2, step=1, label="Bar/Block Spacing (px)")
813
+
814
+ # --- Replaced Checkbox with Radio for mirror modes ---
815
+ mirror_mode_radio = gr.Radio(
816
+ choices=["Off", "Horizontal (Top/Bottom)", "Vertical (Left/Right)"],
817
+ value="Off",
818
+ label="Symmetry / Mirror Mode"
819
+ )
820
+
821
+ with gr.Row():
822
+ bar_style_radio = gr.Radio(
823
+ choices=["Solid Bars", "Stacked Blocks"],
824
+ value="Solid Bars",
825
+ label="Bar Style"
826
+ )
827
+ num_blocks_slider = gr.Slider(
828
+ minimum=5, maximum=50, value=20, step=1,
829
+ label="Number of Blocks per Bar",
830
+ visible=False # Initially hidden
831
+ )
832
+
833
+ # --- Function to dynamically show/hide the block count slider ---
834
+ def update_block_slider_visibility(bar_style):
835
+ return gr.update(visible=(bar_style == "Stacked Blocks"))
836
+
837
+ bar_style_radio.change(
838
+ fn=update_block_slider_visibility,
839
+ inputs=bar_style_radio,
840
+ outputs=num_blocks_slider
841
+ )
842
 
843
  with gr.Accordion("Text Overlay Options", open=True):
844
  gr.Markdown(
 
852
  # Define a priority list for default fonts, starting with common Japanese ones.
853
  # This list can include multiple names for the same font to improve matching.
854
  preferred_fonts = [
 
855
  "Yu Gothic", "游ゴシック",
856
  "MS Gothic", "MS ゴシック",
857
+ "Meiryo", "メイリオ",
858
  "Hiragino Kaku Gothic ProN", # Common on macOS
859
  "Microsoft JhengHei", # Fallback to Traditional Chinese
860
  "Arial" # Generic fallback
 
862
  default_font = None
863
  # Find the first available font from the preferred list
864
  for font in preferred_fonts:
865
+ for candidate in FONT_DISPLAY_NAMES:
866
+ if candidate.startswith(font) or font in candidate:
867
+ default_font = candidate
868
+ break
869
+ if default_font:
870
  break
871
 
872
  # If none of the preferred fonts are found, use the first available font as a last resort
 
908
  inputs=visible_groups_state,
909
  outputs=group_update_outputs
910
  )
911
+
912
  # --- Define the master list of all inputs for the main button ---
913
  all_inputs = [audio_inputs] + group_track_inputs + group_image_inputs + [
914
  fallback_image_input,
915
  format_double_digits_checkbox,
916
  width_input, height_input,
917
  fg_color, bg_color,
918
+ # --- Add spectrogram style inputs in correct order ---
919
+ n_bands_slider, bar_spacing_slider, mirror_mode_radio,
920
+ bar_style_radio, num_blocks_slider,
921
+ # --- Text and font inputs ---
922
  font_name_dd, font_size_slider, font_color_picker,
923
  font_bg_color_picker, font_bg_alpha_slider,
924
  pos_h_radio, pos_v_radio