root commited on
Commit
651b0cd
·
1 Parent(s): bddf9c4
Files changed (3) hide show
  1. app.py +285 -30
  2. appp.py +68 -10
  3. requirements.txt +1 -0
app.py CHANGED
@@ -24,6 +24,17 @@ from utils import (
24
  )
25
  from emotionanalysis import MusicAnalyzer
26
  import librosa
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  # Login to Hugging Face Hub if token is provided
29
  if "HF_TOKEN" in os.environ:
@@ -1180,12 +1191,12 @@ def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='defa
1180
  # Sigmoid-like function with more scientific parameters
1181
  # Using logistic function: L/(1+e^(-k(x-x0))) to create smooth transitions
1182
  if tempo < 40: # Very slow tempos
1183
- return 3.5 # Maximum syllables for extremely slow tempos
1184
  elif tempo > 200: # Very fast tempos
1185
- return 0.8 # Minimum syllables for extremely fast tempos
1186
  else:
1187
  # Scientific logistic function for middle range (40-200 BPM)
1188
- L = 3.5 # Upper limit
1189
  k = 0.04 # Steepness of curve
1190
  x0 = 120 # Midpoint (inflection point at normal tempo)
1191
  return L / (1 + np.exp(k * (tempo - x0)))
@@ -1235,6 +1246,32 @@ def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='defa
1235
  # ----------------------------------------------------------------------
1236
  detailed_template = []
1237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1238
  for i, (stress_type, strength) in enumerate(stress_pattern):
1239
  # Get base syllable count from tempo with more nuanced mapping
1240
  base_syllables = tempo_to_syllable_base(tempo)
@@ -1281,6 +1318,60 @@ def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='defa
1281
  strength_pct = round(strength * 100) / 100
1282
  detailed_template.append(f"{stress_type}({strength_pct}):{syllable_count}")
1283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1284
  # Join beat templates for this phrase
1285
  phrase_template = "-".join(detailed_template)
1286
  syllable_templates.append(phrase_template)
@@ -1572,6 +1663,16 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
1572
  # Split lyrics into lines
1573
  lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
1574
 
 
 
 
 
 
 
 
 
 
 
1575
  # Initialize tracking variables
1576
  verification_notes = []
1577
  detailed_analysis = []
@@ -1747,11 +1848,14 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
1747
 
1748
  for beat in best_phrase_beats:
1749
  if beat.get("type") == "S":
 
1750
  strong_positions.append(current_pos)
1751
  current_pos += beat.get("count", 1)
1752
 
1753
  # Check if strong syllables align with strong beats
1754
  alignment_issues = []
 
 
1755
 
1756
  for pos in strong_positions:
1757
  # Find which word contains this position
@@ -1768,18 +1872,31 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
1768
  # Get stress pattern for this word
1769
  stress = word_info["stress_pattern"]
1770
 
1771
- # If we have stress information and this syllable isn't stressed
1772
- if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1':
1773
- misaligned_word = word_info["word"]
1774
- alignment_issues.append(f"'{word_info['word']}' (unstressed syllable on strong beat)")
1775
- stress_misalignments.append({
1776
- "line": i+1,
1777
- "word": word_info["word"],
1778
- "position": pos,
1779
- "suggestion": get_stress_aligned_alternatives(word_info["word"], syllable_in_word)
1780
- })
 
 
 
 
 
1781
  break
1782
 
 
 
 
 
 
 
 
 
1783
  if alignment_issues:
1784
  verification_notes.append(f" → Stress misalignments: {', '.join(alignment_issues)}")
1785
 
@@ -2452,7 +2569,7 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None, lyric
2452
 
2453
  # Create enhanced prompt with better rhythm alignment instructions
2454
  if use_second_level:
2455
- # Second-level approach with per-second alignment
2456
  content = f"""
2457
  You are a talented songwriter who specializes in {genre} music.
2458
  Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long.
@@ -2634,7 +2751,7 @@ Your lyrics:
2634
 
2635
  # Format as a chat message for the LLM
2636
  messages = [
2637
- {"role": "system", "content": "You are a professional songwriter. Create lyrics that match the specified rhythm patterns exactly. Start with the lyrics immediately without any explanation or thinking. Be concise and direct."},
2638
  {"role": "user", "content": content}
2639
  ]
2640
 
@@ -2684,6 +2801,23 @@ Your lyrics:
2684
  lyrics = lyrics.split("</thinking>")[1].strip()
2685
 
2686
  # Check for alternative thinking indicators with improved detection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2687
  thinking_markers = [
2688
  "<think>", "</think>",
2689
  "[thinking]", "[/thinking]",
@@ -2769,6 +2903,24 @@ Your lyrics:
2769
  if not isinstance(second_level_verification, list):
2770
  second_level_verification = None
2771
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2772
  # Verify syllable counts with enhanced verification - pass second-level templates if available
2773
  if templates_for_verification:
2774
  # Convert any NumPy values to native types before verification - directly handle conversions
@@ -2983,17 +3135,58 @@ Improved lyrics with fixed rhythm:
2983
  "prompt_template": "No prompt template available"
2984
  }
2985
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2986
  def process_audio(audio_file, lyrics_requirements=None):
2987
  """Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis."""
2988
  if audio_file is None:
2989
  return "Please upload an audio file.", None, None
2990
 
2991
  try:
2992
- print("Step 1/5: Extracting audio features...")
2993
  # Extract audio features
2994
  audio_data = extract_audio_features(audio_file)
2995
 
2996
- print("Step 2/5: Verifying audio contains music...")
2997
  # First check if it's music
2998
  try:
2999
  is_music, ast_results = detect_music(audio_data)
@@ -3004,7 +3197,11 @@ def process_audio(audio_file, lyrics_requirements=None):
3004
  if not is_music:
3005
  return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results
3006
 
3007
- print("Step 3/5: Classifying music genre...")
 
 
 
 
3008
  # Classify genre
3009
  try:
3010
  top_genres = classify_genre(audio_data)
@@ -3029,7 +3226,7 @@ def process_audio(audio_file, lyrics_requirements=None):
3029
  "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}
3030
  }
3031
 
3032
- print("Step 4/5: Analyzing music emotions, themes, and structure...")
3033
  # Analyze music emotions and themes
3034
  try:
3035
  emotion_results = music_analyzer.analyze_music(audio_file)
@@ -3046,12 +3243,15 @@ def process_audio(audio_file, lyrics_requirements=None):
3046
  beats_info = detect_beats(y, sr)
3047
  sections_info = detect_sections(y, sr)
3048
 
3049
- # Create structured segments for precise line-by-line matching
3050
  segments = []
3051
 
3052
- # Try to break audio into meaningful segments based on sections
3053
- # Each segment will correspond to one line of lyrics
3054
- if sections_info and len(sections_info) > 1:
 
 
 
3055
  min_segment_duration = 1.5 # Minimum 1.5 seconds per segment
3056
 
3057
  for section in sections_info:
@@ -3063,7 +3263,8 @@ def process_audio(audio_file, lyrics_requirements=None):
3063
  if section_duration < min_segment_duration * 1.5:
3064
  segments.append({
3065
  "start": section_start,
3066
- "end": section_end
 
3067
  })
3068
  else:
3069
  # Calculate ideal number of segments for this section
@@ -3078,7 +3279,8 @@ def process_audio(audio_file, lyrics_requirements=None):
3078
  segment_end = segment_start + segment_duration
3079
  segments.append({
3080
  "start": segment_start,
3081
- "end": segment_end
 
3082
  })
3083
  # If no good sections found, create segments based on beats
3084
  elif beats_info and len(beats_info["beat_times"]) > 4:
@@ -3136,6 +3338,15 @@ def process_audio(audio_file, lyrics_requirements=None):
3136
 
3137
  # Add syllable counts to each section
3138
  for section in sections_info:
 
 
 
 
 
 
 
 
 
3139
  # Create syllable templates for sections
3140
  section_beats_info = {
3141
  "beat_times": [beat for beat in beats_info["beat_times"]
@@ -3150,19 +3361,21 @@ def process_audio(audio_file, lyrics_requirements=None):
3150
  ]
3151
 
3152
  # Get a syllable count based on section duration and tempo
3153
- syllable_count = int(section["duration"] * (beats_info.get("tempo", 120) / 60) * 1.5)
 
3154
 
3155
  section_info = {
3156
  "type": section["type"],
3157
  "start": section["start"],
3158
  "end": section["end"],
3159
  "duration": section["duration"],
 
3160
  "syllable_count": syllable_count,
3161
  "beat_count": len(section_beats_info["beat_times"])
3162
  }
3163
 
3164
- # Try to create a more detailed syllable template
3165
- if len(section_beats_info["beat_times"]) >= 2:
3166
  # Ensure top_genres is a list with at least one element
3167
  if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple):
3168
  genre_name = top_genres[0][0]
@@ -3213,7 +3426,7 @@ def process_audio(audio_file, lyrics_requirements=None):
3213
  print(f"Error analyzing song structure: {str(e)}")
3214
  # Continue without song structure
3215
 
3216
- print("Step 5/5: Generating rhythmically aligned lyrics...")
3217
  # Generate lyrics based on top genre, emotion analysis, and song structure
3218
  try:
3219
  # Ensure top_genres is a list with at least one element before accessing
@@ -3306,7 +3519,8 @@ def process_audio(audio_file, lyrics_requirements=None):
3306
  "rhythm_analysis": rhythm_analysis,
3307
  "syllable_analysis": syllable_analysis,
3308
  "prompt_template": prompt_template,
3309
- "ast_results": ast_results
 
3310
  }
3311
 
3312
  return results
@@ -3328,6 +3542,13 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
3328
  # Get beat information
3329
  beats_info = detect_beats(y, sr)
3330
 
 
 
 
 
 
 
 
3331
  # Helper function to convert numpy values to floats - FIXED
3332
  def ensure_float(value):
3333
  if isinstance(value, np.ndarray) or isinstance(value, np.number):
@@ -3347,6 +3568,14 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
3347
  timeline += f"Beat Periodicity: {beat_periodicity:.3f}s\n"
3348
  timeline += f"Beat Entropy: {beats_info.get('beat_entropy', 'N/A')}\n"
3349
  timeline += f"Total Beats: {beats_info['beat_count']}\n"
 
 
 
 
 
 
 
 
3350
 
3351
  # Add musicological context based on tempo classification
3352
  if tempo < 60:
@@ -3374,6 +3603,13 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
3374
  time = ensure_float(time)
3375
  strength = ensure_float(strength)
3376
 
 
 
 
 
 
 
 
3377
  # More scientific determination of beat type based on both strength and metrical position
3378
  metrical_position = i % beats_info['time_signature']
3379
 
@@ -3395,6 +3631,10 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
3395
  else:
3396
  beat_type = "WEAK"
3397
  syllable_value = 1.0
 
 
 
 
3398
 
3399
  # Determine pattern letter based on beat type for consistency
3400
  if beat_type == "STRONG":
@@ -3851,9 +4091,16 @@ def display_results(audio_file, lyrics_requirements=None):
3851
  genre_results = results.get("genre_results", "Genre classification failed")
3852
  lyrics = results.get("lyrics", "Lyrics generation failed")
3853
  ast_results = results.get("ast_results", [])
 
3854
  else:
3855
  # Old tuple format
3856
  genre_results, lyrics, ast_results = results
 
 
 
 
 
 
3857
 
3858
  # Get clean lyrics (without analysis notes)
3859
  clean_lyrics = lyrics
@@ -3885,6 +4132,14 @@ def display_results(audio_file, lyrics_requirements=None):
3885
  emotion_text += f"- Time Signature: {beats_info.get('time_signature', 4)}/4\n"
3886
  emotion_text += f"- Total Beats: {beats_info.get('beat_count', 0)}\n"
3887
 
 
 
 
 
 
 
 
 
3888
  except Exception as e:
3889
  print(f"Error in emotion analysis: {str(e)}")
3890
 
 
24
  )
25
  from emotionanalysis import MusicAnalyzer
26
  import librosa
27
+ from pyannote.audio import Pipeline
28
+ import tempfile
29
+ import os
30
+ import soundfile as sf
31
+ import warnings
32
+ import json
33
+ import math
34
+ from collections import defaultdict
35
+ import matplotlib.pyplot as plt
36
+ from gradio_client import Client
37
+ from transformers import pipeline as hf_pipeline
38
 
39
  # Login to Hugging Face Hub if token is provided
40
  if "HF_TOKEN" in os.environ:
 
1191
  # Sigmoid-like function with more scientific parameters
1192
  # Using logistic function: L/(1+e^(-k(x-x0))) to create smooth transitions
1193
  if tempo < 40: # Very slow tempos
1194
+ return 1.8 # Further reduced maximum syllables for extremely slow tempos
1195
  elif tempo > 200: # Very fast tempos
1196
+ return 0.7 # Minimum syllables for extremely fast tempos
1197
  else:
1198
  # Scientific logistic function for middle range (40-200 BPM)
1199
+ L = 2.0 # Significantly reduced upper limit to prevent excessive syllables
1200
  k = 0.04 # Steepness of curve
1201
  x0 = 120 # Midpoint (inflection point at normal tempo)
1202
  return L / (1 + np.exp(k * (tempo - x0)))
 
1246
  # ----------------------------------------------------------------------
1247
  detailed_template = []
1248
 
1249
+ # Calculate phrase duration if beat times are available
1250
+ phrase_duration = 0
1251
+ if phrase and len(phrase) > 1 and len(beat_times) > 0:
1252
+ # Get first and last beat indices from the phrase
1253
+ first_idx = phrase[0]
1254
+ last_idx = phrase[-1]
1255
+
1256
+ # Check if indices are within bounds
1257
+ if first_idx < len(beat_times) and last_idx < len(beat_times):
1258
+ phrase_duration = beat_times[last_idx] - beat_times[first_idx]
1259
+
1260
+ # Calculate a maximum reasonable syllable count based on duration
1261
+ # Aim for 3-4 syllables per second maximum for singability (reduced from 5-6)
1262
+ max_reasonable_syllables = 100 # Default high value
1263
+ if phrase_duration > 0:
1264
+ # Use a more conservative syllable rate based on tempo
1265
+ if tempo < 80: # Slow tempo
1266
+ syllable_rate = 3.0 # Maximum 3 syllables per second for slow tempos
1267
+ elif tempo < 120: # Medium tempo
1268
+ syllable_rate = 3.5 # Maximum 3.5 syllables per second for medium tempos
1269
+ else: # Fast tempo
1270
+ syllable_rate = 4.0 # Maximum 4 syllables per second for fast tempos
1271
+
1272
+ # Calculate max syllables and ensure it's at least 2 for any phrase
1273
+ max_reasonable_syllables = max(2, int(phrase_duration * syllable_rate))
1274
+
1275
  for i, (stress_type, strength) in enumerate(stress_pattern):
1276
  # Get base syllable count from tempo with more nuanced mapping
1277
  base_syllables = tempo_to_syllable_base(tempo)
 
1318
  strength_pct = round(strength * 100) / 100
1319
  detailed_template.append(f"{stress_type}({strength_pct}):{syllable_count}")
1320
 
1321
+ # Calculate total expected syllables for this phrase
1322
+ total_expected_syllables = sum([float(beat.split(':')[1]) for beat in detailed_template])
1323
+
1324
+ # If total syllables exceed our reasonable limit, scale them down
1325
+ if total_expected_syllables > max_reasonable_syllables and max_reasonable_syllables > 0:
1326
+ scale_factor = max_reasonable_syllables / total_expected_syllables
1327
+ adjusted_template = []
1328
+
1329
+ # Stronger scaling for very short phrases (less than 0.8 seconds)
1330
+ if phrase_duration < 0.8 and phrase_duration > 0:
1331
+ # Further reduce for extremely short phrases
1332
+ scale_factor *= 0.8
1333
+
1334
+ for beat in detailed_template:
1335
+ if ':' in beat:
1336
+ beat_type_part = beat.split(':')[0]
1337
+ syllable_count = float(beat.split(':')[1])
1338
+ # Scale down and round to nearest 0.25
1339
+ new_count = max(0.5, round(syllable_count * scale_factor * 4) / 4)
1340
+
1341
+ # Extra check for very short phrases - cap at 1.0 for S beats and 0.5 for others
1342
+ if phrase_duration < 0.6 and phrase_duration > 0:
1343
+ if beat_type_part.startswith("S"):
1344
+ new_count = min(new_count, 1.0)
1345
+ else:
1346
+ new_count = min(new_count, 0.5)
1347
+
1348
+ adjusted_template.append(f"{beat_type_part}:{new_count}")
1349
+ else:
1350
+ adjusted_template.append(beat)
1351
+
1352
+ detailed_template = adjusted_template
1353
+
1354
+ # Extra check to avoid having too many total syllables in a phrase
1355
+ if len(detailed_template) > 0:
1356
+ total_syllables = sum([float(beat.split(':')[1]) for beat in detailed_template if ':' in beat])
1357
+ if phrase_duration > 0 and (total_syllables / phrase_duration) > 5.0:
1358
+ # If we have more than 5 syllables per second, apply additional scaling
1359
+ target_syllables = phrase_duration * 4.0 # Target 4 syllables per second max
1360
+ scale_factor = target_syllables / total_syllables
1361
+ adjusted_template = []
1362
+
1363
+ for beat in detailed_template:
1364
+ if ':' in beat:
1365
+ beat_type_part = beat.split(':')[0]
1366
+ syllable_count = float(beat.split(':')[1])
1367
+ # Scale down and round to nearest 0.25
1368
+ new_count = max(0.5, round(syllable_count * scale_factor * 4) / 4)
1369
+ adjusted_template.append(f"{beat_type_part}:{new_count}")
1370
+ else:
1371
+ adjusted_template.append(beat)
1372
+
1373
+ detailed_template = adjusted_template
1374
+
1375
  # Join beat templates for this phrase
1376
  phrase_template = "-".join(detailed_template)
1377
  syllable_templates.append(phrase_template)
 
1663
  # Split lyrics into lines
1664
  lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
1665
 
1666
+ # Remove any lines that are clearly not lyrics, like explanations or meta-content
1667
+ filtered_lines = []
1668
+ for line in lines:
1669
+ # Skip explanatory content or meta-text
1670
+ if line.startswith('**') or line.startswith('[Note:') or 'alignment:' in line.lower():
1671
+ continue
1672
+ filtered_lines.append(line)
1673
+
1674
+ lines = filtered_lines
1675
+
1676
  # Initialize tracking variables
1677
  verification_notes = []
1678
  detailed_analysis = []
 
1848
 
1849
  for beat in best_phrase_beats:
1850
  if beat.get("type") == "S":
1851
+ # If the count is greater than 1, only the first syllable should be stressed
1852
  strong_positions.append(current_pos)
1853
  current_pos += beat.get("count", 1)
1854
 
1855
  # Check if strong syllables align with strong beats
1856
  alignment_issues = []
1857
+ aligned_stress_count = 0
1858
+ total_stress_positions = len(strong_positions)
1859
 
1860
  for pos in strong_positions:
1861
  # Find which word contains this position
 
1872
  # Get stress pattern for this word
1873
  stress = word_info["stress_pattern"]
1874
 
1875
+ # If we have stress information, check if the syllable is stressed
1876
+ if stress and syllable_in_word < len(stress):
1877
+ if stress[syllable_in_word] == '1':
1878
+ # Syllable is stressed and properly aligned
1879
+ aligned_stress_count += 1
1880
+ else:
1881
+ # Syllable is not stressed but should be
1882
+ misaligned_word = word_info["word"]
1883
+ alignment_issues.append(f"'{word_info['word']}' (unstressed syllable on strong beat)")
1884
+ stress_misalignments.append({
1885
+ "line": i+1,
1886
+ "word": word_info["word"],
1887
+ "position": pos,
1888
+ "suggestion": get_stress_aligned_alternatives(word_info["word"], syllable_in_word)
1889
+ })
1890
  break
1891
 
1892
+ # Calculate alignment percentage
1893
+ alignment_percentage = 0
1894
+ if total_stress_positions > 0:
1895
+ alignment_percentage = (aligned_stress_count / total_stress_positions) * 100
1896
+
1897
+ # Add alignment percentage to notes
1898
+ verification_notes.append(f" → Stress alignment: {alignment_percentage:.1f}% ({aligned_stress_count}/{total_stress_positions})")
1899
+
1900
  if alignment_issues:
1901
  verification_notes.append(f" → Stress misalignments: {', '.join(alignment_issues)}")
1902
 
 
2569
 
2570
  # Create enhanced prompt with better rhythm alignment instructions
2571
  if use_second_level:
2572
+ # Second-level approach with per-second alignment - enhanced for better syllable distribution
2573
  content = f"""
2574
  You are a talented songwriter who specializes in {genre} music.
2575
  Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long.
 
2751
 
2752
  # Format as a chat message for the LLM
2753
  messages = [
2754
+ {"role": "system", "content": "You are a professional songwriter. Create lyrics that match the specified rhythm patterns EXACTLY. Be extremely concise - use only the EXACT number of syllables specified for each line. For short phrases (1 second or less), use just 2-3 MAXIMUM syllables. Include lyrics for EVERY musical section - do not leave any section empty. Use one-syllable words whenever possible for better singability. Avoid complex vocabulary. For all beat patterns, use fewer syllables than you think you need. Start with the lyrics immediately without any explanation or thinking."},
2755
  {"role": "user", "content": content}
2756
  ]
2757
 
 
2801
  lyrics = lyrics.split("</thinking>")[1].strip()
2802
 
2803
  # Check for alternative thinking indicators with improved detection
2804
+
2805
+ # Clean up lyrics: Remove meta-content and explanations
2806
+ if lyrics:
2807
+ # Remove any line that starts with **
2808
+ cleaned_lines = []
2809
+ for line in lyrics.split('\n'):
2810
+ if not line.strip().startswith('**') and not 'alignment:' in line.lower():
2811
+ cleaned_lines.append(line)
2812
+ lyrics = '\n'.join(cleaned_lines)
2813
+
2814
+ # Check for excessively long lines (likely explanations)
2815
+ max_reasonable_line_length = 80
2816
+ final_lines = []
2817
+ for line in lyrics.split('\n'):
2818
+ if len(line) <= max_reasonable_line_length or '[' in line or ']' in line:
2819
+ final_lines.append(line)
2820
+ lyrics = '\n'.join(final_lines)
2821
  thinking_markers = [
2822
  "<think>", "</think>",
2823
  "[thinking]", "[/thinking]",
 
2903
  if not isinstance(second_level_verification, list):
2904
  second_level_verification = None
2905
 
2906
+ # Ensure all second-level templates have lyrics
2907
+ if song_structure and "second_level" in song_structure and song_structure["second_level"]:
2908
+ if "templates" in song_structure["second_level"] and isinstance(song_structure["second_level"]["templates"], list):
2909
+ # Count how many seconds have lyrics
2910
+ if lyrics:
2911
+ lines = [line.strip() for line in lyrics.split('\n') if line.strip()]
2912
+
2913
+ # If we have fewer lines than seconds, try to distribute them better
2914
+ second_count = len(song_structure["second_level"]["templates"])
2915
+ if 0 < len(lines) < second_count:
2916
+ # Simple distribution - repeat existing lines to fill all seconds
2917
+ distributed_lines = []
2918
+ for i in range(second_count):
2919
+ distributed_lines.append(lines[i % len(lines)])
2920
+
2921
+ # Replace the lyrics with the distributed version
2922
+ lyrics = '\n'.join(distributed_lines)
2923
+
2924
  # Verify syllable counts with enhanced verification - pass second-level templates if available
2925
  if templates_for_verification:
2926
  # Convert any NumPy values to native types before verification - directly handle conversions
 
3135
  "prompt_template": "No prompt template available"
3136
  }
3137
 
3138
+ def detect_voice_activity(audio_file):
3139
+ """
3140
+ Detect segments with voice/singing in audio using pyannote/voice-activity-detection
3141
+
3142
+ Args:
3143
+ audio_file: Path to audio file
3144
+
3145
+ Returns:
3146
+ List of dictionaries with start and end times of voice segments
3147
+ """
3148
+ try:
3149
+ print("Detecting voice activity in audio...")
3150
+ # Get HF_TOKEN from environment or set your token here
3151
+ hf_token = os.environ.get("HF_TOKEN", None)
3152
+
3153
+ # Initialize the voice activity detection pipeline
3154
+ vad_pipeline = Pipeline.from_pretrained(
3155
+ "pyannote/voice-activity-detection",
3156
+ use_auth_token=hf_token
3157
+ )
3158
+
3159
+ # Process the audio file
3160
+ output = vad_pipeline(audio_file)
3161
+
3162
+ # Extract voice segments
3163
+ voice_segments = []
3164
+ for speech in output.get_timeline().support():
3165
+ voice_segments.append({
3166
+ "start": speech.start,
3167
+ "end": speech.end,
3168
+ "duration": speech.end - speech.start
3169
+ })
3170
+
3171
+ print(f"Detected {len(voice_segments)} voice segments")
3172
+ return voice_segments
3173
+
3174
+ except Exception as e:
3175
+ print(f"Error detecting voice activity: {str(e)}")
3176
+ # Return empty list if detection fails
3177
+ return []
3178
+
3179
  def process_audio(audio_file, lyrics_requirements=None):
3180
  """Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis."""
3181
  if audio_file is None:
3182
  return "Please upload an audio file.", None, None
3183
 
3184
  try:
3185
+ print("Step 1/6: Extracting audio features...")
3186
  # Extract audio features
3187
  audio_data = extract_audio_features(audio_file)
3188
 
3189
+ print("Step 2/6: Verifying audio contains music...")
3190
  # First check if it's music
3191
  try:
3192
  is_music, ast_results = detect_music(audio_data)
 
3197
  if not is_music:
3198
  return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results
3199
 
3200
+ print("Step 3/6: Detecting voice activity segments...")
3201
+ # Detect voice activity segments
3202
+ voice_segments = detect_voice_activity(audio_file)
3203
+
3204
+ print("Step 4/6: Classifying music genre...")
3205
  # Classify genre
3206
  try:
3207
  top_genres = classify_genre(audio_data)
 
3226
  "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}
3227
  }
3228
 
3229
+ print("Step 5/6: Analyzing music emotions, themes, and structure...")
3230
  # Analyze music emotions and themes
3231
  try:
3232
  emotion_results = music_analyzer.analyze_music(audio_file)
 
3243
  beats_info = detect_beats(y, sr)
3244
  sections_info = detect_sections(y, sr)
3245
 
3246
+ # Create structured segments based on voice activity detection
3247
  segments = []
3248
 
3249
+ # If we have voice segments, use them as our primary segments
3250
+ if voice_segments and len(voice_segments) > 0:
3251
+ segments = voice_segments
3252
+ print(f"Using {len(segments)} voice segments for lyrics generation")
3253
+ # If no voice segments detected or detection failed, fall back to previous methods
3254
+ elif sections_info and len(sections_info) > 1:
3255
  min_segment_duration = 1.5 # Minimum 1.5 seconds per segment
3256
 
3257
  for section in sections_info:
 
3263
  if section_duration < min_segment_duration * 1.5:
3264
  segments.append({
3265
  "start": section_start,
3266
+ "end": section_end,
3267
+ "duration": section_duration
3268
  })
3269
  else:
3270
  # Calculate ideal number of segments for this section
 
3279
  segment_end = segment_start + segment_duration
3280
  segments.append({
3281
  "start": segment_start,
3282
+ "end": segment_end,
3283
+ "duration": segment_duration
3284
  })
3285
  # If no good sections found, create segments based on beats
3286
  elif beats_info and len(beats_info["beat_times"]) > 4:
 
3338
 
3339
  # Add syllable counts to each section
3340
  for section in sections_info:
3341
+ # Check if this section overlaps with any voice segments
3342
+ section_has_voice = False
3343
+ for voice_segment in voice_segments:
3344
+ # Check for overlap between section and voice segment
3345
+ if (section["start"] <= voice_segment["end"] and
3346
+ section["end"] >= voice_segment["start"]):
3347
+ section_has_voice = True
3348
+ break
3349
+
3350
  # Create syllable templates for sections
3351
  section_beats_info = {
3352
  "beat_times": [beat for beat in beats_info["beat_times"]
 
3361
  ]
3362
 
3363
  # Get a syllable count based on section duration and tempo
3364
+ # If section has voice, use normal count, otherwise set to 0
3365
+ syllable_count = int(section["duration"] * (beats_info.get("tempo", 120) / 60) * 1.5) if section_has_voice else 0
3366
 
3367
  section_info = {
3368
  "type": section["type"],
3369
  "start": section["start"],
3370
  "end": section["end"],
3371
  "duration": section["duration"],
3372
+ "has_voice": section_has_voice,
3373
  "syllable_count": syllable_count,
3374
  "beat_count": len(section_beats_info["beat_times"])
3375
  }
3376
 
3377
+ # Try to create a more detailed syllable template, but only for sections with voice
3378
+ if len(section_beats_info["beat_times"]) >= 2 and section_has_voice:
3379
  # Ensure top_genres is a list with at least one element
3380
  if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple):
3381
  genre_name = top_genres[0][0]
 
3426
  print(f"Error analyzing song structure: {str(e)}")
3427
  # Continue without song structure
3428
 
3429
+ print("Step 6/6: Generating rhythmically aligned lyrics...")
3430
  # Generate lyrics based on top genre, emotion analysis, and song structure
3431
  try:
3432
  # Ensure top_genres is a list with at least one element before accessing
 
3519
  "rhythm_analysis": rhythm_analysis,
3520
  "syllable_analysis": syllable_analysis,
3521
  "prompt_template": prompt_template,
3522
+ "ast_results": ast_results,
3523
+ "voice_segments": voice_segments
3524
  }
3525
 
3526
  return results
 
3542
  # Get beat information
3543
  beats_info = detect_beats(y, sr)
3544
 
3545
+ # Get voice activity segments
3546
+ try:
3547
+ voice_segments = detect_voice_activity(audio_file)
3548
+ except Exception as e:
3549
+ print(f"Error detecting voice segments: {str(e)}")
3550
+ voice_segments = []
3551
+
3552
  # Helper function to convert numpy values to floats - FIXED
3553
  def ensure_float(value):
3554
  if isinstance(value, np.ndarray) or isinstance(value, np.number):
 
3568
  timeline += f"Beat Periodicity: {beat_periodicity:.3f}s\n"
3569
  timeline += f"Beat Entropy: {beats_info.get('beat_entropy', 'N/A')}\n"
3570
  timeline += f"Total Beats: {beats_info['beat_count']}\n"
3571
+
3572
+ # Add voice activity segments information
3573
+ if voice_segments:
3574
+ timeline += f"\nVoice Activity Segments: {len(voice_segments)}\n"
3575
+ for i, segment in enumerate(voice_segments[:5]): # Show first 5 segments
3576
+ timeline += f" Segment {i+1}: {segment['start']:.2f}s - {segment['end']:.2f}s ({segment['duration']:.2f}s)\n"
3577
+ if len(voice_segments) > 5:
3578
+ timeline += f" ... and {len(voice_segments) - 5} more segments\n"
3579
 
3580
  # Add musicological context based on tempo classification
3581
  if tempo < 60:
 
3603
  time = ensure_float(time)
3604
  strength = ensure_float(strength)
3605
 
3606
+ # Check if this beat is during voice activity
3607
+ in_voice_segment = False
3608
+ for segment in voice_segments:
3609
+ if segment['start'] <= time <= segment['end']:
3610
+ in_voice_segment = True
3611
+ break
3612
+
3613
  # More scientific determination of beat type based on both strength and metrical position
3614
  metrical_position = i % beats_info['time_signature']
3615
 
 
3631
  else:
3632
  beat_type = "WEAK"
3633
  syllable_value = 1.0
3634
+
3635
+ # Mark the beat type if it's in a voice segment
3636
+ if in_voice_segment:
3637
+ beat_type = f"{beat_type} (VOICE)"
3638
 
3639
  # Determine pattern letter based on beat type for consistency
3640
  if beat_type == "STRONG":
 
4091
  genre_results = results.get("genre_results", "Genre classification failed")
4092
  lyrics = results.get("lyrics", "Lyrics generation failed")
4093
  ast_results = results.get("ast_results", [])
4094
+ voice_segments = results.get("voice_segments", [])
4095
  else:
4096
  # Old tuple format
4097
  genre_results, lyrics, ast_results = results
4098
+ # Get voice segments
4099
+ try:
4100
+ voice_segments = detect_voice_activity(audio_file)
4101
+ except Exception as e:
4102
+ print(f"Error detecting voice segments: {str(e)}")
4103
+ voice_segments = []
4104
 
4105
  # Get clean lyrics (without analysis notes)
4106
  clean_lyrics = lyrics
 
4132
  emotion_text += f"- Time Signature: {beats_info.get('time_signature', 4)}/4\n"
4133
  emotion_text += f"- Total Beats: {beats_info.get('beat_count', 0)}\n"
4134
 
4135
+ # Add voice activity segments if available
4136
+ if voice_segments:
4137
+ emotion_text += f"\n\nVoice Activity Segments ({len(voice_segments)}):\n"
4138
+ for i, segment in enumerate(voice_segments[:10]): # Show up to 10 segments
4139
+ emotion_text += f"- Segment {i+1}: {segment['start']:.2f}s - {segment['end']:.2f}s ({segment['duration']:.2f}s)\n"
4140
+ if len(voice_segments) > 10:
4141
+ emotion_text += f"... and {len(voice_segments) - 10} more segments\n"
4142
+
4143
  except Exception as e:
4144
  print(f"Error in emotion analysis: {str(e)}")
4145
 
appp.py CHANGED
@@ -32,7 +32,7 @@ if "HF_TOKEN" in os.environ:
32
  # Constants
33
  GENRE_MODEL_NAME = "dima806/music_genres_classification"
34
  MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593"
35
- LLM_MODEL_NAME = "Qwen/Qwen3-14B"
36
  SAMPLE_RATE = 22050 # Standard sample rate for audio processing
37
 
38
  # Check CUDA availability (for informational purposes)
@@ -2063,7 +2063,7 @@ def get_stress_aligned_alternatives(word, position_to_stress):
2063
  # For other cases, just provide general guidance
2064
  return f"a word with stress on syllable {position_to_stress + 1}"
2065
 
2066
- def generate_lyrics(genre, duration, emotion_results, song_structure=None):
2067
  """
2068
  Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment.
2069
 
@@ -2075,6 +2075,7 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None):
2075
  duration: Duration of the audio in seconds
2076
  emotion_results: Dictionary containing emotional analysis results
2077
  song_structure: Optional dictionary containing song structure analysis
 
2078
 
2079
  Returns:
2080
  Generated lyrics aligned with the rhythm patterns of the music
@@ -2493,6 +2494,30 @@ even if there are no rhythm issues. Include the following in your analysis:
2493
  2. Where stressed syllables align with strong beats
2494
  3. Any potential misalignments or improvements
2495
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2496
  Your lyrics:
2497
  """
2498
  elif use_sections:
@@ -2526,7 +2551,18 @@ The lyrics should:
2526
  - Follow the structure patterns provided above
2527
  - Be completely original
2528
  - Match the song duration of {duration:.1f} seconds
 
 
 
 
 
 
 
 
 
 
2529
 
 
2530
  IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
2531
 
2532
  IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
@@ -2569,7 +2605,18 @@ The lyrics should:
2569
  - Be completely original
2570
  - Maintain a consistent theme throughout
2571
  - Match the audio segment duration of {duration:.1f} seconds
 
 
 
 
 
 
 
 
 
 
2572
 
 
2573
  Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above.
2574
  Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY.
2575
 
@@ -2936,7 +2983,7 @@ Improved lyrics with fixed rhythm:
2936
  "prompt_template": "No prompt template available"
2937
  }
2938
 
2939
- def process_audio(audio_file):
2940
  """Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis."""
2941
  if audio_file is None:
2942
  return "Please upload an audio file.", None, None
@@ -3221,7 +3268,9 @@ def process_audio(audio_file):
3221
 
3222
  try:
3223
  print("Calling generate_lyrics function...")
3224
- lyrics_result = generate_lyrics(primary_genre, audio_data["duration"], emotion_results, sanitized_song_structure)
 
 
3225
  print(f"Type of lyrics_result: {type(lyrics_result)}")
3226
 
3227
  # Handle both old and new return formats with robust type checking
@@ -3774,7 +3823,7 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
3774
  print(f"Error generating complete beat timeline: {str(e)}")
3775
  return f"Error generating complete beat timeline: {str(e)}"
3776
 
3777
- def display_results(audio_file):
3778
  """Process audio file and return formatted results for display in the UI."""
3779
  # Default error response
3780
  error_response = ("Please upload an audio file.",
@@ -3787,8 +3836,8 @@ def display_results(audio_file):
3787
  return error_response
3788
 
3789
  try:
3790
- # Process audio and get results
3791
- results = process_audio(audio_file)
3792
 
3793
  # Check if we got an error message
3794
  if isinstance(results, str) and "Error" in results:
@@ -3862,6 +3911,14 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
3862
  with gr.Row():
3863
  with gr.Column(scale=1):
3864
  audio_input = gr.Audio(label="Upload Music", type="filepath")
 
 
 
 
 
 
 
 
3865
  submit_btn = gr.Button("Analyze & Generate", variant="primary")
3866
 
3867
  # Add genre info box
@@ -3897,10 +3954,10 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
3897
  with gr.TabItem("Beat & Syllable Timeline"):
3898
  beat_timeline_output = gr.Textbox(label="Beat Timings & Syllable Patterns", lines=40)
3899
 
3900
- # Connect the button to the display function with updated outputs
3901
  submit_btn.click(
3902
  fn=display_results,
3903
- inputs=[audio_input],
3904
  outputs=[genre_output, emotion_output, ast_output, lyrics_output, beat_timeline_output]
3905
  )
3906
 
@@ -3929,11 +3986,12 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
3929
  - Genre-specific rhythmic qualities
3930
  - Half-beat and quarter-beat subdivisions
3931
 
3932
- 7. **Lyrics Generation**: Using the detected genre, emotion, and rhythm patterns, a large language model generates lyrics that:
3933
  - Match the emotional quality of the music
3934
  - Follow the precise syllable templates for each second
3935
  - Align stressed syllables with strong beats
3936
  - Maintain genre-appropriate style and themes
 
3937
 
3938
  8. **Rhythm Verification**: The system verifies the generated lyrics, analyzing:
3939
  - Syllable count accuracy
 
32
  # Constants
33
  GENRE_MODEL_NAME = "dima806/music_genres_classification"
34
  MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593"
35
+ LLM_MODEL_NAME = "Qwen/Qwen3-32B"
36
  SAMPLE_RATE = 22050 # Standard sample rate for audio processing
37
 
38
  # Check CUDA availability (for informational purposes)
 
2063
  # For other cases, just provide general guidance
2064
  return f"a word with stress on syllable {position_to_stress + 1}"
2065
 
2066
+ def generate_lyrics(genre, duration, emotion_results, song_structure=None, lyrics_requirements=None):
2067
  """
2068
  Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment.
2069
 
 
2075
  duration: Duration of the audio in seconds
2076
  emotion_results: Dictionary containing emotional analysis results
2077
  song_structure: Optional dictionary containing song structure analysis
2078
+ lyrics_requirements: Optional user-provided requirements for the lyrics
2079
 
2080
  Returns:
2081
  Generated lyrics aligned with the rhythm patterns of the music
 
2494
  2. Where stressed syllables align with strong beats
2495
  3. Any potential misalignments or improvements
2496
 
2497
+ Your lyrics:
2498
+ """
2499
+
2500
+ # Add user requirements if provided
2501
+ if lyrics_requirements and lyrics_requirements.strip():
2502
+ content += f"""
2503
+ USER REQUIREMENTS:
2504
+ {lyrics_requirements.strip()}
2505
+
2506
+ The lyrics MUST incorporate these user requirements while still following the rhythm patterns.
2507
+ """
2508
+
2509
+ content += """
2510
+ Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY.
2511
+
2512
+ IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
2513
+
2514
+ IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
2515
+ where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
2516
+ even if there are no rhythm issues. Include the following in your analysis:
2517
+ 1. Syllable counts for each line and how they match the rhythm pattern
2518
+ 2. Where stressed syllables align with strong beats
2519
+ 3. Any potential misalignments or improvements
2520
+
2521
  Your lyrics:
2522
  """
2523
  elif use_sections:
 
2551
  - Follow the structure patterns provided above
2552
  - Be completely original
2553
  - Match the song duration of {duration:.1f} seconds
2554
+ """
2555
+
2556
+ # Add user requirements if provided
2557
+ if lyrics_requirements and lyrics_requirements.strip():
2558
+ content += f"""
2559
+ USER REQUIREMENTS:
2560
+ {lyrics_requirements.strip()}
2561
+
2562
+ The lyrics MUST incorporate these user requirements while still following the rhythm patterns.
2563
+ """
2564
 
2565
+ content += """
2566
  IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
2567
 
2568
  IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
 
2605
  - Be completely original
2606
  - Maintain a consistent theme throughout
2607
  - Match the audio segment duration of {duration:.1f} seconds
2608
+ """
2609
+
2610
+ # Add user requirements if provided
2611
+ if lyrics_requirements and lyrics_requirements.strip():
2612
+ content += f"""
2613
+ USER REQUIREMENTS:
2614
+ {lyrics_requirements.strip()}
2615
+
2616
+ The lyrics MUST incorporate these user requirements while still following the rhythm patterns.
2617
+ """
2618
 
2619
+ content += """
2620
  Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above.
2621
  Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY.
2622
 
 
2983
  "prompt_template": "No prompt template available"
2984
  }
2985
 
2986
+ def process_audio(audio_file, lyrics_requirements=None):
2987
  """Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis."""
2988
  if audio_file is None:
2989
  return "Please upload an audio file.", None, None
 
3268
 
3269
  try:
3270
  print("Calling generate_lyrics function...")
3271
+ # Pass lyrics_requirements to generate_lyrics function
3272
+ lyrics_result = generate_lyrics(primary_genre, audio_data["duration"], emotion_results,
3273
+ sanitized_song_structure, lyrics_requirements)
3274
  print(f"Type of lyrics_result: {type(lyrics_result)}")
3275
 
3276
  # Handle both old and new return formats with robust type checking
 
3823
  print(f"Error generating complete beat timeline: {str(e)}")
3824
  return f"Error generating complete beat timeline: {str(e)}"
3825
 
3826
+ def display_results(audio_file, lyrics_requirements=None):
3827
  """Process audio file and return formatted results for display in the UI."""
3828
  # Default error response
3829
  error_response = ("Please upload an audio file.",
 
3836
  return error_response
3837
 
3838
  try:
3839
+ # Process audio and get results - pass user requirements
3840
+ results = process_audio(audio_file, lyrics_requirements)
3841
 
3842
  # Check if we got an error message
3843
  if isinstance(results, str) and "Error" in results:
 
3911
  with gr.Row():
3912
  with gr.Column(scale=1):
3913
  audio_input = gr.Audio(label="Upload Music", type="filepath")
3914
+
3915
+ # Add the new lyrics requirements input
3916
+ lyrics_requirements_input = gr.Textbox(
3917
+ label="Lyrics Requirements (optional)",
3918
+ placeholder="Enter specific themes, topics, words, or styles you want in the lyrics",
3919
+ lines=3
3920
+ )
3921
+
3922
  submit_btn = gr.Button("Analyze & Generate", variant="primary")
3923
 
3924
  # Add genre info box
 
3954
  with gr.TabItem("Beat & Syllable Timeline"):
3955
  beat_timeline_output = gr.Textbox(label="Beat Timings & Syllable Patterns", lines=40)
3956
 
3957
+ # Connect the button to the display function with updated inputs
3958
  submit_btn.click(
3959
  fn=display_results,
3960
+ inputs=[audio_input, lyrics_requirements_input],
3961
  outputs=[genre_output, emotion_output, ast_output, lyrics_output, beat_timeline_output]
3962
  )
3963
 
 
3986
  - Genre-specific rhythmic qualities
3987
  - Half-beat and quarter-beat subdivisions
3988
 
3989
+ 7. **Lyrics Generation**: Using the detected genre, emotion, rhythm patterns, and your custom requirements, a large language model generates lyrics that:
3990
  - Match the emotional quality of the music
3991
  - Follow the precise syllable templates for each second
3992
  - Align stressed syllables with strong beats
3993
  - Maintain genre-appropriate style and themes
3994
+ - Incorporate your specific requirements and preferences
3995
 
3996
  8. **Rhythm Verification**: The system verifies the generated lyrics, analyzing:
3997
  - Syllable count accuracy
requirements.txt CHANGED
@@ -13,3 +13,4 @@ scipy>=1.12.0
13
  soundfile>=0.12.1
14
  matplotlib>=3.7.0
15
  pronouncing>=0.2.0
 
 
13
  soundfile>=0.12.1
14
  matplotlib>=3.7.0
15
  pronouncing>=0.2.0
16
+ pyannote.audio>=2.1.1