root
commited on
Commit
·
651b0cd
1
Parent(s):
bddf9c4
ss
Browse files- app.py +285 -30
- appp.py +68 -10
- requirements.txt +1 -0
app.py
CHANGED
@@ -24,6 +24,17 @@ from utils import (
|
|
24 |
)
|
25 |
from emotionanalysis import MusicAnalyzer
|
26 |
import librosa
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
# Login to Hugging Face Hub if token is provided
|
29 |
if "HF_TOKEN" in os.environ:
|
@@ -1180,12 +1191,12 @@ def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='defa
|
|
1180 |
# Sigmoid-like function with more scientific parameters
|
1181 |
# Using logistic function: L/(1+e^(-k(x-x0))) to create smooth transitions
|
1182 |
if tempo < 40: # Very slow tempos
|
1183 |
-
return
|
1184 |
elif tempo > 200: # Very fast tempos
|
1185 |
-
return 0.
|
1186 |
else:
|
1187 |
# Scientific logistic function for middle range (40-200 BPM)
|
1188 |
-
L =
|
1189 |
k = 0.04 # Steepness of curve
|
1190 |
x0 = 120 # Midpoint (inflection point at normal tempo)
|
1191 |
return L / (1 + np.exp(k * (tempo - x0)))
|
@@ -1235,6 +1246,32 @@ def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='defa
|
|
1235 |
# ----------------------------------------------------------------------
|
1236 |
detailed_template = []
|
1237 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1238 |
for i, (stress_type, strength) in enumerate(stress_pattern):
|
1239 |
# Get base syllable count from tempo with more nuanced mapping
|
1240 |
base_syllables = tempo_to_syllable_base(tempo)
|
@@ -1281,6 +1318,60 @@ def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='defa
|
|
1281 |
strength_pct = round(strength * 100) / 100
|
1282 |
detailed_template.append(f"{stress_type}({strength_pct}):{syllable_count}")
|
1283 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1284 |
# Join beat templates for this phrase
|
1285 |
phrase_template = "-".join(detailed_template)
|
1286 |
syllable_templates.append(phrase_template)
|
@@ -1572,6 +1663,16 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
|
|
1572 |
# Split lyrics into lines
|
1573 |
lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
|
1574 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1575 |
# Initialize tracking variables
|
1576 |
verification_notes = []
|
1577 |
detailed_analysis = []
|
@@ -1747,11 +1848,14 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
|
|
1747 |
|
1748 |
for beat in best_phrase_beats:
|
1749 |
if beat.get("type") == "S":
|
|
|
1750 |
strong_positions.append(current_pos)
|
1751 |
current_pos += beat.get("count", 1)
|
1752 |
|
1753 |
# Check if strong syllables align with strong beats
|
1754 |
alignment_issues = []
|
|
|
|
|
1755 |
|
1756 |
for pos in strong_positions:
|
1757 |
# Find which word contains this position
|
@@ -1768,18 +1872,31 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
|
|
1768 |
# Get stress pattern for this word
|
1769 |
stress = word_info["stress_pattern"]
|
1770 |
|
1771 |
-
# If we have stress information
|
1772 |
-
if stress and syllable_in_word < len(stress)
|
1773 |
-
|
1774 |
-
|
1775 |
-
|
1776 |
-
|
1777 |
-
|
1778 |
-
"
|
1779 |
-
"
|
1780 |
-
|
|
|
|
|
|
|
|
|
|
|
1781 |
break
|
1782 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1783 |
if alignment_issues:
|
1784 |
verification_notes.append(f" → Stress misalignments: {', '.join(alignment_issues)}")
|
1785 |
|
@@ -2452,7 +2569,7 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None, lyric
|
|
2452 |
|
2453 |
# Create enhanced prompt with better rhythm alignment instructions
|
2454 |
if use_second_level:
|
2455 |
-
# Second-level approach with per-second alignment
|
2456 |
content = f"""
|
2457 |
You are a talented songwriter who specializes in {genre} music.
|
2458 |
Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long.
|
@@ -2634,7 +2751,7 @@ Your lyrics:
|
|
2634 |
|
2635 |
# Format as a chat message for the LLM
|
2636 |
messages = [
|
2637 |
-
{"role": "system", "content": "You are a professional songwriter. Create lyrics that match the specified rhythm patterns
|
2638 |
{"role": "user", "content": content}
|
2639 |
]
|
2640 |
|
@@ -2684,6 +2801,23 @@ Your lyrics:
|
|
2684 |
lyrics = lyrics.split("</thinking>")[1].strip()
|
2685 |
|
2686 |
# Check for alternative thinking indicators with improved detection
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2687 |
thinking_markers = [
|
2688 |
"<think>", "</think>",
|
2689 |
"[thinking]", "[/thinking]",
|
@@ -2769,6 +2903,24 @@ Your lyrics:
|
|
2769 |
if not isinstance(second_level_verification, list):
|
2770 |
second_level_verification = None
|
2771 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2772 |
# Verify syllable counts with enhanced verification - pass second-level templates if available
|
2773 |
if templates_for_verification:
|
2774 |
# Convert any NumPy values to native types before verification - directly handle conversions
|
@@ -2983,17 +3135,58 @@ Improved lyrics with fixed rhythm:
|
|
2983 |
"prompt_template": "No prompt template available"
|
2984 |
}
|
2985 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2986 |
def process_audio(audio_file, lyrics_requirements=None):
|
2987 |
"""Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis."""
|
2988 |
if audio_file is None:
|
2989 |
return "Please upload an audio file.", None, None
|
2990 |
|
2991 |
try:
|
2992 |
-
print("Step 1/
|
2993 |
# Extract audio features
|
2994 |
audio_data = extract_audio_features(audio_file)
|
2995 |
|
2996 |
-
print("Step 2/
|
2997 |
# First check if it's music
|
2998 |
try:
|
2999 |
is_music, ast_results = detect_music(audio_data)
|
@@ -3004,7 +3197,11 @@ def process_audio(audio_file, lyrics_requirements=None):
|
|
3004 |
if not is_music:
|
3005 |
return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results
|
3006 |
|
3007 |
-
print("Step 3/
|
|
|
|
|
|
|
|
|
3008 |
# Classify genre
|
3009 |
try:
|
3010 |
top_genres = classify_genre(audio_data)
|
@@ -3029,7 +3226,7 @@ def process_audio(audio_file, lyrics_requirements=None):
|
|
3029 |
"summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}
|
3030 |
}
|
3031 |
|
3032 |
-
print("Step
|
3033 |
# Analyze music emotions and themes
|
3034 |
try:
|
3035 |
emotion_results = music_analyzer.analyze_music(audio_file)
|
@@ -3046,12 +3243,15 @@ def process_audio(audio_file, lyrics_requirements=None):
|
|
3046 |
beats_info = detect_beats(y, sr)
|
3047 |
sections_info = detect_sections(y, sr)
|
3048 |
|
3049 |
-
# Create structured segments
|
3050 |
segments = []
|
3051 |
|
3052 |
-
#
|
3053 |
-
|
3054 |
-
|
|
|
|
|
|
|
3055 |
min_segment_duration = 1.5 # Minimum 1.5 seconds per segment
|
3056 |
|
3057 |
for section in sections_info:
|
@@ -3063,7 +3263,8 @@ def process_audio(audio_file, lyrics_requirements=None):
|
|
3063 |
if section_duration < min_segment_duration * 1.5:
|
3064 |
segments.append({
|
3065 |
"start": section_start,
|
3066 |
-
"end": section_end
|
|
|
3067 |
})
|
3068 |
else:
|
3069 |
# Calculate ideal number of segments for this section
|
@@ -3078,7 +3279,8 @@ def process_audio(audio_file, lyrics_requirements=None):
|
|
3078 |
segment_end = segment_start + segment_duration
|
3079 |
segments.append({
|
3080 |
"start": segment_start,
|
3081 |
-
"end": segment_end
|
|
|
3082 |
})
|
3083 |
# If no good sections found, create segments based on beats
|
3084 |
elif beats_info and len(beats_info["beat_times"]) > 4:
|
@@ -3136,6 +3338,15 @@ def process_audio(audio_file, lyrics_requirements=None):
|
|
3136 |
|
3137 |
# Add syllable counts to each section
|
3138 |
for section in sections_info:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3139 |
# Create syllable templates for sections
|
3140 |
section_beats_info = {
|
3141 |
"beat_times": [beat for beat in beats_info["beat_times"]
|
@@ -3150,19 +3361,21 @@ def process_audio(audio_file, lyrics_requirements=None):
|
|
3150 |
]
|
3151 |
|
3152 |
# Get a syllable count based on section duration and tempo
|
3153 |
-
|
|
|
3154 |
|
3155 |
section_info = {
|
3156 |
"type": section["type"],
|
3157 |
"start": section["start"],
|
3158 |
"end": section["end"],
|
3159 |
"duration": section["duration"],
|
|
|
3160 |
"syllable_count": syllable_count,
|
3161 |
"beat_count": len(section_beats_info["beat_times"])
|
3162 |
}
|
3163 |
|
3164 |
-
# Try to create a more detailed syllable template
|
3165 |
-
if len(section_beats_info["beat_times"]) >= 2:
|
3166 |
# Ensure top_genres is a list with at least one element
|
3167 |
if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple):
|
3168 |
genre_name = top_genres[0][0]
|
@@ -3213,7 +3426,7 @@ def process_audio(audio_file, lyrics_requirements=None):
|
|
3213 |
print(f"Error analyzing song structure: {str(e)}")
|
3214 |
# Continue without song structure
|
3215 |
|
3216 |
-
print("Step
|
3217 |
# Generate lyrics based on top genre, emotion analysis, and song structure
|
3218 |
try:
|
3219 |
# Ensure top_genres is a list with at least one element before accessing
|
@@ -3306,7 +3519,8 @@ def process_audio(audio_file, lyrics_requirements=None):
|
|
3306 |
"rhythm_analysis": rhythm_analysis,
|
3307 |
"syllable_analysis": syllable_analysis,
|
3308 |
"prompt_template": prompt_template,
|
3309 |
-
"ast_results": ast_results
|
|
|
3310 |
}
|
3311 |
|
3312 |
return results
|
@@ -3328,6 +3542,13 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
|
|
3328 |
# Get beat information
|
3329 |
beats_info = detect_beats(y, sr)
|
3330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3331 |
# Helper function to convert numpy values to floats - FIXED
|
3332 |
def ensure_float(value):
|
3333 |
if isinstance(value, np.ndarray) or isinstance(value, np.number):
|
@@ -3347,6 +3568,14 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
|
|
3347 |
timeline += f"Beat Periodicity: {beat_periodicity:.3f}s\n"
|
3348 |
timeline += f"Beat Entropy: {beats_info.get('beat_entropy', 'N/A')}\n"
|
3349 |
timeline += f"Total Beats: {beats_info['beat_count']}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3350 |
|
3351 |
# Add musicological context based on tempo classification
|
3352 |
if tempo < 60:
|
@@ -3374,6 +3603,13 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
|
|
3374 |
time = ensure_float(time)
|
3375 |
strength = ensure_float(strength)
|
3376 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3377 |
# More scientific determination of beat type based on both strength and metrical position
|
3378 |
metrical_position = i % beats_info['time_signature']
|
3379 |
|
@@ -3395,6 +3631,10 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
|
|
3395 |
else:
|
3396 |
beat_type = "WEAK"
|
3397 |
syllable_value = 1.0
|
|
|
|
|
|
|
|
|
3398 |
|
3399 |
# Determine pattern letter based on beat type for consistency
|
3400 |
if beat_type == "STRONG":
|
@@ -3851,9 +4091,16 @@ def display_results(audio_file, lyrics_requirements=None):
|
|
3851 |
genre_results = results.get("genre_results", "Genre classification failed")
|
3852 |
lyrics = results.get("lyrics", "Lyrics generation failed")
|
3853 |
ast_results = results.get("ast_results", [])
|
|
|
3854 |
else:
|
3855 |
# Old tuple format
|
3856 |
genre_results, lyrics, ast_results = results
|
|
|
|
|
|
|
|
|
|
|
|
|
3857 |
|
3858 |
# Get clean lyrics (without analysis notes)
|
3859 |
clean_lyrics = lyrics
|
@@ -3885,6 +4132,14 @@ def display_results(audio_file, lyrics_requirements=None):
|
|
3885 |
emotion_text += f"- Time Signature: {beats_info.get('time_signature', 4)}/4\n"
|
3886 |
emotion_text += f"- Total Beats: {beats_info.get('beat_count', 0)}\n"
|
3887 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3888 |
except Exception as e:
|
3889 |
print(f"Error in emotion analysis: {str(e)}")
|
3890 |
|
|
|
24 |
)
|
25 |
from emotionanalysis import MusicAnalyzer
|
26 |
import librosa
|
27 |
+
from pyannote.audio import Pipeline
|
28 |
+
import tempfile
|
29 |
+
import os
|
30 |
+
import soundfile as sf
|
31 |
+
import warnings
|
32 |
+
import json
|
33 |
+
import math
|
34 |
+
from collections import defaultdict
|
35 |
+
import matplotlib.pyplot as plt
|
36 |
+
from gradio_client import Client
|
37 |
+
from transformers import pipeline as hf_pipeline
|
38 |
|
39 |
# Login to Hugging Face Hub if token is provided
|
40 |
if "HF_TOKEN" in os.environ:
|
|
|
1191 |
# Sigmoid-like function with more scientific parameters
|
1192 |
# Using logistic function: L/(1+e^(-k(x-x0))) to create smooth transitions
|
1193 |
if tempo < 40: # Very slow tempos
|
1194 |
+
return 1.8 # Further reduced maximum syllables for extremely slow tempos
|
1195 |
elif tempo > 200: # Very fast tempos
|
1196 |
+
return 0.7 # Minimum syllables for extremely fast tempos
|
1197 |
else:
|
1198 |
# Scientific logistic function for middle range (40-200 BPM)
|
1199 |
+
L = 2.0 # Significantly reduced upper limit to prevent excessive syllables
|
1200 |
k = 0.04 # Steepness of curve
|
1201 |
x0 = 120 # Midpoint (inflection point at normal tempo)
|
1202 |
return L / (1 + np.exp(k * (tempo - x0)))
|
|
|
1246 |
# ----------------------------------------------------------------------
|
1247 |
detailed_template = []
|
1248 |
|
1249 |
+
# Calculate phrase duration if beat times are available
|
1250 |
+
phrase_duration = 0
|
1251 |
+
if phrase and len(phrase) > 1 and len(beat_times) > 0:
|
1252 |
+
# Get first and last beat indices from the phrase
|
1253 |
+
first_idx = phrase[0]
|
1254 |
+
last_idx = phrase[-1]
|
1255 |
+
|
1256 |
+
# Check if indices are within bounds
|
1257 |
+
if first_idx < len(beat_times) and last_idx < len(beat_times):
|
1258 |
+
phrase_duration = beat_times[last_idx] - beat_times[first_idx]
|
1259 |
+
|
1260 |
+
# Calculate a maximum reasonable syllable count based on duration
|
1261 |
+
# Aim for 3-4 syllables per second maximum for singability (reduced from 5-6)
|
1262 |
+
max_reasonable_syllables = 100 # Default high value
|
1263 |
+
if phrase_duration > 0:
|
1264 |
+
# Use a more conservative syllable rate based on tempo
|
1265 |
+
if tempo < 80: # Slow tempo
|
1266 |
+
syllable_rate = 3.0 # Maximum 3 syllables per second for slow tempos
|
1267 |
+
elif tempo < 120: # Medium tempo
|
1268 |
+
syllable_rate = 3.5 # Maximum 3.5 syllables per second for medium tempos
|
1269 |
+
else: # Fast tempo
|
1270 |
+
syllable_rate = 4.0 # Maximum 4 syllables per second for fast tempos
|
1271 |
+
|
1272 |
+
# Calculate max syllables and ensure it's at least 2 for any phrase
|
1273 |
+
max_reasonable_syllables = max(2, int(phrase_duration * syllable_rate))
|
1274 |
+
|
1275 |
for i, (stress_type, strength) in enumerate(stress_pattern):
|
1276 |
# Get base syllable count from tempo with more nuanced mapping
|
1277 |
base_syllables = tempo_to_syllable_base(tempo)
|
|
|
1318 |
strength_pct = round(strength * 100) / 100
|
1319 |
detailed_template.append(f"{stress_type}({strength_pct}):{syllable_count}")
|
1320 |
|
1321 |
+
# Calculate total expected syllables for this phrase
|
1322 |
+
total_expected_syllables = sum([float(beat.split(':')[1]) for beat in detailed_template])
|
1323 |
+
|
1324 |
+
# If total syllables exceed our reasonable limit, scale them down
|
1325 |
+
if total_expected_syllables > max_reasonable_syllables and max_reasonable_syllables > 0:
|
1326 |
+
scale_factor = max_reasonable_syllables / total_expected_syllables
|
1327 |
+
adjusted_template = []
|
1328 |
+
|
1329 |
+
# Stronger scaling for very short phrases (less than 0.8 seconds)
|
1330 |
+
if phrase_duration < 0.8 and phrase_duration > 0:
|
1331 |
+
# Further reduce for extremely short phrases
|
1332 |
+
scale_factor *= 0.8
|
1333 |
+
|
1334 |
+
for beat in detailed_template:
|
1335 |
+
if ':' in beat:
|
1336 |
+
beat_type_part = beat.split(':')[0]
|
1337 |
+
syllable_count = float(beat.split(':')[1])
|
1338 |
+
# Scale down and round to nearest 0.25
|
1339 |
+
new_count = max(0.5, round(syllable_count * scale_factor * 4) / 4)
|
1340 |
+
|
1341 |
+
# Extra check for very short phrases - cap at 1.0 for S beats and 0.5 for others
|
1342 |
+
if phrase_duration < 0.6 and phrase_duration > 0:
|
1343 |
+
if beat_type_part.startswith("S"):
|
1344 |
+
new_count = min(new_count, 1.0)
|
1345 |
+
else:
|
1346 |
+
new_count = min(new_count, 0.5)
|
1347 |
+
|
1348 |
+
adjusted_template.append(f"{beat_type_part}:{new_count}")
|
1349 |
+
else:
|
1350 |
+
adjusted_template.append(beat)
|
1351 |
+
|
1352 |
+
detailed_template = adjusted_template
|
1353 |
+
|
1354 |
+
# Extra check to avoid having too many total syllables in a phrase
|
1355 |
+
if len(detailed_template) > 0:
|
1356 |
+
total_syllables = sum([float(beat.split(':')[1]) for beat in detailed_template if ':' in beat])
|
1357 |
+
if phrase_duration > 0 and (total_syllables / phrase_duration) > 5.0:
|
1358 |
+
# If we have more than 5 syllables per second, apply additional scaling
|
1359 |
+
target_syllables = phrase_duration * 4.0 # Target 4 syllables per second max
|
1360 |
+
scale_factor = target_syllables / total_syllables
|
1361 |
+
adjusted_template = []
|
1362 |
+
|
1363 |
+
for beat in detailed_template:
|
1364 |
+
if ':' in beat:
|
1365 |
+
beat_type_part = beat.split(':')[0]
|
1366 |
+
syllable_count = float(beat.split(':')[1])
|
1367 |
+
# Scale down and round to nearest 0.25
|
1368 |
+
new_count = max(0.5, round(syllable_count * scale_factor * 4) / 4)
|
1369 |
+
adjusted_template.append(f"{beat_type_part}:{new_count}")
|
1370 |
+
else:
|
1371 |
+
adjusted_template.append(beat)
|
1372 |
+
|
1373 |
+
detailed_template = adjusted_template
|
1374 |
+
|
1375 |
# Join beat templates for this phrase
|
1376 |
phrase_template = "-".join(detailed_template)
|
1377 |
syllable_templates.append(phrase_template)
|
|
|
1663 |
# Split lyrics into lines
|
1664 |
lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
|
1665 |
|
1666 |
+
# Remove any lines that are clearly not lyrics, like explanations or meta-content
|
1667 |
+
filtered_lines = []
|
1668 |
+
for line in lines:
|
1669 |
+
# Skip explanatory content or meta-text
|
1670 |
+
if line.startswith('**') or line.startswith('[Note:') or 'alignment:' in line.lower():
|
1671 |
+
continue
|
1672 |
+
filtered_lines.append(line)
|
1673 |
+
|
1674 |
+
lines = filtered_lines
|
1675 |
+
|
1676 |
# Initialize tracking variables
|
1677 |
verification_notes = []
|
1678 |
detailed_analysis = []
|
|
|
1848 |
|
1849 |
for beat in best_phrase_beats:
|
1850 |
if beat.get("type") == "S":
|
1851 |
+
# If the count is greater than 1, only the first syllable should be stressed
|
1852 |
strong_positions.append(current_pos)
|
1853 |
current_pos += beat.get("count", 1)
|
1854 |
|
1855 |
# Check if strong syllables align with strong beats
|
1856 |
alignment_issues = []
|
1857 |
+
aligned_stress_count = 0
|
1858 |
+
total_stress_positions = len(strong_positions)
|
1859 |
|
1860 |
for pos in strong_positions:
|
1861 |
# Find which word contains this position
|
|
|
1872 |
# Get stress pattern for this word
|
1873 |
stress = word_info["stress_pattern"]
|
1874 |
|
1875 |
+
# If we have stress information, check if the syllable is stressed
|
1876 |
+
if stress and syllable_in_word < len(stress):
|
1877 |
+
if stress[syllable_in_word] == '1':
|
1878 |
+
# Syllable is stressed and properly aligned
|
1879 |
+
aligned_stress_count += 1
|
1880 |
+
else:
|
1881 |
+
# Syllable is not stressed but should be
|
1882 |
+
misaligned_word = word_info["word"]
|
1883 |
+
alignment_issues.append(f"'{word_info['word']}' (unstressed syllable on strong beat)")
|
1884 |
+
stress_misalignments.append({
|
1885 |
+
"line": i+1,
|
1886 |
+
"word": word_info["word"],
|
1887 |
+
"position": pos,
|
1888 |
+
"suggestion": get_stress_aligned_alternatives(word_info["word"], syllable_in_word)
|
1889 |
+
})
|
1890 |
break
|
1891 |
|
1892 |
+
# Calculate alignment percentage
|
1893 |
+
alignment_percentage = 0
|
1894 |
+
if total_stress_positions > 0:
|
1895 |
+
alignment_percentage = (aligned_stress_count / total_stress_positions) * 100
|
1896 |
+
|
1897 |
+
# Add alignment percentage to notes
|
1898 |
+
verification_notes.append(f" → Stress alignment: {alignment_percentage:.1f}% ({aligned_stress_count}/{total_stress_positions})")
|
1899 |
+
|
1900 |
if alignment_issues:
|
1901 |
verification_notes.append(f" → Stress misalignments: {', '.join(alignment_issues)}")
|
1902 |
|
|
|
2569 |
|
2570 |
# Create enhanced prompt with better rhythm alignment instructions
|
2571 |
if use_second_level:
|
2572 |
+
# Second-level approach with per-second alignment - enhanced for better syllable distribution
|
2573 |
content = f"""
|
2574 |
You are a talented songwriter who specializes in {genre} music.
|
2575 |
Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long.
|
|
|
2751 |
|
2752 |
# Format as a chat message for the LLM
|
2753 |
messages = [
|
2754 |
+
{"role": "system", "content": "You are a professional songwriter. Create lyrics that match the specified rhythm patterns EXACTLY. Be extremely concise - use only the EXACT number of syllables specified for each line. For short phrases (1 second or less), use just 2-3 MAXIMUM syllables. Include lyrics for EVERY musical section - do not leave any section empty. Use one-syllable words whenever possible for better singability. Avoid complex vocabulary. For all beat patterns, use fewer syllables than you think you need. Start with the lyrics immediately without any explanation or thinking."},
|
2755 |
{"role": "user", "content": content}
|
2756 |
]
|
2757 |
|
|
|
2801 |
lyrics = lyrics.split("</thinking>")[1].strip()
|
2802 |
|
2803 |
# Check for alternative thinking indicators with improved detection
|
2804 |
+
|
2805 |
+
# Clean up lyrics: Remove meta-content and explanations
|
2806 |
+
if lyrics:
|
2807 |
+
# Remove any line that starts with **
|
2808 |
+
cleaned_lines = []
|
2809 |
+
for line in lyrics.split('\n'):
|
2810 |
+
if not line.strip().startswith('**') and not 'alignment:' in line.lower():
|
2811 |
+
cleaned_lines.append(line)
|
2812 |
+
lyrics = '\n'.join(cleaned_lines)
|
2813 |
+
|
2814 |
+
# Check for excessively long lines (likely explanations)
|
2815 |
+
max_reasonable_line_length = 80
|
2816 |
+
final_lines = []
|
2817 |
+
for line in lyrics.split('\n'):
|
2818 |
+
if len(line) <= max_reasonable_line_length or '[' in line or ']' in line:
|
2819 |
+
final_lines.append(line)
|
2820 |
+
lyrics = '\n'.join(final_lines)
|
2821 |
thinking_markers = [
|
2822 |
"<think>", "</think>",
|
2823 |
"[thinking]", "[/thinking]",
|
|
|
2903 |
if not isinstance(second_level_verification, list):
|
2904 |
second_level_verification = None
|
2905 |
|
2906 |
+
# Ensure all second-level templates have lyrics
|
2907 |
+
if song_structure and "second_level" in song_structure and song_structure["second_level"]:
|
2908 |
+
if "templates" in song_structure["second_level"] and isinstance(song_structure["second_level"]["templates"], list):
|
2909 |
+
# Count how many seconds have lyrics
|
2910 |
+
if lyrics:
|
2911 |
+
lines = [line.strip() for line in lyrics.split('\n') if line.strip()]
|
2912 |
+
|
2913 |
+
# If we have fewer lines than seconds, try to distribute them better
|
2914 |
+
second_count = len(song_structure["second_level"]["templates"])
|
2915 |
+
if 0 < len(lines) < second_count:
|
2916 |
+
# Simple distribution - repeat existing lines to fill all seconds
|
2917 |
+
distributed_lines = []
|
2918 |
+
for i in range(second_count):
|
2919 |
+
distributed_lines.append(lines[i % len(lines)])
|
2920 |
+
|
2921 |
+
# Replace the lyrics with the distributed version
|
2922 |
+
lyrics = '\n'.join(distributed_lines)
|
2923 |
+
|
2924 |
# Verify syllable counts with enhanced verification - pass second-level templates if available
|
2925 |
if templates_for_verification:
|
2926 |
# Convert any NumPy values to native types before verification - directly handle conversions
|
|
|
3135 |
"prompt_template": "No prompt template available"
|
3136 |
}
|
3137 |
|
3138 |
+
def detect_voice_activity(audio_file):
|
3139 |
+
"""
|
3140 |
+
Detect segments with voice/singing in audio using pyannote/voice-activity-detection
|
3141 |
+
|
3142 |
+
Args:
|
3143 |
+
audio_file: Path to audio file
|
3144 |
+
|
3145 |
+
Returns:
|
3146 |
+
List of dictionaries with start and end times of voice segments
|
3147 |
+
"""
|
3148 |
+
try:
|
3149 |
+
print("Detecting voice activity in audio...")
|
3150 |
+
# Get HF_TOKEN from environment or set your token here
|
3151 |
+
hf_token = os.environ.get("HF_TOKEN", None)
|
3152 |
+
|
3153 |
+
# Initialize the voice activity detection pipeline
|
3154 |
+
vad_pipeline = Pipeline.from_pretrained(
|
3155 |
+
"pyannote/voice-activity-detection",
|
3156 |
+
use_auth_token=hf_token
|
3157 |
+
)
|
3158 |
+
|
3159 |
+
# Process the audio file
|
3160 |
+
output = vad_pipeline(audio_file)
|
3161 |
+
|
3162 |
+
# Extract voice segments
|
3163 |
+
voice_segments = []
|
3164 |
+
for speech in output.get_timeline().support():
|
3165 |
+
voice_segments.append({
|
3166 |
+
"start": speech.start,
|
3167 |
+
"end": speech.end,
|
3168 |
+
"duration": speech.end - speech.start
|
3169 |
+
})
|
3170 |
+
|
3171 |
+
print(f"Detected {len(voice_segments)} voice segments")
|
3172 |
+
return voice_segments
|
3173 |
+
|
3174 |
+
except Exception as e:
|
3175 |
+
print(f"Error detecting voice activity: {str(e)}")
|
3176 |
+
# Return empty list if detection fails
|
3177 |
+
return []
|
3178 |
+
|
3179 |
def process_audio(audio_file, lyrics_requirements=None):
|
3180 |
"""Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis."""
|
3181 |
if audio_file is None:
|
3182 |
return "Please upload an audio file.", None, None
|
3183 |
|
3184 |
try:
|
3185 |
+
print("Step 1/6: Extracting audio features...")
|
3186 |
# Extract audio features
|
3187 |
audio_data = extract_audio_features(audio_file)
|
3188 |
|
3189 |
+
print("Step 2/6: Verifying audio contains music...")
|
3190 |
# First check if it's music
|
3191 |
try:
|
3192 |
is_music, ast_results = detect_music(audio_data)
|
|
|
3197 |
if not is_music:
|
3198 |
return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results
|
3199 |
|
3200 |
+
print("Step 3/6: Detecting voice activity segments...")
|
3201 |
+
# Detect voice activity segments
|
3202 |
+
voice_segments = detect_voice_activity(audio_file)
|
3203 |
+
|
3204 |
+
print("Step 4/6: Classifying music genre...")
|
3205 |
# Classify genre
|
3206 |
try:
|
3207 |
top_genres = classify_genre(audio_data)
|
|
|
3226 |
"summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}
|
3227 |
}
|
3228 |
|
3229 |
+
print("Step 5/6: Analyzing music emotions, themes, and structure...")
|
3230 |
# Analyze music emotions and themes
|
3231 |
try:
|
3232 |
emotion_results = music_analyzer.analyze_music(audio_file)
|
|
|
3243 |
beats_info = detect_beats(y, sr)
|
3244 |
sections_info = detect_sections(y, sr)
|
3245 |
|
3246 |
+
# Create structured segments based on voice activity detection
|
3247 |
segments = []
|
3248 |
|
3249 |
+
# If we have voice segments, use them as our primary segments
|
3250 |
+
if voice_segments and len(voice_segments) > 0:
|
3251 |
+
segments = voice_segments
|
3252 |
+
print(f"Using {len(segments)} voice segments for lyrics generation")
|
3253 |
+
# If no voice segments detected or detection failed, fall back to previous methods
|
3254 |
+
elif sections_info and len(sections_info) > 1:
|
3255 |
min_segment_duration = 1.5 # Minimum 1.5 seconds per segment
|
3256 |
|
3257 |
for section in sections_info:
|
|
|
3263 |
if section_duration < min_segment_duration * 1.5:
|
3264 |
segments.append({
|
3265 |
"start": section_start,
|
3266 |
+
"end": section_end,
|
3267 |
+
"duration": section_duration
|
3268 |
})
|
3269 |
else:
|
3270 |
# Calculate ideal number of segments for this section
|
|
|
3279 |
segment_end = segment_start + segment_duration
|
3280 |
segments.append({
|
3281 |
"start": segment_start,
|
3282 |
+
"end": segment_end,
|
3283 |
+
"duration": segment_duration
|
3284 |
})
|
3285 |
# If no good sections found, create segments based on beats
|
3286 |
elif beats_info and len(beats_info["beat_times"]) > 4:
|
|
|
3338 |
|
3339 |
# Add syllable counts to each section
|
3340 |
for section in sections_info:
|
3341 |
+
# Check if this section overlaps with any voice segments
|
3342 |
+
section_has_voice = False
|
3343 |
+
for voice_segment in voice_segments:
|
3344 |
+
# Check for overlap between section and voice segment
|
3345 |
+
if (section["start"] <= voice_segment["end"] and
|
3346 |
+
section["end"] >= voice_segment["start"]):
|
3347 |
+
section_has_voice = True
|
3348 |
+
break
|
3349 |
+
|
3350 |
# Create syllable templates for sections
|
3351 |
section_beats_info = {
|
3352 |
"beat_times": [beat for beat in beats_info["beat_times"]
|
|
|
3361 |
]
|
3362 |
|
3363 |
# Get a syllable count based on section duration and tempo
|
3364 |
+
# If section has voice, use normal count, otherwise set to 0
|
3365 |
+
syllable_count = int(section["duration"] * (beats_info.get("tempo", 120) / 60) * 1.5) if section_has_voice else 0
|
3366 |
|
3367 |
section_info = {
|
3368 |
"type": section["type"],
|
3369 |
"start": section["start"],
|
3370 |
"end": section["end"],
|
3371 |
"duration": section["duration"],
|
3372 |
+
"has_voice": section_has_voice,
|
3373 |
"syllable_count": syllable_count,
|
3374 |
"beat_count": len(section_beats_info["beat_times"])
|
3375 |
}
|
3376 |
|
3377 |
+
# Try to create a more detailed syllable template, but only for sections with voice
|
3378 |
+
if len(section_beats_info["beat_times"]) >= 2 and section_has_voice:
|
3379 |
# Ensure top_genres is a list with at least one element
|
3380 |
if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple):
|
3381 |
genre_name = top_genres[0][0]
|
|
|
3426 |
print(f"Error analyzing song structure: {str(e)}")
|
3427 |
# Continue without song structure
|
3428 |
|
3429 |
+
print("Step 6/6: Generating rhythmically aligned lyrics...")
|
3430 |
# Generate lyrics based on top genre, emotion analysis, and song structure
|
3431 |
try:
|
3432 |
# Ensure top_genres is a list with at least one element before accessing
|
|
|
3519 |
"rhythm_analysis": rhythm_analysis,
|
3520 |
"syllable_analysis": syllable_analysis,
|
3521 |
"prompt_template": prompt_template,
|
3522 |
+
"ast_results": ast_results,
|
3523 |
+
"voice_segments": voice_segments
|
3524 |
}
|
3525 |
|
3526 |
return results
|
|
|
3542 |
# Get beat information
|
3543 |
beats_info = detect_beats(y, sr)
|
3544 |
|
3545 |
+
# Get voice activity segments
|
3546 |
+
try:
|
3547 |
+
voice_segments = detect_voice_activity(audio_file)
|
3548 |
+
except Exception as e:
|
3549 |
+
print(f"Error detecting voice segments: {str(e)}")
|
3550 |
+
voice_segments = []
|
3551 |
+
|
3552 |
# Helper function to convert numpy values to floats - FIXED
|
3553 |
def ensure_float(value):
|
3554 |
if isinstance(value, np.ndarray) or isinstance(value, np.number):
|
|
|
3568 |
timeline += f"Beat Periodicity: {beat_periodicity:.3f}s\n"
|
3569 |
timeline += f"Beat Entropy: {beats_info.get('beat_entropy', 'N/A')}\n"
|
3570 |
timeline += f"Total Beats: {beats_info['beat_count']}\n"
|
3571 |
+
|
3572 |
+
# Add voice activity segments information
|
3573 |
+
if voice_segments:
|
3574 |
+
timeline += f"\nVoice Activity Segments: {len(voice_segments)}\n"
|
3575 |
+
for i, segment in enumerate(voice_segments[:5]): # Show first 5 segments
|
3576 |
+
timeline += f" Segment {i+1}: {segment['start']:.2f}s - {segment['end']:.2f}s ({segment['duration']:.2f}s)\n"
|
3577 |
+
if len(voice_segments) > 5:
|
3578 |
+
timeline += f" ... and {len(voice_segments) - 5} more segments\n"
|
3579 |
|
3580 |
# Add musicological context based on tempo classification
|
3581 |
if tempo < 60:
|
|
|
3603 |
time = ensure_float(time)
|
3604 |
strength = ensure_float(strength)
|
3605 |
|
3606 |
+
# Check if this beat is during voice activity
|
3607 |
+
in_voice_segment = False
|
3608 |
+
for segment in voice_segments:
|
3609 |
+
if segment['start'] <= time <= segment['end']:
|
3610 |
+
in_voice_segment = True
|
3611 |
+
break
|
3612 |
+
|
3613 |
# More scientific determination of beat type based on both strength and metrical position
|
3614 |
metrical_position = i % beats_info['time_signature']
|
3615 |
|
|
|
3631 |
else:
|
3632 |
beat_type = "WEAK"
|
3633 |
syllable_value = 1.0
|
3634 |
+
|
3635 |
+
# Mark the beat type if it's in a voice segment
|
3636 |
+
if in_voice_segment:
|
3637 |
+
beat_type = f"{beat_type} (VOICE)"
|
3638 |
|
3639 |
# Determine pattern letter based on beat type for consistency
|
3640 |
if beat_type == "STRONG":
|
|
|
4091 |
genre_results = results.get("genre_results", "Genre classification failed")
|
4092 |
lyrics = results.get("lyrics", "Lyrics generation failed")
|
4093 |
ast_results = results.get("ast_results", [])
|
4094 |
+
voice_segments = results.get("voice_segments", [])
|
4095 |
else:
|
4096 |
# Old tuple format
|
4097 |
genre_results, lyrics, ast_results = results
|
4098 |
+
# Get voice segments
|
4099 |
+
try:
|
4100 |
+
voice_segments = detect_voice_activity(audio_file)
|
4101 |
+
except Exception as e:
|
4102 |
+
print(f"Error detecting voice segments: {str(e)}")
|
4103 |
+
voice_segments = []
|
4104 |
|
4105 |
# Get clean lyrics (without analysis notes)
|
4106 |
clean_lyrics = lyrics
|
|
|
4132 |
emotion_text += f"- Time Signature: {beats_info.get('time_signature', 4)}/4\n"
|
4133 |
emotion_text += f"- Total Beats: {beats_info.get('beat_count', 0)}\n"
|
4134 |
|
4135 |
+
# Add voice activity segments if available
|
4136 |
+
if voice_segments:
|
4137 |
+
emotion_text += f"\n\nVoice Activity Segments ({len(voice_segments)}):\n"
|
4138 |
+
for i, segment in enumerate(voice_segments[:10]): # Show up to 10 segments
|
4139 |
+
emotion_text += f"- Segment {i+1}: {segment['start']:.2f}s - {segment['end']:.2f}s ({segment['duration']:.2f}s)\n"
|
4140 |
+
if len(voice_segments) > 10:
|
4141 |
+
emotion_text += f"... and {len(voice_segments) - 10} more segments\n"
|
4142 |
+
|
4143 |
except Exception as e:
|
4144 |
print(f"Error in emotion analysis: {str(e)}")
|
4145 |
|
appp.py
CHANGED
@@ -32,7 +32,7 @@ if "HF_TOKEN" in os.environ:
|
|
32 |
# Constants
|
33 |
GENRE_MODEL_NAME = "dima806/music_genres_classification"
|
34 |
MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593"
|
35 |
-
LLM_MODEL_NAME = "Qwen/Qwen3-
|
36 |
SAMPLE_RATE = 22050 # Standard sample rate for audio processing
|
37 |
|
38 |
# Check CUDA availability (for informational purposes)
|
@@ -2063,7 +2063,7 @@ def get_stress_aligned_alternatives(word, position_to_stress):
|
|
2063 |
# For other cases, just provide general guidance
|
2064 |
return f"a word with stress on syllable {position_to_stress + 1}"
|
2065 |
|
2066 |
-
def generate_lyrics(genre, duration, emotion_results, song_structure=None):
|
2067 |
"""
|
2068 |
Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment.
|
2069 |
|
@@ -2075,6 +2075,7 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None):
|
|
2075 |
duration: Duration of the audio in seconds
|
2076 |
emotion_results: Dictionary containing emotional analysis results
|
2077 |
song_structure: Optional dictionary containing song structure analysis
|
|
|
2078 |
|
2079 |
Returns:
|
2080 |
Generated lyrics aligned with the rhythm patterns of the music
|
@@ -2493,6 +2494,30 @@ even if there are no rhythm issues. Include the following in your analysis:
|
|
2493 |
2. Where stressed syllables align with strong beats
|
2494 |
3. Any potential misalignments or improvements
|
2495 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2496 |
Your lyrics:
|
2497 |
"""
|
2498 |
elif use_sections:
|
@@ -2526,7 +2551,18 @@ The lyrics should:
|
|
2526 |
- Follow the structure patterns provided above
|
2527 |
- Be completely original
|
2528 |
- Match the song duration of {duration:.1f} seconds
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2529 |
|
|
|
2530 |
IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
|
2531 |
|
2532 |
IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
|
@@ -2569,7 +2605,18 @@ The lyrics should:
|
|
2569 |
- Be completely original
|
2570 |
- Maintain a consistent theme throughout
|
2571 |
- Match the audio segment duration of {duration:.1f} seconds
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2572 |
|
|
|
2573 |
Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above.
|
2574 |
Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY.
|
2575 |
|
@@ -2936,7 +2983,7 @@ Improved lyrics with fixed rhythm:
|
|
2936 |
"prompt_template": "No prompt template available"
|
2937 |
}
|
2938 |
|
2939 |
-
def process_audio(audio_file):
|
2940 |
"""Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis."""
|
2941 |
if audio_file is None:
|
2942 |
return "Please upload an audio file.", None, None
|
@@ -3221,7 +3268,9 @@ def process_audio(audio_file):
|
|
3221 |
|
3222 |
try:
|
3223 |
print("Calling generate_lyrics function...")
|
3224 |
-
|
|
|
|
|
3225 |
print(f"Type of lyrics_result: {type(lyrics_result)}")
|
3226 |
|
3227 |
# Handle both old and new return formats with robust type checking
|
@@ -3774,7 +3823,7 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
|
|
3774 |
print(f"Error generating complete beat timeline: {str(e)}")
|
3775 |
return f"Error generating complete beat timeline: {str(e)}"
|
3776 |
|
3777 |
-
def display_results(audio_file):
|
3778 |
"""Process audio file and return formatted results for display in the UI."""
|
3779 |
# Default error response
|
3780 |
error_response = ("Please upload an audio file.",
|
@@ -3787,8 +3836,8 @@ def display_results(audio_file):
|
|
3787 |
return error_response
|
3788 |
|
3789 |
try:
|
3790 |
-
# Process audio and get results
|
3791 |
-
results = process_audio(audio_file)
|
3792 |
|
3793 |
# Check if we got an error message
|
3794 |
if isinstance(results, str) and "Error" in results:
|
@@ -3862,6 +3911,14 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
|
|
3862 |
with gr.Row():
|
3863 |
with gr.Column(scale=1):
|
3864 |
audio_input = gr.Audio(label="Upload Music", type="filepath")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3865 |
submit_btn = gr.Button("Analyze & Generate", variant="primary")
|
3866 |
|
3867 |
# Add genre info box
|
@@ -3897,10 +3954,10 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
|
|
3897 |
with gr.TabItem("Beat & Syllable Timeline"):
|
3898 |
beat_timeline_output = gr.Textbox(label="Beat Timings & Syllable Patterns", lines=40)
|
3899 |
|
3900 |
-
# Connect the button to the display function with updated
|
3901 |
submit_btn.click(
|
3902 |
fn=display_results,
|
3903 |
-
inputs=[audio_input],
|
3904 |
outputs=[genre_output, emotion_output, ast_output, lyrics_output, beat_timeline_output]
|
3905 |
)
|
3906 |
|
@@ -3929,11 +3986,12 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
|
|
3929 |
- Genre-specific rhythmic qualities
|
3930 |
- Half-beat and quarter-beat subdivisions
|
3931 |
|
3932 |
-
7. **Lyrics Generation**: Using the detected genre, emotion,
|
3933 |
- Match the emotional quality of the music
|
3934 |
- Follow the precise syllable templates for each second
|
3935 |
- Align stressed syllables with strong beats
|
3936 |
- Maintain genre-appropriate style and themes
|
|
|
3937 |
|
3938 |
8. **Rhythm Verification**: The system verifies the generated lyrics, analyzing:
|
3939 |
- Syllable count accuracy
|
|
|
32 |
# Constants
|
33 |
GENRE_MODEL_NAME = "dima806/music_genres_classification"
|
34 |
MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593"
|
35 |
+
LLM_MODEL_NAME = "Qwen/Qwen3-32B"
|
36 |
SAMPLE_RATE = 22050 # Standard sample rate for audio processing
|
37 |
|
38 |
# Check CUDA availability (for informational purposes)
|
|
|
2063 |
# For other cases, just provide general guidance
|
2064 |
return f"a word with stress on syllable {position_to_stress + 1}"
|
2065 |
|
2066 |
+
def generate_lyrics(genre, duration, emotion_results, song_structure=None, lyrics_requirements=None):
|
2067 |
"""
|
2068 |
Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment.
|
2069 |
|
|
|
2075 |
duration: Duration of the audio in seconds
|
2076 |
emotion_results: Dictionary containing emotional analysis results
|
2077 |
song_structure: Optional dictionary containing song structure analysis
|
2078 |
+
lyrics_requirements: Optional user-provided requirements for the lyrics
|
2079 |
|
2080 |
Returns:
|
2081 |
Generated lyrics aligned with the rhythm patterns of the music
|
|
|
2494 |
2. Where stressed syllables align with strong beats
|
2495 |
3. Any potential misalignments or improvements
|
2496 |
|
2497 |
+
Your lyrics:
|
2498 |
+
"""
|
2499 |
+
|
2500 |
+
# Add user requirements if provided
|
2501 |
+
if lyrics_requirements and lyrics_requirements.strip():
|
2502 |
+
content += f"""
|
2503 |
+
USER REQUIREMENTS:
|
2504 |
+
{lyrics_requirements.strip()}
|
2505 |
+
|
2506 |
+
The lyrics MUST incorporate these user requirements while still following the rhythm patterns.
|
2507 |
+
"""
|
2508 |
+
|
2509 |
+
content += """
|
2510 |
+
Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY.
|
2511 |
+
|
2512 |
+
IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
|
2513 |
+
|
2514 |
+
IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
|
2515 |
+
where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
|
2516 |
+
even if there are no rhythm issues. Include the following in your analysis:
|
2517 |
+
1. Syllable counts for each line and how they match the rhythm pattern
|
2518 |
+
2. Where stressed syllables align with strong beats
|
2519 |
+
3. Any potential misalignments or improvements
|
2520 |
+
|
2521 |
Your lyrics:
|
2522 |
"""
|
2523 |
elif use_sections:
|
|
|
2551 |
- Follow the structure patterns provided above
|
2552 |
- Be completely original
|
2553 |
- Match the song duration of {duration:.1f} seconds
|
2554 |
+
"""
|
2555 |
+
|
2556 |
+
# Add user requirements if provided
|
2557 |
+
if lyrics_requirements and lyrics_requirements.strip():
|
2558 |
+
content += f"""
|
2559 |
+
USER REQUIREMENTS:
|
2560 |
+
{lyrics_requirements.strip()}
|
2561 |
+
|
2562 |
+
The lyrics MUST incorporate these user requirements while still following the rhythm patterns.
|
2563 |
+
"""
|
2564 |
|
2565 |
+
content += """
|
2566 |
IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
|
2567 |
|
2568 |
IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
|
|
|
2605 |
- Be completely original
|
2606 |
- Maintain a consistent theme throughout
|
2607 |
- Match the audio segment duration of {duration:.1f} seconds
|
2608 |
+
"""
|
2609 |
+
|
2610 |
+
# Add user requirements if provided
|
2611 |
+
if lyrics_requirements and lyrics_requirements.strip():
|
2612 |
+
content += f"""
|
2613 |
+
USER REQUIREMENTS:
|
2614 |
+
{lyrics_requirements.strip()}
|
2615 |
+
|
2616 |
+
The lyrics MUST incorporate these user requirements while still following the rhythm patterns.
|
2617 |
+
"""
|
2618 |
|
2619 |
+
content += """
|
2620 |
Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above.
|
2621 |
Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY.
|
2622 |
|
|
|
2983 |
"prompt_template": "No prompt template available"
|
2984 |
}
|
2985 |
|
2986 |
+
def process_audio(audio_file, lyrics_requirements=None):
|
2987 |
"""Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis."""
|
2988 |
if audio_file is None:
|
2989 |
return "Please upload an audio file.", None, None
|
|
|
3268 |
|
3269 |
try:
|
3270 |
print("Calling generate_lyrics function...")
|
3271 |
+
# Pass lyrics_requirements to generate_lyrics function
|
3272 |
+
lyrics_result = generate_lyrics(primary_genre, audio_data["duration"], emotion_results,
|
3273 |
+
sanitized_song_structure, lyrics_requirements)
|
3274 |
print(f"Type of lyrics_result: {type(lyrics_result)}")
|
3275 |
|
3276 |
# Handle both old and new return formats with robust type checking
|
|
|
3823 |
print(f"Error generating complete beat timeline: {str(e)}")
|
3824 |
return f"Error generating complete beat timeline: {str(e)}"
|
3825 |
|
3826 |
+
def display_results(audio_file, lyrics_requirements=None):
|
3827 |
"""Process audio file and return formatted results for display in the UI."""
|
3828 |
# Default error response
|
3829 |
error_response = ("Please upload an audio file.",
|
|
|
3836 |
return error_response
|
3837 |
|
3838 |
try:
|
3839 |
+
# Process audio and get results - pass user requirements
|
3840 |
+
results = process_audio(audio_file, lyrics_requirements)
|
3841 |
|
3842 |
# Check if we got an error message
|
3843 |
if isinstance(results, str) and "Error" in results:
|
|
|
3911 |
with gr.Row():
|
3912 |
with gr.Column(scale=1):
|
3913 |
audio_input = gr.Audio(label="Upload Music", type="filepath")
|
3914 |
+
|
3915 |
+
# Add the new lyrics requirements input
|
3916 |
+
lyrics_requirements_input = gr.Textbox(
|
3917 |
+
label="Lyrics Requirements (optional)",
|
3918 |
+
placeholder="Enter specific themes, topics, words, or styles you want in the lyrics",
|
3919 |
+
lines=3
|
3920 |
+
)
|
3921 |
+
|
3922 |
submit_btn = gr.Button("Analyze & Generate", variant="primary")
|
3923 |
|
3924 |
# Add genre info box
|
|
|
3954 |
with gr.TabItem("Beat & Syllable Timeline"):
|
3955 |
beat_timeline_output = gr.Textbox(label="Beat Timings & Syllable Patterns", lines=40)
|
3956 |
|
3957 |
+
# Connect the button to the display function with updated inputs
|
3958 |
submit_btn.click(
|
3959 |
fn=display_results,
|
3960 |
+
inputs=[audio_input, lyrics_requirements_input],
|
3961 |
outputs=[genre_output, emotion_output, ast_output, lyrics_output, beat_timeline_output]
|
3962 |
)
|
3963 |
|
|
|
3986 |
- Genre-specific rhythmic qualities
|
3987 |
- Half-beat and quarter-beat subdivisions
|
3988 |
|
3989 |
+
7. **Lyrics Generation**: Using the detected genre, emotion, rhythm patterns, and your custom requirements, a large language model generates lyrics that:
|
3990 |
- Match the emotional quality of the music
|
3991 |
- Follow the precise syllable templates for each second
|
3992 |
- Align stressed syllables with strong beats
|
3993 |
- Maintain genre-appropriate style and themes
|
3994 |
+
- Incorporate your specific requirements and preferences
|
3995 |
|
3996 |
8. **Rhythm Verification**: The system verifies the generated lyrics, analyzing:
|
3997 |
- Syllable count accuracy
|
requirements.txt
CHANGED
@@ -13,3 +13,4 @@ scipy>=1.12.0
|
|
13 |
soundfile>=0.12.1
|
14 |
matplotlib>=3.7.0
|
15 |
pronouncing>=0.2.0
|
|
|
|
13 |
soundfile>=0.12.1
|
14 |
matplotlib>=3.7.0
|
15 |
pronouncing>=0.2.0
|
16 |
+
pyannote.audio>=2.1.1
|