root
commited on
Commit
·
8515dc5
1
Parent(s):
e3108aa
ss
Browse files- 5.16appbeforesyllables.py +336 -0
- app.py +357 -48
- beat_analysis.py +392 -0
- emotionanalysis.py +15 -24
- requirements.txt +1 -0
5.16appbeforesyllables.py
ADDED
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import io
|
3 |
+
import gradio as gr
|
4 |
+
import torch
|
5 |
+
import numpy as np
|
6 |
+
import re
|
7 |
+
import pronouncing # Add this to requirements.txt for syllable counting
|
8 |
+
import functools # Add this for lru_cache functionality
|
9 |
+
from transformers import (
|
10 |
+
AutoModelForAudioClassification,
|
11 |
+
AutoFeatureExtractor,
|
12 |
+
AutoTokenizer,
|
13 |
+
pipeline,
|
14 |
+
AutoModelForCausalLM,
|
15 |
+
BitsAndBytesConfig
|
16 |
+
)
|
17 |
+
from huggingface_hub import login
|
18 |
+
from utils import (
|
19 |
+
load_audio,
|
20 |
+
extract_audio_duration,
|
21 |
+
extract_mfcc_features,
|
22 |
+
format_genre_results,
|
23 |
+
ensure_cuda_availability
|
24 |
+
)
|
25 |
+
from emotionanalysis import MusicAnalyzer
|
26 |
+
import librosa
|
27 |
+
|
28 |
+
# Login to Hugging Face Hub if token is provided
|
29 |
+
if "HF_TOKEN" in os.environ:
|
30 |
+
login(token=os.environ["HF_TOKEN"])
|
31 |
+
|
32 |
+
# Constants
|
33 |
+
GENRE_MODEL_NAME = "dima806/music_genres_classification"
|
34 |
+
MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593"
|
35 |
+
LLM_MODEL_NAME = "Qwen/Qwen3-32B"
|
36 |
+
SAMPLE_RATE = 22050 # Standard sample rate for audio processing
|
37 |
+
|
38 |
+
# Check CUDA availability (for informational purposes)
|
39 |
+
CUDA_AVAILABLE = ensure_cuda_availability()
|
40 |
+
|
41 |
+
# Load models at initialization time
|
42 |
+
print("Loading genre classification model...")
|
43 |
+
try:
|
44 |
+
genre_feature_extractor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME)
|
45 |
+
genre_model = AutoModelForAudioClassification.from_pretrained(
|
46 |
+
GENRE_MODEL_NAME,
|
47 |
+
device_map="auto" if CUDA_AVAILABLE else None
|
48 |
+
)
|
49 |
+
# Create a convenience wrapper function with the same interface as before
|
50 |
+
def get_genre_model():
|
51 |
+
return genre_model, genre_feature_extractor
|
52 |
+
except Exception as e:
|
53 |
+
print(f"Error loading genre model: {str(e)}")
|
54 |
+
genre_model = None
|
55 |
+
genre_feature_extractor = None
|
56 |
+
|
57 |
+
# Load LLM and tokenizer at initialization time
|
58 |
+
print("Loading Qwen LLM model with 4-bit quantization...")
|
59 |
+
try:
|
60 |
+
# Configure 4-bit quantization for better performance
|
61 |
+
quantization_config = BitsAndBytesConfig(
|
62 |
+
load_in_4bit=True,
|
63 |
+
bnb_4bit_quant_type="nf4",
|
64 |
+
bnb_4bit_compute_dtype=torch.float16,
|
65 |
+
bnb_4bit_use_double_quant=True
|
66 |
+
)
|
67 |
+
|
68 |
+
llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
|
69 |
+
llm_model = AutoModelForCausalLM.from_pretrained(
|
70 |
+
LLM_MODEL_NAME,
|
71 |
+
quantization_config=quantization_config,
|
72 |
+
device_map="auto",
|
73 |
+
trust_remote_code=True,
|
74 |
+
torch_dtype=torch.float16,
|
75 |
+
use_cache=True
|
76 |
+
)
|
77 |
+
except Exception as e:
|
78 |
+
print(f"Error loading LLM model: {str(e)}")
|
79 |
+
llm_tokenizer = None
|
80 |
+
llm_model = None
|
81 |
+
|
82 |
+
# Create music analyzer instance
|
83 |
+
music_analyzer = MusicAnalyzer()
|
84 |
+
|
85 |
+
# Process uploaded audio file
|
86 |
+
def process_audio(audio_file):
|
87 |
+
if audio_file is None:
|
88 |
+
return "No audio file provided", None, None, None, None, None, None
|
89 |
+
|
90 |
+
try:
|
91 |
+
# Load and analyze audio
|
92 |
+
y, sr = load_audio(audio_file, sr=SAMPLE_RATE)
|
93 |
+
|
94 |
+
# Basic audio information
|
95 |
+
duration = extract_audio_duration(y, sr)
|
96 |
+
|
97 |
+
# Analyze music with MusicAnalyzer
|
98 |
+
music_analysis = music_analyzer.analyze_music(audio_file)
|
99 |
+
|
100 |
+
# Extract key information
|
101 |
+
tempo = music_analysis["rhythm_analysis"]["tempo"]
|
102 |
+
time_signature = music_analysis["rhythm_analysis"]["estimated_time_signature"]
|
103 |
+
emotion = music_analysis["emotion_analysis"]["primary_emotion"]
|
104 |
+
theme = music_analysis["theme_analysis"]["primary_theme"]
|
105 |
+
|
106 |
+
# Use genre classification directly instead of pipeline
|
107 |
+
if genre_model is not None and genre_feature_extractor is not None:
|
108 |
+
# Resample audio to 16000 Hz for the genre model
|
109 |
+
y_16k = librosa.resample(y, orig_sr=sr, target_sr=16000)
|
110 |
+
|
111 |
+
# Extract features
|
112 |
+
inputs = genre_feature_extractor(
|
113 |
+
y_16k,
|
114 |
+
sampling_rate=16000,
|
115 |
+
return_tensors="pt"
|
116 |
+
).to(genre_model.device)
|
117 |
+
|
118 |
+
# Classify genre
|
119 |
+
with torch.no_grad():
|
120 |
+
outputs = genre_model(**inputs)
|
121 |
+
logits = outputs.logits
|
122 |
+
probs = torch.nn.functional.softmax(logits, dim=-1)
|
123 |
+
|
124 |
+
# Get top genres
|
125 |
+
values, indices = torch.topk(probs[0], k=5)
|
126 |
+
top_genres = [(genre_model.config.id2label[idx.item()], val.item()) for val, idx in zip(values, indices)]
|
127 |
+
else:
|
128 |
+
# Fallback if model loading failed
|
129 |
+
top_genres = [("Unknown", 1.0)]
|
130 |
+
|
131 |
+
# Format genre results for display
|
132 |
+
genre_results_text = format_genre_results(top_genres)
|
133 |
+
primary_genre = top_genres[0][0]
|
134 |
+
|
135 |
+
# Generate lyrics using LLM
|
136 |
+
lyrics = generate_lyrics(music_analysis, primary_genre, duration)
|
137 |
+
|
138 |
+
# Prepare analysis summary
|
139 |
+
analysis_summary = f"""
|
140 |
+
### Music Analysis Results
|
141 |
+
|
142 |
+
**Duration:** {duration:.2f} seconds
|
143 |
+
**Tempo:** {tempo:.1f} BPM
|
144 |
+
**Time Signature:** {time_signature}
|
145 |
+
**Key:** {music_analysis["tonal_analysis"]["key"]} {music_analysis["tonal_analysis"]["mode"]}
|
146 |
+
**Primary Emotion:** {emotion}
|
147 |
+
**Primary Theme:** {theme}
|
148 |
+
**Top Genre:** {primary_genre}
|
149 |
+
|
150 |
+
{genre_results_text}
|
151 |
+
"""
|
152 |
+
|
153 |
+
return analysis_summary, lyrics, tempo, time_signature, emotion, theme, primary_genre
|
154 |
+
|
155 |
+
except Exception as e:
|
156 |
+
error_msg = f"Error processing audio: {str(e)}"
|
157 |
+
print(error_msg)
|
158 |
+
return error_msg, None, None, None, None, None, None
|
159 |
+
|
160 |
+
def generate_lyrics(music_analysis, genre, duration):
|
161 |
+
try:
|
162 |
+
# Extract meaningful information for context
|
163 |
+
tempo = music_analysis["rhythm_analysis"]["tempo"]
|
164 |
+
key = music_analysis["tonal_analysis"]["key"]
|
165 |
+
mode = music_analysis["tonal_analysis"]["mode"]
|
166 |
+
emotion = music_analysis["emotion_analysis"]["primary_emotion"]
|
167 |
+
theme = music_analysis["theme_analysis"]["primary_theme"]
|
168 |
+
|
169 |
+
# Verify LLM is loaded
|
170 |
+
if llm_model is None or llm_tokenizer is None:
|
171 |
+
return "Error: LLM model not properly loaded"
|
172 |
+
|
173 |
+
# Construct prompt for the LLM with stronger instruction to avoid thinking
|
174 |
+
prompt = f"""Write lyrics for a {genre} song with these specifications:
|
175 |
+
- Key: {key} {mode}
|
176 |
+
- Tempo: {tempo} BPM
|
177 |
+
- Emotion: {emotion}
|
178 |
+
- Theme: {theme}
|
179 |
+
- Duration: {duration:.1f} seconds
|
180 |
+
- Time signature: {music_analysis["rhythm_analysis"]["estimated_time_signature"]}
|
181 |
+
|
182 |
+
CRITICAL INSTRUCTIONS:
|
183 |
+
- The lyrics should be in English
|
184 |
+
- Write ONLY the raw lyrics with no structural labels
|
185 |
+
- DO NOT include any thinking, reasoning, or explanations
|
186 |
+
- DO NOT include <think> tags or thinking processes
|
187 |
+
- DO NOT include [verse], [chorus], [bridge], or any other section markers
|
188 |
+
- DO NOT number the verses or lines
|
189 |
+
- DO NOT use bullet points
|
190 |
+
- Format as simple line-by-line lyrics only
|
191 |
+
- Make sure the lyrics match the specified duration and tempo
|
192 |
+
- Keep lyrics concise enough to fit the duration when sung at the given tempo
|
193 |
+
"""
|
194 |
+
|
195 |
+
# Generate lyrics using the LLM model directly
|
196 |
+
messages = [
|
197 |
+
{"role": "user", "content": prompt}
|
198 |
+
]
|
199 |
+
|
200 |
+
# Apply chat template
|
201 |
+
text = llm_tokenizer.apply_chat_template(
|
202 |
+
messages,
|
203 |
+
tokenize=False,
|
204 |
+
add_generation_prompt=True
|
205 |
+
)
|
206 |
+
|
207 |
+
# Tokenize and move to model device
|
208 |
+
model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device)
|
209 |
+
|
210 |
+
# Generate with optimized parameters
|
211 |
+
generated_ids = llm_model.generate(
|
212 |
+
**model_inputs,
|
213 |
+
max_new_tokens=1024,
|
214 |
+
do_sample=True,
|
215 |
+
temperature=0.6, # Lower temperature for more focused responses
|
216 |
+
top_p=0.9,
|
217 |
+
repetition_penalty=1.2,
|
218 |
+
pad_token_id=llm_tokenizer.eos_token_id
|
219 |
+
)
|
220 |
+
|
221 |
+
# Decode the output
|
222 |
+
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
|
223 |
+
lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
|
224 |
+
|
225 |
+
# ENHANCED post-processing to remove ALL thinking elements
|
226 |
+
|
227 |
+
# Remove thinking tags and their content
|
228 |
+
lyrics = re.sub(r'<think>.*?</think>', '', lyrics, flags=re.DOTALL)
|
229 |
+
lyrics = re.sub(r'\[thinking\].*?\[/thinking\]', '', lyrics, flags=re.DOTALL)
|
230 |
+
|
231 |
+
# Remove any lines with section labels
|
232 |
+
lyrics = re.sub(r'^\[.*?\].*$', '', lyrics, flags=re.MULTILINE)
|
233 |
+
|
234 |
+
# Remove common prefixes and thinking text (expanded list)
|
235 |
+
thinking_prefixes = [
|
236 |
+
r'^(Here are|Here is|These are|This is|Let me|I will|I\'ll).*?:\s*',
|
237 |
+
r'^Okay, let\'s.*$',
|
238 |
+
r'^First, I need to.*$',
|
239 |
+
r'^Let me brainstorm.*$',
|
240 |
+
r'^I\'ll write.*$',
|
241 |
+
r'^Let\'s create.*$',
|
242 |
+
r'^For a.*song.*$',
|
243 |
+
r'^Now I\'ll.*$',
|
244 |
+
r'^Taking into account.*$',
|
245 |
+
r'^Given the specifications.*$',
|
246 |
+
r'^Based on the.*$',
|
247 |
+
r'^Considering the.*$'
|
248 |
+
]
|
249 |
+
|
250 |
+
for pattern in thinking_prefixes:
|
251 |
+
lyrics = re.sub(pattern, '', lyrics, flags=re.MULTILINE|re.IGNORECASE)
|
252 |
+
|
253 |
+
lyrics = re.sub(r'^Title:.*?$', '', lyrics, flags=re.MULTILINE).strip()
|
254 |
+
|
255 |
+
# Remove all section markers in any format
|
256 |
+
lyrics = re.sub(r'^\s*(Verse|Chorus|Bridge|Pre.?Chorus|Intro|Outro|Refrain|Hook|Breakdown)(\s*\d*|\s*[A-Z])?:?\s*$', '', lyrics, flags=re.MULTILINE|re.IGNORECASE)
|
257 |
+
lyrics = re.sub(r'\[(Verse|Chorus|Bridge|Pre.?Chorus|Intro|Outro|Refrain|Hook|Breakdown)(\s*\d*|\s*[A-Z])?\]', '', lyrics, flags=re.IGNORECASE)
|
258 |
+
|
259 |
+
# Remove lines with obvious thinking content
|
260 |
+
lyrics = re.sub(r'^.*?(think|brainstorm|consider|syllable|count|rhyme|scheme|tempo|calculate|bpm).*$', '', lyrics, flags=re.MULTILINE|re.IGNORECASE)
|
261 |
+
|
262 |
+
# Remove any empty lines at beginning, collapse multiple blank lines, and trim
|
263 |
+
lyrics = re.sub(r'^\s*\n', '', lyrics)
|
264 |
+
lyrics = re.sub(r'\n\s*\n\s*\n+', '\n\n', lyrics)
|
265 |
+
lyrics = lyrics.strip()
|
266 |
+
|
267 |
+
# One final check - if lyrics still starts with obvious thinking, try to find the actual lyrics
|
268 |
+
if re.match(r'.*?(I need to|Let me|Okay|Hmm|I will|I\'ll|First|Let\'s|Now).*', lyrics[:100], re.IGNORECASE):
|
269 |
+
# Look for a double line break which often separates thinking from lyrics
|
270 |
+
parts = lyrics.split('\n\n')
|
271 |
+
if len(parts) > 1:
|
272 |
+
# Take everything after the first paragraph as the actual lyrics
|
273 |
+
lyrics = '\n\n'.join(parts[1:])
|
274 |
+
|
275 |
+
return lyrics
|
276 |
+
|
277 |
+
except Exception as e:
|
278 |
+
error_msg = f"Error generating lyrics: {str(e)}"
|
279 |
+
print(error_msg)
|
280 |
+
return error_msg
|
281 |
+
|
282 |
+
# Create Gradio interface
|
283 |
+
def create_interface():
|
284 |
+
with gr.Blocks(title="Music Analysis & Lyrics Generator") as demo:
|
285 |
+
gr.Markdown("# Music Analysis & Lyrics Generator")
|
286 |
+
gr.Markdown("Upload a music file or record audio to analyze it and generate matching lyrics")
|
287 |
+
|
288 |
+
with gr.Row():
|
289 |
+
with gr.Column(scale=1):
|
290 |
+
audio_input = gr.Audio(
|
291 |
+
label="Upload or Record Audio",
|
292 |
+
type="filepath",
|
293 |
+
sources=["upload", "microphone"]
|
294 |
+
)
|
295 |
+
analyze_btn = gr.Button("Analyze and Generate Lyrics", variant="primary")
|
296 |
+
|
297 |
+
with gr.Column(scale=2):
|
298 |
+
with gr.Tab("Analysis"):
|
299 |
+
analysis_output = gr.Textbox(label="Music Analysis Results", lines=10)
|
300 |
+
|
301 |
+
with gr.Row():
|
302 |
+
tempo_output = gr.Number(label="Tempo (BPM)")
|
303 |
+
time_sig_output = gr.Textbox(label="Time Signature")
|
304 |
+
emotion_output = gr.Textbox(label="Primary Emotion")
|
305 |
+
theme_output = gr.Textbox(label="Primary Theme")
|
306 |
+
genre_output = gr.Textbox(label="Primary Genre")
|
307 |
+
|
308 |
+
with gr.Tab("Generated Lyrics"):
|
309 |
+
lyrics_output = gr.Textbox(label="Generated Lyrics", lines=20)
|
310 |
+
|
311 |
+
# Set up event handlers
|
312 |
+
analyze_btn.click(
|
313 |
+
fn=process_audio,
|
314 |
+
inputs=[audio_input],
|
315 |
+
outputs=[analysis_output, lyrics_output, tempo_output, time_sig_output,
|
316 |
+
emotion_output, theme_output, genre_output]
|
317 |
+
)
|
318 |
+
|
319 |
+
gr.Markdown("""
|
320 |
+
## How it works
|
321 |
+
1. Upload or record a music file
|
322 |
+
2. The system analyzes tempo, beats, time signature and other musical features
|
323 |
+
3. It detects emotion, theme, and music genre
|
324 |
+
4. Using this information, it generates lyrics that match the style and length of your music
|
325 |
+
""")
|
326 |
+
|
327 |
+
return demo
|
328 |
+
|
329 |
+
# Launch the app
|
330 |
+
demo = create_interface()
|
331 |
+
|
332 |
+
if __name__ == "__main__":
|
333 |
+
demo.launch()
|
334 |
+
else:
|
335 |
+
# For Hugging Face Spaces
|
336 |
+
app = demo
|
app.py
CHANGED
@@ -4,8 +4,8 @@ import gradio as gr
|
|
4 |
import torch
|
5 |
import numpy as np
|
6 |
import re
|
7 |
-
import pronouncing
|
8 |
-
import functools
|
9 |
from transformers import (
|
10 |
AutoModelForAudioClassification,
|
11 |
AutoFeatureExtractor,
|
@@ -22,8 +22,12 @@ from utils import (
|
|
22 |
format_genre_results,
|
23 |
ensure_cuda_availability
|
24 |
)
|
25 |
-
from emotionanalysis import MusicAnalyzer
|
26 |
import librosa
|
|
|
|
|
|
|
|
|
27 |
|
28 |
# Login to Hugging Face Hub if token is provided
|
29 |
if "HF_TOKEN" in os.environ:
|
@@ -85,7 +89,7 @@ music_analyzer = MusicAnalyzer()
|
|
85 |
# Process uploaded audio file
|
86 |
def process_audio(audio_file):
|
87 |
if audio_file is None:
|
88 |
-
return "No audio file provided", None, None, None, None, None, None
|
89 |
|
90 |
try:
|
91 |
# Load and analyze audio
|
@@ -97,9 +101,24 @@ def process_audio(audio_file):
|
|
97 |
# Analyze music with MusicAnalyzer
|
98 |
music_analysis = music_analyzer.analyze_music(audio_file)
|
99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
# Extract key information
|
101 |
tempo = music_analysis["rhythm_analysis"]["tempo"]
|
102 |
-
time_signature = music_analysis["rhythm_analysis"]["estimated_time_signature"]
|
103 |
emotion = music_analysis["emotion_analysis"]["primary_emotion"]
|
104 |
theme = music_analysis["theme_analysis"]["primary_theme"]
|
105 |
|
@@ -135,6 +154,9 @@ def process_audio(audio_file):
|
|
135 |
# Generate lyrics using LLM
|
136 |
lyrics = generate_lyrics(music_analysis, primary_genre, duration)
|
137 |
|
|
|
|
|
|
|
138 |
# Prepare analysis summary
|
139 |
analysis_summary = f"""
|
140 |
### Music Analysis Results
|
@@ -148,14 +170,26 @@ def process_audio(audio_file):
|
|
148 |
**Top Genre:** {primary_genre}
|
149 |
|
150 |
{genre_results_text}
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
-
return analysis_summary, lyrics, tempo, time_signature, emotion, theme, primary_genre
|
154 |
|
155 |
except Exception as e:
|
156 |
error_msg = f"Error processing audio: {str(e)}"
|
157 |
print(error_msg)
|
158 |
-
return error_msg, None, None, None, None, None, None
|
159 |
|
160 |
def generate_lyrics(music_analysis, genre, duration):
|
161 |
try:
|
@@ -166,33 +200,48 @@ def generate_lyrics(music_analysis, genre, duration):
|
|
166 |
emotion = music_analysis["emotion_analysis"]["primary_emotion"]
|
167 |
theme = music_analysis["theme_analysis"]["primary_theme"]
|
168 |
|
|
|
|
|
|
|
169 |
# Verify LLM is loaded
|
170 |
if llm_model is None or llm_tokenizer is None:
|
171 |
return "Error: LLM model not properly loaded"
|
172 |
-
|
173 |
-
# Construct prompt for the LLM
|
174 |
-
prompt = f"""Write lyrics for a {genre} song with these specifications:
|
175 |
-
- Key: {key} {mode}
|
176 |
-
- Tempo: {tempo} BPM
|
177 |
-
- Emotion: {emotion}
|
178 |
-
- Theme: {theme}
|
179 |
-
- Duration: {duration:.1f} seconds
|
180 |
-
- Time signature: {music_analysis["rhythm_analysis"]["estimated_time_signature"]}
|
181 |
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
"""
|
193 |
|
194 |
-
# Generate lyrics using the LLM model
|
195 |
-
# Format as chat message
|
196 |
messages = [
|
197 |
{"role": "user", "content": prompt}
|
198 |
]
|
@@ -214,7 +263,7 @@ IMPORTANT INSTRUCTIONS:
|
|
214 |
do_sample=True,
|
215 |
temperature=0.7,
|
216 |
top_p=0.9,
|
217 |
-
repetition_penalty=1.
|
218 |
pad_token_id=llm_tokenizer.eos_token_id
|
219 |
)
|
220 |
|
@@ -222,33 +271,289 @@ IMPORTANT INSTRUCTIONS:
|
|
222 |
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
|
223 |
lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
|
224 |
|
225 |
-
#
|
226 |
-
#
|
227 |
-
lyrics = re.sub(r'^\[.*?\].*$', '', lyrics, flags=re.MULTILINE)
|
228 |
|
229 |
-
#
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
|
233 |
-
# Remove
|
234 |
-
lyrics = re.sub(r'
|
235 |
-
lyrics = re.sub(r'\[
|
|
|
|
|
|
|
|
|
236 |
|
237 |
-
#
|
238 |
-
|
|
|
239 |
|
240 |
-
#
|
241 |
-
|
242 |
-
|
243 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
|
245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
|
247 |
except Exception as e:
|
248 |
error_msg = f"Error generating lyrics: {str(e)}"
|
249 |
print(error_msg)
|
250 |
return error_msg
|
251 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
# Create Gradio interface
|
253 |
def create_interface():
|
254 |
with gr.Blocks(title="Music Analysis & Lyrics Generator") as demo:
|
@@ -277,13 +582,16 @@ def create_interface():
|
|
277 |
|
278 |
with gr.Tab("Generated Lyrics"):
|
279 |
lyrics_output = gr.Textbox(label="Generated Lyrics", lines=20)
|
|
|
|
|
|
|
280 |
|
281 |
# Set up event handlers
|
282 |
analyze_btn.click(
|
283 |
fn=process_audio,
|
284 |
inputs=[audio_input],
|
285 |
outputs=[analysis_output, lyrics_output, tempo_output, time_sig_output,
|
286 |
-
emotion_output, theme_output, genre_output]
|
287 |
)
|
288 |
|
289 |
gr.Markdown("""
|
@@ -291,7 +599,8 @@ def create_interface():
|
|
291 |
1. Upload or record a music file
|
292 |
2. The system analyzes tempo, beats, time signature and other musical features
|
293 |
3. It detects emotion, theme, and music genre
|
294 |
-
4. Using
|
|
|
295 |
""")
|
296 |
|
297 |
return demo
|
|
|
4 |
import torch
|
5 |
import numpy as np
|
6 |
import re
|
7 |
+
import pronouncing
|
8 |
+
import functools
|
9 |
from transformers import (
|
10 |
AutoModelForAudioClassification,
|
11 |
AutoFeatureExtractor,
|
|
|
22 |
format_genre_results,
|
23 |
ensure_cuda_availability
|
24 |
)
|
25 |
+
from emotionanalysis import MusicAnalyzer
|
26 |
import librosa
|
27 |
+
from beat_analysis import BeatAnalyzer # Import the BeatAnalyzer class
|
28 |
+
|
29 |
+
# Initialize beat analyzer
|
30 |
+
beat_analyzer = BeatAnalyzer()
|
31 |
|
32 |
# Login to Hugging Face Hub if token is provided
|
33 |
if "HF_TOKEN" in os.environ:
|
|
|
89 |
# Process uploaded audio file
|
90 |
def process_audio(audio_file):
|
91 |
if audio_file is None:
|
92 |
+
return "No audio file provided", None, None, None, None, None, None, None
|
93 |
|
94 |
try:
|
95 |
# Load and analyze audio
|
|
|
101 |
# Analyze music with MusicAnalyzer
|
102 |
music_analysis = music_analyzer.analyze_music(audio_file)
|
103 |
|
104 |
+
# Extract time signature from MusicAnalyzer result
|
105 |
+
time_signature = music_analysis["rhythm_analysis"]["estimated_time_signature"]
|
106 |
+
|
107 |
+
# Ensure time signature is one of the supported ones (4/4, 3/4, 2/4, 6/8)
|
108 |
+
if time_signature not in ["4/4", "3/4", "2/4", "6/8"]:
|
109 |
+
time_signature = "4/4" # Default to 4/4 if unsupported
|
110 |
+
music_analysis["rhythm_analysis"]["estimated_time_signature"] = time_signature
|
111 |
+
|
112 |
+
# Analyze beat patterns and create lyrics template using MusicAnalyzer's time signature
|
113 |
+
beat_analysis = beat_analyzer.analyze_beat_pattern(audio_file, time_signature=time_signature)
|
114 |
+
lyric_templates = beat_analyzer.create_lyric_template(beat_analysis)
|
115 |
+
|
116 |
+
# Store these in the music_analysis dict for use in lyrics generation
|
117 |
+
music_analysis["beat_analysis"] = beat_analysis
|
118 |
+
music_analysis["lyric_templates"] = lyric_templates
|
119 |
+
|
120 |
# Extract key information
|
121 |
tempo = music_analysis["rhythm_analysis"]["tempo"]
|
|
|
122 |
emotion = music_analysis["emotion_analysis"]["primary_emotion"]
|
123 |
theme = music_analysis["theme_analysis"]["primary_theme"]
|
124 |
|
|
|
154 |
# Generate lyrics using LLM
|
155 |
lyrics = generate_lyrics(music_analysis, primary_genre, duration)
|
156 |
|
157 |
+
# Create beat/stress/syllable matching analysis
|
158 |
+
beat_match_analysis = analyze_lyrics_rhythm_match(lyrics, lyric_templates, primary_genre)
|
159 |
+
|
160 |
# Prepare analysis summary
|
161 |
analysis_summary = f"""
|
162 |
### Music Analysis Results
|
|
|
170 |
**Top Genre:** {primary_genre}
|
171 |
|
172 |
{genre_results_text}
|
173 |
+
"""
|
174 |
+
|
175 |
+
# Add beat analysis summary
|
176 |
+
if lyric_templates:
|
177 |
+
analysis_summary += f"""
|
178 |
+
### Beat Analysis
|
179 |
+
|
180 |
+
**Total Phrases:** {len(lyric_templates)}
|
181 |
+
**Average Beats Per Phrase:** {np.mean([t['num_beats'] for t in lyric_templates]):.1f}
|
182 |
+
**Beat Pattern Examples:**
|
183 |
+
- Phrase 1: {lyric_templates[0]['stress_pattern'] if lyric_templates else 'N/A'}
|
184 |
+
- Phrase 2: {lyric_templates[1]['stress_pattern'] if len(lyric_templates) > 1 else 'N/A'}
|
185 |
+
"""
|
186 |
|
187 |
+
return analysis_summary, lyrics, tempo, time_signature, emotion, theme, primary_genre, beat_match_analysis
|
188 |
|
189 |
except Exception as e:
|
190 |
error_msg = f"Error processing audio: {str(e)}"
|
191 |
print(error_msg)
|
192 |
+
return error_msg, None, None, None, None, None, None, None
|
193 |
|
194 |
def generate_lyrics(music_analysis, genre, duration):
|
195 |
try:
|
|
|
200 |
emotion = music_analysis["emotion_analysis"]["primary_emotion"]
|
201 |
theme = music_analysis["theme_analysis"]["primary_theme"]
|
202 |
|
203 |
+
# Get beat analysis and templates
|
204 |
+
lyric_templates = music_analysis.get("lyric_templates", [])
|
205 |
+
|
206 |
# Verify LLM is loaded
|
207 |
if llm_model is None or llm_tokenizer is None:
|
208 |
return "Error: LLM model not properly loaded"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
|
210 |
+
# If no templates, fall back to original method
|
211 |
+
if not lyric_templates:
|
212 |
+
# Simplified prompt
|
213 |
+
prompt = f"""Write song lyrics for a {genre} song in {key} {mode} with tempo {tempo} BPM. The emotion is {emotion} and theme is {theme}.
|
214 |
+
|
215 |
+
ONLY WRITE THE ACTUAL LYRICS. NO EXPLANATIONS OR META-TEXT.
|
216 |
+
"""
|
217 |
+
else:
|
218 |
+
# Create phrase examples
|
219 |
+
num_phrases = len(lyric_templates)
|
220 |
+
|
221 |
+
# Create a more direct prompt with examples
|
222 |
+
prompt = f"""Write song lyrics for a {genre} song in {key} {mode} with tempo {tempo} BPM. The emotion is {emotion} and theme is {theme}.
|
223 |
+
|
224 |
+
I need EXACTLY {num_phrases} lines of lyrics - one line for each musical phrase. Not one more, not one less.
|
225 |
+
|
226 |
+
FORMAT:
|
227 |
+
- Just write {num_phrases} plain text lines
|
228 |
+
- Each line should be simple song lyrics (no annotations, no numbers, no labeling)
|
229 |
+
- Don't include any explanations, thinking tags, or meta-commentary
|
230 |
+
- Don't use any <think> or [thinking] tags
|
231 |
+
- Don't include [Verse], [Chorus] or section markers
|
232 |
+
- Don't include line numbers
|
233 |
+
|
234 |
+
EXAMPLE OF WHAT I WANT (for a {num_phrases}-line song):
|
235 |
+
Lost in the shadows of yesterday
|
236 |
+
Dreams fade away like morning dew
|
237 |
+
Time slips through fingers like desert sand
|
238 |
+
Memories echo in empty rooms
|
239 |
+
(... and so on for exactly {num_phrases} lines)
|
240 |
+
|
241 |
+
JUST THE PLAIN LYRICS, EXACTLY {num_phrases} LINES.
|
242 |
"""
|
243 |
|
244 |
+
# Generate lyrics using the LLM model
|
|
|
245 |
messages = [
|
246 |
{"role": "user", "content": prompt}
|
247 |
]
|
|
|
263 |
do_sample=True,
|
264 |
temperature=0.7,
|
265 |
top_p=0.9,
|
266 |
+
repetition_penalty=1.2,
|
267 |
pad_token_id=llm_tokenizer.eos_token_id
|
268 |
)
|
269 |
|
|
|
271 |
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
|
272 |
lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
|
273 |
|
274 |
+
# ULTRA AGGRESSIVE CLEANING - COMPLETELY REVISED
|
275 |
+
# ------------------------------------------------
|
|
|
276 |
|
277 |
+
# 1. First, look for any standard dividers that might separate thinking from lyrics
|
278 |
+
divider_patterns = [
|
279 |
+
r'Here are the lyrics:',
|
280 |
+
r'Here is my song:',
|
281 |
+
r'The lyrics:',
|
282 |
+
r'My lyrics:',
|
283 |
+
r'Song lyrics:',
|
284 |
+
r'\*\*\*+',
|
285 |
+
r'===+',
|
286 |
+
r'---+',
|
287 |
+
r'```',
|
288 |
+
r'Lyrics:'
|
289 |
+
]
|
290 |
+
|
291 |
+
for pattern in divider_patterns:
|
292 |
+
matches = re.finditer(pattern, lyrics, re.IGNORECASE)
|
293 |
+
for match in matches:
|
294 |
+
# Keep only content after the divider
|
295 |
+
lyrics = lyrics[match.end():].strip()
|
296 |
|
297 |
+
# 2. Remove thinking tags completely before splitting into lines
|
298 |
+
lyrics = re.sub(r'<think>.*?</think>', '', lyrics, flags=re.DOTALL)
|
299 |
+
lyrics = re.sub(r'\[thinking\].*?\[/thinking\]', '', lyrics, flags=re.DOTALL)
|
300 |
+
lyrics = re.sub(r'<think>', '', lyrics, flags=re.DOTALL)
|
301 |
+
lyrics = re.sub(r'</think>', '', lyrics, flags=re.DOTALL)
|
302 |
+
lyrics = re.sub(r'\[thinking\]', '', lyrics, flags=re.DOTALL)
|
303 |
+
lyrics = re.sub(r'\[/thinking\]', '', lyrics, flags=re.DOTALL)
|
304 |
|
305 |
+
# 3. Split text into lines for aggressive line-by-line filtering
|
306 |
+
lines = lyrics.strip().split('\n')
|
307 |
+
clean_lines = []
|
308 |
|
309 |
+
# 4. Define comprehensive patterns for non-lyrical content
|
310 |
+
non_lyric_patterns = [
|
311 |
+
# Meta-commentary
|
312 |
+
r'^(note|thinking|thoughts|let me|i will|i am going|i would|i can|i need to|i have to|i should|let\'s|here|now)',
|
313 |
+
r'^(first|second|third|next|finally|importantly|remember|so|ok|okay|as requested|as asked|considering)',
|
314 |
+
# Explanations
|
315 |
+
r'syllable[s]?|phrase|rhythm|beats?|tempo|bpm|instruction|follow|alignment|match|corresponding',
|
316 |
+
r'verses?|chorus|bridge|section|stanza|part|template|format|pattern|example',
|
317 |
+
r'requirements?|guidelines?|song structure|stressed|unstressed',
|
318 |
+
# Technical language
|
319 |
+
r'generated|output|result|provide|create|write|draft|version',
|
320 |
+
# Annotations and numbering
|
321 |
+
r'^line \d+|^\d+[\.\):]|^\[\w+\]|^[\*\-\+] ',
|
322 |
+
# Questions or analytical statements
|
323 |
+
r'\?$|analysis|evaluate|review|check|ensure',
|
324 |
+
# Instruction-like statements
|
325 |
+
r'make sure|please note|important|notice|pay attention'
|
326 |
+
]
|
327 |
|
328 |
+
# 5. Identify which lines are likely actual lyrics vs non-lyrics
|
329 |
+
for line in lines:
|
330 |
+
line = line.strip()
|
331 |
+
|
332 |
+
# Skip empty lines or lines with just spaces/tabs
|
333 |
+
if not line or line.isspace():
|
334 |
+
continue
|
335 |
+
|
336 |
+
# Skip lines that match any non-lyric pattern
|
337 |
+
should_skip = False
|
338 |
+
for pattern in non_lyric_patterns:
|
339 |
+
if re.search(pattern, line.lower()):
|
340 |
+
should_skip = True
|
341 |
+
break
|
342 |
+
|
343 |
+
if should_skip:
|
344 |
+
continue
|
345 |
+
|
346 |
+
# Skip section headers
|
347 |
+
if (line.startswith('[') and ']' in line) or (line.startswith('(') and ')' in line and len(line) < 20):
|
348 |
+
continue
|
349 |
+
|
350 |
+
# Skip lines that look like annotations (not prose-like)
|
351 |
+
if ':' in line and not any(word in line.lower() for word in ['like', 'when', 'where', 'how', 'why', 'what']):
|
352 |
+
if len(line.split(':')[0]) < 15: # Short prefixes followed by colon are likely annotations
|
353 |
+
continue
|
354 |
+
|
355 |
+
# Skip very short lines that aren't likely to be lyrics (unless it's just a few words which could be valid)
|
356 |
+
if len(line) < 3:
|
357 |
+
continue
|
358 |
+
|
359 |
+
# Skip lines that are numbered or bulleted
|
360 |
+
if re.match(r'^\d+\.|\(#\d+\)|\d+\)', line):
|
361 |
+
continue
|
362 |
+
|
363 |
+
# Skip markdown-style emphasis or headers
|
364 |
+
if re.match(r'^#{1,6} |^\*\*|^__', line):
|
365 |
+
continue
|
366 |
+
|
367 |
+
# Skip lines with think tags
|
368 |
+
if '<think>' in line.lower() or '</think>' in line.lower() or '[thinking]' in line.lower() or '[/thinking]' in line.lower():
|
369 |
+
continue
|
370 |
+
|
371 |
+
# Add this line as it passed all filters
|
372 |
+
clean_lines.append(line)
|
373 |
+
|
374 |
+
# 6. Additional block-level filters for common patterns
|
375 |
+
# Check beginning of lyrics for common prefixes
|
376 |
+
if clean_lines and any(clean_lines[0].lower().startswith(prefix) for prefix in
|
377 |
+
['here are', 'these are', 'below are', 'following are']):
|
378 |
+
clean_lines = clean_lines[1:] # Skip the first line
|
379 |
+
|
380 |
+
# 7. Process blocks of lines to detect explanation blocks
|
381 |
+
if len(clean_lines) > 3:
|
382 |
+
# Check for explanation blocks at the beginning
|
383 |
+
first_three = ' '.join(clean_lines[:3]).lower()
|
384 |
+
if any(term in first_three for term in ['i will', 'i have created', 'i\'ll provide', 'i\'ll write']):
|
385 |
+
# This looks like an explanation, skip the first few lines
|
386 |
+
start_idx = 0
|
387 |
+
for i, line in enumerate(clean_lines):
|
388 |
+
if i >= 3 and not any(term in line.lower() for term in ['i will', 'created', 'write', 'provide']):
|
389 |
+
start_idx = i
|
390 |
+
break
|
391 |
+
clean_lines = clean_lines[start_idx:]
|
392 |
+
|
393 |
+
# Check for explanation blocks at the end
|
394 |
+
last_three = ' '.join(clean_lines[-3:]).lower()
|
395 |
+
if any(term in last_three for term in ['hope this', 'these lyrics', 'as you can see', 'this song', 'i have']):
|
396 |
+
# This looks like an explanation at the end, truncate
|
397 |
+
end_idx = len(clean_lines)
|
398 |
+
for i in range(len(clean_lines) - 1, max(0, len(clean_lines) - 4), -1):
|
399 |
+
if i < len(clean_lines) and not any(term in clean_lines[i].lower() for term in
|
400 |
+
['hope', 'these lyrics', 'as you can see', 'this song']):
|
401 |
+
end_idx = i + 1
|
402 |
+
break
|
403 |
+
clean_lines = clean_lines[:end_idx]
|
404 |
+
|
405 |
+
# 8. Cleanup - Remove remaining annotations or thinking
|
406 |
+
for i in range(len(clean_lines)):
|
407 |
+
# Remove trailing thoughts/annotations
|
408 |
+
clean_lines[i] = re.sub(r'\s+//.*$', '', clean_lines[i])
|
409 |
+
clean_lines[i] = re.sub(r'\s+\(.*?\)$', '', clean_lines[i])
|
410 |
+
|
411 |
+
# Remove thinking tags completely
|
412 |
+
clean_lines[i] = re.sub(r'<think>.*?</think>', '', clean_lines[i], flags=re.DOTALL)
|
413 |
+
clean_lines[i] = re.sub(r'\[thinking\].*?\[/thinking\]', '', clean_lines[i], flags=re.DOTALL)
|
414 |
+
clean_lines[i] = re.sub(r'<think>', '', clean_lines[i])
|
415 |
+
clean_lines[i] = re.sub(r'</think>', '', clean_lines[i])
|
416 |
+
clean_lines[i] = re.sub(r'\[thinking\]', '', clean_lines[i])
|
417 |
+
clean_lines[i] = re.sub(r'\[/thinking\]', '', clean_lines[i])
|
418 |
+
|
419 |
+
# 9. Filter out any remaining empty lines after tag removal
|
420 |
+
clean_lines = [line for line in clean_lines if line.strip() and not line.isspace()]
|
421 |
+
|
422 |
+
# 10. If we have lyric templates, ensure we have the correct number of lines
|
423 |
+
if lyric_templates:
|
424 |
+
num_required = len(lyric_templates)
|
425 |
+
|
426 |
+
# If we have too many lines, keep just the best ones
|
427 |
+
if len(clean_lines) > num_required:
|
428 |
+
# Keep the first num_required lines
|
429 |
+
clean_lines = clean_lines[:num_required]
|
430 |
+
|
431 |
+
# If we don't have enough lines, generate placeholders
|
432 |
+
while len(clean_lines) < num_required:
|
433 |
+
placeholder = f"Echoes of {emotion} fill the {genre} night"
|
434 |
+
if len(clean_lines) > 0:
|
435 |
+
# Try to make the placeholder somewhat related to previous lines
|
436 |
+
last_words = [word for line in clean_lines[-1:] for word in line.split() if len(word) > 3]
|
437 |
+
if last_words:
|
438 |
+
import random
|
439 |
+
word = random.choice(last_words)
|
440 |
+
placeholder = f"{word.capitalize()} whispers through the {emotion} silence"
|
441 |
+
|
442 |
+
clean_lines.append(placeholder)
|
443 |
+
|
444 |
+
# Assemble final lyrics
|
445 |
+
final_lyrics = '\n'.join(clean_lines)
|
446 |
+
|
447 |
+
# 11. Final sanity check - if we have nothing or garbage, return an error
|
448 |
+
if not final_lyrics or len(final_lyrics) < 10:
|
449 |
+
return "The model generated only thinking content but no actual lyrics. Please try again."
|
450 |
+
|
451 |
+
return final_lyrics
|
452 |
|
453 |
except Exception as e:
|
454 |
error_msg = f"Error generating lyrics: {str(e)}"
|
455 |
print(error_msg)
|
456 |
return error_msg
|
457 |
|
458 |
+
def analyze_lyrics_rhythm_match(lyrics, lyric_templates, genre="pop"):
|
459 |
+
"""Analyze how well the generated lyrics match the beat patterns and syllable requirements"""
|
460 |
+
if not lyric_templates or not lyrics:
|
461 |
+
return "No beat templates or lyrics available for analysis."
|
462 |
+
|
463 |
+
# Split lyrics into lines
|
464 |
+
lines = lyrics.strip().split('\n')
|
465 |
+
lines = [line for line in lines if line.strip()] # Remove empty lines
|
466 |
+
|
467 |
+
# Prepare analysis result
|
468 |
+
result = "### Beat & Syllable Match Analysis\n\n"
|
469 |
+
result += "| Line | Syllables | Target Range | Match | Stress Pattern |\n"
|
470 |
+
result += "| ---- | --------- | ------------ | ----- | -------------- |\n"
|
471 |
+
|
472 |
+
# Maximum number of lines to analyze (either all lines or all templates)
|
473 |
+
line_count = min(len(lines), len(lyric_templates))
|
474 |
+
|
475 |
+
# Track overall match statistics
|
476 |
+
total_matches = 0
|
477 |
+
total_range_matches = 0
|
478 |
+
total_stress_matches = 0
|
479 |
+
total_stress_percentage = 0
|
480 |
+
total_ideal_matches = 0
|
481 |
+
|
482 |
+
for i in range(line_count):
|
483 |
+
line = lines[i]
|
484 |
+
template = lyric_templates[i]
|
485 |
+
|
486 |
+
# Check match between line and template with genre awareness
|
487 |
+
check_result = beat_analyzer.check_syllable_stress_match(line, template, genre)
|
488 |
+
|
489 |
+
# Get match symbols
|
490 |
+
syllable_match = "✓" if check_result["matches_beat_count"] else ("✓*" if check_result["within_range"] else "✗")
|
491 |
+
stress_match = "✓" if check_result["stress_matches"] else f"{int(check_result['stress_match_percentage']*100)}%"
|
492 |
+
|
493 |
+
# Update stats
|
494 |
+
if check_result["matches_beat_count"]:
|
495 |
+
total_matches += 1
|
496 |
+
if check_result["within_range"]:
|
497 |
+
total_range_matches += 1
|
498 |
+
if check_result["stress_matches"]:
|
499 |
+
total_stress_matches += 1
|
500 |
+
total_stress_percentage += check_result["stress_match_percentage"]
|
501 |
+
|
502 |
+
# Track how close we are to ideal count for this genre
|
503 |
+
if abs(check_result["syllable_count"] - check_result["ideal_syllable_count"]) <= 1:
|
504 |
+
total_ideal_matches += 1
|
505 |
+
|
506 |
+
# Create visual representation of the stress pattern
|
507 |
+
stress_visual = ""
|
508 |
+
for char in template['stress_pattern']:
|
509 |
+
if char == "S":
|
510 |
+
stress_visual += "X" # Strong
|
511 |
+
elif char == "M":
|
512 |
+
stress_visual += "x" # Medium
|
513 |
+
else:
|
514 |
+
stress_visual += "." # Weak
|
515 |
+
|
516 |
+
# Add line to results table
|
517 |
+
result += f"| {i+1} | {check_result['syllable_count']} | {check_result['min_expected']}-{check_result['max_expected']} | {syllable_match} | {stress_visual} |\n"
|
518 |
+
|
519 |
+
# Add summary statistics
|
520 |
+
if line_count > 0:
|
521 |
+
exact_match_rate = (total_matches / line_count) * 100
|
522 |
+
range_match_rate = (total_range_matches / line_count) * 100
|
523 |
+
ideal_match_rate = (total_ideal_matches / line_count) * 100
|
524 |
+
stress_match_rate = (total_stress_matches / line_count) * 100
|
525 |
+
avg_stress_percentage = (total_stress_percentage / line_count) * 100
|
526 |
+
|
527 |
+
result += f"\n**Summary:**\n"
|
528 |
+
result += f"- Exact syllable match rate: {exact_match_rate:.1f}%\n"
|
529 |
+
result += f"- Genre-appropriate syllable range match rate: {range_match_rate:.1f}%\n"
|
530 |
+
result += f"- Ideal genre syllable count match rate: {ideal_match_rate:.1f}%\n"
|
531 |
+
result += f"- Perfect stress pattern match rate: {stress_match_rate:.1f}%\n"
|
532 |
+
result += f"- Average stress pattern accuracy: {avg_stress_percentage:.1f}%\n"
|
533 |
+
result += f"- Overall rhythmic accuracy: {((range_match_rate + avg_stress_percentage) / 2):.1f}%\n"
|
534 |
+
|
535 |
+
# Add genre-specific notes
|
536 |
+
result += f"\n**Genre Notes ({genre}):**\n"
|
537 |
+
|
538 |
+
# Add appropriate genre notes based on genre
|
539 |
+
if genre.lower() == "pop":
|
540 |
+
result += "- Pop music typically allows 1-3 syllables per beat using melisma and syncopation\n"
|
541 |
+
result += "- Strong downbeats often align with stressed syllables of important words\n"
|
542 |
+
elif genre.lower() == "rock":
|
543 |
+
result += "- Rock music often uses 1-2 syllables per beat with some variation\n"
|
544 |
+
result += "- Emphasis on strong beats for impact and rhythmic drive\n"
|
545 |
+
elif genre.lower() in ["hiphop", "rap"]:
|
546 |
+
result += "- Hip-hop/rap often features 2-5 syllables per beat through rapid delivery\n"
|
547 |
+
result += "- Complex rhyme patterns and fast delivery create higher syllable density\n"
|
548 |
+
elif genre.lower() in ["folk", "country"]:
|
549 |
+
result += "- Folk/country music often stays closer to 1:1 syllable-to-beat ratio\n"
|
550 |
+
result += "- Narrative focus leads to clearer enunciation of syllables\n"
|
551 |
+
else:
|
552 |
+
result += "- This genre typically allows for flexible syllable-to-beat relationships\n"
|
553 |
+
result += "- Syllable count can vary based on vocal style and song section\n"
|
554 |
+
|
555 |
+
return result
|
556 |
+
|
557 |
# Create Gradio interface
|
558 |
def create_interface():
|
559 |
with gr.Blocks(title="Music Analysis & Lyrics Generator") as demo:
|
|
|
582 |
|
583 |
with gr.Tab("Generated Lyrics"):
|
584 |
lyrics_output = gr.Textbox(label="Generated Lyrics", lines=20)
|
585 |
+
|
586 |
+
with gr.Tab("Beat Matching"):
|
587 |
+
beat_match_output = gr.Markdown(label="Beat & Syllable Matching Analysis")
|
588 |
|
589 |
# Set up event handlers
|
590 |
analyze_btn.click(
|
591 |
fn=process_audio,
|
592 |
inputs=[audio_input],
|
593 |
outputs=[analysis_output, lyrics_output, tempo_output, time_sig_output,
|
594 |
+
emotion_output, theme_output, genre_output, beat_match_output]
|
595 |
)
|
596 |
|
597 |
gr.Markdown("""
|
|
|
599 |
1. Upload or record a music file
|
600 |
2. The system analyzes tempo, beats, time signature and other musical features
|
601 |
3. It detects emotion, theme, and music genre
|
602 |
+
4. Using beat patterns and syllable stress analysis, it generates perfectly aligned lyrics
|
603 |
+
5. Each line of the lyrics is matched to the beat pattern of the corresponding musical phrase
|
604 |
""")
|
605 |
|
606 |
return demo
|
beat_analysis.py
ADDED
@@ -0,0 +1,392 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
import pronouncing
|
4 |
+
import re
|
5 |
+
from functools import lru_cache
|
6 |
+
import string
|
7 |
+
from nltk.corpus import cmudict
|
8 |
+
import nltk
|
9 |
+
|
10 |
+
try:
|
11 |
+
nltk.data.find('corpora/cmudict')
|
12 |
+
except LookupError:
|
13 |
+
nltk.download('cmudict')
|
14 |
+
|
15 |
+
class BeatAnalyzer:
|
16 |
+
def __init__(self):
|
17 |
+
# Mapping for standard stress patterns by time signature
|
18 |
+
# Simplified to only include 4/4, 3/4, 2/4, and 6/8
|
19 |
+
self.stress_patterns = {
|
20 |
+
# Format: Strong (1.0), Medium (0.5), Weak (0.0)
|
21 |
+
"4/4": [1.0, 0.0, 0.5, 0.0], # Strong, weak, medium, weak
|
22 |
+
"3/4": [1.0, 0.0, 0.0], # Strong, weak, weak
|
23 |
+
"2/4": [1.0, 0.0], # Strong, weak
|
24 |
+
"6/8": [1.0, 0.0, 0.0, 0.5, 0.0, 0.0] # Strong, weak, weak, medium, weak, weak
|
25 |
+
}
|
26 |
+
|
27 |
+
self.cmudict = None
|
28 |
+
try:
|
29 |
+
self.cmudict = cmudict.dict()
|
30 |
+
except:
|
31 |
+
pass # Fall back to rule-based counting if cmudict is not available
|
32 |
+
|
33 |
+
# Genre-specific syllable-to-beat ratio guidelines
|
34 |
+
self.genre_syllable_ratios = {
|
35 |
+
# Genre: (min_ratio, typical_ratio, max_ratio)
|
36 |
+
'pop': (0.7, 1.5, 3.0), # Pop tends to have more syllables per beat
|
37 |
+
'rock': (0.7, 1.2, 2.5), # Rock can vary widely
|
38 |
+
'hiphop': (1.5, 3.0, 5.0), # Hip hop often has many syllables per beat
|
39 |
+
'rap': (2.0, 4.0, 7.0), # Rap often has very high syllable counts
|
40 |
+
'folk': (0.8, 1.0, 1.5), # Folk often has close to 1:1 ratio
|
41 |
+
'country': (0.7, 1.2, 2.0), # Country tends to be moderate
|
42 |
+
'jazz': (0.5, 1.0, 3.0), # Jazz can be very flexible
|
43 |
+
'reggae': (0.6, 1.0, 1.5), # Reggae often emphasizes specific beats
|
44 |
+
'soul': (0.7, 1.2, 2.0), # Soul music tends to be expressive
|
45 |
+
'r&b': (0.8, 1.5, 2.5), # R&B can have melisma
|
46 |
+
'electronic': (0.5, 1.0, 2.0), # Electronic music varies widely
|
47 |
+
'disco': (1.0, 1.5, 2.5), # Disco tends to have more syllables
|
48 |
+
'classical': (0.5, 1.0, 2.0), # Classical can vary by subgenre
|
49 |
+
'metal': (0.8, 1.5, 3.0), # Metal often has more syllables on strong beats
|
50 |
+
'blues': (0.5, 0.8, 1.5), # Blues often extends syllables
|
51 |
+
'default': (0.7, 1.5, 3.0) # Default for unknown genres
|
52 |
+
}
|
53 |
+
|
54 |
+
@lru_cache(maxsize=128)
|
55 |
+
def count_syllables(self, word):
|
56 |
+
"""Count syllables in a word using CMU dictionary if available, otherwise use rule-based method."""
|
57 |
+
word = word.lower().strip()
|
58 |
+
word = re.sub(r'[^a-z]', '', word) # Remove non-alphabetic characters
|
59 |
+
|
60 |
+
if not word:
|
61 |
+
return 0
|
62 |
+
|
63 |
+
# Try using CMUDict first if available
|
64 |
+
if self.cmudict and word in self.cmudict:
|
65 |
+
return max([len(list(y for y in x if y[-1].isdigit())) for x in self.cmudict[word]])
|
66 |
+
|
67 |
+
# Rule-based syllable counting as fallback
|
68 |
+
# Modified version from NLTK's implementation
|
69 |
+
vowels = "aeiouy"
|
70 |
+
double_vowels = ['aa', 'ae', 'ai', 'ao', 'au', 'ay', 'ea', 'ee', 'ei', 'eo', 'eu', 'ey', 'ia', 'ie', 'ii', 'io', 'iu', 'oa', 'oe', 'oi', 'oo', 'ou', 'oy', 'ua', 'ue', 'ui', 'uo', 'uy']
|
71 |
+
prev_was_vowel = False
|
72 |
+
count = 0
|
73 |
+
final_e = False
|
74 |
+
|
75 |
+
if word.endswith('e') and not word.endswith('le'):
|
76 |
+
final_e = True
|
77 |
+
|
78 |
+
for i, char in enumerate(word):
|
79 |
+
if char in vowels:
|
80 |
+
# Check if current char and previous char form a dipthong
|
81 |
+
if prev_was_vowel and i > 0 and (word[i-1:i+1] in double_vowels):
|
82 |
+
prev_was_vowel = True
|
83 |
+
continue
|
84 |
+
|
85 |
+
if not prev_was_vowel:
|
86 |
+
count += 1
|
87 |
+
prev_was_vowel = True
|
88 |
+
else:
|
89 |
+
prev_was_vowel = False
|
90 |
+
|
91 |
+
# Handle edge cases
|
92 |
+
if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
|
93 |
+
count += 1
|
94 |
+
elif final_e:
|
95 |
+
count = max(count-1, 1) # Remove last 'e', but ensure at least 1 syllable
|
96 |
+
elif word.endswith('y') and not prev_was_vowel:
|
97 |
+
count += 1
|
98 |
+
|
99 |
+
# Ensure at least one syllable
|
100 |
+
return max(count, 1)
|
101 |
+
|
102 |
+
def analyze_beat_pattern(self, audio_path, sr=22050, time_signature="4/4"):
|
103 |
+
"""Analyze beat patterns and stresses in music using the provided time signature."""
|
104 |
+
# Load audio
|
105 |
+
y, sr = librosa.load(audio_path, sr=sr)
|
106 |
+
|
107 |
+
# Get tempo and beat frames
|
108 |
+
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
|
109 |
+
beat_times = librosa.frames_to_time(beat_frames, sr=sr)
|
110 |
+
|
111 |
+
# Get beat strengths using onset envelope
|
112 |
+
onset_env = librosa.onset.onset_strength(y=y, sr=sr)
|
113 |
+
beat_strengths = onset_env[beat_frames]
|
114 |
+
|
115 |
+
# Normalize beat strengths
|
116 |
+
if len(beat_strengths) > 0 and np.max(beat_strengths) > np.min(beat_strengths):
|
117 |
+
beat_strengths = (beat_strengths - np.min(beat_strengths)) / (np.max(beat_strengths) - np.min(beat_strengths))
|
118 |
+
|
119 |
+
# Parse time signature
|
120 |
+
if '/' in time_signature:
|
121 |
+
num, denom = map(int, time_signature.split('/'))
|
122 |
+
else:
|
123 |
+
num, denom = 4, 4 # Default to 4/4
|
124 |
+
|
125 |
+
# Group beats into bars (each bar is one phrase based on time signature)
|
126 |
+
bars = []
|
127 |
+
current_bar = []
|
128 |
+
|
129 |
+
for i, (time, strength) in enumerate(zip(beat_times, beat_strengths)):
|
130 |
+
# Determine metrical position and stress
|
131 |
+
metrical_position = i % num
|
132 |
+
|
133 |
+
# Define stress pattern according to time signature
|
134 |
+
if time_signature == "4/4":
|
135 |
+
if metrical_position == 0: # First beat (strongest)
|
136 |
+
stress = "S" # Strong
|
137 |
+
elif metrical_position == 2: # Third beat (medium)
|
138 |
+
stress = "M" # Medium
|
139 |
+
else: # Second and fourth beats (weak)
|
140 |
+
stress = "W" # Weak
|
141 |
+
elif time_signature == "3/4":
|
142 |
+
if metrical_position == 0: # First beat (strongest)
|
143 |
+
stress = "S" # Strong
|
144 |
+
else: # Other beats (weak)
|
145 |
+
stress = "W" # Weak
|
146 |
+
elif time_signature == "6/8":
|
147 |
+
if metrical_position == 0: # First beat (strongest)
|
148 |
+
stress = "S" # Strong
|
149 |
+
elif metrical_position == 3: # Fourth beat (medium)
|
150 |
+
stress = "M" # Medium
|
151 |
+
else: # Other beats (weak)
|
152 |
+
stress = "W" # Weak
|
153 |
+
elif time_signature == "2/4":
|
154 |
+
if metrical_position == 0: # First beat (strongest)
|
155 |
+
stress = "S" # Strong
|
156 |
+
else: # Second beat (weak)
|
157 |
+
stress = "W" # Weak
|
158 |
+
else:
|
159 |
+
# Default pattern for other time signatures
|
160 |
+
if metrical_position == 0:
|
161 |
+
stress = "S"
|
162 |
+
else:
|
163 |
+
stress = "W"
|
164 |
+
|
165 |
+
# Add beat to current bar
|
166 |
+
current_bar.append({
|
167 |
+
'time': time,
|
168 |
+
'strength': strength,
|
169 |
+
'stress': stress,
|
170 |
+
'metrical_position': metrical_position
|
171 |
+
})
|
172 |
+
|
173 |
+
# When we complete a bar, add it to our bars list
|
174 |
+
if metrical_position == num - 1 or i == len(beat_times) - 1:
|
175 |
+
if current_bar:
|
176 |
+
bars.append(current_bar)
|
177 |
+
current_bar = []
|
178 |
+
|
179 |
+
# If there's any remaining beats, add them as a partial bar
|
180 |
+
if current_bar:
|
181 |
+
bars.append(current_bar)
|
182 |
+
|
183 |
+
# Organize beats into phrases (one phrase = one bar)
|
184 |
+
phrases = []
|
185 |
+
|
186 |
+
for i, bar in enumerate(bars):
|
187 |
+
phrase_beats = bar
|
188 |
+
|
189 |
+
if not phrase_beats:
|
190 |
+
continue
|
191 |
+
|
192 |
+
# Calculate the phrase information
|
193 |
+
phrase = {
|
194 |
+
'id': i,
|
195 |
+
'num_beats': len(phrase_beats),
|
196 |
+
'beats': phrase_beats,
|
197 |
+
'stress_pattern': ''.join(beat['stress'] for beat in phrase_beats),
|
198 |
+
'start_time': phrase_beats[0]['time'],
|
199 |
+
'end_time': phrase_beats[-1]['time'] + (phrase_beats[-1]['time'] - phrase_beats[-2]['time'] if len(phrase_beats) > 1 else 0.5),
|
200 |
+
}
|
201 |
+
|
202 |
+
phrases.append(phrase)
|
203 |
+
|
204 |
+
return {
|
205 |
+
'tempo': tempo,
|
206 |
+
'time_signature': time_signature,
|
207 |
+
'num_beats': len(beat_times),
|
208 |
+
'beat_times': beat_times.tolist(),
|
209 |
+
'beat_strengths': beat_strengths.tolist(),
|
210 |
+
'phrases': phrases
|
211 |
+
}
|
212 |
+
|
213 |
+
def create_lyric_template(self, beat_analysis):
|
214 |
+
"""Create templates for lyrics based on beat phrases."""
|
215 |
+
templates = []
|
216 |
+
|
217 |
+
if not beat_analysis or 'phrases' not in beat_analysis:
|
218 |
+
return templates
|
219 |
+
|
220 |
+
phrases = beat_analysis['phrases']
|
221 |
+
|
222 |
+
for i, phrase in enumerate(phrases):
|
223 |
+
duration = phrase['end_time'] - phrase['start_time']
|
224 |
+
|
225 |
+
template = {
|
226 |
+
'id': phrase['id'],
|
227 |
+
'start_time': phrase['start_time'],
|
228 |
+
'end_time': phrase['end_time'],
|
229 |
+
'duration': duration,
|
230 |
+
'num_beats': phrase['num_beats'],
|
231 |
+
'stress_pattern': phrase['stress_pattern'],
|
232 |
+
'syllable_guide': self.generate_phrase_guide(phrase)
|
233 |
+
}
|
234 |
+
|
235 |
+
templates.append(template)
|
236 |
+
|
237 |
+
return templates
|
238 |
+
|
239 |
+
def generate_phrase_guide(self, template, words_per_beat=0.5):
|
240 |
+
"""Generate a guide for each phrase to help the LLM."""
|
241 |
+
num_beats = template['num_beats']
|
242 |
+
stress_pattern = template['stress_pattern']
|
243 |
+
|
244 |
+
# Create a visual representation of the stress pattern
|
245 |
+
# S = Strong stress, M = Medium stress, W = Weak stress
|
246 |
+
visual_pattern = ""
|
247 |
+
for i, stress in enumerate(stress_pattern):
|
248 |
+
if stress == "S":
|
249 |
+
visual_pattern += "STRONG "
|
250 |
+
elif stress == "M":
|
251 |
+
visual_pattern += "medium "
|
252 |
+
else:
|
253 |
+
visual_pattern += "weak "
|
254 |
+
|
255 |
+
# Estimate number of words based on beats (very rough estimate)
|
256 |
+
est_words = max(1, int(num_beats * words_per_beat))
|
257 |
+
|
258 |
+
# Estimate syllables - more flexible now, allowing for reasonable ranges
|
259 |
+
# Typical song might have 1-3 syllables per beat
|
260 |
+
min_syllables = num_beats
|
261 |
+
max_syllables = num_beats * 3
|
262 |
+
|
263 |
+
guide = f"~{est_words} words, ~{min_syllables}-{max_syllables} syllables | Pattern: {visual_pattern}"
|
264 |
+
return guide
|
265 |
+
|
266 |
+
def check_syllable_stress_match(self, text, template, genre="pop"):
|
267 |
+
"""Check if lyrics match the syllable and stress pattern with genre-specific flexibility."""
|
268 |
+
# Split text into words and count syllables
|
269 |
+
words = text.split()
|
270 |
+
syllable_count = sum(self.count_syllables(word) for word in words)
|
271 |
+
|
272 |
+
# Get expected syllable count based on number of beats
|
273 |
+
expected_count = template['num_beats']
|
274 |
+
|
275 |
+
# Get syllable-to-beat ratios based on genre
|
276 |
+
genre_lower = genre.lower()
|
277 |
+
if genre_lower in self.genre_syllable_ratios:
|
278 |
+
min_ratio, typical_ratio, max_ratio = self.genre_syllable_ratios[genre_lower]
|
279 |
+
else:
|
280 |
+
min_ratio, typical_ratio, max_ratio = self.genre_syllable_ratios['default']
|
281 |
+
|
282 |
+
# Calculate flexible min and max syllable expectations based on genre
|
283 |
+
min_expected = max(1, int(expected_count * min_ratio))
|
284 |
+
max_expected = int(expected_count * max_ratio)
|
285 |
+
|
286 |
+
# Check if syllable count falls within genre-appropriate range
|
287 |
+
within_range = min_expected <= syllable_count <= max_expected
|
288 |
+
|
289 |
+
# Consider typical ratio - how close are we to the ideal for this genre?
|
290 |
+
ideal_count = int(expected_count * typical_ratio)
|
291 |
+
closeness_to_ideal = 1.0 - min(abs(syllable_count - ideal_count) / (max_expected - min_expected + 1), 1.0)
|
292 |
+
|
293 |
+
# Get detailed syllable breakdown for stress analysis
|
294 |
+
word_syllables = []
|
295 |
+
for word in words:
|
296 |
+
count = self.count_syllables(word)
|
297 |
+
word_syllables.append(count)
|
298 |
+
|
299 |
+
# Analyze stress pattern match using a more flexible approach
|
300 |
+
stress_pattern = template['stress_pattern']
|
301 |
+
|
302 |
+
# Simple stress matching algorithm (can be improved in future versions)
|
303 |
+
# We need to map syllables to beats in a more flexible way
|
304 |
+
syllable_to_beat_mapping = self._map_syllables_to_beats(word_syllables, stress_pattern)
|
305 |
+
|
306 |
+
# Calculate stress match score based on alignment of stressed syllables with strong beats
|
307 |
+
stress_match_percentage = self._calculate_stress_match(words, word_syllables, syllable_to_beat_mapping, stress_pattern)
|
308 |
+
|
309 |
+
# Consider a stress match if the percentage is high enough
|
310 |
+
stress_matches = stress_match_percentage >= 0.7
|
311 |
+
|
312 |
+
return {
|
313 |
+
'syllable_count': syllable_count,
|
314 |
+
'expected_count': expected_count,
|
315 |
+
'min_expected': min_expected,
|
316 |
+
'max_expected': max_expected,
|
317 |
+
'within_range': within_range,
|
318 |
+
'matches_beat_count': syllable_count == expected_count, # Exact match (strict)
|
319 |
+
'close_match': within_range, # Flexible match (based on genre)
|
320 |
+
'stress_matches': stress_matches,
|
321 |
+
'stress_match_percentage': stress_match_percentage,
|
322 |
+
'closeness_to_ideal': closeness_to_ideal,
|
323 |
+
'word_syllables': word_syllables,
|
324 |
+
'ideal_syllable_count': ideal_count
|
325 |
+
}
|
326 |
+
|
327 |
+
def _map_syllables_to_beats(self, word_syllables, stress_pattern):
|
328 |
+
"""Map syllables to beats in a flexible way."""
|
329 |
+
total_syllables = sum(word_syllables)
|
330 |
+
total_beats = len(stress_pattern)
|
331 |
+
|
332 |
+
# Simple mapping for now - this could be improved with more sophisticated algorithms
|
333 |
+
if total_syllables <= total_beats:
|
334 |
+
# Fewer syllables than beats - some beats have no syllables (prolongation)
|
335 |
+
mapping = []
|
336 |
+
syllable_index = 0
|
337 |
+
for beat_index in range(total_beats):
|
338 |
+
if syllable_index < total_syllables:
|
339 |
+
mapping.append((syllable_index, beat_index))
|
340 |
+
syllable_index += 1
|
341 |
+
return mapping
|
342 |
+
else:
|
343 |
+
# More syllables than beats - some beats have multiple syllables (melisma/syncopation)
|
344 |
+
mapping = []
|
345 |
+
syllables_per_beat = total_syllables / total_beats
|
346 |
+
for beat_index in range(total_beats):
|
347 |
+
start_syllable = int(beat_index * syllables_per_beat)
|
348 |
+
end_syllable = int((beat_index + 1) * syllables_per_beat)
|
349 |
+
for syllable_index in range(start_syllable, end_syllable):
|
350 |
+
if syllable_index < total_syllables:
|
351 |
+
mapping.append((syllable_index, beat_index))
|
352 |
+
return mapping
|
353 |
+
|
354 |
+
def _calculate_stress_match(self, words, word_syllables, syllable_to_beat_mapping, stress_pattern):
|
355 |
+
"""Calculate how well syllable stresses match beat stresses."""
|
356 |
+
# This is a simplified version - real stress analysis would be more complex
|
357 |
+
# For now, we'll assume the first syllable of each word is stressed
|
358 |
+
|
359 |
+
# First, create a flat list of all syllables with their stress (1 = stressed, 0 = unstressed)
|
360 |
+
syllable_stresses = []
|
361 |
+
for word, syllable_count in zip(words, word_syllables):
|
362 |
+
# Simple assumption: first syllable is stressed, rest are unstressed
|
363 |
+
for i in range(syllable_count):
|
364 |
+
if i == 0: # First syllable of word
|
365 |
+
syllable_stresses.append(1) # Stressed
|
366 |
+
else:
|
367 |
+
syllable_stresses.append(0) # Unstressed
|
368 |
+
|
369 |
+
# Count matches between syllable stress and beat stress
|
370 |
+
matches = 0
|
371 |
+
total_mapped = 0
|
372 |
+
|
373 |
+
for syllable_index, beat_index in syllable_to_beat_mapping:
|
374 |
+
if syllable_index < len(syllable_stresses):
|
375 |
+
syllable_stress = syllable_stresses[syllable_index]
|
376 |
+
beat_stress = 1 if stress_pattern[beat_index] == 'S' else (0.5 if stress_pattern[beat_index] == 'M' else 0)
|
377 |
+
|
378 |
+
# Consider it a match if:
|
379 |
+
# - Stressed syllable on Strong beat
|
380 |
+
# - Unstressed syllable on Weak beat
|
381 |
+
# - Some partial credit for other combinations
|
382 |
+
if (syllable_stress == 1 and beat_stress > 0.5) or (syllable_stress == 0 and beat_stress < 0.5):
|
383 |
+
matches += 1
|
384 |
+
elif syllable_stress == 1 and beat_stress == 0.5: # Stressed syllable on Medium beat
|
385 |
+
matches += 0.7
|
386 |
+
|
387 |
+
total_mapped += 1
|
388 |
+
|
389 |
+
if total_mapped == 0:
|
390 |
+
return 0
|
391 |
+
|
392 |
+
return matches / total_mapped
|
emotionanalysis.py
CHANGED
@@ -36,15 +36,12 @@ class MusicAnalyzer:
|
|
36 |
self.key_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
|
37 |
|
38 |
# Common time signatures and their beat patterns with weights for prior probability
|
|
|
39 |
self.common_time_signatures = {
|
40 |
-
"4/4": {"beats_per_bar": 4, "beat_pattern": [1.0, 0.2, 0.5, 0.2], "weight": 0.
|
41 |
"3/4": {"beats_per_bar": 3, "beat_pattern": [1.0, 0.2, 0.3], "weight": 0.25},
|
42 |
"2/4": {"beats_per_bar": 2, "beat_pattern": [1.0, 0.3], "weight": 0.15},
|
43 |
-
"6/8": {"beats_per_bar": 6, "beat_pattern": [1.0, 0.2, 0.3, 0.8, 0.2, 0.3], "weight": 0.
|
44 |
-
"5/4": {"beats_per_bar": 5, "beat_pattern": [1.0, 0.2, 0.4, 0.7, 0.2], "weight": 0.10},
|
45 |
-
"7/8": {"beats_per_bar": 7, "beat_pattern": [1.0, 0.2, 0.3, 0.8, 0.2, 0.2, 0.3], "weight": 0.10},
|
46 |
-
"9/8": {"beats_per_bar": 9, "beat_pattern": [1.0, 0.2, 0.3, 0.8, 0.2, 0.3, 0.7, 0.2, 0.3], "weight": 0.10},
|
47 |
-
"12/8": {"beats_per_bar": 12, "beat_pattern": [1.0, 0.2, 0.3, 0.6, 0.2, 0.3, 0.8, 0.2, 0.3, 0.6, 0.2, 0.3], "weight": 0.15}
|
48 |
}
|
49 |
|
50 |
# Add common accent patterns for different time signatures
|
@@ -52,11 +49,7 @@ class MusicAnalyzer:
|
|
52 |
"4/4": [[1, 0, 0, 0], [1, 0, 2, 0], [1, 0, 2, 0, 3, 0, 2, 0]],
|
53 |
"3/4": [[1, 0, 0], [1, 0, 2]],
|
54 |
"2/4": [[1, 0], [1, 2]],
|
55 |
-
"6/8": [[1, 0, 0, 2, 0, 0], [1, 0, 0, 2, 0, 3]]
|
56 |
-
"5/4": [[1, 0, 0, 2, 0], [1, 0, 2, 0, 0]],
|
57 |
-
"7/8": [[1, 0, 0, 2, 0, 0, 0], [1, 0, 0, 2, 0, 3, 0]],
|
58 |
-
"9/8": [[1, 0, 0, 2, 0, 0, 3, 0, 0]],
|
59 |
-
"12/8": [[1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0]]
|
60 |
}
|
61 |
|
62 |
# Expected rhythm density (relative note density per beat) for different time signatures
|
@@ -64,9 +57,7 @@ class MusicAnalyzer:
|
|
64 |
"4/4": [1.0, 0.7, 0.8, 0.6],
|
65 |
"3/4": [1.0, 0.6, 0.7],
|
66 |
"6/8": [1.0, 0.5, 0.4, 0.8, 0.5, 0.4],
|
67 |
-
"2/4": [1.0, 0.6]
|
68 |
-
"5/4": [1.0, 0.6, 0.8, 0.7, 0.6],
|
69 |
-
"7/8": [1.0, 0.5, 0.4, 0.8, 0.5, 0.4, 0.5]
|
70 |
}
|
71 |
|
72 |
def load_audio(self, file_path, sr=22050, duration=None):
|
@@ -320,7 +311,7 @@ class MusicAnalyzer:
|
|
320 |
|
321 |
# Find peaks in spectrum
|
322 |
peaks = signal.find_peaks(S_tempo, height=np.max(S_tempo)*0.1, distance=5)[0]
|
323 |
-
|
324 |
if len(peaks) == 0:
|
325 |
return {"time_signature": "4/4", "confidence": 0.4}
|
326 |
|
@@ -448,18 +439,18 @@ class MusicAnalyzer:
|
|
448 |
|
449 |
def _estimate_from_tempo(self, tempo):
|
450 |
"""Use tempo to help estimate likely time signature"""
|
451 |
-
# Statistical tendencies: slower tempos often in compound meters (6/8
|
452 |
-
#
|
453 |
|
454 |
scores = {}
|
455 |
|
456 |
if tempo < 70:
|
457 |
# Slow tempos favor compound meters
|
458 |
scores = {
|
459 |
-
"4/4": 0.
|
460 |
-
"3/4": 0.
|
461 |
-
"
|
462 |
-
"
|
463 |
}
|
464 |
elif 70 <= tempo <= 120:
|
465 |
# Medium tempos favor 4/4, 3/4
|
@@ -467,15 +458,15 @@ class MusicAnalyzer:
|
|
467 |
"4/4": 0.7,
|
468 |
"3/4": 0.6,
|
469 |
"2/4": 0.4,
|
470 |
-
"6/8": 0.
|
471 |
}
|
472 |
else:
|
473 |
# Fast tempos favor simpler meters
|
474 |
scores = {
|
475 |
"4/4": 0.6,
|
476 |
"2/4": 0.7,
|
477 |
-
"
|
478 |
-
"
|
479 |
}
|
480 |
|
481 |
# Find best match
|
|
|
36 |
self.key_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
|
37 |
|
38 |
# Common time signatures and their beat patterns with weights for prior probability
|
39 |
+
# Simplified to only include 4/4, 3/4, 2/4, and 6/8
|
40 |
self.common_time_signatures = {
|
41 |
+
"4/4": {"beats_per_bar": 4, "beat_pattern": [1.0, 0.2, 0.5, 0.2], "weight": 0.45},
|
42 |
"3/4": {"beats_per_bar": 3, "beat_pattern": [1.0, 0.2, 0.3], "weight": 0.25},
|
43 |
"2/4": {"beats_per_bar": 2, "beat_pattern": [1.0, 0.3], "weight": 0.15},
|
44 |
+
"6/8": {"beats_per_bar": 6, "beat_pattern": [1.0, 0.2, 0.3, 0.8, 0.2, 0.3], "weight": 0.15}
|
|
|
|
|
|
|
|
|
45 |
}
|
46 |
|
47 |
# Add common accent patterns for different time signatures
|
|
|
49 |
"4/4": [[1, 0, 0, 0], [1, 0, 2, 0], [1, 0, 2, 0, 3, 0, 2, 0]],
|
50 |
"3/4": [[1, 0, 0], [1, 0, 2]],
|
51 |
"2/4": [[1, 0], [1, 2]],
|
52 |
+
"6/8": [[1, 0, 0, 2, 0, 0], [1, 0, 0, 2, 0, 3]]
|
|
|
|
|
|
|
|
|
53 |
}
|
54 |
|
55 |
# Expected rhythm density (relative note density per beat) for different time signatures
|
|
|
57 |
"4/4": [1.0, 0.7, 0.8, 0.6],
|
58 |
"3/4": [1.0, 0.6, 0.7],
|
59 |
"6/8": [1.0, 0.5, 0.4, 0.8, 0.5, 0.4],
|
60 |
+
"2/4": [1.0, 0.6]
|
|
|
|
|
61 |
}
|
62 |
|
63 |
def load_audio(self, file_path, sr=22050, duration=None):
|
|
|
311 |
|
312 |
# Find peaks in spectrum
|
313 |
peaks = signal.find_peaks(S_tempo, height=np.max(S_tempo)*0.1, distance=5)[0]
|
314 |
+
|
315 |
if len(peaks) == 0:
|
316 |
return {"time_signature": "4/4", "confidence": 0.4}
|
317 |
|
|
|
439 |
|
440 |
def _estimate_from_tempo(self, tempo):
|
441 |
"""Use tempo to help estimate likely time signature"""
|
442 |
+
# Statistical tendencies: slower tempos often in compound meters (6/8)
|
443 |
+
# Fast tempos often favor simple meters (2/4)
|
444 |
|
445 |
scores = {}
|
446 |
|
447 |
if tempo < 70:
|
448 |
# Slow tempos favor compound meters
|
449 |
scores = {
|
450 |
+
"4/4": 0.5,
|
451 |
+
"3/4": 0.4,
|
452 |
+
"2/4": 0.3,
|
453 |
+
"6/8": 0.7
|
454 |
}
|
455 |
elif 70 <= tempo <= 120:
|
456 |
# Medium tempos favor 4/4, 3/4
|
|
|
458 |
"4/4": 0.7,
|
459 |
"3/4": 0.6,
|
460 |
"2/4": 0.4,
|
461 |
+
"6/8": 0.3
|
462 |
}
|
463 |
else:
|
464 |
# Fast tempos favor simpler meters
|
465 |
scores = {
|
466 |
"4/4": 0.6,
|
467 |
"2/4": 0.7,
|
468 |
+
"3/4": 0.4,
|
469 |
+
"6/8": 0.2
|
470 |
}
|
471 |
|
472 |
# Find best match
|
requirements.txt
CHANGED
@@ -13,3 +13,4 @@ scipy>=1.12.0
|
|
13 |
soundfile>=0.12.1
|
14 |
matplotlib>=3.7.0
|
15 |
pronouncing>=0.2.0
|
|
|
|
13 |
soundfile>=0.12.1
|
14 |
matplotlib>=3.7.0
|
15 |
pronouncing>=0.2.0
|
16 |
+
nltk>=3.8.1
|