|
import os |
|
import io |
|
import gradio as gr |
|
import torch |
|
import numpy as np |
|
import re |
|
import pronouncing |
|
import functools |
|
from transformers import ( |
|
AutoModelForAudioClassification, |
|
AutoFeatureExtractor, |
|
AutoTokenizer, |
|
pipeline, |
|
AutoModelForCausalLM, |
|
BitsAndBytesConfig |
|
) |
|
from huggingface_hub import login |
|
from utils import ( |
|
load_audio, |
|
extract_audio_duration, |
|
extract_mfcc_features, |
|
format_genre_results, |
|
ensure_cuda_availability |
|
) |
|
from emotionanalysis import MusicAnalyzer |
|
import librosa |
|
from beat_analysis import BeatAnalyzer |
|
|
|
|
|
beat_analyzer = BeatAnalyzer() |
|
|
|
|
|
if "HF_TOKEN" in os.environ: |
|
login(token=os.environ["HF_TOKEN"]) |
|
|
|
|
|
GENRE_MODEL_NAME = "dima806/music_genres_classification" |
|
MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593" |
|
LLM_MODEL_NAME = "Qwen/Qwen3-32B" |
|
SAMPLE_RATE = 22050 |
|
|
|
|
|
CUDA_AVAILABLE = ensure_cuda_availability() |
|
|
|
|
|
print("Loading genre classification model...") |
|
try: |
|
genre_feature_extractor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME) |
|
genre_model = AutoModelForAudioClassification.from_pretrained( |
|
GENRE_MODEL_NAME, |
|
device_map="auto" if CUDA_AVAILABLE else None |
|
) |
|
|
|
def get_genre_model(): |
|
return genre_model, genre_feature_extractor |
|
except Exception as e: |
|
print(f"Error loading genre model: {str(e)}") |
|
genre_model = None |
|
genre_feature_extractor = None |
|
|
|
|
|
print("Loading Qwen LLM model with 4-bit quantization...") |
|
try: |
|
|
|
quantization_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_compute_dtype=torch.float16, |
|
bnb_4bit_use_double_quant=True |
|
) |
|
|
|
llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME) |
|
llm_model = AutoModelForCausalLM.from_pretrained( |
|
LLM_MODEL_NAME, |
|
quantization_config=quantization_config, |
|
device_map="auto", |
|
trust_remote_code=True, |
|
torch_dtype=torch.float16, |
|
use_cache=True |
|
) |
|
except Exception as e: |
|
print(f"Error loading LLM model: {str(e)}") |
|
llm_tokenizer = None |
|
llm_model = None |
|
|
|
|
|
music_analyzer = MusicAnalyzer() |
|
|
|
|
|
emotion_lexicons = { |
|
"happy": ["joy", "happy", "smile", "laugh", "light", "bright", "sun", "dance", "celebrate", "glow", "warm"], |
|
"sad": ["cry", "tear", "pain", "loss", "grief", "dark", "alone", "miss", "gone", "sorrow", "heart", "break"], |
|
"calm": ["peace", "quiet", "still", "gentle", "soft", "slow", "breath", "serene", "tranquil", "relax", "flow"], |
|
"energetic": ["run", "fast", "beat", "pulse", "jump", "fire", "alive", "spark", "rush", "wild", "free"], |
|
"tense": ["fear", "worry", "wait", "edge", "grip", "tight", "storm", "break", "shadow", "threat", "doubt"], |
|
"nostalgic": ["memory", "remember", "past", "time", "ago", "once", "childhood", "return", "old", "familiar", "home"], |
|
"reflective": ["think", "ponder", "wonder", "question", "search", "mind", "deep", "self", "mirror", "path", "journey"], |
|
"triumphant": ["win", "rise", "stand", "overcome", "above", "victory", "summit", "conquer", "champion", "succeed"], |
|
"yearning": ["want", "need", "desire", "reach", "seek", "dream", "hope", "wish", "long", "hunger", "thirst"], |
|
"peaceful": ["calm", "rest", "still", "quiet", "harmony", "balance", "ease", "gentle", "soft", "float", "drift"] |
|
} |
|
|
|
theme_lexicons = { |
|
"love": ["love", "heart", "touch", "together", "hold", "kiss", "embrace", "feel", "close", "intimate", "passion"], |
|
"loss": ["gone", "away", "empty", "missing", "leave", "without", "never", "forever", "lost", "memory", "shadow"], |
|
"freedom": ["free", "fly", "open", "release", "escape", "chain", "break", "boundless", "space", "breathe", "wings"], |
|
"triumph": ["victory", "overcome", "win", "rise", "mountain", "climb", "top", "struggle", "strength", "succeed"], |
|
"reflection": ["mirror", "water", "see", "self", "face", "look", "inside", "truth", "reality", "soul", "mind"], |
|
"journey": ["road", "path", "walk", "step", "travel", "distance", "far", "way", "wander", "search", "find"], |
|
"time": ["clock", "moment", "second", "hour", "pass", "wait", "forever", "instant", "eternity", "memory", "future"], |
|
"conflict": ["fight", "battle", "against", "oppose", "between", "war", "struggle", "clash", "resist", "enemy"], |
|
"nature": ["earth", "wind", "fire", "water", "sky", "tree", "flower", "mountain", "river", "ocean", "stars"], |
|
"change": ["transform", "become", "different", "shift", "turn", "evolve", "grow", "new", "begin", "end", "cycle"] |
|
} |
|
|
|
|
|
def process_audio(audio_file): |
|
if audio_file is None: |
|
return "No audio file provided", None, None, None, None, None, None, None |
|
|
|
try: |
|
|
|
y, sr = load_audio(audio_file, sr=SAMPLE_RATE) |
|
|
|
|
|
duration = extract_audio_duration(y, sr) |
|
|
|
|
|
music_analysis = music_analyzer.analyze_music(audio_file) |
|
|
|
|
|
tempo = music_analysis["rhythm_analysis"]["tempo"] |
|
emotion = music_analysis["emotion_analysis"]["primary_emotion"] |
|
theme = music_analysis["theme_analysis"]["primary_theme"] |
|
|
|
|
|
if genre_model is not None and genre_feature_extractor is not None: |
|
|
|
y_16k = librosa.resample(y, orig_sr=sr, target_sr=16000) |
|
|
|
|
|
inputs = genre_feature_extractor( |
|
y_16k, |
|
sampling_rate=16000, |
|
return_tensors="pt" |
|
).to(genre_model.device) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = genre_model(**inputs) |
|
logits = outputs.logits |
|
probs = torch.nn.functional.softmax(logits, dim=-1) |
|
|
|
|
|
values, indices = torch.topk(probs[0], k=5) |
|
top_genres = [(genre_model.config.id2label[idx.item()], val.item()) for val, idx in zip(values, indices)] |
|
else: |
|
|
|
top_genres = [("Unknown", 1.0)] |
|
|
|
|
|
genre_results_text = format_genre_results(top_genres) |
|
primary_genre = top_genres[0][0] |
|
|
|
|
|
if any(genre.lower() in primary_genre.lower() for genre in ['pop', 'disco']): |
|
music_analysis["rhythm_analysis"]["estimated_time_signature"] = "4/4" |
|
time_signature = "4/4" |
|
else: |
|
|
|
time_signature = music_analysis["rhythm_analysis"]["estimated_time_signature"] |
|
|
|
|
|
if time_signature not in ["4/4", "3/4", "6/8"]: |
|
time_signature = "4/4" |
|
music_analysis["rhythm_analysis"]["estimated_time_signature"] = time_signature |
|
|
|
|
|
beat_analysis = beat_analyzer.analyze_beat_pattern(audio_file, time_signature=time_signature) |
|
lyric_templates = beat_analyzer.create_lyric_template(beat_analysis) |
|
|
|
|
|
music_analysis["beat_analysis"] = beat_analysis |
|
music_analysis["lyric_templates"] = lyric_templates |
|
|
|
|
|
analysis_summary = f""" |
|
### Music Analysis Results |
|
|
|
**Duration:** {duration:.2f} seconds |
|
**Tempo:** {tempo:.1f} BPM |
|
**Time Signature:** {time_signature} |
|
**Key:** {music_analysis["tonal_analysis"]["key"]} {music_analysis["tonal_analysis"]["mode"]} |
|
**Primary Emotion:** {emotion} |
|
**Primary Theme:** {theme} |
|
**Top Genre:** {primary_genre} |
|
|
|
{genre_results_text} |
|
""" |
|
|
|
|
|
if lyric_templates: |
|
analysis_summary += f""" |
|
### Beat Analysis |
|
|
|
**Total Phrases:** {len(lyric_templates)} |
|
**Average Beats Per Phrase:** {np.mean([t['num_beats'] for t in lyric_templates]):.1f} |
|
**Beat Pattern Examples:** |
|
- Phrase 1: {lyric_templates[0]['stress_pattern'] if lyric_templates else 'N/A'} |
|
- Phrase 2: {lyric_templates[1]['stress_pattern'] if len(lyric_templates) > 1 else 'N/A'} |
|
""" |
|
|
|
|
|
|
|
genre_supported = any(genre.lower() in primary_genre.lower() for genre in beat_analyzer.supported_genres) |
|
|
|
|
|
if genre_supported: |
|
lyrics = generate_lyrics(music_analysis, primary_genre, duration) |
|
beat_match_analysis = analyze_lyrics_rhythm_match(lyrics, lyric_templates, primary_genre, emotion, theme) |
|
else: |
|
supported_genres_str = ", ".join([genre.capitalize() for genre in beat_analyzer.supported_genres]) |
|
lyrics = f"Lyrics generation is only supported for the following genres: {supported_genres_str}.\n\nDetected genre '{primary_genre}' doesn't have strong syllable-to-beat patterns required for our lyric generation algorithm." |
|
beat_match_analysis = "Lyrics generation not available for this genre." |
|
|
|
return analysis_summary, lyrics, tempo, time_signature, emotion, theme, primary_genre, beat_match_analysis |
|
|
|
except Exception as e: |
|
error_msg = f"Error processing audio: {str(e)}" |
|
print(error_msg) |
|
return error_msg, None, None, None, None, None, None, None |
|
|
|
def generate_lyrics(music_analysis, genre, duration): |
|
try: |
|
|
|
tempo = music_analysis["rhythm_analysis"]["tempo"] |
|
key = music_analysis["tonal_analysis"]["key"] |
|
mode = music_analysis["tonal_analysis"]["mode"] |
|
emotion = music_analysis["emotion_analysis"]["primary_emotion"] |
|
theme = music_analysis["theme_analysis"]["primary_theme"] |
|
|
|
|
|
lyric_templates = music_analysis.get("lyric_templates", []) |
|
|
|
|
|
if llm_model is None or llm_tokenizer is None: |
|
return "Error: LLM model not properly loaded" |
|
|
|
|
|
if not lyric_templates: |
|
|
|
prompt = f"""Write song lyrics for a {genre} song in {key} {mode} with tempo {tempo} BPM. The emotion is {emotion} and theme is {theme}. |
|
|
|
ONLY WRITE THE ACTUAL LYRICS. NO EXPLANATIONS OR META-TEXT. |
|
""" |
|
else: |
|
|
|
num_phrases = len(lyric_templates) |
|
|
|
|
|
if num_phrases > 0: |
|
|
|
max_syllables = max([t.get('max_expected', 7) for t in lyric_templates]) if lyric_templates[0].get('max_expected') else 7 |
|
min_syllables = min([t.get('min_expected', 2) for t in lyric_templates]) if lyric_templates[0].get('min_expected') else 2 |
|
avg_syllables = (min_syllables + max_syllables) // 2 |
|
else: |
|
min_syllables = 2 |
|
max_syllables = 7 |
|
avg_syllables = 4 |
|
|
|
|
|
prompt = f"""Write song lyrics for a {genre} song that truly captures the emotional essence of "{emotion}" and explores the theme of "{theme}". The song is in {key} {mode} with tempo {tempo} BPM. |
|
|
|
I need EXACTLY {num_phrases} lines of lyrics - one line for each musical phrase. |
|
|
|
YOUR TOP PRIORITIES (in order): |
|
1. EXPRESS THE EMOTION: "{emotion}" should be felt through your word choices |
|
2. DEVELOP THE THEME: "{theme}" should be clearly represented |
|
3. CONNECT YOUR LINES: spread complete thoughts across 2-3 consecutive lines |
|
4. KEEP LINES SHORT: {min_syllables}-{max_syllables} syllables per line (aim for {avg_syllables}) |
|
|
|
ADDITIONAL REQUIREMENTS: |
|
- Create original lyrics that reflect this specific emotion and theme |
|
- Let sentence clauses flow naturally across line breaks |
|
- Use vivid imagery and sensory details related to the emotion |
|
- Each line should contribute to the overall theme |
|
- Don't copy the example structures - be creative and unique |
|
- Use simple, concise words that evoke strong emotions |
|
|
|
AVOID: |
|
- Generic phrases that could apply to any song |
|
- Copying patterns from the examples below |
|
- Complete, independent thoughts on each line |
|
- Abstract concepts without concrete imagery |
|
|
|
FORMAT: |
|
- Plain text, one line per musical phrase |
|
- No annotations, explanations, or labels |
|
|
|
Here's a simplified structural example of connecting thoughts across lines: |
|
|
|
Line 1 (introduces an image) |
|
Line 2 (extends the image) |
|
Line 3 (completes the thought) |
|
|
|
Line 4 (starts a new thought) |
|
Line 5 (continues it) |
|
And so on... |
|
|
|
Remember: YOUR LYRICS SHOULD DEEPLY EXPRESS "{emotion}" AND EXPLORE "{theme}" - make every word count toward these goals. |
|
""" |
|
|
|
|
|
messages = [ |
|
{"role": "user", "content": prompt} |
|
] |
|
|
|
|
|
text = llm_tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
|
|
|
|
model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device) |
|
|
|
|
|
generated_ids = llm_model.generate( |
|
**model_inputs, |
|
max_new_tokens=1024, |
|
do_sample=True, |
|
temperature=0.7, |
|
top_p=0.9, |
|
repetition_penalty=1.2, |
|
pad_token_id=llm_tokenizer.eos_token_id |
|
) |
|
|
|
|
|
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() |
|
lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip() |
|
|
|
|
|
|
|
|
|
|
|
divider_patterns = [ |
|
r'Here are the lyrics:', |
|
r'Here is my song:', |
|
r'The lyrics:', |
|
r'My lyrics:', |
|
r'Song lyrics:', |
|
r'\*\*\*+', |
|
r'===+', |
|
r'---+', |
|
r'```', |
|
r'Lyrics:' |
|
] |
|
|
|
for pattern in divider_patterns: |
|
matches = re.finditer(pattern, lyrics, re.IGNORECASE) |
|
for match in matches: |
|
|
|
lyrics = lyrics[match.end():].strip() |
|
|
|
|
|
lyrics = re.sub(r'<think>.*?</think>', '', lyrics, flags=re.DOTALL) |
|
lyrics = re.sub(r'\[thinking\].*?\[/thinking\]', '', lyrics, flags=re.DOTALL) |
|
lyrics = re.sub(r'<think>', '', lyrics, flags=re.DOTALL) |
|
lyrics = re.sub(r'</think>', '', lyrics, flags=re.DOTALL) |
|
lyrics = re.sub(r'\[thinking\]', '', lyrics, flags=re.DOTALL) |
|
lyrics = re.sub(r'\[/thinking\]', '', lyrics, flags=re.DOTALL) |
|
|
|
|
|
lines = lyrics.strip().split('\n') |
|
clean_lines = [] |
|
|
|
|
|
non_lyric_patterns = [ |
|
|
|
r'^(note|thinking|thoughts|let me|i will|i am going|i would|i can|i need to|i have to|i should|let\'s|here|now)', |
|
r'^(first|second|third|next|finally|importantly|remember|so|ok|okay|as requested|as asked|considering)', |
|
|
|
r'syllable[s]?|phrase|rhythm|beats?|tempo|bpm|instruction|follow|alignment|match|corresponding', |
|
r'verses?|chorus|bridge|section|stanza|part|template|format|pattern|example', |
|
r'requirements?|guidelines?|song structure|stressed|unstressed', |
|
|
|
r'generated|output|result|provide|create|write|draft|version', |
|
|
|
r'^line \d+|^\d+[\.\):]|^\[\w+\]|^[\*\-\+] ', |
|
|
|
r'\?$|analysis|evaluate|review|check|ensure', |
|
|
|
r'make sure|please note|important|notice|pay attention' |
|
] |
|
|
|
|
|
for line in lines: |
|
line = line.strip() |
|
|
|
|
|
if not line or line.isspace(): |
|
continue |
|
|
|
|
|
should_skip = False |
|
for pattern in non_lyric_patterns: |
|
if re.search(pattern, line.lower()): |
|
should_skip = True |
|
break |
|
|
|
if should_skip: |
|
continue |
|
|
|
|
|
if (line.startswith('[') and ']' in line) or (line.startswith('(') and ')' in line and len(line) < 20): |
|
continue |
|
|
|
|
|
if ':' in line and not any(word in line.lower() for word in ['like', 'when', 'where', 'how', 'why', 'what']): |
|
if len(line.split(':')[0]) < 15: |
|
continue |
|
|
|
|
|
if len(line) < 3: |
|
continue |
|
|
|
|
|
if re.match(r'^\d+\.|\(#\d+\)|\d+\)', line): |
|
continue |
|
|
|
|
|
if re.match(r'^#{1,6} |^\*\*|^__', line): |
|
continue |
|
|
|
|
|
if '<think>' in line.lower() or '</think>' in line.lower() or '[thinking]' in line.lower() or '[/thinking]' in line.lower(): |
|
continue |
|
|
|
|
|
clean_lines.append(line) |
|
|
|
|
|
|
|
if clean_lines and any(clean_lines[0].lower().startswith(prefix) for prefix in |
|
['here are', 'these are', 'below are', 'following are']): |
|
clean_lines = clean_lines[1:] |
|
|
|
|
|
if len(clean_lines) > 3: |
|
|
|
first_three = ' '.join(clean_lines[:3]).lower() |
|
if any(term in first_three for term in ['i will', 'i have created', 'i\'ll provide', 'i\'ll write']): |
|
|
|
start_idx = 0 |
|
for i, line in enumerate(clean_lines): |
|
if i >= 3 and not any(term in line.lower() for term in ['i will', 'created', 'write', 'provide']): |
|
start_idx = i |
|
break |
|
clean_lines = clean_lines[start_idx:] |
|
|
|
|
|
last_three = ' '.join(clean_lines[-3:]).lower() |
|
if any(term in last_three for term in ['hope this', 'these lyrics', 'as you can see', 'this song', 'i have']): |
|
|
|
end_idx = len(clean_lines) |
|
for i in range(len(clean_lines) - 1, max(0, len(clean_lines) - 4), -1): |
|
if i < len(clean_lines) and not any(term in clean_lines[i].lower() for term in |
|
['hope', 'these lyrics', 'as you can see', 'this song']): |
|
end_idx = i + 1 |
|
break |
|
clean_lines = clean_lines[:end_idx] |
|
|
|
|
|
for i in range(len(clean_lines)): |
|
|
|
clean_lines[i] = re.sub(r'\s+//.*$', '', clean_lines[i]) |
|
clean_lines[i] = re.sub(r'\s+\(.*?\)$', '', clean_lines[i]) |
|
|
|
|
|
clean_lines[i] = re.sub(r'<think>.*?</think>', '', clean_lines[i], flags=re.DOTALL) |
|
clean_lines[i] = re.sub(r'\[thinking\].*?\[/thinking\]', '', clean_lines[i], flags=re.DOTALL) |
|
clean_lines[i] = re.sub(r'<think>', '', clean_lines[i]) |
|
clean_lines[i] = re.sub(r'</think>', '', clean_lines[i]) |
|
clean_lines[i] = re.sub(r'\[thinking\]', '', clean_lines[i]) |
|
clean_lines[i] = re.sub(r'\[/thinking\]', '', clean_lines[i]) |
|
|
|
|
|
clean_lines[i] = re.sub(r'\s*\(\d+\s*syllables?\)', '', clean_lines[i]) |
|
|
|
|
|
clean_lines = [line for line in clean_lines if line.strip() and not line.isspace()] |
|
|
|
|
|
if lyric_templates: |
|
num_required = len(lyric_templates) |
|
|
|
|
|
if len(clean_lines) > num_required: |
|
|
|
clean_lines = clean_lines[:num_required] |
|
|
|
|
|
while len(clean_lines) < num_required: |
|
i = len(clean_lines) |
|
if i < len(lyric_templates): |
|
template = lyric_templates[i] |
|
target_syllables = min(max_syllables, (template.get('min_expected', 2) + template.get('max_expected', 7)) // 2) |
|
|
|
|
|
placeholders = { |
|
|
|
2: [ |
|
"Night falls", |
|
"Time stops", |
|
"Hearts beat", |
|
"Rain falls", |
|
"Stars shine" |
|
], |
|
|
|
3: [ |
|
"Empty chair", |
|
"Shadows dance", |
|
"Whispers fade", |
|
"Memories", |
|
"Silent room" |
|
], |
|
|
|
4: [ |
|
"Moonlight shimmers", |
|
"Echoes of time", |
|
"Footsteps fading", |
|
"Memories drift", |
|
"Silence speaks loud" |
|
], |
|
|
|
5: [ |
|
"Walking in the rain", |
|
"Whispers in the dark", |
|
"Echoes of your voice", |
|
"Traces left behind", |
|
"Time moves ever on" |
|
], |
|
|
|
6: [ |
|
"Dancing in the moonlight", |
|
"Shadows play on the wall", |
|
"Memories fade to silence", |
|
"Moments lost in the wind", |
|
"Whispers of a better time" |
|
] |
|
} |
|
|
|
|
|
closest_group = min(placeholders.keys(), key=lambda k: abs(k - target_syllables)) |
|
|
|
|
|
available_placeholders = [p for p in placeholders[closest_group] |
|
if p not in clean_lines] |
|
|
|
if available_placeholders: |
|
placeholder = available_placeholders[i % len(available_placeholders)] |
|
else: |
|
|
|
if emotion.lower() in ["sad", "nostalgic", "calm"]: |
|
placeholder = f"Memories of {emotion}" |
|
elif emotion.lower() in ["happy", "energetic"]: |
|
placeholder = f"Dancing through {emotion}" |
|
else: |
|
placeholder = f"Feeling {emotion} now" |
|
else: |
|
placeholder = "Silence speaks volumes" |
|
|
|
clean_lines.append(placeholder) |
|
|
|
|
|
final_lyrics = '\n'.join(clean_lines) |
|
|
|
|
|
if not final_lyrics or len(final_lyrics) < 10: |
|
return "The model generated only thinking content but no actual lyrics. Please try again." |
|
|
|
return final_lyrics |
|
|
|
except Exception as e: |
|
error_msg = f"Error generating lyrics: {str(e)}" |
|
print(error_msg) |
|
return error_msg |
|
|
|
def analyze_lyrics_rhythm_match(lyrics, lyric_templates, genre="pop", emotion="reflective", theme="journey"): |
|
"""Analyze how well the generated lyrics match the beat patterns and syllable requirements""" |
|
if not lyric_templates or not lyrics: |
|
return "No beat templates or lyrics available for analysis." |
|
|
|
|
|
lines = lyrics.strip().split('\n') |
|
lines = [line for line in lines if line.strip()] |
|
|
|
|
|
result = "### Beat & Syllable Match Analysis\n\n" |
|
result += "| Line | Syllables | Target Range | Match | Stress Pattern |\n" |
|
result += "| ---- | --------- | ------------ | ----- | -------------- |\n" |
|
|
|
|
|
line_count = min(len(lines), len(lyric_templates)) |
|
|
|
|
|
total_matches = 0 |
|
total_range_matches = 0 |
|
total_stress_matches = 0 |
|
total_stress_percentage = 0 |
|
total_ideal_matches = 0 |
|
|
|
for i in range(line_count): |
|
line = lines[i] |
|
template = lyric_templates[i] |
|
|
|
|
|
check_result = beat_analyzer.check_syllable_stress_match(line, template, genre) |
|
|
|
|
|
if check_result["close_to_ideal"]: |
|
syllable_match = "✓" |
|
elif check_result["within_range"]: |
|
syllable_match = "✓*" |
|
else: |
|
syllable_match = "✗" |
|
|
|
stress_match = "✓" if check_result["stress_matches"] else f"{int(check_result['stress_match_percentage']*100)}%" |
|
|
|
|
|
if check_result["close_to_ideal"]: |
|
total_matches += 1 |
|
total_ideal_matches += 1 |
|
elif check_result["within_range"]: |
|
total_range_matches += 1 |
|
|
|
if check_result["stress_matches"]: |
|
total_stress_matches += 1 |
|
total_stress_percentage += check_result["stress_match_percentage"] |
|
|
|
|
|
stress_visual = "" |
|
for char in template['stress_pattern']: |
|
if char == "S": |
|
stress_visual += "X" |
|
elif char == "M": |
|
stress_visual += "x" |
|
else: |
|
stress_visual += "." |
|
|
|
|
|
result += f"| {i+1} | {check_result['syllable_count']} | {check_result['min_expected']}-{check_result['max_expected']} | {syllable_match} | {stress_visual} |\n" |
|
|
|
|
|
if line_count > 0: |
|
exact_match_rate = (total_matches / line_count) * 100 |
|
range_match_rate = ((total_matches + total_range_matches) / line_count) * 100 |
|
ideal_match_rate = (total_ideal_matches / line_count) * 100 |
|
stress_match_rate = (total_stress_matches / line_count) * 100 |
|
avg_stress_percentage = (total_stress_percentage / line_count) * 100 |
|
|
|
result += f"\n**Summary:**\n" |
|
result += f"- Ideal or near-ideal syllable match rate: {exact_match_rate:.1f}%\n" |
|
result += f"- Genre-appropriate syllable range match rate: {range_match_rate:.1f}%\n" |
|
result += f"- Perfect stress pattern match rate: {stress_match_rate:.1f}%\n" |
|
result += f"- Average stress pattern accuracy: {avg_stress_percentage:.1f}%\n" |
|
result += f"- Overall rhythmic accuracy: {((range_match_rate + avg_stress_percentage) / 2):.1f}%\n" |
|
|
|
|
|
sentence_flow_analysis = analyze_sentence_flow(lines) |
|
result += f"\n**Sentence Flow Analysis:**\n" |
|
result += f"- Connected thought groups: {sentence_flow_analysis['connected_groups']} detected\n" |
|
result += f"- Average lines per thought: {sentence_flow_analysis['avg_lines_per_group']:.1f}\n" |
|
result += f"- Flow quality: {sentence_flow_analysis['flow_quality']}\n" |
|
|
|
|
|
content_analysis = analyze_theme_emotion_expression(lyrics, theme, emotion) |
|
result += f"\n**Theme & Emotion Expression:**\n" |
|
result += f"- Emotion ({emotion}) expression: {content_analysis['emotion_score']:.1f}% ({content_analysis['emotion_words_found']} words)\n" |
|
result += f"- Theme ({theme}) development: {content_analysis['theme_score']:.1f}% ({content_analysis['theme_words_found']} words)\n" |
|
result += f"- Overall expression quality: {content_analysis['expression_quality']}\n" |
|
|
|
|
|
result += f"\n**Improvement Recommendations:**\n" |
|
|
|
|
|
if range_match_rate < 70: |
|
result += f"- **Syllable count:** Aim for {min([t.get('min_expected', 3) for t in lyric_templates])}-{max([t.get('max_expected', 7) for t in lyric_templates])} syllables per line\n" |
|
|
|
|
|
if sentence_flow_analysis['connected_groups'] < len(lines) / 5: |
|
result += f"- **Line connections:** Break complete thoughts across 2-3 lines using conjunctions and prepositions\n" |
|
result += f"- **Flow techniques:** Start lines with connecting words like 'as', 'when', 'while', 'through'\n" |
|
|
|
|
|
if content_analysis['emotion_score'] < 20: |
|
result += f"- **Emotion expression:** Use more words that evoke '{emotion}' feelings (e.g., {', '.join(emotion_lexicons.get(emotion.lower(), ['expressive words'])[:3])})\n" |
|
|
|
if content_analysis['theme_score'] < 20: |
|
result += f"- **Theme development:** Incorporate more '{theme}' imagery and concepts (e.g., {', '.join(theme_lexicons.get(theme.lower(), ['thematic words'])[:3])})\n" |
|
|
|
|
|
result += f"- **Originality:** Avoid generic phrases and create specific, vivid imagery\n" |
|
result += f"- **Sensory details:** Include concrete details that can be seen, heard, or felt\n" |
|
|
|
|
|
result += f"\n**Genre Notes ({genre}):**\n" |
|
|
|
|
|
if genre.lower() == "pop": |
|
result += "- Pop lyrics work well with thoughts spanning 2-3 musical phrases\n" |
|
result += "- Create flow by connecting lines with transitions like 'as', 'when', 'through'\n" |
|
elif genre.lower() == "rock": |
|
result += "- Rock lyrics benefit from short phrases that build into complete thoughts\n" |
|
result += "- Use line breaks strategically to emphasize key words\n" |
|
elif genre.lower() == "country": |
|
result += "- Country lyrics tell stories that flow naturally across multiple lines\n" |
|
result += "- Connect narrative elements across phrases for authentic storytelling\n" |
|
elif genre.lower() == "disco": |
|
result += "- Disco lyrics work well with phrases that create rhythmic momentum\n" |
|
result += "- Use line transitions that maintain energy and flow\n" |
|
elif genre.lower() == "metal": |
|
result += "- Metal lyrics can create intensity by breaking phrases at dramatic points\n" |
|
result += "- Connect lines to build tension and release across measures\n" |
|
else: |
|
result += "- This genre works well with connected thoughts across multiple lines\n" |
|
result += "- Aim for natural speech flow rather than complete thoughts per line\n" |
|
|
|
return result |
|
|
|
def analyze_sentence_flow(lines): |
|
"""Analyze how well the lyrics create sentence flow across multiple lines""" |
|
if not lines or len(lines) < 2: |
|
return { |
|
"connected_groups": 0, |
|
"avg_lines_per_group": 0, |
|
"flow_quality": "Insufficient lines to analyze" |
|
} |
|
|
|
|
|
continuation_starters = [ |
|
'and', 'but', 'or', 'nor', 'for', 'yet', 'so', |
|
'as', 'when', 'while', 'before', 'after', 'since', 'until', 'because', 'although', 'though', |
|
'with', 'without', 'through', 'throughout', 'beyond', 'beneath', 'under', 'over', 'into', 'onto', |
|
'to', 'from', 'by', 'at', 'in', 'on', 'of', |
|
'where', 'how', 'who', 'whom', 'whose', 'which', 'that', |
|
'if', 'then', |
|
] |
|
|
|
|
|
connected_lines = [] |
|
potential_groups = [] |
|
current_group = [0] |
|
|
|
for i in range(1, len(lines)): |
|
|
|
words = lines[i].lower().split() |
|
|
|
|
|
if not words: |
|
if len(current_group) > 1: |
|
potential_groups.append(current_group.copy()) |
|
current_group = [i] |
|
continue |
|
|
|
|
|
first_word = words[0].strip(',.!?;:') |
|
if first_word in continuation_starters: |
|
connected_lines.append(i) |
|
current_group.append(i) |
|
|
|
elif not first_word[0].isupper() and first_word[0].isalpha(): |
|
connected_lines.append(i) |
|
current_group.append(i) |
|
|
|
elif len(words) <= 3 and i < len(lines) - 1: |
|
|
|
if i+1 < len(lines): |
|
next_words = lines[i+1].lower().split() |
|
if next_words and next_words[0] in continuation_starters: |
|
connected_lines.append(i) |
|
current_group.append(i) |
|
else: |
|
|
|
if len(current_group) > 1: |
|
potential_groups.append(current_group.copy()) |
|
current_group = [i] |
|
else: |
|
|
|
if len(current_group) > 1: |
|
potential_groups.append(current_group.copy()) |
|
current_group = [i] |
|
|
|
|
|
if len(current_group) > 1: |
|
potential_groups.append(current_group) |
|
|
|
|
|
connected_groups = len(potential_groups) |
|
|
|
if connected_groups > 0: |
|
avg_lines_per_group = sum(len(group) for group in potential_groups) / connected_groups |
|
|
|
|
|
if connected_groups >= len(lines) / 3 and avg_lines_per_group >= 2.5: |
|
flow_quality = "Excellent - multiple connected thoughts across lines" |
|
elif connected_groups >= len(lines) / 4 and avg_lines_per_group >= 2: |
|
flow_quality = "Good - some connected thoughts across lines" |
|
elif connected_groups > 0: |
|
flow_quality = "Fair - limited connection between lines" |
|
else: |
|
flow_quality = "Poor - mostly independent lines" |
|
else: |
|
avg_lines_per_group = 0 |
|
flow_quality = "Poor - no connected thoughts detected" |
|
|
|
return { |
|
"connected_groups": connected_groups, |
|
"avg_lines_per_group": avg_lines_per_group, |
|
"flow_quality": flow_quality |
|
} |
|
|
|
def analyze_theme_emotion_expression(lyrics, theme, emotion): |
|
"""Analyze how well the lyrics express the target theme and emotion""" |
|
|
|
|
|
lyrics_text = lyrics.lower() |
|
emotion = emotion.lower() |
|
theme = theme.lower() |
|
|
|
|
|
if emotion not in emotion_lexicons: |
|
closest_emotion = "reflective" |
|
for key in emotion_lexicons: |
|
if emotion in key or key in emotion: |
|
closest_emotion = key |
|
break |
|
emotion = closest_emotion |
|
|
|
|
|
if theme not in theme_lexicons: |
|
closest_theme = "journey" |
|
for key in theme_lexicons: |
|
if theme in key or key in theme: |
|
closest_theme = key |
|
break |
|
theme = closest_theme |
|
|
|
|
|
emotion_matches = 0 |
|
theme_matches = 0 |
|
|
|
for word in emotion_lexicons[emotion]: |
|
if word in lyrics_text: |
|
emotion_matches += 1 |
|
|
|
for word in theme_lexicons[theme]: |
|
if word in lyrics_text: |
|
theme_matches += 1 |
|
|
|
|
|
emotion_score = min(100, (emotion_matches / len(emotion_lexicons[emotion])) * 100) |
|
theme_score = min(100, (theme_matches / len(theme_lexicons[theme])) * 100) |
|
|
|
|
|
if emotion_score >= 30 and theme_score >= 30: |
|
expression_quality = "Strong" |
|
elif emotion_score >= 20 and theme_score >= 20: |
|
expression_quality = "Good" |
|
elif emotion_score >= 10 and theme_score >= 10: |
|
expression_quality = "Fair" |
|
else: |
|
expression_quality = "Weak" |
|
|
|
return { |
|
"emotion_score": emotion_score, |
|
"theme_score": theme_score, |
|
"expression_quality": expression_quality, |
|
"emotion_words_found": emotion_matches, |
|
"theme_words_found": theme_matches |
|
} |
|
|
|
|
|
def create_interface(): |
|
with gr.Blocks(title="Music Analysis & Lyrics Generator") as demo: |
|
gr.Markdown("# Music Analysis & Lyrics Generator") |
|
gr.Markdown("Upload a music file or record audio to analyze it and generate matching lyrics") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
audio_input = gr.Audio( |
|
label="Upload or Record Audio", |
|
type="filepath", |
|
sources=["upload", "microphone"] |
|
) |
|
analyze_btn = gr.Button("Analyze and Generate Lyrics", variant="primary") |
|
|
|
with gr.Column(scale=2): |
|
with gr.Tab("Analysis"): |
|
analysis_output = gr.Textbox(label="Music Analysis Results", lines=10) |
|
|
|
with gr.Row(): |
|
tempo_output = gr.Number(label="Tempo (BPM)") |
|
time_sig_output = gr.Textbox(label="Time Signature") |
|
emotion_output = gr.Textbox(label="Primary Emotion") |
|
theme_output = gr.Textbox(label="Primary Theme") |
|
genre_output = gr.Textbox(label="Primary Genre") |
|
|
|
with gr.Tab("Generated Lyrics"): |
|
lyrics_output = gr.Textbox(label="Generated Lyrics", lines=20) |
|
|
|
with gr.Tab("Beat Matching"): |
|
beat_match_output = gr.Markdown(label="Beat & Syllable Matching Analysis") |
|
|
|
|
|
analyze_btn.click( |
|
fn=process_audio, |
|
inputs=[audio_input], |
|
outputs=[analysis_output, lyrics_output, tempo_output, time_sig_output, |
|
emotion_output, theme_output, genre_output, beat_match_output] |
|
) |
|
|
|
|
|
supported_genres_md = "\n".join([f"- {genre.capitalize()}" for genre in beat_analyzer.supported_genres]) |
|
|
|
gr.Markdown(f""" |
|
## How it works |
|
1. Upload or record a music file |
|
2. The system analyzes tempo, beats, time signature and other musical features |
|
3. It detects emotion, theme, and music genre |
|
4. Using beat patterns and syllable stress analysis, it generates perfectly aligned lyrics |
|
5. Each line of the lyrics is matched to the beat pattern of the corresponding musical phrase |
|
|
|
## Supported Genres |
|
**Note:** Lyrics generation is currently only supported for the following genres: |
|
{supported_genres_md} |
|
|
|
These genres have consistent syllable-to-beat patterns that work well with our algorithm. |
|
For other genres, only music analysis will be provided. |
|
""") |
|
|
|
return demo |
|
|
|
|
|
demo = create_interface() |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
else: |
|
|
|
app = demo |