root
css
19c0923
import os
import io
import gradio as gr
import torch
import numpy as np
import re
import pronouncing
import functools
from transformers import (
AutoModelForAudioClassification,
AutoFeatureExtractor,
AutoTokenizer,
pipeline,
AutoModelForCausalLM,
BitsAndBytesConfig
)
from huggingface_hub import login
from utils import (
load_audio,
extract_audio_duration,
extract_mfcc_features,
format_genre_results,
ensure_cuda_availability
)
from emotionanalysis import MusicAnalyzer
import librosa
from beat_analysis import BeatAnalyzer # Import the BeatAnalyzer class
# Initialize beat analyzer
beat_analyzer = BeatAnalyzer()
# Login to Hugging Face Hub if token is provided
if "HF_TOKEN" in os.environ:
login(token=os.environ["HF_TOKEN"])
# Constants
GENRE_MODEL_NAME = "dima806/music_genres_classification"
MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593"
LLM_MODEL_NAME = "Qwen/QwQ-32B"
SAMPLE_RATE = 22050 # Standard sample rate for audio processing
# Check CUDA availability (for informational purposes)
CUDA_AVAILABLE = ensure_cuda_availability()
# Load models at initialization time
print("Loading genre classification model...")
try:
genre_feature_extractor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME)
genre_model = AutoModelForAudioClassification.from_pretrained(
GENRE_MODEL_NAME,
device_map="auto" if CUDA_AVAILABLE else None
)
# Create a convenience wrapper function with the same interface as before
def get_genre_model():
return genre_model, genre_feature_extractor
except Exception as e:
print(f"Error loading genre model: {str(e)}")
genre_model = None
genre_feature_extractor = None
# Load LLM and tokenizer at initialization time
print("Loading Qwen QwQ-32B model with 4-bit quantization...")
try:
# Configure 4-bit quantization for better performance
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True
)
llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
llm_model = AutoModelForCausalLM.from_pretrained(
LLM_MODEL_NAME,
quantization_config=quantization_config,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.float16,
use_cache=True
)
except Exception as e:
print(f"Error loading LLM model: {str(e)}")
llm_tokenizer = None
llm_model = None
# Create music analyzer instance
music_analyzer = MusicAnalyzer()
# Process uploaded audio file
def process_audio(audio_file, custom_prompt=""):
if audio_file is None:
return "No audio file provided", None, None, None, None, None, None, None, None, None
try:
# Load and analyze audio
y, sr = load_audio(audio_file, sr=SAMPLE_RATE)
# Basic audio information
duration = extract_audio_duration(y, sr)
# Detect time signature using BeatAnalyzer
time_sig_result = beat_analyzer.detect_time_signature(audio_file)
time_signature = time_sig_result["time_signature"]
# Analyze music with MusicAnalyzer for emotion and theme analysis
music_analysis = music_analyzer.analyze_music(audio_file)
# Extract key information
tempo = music_analysis["rhythm_analysis"]["tempo"]
# Get top two emotions
emotion_scores = music_analysis["emotion_analysis"]["emotion_scores"]
sorted_emotions = sorted(emotion_scores.items(), key=lambda x: x[1], reverse=True)
primary_emotion = sorted_emotions[0][0]
secondary_emotion = sorted_emotions[1][0] if len(sorted_emotions) > 1 else None
# Get top two themes
theme_scores = music_analysis["theme_analysis"]["theme_scores"]
sorted_themes = sorted(theme_scores.items(), key=lambda x: x[1], reverse=True)
primary_theme = sorted_themes[0][0]
secondary_theme = sorted_themes[1][0] if len(sorted_themes) > 1 else None
# Use genre classification directly instead of pipeline
if genre_model is not None and genre_feature_extractor is not None:
# Resample audio to 16000 Hz for the genre model
y_16k = librosa.resample(y, orig_sr=sr, target_sr=16000)
# Extract features
inputs = genre_feature_extractor(
y_16k,
sampling_rate=16000,
return_tensors="pt"
).to(genre_model.device)
# Classify genre
with torch.no_grad():
outputs = genre_model(**inputs)
logits = outputs.logits
probs = torch.nn.functional.softmax(logits, dim=-1)
# Get top genres
values, indices = torch.topk(probs[0], k=5)
top_genres = [(genre_model.config.id2label[idx.item()], val.item()) for val, idx in zip(values, indices)]
else:
# Fallback if model loading failed
top_genres = [("Unknown", 1.0)]
# Format genre results for display
genre_results_text = format_genre_results(top_genres)
primary_genre = top_genres[0][0]
# Ensure time signature is one of the supported ones (4/4, 3/4, 6/8)
if time_signature not in ["4/4", "3/4", "6/8"]:
time_signature = "4/4" # Default to 4/4 if unsupported
# Analyze beat patterns and create lyrics template using the time signature
beat_analysis = beat_analyzer.analyze_beat_pattern(audio_file, time_signature=time_signature, auto_detect=False)
lyric_templates = beat_analyzer.create_lyric_template(beat_analysis)
# Store these in the music_analysis dict for use in lyrics generation
music_analysis["beat_analysis"] = beat_analysis
music_analysis["lyric_templates"] = lyric_templates
# Prepare analysis summary
analysis_summary = f"""
### Music Analysis Results
**Duration:** {duration:.2f} seconds
**Tempo:** {tempo:.1f} BPM
**Time Signature:** {time_signature} (Confidence: {time_sig_result["confidence"]:.1%})
**Key:** {music_analysis["tonal_analysis"]["key"]} {music_analysis["tonal_analysis"]["mode"]}
**Emotions:**
- Primary: {primary_emotion} (Confidence: {emotion_scores[primary_emotion]:.1%})
- Secondary: {secondary_emotion} (Confidence: {emotion_scores[secondary_emotion]:.1%})
**Themes:**
- Primary: {primary_theme} (Confidence: {theme_scores[primary_theme]:.1%})
- Secondary: {secondary_theme} (Confidence: {theme_scores[secondary_theme]:.1%})
**Top Genre:** {primary_genre}
{genre_results_text}
"""
# Add beat analysis summary
if lyric_templates:
analysis_summary += f"""
### Beat Analysis
**Total Phrases:** {len(lyric_templates)}
**Average Beats Per Phrase:** {np.mean([t['num_beats'] for t in lyric_templates]):.1f}
**Beat Pattern Examples:**
- Phrase 1: {lyric_templates[0]['stress_pattern'] if lyric_templates else 'N/A'}
- Phrase 2: {lyric_templates[1]['stress_pattern'] if len(lyric_templates) > 1 else 'N/A'}
"""
# Check if genre is supported for lyrics generation
genre_supported = any(genre.lower() in primary_genre.lower() for genre in beat_analyzer.supported_genres)
# Generate lyrics only for supported genres
if genre_supported:
lyrics = generate_lyrics(music_analysis, primary_genre, duration, custom_prompt)
beat_match_analysis = analyze_lyrics_rhythm_match(lyrics, lyric_templates, primary_genre)
else:
supported_genres_str = ", ".join([genre.capitalize() for genre in beat_analyzer.supported_genres])
lyrics = f"Lyrics generation is only supported for the following genres: {supported_genres_str}.\n\nDetected genre '{primary_genre}' doesn't have strong syllable-to-beat patterns required for our lyric generation algorithm."
beat_match_analysis = "Lyrics generation not available for this genre."
return analysis_summary, lyrics, tempo, time_signature, primary_emotion, secondary_emotion, primary_theme, secondary_theme, primary_genre, beat_match_analysis
except Exception as e:
error_msg = f"Error processing audio: {str(e)}"
print(error_msg)
return error_msg, None, None, None, None, None, None, None, None, None
def generate_lyrics(music_analysis, genre, duration, custom_prompt=""):
try:
# Extract meaningful information for context
tempo = music_analysis["rhythm_analysis"]["tempo"]
key = music_analysis["tonal_analysis"]["key"]
mode = music_analysis["tonal_analysis"]["mode"]
# Get both primary and secondary emotions and themes
emotion_scores = music_analysis["emotion_analysis"]["emotion_scores"]
sorted_emotions = sorted(emotion_scores.items(), key=lambda x: x[1], reverse=True)
primary_emotion = sorted_emotions[0][0]
secondary_emotion = sorted_emotions[1][0] if len(sorted_emotions) > 1 else None
theme_scores = music_analysis["theme_analysis"]["theme_scores"]
sorted_themes = sorted(theme_scores.items(), key=lambda x: x[1], reverse=True)
primary_theme = sorted_themes[0][0]
secondary_theme = sorted_themes[1][0] if len(sorted_themes) > 1 else None
# Get beat analysis and templates
lyric_templates = music_analysis.get("lyric_templates", [])
# Define num_phrases here to ensure it's available in all code paths
# Also define syllable limits for the prompt
if not lyric_templates:
num_phrases_for_prompt = 4 # Default if no templates
min_syl_for_prompt = 2
max_syl_for_prompt = 7
# Build the base prompt
base_prompt = f'''You are a professional songwriter. Write song lyrics for a {genre} song.
SONG DETAILS:
- Key: {key} {mode}
- Tempo: {tempo} BPM
- Primary emotion: {primary_emotion}
- Secondary emotion: {secondary_emotion}
- Primary theme: {primary_theme}
- Secondary theme: {secondary_theme}'''
# Add custom requirements if provided
custom_requirements = ""
if custom_prompt and custom_prompt.strip():
custom_requirements = f'''
SPECIAL REQUIREMENTS FROM USER:
{custom_prompt.strip()}
Please incorporate these requirements while still following all the technical constraints below.'''
prompt = base_prompt + custom_requirements + f'''
CRITICAL REQUIREMENTS (MOST IMPORTANT):
- You MUST write EXACTLY {num_phrases_for_prompt} lines of lyrics.
- Number each lyric line starting from 1 up to {num_phrases_for_prompt}. For example:
1. First lyric line.
2. Second lyric line.
...
{num_phrases_for_prompt}. The final lyric line.
- Each numbered line (after removing the number and period) MUST be {min_syl_for_prompt}-{max_syl_for_prompt} syllables MAXIMUM.
- NO line's content (after removing the number) can exceed {max_syl_for_prompt} syllables. This is EXTREMELY IMPORTANT.
- Count syllables carefully for the content of each numbered line.
- Use SHORT WORDS and SHORT PHRASES for the content of each numbered line.
- Break long thoughts into multiple numbered lines.
CREATIVITY GUIDELINES:
- Create original, vivid imagery that captures the emotions.
- Use concrete, sensory details (what you see, hear, feel, touch).
- Avoid clichΓ©s and common phrases.
- Draw inspiration from the specific themes and emotions listed above.
- Think about unique moments, specific objects, or personal details.
- Use unexpected word combinations.
- Focus on the particular mood created by {primary_emotion} and {secondary_emotion}.
STYLE FOR SHORT LINES (for the content of each numbered line):
- Use brief, impactful phrases.
- Focus on single images or moments per line.
- Choose simple, everyday words.
- Let each line paint one clear picture.
ABSOLUTELY NO placeholders like [line], [moment], [breath], [phrase], [word], etc.
OUTPUT FORMAT:
Under the "LYRICS:" heading, provide exactly {num_phrases_for_prompt} numbered lyric lines.
LYRICS:
(Your {num_phrases_for_prompt} numbered lyric lines go here, each starting with its number, a period, and a space)
Remember: Output EXACTLY {num_phrases_for_prompt} numbered lyric lines. Each line's content (after removing the number) must be {min_syl_for_prompt}-{max_syl_for_prompt} syllables.'''
else:
# Calculate the typical syllable range for this genre
num_phrases_for_prompt = len(lyric_templates)
max_syl_for_prompt = max([t.get('max_expected', 7) for t in lyric_templates]) if lyric_templates and lyric_templates[0].get('max_expected') else 7
min_syl_for_prompt = min([t.get('min_expected', 2) for t in lyric_templates]) if lyric_templates and lyric_templates[0].get('min_expected') else 2
# Build the base prompt
base_prompt = f'''You are a professional songwriter. Write song lyrics for a {genre} song.
SONG DETAILS:
- Key: {key} {mode}
- Tempo: {tempo} BPM
- Primary emotion: {primary_emotion}
- Secondary emotion: {secondary_emotion}
- Primary theme: {primary_theme}
- Secondary theme: {secondary_theme}'''
# Add custom requirements if provided
custom_requirements = ""
if custom_prompt and custom_prompt.strip():
custom_requirements = f'''
SPECIAL REQUIREMENTS FROM USER:
{custom_prompt.strip()}
Please incorporate these requirements while still following all the technical constraints below.'''
prompt = base_prompt + custom_requirements + f'''
CRITICAL REQUIREMENTS (MOST IMPORTANT):
- You MUST write EXACTLY {num_phrases_for_prompt} lines of lyrics.
- Number each lyric line starting from 1 up to {num_phrases_for_prompt}. For example:
1. First lyric line.
2. Second lyric line.
...
{num_phrases_for_prompt}. The final lyric line.
- Each numbered line (after removing the number and period) MUST be {min_syl_for_prompt}-{max_syl_for_prompt} syllables MAXIMUM.
- NO line's content (after removing the number) can exceed {max_syl_for_prompt} syllables. This is EXTREMELY IMPORTANT.
- Count syllables carefully for the content of each numbered line.
- Use SHORT WORDS and SHORT PHRASES for the content of each numbered line.
- Break long thoughts into multiple numbered lines.
CREATIVITY GUIDELINES:
- Create original, vivid imagery that captures the emotions.
- Use concrete, sensory details (what you see, hear, feel, touch).
- Avoid clichΓ©s and common phrases.
- Draw inspiration from the specific themes and emotions listed above.
- Think about unique moments, specific objects, or personal details.
- Use unexpected word combinations.
- Focus on the particular mood created by {primary_emotion} and {secondary_emotion}.
STYLE FOR SHORT LINES (for the content of each numbered line):
- Use brief, impactful phrases.
- Focus on single images or moments per line.
- Choose simple, everyday words.
- Let each line paint one clear picture.
ABSOLUTELY NO placeholders like [line], [moment], [breath], [phrase], [word], etc.
OUTPUT FORMAT:
Under the "LYRICS:" heading, provide exactly {num_phrases_for_prompt} numbered lyric lines.
LYRICS:
(Your {num_phrases_for_prompt} numbered lyric lines go here, each starting with its number, a period, and a space)
Remember: Output EXACTLY {num_phrases_for_prompt} numbered lyric lines. Each line's content (after removing the number) must be {min_syl_for_prompt}-{max_syl_for_prompt} syllables.'''
# Generate with optimized parameters for QwQ model
messages = [
{"role": "user", "content": prompt}
]
# Apply chat template
text = llm_tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Tokenize and move to model device
model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device)
# Generate with optimized parameters for QwQ model
generated_ids = llm_model.generate(
**model_inputs,
max_new_tokens=2048, # Increased from 1024 to give QwQ more room
do_sample=True,
temperature=0.6, # QwQ recommended setting
top_p=0.95, # QwQ recommended setting
top_k=30, # QwQ recommended range 20-40
repetition_penalty=1.1, # Reduced to allow some repetition if needed
pad_token_id=llm_tokenizer.eos_token_id
)
# Decode the output
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
# ENHANCED CLEANING FOR QWQ MODEL - IMPROVED APPROACH
# ---------------------------------------------------
# QwQ often includes thinking process - we need to extract only the final lyrics
# 1. First, remove any thinking tags completely (QwQ specific)
lyrics = re.sub(r'<think>.*?</think>', '', lyrics, flags=re.DOTALL | re.IGNORECASE)
lyrics = re.sub(r'<think>', '', lyrics, flags=re.IGNORECASE)
lyrics = re.sub(r'</think>', '', lyrics, flags=re.IGNORECASE)
# 2. Look for the LYRICS: section specifically
lyrics_section_match = re.search(r'LYRICS:\s*\n(.*?)(?:\n\n|\Z)', lyrics, re.DOTALL | re.IGNORECASE)
if lyrics_section_match:
lyrics = lyrics_section_match.group(1).strip()
else:
# Fallback: look for other common transitions that indicate the start of actual lyrics
lyric_start_patterns = [
r'(?:here (?:are )?(?:the )?lyrics?:?|lyrics?:?|my lyrics?:?|song lyrics?:?)\s*',
r'(?:here (?:is )?(?:a )?song:?|here (?:is )?my song:?)\s*',
r'(?:\*{3,}|\={3,}|\-{3,})\s*',
r'(?:final lyrics?:?|the lyrics?:?)\s*',
r'```\s*'
]
# Try to find where actual lyrics start
lyrics_start_pos = 0
for pattern in lyric_start_patterns:
match = re.search(pattern, lyrics, re.IGNORECASE)
if match:
lyrics_start_pos = max(lyrics_start_pos, match.end())
# Keep content from the identified start position
if lyrics_start_pos > 0:
lyrics = lyrics[lyrics_start_pos:].strip()
# 3. Split into lines and apply basic filtering
lines = lyrics.strip().split('\n')
clean_lines = []
# 4. Simple filtering - keep only actual lyric lines
for line in lines:
line = line.strip()
if not line or line.isspace():
continue
# Strip leading numbers like "1. ", "2. ", etc.
line = re.sub(r'^\d+\.\s*', '', line)
line_lower = line.lower()
# Remove placeholder lines - more comprehensive pattern
if re.match(r'^\[ *(line|moment|breath|phrase|word|sound) *\]$', line_lower):
continue
# Skip lines that are clearly not lyrics (simplified filtering)
if any(phrase in line_lower for phrase in [
'line 1', 'line 2', 'line 3',
'thinking', 'lyrics:', 'format:', 'etc...', 'commentary',
'syllables', 'requirements', 'output', 'provide'
]):
continue
# Skip numbered annotations
if re.match(r'^\d+[\.\):]|^\[.*\]$', line):
continue
# Keep lines that look like actual lyrics (not too long, not too technical)
words = line.split()
if 1 <= len(words) <= 8 and not any(tech_word in line_lower for tech_word in [
'syllable', 'beat', 'tempo', 'analysis', 'format', 'section'
]):
clean_lines.append(line)
# 5. Additional cleanup for QwQ-specific issues
# Remove any remaining thinking fragments
final_clean_lines = []
for line in clean_lines:
# Remove trailing thoughts/annotations
line = re.sub(r'\s+//.*$', '', line)
line = re.sub(r'\s+\(.*?\)$', '', line)
# Remove syllable count annotations
line = re.sub(r'\s*\(\d+\s*syllables?\)', '', line, flags=re.IGNORECASE)
# Skip if the line became empty after cleaning
if line.strip():
final_clean_lines.append(line.strip())
clean_lines = final_clean_lines
# AGGRESSIVE SYLLABLE ENFORCEMENT - This is critical for beat matching
if lyric_templates:
max_allowed_syllables = max([t.get('max_expected', 6) for t in lyric_templates])
min_allowed_syllables = min([t.get('min_expected', 2) for t in lyric_templates])
else:
max_allowed_syllables = 6
min_allowed_syllables = 2
# Enforce syllable limits on every line
syllable_enforced_lines = []
for line in clean_lines:
words = line.split()
current_syllables = sum(beat_analyzer.count_syllables(word) for word in words)
# If line is within limits, keep it
if min_allowed_syllables <= current_syllables <= max_allowed_syllables:
syllable_enforced_lines.append(line)
# If line is too long, we need to split it intelligently
elif current_syllables > max_allowed_syllables:
# Try to split into multiple shorter lines
current_line = []
current_count = 0
for word in words:
word_syllables = beat_analyzer.count_syllables(word)
# If adding this word would exceed limit, start new line
if current_count + word_syllables > max_allowed_syllables and current_line:
syllable_enforced_lines.append(" ".join(current_line))
current_line = [word]
current_count = word_syllables
else:
# Add the word to the current line
current_line.append(word)
current_count += word_syllables
# Add the remaining words as final line
if current_line and current_count >= min_allowed_syllables:
syllable_enforced_lines.append(" ".join(current_line))
# Skip lines that are too short
clean_lines = syllable_enforced_lines
# Get required number of lines
if lyric_templates:
num_required = len(lyric_templates)
else:
num_required = 4
# IMPORTANT: Adjust line count to match requirement
if len(clean_lines) > num_required:
# Too many lines - try to merge adjacent short lines first
merged_lines = []
i = 0
while i < len(clean_lines) and len(merged_lines) < num_required:
if i + 1 < len(clean_lines) and len(merged_lines) < num_required - 1:
# Check if we can merge current and next line
line1 = clean_lines[i]
line2 = clean_lines[i + 1]
words1 = line1.split()
words2 = line2.split()
syllables1 = sum(beat_analyzer.count_syllables(word) for word in words1)
syllables2 = sum(beat_analyzer.count_syllables(word) for word in words2)
# If merging would stay within limits, merge them
if syllables1 + syllables2 <= max_allowed_syllables:
merged_lines.append(line1 + " " + line2)
i += 2
else:
merged_lines.append(line1)
i += 1
else:
merged_lines.append(clean_lines[i])
i += 1
# If still too many, truncate to required number
clean_lines = merged_lines[:num_required]
elif len(clean_lines) < num_required:
# Too few lines - this is a generation failure
# Instead of error, try to pad with empty lines or regenerate
# For now, let's return an error message
return f"Error: The model generated {len(clean_lines)} lines but {num_required} were required. Please try again."
# Final check - ensure we have exactly the required number
if len(clean_lines) != num_required:
# If we still don't have the right number, truncate or pad
if len(clean_lines) > num_required:
clean_lines = clean_lines[:num_required]
else:
# This shouldn't happen with the above logic, but just in case
return f"Error: Could not generate exactly {num_required} lines. Please try again."
# Assemble final lyrics
final_lyrics = '\n'.join(clean_lines)
# Final sanity check - if we have nothing or very little, return an error
if not final_lyrics or len(final_lyrics.strip()) < 15:
return "The model output appears to be mostly thinking content. Please try regenerating for cleaner lyrics."
return final_lyrics
except Exception as e:
error_msg = f"Error generating lyrics: {str(e)}"
print(error_msg)
return error_msg
def analyze_lyrics_rhythm_match(lyrics, lyric_templates, genre="pop"):
"""Analyze how well the generated lyrics match the beat patterns and syllable requirements"""
if not lyric_templates or not lyrics:
return "No beat templates or lyrics available for analysis."
# Split lyrics into lines
lines = lyrics.strip().split('\n')
lines = [line for line in lines if line.strip()] # Remove empty lines
# Prepare analysis result
result = "### Beat & Syllable Match Analysis\n\n"
result += "| Line | Syllables | Target Range | Match | Stress Pattern |\n"
result += "| ---- | --------- | ------------ | ----- | -------------- |\n"
# Maximum number of lines to analyze (either all lines or all templates)
line_count = min(len(lines), len(lyric_templates))
# Track overall match statistics
total_matches = 0
total_range_matches = 0
total_stress_matches = 0
total_stress_percentage = 0
total_ideal_matches = 0
for i in range(line_count):
line = lines[i]
template = lyric_templates[i]
# Check match between line and template with genre awareness
check_result = beat_analyzer.check_syllable_stress_match(line, template, genre)
# Get match symbols
if check_result["close_to_ideal"]:
syllable_match = "βœ“" # Ideal or very close
elif check_result["within_range"]:
syllable_match = "βœ“*" # Within range but not ideal
else:
syllable_match = "βœ—" # Outside range
stress_match = "βœ“" if check_result["stress_matches"] else f"{int(check_result['stress_match_percentage']*100)}%"
# Update stats
if check_result["close_to_ideal"]:
total_matches += 1
total_ideal_matches += 1
elif check_result["within_range"]:
total_range_matches += 1
if check_result["stress_matches"]:
total_stress_matches += 1
total_stress_percentage += check_result["stress_match_percentage"]
# Create visual representation of the stress pattern
stress_visual = ""
for char in template['stress_pattern']:
if char == "S":
stress_visual += "X" # Strong
elif char == "M":
stress_visual += "x" # Medium
else:
stress_visual += "." # Weak
# Add line to results table
result += f"| {i+1} | {check_result['syllable_count']} | {check_result['min_expected']}-{check_result['max_expected']} | {syllable_match} | {stress_visual} |\n"
# Add summary statistics
if line_count > 0:
exact_match_rate = (total_matches / line_count) * 100
range_match_rate = ((total_matches + total_range_matches) / line_count) * 100
ideal_match_rate = (total_ideal_matches / line_count) * 100
stress_match_rate = (total_stress_matches / line_count) * 100
avg_stress_percentage = (total_stress_percentage / line_count) * 100
result += f"\n**Summary:**\n"
result += f"- Ideal or near-ideal syllable match rate: {exact_match_rate:.1f}%\n"
result += f"- Genre-appropriate syllable range match rate: {range_match_rate:.1f}%\n"
result += f"- Perfect stress pattern match rate: {stress_match_rate:.1f}%\n"
result += f"- Average stress pattern accuracy: {avg_stress_percentage:.1f}%\n"
result += f"- Overall rhythmic accuracy: {((range_match_rate + avg_stress_percentage) / 2):.1f}%\n"
# Analyze sentence flow across lines
sentence_flow_analysis = analyze_sentence_flow(lines)
result += f"\n**Sentence Flow Analysis:**\n"
result += f"- Connected thought groups: {sentence_flow_analysis['connected_groups']} detected\n"
result += f"- Average lines per thought: {sentence_flow_analysis['avg_lines_per_group']:.1f}\n"
result += f"- Flow quality: {sentence_flow_analysis['flow_quality']}\n"
# Add guidance on ideal distribution for syllables and sentence flow
result += f"\n**Syllable & Flow Guidance:**\n"
result += f"- Aim for {min([t.get('min_expected', 3) for t in lyric_templates])}-{max([t.get('max_expected', 7) for t in lyric_templates])} syllables per line\n"
result += f"- Break complete thoughts across 2-3 lines for natural flow\n"
result += f"- Connect your lyrics with sentence fragments that flow across lines\n"
result += f"- Use conjunctions, prepositions, and dependent clauses to connect lines\n"
# Add genre-specific notes
result += f"\n**Genre Notes ({genre}):**\n"
# Add appropriate genre notes based on genre
if genre.lower() == "pop":
result += "- Pop lyrics work well with thoughts spanning 2-3 musical phrases\n"
result += "- Create flow by connecting lines with transitions like 'as', 'when', 'through'\n"
elif genre.lower() == "rock":
result += "- Rock lyrics benefit from short phrases that build into complete thoughts\n"
result += "- Use line breaks strategically to emphasize key words\n"
elif genre.lower() == "country":
result += "- Country lyrics tell stories that flow naturally across multiple lines\n"
result += "- Connect narrative elements across phrases for authentic storytelling\n"
elif genre.lower() == "disco":
result += "- Disco lyrics work well with phrases that create rhythmic momentum\n"
result += "- Use line transitions that maintain energy and flow\n"
elif genre.lower() == "metal":
result += "- Metal lyrics can create intensity by breaking phrases at dramatic points\n"
result += "- Connect lines to build tension and release across measures\n"
else:
result += "- This genre works well with connected thoughts across multiple lines\n"
result += "- Aim for natural speech flow rather than complete thoughts per line\n"
return result
def analyze_sentence_flow(lines):
"""Analyze how well the lyrics create sentence flow across multiple lines"""
if not lines or len(lines) < 2:
return {
"connected_groups": 0,
"avg_lines_per_group": 0,
"flow_quality": "Insufficient lines to analyze"
}
# Simplified analysis looking for grammatical clues of sentence continuation
continuation_starters = [
'and', 'but', 'or', 'nor', 'for', 'yet', 'so', # Coordinating conjunctions
'as', 'when', 'while', 'before', 'after', 'since', 'until', 'because', 'although', 'though', # Subordinating conjunctions
'with', 'without', 'through', 'throughout', 'beyond', 'beneath', 'under', 'over', 'into', 'onto', # Prepositions
'to', 'from', 'by', 'at', 'in', 'on', 'of', # Common prepositions
'where', 'how', 'who', 'whom', 'whose', 'which', 'that', # Relative pronouns
'if', 'then', # Conditional connectors
]
# Check for lines that likely continue a thought from previous line
connected_lines = []
potential_groups = []
current_group = [0] # Start with first line
for i in range(1, len(lines)):
# Check if line starts with a continuation word
words = lines[i].lower().split()
# Empty line or no words
if not words:
if len(current_group) > 1: # Only consider groups of 2+ lines
potential_groups.append(current_group.copy())
current_group = [i]
continue
# Check first word for continuation clues
first_word = words[0].strip(',.!?;:')
if first_word in continuation_starters:
connected_lines.append(i)
current_group.append(i)
# Check for absence of capitalization as continuation clue
elif not first_word[0].isupper() and first_word[0].isalpha():
connected_lines.append(i)
current_group.append(i)
# Check if current line is very short (likely part of a continued thought)
elif len(words) <= 3 and i < len(lines) - 1:
# Look ahead to see if next line could be a continuation
if i+1 < len(lines):
next_words = lines[i+1].lower().split()
if next_words and next_words[0] in continuation_starters:
connected_lines.append(i)
current_group.append(i)
else:
# This might end a group
if len(current_group) > 1: # Only consider groups of 2+ lines
potential_groups.append(current_group.copy())
current_group = [i]
else:
# This likely starts a new thought
if len(current_group) > 1: # Only consider groups of 2+ lines
potential_groups.append(current_group.copy())
current_group = [i]
# Add the last group if it has multiple lines
if len(current_group) > 1:
potential_groups.append(current_group)
# Calculate metrics
connected_groups = len(potential_groups)
if connected_groups > 0:
avg_lines_per_group = sum(len(group) for group in potential_groups) / connected_groups
# Determine flow quality
if connected_groups >= len(lines) / 3 and avg_lines_per_group >= 2.5:
flow_quality = "Excellent - multiple connected thoughts across lines"
elif connected_groups >= len(lines) / 4 and avg_lines_per_group >= 2:
flow_quality = "Good - some connected thoughts across lines"
elif connected_groups > 0:
flow_quality = "Fair - limited connection between lines"
else:
flow_quality = "Poor - mostly independent lines"
else:
avg_lines_per_group = 0
flow_quality = "Poor - no connected thoughts detected"
return {
"connected_groups": connected_groups,
"avg_lines_per_group": avg_lines_per_group,
"flow_quality": flow_quality
}
def enforce_syllable_limits(lines, max_syllables=6):
"""
Enforce syllable limits by splitting or truncating lines that are too long.
Returns a modified list of lines where no line exceeds max_syllables.
"""
if not lines:
return []
result_lines = []
for line in lines:
words = line.split()
if not words:
continue
# Count syllables in the line
syllable_count = sum(beat_analyzer.count_syllables(word) for word in words)
# If within limits, keep the line as is
if syllable_count <= max_syllables:
result_lines.append(line)
continue
# Line is too long - we need to split or truncate it
current_line = []
current_syllables = 0
for word in words:
word_syllables = beat_analyzer.count_syllables(word)
# If adding this word would exceed the limit, start a new line
if current_syllables + word_syllables > max_syllables and current_line:
result_lines.append(" ".join(current_line))
current_line = [word]
current_syllables = word_syllables
else:
# Add the word to the current line
current_line.append(word)
current_syllables += word_syllables
# Don't forget the last line if there are words left
if current_line:
result_lines.append(" ".join(current_line))
return result_lines
# Create Gradio interface
def create_interface():
with gr.Blocks(title="Advanced Music Analysis & Beat-Matched Lyrics Generator") as demo:
gr.Markdown("# 🎡 Advanced Music Analysis & Beat-Matched Lyrics Generator")
gr.Markdown("**Upload music to get comprehensive analysis and generate perfectly synchronized lyrics that match the rhythm, emotion, and structure of your audio**")
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(
label="🎧 Upload or Record Audio",
type="filepath",
sources=["upload", "microphone"]
)
# Add custom prompt input
custom_prompt_input = gr.Textbox(
label="🎨 Custom Lyrics Requirements (Optional)",
placeholder="e.g., 'Write about a rainy day in the city' or 'Include metaphors about flying' or 'Make it about overcoming challenges'",
lines=3,
info="Add any specific requirements, themes, or creative directions for the lyrics. This will be merged with the music analysis to create personalized lyrics."
)
analyze_btn = gr.Button("πŸš€ Analyze Music & Generate Lyrics", variant="primary", size="lg")
with gr.Column(scale=2):
with gr.Tab("πŸ“Š Music Analysis"):
analysis_output = gr.Textbox(label="Comprehensive Music Analysis Results", lines=10)
with gr.Row():
tempo_output = gr.Number(label="πŸ₯ Tempo (BPM)")
time_sig_output = gr.Textbox(label="⏱️ Time Signature")
with gr.Row():
primary_emotion_output = gr.Textbox(label="😊 Primary Emotion")
secondary_emotion_output = gr.Textbox(label="😌 Secondary Emotion")
with gr.Row():
primary_theme_output = gr.Textbox(label="🎭 Primary Theme")
secondary_theme_output = gr.Textbox(label="πŸŽͺ Secondary Theme")
genre_output = gr.Textbox(label="🎼 Primary Genre")
with gr.Tab("🎀 Generated Lyrics"):
lyrics_output = gr.Textbox(label="Beat-Synchronized Lyrics", lines=20)
with gr.Tab("🎯 Beat Matching Analysis"):
beat_match_output = gr.Markdown(label="Rhythm & Syllable Synchronization Analysis")
# Set up event handlers
analyze_btn.click(
fn=process_audio,
inputs=[audio_input, custom_prompt_input],
outputs=[
analysis_output, lyrics_output, tempo_output, time_sig_output,
primary_emotion_output, secondary_emotion_output,
primary_theme_output, secondary_theme_output,
genre_output, beat_match_output
]
)
# Format supported genres for display
supported_genres_md = "\n".join([f"- **{genre.capitalize()}**: Optimized for {genre} music patterns" for genre in beat_analyzer.supported_genres])
gr.Markdown(f"""
## πŸš€ How It Works
1. **🎧 Upload Audio**: Support for various formats (MP3, WAV, etc.) or record directly in your browser
2. **🎨 Add Custom Requirements** (Optional): Specify your creative vision, themes, or style preferences
3. **πŸ” Advanced Analysis**: Multi-layered analysis including:
- **Tempo & Time Signature**: Advanced detection using multiple algorithms
- **Emotional Profiling**: 8-dimensional emotion mapping (happy, sad, excited, calm, etc.)
- **Thematic Analysis**: Musical themes (love, triumph, adventure, reflection, etc.)
- **Beat Pattern Extraction**: Precise rhythm and stress pattern identification
- **Genre Classification**: AI-powered genre detection with confidence scores
4. **🎀 Lyrics Generation**: AI creates perfectly synchronized lyrics that:
- **Match Beat Patterns**: Each line aligns with musical phrases and rhythm
- **Follow Syllable Constraints**: Precise syllable-to-beat mapping for natural flow
- **Incorporate Emotions & Themes**: Blend detected musical characteristics
- **Include Your Requirements**: Merge your creative directions seamlessly
5. **πŸ“Š Quality Analysis**: Comprehensive metrics showing beat matching accuracy and flow quality
## 🎨 Custom Requirements Examples
**🌟 Themes**: "Write about nature and freedom", "Focus on urban nightlife", "Tell a story about friendship"
**πŸ–ΌοΈ Imagery**: "Use ocean metaphors", "Include references to stars and sky", "Focus on light and shadow"
**πŸ‘οΈ Perspective**: "From a child's viewpoint", "Make it nostalgic", "Focus on hope and resilience"
**✍️ Style**: "Use simple everyday language", "Include some rhyming", "Make it conversational"
**πŸ“ Content**: "Avoid sad themes", "Include words 'journey' and 'home'", "Focus on personal growth"
The system intelligently blends your requirements with detected musical characteristics to create personalized, rhythm-perfect lyrics.
## 🎡 Supported Genres for Full Lyrics Generation
**βœ… Full Support** (Complete Analysis + Beat-Matched Lyrics):
{supported_genres_md}
These genres have consistent syllable-to-beat patterns that work optimally with our advanced rhythm-matching algorithm.
**πŸ“Š Analysis Only**: All other genres receive comprehensive musical analysis (tempo, emotion, themes, etc.) without lyrics generation.
## πŸ› οΈ Advanced Features
- **🎯 Beat Synchronization**: Syllable-perfect alignment with musical phrases
- **🧠 Emotion Integration**: Lyrics reflect detected emotional characteristics
- **🎭 Theme Incorporation**: Musical themes guide lyrical content
- **πŸ“ Quality Metrics**: Detailed analysis of rhythm matching accuracy
- **πŸ”„ Flow Optimization**: Natural sentence continuation across lines
- **βš™οΈ Genre Optimization**: Tailored patterns for different musical styles
""")
return demo
# Launch the app
demo = create_interface()
if __name__ == "__main__":
demo.launch()
else:
# For Hugging Face Spaces
app = demo