import os import io import gradio as gr import torch import numpy as np import re import pronouncing import functools from transformers import ( AutoModelForAudioClassification, AutoFeatureExtractor, AutoTokenizer, pipeline, AutoModelForCausalLM, BitsAndBytesConfig ) from huggingface_hub import login from utils import ( load_audio, extract_audio_duration, extract_mfcc_features, format_genre_results, ensure_cuda_availability ) from emotionanalysis import MusicAnalyzer import librosa from beat_analysis import BeatAnalyzer # Import the BeatAnalyzer class # Initialize beat analyzer beat_analyzer = BeatAnalyzer() # Login to Hugging Face Hub if token is provided if "HF_TOKEN" in os.environ: login(token=os.environ["HF_TOKEN"]) # Constants GENRE_MODEL_NAME = "dima806/music_genres_classification" MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593" LLM_MODEL_NAME = "Qwen/QwQ-32B" SAMPLE_RATE = 22050 # Standard sample rate for audio processing # Check CUDA availability (for informational purposes) CUDA_AVAILABLE = ensure_cuda_availability() # Load models at initialization time print("Loading genre classification model...") try: genre_feature_extractor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME) genre_model = AutoModelForAudioClassification.from_pretrained( GENRE_MODEL_NAME, device_map="auto" if CUDA_AVAILABLE else None ) # Create a convenience wrapper function with the same interface as before def get_genre_model(): return genre_model, genre_feature_extractor except Exception as e: print(f"Error loading genre model: {str(e)}") genre_model = None genre_feature_extractor = None # Load LLM and tokenizer at initialization time print("Loading Qwen QwQ-32B model with 4-bit quantization...") try: # Configure 4-bit quantization for better performance quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True ) llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME) llm_model = AutoModelForCausalLM.from_pretrained( LLM_MODEL_NAME, quantization_config=quantization_config, device_map="auto", trust_remote_code=True, torch_dtype=torch.float16, use_cache=True ) except Exception as e: print(f"Error loading LLM model: {str(e)}") llm_tokenizer = None llm_model = None # Create music analyzer instance music_analyzer = MusicAnalyzer() # Process uploaded audio file def process_audio(audio_file, custom_prompt=""): if audio_file is None: return "No audio file provided", None, None, None, None, None, None, None, None, None try: # Load and analyze audio y, sr = load_audio(audio_file, sr=SAMPLE_RATE) # Basic audio information duration = extract_audio_duration(y, sr) # Detect time signature using BeatAnalyzer time_sig_result = beat_analyzer.detect_time_signature(audio_file) time_signature = time_sig_result["time_signature"] # Analyze music with MusicAnalyzer for emotion and theme analysis music_analysis = music_analyzer.analyze_music(audio_file) # Extract key information tempo = music_analysis["rhythm_analysis"]["tempo"] # Get top two emotions emotion_scores = music_analysis["emotion_analysis"]["emotion_scores"] sorted_emotions = sorted(emotion_scores.items(), key=lambda x: x[1], reverse=True) primary_emotion = sorted_emotions[0][0] secondary_emotion = sorted_emotions[1][0] if len(sorted_emotions) > 1 else None # Get top two themes theme_scores = music_analysis["theme_analysis"]["theme_scores"] sorted_themes = sorted(theme_scores.items(), key=lambda x: x[1], reverse=True) primary_theme = sorted_themes[0][0] secondary_theme = sorted_themes[1][0] if len(sorted_themes) > 1 else None # Use genre classification directly instead of pipeline if genre_model is not None and genre_feature_extractor is not None: # Resample audio to 16000 Hz for the genre model y_16k = librosa.resample(y, orig_sr=sr, target_sr=16000) # Extract features inputs = genre_feature_extractor( y_16k, sampling_rate=16000, return_tensors="pt" ).to(genre_model.device) # Classify genre with torch.no_grad(): outputs = genre_model(**inputs) logits = outputs.logits probs = torch.nn.functional.softmax(logits, dim=-1) # Get top genres values, indices = torch.topk(probs[0], k=5) top_genres = [(genre_model.config.id2label[idx.item()], val.item()) for val, idx in zip(values, indices)] else: # Fallback if model loading failed top_genres = [("Unknown", 1.0)] # Format genre results for display genre_results_text = format_genre_results(top_genres) primary_genre = top_genres[0][0] # Ensure time signature is one of the supported ones (4/4, 3/4, 6/8) if time_signature not in ["4/4", "3/4", "6/8"]: time_signature = "4/4" # Default to 4/4 if unsupported # Analyze beat patterns and create lyrics template using the time signature beat_analysis = beat_analyzer.analyze_beat_pattern(audio_file, time_signature=time_signature, auto_detect=False) lyric_templates = beat_analyzer.create_lyric_template(beat_analysis) # Store these in the music_analysis dict for use in lyrics generation music_analysis["beat_analysis"] = beat_analysis music_analysis["lyric_templates"] = lyric_templates # Prepare analysis summary analysis_summary = f""" ### Music Analysis Results **Duration:** {duration:.2f} seconds **Tempo:** {tempo:.1f} BPM **Time Signature:** {time_signature} (Confidence: {time_sig_result["confidence"]:.1%}) **Key:** {music_analysis["tonal_analysis"]["key"]} {music_analysis["tonal_analysis"]["mode"]} **Emotions:** - Primary: {primary_emotion} (Confidence: {emotion_scores[primary_emotion]:.1%}) - Secondary: {secondary_emotion} (Confidence: {emotion_scores[secondary_emotion]:.1%}) **Themes:** - Primary: {primary_theme} (Confidence: {theme_scores[primary_theme]:.1%}) - Secondary: {secondary_theme} (Confidence: {theme_scores[secondary_theme]:.1%}) **Top Genre:** {primary_genre} {genre_results_text} """ # Add beat analysis summary if lyric_templates: analysis_summary += f""" ### Beat Analysis **Total Phrases:** {len(lyric_templates)} **Average Beats Per Phrase:** {np.mean([t['num_beats'] for t in lyric_templates]):.1f} **Beat Pattern Examples:** - Phrase 1: {lyric_templates[0]['stress_pattern'] if lyric_templates else 'N/A'} - Phrase 2: {lyric_templates[1]['stress_pattern'] if len(lyric_templates) > 1 else 'N/A'} """ # Check if genre is supported for lyrics generation genre_supported = any(genre.lower() in primary_genre.lower() for genre in beat_analyzer.supported_genres) # Generate lyrics only for supported genres if genre_supported: lyrics = generate_lyrics(music_analysis, primary_genre, duration, custom_prompt) beat_match_analysis = analyze_lyrics_rhythm_match(lyrics, lyric_templates, primary_genre) else: supported_genres_str = ", ".join([genre.capitalize() for genre in beat_analyzer.supported_genres]) lyrics = f"Lyrics generation is only supported for the following genres: {supported_genres_str}.\n\nDetected genre '{primary_genre}' doesn't have strong syllable-to-beat patterns required for our lyric generation algorithm." beat_match_analysis = "Lyrics generation not available for this genre." return analysis_summary, lyrics, tempo, time_signature, primary_emotion, secondary_emotion, primary_theme, secondary_theme, primary_genre, beat_match_analysis except Exception as e: error_msg = f"Error processing audio: {str(e)}" print(error_msg) return error_msg, None, None, None, None, None, None, None, None, None def generate_lyrics(music_analysis, genre, duration, custom_prompt=""): try: # Extract meaningful information for context tempo = music_analysis["rhythm_analysis"]["tempo"] key = music_analysis["tonal_analysis"]["key"] mode = music_analysis["tonal_analysis"]["mode"] # Get both primary and secondary emotions and themes emotion_scores = music_analysis["emotion_analysis"]["emotion_scores"] sorted_emotions = sorted(emotion_scores.items(), key=lambda x: x[1], reverse=True) primary_emotion = sorted_emotions[0][0] secondary_emotion = sorted_emotions[1][0] if len(sorted_emotions) > 1 else None theme_scores = music_analysis["theme_analysis"]["theme_scores"] sorted_themes = sorted(theme_scores.items(), key=lambda x: x[1], reverse=True) primary_theme = sorted_themes[0][0] secondary_theme = sorted_themes[1][0] if len(sorted_themes) > 1 else None # Get beat analysis and templates lyric_templates = music_analysis.get("lyric_templates", []) # Define num_phrases here to ensure it's available in all code paths # Also define syllable limits for the prompt if not lyric_templates: num_phrases_for_prompt = 4 # Default if no templates min_syl_for_prompt = 2 max_syl_for_prompt = 7 # Build the base prompt base_prompt = f'''You are a professional songwriter. Write song lyrics for a {genre} song. SONG DETAILS: - Key: {key} {mode} - Tempo: {tempo} BPM - Primary emotion: {primary_emotion} - Secondary emotion: {secondary_emotion} - Primary theme: {primary_theme} - Secondary theme: {secondary_theme}''' # Add custom requirements if provided custom_requirements = "" if custom_prompt and custom_prompt.strip(): custom_requirements = f''' SPECIAL REQUIREMENTS FROM USER: {custom_prompt.strip()} Please incorporate these requirements while still following all the technical constraints below.''' prompt = base_prompt + custom_requirements + f''' CRITICAL REQUIREMENTS (MOST IMPORTANT): - You MUST write EXACTLY {num_phrases_for_prompt} lines of lyrics. - Number each lyric line starting from 1 up to {num_phrases_for_prompt}. For example: 1. First lyric line. 2. Second lyric line. ... {num_phrases_for_prompt}. The final lyric line. - Each numbered line (after removing the number and period) MUST be {min_syl_for_prompt}-{max_syl_for_prompt} syllables MAXIMUM. - NO line's content (after removing the number) can exceed {max_syl_for_prompt} syllables. This is EXTREMELY IMPORTANT. - Count syllables carefully for the content of each numbered line. - Use SHORT WORDS and SHORT PHRASES for the content of each numbered line. - Break long thoughts into multiple numbered lines. CREATIVITY GUIDELINES: - Create original, vivid imagery that captures the emotions. - Use concrete, sensory details (what you see, hear, feel, touch). - Avoid clichés and common phrases. - Draw inspiration from the specific themes and emotions listed above. - Think about unique moments, specific objects, or personal details. - Use unexpected word combinations. - Focus on the particular mood created by {primary_emotion} and {secondary_emotion}. STYLE FOR SHORT LINES (for the content of each numbered line): - Use brief, impactful phrases. - Focus on single images or moments per line. - Choose simple, everyday words. - Let each line paint one clear picture. ABSOLUTELY NO placeholders like [line], [moment], [breath], [phrase], [word], etc. OUTPUT FORMAT: Under the "LYRICS:" heading, provide exactly {num_phrases_for_prompt} numbered lyric lines. LYRICS: (Your {num_phrases_for_prompt} numbered lyric lines go here, each starting with its number, a period, and a space) Remember: Output EXACTLY {num_phrases_for_prompt} numbered lyric lines. Each line's content (after removing the number) must be {min_syl_for_prompt}-{max_syl_for_prompt} syllables.''' else: # Calculate the typical syllable range for this genre num_phrases_for_prompt = len(lyric_templates) max_syl_for_prompt = max([t.get('max_expected', 7) for t in lyric_templates]) if lyric_templates and lyric_templates[0].get('max_expected') else 7 min_syl_for_prompt = min([t.get('min_expected', 2) for t in lyric_templates]) if lyric_templates and lyric_templates[0].get('min_expected') else 2 # Build the base prompt base_prompt = f'''You are a professional songwriter. Write song lyrics for a {genre} song. SONG DETAILS: - Key: {key} {mode} - Tempo: {tempo} BPM - Primary emotion: {primary_emotion} - Secondary emotion: {secondary_emotion} - Primary theme: {primary_theme} - Secondary theme: {secondary_theme}''' # Add custom requirements if provided custom_requirements = "" if custom_prompt and custom_prompt.strip(): custom_requirements = f''' SPECIAL REQUIREMENTS FROM USER: {custom_prompt.strip()} Please incorporate these requirements while still following all the technical constraints below.''' prompt = base_prompt + custom_requirements + f''' CRITICAL REQUIREMENTS (MOST IMPORTANT): - You MUST write EXACTLY {num_phrases_for_prompt} lines of lyrics. - Number each lyric line starting from 1 up to {num_phrases_for_prompt}. For example: 1. First lyric line. 2. Second lyric line. ... {num_phrases_for_prompt}. The final lyric line. - Each numbered line (after removing the number and period) MUST be {min_syl_for_prompt}-{max_syl_for_prompt} syllables MAXIMUM. - NO line's content (after removing the number) can exceed {max_syl_for_prompt} syllables. This is EXTREMELY IMPORTANT. - Count syllables carefully for the content of each numbered line. - Use SHORT WORDS and SHORT PHRASES for the content of each numbered line. - Break long thoughts into multiple numbered lines. CREATIVITY GUIDELINES: - Create original, vivid imagery that captures the emotions. - Use concrete, sensory details (what you see, hear, feel, touch). - Avoid clichés and common phrases. - Draw inspiration from the specific themes and emotions listed above. - Think about unique moments, specific objects, or personal details. - Use unexpected word combinations. - Focus on the particular mood created by {primary_emotion} and {secondary_emotion}. STYLE FOR SHORT LINES (for the content of each numbered line): - Use brief, impactful phrases. - Focus on single images or moments per line. - Choose simple, everyday words. - Let each line paint one clear picture. ABSOLUTELY NO placeholders like [line], [moment], [breath], [phrase], [word], etc. OUTPUT FORMAT: Under the "LYRICS:" heading, provide exactly {num_phrases_for_prompt} numbered lyric lines. LYRICS: (Your {num_phrases_for_prompt} numbered lyric lines go here, each starting with its number, a period, and a space) Remember: Output EXACTLY {num_phrases_for_prompt} numbered lyric lines. Each line's content (after removing the number) must be {min_syl_for_prompt}-{max_syl_for_prompt} syllables.''' # Generate with optimized parameters for QwQ model messages = [ {"role": "user", "content": prompt} ] # Apply chat template text = llm_tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Tokenize and move to model device model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device) # Generate with optimized parameters for QwQ model generated_ids = llm_model.generate( **model_inputs, max_new_tokens=2048, # Increased from 1024 to give QwQ more room do_sample=True, temperature=0.6, # QwQ recommended setting top_p=0.95, # QwQ recommended setting top_k=30, # QwQ recommended range 20-40 repetition_penalty=1.1, # Reduced to allow some repetition if needed pad_token_id=llm_tokenizer.eos_token_id ) # Decode the output output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip() # ENHANCED CLEANING FOR QWQ MODEL - IMPROVED APPROACH # --------------------------------------------------- # QwQ often includes thinking process - we need to extract only the final lyrics # 1. First, remove any thinking tags completely (QwQ specific) lyrics = re.sub(r'.*?', '', lyrics, flags=re.DOTALL | re.IGNORECASE) lyrics = re.sub(r'', '', lyrics, flags=re.IGNORECASE) lyrics = re.sub(r'', '', lyrics, flags=re.IGNORECASE) # 2. Look for the LYRICS: section specifically lyrics_section_match = re.search(r'LYRICS:\s*\n(.*?)(?:\n\n|\Z)', lyrics, re.DOTALL | re.IGNORECASE) if lyrics_section_match: lyrics = lyrics_section_match.group(1).strip() else: # Fallback: look for other common transitions that indicate the start of actual lyrics lyric_start_patterns = [ r'(?:here (?:are )?(?:the )?lyrics?:?|lyrics?:?|my lyrics?:?|song lyrics?:?)\s*', r'(?:here (?:is )?(?:a )?song:?|here (?:is )?my song:?)\s*', r'(?:\*{3,}|\={3,}|\-{3,})\s*', r'(?:final lyrics?:?|the lyrics?:?)\s*', r'```\s*' ] # Try to find where actual lyrics start lyrics_start_pos = 0 for pattern in lyric_start_patterns: match = re.search(pattern, lyrics, re.IGNORECASE) if match: lyrics_start_pos = max(lyrics_start_pos, match.end()) # Keep content from the identified start position if lyrics_start_pos > 0: lyrics = lyrics[lyrics_start_pos:].strip() # 3. Split into lines and apply basic filtering lines = lyrics.strip().split('\n') clean_lines = [] # 4. Simple filtering - keep only actual lyric lines for line in lines: line = line.strip() if not line or line.isspace(): continue # Strip leading numbers like "1. ", "2. ", etc. line = re.sub(r'^\d+\.\s*', '', line) line_lower = line.lower() # Remove placeholder lines - more comprehensive pattern if re.match(r'^\[ *(line|moment|breath|phrase|word|sound) *\]$', line_lower): continue # Skip lines that are clearly not lyrics (simplified filtering) if any(phrase in line_lower for phrase in [ 'line 1', 'line 2', 'line 3', 'thinking', 'lyrics:', 'format:', 'etc...', 'commentary', 'syllables', 'requirements', 'output', 'provide' ]): continue # Skip numbered annotations if re.match(r'^\d+[\.\):]|^\[.*\]$', line): continue # Keep lines that look like actual lyrics (not too long, not too technical) words = line.split() if 1 <= len(words) <= 8 and not any(tech_word in line_lower for tech_word in [ 'syllable', 'beat', 'tempo', 'analysis', 'format', 'section' ]): clean_lines.append(line) # 5. Additional cleanup for QwQ-specific issues # Remove any remaining thinking fragments final_clean_lines = [] for line in clean_lines: # Remove trailing thoughts/annotations line = re.sub(r'\s+//.*$', '', line) line = re.sub(r'\s+\(.*?\)$', '', line) # Remove syllable count annotations line = re.sub(r'\s*\(\d+\s*syllables?\)', '', line, flags=re.IGNORECASE) # Skip if the line became empty after cleaning if line.strip(): final_clean_lines.append(line.strip()) clean_lines = final_clean_lines # AGGRESSIVE SYLLABLE ENFORCEMENT - This is critical for beat matching if lyric_templates: max_allowed_syllables = max([t.get('max_expected', 6) for t in lyric_templates]) min_allowed_syllables = min([t.get('min_expected', 2) for t in lyric_templates]) else: max_allowed_syllables = 6 min_allowed_syllables = 2 # Enforce syllable limits on every line syllable_enforced_lines = [] for line in clean_lines: words = line.split() current_syllables = sum(beat_analyzer.count_syllables(word) for word in words) # If line is within limits, keep it if min_allowed_syllables <= current_syllables <= max_allowed_syllables: syllable_enforced_lines.append(line) # If line is too long, we need to split it intelligently elif current_syllables > max_allowed_syllables: # Try to split into multiple shorter lines current_line = [] current_count = 0 for word in words: word_syllables = beat_analyzer.count_syllables(word) # If adding this word would exceed limit, start new line if current_count + word_syllables > max_allowed_syllables and current_line: syllable_enforced_lines.append(" ".join(current_line)) current_line = [word] current_count = word_syllables else: # Add the word to the current line current_line.append(word) current_count += word_syllables # Add the remaining words as final line if current_line and current_count >= min_allowed_syllables: syllable_enforced_lines.append(" ".join(current_line)) # Skip lines that are too short clean_lines = syllable_enforced_lines # Get required number of lines if lyric_templates: num_required = len(lyric_templates) else: num_required = 4 # IMPORTANT: Adjust line count to match requirement if len(clean_lines) > num_required: # Too many lines - try to merge adjacent short lines first merged_lines = [] i = 0 while i < len(clean_lines) and len(merged_lines) < num_required: if i + 1 < len(clean_lines) and len(merged_lines) < num_required - 1: # Check if we can merge current and next line line1 = clean_lines[i] line2 = clean_lines[i + 1] words1 = line1.split() words2 = line2.split() syllables1 = sum(beat_analyzer.count_syllables(word) for word in words1) syllables2 = sum(beat_analyzer.count_syllables(word) for word in words2) # If merging would stay within limits, merge them if syllables1 + syllables2 <= max_allowed_syllables: merged_lines.append(line1 + " " + line2) i += 2 else: merged_lines.append(line1) i += 1 else: merged_lines.append(clean_lines[i]) i += 1 # If still too many, truncate to required number clean_lines = merged_lines[:num_required] elif len(clean_lines) < num_required: # Too few lines - this is a generation failure # Instead of error, try to pad with empty lines or regenerate # For now, let's return an error message return f"Error: The model generated {len(clean_lines)} lines but {num_required} were required. Please try again." # Final check - ensure we have exactly the required number if len(clean_lines) != num_required: # If we still don't have the right number, truncate or pad if len(clean_lines) > num_required: clean_lines = clean_lines[:num_required] else: # This shouldn't happen with the above logic, but just in case return f"Error: Could not generate exactly {num_required} lines. Please try again." # Assemble final lyrics final_lyrics = '\n'.join(clean_lines) # Final sanity check - if we have nothing or very little, return an error if not final_lyrics or len(final_lyrics.strip()) < 15: return "The model output appears to be mostly thinking content. Please try regenerating for cleaner lyrics." return final_lyrics except Exception as e: error_msg = f"Error generating lyrics: {str(e)}" print(error_msg) return error_msg def analyze_lyrics_rhythm_match(lyrics, lyric_templates, genre="pop"): """Analyze how well the generated lyrics match the beat patterns and syllable requirements""" if not lyric_templates or not lyrics: return "No beat templates or lyrics available for analysis." # Split lyrics into lines lines = lyrics.strip().split('\n') lines = [line for line in lines if line.strip()] # Remove empty lines # Prepare analysis result result = "### Beat & Syllable Match Analysis\n\n" result += "| Line | Syllables | Target Range | Match | Stress Pattern |\n" result += "| ---- | --------- | ------------ | ----- | -------------- |\n" # Maximum number of lines to analyze (either all lines or all templates) line_count = min(len(lines), len(lyric_templates)) # Track overall match statistics total_matches = 0 total_range_matches = 0 total_stress_matches = 0 total_stress_percentage = 0 total_ideal_matches = 0 for i in range(line_count): line = lines[i] template = lyric_templates[i] # Check match between line and template with genre awareness check_result = beat_analyzer.check_syllable_stress_match(line, template, genre) # Get match symbols if check_result["close_to_ideal"]: syllable_match = "✓" # Ideal or very close elif check_result["within_range"]: syllable_match = "✓*" # Within range but not ideal else: syllable_match = "✗" # Outside range stress_match = "✓" if check_result["stress_matches"] else f"{int(check_result['stress_match_percentage']*100)}%" # Update stats if check_result["close_to_ideal"]: total_matches += 1 total_ideal_matches += 1 elif check_result["within_range"]: total_range_matches += 1 if check_result["stress_matches"]: total_stress_matches += 1 total_stress_percentage += check_result["stress_match_percentage"] # Create visual representation of the stress pattern stress_visual = "" for char in template['stress_pattern']: if char == "S": stress_visual += "X" # Strong elif char == "M": stress_visual += "x" # Medium else: stress_visual += "." # Weak # Add line to results table result += f"| {i+1} | {check_result['syllable_count']} | {check_result['min_expected']}-{check_result['max_expected']} | {syllable_match} | {stress_visual} |\n" # Add summary statistics if line_count > 0: exact_match_rate = (total_matches / line_count) * 100 range_match_rate = ((total_matches + total_range_matches) / line_count) * 100 ideal_match_rate = (total_ideal_matches / line_count) * 100 stress_match_rate = (total_stress_matches / line_count) * 100 avg_stress_percentage = (total_stress_percentage / line_count) * 100 result += f"\n**Summary:**\n" result += f"- Ideal or near-ideal syllable match rate: {exact_match_rate:.1f}%\n" result += f"- Genre-appropriate syllable range match rate: {range_match_rate:.1f}%\n" result += f"- Perfect stress pattern match rate: {stress_match_rate:.1f}%\n" result += f"- Average stress pattern accuracy: {avg_stress_percentage:.1f}%\n" result += f"- Overall rhythmic accuracy: {((range_match_rate + avg_stress_percentage) / 2):.1f}%\n" # Analyze sentence flow across lines sentence_flow_analysis = analyze_sentence_flow(lines) result += f"\n**Sentence Flow Analysis:**\n" result += f"- Connected thought groups: {sentence_flow_analysis['connected_groups']} detected\n" result += f"- Average lines per thought: {sentence_flow_analysis['avg_lines_per_group']:.1f}\n" result += f"- Flow quality: {sentence_flow_analysis['flow_quality']}\n" # Add guidance on ideal distribution for syllables and sentence flow result += f"\n**Syllable & Flow Guidance:**\n" result += f"- Aim for {min([t.get('min_expected', 3) for t in lyric_templates])}-{max([t.get('max_expected', 7) for t in lyric_templates])} syllables per line\n" result += f"- Break complete thoughts across 2-3 lines for natural flow\n" result += f"- Connect your lyrics with sentence fragments that flow across lines\n" result += f"- Use conjunctions, prepositions, and dependent clauses to connect lines\n" # Add genre-specific notes result += f"\n**Genre Notes ({genre}):**\n" # Add appropriate genre notes based on genre if genre.lower() == "pop": result += "- Pop lyrics work well with thoughts spanning 2-3 musical phrases\n" result += "- Create flow by connecting lines with transitions like 'as', 'when', 'through'\n" elif genre.lower() == "rock": result += "- Rock lyrics benefit from short phrases that build into complete thoughts\n" result += "- Use line breaks strategically to emphasize key words\n" elif genre.lower() == "country": result += "- Country lyrics tell stories that flow naturally across multiple lines\n" result += "- Connect narrative elements across phrases for authentic storytelling\n" elif genre.lower() == "disco": result += "- Disco lyrics work well with phrases that create rhythmic momentum\n" result += "- Use line transitions that maintain energy and flow\n" elif genre.lower() == "metal": result += "- Metal lyrics can create intensity by breaking phrases at dramatic points\n" result += "- Connect lines to build tension and release across measures\n" else: result += "- This genre works well with connected thoughts across multiple lines\n" result += "- Aim for natural speech flow rather than complete thoughts per line\n" return result def analyze_sentence_flow(lines): """Analyze how well the lyrics create sentence flow across multiple lines""" if not lines or len(lines) < 2: return { "connected_groups": 0, "avg_lines_per_group": 0, "flow_quality": "Insufficient lines to analyze" } # Simplified analysis looking for grammatical clues of sentence continuation continuation_starters = [ 'and', 'but', 'or', 'nor', 'for', 'yet', 'so', # Coordinating conjunctions 'as', 'when', 'while', 'before', 'after', 'since', 'until', 'because', 'although', 'though', # Subordinating conjunctions 'with', 'without', 'through', 'throughout', 'beyond', 'beneath', 'under', 'over', 'into', 'onto', # Prepositions 'to', 'from', 'by', 'at', 'in', 'on', 'of', # Common prepositions 'where', 'how', 'who', 'whom', 'whose', 'which', 'that', # Relative pronouns 'if', 'then', # Conditional connectors ] # Check for lines that likely continue a thought from previous line connected_lines = [] potential_groups = [] current_group = [0] # Start with first line for i in range(1, len(lines)): # Check if line starts with a continuation word words = lines[i].lower().split() # Empty line or no words if not words: if len(current_group) > 1: # Only consider groups of 2+ lines potential_groups.append(current_group.copy()) current_group = [i] continue # Check first word for continuation clues first_word = words[0].strip(',.!?;:') if first_word in continuation_starters: connected_lines.append(i) current_group.append(i) # Check for absence of capitalization as continuation clue elif not first_word[0].isupper() and first_word[0].isalpha(): connected_lines.append(i) current_group.append(i) # Check if current line is very short (likely part of a continued thought) elif len(words) <= 3 and i < len(lines) - 1: # Look ahead to see if next line could be a continuation if i+1 < len(lines): next_words = lines[i+1].lower().split() if next_words and next_words[0] in continuation_starters: connected_lines.append(i) current_group.append(i) else: # This might end a group if len(current_group) > 1: # Only consider groups of 2+ lines potential_groups.append(current_group.copy()) current_group = [i] else: # This likely starts a new thought if len(current_group) > 1: # Only consider groups of 2+ lines potential_groups.append(current_group.copy()) current_group = [i] # Add the last group if it has multiple lines if len(current_group) > 1: potential_groups.append(current_group) # Calculate metrics connected_groups = len(potential_groups) if connected_groups > 0: avg_lines_per_group = sum(len(group) for group in potential_groups) / connected_groups # Determine flow quality if connected_groups >= len(lines) / 3 and avg_lines_per_group >= 2.5: flow_quality = "Excellent - multiple connected thoughts across lines" elif connected_groups >= len(lines) / 4 and avg_lines_per_group >= 2: flow_quality = "Good - some connected thoughts across lines" elif connected_groups > 0: flow_quality = "Fair - limited connection between lines" else: flow_quality = "Poor - mostly independent lines" else: avg_lines_per_group = 0 flow_quality = "Poor - no connected thoughts detected" return { "connected_groups": connected_groups, "avg_lines_per_group": avg_lines_per_group, "flow_quality": flow_quality } def enforce_syllable_limits(lines, max_syllables=6): """ Enforce syllable limits by splitting or truncating lines that are too long. Returns a modified list of lines where no line exceeds max_syllables. """ if not lines: return [] result_lines = [] for line in lines: words = line.split() if not words: continue # Count syllables in the line syllable_count = sum(beat_analyzer.count_syllables(word) for word in words) # If within limits, keep the line as is if syllable_count <= max_syllables: result_lines.append(line) continue # Line is too long - we need to split or truncate it current_line = [] current_syllables = 0 for word in words: word_syllables = beat_analyzer.count_syllables(word) # If adding this word would exceed the limit, start a new line if current_syllables + word_syllables > max_syllables and current_line: result_lines.append(" ".join(current_line)) current_line = [word] current_syllables = word_syllables else: # Add the word to the current line current_line.append(word) current_syllables += word_syllables # Don't forget the last line if there are words left if current_line: result_lines.append(" ".join(current_line)) return result_lines # Create Gradio interface def create_interface(): with gr.Blocks(title="Advanced Music Analysis & Beat-Matched Lyrics Generator") as demo: gr.Markdown("# 🎵 Advanced Music Analysis & Beat-Matched Lyrics Generator") gr.Markdown("**Upload music to get comprehensive analysis and generate perfectly synchronized lyrics that match the rhythm, emotion, and structure of your audio**") with gr.Row(): with gr.Column(scale=1): audio_input = gr.Audio( label="🎧 Upload or Record Audio", type="filepath", sources=["upload", "microphone"] ) # Add custom prompt input custom_prompt_input = gr.Textbox( label="🎨 Custom Lyrics Requirements (Optional)", placeholder="e.g., 'Write about a rainy day in the city' or 'Include metaphors about flying' or 'Make it about overcoming challenges'", lines=3, info="Add any specific requirements, themes, or creative directions for the lyrics. This will be merged with the music analysis to create personalized lyrics." ) analyze_btn = gr.Button("🚀 Analyze Music & Generate Lyrics", variant="primary", size="lg") with gr.Column(scale=2): with gr.Tab("📊 Music Analysis"): analysis_output = gr.Textbox(label="Comprehensive Music Analysis Results", lines=10) with gr.Row(): tempo_output = gr.Number(label="🥁 Tempo (BPM)") time_sig_output = gr.Textbox(label="⏱️ Time Signature") with gr.Row(): primary_emotion_output = gr.Textbox(label="😊 Primary Emotion") secondary_emotion_output = gr.Textbox(label="😌 Secondary Emotion") with gr.Row(): primary_theme_output = gr.Textbox(label="🎭 Primary Theme") secondary_theme_output = gr.Textbox(label="🎪 Secondary Theme") genre_output = gr.Textbox(label="🎼 Primary Genre") with gr.Tab("🎤 Generated Lyrics"): lyrics_output = gr.Textbox(label="Beat-Synchronized Lyrics", lines=20) with gr.Tab("🎯 Beat Matching Analysis"): beat_match_output = gr.Markdown(label="Rhythm & Syllable Synchronization Analysis") # Set up event handlers analyze_btn.click( fn=process_audio, inputs=[audio_input, custom_prompt_input], outputs=[ analysis_output, lyrics_output, tempo_output, time_sig_output, primary_emotion_output, secondary_emotion_output, primary_theme_output, secondary_theme_output, genre_output, beat_match_output ] ) # Format supported genres for display supported_genres_md = "\n".join([f"- **{genre.capitalize()}**: Optimized for {genre} music patterns" for genre in beat_analyzer.supported_genres]) gr.Markdown(f""" ## 🚀 How It Works 1. **🎧 Upload Audio**: Support for various formats (MP3, WAV, etc.) or record directly in your browser 2. **🎨 Add Custom Requirements** (Optional): Specify your creative vision, themes, or style preferences 3. **🔍 Advanced Analysis**: Multi-layered analysis including: - **Tempo & Time Signature**: Advanced detection using multiple algorithms - **Emotional Profiling**: 8-dimensional emotion mapping (happy, sad, excited, calm, etc.) - **Thematic Analysis**: Musical themes (love, triumph, adventure, reflection, etc.) - **Beat Pattern Extraction**: Precise rhythm and stress pattern identification - **Genre Classification**: AI-powered genre detection with confidence scores 4. **🎤 Lyrics Generation**: AI creates perfectly synchronized lyrics that: - **Match Beat Patterns**: Each line aligns with musical phrases and rhythm - **Follow Syllable Constraints**: Precise syllable-to-beat mapping for natural flow - **Incorporate Emotions & Themes**: Blend detected musical characteristics - **Include Your Requirements**: Merge your creative directions seamlessly 5. **📊 Quality Analysis**: Comprehensive metrics showing beat matching accuracy and flow quality ## 🎨 Custom Requirements Examples **🌟 Themes**: "Write about nature and freedom", "Focus on urban nightlife", "Tell a story about friendship" **🖼️ Imagery**: "Use ocean metaphors", "Include references to stars and sky", "Focus on light and shadow" **👁️ Perspective**: "From a child's viewpoint", "Make it nostalgic", "Focus on hope and resilience" **✍️ Style**: "Use simple everyday language", "Include some rhyming", "Make it conversational" **📝 Content**: "Avoid sad themes", "Include words 'journey' and 'home'", "Focus on personal growth" The system intelligently blends your requirements with detected musical characteristics to create personalized, rhythm-perfect lyrics. ## 🎵 Supported Genres for Full Lyrics Generation **✅ Full Support** (Complete Analysis + Beat-Matched Lyrics): {supported_genres_md} These genres have consistent syllable-to-beat patterns that work optimally with our advanced rhythm-matching algorithm. **📊 Analysis Only**: All other genres receive comprehensive musical analysis (tempo, emotion, themes, etc.) without lyrics generation. ## 🛠️ Advanced Features - **🎯 Beat Synchronization**: Syllable-perfect alignment with musical phrases - **🧠 Emotion Integration**: Lyrics reflect detected emotional characteristics - **🎭 Theme Incorporation**: Musical themes guide lyrical content - **📏 Quality Metrics**: Detailed analysis of rhythm matching accuracy - **🔄 Flow Optimization**: Natural sentence continuation across lines - **⚙️ Genre Optimization**: Tailored patterns for different musical styles """) return demo # Launch the app demo = create_interface() if __name__ == "__main__": demo.launch() else: # For Hugging Face Spaces app = demo