diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -4,8 +4,8 @@ import gradio as gr import torch import numpy as np import re -import pronouncing # Add this to requirements.txt for syllable counting -import functools # Add this for lru_cache functionality +import pronouncing +import functools from transformers import ( AutoModelForAudioClassification, AutoFeatureExtractor, @@ -22,8 +22,12 @@ from utils import ( format_genre_results, ensure_cuda_availability ) -from emotionanalysis import MusicAnalyzer +from emotionanalysis import MusicAnalyzer import librosa +from beat_analysis import BeatAnalyzer # Import the BeatAnalyzer class + +# Initialize beat analyzer +beat_analyzer = BeatAnalyzer() # Login to Hugging Face Hub if token is provided if "HF_TOKEN" in os.environ: @@ -38,3971 +42,945 @@ SAMPLE_RATE = 22050 # Standard sample rate for audio processing # Check CUDA availability (for informational purposes) CUDA_AVAILABLE = ensure_cuda_availability() -# Create music detection pipeline -print(f"Loading music detection model: {MUSIC_DETECTION_MODEL}") +# Load models at initialization time +print("Loading genre classification model...") try: - music_detector = pipeline( - "audio-classification", - model=MUSIC_DETECTION_MODEL, - device=0 if CUDA_AVAILABLE else -1 + genre_feature_extractor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME) + genre_model = AutoModelForAudioClassification.from_pretrained( + GENRE_MODEL_NAME, + device_map="auto" if CUDA_AVAILABLE else None ) - print("Successfully loaded music detection pipeline") + # Create a convenience wrapper function with the same interface as before + def get_genre_model(): + return genre_model, genre_feature_extractor except Exception as e: - print(f"Error creating music detection pipeline: {str(e)}") - # Fallback to manual loading - try: - music_processor = AutoFeatureExtractor.from_pretrained(MUSIC_DETECTION_MODEL) - music_model = AutoModelForAudioClassification.from_pretrained(MUSIC_DETECTION_MODEL) - print("Successfully loaded music detection model and feature extractor") - except Exception as e2: - print(f"Error loading music detection model components: {str(e2)}") - raise RuntimeError(f"Could not load music detection model: {str(e2)}") + print(f"Error loading genre model: {str(e)}") + genre_model = None + genre_feature_extractor = None -# Create genre classification pipeline -print(f"Loading audio classification model: {GENRE_MODEL_NAME}") +# Load LLM and tokenizer at initialization time +print("Loading Qwen LLM model with 4-bit quantization...") try: - genre_classifier = pipeline( - "audio-classification", - model=GENRE_MODEL_NAME, - device=0 if CUDA_AVAILABLE else -1 + # Configure 4-bit quantization for better performance + quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True + ) + + llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME) + llm_model = AutoModelForCausalLM.from_pretrained( + LLM_MODEL_NAME, + quantization_config=quantization_config, + device_map="auto", + trust_remote_code=True, + torch_dtype=torch.float16, + use_cache=True ) - print("Successfully loaded audio classification pipeline") except Exception as e: - print(f"Error creating pipeline: {str(e)}") - # Fallback to manual loading - try: - genre_processor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME) - genre_model = AutoModelForAudioClassification.from_pretrained(GENRE_MODEL_NAME) - print("Successfully loaded audio classification model and feature extractor") - except Exception as e2: - print(f"Error loading model components: {str(e2)}") - raise RuntimeError(f"Could not load genre classification model: {str(e2)}") - -# Load LLM with appropriate quantization for T4 GPU -bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.float16, -) - -llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME) -llm_model = AutoModelForCausalLM.from_pretrained( - LLM_MODEL_NAME, - device_map="auto", - quantization_config=bnb_config, - torch_dtype=torch.float16, -) - -# Create LLM pipeline -llm_pipeline = pipeline( - "text-generation", - model=llm_model, - tokenizer=llm_tokenizer, - max_new_tokens=512, -) + print(f"Error loading LLM model: {str(e)}") + llm_tokenizer = None + llm_model = None -# Initialize music emotion analyzer +# Create music analyzer instance music_analyzer = MusicAnalyzer() -# New global function moved outside of verify_flexible_syllable_counts -@functools.lru_cache(maxsize=512) -def cached_phones_for_word(word): - """Get word pronunciations with caching for better performance.""" - return pronouncing.phones_for_word(word) - -@functools.lru_cache(maxsize=512) -def count_syllables_for_word(word): - """Count syllables in a single word with caching for performance.""" - # Try using pronouncing library first - pronunciations = cached_phones_for_word(word.lower()) - if pronunciations: - return pronouncing.syllable_count(pronunciations[0]) - - # Fallback method for words not in the pronouncing dictionary - vowels = "aeiouy" - word = word.lower() - count = 0 - prev_is_vowel = False - - for char in word: - is_vowel = char in vowels - if is_vowel and not prev_is_vowel: - count += 1 - prev_is_vowel = is_vowel - - # Handle special cases - if word.endswith('e') and not word.endswith('le'): - count -= 1 - if word.endswith('le') and len(word) > 2 and word[-3] not in vowels: - count += 1 - if count == 0: - count = 1 - - return count - -@functools.lru_cache(maxsize=512) -def get_word_stress(word): - """Get the stress pattern for a word with improved fallback handling.""" - pronunciations = cached_phones_for_word(word.lower()) - if pronunciations: - return pronouncing.stresses(pronunciations[0]) - - # Enhanced fallback for words not in the dictionary - syllables = count_syllables_for_word(word) - - # Common English stress patterns by word length - if syllables == 1: - return "1" # Single syllable words are stressed - elif syllables == 2: - # Most 2-syllable nouns and adjectives stress first syllable - # Common endings that indicate second-syllable stress - second_syllable_stress = ["ing", "er", "or", "ize", "ise", "ate", "ect", "end", "ure"] - if any(word.endswith(ending) for ending in second_syllable_stress): - return "01" - else: - return "10" # Default for 2-syllable words - elif syllables == 3: - # Common endings for specific stress patterns in 3-syllable words - if any(word.endswith(ending) for ending in ["ity", "ety", "ify", "ogy", "graphy"]): - return "100" # First syllable stress - elif any(word.endswith(ending) for ending in ["ation", "ious", "itis"]): - return "010" # Middle syllable stress - else: - return "100" # Default for 3-syllable words - else: - # For longer words, use common English patterns - return "1" + "0" * (syllables - 1) - -# New function: Count syllables in text -def count_syllables(text): - """Count syllables in a given text using the pronouncing library.""" - words = re.findall(r'\b[a-zA-Z]+\b', text.lower()) - syllable_count = 0 - - for word in words: - syllable_count += count_syllables_for_word(word) +# Process uploaded audio file +def process_audio(audio_file): + if audio_file is None: + return "No audio file provided", None, None, None, None, None, None, None - return syllable_count - -def extract_audio_features(audio_file): - """Extract audio features from an audio file.""" try: - # Load the audio file using utility function - y, sr = load_audio(audio_file, SAMPLE_RATE) + # Load and analyze audio + y, sr = load_audio(audio_file, sr=SAMPLE_RATE) - if y is None or sr is None: - raise ValueError("Failed to load audio data") - - # Get audio duration in seconds + # Basic audio information duration = extract_audio_duration(y, sr) - # Extract MFCCs for genre classification (may not be needed with the pipeline) - mfccs_mean = extract_mfcc_features(y, sr, n_mfcc=20) + # Detect time signature using the advanced method in BeatAnalyzer + time_sig_result = beat_analyzer.detect_time_signature(audio_file) + time_signature = time_sig_result["time_signature"] - return { - "features": mfccs_mean, - "duration": duration, - "waveform": y, - "sample_rate": sr, - "path": audio_file # Keep path for the pipeline - } - except Exception as e: - print(f"Error extracting audio features: {str(e)}") - raise ValueError(f"Failed to extract audio features: {str(e)}") - -def classify_genre(audio_data): - """Classify the genre of the audio using the loaded model.""" - try: - # First attempt: Try using the pipeline if available - if 'genre_classifier' in globals(): - results = genre_classifier(audio_data["path"]) - # Transform pipeline results to our expected format - top_genres = [(result["label"], result["score"]) for result in results[:3]] - return top_genres + # Analyze music with MusicAnalyzer for emotion and theme analysis + music_analysis = music_analyzer.analyze_music(audio_file) - # Second attempt: Use manually loaded model components - elif 'genre_processor' in globals() and 'genre_model' in globals(): - # Process audio input with feature extractor - inputs = genre_processor( - audio_data["waveform"], - sampling_rate=audio_data["sample_rate"], + # Override MusicAnalyzer's time signature with the one detected by BeatAnalyzer + music_analysis["rhythm_analysis"]["estimated_time_signature"] = time_signature + + # Extract key information + tempo = music_analysis["rhythm_analysis"]["tempo"] + emotion = music_analysis["emotion_analysis"]["primary_emotion"] + theme = music_analysis["theme_analysis"]["primary_theme"] + + # Use genre classification directly instead of pipeline + if genre_model is not None and genre_feature_extractor is not None: + # Resample audio to 16000 Hz for the genre model + y_16k = librosa.resample(y, orig_sr=sr, target_sr=16000) + + # Extract features + inputs = genre_feature_extractor( + y_16k, + sampling_rate=16000, return_tensors="pt" - ) + ).to(genre_model.device) + # Classify genre with torch.no_grad(): outputs = genre_model(**inputs) - predictions = outputs.logits.softmax(dim=-1) - - # Get the top 3 genres - values, indices = torch.topk(predictions, 3) - - # Map indices to genre labels - genre_labels = genre_model.config.id2label - - top_genres = [] - for i, (value, index) in enumerate(zip(values[0], indices[0])): - genre = genre_labels[index.item()] - confidence = value.item() - top_genres.append((genre, confidence)) - - return top_genres + logits = outputs.logits + probs = torch.nn.functional.softmax(logits, dim=-1) + + # Get top genres + values, indices = torch.topk(probs[0], k=5) + top_genres = [(genre_model.config.id2label[idx.item()], val.item()) for val, idx in zip(values, indices)] + else: + # Fallback if model loading failed + top_genres = [("Unknown", 1.0)] + + # Format genre results for display + genre_results_text = format_genre_results(top_genres) + primary_genre = top_genres[0][0] + # Override time signature for pop and disco genres to always be 4/4 + if any(genre.lower() in primary_genre.lower() for genre in ['pop', 'disco']): + music_analysis["rhythm_analysis"]["estimated_time_signature"] = "4/4" + time_signature = "4/4" else: - raise ValueError("No genre classification model available") - - except Exception as e: - print(f"Error in genre classification: {str(e)}") - # Fallback: return a default genre if everything fails - return [("rock", 1.0)] + # Ensure time signature is one of the supported ones (4/4, 3/4, 6/8) + if time_signature not in ["4/4", "3/4", "6/8"]: + time_signature = "4/4" # Default to 4/4 if unsupported + music_analysis["rhythm_analysis"]["estimated_time_signature"] = time_signature + + # Analyze beat patterns and create lyrics template using the time signature + beat_analysis = beat_analyzer.analyze_beat_pattern(audio_file, time_signature=time_signature, auto_detect=False) + lyric_templates = beat_analyzer.create_lyric_template(beat_analysis) + + # Store these in the music_analysis dict for use in lyrics generation + music_analysis["beat_analysis"] = beat_analysis + music_analysis["lyric_templates"] = lyric_templates + + # Prepare analysis summary + analysis_summary = f""" +### Music Analysis Results + +**Duration:** {duration:.2f} seconds +**Tempo:** {tempo:.1f} BPM +**Time Signature:** {time_signature} (Confidence: {time_sig_result["confidence"]:.1%}) +**Key:** {music_analysis["tonal_analysis"]["key"]} {music_analysis["tonal_analysis"]["mode"]} +**Primary Emotion:** {emotion} +**Primary Theme:** {theme} +**Top Genre:** {primary_genre} + +{genre_results_text} +""" -def detect_music(audio_data): - """Detect if the audio is music using the MIT AST model.""" - try: - # First attempt: Try using the pipeline if available - if 'music_detector' in globals(): - results = music_detector(audio_data["path"]) - # Look for music-related classes in the results - music_confidence = 0.0 - for result in results: - label = result["label"].lower() - if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): - music_confidence = max(music_confidence, result["score"]) - return music_confidence >= 0.2, results + # Add beat analysis summary + if lyric_templates: + analysis_summary += f""" +### Beat Analysis + +**Total Phrases:** {len(lyric_templates)} +**Average Beats Per Phrase:** {np.mean([t['num_beats'] for t in lyric_templates]):.1f} +**Beat Pattern Examples:** +- Phrase 1: {lyric_templates[0]['stress_pattern'] if lyric_templates else 'N/A'} +- Phrase 2: {lyric_templates[1]['stress_pattern'] if len(lyric_templates) > 1 else 'N/A'} +""" - # Second attempt: Use manually loaded model components - elif 'music_processor' in globals() and 'music_model' in globals(): - # Process audio input with feature extractor - inputs = music_processor( - audio_data["waveform"], - sampling_rate=audio_data["sample_rate"], - return_tensors="pt" - ) - - with torch.no_grad(): - outputs = music_model(**inputs) - predictions = outputs.logits.softmax(dim=-1) - - # Get the top predictions - values, indices = torch.topk(predictions, 5) - - # Map indices to labels - labels = music_model.config.id2label - - # Check for music-related classes - music_confidence = 0.0 - results = [] - - for i, (value, index) in enumerate(zip(values[0], indices[0])): - label = labels[index.item()].lower() - score = value.item() - results.append({"label": label, "score": score}) - - if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): - music_confidence = max(music_confidence, score) - - return music_confidence >= 0.2, results - + # Check if genre is supported for lyrics generation + # Use the supported_genres list from BeatAnalyzer + genre_supported = any(genre.lower() in primary_genre.lower() for genre in beat_analyzer.supported_genres) + + # Generate lyrics only for supported genres + if genre_supported: + lyrics = generate_lyrics(music_analysis, primary_genre, duration) + beat_match_analysis = analyze_lyrics_rhythm_match(lyrics, lyric_templates, primary_genre) else: - raise ValueError("No music detection model available") - + supported_genres_str = ", ".join([genre.capitalize() for genre in beat_analyzer.supported_genres]) + lyrics = f"Lyrics generation is only supported for the following genres: {supported_genres_str}.\n\nDetected genre '{primary_genre}' doesn't have strong syllable-to-beat patterns required for our lyric generation algorithm." + beat_match_analysis = "Lyrics generation not available for this genre." + + return analysis_summary, lyrics, tempo, time_signature, emotion, theme, primary_genre, beat_match_analysis + except Exception as e: - print(f"Error in music detection: {str(e)}") - return False, [] + error_msg = f"Error processing audio: {str(e)}" + print(error_msg) + return error_msg, None, None, None, None, None, None, None -def detect_beats(y, sr): - """Enhanced beat detection with adaptive threshold analysis, improved time signature detection and scientific confidence metrics.""" - # STEP 1: Improved pre-processing with robustness for quiet sections - # Apply a small floor to avoid division-by-zero issues - y = np.clip(y, 1e-10, None) # Prevent extreme quiet sections from causing NaN - - # Separate harmonic and percussive components - y_harmonic, y_percussive = librosa.effects.hpss(y) - - # Generate multiple onset envelopes with smoothing for stability - onset_env_full = librosa.onset.onset_strength(y=y, sr=sr) - onset_env_perc = librosa.onset.onset_strength(y=y_percussive, sr=sr) - - # Apply small smoothing to handle quiet sections - onset_env_full = np.maximum(onset_env_full, 1e-6) # Minimum threshold to avoid NaN - onset_env_perc = np.maximum(onset_env_perc, 1e-6) - - # Create weighted combination - combined_onset = onset_env_full * 0.3 + onset_env_perc * 0.7 - - # STEP 2: Multi-strategy tempo and beat detection with confidence tracking - tempo_candidates = [] - beat_candidates = [] - consistency_metrics = [] - - # Strategy 1: Standard detection - tempo1, beats1 = librosa.beat.beat_track( - onset_envelope=combined_onset, - sr=sr, - tightness=100 # More sensitive tracking - ) - tempo_candidates.append(tempo1) - beat_candidates.append(beats1) - - # Calculate autocorrelation-based confidence for this tempo - ac = librosa.autocorrelate(combined_onset) - estimated_period = int(sr * 60.0 / (tempo1 * librosa.get_duration(y=y, sr=sr) / len(combined_onset))) - if estimated_period < len(ac) and estimated_period > 0: - # Measure peak height relative to surroundings - local_ac = ac[max(0, estimated_period-5):min(len(ac), estimated_period+6)] - if np.max(local_ac) > 0: - tempo1_confidence = ac[estimated_period] / np.max(local_ac) - else: - tempo1_confidence = 0.5 - else: - tempo1_confidence = 0.5 - consistency_metrics.append(tempo1_confidence) - - # Strategy 2: Try with different tempo range for complex signatures - tempo2, beats2 = librosa.beat.beat_track( - onset_envelope=combined_onset, - sr=sr, - tightness=100, - start_bpm=60 # Lower starting BPM helps find different time signatures - ) - tempo_candidates.append(tempo2) - beat_candidates.append(beats2) - - # Calculate confidence for the second tempo estimate - estimated_period2 = int(sr * 60.0 / (tempo2 * librosa.get_duration(y=y, sr=sr) / len(combined_onset))) - if estimated_period2 < len(ac) and estimated_period2 > 0: - local_ac2 = ac[max(0, estimated_period2-5):min(len(ac), estimated_period2+6)] - if np.max(local_ac2) > 0: - tempo2_confidence = ac[estimated_period2] / np.max(local_ac2) - else: - tempo2_confidence = 0.5 - else: - tempo2_confidence = 0.5 - consistency_metrics.append(tempo2_confidence) - - # Strategy 3: Use dynamic programming for beat tracking +def generate_lyrics(music_analysis, genre, duration): try: - tempo3, beats3 = librosa.beat.beat_track( - onset_envelope=combined_onset, - sr=sr, - tightness=300, # Higher tightness for more structured detection - trim=False - ) - tempo_candidates.append(tempo3) - beat_candidates.append(beats3) + # Extract meaningful information for context + tempo = music_analysis["rhythm_analysis"]["tempo"] + key = music_analysis["tonal_analysis"]["key"] + mode = music_analysis["tonal_analysis"]["mode"] + emotion = music_analysis["emotion_analysis"]["primary_emotion"] + theme = music_analysis["theme_analysis"]["primary_theme"] + + # Get beat analysis and templates + lyric_templates = music_analysis.get("lyric_templates", []) - # Calculate DP-based confidence - if len(beats3) > 1: - beat_times3 = librosa.frames_to_time(beats3, sr=sr) - intervals3 = np.diff(beat_times3) - tempo3_consistency = 1.0 / (1.0 + np.std(intervals3)/np.mean(intervals3)) if np.mean(intervals3) > 0 else 0.5 + # Define num_phrases here to ensure it's available in all code paths + num_phrases = len(lyric_templates) if lyric_templates else 4 + + # Verify LLM is loaded + if llm_model is None or llm_tokenizer is None: + return "Error: LLM model not properly loaded" + + # If no templates, fall back to original method + if not lyric_templates: + # Simplified prompt + prompt = f"""Write song lyrics for a {genre} song in {key} {mode} with tempo {tempo} BPM. The emotion is {emotion} and theme is {theme}. + +ONLY WRITE THE ACTUAL LYRICS. NO EXPLANATIONS OR META-TEXT. +""" else: - tempo3_consistency = 0.5 - consistency_metrics.append(tempo3_consistency) - except Exception: - # Skip if this approach fails - pass - - # Select the best strategy based on improved consistency measurement - beat_consistency = [] - for i, beats in enumerate(beat_candidates): - if len(beats) <= 1: - beat_consistency.append(0) - continue + # Calculate the typical syllable range for this genre + if num_phrases > 0: + # Get max syllables per line from templates + max_syllables = max([t.get('max_expected', 7) for t in lyric_templates]) if lyric_templates[0].get('max_expected') else 7 + min_syllables = min([t.get('min_expected', 2) for t in lyric_templates]) if lyric_templates[0].get('min_expected') else 2 + avg_syllables = (min_syllables + max_syllables) // 2 + else: + min_syllables = 2 + max_syllables = 7 + avg_syllables = 4 + + # Create random examples based on the song's theme and emotion + # to avoid the LLM copying our examples directly + example_themes = [ + {"emotion": "love", "fragments": ["I see your face", "across the room", "my heart beats fast", "can't look away"]}, + {"emotion": "sadness", "fragments": ["tears fall like rain", "on empty streets", "memories fade", "into the dark"]}, + {"emotion": "nostalgia", "fragments": ["old photographs", "dusty and worn", "remind me of when", "we were young"]}, + {"emotion": "hope", "fragments": ["dawn breaks through clouds", "new day begins", "darkness recedes", "light fills my soul"]}, + {"emotion": "longing", "fragments": ["miles apart now", "under same stars", "thinking of you", "across the distance"]} + ] + + # Select a theme that doesn't match the song's emotion to avoid copying + selected_themes = [t for t in example_themes if t["emotion"].lower() != emotion.lower()] + if not selected_themes: + selected_themes = example_themes + + import random + example_theme = random.choice(selected_themes) + example_fragments = example_theme["fragments"] + random.shuffle(example_fragments) # Randomize order + + # Create example 1 - grammatical connection with conjunction + ex1_line1 = example_fragments[0] if len(example_fragments) > 0 else "The morning sun" + ex1_line2 = example_fragments[1] if len(example_fragments) > 1 else "breaks through clouds" + ex1_line3 = example_fragments[2] if len(example_fragments) > 2 else "as birds begin" + ex1_line4 = example_fragments[3] if len(example_fragments) > 3 else "their dawn chorus" + + # Create example 2 - prepositional connection + ex2_fragments = [ + "She walks alone", + "through crowded streets", + "with memories", + "of better days" + ] + random.shuffle(ex2_fragments) - times = librosa.frames_to_time(beats, sr=sr) - intervals = np.diff(times) + # Create a more direct prompt with examples and specific syllable count guidance + prompt = f"""Write song lyrics for a {genre} song in {key} {mode} with tempo {tempo} BPM. + +PRIMARY THEME: {theme} +EMOTION: {emotion} + +I need EXACTLY {num_phrases} lines of lyrics with these STRICT requirements: + +CRITICAL INSTRUCTIONS: +1. EXTREMELY SHORT LINES: Each line MUST be between {min_syllables}-{max_syllables} syllables MAXIMUM +2. ENFORCE BREVITY: NO exceptions to the syllable limit - not a single line should exceed {max_syllables} syllables +3. FRAGMENT STYLE: Use sentence fragments and short phrases instead of complete sentences +4. CONNECTED THOUGHTS: Use prepositions and conjunctions at the start of lines to connect ideas +5. SIMPLE WORDS: Choose one or two-syllable words whenever possible +6. CONCRETE IMAGERY: Use specific, tangible details rather than abstract concepts +7. NO CLICHÉS: Avoid common phrases like "time slips away" or "memories fade" +8. ONE THOUGHT PER LINE: Express just one simple idea in each line + +FORMAT: +- Write exactly {num_phrases} short text lines +- No annotations, explanations, or line numbers +- Do not count syllables in the output + +IMPORTANT: If you can't express an idea in {max_syllables} or fewer syllables, break it across two lines or choose a simpler way to express it. + +===== EXAMPLES OF CORRECT LENGTH ===== + +Example 1 (short fragments connected by flow): +Cold tea cup (3 syllables) +on windowsill (3 syllables) +cat watches rain (3 syllables) +through foggy glass (3 syllables) + +Example 2 (prepositional connections): +Keys dropped here (3 syllables) +by the front door (3 syllables) +where shoes pile up (3 syllables) +since you moved in (3 syllables) + +DO NOT copy my examples. Create ENTIRELY NEW lyrics about {theme} with {emotion} feeling. + +REMEMBER: NO LINE SHOULD EXCEED {max_syllables} SYLLABLES - this is the most important rule! +""" + + # Generate lyrics using the LLM model + messages = [ + {"role": "user", "content": prompt} + ] + + # Apply chat template + text = llm_tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + + # Tokenize and move to model device + model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device) + + # Generate with optimized parameters + generated_ids = llm_model.generate( + **model_inputs, + max_new_tokens=1024, + do_sample=True, + temperature=0.7, + top_p=0.9, + repetition_penalty=1.2, + pad_token_id=llm_tokenizer.eos_token_id + ) + + # Decode the output + output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() + lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip() + + # ULTRA AGGRESSIVE CLEANING - COMPLETELY REVISED + # ------------------------------------------------ + + # 1. First, look for any standard dividers that might separate thinking from lyrics + divider_patterns = [ + r'Here are the lyrics:', + r'Here is my song:', + r'The lyrics:', + r'My lyrics:', + r'Song lyrics:', + r'\*\*\*+', + r'===+', + r'---+', + r'```', + r'Lyrics:' + ] + + for pattern in divider_patterns: + matches = re.finditer(pattern, lyrics, re.IGNORECASE) + for match in matches: + # Keep only content after the divider + lyrics = lyrics[match.end():].strip() + + # 2. Remove thinking tags completely before splitting into lines + lyrics = re.sub(r'.*?', '', lyrics, flags=re.DOTALL) + lyrics = re.sub(r'\[thinking\].*?\[/thinking\]', '', lyrics, flags=re.DOTALL) + lyrics = re.sub(r'', '', lyrics, flags=re.DOTALL) + lyrics = re.sub(r'', '', lyrics, flags=re.DOTALL) + lyrics = re.sub(r'\[thinking\]', '', lyrics, flags=re.DOTALL) + lyrics = re.sub(r'\[/thinking\]', '', lyrics, flags=re.DOTALL) + + # 3. Split text into lines for aggressive line-by-line filtering + lines = lyrics.strip().split('\n') + clean_lines = [] + + # 4. Define comprehensive patterns for non-lyrical content + non_lyric_patterns = [ + # Meta-commentary + r'^(note|thinking|thoughts|let me|i will|i am going|i would|i can|i need to|i have to|i should|let\'s|here|now)', + r'^(first|second|third|next|finally|importantly|remember|so|ok|okay|as requested|as asked|considering)', + # Explanations + r'syllable[s]?|phrase|rhythm|beats?|tempo|bpm|instruction|follow|alignment|match|corresponding', + r'verses?|chorus|bridge|section|stanza|part|template|format|pattern|example', + r'requirements?|guidelines?|song structure|stressed|unstressed', + # Technical language + r'generated|output|result|provide|create|write|draft|version', + # Annotations and numbering + r'^line \d+|^\d+[\.\):]|^\[\w+\]|^[\*\-\+] ', + # Questions or analytical statements + r'\?$|analysis|evaluate|review|check|ensure', + # Instruction-like statements + r'make sure|please note|important|notice|pay attention' + ] - # Comprehensive consistency metrics with better statistical justification - if np.mean(intervals) > 0: - # Combine coefficient of variation with autocorrelation confidence - cv = np.std(intervals)/np.mean(intervals) # Lower is better + # 5. Identify which lines are likely actual lyrics vs non-lyrics + for line in lines: + line = line.strip() - # Add adjustments for beat count reasonability - duration = librosa.get_duration(y=y, sr=sr) - expected_beats = duration * tempo_candidates[i] / 60 - beats_ratio = min(len(beats) / expected_beats, expected_beats / len(beats)) if expected_beats > 0 else 0.5 + # Skip empty lines or lines with just spaces/tabs + if not line or line.isspace(): + continue - # Combine metrics with scientific weighting - consistency = (0.7 * (1.0 / (1.0 + cv))) + (0.3 * consistency_metrics[i]) + (0.2 * beats_ratio) - beat_consistency.append(consistency) - else: - beat_consistency.append(0) - - # Select best model with scientific confidence calculation - if beat_consistency: - best_idx = np.argmax(beat_consistency) - best_confidence = beat_consistency[best_idx] * 100 # Convert to percentage - else: - best_idx = 0 - best_confidence = 50.0 # Default 50% confidence if no good metrics - - tempo = tempo_candidates[best_idx] - beat_frames = beat_candidates[best_idx] - - # Calculate beat entropy - scientific measure of beat pattern predictability - beat_entropy = 0.0 - if len(beat_frames) > 2: - times = librosa.frames_to_time(beat_frames, sr=sr) - intervals = np.diff(times) - - # Quantize intervals to detect patterns - if len(intervals) > 0 and np.std(intervals) > 0: - quantized = np.round(intervals / np.min(intervals)) - # Count frequencies of each interval type - unique, counts = np.unique(quantized, return_counts=True) - probs = counts / np.sum(counts) - # Calculate Shannon entropy - beat_entropy = -np.sum(probs * np.log2(probs)) - - # STEP 3: Improved beat strength extraction - beat_times = librosa.frames_to_time(beat_frames, sr=sr) - - # Vectorized extraction of beat strengths with improved error handling - beat_strengths = [] - if len(beat_frames) > 0: - # Filter out beat frames that exceed the onset envelope length - valid_frames = [frame for frame in beat_frames if frame < len(combined_onset)] - if valid_frames: - # Vectorized extraction with normalization for consistency - raw_strengths = combined_onset[valid_frames] + # Skip lines that match any non-lyric pattern + should_skip = False + for pattern in non_lyric_patterns: + if re.search(pattern, line.lower()): + should_skip = True + break - # Normalize strengths to [0,1] for scientific consistency - if np.max(raw_strengths) > 0: - normalized_strengths = raw_strengths / np.max(raw_strengths) - else: - normalized_strengths = np.ones_like(raw_strengths) + if should_skip: + continue - beat_strengths = normalized_strengths.tolist() + # Skip section headers + if (line.startswith('[') and ']' in line) or (line.startswith('(') and ')' in line and len(line) < 20): + continue - # Handle remaining beats with interpolation instead of constant values - if len(beat_times) > len(beat_strengths): - missing_count = len(beat_times) - len(beat_strengths) - # Use linear interpolation for more scientific approach - if beat_strengths: - last_strength = beat_strengths[-1] - decay_factor = 0.9 # Gradual decay for trailing beats - beat_strengths.extend([last_strength * (decay_factor ** (i+1)) - for i in range(missing_count)]) - else: - beat_strengths = [1.0] * len(beat_times) - else: - beat_strengths = [1.0] * len(beat_times) - else: - beat_strengths = [1.0] * len(beat_times) - - # STEP 4: Calculate intervals between beats - intervals = np.diff(beat_times).tolist() if len(beat_times) > 1 else [] - - # STEP 5: Improved time signature detection with scientific confidence - # Start with default assumption - time_signature = 4 - time_sig_confidence = 70.0 # Default confidence - - if len(beat_strengths) > 8: - # Use autocorrelation to find periodicity in beat strengths - if len(beat_strengths) > 4: - # Normalize beat strengths for better pattern detection - norm_strengths = np.array(beat_strengths) - if np.max(norm_strengths) > 0: - norm_strengths = norm_strengths / np.max(norm_strengths) + # Skip lines that look like annotations (not prose-like) + if ':' in line and not any(word in line.lower() for word in ['like', 'when', 'where', 'how', 'why', 'what']): + if len(line.split(':')[0]) < 15: # Short prefixes followed by colon are likely annotations + continue + + # Skip very short lines that aren't likely to be lyrics (unless it's just a few words which could be valid) + if len(line) < 3: + continue - # Compute autocorrelation to find periodic patterns (N) - ac = librosa.autocorrelate(norm_strengths, max_size=len(norm_strengths)//2) + # Skip lines that are numbered or bulleted + if re.match(r'^\d+\.|\(#\d+\)|\d+\)', line): + continue + + # Skip markdown-style emphasis or headers + if re.match(r'^#{1,6} |^\*\*|^__', line): + continue - # Find peaks in autocorrelation (indicates periodicity) - if len(ac) > 3: # Need enough data for peak picking - # Find peaks after lag 0 - peaks = librosa.util.peak_pick(ac[1:], pre_max=1, post_max=1, pre_avg=1, post_avg=1, delta=0.1, wait=1) - peaks = peaks + 1 # Adjust for the removed lag 0 + # Skip lines with think tags + if '' in line.lower() or '' in line.lower() or '[thinking]' in line.lower() or '[/thinking]' in line.lower(): + continue - if len(peaks) > 0: - # Get the first significant peak position (cycle length N) - peak_idx = peaks[0] - N = peak_idx + # Add this line as it passed all filters + clean_lines.append(line) + + # 6. Additional block-level filters for common patterns + # Check beginning of lyrics for common prefixes + if clean_lines and any(clean_lines[0].lower().startswith(prefix) for prefix in + ['here are', 'these are', 'below are', 'following are']): + clean_lines = clean_lines[1:] # Skip the first line + + # 7. Process blocks of lines to detect explanation blocks + if len(clean_lines) > 3: + # Check for explanation blocks at the beginning + first_three = ' '.join(clean_lines[:3]).lower() + if any(term in first_three for term in ['i will', 'i have created', 'i\'ll provide', 'i\'ll write']): + # This looks like an explanation, skip the first few lines + start_idx = 0 + for i, line in enumerate(clean_lines): + if i >= 3 and not any(term in line.lower() for term in ['i will', 'created', 'write', 'provide']): + start_idx = i + break + clean_lines = clean_lines[start_idx:] + + # Check for explanation blocks at the end + last_three = ' '.join(clean_lines[-3:]).lower() + if any(term in last_three for term in ['hope this', 'these lyrics', 'as you can see', 'this song', 'i have']): + # This looks like an explanation at the end, truncate + end_idx = len(clean_lines) + for i in range(len(clean_lines) - 1, max(0, len(clean_lines) - 4), -1): + if i < len(clean_lines) and not any(term in clean_lines[i].lower() for term in + ['hope', 'these lyrics', 'as you can see', 'this song']): + end_idx = i + 1 + break + clean_lines = clean_lines[:end_idx] + + # 8. Cleanup - Remove remaining annotations or thinking + for i in range(len(clean_lines)): + # Remove trailing thoughts/annotations + clean_lines[i] = re.sub(r'\s+//.*$', '', clean_lines[i]) + clean_lines[i] = re.sub(r'\s+\(.*?\)$', '', clean_lines[i]) + + # Remove thinking tags completely + clean_lines[i] = re.sub(r'.*?', '', clean_lines[i], flags=re.DOTALL) + clean_lines[i] = re.sub(r'\[thinking\].*?\[/thinking\]', '', clean_lines[i], flags=re.DOTALL) + clean_lines[i] = re.sub(r'', '', clean_lines[i]) + clean_lines[i] = re.sub(r'', '', clean_lines[i]) + clean_lines[i] = re.sub(r'\[thinking\]', '', clean_lines[i]) + clean_lines[i] = re.sub(r'\[/thinking\]', '', clean_lines[i]) + + # Remove syllable count annotations + clean_lines[i] = re.sub(r'\s*\(\d+\s*syllables?\)', '', clean_lines[i]) + + # 9. Filter out any remaining empty lines after tag removal + clean_lines = [line for line in clean_lines if line.strip() and not line.isspace()] + + # 10. NEW: Apply strict syllable enforcement - split or truncate lines that are too long + # This is a critical step to ensure no line exceeds our max syllable count + if lyric_templates: + max_allowed_syllables = min(7, max([t.get('max_expected', 6) for t in lyric_templates])) + else: + max_allowed_syllables = 6 + + clean_lines = enforce_syllable_limits(clean_lines, max_allowed_syllables) + + # 11. NEW: Check for template copying or clichéd phrases + cliched_patterns = [ + r'moonlight (shimmers?|falls?|dances?)', + r'shadows? (dance|play|fall|stretch)', + r'time slips? away', + r'whispers? (fade|in the)', + r'silence speaks', + r'stars? shine', + r'hearts? beat', + r'footsteps (fade|echo)', + r'gentle wind', + r'(old|empty) (roads?|chair)', + r'night (holds?|falls?)', + r'memories fade', + r'dreams (linger|drift)' + ] + + cliche_count = 0 + for line in clean_lines: + for pattern in cliched_patterns: + if re.search(pattern, line.lower()): + cliche_count += 1 + break + + # Calculate percentage of clichéd lines + if clean_lines: + cliche_percentage = (cliche_count / len(clean_lines)) * 100 + else: + cliche_percentage = 0 + + # 12. If we have lyric templates, ensure we have the correct number of lines + if lyric_templates: + num_required = len(lyric_templates) + + # If we have too many lines, keep just the best ones + if len(clean_lines) > num_required: + # Keep the first num_required lines + clean_lines = clean_lines[:num_required] + + # If we don't have enough lines, generate placeholders that fit the syllable count + while len(clean_lines) < num_required: + i = len(clean_lines) + if i < len(lyric_templates): + template = lyric_templates[i] + target_syllables = min(max_allowed_syllables - 1, (template.get('min_expected', 2) + template.get('max_expected', 6)) // 2) + + # Generate more creative, contextual placeholders with specificity + # Avoid clichés like "moonlight shimmers" or "time slips away" + specific_placeholders = { + # 2-3 syllables - specific, concrete phrases + 2: [ + "Phone rings twice", + "Dogs bark loud", + "Keys dropped here", + "Train rolls by", + "Birds take flight" + ], + # 3-4 syllables - specific contexts + 3: [ + "Coffee gets cold", + "Fan blades spin", + "Pages turn slow", + "Neighbors talk", + "Radio hums soft" + ], + # 4-5 syllables - specific details + 4: [ + "Fingers tap table", + "Taxi waits in rain", + "Laptop screen blinks", + "Ring left on sink", + "Church bells ring loud" + ], + # 5-6 syllables - context rich + 5: [ + "Letters with no stamps", + "Watch shows wrong time", + "Jeans with torn knees", + "Dog barks next door", + "Smoke alarm beeps" + ] + } - # Calculate confidence based on peak prominence - if peak_idx < len(ac): - peak_height = ac[peak_idx] - local_prominence = peak_height / np.mean(ac[max(0, peak_idx-2):min(len(ac), peak_idx+3)]) - time_sig_confidence = min(95, 60 + 35 * local_prominence) # Scale between 60-95% + # Make theme and emotion specific placeholders to add to the list + theme_specific = [] + if theme.lower() in ["love", "relationship", "romance"]: + theme_specific = ["Lipstick on glass", "Text left on read", "Scent on your coat"] + elif theme.lower() in ["loss", "grief", "sadness"]: + theme_specific = ["Chair sits empty", "Photos face down", "Clothes in closet"] + elif theme.lower() in ["hope", "inspiration", "triumph"]: + theme_specific = ["Seeds start to grow", "Finish line waits", "New day breaks through"] - # Map common cycle lengths to time signatures with improved musical theory - if N == 2: - time_signature = 2 # Clear binary meter (2/4, 2/2, etc.) - time_sig_confidence += 5 # Boost for simple meter - elif N == 3: - time_signature = 3 # Clear triple meter (3/4, 3/8, etc.) - time_sig_confidence += 5 # Boost for simple meter - elif 4 <= N <= 5: - time_signature = N # Direct mapping for common cases (4/4 or 5/4) - elif N == 6: - # Could be 6/8 (compound duple) or 3/4 with subdivisions - # Further analyze to distinguish - group_3_count = 0 - for i in range(0, len(beat_strengths) - 6, 3): - if i + 2 < len(beat_strengths): - if beat_strengths[i] > beat_strengths[i+1] and beat_strengths[i] > beat_strengths[i+2]: - group_3_count += 1 + # Get the closest matching syllable group + closest_group = min(specific_placeholders.keys(), key=lambda k: abs(k - target_syllables)) + + # Create pool of available placeholders from both specific and theme specific options + all_placeholders = specific_placeholders[closest_group] + theme_specific + + # Choose a placeholder that hasn't been used yet + available_placeholders = [p for p in all_placeholders if p not in clean_lines] + + if available_placeholders: + # Use modulo for more variation + idx = (i * 17 + len(clean_lines) * 13) % len(available_placeholders) + placeholder = available_placeholders[idx] + else: + # If we've used all placeholders, create something random and specific + subjects = ["Car", "Dog", "Kid", "Clock", "Phone", "Tree", "Book", "Door", "Light"] + verbs = ["waits", "moves", "stops", "falls", "breaks", "turns", "sleeps"] - group_2_count = 0 - for i in range(0, len(beat_strengths) - 4, 2): - if i + 1 < len(beat_strengths): - if beat_strengths[i] > beat_strengths[i+1]: - group_2_count += 1 - - # Determine if it's grouped in 2s or 3s - time_signature = 3 if group_3_count > group_2_count else 6 - elif N == 8: - time_signature = 4 # 4/4 with embellishments - elif N == 5 or N == 7: - time_signature = N # Odd time signatures like 5/4 or 7/8 - - # STEP 6: Enhanced phrase detection with adaptive thresholds and scientific justification - phrases = [] - current_phrase = [] - - if len(beat_times) > 0: - # Calculate adaptive thresholds using percentiles instead of fixed ratios - if len(beat_strengths) > 4: - # Define thresholds based on distribution rather than fixed values - strong_threshold = np.percentile(beat_strengths, 75) # Top 25% are "strong" beats - # For gaps, calculate significant deviation using z-scores if we have intervals - if intervals: - mean_interval = np.mean(intervals) - std_interval = np.std(intervals) - # A significant gap is > 1.5 standard deviations above mean (95th percentile) - significant_gap = mean_interval + (1.5 * std_interval) if std_interval > 0 else mean_interval * 1.3 - else: - significant_gap = 0 - else: - # Fallback for limited data - strong_threshold = np.max(beat_strengths) * 0.8 if beat_strengths else 1.0 - significant_gap = 0 - - # Identify phrase boundaries with improved musical heuristics - for i in range(len(beat_times)): - current_phrase.append(i) - - # Check for phrase boundary conditions - if i < len(beat_times) - 1: - # Strong beat coming up (using adaptive threshold) - is_stronger_next = False - if i < len(beat_strengths) - 1: - is_stronger_next = beat_strengths[i+1] > strong_threshold and beat_strengths[i+1] > beat_strengths[i] * 1.1 - - # Significant gap (using adaptive threshold) - is_longer_gap = False - if i < len(beat_times) - 1 and intervals and i < len(intervals): - is_longer_gap = intervals[i] > significant_gap - - # Measure boundary based on time signature - is_measure_boundary = (i + 1) % time_signature == 0 and i > 0 - - # Check for significant dip in onset strength (phrase boundary often has reduced energy) - is_energy_dip = False - if i < len(beat_strengths) - 1: - onset_ratio = beat_strengths[i+1] / max(beat_strengths[i], 0.001) - is_energy_dip = onset_ratio < 0.6 - - # Combined decision for phrase boundary with scientific weighting - phrase_boundary_score = ( - (1.5 if is_stronger_next else 0) + - (2.0 if is_longer_gap else 0) + - (1.0 if is_measure_boundary else 0) + - (0.5 if is_energy_dip else 0) - ) - - if (phrase_boundary_score >= 1.5 and len(current_phrase) >= 2) or \ - (is_measure_boundary and len(current_phrase) >= time_signature): - phrases.append(current_phrase) - current_phrase = [] - - # Add the last phrase if not empty - if current_phrase and len(current_phrase) >= 2: - phrases.append(current_phrase) - - # Ensure we have at least one phrase - if not phrases and len(beat_times) >= 2: - # Default to grouping by measures based on detected time signature - for i in range(0, len(beat_times), time_signature): - end = min(i + time_signature, len(beat_times)) - if end - i >= 2: # Ensure at least 2 beats per phrase - phrases.append(list(range(i, end))) - - # Calculate beat periodicity (average time between beats) - beat_periodicity = np.mean(intervals) if intervals else (60 / tempo) - - # Return enhanced results with scientific confidence metrics - return { - "tempo": tempo, - "tempo_confidence": best_confidence, # New scientific confidence metric - "time_signature": time_signature, - "time_sig_confidence": time_sig_confidence, # New scientific confidence metric - "beat_frames": beat_frames, - "beat_times": beat_times, - "beat_count": len(beat_times), - "beat_strengths": beat_strengths, - "intervals": intervals, - "phrases": phrases, - "beat_periodicity": beat_periodicity, - "beat_entropy": beat_entropy # New scientific measure of rhythm complexity - } - -def detect_beats_and_subbeats(y, sr, subdivision=4): - """ - Detect main beats and interpolate subbeats between consecutive beats. - - Parameters: - y: Audio time series - sr: Sample rate - subdivision: Number of subdivisions between beats (default: 4 for quarter beats) + # Ensure randomness with seed that changes with each call + import random + random.seed(len(clean_lines) * 27 + i * 31) + + subj = random.choice(subjects) + verb = random.choice(verbs) + + placeholder = f"{subj} {verb}" + else: + placeholder = "Page turns slow" + + clean_lines.append(placeholder) - Returns: - Dictionary containing beat times, subbeat times, and tempo information - """ - # Detect main beats using librosa - try: - tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr) - beat_times = librosa.frames_to_time(beat_frames, sr=sr) + # Assemble final lyrics + final_lyrics = '\n'.join(clean_lines) - # Convert numpy values to native Python types - if isinstance(tempo, np.ndarray) or isinstance(tempo, np.number): - tempo = float(tempo) + # Add a warning if we detected too many clichés + if cliche_percentage >= 40: + final_lyrics = f"""WARNING: These lyrics contain several overused phrases and clichés. +Try regenerating for more original content. + +{final_lyrics}""" - # Convert beat_times to a list of floats - if isinstance(beat_times, np.ndarray): - beat_times = [float(t) for t in beat_times] + # 13. Final sanity check - if we have nothing or garbage, return an error + if not final_lyrics or len(final_lyrics) < 10: + return "The model generated only thinking content but no actual lyrics. Please try again." + + return final_lyrics + except Exception as e: - print(f"Error in beat detection: {e}") - # Default fallbacks - tempo = 120.0 - beat_times = [] + error_msg = f"Error generating lyrics: {str(e)}" + print(error_msg) + return error_msg + +def analyze_lyrics_rhythm_match(lyrics, lyric_templates, genre="pop"): + """Analyze how well the generated lyrics match the beat patterns and syllable requirements""" + if not lyric_templates or not lyrics: + return "No beat templates or lyrics available for analysis." - # Create subbeats by interpolating between main beats - subbeat_times = [] + # Split lyrics into lines + lines = lyrics.strip().split('\n') + lines = [line for line in lines if line.strip()] # Remove empty lines + + # Prepare analysis result + result = "### Beat & Syllable Match Analysis\n\n" + result += "| Line | Syllables | Target Range | Match | Stress Pattern |\n" + result += "| ---- | --------- | ------------ | ----- | -------------- |\n" + + # Maximum number of lines to analyze (either all lines or all templates) + line_count = min(len(lines), len(lyric_templates)) + + # Track overall match statistics + total_matches = 0 + total_range_matches = 0 + total_stress_matches = 0 + total_stress_percentage = 0 + total_ideal_matches = 0 + + for i in range(line_count): + line = lines[i] + template = lyric_templates[i] + + # Check match between line and template with genre awareness + check_result = beat_analyzer.check_syllable_stress_match(line, template, genre) + + # Get match symbols + if check_result["close_to_ideal"]: + syllable_match = "✓" # Ideal or very close + elif check_result["within_range"]: + syllable_match = "✓*" # Within range but not ideal + else: + syllable_match = "✗" # Outside range + + stress_match = "✓" if check_result["stress_matches"] else f"{int(check_result['stress_match_percentage']*100)}%" + + # Update stats + if check_result["close_to_ideal"]: + total_matches += 1 + total_ideal_matches += 1 + elif check_result["within_range"]: + total_range_matches += 1 + + if check_result["stress_matches"]: + total_stress_matches += 1 + total_stress_percentage += check_result["stress_match_percentage"] + + # Create visual representation of the stress pattern + stress_visual = "" + for char in template['stress_pattern']: + if char == "S": + stress_visual += "X" # Strong + elif char == "M": + stress_visual += "x" # Medium + else: + stress_visual += "." # Weak + + # Add line to results table + result += f"| {i+1} | {check_result['syllable_count']} | {check_result['min_expected']}-{check_result['max_expected']} | {syllable_match} | {stress_visual} |\n" + + # Add summary statistics + if line_count > 0: + exact_match_rate = (total_matches / line_count) * 100 + range_match_rate = ((total_matches + total_range_matches) / line_count) * 100 + ideal_match_rate = (total_ideal_matches / line_count) * 100 + stress_match_rate = (total_stress_matches / line_count) * 100 + avg_stress_percentage = (total_stress_percentage / line_count) * 100 + + result += f"\n**Summary:**\n" + result += f"- Ideal or near-ideal syllable match rate: {exact_match_rate:.1f}%\n" + result += f"- Genre-appropriate syllable range match rate: {range_match_rate:.1f}%\n" + result += f"- Perfect stress pattern match rate: {stress_match_rate:.1f}%\n" + result += f"- Average stress pattern accuracy: {avg_stress_percentage:.1f}%\n" + result += f"- Overall rhythmic accuracy: {((range_match_rate + avg_stress_percentage) / 2):.1f}%\n" + + # Analyze sentence flow across lines + sentence_flow_analysis = analyze_sentence_flow(lines) + result += f"\n**Sentence Flow Analysis:**\n" + result += f"- Connected thought groups: {sentence_flow_analysis['connected_groups']} detected\n" + result += f"- Average lines per thought: {sentence_flow_analysis['avg_lines_per_group']:.1f}\n" + result += f"- Flow quality: {sentence_flow_analysis['flow_quality']}\n" + + # Add guidance on ideal distribution for syllables and sentence flow + result += f"\n**Syllable & Flow Guidance:**\n" + result += f"- Aim for {min([t.get('min_expected', 3) for t in lyric_templates])}-{max([t.get('max_expected', 7) for t in lyric_templates])} syllables per line\n" + result += f"- Break complete thoughts across 2-3 lines for natural flow\n" + result += f"- Connect your lyrics with sentence fragments that flow across lines\n" + result += f"- Use conjunctions, prepositions, and dependent clauses to connect lines\n" + + # Add genre-specific notes + result += f"\n**Genre Notes ({genre}):**\n" + + # Add appropriate genre notes based on genre + if genre.lower() == "pop": + result += "- Pop lyrics work well with thoughts spanning 2-3 musical phrases\n" + result += "- Create flow by connecting lines with transitions like 'as', 'when', 'through'\n" + elif genre.lower() == "rock": + result += "- Rock lyrics benefit from short phrases that build into complete thoughts\n" + result += "- Use line breaks strategically to emphasize key words\n" + elif genre.lower() == "country": + result += "- Country lyrics tell stories that flow naturally across multiple lines\n" + result += "- Connect narrative elements across phrases for authentic storytelling\n" + elif genre.lower() == "disco": + result += "- Disco lyrics work well with phrases that create rhythmic momentum\n" + result += "- Use line transitions that maintain energy and flow\n" + elif genre.lower() == "metal": + result += "- Metal lyrics can create intensity by breaking phrases at dramatic points\n" + result += "- Connect lines to build tension and release across measures\n" + else: + result += "- This genre works well with connected thoughts across multiple lines\n" + result += "- Aim for natural speech flow rather than complete thoughts per line\n" - # Early return if no beats detected - if not beat_times or len(beat_times) < 2: + return result + +def analyze_sentence_flow(lines): + """Analyze how well the lyrics create sentence flow across multiple lines""" + if not lines or len(lines) < 2: return { - "tempo": float(tempo) if tempo is not None else 120.0, - "beat_times": beat_times, - "subbeat_times": [] + "connected_groups": 0, + "avg_lines_per_group": 0, + "flow_quality": "Insufficient lines to analyze" } - for i in range(len(beat_times) - 1): - # Get current and next beat time - try: - current_beat = float(beat_times[i]) - next_beat = float(beat_times[i + 1]) - except (IndexError, ValueError, TypeError): - continue - - # Calculate time interval between beats - interval = (next_beat - current_beat) / subdivision - - # Add the main beat - subbeat_times.append({ - "time": float(current_beat), - "type": "main", - "strength": 1.0, - "beat_index": i - }) + # Simplified analysis looking for grammatical clues of sentence continuation + continuation_starters = [ + 'and', 'but', 'or', 'nor', 'for', 'yet', 'so', # Coordinating conjunctions + 'as', 'when', 'while', 'before', 'after', 'since', 'until', 'because', 'although', 'though', # Subordinating conjunctions + 'with', 'without', 'through', 'throughout', 'beyond', 'beneath', 'under', 'over', 'into', 'onto', # Prepositions + 'to', 'from', 'by', 'at', 'in', 'on', 'of', # Common prepositions + 'where', 'how', 'who', 'whom', 'whose', 'which', 'that', # Relative pronouns + 'if', 'then', # Conditional connectors + ] + + # Check for lines that likely continue a thought from previous line + connected_lines = [] + potential_groups = [] + current_group = [0] # Start with first line + + for i in range(1, len(lines)): + # Check if line starts with a continuation word + words = lines[i].lower().split() - # Add subbeats - for j in range(1, subdivision): - subbeat_time = current_beat + j * interval - # Calculate strength based on position - # For 4/4 time, beat 3 is stronger than beats 2 and 4 - if j == subdivision // 2 and subdivision == 4: - strength = 0.8 # Stronger subbeat (e.g., beat 3 in 4/4) - else: - strength = 0.5 # Weaker subbeat + # Empty line or no words + if not words: + if len(current_group) > 1: # Only consider groups of 2+ lines + potential_groups.append(current_group.copy()) + current_group = [i] + continue - subbeat_times.append({ - "time": float(subbeat_time), - "type": "sub", - "strength": float(strength), - "beat_index": i, - "subbeat_index": j - }) - - # Add the last main beat - if beat_times: - try: - subbeat_times.append({ - "time": float(beat_times[-1]), - "type": "main", - "strength": 1.0, - "beat_index": len(beat_times) - 1 - }) - except (ValueError, TypeError): - # Skip if conversion fails - pass + # Check first word for continuation clues + first_word = words[0].strip(',.!?;:') + if first_word in continuation_starters: + connected_lines.append(i) + current_group.append(i) + # Check for absence of capitalization as continuation clue + elif not first_word[0].isupper() and first_word[0].isalpha(): + connected_lines.append(i) + current_group.append(i) + # Check if current line is very short (likely part of a continued thought) + elif len(words) <= 3 and i < len(lines) - 1: + # Look ahead to see if next line could be a continuation + if i+1 < len(lines): + next_words = lines[i+1].lower().split() + if next_words and next_words[0] in continuation_starters: + connected_lines.append(i) + current_group.append(i) + else: + # This might end a group + if len(current_group) > 1: # Only consider groups of 2+ lines + potential_groups.append(current_group.copy()) + current_group = [i] + else: + # This likely starts a new thought + if len(current_group) > 1: # Only consider groups of 2+ lines + potential_groups.append(current_group.copy()) + current_group = [i] + + # Add the last group if it has multiple lines + if len(current_group) > 1: + potential_groups.append(current_group) + + # Calculate metrics + connected_groups = len(potential_groups) + + if connected_groups > 0: + avg_lines_per_group = sum(len(group) for group in potential_groups) / connected_groups + + # Determine flow quality + if connected_groups >= len(lines) / 3 and avg_lines_per_group >= 2.5: + flow_quality = "Excellent - multiple connected thoughts across lines" + elif connected_groups >= len(lines) / 4 and avg_lines_per_group >= 2: + flow_quality = "Good - some connected thoughts across lines" + elif connected_groups > 0: + flow_quality = "Fair - limited connection between lines" + else: + flow_quality = "Poor - mostly independent lines" + else: + avg_lines_per_group = 0 + flow_quality = "Poor - no connected thoughts detected" return { - "tempo": float(tempo) if tempo is not None else 120.0, - "beat_times": beat_times, - "subbeat_times": subbeat_times + "connected_groups": connected_groups, + "avg_lines_per_group": avg_lines_per_group, + "flow_quality": flow_quality } -def map_beats_to_seconds(subbeat_times, duration, fps=1.0): - """ - Map beats and subbeats to second-level intervals. - - Parameters: - subbeat_times: List of dictionaries containing beat and subbeat information - duration: Total duration of the audio in seconds - fps: Frames per second (default: 1.0 for one-second intervals) - - Returns: - List of dictionaries, each containing beats within a time window - """ - # Safety check for input parameters - if not isinstance(subbeat_times, list): - print("Warning: subbeat_times is not a list") - subbeat_times = [] - - try: - duration = float(duration) - except (ValueError, TypeError): - print("Warning: duration is not convertible to float, defaulting to 30") - duration = 30.0 - - # Calculate number of time windows - num_windows = int(duration * fps) + 1 - - # Initialize time windows - time_windows = [] - - for i in range(num_windows): - # Calculate window boundaries - start_time = i / fps - end_time = (i + 1) / fps - - # Find beats and subbeats within this window - window_beats = [] - - for beat in subbeat_times: - # Safety check for beat object - if not isinstance(beat, dict): - continue - - # Safely access beat time - try: - beat_time = float(beat.get("time", 0)) - except (ValueError, TypeError): - continue - - if start_time <= beat_time < end_time: - # Safely extract beat properties with defaults - beat_type = beat.get("type", "sub") - if not isinstance(beat_type, str): - beat_type = "sub" - - # Safely handle strength - try: - strength = float(beat.get("strength", 0.5)) - except (ValueError, TypeError): - strength = 0.5 - - # Add beat to this window - window_beats.append({ - "time": beat_time, - "type": beat_type, - "strength": strength, - "relative_pos": (beat_time - start_time) / (1/fps) # Position within window (0-1) - }) - - # Add window to list - time_windows.append({ - "second": i, - "start": start_time, - "end": end_time, - "beats": window_beats - }) - - return time_windows - -def create_second_level_templates(sec_map, tempo, genre=None): +def enforce_syllable_limits(lines, max_syllables=6): """ - Create syllable templates for each second-level window. - - Parameters: - sec_map: List of second-level time windows with beat information - tempo: Tempo in BPM - genre: Optional genre for genre-specific adjustments - - Returns: - List of template strings, one for each second + Enforce syllable limits by splitting or truncating lines that are too long. + Returns a modified list of lines where no line exceeds max_syllables. """ - # Helper function to map tempo to base syllable count - def tempo_to_syllable_base(tempo): - """Continuous function mapping tempo to syllable base count""" - # Sigmoid-like function that smoothly transitions between syllable counts - if tempo > 180: - return 1.0 - elif tempo > 140: - return 1.0 + (180 - tempo) * 0.02 # Gradual increase 1.0 → 1.8 - elif tempo > 100: - return 1.8 + (140 - tempo) * 0.01 # Gradual increase 1.8 → 2.2 - elif tempo > 70: - return 2.2 + (100 - tempo) * 0.02 # Gradual increase 2.2 → 2.8 - else: - return 2.8 + max(0, (70 - tempo) * 0.04) # Continue increasing for very slow tempos - - # Calculate base syllable count from tempo - base_syllables = tempo_to_syllable_base(tempo) - - # Apply genre-specific adjustments - genre_factor = 1.0 - if genre: - genre_lower = genre.lower() - if any(term in genre_lower for term in ["rap", "hip hop", "hip-hop"]): - genre_factor = 1.4 # Much higher syllable density for rap - elif any(term in genre_lower for term in ["folk", "country", "ballad"]): - genre_factor = 0.8 # Lower density for folk styles + if not lines: + return [] - # Create templates for each second - templates = [] + result_lines = [] - for window in sec_map: - beats = window["beats"] - - # If no beats in this second, create a default template - if not beats: - templates.append("w(0.5):1") + for line in lines: + words = line.split() + if not words: continue - - # Create beat patterns for this second - beat_patterns = [] - - for beat in beats: - # Ensure we're dealing with a dictionary and that it has a "strength" key - if not isinstance(beat, dict): - continue # Skip this beat if it's not a dictionary - # Safely get beat type and strength - if "type" not in beat or not isinstance(beat["type"], str): - beat_type = "w" # Default to weak if type is missing or not a string - else: - beat_type = "S" if beat["type"] == "main" else "m" if beat.get("strength", 0) >= 0.7 else "w" + # Count syllables in the line + syllable_count = sum(beat_analyzer.count_syllables(word) for word in words) + + # If within limits, keep the line as is + if syllable_count <= max_syllables: + result_lines.append(line) + continue - # Safely get strength value with fallback - try: - strength = float(beat.get("strength", 0.5)) - except (ValueError, TypeError): - strength = 0.5 # Default if conversion fails + # Line is too long - we need to split or truncate it + current_line = [] + current_syllables = 0 + + for word in words: + word_syllables = beat_analyzer.count_syllables(word) - # Adjust syllable count based on beat type and strength - if beat_type == "S": - syllable_factor = 1.2 # More syllables for strong beats - elif beat_type == "m": - syllable_factor = 1.0 # Normal for medium beats + # If adding this word would exceed the limit, start a new line + if current_syllables + word_syllables > max_syllables and current_line: + result_lines.append(" ".join(current_line)) + current_line = [word] + current_syllables = word_syllables else: - syllable_factor = 0.8 # Fewer for weak beats - - # Calculate final syllable count - syllable_count = base_syllables * syllable_factor * genre_factor - - # Round to half-syllable precision - syllable_count = round(syllable_count * 2) / 2 - - # Ensure reasonable limits - syllable_count = max(0.5, min(4, syllable_count)) + # Add the word to the current line + current_line.append(word) + current_syllables += word_syllables + + # Don't forget the last line if there are words left + if current_line: + result_lines.append(" ".join(current_line)) + + return result_lines + +# Create Gradio interface +def create_interface(): + with gr.Blocks(title="Music Analysis & Lyrics Generator") as demo: + gr.Markdown("# Music Analysis & Lyrics Generator") + gr.Markdown("Upload a music file or record audio to analyze it and generate matching lyrics") + + with gr.Row(): + with gr.Column(scale=1): + audio_input = gr.Audio( + label="Upload or Record Audio", + type="filepath", + sources=["upload", "microphone"] + ) + analyze_btn = gr.Button("Analyze and Generate Lyrics", variant="primary") - # Format with embedded strength value - strength_pct = round(strength * 100) / 100 - beat_patterns.append(f"{beat_type}({strength_pct}):{syllable_count}") + with gr.Column(scale=2): + with gr.Tab("Analysis"): + analysis_output = gr.Textbox(label="Music Analysis Results", lines=10) + + with gr.Row(): + tempo_output = gr.Number(label="Tempo (BPM)") + time_sig_output = gr.Textbox(label="Time Signature") + emotion_output = gr.Textbox(label="Primary Emotion") + theme_output = gr.Textbox(label="Primary Theme") + genre_output = gr.Textbox(label="Primary Genre") + + with gr.Tab("Generated Lyrics"): + lyrics_output = gr.Textbox(label="Generated Lyrics", lines=20) + + with gr.Tab("Beat Matching"): + beat_match_output = gr.Markdown(label="Beat & Syllable Matching Analysis") + + # Set up event handlers + analyze_btn.click( + fn=process_audio, + inputs=[audio_input], + outputs=[analysis_output, lyrics_output, tempo_output, time_sig_output, + emotion_output, theme_output, genre_output, beat_match_output] + ) - # Join patterns with dashes - ensure we have at least one pattern - if not beat_patterns: - templates.append("w(0.5):1") # Default if no valid patterns were created - else: - second_template = "-".join(beat_patterns) - templates.append(second_template) + # Format supported genres for display + supported_genres_md = "\n".join([f"- {genre.capitalize()}" for genre in beat_analyzer.supported_genres]) + + gr.Markdown(f""" + ## How it works + 1. Upload or record a music file + 2. The system analyzes tempo, beats, time signature and other musical features + 3. It detects emotion, theme, and music genre + 4. Using beat patterns and syllable stress analysis, it generates perfectly aligned lyrics + 5. Each line of the lyrics is matched to the beat pattern of the corresponding musical phrase + + ## Supported Genres + **Note:** Lyrics generation is currently only supported for the following genres: + {supported_genres_md} + + These genres have consistent syllable-to-beat patterns that work well with our algorithm. + For other genres, only music analysis will be provided. + """) - return templates - -def detect_sections(y, sr): - """ - Detect musical segments without classifying them by type (verse, chorus, etc.). - - Parameters: - y: Audio time series - sr: Sample rate - - Returns: - A list of section dictionaries with start time, end time, and duration - """ - # Step 1: Extract rich feature set for comprehensive analysis - # ---------------------------------------------------------------------- - hop_length = 512 # Common hop length for feature extraction - - # Spectral features - S = np.abs(librosa.stft(y, hop_length=hop_length)) - contrast = librosa.feature.spectral_contrast(S=S, sr=sr) - - # Harmonic features with CQT-based chroma (better for harmonic analysis) - chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length) - - # Timbral features - mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length) - - # Energy features - rms = librosa.feature.rms(y=y, hop_length=hop_length) - - # Harmonic-percussive source separation for better rhythm analysis - y_harmonic, y_percussive = librosa.effects.hpss(y) - - # Step 2: Adaptive determination of segment count based on song complexity - # ---------------------------------------------------------------------- - duration = librosa.get_duration(y=y, sr=sr) - - # Feature preparation for adaptive segmentation - # Stack features with proper normalization (addressing the scale issue) - feature_stack = np.vstack([ - librosa.util.normalize(contrast), - librosa.util.normalize(chroma), - librosa.util.normalize(mfcc), - librosa.util.normalize(rms) - ]) - - # Transpose to get time as first dimension - feature_matrix = feature_stack.T - - # Step 3: Feature fusion using dimensionality reduction - # ---------------------------------------------------------------------- - from sklearn.decomposition import PCA - - # Handle very short audio files - n_components = min(8, feature_matrix.shape[0], feature_matrix.shape[1]) - - if feature_matrix.shape[0] > n_components and feature_matrix.shape[1] > 0: - try: - pca = PCA(n_components=n_components) - reduced_features = pca.fit_transform(feature_matrix) - except Exception as e: - print(f"PCA failed, falling back to original features: {e}") - # Fallback to simpler approach if PCA fails - reduced_features = feature_matrix - else: - # Not enough data for PCA - reduced_features = feature_matrix - - # Step 4: Adaptive determination of optimal segment count - # ---------------------------------------------------------------------- - - # Initialize range of segment counts to try - min_segments = max(2, int(duration / 60)) # At least 2 segments, roughly 1 per minute - max_segments = min(10, int(duration / 20)) # At most 10 segments, roughly 1 per 20 seconds - - # Ensure reasonable bounds - min_segments = max(2, min(min_segments, 4)) - max_segments = max(min_segments + 1, min(max_segments, 8)) - - # Try different segment counts and evaluate with silhouette score - best_segments = min_segments - best_score = -1 - - from sklearn.metrics import silhouette_score - from sklearn.cluster import AgglomerativeClustering - - # Only do this analysis if we have enough data - if reduced_features.shape[0] > max_segments: - for n_segments in range(min_segments, max_segments + 1): - try: - # Perform agglomerative clustering - clustering = AgglomerativeClustering(n_clusters=n_segments) - labels = clustering.fit_predict(reduced_features) - - # Calculate silhouette score if we have enough samples - if len(np.unique(labels)) > 1 and len(labels) > n_segments + 1: - score = silhouette_score(reduced_features, labels) - - if score > best_score: - best_score = score - best_segments = n_segments - except Exception as e: - print(f"Clustering with {n_segments} segments failed: {e}") - continue - - # Use the optimal segment count for final segmentation - n_segments = best_segments - - # Step 5: Final segmentation using the optimal segment count - # ---------------------------------------------------------------------- - - # Method 1: Use agglomerative clustering on the reduced features - try: - clustering = AgglomerativeClustering(n_clusters=n_segments) - labels = clustering.fit_predict(reduced_features) - - # Convert cluster labels to boundaries by finding where labels change - boundaries = [0] # Start with the beginning - - for i in range(1, len(labels)): - if labels[i] != labels[i-1]: - boundaries.append(i) - - boundaries.append(len(labels)) # Add the end - - # Convert to frames - bounds_frames = np.array(boundaries) - - except Exception as e: - print(f"Final clustering failed: {e}") - # Fallback to librosa's agglomerative clustering on original features - bounds_frames = librosa.segment.agglomerative(feature_stack, n_segments) - - # Step 6: Convert boundaries to time and create sections - # ---------------------------------------------------------------------- - bounds_times = librosa.frames_to_time(bounds_frames, sr=sr, hop_length=hop_length) - - # Create sections from the boundaries - sections = [] - - for i in range(len(bounds_times) - 1): - start = bounds_times[i] - end = bounds_times[i+1] - duration = end - start - - # Skip extremely short sections - if duration < 4 and i > 0 and i < len(bounds_times) - 2: - continue - - # Add section to the list (without classifying as verse/chorus/etc) - sections.append({ - "type": "segment", # Generic type instead of verse/chorus/etc - "start": start, - "end": end, - "duration": duration - }) - - # Filter out any remaining extremely short sections - sections = [s for s in sections if s["duration"] >= 5] - - return sections - -def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='default'): - """ - Create enhanced syllable templates based on beat patterns with improved musical intelligence. - - Parameters: - beats_info: Dictionary containing beat analysis data - genre: Optional genre to influence template creation - phrase_mode: 'default' uses provided phrases, 'auto' forces recalculation - - Returns: - String of syllable templates with embedded strength values and flexible timing - """ - import numpy as np - from sklearn.cluster import KMeans - - # Convert any numpy values to native Python types for safety - directly handle conversions - # Process the dictionary to convert numpy values to Python native types - if isinstance(beats_info, dict): - processed_beats_info = {} - for k, v in beats_info.items(): - if isinstance(v, np.ndarray): - if v.size == 1: - processed_beats_info[k] = float(v.item()) - else: - processed_beats_info[k] = [float(x) if isinstance(x, np.number) else x for x in v] - elif isinstance(v, np.number): - processed_beats_info[k] = float(v) - elif isinstance(v, list): - processed_beats_info[k] = [float(x) if isinstance(x, np.number) else x for x in v] - else: - processed_beats_info[k] = v - beats_info = processed_beats_info - - # Extract basic beat information - beat_times = beats_info.get("beat_times", []) - beat_strengths = beats_info.get("beat_strengths", [1.0] * len(beat_times)) - tempo = beats_info.get("tempo", 120) - time_signature = beats_info.get("time_signature", 4) - - # Early return for insufficient data - if len(beat_times) < 2: - return "S(1.0):1-w(0.5):1|S(1.0):1-w(0.5):1" # Default fallback pattern - - # Step 1: Improved adaptive thresholding using k-means clustering - # ---------------------------------------------------------------------- - if len(beat_strengths) >= 6: # Need enough data points for clustering - # Reshape for k-means - X = np.array(beat_strengths).reshape(-1, 1) - - # Use k-means with 3 clusters for Strong, Medium, Weak classification - kmeans = KMeans(n_clusters=3, random_state=0, n_init=10).fit(X) - - # Find the centroid values and sort them - centroids = sorted([float(c[0]) for c in kmeans.cluster_centers_]) - - # Map to thresholds (using the midpoints between centroids) - if len(centroids) >= 3: - medium_threshold = (centroids[0] + centroids[1]) / 2 - strong_threshold = (centroids[1] + centroids[2]) / 2 - else: - # Fallback if clustering doesn't work well - medium_threshold = np.percentile(beat_strengths, 33) - strong_threshold = np.percentile(beat_strengths, 66) - else: - # For limited data, use percentile-based approach - medium_threshold = np.percentile(beat_strengths, 33) - strong_threshold = np.percentile(beat_strengths, 66) - - # Step 2: Create or refine phrases based on mode - # ---------------------------------------------------------------------- - phrases = beats_info.get("phrases", []) - - if phrase_mode == 'auto' or not phrases: - # Create phrases based on time signature and beat strengths - phrases = [] - current_phrase = [] - - for i in range(len(beat_times)): - current_phrase.append(i) - - # Check for natural phrase endings - if (i + 1) % time_signature == 0 or i == len(beat_times) - 1: - if len(current_phrase) >= 2: # Ensure minimum phrase length - phrases.append(current_phrase) - current_phrase = [] - - # Add any remaining beats - if current_phrase and len(current_phrase) >= 2: - phrases.append(current_phrase) - - # Step 3: Improved continuous tempo-to-syllable mapping function - # ---------------------------------------------------------------------- - def tempo_to_syllable_base(tempo): - """Continuous function mapping tempo to syllable base count with scientific curve""" - # Sigmoid-like function with more scientific parameters - # Using logistic function: L/(1+e^(-k(x-x0))) to create smooth transitions - if tempo < 40: # Very slow tempos - return 3.5 # Maximum syllables for extremely slow tempos - elif tempo > 200: # Very fast tempos - return 0.8 # Minimum syllables for extremely fast tempos - else: - # Scientific logistic function for middle range (40-200 BPM) - L = 3.5 # Upper limit - k = 0.04 # Steepness of curve - x0 = 120 # Midpoint (inflection point at normal tempo) - return L / (1 + np.exp(k * (tempo - x0))) - - # Step 4: Generate enhanced templates with flexible timing - # ---------------------------------------------------------------------- - syllable_templates = [] - - for phrase in phrases: - # Skip empty phrases - if not phrase: - continue - - # Extract beat strengths for this phrase - phrase_strengths = [beat_strengths[i] for i in phrase if i < len(beat_strengths)] - if not phrase_strengths: - phrase_strengths = [1.0] * len(phrase) - - # Apply improved adaptive thresholding for stress pattern detection - stress_pattern = [] - for i, strength in enumerate(phrase_strengths): - # Consider both strength and metrical position with improved weighting - metrical_position = i % time_signature - - # Apply improved position boosting based on musical theory - # In common time signatures, first beat gets strong emphasis, - # third beat gets moderate emphasis (in 4/4) - if metrical_position == 0: # Downbeat (first beat) - position_boost = 0.18 # Stronger boost for downbeats - elif time_signature == 4 and metrical_position == 2: # Third beat in 4/4 - position_boost = 0.1 # Moderate boost for third beat - elif time_signature == 3 and metrical_position == 1: # Second beat in 3/4 - position_boost = 0.05 # Slight boost for second beat in 3/4 - else: - position_boost = 0 # No boost for other beats - - effective_strength = strength + position_boost - - if effective_strength >= strong_threshold: - stress_pattern.append(("S", effective_strength)) # Strong beat with strength - elif effective_strength >= medium_threshold: - stress_pattern.append(("m", effective_strength)) # Medium beat with strength - else: - stress_pattern.append(("w", effective_strength)) # Weak beat with strength - - # Step 5: Calculate syllable counts using improved continuous function - # ---------------------------------------------------------------------- - detailed_template = [] - - for i, (stress_type, strength) in enumerate(stress_pattern): - # Get base syllable count from tempo with more nuanced mapping - base_syllables = tempo_to_syllable_base(tempo) - - # Adjust based on both stress type AND metrical position - metrical_position = i % time_signature - position_factor = 1.2 if metrical_position == 0 else 1.0 - - # More nuanced adjustment based on stress type - if stress_type == "S": - syllable_factor = 1.2 * position_factor # Emphasize strong beats more - elif stress_type == "m": - syllable_factor = 1.0 * position_factor # Medium beats - else: - syllable_factor = 0.8 # Weak beats - - # Apply improved genre-specific adjustments with more granular factors - genre_factor = 1.0 - if genre: - genre = genre.lower() - if "rap" in genre or "hip" in genre: - genre_factor = 1.5 # Significantly higher syllable density for rap - elif "folk" in genre or "country" in genre or "ballad" in genre: - genre_factor = 0.7 # Lower density for folk styles - elif "metal" in genre or "rock" in genre: - genre_factor = 1.1 # Slightly higher density for rock/metal - elif "jazz" in genre: - genre_factor = 1.2 # Higher density for jazz (complex rhythms) - elif "classical" in genre: - genre_factor = 0.9 # More moderate for classical - - # Calculate adjusted syllable count with scientific weighting - raw_count = base_syllables * syllable_factor * genre_factor - - # Use more precise rounding that preserves subtle differences - # Round to quarters rather than halves for more precision - rounded_count = round(raw_count * 4) / 4 - - # Limit to reasonable range (0.5 to 4) with improved bounds - syllable_count = max(0.5, min(4, rounded_count)) - - # Format with embedded strength value for reversibility - # Convert strength to 2-decimal precision percentage - strength_pct = round(strength * 100) / 100 - detailed_template.append(f"{stress_type}({strength_pct}):{syllable_count}") - - # Join beat templates for this phrase - phrase_template = "-".join(detailed_template) - syllable_templates.append(phrase_template) - - # Step 6: Ensure valid output with improved defaults - # ---------------------------------------------------------------------- - if not syllable_templates: - # Create sensible defaults based on time signature that reflect musical theory - if time_signature == 3: # 3/4 time - waltz pattern - syllable_templates = ["S(0.95):2-w(0.4):1-w(0.35):1"] # 3/4 default - elif time_signature == 2: # 2/4 time - march pattern - syllable_templates = ["S(0.95):1.5-w(0.4):1"] # 2/4 default - else: # 4/4 time - common time - syllable_templates = ["S(0.95):2-w(0.4):1-m(0.7):1.5-w(0.35):1"] # 4/4 default - - # Join all phrase templates with the original separator for compatibility - return "|".join(syllable_templates) - -def format_syllable_templates_for_prompt(syllable_templates, arrow="→", line_wrap=10, - structured_output=False, beat_types=None): - """ - Convert technical syllable templates into clear, human-readable instructions with - enhanced flexibility and customization options. - - Parameters: - syllable_templates: String or list of templates - arrow: Symbol to use between beats (default: "→") - line_wrap: Number of beats before automatic line wrapping (0 = no wrapping) - structured_output: If True, return structured data instead of text - beat_types: Custom mapping for beat types (default: None, uses standard mapping) - - Returns: - Human-readable instructions or structured data depending on parameters - """ - if not syllable_templates: - return {} if structured_output else "" - - # Define standard beat type mapping (extensible) - default_beat_types = { - "S": {"name": "STRONG", "description": "stressed syllable"}, - "m": {"name": "medium", "description": "medium-stressed syllable"}, - "w": {"name": "weak", "description": "unstressed syllable"}, - "X": {"name": "EXTRA", "description": "extra strong syllable"}, - "L": {"name": "legato", "description": "connected/tied syllable"} - } - - # Use custom mapping if provided, otherwise use default - beat_types = beat_types or default_beat_types - - # Initialize structured output if requested - structured_data = {"lines": [], "explanations": []} if structured_output else None - - # Improved format detection - more robust than just checking for "|" - is_enhanced_format = False - - # Check if it's a string with enhanced format patterns - if isinstance(syllable_templates, str): - # Look for enhanced format patterns - check for beat type indicators - if any(bt + "(" in syllable_templates or bt + ":" in syllable_templates or bt + "[" in syllable_templates - for bt in beat_types.keys()): - is_enhanced_format = True - # Secondary check for the "|" delimiter between phrases - elif "|" in syllable_templates: - is_enhanced_format = True - - # Initialize the output with a brief explanatory header - output = [] - - if is_enhanced_format: - # Split into individual phrase templates - phrases = syllable_templates.split("|") if "|" in syllable_templates else [syllable_templates] - - # Process each phrase into human-readable instructions - for i, phrase in enumerate(phrases): - # Check for special annotations - has_swing = "(swing)" in phrase - if has_swing: - phrase = phrase.replace("(swing)", "") # Remove annotation for processing - - beats = phrase.split("-") - beat_instructions = [] - - # Process each beat in the phrase - for j, beat in enumerate(beats): - # Extract beat type and information - beat_info = {"original": beat, "type": None, "count": None, "strength": None} - - # Handle enhanced format with embedded strength values: S(0.95):2 - if "(" in beat and ")" in beat and ":" in beat: - parts = beat.split(":") - beat_type = parts[0].split("(")[0] # Extract beat type - strength = parts[0].split("(")[1].rstrip(")") # Extract strength value - count = parts[1] # Extract syllable count - - beat_info["type"] = beat_type - beat_info["count"] = count - beat_info["strength"] = strength - - # Handle simpler format: S2, m1, w1 - elif any(beat.startswith(bt) for bt in beat_types.keys()) and len(beat) > 1: - beat_type = beat[0] - count = beat[1:] - - beat_info["type"] = beat_type - beat_info["count"] = count - - # Fallback for any other format - else: - beat_instructions.append(beat) - continue - - # Format the beat instruction based on type - if beat_info["type"] in beat_types: - type_name = beat_types[beat_info["type"]]["name"] - if beat_info["strength"]: - beat_instructions.append(f"{type_name}({beat_info['count']}) [{beat_info['strength']}]") - else: - beat_instructions.append(f"{type_name}({beat_info['count']})") - else: - # Unknown beat type, use as-is - beat_instructions.append(beat) - - # Handle line wrapping for readability - if line_wrap > 0 and len(beat_instructions) > line_wrap: - wrapped_instructions = [] - for k in range(0, len(beat_instructions), line_wrap): - section = beat_instructions[k:k+line_wrap] - wrapped_instructions.append(f"{arrow} ".join(section)) - line_desc = f"\n {arrow} ".join(wrapped_instructions) - else: - line_desc = f" {arrow} ".join(beat_instructions) - - # Add swing notation if present - if has_swing: - line_desc += " [with swing feel]" - - # Add to output - line_output = f"Line {i+1}: {line_desc}" - output.append(line_output) - - if structured_output: - structured_data["lines"].append({ - "line_number": i+1, - "beats": [{"original": beats[j], - "type": beat_info.get("type"), - "count": beat_info.get("count"), - "strength": beat_info.get("strength")} - for j, beat_info in enumerate([b for b in beats if isinstance(b, dict)])], - "has_swing": has_swing - }) - - # Add explanation of notation after the lines - explanation = [ - "\n📝 UNDERSTANDING THE NOTATION:" - ] - - # Add descriptions for each beat type that was actually used - used_beat_types = set() - for phrase in phrases: - for beat in phrase.split("-"): - for bt in beat_types.keys(): - if beat.startswith(bt): - used_beat_types.add(bt) - - for bt in used_beat_types: - if bt in beat_types: - name = beat_types[bt]["name"] - desc = beat_types[bt]["description"] - explanation.append(f"- {name}(n): Place a {desc} here, plus (n-1) unstressed syllables") - - explanation.extend([ - f"- {arrow}: Indicates flow from one beat to the next", - "- [0.xx]: Beat strength value (higher = more emphasis needed)" - ]) - - output.extend(explanation) - - if structured_output: - structured_data["explanations"] = explanation - - # Add examples for half-syllable values if they appear in the templates - has_half_syllables = any((".5" in beat) for phrase in phrases for beat in phrase.split("-")) - if has_half_syllables: - half_syllable_examples = [ - "\n🎵 HALF-SYLLABLE EXAMPLES:", - "- STRONG(1.5): One stressed syllable followed by an unstressed half-syllable", - " Example: \"LOVE you\" where \"LOVE\" is stressed and \"you\" is quick", - "- medium(2.5): One medium syllable plus one-and-a-half unstressed syllables", - " Example: \"Wait for the\" where \"Wait\" is medium-stressed and \"for the\" is quick" - ] - output.extend(half_syllable_examples) - - if structured_output: - structured_data["half_syllable_examples"] = half_syllable_examples - - # Add swing explanation if needed - if any("swing" in phrase for phrase in phrases): - swing_guide = [ - "\n🎶 SWING RHYTHM GUIDE:", - "- In swing, syllables should be unevenly timed (long-short pattern)", - "- Example: \"SUM-mer TIME\" in swing feels like \"SUM...mer-TIME\" with delay" - ] - output.extend(swing_guide) - - if structured_output: - structured_data["swing_guide"] = swing_guide - - # Handle the original format or segment dictionaries - else: - formatted_lines = [] - - if isinstance(syllable_templates, list): - for i, template in enumerate(syllable_templates): - if isinstance(template, dict) and "syllable_template" in template: - line = f"Line {i+1}: {template['syllable_template']} syllables" - formatted_lines.append(line) - - if structured_output: - structured_data["lines"].append({ - "line_number": i+1, - "syllable_count": template["syllable_template"] - }) - elif isinstance(template, str): - line = f"Line {i+1}: {template} syllables" - formatted_lines.append(line) - - if structured_output: - structured_data["lines"].append({ - "line_number": i+1, - "syllable_count": template - }) - - output = formatted_lines - else: - output = [str(syllable_templates)] - - if structured_output: - structured_data["raw_content"] = str(syllable_templates) - - # Add general application advice - application_tips = [ - "\n💡 APPLICATION TIPS:", - "1. Strong beats need naturally stressed syllables (like the START of \"RE-mem-ber\")", - "2. Place important words on strong beats for natural emphasis", - "3. Vowel sounds work best for sustained or emphasized syllables", - "4. Keep consonant clusters (like \"str\" or \"thr\") on weak beats" - ] - output.extend(application_tips) - - if structured_output: - structured_data["application_tips"] = application_tips - return structured_data - - return "\n".join(output) - -def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=None): - """ - Enhanced verification of syllable counts and stress patterns with precise alignment analysis - for both phrase-level and second-level templates. - """ - import re - import pronouncing - import numpy as np - import functools - from itertools import chain - - print(f"DEBUG: In verify_flexible_syllable_counts, type of lyrics={type(lyrics)}") - print(f"DEBUG: Type of templates={type(templates)}") - - # Ensure lyrics is a string - if not isinstance(lyrics, str): - print(f"DEBUG: lyrics is not a string, it's {type(lyrics)}") - # Convert to string if possible - try: - lyrics = str(lyrics) - except Exception as e: - print(f"DEBUG: Cannot convert lyrics to string: {str(e)}") - return "Error: Cannot process non-string lyrics" - - # Ensure templates is a list - if not isinstance(templates, list): - print(f"DEBUG: templates is not a list, it's {type(templates)}") - # If it's not a list, create a single-item list - if templates is not None: - templates = [templates] - else: - templates = [] - - # Split lyrics into lines - lines = [line.strip() for line in lyrics.split("\n") if line.strip()] - - # Initialize tracking variables - verification_notes = [] - detailed_analysis = [] - stress_misalignments = [] - total_mismatch_count = 0 - - # Process each lyric line against its template - for i, line in enumerate(lines): - if i >= len(templates): - break - - template = templates[i] - print(f"DEBUG: Processing template {i+1}, type={type(template)}") - - # Extract the template string from different possible formats - template_str = None - if isinstance(template, dict) and "syllable_template" in template: - template_str = template["syllable_template"] - elif isinstance(template, str): - template_str = template - else: - print(f"DEBUG: Skipping template {i+1}, not a string or dict with syllable_template") - continue - - if not isinstance(template_str, str): - print(f"DEBUG: template_str is not a string, it's {type(template_str)}") - continue - - # Handle multiple phrases in template - process ALL phrases, not just the first - template_phrases = [template_str] - if "|" in template_str: - template_phrases = template_str.split("|") - - # Check against all phrases and find the best match - best_match_diff = float('inf') - best_match_phrase = None - best_phrase_beats = None - actual_count = count_syllables(line) - - for phrase_idx, phrase in enumerate(template_phrases): - # Extract beat patterns and expected syllable counts from template - beats_info = [] - total_expected = 0 - - # Enhanced template parsing - if "-" in phrase: - beat_templates = phrase.split("-") - - # Parse each beat template - for beat in beat_templates: - beat_info = {"original": beat, "type": None, "count": 1, "strength": None} - - # Handle templates with embedded strength values: S(0.95):2 - if "(" in beat and ")" in beat and ":" in beat: - parts = beat.split(":") - beat_type = parts[0].split("(")[0] - try: - strength = float(parts[0].split("(")[1].rstrip(")")) - except ValueError: - strength = 1.0 - - # Handle potential float syllable counts - try: - count = float(parts[1]) - # Convert to int if it's a whole number - if count == int(count): - count = int(count) - except ValueError: - count = 1 - - beat_info.update({ - "type": beat_type, - "count": count, - "strength": strength - }) - - # Handle simple format: S2, m1, w1 - elif any(beat.startswith(x) for x in ["S", "m", "w", "X", "L"]): - beat_type = beat[0] - - # Extract count, supporting float values - try: - count_str = beat[1:] - count = float(count_str) - if count == int(count): - count = int(count) - except ValueError: - count = 1 - - beat_info.update({ - "type": beat_type, - "count": count - }) - - # Legacy format - just numbers - else: - try: - count = float(beat) - if count == int(count): - count = int(count) - beat_info["count"] = count - except ValueError: - pass - - beats_info.append(beat_info) - total_expected += beat_info["count"] - - # Compare this phrase to actual syllable count - phrase_diff = abs(actual_count - total_expected) - - # Adaptive threshold based on expected syllables - expected_ratio = 0.15 if total_expected > 10 else 0.25 - phrase_threshold = max(1, round(total_expected * expected_ratio)) - - # If this is the best match so far, store it - if phrase_diff < best_match_diff: - best_match_diff = phrase_diff - best_match_phrase = phrase - best_phrase_beats = beats_info - - # For very simple templates without "-" - else: - try: - total_expected = float(phrase) - phrase_diff = abs(actual_count - total_expected) - if phrase_diff < best_match_diff: - best_match_diff = phrase_diff - best_match_phrase = phrase - best_phrase_beats = [{"count": total_expected}] - except ValueError: - pass - - # If we found a reasonable match, proceed with analysis - if best_match_phrase and best_phrase_beats: - total_expected = sum(beat["count"] for beat in best_phrase_beats) - - # Calculate adaptive threshold based on expected syllables - expected_ratio = 0.15 if total_expected > 10 else 0.25 - threshold = max(1, round(total_expected * expected_ratio)) - - # Check if total syllable count is significantly off - if total_expected > 0 and best_match_diff > threshold: - verification_notes.append(f"Line {i+1}: Expected {total_expected} syllables, got {actual_count}") - total_mismatch_count += 1 - - # Extract words and perform detailed alignment analysis - words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) - - # Get syllable count and stress for each word - word_analysis = [] - cumulative_syllables = 0 - - for word in words: - syllable_count = count_syllables_for_word(word) - - # Get stress pattern - stress_pattern = get_word_stress(word) - - word_analysis.append({ - "word": word, - "syllables": syllable_count, - "stress_pattern": stress_pattern, - "position": cumulative_syllables - }) - - cumulative_syllables += syllable_count - - # Analyze alignment with beats - only if there are beat types - if best_phrase_beats and any(b.get("type") == "S" for b in best_phrase_beats if "type" in b): - # Identify positions where strong syllables should fall - strong_positions = [] - current_pos = 0 - - for beat in best_phrase_beats: - if beat.get("type") == "S": - strong_positions.append(current_pos) - current_pos += beat.get("count", 1) - - # Check if strong syllables align with strong beats - alignment_issues = [] - - for pos in strong_positions: - # Find which word contains this position - misaligned_word = None - - for word_info in word_analysis: - word_start = word_info["position"] - word_end = word_start + word_info["syllables"] - - if word_start <= pos < word_end: - # Check if a stressed syllable falls on this position - syllable_in_word = pos - word_start - - # Get stress pattern for this word - stress = word_info["stress_pattern"] - - # If we have stress information and this syllable isn't stressed - if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1': - misaligned_word = word_info["word"] - alignment_issues.append(f"'{word_info['word']}' (unstressed syllable on strong beat)") - stress_misalignments.append({ - "line": i+1, - "word": word_info["word"], - "position": pos, - "suggestion": get_stress_aligned_alternatives(word_info["word"], syllable_in_word) - }) - break - - if alignment_issues: - verification_notes.append(f" → Stress misalignments: {', '.join(alignment_issues)}") - - # Generate a visual alignment map for better understanding - alignment_map = generate_alignment_visualization(line, best_phrase_beats, word_analysis) - if alignment_map: - detailed_analysis.append(f"Line {i+1} Alignment Analysis:\n{alignment_map}") - else: - # If no matching template was found - verification_notes.append(f"Line {i+1}: Unable to find matching template pattern") - - # Add second-level verification if templates are provided - if second_level_templates: - verification_notes.append("\n=== SECOND-LEVEL VERIFICATION ===\n") - - # Check each second against corresponding line - for i, template in enumerate(second_level_templates): - if i >= len(lines): - break - - line = lines[i] - - # Skip section headers - if line.startswith('[') and ']' in line: - continue - - actual_count = count_syllables(line) - - # Parse template to get expected syllable count - total_expected = 0 - beat_patterns = [] - - # Handle templates with beat patterns like "S(0.95):2-w(0.4):1" - if isinstance(template, str) and "-" in template: - for beat in template.split("-"): - if ":" in beat: - try: - count_part = beat.split(":")[1] - count = float(count_part) - total_expected += count - - # Extract beat type for alignment check - beat_type = beat.split("(")[0] if "(" in beat else beat[0] - beat_patterns.append((beat_type, count)) - except (IndexError, ValueError): - pass - - # Compare actual vs expected count - if total_expected > 0: - # Calculate adaptive threshold based on expected syllables - expected_ratio = 0.2 # More strict at second level - threshold = max(0.5, round(total_expected * expected_ratio)) - - difference = abs(actual_count - total_expected) - - if difference > threshold: - verification_notes.append(f"Second {i+1}: Expected {total_expected} syllables, got {actual_count}") - total_mismatch_count += 1 - - # Check for stress misalignment in this second - words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) - word_analysis = [] - cumulative_syllables = 0 - - for word in words: - syllable_count = count_syllables_for_word(word) - stress_pattern = get_word_stress(word) - - word_analysis.append({ - "word": word, - "syllables": syllable_count, - "stress_pattern": stress_pattern, - "position": cumulative_syllables - }) - - cumulative_syllables += syllable_count - - # Check if stressed syllables align with strong beats - if beat_patterns: - strong_positions = [] - current_pos = 0 - - for beat_type, count in beat_patterns: - if beat_type == "S": - strong_positions.append(current_pos) - current_pos += count - - # Look for misalignments - for pos in strong_positions: - for word_info in word_analysis: - word_start = word_info["position"] - word_end = word_start + word_info["syllables"] - - if word_start <= pos < word_end: - # Check if a stressed syllable falls on this position - syllable_in_word = int(pos - word_start) - stress = word_info["stress_pattern"] - - if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1': - verification_notes.append(f" → In second {i+1}, '{word_info['word']}' has unstressed syllable on strong beat") - break - - # Only add detailed analysis if we have rhythm mismatches - if verification_notes: - lyrics += "\n\n[Note: Potential rhythm mismatches detected in these lines:]\n" - lyrics += "\n".join(verification_notes) - - if detailed_analysis: - lyrics += "\n\n[Detailed Alignment Analysis:]\n" - lyrics += "\n\n".join(detailed_analysis) - - lyrics += "\n\n[How to fix rhythm mismatches:]\n" - lyrics += "1. Make sure stressed syllables (like 'LO' in 'LOV-er') fall on STRONG beats\n" - lyrics += "2. Adjust syllable counts to match the template (add/remove words or use different words)\n" - lyrics += "3. Try using words where natural stress aligns with musical rhythm\n" - - # Add specific word substitution suggestions if we found stress misalignments - if stress_misalignments: - lyrics += "\n[Specific word replacement suggestions:]\n" - for issue in stress_misalignments[:5]: # Limit to first 5 issues - if issue["suggestion"]: - lyrics += f"Line {issue['line']}: Consider replacing '{issue['word']}' with: {issue['suggestion']}\n" - - return lyrics - -def generate_alignment_visualization(line, beats_info, word_analysis): - """Generate a visual representation of syllable alignment with beats.""" - if not beats_info or not word_analysis: - return None - - # Create a syllable breakdown with stress information - syllable_breakdown = [] - syllable_stresses = [] - - for word_info in word_analysis: - word = word_info["word"] - syllables = word_info["syllables"] - stress = word_info["stress_pattern"] or "" - - # Extend stress pattern if needed - while len(stress) < syllables: - stress += "0" - - # Get syllable breakdown - parts = naive_syllable_split(word, syllables) - - for i, part in enumerate(parts): - syllable_breakdown.append(part) - if i < len(stress): - syllable_stresses.append(stress[i]) - else: - syllable_stresses.append("0") - - # Create beat pattern - beat_types = [] - current_pos = 0 - - for beat in beats_info: - beat_type = beat.get("type", "-") - count = beat.get("count", 1) - - # Handle whole numbers and half syllables - if isinstance(count, int): - beat_types.extend([beat_type] * count) - else: - # For half syllables, round up and use markers - whole_part = int(count) - frac_part = count - whole_part - - if whole_part > 0: - beat_types.extend([beat_type] * whole_part) - - if frac_part > 0: - beat_types.append(f"{beat_type}½") - - # Ensure we have enough beat types - while len(beat_types) < len(syllable_breakdown): - beat_types.append("-") - - # Trim beat types if too many - beat_types = beat_types[:len(syllable_breakdown)] - - # Generate the visualization with highlighted misalignments - result = [] - - # First line: syllable breakdown with stress indicators - syllable_display = [] - for i, syllable in enumerate(syllable_breakdown): - if i < len(syllable_stresses) and syllable_stresses[i] == "1": - syllable_display.append(syllable.upper()) # Uppercase for stressed syllables - else: - syllable_display.append(syllable.lower()) # Lowercase for unstressed - - result.append(" - ".join(syllable_display)) - - # Second line: beat indicators with highlighting for misalignments - beat_indicators = [] - for i, (syllable, beat_type) in enumerate(zip(syllable_stresses, beat_types)): - if beat_type == "S" or beat_type.startswith("S"): - if syllable == "1": - beat_indicators.append("↑") # Aligned strong beat - else: - beat_indicators.append("❌") # Misaligned strong beat - elif beat_type == "m" or beat_type.startswith("m"): - beat_indicators.append("•") # Medium beat - elif beat_type == "w" or beat_type.startswith("w"): - beat_indicators.append("·") # Weak beat - else: - beat_indicators.append(" ") - - result.append(" ".join(beat_indicators)) - - # Third line: beat types - result.append(" - ".join(beat_types)) - - return "\n".join(result) - -@functools.lru_cache(maxsize=256) -def naive_syllable_split(word, syllable_count): - """Naively split a word into the specified number of syllables, with caching for performance.""" - if syllable_count <= 1: - return [word] - - # Common syllable break patterns - vowels = "aeiouy" - consonants = "bcdfghjklmnpqrstvwxz" - - # Find potential split points - splits = [] - for i in range(1, len(word) - 1): - if word[i] in consonants and word[i-1] in vowels: - splits.append(i) - elif word[i] in vowels and word[i-1] in consonants and word[i+1] in consonants: - splits.append(i+1) - - # Ensure we have enough split points - while len(splits) < syllable_count - 1: - for i in range(1, len(word)): - if i not in splits: - splits.append(i) - break - - # Sort and limit - splits.sort() - splits = splits[:syllable_count - 1] - - # Split the word - result = [] - prev = 0 - for pos in splits: - result.append(word[prev:pos]) - prev = pos - - result.append(word[prev:]) - return result - -def get_stress_aligned_alternatives(word, position_to_stress): - """Suggest alternative words with proper stress at the required position.""" - # This would ideally use a more sophisticated dictionary lookup, - # but here's a simple implementation with common word patterns - syllable_count = count_syllables_for_word(word) - - # Common synonyms/replacements by syllable count with stress position - if syllable_count == 2: - if position_to_stress == 0: # Need stress on first syllable - first_stress = ["love-ly", "won-der", "beau-ty", "danc-ing", "dream-ing", - "heart-beat", "sun-light", "moon-light", "star-light"] - return ", ".join(first_stress[:3]) - else: # Need stress on second syllable - second_stress = ["be-LIEVE", "a-BOVE", "a-ROUND", "to-DAY", "a-LIVE", - "a-LONE", "be-HOLD", "re-TURN", "de-LIGHT"] - return ", ".join(second_stress[:3]) - elif syllable_count == 3: - if position_to_stress == 0: # First syllable stress - return "MEM-o-ry, WON-der-ful, BEAU-ti-ful" - elif position_to_stress == 1: # Second syllable stress - return "a-MAZE-ing, to-GE-ther, for-EV-er" - else: # Third syllable stress - return "un-der-STAND, o-ver-COME, ne-ver-MORE" - - # For other cases, just provide general guidance - return f"a word with stress on syllable {position_to_stress + 1}" - -def generate_lyrics(genre, duration, emotion_results, song_structure=None, lyrics_requirements=None): - """ - Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment. - - This improved version uses advanced template creation, better formatting, and verification with - potential refinement for lyrics that perfectly match the musical rhythm patterns. - - Parameters: - genre: Musical genre of the audio - duration: Duration of the audio in seconds - emotion_results: Dictionary containing emotional analysis results - song_structure: Optional dictionary containing song structure analysis - lyrics_requirements: Optional user-provided requirements for the lyrics - - Returns: - Generated lyrics aligned with the rhythm patterns of the music - """ - # Safety check for strings - def is_safe_dict_access(obj, key): - """Safe dictionary key access with type checking""" - if not isinstance(obj, dict): - print(f"WARNING: Attempted to access key '{key}' on non-dictionary object of type {type(obj)}") - return False - return key in obj - - # Ensure emotion_results is a dictionary with the expected structure - if not isinstance(emotion_results, dict): - emotion_results = { - "emotion_analysis": {"primary_emotion": "Unknown"}, - "theme_analysis": {"primary_theme": "Unknown"}, - "rhythm_analysis": {"tempo": 0}, - "tonal_analysis": {"key": "Unknown", "mode": ""}, - "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"} - } - - # Ensure song_structure is properly structured - if song_structure is not None and not isinstance(song_structure, dict): - print(f"WARNING: song_structure is not a dict, it's {type(song_structure)}") - song_structure = None - - print(f"DEBUG: Starting generate_lyrics with genre={genre}, duration={duration}") - print(f"DEBUG: Type of song_structure={type(song_structure)}") - print(f"DEBUG: Type of emotion_results={type(emotion_results)}") - - # Helper function to safely access dictionary with string keys - def safe_dict_get(d, key, default=None): - """Safely get a value from a dictionary, handling non-dictionary objects.""" - if not isinstance(d, dict): - print(f"WARNING: Attempted to access key '{key}' in non-dictionary object of type {type(d)}") - return default - return d.get(key, default) - - # Extract emotion and theme data with safe defaults - primary_emotion = safe_dict_get(safe_dict_get(emotion_results, "emotion_analysis", {}), "primary_emotion", "Unknown") - primary_theme = safe_dict_get(safe_dict_get(emotion_results, "theme_analysis", {}), "primary_theme", "Unknown") - - # Extract numeric values safely with fallbacks - try: - tempo = float(safe_dict_get(safe_dict_get(emotion_results, "rhythm_analysis", {}), "tempo", 0.0)) - except (ValueError, TypeError): - tempo = 0.0 - - key = safe_dict_get(safe_dict_get(emotion_results, "tonal_analysis", {}), "key", "Unknown") - mode = safe_dict_get(safe_dict_get(emotion_results, "tonal_analysis", {}), "mode", "") - - # Format syllable templates for the prompt - syllable_guidance = "" - templates_for_verification = [] - - # Create a structure visualization to help with lyrics-music matching - structure_visualization = "=== MUSIC-LYRICS STRUCTURE MATCHING ===\n\n" - structure_visualization += f"Song Duration: {duration:.1f} seconds\n" - structure_visualization += f"Tempo: {tempo:.1f} BPM\n\n" - - # Add second-level template guidance if available - if song_structure and is_safe_dict_access(song_structure, "second_level") and is_safe_dict_access(song_structure.get("second_level", {}), "templates"): - print(f"DEBUG: Using second-level templates") - second_level_templates = song_structure.get("second_level", {}).get("templates", []) - - # Create second-level guidance - second_level_guidance = "\nSECOND-BY-SECOND RHYTHM INSTRUCTIONS:\n" - second_level_guidance += "Each line below corresponds to ONE SECOND of audio. Follow these rhythm patterns EXACTLY:\n\n" - - # Format each second's template - formatted_second_templates = [] - for i, template in enumerate(second_level_templates): - if i < min(60, len(second_level_templates)): # Limit to 60 seconds to avoid overwhelming the LLM - formatted_template = format_syllable_templates_for_prompt(template, arrow="→", line_wrap=0) - formatted_second_templates.append(f"Second {i+1}: {formatted_template}") - - second_level_guidance += "\n".join(formatted_second_templates) - - # Add critical instructions for second-level alignment - second_level_guidance += "\n\nCRITICAL: Create ONE LINE of lyrics for EACH SECOND, following the exact rhythm pattern." - second_level_guidance += "\nIf a second has no beats, use it for a breath or pause in the lyrics." - second_level_guidance += "\nThe first line of your lyrics MUST match Second 1, the second line matches Second 2, and so on." - - # Add to syllable guidance - syllable_guidance = second_level_guidance - - # Store templates for verification - templates_for_verification = second_level_templates - - elif song_structure: - print(f"DEBUG: Checking flexible structure") - # Try to use flexible structure if available - if is_safe_dict_access(song_structure, "flexible_structure"): - print(f"DEBUG: Using flexible structure") - flexible = song_structure.get("flexible_structure", {}) - if is_safe_dict_access(flexible, "segments") and len(flexible.get("segments", [])) > 0: - print(f"DEBUG: Found segments in flexible structure") - # Get the segments - segments = flexible.get("segments", []) - - # Add structure visualization - structure_visualization += f"Total segments: {len(segments)}\n" - structure_visualization += "Each segment represents one musical phrase for which you should write ONE line of lyrics.\n\n" - - # Process each segment to create enhanced rhythmic templates - enhanced_templates = [] - - for i, segment in enumerate(segments): - if i < 30: # Extend limit to 30 lines to handle longer songs - # Get the beat information for this segment - segment_start = segment["start"] - segment_end = segment["end"] - - # Add segment info to visualization - structure_visualization += f"Segment {i+1}: {segment_start:.1f}s - {segment_end:.1f}s (duration: {segment_end-segment_start:.1f}s)\n" - - # Find beats within this segment - segment_beats = [] - - # Add type checking for beat_times access - print(f"DEBUG: Checking beat_times in flexible structure") - if is_safe_dict_access(flexible, "beats") and is_safe_dict_access(flexible.get("beats", {}), "beat_times"): - beat_times = flexible.get("beats", {}).get("beat_times", []) - if isinstance(beat_times, list): - beat_strengths = flexible.get("beats", {}).get("beat_strengths", []) - - for j, beat_time in enumerate(beat_times): - if segment_start <= beat_time < segment_end: - # Add this beat to the segment - segment_beats.append(j) - - # Create segment-specific beat info - segment_beats_info = { - "beat_times": [beat_times[j] for j in segment_beats if j < len(beat_times)], - "tempo": flexible.get("beats", {}).get("tempo", 120) - } - - if beat_strengths and isinstance(beat_strengths, list): - segment_beats_info["beat_strengths"] = [ - beat_strengths[j] for j in segment_beats - if j < len(beat_strengths) - ] - - # Create a phrase structure for this segment - segment_beats_info["phrases"] = [segment_beats] - - # Generate enhanced template with genre awareness and auto phrasing - print(f"DEBUG: Creating flexible syllable template for segment {i+1}") - enhanced_template = create_flexible_syllable_templates( - segment_beats_info, - genre=genre, - phrase_mode='auto' if i == 0 else 'default' - ) - enhanced_templates.append(enhanced_template) - templates_for_verification.append(enhanced_template) - - # Add template to visualization - structure_visualization += f" Template: {enhanced_template}\n" - else: - print(f"DEBUG: beat_times is not a list, it's {type(beat_times)}") - else: - print(f"DEBUG: beats or beat_times not found in flexible structure") - # Skip segment if we don't have beat information - continue - - # Use these templates to determine rhythm patterns, without classifying as verse/chorus - pattern_groups = {} - - for i, template in enumerate(enhanced_templates): - # Create simplified version for pattern matching - simple_pattern = template.replace("(", "").replace(")", "").replace(":", "") - - # Check if this pattern is similar to any we've seen - found_match = False - for group, patterns in pattern_groups.items(): - if any(simple_pattern == p.replace("(", "").replace(")", "").replace(":", "") for p in patterns): - pattern_groups[group].append(template) - found_match = True - break - - if not found_match: - # New pattern type - group_name = f"Group_{len(pattern_groups) + 1}" - pattern_groups[group_name] = [template] - - # Format templates with improved formatting for the prompt - syllable_guidance = "CRITICAL RHYTHM INSTRUCTIONS:\n" - syllable_guidance += "Each line of lyrics MUST match exactly with one musical phrase/segment.\n" - syllable_guidance += "Follow these rhythm patterns for each line (STRONG beats need stressed syllables):\n\n" - - # Add formatted templates without section labels - formatted_templates = [] - for i, template in enumerate(enhanced_templates): - formatted_templates.append(format_syllable_templates_for_prompt([template], arrow="→", line_wrap=8)) - - syllable_guidance += "\n".join(formatted_templates) - - # Store info for later use in traditional sections approach - use_sections = True - - # Use the detected section structure for traditional approach - if verse_lines > 0: - verse_lines = min(verse_lines, total_lines // 2) # Ensure reasonable limits - else: - verse_lines = total_lines // 2 - - if chorus_lines > 0: - chorus_lines = min(chorus_lines, total_lines // 3) - else: - chorus_lines = total_lines // 3 - - if bridge_lines > 0: - bridge_lines = min(bridge_lines, total_lines // 6) - else: - bridge_lines = 0 - - # Fallback to traditional sections if needed - elif song_structure and is_safe_dict_access(song_structure, "syllables") and song_structure.get("syllables"): - syllable_guidance = "RHYTHM PATTERN INSTRUCTIONS:\n" - syllable_guidance += "Follow these syllable patterns for each section. Each line should match ONE phrase:\n\n" - - # Count sections for visualization - section_counts = {"verse": 0, "chorus": 0, "bridge": 0, "intro": 0, "outro": 0} - - for section in song_structure.get("syllables", []): - if not isinstance(section, dict): - continue - - section_type = section.get("type", "verse") - section_counts[section_type] = section_counts.get(section_type, 0) + 1 - - if is_safe_dict_access(section, "syllable_template"): - # Process to create enhanced template - if is_safe_dict_access(song_structure, "beats") and is_safe_dict_access(song_structure.get("beats", {}), "beat_times"): - section_beats_info = { - "beat_times": [beat for beat in song_structure.get("beats", {}).get("beat_times", []) - if section.get("start", 0) <= beat < section.get("end", 0)], - "tempo": song_structure.get("beats", {}).get("tempo", 120) - } - - if is_safe_dict_access(song_structure.get("beats", {}), "beat_strengths"): - section_beats_info["beat_strengths"] = [ - strength for i, strength in enumerate(song_structure.get("beats", {}).get("beat_strengths", [])) - if i < len(song_structure.get("beats", {}).get("beat_times", [])) and - section.get("start", 0) <= song_structure.get("beats", {}).get("beat_times", [])[i] < section.get("end", 0) - ] - - # Create a phrase structure for this section - section_beats_info["phrases"] = [list(range(len(section_beats_info["beat_times"])))] - - # Create a phrase structure for this section - section_beats_info["phrases"] = [list(range(len(section_beats_info["beat_times"])))] - - # Generate enhanced template with genre awareness - enhanced_template = create_flexible_syllable_templates( - section_beats_info, - genre=genre, - phrase_mode='auto' if section['type'] == 'verse' else 'default' - ) - - syllable_guidance += f"[{section['type'].capitalize()}]:\n" - syllable_guidance += format_syllable_templates_for_prompt( - enhanced_template, - arrow="→", - line_wrap=6 - ) + "\n\n" - templates_for_verification.append(section) - elif "syllable_count" in section: - syllable_guidance += f"[{section['type'].capitalize()}]: ~{section['syllable_count']} syllables total\n" - - # Create structure visualization - structure_visualization += "Using traditional section-based structure:\n" - for section_type, count in section_counts.items(): - if count > 0: - structure_visualization += f"{section_type.capitalize()}: {count} sections\n" - - # Set traditional section counts - verse_lines = max(2, section_counts.get("verse", 0) * 4) - chorus_lines = max(2, section_counts.get("chorus", 0) * 4) - bridge_lines = max(0, section_counts.get("bridge", 0) * 2) - - # Use sections approach - use_sections = True - - # If we couldn't get specific templates, use general guidance - if not syllable_guidance: - syllable_guidance = "RHYTHM ALIGNMENT INSTRUCTIONS:\n\n" - syllable_guidance += "1. Align stressed syllables with strong beats (usually beats 1 and 3 in 4/4 time)\n" - syllable_guidance += "2. Use unstressed syllables on weak beats (usually beats 2 and 4 in 4/4 time)\n" - syllable_guidance += "3. Use appropriate syllable counts based on tempo:\n" - syllable_guidance += " - Fast tempo (>120 BPM): 4-6 syllables per line\n" - syllable_guidance += " - Medium tempo (90-120 BPM): 6-8 syllables per line\n" - syllable_guidance += " - Slow tempo (<90 BPM): 8-10 syllables per line\n" - - # Create basic structure visualization - structure_visualization += "Using estimated structure (no detailed analysis available):\n" - - # Calculate rough section counts based on duration - estimated_lines = max(8, int(duration / 10)) - structure_visualization += f"Estimated total lines: {estimated_lines}\n" - - # Set traditional section counts based on duration - verse_lines = estimated_lines // 2 - chorus_lines = estimated_lines // 3 - bridge_lines = estimated_lines // 6 if estimated_lines > 12 else 0 - - # Use sections approach - use_sections = True - - # Add examples of syllable-beat alignment with enhanced format - syllable_guidance += "\nEXAMPLES OF PERFECT RHYTHM ALIGNMENT:\n" - syllable_guidance += "Pattern: S(0.95):1 → w(0.4):1 → m(0.7):1 → w(0.3):1\n" - syllable_guidance += "Lyric: 'HEAR the MU-sic PLAY'\n" - syllable_guidance += " ↑ ↑ ↑ ↑\n" - syllable_guidance += " S w m w <- BEAT TYPE\n\n" - - syllable_guidance += "Pattern: S(0.9):2 → w(0.3):1 → S(0.85):1 → w(0.4):2\n" - syllable_guidance += "Lyric: 'DANC-ing TO the RHYTHM of LOVE'\n" - syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" - syllable_guidance += " S S w S w w <- BEAT TYPE\n\n" - - syllable_guidance += "Pattern: S(0.92):1 → m(0.65):2 → S(0.88):1 → w(0.35):1\n" - syllable_guidance += "Lyric: 'TIME keeps FLOW-ing ON and ON'\n" - syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" - syllable_guidance += " S m m S w w <- BEAT TYPE\n\n" - - # Add genre-specific guidance based on the detected genre - genre_guidance = "" - if any(term in genre.lower() for term in ["rap", "hip-hop", "hip hop"]): - genre_guidance += "\nSPECIFIC GUIDANCE FOR RAP/HIP-HOP RHYTHMS:\n" - genre_guidance += "- Use more syllables per beat for rapid-fire sections\n" - genre_guidance += "- Create internal rhymes within lines, not just at line endings\n" - genre_guidance += "- Emphasize the first beat of each bar with strong consonants\n" - elif any(term in genre.lower() for term in ["electronic", "edm", "techno", "house", "dance"]): - genre_guidance += "\nSPECIFIC GUIDANCE FOR ELECTRONIC MUSIC RHYTHMS:\n" - genre_guidance += "- Use repetitive phrases that build and release tension\n" - genre_guidance += "- Match syllables precisely to the beat grid\n" - genre_guidance += "- Use short, percussive words on strong beats\n" - elif any(term in genre.lower() for term in ["rock", "metal", "punk", "alternative"]): - genre_guidance += "\nSPECIFIC GUIDANCE FOR ROCK RHYTHMS:\n" - genre_guidance += "- Use powerful, emotive words on downbeats\n" - genre_guidance += "- Create contrast between verse and chorus energy levels\n" - genre_guidance += "- Emphasize hooks with simple, memorable phrases\n" - elif any(term in genre.lower() for term in ["folk", "country", "acoustic", "ballad"]): - genre_guidance += "\nSPECIFIC GUIDANCE FOR FOLK/ACOUSTIC RHYTHMS:\n" - genre_guidance += "- Focus on storytelling with clear narrative flow\n" - genre_guidance += "- Use natural speech patterns that flow conversationally\n" - genre_guidance += "- Place important words at the start of phrases\n" - - # Add genre guidance to the main guidance - syllable_guidance += genre_guidance - - # Store the syllable guidance for later use - syllable_guidance_text = syllable_guidance - - # Determine if we should use traditional sections or second-level alignment - use_sections = True - use_second_level = False - - if song_structure and "second_level" in song_structure and song_structure["second_level"]: - use_second_level = True - # If we have second-level templates, prioritize those over traditional sections - if isinstance(song_structure["second_level"], dict) and "templates" in song_structure["second_level"]: - templates = song_structure["second_level"]["templates"] - if isinstance(templates, list) and len(templates) > 0: - use_sections = False - elif song_structure and "flexible_structure" in song_structure and song_structure["flexible_structure"]: - # If we have more than 4 segments, it's likely not a traditional song structure - if "segments" in song_structure["flexible_structure"]: - segments = song_structure["flexible_structure"]["segments"] - if len(segments) > 4: - use_sections = False - - # Create enhanced prompt with better rhythm alignment instructions - if use_second_level: - # Second-level approach with per-second alignment - content = f""" -You are a talented songwriter who specializes in {genre} music. -Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long. - -IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics. - -Music analysis has detected the following qualities: -- Tempo: {tempo:.1f} BPM -- Key: {key} {mode} -- Primary emotion: {primary_emotion} -- Primary theme: {primary_theme} - -{syllable_guidance} - -CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: -1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) -2. Natural word stress patterns must match the beat strength (strong words on strong beats) -3. Line breaks should occur at phrase endings for natural breathing -4. Consonant clusters should be avoided on fast notes and strong beats -5. Open vowels (a, e, o) work better for sustained notes and syllables -6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) -7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels - -The lyrics should: -- Perfectly capture the essence and style of {genre} music -- Express the {primary_emotion} emotion and {primary_theme} theme -- Be completely original -- Maintain a consistent theme throughout -- Match the audio segment duration of {duration:.1f} seconds - -Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY. - -IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics. - -IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" -where you analyze how well the lyrics align with the musical rhythm. This section MUST appear -even if there are no rhythm issues. Include the following in your analysis: -1. Syllable counts for each line and how they match the rhythm pattern -2. Where stressed syllables align with strong beats -3. Any potential misalignments or improvements - -Your lyrics: -""" - - # Add user requirements if provided - if lyrics_requirements and lyrics_requirements.strip(): - content += f""" -USER REQUIREMENTS: -{lyrics_requirements.strip()} - -The lyrics MUST incorporate these user requirements while still following the rhythm patterns. -""" - - content += """ -Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY. - -IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics. - -IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" -where you analyze how well the lyrics align with the musical rhythm. This section MUST appear -even if there are no rhythm issues. Include the following in your analysis: -1. Syllable counts for each line and how they match the rhythm pattern -2. Where stressed syllables align with strong beats -3. Any potential misalignments or improvements - -Your lyrics: -""" - elif use_sections: - # Traditional approach with sections - content = f""" -You are a talented songwriter who specializes in {genre} music. -Write original {genre} song lyrics for a song that is {duration:.1f} seconds long. - -IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics. - -Music analysis has detected the following qualities in the music: -- Tempo: {tempo:.1f} BPM -- Key: {key} {mode} -- Primary emotion: {primary_emotion} -- Primary theme: {primary_theme} - -{syllable_guidance} - -CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: -1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) -2. Natural word stress patterns must match the beat strength (strong words on strong beats) -3. Line breaks should occur at phrase endings for natural breathing -4. Consonant clusters should be avoided on fast notes and strong beats -5. Open vowels (a, e, o) work better for sustained notes and syllables -6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) -7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels - -The lyrics should: -- Perfectly capture the essence and style of {genre} music -- Express the {primary_emotion} emotion and {primary_theme} theme -- Follow the structure patterns provided above -- Be completely original -- Match the song duration of {duration:.1f} seconds -""" - - # Add user requirements if provided - if lyrics_requirements and lyrics_requirements.strip(): - content += f""" -USER REQUIREMENTS: -{lyrics_requirements.strip()} - -The lyrics MUST incorporate these user requirements while still following the rhythm patterns. -""" - - content += """ -IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics. - -IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" -where you analyze how well the lyrics align with the musical rhythm. This section MUST appear -even if there are no rhythm issues. Include the following in your analysis: -1. Syllable counts for each line and how they match the rhythm pattern -2. Where stressed syllables align with strong beats -3. Any potential misalignments or improvements - -Your lyrics: -""" - else: - # Flexible approach without traditional sections - content = f""" -You are a talented songwriter who specializes in {genre} music. -Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long. - -IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics. - -Music analysis has detected the following qualities: -- Tempo: {tempo:.1f} BPM -- Key: {key} {mode} -- Primary emotion: {primary_emotion} -- Primary theme: {primary_theme} - -{syllable_guidance} - -CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: -1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) -2. Natural word stress patterns must match the beat strength (strong words on strong beats) -3. Line breaks should occur at phrase endings for natural breathing -4. Consonant clusters should be avoided on fast notes and strong beats -5. Open vowels (a, e, o) work better for sustained notes and syllables -6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) -7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels - -The lyrics should: -- Perfectly capture the essence and style of {genre} music -- Express the {primary_emotion} emotion and {primary_theme} theme -- Be completely original -- Maintain a consistent theme throughout -- Match the audio segment duration of {duration:.1f} seconds -""" - - # Add user requirements if provided - if lyrics_requirements and lyrics_requirements.strip(): - content += f""" -USER REQUIREMENTS: -{lyrics_requirements.strip()} - -The lyrics MUST incorporate these user requirements while still following the rhythm patterns. -""" - - content += """ -Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above. -Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY. - -IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics. - -IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" -where you analyze how well the lyrics align with the musical rhythm. This section MUST appear -even if there are no rhythm issues. Include the following in your analysis: -1. Syllable counts for each line and how they match the rhythm pattern -2. Where stressed syllables align with strong beats -3. Any potential misalignments or improvements - -Your lyrics: -""" - - # Format as a chat message for the LLM - messages = [ - {"role": "system", "content": "You are a professional songwriter. Create lyrics that match the specified rhythm patterns exactly. Start with the lyrics immediately without any explanation or thinking. Be concise and direct."}, - {"role": "user", "content": content} - ] - - # Apply standard chat template without thinking enabled - text = llm_tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True - ) - - # Generate lyrics using the LLM - model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device) - - # Configure generation parameters based on model capability - generation_params = { - "do_sample": True, - "temperature": 0.5, # Lower for more consistent and direct output - "top_p": 0.85, # Slightly lower for more predictable responses - "top_k": 50, - "repetition_penalty": 1.2, - "max_new_tokens": 2048, - "num_return_sequences": 1 - } - - # Add specific stop sequences to prevent excessive explanation - if hasattr(llm_model.generation_config, "stopping_criteria"): - thinking_stops = ["Let me think", "First, I need to", "Let's analyze", "I'll approach this", "Step 1:", "To start,"] - for stop in thinking_stops: - if stop not in llm_model.generation_config.stopping_criteria: - llm_model.generation_config.stopping_criteria.append(stop) - - # Generate output - generated_ids = llm_model.generate( - **model_inputs, - **generation_params - ) - - # Extract output tokens - output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() - - # Get the raw output and strip any thinking process - lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip() - - # Enhanced thinking process removal - handle multiple formats - # First check for standard thinking tags - if "" in lyrics and "" in lyrics: - lyrics = lyrics.split("")[1].strip() - - # Check for alternative thinking indicators with improved detection - thinking_markers = [ - "", "", - "[thinking]", "[/thinking]", - "I'll think step by step:", - "First, I need to understand", - "Let me think about", - "Let's tackle this query", - "Okay, let's tackle this query", - "First, I need to understand the requirements", - "Looking at the rhythm patterns" - ] - - # First try to find clear section breaks - for marker in thinking_markers: - if marker in lyrics: - parts = lyrics.split(marker) - if len(parts) > 1: - lyrics = parts[-1].strip() # Take the last part after any thinking marker - - # Look for long analytical sections followed by clear lyrics - analytical_patterns = [ - "Let me analyze", - "I need to understand", - "The tempo is", - "First, let's look at", - "Wait, maybe", - "Considering the emotional tone", - "Starting with the first line", - "Let me check the examples" - ] - - # Check if lyrics begin with any analytical patterns - for pattern in analytical_patterns: - if lyrics.startswith(pattern): - # Try to find where the actual lyrics start - look for common lyrics markers - lyrics_markers = [ - "\n\n[Verse", - "\n\n[Chorus", - "\n\nVerse", - "\n\nChorus", - "\n\n[Verse 1]", - "\n\n[Intro]" - ] - - for marker in lyrics_markers: - if marker in lyrics: - lyrics = lyrics[lyrics.index(marker):].strip() - break - - # One last effort to clean up - if the text is very long and contains obvious thinking - # before getting to actual lyrics, try to find a clear starting point - if len(lyrics.split()) > 100 and "\n\n" in lyrics: - paragraphs = lyrics.split("\n\n") - for i, paragraph in enumerate(paragraphs): - # Look for typical song structure indicators in a paragraph - if any(marker in paragraph for marker in ["[Verse", "[Chorus", "Verse 1", "Chorus:"]): - lyrics = "\n\n".join(paragraphs[i:]) - break - - # Clean up any remaining thinking artifacts at the beginning - lines = lyrics.split('\n') - clean_lines = [] - lyrics_started = False - - for line in lines: - # Skip initial commentary/thinking lines until we hit what looks like lyrics - if not lyrics_started: - if (line.strip().startswith('[') and ']' in line) or not any(thinking in line.lower() for thinking in ["i think", "let me", "maybe", "perhaps", "alternatively", "checking"]): - lyrics_started = True - - if lyrics_started: - clean_lines.append(line) - - # Only use the cleaning logic if we found some actual lyrics - if clean_lines: - lyrics = '\n'.join(clean_lines) - - # Special handling for second-level templates - second_level_verification = None - if song_structure and "second_level" in song_structure and song_structure["second_level"]: - if isinstance(song_structure["second_level"], dict) and "templates" in song_structure["second_level"]: - second_level_verification = song_structure["second_level"]["templates"] - if not isinstance(second_level_verification, list): - second_level_verification = None - - # Verify syllable counts with enhanced verification - pass second-level templates if available - if templates_for_verification: - # Convert any NumPy values to native types before verification - directly handle conversions - # Simple conversion for basic templates (non-recursive) - if isinstance(templates_for_verification, list): - safe_templates = [] - for template in templates_for_verification: - if isinstance(template, dict): - processed_template = {} - for k, v in template.items(): - if isinstance(v, np.ndarray): - if v.size == 1: - processed_template[k] = float(v.item()) - else: - processed_template[k] = [float(x) if isinstance(x, np.number) else x for x in v] - elif isinstance(v, np.number): - processed_template[k] = float(v) - else: - processed_template[k] = v - safe_templates.append(processed_template) - else: - safe_templates.append(template) - else: - safe_templates = templates_for_verification - - # Wrap verification in try-except to handle any potential string indices errors - try: - print(f"DEBUG: Calling verify_flexible_syllable_counts") - print(f"DEBUG: Type of lyrics: {type(lyrics)}") - print(f"DEBUG: Type of safe_templates: {type(safe_templates)}") - print(f"DEBUG: Type of second_level_verification: {type(second_level_verification)}") - - verified_lyrics = verify_flexible_syllable_counts(lyrics, safe_templates, second_level_verification) - print(f"DEBUG: Type of verified_lyrics: {type(verified_lyrics)}") - - except Exception as e: - print(f"ERROR in verify_flexible_syllable_counts: {str(e)}") - # Return the original lyrics if verification fails - return { - "lyrics": lyrics if isinstance(lyrics, str) else str(lyrics), - "rhythm_analysis": f"Error in rhythm analysis: {str(e)}", - "syllable_analysis": "Error performing syllable analysis", - "prompt_template": "Error generating prompt template" - } - - if isinstance(verified_lyrics, str) and "[Note: Potential rhythm mismatches" in verified_lyrics and "Detailed Alignment Analysis" in verified_lyrics: - # Extract the original lyrics (before the notes section) - original_lyrics = lyrics.split("[Note:")[0].strip() if isinstance(lyrics, str) else str(lyrics) - - # Extract the analysis - analysis = verified_lyrics.split("[Note:")[1] if "[Note:" in verified_lyrics else "" - - # If we have serious alignment issues, consider a refinement step - if "stress misalignments" in analysis and len(templates_for_verification) > 0: - # Add a refinement prompt with the specific analysis - refinement_prompt = f""" -You need to fix rhythm issues in these lyrics. Here's the analysis of the problems: - -{analysis} - -Revise the lyrics to perfectly match the rhythm pattern while maintaining the theme. -Focus on fixing the stress misalignments by placing stressed syllables on STRONG beats. - -Original lyrics: -{original_lyrics} - -Improved lyrics with fixed rhythm: -""" - # Format as a chat message for refinement - refinement_messages = [ - {"role": "user", "content": refinement_prompt} - ] - - # Use standard template for refinement (no thinking mode needed) - refinement_text = llm_tokenizer.apply_chat_template( - refinement_messages, - tokenize=False, - add_generation_prompt=True - ) - - try: - # Generate refined lyrics with more focus on rhythm alignment - refinement_inputs = llm_tokenizer([refinement_text], return_tensors="pt").to(llm_model.device) - - # Use stricter parameters for refinement - refinement_params = { - "do_sample": True, - "temperature": 0.4, # Lower temperature for more precise refinement - "top_p": 0.9, - "repetition_penalty": 1.3, - "max_new_tokens": 1024 - } - - refined_ids = llm_model.generate( - **refinement_inputs, - **refinement_params - ) - - # Extract refined lyrics - refined_output_ids = refined_ids[0][len(refinement_inputs.input_ids[0]):].tolist() - refined_lyrics = llm_tokenizer.decode(refined_output_ids, skip_special_tokens=True).strip() - - # Verify the refined lyrics - try: - refined_verified_lyrics = verify_flexible_syllable_counts(refined_lyrics, safe_templates, second_level_verification) - - # Only use refined lyrics if they're better (fewer notes) - if "[Note: Potential rhythm mismatches" not in refined_verified_lyrics: - lyrics = refined_lyrics - elif refined_verified_lyrics.count("misalignments") < verified_lyrics.count("misalignments"): - lyrics = refined_verified_lyrics - else: - lyrics = verified_lyrics - except Exception as e: - print(f"Error in refined lyrics verification: {str(e)}") - lyrics = verified_lyrics - except Exception as e: - print(f"Error in lyrics refinement: {str(e)}") - lyrics = verified_lyrics - else: - # Minor issues, just use the verification notes - lyrics = verified_lyrics - else: - # No significant issues detected - lyrics = verified_lyrics - - # Check if we have the [RHYTHM_ANALYSIS_SECTION] tag - if "[RHYTHM_ANALYSIS_SECTION]" in lyrics: - # Split at our custom marker - parts = lyrics.split("[RHYTHM_ANALYSIS_SECTION]") - clean_lyrics = parts[0].strip() - rhythm_analysis = parts[1].strip() - - # Add our standard marker for compatibility with existing code - lyrics = clean_lyrics + "\n\n[Note: Rhythm Analysis]\n" + rhythm_analysis - - # For backwards compatibility - if we have the old format, still handle it - elif "[Note: Potential rhythm mismatches" in lyrics: - # Keep it as is, the existing parsing code can handle this format - pass - else: - # No analysis found, add a minimal one - lyrics = lyrics + "\n\n[Note: Rhythm Analysis]\nNo rhythm issues detected. All syllables align well with the beat pattern." - - # Before returning, add syllable analysis and prompt template - if isinstance(lyrics, str): - # Extract clean lyrics and analysis - if "[Note: Rhythm Analysis]" in lyrics: - clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() - rhythm_analysis = lyrics.split("[Note: Rhythm Analysis]")[1] - elif "[Note: Potential rhythm mismatches" in lyrics: - clean_lyrics = lyrics.split("[Note:")[0].strip() - rhythm_analysis = "[Note:" + lyrics.split("[Note:")[1] - else: - clean_lyrics = lyrics - rhythm_analysis = "No rhythm analysis available" - - # Create syllable analysis - syllable_analysis = "=== SYLLABLE ANALYSIS ===\n\n" - if templates_for_verification: - syllable_analysis += "Template Analysis:\n" - for i, template in enumerate(templates_for_verification): - if i < min(len(templates_for_verification), 30): # Limit to 30 to avoid overwhelming output - syllable_analysis += f"Line {i+1}:\n" - if isinstance(template, dict): - if "syllable_template" in template: - syllable_analysis += f" Template: {template['syllable_template']}\n" - if "syllable_count" in template: - syllable_analysis += f" Expected syllables: {template['syllable_count']}\n" - elif isinstance(template, str): - syllable_analysis += f" Template: {template}\n" - syllable_analysis += "\n" - - if len(templates_for_verification) > 30: - syllable_analysis += f"... and {len(templates_for_verification) - 30} more lines\n\n" - - # Add second-level analysis if available - if second_level_verification: - syllable_analysis += "\nSecond-Level Template Analysis:\n" - for i, template in enumerate(second_level_verification): - if i < min(len(second_level_verification), 30): # Limit to 30 seconds - syllable_analysis += f"Second {i+1}: {template}\n" - - if len(second_level_verification) > 30: - syllable_analysis += f"... and {len(second_level_verification) - 30} more seconds\n" - - # Add structure visualization to syllable analysis - syllable_analysis += "\n" + structure_visualization - - # Create prompt template - prompt_template = "=== PROMPT TEMPLATE ===\n\n" - prompt_template += "Genre: " + genre + "\n" - prompt_template += f"Duration: {duration:.1f} seconds\n" - prompt_template += f"Tempo: {tempo:.1f} BPM\n" - prompt_template += f"Key: {key} {mode}\n" - prompt_template += f"Primary Emotion: {primary_emotion}\n" - prompt_template += f"Primary Theme: {primary_theme}\n\n" - prompt_template += "Syllable Guidance:\n" + syllable_guidance_text - - # Return all components - return { - "lyrics": clean_lyrics, - "rhythm_analysis": rhythm_analysis, - "syllable_analysis": syllable_analysis, - "prompt_template": prompt_template - } - - return { - "lyrics": lyrics, - "rhythm_analysis": "No rhythm analysis available", - "syllable_analysis": "No syllable analysis available", - "prompt_template": "No prompt template available" - } - -def process_audio(audio_file, lyrics_requirements=None): - """Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis.""" - if audio_file is None: - return "Please upload an audio file.", None, None - - try: - print("Step 1/5: Extracting audio features...") - # Extract audio features - audio_data = extract_audio_features(audio_file) - - print("Step 2/5: Verifying audio contains music...") - # First check if it's music - try: - is_music, ast_results = detect_music(audio_data) - except Exception as e: - print(f"Error in music detection: {str(e)}") - return f"Error in music detection: {str(e)}", None, ast_results - - if not is_music: - return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results - - print("Step 3/5: Classifying music genre...") - # Classify genre - try: - top_genres = classify_genre(audio_data) - # Format genre results using utility function - genre_results = format_genre_results(top_genres) - if not isinstance(top_genres, list) or len(top_genres) == 0: - # Fallback if we don't have valid top_genres - top_genres = [("rock", 1.0)] - except Exception as e: - print(f"Error in genre classification: {str(e)}") - top_genres = [("rock", 1.0)] # Ensure we have a default even when exception happens - return f"Error in genre classification: {str(e)}", None, ast_results - - # Initialize default values - ast_results = ast_results if ast_results else [] - song_structure = None - emotion_results = { - "emotion_analysis": {"primary_emotion": "Unknown"}, - "theme_analysis": {"primary_theme": "Unknown"}, - "rhythm_analysis": {"tempo": 0}, - "tonal_analysis": {"key": "Unknown", "mode": ""}, - "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"} - } - - print("Step 4/5: Analyzing music emotions, themes, and structure...") - # Analyze music emotions and themes - try: - emotion_results = music_analyzer.analyze_music(audio_file) - except Exception as e: - print(f"Error in emotion analysis: {str(e)}") - # Continue with default emotion_results - - # Calculate detailed song structure for better lyrics alignment - try: - # Load audio data - y, sr = load_audio(audio_file, SAMPLE_RATE) - - # Analyze beats and phrases for music-aligned lyrics - beats_info = detect_beats(y, sr) - sections_info = detect_sections(y, sr) - - # Create structured segments for precise line-by-line matching - segments = [] - - # Try to break audio into meaningful segments based on sections - # Each segment will correspond to one line of lyrics - if sections_info and len(sections_info) > 1: - min_segment_duration = 1.5 # Minimum 1.5 seconds per segment - - for section in sections_info: - section_start = section["start"] - section_end = section["end"] - section_duration = section["duration"] - - # For very short sections, add as a single segment - if section_duration < min_segment_duration * 1.5: - segments.append({ - "start": section_start, - "end": section_end - }) - else: - # Calculate ideal number of segments for this section - # based on its duration - aiming for 2-4 second segments - ideal_segment_duration = 3.0 # Target 3 seconds per segment - segment_count = max(1, int(section_duration / ideal_segment_duration)) - - # Create evenly-spaced segments within this section - segment_duration = section_duration / segment_count - for i in range(segment_count): - segment_start = section_start + i * segment_duration - segment_end = segment_start + segment_duration - segments.append({ - "start": segment_start, - "end": segment_end - }) - # If no good sections found, create segments based on beats - elif beats_info and len(beats_info["beat_times"]) > 4: - beats = beats_info["beat_times"] - time_signature = beats_info.get("time_signature", 4) - - # Target one segment per musical measure (typically 4 beats) - measure_size = time_signature - for i in range(0, len(beats), measure_size): - if i + 1 < len(beats): # Need at least 2 beats for a meaningful segment - measure_start = beats[i] - # If we have enough beats for the full measure - if i + measure_size < len(beats): - measure_end = beats[i + measure_size] - else: - # Use available beats and extrapolate for the last measure - if i > 0: - beat_interval = beats[i] - beats[i-1] - measure_end = beats[-1] + (beat_interval * (measure_size - (len(beats) - i))) - else: - measure_end = audio_data["duration"] - - segments.append({ - "start": measure_start, - "end": measure_end - }) - # Last resort: simple time-based segments - else: - # Create segments of approximately 3 seconds each - segment_duration = 3.0 - total_segments = max(4, int(audio_data["duration"] / segment_duration)) - segment_duration = audio_data["duration"] / total_segments - - for i in range(total_segments): - segment_start = i * segment_duration - segment_end = segment_start + segment_duration - segments.append({ - "start": segment_start, - "end": segment_end - }) - - # Create flexible structure with the segments - flexible_structure = { - "beats": beats_info, - "segments": segments - } - - # Create song structure object - song_structure = { - "beats": beats_info, - "sections": sections_info, - "flexible_structure": flexible_structure, - "syllables": [] - } - - # Add syllable counts to each section - for section in sections_info: - # Create syllable templates for sections - section_beats_info = { - "beat_times": [beat for beat in beats_info["beat_times"] - if section["start"] <= beat < section["end"]], - "tempo": beats_info.get("tempo", 120) - } - if "beat_strengths" in beats_info: - section_beats_info["beat_strengths"] = [ - strength for i, strength in enumerate(beats_info["beat_strengths"]) - if i < len(beats_info["beat_times"]) and - section["start"] <= beats_info["beat_times"][i] < section["end"] - ] - - # Get a syllable count based on section duration and tempo - syllable_count = int(section["duration"] * (beats_info.get("tempo", 120) / 60) * 1.5) - - section_info = { - "type": section["type"], - "start": section["start"], - "end": section["end"], - "duration": section["duration"], - "syllable_count": syllable_count, - "beat_count": len(section_beats_info["beat_times"]) - } - - # Try to create a more detailed syllable template - if len(section_beats_info["beat_times"]) >= 2: - # Ensure top_genres is a list with at least one element - if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple): - genre_name = top_genres[0][0] - else: - genre_name = "unknown" # Default genre if top_genres is invalid - - section_info["syllable_template"] = create_flexible_syllable_templates( - section_beats_info, - genre=genre_name - ) - - song_structure["syllables"].append(section_info) - - # Add second-level beat analysis - try: - # Get enhanced beat information with subbeats - subbeat_info = detect_beats_and_subbeats(y, sr, subdivision=4) - - # Map beats to second-level windows - sec_map = map_beats_to_seconds( - subbeat_info["subbeat_times"], - audio_data["duration"] - ) - - # Create second-level templates - # Ensure top_genres is a list with at least one element - genre_name = "unknown" - if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple): - genre_name = top_genres[0][0] - - second_level_templates = create_second_level_templates( - sec_map, - subbeat_info["tempo"], - genre_name # Use top genre with safety check - ) - - # Add to song structure - song_structure["second_level"] = { - "sec_map": sec_map, - "templates": second_level_templates - } - - except Exception as e: - print(f"Error in second-level beat analysis: {str(e)}") - # Continue without second-level data - - except Exception as e: - print(f"Error analyzing song structure: {str(e)}") - # Continue without song structure - - print("Step 5/5: Generating rhythmically aligned lyrics...") - # Generate lyrics based on top genre, emotion analysis, and song structure - try: - # Ensure top_genres is a list with at least one element before accessing - primary_genre = "unknown" - if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple): - primary_genre, _ = top_genres[0] - - # CRITICAL FIX: Create a sanitized version of song_structure to prevent string indices error - sanitized_song_structure = None - if song_structure: - sanitized_song_structure = {} - - # Safely copy beats data - if "beats" in song_structure and isinstance(song_structure["beats"], dict): - sanitized_song_structure["beats"] = song_structure["beats"] - - # Safely copy sections data - if "sections" in song_structure and isinstance(song_structure["sections"], list): - sanitized_song_structure["sections"] = song_structure["sections"] - - # Safely handle flexible structure - if "flexible_structure" in song_structure and isinstance(song_structure["flexible_structure"], dict): - flex_struct = song_structure["flexible_structure"] - sanitized_flex = {} - - # Safely handle segments - if "segments" in flex_struct and isinstance(flex_struct["segments"], list): - sanitized_flex["segments"] = flex_struct["segments"] - - # Safely handle beats - if "beats" in flex_struct and isinstance(flex_struct["beats"], dict): - sanitized_flex["beats"] = flex_struct["beats"] - - sanitized_song_structure["flexible_structure"] = sanitized_flex - - # Safely handle syllables - if "syllables" in song_structure and isinstance(song_structure["syllables"], list): - sanitized_song_structure["syllables"] = song_structure["syllables"] - - # Safely handle second-level - if "second_level" in song_structure and isinstance(song_structure["second_level"], dict): - second_level = song_structure["second_level"] - sanitized_second = {} - - if "templates" in second_level and isinstance(second_level["templates"], list): - sanitized_second["templates"] = second_level["templates"] - - if "sec_map" in second_level and isinstance(second_level["sec_map"], list): - sanitized_second["sec_map"] = second_level["sec_map"] - - sanitized_song_structure["second_level"] = sanitized_second - - try: - print("Calling generate_lyrics function...") - # Pass lyrics_requirements to generate_lyrics function - lyrics_result = generate_lyrics(primary_genre, audio_data["duration"], emotion_results, - sanitized_song_structure, lyrics_requirements) - print(f"Type of lyrics_result: {type(lyrics_result)}") - - # Handle both old and new return formats with robust type checking - if isinstance(lyrics_result, dict) and all(k in lyrics_result for k in ["lyrics"]): - lyrics = lyrics_result.get("lyrics", "No lyrics generated") - rhythm_analysis = lyrics_result.get("rhythm_analysis", "No rhythm analysis available") - syllable_analysis = lyrics_result.get("syllable_analysis", "No syllable analysis available") - prompt_template = lyrics_result.get("prompt_template", "No prompt template available") - else: - # Convert to string regardless of the type - lyrics = str(lyrics_result) if lyrics_result is not None else "No lyrics generated" - rhythm_analysis = "No detailed rhythm analysis available" - syllable_analysis = "No syllable analysis available" - prompt_template = "No prompt template available" - except Exception as inner_e: - print(f"Inner error in lyrics generation: {str(inner_e)}") - # Create a simplified fallback result with just the error message - lyrics = f"Error generating lyrics: {str(inner_e)}" - rhythm_analysis = "Error in rhythm analysis" - syllable_analysis = "Error in syllable analysis" - prompt_template = "Error in prompt template generation" - - except Exception as e: - print(f"Outer error in lyrics generation: {str(e)}") - lyrics = f"Error generating lyrics: {str(e)}" - rhythm_analysis = "No rhythm analysis available" - syllable_analysis = "No syllable analysis available" - prompt_template = "No prompt template available" - # Prepare results dictionary with additional rhythm analysis - results = { - "genre_results": genre_results, - "lyrics": lyrics, - "rhythm_analysis": rhythm_analysis, - "syllable_analysis": syllable_analysis, - "prompt_template": prompt_template, - "ast_results": ast_results - } - - return results - - except Exception as e: - error_msg = f"Error processing audio: {str(e)}" - print(error_msg) - return error_msg, None, [] - -def format_complete_beat_timeline(audio_file, lyrics=None): - """Creates a complete formatted timeline showing all beat timings and their syllable patterns without truncation""" - if audio_file is None: - return "Please upload an audio file to see beat timeline." - - try: - # Extract audio data - y, sr = load_audio(audio_file, SAMPLE_RATE) - - # Get beat information - beats_info = detect_beats(y, sr) - - # Helper function to convert numpy values to floats - FIXED - def ensure_float(value): - if isinstance(value, np.ndarray) or isinstance(value, np.number): - return float(value) - return value - - # Format the timeline with enhanced scientific headers - timeline = "=== BEAT & SYLLABLE TIMELINE ===\n\n" - - tempo = ensure_float(beats_info['tempo']) - tempo_confidence = ensure_float(beats_info.get('tempo_confidence', 90.0)) - time_sig_confidence = ensure_float(beats_info.get('time_sig_confidence', 85.0)) - beat_periodicity = ensure_float(beats_info.get('beat_periodicity', 60 / tempo)) - - timeline += f"Tempo: {tempo:.1f} BPM (±{tempo_confidence:.1f}%)\n" - timeline += f"Time Signature: {beats_info['time_signature']}/4 (Confidence: {time_sig_confidence:.1f}%)\n" - timeline += f"Beat Periodicity: {beat_periodicity:.3f}s\n" - timeline += f"Beat Entropy: {beats_info.get('beat_entropy', 'N/A')}\n" - timeline += f"Total Beats: {beats_info['beat_count']}\n" - - # Add musicological context based on tempo classification - if tempo < 60: - tempo_class = "Largo (very slow)" - elif tempo < 76: - tempo_class = "Adagio (slow)" - elif tempo < 108: - tempo_class = "Andante (walking pace)" - elif tempo < 132: - tempo_class = "Moderato (moderate)" - elif tempo < 168: - tempo_class = "Allegro (fast)" - else: - tempo_class = "Presto (very fast)" - - timeline += f"Tempo Classification: {tempo_class}\n\n" - - # Create an enhanced table header with better column descriptions - timeline += "| Beat # | Time (s) | Beat Strength | Syllable Pattern |\n" - timeline += "|--------|----------|--------------|------------------|\n" - - # Add beat-by-beat information with improved classification - for i, (time, strength) in enumerate(zip(beats_info['beat_times'], beats_info['beat_strengths'])): - # Convert numpy values to Python float if needed - time = ensure_float(time) - strength = ensure_float(strength) - - # More scientific determination of beat type based on both strength and metrical position - metrical_position = i % beats_info['time_signature'] - - if metrical_position == 0: # Downbeat (first beat of measure) - beat_type = "STRONG" - syllable_value = 1.5 - elif metrical_position == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 2: - # Secondary strong beat (e.g., beat 3 in 4/4 time) - beat_type = "MEDIUM" if strength < 0.8 else "STRONG" - syllable_value = 1.0 if strength < 0.8 else 1.5 - else: - # Other beats - classified by actual strength value - if strength >= 0.8: - beat_type = "STRONG" - syllable_value = 1.5 - elif strength >= 0.5: - beat_type = "MEDIUM" - syllable_value = 1.0 - else: - beat_type = "WEAK" - syllable_value = 1.0 - - # Determine pattern letter based on beat type for consistency - if beat_type == "STRONG": - pattern = "S" - elif beat_type == "MEDIUM": - pattern = "m" - else: - pattern = "w" - - # Add row to table with the correct beat classification - timeline += f"| {i+1:<6} | {time:.2f}s | {beat_type:<12} | {pattern}:{syllable_value} |\n" - - # No truncation - show all beats - - # Add a visual timeline of beats - timeline += "\n=== VISUAL BEAT TIMELINE ===\n\n" - timeline += "Each character represents 0.5 seconds. Beats are marked as:\n" - timeline += "S = Strong beat | m = Medium beat | w = Weak beat | · = No beat\n\n" - - # Calculate total duration and create time markers - if 'beat_times' in beats_info and len(beats_info['beat_times']) > 0: - # Get the max value safely - max_beat_time = max([ensure_float(t) for t in beats_info['beat_times']]) - total_duration = max_beat_time + 2 # Add 2 seconds of padding - else: - total_duration = 30 # Default duration if no beats found - - time_markers = "" - for i in range(0, int(total_duration) + 1, 5): - time_markers += f"{i:<5}" - timeline += time_markers + " (seconds)\n" - - # Create a ruler for easier time tracking - ruler = "" - for i in range(0, int(total_duration) + 1): - if i % 5 == 0: - ruler += "+" - else: - ruler += "-" - ruler += "-" * 9 # Each second is 10 characters wide - timeline += ruler + "\n" - - # Create a visualization of beats with symbols - beat_line = ["·"] * int(total_duration * 2) # 2 characters per second - - for i, time in enumerate(beats_info['beat_times']): - if i >= len(beats_info['beat_strengths']): - break - - # Convert to float if it's a numpy array - time_val = ensure_float(time) - - # Determine position in the timeline - pos = int(time_val * 2) # Convert to position in the beat_line - if pos >= len(beat_line): - continue - - # Determine beat type based on strength and position - strength = beats_info['beat_strengths'][i] - # Convert to float if it's a numpy array - strength = ensure_float(strength) - - if i % beats_info['time_signature'] == 0: - beat_line[pos] = "S" # Strong beat at start of measure - elif strength >= 0.8: - beat_line[pos] = "S" # Strong beat - elif i % beats_info['time_signature'] == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 3: - beat_line[pos] = "m" # Medium beat (3rd beat in 4/4) - elif strength >= 0.5: - beat_line[pos] = "m" # Medium beat - else: - beat_line[pos] = "w" # Weak beat - - # Format and add to timeline - beat_visualization = "" - for i in range(0, len(beat_line), 10): - beat_visualization += "".join(beat_line[i:i+10]) - if i + 10 < len(beat_line): - beat_visualization += " " # Add space every 5 seconds - timeline += beat_visualization + "\n\n" - - # Add measure markers - timeline += "=== MEASURE MARKERS ===\n\n" - - # Create a list to track measure start times - measure_starts = [] - for i, time in enumerate(beats_info['beat_times']): - if i % beats_info['time_signature'] == 0: # Start of measure - # Convert to float if it's a numpy array - time_val = ensure_float(time) - measure_starts.append((i // beats_info['time_signature'] + 1, time_val)) - - # Format measure information - if measure_starts: - timeline += "| Measure # | Start Time | Duration |\n" - timeline += "|-----------|------------|----------|\n" - - for i in range(len(measure_starts)): - measure_num, start_time = measure_starts[i] - - # Calculate end time (start of next measure or end of song) - if i < len(measure_starts) - 1: - end_time = measure_starts[i+1][1] - elif 'beat_times' in beats_info and len(beats_info['beat_times']) > 0: - # Get the last beat time and convert to float if needed - last_beat = beats_info['beat_times'][-1] - end_time = ensure_float(last_beat) - else: - end_time = start_time + 2.0 # Default 2 seconds if no next measure - - duration = end_time - start_time - - timeline += f"| {measure_num:<9} | {start_time:.2f}s | {duration:.2f}s |\n" - - # No truncation - show all measures - - # Add phrase information - if 'phrases' in beats_info and beats_info['phrases']: - timeline += "\n=== MUSICAL PHRASES ===\n\n" - for i, phrase in enumerate(beats_info['phrases']): - # Show all phrases, not just the first 10 - if not phrase: - continue - - # Safely check phrase indices - if not (len(phrase) > 0 and len(beats_info['beat_times']) > 0): - continue - - start_beat = min(phrase[0], len(beats_info['beat_times'])-1) - end_beat = min(phrase[-1], len(beats_info['beat_times'])-1) - - # Convert to float if needed - phrase_start = ensure_float(beats_info['beat_times'][start_beat]) - phrase_end = ensure_float(beats_info['beat_times'][end_beat]) - - timeline += f"Phrase {i+1}: Beats {start_beat+1}-{end_beat+1} ({phrase_start:.2f}s - {phrase_end:.2f}s)\n" - - # Create syllable template for this phrase with simplified numpy handling - phrase_beats = { - "beat_times": [ensure_float(beats_info['beat_times'][j]) - for j in phrase if j < len(beats_info['beat_times'])], - "beat_strengths": [ensure_float(beats_info['beat_strengths'][j]) - for j in phrase if j < len(beats_info['beat_strengths'])], - "tempo": ensure_float(beats_info['tempo']), - "time_signature": beats_info['time_signature'], - "phrases": [list(range(len(phrase)))] - } - - template = create_flexible_syllable_templates(phrase_beats) - timeline += f" Syllable Template: {template}\n" - - # Create a visual representation of this phrase - if phrase_start < total_duration and phrase_end < total_duration: - # Create a timeline for this phrase - phrase_visualization = ["·"] * int(total_duration * 2) - - # Mark the phrase boundaries - start_pos = int(phrase_start * 2) - end_pos = int(phrase_end * 2) - - if start_pos < len(phrase_visualization): - phrase_visualization[start_pos] = "[" - - if end_pos < len(phrase_visualization): - phrase_visualization[end_pos] = "]" - - # Mark the beats in this phrase - for j in phrase: - if j < len(beats_info['beat_times']): - beat_time = ensure_float(beats_info['beat_times'][j]) - beat_pos = int(beat_time * 2) - - if beat_pos < len(phrase_visualization) and beat_pos != start_pos and beat_pos != end_pos: - # Determine beat type - if j % beats_info['time_signature'] == 0: - phrase_visualization[beat_pos] = "S" - elif j % beats_info['time_signature'] == beats_info['time_signature'] // 2: - phrase_visualization[beat_pos] = "m" - else: - phrase_visualization[beat_pos] = "w" - - # Format and add visualization - phrase_visual = "" - for k in range(0, len(phrase_visualization), 10): - phrase_visual += "".join(phrase_visualization[k:k+10]) - if k + 10 < len(phrase_visualization): - phrase_visual += " " - - timeline += f" Timeline: {phrase_visual}\n\n" - - # Add second-level script display - try: - # Get second-level beat information - subbeat_info = detect_beats_and_subbeats(y, sr, subdivision=4) - duration = librosa.get_duration(y=y, sr=sr) - - # Map to seconds - sec_map = map_beats_to_seconds(subbeat_info["subbeat_times"], duration) - - # Create templates - templates = create_second_level_templates(sec_map, subbeat_info["tempo"]) - - # Add to timeline - timeline += "\n=== SECOND-LEVEL SCRIPT ===\n\n" - timeline += "Each line below represents ONE SECOND of audio with matching lyric content.\n" - timeline += "| Second | Beat Pattern | Lyric Content |\n" - timeline += "|--------|-------------|---------------|\n" - - # Get clean lyrics (without analysis notes) - clean_lyrics = lyrics - if isinstance(lyrics, str): - if "[Note: Rhythm Analysis]" in lyrics: - clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() - elif "[Note: Potential rhythm mismatches" in lyrics: - clean_lyrics = lyrics.split("[Note:")[0].strip() - - # Get lyric lines - lines = clean_lyrics.strip().split('\n') if clean_lyrics else [] - - for i, template in enumerate(templates): - # Get corresponding lyric line if available - lyric = lines[i] if i < len(lines) else "" - if lyric.startswith('[') and ']' in lyric: - lyric = "" # Skip section headers - - # Format nicely for display - timeline += f"| {i+1:<6} | {template:<30} | {lyric[:40]} |\n" - - # Add ASCII visualization of second-level beats - timeline += "\n=== SECOND-LEVEL VISUALIZATION ===\n\n" - timeline += "Each row represents ONE SECOND. Beat types:\n" - timeline += "S = Strong beat | m = Medium beat | w = Weak beat | · = No beat\n\n" - - for i, window in enumerate(sec_map): - beats = window["beats"] - - # Create ASCII visualization - beat_viz = ["·"] * 20 # 20 columns for visualization - - for beat in beats: - # Calculate position in visualization - pos = int(beat["relative_pos"] * 19) # Map 0-1 to 0-19 - if 0 <= pos < len(beat_viz): - # Set marker based on beat type - if beat["type"] == "main": - beat_viz[pos] = "S" - elif beat["strength"] >= 0.7: - beat_viz[pos] = "m" - else: - beat_viz[pos] = "w" - - # Get corresponding lyric - lyric = lines[i] if i < len(lines) else "" - if lyric.startswith('[') and ']' in lyric: - lyric = "" - - # Format visualization line - viz_line = f"Second {i+1:2d}: [" + "".join(beat_viz) + "]" - if lyric: - viz_line += f" → {lyric[:40]}" - - timeline += viz_line + "\n" - - except Exception as e: - timeline += f"\n[Error generating second-level analysis: {str(e)}]" - - # Add a section showing alignment if lyrics were generated - if lyrics and isinstance(lyrics, str): - timeline += "\n=== LYRICS-BEAT ALIGNMENT ===\n\n" - # Remove rhythm analysis notes from lyrics if present - if "[Note:" in lyrics: - clean_lyrics = lyrics.split("[Note:")[0].strip() - else: - clean_lyrics = lyrics - - lines = clean_lyrics.strip().split('\n') - - # Show alignment for ALL lines, not just the first 10 - for i, line in enumerate(lines): - if not line.strip() or line.startswith('['): - continue - - timeline += f"Line: \"{line}\"\n" - - # Count syllables - syllable_count = count_syllables(line) - timeline += f" Syllables: {syllable_count}\n" - - # Create adaptive phrase matching - if we don't have a direct phrase match, - # try to find the closest matching phrase by time or measure - matching_phrase = None - if 'phrases' in beats_info and beats_info['phrases']: - # First try direct index matching - if i < len(beats_info['phrases']) and beats_info['phrases'][i]: - matching_phrase = beats_info['phrases'][i] - else: - # If no direct match, try to find a phrase by musical position - # Calculate which section of the song we're in - if len(beats_info['phrases']) > 0: - section_size = max(1, len(beats_info['phrases']) // 4) - section_index = min(i // section_size, 3) # Limit to 4 sections - section_start = section_index * section_size - section_end = min(section_start + section_size, len(beats_info['phrases'])) - - # Try to find a phrase within this section - candidate_phrases = [phrase for j, phrase in enumerate(beats_info['phrases']) - if section_start <= j < section_end and phrase] - - if candidate_phrases: - matching_phrase = candidate_phrases[min(i % section_size, len(candidate_phrases)-1)] - elif beats_info['phrases']: - # Fallback to cycling through available phrases - phrase_index = i % len(beats_info['phrases']) - if beats_info['phrases'][phrase_index]: - matching_phrase = beats_info['phrases'][phrase_index] - - # Show timing and detailed alignment if we found a matching phrase - if matching_phrase and len(matching_phrase) > 0 and len(beats_info['beat_times']) > 0: - # Safely check if phrase has elements and indices are valid - if len(matching_phrase) > 0 and len(beats_info['beat_times']) > 0: - start_beat = min(matching_phrase[0], len(beats_info['beat_times'])-1) - end_beat = min(matching_phrase[-1], len(beats_info['beat_times'])-1) - - start_time = ensure_float(beats_info['beat_times'][start_beat]) - end_time = ensure_float(beats_info['beat_times'][end_beat]) - - timeline += f" Timing: {start_time:.2f}s - {end_time:.2f}s\n" - - # Create an enhanced visualization of syllable alignment - timeline += " Alignment: " - - # Create a timeline focused on just this phrase - phrase_duration = end_time - start_time - syllable_viz = [] - - # Initialize with beat markers for this phrase using improved algorithm - for j, beat_idx in enumerate(matching_phrase): - if beat_idx < len(beats_info['beat_times']): - beat_time = ensure_float(beats_info['beat_times'][beat_idx]) - - # Handle edge case where phrase_duration is very small - if phrase_duration > 0.001: # Avoid division by very small numbers - # Use non-linear mapping for more musical alignment - # This accounts for natural speech rhythms not being strictly linear - normalized_pos = (beat_time - start_time) / phrase_duration - # Apply slight curve to map syllable positions more naturally - curved_pos = min(1.0, normalized_pos * (1.0 + 0.1 * (normalized_pos - 0.5))) - relative_pos = int(curved_pos * syllable_count) - else: - relative_pos = j # Default to sequential if duration is too small - - # Ensure we have enough space - while len(syllable_viz) <= relative_pos: - syllable_viz.append("·") - - # Determine beat type with metrical context - metrical_pos = beat_idx % beats_info['time_signature'] - beat_strength = beats_info['beat_strengths'][beat_idx] if beat_idx < len(beats_info['beat_strengths']) else 0 - - if metrical_pos == 0 or beat_strength >= 0.8: - syllable_viz[relative_pos] = "S" # Strong beat - elif metrical_pos == beats_info['time_signature'] // 2 or beat_strength >= 0.5: - syllable_viz[relative_pos] = "m" # Medium beat - else: - syllable_viz[relative_pos] = "w" # Weak beat - - # Fill in any gaps - while len(syllable_viz) < syllable_count: - syllable_viz.append("·") - - # Trim if too long - syllable_viz = syllable_viz[:syllable_count] - - # Add alignment visualization with word stress analysis - timeline += "".join(syllable_viz) + "\n" - - # Add word stress analysis - words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) - if words: - word_stresses = [] - cumulative_syllables = 0 - - for word in words: - syllable_count_word = count_syllables_for_word(word) - stress_pattern = get_word_stress(word) - - # Ensure stress pattern is as long as syllable count - while len(stress_pattern) < syllable_count_word: - stress_pattern += "0" - - for j in range(syllable_count_word): - stress_char = "S" if j < len(stress_pattern) and stress_pattern[j] == "1" else "_" - word_stresses.append(stress_char) - - cumulative_syllables += syllable_count_word - - # Add word stress information - timeline += " Word stress: " + "".join(word_stresses) + "\n" - - # Check if stressed syllables align with strong beats - alignment_score = 0 - alignment_issues = [] - - for j, (stress, beat) in enumerate(zip(word_stresses, syllable_viz)): - if (stress == "S" and beat == "S") or (stress != "S" and beat != "S"): - alignment_score += 1 - elif stress == "S" and beat != "S": - alignment_issues.append(f"Syllable {j+1} has stress but weak beat") - elif stress != "S" and beat == "S": - alignment_issues.append(f"Syllable {j+1} has no stress but strong beat") - - if word_stresses: - alignment_percent = (alignment_score / len(word_stresses)) * 100 - timeline += f" Stress alignment: {alignment_percent:.1f}% match\n" - - if alignment_issues and len(alignment_issues) <= 3: - timeline += " Issues: " + "; ".join(alignment_issues) + "\n" - else: - timeline += " No matching phrase found for alignment\n" - - timeline += "\n" - - return timeline - - except Exception as e: - print(f"Error generating complete beat timeline: {str(e)}") - return f"Error generating complete beat timeline: {str(e)}" - -def display_results(audio_file, lyrics_requirements=None): - """Process audio file and return formatted results for display in the UI.""" - # Default error response - error_response = ("Please upload an audio file.", - "No emotion analysis available.", - "No audio classification available.", - "No lyrics generated.", - "No beat timeline available.") - - if audio_file is None: - return error_response - - try: - # Process audio and get results - pass user requirements - results = process_audio(audio_file, lyrics_requirements) - - # Check if we got an error message - if isinstance(results, str) and "Error" in results: - return results, *error_response[1:] - elif isinstance(results, tuple) and isinstance(results[0], str) and "Error" in results[0]: - return results[0], *error_response[1:] - - # Extract results - if isinstance(results, dict): - # New format - genre_results = results.get("genre_results", "Genre classification failed") - lyrics = results.get("lyrics", "Lyrics generation failed") - ast_results = results.get("ast_results", []) - else: - # Old tuple format - genre_results, lyrics, ast_results = results - - # Get clean lyrics (without analysis notes) - clean_lyrics = lyrics - if isinstance(lyrics, str): - if "[Note: Rhythm Analysis]" in lyrics: - clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() - elif "[Note: Potential rhythm mismatches" in lyrics: - clean_lyrics = lyrics.split("[Note:")[0].strip() - - # Generate beat timeline - use the complete timeline function that shows all beats - beat_timeline = format_complete_beat_timeline(audio_file, clean_lyrics) - - # Format emotion analysis results - emotion_text = "No emotion analysis available." - try: - emotion_results = music_analyzer.analyze_music(audio_file) - emotion_text = (f"Tempo: {emotion_results['summary']['tempo']:.1f} BPM\n" - f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n" - f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n" - f"Primary Theme: {emotion_results['summary']['primary_theme']}") - - # Keep basic beat analysis without section information - y, sr = load_audio(audio_file, SAMPLE_RATE) - beats_info = detect_beats(y, sr) - - # Add beat analysis info - emotion_text += f"\n\nBeat Analysis:\n" - emotion_text += f"- Tempo: {beats_info.get('tempo', 0):.1f} BPM\n" - emotion_text += f"- Time Signature: {beats_info.get('time_signature', 4)}/4\n" - emotion_text += f"- Total Beats: {beats_info.get('beat_count', 0)}\n" - - except Exception as e: - print(f"Error in emotion analysis: {str(e)}") - - # Format audio classification results - ast_text = "No valid audio classification results available." - if ast_results and isinstance(ast_results, list): - ast_text = "Audio Classification Results:\n" - for result in ast_results[:5]: # Show top 5 results - ast_text += f"{result['label']}: {result['score']*100:.2f}%\n" - - # Return all results - return genre_results, emotion_text, ast_text, clean_lyrics, beat_timeline - - except Exception as e: - error_msg = f"Error: {str(e)}" - print(error_msg) - return error_msg, *error_response[1:] - -# Create enhanced Gradio interface with tabs for better organization -with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo: - gr.Markdown("# Music Genre Classifier & Lyrics Generator") - gr.Markdown("Upload a music file to classify its genre, analyze its emotions, and generate perfectly aligned lyrics.") - - with gr.Row(): - with gr.Column(scale=1): - audio_input = gr.Audio(label="Upload Music", type="filepath") - - # Add the new lyrics requirements input - lyrics_requirements_input = gr.Textbox( - label="Lyrics Requirements (optional)", - placeholder="Enter specific themes, topics, words, or styles you want in the lyrics", - lines=3 - ) - - submit_btn = gr.Button("Analyze & Generate", variant="primary") - - # Add genre info box - with gr.Accordion("About Music Genres", open=False): - gr.Markdown(""" - The system recognizes various music genres including: - - Pop, Rock, Hip-Hop, R&B - - Electronic, Dance, Techno, House - - Jazz, Blues, Classical - - Folk, Country, Acoustic - - Metal, Punk, Alternative - - And many others! - - For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music. - """) - - with gr.Column(scale=2): - # Use tabs for better organization of outputs - with gr.Tabs(): - with gr.TabItem("Analysis Results"): - genre_output = gr.Textbox(label="Detected Genres", lines=4) - - # Create 2 columns for emotion and audio classification - with gr.Row(): - with gr.Column(): - emotion_output = gr.Textbox(label="Emotion & Structure Analysis", lines=8) - with gr.Column(): - ast_output = gr.Textbox(label="Audio Classification", lines=8) - - with gr.TabItem("Generated Lyrics"): - lyrics_output = gr.Textbox(label="Lyrics", lines=18) - - with gr.TabItem("Beat & Syllable Timeline"): - beat_timeline_output = gr.Textbox(label="Beat Timings & Syllable Patterns", lines=40) - - # Connect the button to the display function with updated inputs - submit_btn.click( - fn=display_results, - inputs=[audio_input, lyrics_requirements_input], - outputs=[genre_output, emotion_output, ast_output, lyrics_output, beat_timeline_output] - ) - - # Enhanced explanation of how the system works - with gr.Accordion("How it works", open=False): - gr.Markdown(""" - ## Advanced Lyrics Generation Process - - 1. **Audio Analysis**: The system analyzes your uploaded music file using multiple machine learning models. - - 2. **Genre Classification**: A specialized neural network identifies the musical genre, detecting subtle patterns in the audio. - - 3. **Emotional Analysis**: The system examines harmonic, rhythmic, and timbral features to determine the emotional qualities of the music. - - 4. **Rhythm Mapping**: Advanced beat detection algorithms create a detailed rhythmic map of the music, identifying: - - Strong and weak beats - - Natural phrase boundaries - - Time signature and tempo variations - - Beat subdivisions (half and quarter beats) - - 5. **Second-Level Alignment**: The system maps beats and subbeats to each second of audio, creating precise templates for perfect alignment. - - 6. **Syllable Template Creation**: For each second of audio, the system generates precise syllable templates that reflect: - - Beat stress patterns (strong, medium, weak) - - Appropriate syllable counts based on tempo - - Genre-specific rhythmic qualities - - Half-beat and quarter-beat subdivisions - - 7. **Lyrics Generation**: Using the detected genre, emotion, rhythm patterns, and your custom requirements, a large language model generates lyrics that: - - Match the emotional quality of the music - - Follow the precise syllable templates for each second - - Align stressed syllables with strong beats - - Maintain genre-appropriate style and themes - - Incorporate your specific requirements and preferences - - 8. **Rhythm Verification**: The system verifies the generated lyrics, analyzing: - - Syllable count accuracy - - Stress alignment with strong beats - - Word stress patterns - - Second-by-second alignment precision - - 9. **Refinement**: If significant rhythm mismatches are detected, the system can automatically refine the lyrics for better alignment. - - This multi-step process creates lyrics that feel naturally connected to the music, as if they were written specifically for it. - """) + return demo # Launch the app -demo.launch() \ No newline at end of file +demo = create_interface() + +if __name__ == "__main__": + demo.launch() +else: + # For Hugging Face Spaces + app = demo \ No newline at end of file