Spaces:

jacob-c
/

syllables_matching_experiment

Paused

App Files Files Community

root commited on May 16

Commit

651b0cd

1 Parent(s): bddf9c4

ss

Browse files

Files changed (3) hide show

app.py +285 -30
appp.py +68 -10
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -24,6 +24,17 @@ from utils import (
 )
 from emotionanalysis import MusicAnalyzer
 import librosa
 # Login to Hugging Face Hub if token is provided
 if "HF_TOKEN" in os.environ:
@@ -1180,12 +1191,12 @@ def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='defa
         # Sigmoid-like function with more scientific parameters
         # Using logistic function: L/(1+e^(-k(x-x0))) to create smooth transitions
         if tempo < 40:  # Very slow tempos
-            return 3.5  # Maximum syllables for extremely slow tempos
         elif tempo > 200:  # Very fast tempos
-            return 0.8  # Minimum syllables for extremely fast tempos
         else:
             # Scientific logistic function for middle range (40-200 BPM)
-            L = 3.5  # Upper limit
             k = 0.04  # Steepness of curve
             x0 = 120  # Midpoint (inflection point at normal tempo)
             return L / (1 + np.exp(k * (tempo - x0)))
@@ -1235,6 +1246,32 @@ def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='defa
         # ----------------------------------------------------------------------
         detailed_template = []
         for i, (stress_type, strength) in enumerate(stress_pattern):
             # Get base syllable count from tempo with more nuanced mapping
             base_syllables = tempo_to_syllable_base(tempo)
@@ -1281,6 +1318,60 @@ def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='defa
             strength_pct = round(strength * 100) / 100
             detailed_template.append(f"{stress_type}({strength_pct}):{syllable_count}")
         # Join beat templates for this phrase
         phrase_template = "-".join(detailed_template)
         syllable_templates.append(phrase_template)
@@ -1572,6 +1663,16 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
     # Split lyrics into lines
     lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
     # Initialize tracking variables
     verification_notes = []
     detailed_analysis = []
@@ -1747,11 +1848,14 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
                     for beat in best_phrase_beats:
                         if beat.get("type") == "S":
                             strong_positions.append(current_pos)
                         current_pos += beat.get("count", 1)
                     # Check if strong syllables align with strong beats
                     alignment_issues = []
                     for pos in strong_positions:
                         # Find which word contains this position
@@ -1768,18 +1872,31 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
                                 # Get stress pattern for this word
                                 stress = word_info["stress_pattern"]
-                                # If we have stress information and this syllable isn't stressed
-                                if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1':
-                                    misaligned_word = word_info["word"]
-                                    alignment_issues.append(f"'{word_info['word']}' (unstressed syllable on strong beat)")
-                                    stress_misalignments.append({
-                                        "line": i+1,
-                                        "word": word_info["word"],
-                                        "position": pos,
-                                        "suggestion": get_stress_aligned_alternatives(word_info["word"], syllable_in_word)
-                                    })
                                 break
                     if alignment_issues:
                         verification_notes.append(f"  → Stress misalignments: {', '.join(alignment_issues)}")
@@ -2452,7 +2569,7 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None, lyric
     # Create enhanced prompt with better rhythm alignment instructions
     if use_second_level:
-        # Second-level approach with per-second alignment
         content = f"""
 You are a talented songwriter who specializes in {genre} music.
 Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long.
@@ -2634,7 +2751,7 @@ Your lyrics:
     # Format as a chat message for the LLM
     messages = [
-        {"role": "system", "content": "You are a professional songwriter. Create lyrics that match the specified rhythm patterns exactly. Start with the lyrics immediately without any explanation or thinking. Be concise and direct."},
         {"role": "user", "content": content}
     ]
@@ -2684,6 +2801,23 @@ Your lyrics:
         lyrics = lyrics.split("</thinking>")[1].strip()
     # Check for alternative thinking indicators with improved detection
     thinking_markers = [
         "<think>", "</think>",
         "[thinking]", "[/thinking]",
@@ -2769,6 +2903,24 @@ Your lyrics:
             if not isinstance(second_level_verification, list):
                 second_level_verification = None
     # Verify syllable counts with enhanced verification - pass second-level templates if available
     if templates_for_verification:
         # Convert any NumPy values to native types before verification - directly handle conversions
@@ -2983,17 +3135,58 @@ Improved lyrics with fixed rhythm:
         "prompt_template": "No prompt template available"
     }
 def process_audio(audio_file, lyrics_requirements=None):
     """Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis."""
     if audio_file is None:
         return "Please upload an audio file.", None, None
     try:
-        print("Step 1/5: Extracting audio features...")
         # Extract audio features
         audio_data = extract_audio_features(audio_file)
-        print("Step 2/5: Verifying audio contains music...")
         # First check if it's music
         try:
             is_music, ast_results = detect_music(audio_data)
@@ -3004,7 +3197,11 @@ def process_audio(audio_file, lyrics_requirements=None):
         if not is_music:
             return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results
-        print("Step 3/5: Classifying music genre...")
         # Classify genre
         try:
             top_genres = classify_genre(audio_data)
@@ -3029,7 +3226,7 @@ def process_audio(audio_file, lyrics_requirements=None):
             "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}
         }
-        print("Step 4/5: Analyzing music emotions, themes, and structure...")
         # Analyze music emotions and themes
         try:
             emotion_results = music_analyzer.analyze_music(audio_file)
@@ -3046,12 +3243,15 @@ def process_audio(audio_file, lyrics_requirements=None):
             beats_info = detect_beats(y, sr)
             sections_info = detect_sections(y, sr)
-            # Create structured segments for precise line-by-line matching
             segments = []
-            # Try to break audio into meaningful segments based on sections
-            # Each segment will correspond to one line of lyrics
-            if sections_info and len(sections_info) > 1:
                 min_segment_duration = 1.5  # Minimum 1.5 seconds per segment
                 for section in sections_info:
@@ -3063,7 +3263,8 @@ def process_audio(audio_file, lyrics_requirements=None):
                     if section_duration < min_segment_duration * 1.5:
                         segments.append({
                             "start": section_start,
-                            "end": section_end
                         })
                     else:
                         # Calculate ideal number of segments for this section
@@ -3078,7 +3279,8 @@ def process_audio(audio_file, lyrics_requirements=None):
                             segment_end = segment_start + segment_duration
                             segments.append({
                                 "start": segment_start,
-                                "end": segment_end
                             })
             # If no good sections found, create segments based on beats
             elif beats_info and len(beats_info["beat_times"]) > 4:
@@ -3136,6 +3338,15 @@ def process_audio(audio_file, lyrics_requirements=None):
             # Add syllable counts to each section
             for section in sections_info:
                 # Create syllable templates for sections
                 section_beats_info = {
                     "beat_times": [beat for beat in beats_info["beat_times"]
@@ -3150,19 +3361,21 @@ def process_audio(audio_file, lyrics_requirements=None):
                     ]
                 # Get a syllable count based on section duration and tempo
-                syllable_count = int(section["duration"] * (beats_info.get("tempo", 120) / 60) * 1.5)
                 section_info = {
                     "type": section["type"],
                     "start": section["start"],
                     "end": section["end"],
                     "duration": section["duration"],
                     "syllable_count": syllable_count,
                     "beat_count": len(section_beats_info["beat_times"])
                 }
-                # Try to create a more detailed syllable template
-                if len(section_beats_info["beat_times"]) >= 2:
                     # Ensure top_genres is a list with at least one element
                     if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple):
                         genre_name = top_genres[0][0]
@@ -3213,7 +3426,7 @@ def process_audio(audio_file, lyrics_requirements=None):
             print(f"Error analyzing song structure: {str(e)}")
             # Continue without song structure
-        print("Step 5/5: Generating rhythmically aligned lyrics...")
         # Generate lyrics based on top genre, emotion analysis, and song structure
         try:
             # Ensure top_genres is a list with at least one element before accessing
@@ -3306,7 +3519,8 @@ def process_audio(audio_file, lyrics_requirements=None):
             "rhythm_analysis": rhythm_analysis,
             "syllable_analysis": syllable_analysis,
             "prompt_template": prompt_template,
-            "ast_results": ast_results
         }
         return results
@@ -3328,6 +3542,13 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
         # Get beat information
         beats_info = detect_beats(y, sr)
         # Helper function to convert numpy values to floats - FIXED
         def ensure_float(value):
             if isinstance(value, np.ndarray) or isinstance(value, np.number):
@@ -3347,6 +3568,14 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
         timeline += f"Beat Periodicity: {beat_periodicity:.3f}s\n"
         timeline += f"Beat Entropy: {beats_info.get('beat_entropy', 'N/A')}\n"
         timeline += f"Total Beats: {beats_info['beat_count']}\n"
         # Add musicological context based on tempo classification
         if tempo < 60:
@@ -3374,6 +3603,13 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
             time = ensure_float(time)
             strength = ensure_float(strength)
             # More scientific determination of beat type based on both strength and metrical position
             metrical_position = i % beats_info['time_signature']
@@ -3395,6 +3631,10 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
                 else:
                     beat_type = "WEAK"
                     syllable_value = 1.0
             # Determine pattern letter based on beat type for consistency
             if beat_type == "STRONG":
@@ -3851,9 +4091,16 @@ def display_results(audio_file, lyrics_requirements=None):
             genre_results = results.get("genre_results", "Genre classification failed")
             lyrics = results.get("lyrics", "Lyrics generation failed")
             ast_results = results.get("ast_results", [])
         else:
             # Old tuple format
             genre_results, lyrics, ast_results = results
         # Get clean lyrics (without analysis notes)
         clean_lyrics = lyrics
@@ -3885,6 +4132,14 @@ def display_results(audio_file, lyrics_requirements=None):
             emotion_text += f"- Time Signature: {beats_info.get('time_signature', 4)}/4\n"
             emotion_text += f"- Total Beats: {beats_info.get('beat_count', 0)}\n"
         except Exception as e:
             print(f"Error in emotion analysis: {str(e)}")

 )
 from emotionanalysis import MusicAnalyzer
 import librosa
+from pyannote.audio import Pipeline
+import tempfile
+import os
+import soundfile as sf
+import warnings
+import json
+import math
+from collections import defaultdict
+import matplotlib.pyplot as plt
+from gradio_client import Client
+from transformers import pipeline as hf_pipeline
 # Login to Hugging Face Hub if token is provided
 if "HF_TOKEN" in os.environ:
         # Sigmoid-like function with more scientific parameters
         # Using logistic function: L/(1+e^(-k(x-x0))) to create smooth transitions
         if tempo < 40:  # Very slow tempos
+            return 1.8  # Further reduced maximum syllables for extremely slow tempos
         elif tempo > 200:  # Very fast tempos
+            return 0.7  # Minimum syllables for extremely fast tempos
         else:
             # Scientific logistic function for middle range (40-200 BPM)
+            L = 2.0  # Significantly reduced upper limit to prevent excessive syllables
             k = 0.04  # Steepness of curve
             x0 = 120  # Midpoint (inflection point at normal tempo)
             return L / (1 + np.exp(k * (tempo - x0)))
         # ----------------------------------------------------------------------
         detailed_template = []
+        # Calculate phrase duration if beat times are available
+        phrase_duration = 0
+        if phrase and len(phrase) > 1 and len(beat_times) > 0:
+            # Get first and last beat indices from the phrase
+            first_idx = phrase[0]
+            last_idx = phrase[-1]
+            # Check if indices are within bounds
+            if first_idx < len(beat_times) and last_idx < len(beat_times):
+                phrase_duration = beat_times[last_idx] - beat_times[first_idx]
+        # Calculate a maximum reasonable syllable count based on duration
+        # Aim for 3-4 syllables per second maximum for singability (reduced from 5-6)
+        max_reasonable_syllables = 100  # Default high value
+        if phrase_duration > 0:
+            # Use a more conservative syllable rate based on tempo
+            if tempo < 80:  # Slow tempo
+                syllable_rate = 3.0  # Maximum 3 syllables per second for slow tempos
+            elif tempo < 120:  # Medium tempo
+                syllable_rate = 3.5  # Maximum 3.5 syllables per second for medium tempos
+            else:  # Fast tempo
+                syllable_rate = 4.0  # Maximum 4 syllables per second for fast tempos
+            # Calculate max syllables and ensure it's at least 2 for any phrase
+            max_reasonable_syllables = max(2, int(phrase_duration * syllable_rate))
         for i, (stress_type, strength) in enumerate(stress_pattern):
             # Get base syllable count from tempo with more nuanced mapping
             base_syllables = tempo_to_syllable_base(tempo)
             strength_pct = round(strength * 100) / 100
             detailed_template.append(f"{stress_type}({strength_pct}):{syllable_count}")
+        # Calculate total expected syllables for this phrase
+        total_expected_syllables = sum([float(beat.split(':')[1]) for beat in detailed_template])
+        # If total syllables exceed our reasonable limit, scale them down
+        if total_expected_syllables > max_reasonable_syllables and max_reasonable_syllables > 0:
+            scale_factor = max_reasonable_syllables / total_expected_syllables
+            adjusted_template = []
+            # Stronger scaling for very short phrases (less than 0.8 seconds)
+            if phrase_duration < 0.8 and phrase_duration > 0:
+                # Further reduce for extremely short phrases
+                scale_factor *= 0.8
+            for beat in detailed_template:
+                if ':' in beat:
+                    beat_type_part = beat.split(':')[0]
+                    syllable_count = float(beat.split(':')[1])
+                    # Scale down and round to nearest 0.25
+                    new_count = max(0.5, round(syllable_count * scale_factor * 4) / 4)
+                    # Extra check for very short phrases - cap at 1.0 for S beats and 0.5 for others
+                    if phrase_duration < 0.6 and phrase_duration > 0:
+                        if beat_type_part.startswith("S"):
+                            new_count = min(new_count, 1.0)
+                        else:
+                            new_count = min(new_count, 0.5)
+                    adjusted_template.append(f"{beat_type_part}:{new_count}")
+                else:
+                    adjusted_template.append(beat)
+            detailed_template = adjusted_template
+        # Extra check to avoid having too many total syllables in a phrase
+        if len(detailed_template) > 0:
+            total_syllables = sum([float(beat.split(':')[1]) for beat in detailed_template if ':' in beat])
+            if phrase_duration > 0 and (total_syllables / phrase_duration) > 5.0:
+                # If we have more than 5 syllables per second, apply additional scaling
+                target_syllables = phrase_duration * 4.0  # Target 4 syllables per second max
+                scale_factor = target_syllables / total_syllables
+                adjusted_template = []
+                for beat in detailed_template:
+                    if ':' in beat:
+                        beat_type_part = beat.split(':')[0]
+                        syllable_count = float(beat.split(':')[1])
+                        # Scale down and round to nearest 0.25
+                        new_count = max(0.5, round(syllable_count * scale_factor * 4) / 4)
+                        adjusted_template.append(f"{beat_type_part}:{new_count}")
+                    else:
+                        adjusted_template.append(beat)
+                detailed_template = adjusted_template
         # Join beat templates for this phrase
         phrase_template = "-".join(detailed_template)
         syllable_templates.append(phrase_template)
     # Split lyrics into lines
     lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
+    # Remove any lines that are clearly not lyrics, like explanations or meta-content
+    filtered_lines = []
+    for line in lines:
+        # Skip explanatory content or meta-text
+        if line.startswith('**') or line.startswith('[Note:') or 'alignment:' in line.lower():
+            continue
+        filtered_lines.append(line)
+    lines = filtered_lines
     # Initialize tracking variables
     verification_notes = []
     detailed_analysis = []
                     for beat in best_phrase_beats:
                         if beat.get("type") == "S":
+                            # If the count is greater than 1, only the first syllable should be stressed
                             strong_positions.append(current_pos)
                         current_pos += beat.get("count", 1)
                     # Check if strong syllables align with strong beats
                     alignment_issues = []
+                    aligned_stress_count = 0
+                    total_stress_positions = len(strong_positions)
                     for pos in strong_positions:
                         # Find which word contains this position
                                 # Get stress pattern for this word
                                 stress = word_info["stress_pattern"]
+                                # If we have stress information, check if the syllable is stressed
+                                if stress and syllable_in_word < len(stress):
+                                    if stress[syllable_in_word] == '1':
+                                        # Syllable is stressed and properly aligned
+                                        aligned_stress_count += 1
+                                    else:
+                                        # Syllable is not stressed but should be
+                                        misaligned_word = word_info["word"]
+                                        alignment_issues.append(f"'{word_info['word']}' (unstressed syllable on strong beat)")
+                                        stress_misalignments.append({
+                                            "line": i+1,
+                                            "word": word_info["word"],
+                                            "position": pos,
+                                            "suggestion": get_stress_aligned_alternatives(word_info["word"], syllable_in_word)
+                                        })
                                 break
+                    # Calculate alignment percentage
+                    alignment_percentage = 0
+                    if total_stress_positions > 0:
+                        alignment_percentage = (aligned_stress_count / total_stress_positions) * 100
+                    # Add alignment percentage to notes
+                    verification_notes.append(f"  → Stress alignment: {alignment_percentage:.1f}% ({aligned_stress_count}/{total_stress_positions})")
                     if alignment_issues:
                         verification_notes.append(f"  → Stress misalignments: {', '.join(alignment_issues)}")
     # Create enhanced prompt with better rhythm alignment instructions
     if use_second_level:
+        # Second-level approach with per-second alignment - enhanced for better syllable distribution
         content = f"""
 You are a talented songwriter who specializes in {genre} music.
 Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long.
     # Format as a chat message for the LLM
     messages = [
+        {"role": "system", "content": "You are a professional songwriter. Create lyrics that match the specified rhythm patterns EXACTLY. Be extremely concise - use only the EXACT number of syllables specified for each line. For short phrases (1 second or less), use just 2-3 MAXIMUM syllables. Include lyrics for EVERY musical section - do not leave any section empty. Use one-syllable words whenever possible for better singability. Avoid complex vocabulary. For all beat patterns, use fewer syllables than you think you need. Start with the lyrics immediately without any explanation or thinking."},
         {"role": "user", "content": content}
     ]
         lyrics = lyrics.split("</thinking>")[1].strip()
     # Check for alternative thinking indicators with improved detection
+    # Clean up lyrics: Remove meta-content and explanations
+    if lyrics:
+        # Remove any line that starts with **
+        cleaned_lines = []
+        for line in lyrics.split('\n'):
+            if not line.strip().startswith('**') and not 'alignment:' in line.lower():
+                cleaned_lines.append(line)
+        lyrics = '\n'.join(cleaned_lines)
+        # Check for excessively long lines (likely explanations)
+        max_reasonable_line_length = 80
+        final_lines = []
+        for line in lyrics.split('\n'):
+            if len(line) <= max_reasonable_line_length or '[' in line or ']' in line:
+                final_lines.append(line)
+        lyrics = '\n'.join(final_lines)
     thinking_markers = [
         "<think>", "</think>",
         "[thinking]", "[/thinking]",
             if not isinstance(second_level_verification, list):
                 second_level_verification = None
+    # Ensure all second-level templates have lyrics
+    if song_structure and "second_level" in song_structure and song_structure["second_level"]:
+        if "templates" in song_structure["second_level"] and isinstance(song_structure["second_level"]["templates"], list):
+            # Count how many seconds have lyrics
+            if lyrics:
+                lines = [line.strip() for line in lyrics.split('\n') if line.strip()]
+                # If we have fewer lines than seconds, try to distribute them better
+                second_count = len(song_structure["second_level"]["templates"])
+                if 0 < len(lines) < second_count:
+                    # Simple distribution - repeat existing lines to fill all seconds
+                    distributed_lines = []
+                    for i in range(second_count):
+                        distributed_lines.append(lines[i % len(lines)])
+                    # Replace the lyrics with the distributed version
+                    lyrics = '\n'.join(distributed_lines)
     # Verify syllable counts with enhanced verification - pass second-level templates if available
     if templates_for_verification:
         # Convert any NumPy values to native types before verification - directly handle conversions
         "prompt_template": "No prompt template available"
     }
+def detect_voice_activity(audio_file):
+    """
+    Detect segments with voice/singing in audio using pyannote/voice-activity-detection
+    Args:
+        audio_file: Path to audio file
+    Returns:
+        List of dictionaries with start and end times of voice segments
+    """
+    try:
+        print("Detecting voice activity in audio...")
+        # Get HF_TOKEN from environment or set your token here
+        hf_token = os.environ.get("HF_TOKEN", None)
+        # Initialize the voice activity detection pipeline
+        vad_pipeline = Pipeline.from_pretrained(
+            "pyannote/voice-activity-detection",
+            use_auth_token=hf_token
+        )
+        # Process the audio file
+        output = vad_pipeline(audio_file)
+        # Extract voice segments
+        voice_segments = []
+        for speech in output.get_timeline().support():
+            voice_segments.append({
+                "start": speech.start,
+                "end": speech.end,
+                "duration": speech.end - speech.start
+            })
+        print(f"Detected {len(voice_segments)} voice segments")
+        return voice_segments
+    except Exception as e:
+        print(f"Error detecting voice activity: {str(e)}")
+        # Return empty list if detection fails
+        return []
 def process_audio(audio_file, lyrics_requirements=None):
     """Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis."""
     if audio_file is None:
         return "Please upload an audio file.", None, None
     try:
+        print("Step 1/6: Extracting audio features...")
         # Extract audio features
         audio_data = extract_audio_features(audio_file)
+        print("Step 2/6: Verifying audio contains music...")
         # First check if it's music
         try:
             is_music, ast_results = detect_music(audio_data)
         if not is_music:
             return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results
+        print("Step 3/6: Detecting voice activity segments...")
+        # Detect voice activity segments
+        voice_segments = detect_voice_activity(audio_file)
+        print("Step 4/6: Classifying music genre...")
         # Classify genre
         try:
             top_genres = classify_genre(audio_data)
             "summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"}
         }
+        print("Step 5/6: Analyzing music emotions, themes, and structure...")
         # Analyze music emotions and themes
         try:
             emotion_results = music_analyzer.analyze_music(audio_file)
             beats_info = detect_beats(y, sr)
             sections_info = detect_sections(y, sr)
+            # Create structured segments based on voice activity detection
             segments = []
+            # If we have voice segments, use them as our primary segments
+            if voice_segments and len(voice_segments) > 0:
+                segments = voice_segments
+                print(f"Using {len(segments)} voice segments for lyrics generation")
+            # If no voice segments detected or detection failed, fall back to previous methods
+            elif sections_info and len(sections_info) > 1:
                 min_segment_duration = 1.5  # Minimum 1.5 seconds per segment
                 for section in sections_info:
                     if section_duration < min_segment_duration * 1.5:
                         segments.append({
                             "start": section_start,
+                            "end": section_end,
+                            "duration": section_duration
                         })
                     else:
                         # Calculate ideal number of segments for this section
                             segment_end = segment_start + segment_duration
                             segments.append({
                                 "start": segment_start,
+                                "end": segment_end,
+                                "duration": segment_duration
                             })
             # If no good sections found, create segments based on beats
             elif beats_info and len(beats_info["beat_times"]) > 4:
             # Add syllable counts to each section
             for section in sections_info:
+                # Check if this section overlaps with any voice segments
+                section_has_voice = False
+                for voice_segment in voice_segments:
+                    # Check for overlap between section and voice segment
+                    if (section["start"] <= voice_segment["end"] and
+                        section["end"] >= voice_segment["start"]):
+                        section_has_voice = True
+                        break
                 # Create syllable templates for sections
                 section_beats_info = {
                     "beat_times": [beat for beat in beats_info["beat_times"]
                     ]
                 # Get a syllable count based on section duration and tempo
+                # If section has voice, use normal count, otherwise set to 0
+                syllable_count = int(section["duration"] * (beats_info.get("tempo", 120) / 60) * 1.5) if section_has_voice else 0
                 section_info = {
                     "type": section["type"],
                     "start": section["start"],
                     "end": section["end"],
                     "duration": section["duration"],
+                    "has_voice": section_has_voice,
                     "syllable_count": syllable_count,
                     "beat_count": len(section_beats_info["beat_times"])
                 }
+                # Try to create a more detailed syllable template, but only for sections with voice
+                if len(section_beats_info["beat_times"]) >= 2 and section_has_voice:
                     # Ensure top_genres is a list with at least one element
                     if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple):
                         genre_name = top_genres[0][0]
             print(f"Error analyzing song structure: {str(e)}")
             # Continue without song structure
+        print("Step 6/6: Generating rhythmically aligned lyrics...")
         # Generate lyrics based on top genre, emotion analysis, and song structure
         try:
             # Ensure top_genres is a list with at least one element before accessing
             "rhythm_analysis": rhythm_analysis,
             "syllable_analysis": syllable_analysis,
             "prompt_template": prompt_template,
+            "ast_results": ast_results,
+            "voice_segments": voice_segments
         }
         return results
         # Get beat information
         beats_info = detect_beats(y, sr)
+        # Get voice activity segments
+        try:
+            voice_segments = detect_voice_activity(audio_file)
+        except Exception as e:
+            print(f"Error detecting voice segments: {str(e)}")
+            voice_segments = []
         # Helper function to convert numpy values to floats - FIXED
         def ensure_float(value):
             if isinstance(value, np.ndarray) or isinstance(value, np.number):
         timeline += f"Beat Periodicity: {beat_periodicity:.3f}s\n"
         timeline += f"Beat Entropy: {beats_info.get('beat_entropy', 'N/A')}\n"
         timeline += f"Total Beats: {beats_info['beat_count']}\n"
+        # Add voice activity segments information
+        if voice_segments:
+            timeline += f"\nVoice Activity Segments: {len(voice_segments)}\n"
+            for i, segment in enumerate(voice_segments[:5]):  # Show first 5 segments
+                timeline += f"  Segment {i+1}: {segment['start']:.2f}s - {segment['end']:.2f}s ({segment['duration']:.2f}s)\n"
+            if len(voice_segments) > 5:
+                timeline += f"  ... and {len(voice_segments) - 5} more segments\n"
         # Add musicological context based on tempo classification
         if tempo < 60:
             time = ensure_float(time)
             strength = ensure_float(strength)
+            # Check if this beat is during voice activity
+            in_voice_segment = False
+            for segment in voice_segments:
+                if segment['start'] <= time <= segment['end']:
+                    in_voice_segment = True
+                    break
             # More scientific determination of beat type based on both strength and metrical position
             metrical_position = i % beats_info['time_signature']
                 else:
                     beat_type = "WEAK"
                     syllable_value = 1.0
+            # Mark the beat type if it's in a voice segment
+            if in_voice_segment:
+                beat_type = f"{beat_type} (VOICE)"
             # Determine pattern letter based on beat type for consistency
             if beat_type == "STRONG":
             genre_results = results.get("genre_results", "Genre classification failed")
             lyrics = results.get("lyrics", "Lyrics generation failed")
             ast_results = results.get("ast_results", [])
+            voice_segments = results.get("voice_segments", [])
         else:
             # Old tuple format
             genre_results, lyrics, ast_results = results
+            # Get voice segments
+            try:
+                voice_segments = detect_voice_activity(audio_file)
+            except Exception as e:
+                print(f"Error detecting voice segments: {str(e)}")
+                voice_segments = []
         # Get clean lyrics (without analysis notes)
         clean_lyrics = lyrics
             emotion_text += f"- Time Signature: {beats_info.get('time_signature', 4)}/4\n"
             emotion_text += f"- Total Beats: {beats_info.get('beat_count', 0)}\n"
+            # Add voice activity segments if available
+            if voice_segments:
+                emotion_text += f"\n\nVoice Activity Segments ({len(voice_segments)}):\n"
+                for i, segment in enumerate(voice_segments[:10]):  # Show up to 10 segments
+                    emotion_text += f"- Segment {i+1}: {segment['start']:.2f}s - {segment['end']:.2f}s ({segment['duration']:.2f}s)\n"
+                if len(voice_segments) > 10:
+                    emotion_text += f"... and {len(voice_segments) - 10} more segments\n"
         except Exception as e:
             print(f"Error in emotion analysis: {str(e)}")

appp.py CHANGED Viewed

@@ -32,7 +32,7 @@ if "HF_TOKEN" in os.environ:
 # Constants
 GENRE_MODEL_NAME = "dima806/music_genres_classification"
 MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593"
-LLM_MODEL_NAME = "Qwen/Qwen3-14B"
 SAMPLE_RATE = 22050  # Standard sample rate for audio processing
 # Check CUDA availability (for informational purposes)
@@ -2063,7 +2063,7 @@ def get_stress_aligned_alternatives(word, position_to_stress):
     # For other cases, just provide general guidance
     return f"a word with stress on syllable {position_to_stress + 1}"
-def generate_lyrics(genre, duration, emotion_results, song_structure=None):
     """
     Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment.
@@ -2075,6 +2075,7 @@ def generate_lyrics(genre, duration, emotion_results, song_structure=None):
         duration: Duration of the audio in seconds
         emotion_results: Dictionary containing emotional analysis results
         song_structure: Optional dictionary containing song structure analysis
     Returns:
         Generated lyrics aligned with the rhythm patterns of the music
@@ -2493,6 +2494,30 @@ even if there are no rhythm issues. Include the following in your analysis:
 2. Where stressed syllables align with strong beats
 3. Any potential misalignments or improvements
 Your lyrics:
 """
     elif use_sections:
@@ -2526,7 +2551,18 @@ The lyrics should:
 - Follow the structure patterns provided above
 - Be completely original
 - Match the song duration of {duration:.1f} seconds
 IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
 IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
@@ -2569,7 +2605,18 @@ The lyrics should:
 - Be completely original
 - Maintain a consistent theme throughout
 - Match the audio segment duration of {duration:.1f} seconds
 Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above.
 Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY.
@@ -2936,7 +2983,7 @@ Improved lyrics with fixed rhythm:
         "prompt_template": "No prompt template available"
     }
-def process_audio(audio_file):
     """Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis."""
     if audio_file is None:
         return "Please upload an audio file.", None, None
@@ -3221,7 +3268,9 @@ def process_audio(audio_file):
             try:
                 print("Calling generate_lyrics function...")
-                lyrics_result = generate_lyrics(primary_genre, audio_data["duration"], emotion_results, sanitized_song_structure)
                 print(f"Type of lyrics_result: {type(lyrics_result)}")
                 # Handle both old and new return formats with robust type checking
@@ -3774,7 +3823,7 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
         print(f"Error generating complete beat timeline: {str(e)}")
         return f"Error generating complete beat timeline: {str(e)}"
-def display_results(audio_file):
     """Process audio file and return formatted results for display in the UI."""
     # Default error response
     error_response = ("Please upload an audio file.",
@@ -3787,8 +3836,8 @@ def display_results(audio_file):
         return error_response
     try:
-        # Process audio and get results
-        results = process_audio(audio_file)
         # Check if we got an error message
         if isinstance(results, str) and "Error" in results:
@@ -3862,6 +3911,14 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
     with gr.Row():
         with gr.Column(scale=1):
             audio_input = gr.Audio(label="Upload Music", type="filepath")
             submit_btn = gr.Button("Analyze & Generate", variant="primary")
             # Add genre info box
@@ -3897,10 +3954,10 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
                 with gr.TabItem("Beat & Syllable Timeline"):
                     beat_timeline_output = gr.Textbox(label="Beat Timings & Syllable Patterns", lines=40)
-    # Connect the button to the display function with updated outputs
     submit_btn.click(
         fn=display_results,
-        inputs=[audio_input],
         outputs=[genre_output, emotion_output, ast_output, lyrics_output, beat_timeline_output]
     )
@@ -3929,11 +3986,12 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
            - Genre-specific rhythmic qualities
            - Half-beat and quarter-beat subdivisions
-        7. **Lyrics Generation**: Using the detected genre, emotion, and rhythm patterns, a large language model generates lyrics that:
            - Match the emotional quality of the music
            - Follow the precise syllable templates for each second
            - Align stressed syllables with strong beats
            - Maintain genre-appropriate style and themes
         8. **Rhythm Verification**: The system verifies the generated lyrics, analyzing:
            - Syllable count accuracy

 # Constants
 GENRE_MODEL_NAME = "dima806/music_genres_classification"
 MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593"
+LLM_MODEL_NAME = "Qwen/Qwen3-32B"
 SAMPLE_RATE = 22050  # Standard sample rate for audio processing
 # Check CUDA availability (for informational purposes)
     # For other cases, just provide general guidance
     return f"a word with stress on syllable {position_to_stress + 1}"
+def generate_lyrics(genre, duration, emotion_results, song_structure=None, lyrics_requirements=None):
     """
     Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment.
         duration: Duration of the audio in seconds
         emotion_results: Dictionary containing emotional analysis results
         song_structure: Optional dictionary containing song structure analysis
+        lyrics_requirements: Optional user-provided requirements for the lyrics
     Returns:
         Generated lyrics aligned with the rhythm patterns of the music
 2. Where stressed syllables align with strong beats
 3. Any potential misalignments or improvements
+Your lyrics:
+"""
+        # Add user requirements if provided
+        if lyrics_requirements and lyrics_requirements.strip():
+            content += f"""
+USER REQUIREMENTS:
+{lyrics_requirements.strip()}
+The lyrics MUST incorporate these user requirements while still following the rhythm patterns.
+"""
+        content += """
+Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY.
+IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
+IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
+where you analyze how well the lyrics align with the musical rhythm. This section MUST appear
+even if there are no rhythm issues. Include the following in your analysis:
+1. Syllable counts for each line and how they match the rhythm pattern
+2. Where stressed syllables align with strong beats
+3. Any potential misalignments or improvements
 Your lyrics:
 """
     elif use_sections:
 - Follow the structure patterns provided above
 - Be completely original
 - Match the song duration of {duration:.1f} seconds
+"""
+        # Add user requirements if provided
+        if lyrics_requirements and lyrics_requirements.strip():
+            content += f"""
+USER REQUIREMENTS:
+{lyrics_requirements.strip()}
+The lyrics MUST incorporate these user requirements while still following the rhythm patterns.
+"""
+        content += """
 IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics.
 IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]"
 - Be completely original
 - Maintain a consistent theme throughout
 - Match the audio segment duration of {duration:.1f} seconds
+"""
+        # Add user requirements if provided
+        if lyrics_requirements and lyrics_requirements.strip():
+            content += f"""
+USER REQUIREMENTS:
+{lyrics_requirements.strip()}
+The lyrics MUST incorporate these user requirements while still following the rhythm patterns.
+"""
+        content += """
 Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above.
 Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY.
         "prompt_template": "No prompt template available"
     }
+def process_audio(audio_file, lyrics_requirements=None):
     """Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis."""
     if audio_file is None:
         return "Please upload an audio file.", None, None
             try:
                 print("Calling generate_lyrics function...")
+                # Pass lyrics_requirements to generate_lyrics function
+                lyrics_result = generate_lyrics(primary_genre, audio_data["duration"], emotion_results,
+                                               sanitized_song_structure, lyrics_requirements)
                 print(f"Type of lyrics_result: {type(lyrics_result)}")
                 # Handle both old and new return formats with robust type checking
         print(f"Error generating complete beat timeline: {str(e)}")
         return f"Error generating complete beat timeline: {str(e)}"
+def display_results(audio_file, lyrics_requirements=None):
     """Process audio file and return formatted results for display in the UI."""
     # Default error response
     error_response = ("Please upload an audio file.",
         return error_response
     try:
+        # Process audio and get results - pass user requirements
+        results = process_audio(audio_file, lyrics_requirements)
         # Check if we got an error message
         if isinstance(results, str) and "Error" in results:
     with gr.Row():
         with gr.Column(scale=1):
             audio_input = gr.Audio(label="Upload Music", type="filepath")
+            # Add the new lyrics requirements input
+            lyrics_requirements_input = gr.Textbox(
+                label="Lyrics Requirements (optional)",
+                placeholder="Enter specific themes, topics, words, or styles you want in the lyrics",
+                lines=3
+            )
             submit_btn = gr.Button("Analyze & Generate", variant="primary")
             # Add genre info box
                 with gr.TabItem("Beat & Syllable Timeline"):
                     beat_timeline_output = gr.Textbox(label="Beat Timings & Syllable Patterns", lines=40)
+    # Connect the button to the display function with updated inputs
     submit_btn.click(
         fn=display_results,
+        inputs=[audio_input, lyrics_requirements_input],
         outputs=[genre_output, emotion_output, ast_output, lyrics_output, beat_timeline_output]
     )
            - Genre-specific rhythmic qualities
            - Half-beat and quarter-beat subdivisions
+        7. **Lyrics Generation**: Using the detected genre, emotion, rhythm patterns, and your custom requirements, a large language model generates lyrics that:
            - Match the emotional quality of the music
            - Follow the precise syllable templates for each second
            - Align stressed syllables with strong beats
            - Maintain genre-appropriate style and themes
+           - Incorporate your specific requirements and preferences
         8. **Rhythm Verification**: The system verifies the generated lyrics, analyzing:
            - Syllable count accuracy

requirements.txt CHANGED Viewed

@@ -13,3 +13,4 @@ scipy>=1.12.0
 soundfile>=0.12.1
 matplotlib>=3.7.0
 pronouncing>=0.2.0

 soundfile>=0.12.1
 matplotlib>=3.7.0
 pronouncing>=0.2.0
+pyannote.audio>=2.1.1