Spaces:

jacob-c
/

syllables_matching_experiment

Paused

App Files Files Community

root commited on May 16

Commit

5d5eb0f

1 Parent(s): 651b0cd

ss

Browse files

Files changed (1) hide show

app.py +197 -34

app.py CHANGED Viewed

@@ -1656,9 +1656,36 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
         print(f"DEBUG: templates is not a list, it's {type(templates)}")
         # If it's not a list, create a single-item list
         if templates is not None:
-            templates = [templates]
         else:
             templates = []
     # Split lyrics into lines
     lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
@@ -1689,18 +1716,32 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
         # Extract the template string from different possible formats
         template_str = None
-        if isinstance(template, dict) and "syllable_template" in template:
-            template_str = template["syllable_template"]
         elif isinstance(template, str):
             template_str = template
         else:
-            print(f"DEBUG: Skipping template {i+1}, not a string or dict with syllable_template")
-            continue
         if not isinstance(template_str, str):
             print(f"DEBUG: template_str is not a string, it's {type(template_str)}")
             continue
         # Handle multiple phrases in template - process ALL phrases, not just the first
         template_phrases = [template_str]
         if "|" in template_str:
@@ -3148,28 +3189,93 @@ def detect_voice_activity(audio_file):
     try:
         print("Detecting voice activity in audio...")
         # Get HF_TOKEN from environment or set your token here
-        hf_token = os.environ.get("HF_TOKEN", None)
         # Initialize the voice activity detection pipeline
-        vad_pipeline = Pipeline.from_pretrained(
-            "pyannote/voice-activity-detection",
-            use_auth_token=hf_token
-        )
-        # Process the audio file
-        output = vad_pipeline(audio_file)
-        # Extract voice segments
-        voice_segments = []
-        for speech in output.get_timeline().support():
-            voice_segments.append({
-                "start": speech.start,
-                "end": speech.end,
-                "duration": speech.end - speech.start
-            })
-        print(f"Detected {len(voice_segments)} voice segments")
-        return voice_segments
     except Exception as e:
         print(f"Error detecting voice activity: {str(e)}")
@@ -3551,8 +3657,14 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
         # Helper function to convert numpy values to floats - FIXED
         def ensure_float(value):
-            if isinstance(value, np.ndarray) or isinstance(value, np.number):
                 return float(value)
             return value
         # Format the timeline with enhanced scientific headers
@@ -4117,10 +4229,38 @@ def display_results(audio_file, lyrics_requirements=None):
         emotion_text = "No emotion analysis available."
         try:
             emotion_results = music_analyzer.analyze_music(audio_file)
-            emotion_text = (f"Tempo: {emotion_results['summary']['tempo']:.1f} BPM\n"
-                           f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n"
-                           f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n"
-                           f"Primary Theme: {emotion_results['summary']['primary_theme']}")
             # Keep basic beat analysis without section information
             y, sr = load_audio(audio_file, SAMPLE_RATE)
@@ -4128,9 +4268,15 @@ def display_results(audio_file, lyrics_requirements=None):
             # Add beat analysis info
             emotion_text += f"\n\nBeat Analysis:\n"
-            emotion_text += f"- Tempo: {beats_info.get('tempo', 0):.1f} BPM\n"
-            emotion_text += f"- Time Signature: {beats_info.get('time_signature', 4)}/4\n"
-            emotion_text += f"- Total Beats: {beats_info.get('beat_count', 0)}\n"
             # Add voice activity segments if available
             if voice_segments:
@@ -4189,6 +4335,23 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
                 For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music.
                 """)
         with gr.Column(scale=2):
             # Use tabs for better organization of outputs
@@ -4260,4 +4423,4 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
         """)
 # Launch the app
-demo.launch()

         print(f"DEBUG: templates is not a list, it's {type(templates)}")
         # If it's not a list, create a single-item list
         if templates is not None:
+            if isinstance(templates, str):
+                # If it's a string, we need to parse it properly
+                templates = [templates]
+            elif isinstance(templates, dict):
+                # If it's a dict, extract relevant information
+                if "templates" in templates:
+                    templates = templates["templates"]
+                    if not isinstance(templates, list):
+                        templates = [templates]
+                else:
+                    # Create a single element list with the dict
+                    templates = [templates]
+            else:
+                templates = [templates]
         else:
             templates = []
+    # Ensure all templates are strings or properly formatted dicts
+    for i, template in enumerate(templates[:]):
+        if isinstance(template, dict):
+            if "syllable_template" not in template and "text" in template:
+                # Try to use text field if syllable_template is missing
+                template["syllable_template"] = template["text"]
+        elif not isinstance(template, str):
+            # Convert non-string, non-dict templates to strings if possible
+            try:
+                templates[i] = str(template)
+            except:
+                # Remove this template if it can't be converted
+                templates.pop(i)
     # Split lyrics into lines
     lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
         # Extract the template string from different possible formats
         template_str = None
+        if isinstance(template, dict):
+            # Try various keys that might contain template information
+            for key in ["syllable_template", "template", "text", "pattern"]:
+                if key in template and template[key] is not None:
+                    template_str = template[key]
+                    break
         elif isinstance(template, str):
             template_str = template
         else:
+            # Try to convert to string
+            try:
+                template_str = str(template)
+                print(f"DEBUG: Converted template {i+1} from {type(template)} to string")
+            except:
+                print(f"DEBUG: Skipping template {i+1}, not a string or dict with syllable_template")
+                continue
         if not isinstance(template_str, str):
             print(f"DEBUG: template_str is not a string, it's {type(template_str)}")
             continue
+        # Safety check for empty strings
+        if not template_str.strip():
+            print(f"DEBUG: Skipping empty template {i+1}")
+            continue
         # Handle multiple phrases in template - process ALL phrases, not just the first
         template_phrases = [template_str]
         if "|" in template_str:
     try:
         print("Detecting voice activity in audio...")
         # Get HF_TOKEN from environment or set your token here
+        hf_token = os.environ.get("pyannote", None)
+        if not hf_token:
+            print("Warning: No Hugging Face token provided. Voice activity detection requires authentication.")
+            print("To use voice activity detection:")
+            print("1. Create an account at https://huggingface.co")
+            print("2. Generate a token at https://huggingface.co/settings/tokens")
+            print("3. Accept the terms for pyannote/segmentation at https://huggingface.co/pyannote/segmentation")
+            print("4. Set HF_TOKEN environment variable or provide it directly in the code")
+            # Create fallback segments based on audio duration
+            # This creates segments approximately every 5 seconds
+            y, sr = load_audio(audio_file, SAMPLE_RATE)
+            duration = extract_audio_duration(y, sr)
+            # Create segments of 4-5 seconds each, with small gaps between them
+            estimated_segments = []
+            segment_duration = 4.5
+            gap_duration = 1.0
+            current_pos = 0.0
+            while current_pos < duration:
+                segment_end = min(current_pos + segment_duration, duration)
+                estimated_segments.append({
+                    "start": current_pos,
+                    "end": segment_end,
+                    "duration": segment_end - current_pos
+                })
+                current_pos = segment_end + gap_duration
+                if current_pos >= duration:
+                    break
+            print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
+            return estimated_segments
         # Initialize the voice activity detection pipeline
+        try:
+            vad_pipeline = Pipeline.from_pretrained(
+                "pyannote/voice-activity-detection",
+                use_auth_token=hf_token
+            )
+            # Process the audio file
+            output = vad_pipeline(audio_file)
+            # Extract voice segments
+            voice_segments = []
+            for speech in output.get_timeline().support():
+                voice_segments.append({
+                    "start": speech.start,
+                    "end": speech.end,
+                    "duration": speech.end - speech.start
+                })
+            print(f"Detected {len(voice_segments)} voice segments")
+            return voice_segments
+        except Exception as auth_error:
+            print(f"Authentication error with pyannote models: {str(auth_error)}")
+            print("Make sure you have:")
+            print("1. Created a Hugging Face account")
+            print("2. Generated a token at https://huggingface.co/settings/tokens")
+            print("3. Accepted terms for pyannote/segmentation at https://huggingface.co/pyannote/segmentation")
+            # Create fallback segments as above
+            y, sr = load_audio(audio_file, SAMPLE_RATE)
+            duration = extract_audio_duration(y, sr)
+            # Create segments of 4-5 seconds each with small gaps
+            estimated_segments = []
+            segment_duration = 4.5
+            gap_duration = 1.0
+            current_pos = 0.0
+            while current_pos < duration:
+                segment_end = min(current_pos + segment_duration, duration)
+                estimated_segments.append({
+                    "start": current_pos,
+                    "end": segment_end,
+                    "duration": segment_end - current_pos
+                })
+                current_pos = segment_end + gap_duration
+                if current_pos >= duration:
+                    break
+            print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
+            return estimated_segments
     except Exception as e:
         print(f"Error detecting voice activity: {str(e)}")
         # Helper function to convert numpy values to floats - FIXED
         def ensure_float(value):
+            if isinstance(value, np.ndarray):
+                if value.size == 1:
+                    return float(value.item())
+                return float(value[0]) if value.size > 0 else 0.0
+            elif isinstance(value, np.number):
                 return float(value)
+            elif value is None:
+                return 0.0
             return value
         # Format the timeline with enhanced scientific headers
         emotion_text = "No emotion analysis available."
         try:
             emotion_results = music_analyzer.analyze_music(audio_file)
+            # Safe formatting helper function to handle any value type
+            def safe_format(value, format_spec=None):
+                if value is None:
+                    return "N/A"
+                try:
+                    if isinstance(value, (int, float)):
+                        if format_spec:
+                            return format(value, format_spec)
+                        return str(value)
+                    if isinstance(value, np.ndarray):
+                        if value.size == 1:
+                            val = value.item()
+                            if format_spec:
+                                return format(val, format_spec)
+                            return str(val)
+                        return str(value[0]) if value.size > 0 else "N/A"
+                    return str(value)
+                except:
+                    return "N/A"
+            # Get summary values safely
+            tempo = emotion_results.get('summary', {}).get('tempo', 0)
+            key = emotion_results.get('summary', {}).get('key', 'Unknown')
+            mode = emotion_results.get('summary', {}).get('mode', '')
+            primary_emotion = emotion_results.get('summary', {}).get('primary_emotion', 'Unknown')
+            primary_theme = emotion_results.get('summary', {}).get('primary_theme', 'Unknown')
+            emotion_text = (f"Tempo: {safe_format(tempo, '.1f')} BPM\n"
+                           f"Key: {key} {mode}\n"
+                           f"Primary Emotion: {primary_emotion}\n"
+                           f"Primary Theme: {primary_theme}")
             # Keep basic beat analysis without section information
             y, sr = load_audio(audio_file, SAMPLE_RATE)
             # Add beat analysis info
             emotion_text += f"\n\nBeat Analysis:\n"
+            # Get beat info values safely
+            tempo = beats_info.get('tempo', 0)
+            time_sig = beats_info.get('time_signature', 4)
+            beat_count = beats_info.get('beat_count', 0)
+            emotion_text += f"- Tempo: {safe_format(tempo, '.1f')} BPM\n"
+            emotion_text += f"- Time Signature: {time_sig}/4\n"
+            emotion_text += f"- Total Beats: {beat_count}\n"
             # Add voice activity segments if available
             if voice_segments:
                 For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music.
                 """)
+            # Add voice detection info box
+            with gr.Accordion("Voice Activity Detection", open=True):
+                gr.Markdown("""
+                ### Voice Detection Authentication Required
+                This app uses pyannote/voice-activity-detection to identify vocal segments in music.
+                **Important:** This model requires Hugging Face authentication:
+                1. Create an account at [huggingface.co](https://huggingface.co)
+                2. Generate a token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
+                3. Accept the terms at [huggingface.co/pyannote/segmentation](https://huggingface.co/pyannote/segmentation)
+                4. Set the HF_TOKEN environment variable
+                Without authentication, the app will use estimated segments based on audio duration.
+                """)
         with gr.Column(scale=2):
             # Use tabs for better organization of outputs
         """)
 # Launch the app
+demo.launch(share=True)