root commited on
Commit
5d5eb0f
·
1 Parent(s): 651b0cd
Files changed (1) hide show
  1. app.py +197 -34
app.py CHANGED
@@ -1656,9 +1656,36 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
1656
  print(f"DEBUG: templates is not a list, it's {type(templates)}")
1657
  # If it's not a list, create a single-item list
1658
  if templates is not None:
1659
- templates = [templates]
 
 
 
 
 
 
 
 
 
 
 
 
 
1660
  else:
1661
  templates = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1662
 
1663
  # Split lyrics into lines
1664
  lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
@@ -1689,18 +1716,32 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
1689
 
1690
  # Extract the template string from different possible formats
1691
  template_str = None
1692
- if isinstance(template, dict) and "syllable_template" in template:
1693
- template_str = template["syllable_template"]
 
 
 
 
1694
  elif isinstance(template, str):
1695
  template_str = template
1696
  else:
1697
- print(f"DEBUG: Skipping template {i+1}, not a string or dict with syllable_template")
1698
- continue
 
 
 
 
 
1699
 
1700
  if not isinstance(template_str, str):
1701
  print(f"DEBUG: template_str is not a string, it's {type(template_str)}")
1702
  continue
1703
 
 
 
 
 
 
1704
  # Handle multiple phrases in template - process ALL phrases, not just the first
1705
  template_phrases = [template_str]
1706
  if "|" in template_str:
@@ -3148,28 +3189,93 @@ def detect_voice_activity(audio_file):
3148
  try:
3149
  print("Detecting voice activity in audio...")
3150
  # Get HF_TOKEN from environment or set your token here
3151
- hf_token = os.environ.get("HF_TOKEN", None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3152
 
3153
  # Initialize the voice activity detection pipeline
3154
- vad_pipeline = Pipeline.from_pretrained(
3155
- "pyannote/voice-activity-detection",
3156
- use_auth_token=hf_token
3157
- )
3158
-
3159
- # Process the audio file
3160
- output = vad_pipeline(audio_file)
3161
-
3162
- # Extract voice segments
3163
- voice_segments = []
3164
- for speech in output.get_timeline().support():
3165
- voice_segments.append({
3166
- "start": speech.start,
3167
- "end": speech.end,
3168
- "duration": speech.end - speech.start
3169
- })
3170
-
3171
- print(f"Detected {len(voice_segments)} voice segments")
3172
- return voice_segments
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3173
 
3174
  except Exception as e:
3175
  print(f"Error detecting voice activity: {str(e)}")
@@ -3551,8 +3657,14 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
3551
 
3552
  # Helper function to convert numpy values to floats - FIXED
3553
  def ensure_float(value):
3554
- if isinstance(value, np.ndarray) or isinstance(value, np.number):
 
 
 
 
3555
  return float(value)
 
 
3556
  return value
3557
 
3558
  # Format the timeline with enhanced scientific headers
@@ -4117,10 +4229,38 @@ def display_results(audio_file, lyrics_requirements=None):
4117
  emotion_text = "No emotion analysis available."
4118
  try:
4119
  emotion_results = music_analyzer.analyze_music(audio_file)
4120
- emotion_text = (f"Tempo: {emotion_results['summary']['tempo']:.1f} BPM\n"
4121
- f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n"
4122
- f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n"
4123
- f"Primary Theme: {emotion_results['summary']['primary_theme']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4124
 
4125
  # Keep basic beat analysis without section information
4126
  y, sr = load_audio(audio_file, SAMPLE_RATE)
@@ -4128,9 +4268,15 @@ def display_results(audio_file, lyrics_requirements=None):
4128
 
4129
  # Add beat analysis info
4130
  emotion_text += f"\n\nBeat Analysis:\n"
4131
- emotion_text += f"- Tempo: {beats_info.get('tempo', 0):.1f} BPM\n"
4132
- emotion_text += f"- Time Signature: {beats_info.get('time_signature', 4)}/4\n"
4133
- emotion_text += f"- Total Beats: {beats_info.get('beat_count', 0)}\n"
 
 
 
 
 
 
4134
 
4135
  # Add voice activity segments if available
4136
  if voice_segments:
@@ -4189,6 +4335,23 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
4189
 
4190
  For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music.
4191
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4192
 
4193
  with gr.Column(scale=2):
4194
  # Use tabs for better organization of outputs
@@ -4260,4 +4423,4 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
4260
  """)
4261
 
4262
  # Launch the app
4263
- demo.launch()
 
1656
  print(f"DEBUG: templates is not a list, it's {type(templates)}")
1657
  # If it's not a list, create a single-item list
1658
  if templates is not None:
1659
+ if isinstance(templates, str):
1660
+ # If it's a string, we need to parse it properly
1661
+ templates = [templates]
1662
+ elif isinstance(templates, dict):
1663
+ # If it's a dict, extract relevant information
1664
+ if "templates" in templates:
1665
+ templates = templates["templates"]
1666
+ if not isinstance(templates, list):
1667
+ templates = [templates]
1668
+ else:
1669
+ # Create a single element list with the dict
1670
+ templates = [templates]
1671
+ else:
1672
+ templates = [templates]
1673
  else:
1674
  templates = []
1675
+
1676
+ # Ensure all templates are strings or properly formatted dicts
1677
+ for i, template in enumerate(templates[:]):
1678
+ if isinstance(template, dict):
1679
+ if "syllable_template" not in template and "text" in template:
1680
+ # Try to use text field if syllable_template is missing
1681
+ template["syllable_template"] = template["text"]
1682
+ elif not isinstance(template, str):
1683
+ # Convert non-string, non-dict templates to strings if possible
1684
+ try:
1685
+ templates[i] = str(template)
1686
+ except:
1687
+ # Remove this template if it can't be converted
1688
+ templates.pop(i)
1689
 
1690
  # Split lyrics into lines
1691
  lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
 
1716
 
1717
  # Extract the template string from different possible formats
1718
  template_str = None
1719
+ if isinstance(template, dict):
1720
+ # Try various keys that might contain template information
1721
+ for key in ["syllable_template", "template", "text", "pattern"]:
1722
+ if key in template and template[key] is not None:
1723
+ template_str = template[key]
1724
+ break
1725
  elif isinstance(template, str):
1726
  template_str = template
1727
  else:
1728
+ # Try to convert to string
1729
+ try:
1730
+ template_str = str(template)
1731
+ print(f"DEBUG: Converted template {i+1} from {type(template)} to string")
1732
+ except:
1733
+ print(f"DEBUG: Skipping template {i+1}, not a string or dict with syllable_template")
1734
+ continue
1735
 
1736
  if not isinstance(template_str, str):
1737
  print(f"DEBUG: template_str is not a string, it's {type(template_str)}")
1738
  continue
1739
 
1740
+ # Safety check for empty strings
1741
+ if not template_str.strip():
1742
+ print(f"DEBUG: Skipping empty template {i+1}")
1743
+ continue
1744
+
1745
  # Handle multiple phrases in template - process ALL phrases, not just the first
1746
  template_phrases = [template_str]
1747
  if "|" in template_str:
 
3189
  try:
3190
  print("Detecting voice activity in audio...")
3191
  # Get HF_TOKEN from environment or set your token here
3192
+ hf_token = os.environ.get("pyannote", None)
3193
+
3194
+ if not hf_token:
3195
+ print("Warning: No Hugging Face token provided. Voice activity detection requires authentication.")
3196
+ print("To use voice activity detection:")
3197
+ print("1. Create an account at https://huggingface.co")
3198
+ print("2. Generate a token at https://huggingface.co/settings/tokens")
3199
+ print("3. Accept the terms for pyannote/segmentation at https://huggingface.co/pyannote/segmentation")
3200
+ print("4. Set HF_TOKEN environment variable or provide it directly in the code")
3201
+
3202
+ # Create fallback segments based on audio duration
3203
+ # This creates segments approximately every 5 seconds
3204
+ y, sr = load_audio(audio_file, SAMPLE_RATE)
3205
+ duration = extract_audio_duration(y, sr)
3206
+
3207
+ # Create segments of 4-5 seconds each, with small gaps between them
3208
+ estimated_segments = []
3209
+ segment_duration = 4.5
3210
+ gap_duration = 1.0
3211
+
3212
+ current_pos = 0.0
3213
+ while current_pos < duration:
3214
+ segment_end = min(current_pos + segment_duration, duration)
3215
+ estimated_segments.append({
3216
+ "start": current_pos,
3217
+ "end": segment_end,
3218
+ "duration": segment_end - current_pos
3219
+ })
3220
+ current_pos = segment_end + gap_duration
3221
+ if current_pos >= duration:
3222
+ break
3223
+
3224
+ print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
3225
+ return estimated_segments
3226
 
3227
  # Initialize the voice activity detection pipeline
3228
+ try:
3229
+ vad_pipeline = Pipeline.from_pretrained(
3230
+ "pyannote/voice-activity-detection",
3231
+ use_auth_token=hf_token
3232
+ )
3233
+
3234
+ # Process the audio file
3235
+ output = vad_pipeline(audio_file)
3236
+
3237
+ # Extract voice segments
3238
+ voice_segments = []
3239
+ for speech in output.get_timeline().support():
3240
+ voice_segments.append({
3241
+ "start": speech.start,
3242
+ "end": speech.end,
3243
+ "duration": speech.end - speech.start
3244
+ })
3245
+
3246
+ print(f"Detected {len(voice_segments)} voice segments")
3247
+ return voice_segments
3248
+
3249
+ except Exception as auth_error:
3250
+ print(f"Authentication error with pyannote models: {str(auth_error)}")
3251
+ print("Make sure you have:")
3252
+ print("1. Created a Hugging Face account")
3253
+ print("2. Generated a token at https://huggingface.co/settings/tokens")
3254
+ print("3. Accepted terms for pyannote/segmentation at https://huggingface.co/pyannote/segmentation")
3255
+
3256
+ # Create fallback segments as above
3257
+ y, sr = load_audio(audio_file, SAMPLE_RATE)
3258
+ duration = extract_audio_duration(y, sr)
3259
+
3260
+ # Create segments of 4-5 seconds each with small gaps
3261
+ estimated_segments = []
3262
+ segment_duration = 4.5
3263
+ gap_duration = 1.0
3264
+
3265
+ current_pos = 0.0
3266
+ while current_pos < duration:
3267
+ segment_end = min(current_pos + segment_duration, duration)
3268
+ estimated_segments.append({
3269
+ "start": current_pos,
3270
+ "end": segment_end,
3271
+ "duration": segment_end - current_pos
3272
+ })
3273
+ current_pos = segment_end + gap_duration
3274
+ if current_pos >= duration:
3275
+ break
3276
+
3277
+ print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
3278
+ return estimated_segments
3279
 
3280
  except Exception as e:
3281
  print(f"Error detecting voice activity: {str(e)}")
 
3657
 
3658
  # Helper function to convert numpy values to floats - FIXED
3659
  def ensure_float(value):
3660
+ if isinstance(value, np.ndarray):
3661
+ if value.size == 1:
3662
+ return float(value.item())
3663
+ return float(value[0]) if value.size > 0 else 0.0
3664
+ elif isinstance(value, np.number):
3665
  return float(value)
3666
+ elif value is None:
3667
+ return 0.0
3668
  return value
3669
 
3670
  # Format the timeline with enhanced scientific headers
 
4229
  emotion_text = "No emotion analysis available."
4230
  try:
4231
  emotion_results = music_analyzer.analyze_music(audio_file)
4232
+
4233
+ # Safe formatting helper function to handle any value type
4234
+ def safe_format(value, format_spec=None):
4235
+ if value is None:
4236
+ return "N/A"
4237
+ try:
4238
+ if isinstance(value, (int, float)):
4239
+ if format_spec:
4240
+ return format(value, format_spec)
4241
+ return str(value)
4242
+ if isinstance(value, np.ndarray):
4243
+ if value.size == 1:
4244
+ val = value.item()
4245
+ if format_spec:
4246
+ return format(val, format_spec)
4247
+ return str(val)
4248
+ return str(value[0]) if value.size > 0 else "N/A"
4249
+ return str(value)
4250
+ except:
4251
+ return "N/A"
4252
+
4253
+ # Get summary values safely
4254
+ tempo = emotion_results.get('summary', {}).get('tempo', 0)
4255
+ key = emotion_results.get('summary', {}).get('key', 'Unknown')
4256
+ mode = emotion_results.get('summary', {}).get('mode', '')
4257
+ primary_emotion = emotion_results.get('summary', {}).get('primary_emotion', 'Unknown')
4258
+ primary_theme = emotion_results.get('summary', {}).get('primary_theme', 'Unknown')
4259
+
4260
+ emotion_text = (f"Tempo: {safe_format(tempo, '.1f')} BPM\n"
4261
+ f"Key: {key} {mode}\n"
4262
+ f"Primary Emotion: {primary_emotion}\n"
4263
+ f"Primary Theme: {primary_theme}")
4264
 
4265
  # Keep basic beat analysis without section information
4266
  y, sr = load_audio(audio_file, SAMPLE_RATE)
 
4268
 
4269
  # Add beat analysis info
4270
  emotion_text += f"\n\nBeat Analysis:\n"
4271
+
4272
+ # Get beat info values safely
4273
+ tempo = beats_info.get('tempo', 0)
4274
+ time_sig = beats_info.get('time_signature', 4)
4275
+ beat_count = beats_info.get('beat_count', 0)
4276
+
4277
+ emotion_text += f"- Tempo: {safe_format(tempo, '.1f')} BPM\n"
4278
+ emotion_text += f"- Time Signature: {time_sig}/4\n"
4279
+ emotion_text += f"- Total Beats: {beat_count}\n"
4280
 
4281
  # Add voice activity segments if available
4282
  if voice_segments:
 
4335
 
4336
  For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music.
4337
  """)
4338
+
4339
+ # Add voice detection info box
4340
+ with gr.Accordion("Voice Activity Detection", open=True):
4341
+ gr.Markdown("""
4342
+ ### Voice Detection Authentication Required
4343
+
4344
+ This app uses pyannote/voice-activity-detection to identify vocal segments in music.
4345
+
4346
+ **Important:** This model requires Hugging Face authentication:
4347
+
4348
+ 1. Create an account at [huggingface.co](https://huggingface.co)
4349
+ 2. Generate a token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
4350
+ 3. Accept the terms at [huggingface.co/pyannote/segmentation](https://huggingface.co/pyannote/segmentation)
4351
+ 4. Set the HF_TOKEN environment variable
4352
+
4353
+ Without authentication, the app will use estimated segments based on audio duration.
4354
+ """)
4355
 
4356
  with gr.Column(scale=2):
4357
  # Use tabs for better organization of outputs
 
4423
  """)
4424
 
4425
  # Launch the app
4426
+ demo.launch(share=True)