root commited on
Commit
c95399f
·
1 Parent(s): 5d5eb0f
Files changed (1) hide show
  1. app.py +61 -14
app.py CHANGED
@@ -24,7 +24,12 @@ from utils import (
24
  )
25
  from emotionanalysis import MusicAnalyzer
26
  import librosa
27
- from pyannote.audio import Pipeline
 
 
 
 
 
28
  import tempfile
29
  import os
30
  import soundfile as sf
@@ -3196,8 +3201,13 @@ def detect_voice_activity(audio_file):
3196
  print("To use voice activity detection:")
3197
  print("1. Create an account at https://huggingface.co")
3198
  print("2. Generate a token at https://huggingface.co/settings/tokens")
3199
- print("3. Accept the terms for pyannote/segmentation at https://huggingface.co/pyannote/segmentation")
3200
- print("4. Set HF_TOKEN environment variable or provide it directly in the code")
 
 
 
 
 
3201
 
3202
  # Create fallback segments based on audio duration
3203
  # This creates segments approximately every 5 seconds
@@ -3224,8 +3234,36 @@ def detect_voice_activity(audio_file):
3224
  print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
3225
  return estimated_segments
3226
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3227
  # Initialize the voice activity detection pipeline
3228
  try:
 
3229
  vad_pipeline = Pipeline.from_pretrained(
3230
  "pyannote/voice-activity-detection",
3231
  use_auth_token=hf_token
@@ -4339,18 +4377,27 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
4339
  # Add voice detection info box
4340
  with gr.Accordion("Voice Activity Detection", open=True):
4341
  gr.Markdown("""
4342
- ### Voice Detection Authentication Required
4343
-
4344
- This app uses pyannote/voice-activity-detection to identify vocal segments in music.
4345
-
4346
- **Important:** This model requires Hugging Face authentication:
4347
-
4348
- 1. Create an account at [huggingface.co](https://huggingface.co)
4349
- 2. Generate a token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
4350
- 3. Accept the terms at [huggingface.co/pyannote/segmentation](https://huggingface.co/pyannote/segmentation)
4351
- 4. Set the HF_TOKEN environment variable
4352
 
4353
- Without authentication, the app will use estimated segments based on audio duration.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4354
  """)
4355
 
4356
  with gr.Column(scale=2):
 
24
  )
25
  from emotionanalysis import MusicAnalyzer
26
  import librosa
27
+ try:
28
+ from pyannote.audio import Pipeline
29
+ PYANNOTE_AVAILABLE = True
30
+ except ImportError:
31
+ print("WARNING: pyannote.audio is not properly installed. Voice detection will use fallback mode.")
32
+ PYANNOTE_AVAILABLE = False
33
  import tempfile
34
  import os
35
  import soundfile as sf
 
3201
  print("To use voice activity detection:")
3202
  print("1. Create an account at https://huggingface.co")
3203
  print("2. Generate a token at https://huggingface.co/settings/tokens")
3204
+ print("3. Accept the terms for pyannote models at:")
3205
+ print(" - https://huggingface.co/pyannote/segmentation")
3206
+ print(" - https://huggingface.co/pyannote/voice-activity-detection")
3207
+ print("4. Set 'pyannote' environment variable with your token:")
3208
+ print(" - Linux/Mac: export pyannote=your_token_here")
3209
+ print(" - Windows: set pyannote=your_token_here")
3210
+ print(" - Hugging Face Spaces: Add a 'pyannote' Secret in Settings")
3211
 
3212
  # Create fallback segments based on audio duration
3213
  # This creates segments approximately every 5 seconds
 
3234
  print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
3235
  return estimated_segments
3236
 
3237
+ # Check if pyannote is available
3238
+ if not PYANNOTE_AVAILABLE:
3239
+ print("pyannote.audio is not available. Using fallback voice detection.")
3240
+ # Create fallback segments based on audio duration
3241
+ y, sr = load_audio(audio_file, SAMPLE_RATE)
3242
+ duration = extract_audio_duration(y, sr)
3243
+
3244
+ # Create segments of 4-5 seconds each, with small gaps between them
3245
+ estimated_segments = []
3246
+ segment_duration = 4.5
3247
+ gap_duration = 1.0
3248
+
3249
+ current_pos = 0.0
3250
+ while current_pos < duration:
3251
+ segment_end = min(current_pos + segment_duration, duration)
3252
+ estimated_segments.append({
3253
+ "start": current_pos,
3254
+ "end": segment_end,
3255
+ "duration": segment_end - current_pos
3256
+ })
3257
+ current_pos = segment_end + gap_duration
3258
+ if current_pos >= duration:
3259
+ break
3260
+
3261
+ print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
3262
+ return estimated_segments
3263
+
3264
  # Initialize the voice activity detection pipeline
3265
  try:
3266
+ print(f"Attempting to load pyannote/voice-activity-detection with auth token: {'[PROVIDED]' if hf_token else '[MISSING]'}")
3267
  vad_pipeline = Pipeline.from_pretrained(
3268
  "pyannote/voice-activity-detection",
3269
  use_auth_token=hf_token
 
4377
  # Add voice detection info box
4378
  with gr.Accordion("Voice Activity Detection", open=True):
4379
  gr.Markdown("""
4380
+ ### Voice Detection Authentication Required
 
 
 
 
 
 
 
 
 
4381
 
4382
+ This app uses pyannote/voice-activity-detection to identify vocal segments in music.
4383
+
4384
+ **Important:** This model requires Hugging Face authentication:
4385
+
4386
+ 1. Create an account at [huggingface.co](https://huggingface.co)
4387
+ 2. Generate a token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
4388
+ 3. Accept the terms at [huggingface.co/pyannote/segmentation](https://huggingface.co/pyannote/segmentation)
4389
+ 4. Set the "pyannote" environment variable with your token:
4390
+ - In Linux/Mac: `export pyannote="your_token_here"`
4391
+ - In Windows: `set pyannote=your_token_here`
4392
+ - In Hugging Face Spaces: Add a "pyannote" Secret in the Settings tab
4393
+
4394
+ Without authentication, the app will use estimated segments based on audio duration.
4395
+
4396
+ **Technical Note:** If you're having trouble with authentication, make sure:
4397
+ 1. The pyannote.audio package is properly installed
4398
+ 2. You've accepted the model terms at [huggingface.co/pyannote/voice-activity-detection](https://huggingface.co/pyannote/voice-activity-detection)
4399
+ 3. The provided token has READ access permission
4400
+ 4. You've added hf.co to your allowed domains if using a scoped token
4401
  """)
4402
 
4403
  with gr.Column(scale=2):