root
commited on
Commit
·
5d5eb0f
1
Parent(s):
651b0cd
ss
Browse files
app.py
CHANGED
@@ -1656,9 +1656,36 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
|
|
1656 |
print(f"DEBUG: templates is not a list, it's {type(templates)}")
|
1657 |
# If it's not a list, create a single-item list
|
1658 |
if templates is not None:
|
1659 |
-
templates
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1660 |
else:
|
1661 |
templates = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1662 |
|
1663 |
# Split lyrics into lines
|
1664 |
lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
|
@@ -1689,18 +1716,32 @@ def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=No
|
|
1689 |
|
1690 |
# Extract the template string from different possible formats
|
1691 |
template_str = None
|
1692 |
-
if isinstance(template, dict)
|
1693 |
-
|
|
|
|
|
|
|
|
|
1694 |
elif isinstance(template, str):
|
1695 |
template_str = template
|
1696 |
else:
|
1697 |
-
|
1698 |
-
|
|
|
|
|
|
|
|
|
|
|
1699 |
|
1700 |
if not isinstance(template_str, str):
|
1701 |
print(f"DEBUG: template_str is not a string, it's {type(template_str)}")
|
1702 |
continue
|
1703 |
|
|
|
|
|
|
|
|
|
|
|
1704 |
# Handle multiple phrases in template - process ALL phrases, not just the first
|
1705 |
template_phrases = [template_str]
|
1706 |
if "|" in template_str:
|
@@ -3148,28 +3189,93 @@ def detect_voice_activity(audio_file):
|
|
3148 |
try:
|
3149 |
print("Detecting voice activity in audio...")
|
3150 |
# Get HF_TOKEN from environment or set your token here
|
3151 |
-
hf_token = os.environ.get("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3152 |
|
3153 |
# Initialize the voice activity detection pipeline
|
3154 |
-
|
3155 |
-
|
3156 |
-
|
3157 |
-
|
3158 |
-
|
3159 |
-
|
3160 |
-
|
3161 |
-
|
3162 |
-
|
3163 |
-
|
3164 |
-
|
3165 |
-
|
3166 |
-
|
3167 |
-
|
3168 |
-
|
3169 |
-
|
3170 |
-
|
3171 |
-
|
3172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3173 |
|
3174 |
except Exception as e:
|
3175 |
print(f"Error detecting voice activity: {str(e)}")
|
@@ -3551,8 +3657,14 @@ def format_complete_beat_timeline(audio_file, lyrics=None):
|
|
3551 |
|
3552 |
# Helper function to convert numpy values to floats - FIXED
|
3553 |
def ensure_float(value):
|
3554 |
-
if isinstance(value, np.ndarray)
|
|
|
|
|
|
|
|
|
3555 |
return float(value)
|
|
|
|
|
3556 |
return value
|
3557 |
|
3558 |
# Format the timeline with enhanced scientific headers
|
@@ -4117,10 +4229,38 @@ def display_results(audio_file, lyrics_requirements=None):
|
|
4117 |
emotion_text = "No emotion analysis available."
|
4118 |
try:
|
4119 |
emotion_results = music_analyzer.analyze_music(audio_file)
|
4120 |
-
|
4121 |
-
|
4122 |
-
|
4123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4124 |
|
4125 |
# Keep basic beat analysis without section information
|
4126 |
y, sr = load_audio(audio_file, SAMPLE_RATE)
|
@@ -4128,9 +4268,15 @@ def display_results(audio_file, lyrics_requirements=None):
|
|
4128 |
|
4129 |
# Add beat analysis info
|
4130 |
emotion_text += f"\n\nBeat Analysis:\n"
|
4131 |
-
|
4132 |
-
|
4133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
4134 |
|
4135 |
# Add voice activity segments if available
|
4136 |
if voice_segments:
|
@@ -4189,6 +4335,23 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
|
|
4189 |
|
4190 |
For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music.
|
4191 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4192 |
|
4193 |
with gr.Column(scale=2):
|
4194 |
# Use tabs for better organization of outputs
|
@@ -4260,4 +4423,4 @@ with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo:
|
|
4260 |
""")
|
4261 |
|
4262 |
# Launch the app
|
4263 |
-
demo.launch()
|
|
|
1656 |
print(f"DEBUG: templates is not a list, it's {type(templates)}")
|
1657 |
# If it's not a list, create a single-item list
|
1658 |
if templates is not None:
|
1659 |
+
if isinstance(templates, str):
|
1660 |
+
# If it's a string, we need to parse it properly
|
1661 |
+
templates = [templates]
|
1662 |
+
elif isinstance(templates, dict):
|
1663 |
+
# If it's a dict, extract relevant information
|
1664 |
+
if "templates" in templates:
|
1665 |
+
templates = templates["templates"]
|
1666 |
+
if not isinstance(templates, list):
|
1667 |
+
templates = [templates]
|
1668 |
+
else:
|
1669 |
+
# Create a single element list with the dict
|
1670 |
+
templates = [templates]
|
1671 |
+
else:
|
1672 |
+
templates = [templates]
|
1673 |
else:
|
1674 |
templates = []
|
1675 |
+
|
1676 |
+
# Ensure all templates are strings or properly formatted dicts
|
1677 |
+
for i, template in enumerate(templates[:]):
|
1678 |
+
if isinstance(template, dict):
|
1679 |
+
if "syllable_template" not in template and "text" in template:
|
1680 |
+
# Try to use text field if syllable_template is missing
|
1681 |
+
template["syllable_template"] = template["text"]
|
1682 |
+
elif not isinstance(template, str):
|
1683 |
+
# Convert non-string, non-dict templates to strings if possible
|
1684 |
+
try:
|
1685 |
+
templates[i] = str(template)
|
1686 |
+
except:
|
1687 |
+
# Remove this template if it can't be converted
|
1688 |
+
templates.pop(i)
|
1689 |
|
1690 |
# Split lyrics into lines
|
1691 |
lines = [line.strip() for line in lyrics.split("\n") if line.strip()]
|
|
|
1716 |
|
1717 |
# Extract the template string from different possible formats
|
1718 |
template_str = None
|
1719 |
+
if isinstance(template, dict):
|
1720 |
+
# Try various keys that might contain template information
|
1721 |
+
for key in ["syllable_template", "template", "text", "pattern"]:
|
1722 |
+
if key in template and template[key] is not None:
|
1723 |
+
template_str = template[key]
|
1724 |
+
break
|
1725 |
elif isinstance(template, str):
|
1726 |
template_str = template
|
1727 |
else:
|
1728 |
+
# Try to convert to string
|
1729 |
+
try:
|
1730 |
+
template_str = str(template)
|
1731 |
+
print(f"DEBUG: Converted template {i+1} from {type(template)} to string")
|
1732 |
+
except:
|
1733 |
+
print(f"DEBUG: Skipping template {i+1}, not a string or dict with syllable_template")
|
1734 |
+
continue
|
1735 |
|
1736 |
if not isinstance(template_str, str):
|
1737 |
print(f"DEBUG: template_str is not a string, it's {type(template_str)}")
|
1738 |
continue
|
1739 |
|
1740 |
+
# Safety check for empty strings
|
1741 |
+
if not template_str.strip():
|
1742 |
+
print(f"DEBUG: Skipping empty template {i+1}")
|
1743 |
+
continue
|
1744 |
+
|
1745 |
# Handle multiple phrases in template - process ALL phrases, not just the first
|
1746 |
template_phrases = [template_str]
|
1747 |
if "|" in template_str:
|
|
|
3189 |
try:
|
3190 |
print("Detecting voice activity in audio...")
|
3191 |
# Get HF_TOKEN from environment or set your token here
|
3192 |
+
hf_token = os.environ.get("pyannote", None)
|
3193 |
+
|
3194 |
+
if not hf_token:
|
3195 |
+
print("Warning: No Hugging Face token provided. Voice activity detection requires authentication.")
|
3196 |
+
print("To use voice activity detection:")
|
3197 |
+
print("1. Create an account at https://huggingface.co")
|
3198 |
+
print("2. Generate a token at https://huggingface.co/settings/tokens")
|
3199 |
+
print("3. Accept the terms for pyannote/segmentation at https://huggingface.co/pyannote/segmentation")
|
3200 |
+
print("4. Set HF_TOKEN environment variable or provide it directly in the code")
|
3201 |
+
|
3202 |
+
# Create fallback segments based on audio duration
|
3203 |
+
# This creates segments approximately every 5 seconds
|
3204 |
+
y, sr = load_audio(audio_file, SAMPLE_RATE)
|
3205 |
+
duration = extract_audio_duration(y, sr)
|
3206 |
+
|
3207 |
+
# Create segments of 4-5 seconds each, with small gaps between them
|
3208 |
+
estimated_segments = []
|
3209 |
+
segment_duration = 4.5
|
3210 |
+
gap_duration = 1.0
|
3211 |
+
|
3212 |
+
current_pos = 0.0
|
3213 |
+
while current_pos < duration:
|
3214 |
+
segment_end = min(current_pos + segment_duration, duration)
|
3215 |
+
estimated_segments.append({
|
3216 |
+
"start": current_pos,
|
3217 |
+
"end": segment_end,
|
3218 |
+
"duration": segment_end - current_pos
|
3219 |
+
})
|
3220 |
+
current_pos = segment_end + gap_duration
|
3221 |
+
if current_pos >= duration:
|
3222 |
+
break
|
3223 |
+
|
3224 |
+
print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
|
3225 |
+
return estimated_segments
|
3226 |
|
3227 |
# Initialize the voice activity detection pipeline
|
3228 |
+
try:
|
3229 |
+
vad_pipeline = Pipeline.from_pretrained(
|
3230 |
+
"pyannote/voice-activity-detection",
|
3231 |
+
use_auth_token=hf_token
|
3232 |
+
)
|
3233 |
+
|
3234 |
+
# Process the audio file
|
3235 |
+
output = vad_pipeline(audio_file)
|
3236 |
+
|
3237 |
+
# Extract voice segments
|
3238 |
+
voice_segments = []
|
3239 |
+
for speech in output.get_timeline().support():
|
3240 |
+
voice_segments.append({
|
3241 |
+
"start": speech.start,
|
3242 |
+
"end": speech.end,
|
3243 |
+
"duration": speech.end - speech.start
|
3244 |
+
})
|
3245 |
+
|
3246 |
+
print(f"Detected {len(voice_segments)} voice segments")
|
3247 |
+
return voice_segments
|
3248 |
+
|
3249 |
+
except Exception as auth_error:
|
3250 |
+
print(f"Authentication error with pyannote models: {str(auth_error)}")
|
3251 |
+
print("Make sure you have:")
|
3252 |
+
print("1. Created a Hugging Face account")
|
3253 |
+
print("2. Generated a token at https://huggingface.co/settings/tokens")
|
3254 |
+
print("3. Accepted terms for pyannote/segmentation at https://huggingface.co/pyannote/segmentation")
|
3255 |
+
|
3256 |
+
# Create fallback segments as above
|
3257 |
+
y, sr = load_audio(audio_file, SAMPLE_RATE)
|
3258 |
+
duration = extract_audio_duration(y, sr)
|
3259 |
+
|
3260 |
+
# Create segments of 4-5 seconds each with small gaps
|
3261 |
+
estimated_segments = []
|
3262 |
+
segment_duration = 4.5
|
3263 |
+
gap_duration = 1.0
|
3264 |
+
|
3265 |
+
current_pos = 0.0
|
3266 |
+
while current_pos < duration:
|
3267 |
+
segment_end = min(current_pos + segment_duration, duration)
|
3268 |
+
estimated_segments.append({
|
3269 |
+
"start": current_pos,
|
3270 |
+
"end": segment_end,
|
3271 |
+
"duration": segment_end - current_pos
|
3272 |
+
})
|
3273 |
+
current_pos = segment_end + gap_duration
|
3274 |
+
if current_pos >= duration:
|
3275 |
+
break
|
3276 |
+
|
3277 |
+
print(f"Created {len(estimated_segments)} estimated voice segments (fallback mode)")
|
3278 |
+
return estimated_segments
|
3279 |
|
3280 |
except Exception as e:
|
3281 |
print(f"Error detecting voice activity: {str(e)}")
|
|
|
3657 |
|
3658 |
# Helper function to convert numpy values to floats - FIXED
|
3659 |
def ensure_float(value):
|
3660 |
+
if isinstance(value, np.ndarray):
|
3661 |
+
if value.size == 1:
|
3662 |
+
return float(value.item())
|
3663 |
+
return float(value[0]) if value.size > 0 else 0.0
|
3664 |
+
elif isinstance(value, np.number):
|
3665 |
return float(value)
|
3666 |
+
elif value is None:
|
3667 |
+
return 0.0
|
3668 |
return value
|
3669 |
|
3670 |
# Format the timeline with enhanced scientific headers
|
|
|
4229 |
emotion_text = "No emotion analysis available."
|
4230 |
try:
|
4231 |
emotion_results = music_analyzer.analyze_music(audio_file)
|
4232 |
+
|
4233 |
+
# Safe formatting helper function to handle any value type
|
4234 |
+
def safe_format(value, format_spec=None):
|
4235 |
+
if value is None:
|
4236 |
+
return "N/A"
|
4237 |
+
try:
|
4238 |
+
if isinstance(value, (int, float)):
|
4239 |
+
if format_spec:
|
4240 |
+
return format(value, format_spec)
|
4241 |
+
return str(value)
|
4242 |
+
if isinstance(value, np.ndarray):
|
4243 |
+
if value.size == 1:
|
4244 |
+
val = value.item()
|
4245 |
+
if format_spec:
|
4246 |
+
return format(val, format_spec)
|
4247 |
+
return str(val)
|
4248 |
+
return str(value[0]) if value.size > 0 else "N/A"
|
4249 |
+
return str(value)
|
4250 |
+
except:
|
4251 |
+
return "N/A"
|
4252 |
+
|
4253 |
+
# Get summary values safely
|
4254 |
+
tempo = emotion_results.get('summary', {}).get('tempo', 0)
|
4255 |
+
key = emotion_results.get('summary', {}).get('key', 'Unknown')
|
4256 |
+
mode = emotion_results.get('summary', {}).get('mode', '')
|
4257 |
+
primary_emotion = emotion_results.get('summary', {}).get('primary_emotion', 'Unknown')
|
4258 |
+
primary_theme = emotion_results.get('summary', {}).get('primary_theme', 'Unknown')
|
4259 |
+
|
4260 |
+
emotion_text = (f"Tempo: {safe_format(tempo, '.1f')} BPM\n"
|
4261 |
+
f"Key: {key} {mode}\n"
|
4262 |
+
f"Primary Emotion: {primary_emotion}\n"
|
4263 |
+
f"Primary Theme: {primary_theme}")
|
4264 |
|
4265 |
# Keep basic beat analysis without section information
|
4266 |
y, sr = load_audio(audio_file, SAMPLE_RATE)
|
|
|
4268 |
|
4269 |
# Add beat analysis info
|
4270 |
emotion_text += f"\n\nBeat Analysis:\n"
|
4271 |
+
|
4272 |
+
# Get beat info values safely
|
4273 |
+
tempo = beats_info.get('tempo', 0)
|
4274 |
+
time_sig = beats_info.get('time_signature', 4)
|
4275 |
+
beat_count = beats_info.get('beat_count', 0)
|
4276 |
+
|
4277 |
+
emotion_text += f"- Tempo: {safe_format(tempo, '.1f')} BPM\n"
|
4278 |
+
emotion_text += f"- Time Signature: {time_sig}/4\n"
|
4279 |
+
emotion_text += f"- Total Beats: {beat_count}\n"
|
4280 |
|
4281 |
# Add voice activity segments if available
|
4282 |
if voice_segments:
|
|
|
4335 |
|
4336 |
For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music.
|
4337 |
""")
|
4338 |
+
|
4339 |
+
# Add voice detection info box
|
4340 |
+
with gr.Accordion("Voice Activity Detection", open=True):
|
4341 |
+
gr.Markdown("""
|
4342 |
+
### Voice Detection Authentication Required
|
4343 |
+
|
4344 |
+
This app uses pyannote/voice-activity-detection to identify vocal segments in music.
|
4345 |
+
|
4346 |
+
**Important:** This model requires Hugging Face authentication:
|
4347 |
+
|
4348 |
+
1. Create an account at [huggingface.co](https://huggingface.co)
|
4349 |
+
2. Generate a token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
|
4350 |
+
3. Accept the terms at [huggingface.co/pyannote/segmentation](https://huggingface.co/pyannote/segmentation)
|
4351 |
+
4. Set the HF_TOKEN environment variable
|
4352 |
+
|
4353 |
+
Without authentication, the app will use estimated segments based on audio duration.
|
4354 |
+
""")
|
4355 |
|
4356 |
with gr.Column(scale=2):
|
4357 |
# Use tabs for better organization of outputs
|
|
|
4423 |
""")
|
4424 |
|
4425 |
# Launch the app
|
4426 |
+
demo.launch(share=True)
|