fyp_start_space / app.py
root
...
0a51dfa
import requests
import gradio as gr
import os
import torch
import json
import time
import tempfile
import shutil
import librosa
from transformers import AutoTokenizer, AutoModelForCausalLM
# Check if CUDA is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# API URLs and headers
AUDIO_API_URL = "https://api-inference.huggingface.co/models/MIT/ast-finetuned-audioset-10-10-0.4593"
LYRICS_API_URL = "https://api-inference.huggingface.co/models/gpt2-medium"
headers = {"Authorization": f"Bearer {os.environ.get('HF_TOKEN')}"}
def get_audio_duration(audio_path):
"""Get the duration of the audio file in seconds"""
try:
duration = librosa.get_duration(path=audio_path)
return duration
except Exception as e:
print(f"Error getting audio duration: {e}")
return None
def calculate_song_structure(duration):
"""Calculate song structure based on audio duration"""
if duration is None:
return {"verses": 2, "choruses": 1, "tokens": 200} # Default structure
# Basic rules for song structure:
# - Short clips (< 30s): 1 verse, 1 chorus
# - Medium clips (30s-2min): 2 verses, 1-2 choruses
# - Longer clips (>2min): 3 verses, 2-3 choruses
if duration < 30:
return {
"verses": 1,
"choruses": 1,
"tokens": 150
}
elif duration < 120:
return {
"verses": 2,
"choruses": 2,
"tokens": 200
}
else:
return {
"verses": 3,
"choruses": 3,
"tokens": 300
}
def create_lyrics_prompt(classification_results, song_structure):
"""Create a prompt for lyrics generation based on classification results and desired structure"""
# Get the top genres and characteristics
main_style = classification_results[0]['label']
secondary_elements = [result['label'] for result in classification_results[1:3]]
# Create a more specific prompt with example structure and style guidance
prompt = f"""Create {song_structure['verses']} verses and {song_structure['choruses']} choruses in {main_style} style with {', '.join(secondary_elements)} elements.
[Verse 1]"""
return prompt
def format_lyrics(generated_text, song_structure):
"""Format the generated lyrics according to desired structure"""
lines = []
verse_count = 0
chorus_count = 0
current_section = []
# Split text into lines and process
text_lines = generated_text.split('\n')
for line in text_lines:
line = line.strip()
# Skip empty lines and metadata
if not line or line.startswith('```') or line.startswith('###'):
continue
# Handle section markers
if '[verse' in line.lower() or '[chorus' in line.lower():
# Save previous section if it exists
if current_section:
# Pad section to 4 lines if needed
while len(current_section) < 4:
current_section.append("...")
lines.extend(current_section[:4])
current_section = []
# Add new section marker
if '[verse' in line.lower() and verse_count < song_structure['verses']:
verse_count += 1
lines.append(f"\n[Verse {verse_count}]")
elif '[chorus' in line.lower() and chorus_count < song_structure['choruses']:
chorus_count += 1
lines.append(f"\n[Chorus {chorus_count}]")
else:
# Add line to current section if it looks like lyrics
if len(line.split()) <= 12 and not line[0] in '.,!?':
current_section.append(line)
# Handle last section
if current_section:
while len(current_section) < 4:
current_section.append("...")
lines.extend(current_section[:4])
# Add any missing sections
while verse_count < song_structure['verses'] or chorus_count < song_structure['choruses']:
if verse_count < song_structure['verses']:
verse_count += 1
lines.append(f"\n[Verse {verse_count}]")
lines.extend(["..." for _ in range(4)])
if chorus_count < song_structure['choruses']:
chorus_count += 1
lines.append(f"\n[Chorus {chorus_count}]")
lines.extend(["..." for _ in range(4)])
return "\n".join(lines)
def create_default_lyrics(song_structure):
"""Create default lyrics when generation fails"""
lyrics = []
# Add verses
for i in range(song_structure['verses']):
lyrics.append(f"\n[Verse {i+1}]")
lyrics.extend([
])
# Add choruses
for i in range(song_structure['choruses']):
lyrics.append(f"\n[Chorus {i+1}]")
lyrics.extend([
])
return "\n".join(lyrics)
def generate_lyrics_with_retry(prompt, song_structure, max_retries=5, initial_wait=2):
"""Generate lyrics using GPT2 with improved retry logic and error handling"""
wait_time = initial_wait
for attempt in range(max_retries):
try:
print(f"\nAttempt {attempt + 1}: Generating lyrics...")
response = requests.post(
LYRICS_API_URL,
headers=headers,
json={
"inputs": prompt,
"parameters": {
"max_new_tokens": song_structure['tokens'],
"temperature": 0.8,
"top_p": 0.9,
"do_sample": True,
"return_full_text": True,
"num_return_sequences": 1,
"repetition_penalty": 1.1
}
}
)
if response.status_code == 200:
result = response.json()
# Handle different response formats
if isinstance(result, list):
generated_text = result[0].get('generated_text', '')
elif isinstance(result, dict):
generated_text = result.get('generated_text', '')
else:
generated_text = str(result)
if not generated_text:
print("Empty response received, retrying...")
time.sleep(wait_time)
continue
# Process the generated text into verses and chorus
formatted_lyrics = format_lyrics(generated_text, song_structure)
# Verify we have enough content
if formatted_lyrics.count('[Verse') >= song_structure['verses'] and \
formatted_lyrics.count('[Chorus') >= song_structure['choruses']:
return formatted_lyrics
else:
print("Not enough sections generated, retrying...")
elif response.status_code == 503:
print(f"Model loading, waiting {wait_time} seconds...")
time.sleep(wait_time)
wait_time *= 1.5
continue
else:
print(f"Error response: {response.text}")
if attempt < max_retries - 1:
time.sleep(wait_time)
continue
except Exception as e:
print(f"Exception during generation: {str(e)}")
if attempt < max_retries - 1:
time.sleep(wait_time)
wait_time *= 1.5
continue
time.sleep(wait_time)
wait_time = min(wait_time * 1.5, 10) # Cap maximum wait time at 10 seconds
# If we failed to generate after all retries, return a default structure
return create_default_lyrics(song_structure)
def format_results(classification_results, lyrics, prompt):
"""Format the results for display"""
# Format classification results
classification_text = "Classification Results:\n"
for i, result in enumerate(classification_results):
classification_text += f"{i+1}. {result['label']}: {result['score']}\n"
# Format final output
output = f"""
{classification_text}
\n---Generated Lyrics---\n
{lyrics}
"""
return output
def classify_with_retry(data, max_retries=5, initial_wait=2):
"""Classify audio with retry logic for 503 errors"""
wait_time = initial_wait
for attempt in range(max_retries):
try:
print(f"\nAttempt {attempt + 1}: Classifying audio...")
response = requests.post(AUDIO_API_URL, headers=headers, data=data)
if response.status_code == 200:
return response.json()
elif response.status_code == 503:
print(f"Model loading, waiting {wait_time} seconds...")
time.sleep(wait_time)
wait_time *= 1.5
continue
else:
print(f"Error response: {response.text}")
if attempt < max_retries - 1:
time.sleep(wait_time)
continue
return None
except Exception as e:
print(f"Exception during classification: {str(e)}")
if attempt < max_retries - 1:
time.sleep(wait_time)
wait_time *= 1.5
continue
return None
time.sleep(wait_time)
wait_time = min(wait_time * 1.5, 10)
return None
def classify_and_generate(audio_file):
"""
Classify the audio and generate matching lyrics
"""
if audio_file is None:
return "Please upload an audio file."
try:
token = os.environ.get('HF_TOKEN')
if not token:
return "Error: HF_TOKEN environment variable is not set. Please set your Hugging Face API token."
# Get audio duration and calculate structure
if isinstance(audio_file, tuple):
audio_path = audio_file[0]
else:
audio_path = audio_file
duration = get_audio_duration(audio_path)
song_structure = calculate_song_structure(duration)
print(f"Audio duration: {duration:.2f}s, Structure: {song_structure}")
# Create a temporary file to handle the audio data
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_audio:
# Copy the audio file to our temporary file
shutil.copy2(audio_path, temp_audio.name)
# Read the temporary file
with open(temp_audio.name, "rb") as f:
data = f.read()
print("Sending request to Audio Classification API...")
classification_results = classify_with_retry(data)
# Clean up the temporary file
try:
os.unlink(temp_audio.name)
except:
pass
if classification_results is None:
return "Error: Failed to classify audio after multiple retries. Please try again."
# Format classification results
formatted_results = []
for result in classification_results:
formatted_results.append({
'label': result['label'],
'score': f"{result['score']*100:.2f}%"
})
# Generate lyrics based on classification with retry logic
print("Generating lyrics based on classification...")
prompt = create_lyrics_prompt(formatted_results, song_structure)
lyrics = generate_lyrics_with_retry(prompt, song_structure)
# Format and return results
return format_results(formatted_results, lyrics, prompt)
except Exception as e:
import traceback
error_details = traceback.format_exc()
return f"Error processing request: {str(e)}\nDetails:\n{error_details}"
# Create Gradio interface
iface = gr.Interface(
fn=classify_and_generate,
inputs=gr.Audio(type="filepath", label="Upload Audio File"),
outputs=gr.Textbox(
label="Results",
lines=15,
placeholder="Upload an audio file to see classification results and generated lyrics..."
),
title="Music Genre Classifier + Lyric Generator",
description="Upload an audio file to classify its genre and generate matching lyrics using AI.",
examples=[],
)
# Launch the interface
if __name__ == "__main__":
iface.launch(server_name="0.0.0.0", server_port=7860)