|
import os |
|
import io |
|
import gradio as gr |
|
import torch |
|
import numpy as np |
|
import re |
|
import pronouncing |
|
import functools |
|
from transformers import ( |
|
AutoModelForAudioClassification, |
|
AutoFeatureExtractor, |
|
AutoTokenizer, |
|
pipeline, |
|
AutoModelForCausalLM, |
|
BitsAndBytesConfig |
|
) |
|
from huggingface_hub import login |
|
from utils import ( |
|
load_audio, |
|
extract_audio_duration, |
|
extract_mfcc_features, |
|
format_genre_results, |
|
ensure_cuda_availability |
|
) |
|
from emotionanalysis import MusicAnalyzer |
|
import librosa |
|
|
|
|
|
if "HF_TOKEN" in os.environ: |
|
login(token=os.environ["HF_TOKEN"]) |
|
|
|
|
|
GENRE_MODEL_NAME = "dima806/music_genres_classification" |
|
MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593" |
|
LLM_MODEL_NAME = "Qwen/Qwen3-32B" |
|
SAMPLE_RATE = 22050 |
|
|
|
|
|
CUDA_AVAILABLE = ensure_cuda_availability() |
|
|
|
|
|
print("Loading genre classification model...") |
|
try: |
|
genre_feature_extractor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME) |
|
genre_model = AutoModelForAudioClassification.from_pretrained( |
|
GENRE_MODEL_NAME, |
|
device_map="auto" if CUDA_AVAILABLE else None |
|
) |
|
|
|
def get_genre_model(): |
|
return genre_model, genre_feature_extractor |
|
except Exception as e: |
|
print(f"Error loading genre model: {str(e)}") |
|
genre_model = None |
|
genre_feature_extractor = None |
|
|
|
|
|
print("Loading Qwen LLM model with 4-bit quantization...") |
|
try: |
|
|
|
quantization_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_compute_dtype=torch.float16, |
|
bnb_4bit_use_double_quant=True |
|
) |
|
|
|
llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME) |
|
llm_model = AutoModelForCausalLM.from_pretrained( |
|
LLM_MODEL_NAME, |
|
quantization_config=quantization_config, |
|
device_map="auto", |
|
trust_remote_code=True, |
|
torch_dtype=torch.float16, |
|
use_cache=True |
|
) |
|
except Exception as e: |
|
print(f"Error loading LLM model: {str(e)}") |
|
llm_tokenizer = None |
|
llm_model = None |
|
|
|
|
|
music_analyzer = MusicAnalyzer() |
|
|
|
|
|
def process_audio(audio_file): |
|
if audio_file is None: |
|
return "No audio file provided", None, None, None, None, None, None |
|
|
|
try: |
|
|
|
y, sr = load_audio(audio_file, sr=SAMPLE_RATE) |
|
|
|
|
|
duration = extract_audio_duration(y, sr) |
|
|
|
|
|
music_analysis = music_analyzer.analyze_music(audio_file) |
|
|
|
|
|
tempo = music_analysis["rhythm_analysis"]["tempo"] |
|
time_signature = music_analysis["rhythm_analysis"]["estimated_time_signature"] |
|
emotion = music_analysis["emotion_analysis"]["primary_emotion"] |
|
theme = music_analysis["theme_analysis"]["primary_theme"] |
|
|
|
|
|
if genre_model is not None and genre_feature_extractor is not None: |
|
|
|
y_16k = librosa.resample(y, orig_sr=sr, target_sr=16000) |
|
|
|
|
|
inputs = genre_feature_extractor( |
|
y_16k, |
|
sampling_rate=16000, |
|
return_tensors="pt" |
|
).to(genre_model.device) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = genre_model(**inputs) |
|
logits = outputs.logits |
|
probs = torch.nn.functional.softmax(logits, dim=-1) |
|
|
|
|
|
values, indices = torch.topk(probs[0], k=5) |
|
top_genres = [(genre_model.config.id2label[idx.item()], val.item()) for val, idx in zip(values, indices)] |
|
else: |
|
|
|
top_genres = [("Unknown", 1.0)] |
|
|
|
|
|
genre_results_text = format_genre_results(top_genres) |
|
primary_genre = top_genres[0][0] |
|
|
|
|
|
lyrics = generate_lyrics(music_analysis, primary_genre, duration) |
|
|
|
|
|
analysis_summary = f""" |
|
### Music Analysis Results |
|
|
|
**Duration:** {duration:.2f} seconds |
|
**Tempo:** {tempo:.1f} BPM |
|
**Time Signature:** {time_signature} |
|
**Key:** {music_analysis["tonal_analysis"]["key"]} {music_analysis["tonal_analysis"]["mode"]} |
|
**Primary Emotion:** {emotion} |
|
**Primary Theme:** {theme} |
|
**Top Genre:** {primary_genre} |
|
|
|
{genre_results_text} |
|
""" |
|
|
|
return analysis_summary, lyrics, tempo, time_signature, emotion, theme, primary_genre |
|
|
|
except Exception as e: |
|
error_msg = f"Error processing audio: {str(e)}" |
|
print(error_msg) |
|
return error_msg, None, None, None, None, None, None |
|
|
|
def generate_lyrics(music_analysis, genre, duration): |
|
try: |
|
|
|
tempo = music_analysis["rhythm_analysis"]["tempo"] |
|
key = music_analysis["tonal_analysis"]["key"] |
|
mode = music_analysis["tonal_analysis"]["mode"] |
|
emotion = music_analysis["emotion_analysis"]["primary_emotion"] |
|
theme = music_analysis["theme_analysis"]["primary_theme"] |
|
|
|
|
|
if llm_model is None or llm_tokenizer is None: |
|
return "Error: LLM model not properly loaded" |
|
|
|
|
|
prompt = f"""Write lyrics for a {genre} song with these specifications: |
|
- Key: {key} {mode} |
|
- Tempo: {tempo} BPM |
|
- Emotion: {emotion} |
|
- Theme: {theme} |
|
- Duration: {duration:.1f} seconds |
|
- Time signature: {music_analysis["rhythm_analysis"]["estimated_time_signature"]} |
|
|
|
CRITICAL INSTRUCTIONS: |
|
- The lyrics should be in English |
|
- Write ONLY the raw lyrics with no structural labels |
|
- DO NOT include any thinking, reasoning, or explanations |
|
- DO NOT include <think> tags or thinking processes |
|
- DO NOT include [verse], [chorus], [bridge], or any other section markers |
|
- DO NOT number the verses or lines |
|
- DO NOT use bullet points |
|
- Format as simple line-by-line lyrics only |
|
- Make sure the lyrics match the specified duration and tempo |
|
- Keep lyrics concise enough to fit the duration when sung at the given tempo |
|
""" |
|
|
|
|
|
messages = [ |
|
{"role": "user", "content": prompt} |
|
] |
|
|
|
|
|
text = llm_tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
|
|
|
|
model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device) |
|
|
|
|
|
generated_ids = llm_model.generate( |
|
**model_inputs, |
|
max_new_tokens=1024, |
|
do_sample=True, |
|
temperature=0.6, |
|
top_p=0.9, |
|
repetition_penalty=1.2, |
|
pad_token_id=llm_tokenizer.eos_token_id |
|
) |
|
|
|
|
|
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() |
|
lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip() |
|
|
|
|
|
|
|
|
|
lyrics = re.sub(r'<think>.*?</think>', '', lyrics, flags=re.DOTALL) |
|
lyrics = re.sub(r'\[thinking\].*?\[/thinking\]', '', lyrics, flags=re.DOTALL) |
|
|
|
|
|
lyrics = re.sub(r'^\[.*?\].*$', '', lyrics, flags=re.MULTILINE) |
|
|
|
|
|
thinking_prefixes = [ |
|
r'^(Here are|Here is|These are|This is|Let me|I will|I\'ll).*?:\s*', |
|
r'^Okay, let\'s.*$', |
|
r'^First, I need to.*$', |
|
r'^Let me brainstorm.*$', |
|
r'^I\'ll write.*$', |
|
r'^Let\'s create.*$', |
|
r'^For a.*song.*$', |
|
r'^Now I\'ll.*$', |
|
r'^Taking into account.*$', |
|
r'^Given the specifications.*$', |
|
r'^Based on the.*$', |
|
r'^Considering the.*$' |
|
] |
|
|
|
for pattern in thinking_prefixes: |
|
lyrics = re.sub(pattern, '', lyrics, flags=re.MULTILINE|re.IGNORECASE) |
|
|
|
lyrics = re.sub(r'^Title:.*?$', '', lyrics, flags=re.MULTILINE).strip() |
|
|
|
|
|
lyrics = re.sub(r'^\s*(Verse|Chorus|Bridge|Pre.?Chorus|Intro|Outro|Refrain|Hook|Breakdown)(\s*\d*|\s*[A-Z])?:?\s*$', '', lyrics, flags=re.MULTILINE|re.IGNORECASE) |
|
lyrics = re.sub(r'\[(Verse|Chorus|Bridge|Pre.?Chorus|Intro|Outro|Refrain|Hook|Breakdown)(\s*\d*|\s*[A-Z])?\]', '', lyrics, flags=re.IGNORECASE) |
|
|
|
|
|
lyrics = re.sub(r'^.*?(think|brainstorm|consider|syllable|count|rhyme|scheme|tempo|calculate|bpm).*$', '', lyrics, flags=re.MULTILINE|re.IGNORECASE) |
|
|
|
|
|
lyrics = re.sub(r'^\s*\n', '', lyrics) |
|
lyrics = re.sub(r'\n\s*\n\s*\n+', '\n\n', lyrics) |
|
lyrics = lyrics.strip() |
|
|
|
|
|
if re.match(r'.*?(I need to|Let me|Okay|Hmm|I will|I\'ll|First|Let\'s|Now).*', lyrics[:100], re.IGNORECASE): |
|
|
|
parts = lyrics.split('\n\n') |
|
if len(parts) > 1: |
|
|
|
lyrics = '\n\n'.join(parts[1:]) |
|
|
|
return lyrics |
|
|
|
except Exception as e: |
|
error_msg = f"Error generating lyrics: {str(e)}" |
|
print(error_msg) |
|
return error_msg |
|
|
|
|
|
def create_interface(): |
|
with gr.Blocks(title="Music Analysis & Lyrics Generator") as demo: |
|
gr.Markdown("# Music Analysis & Lyrics Generator") |
|
gr.Markdown("Upload a music file or record audio to analyze it and generate matching lyrics") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
audio_input = gr.Audio( |
|
label="Upload or Record Audio", |
|
type="filepath", |
|
sources=["upload", "microphone"] |
|
) |
|
analyze_btn = gr.Button("Analyze and Generate Lyrics", variant="primary") |
|
|
|
with gr.Column(scale=2): |
|
with gr.Tab("Analysis"): |
|
analysis_output = gr.Textbox(label="Music Analysis Results", lines=10) |
|
|
|
with gr.Row(): |
|
tempo_output = gr.Number(label="Tempo (BPM)") |
|
time_sig_output = gr.Textbox(label="Time Signature") |
|
emotion_output = gr.Textbox(label="Primary Emotion") |
|
theme_output = gr.Textbox(label="Primary Theme") |
|
genre_output = gr.Textbox(label="Primary Genre") |
|
|
|
with gr.Tab("Generated Lyrics"): |
|
lyrics_output = gr.Textbox(label="Generated Lyrics", lines=20) |
|
|
|
|
|
analyze_btn.click( |
|
fn=process_audio, |
|
inputs=[audio_input], |
|
outputs=[analysis_output, lyrics_output, tempo_output, time_sig_output, |
|
emotion_output, theme_output, genre_output] |
|
) |
|
|
|
gr.Markdown(""" |
|
## How it works |
|
1. Upload or record a music file |
|
2. The system analyzes tempo, beats, time signature and other musical features |
|
3. It detects emotion, theme, and music genre |
|
4. Using this information, it generates lyrics that match the style and length of your music |
|
""") |
|
|
|
return demo |
|
|
|
|
|
demo = create_interface() |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
else: |
|
|
|
app = demo |