import os import io import gradio as gr import torch import numpy as np from transformers import ( AutoModelForAudioClassification, AutoFeatureExtractor, AutoTokenizer, pipeline, AutoModelForCausalLM, BitsAndBytesConfig ) from huggingface_hub import login from utils import ( load_audio, extract_audio_duration, extract_mfcc_features, calculate_lyrics_length, format_genre_results, ensure_cuda_availability, preprocess_audio_for_model ) # Login to Hugging Face Hub if token is provided if "HF_TOKEN" in os.environ: login(token=os.environ["HF_TOKEN"]) # Constants GENRE_MODEL_NAME = "dima806/music_genres_classification" MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593" LLM_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" SAMPLE_RATE = 22050 # Standard sample rate for audio processing # Check CUDA availability (for informational purposes) CUDA_AVAILABLE = ensure_cuda_availability() # Create music detection pipeline print(f"Loading music detection model: {MUSIC_DETECTION_MODEL}") try: music_detector = pipeline( "audio-classification", model=MUSIC_DETECTION_MODEL, device=0 if CUDA_AVAILABLE else -1 ) print("Successfully loaded music detection pipeline") except Exception as e: print(f"Error creating music detection pipeline: {str(e)}") # Fallback to manual loading try: music_processor = AutoFeatureExtractor.from_pretrained(MUSIC_DETECTION_MODEL) music_model = AutoModelForAudioClassification.from_pretrained(MUSIC_DETECTION_MODEL) print("Successfully loaded music detection model and feature extractor") except Exception as e2: print(f"Error loading music detection model components: {str(e2)}") raise RuntimeError(f"Could not load music detection model: {str(e2)}") # Create genre classification pipeline print(f"Loading audio classification model: {GENRE_MODEL_NAME}") try: genre_classifier = pipeline( "audio-classification", model=GENRE_MODEL_NAME, device=0 if CUDA_AVAILABLE else -1 ) print("Successfully loaded audio classification pipeline") except Exception as e: print(f"Error creating pipeline: {str(e)}") # Fallback to manual loading try: genre_processor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME) genre_model = AutoModelForAudioClassification.from_pretrained(GENRE_MODEL_NAME) print("Successfully loaded audio classification model and feature extractor") except Exception as e2: print(f"Error loading model components: {str(e2)}") raise RuntimeError(f"Could not load genre classification model: {str(e2)}") # Load LLM with appropriate quantization for T4 GPU bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, ) llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME) llm_model = AutoModelForCausalLM.from_pretrained( LLM_MODEL_NAME, device_map="auto", quantization_config=bnb_config, torch_dtype=torch.float16, ) # Create LLM pipeline llm_pipeline = pipeline( "text-generation", model=llm_model, tokenizer=llm_tokenizer, max_new_tokens=512, ) def extract_audio_features(audio_file): """Extract audio features from an audio file.""" # Load the audio file using utility function y, sr = load_audio(audio_file, SAMPLE_RATE) # Get audio duration in seconds duration = extract_audio_duration(y, sr) # Extract MFCCs for genre classification (may not be needed with the pipeline) mfccs_mean = extract_mfcc_features(y, sr, n_mfcc=20) return { "features": mfccs_mean, "duration": duration, "waveform": y, "sample_rate": sr, "path": audio_file # Keep path for the pipeline } def classify_genre(audio_data): """Classify the genre of the audio using the loaded model.""" try: # First attempt: Try using the pipeline if available if 'genre_classifier' in globals(): results = genre_classifier(audio_data["path"]) # Transform pipeline results to our expected format top_genres = [(result["label"], result["score"]) for result in results[:3]] return top_genres # Second attempt: Use manually loaded model components elif 'genre_processor' in globals() and 'genre_model' in globals(): # Process audio input with feature extractor inputs = genre_processor( audio_data["waveform"], sampling_rate=audio_data["sample_rate"], return_tensors="pt" ) with torch.no_grad(): outputs = genre_model(**inputs) predictions = outputs.logits.softmax(dim=-1) # Get the top 3 genres values, indices = torch.topk(predictions, 3) # Map indices to genre labels genre_labels = genre_model.config.id2label top_genres = [] for i, (value, index) in enumerate(zip(values[0], indices[0])): genre = genre_labels[index.item()] confidence = value.item() top_genres.append((genre, confidence)) return top_genres else: raise ValueError("No genre classification model available") except Exception as e: print(f"Error in genre classification: {str(e)}") # Fallback: return a default genre if everything fails return [("rock", 1.0)] def generate_lyrics(genre, duration): """Generate lyrics based on the genre and with appropriate length.""" # Calculate appropriate lyrics length based on audio duration lines_count = calculate_lyrics_length(duration) # Calculate approximate number of verses and chorus if lines_count <= 6: # Very short song - one verse and chorus verse_lines = 2 chorus_lines = 2 elif lines_count <= 10: # Medium song - two verses and chorus verse_lines = 3 chorus_lines = 2 else: # Longer song - two verses, chorus, and bridge verse_lines = 3 chorus_lines = 2 # Create prompt for the LLM prompt = f""" You are a talented songwriter who specializes in {genre} music. Write original {genre} song lyrics for a song that is {duration:.1f} seconds long. The lyrics should: - Perfectly capture the essence and style of {genre} music - Be approximately {lines_count} lines long - Have a coherent theme and flow - Follow this structure: * Verse: {verse_lines} lines * Chorus: {chorus_lines} lines * {f'Bridge: 2 lines' if lines_count > 10 else ''} - Be completely original - Match the song duration of {duration:.1f} seconds - Keep each line concise and impactful Your lyrics: """ # Generate lyrics using the LLM response = llm_pipeline( prompt, do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.1, return_full_text=False ) # Extract and clean generated lyrics lyrics = response[0]["generated_text"].strip() # Add section labels if they're not present if "Verse" not in lyrics and "Chorus" not in lyrics: lines = lyrics.split('\n') formatted_lyrics = [] current_section = "Verse" for i, line in enumerate(lines): if i == 0: formatted_lyrics.append("[Verse]") elif i == verse_lines: formatted_lyrics.append("\n[Chorus]") elif i == verse_lines + chorus_lines and lines_count > 10: formatted_lyrics.append("\n[Bridge]") formatted_lyrics.append(line) lyrics = '\n'.join(formatted_lyrics) return lyrics def detect_music(audio_data): """Detect if the audio is music using the MIT AST model.""" try: # First attempt: Try using the pipeline if available if 'music_detector' in globals(): results = music_detector(audio_data["path"]) # Look for music-related classes in the results music_confidence = 0.0 for result in results: label = result["label"].lower() if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): music_confidence = max(music_confidence, result["score"]) return music_confidence >= 0.5 # Second attempt: Use manually loaded model components elif 'music_processor' in globals() and 'music_model' in globals(): # Process audio input with feature extractor inputs = music_processor( audio_data["waveform"], sampling_rate=audio_data["sample_rate"], return_tensors="pt" ) with torch.no_grad(): outputs = music_model(**inputs) predictions = outputs.logits.softmax(dim=-1) # Get the top predictions values, indices = torch.topk(predictions, 5) # Map indices to labels labels = music_model.config.id2label # Check for music-related classes music_confidence = 0.0 for i, (value, index) in enumerate(zip(values[0], indices[0])): label = labels[index.item()].lower() if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): music_confidence = max(music_confidence, value.item()) return music_confidence >= 0.5 else: raise ValueError("No music detection model available") except Exception as e: print(f"Error in music detection: {str(e)}") return False def process_audio(audio_file): """Main function to process audio file, classify genre, and generate lyrics.""" if audio_file is None: return "Please upload an audio file.", None try: # Extract audio features audio_data = extract_audio_features(audio_file) # First check if it's music is_music = detect_music(audio_data) if not is_music: return "The uploaded audio does not appear to be music. Please upload a music file.", None # Classify genre top_genres = classify_genre(audio_data) # Format genre results using utility function genre_results = format_genre_results(top_genres) # Generate lyrics based on top genre primary_genre, _ = top_genres[0] lyrics = generate_lyrics(primary_genre, audio_data["duration"]) return genre_results, lyrics except Exception as e: return f"Error processing audio: {str(e)}", None # Create Gradio interface with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo: gr.Markdown("# Music Genre Classifier & Lyrics Generator") gr.Markdown("Upload a music file to classify its genre and generate matching lyrics.") with gr.Row(): with gr.Column(): audio_input = gr.Audio(label="Upload Music", type="filepath") submit_btn = gr.Button("Analyze & Generate") with gr.Column(): genre_output = gr.Textbox(label="Detected Genres", lines=5) lyrics_output = gr.Textbox(label="Generated Lyrics", lines=15) submit_btn.click( fn=process_audio, inputs=[audio_input], outputs=[genre_output, lyrics_output] ) gr.Markdown("### How it works") gr.Markdown(""" 1. Upload an audio file of your choice 2. The system will classify the genre using the dima806/music_genres_classification model 3. Based on the detected genre, it will generate appropriate lyrics using Llama-3.1-8B-Instruct 4. The lyrics length is automatically adjusted based on your audio duration """) # Launch the app demo.launch()