|
import os |
|
import io |
|
import gradio as gr |
|
import torch |
|
import numpy as np |
|
import re |
|
import pronouncing |
|
from transformers import ( |
|
AutoModelForAudioClassification, |
|
AutoFeatureExtractor, |
|
AutoTokenizer, |
|
pipeline, |
|
AutoModelForCausalLM, |
|
BitsAndBytesConfig |
|
) |
|
from huggingface_hub import login |
|
from utils import ( |
|
load_audio, |
|
extract_audio_duration, |
|
extract_mfcc_features, |
|
calculate_lyrics_length, |
|
format_genre_results, |
|
ensure_cuda_availability, |
|
preprocess_audio_for_model |
|
) |
|
from emotionanalysis import MusicAnalyzer |
|
import librosa |
|
|
|
|
|
if "HF_TOKEN" in os.environ: |
|
login(token=os.environ["HF_TOKEN"]) |
|
|
|
|
|
GENRE_MODEL_NAME = "dima806/music_genres_classification" |
|
MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593" |
|
LLM_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" |
|
SAMPLE_RATE = 22050 |
|
|
|
|
|
CUDA_AVAILABLE = ensure_cuda_availability() |
|
|
|
|
|
print(f"Loading music detection model: {MUSIC_DETECTION_MODEL}") |
|
try: |
|
music_detector = pipeline( |
|
"audio-classification", |
|
model=MUSIC_DETECTION_MODEL, |
|
device=0 if CUDA_AVAILABLE else -1 |
|
) |
|
print("Successfully loaded music detection pipeline") |
|
except Exception as e: |
|
print(f"Error creating music detection pipeline: {str(e)}") |
|
|
|
try: |
|
music_processor = AutoFeatureExtractor.from_pretrained(MUSIC_DETECTION_MODEL) |
|
music_model = AutoModelForAudioClassification.from_pretrained(MUSIC_DETECTION_MODEL) |
|
print("Successfully loaded music detection model and feature extractor") |
|
except Exception as e2: |
|
print(f"Error loading music detection model components: {str(e2)}") |
|
raise RuntimeError(f"Could not load music detection model: {str(e2)}") |
|
|
|
|
|
print(f"Loading audio classification model: {GENRE_MODEL_NAME}") |
|
try: |
|
genre_classifier = pipeline( |
|
"audio-classification", |
|
model=GENRE_MODEL_NAME, |
|
device=0 if CUDA_AVAILABLE else -1 |
|
) |
|
print("Successfully loaded audio classification pipeline") |
|
except Exception as e: |
|
print(f"Error creating pipeline: {str(e)}") |
|
|
|
try: |
|
genre_processor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME) |
|
genre_model = AutoModelForAudioClassification.from_pretrained(GENRE_MODEL_NAME) |
|
print("Successfully loaded audio classification model and feature extractor") |
|
except Exception as e2: |
|
print(f"Error loading model components: {str(e2)}") |
|
raise RuntimeError(f"Could not load genre classification model: {str(e2)}") |
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_compute_dtype=torch.float16, |
|
) |
|
|
|
llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME) |
|
llm_model = AutoModelForCausalLM.from_pretrained( |
|
LLM_MODEL_NAME, |
|
device_map="auto", |
|
quantization_config=bnb_config, |
|
torch_dtype=torch.float16, |
|
) |
|
|
|
|
|
llm_pipeline = pipeline( |
|
"text-generation", |
|
model=llm_model, |
|
tokenizer=llm_tokenizer, |
|
max_new_tokens=512, |
|
) |
|
|
|
|
|
music_analyzer = MusicAnalyzer() |
|
|
|
|
|
def count_syllables(text): |
|
"""Count syllables in a given text using the pronouncing library.""" |
|
words = re.findall(r'\b[a-zA-Z]+\b', text.lower()) |
|
syllable_count = 0 |
|
|
|
for word in words: |
|
|
|
pronunciations = pronouncing.phones_for_word(word) |
|
if pronunciations: |
|
|
|
syllable_count += pronouncing.syllable_count(pronunciations[0]) |
|
else: |
|
|
|
vowels = "aeiouy" |
|
count = 0 |
|
prev_is_vowel = False |
|
|
|
for char in word: |
|
is_vowel = char.lower() in vowels |
|
if is_vowel and not prev_is_vowel: |
|
count += 1 |
|
prev_is_vowel = is_vowel |
|
|
|
if word.endswith('e'): |
|
count -= 1 |
|
if word.endswith('le') and len(word) > 2 and word[-3] not in vowels: |
|
count += 1 |
|
if count == 0: |
|
count = 1 |
|
|
|
syllable_count += count |
|
|
|
return syllable_count |
|
|
|
def extract_audio_features(audio_file): |
|
"""Extract audio features from an audio file.""" |
|
try: |
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
|
|
if y is None or sr is None: |
|
raise ValueError("Failed to load audio data") |
|
|
|
|
|
duration = extract_audio_duration(y, sr) |
|
|
|
|
|
mfccs_mean = extract_mfcc_features(y, sr, n_mfcc=20) |
|
|
|
return { |
|
"features": mfccs_mean, |
|
"duration": duration, |
|
"waveform": y, |
|
"sample_rate": sr, |
|
"path": audio_file |
|
} |
|
except Exception as e: |
|
print(f"Error extracting audio features: {str(e)}") |
|
raise ValueError(f"Failed to extract audio features: {str(e)}") |
|
|
|
def classify_genre(audio_data): |
|
"""Classify the genre of the audio using the loaded model.""" |
|
try: |
|
|
|
if 'genre_classifier' in globals(): |
|
results = genre_classifier(audio_data["path"]) |
|
|
|
top_genres = [(result["label"], result["score"]) for result in results[:3]] |
|
return top_genres |
|
|
|
|
|
elif 'genre_processor' in globals() and 'genre_model' in globals(): |
|
|
|
inputs = genre_processor( |
|
audio_data["waveform"], |
|
sampling_rate=audio_data["sample_rate"], |
|
return_tensors="pt" |
|
) |
|
|
|
with torch.no_grad(): |
|
outputs = genre_model(**inputs) |
|
predictions = outputs.logits.softmax(dim=-1) |
|
|
|
|
|
values, indices = torch.topk(predictions, 3) |
|
|
|
|
|
genre_labels = genre_model.config.id2label |
|
|
|
top_genres = [] |
|
for i, (value, index) in enumerate(zip(values[0], indices[0])): |
|
genre = genre_labels[index.item()] |
|
confidence = value.item() |
|
top_genres.append((genre, confidence)) |
|
|
|
return top_genres |
|
|
|
else: |
|
raise ValueError("No genre classification model available") |
|
|
|
except Exception as e: |
|
print(f"Error in genre classification: {str(e)}") |
|
|
|
return [("rock", 1.0)] |
|
|
|
def detect_music(audio_data): |
|
"""Detect if the audio is music using the MIT AST model.""" |
|
try: |
|
|
|
if 'music_detector' in globals(): |
|
results = music_detector(audio_data["path"]) |
|
|
|
music_confidence = 0.0 |
|
for result in results: |
|
label = result["label"].lower() |
|
if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): |
|
music_confidence = max(music_confidence, result["score"]) |
|
return music_confidence >= 0.2, results |
|
|
|
|
|
elif 'music_processor' in globals() and 'music_model' in globals(): |
|
|
|
inputs = music_processor( |
|
audio_data["waveform"], |
|
sampling_rate=audio_data["sample_rate"], |
|
return_tensors="pt" |
|
) |
|
|
|
with torch.no_grad(): |
|
outputs = music_model(**inputs) |
|
predictions = outputs.logits.softmax(dim=-1) |
|
|
|
|
|
values, indices = torch.topk(predictions, 5) |
|
|
|
|
|
labels = music_model.config.id2label |
|
|
|
|
|
music_confidence = 0.0 |
|
results = [] |
|
|
|
for i, (value, index) in enumerate(zip(values[0], indices[0])): |
|
label = labels[index.item()].lower() |
|
score = value.item() |
|
results.append({"label": label, "score": score}) |
|
|
|
if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): |
|
music_confidence = max(music_confidence, score) |
|
|
|
return music_confidence >= 0.2, results |
|
|
|
else: |
|
raise ValueError("No music detection model available") |
|
|
|
except Exception as e: |
|
print(f"Error in music detection: {str(e)}") |
|
return False, [] |
|
|
|
|
|
def detect_beats(y, sr): |
|
"""Detect beats and create a detailed rhythmic map of the audio.""" |
|
|
|
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr) |
|
|
|
|
|
beat_times = librosa.frames_to_time(beat_frames, sr=sr) |
|
|
|
|
|
onset_env = librosa.onset.onset_strength(y=y, sr=sr) |
|
beat_strengths = [onset_env[frame] for frame in beat_frames if frame < len(onset_env)] |
|
|
|
|
|
if beat_strengths: |
|
avg_strength = sum(beat_strengths) / len(beat_strengths) |
|
while len(beat_strengths) < len(beat_times): |
|
beat_strengths.append(avg_strength) |
|
else: |
|
beat_strengths = [1.0] * len(beat_times) |
|
|
|
|
|
intervals = [] |
|
for i in range(1, len(beat_times)): |
|
intervals.append(beat_times[i] - beat_times[i-1]) |
|
|
|
|
|
time_signature = 4 |
|
if len(beat_strengths) > 8: |
|
strength_pattern = [] |
|
for i in range(0, len(beat_strengths), 2): |
|
if i+1 < len(beat_strengths): |
|
ratio = beat_strengths[i] / (beat_strengths[i+1] + 0.0001) |
|
strength_pattern.append(ratio) |
|
|
|
|
|
if strength_pattern: |
|
three_pattern = sum(1 for r in strength_pattern if r > 1.2) / len(strength_pattern) |
|
if three_pattern > 0.6: |
|
time_signature = 3 |
|
|
|
|
|
phrases = [] |
|
current_phrase = [] |
|
|
|
for i in range(len(beat_times)): |
|
current_phrase.append(i) |
|
|
|
|
|
if i < len(beat_times) - 1: |
|
is_stronger_next = False |
|
if i < len(beat_strengths) - 1: |
|
is_stronger_next = beat_strengths[i+1] > beat_strengths[i] * 1.2 |
|
|
|
is_longer_gap = False |
|
if i < len(beat_times) - 1 and intervals: |
|
current_gap = beat_times[i+1] - beat_times[i] |
|
avg_gap = sum(intervals) / len(intervals) |
|
is_longer_gap = current_gap > avg_gap * 1.3 |
|
|
|
if (is_stronger_next or is_longer_gap) and len(current_phrase) >= 2: |
|
phrases.append(current_phrase) |
|
current_phrase = [] |
|
|
|
|
|
if current_phrase: |
|
phrases.append(current_phrase) |
|
|
|
return { |
|
"tempo": tempo, |
|
"beat_frames": beat_frames, |
|
"beat_times": beat_times, |
|
"beat_count": len(beat_times), |
|
"beat_strengths": beat_strengths, |
|
"intervals": intervals, |
|
"time_signature": time_signature, |
|
"phrases": phrases |
|
} |
|
|
|
def detect_sections(y, sr): |
|
"""Detect sections (verse, chorus, etc.) in the audio.""" |
|
|
|
S = np.abs(librosa.stft(y)) |
|
contrast = librosa.feature.spectral_contrast(S=S, sr=sr) |
|
|
|
|
|
chroma = librosa.feature.chroma_cqt(y=y, sr=sr) |
|
|
|
|
|
|
|
contrast_avg = np.mean(contrast, axis=0) |
|
chroma_avg = np.mean(chroma, axis=0) |
|
|
|
|
|
contrast_avg = (contrast_avg - np.mean(contrast_avg)) / np.std(contrast_avg) |
|
chroma_avg = (chroma_avg - np.mean(chroma_avg)) / np.std(chroma_avg) |
|
|
|
|
|
combined = contrast_avg + chroma_avg |
|
|
|
|
|
bounds = librosa.segment.agglomerative(combined, 3) |
|
|
|
|
|
bound_times = librosa.frames_to_time(bounds, sr=sr) |
|
|
|
|
|
sections = [] |
|
for i in range(len(bound_times) - 1): |
|
start = bound_times[i] |
|
end = bound_times[i+1] |
|
duration = end - start |
|
|
|
|
|
if i == 0: |
|
section_type = "intro" |
|
elif i == len(bound_times) - 2: |
|
section_type = "outro" |
|
elif i % 2 == 1: |
|
section_type = "chorus" |
|
else: |
|
section_type = "verse" |
|
|
|
|
|
if 0 < i < len(bound_times) - 2 and duration < 20: |
|
section_type = "bridge" |
|
|
|
sections.append({ |
|
"type": section_type, |
|
"start": start, |
|
"end": end, |
|
"duration": duration |
|
}) |
|
|
|
return sections |
|
|
|
|
|
def create_flexible_syllable_templates(beats_info): |
|
"""Create detailed syllable templates based on beat patterns, capturing stress patterns.""" |
|
|
|
beat_times = beats_info["beat_times"] |
|
beat_strengths = beats_info.get("beat_strengths", [1.0] * len(beat_times)) |
|
phrases = beats_info.get("phrases", []) |
|
tempo = beats_info.get("tempo", 120) |
|
|
|
|
|
if not phrases: |
|
|
|
phrases = [] |
|
for i in range(0, len(beat_times), 4): |
|
end_idx = min(i + 4, len(beat_times)) |
|
if end_idx - i >= 2: |
|
phrases.append(list(range(i, end_idx))) |
|
|
|
|
|
syllable_templates = [] |
|
|
|
for phrase in phrases: |
|
|
|
phrase_strengths = [beat_strengths[i] for i in phrase if i < len(beat_strengths)] |
|
if not phrase_strengths: |
|
phrase_strengths = [1.0] * len(phrase) |
|
|
|
|
|
if phrase_strengths: |
|
max_strength = max(phrase_strengths) |
|
if max_strength > 0: |
|
norm_strengths = [s/max_strength for s in phrase_strengths] |
|
else: |
|
norm_strengths = [1.0] * len(phrase_strengths) |
|
else: |
|
norm_strengths = [] |
|
|
|
|
|
stress_pattern = [] |
|
for strength in norm_strengths: |
|
if strength > 0.7: |
|
stress_pattern.append("S") |
|
elif strength > 0.4: |
|
stress_pattern.append("m") |
|
else: |
|
stress_pattern.append("w") |
|
|
|
|
|
if tempo > 160: |
|
|
|
syllables_per_beat = [1] * len(phrase) |
|
elif tempo > 120: |
|
|
|
syllables_per_beat = [1 if s == "S" or s == "m" else 1 for s in stress_pattern] |
|
elif tempo > 90: |
|
|
|
syllables_per_beat = [2 if s == "S" else 1 if s == "m" else 1 for s in stress_pattern] |
|
else: |
|
|
|
syllables_per_beat = [2 if s == "S" else 2 if s == "m" else 1 for s in stress_pattern] |
|
|
|
|
|
detailed_template = [] |
|
for i, (stress, syllable_count) in enumerate(zip(stress_pattern, syllables_per_beat)): |
|
if stress == "S": |
|
|
|
detailed_template.append(f"S{syllable_count}") |
|
elif stress == "m": |
|
|
|
detailed_template.append(f"m{syllable_count}") |
|
else: |
|
|
|
detailed_template.append(f"w{syllable_count}") |
|
|
|
|
|
phrase_template = "-".join(detailed_template) |
|
syllable_templates.append(phrase_template) |
|
|
|
|
|
return "|".join(syllable_templates) |
|
|
|
|
|
def format_syllable_templates_for_prompt(syllable_templates): |
|
"""Convert technical syllable templates into clear, human-readable instructions.""" |
|
if not syllable_templates: |
|
return "" |
|
|
|
|
|
if isinstance(syllable_templates, str) and "|" in syllable_templates: |
|
|
|
phrases = syllable_templates.split("|") |
|
|
|
instructions = [] |
|
for i, phrase in enumerate(phrases): |
|
beats = phrase.split("-") |
|
beat_instructions = [] |
|
|
|
for beat in beats: |
|
if beat.startswith("S"): |
|
|
|
count = beat[1:] |
|
beat_instructions.append(f"STRONG({count})") |
|
elif beat.startswith("m"): |
|
|
|
count = beat[1:] |
|
beat_instructions.append(f"medium({count})") |
|
elif beat.startswith("w"): |
|
|
|
count = beat[1:] |
|
beat_instructions.append(f"weak({count})") |
|
else: |
|
|
|
beat_instructions.append(beat) |
|
|
|
line_desc = " → ".join(beat_instructions) |
|
instructions.append(f"Line {i+1}: {line_desc}") |
|
|
|
return "\n".join(instructions) |
|
else: |
|
|
|
formatted_lines = [] |
|
|
|
if isinstance(syllable_templates, list): |
|
for i, template in enumerate(syllable_templates): |
|
if isinstance(template, dict) and "syllable_template" in template: |
|
formatted_lines.append(f"Line {i+1}: {template['syllable_template']} syllables") |
|
elif isinstance(template, str): |
|
formatted_lines.append(f"Line {i+1}: {template} syllables") |
|
|
|
return "\n".join(formatted_lines) |
|
|
|
return str(syllable_templates) |
|
|
|
|
|
def verify_flexible_syllable_counts(lyrics, templates): |
|
"""Verify that the generated lyrics match the required syllable counts and stress patterns.""" |
|
|
|
lines = [line.strip() for line in lyrics.split("\n") if line.strip()] |
|
|
|
|
|
verification_notes = [] |
|
|
|
for i, line in enumerate(lines): |
|
if i >= len(templates): |
|
break |
|
|
|
template = templates[i] |
|
|
|
|
|
if isinstance(template, dict) and "syllable_template" in template: |
|
template_str = template["syllable_template"] |
|
elif isinstance(template, str): |
|
template_str = template |
|
else: |
|
continue |
|
|
|
|
|
if "|" in template_str: |
|
|
|
template_str = template_str.split("|")[0] |
|
|
|
|
|
total_expected = 0 |
|
|
|
|
|
if "-" in template_str and any(x in template_str for x in ["S", "m", "w"]): |
|
beats = template_str.split("-") |
|
expected_counts = [] |
|
|
|
for beat in beats: |
|
if beat.startswith(("S", "m", "w")): |
|
try: |
|
count = int(beat[1:]) |
|
expected_counts.append(count) |
|
total_expected += count |
|
except ValueError: |
|
expected_counts.append(1) |
|
total_expected += 1 |
|
else: |
|
try: |
|
count = int(beat) |
|
expected_counts.append(count) |
|
total_expected += count |
|
except ValueError: |
|
expected_counts.append(1) |
|
total_expected += 1 |
|
else: |
|
|
|
try: |
|
expected_counts = [int(count) for count in template_str.split("-")] |
|
total_expected = sum(expected_counts) |
|
except ValueError: |
|
|
|
expected_counts = [] |
|
total_expected = 0 |
|
|
|
|
|
actual_count = count_syllables(line) |
|
|
|
|
|
if total_expected > 0 and abs(actual_count - total_expected) > 2: |
|
verification_notes.append(f"Line {i+1}: Expected {total_expected} syllables, got {actual_count}") |
|
|
|
|
|
words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) |
|
if words and expected_counts and "S" in template_str: |
|
|
|
strong_beat_positions = [] |
|
current_pos = 0 |
|
|
|
for j, beat in enumerate(template_str.split("-")): |
|
if beat.startswith("S"): |
|
beat_count = int(beat[1:]) if len(beat) > 1 else 1 |
|
strong_beat_positions.append(current_pos) |
|
current_pos += beat_count |
|
else: |
|
beat_count = int(beat[1:]) if len(beat) > 1 else 1 |
|
current_pos += beat_count |
|
|
|
|
|
word_stresses = [] |
|
for word in words: |
|
pronunciations = pronouncing.phones_for_word(word) |
|
if pronunciations: |
|
stress_pattern = pronouncing.stresses(pronunciations[0]) |
|
word_stresses.append(stress_pattern) |
|
|
|
|
|
if word_stresses and strong_beat_positions and len(word_stresses) >= len(strong_beat_positions): |
|
verification_notes.append(f" → Check stress alignment on words with strong beats") |
|
|
|
|
|
if verification_notes: |
|
lyrics += "\n\n[Note: Potential rhythm mismatches in these lines:]\n" |
|
lyrics += "\n".join(verification_notes) |
|
lyrics += "\n\n[To fix mismatches:]\n" |
|
lyrics += "1. Make sure stressed syllables fall on STRONG beats\n" |
|
lyrics += "2. Adjust syllable counts to match the template\n" |
|
lyrics += "3. Try using words with naturally aligned stress patterns" |
|
|
|
return lyrics |
|
|
|
|
|
def generate_lyrics(genre, duration, emotion_results, song_structure=None): |
|
"""Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment.""" |
|
|
|
primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"] |
|
primary_theme = emotion_results["theme_analysis"]["primary_theme"] |
|
|
|
|
|
try: |
|
tempo = float(emotion_results["rhythm_analysis"]["tempo"]) |
|
except (KeyError, ValueError, TypeError): |
|
tempo = 0.0 |
|
|
|
key = emotion_results["tonal_analysis"]["key"] |
|
mode = emotion_results["tonal_analysis"]["mode"] |
|
|
|
|
|
syllable_guidance = "" |
|
templates_for_verification = [] |
|
|
|
if song_structure: |
|
|
|
if "flexible_structure" in song_structure and song_structure["flexible_structure"]: |
|
flexible = song_structure["flexible_structure"] |
|
if "segments" in flexible and flexible["segments"]: |
|
|
|
segments = flexible["segments"] |
|
|
|
|
|
enhanced_templates = [] |
|
|
|
for i, segment in enumerate(segments): |
|
if i < 15: |
|
|
|
segment_start = segment["start"] |
|
segment_end = segment["end"] |
|
|
|
|
|
segment_beats = [] |
|
beat_times = flexible["beats"]["beat_times"] |
|
beat_strengths = flexible["beats"].get("beat_strengths", []) |
|
|
|
for j, beat_time in enumerate(beat_times): |
|
if segment_start <= beat_time < segment_end: |
|
|
|
segment_beats.append(j) |
|
|
|
|
|
segment_beats_info = { |
|
"beat_times": [beat_times[j] for j in segment_beats], |
|
"tempo": flexible["beats"].get("tempo", 120) |
|
} |
|
|
|
if beat_strengths: |
|
segment_beats_info["beat_strengths"] = [ |
|
beat_strengths[j] for j in segment_beats |
|
if j < len(beat_strengths) |
|
] |
|
|
|
|
|
segment_beats_info["phrases"] = [segment_beats] |
|
|
|
|
|
enhanced_template = create_flexible_syllable_templates(segment_beats_info) |
|
enhanced_templates.append(enhanced_template) |
|
templates_for_verification.append(enhanced_template) |
|
|
|
|
|
syllable_guidance = "CRITICAL RHYTHM INSTRUCTIONS:\n" |
|
syllable_guidance += "Match each line exactly to this rhythm pattern (STRONG beats need stressed syllables):\n\n" |
|
syllable_guidance += format_syllable_templates_for_prompt(enhanced_templates) |
|
|
|
|
|
syllable_guidance += "\n\nWhere:\n" |
|
syllable_guidance += "- STRONG(n): Place a STRESSED syllable here, followed by (n-1) unstressed syllables\n" |
|
syllable_guidance += "- medium(n): Place a medium-stressed or unstressed syllable here, followed by (n-1) unstressed syllables\n" |
|
syllable_guidance += "- weak(n): Place unstressed syllables here\n" |
|
syllable_guidance += "- →: Indicates flow from one beat to the next within a line\n" |
|
|
|
|
|
elif "syllables" in song_structure and song_structure["syllables"]: |
|
syllable_guidance = "RHYTHM PATTERN INSTRUCTIONS:\n" |
|
syllable_guidance += "Follow these syllable patterns for each section:\n\n" |
|
|
|
for section in song_structure["syllables"]: |
|
if "syllable_template" in section: |
|
|
|
section_beats_info = { |
|
"beat_times": [beat for beat in song_structure["beats"]["beat_times"] |
|
if section["start"] <= beat < section["end"]], |
|
"tempo": song_structure["beats"].get("tempo", 120) |
|
} |
|
|
|
if "beat_strengths" in song_structure["beats"]: |
|
section_beats_info["beat_strengths"] = [ |
|
strength for i, strength in enumerate(song_structure["beats"]["beat_strengths"]) |
|
if i < len(song_structure["beats"]["beat_times"]) and |
|
section["start"] <= song_structure["beats"]["beat_times"][i] < section["end"] |
|
] |
|
|
|
|
|
section_beats_info["phrases"] = [list(range(len(section_beats_info["beat_times"])))] |
|
|
|
|
|
enhanced_template = create_flexible_syllable_templates(section_beats_info) |
|
|
|
syllable_guidance += f"[{section['type'].capitalize()}]:\n" |
|
syllable_guidance += format_syllable_templates_for_prompt(enhanced_template) + "\n\n" |
|
templates_for_verification.append(section) |
|
elif "syllable_count" in section: |
|
syllable_guidance += f"[{section['type'].capitalize()}]: ~{section['syllable_count']} syllables total\n" |
|
|
|
|
|
if not syllable_guidance: |
|
syllable_guidance = "RHYTHM ALIGNMENT INSTRUCTIONS:\n\n" |
|
syllable_guidance += "1. Align stressed syllables with strong beats (usually beats 1 and 3 in 4/4 time)\n" |
|
syllable_guidance += "2. Use unstressed syllables on weak beats (usually beats 2 and 4 in 4/4 time)\n" |
|
syllable_guidance += "3. Use appropriate syllable counts based on tempo:\n" |
|
syllable_guidance += " - Fast tempo (>120 BPM): 4-6 syllables per line\n" |
|
syllable_guidance += " - Medium tempo (90-120 BPM): 6-8 syllables per line\n" |
|
syllable_guidance += " - Slow tempo (<90 BPM): 8-10 syllables per line\n" |
|
|
|
|
|
syllable_guidance += "\nEXAMPLES OF PERFECT RHYTHM ALIGNMENT:\n" |
|
syllable_guidance += "Pattern: STRONG(1) → weak(1) → medium(1) → weak(1)\n" |
|
syllable_guidance += "Lyric: 'HEAR the MU-sic PLAY'\n" |
|
syllable_guidance += " ↑ ↑ ↑ ↑\n" |
|
syllable_guidance += " S w m w <- BEAT TYPE\n\n" |
|
|
|
syllable_guidance += "Pattern: STRONG(2) → weak(1) → STRONG(1) → weak(2)\n" |
|
syllable_guidance += "Lyric: 'DANC-ing TO the RHYTHM of LOVE'\n" |
|
syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" |
|
syllable_guidance += " S S w S w w <- BEAT TYPE\n\n" |
|
|
|
syllable_guidance += "Pattern: STRONG(1) → medium(2) → STRONG(1) → weak(1)\n" |
|
syllable_guidance += "Lyric: 'TIME keeps FLOW-ing ON and ON'\n" |
|
syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" |
|
syllable_guidance += " S m m S w w <- BEAT TYPE\n\n" |
|
|
|
|
|
use_sections = True |
|
if song_structure and "flexible_structure" in song_structure and song_structure["flexible_structure"]: |
|
|
|
if "segments" in song_structure["flexible_structure"]: |
|
segments = song_structure["flexible_structure"]["segments"] |
|
if len(segments) > 4: |
|
use_sections = False |
|
|
|
|
|
try: |
|
if song_structure and "beats" in song_structure: |
|
beats_info = song_structure["beats"] |
|
tempo = beats_info.get("tempo", 120) |
|
time_signature = beats_info.get("time_signature", 4) |
|
lines_structure = calculate_lyrics_length(duration, tempo, time_signature) |
|
|
|
|
|
if isinstance(lines_structure, dict): |
|
total_lines = lines_structure["lines_count"] |
|
|
|
|
|
verse_lines = 0 |
|
chorus_lines = 0 |
|
bridge_lines = 0 |
|
|
|
for section in lines_structure["sections"]: |
|
if section["type"] == "verse": |
|
verse_lines = section["lines"] |
|
elif section["type"] == "chorus": |
|
chorus_lines = section["lines"] |
|
elif section["type"] == "bridge": |
|
bridge_lines = section["lines"] |
|
else: |
|
|
|
total_lines = lines_structure |
|
|
|
|
|
if total_lines <= 6: |
|
verse_lines = 2 |
|
chorus_lines = 2 |
|
bridge_lines = 0 |
|
elif total_lines <= 10: |
|
verse_lines = 3 |
|
chorus_lines = 2 |
|
bridge_lines = 0 |
|
else: |
|
verse_lines = 3 |
|
chorus_lines = 2 |
|
bridge_lines = 2 |
|
else: |
|
|
|
total_lines = max(4, int(duration / 10)) |
|
|
|
|
|
if total_lines <= 6: |
|
verse_lines = 2 |
|
chorus_lines = 2 |
|
bridge_lines = 0 |
|
elif total_lines <= 10: |
|
verse_lines = 3 |
|
chorus_lines = 2 |
|
bridge_lines = 0 |
|
else: |
|
verse_lines = 3 |
|
chorus_lines = 2 |
|
bridge_lines = 2 |
|
except Exception as e: |
|
print(f"Error calculating lyrics length: {str(e)}") |
|
total_lines = max(4, int(duration / 10)) |
|
|
|
|
|
verse_lines = 3 |
|
chorus_lines = 2 |
|
bridge_lines = 0 |
|
|
|
|
|
if use_sections: |
|
|
|
prompt = f""" |
|
You are a talented songwriter who specializes in {genre} music. |
|
Write original {genre} song lyrics for a song that is {duration:.1f} seconds long. |
|
|
|
Music analysis has detected the following qualities in the music: |
|
- Tempo: {tempo:.1f} BPM |
|
- Key: {key} {mode} |
|
- Primary emotion: {primary_emotion} |
|
- Primary theme: {primary_theme} |
|
|
|
{syllable_guidance} |
|
|
|
CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: |
|
1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) |
|
2. Natural word stress patterns must match the beat strength (strong words on strong beats) |
|
3. Line breaks should occur at phrase endings for natural breathing |
|
4. Consonant clusters should be avoided on fast notes and strong beats |
|
5. Open vowels (a, e, o) work better for sustained notes and syllables |
|
|
|
The lyrics should: |
|
- Perfectly capture the essence and style of {genre} music |
|
- Express the {primary_emotion} emotion and {primary_theme} theme |
|
- Be approximately {total_lines} lines long |
|
- Follow this structure: |
|
* Verse: {verse_lines} lines |
|
* Chorus: {chorus_lines} lines |
|
* {f'Bridge: {bridge_lines} lines' if bridge_lines > 0 else ''} |
|
- Be completely original |
|
- Match the song duration of {duration:.1f} seconds |
|
|
|
Your lyrics: |
|
""" |
|
else: |
|
|
|
prompt = f""" |
|
You are a talented songwriter who specializes in {genre} music. |
|
Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long. |
|
|
|
Music analysis has detected the following qualities: |
|
- Tempo: {tempo:.1f} BPM |
|
- Key: {key} {mode} |
|
- Primary emotion: {primary_emotion} |
|
- Primary theme: {primary_theme} |
|
|
|
{syllable_guidance} |
|
|
|
CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: |
|
1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) |
|
2. Natural word stress patterns must match the beat strength (strong words on strong beats) |
|
3. Line breaks should occur at phrase endings for natural breathing |
|
4. Consonant clusters should be avoided on fast notes and strong beats |
|
5. Open vowels (a, e, o) work better for sustained notes and syllables |
|
|
|
For perfect alignment examples: |
|
- "FEEL the RHY-thm in your SOUL" – stressed syllables on strong beats |
|
- "to-DAY we DANCE a-LONG" – natural speech stress matches musical stress |
|
- "WAIT-ing FOR the SUN to RISE" – syllable emphasis aligns with beat emphasis |
|
|
|
The lyrics should: |
|
- Perfectly capture the essence and style of {genre} music |
|
- Express the {primary_emotion} emotion and {primary_theme} theme |
|
- Be completely original |
|
- Maintain a consistent theme throughout |
|
- Match the audio segment duration of {duration:.1f} seconds |
|
|
|
DON'T include any section labels like [Verse] or [Chorus] unless specifically instructed. |
|
Instead, write lyrics that flow naturally and match the music's rhythm precisely. |
|
|
|
Your lyrics: |
|
""" |
|
|
|
|
|
response = llm_pipeline( |
|
prompt, |
|
do_sample=True, |
|
temperature=0.7, |
|
top_p=0.9, |
|
repetition_penalty=1.1, |
|
return_full_text=False |
|
) |
|
|
|
|
|
lyrics = response[0]["generated_text"].strip() |
|
|
|
|
|
if templates_for_verification: |
|
lyrics = verify_flexible_syllable_counts(lyrics, templates_for_verification) |
|
|
|
|
|
if use_sections and "Verse" not in lyrics and "Chorus" not in lyrics: |
|
lines = lyrics.split('\n') |
|
formatted_lyrics = [] |
|
|
|
line_count = 0 |
|
for i, line in enumerate(lines): |
|
if not line.strip(): |
|
formatted_lyrics.append(line) |
|
continue |
|
|
|
if line_count == 0: |
|
formatted_lyrics.append("[Verse]") |
|
elif line_count == verse_lines: |
|
formatted_lyrics.append("\n[Chorus]") |
|
elif line_count == verse_lines + chorus_lines and bridge_lines > 0: |
|
formatted_lyrics.append("\n[Bridge]") |
|
|
|
formatted_lyrics.append(line) |
|
line_count += 1 |
|
|
|
lyrics = '\n'.join(formatted_lyrics) |
|
|
|
return lyrics |
|
|
|
def process_audio(audio_file): |
|
"""Main function to process audio file, classify genre, and generate lyrics.""" |
|
if audio_file is None: |
|
return "Please upload an audio file.", None, None |
|
|
|
try: |
|
|
|
audio_data = extract_audio_features(audio_file) |
|
|
|
|
|
try: |
|
is_music, ast_results = detect_music(audio_data) |
|
except Exception as e: |
|
print(f"Error in music detection: {str(e)}") |
|
return f"Error in music detection: {str(e)}", None, ast_results |
|
|
|
if not is_music: |
|
return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results |
|
|
|
|
|
try: |
|
top_genres = classify_genre(audio_data) |
|
|
|
genre_results = format_genre_results(top_genres) |
|
except Exception as e: |
|
print(f"Error in genre classification: {str(e)}") |
|
return f"Error in genre classification: {str(e)}", None, ast_results |
|
|
|
|
|
try: |
|
emotion_results = music_analyzer.analyze_music(audio_file) |
|
except Exception as e: |
|
print(f"Error in emotion analysis: {str(e)}") |
|
|
|
emotion_results = { |
|
"emotion_analysis": {"primary_emotion": "Unknown"}, |
|
"theme_analysis": {"primary_theme": "Unknown"}, |
|
"rhythm_analysis": {"tempo": 0}, |
|
"tonal_analysis": {"key": "Unknown", "mode": ""}, |
|
"summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"} |
|
} |
|
|
|
|
|
try: |
|
song_structure = calculate_detailed_song_structure(audio_data) |
|
except Exception as e: |
|
print(f"Error analyzing song structure: {str(e)}") |
|
|
|
song_structure = None |
|
|
|
|
|
try: |
|
primary_genre, _ = top_genres[0] |
|
lyrics = generate_lyrics(primary_genre, audio_data["duration"], emotion_results, song_structure) |
|
except Exception as e: |
|
print(f"Error generating lyrics: {str(e)}") |
|
lyrics = f"Error generating lyrics: {str(e)}" |
|
|
|
return genre_results, lyrics, ast_results |
|
|
|
except Exception as e: |
|
error_msg = f"Error processing audio: {str(e)}" |
|
print(error_msg) |
|
return error_msg, None, [] |
|
|
|
|
|
with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo: |
|
gr.Markdown("# Music Genre Classifier & Lyrics Generator") |
|
gr.Markdown("Upload a music file to classify its genre, analyze its emotions, and generate matching lyrics.") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
audio_input = gr.Audio(label="Upload Music", type="filepath") |
|
submit_btn = gr.Button("Analyze & Generate") |
|
|
|
with gr.Column(): |
|
genre_output = gr.Textbox(label="Detected Genres", lines=5) |
|
emotion_output = gr.Textbox(label="Emotion Analysis", lines=5) |
|
ast_output = gr.Textbox(label="Audio Classification Results (AST)", lines=5) |
|
lyrics_output = gr.Textbox(label="Generated Lyrics", lines=15) |
|
|
|
def display_results(audio_file): |
|
if audio_file is None: |
|
return "Please upload an audio file.", "No emotion analysis available.", "No audio classification available.", None |
|
|
|
try: |
|
|
|
genre_results, lyrics, ast_results = process_audio(audio_file) |
|
|
|
|
|
if isinstance(genre_results, str) and genre_results.startswith("Error"): |
|
return genre_results, "Error in emotion analysis", "Error in audio classification", None |
|
|
|
|
|
try: |
|
emotion_results = music_analyzer.analyze_music(audio_file) |
|
emotion_text = f"Tempo: {emotion_results['summary']['tempo']:.1f} BPM\n" |
|
emotion_text += f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n" |
|
emotion_text += f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n" |
|
emotion_text += f"Primary Theme: {emotion_results['summary']['primary_theme']}" |
|
|
|
|
|
try: |
|
audio_data = extract_audio_features(audio_file) |
|
song_structure = calculate_detailed_song_structure(audio_data) |
|
|
|
emotion_text += "\n\nSong Structure:\n" |
|
for section in song_structure["syllables"]: |
|
emotion_text += f"- {section['type'].capitalize()}: {section['start']:.1f}s to {section['end']:.1f}s " |
|
emotion_text += f"({section['duration']:.1f}s, {section['beat_count']} beats, " |
|
|
|
if "syllable_template" in section: |
|
emotion_text += f"template: {section['syllable_template']})\n" |
|
else: |
|
emotion_text += f"~{section['syllable_count']} syllables)\n" |
|
|
|
|
|
if "flexible_structure" in song_structure and song_structure["flexible_structure"]: |
|
flexible = song_structure["flexible_structure"] |
|
if "segments" in flexible and flexible["segments"]: |
|
emotion_text += "\nDetailed Rhythm Analysis:\n" |
|
for i, segment in enumerate(flexible["segments"][:5]): |
|
emotion_text += f"- Segment {i+1}: {segment['start']:.1f}s to {segment['end']:.1f}s, " |
|
emotion_text += f"pattern: {segment['syllable_template']}\n" |
|
|
|
if len(flexible["segments"]) > 5: |
|
emotion_text += f" (+ {len(flexible['segments']) - 5} more segments)\n" |
|
|
|
except Exception as e: |
|
print(f"Error displaying song structure: {str(e)}") |
|
|
|
|
|
except Exception as e: |
|
print(f"Error in emotion analysis: {str(e)}") |
|
emotion_text = f"Error in emotion analysis: {str(e)}" |
|
|
|
|
|
if ast_results and isinstance(ast_results, list): |
|
ast_text = "Audio Classification Results (AST Model):\n" |
|
for result in ast_results[:5]: |
|
ast_text += f"{result['label']}: {result['score']*100:.2f}%\n" |
|
else: |
|
ast_text = "No valid audio classification results available." |
|
|
|
return genre_results, emotion_text, ast_text, lyrics |
|
except Exception as e: |
|
error_msg = f"Error: {str(e)}" |
|
print(error_msg) |
|
return error_msg, "Error in emotion analysis", "Error in audio classification", None |
|
|
|
submit_btn.click( |
|
fn=display_results, |
|
inputs=[audio_input], |
|
outputs=[genre_output, emotion_output, ast_output, lyrics_output] |
|
) |
|
|
|
gr.Markdown("### How it works") |
|
gr.Markdown(""" |
|
1. Upload an audio file of your choice |
|
2. The system will classify the genre using the dima806/music_genres_classification model |
|
3. The system will analyze the musical emotion and theme using advanced audio processing |
|
4. The system will identify the song structure, beats, and timing patterns |
|
5. The system will create syllable templates that precisely match the rhythm of the music |
|
6. Based on the detected genre, emotion, and syllable templates, it will generate lyrics that align perfectly with the beats |
|
7. The system verifies syllable counts to ensure the generated lyrics can be sung naturally with the music |
|
""") |
|
|
|
|
|
demo.launch() |