|
import os |
|
import io |
|
import gradio as gr |
|
import torch |
|
import numpy as np |
|
import re |
|
import pronouncing |
|
import functools |
|
from transformers import ( |
|
AutoModelForAudioClassification, |
|
AutoFeatureExtractor, |
|
AutoTokenizer, |
|
pipeline, |
|
AutoModelForCausalLM, |
|
BitsAndBytesConfig |
|
) |
|
from huggingface_hub import login |
|
from utils import ( |
|
load_audio, |
|
extract_audio_duration, |
|
extract_mfcc_features, |
|
format_genre_results, |
|
ensure_cuda_availability |
|
) |
|
from emotionanalysis import MusicAnalyzer |
|
import librosa |
|
from pyannote.audio import Pipeline |
|
import tempfile |
|
import os |
|
import soundfile as sf |
|
import warnings |
|
import json |
|
import math |
|
from collections import defaultdict |
|
import matplotlib.pyplot as plt |
|
from gradio_client import Client |
|
from transformers import pipeline as hf_pipeline |
|
|
|
|
|
if "HF_TOKEN" in os.environ: |
|
login(token=os.environ["HF_TOKEN"]) |
|
|
|
|
|
GENRE_MODEL_NAME = "dima806/music_genres_classification" |
|
MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593" |
|
LLM_MODEL_NAME = "Qwen/Qwen3-32B" |
|
SAMPLE_RATE = 22050 |
|
|
|
|
|
CUDA_AVAILABLE = ensure_cuda_availability() |
|
|
|
|
|
print(f"Loading music detection model: {MUSIC_DETECTION_MODEL}") |
|
try: |
|
music_detector = pipeline( |
|
"audio-classification", |
|
model=MUSIC_DETECTION_MODEL, |
|
device=0 if CUDA_AVAILABLE else -1 |
|
) |
|
print("Successfully loaded music detection pipeline") |
|
except Exception as e: |
|
print(f"Error creating music detection pipeline: {str(e)}") |
|
|
|
try: |
|
music_processor = AutoFeatureExtractor.from_pretrained(MUSIC_DETECTION_MODEL) |
|
music_model = AutoModelForAudioClassification.from_pretrained(MUSIC_DETECTION_MODEL) |
|
print("Successfully loaded music detection model and feature extractor") |
|
except Exception as e2: |
|
print(f"Error loading music detection model components: {str(e2)}") |
|
raise RuntimeError(f"Could not load music detection model: {str(e2)}") |
|
|
|
|
|
print(f"Loading audio classification model: {GENRE_MODEL_NAME}") |
|
try: |
|
genre_classifier = pipeline( |
|
"audio-classification", |
|
model=GENRE_MODEL_NAME, |
|
device=0 if CUDA_AVAILABLE else -1 |
|
) |
|
print("Successfully loaded audio classification pipeline") |
|
except Exception as e: |
|
print(f"Error creating pipeline: {str(e)}") |
|
|
|
try: |
|
genre_processor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME) |
|
genre_model = AutoModelForAudioClassification.from_pretrained(GENRE_MODEL_NAME) |
|
print("Successfully loaded audio classification model and feature extractor") |
|
except Exception as e2: |
|
print(f"Error loading model components: {str(e2)}") |
|
raise RuntimeError(f"Could not load genre classification model: {str(e2)}") |
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_compute_dtype=torch.float16, |
|
) |
|
|
|
llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME) |
|
llm_model = AutoModelForCausalLM.from_pretrained( |
|
LLM_MODEL_NAME, |
|
device_map="auto", |
|
quantization_config=bnb_config, |
|
torch_dtype=torch.float16, |
|
) |
|
|
|
|
|
llm_pipeline = pipeline( |
|
"text-generation", |
|
model=llm_model, |
|
tokenizer=llm_tokenizer, |
|
max_new_tokens=512, |
|
) |
|
|
|
|
|
music_analyzer = MusicAnalyzer() |
|
|
|
|
|
@functools.lru_cache(maxsize=512) |
|
def cached_phones_for_word(word): |
|
"""Get word pronunciations with caching for better performance.""" |
|
return pronouncing.phones_for_word(word) |
|
|
|
@functools.lru_cache(maxsize=512) |
|
def count_syllables_for_word(word): |
|
"""Count syllables in a single word with caching for performance.""" |
|
|
|
pronunciations = cached_phones_for_word(word.lower()) |
|
if pronunciations: |
|
return pronouncing.syllable_count(pronunciations[0]) |
|
|
|
|
|
vowels = "aeiouy" |
|
word = word.lower() |
|
count = 0 |
|
prev_is_vowel = False |
|
|
|
for char in word: |
|
is_vowel = char in vowels |
|
if is_vowel and not prev_is_vowel: |
|
count += 1 |
|
prev_is_vowel = is_vowel |
|
|
|
|
|
if word.endswith('e') and not word.endswith('le'): |
|
count -= 1 |
|
if word.endswith('le') and len(word) > 2 and word[-3] not in vowels: |
|
count += 1 |
|
if count == 0: |
|
count = 1 |
|
|
|
return count |
|
|
|
@functools.lru_cache(maxsize=512) |
|
def get_word_stress(word): |
|
"""Get the stress pattern for a word with improved fallback handling.""" |
|
pronunciations = cached_phones_for_word(word.lower()) |
|
if pronunciations: |
|
return pronouncing.stresses(pronunciations[0]) |
|
|
|
|
|
syllables = count_syllables_for_word(word) |
|
|
|
|
|
if syllables == 1: |
|
return "1" |
|
elif syllables == 2: |
|
|
|
|
|
second_syllable_stress = ["ing", "er", "or", "ize", "ise", "ate", "ect", "end", "ure"] |
|
if any(word.endswith(ending) for ending in second_syllable_stress): |
|
return "01" |
|
else: |
|
return "10" |
|
elif syllables == 3: |
|
|
|
if any(word.endswith(ending) for ending in ["ity", "ety", "ify", "ogy", "graphy"]): |
|
return "100" |
|
elif any(word.endswith(ending) for ending in ["ation", "ious", "itis"]): |
|
return "010" |
|
else: |
|
return "100" |
|
else: |
|
|
|
return "1" + "0" * (syllables - 1) |
|
|
|
|
|
def count_syllables(text): |
|
"""Count syllables in a given text using the pronouncing library.""" |
|
words = re.findall(r'\b[a-zA-Z]+\b', text.lower()) |
|
syllable_count = 0 |
|
|
|
for word in words: |
|
syllable_count += count_syllables_for_word(word) |
|
|
|
return syllable_count |
|
|
|
def extract_audio_features(audio_file): |
|
"""Extract audio features from an audio file.""" |
|
try: |
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
|
|
if y is None or sr is None: |
|
raise ValueError("Failed to load audio data") |
|
|
|
|
|
duration = extract_audio_duration(y, sr) |
|
|
|
|
|
mfccs_mean = extract_mfcc_features(y, sr, n_mfcc=20) |
|
|
|
return { |
|
"features": mfccs_mean, |
|
"duration": duration, |
|
"waveform": y, |
|
"sample_rate": sr, |
|
"path": audio_file |
|
} |
|
except Exception as e: |
|
print(f"Error extracting audio features: {str(e)}") |
|
raise ValueError(f"Failed to extract audio features: {str(e)}") |
|
|
|
def classify_genre(audio_data): |
|
"""Classify the genre of the audio using the loaded model.""" |
|
try: |
|
|
|
if 'genre_classifier' in globals(): |
|
results = genre_classifier(audio_data["path"]) |
|
|
|
top_genres = [(result["label"], result["score"]) for result in results[:3]] |
|
return top_genres |
|
|
|
|
|
elif 'genre_processor' in globals() and 'genre_model' in globals(): |
|
|
|
inputs = genre_processor( |
|
audio_data["waveform"], |
|
sampling_rate=audio_data["sample_rate"], |
|
return_tensors="pt" |
|
) |
|
|
|
with torch.no_grad(): |
|
outputs = genre_model(**inputs) |
|
predictions = outputs.logits.softmax(dim=-1) |
|
|
|
|
|
values, indices = torch.topk(predictions, 3) |
|
|
|
|
|
genre_labels = genre_model.config.id2label |
|
|
|
top_genres = [] |
|
for i, (value, index) in enumerate(zip(values[0], indices[0])): |
|
genre = genre_labels[index.item()] |
|
confidence = value.item() |
|
top_genres.append((genre, confidence)) |
|
|
|
return top_genres |
|
|
|
else: |
|
raise ValueError("No genre classification model available") |
|
|
|
except Exception as e: |
|
print(f"Error in genre classification: {str(e)}") |
|
|
|
return [("rock", 1.0)] |
|
|
|
def detect_music(audio_data): |
|
"""Detect if the audio is music using the MIT AST model.""" |
|
try: |
|
|
|
if 'music_detector' in globals(): |
|
results = music_detector(audio_data["path"]) |
|
|
|
music_confidence = 0.0 |
|
for result in results: |
|
label = result["label"].lower() |
|
if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): |
|
music_confidence = max(music_confidence, result["score"]) |
|
return music_confidence >= 0.2, results |
|
|
|
|
|
elif 'music_processor' in globals() and 'music_model' in globals(): |
|
|
|
inputs = music_processor( |
|
audio_data["waveform"], |
|
sampling_rate=audio_data["sample_rate"], |
|
return_tensors="pt" |
|
) |
|
|
|
with torch.no_grad(): |
|
outputs = music_model(**inputs) |
|
predictions = outputs.logits.softmax(dim=-1) |
|
|
|
|
|
values, indices = torch.topk(predictions, 5) |
|
|
|
|
|
labels = music_model.config.id2label |
|
|
|
|
|
music_confidence = 0.0 |
|
results = [] |
|
|
|
for i, (value, index) in enumerate(zip(values[0], indices[0])): |
|
label = labels[index.item()].lower() |
|
score = value.item() |
|
results.append({"label": label, "score": score}) |
|
|
|
if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): |
|
music_confidence = max(music_confidence, score) |
|
|
|
return music_confidence >= 0.2, results |
|
|
|
else: |
|
raise ValueError("No music detection model available") |
|
|
|
except Exception as e: |
|
print(f"Error in music detection: {str(e)}") |
|
return False, [] |
|
|
|
def detect_beats(y, sr): |
|
"""Enhanced beat detection with adaptive threshold analysis, improved time signature detection and scientific confidence metrics.""" |
|
|
|
|
|
y = np.clip(y, 1e-10, None) |
|
|
|
|
|
y_harmonic, y_percussive = librosa.effects.hpss(y) |
|
|
|
|
|
onset_env_full = librosa.onset.onset_strength(y=y, sr=sr) |
|
onset_env_perc = librosa.onset.onset_strength(y=y_percussive, sr=sr) |
|
|
|
|
|
onset_env_full = np.maximum(onset_env_full, 1e-6) |
|
onset_env_perc = np.maximum(onset_env_perc, 1e-6) |
|
|
|
|
|
combined_onset = onset_env_full * 0.3 + onset_env_perc * 0.7 |
|
|
|
|
|
tempo_candidates = [] |
|
beat_candidates = [] |
|
consistency_metrics = [] |
|
|
|
|
|
tempo1, beats1 = librosa.beat.beat_track( |
|
onset_envelope=combined_onset, |
|
sr=sr, |
|
tightness=100 |
|
) |
|
tempo_candidates.append(tempo1) |
|
beat_candidates.append(beats1) |
|
|
|
|
|
ac = librosa.autocorrelate(combined_onset) |
|
estimated_period = int(sr * 60.0 / (tempo1 * librosa.get_duration(y=y, sr=sr) / len(combined_onset))) |
|
if estimated_period < len(ac) and estimated_period > 0: |
|
|
|
local_ac = ac[max(0, estimated_period-5):min(len(ac), estimated_period+6)] |
|
if np.max(local_ac) > 0: |
|
tempo1_confidence = ac[estimated_period] / np.max(local_ac) |
|
else: |
|
tempo1_confidence = 0.5 |
|
else: |
|
tempo1_confidence = 0.5 |
|
consistency_metrics.append(tempo1_confidence) |
|
|
|
|
|
tempo2, beats2 = librosa.beat.beat_track( |
|
onset_envelope=combined_onset, |
|
sr=sr, |
|
tightness=100, |
|
start_bpm=60 |
|
) |
|
tempo_candidates.append(tempo2) |
|
beat_candidates.append(beats2) |
|
|
|
|
|
estimated_period2 = int(sr * 60.0 / (tempo2 * librosa.get_duration(y=y, sr=sr) / len(combined_onset))) |
|
if estimated_period2 < len(ac) and estimated_period2 > 0: |
|
local_ac2 = ac[max(0, estimated_period2-5):min(len(ac), estimated_period2+6)] |
|
if np.max(local_ac2) > 0: |
|
tempo2_confidence = ac[estimated_period2] / np.max(local_ac2) |
|
else: |
|
tempo2_confidence = 0.5 |
|
else: |
|
tempo2_confidence = 0.5 |
|
consistency_metrics.append(tempo2_confidence) |
|
|
|
|
|
try: |
|
tempo3, beats3 = librosa.beat.beat_track( |
|
onset_envelope=combined_onset, |
|
sr=sr, |
|
tightness=300, |
|
trim=False |
|
) |
|
tempo_candidates.append(tempo3) |
|
beat_candidates.append(beats3) |
|
|
|
|
|
if len(beats3) > 1: |
|
beat_times3 = librosa.frames_to_time(beats3, sr=sr) |
|
intervals3 = np.diff(beat_times3) |
|
tempo3_consistency = 1.0 / (1.0 + np.std(intervals3)/np.mean(intervals3)) if np.mean(intervals3) > 0 else 0.5 |
|
else: |
|
tempo3_consistency = 0.5 |
|
consistency_metrics.append(tempo3_consistency) |
|
except Exception: |
|
|
|
pass |
|
|
|
|
|
beat_consistency = [] |
|
for i, beats in enumerate(beat_candidates): |
|
if len(beats) <= 1: |
|
beat_consistency.append(0) |
|
continue |
|
|
|
times = librosa.frames_to_time(beats, sr=sr) |
|
intervals = np.diff(times) |
|
|
|
|
|
if np.mean(intervals) > 0: |
|
|
|
cv = np.std(intervals)/np.mean(intervals) |
|
|
|
|
|
duration = librosa.get_duration(y=y, sr=sr) |
|
expected_beats = duration * tempo_candidates[i] / 60 |
|
beats_ratio = min(len(beats) / expected_beats, expected_beats / len(beats)) if expected_beats > 0 else 0.5 |
|
|
|
|
|
consistency = (0.7 * (1.0 / (1.0 + cv))) + (0.3 * consistency_metrics[i]) + (0.2 * beats_ratio) |
|
beat_consistency.append(consistency) |
|
else: |
|
beat_consistency.append(0) |
|
|
|
|
|
if beat_consistency: |
|
best_idx = np.argmax(beat_consistency) |
|
best_confidence = beat_consistency[best_idx] * 100 |
|
else: |
|
best_idx = 0 |
|
best_confidence = 50.0 |
|
|
|
tempo = tempo_candidates[best_idx] |
|
beat_frames = beat_candidates[best_idx] |
|
|
|
|
|
beat_entropy = 0.0 |
|
if len(beat_frames) > 2: |
|
times = librosa.frames_to_time(beat_frames, sr=sr) |
|
intervals = np.diff(times) |
|
|
|
|
|
if len(intervals) > 0 and np.std(intervals) > 0: |
|
quantized = np.round(intervals / np.min(intervals)) |
|
|
|
unique, counts = np.unique(quantized, return_counts=True) |
|
probs = counts / np.sum(counts) |
|
|
|
beat_entropy = -np.sum(probs * np.log2(probs)) |
|
|
|
|
|
beat_times = librosa.frames_to_time(beat_frames, sr=sr) |
|
|
|
|
|
beat_strengths = [] |
|
if len(beat_frames) > 0: |
|
|
|
valid_frames = [frame for frame in beat_frames if frame < len(combined_onset)] |
|
if valid_frames: |
|
|
|
raw_strengths = combined_onset[valid_frames] |
|
|
|
|
|
if np.max(raw_strengths) > 0: |
|
normalized_strengths = raw_strengths / np.max(raw_strengths) |
|
else: |
|
normalized_strengths = np.ones_like(raw_strengths) |
|
|
|
beat_strengths = normalized_strengths.tolist() |
|
|
|
|
|
if len(beat_times) > len(beat_strengths): |
|
missing_count = len(beat_times) - len(beat_strengths) |
|
|
|
if beat_strengths: |
|
last_strength = beat_strengths[-1] |
|
decay_factor = 0.9 |
|
beat_strengths.extend([last_strength * (decay_factor ** (i+1)) |
|
for i in range(missing_count)]) |
|
else: |
|
beat_strengths = [1.0] * len(beat_times) |
|
else: |
|
beat_strengths = [1.0] * len(beat_times) |
|
else: |
|
beat_strengths = [1.0] * len(beat_times) |
|
|
|
|
|
intervals = np.diff(beat_times).tolist() if len(beat_times) > 1 else [] |
|
|
|
|
|
|
|
time_signature = 4 |
|
time_sig_confidence = 70.0 |
|
|
|
if len(beat_strengths) > 8: |
|
|
|
if len(beat_strengths) > 4: |
|
|
|
norm_strengths = np.array(beat_strengths) |
|
if np.max(norm_strengths) > 0: |
|
norm_strengths = norm_strengths / np.max(norm_strengths) |
|
|
|
|
|
ac = librosa.autocorrelate(norm_strengths, max_size=len(norm_strengths)//2) |
|
|
|
|
|
if len(ac) > 3: |
|
|
|
peaks = librosa.util.peak_pick(ac[1:], pre_max=1, post_max=1, pre_avg=1, post_avg=1, delta=0.1, wait=1) |
|
peaks = peaks + 1 |
|
|
|
if len(peaks) > 0: |
|
|
|
peak_idx = peaks[0] |
|
N = peak_idx |
|
|
|
|
|
if peak_idx < len(ac): |
|
peak_height = ac[peak_idx] |
|
local_prominence = peak_height / np.mean(ac[max(0, peak_idx-2):min(len(ac), peak_idx+3)]) |
|
time_sig_confidence = min(95, 60 + 35 * local_prominence) |
|
|
|
|
|
if N == 2: |
|
time_signature = 2 |
|
time_sig_confidence += 5 |
|
elif N == 3: |
|
time_signature = 3 |
|
time_sig_confidence += 5 |
|
elif 4 <= N <= 5: |
|
time_signature = N |
|
elif N == 6: |
|
|
|
|
|
group_3_count = 0 |
|
for i in range(0, len(beat_strengths) - 6, 3): |
|
if i + 2 < len(beat_strengths): |
|
if beat_strengths[i] > beat_strengths[i+1] and beat_strengths[i] > beat_strengths[i+2]: |
|
group_3_count += 1 |
|
|
|
group_2_count = 0 |
|
for i in range(0, len(beat_strengths) - 4, 2): |
|
if i + 1 < len(beat_strengths): |
|
if beat_strengths[i] > beat_strengths[i+1]: |
|
group_2_count += 1 |
|
|
|
|
|
time_signature = 3 if group_3_count > group_2_count else 6 |
|
elif N == 8: |
|
time_signature = 4 |
|
elif N == 5 or N == 7: |
|
time_signature = N |
|
|
|
|
|
phrases = [] |
|
current_phrase = [] |
|
|
|
if len(beat_times) > 0: |
|
|
|
if len(beat_strengths) > 4: |
|
|
|
strong_threshold = np.percentile(beat_strengths, 75) |
|
|
|
if intervals: |
|
mean_interval = np.mean(intervals) |
|
std_interval = np.std(intervals) |
|
|
|
significant_gap = mean_interval + (1.5 * std_interval) if std_interval > 0 else mean_interval * 1.3 |
|
else: |
|
significant_gap = 0 |
|
else: |
|
|
|
strong_threshold = np.max(beat_strengths) * 0.8 if beat_strengths else 1.0 |
|
significant_gap = 0 |
|
|
|
|
|
for i in range(len(beat_times)): |
|
current_phrase.append(i) |
|
|
|
|
|
if i < len(beat_times) - 1: |
|
|
|
is_stronger_next = False |
|
if i < len(beat_strengths) - 1: |
|
is_stronger_next = beat_strengths[i+1] > strong_threshold and beat_strengths[i+1] > beat_strengths[i] * 1.1 |
|
|
|
|
|
is_longer_gap = False |
|
if i < len(beat_times) - 1 and intervals and i < len(intervals): |
|
is_longer_gap = intervals[i] > significant_gap |
|
|
|
|
|
is_measure_boundary = (i + 1) % time_signature == 0 and i > 0 |
|
|
|
|
|
is_energy_dip = False |
|
if i < len(beat_strengths) - 1: |
|
onset_ratio = beat_strengths[i+1] / max(beat_strengths[i], 0.001) |
|
is_energy_dip = onset_ratio < 0.6 |
|
|
|
|
|
phrase_boundary_score = ( |
|
(1.5 if is_stronger_next else 0) + |
|
(2.0 if is_longer_gap else 0) + |
|
(1.0 if is_measure_boundary else 0) + |
|
(0.5 if is_energy_dip else 0) |
|
) |
|
|
|
if (phrase_boundary_score >= 1.5 and len(current_phrase) >= 2) or \ |
|
(is_measure_boundary and len(current_phrase) >= time_signature): |
|
phrases.append(current_phrase) |
|
current_phrase = [] |
|
|
|
|
|
if current_phrase and len(current_phrase) >= 2: |
|
phrases.append(current_phrase) |
|
|
|
|
|
if not phrases and len(beat_times) >= 2: |
|
|
|
for i in range(0, len(beat_times), time_signature): |
|
end = min(i + time_signature, len(beat_times)) |
|
if end - i >= 2: |
|
phrases.append(list(range(i, end))) |
|
|
|
|
|
beat_periodicity = np.mean(intervals) if intervals else (60 / tempo) |
|
|
|
|
|
return { |
|
"tempo": tempo, |
|
"tempo_confidence": best_confidence, |
|
"time_signature": time_signature, |
|
"time_sig_confidence": time_sig_confidence, |
|
"beat_frames": beat_frames, |
|
"beat_times": beat_times, |
|
"beat_count": len(beat_times), |
|
"beat_strengths": beat_strengths, |
|
"intervals": intervals, |
|
"phrases": phrases, |
|
"beat_periodicity": beat_periodicity, |
|
"beat_entropy": beat_entropy |
|
} |
|
|
|
def detect_beats_and_subbeats(y, sr, subdivision=4): |
|
""" |
|
Detect main beats and interpolate subbeats between consecutive beats. |
|
|
|
Parameters: |
|
y: Audio time series |
|
sr: Sample rate |
|
subdivision: Number of subdivisions between beats (default: 4 for quarter beats) |
|
|
|
Returns: |
|
Dictionary containing beat times, subbeat times, and tempo information |
|
""" |
|
|
|
try: |
|
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr) |
|
beat_times = librosa.frames_to_time(beat_frames, sr=sr) |
|
|
|
|
|
if isinstance(tempo, np.ndarray) or isinstance(tempo, np.number): |
|
tempo = float(tempo) |
|
|
|
|
|
if isinstance(beat_times, np.ndarray): |
|
beat_times = [float(t) for t in beat_times] |
|
except Exception as e: |
|
print(f"Error in beat detection: {e}") |
|
|
|
tempo = 120.0 |
|
beat_times = [] |
|
|
|
|
|
subbeat_times = [] |
|
|
|
|
|
if not beat_times or len(beat_times) < 2: |
|
return { |
|
"tempo": float(tempo) if tempo is not None else 120.0, |
|
"beat_times": beat_times, |
|
"subbeat_times": [] |
|
} |
|
|
|
for i in range(len(beat_times) - 1): |
|
|
|
try: |
|
current_beat = float(beat_times[i]) |
|
next_beat = float(beat_times[i + 1]) |
|
except (IndexError, ValueError, TypeError): |
|
continue |
|
|
|
|
|
interval = (next_beat - current_beat) / subdivision |
|
|
|
|
|
subbeat_times.append({ |
|
"time": float(current_beat), |
|
"type": "main", |
|
"strength": 1.0, |
|
"beat_index": i |
|
}) |
|
|
|
|
|
for j in range(1, subdivision): |
|
subbeat_time = current_beat + j * interval |
|
|
|
|
|
if j == subdivision // 2 and subdivision == 4: |
|
strength = 0.8 |
|
else: |
|
strength = 0.5 |
|
|
|
subbeat_times.append({ |
|
"time": float(subbeat_time), |
|
"type": "sub", |
|
"strength": float(strength), |
|
"beat_index": i, |
|
"subbeat_index": j |
|
}) |
|
|
|
|
|
if beat_times: |
|
try: |
|
subbeat_times.append({ |
|
"time": float(beat_times[-1]), |
|
"type": "main", |
|
"strength": 1.0, |
|
"beat_index": len(beat_times) - 1 |
|
}) |
|
except (ValueError, TypeError): |
|
|
|
pass |
|
|
|
return { |
|
"tempo": float(tempo) if tempo is not None else 120.0, |
|
"beat_times": beat_times, |
|
"subbeat_times": subbeat_times |
|
} |
|
|
|
def map_beats_to_seconds(subbeat_times, duration, fps=1.0): |
|
""" |
|
Map beats and subbeats to second-level intervals. |
|
|
|
Parameters: |
|
subbeat_times: List of dictionaries containing beat and subbeat information |
|
duration: Total duration of the audio in seconds |
|
fps: Frames per second (default: 1.0 for one-second intervals) |
|
|
|
Returns: |
|
List of dictionaries, each containing beats within a time window |
|
""" |
|
|
|
if not isinstance(subbeat_times, list): |
|
print("Warning: subbeat_times is not a list") |
|
subbeat_times = [] |
|
|
|
try: |
|
duration = float(duration) |
|
except (ValueError, TypeError): |
|
print("Warning: duration is not convertible to float, defaulting to 30") |
|
duration = 30.0 |
|
|
|
|
|
num_windows = int(duration * fps) + 1 |
|
|
|
|
|
time_windows = [] |
|
|
|
for i in range(num_windows): |
|
|
|
start_time = i / fps |
|
end_time = (i + 1) / fps |
|
|
|
|
|
window_beats = [] |
|
|
|
for beat in subbeat_times: |
|
|
|
if not isinstance(beat, dict): |
|
continue |
|
|
|
|
|
try: |
|
beat_time = float(beat.get("time", 0)) |
|
except (ValueError, TypeError): |
|
continue |
|
|
|
if start_time <= beat_time < end_time: |
|
|
|
beat_type = beat.get("type", "sub") |
|
if not isinstance(beat_type, str): |
|
beat_type = "sub" |
|
|
|
|
|
try: |
|
strength = float(beat.get("strength", 0.5)) |
|
except (ValueError, TypeError): |
|
strength = 0.5 |
|
|
|
|
|
window_beats.append({ |
|
"time": beat_time, |
|
"type": beat_type, |
|
"strength": strength, |
|
"relative_pos": (beat_time - start_time) / (1/fps) |
|
}) |
|
|
|
|
|
time_windows.append({ |
|
"second": i, |
|
"start": start_time, |
|
"end": end_time, |
|
"beats": window_beats |
|
}) |
|
|
|
return time_windows |
|
|
|
def create_second_level_templates(sec_map, tempo, genre=None): |
|
""" |
|
Create syllable templates for each second-level window. |
|
|
|
Parameters: |
|
sec_map: List of second-level time windows with beat information |
|
tempo: Tempo in BPM |
|
genre: Optional genre for genre-specific adjustments |
|
|
|
Returns: |
|
List of template strings, one for each second |
|
""" |
|
|
|
def tempo_to_syllable_base(tempo): |
|
"""Continuous function mapping tempo to syllable base count""" |
|
|
|
if tempo > 180: |
|
return 1.0 |
|
elif tempo > 140: |
|
return 1.0 + (180 - tempo) * 0.02 |
|
elif tempo > 100: |
|
return 1.8 + (140 - tempo) * 0.01 |
|
elif tempo > 70: |
|
return 2.2 + (100 - tempo) * 0.02 |
|
else: |
|
return 2.8 + max(0, (70 - tempo) * 0.04) |
|
|
|
|
|
base_syllables = tempo_to_syllable_base(tempo) |
|
|
|
|
|
genre_factor = 1.0 |
|
if genre: |
|
genre_lower = genre.lower() |
|
if any(term in genre_lower for term in ["rap", "hip hop", "hip-hop"]): |
|
genre_factor = 1.4 |
|
elif any(term in genre_lower for term in ["folk", "country", "ballad"]): |
|
genre_factor = 0.8 |
|
|
|
|
|
templates = [] |
|
|
|
for window in sec_map: |
|
beats = window["beats"] |
|
|
|
|
|
if not beats: |
|
templates.append("w(0.5):1") |
|
continue |
|
|
|
|
|
beat_patterns = [] |
|
|
|
for beat in beats: |
|
|
|
if not isinstance(beat, dict): |
|
continue |
|
|
|
|
|
if "type" not in beat or not isinstance(beat["type"], str): |
|
beat_type = "w" |
|
else: |
|
beat_type = "S" if beat["type"] == "main" else "m" if beat.get("strength", 0) >= 0.7 else "w" |
|
|
|
|
|
try: |
|
strength = float(beat.get("strength", 0.5)) |
|
except (ValueError, TypeError): |
|
strength = 0.5 |
|
|
|
|
|
if beat_type == "S": |
|
syllable_factor = 1.2 |
|
elif beat_type == "m": |
|
syllable_factor = 1.0 |
|
else: |
|
syllable_factor = 0.8 |
|
|
|
|
|
syllable_count = base_syllables * syllable_factor * genre_factor |
|
|
|
|
|
syllable_count = round(syllable_count * 2) / 2 |
|
|
|
|
|
syllable_count = max(0.5, min(4, syllable_count)) |
|
|
|
|
|
strength_pct = round(strength * 100) / 100 |
|
beat_patterns.append(f"{beat_type}({strength_pct}):{syllable_count}") |
|
|
|
|
|
if not beat_patterns: |
|
templates.append("w(0.5):1") |
|
else: |
|
second_template = "-".join(beat_patterns) |
|
templates.append(second_template) |
|
|
|
return templates |
|
|
|
def detect_sections(y, sr): |
|
""" |
|
Detect musical segments without classifying them by type (verse, chorus, etc.). |
|
|
|
Parameters: |
|
y: Audio time series |
|
sr: Sample rate |
|
|
|
Returns: |
|
A list of section dictionaries with start time, end time, and duration |
|
""" |
|
|
|
|
|
hop_length = 512 |
|
|
|
|
|
S = np.abs(librosa.stft(y, hop_length=hop_length)) |
|
contrast = librosa.feature.spectral_contrast(S=S, sr=sr) |
|
|
|
|
|
chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length) |
|
|
|
|
|
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length) |
|
|
|
|
|
rms = librosa.feature.rms(y=y, hop_length=hop_length) |
|
|
|
|
|
y_harmonic, y_percussive = librosa.effects.hpss(y) |
|
|
|
|
|
|
|
duration = librosa.get_duration(y=y, sr=sr) |
|
|
|
|
|
|
|
feature_stack = np.vstack([ |
|
librosa.util.normalize(contrast), |
|
librosa.util.normalize(chroma), |
|
librosa.util.normalize(mfcc), |
|
librosa.util.normalize(rms) |
|
]) |
|
|
|
|
|
feature_matrix = feature_stack.T |
|
|
|
|
|
|
|
from sklearn.decomposition import PCA |
|
|
|
|
|
n_components = min(8, feature_matrix.shape[0], feature_matrix.shape[1]) |
|
|
|
if feature_matrix.shape[0] > n_components and feature_matrix.shape[1] > 0: |
|
try: |
|
pca = PCA(n_components=n_components) |
|
reduced_features = pca.fit_transform(feature_matrix) |
|
except Exception as e: |
|
print(f"PCA failed, falling back to original features: {e}") |
|
|
|
reduced_features = feature_matrix |
|
else: |
|
|
|
reduced_features = feature_matrix |
|
|
|
|
|
|
|
|
|
|
|
min_segments = max(2, int(duration / 60)) |
|
max_segments = min(10, int(duration / 20)) |
|
|
|
|
|
min_segments = max(2, min(min_segments, 4)) |
|
max_segments = max(min_segments + 1, min(max_segments, 8)) |
|
|
|
|
|
best_segments = min_segments |
|
best_score = -1 |
|
|
|
from sklearn.metrics import silhouette_score |
|
from sklearn.cluster import AgglomerativeClustering |
|
|
|
|
|
if reduced_features.shape[0] > max_segments: |
|
for n_segments in range(min_segments, max_segments + 1): |
|
try: |
|
|
|
clustering = AgglomerativeClustering(n_clusters=n_segments) |
|
labels = clustering.fit_predict(reduced_features) |
|
|
|
|
|
if len(np.unique(labels)) > 1 and len(labels) > n_segments + 1: |
|
score = silhouette_score(reduced_features, labels) |
|
|
|
if score > best_score: |
|
best_score = score |
|
best_segments = n_segments |
|
except Exception as e: |
|
print(f"Clustering with {n_segments} segments failed: {e}") |
|
continue |
|
|
|
|
|
n_segments = best_segments |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
clustering = AgglomerativeClustering(n_clusters=n_segments) |
|
labels = clustering.fit_predict(reduced_features) |
|
|
|
|
|
boundaries = [0] |
|
|
|
for i in range(1, len(labels)): |
|
if labels[i] != labels[i-1]: |
|
boundaries.append(i) |
|
|
|
boundaries.append(len(labels)) |
|
|
|
|
|
bounds_frames = np.array(boundaries) |
|
|
|
except Exception as e: |
|
print(f"Final clustering failed: {e}") |
|
|
|
bounds_frames = librosa.segment.agglomerative(feature_stack, n_segments) |
|
|
|
|
|
|
|
bounds_times = librosa.frames_to_time(bounds_frames, sr=sr, hop_length=hop_length) |
|
|
|
|
|
sections = [] |
|
|
|
for i in range(len(bounds_times) - 1): |
|
start = bounds_times[i] |
|
end = bounds_times[i+1] |
|
duration = end - start |
|
|
|
|
|
if duration < 4 and i > 0 and i < len(bounds_times) - 2: |
|
continue |
|
|
|
|
|
sections.append({ |
|
"type": "segment", |
|
"start": start, |
|
"end": end, |
|
"duration": duration |
|
}) |
|
|
|
|
|
sections = [s for s in sections if s["duration"] >= 5] |
|
|
|
return sections |
|
|
|
def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='default'): |
|
""" |
|
Create enhanced syllable templates based on beat patterns with improved musical intelligence. |
|
|
|
Parameters: |
|
beats_info: Dictionary containing beat analysis data |
|
genre: Optional genre to influence template creation |
|
phrase_mode: 'default' uses provided phrases, 'auto' forces recalculation |
|
|
|
Returns: |
|
String of syllable templates with embedded strength values and flexible timing |
|
""" |
|
import numpy as np |
|
from sklearn.cluster import KMeans |
|
|
|
|
|
|
|
if isinstance(beats_info, dict): |
|
processed_beats_info = {} |
|
for k, v in beats_info.items(): |
|
if isinstance(v, np.ndarray): |
|
if v.size == 1: |
|
processed_beats_info[k] = float(v.item()) |
|
else: |
|
processed_beats_info[k] = [float(x) if isinstance(x, np.number) else x for x in v] |
|
elif isinstance(v, np.number): |
|
processed_beats_info[k] = float(v) |
|
elif isinstance(v, list): |
|
processed_beats_info[k] = [float(x) if isinstance(x, np.number) else x for x in v] |
|
else: |
|
processed_beats_info[k] = v |
|
beats_info = processed_beats_info |
|
|
|
|
|
beat_times = beats_info.get("beat_times", []) |
|
beat_strengths = beats_info.get("beat_strengths", [1.0] * len(beat_times)) |
|
tempo = beats_info.get("tempo", 120) |
|
time_signature = beats_info.get("time_signature", 4) |
|
|
|
|
|
if len(beat_times) < 2: |
|
return "S(1.0):1-w(0.5):1|S(1.0):1-w(0.5):1" |
|
|
|
|
|
|
|
if len(beat_strengths) >= 6: |
|
|
|
X = np.array(beat_strengths).reshape(-1, 1) |
|
|
|
|
|
kmeans = KMeans(n_clusters=3, random_state=0, n_init=10).fit(X) |
|
|
|
|
|
centroids = sorted([float(c[0]) for c in kmeans.cluster_centers_]) |
|
|
|
|
|
if len(centroids) >= 3: |
|
medium_threshold = (centroids[0] + centroids[1]) / 2 |
|
strong_threshold = (centroids[1] + centroids[2]) / 2 |
|
else: |
|
|
|
medium_threshold = np.percentile(beat_strengths, 33) |
|
strong_threshold = np.percentile(beat_strengths, 66) |
|
else: |
|
|
|
medium_threshold = np.percentile(beat_strengths, 33) |
|
strong_threshold = np.percentile(beat_strengths, 66) |
|
|
|
|
|
|
|
phrases = beats_info.get("phrases", []) |
|
|
|
if phrase_mode == 'auto' or not phrases: |
|
|
|
phrases = [] |
|
current_phrase = [] |
|
|
|
for i in range(len(beat_times)): |
|
current_phrase.append(i) |
|
|
|
|
|
if (i + 1) % time_signature == 0 or i == len(beat_times) - 1: |
|
if len(current_phrase) >= 2: |
|
phrases.append(current_phrase) |
|
current_phrase = [] |
|
|
|
|
|
if current_phrase and len(current_phrase) >= 2: |
|
phrases.append(current_phrase) |
|
|
|
|
|
|
|
def tempo_to_syllable_base(tempo): |
|
"""Continuous function mapping tempo to syllable base count with scientific curve""" |
|
|
|
|
|
if tempo < 40: |
|
return 1.8 |
|
elif tempo > 200: |
|
return 0.7 |
|
else: |
|
|
|
L = 2.0 |
|
k = 0.04 |
|
x0 = 120 |
|
return L / (1 + np.exp(k * (tempo - x0))) |
|
|
|
|
|
|
|
syllable_templates = [] |
|
|
|
for phrase in phrases: |
|
|
|
if not phrase: |
|
continue |
|
|
|
|
|
phrase_strengths = [beat_strengths[i] for i in phrase if i < len(beat_strengths)] |
|
if not phrase_strengths: |
|
phrase_strengths = [1.0] * len(phrase) |
|
|
|
|
|
stress_pattern = [] |
|
for i, strength in enumerate(phrase_strengths): |
|
|
|
metrical_position = i % time_signature |
|
|
|
|
|
|
|
|
|
if metrical_position == 0: |
|
position_boost = 0.18 |
|
elif time_signature == 4 and metrical_position == 2: |
|
position_boost = 0.1 |
|
elif time_signature == 3 and metrical_position == 1: |
|
position_boost = 0.05 |
|
else: |
|
position_boost = 0 |
|
|
|
effective_strength = strength + position_boost |
|
|
|
if effective_strength >= strong_threshold: |
|
stress_pattern.append(("S", effective_strength)) |
|
elif effective_strength >= medium_threshold: |
|
stress_pattern.append(("m", effective_strength)) |
|
else: |
|
stress_pattern.append(("w", effective_strength)) |
|
|
|
|
|
|
|
detailed_template = [] |
|
|
|
|
|
phrase_duration = 0 |
|
if phrase and len(phrase) > 1 and len(beat_times) > 0: |
|
|
|
first_idx = phrase[0] |
|
last_idx = phrase[-1] |
|
|
|
|
|
if first_idx < len(beat_times) and last_idx < len(beat_times): |
|
phrase_duration = beat_times[last_idx] - beat_times[first_idx] |
|
|
|
|
|
|
|
max_reasonable_syllables = 100 |
|
if phrase_duration > 0: |
|
|
|
if tempo < 80: |
|
syllable_rate = 3.0 |
|
elif tempo < 120: |
|
syllable_rate = 3.5 |
|
else: |
|
syllable_rate = 4.0 |
|
|
|
|
|
max_reasonable_syllables = max(2, int(phrase_duration * syllable_rate)) |
|
|
|
for i, (stress_type, strength) in enumerate(stress_pattern): |
|
|
|
base_syllables = tempo_to_syllable_base(tempo) |
|
|
|
|
|
metrical_position = i % time_signature |
|
position_factor = 1.2 if metrical_position == 0 else 1.0 |
|
|
|
|
|
if stress_type == "S": |
|
syllable_factor = 1.2 * position_factor |
|
elif stress_type == "m": |
|
syllable_factor = 1.0 * position_factor |
|
else: |
|
syllable_factor = 0.8 |
|
|
|
|
|
genre_factor = 1.0 |
|
if genre: |
|
genre = genre.lower() |
|
if "rap" in genre or "hip" in genre: |
|
genre_factor = 1.5 |
|
elif "folk" in genre or "country" in genre or "ballad" in genre: |
|
genre_factor = 0.7 |
|
elif "metal" in genre or "rock" in genre: |
|
genre_factor = 1.1 |
|
elif "jazz" in genre: |
|
genre_factor = 1.2 |
|
elif "classical" in genre: |
|
genre_factor = 0.9 |
|
|
|
|
|
raw_count = base_syllables * syllable_factor * genre_factor |
|
|
|
|
|
|
|
rounded_count = round(raw_count * 4) / 4 |
|
|
|
|
|
syllable_count = max(0.5, min(4, rounded_count)) |
|
|
|
|
|
|
|
strength_pct = round(strength * 100) / 100 |
|
detailed_template.append(f"{stress_type}({strength_pct}):{syllable_count}") |
|
|
|
|
|
total_expected_syllables = sum([float(beat.split(':')[1]) for beat in detailed_template]) |
|
|
|
|
|
if total_expected_syllables > max_reasonable_syllables and max_reasonable_syllables > 0: |
|
scale_factor = max_reasonable_syllables / total_expected_syllables |
|
adjusted_template = [] |
|
|
|
|
|
if phrase_duration < 0.8 and phrase_duration > 0: |
|
|
|
scale_factor *= 0.8 |
|
|
|
for beat in detailed_template: |
|
if ':' in beat: |
|
beat_type_part = beat.split(':')[0] |
|
syllable_count = float(beat.split(':')[1]) |
|
|
|
new_count = max(0.5, round(syllable_count * scale_factor * 4) / 4) |
|
|
|
|
|
if phrase_duration < 0.6 and phrase_duration > 0: |
|
if beat_type_part.startswith("S"): |
|
new_count = min(new_count, 1.0) |
|
else: |
|
new_count = min(new_count, 0.5) |
|
|
|
adjusted_template.append(f"{beat_type_part}:{new_count}") |
|
else: |
|
adjusted_template.append(beat) |
|
|
|
detailed_template = adjusted_template |
|
|
|
|
|
if len(detailed_template) > 0: |
|
total_syllables = sum([float(beat.split(':')[1]) for beat in detailed_template if ':' in beat]) |
|
if phrase_duration > 0 and (total_syllables / phrase_duration) > 5.0: |
|
|
|
target_syllables = phrase_duration * 4.0 |
|
scale_factor = target_syllables / total_syllables |
|
adjusted_template = [] |
|
|
|
for beat in detailed_template: |
|
if ':' in beat: |
|
beat_type_part = beat.split(':')[0] |
|
syllable_count = float(beat.split(':')[1]) |
|
|
|
new_count = max(0.5, round(syllable_count * scale_factor * 4) / 4) |
|
adjusted_template.append(f"{beat_type_part}:{new_count}") |
|
else: |
|
adjusted_template.append(beat) |
|
|
|
detailed_template = adjusted_template |
|
|
|
|
|
phrase_template = "-".join(detailed_template) |
|
syllable_templates.append(phrase_template) |
|
|
|
|
|
|
|
if not syllable_templates: |
|
|
|
if time_signature == 3: |
|
syllable_templates = ["S(0.95):2-w(0.4):1-w(0.35):1"] |
|
elif time_signature == 2: |
|
syllable_templates = ["S(0.95):1.5-w(0.4):1"] |
|
else: |
|
syllable_templates = ["S(0.95):2-w(0.4):1-m(0.7):1.5-w(0.35):1"] |
|
|
|
|
|
return "|".join(syllable_templates) |
|
|
|
def format_syllable_templates_for_prompt(syllable_templates, arrow="→", line_wrap=10, |
|
structured_output=False, beat_types=None): |
|
""" |
|
Convert technical syllable templates into clear, human-readable instructions with |
|
enhanced flexibility and customization options. |
|
|
|
Parameters: |
|
syllable_templates: String or list of templates |
|
arrow: Symbol to use between beats (default: "→") |
|
line_wrap: Number of beats before automatic line wrapping (0 = no wrapping) |
|
structured_output: If True, return structured data instead of text |
|
beat_types: Custom mapping for beat types (default: None, uses standard mapping) |
|
|
|
Returns: |
|
Human-readable instructions or structured data depending on parameters |
|
""" |
|
if not syllable_templates: |
|
return {} if structured_output else "" |
|
|
|
|
|
default_beat_types = { |
|
"S": {"name": "STRONG", "description": "stressed syllable"}, |
|
"m": {"name": "medium", "description": "medium-stressed syllable"}, |
|
"w": {"name": "weak", "description": "unstressed syllable"}, |
|
"X": {"name": "EXTRA", "description": "extra strong syllable"}, |
|
"L": {"name": "legato", "description": "connected/tied syllable"} |
|
} |
|
|
|
|
|
beat_types = beat_types or default_beat_types |
|
|
|
|
|
structured_data = {"lines": [], "explanations": []} if structured_output else None |
|
|
|
|
|
is_enhanced_format = False |
|
|
|
|
|
if isinstance(syllable_templates, str): |
|
|
|
if any(bt + "(" in syllable_templates or bt + ":" in syllable_templates or bt + "[" in syllable_templates |
|
for bt in beat_types.keys()): |
|
is_enhanced_format = True |
|
|
|
elif "|" in syllable_templates: |
|
is_enhanced_format = True |
|
|
|
|
|
output = [] |
|
|
|
if is_enhanced_format: |
|
|
|
phrases = syllable_templates.split("|") if "|" in syllable_templates else [syllable_templates] |
|
|
|
|
|
for i, phrase in enumerate(phrases): |
|
|
|
has_swing = "(swing)" in phrase |
|
if has_swing: |
|
phrase = phrase.replace("(swing)", "") |
|
|
|
beats = phrase.split("-") |
|
beat_instructions = [] |
|
|
|
|
|
for j, beat in enumerate(beats): |
|
|
|
beat_info = {"original": beat, "type": None, "count": None, "strength": None} |
|
|
|
|
|
if "(" in beat and ")" in beat and ":" in beat: |
|
parts = beat.split(":") |
|
beat_type = parts[0].split("(")[0] |
|
strength = parts[0].split("(")[1].rstrip(")") |
|
count = parts[1] |
|
|
|
beat_info["type"] = beat_type |
|
beat_info["count"] = count |
|
beat_info["strength"] = strength |
|
|
|
|
|
elif any(beat.startswith(bt) for bt in beat_types.keys()) and len(beat) > 1: |
|
beat_type = beat[0] |
|
count = beat[1:] |
|
|
|
beat_info["type"] = beat_type |
|
beat_info["count"] = count |
|
|
|
|
|
else: |
|
beat_instructions.append(beat) |
|
continue |
|
|
|
|
|
if beat_info["type"] in beat_types: |
|
type_name = beat_types[beat_info["type"]]["name"] |
|
if beat_info["strength"]: |
|
beat_instructions.append(f"{type_name}({beat_info['count']}) [{beat_info['strength']}]") |
|
else: |
|
beat_instructions.append(f"{type_name}({beat_info['count']})") |
|
else: |
|
|
|
beat_instructions.append(beat) |
|
|
|
|
|
if line_wrap > 0 and len(beat_instructions) > line_wrap: |
|
wrapped_instructions = [] |
|
for k in range(0, len(beat_instructions), line_wrap): |
|
section = beat_instructions[k:k+line_wrap] |
|
wrapped_instructions.append(f"{arrow} ".join(section)) |
|
line_desc = f"\n {arrow} ".join(wrapped_instructions) |
|
else: |
|
line_desc = f" {arrow} ".join(beat_instructions) |
|
|
|
|
|
if has_swing: |
|
line_desc += " [with swing feel]" |
|
|
|
|
|
line_output = f"Line {i+1}: {line_desc}" |
|
output.append(line_output) |
|
|
|
if structured_output: |
|
structured_data["lines"].append({ |
|
"line_number": i+1, |
|
"beats": [{"original": beats[j], |
|
"type": beat_info.get("type"), |
|
"count": beat_info.get("count"), |
|
"strength": beat_info.get("strength")} |
|
for j, beat_info in enumerate([b for b in beats if isinstance(b, dict)])], |
|
"has_swing": has_swing |
|
}) |
|
|
|
|
|
explanation = [ |
|
"\n📝 UNDERSTANDING THE NOTATION:" |
|
] |
|
|
|
|
|
used_beat_types = set() |
|
for phrase in phrases: |
|
for beat in phrase.split("-"): |
|
for bt in beat_types.keys(): |
|
if beat.startswith(bt): |
|
used_beat_types.add(bt) |
|
|
|
for bt in used_beat_types: |
|
if bt in beat_types: |
|
name = beat_types[bt]["name"] |
|
desc = beat_types[bt]["description"] |
|
explanation.append(f"- {name}(n): Place a {desc} here, plus (n-1) unstressed syllables") |
|
|
|
explanation.extend([ |
|
f"- {arrow}: Indicates flow from one beat to the next", |
|
"- [0.xx]: Beat strength value (higher = more emphasis needed)" |
|
]) |
|
|
|
output.extend(explanation) |
|
|
|
if structured_output: |
|
structured_data["explanations"] = explanation |
|
|
|
|
|
has_half_syllables = any((".5" in beat) for phrase in phrases for beat in phrase.split("-")) |
|
if has_half_syllables: |
|
half_syllable_examples = [ |
|
"\n🎵 HALF-SYLLABLE EXAMPLES:", |
|
"- STRONG(1.5): One stressed syllable followed by an unstressed half-syllable", |
|
" Example: \"LOVE you\" where \"LOVE\" is stressed and \"you\" is quick", |
|
"- medium(2.5): One medium syllable plus one-and-a-half unstressed syllables", |
|
" Example: \"Wait for the\" where \"Wait\" is medium-stressed and \"for the\" is quick" |
|
] |
|
output.extend(half_syllable_examples) |
|
|
|
if structured_output: |
|
structured_data["half_syllable_examples"] = half_syllable_examples |
|
|
|
|
|
if any("swing" in phrase for phrase in phrases): |
|
swing_guide = [ |
|
"\n🎶 SWING RHYTHM GUIDE:", |
|
"- In swing, syllables should be unevenly timed (long-short pattern)", |
|
"- Example: \"SUM-mer TIME\" in swing feels like \"SUM...mer-TIME\" with delay" |
|
] |
|
output.extend(swing_guide) |
|
|
|
if structured_output: |
|
structured_data["swing_guide"] = swing_guide |
|
|
|
|
|
else: |
|
formatted_lines = [] |
|
|
|
if isinstance(syllable_templates, list): |
|
for i, template in enumerate(syllable_templates): |
|
if isinstance(template, dict) and "syllable_template" in template: |
|
line = f"Line {i+1}: {template['syllable_template']} syllables" |
|
formatted_lines.append(line) |
|
|
|
if structured_output: |
|
structured_data["lines"].append({ |
|
"line_number": i+1, |
|
"syllable_count": template["syllable_template"] |
|
}) |
|
elif isinstance(template, str): |
|
line = f"Line {i+1}: {template} syllables" |
|
formatted_lines.append(line) |
|
|
|
if structured_output: |
|
structured_data["lines"].append({ |
|
"line_number": i+1, |
|
"syllable_count": template |
|
}) |
|
|
|
output = formatted_lines |
|
else: |
|
output = [str(syllable_templates)] |
|
|
|
if structured_output: |
|
structured_data["raw_content"] = str(syllable_templates) |
|
|
|
|
|
application_tips = [ |
|
"\n💡 APPLICATION TIPS:", |
|
"1. Strong beats need naturally stressed syllables (like the START of \"RE-mem-ber\")", |
|
"2. Place important words on strong beats for natural emphasis", |
|
"3. Vowel sounds work best for sustained or emphasized syllables", |
|
"4. Keep consonant clusters (like \"str\" or \"thr\") on weak beats" |
|
] |
|
output.extend(application_tips) |
|
|
|
if structured_output: |
|
structured_data["application_tips"] = application_tips |
|
return structured_data |
|
|
|
return "\n".join(output) |
|
|
|
def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=None): |
|
""" |
|
Enhanced verification of syllable counts and stress patterns with precise alignment analysis |
|
for both phrase-level and second-level templates. |
|
""" |
|
import re |
|
import pronouncing |
|
import numpy as np |
|
import functools |
|
from itertools import chain |
|
|
|
print(f"DEBUG: In verify_flexible_syllable_counts, type of lyrics={type(lyrics)}") |
|
print(f"DEBUG: Type of templates={type(templates)}") |
|
|
|
|
|
if not isinstance(lyrics, str): |
|
print(f"DEBUG: lyrics is not a string, it's {type(lyrics)}") |
|
|
|
try: |
|
lyrics = str(lyrics) |
|
except Exception as e: |
|
print(f"DEBUG: Cannot convert lyrics to string: {str(e)}") |
|
return "Error: Cannot process non-string lyrics" |
|
|
|
|
|
if not isinstance(templates, list): |
|
print(f"DEBUG: templates is not a list, it's {type(templates)}") |
|
|
|
if templates is not None: |
|
templates = [templates] |
|
else: |
|
templates = [] |
|
|
|
|
|
lines = [line.strip() for line in lyrics.split("\n") if line.strip()] |
|
|
|
|
|
filtered_lines = [] |
|
for line in lines: |
|
|
|
if line.startswith('**') or line.startswith('[Note:') or 'alignment:' in line.lower(): |
|
continue |
|
filtered_lines.append(line) |
|
|
|
lines = filtered_lines |
|
|
|
|
|
verification_notes = [] |
|
detailed_analysis = [] |
|
stress_misalignments = [] |
|
total_mismatch_count = 0 |
|
|
|
|
|
for i, line in enumerate(lines): |
|
if i >= len(templates): |
|
break |
|
|
|
template = templates[i] |
|
print(f"DEBUG: Processing template {i+1}, type={type(template)}") |
|
|
|
|
|
template_str = None |
|
if isinstance(template, dict) and "syllable_template" in template: |
|
template_str = template["syllable_template"] |
|
elif isinstance(template, str): |
|
template_str = template |
|
else: |
|
print(f"DEBUG: Skipping template {i+1}, not a string or dict with syllable_template") |
|
continue |
|
|
|
if not isinstance(template_str, str): |
|
print(f"DEBUG: template_str is not a string, it's {type(template_str)}") |
|
continue |
|
|
|
|
|
template_phrases = [template_str] |
|
if "|" in template_str: |
|
template_phrases = template_str.split("|") |
|
|
|
|
|
best_match_diff = float('inf') |
|
best_match_phrase = None |
|
best_phrase_beats = None |
|
actual_count = count_syllables(line) |
|
|
|
for phrase_idx, phrase in enumerate(template_phrases): |
|
|
|
beats_info = [] |
|
total_expected = 0 |
|
|
|
|
|
if "-" in phrase: |
|
beat_templates = phrase.split("-") |
|
|
|
|
|
for beat in beat_templates: |
|
beat_info = {"original": beat, "type": None, "count": 1, "strength": None} |
|
|
|
|
|
if "(" in beat and ")" in beat and ":" in beat: |
|
parts = beat.split(":") |
|
beat_type = parts[0].split("(")[0] |
|
try: |
|
strength = float(parts[0].split("(")[1].rstrip(")")) |
|
except ValueError: |
|
strength = 1.0 |
|
|
|
|
|
try: |
|
count = float(parts[1]) |
|
|
|
if count == int(count): |
|
count = int(count) |
|
except ValueError: |
|
count = 1 |
|
|
|
beat_info.update({ |
|
"type": beat_type, |
|
"count": count, |
|
"strength": strength |
|
}) |
|
|
|
|
|
elif any(beat.startswith(x) for x in ["S", "m", "w", "X", "L"]): |
|
beat_type = beat[0] |
|
|
|
|
|
try: |
|
count_str = beat[1:] |
|
count = float(count_str) |
|
if count == int(count): |
|
count = int(count) |
|
except ValueError: |
|
count = 1 |
|
|
|
beat_info.update({ |
|
"type": beat_type, |
|
"count": count |
|
}) |
|
|
|
|
|
else: |
|
try: |
|
count = float(beat) |
|
if count == int(count): |
|
count = int(count) |
|
beat_info["count"] = count |
|
except ValueError: |
|
pass |
|
|
|
beats_info.append(beat_info) |
|
total_expected += beat_info["count"] |
|
|
|
|
|
phrase_diff = abs(actual_count - total_expected) |
|
|
|
|
|
expected_ratio = 0.15 if total_expected > 10 else 0.25 |
|
phrase_threshold = max(1, round(total_expected * expected_ratio)) |
|
|
|
|
|
if phrase_diff < best_match_diff: |
|
best_match_diff = phrase_diff |
|
best_match_phrase = phrase |
|
best_phrase_beats = beats_info |
|
|
|
|
|
else: |
|
try: |
|
total_expected = float(phrase) |
|
phrase_diff = abs(actual_count - total_expected) |
|
if phrase_diff < best_match_diff: |
|
best_match_diff = phrase_diff |
|
best_match_phrase = phrase |
|
best_phrase_beats = [{"count": total_expected}] |
|
except ValueError: |
|
pass |
|
|
|
|
|
if best_match_phrase and best_phrase_beats: |
|
total_expected = sum(beat["count"] for beat in best_phrase_beats) |
|
|
|
|
|
expected_ratio = 0.15 if total_expected > 10 else 0.25 |
|
threshold = max(1, round(total_expected * expected_ratio)) |
|
|
|
|
|
if total_expected > 0 and best_match_diff > threshold: |
|
verification_notes.append(f"Line {i+1}: Expected {total_expected} syllables, got {actual_count}") |
|
total_mismatch_count += 1 |
|
|
|
|
|
words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) |
|
|
|
|
|
word_analysis = [] |
|
cumulative_syllables = 0 |
|
|
|
for word in words: |
|
syllable_count = count_syllables_for_word(word) |
|
|
|
|
|
stress_pattern = get_word_stress(word) |
|
|
|
word_analysis.append({ |
|
"word": word, |
|
"syllables": syllable_count, |
|
"stress_pattern": stress_pattern, |
|
"position": cumulative_syllables |
|
}) |
|
|
|
cumulative_syllables += syllable_count |
|
|
|
|
|
if best_phrase_beats and any(b.get("type") == "S" for b in best_phrase_beats if "type" in b): |
|
|
|
strong_positions = [] |
|
current_pos = 0 |
|
|
|
for beat in best_phrase_beats: |
|
if beat.get("type") == "S": |
|
|
|
strong_positions.append(current_pos) |
|
current_pos += beat.get("count", 1) |
|
|
|
|
|
alignment_issues = [] |
|
aligned_stress_count = 0 |
|
total_stress_positions = len(strong_positions) |
|
|
|
for pos in strong_positions: |
|
|
|
misaligned_word = None |
|
|
|
for word_info in word_analysis: |
|
word_start = word_info["position"] |
|
word_end = word_start + word_info["syllables"] |
|
|
|
if word_start <= pos < word_end: |
|
|
|
syllable_in_word = pos - word_start |
|
|
|
|
|
stress = word_info["stress_pattern"] |
|
|
|
|
|
if stress and syllable_in_word < len(stress): |
|
if stress[syllable_in_word] == '1': |
|
|
|
aligned_stress_count += 1 |
|
else: |
|
|
|
misaligned_word = word_info["word"] |
|
alignment_issues.append(f"'{word_info['word']}' (unstressed syllable on strong beat)") |
|
stress_misalignments.append({ |
|
"line": i+1, |
|
"word": word_info["word"], |
|
"position": pos, |
|
"suggestion": get_stress_aligned_alternatives(word_info["word"], syllable_in_word) |
|
}) |
|
break |
|
|
|
|
|
alignment_percentage = 0 |
|
if total_stress_positions > 0: |
|
alignment_percentage = (aligned_stress_count / total_stress_positions) * 100 |
|
|
|
|
|
verification_notes.append(f" → Stress alignment: {alignment_percentage:.1f}% ({aligned_stress_count}/{total_stress_positions})") |
|
|
|
if alignment_issues: |
|
verification_notes.append(f" → Stress misalignments: {', '.join(alignment_issues)}") |
|
|
|
|
|
alignment_map = generate_alignment_visualization(line, best_phrase_beats, word_analysis) |
|
if alignment_map: |
|
detailed_analysis.append(f"Line {i+1} Alignment Analysis:\n{alignment_map}") |
|
else: |
|
|
|
verification_notes.append(f"Line {i+1}: Unable to find matching template pattern") |
|
|
|
|
|
if second_level_templates: |
|
verification_notes.append("\n=== SECOND-LEVEL VERIFICATION ===\n") |
|
|
|
|
|
for i, template in enumerate(second_level_templates): |
|
if i >= len(lines): |
|
break |
|
|
|
line = lines[i] |
|
|
|
|
|
if line.startswith('[') and ']' in line: |
|
continue |
|
|
|
actual_count = count_syllables(line) |
|
|
|
|
|
total_expected = 0 |
|
beat_patterns = [] |
|
|
|
|
|
if isinstance(template, str) and "-" in template: |
|
for beat in template.split("-"): |
|
if ":" in beat: |
|
try: |
|
count_part = beat.split(":")[1] |
|
count = float(count_part) |
|
total_expected += count |
|
|
|
|
|
beat_type = beat.split("(")[0] if "(" in beat else beat[0] |
|
beat_patterns.append((beat_type, count)) |
|
except (IndexError, ValueError): |
|
pass |
|
|
|
|
|
if total_expected > 0: |
|
|
|
expected_ratio = 0.2 |
|
threshold = max(0.5, round(total_expected * expected_ratio)) |
|
|
|
difference = abs(actual_count - total_expected) |
|
|
|
if difference > threshold: |
|
verification_notes.append(f"Second {i+1}: Expected {total_expected} syllables, got {actual_count}") |
|
total_mismatch_count += 1 |
|
|
|
|
|
words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) |
|
word_analysis = [] |
|
cumulative_syllables = 0 |
|
|
|
for word in words: |
|
syllable_count = count_syllables_for_word(word) |
|
stress_pattern = get_word_stress(word) |
|
|
|
word_analysis.append({ |
|
"word": word, |
|
"syllables": syllable_count, |
|
"stress_pattern": stress_pattern, |
|
"position": cumulative_syllables |
|
}) |
|
|
|
cumulative_syllables += syllable_count |
|
|
|
|
|
if beat_patterns: |
|
strong_positions = [] |
|
current_pos = 0 |
|
|
|
for beat_type, count in beat_patterns: |
|
if beat_type == "S": |
|
strong_positions.append(current_pos) |
|
current_pos += count |
|
|
|
|
|
for pos in strong_positions: |
|
for word_info in word_analysis: |
|
word_start = word_info["position"] |
|
word_end = word_start + word_info["syllables"] |
|
|
|
if word_start <= pos < word_end: |
|
|
|
syllable_in_word = int(pos - word_start) |
|
stress = word_info["stress_pattern"] |
|
|
|
if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1': |
|
verification_notes.append(f" → In second {i+1}, '{word_info['word']}' has unstressed syllable on strong beat") |
|
break |
|
|
|
|
|
if verification_notes: |
|
lyrics += "\n\n[Note: Potential rhythm mismatches detected in these lines:]\n" |
|
lyrics += "\n".join(verification_notes) |
|
|
|
if detailed_analysis: |
|
lyrics += "\n\n[Detailed Alignment Analysis:]\n" |
|
lyrics += "\n\n".join(detailed_analysis) |
|
|
|
lyrics += "\n\n[How to fix rhythm mismatches:]\n" |
|
lyrics += "1. Make sure stressed syllables (like 'LO' in 'LOV-er') fall on STRONG beats\n" |
|
lyrics += "2. Adjust syllable counts to match the template (add/remove words or use different words)\n" |
|
lyrics += "3. Try using words where natural stress aligns with musical rhythm\n" |
|
|
|
|
|
if stress_misalignments: |
|
lyrics += "\n[Specific word replacement suggestions:]\n" |
|
for issue in stress_misalignments[:5]: |
|
if issue["suggestion"]: |
|
lyrics += f"Line {issue['line']}: Consider replacing '{issue['word']}' with: {issue['suggestion']}\n" |
|
|
|
return lyrics |
|
|
|
def generate_alignment_visualization(line, beats_info, word_analysis): |
|
"""Generate a visual representation of syllable alignment with beats.""" |
|
if not beats_info or not word_analysis: |
|
return None |
|
|
|
|
|
syllable_breakdown = [] |
|
syllable_stresses = [] |
|
|
|
for word_info in word_analysis: |
|
word = word_info["word"] |
|
syllables = word_info["syllables"] |
|
stress = word_info["stress_pattern"] or "" |
|
|
|
|
|
while len(stress) < syllables: |
|
stress += "0" |
|
|
|
|
|
parts = naive_syllable_split(word, syllables) |
|
|
|
for i, part in enumerate(parts): |
|
syllable_breakdown.append(part) |
|
if i < len(stress): |
|
syllable_stresses.append(stress[i]) |
|
else: |
|
syllable_stresses.append("0") |
|
|
|
|
|
beat_types = [] |
|
current_pos = 0 |
|
|
|
for beat in beats_info: |
|
beat_type = beat.get("type", "-") |
|
count = beat.get("count", 1) |
|
|
|
|
|
if isinstance(count, int): |
|
beat_types.extend([beat_type] * count) |
|
else: |
|
|
|
whole_part = int(count) |
|
frac_part = count - whole_part |
|
|
|
if whole_part > 0: |
|
beat_types.extend([beat_type] * whole_part) |
|
|
|
if frac_part > 0: |
|
beat_types.append(f"{beat_type}½") |
|
|
|
|
|
while len(beat_types) < len(syllable_breakdown): |
|
beat_types.append("-") |
|
|
|
|
|
beat_types = beat_types[:len(syllable_breakdown)] |
|
|
|
|
|
result = [] |
|
|
|
|
|
syllable_display = [] |
|
for i, syllable in enumerate(syllable_breakdown): |
|
if i < len(syllable_stresses) and syllable_stresses[i] == "1": |
|
syllable_display.append(syllable.upper()) |
|
else: |
|
syllable_display.append(syllable.lower()) |
|
|
|
result.append(" - ".join(syllable_display)) |
|
|
|
|
|
beat_indicators = [] |
|
for i, (syllable, beat_type) in enumerate(zip(syllable_stresses, beat_types)): |
|
if beat_type == "S" or beat_type.startswith("S"): |
|
if syllable == "1": |
|
beat_indicators.append("↑") |
|
else: |
|
beat_indicators.append("❌") |
|
elif beat_type == "m" or beat_type.startswith("m"): |
|
beat_indicators.append("•") |
|
elif beat_type == "w" or beat_type.startswith("w"): |
|
beat_indicators.append("·") |
|
else: |
|
beat_indicators.append(" ") |
|
|
|
result.append(" ".join(beat_indicators)) |
|
|
|
|
|
result.append(" - ".join(beat_types)) |
|
|
|
return "\n".join(result) |
|
|
|
@functools.lru_cache(maxsize=256) |
|
def naive_syllable_split(word, syllable_count): |
|
"""Naively split a word into the specified number of syllables, with caching for performance.""" |
|
if syllable_count <= 1: |
|
return [word] |
|
|
|
|
|
vowels = "aeiouy" |
|
consonants = "bcdfghjklmnpqrstvwxz" |
|
|
|
|
|
splits = [] |
|
for i in range(1, len(word) - 1): |
|
if word[i] in consonants and word[i-1] in vowels: |
|
splits.append(i) |
|
elif word[i] in vowels and word[i-1] in consonants and word[i+1] in consonants: |
|
splits.append(i+1) |
|
|
|
|
|
while len(splits) < syllable_count - 1: |
|
for i in range(1, len(word)): |
|
if i not in splits: |
|
splits.append(i) |
|
break |
|
|
|
|
|
splits.sort() |
|
splits = splits[:syllable_count - 1] |
|
|
|
|
|
result = [] |
|
prev = 0 |
|
for pos in splits: |
|
result.append(word[prev:pos]) |
|
prev = pos |
|
|
|
result.append(word[prev:]) |
|
return result |
|
|
|
def get_stress_aligned_alternatives(word, position_to_stress): |
|
"""Suggest alternative words with proper stress at the required position.""" |
|
|
|
|
|
syllable_count = count_syllables_for_word(word) |
|
|
|
|
|
if syllable_count == 2: |
|
if position_to_stress == 0: |
|
first_stress = ["love-ly", "won-der", "beau-ty", "danc-ing", "dream-ing", |
|
"heart-beat", "sun-light", "moon-light", "star-light"] |
|
return ", ".join(first_stress[:3]) |
|
else: |
|
second_stress = ["be-LIEVE", "a-BOVE", "a-ROUND", "to-DAY", "a-LIVE", |
|
"a-LONE", "be-HOLD", "re-TURN", "de-LIGHT"] |
|
return ", ".join(second_stress[:3]) |
|
elif syllable_count == 3: |
|
if position_to_stress == 0: |
|
return "MEM-o-ry, WON-der-ful, BEAU-ti-ful" |
|
elif position_to_stress == 1: |
|
return "a-MAZE-ing, to-GE-ther, for-EV-er" |
|
else: |
|
return "un-der-STAND, o-ver-COME, ne-ver-MORE" |
|
|
|
|
|
return f"a word with stress on syllable {position_to_stress + 1}" |
|
|
|
def generate_lyrics(genre, duration, emotion_results, song_structure=None, lyrics_requirements=None): |
|
""" |
|
Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment. |
|
|
|
This improved version uses advanced template creation, better formatting, and verification with |
|
potential refinement for lyrics that perfectly match the musical rhythm patterns. |
|
|
|
Parameters: |
|
genre: Musical genre of the audio |
|
duration: Duration of the audio in seconds |
|
emotion_results: Dictionary containing emotional analysis results |
|
song_structure: Optional dictionary containing song structure analysis |
|
lyrics_requirements: Optional user-provided requirements for the lyrics |
|
|
|
Returns: |
|
Generated lyrics aligned with the rhythm patterns of the music |
|
""" |
|
|
|
def is_safe_dict_access(obj, key): |
|
"""Safe dictionary key access with type checking""" |
|
if not isinstance(obj, dict): |
|
print(f"WARNING: Attempted to access key '{key}' on non-dictionary object of type {type(obj)}") |
|
return False |
|
return key in obj |
|
|
|
|
|
if not isinstance(emotion_results, dict): |
|
emotion_results = { |
|
"emotion_analysis": {"primary_emotion": "Unknown"}, |
|
"theme_analysis": {"primary_theme": "Unknown"}, |
|
"rhythm_analysis": {"tempo": 0}, |
|
"tonal_analysis": {"key": "Unknown", "mode": ""}, |
|
"summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"} |
|
} |
|
|
|
|
|
if song_structure is not None and not isinstance(song_structure, dict): |
|
print(f"WARNING: song_structure is not a dict, it's {type(song_structure)}") |
|
song_structure = None |
|
|
|
print(f"DEBUG: Starting generate_lyrics with genre={genre}, duration={duration}") |
|
print(f"DEBUG: Type of song_structure={type(song_structure)}") |
|
print(f"DEBUG: Type of emotion_results={type(emotion_results)}") |
|
|
|
|
|
def safe_dict_get(d, key, default=None): |
|
"""Safely get a value from a dictionary, handling non-dictionary objects.""" |
|
if not isinstance(d, dict): |
|
print(f"WARNING: Attempted to access key '{key}' in non-dictionary object of type {type(d)}") |
|
return default |
|
return d.get(key, default) |
|
|
|
|
|
primary_emotion = safe_dict_get(safe_dict_get(emotion_results, "emotion_analysis", {}), "primary_emotion", "Unknown") |
|
primary_theme = safe_dict_get(safe_dict_get(emotion_results, "theme_analysis", {}), "primary_theme", "Unknown") |
|
|
|
|
|
try: |
|
tempo = float(safe_dict_get(safe_dict_get(emotion_results, "rhythm_analysis", {}), "tempo", 0.0)) |
|
except (ValueError, TypeError): |
|
tempo = 0.0 |
|
|
|
key = safe_dict_get(safe_dict_get(emotion_results, "tonal_analysis", {}), "key", "Unknown") |
|
mode = safe_dict_get(safe_dict_get(emotion_results, "tonal_analysis", {}), "mode", "") |
|
|
|
|
|
syllable_guidance = "" |
|
templates_for_verification = [] |
|
|
|
|
|
structure_visualization = "=== MUSIC-LYRICS STRUCTURE MATCHING ===\n\n" |
|
structure_visualization += f"Song Duration: {duration:.1f} seconds\n" |
|
structure_visualization += f"Tempo: {tempo:.1f} BPM\n\n" |
|
|
|
|
|
if song_structure and is_safe_dict_access(song_structure, "second_level") and is_safe_dict_access(song_structure.get("second_level", {}), "templates"): |
|
print(f"DEBUG: Using second-level templates") |
|
second_level_templates = song_structure.get("second_level", {}).get("templates", []) |
|
|
|
|
|
second_level_guidance = "\nSECOND-BY-SECOND RHYTHM INSTRUCTIONS:\n" |
|
second_level_guidance += "Each line below corresponds to ONE SECOND of audio. Follow these rhythm patterns EXACTLY:\n\n" |
|
|
|
|
|
formatted_second_templates = [] |
|
for i, template in enumerate(second_level_templates): |
|
if i < min(60, len(second_level_templates)): |
|
formatted_template = format_syllable_templates_for_prompt(template, arrow="→", line_wrap=0) |
|
formatted_second_templates.append(f"Second {i+1}: {formatted_template}") |
|
|
|
second_level_guidance += "\n".join(formatted_second_templates) |
|
|
|
|
|
second_level_guidance += "\n\nCRITICAL: Create ONE LINE of lyrics for EACH SECOND, following the exact rhythm pattern." |
|
second_level_guidance += "\nIf a second has no beats, use it for a breath or pause in the lyrics." |
|
second_level_guidance += "\nThe first line of your lyrics MUST match Second 1, the second line matches Second 2, and so on." |
|
|
|
|
|
syllable_guidance = second_level_guidance |
|
|
|
|
|
templates_for_verification = second_level_templates |
|
|
|
elif song_structure: |
|
print(f"DEBUG: Checking flexible structure") |
|
|
|
if is_safe_dict_access(song_structure, "flexible_structure"): |
|
print(f"DEBUG: Using flexible structure") |
|
flexible = song_structure.get("flexible_structure", {}) |
|
if is_safe_dict_access(flexible, "segments") and len(flexible.get("segments", [])) > 0: |
|
print(f"DEBUG: Found segments in flexible structure") |
|
|
|
segments = flexible.get("segments", []) |
|
|
|
|
|
structure_visualization += f"Total segments: {len(segments)}\n" |
|
structure_visualization += "Each segment represents one musical phrase for which you should write ONE line of lyrics.\n\n" |
|
|
|
|
|
enhanced_templates = [] |
|
|
|
for i, segment in enumerate(segments): |
|
if i < 30: |
|
|
|
segment_start = segment["start"] |
|
segment_end = segment["end"] |
|
|
|
|
|
structure_visualization += f"Segment {i+1}: {segment_start:.1f}s - {segment_end:.1f}s (duration: {segment_end-segment_start:.1f}s)\n" |
|
|
|
|
|
segment_beats = [] |
|
|
|
|
|
print(f"DEBUG: Checking beat_times in flexible structure") |
|
if is_safe_dict_access(flexible, "beats") and is_safe_dict_access(flexible.get("beats", {}), "beat_times"): |
|
beat_times = flexible.get("beats", {}).get("beat_times", []) |
|
if isinstance(beat_times, list): |
|
beat_strengths = flexible.get("beats", {}).get("beat_strengths", []) |
|
|
|
for j, beat_time in enumerate(beat_times): |
|
if segment_start <= beat_time < segment_end: |
|
|
|
segment_beats.append(j) |
|
|
|
|
|
segment_beats_info = { |
|
"beat_times": [beat_times[j] for j in segment_beats if j < len(beat_times)], |
|
"tempo": flexible.get("beats", {}).get("tempo", 120) |
|
} |
|
|
|
if beat_strengths and isinstance(beat_strengths, list): |
|
segment_beats_info["beat_strengths"] = [ |
|
beat_strengths[j] for j in segment_beats |
|
if j < len(beat_strengths) |
|
] |
|
|
|
|
|
segment_beats_info["phrases"] = [segment_beats] |
|
|
|
|
|
print(f"DEBUG: Creating flexible syllable template for segment {i+1}") |
|
enhanced_template = create_flexible_syllable_templates( |
|
segment_beats_info, |
|
genre=genre, |
|
phrase_mode='auto' if i == 0 else 'default' |
|
) |
|
enhanced_templates.append(enhanced_template) |
|
templates_for_verification.append(enhanced_template) |
|
|
|
|
|
structure_visualization += f" Template: {enhanced_template}\n" |
|
else: |
|
print(f"DEBUG: beat_times is not a list, it's {type(beat_times)}") |
|
else: |
|
print(f"DEBUG: beats or beat_times not found in flexible structure") |
|
|
|
continue |
|
|
|
|
|
pattern_groups = {} |
|
|
|
for i, template in enumerate(enhanced_templates): |
|
|
|
simple_pattern = template.replace("(", "").replace(")", "").replace(":", "") |
|
|
|
|
|
found_match = False |
|
for group, patterns in pattern_groups.items(): |
|
if any(simple_pattern == p.replace("(", "").replace(")", "").replace(":", "") for p in patterns): |
|
pattern_groups[group].append(template) |
|
found_match = True |
|
break |
|
|
|
if not found_match: |
|
|
|
group_name = f"Group_{len(pattern_groups) + 1}" |
|
pattern_groups[group_name] = [template] |
|
|
|
|
|
syllable_guidance = "CRITICAL RHYTHM INSTRUCTIONS:\n" |
|
syllable_guidance += "Each line of lyrics MUST match exactly with one musical phrase/segment.\n" |
|
syllable_guidance += "Follow these rhythm patterns for each line (STRONG beats need stressed syllables):\n\n" |
|
|
|
|
|
formatted_templates = [] |
|
for i, template in enumerate(enhanced_templates): |
|
formatted_templates.append(format_syllable_templates_for_prompt([template], arrow="→", line_wrap=8)) |
|
|
|
syllable_guidance += "\n".join(formatted_templates) |
|
|
|
|
|
use_sections = True |
|
|
|
|
|
if verse_lines > 0: |
|
verse_lines = min(verse_lines, total_lines // 2) |
|
else: |
|
verse_lines = total_lines // 2 |
|
|
|
if chorus_lines > 0: |
|
chorus_lines = min(chorus_lines, total_lines // 3) |
|
else: |
|
chorus_lines = total_lines // 3 |
|
|
|
if bridge_lines > 0: |
|
bridge_lines = min(bridge_lines, total_lines // 6) |
|
else: |
|
bridge_lines = 0 |
|
|
|
|
|
elif song_structure and is_safe_dict_access(song_structure, "syllables") and song_structure.get("syllables"): |
|
syllable_guidance = "RHYTHM PATTERN INSTRUCTIONS:\n" |
|
syllable_guidance += "Follow these syllable patterns for each section. Each line should match ONE phrase:\n\n" |
|
|
|
|
|
section_counts = {"verse": 0, "chorus": 0, "bridge": 0, "intro": 0, "outro": 0} |
|
|
|
for section in song_structure.get("syllables", []): |
|
if not isinstance(section, dict): |
|
continue |
|
|
|
section_type = section.get("type", "verse") |
|
section_counts[section_type] = section_counts.get(section_type, 0) + 1 |
|
|
|
if is_safe_dict_access(section, "syllable_template"): |
|
|
|
if is_safe_dict_access(song_structure, "beats") and is_safe_dict_access(song_structure.get("beats", {}), "beat_times"): |
|
section_beats_info = { |
|
"beat_times": [beat for beat in song_structure.get("beats", {}).get("beat_times", []) |
|
if section.get("start", 0) <= beat < section.get("end", 0)], |
|
"tempo": song_structure.get("beats", {}).get("tempo", 120) |
|
} |
|
|
|
if is_safe_dict_access(song_structure.get("beats", {}), "beat_strengths"): |
|
section_beats_info["beat_strengths"] = [ |
|
strength for i, strength in enumerate(song_structure.get("beats", {}).get("beat_strengths", [])) |
|
if i < len(song_structure.get("beats", {}).get("beat_times", [])) and |
|
section.get("start", 0) <= song_structure.get("beats", {}).get("beat_times", [])[i] < section.get("end", 0) |
|
] |
|
|
|
|
|
section_beats_info["phrases"] = [list(range(len(section_beats_info["beat_times"])))] |
|
|
|
|
|
section_beats_info["phrases"] = [list(range(len(section_beats_info["beat_times"])))] |
|
|
|
|
|
enhanced_template = create_flexible_syllable_templates( |
|
section_beats_info, |
|
genre=genre, |
|
phrase_mode='auto' if section['type'] == 'verse' else 'default' |
|
) |
|
|
|
syllable_guidance += f"[{section['type'].capitalize()}]:\n" |
|
syllable_guidance += format_syllable_templates_for_prompt( |
|
enhanced_template, |
|
arrow="→", |
|
line_wrap=6 |
|
) + "\n\n" |
|
templates_for_verification.append(section) |
|
elif "syllable_count" in section: |
|
syllable_guidance += f"[{section['type'].capitalize()}]: ~{section['syllable_count']} syllables total\n" |
|
|
|
|
|
structure_visualization += "Using traditional section-based structure:\n" |
|
for section_type, count in section_counts.items(): |
|
if count > 0: |
|
structure_visualization += f"{section_type.capitalize()}: {count} sections\n" |
|
|
|
|
|
verse_lines = max(2, section_counts.get("verse", 0) * 4) |
|
chorus_lines = max(2, section_counts.get("chorus", 0) * 4) |
|
bridge_lines = max(0, section_counts.get("bridge", 0) * 2) |
|
|
|
|
|
use_sections = True |
|
|
|
|
|
if not syllable_guidance: |
|
syllable_guidance = "RHYTHM ALIGNMENT INSTRUCTIONS:\n\n" |
|
syllable_guidance += "1. Align stressed syllables with strong beats (usually beats 1 and 3 in 4/4 time)\n" |
|
syllable_guidance += "2. Use unstressed syllables on weak beats (usually beats 2 and 4 in 4/4 time)\n" |
|
syllable_guidance += "3. Use appropriate syllable counts based on tempo:\n" |
|
syllable_guidance += " - Fast tempo (>120 BPM): 4-6 syllables per line\n" |
|
syllable_guidance += " - Medium tempo (90-120 BPM): 6-8 syllables per line\n" |
|
syllable_guidance += " - Slow tempo (<90 BPM): 8-10 syllables per line\n" |
|
|
|
|
|
structure_visualization += "Using estimated structure (no detailed analysis available):\n" |
|
|
|
|
|
estimated_lines = max(8, int(duration / 10)) |
|
structure_visualization += f"Estimated total lines: {estimated_lines}\n" |
|
|
|
|
|
verse_lines = estimated_lines // 2 |
|
chorus_lines = estimated_lines // 3 |
|
bridge_lines = estimated_lines // 6 if estimated_lines > 12 else 0 |
|
|
|
|
|
use_sections = True |
|
|
|
|
|
syllable_guidance += "\nEXAMPLES OF PERFECT RHYTHM ALIGNMENT:\n" |
|
syllable_guidance += "Pattern: S(0.95):1 → w(0.4):1 → m(0.7):1 → w(0.3):1\n" |
|
syllable_guidance += "Lyric: 'HEAR the MU-sic PLAY'\n" |
|
syllable_guidance += " ↑ ↑ ↑ ↑\n" |
|
syllable_guidance += " S w m w <- BEAT TYPE\n\n" |
|
|
|
syllable_guidance += "Pattern: S(0.9):2 → w(0.3):1 → S(0.85):1 → w(0.4):2\n" |
|
syllable_guidance += "Lyric: 'DANC-ing TO the RHYTHM of LOVE'\n" |
|
syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" |
|
syllable_guidance += " S S w S w w <- BEAT TYPE\n\n" |
|
|
|
syllable_guidance += "Pattern: S(0.92):1 → m(0.65):2 → S(0.88):1 → w(0.35):1\n" |
|
syllable_guidance += "Lyric: 'TIME keeps FLOW-ing ON and ON'\n" |
|
syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" |
|
syllable_guidance += " S m m S w w <- BEAT TYPE\n\n" |
|
|
|
|
|
genre_guidance = "" |
|
if any(term in genre.lower() for term in ["rap", "hip-hop", "hip hop"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR RAP/HIP-HOP RHYTHMS:\n" |
|
genre_guidance += "- Use more syllables per beat for rapid-fire sections\n" |
|
genre_guidance += "- Create internal rhymes within lines, not just at line endings\n" |
|
genre_guidance += "- Emphasize the first beat of each bar with strong consonants\n" |
|
elif any(term in genre.lower() for term in ["electronic", "edm", "techno", "house", "dance"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR ELECTRONIC MUSIC RHYTHMS:\n" |
|
genre_guidance += "- Use repetitive phrases that build and release tension\n" |
|
genre_guidance += "- Match syllables precisely to the beat grid\n" |
|
genre_guidance += "- Use short, percussive words on strong beats\n" |
|
elif any(term in genre.lower() for term in ["rock", "metal", "punk", "alternative"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR ROCK RHYTHMS:\n" |
|
genre_guidance += "- Use powerful, emotive words on downbeats\n" |
|
genre_guidance += "- Create contrast between verse and chorus energy levels\n" |
|
genre_guidance += "- Emphasize hooks with simple, memorable phrases\n" |
|
elif any(term in genre.lower() for term in ["folk", "country", "acoustic", "ballad"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR FOLK/ACOUSTIC RHYTHMS:\n" |
|
genre_guidance += "- Focus on storytelling with clear narrative flow\n" |
|
genre_guidance += "- Use natural speech patterns that flow conversationally\n" |
|
genre_guidance += "- Place important words at the start of phrases\n" |
|
|
|
|
|
syllable_guidance += genre_guidance |
|
|
|
|
|
syllable_guidance_text = syllable_guidance |
|
|
|
|
|
use_sections = True |
|
use_second_level = False |
|
|
|
if song_structure and "second_level" in song_structure and song_structure["second_level"]: |
|
use_second_level = True |
|
|
|
if isinstance(song_structure["second_level"], dict) and "templates" in song_structure["second_level"]: |
|
templates = song_structure["second_level"]["templates"] |
|
if isinstance(templates, list) and len(templates) > 0: |
|
use_sections = False |
|
elif song_structure and "flexible_structure" in song_structure and song_structure["flexible_structure"]: |
|
|
|
if "segments" in song_structure["flexible_structure"]: |
|
segments = song_structure["flexible_structure"]["segments"] |
|
if len(segments) > 4: |
|
use_sections = False |
|
|
|
|
|
if use_second_level: |
|
|
|
content = f""" |
|
You are a talented songwriter who specializes in {genre} music. |
|
Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long. |
|
|
|
IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics. |
|
|
|
Music analysis has detected the following qualities: |
|
- Tempo: {tempo:.1f} BPM |
|
- Key: {key} {mode} |
|
- Primary emotion: {primary_emotion} |
|
- Primary theme: {primary_theme} |
|
|
|
{syllable_guidance} |
|
|
|
CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: |
|
1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) |
|
2. Natural word stress patterns must match the beat strength (strong words on strong beats) |
|
3. Line breaks should occur at phrase endings for natural breathing |
|
4. Consonant clusters should be avoided on fast notes and strong beats |
|
5. Open vowels (a, e, o) work better for sustained notes and syllables |
|
6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) |
|
7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels |
|
|
|
The lyrics should: |
|
- Perfectly capture the essence and style of {genre} music |
|
- Express the {primary_emotion} emotion and {primary_theme} theme |
|
- Be completely original |
|
- Maintain a consistent theme throughout |
|
- Match the audio segment duration of {duration:.1f} seconds |
|
|
|
Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY. |
|
|
|
IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics. |
|
|
|
IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" |
|
where you analyze how well the lyrics align with the musical rhythm. This section MUST appear |
|
even if there are no rhythm issues. Include the following in your analysis: |
|
1. Syllable counts for each line and how they match the rhythm pattern |
|
2. Where stressed syllables align with strong beats |
|
3. Any potential misalignments or improvements |
|
|
|
Your lyrics: |
|
""" |
|
|
|
|
|
if lyrics_requirements and lyrics_requirements.strip(): |
|
content += f""" |
|
USER REQUIREMENTS: |
|
{lyrics_requirements.strip()} |
|
|
|
The lyrics MUST incorporate these user requirements while still following the rhythm patterns. |
|
""" |
|
|
|
content += """ |
|
Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY. |
|
|
|
IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics. |
|
|
|
IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" |
|
where you analyze how well the lyrics align with the musical rhythm. This section MUST appear |
|
even if there are no rhythm issues. Include the following in your analysis: |
|
1. Syllable counts for each line and how they match the rhythm pattern |
|
2. Where stressed syllables align with strong beats |
|
3. Any potential misalignments or improvements |
|
|
|
Your lyrics: |
|
""" |
|
elif use_sections: |
|
|
|
content = f""" |
|
You are a talented songwriter who specializes in {genre} music. |
|
Write original {genre} song lyrics for a song that is {duration:.1f} seconds long. |
|
|
|
IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics. |
|
|
|
Music analysis has detected the following qualities in the music: |
|
- Tempo: {tempo:.1f} BPM |
|
- Key: {key} {mode} |
|
- Primary emotion: {primary_emotion} |
|
- Primary theme: {primary_theme} |
|
|
|
{syllable_guidance} |
|
|
|
CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: |
|
1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) |
|
2. Natural word stress patterns must match the beat strength (strong words on strong beats) |
|
3. Line breaks should occur at phrase endings for natural breathing |
|
4. Consonant clusters should be avoided on fast notes and strong beats |
|
5. Open vowels (a, e, o) work better for sustained notes and syllables |
|
6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) |
|
7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels |
|
|
|
The lyrics should: |
|
- Perfectly capture the essence and style of {genre} music |
|
- Express the {primary_emotion} emotion and {primary_theme} theme |
|
- Follow the structure patterns provided above |
|
- Be completely original |
|
- Match the song duration of {duration:.1f} seconds |
|
""" |
|
|
|
|
|
if lyrics_requirements and lyrics_requirements.strip(): |
|
content += f""" |
|
USER REQUIREMENTS: |
|
{lyrics_requirements.strip()} |
|
|
|
The lyrics MUST incorporate these user requirements while still following the rhythm patterns. |
|
""" |
|
|
|
content += """ |
|
IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics. |
|
|
|
IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" |
|
where you analyze how well the lyrics align with the musical rhythm. This section MUST appear |
|
even if there are no rhythm issues. Include the following in your analysis: |
|
1. Syllable counts for each line and how they match the rhythm pattern |
|
2. Where stressed syllables align with strong beats |
|
3. Any potential misalignments or improvements |
|
|
|
Your lyrics: |
|
""" |
|
else: |
|
|
|
content = f""" |
|
You are a talented songwriter who specializes in {genre} music. |
|
Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long. |
|
|
|
IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics. |
|
|
|
Music analysis has detected the following qualities: |
|
- Tempo: {tempo:.1f} BPM |
|
- Key: {key} {mode} |
|
- Primary emotion: {primary_emotion} |
|
- Primary theme: {primary_theme} |
|
|
|
{syllable_guidance} |
|
|
|
CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: |
|
1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) |
|
2. Natural word stress patterns must match the beat strength (strong words on strong beats) |
|
3. Line breaks should occur at phrase endings for natural breathing |
|
4. Consonant clusters should be avoided on fast notes and strong beats |
|
5. Open vowels (a, e, o) work better for sustained notes and syllables |
|
6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) |
|
7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels |
|
|
|
The lyrics should: |
|
- Perfectly capture the essence and style of {genre} music |
|
- Express the {primary_emotion} emotion and {primary_theme} theme |
|
- Be completely original |
|
- Maintain a consistent theme throughout |
|
- Match the audio segment duration of {duration:.1f} seconds |
|
""" |
|
|
|
|
|
if lyrics_requirements and lyrics_requirements.strip(): |
|
content += f""" |
|
USER REQUIREMENTS: |
|
{lyrics_requirements.strip()} |
|
|
|
The lyrics MUST incorporate these user requirements while still following the rhythm patterns. |
|
""" |
|
|
|
content += """ |
|
Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above. |
|
Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY. |
|
|
|
IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics. |
|
|
|
IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" |
|
where you analyze how well the lyrics align with the musical rhythm. This section MUST appear |
|
even if there are no rhythm issues. Include the following in your analysis: |
|
1. Syllable counts for each line and how they match the rhythm pattern |
|
2. Where stressed syllables align with strong beats |
|
3. Any potential misalignments or improvements |
|
|
|
Your lyrics: |
|
""" |
|
|
|
|
|
messages = [ |
|
{"role": "system", "content": "You are a professional songwriter. Create lyrics that match the specified rhythm patterns EXACTLY. Be extremely concise - use only the EXACT number of syllables specified for each line. For short phrases (1 second or less), use just 2-3 MAXIMUM syllables. Include lyrics for EVERY musical section - do not leave any section empty. Use one-syllable words whenever possible for better singability. Avoid complex vocabulary. For all beat patterns, use fewer syllables than you think you need. Start with the lyrics immediately without any explanation or thinking."}, |
|
{"role": "user", "content": content} |
|
] |
|
|
|
|
|
text = llm_tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
|
|
|
|
model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device) |
|
|
|
|
|
generation_params = { |
|
"do_sample": True, |
|
"temperature": 0.5, |
|
"top_p": 0.85, |
|
"top_k": 50, |
|
"repetition_penalty": 1.2, |
|
"max_new_tokens": 2048, |
|
"num_return_sequences": 1 |
|
} |
|
|
|
|
|
if hasattr(llm_model.generation_config, "stopping_criteria"): |
|
thinking_stops = ["Let me think", "First, I need to", "Let's analyze", "I'll approach this", "Step 1:", "To start,"] |
|
for stop in thinking_stops: |
|
if stop not in llm_model.generation_config.stopping_criteria: |
|
llm_model.generation_config.stopping_criteria.append(stop) |
|
|
|
|
|
generated_ids = llm_model.generate( |
|
**model_inputs, |
|
**generation_params |
|
) |
|
|
|
|
|
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() |
|
|
|
|
|
lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip() |
|
|
|
|
|
|
|
if "<thinking>" in lyrics and "</thinking>" in lyrics: |
|
lyrics = lyrics.split("</thinking>")[1].strip() |
|
|
|
|
|
|
|
|
|
if lyrics: |
|
|
|
cleaned_lines = [] |
|
for line in lyrics.split('\n'): |
|
if not line.strip().startswith('**') and not 'alignment:' in line.lower(): |
|
cleaned_lines.append(line) |
|
lyrics = '\n'.join(cleaned_lines) |
|
|
|
|
|
max_reasonable_line_length = 80 |
|
final_lines = [] |
|
for line in lyrics.split('\n'): |
|
if len(line) <= max_reasonable_line_length or '[' in line or ']' in line: |
|
final_lines.append(line) |
|
lyrics = '\n'.join(final_lines) |
|
thinking_markers = [ |
|
"<think>", "</think>", |
|
"[thinking]", "[/thinking]", |
|
"I'll think step by step:", |
|
"First, I need to understand", |
|
"Let me think about", |
|
"Let's tackle this query", |
|
"Okay, let's tackle this query", |
|
"First, I need to understand the requirements", |
|
"Looking at the rhythm patterns" |
|
] |
|
|
|
|
|
for marker in thinking_markers: |
|
if marker in lyrics: |
|
parts = lyrics.split(marker) |
|
if len(parts) > 1: |
|
lyrics = parts[-1].strip() |
|
|
|
|
|
analytical_patterns = [ |
|
"Let me analyze", |
|
"I need to understand", |
|
"The tempo is", |
|
"First, let's look at", |
|
"Wait, maybe", |
|
"Considering the emotional tone", |
|
"Starting with the first line", |
|
"Let me check the examples" |
|
] |
|
|
|
|
|
for pattern in analytical_patterns: |
|
if lyrics.startswith(pattern): |
|
|
|
lyrics_markers = [ |
|
"\n\n[Verse", |
|
"\n\n[Chorus", |
|
"\n\nVerse", |
|
"\n\nChorus", |
|
"\n\n[Verse 1]", |
|
"\n\n[Intro]" |
|
] |
|
|
|
for marker in lyrics_markers: |
|
if marker in lyrics: |
|
lyrics = lyrics[lyrics.index(marker):].strip() |
|
break |
|
|
|
|
|
|
|
if len(lyrics.split()) > 100 and "\n\n" in lyrics: |
|
paragraphs = lyrics.split("\n\n") |
|
for i, paragraph in enumerate(paragraphs): |
|
|
|
if any(marker in paragraph for marker in ["[Verse", "[Chorus", "Verse 1", "Chorus:"]): |
|
lyrics = "\n\n".join(paragraphs[i:]) |
|
break |
|
|
|
|
|
lines = lyrics.split('\n') |
|
clean_lines = [] |
|
lyrics_started = False |
|
|
|
for line in lines: |
|
|
|
if not lyrics_started: |
|
if (line.strip().startswith('[') and ']' in line) or not any(thinking in line.lower() for thinking in ["i think", "let me", "maybe", "perhaps", "alternatively", "checking"]): |
|
lyrics_started = True |
|
|
|
if lyrics_started: |
|
clean_lines.append(line) |
|
|
|
|
|
if clean_lines: |
|
lyrics = '\n'.join(clean_lines) |
|
|
|
|
|
second_level_verification = None |
|
if song_structure and "second_level" in song_structure and song_structure["second_level"]: |
|
if isinstance(song_structure["second_level"], dict) and "templates" in song_structure["second_level"]: |
|
second_level_verification = song_structure["second_level"]["templates"] |
|
if not isinstance(second_level_verification, list): |
|
second_level_verification = None |
|
|
|
|
|
if song_structure and "second_level" in song_structure and song_structure["second_level"]: |
|
if "templates" in song_structure["second_level"] and isinstance(song_structure["second_level"]["templates"], list): |
|
|
|
if lyrics: |
|
lines = [line.strip() for line in lyrics.split('\n') if line.strip()] |
|
|
|
|
|
second_count = len(song_structure["second_level"]["templates"]) |
|
if 0 < len(lines) < second_count: |
|
|
|
distributed_lines = [] |
|
for i in range(second_count): |
|
distributed_lines.append(lines[i % len(lines)]) |
|
|
|
|
|
lyrics = '\n'.join(distributed_lines) |
|
|
|
|
|
if templates_for_verification: |
|
|
|
|
|
if isinstance(templates_for_verification, list): |
|
safe_templates = [] |
|
for template in templates_for_verification: |
|
if isinstance(template, dict): |
|
processed_template = {} |
|
for k, v in template.items(): |
|
if isinstance(v, np.ndarray): |
|
if v.size == 1: |
|
processed_template[k] = float(v.item()) |
|
else: |
|
processed_template[k] = [float(x) if isinstance(x, np.number) else x for x in v] |
|
elif isinstance(v, np.number): |
|
processed_template[k] = float(v) |
|
else: |
|
processed_template[k] = v |
|
safe_templates.append(processed_template) |
|
else: |
|
safe_templates.append(template) |
|
else: |
|
safe_templates = templates_for_verification |
|
|
|
|
|
try: |
|
print(f"DEBUG: Calling verify_flexible_syllable_counts") |
|
print(f"DEBUG: Type of lyrics: {type(lyrics)}") |
|
print(f"DEBUG: Type of safe_templates: {type(safe_templates)}") |
|
print(f"DEBUG: Type of second_level_verification: {type(second_level_verification)}") |
|
|
|
verified_lyrics = verify_flexible_syllable_counts(lyrics, safe_templates, second_level_verification) |
|
print(f"DEBUG: Type of verified_lyrics: {type(verified_lyrics)}") |
|
|
|
except Exception as e: |
|
print(f"ERROR in verify_flexible_syllable_counts: {str(e)}") |
|
|
|
return { |
|
"lyrics": lyrics if isinstance(lyrics, str) else str(lyrics), |
|
"rhythm_analysis": f"Error in rhythm analysis: {str(e)}", |
|
"syllable_analysis": "Error performing syllable analysis", |
|
"prompt_template": "Error generating prompt template" |
|
} |
|
|
|
if isinstance(verified_lyrics, str) and "[Note: Potential rhythm mismatches" in verified_lyrics and "Detailed Alignment Analysis" in verified_lyrics: |
|
|
|
original_lyrics = lyrics.split("[Note:")[0].strip() if isinstance(lyrics, str) else str(lyrics) |
|
|
|
|
|
analysis = verified_lyrics.split("[Note:")[1] if "[Note:" in verified_lyrics else "" |
|
|
|
|
|
if "stress misalignments" in analysis and len(templates_for_verification) > 0: |
|
|
|
refinement_prompt = f""" |
|
You need to fix rhythm issues in these lyrics. Here's the analysis of the problems: |
|
|
|
{analysis} |
|
|
|
Revise the lyrics to perfectly match the rhythm pattern while maintaining the theme. |
|
Focus on fixing the stress misalignments by placing stressed syllables on STRONG beats. |
|
|
|
Original lyrics: |
|
{original_lyrics} |
|
|
|
Improved lyrics with fixed rhythm: |
|
""" |
|
|
|
refinement_messages = [ |
|
{"role": "user", "content": refinement_prompt} |
|
] |
|
|
|
|
|
refinement_text = llm_tokenizer.apply_chat_template( |
|
refinement_messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
|
|
try: |
|
|
|
refinement_inputs = llm_tokenizer([refinement_text], return_tensors="pt").to(llm_model.device) |
|
|
|
|
|
refinement_params = { |
|
"do_sample": True, |
|
"temperature": 0.4, |
|
"top_p": 0.9, |
|
"repetition_penalty": 1.3, |
|
"max_new_tokens": 1024 |
|
} |
|
|
|
refined_ids = llm_model.generate( |
|
**refinement_inputs, |
|
**refinement_params |
|
) |
|
|
|
|
|
refined_output_ids = refined_ids[0][len(refinement_inputs.input_ids[0]):].tolist() |
|
refined_lyrics = llm_tokenizer.decode(refined_output_ids, skip_special_tokens=True).strip() |
|
|
|
|
|
try: |
|
refined_verified_lyrics = verify_flexible_syllable_counts(refined_lyrics, safe_templates, second_level_verification) |
|
|
|
|
|
if "[Note: Potential rhythm mismatches" not in refined_verified_lyrics: |
|
lyrics = refined_lyrics |
|
elif refined_verified_lyrics.count("misalignments") < verified_lyrics.count("misalignments"): |
|
lyrics = refined_verified_lyrics |
|
else: |
|
lyrics = verified_lyrics |
|
except Exception as e: |
|
print(f"Error in refined lyrics verification: {str(e)}") |
|
lyrics = verified_lyrics |
|
except Exception as e: |
|
print(f"Error in lyrics refinement: {str(e)}") |
|
lyrics = verified_lyrics |
|
else: |
|
|
|
lyrics = verified_lyrics |
|
else: |
|
|
|
lyrics = verified_lyrics |
|
|
|
|
|
if "[RHYTHM_ANALYSIS_SECTION]" in lyrics: |
|
|
|
parts = lyrics.split("[RHYTHM_ANALYSIS_SECTION]") |
|
clean_lyrics = parts[0].strip() |
|
rhythm_analysis = parts[1].strip() |
|
|
|
|
|
lyrics = clean_lyrics + "\n\n[Note: Rhythm Analysis]\n" + rhythm_analysis |
|
|
|
|
|
elif "[Note: Potential rhythm mismatches" in lyrics: |
|
|
|
pass |
|
else: |
|
|
|
lyrics = lyrics + "\n\n[Note: Rhythm Analysis]\nNo rhythm issues detected. All syllables align well with the beat pattern." |
|
|
|
|
|
if isinstance(lyrics, str): |
|
|
|
if "[Note: Rhythm Analysis]" in lyrics: |
|
clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() |
|
rhythm_analysis = lyrics.split("[Note: Rhythm Analysis]")[1] |
|
elif "[Note: Potential rhythm mismatches" in lyrics: |
|
clean_lyrics = lyrics.split("[Note:")[0].strip() |
|
rhythm_analysis = "[Note:" + lyrics.split("[Note:")[1] |
|
else: |
|
clean_lyrics = lyrics |
|
rhythm_analysis = "No rhythm analysis available" |
|
|
|
|
|
syllable_analysis = "=== SYLLABLE ANALYSIS ===\n\n" |
|
if templates_for_verification: |
|
syllable_analysis += "Template Analysis:\n" |
|
for i, template in enumerate(templates_for_verification): |
|
if i < min(len(templates_for_verification), 30): |
|
syllable_analysis += f"Line {i+1}:\n" |
|
if isinstance(template, dict): |
|
if "syllable_template" in template: |
|
syllable_analysis += f" Template: {template['syllable_template']}\n" |
|
if "syllable_count" in template: |
|
syllable_analysis += f" Expected syllables: {template['syllable_count']}\n" |
|
elif isinstance(template, str): |
|
syllable_analysis += f" Template: {template}\n" |
|
syllable_analysis += "\n" |
|
|
|
if len(templates_for_verification) > 30: |
|
syllable_analysis += f"... and {len(templates_for_verification) - 30} more lines\n\n" |
|
|
|
|
|
if second_level_verification: |
|
syllable_analysis += "\nSecond-Level Template Analysis:\n" |
|
for i, template in enumerate(second_level_verification): |
|
if i < min(len(second_level_verification), 30): |
|
syllable_analysis += f"Second {i+1}: {template}\n" |
|
|
|
if len(second_level_verification) > 30: |
|
syllable_analysis += f"... and {len(second_level_verification) - 30} more seconds\n" |
|
|
|
|
|
syllable_analysis += "\n" + structure_visualization |
|
|
|
|
|
prompt_template = "=== PROMPT TEMPLATE ===\n\n" |
|
prompt_template += "Genre: " + genre + "\n" |
|
prompt_template += f"Duration: {duration:.1f} seconds\n" |
|
prompt_template += f"Tempo: {tempo:.1f} BPM\n" |
|
prompt_template += f"Key: {key} {mode}\n" |
|
prompt_template += f"Primary Emotion: {primary_emotion}\n" |
|
prompt_template += f"Primary Theme: {primary_theme}\n\n" |
|
prompt_template += "Syllable Guidance:\n" + syllable_guidance_text |
|
|
|
|
|
return { |
|
"lyrics": clean_lyrics, |
|
"rhythm_analysis": rhythm_analysis, |
|
"syllable_analysis": syllable_analysis, |
|
"prompt_template": prompt_template |
|
} |
|
|
|
return { |
|
"lyrics": lyrics, |
|
"rhythm_analysis": "No rhythm analysis available", |
|
"syllable_analysis": "No syllable analysis available", |
|
"prompt_template": "No prompt template available" |
|
} |
|
|
|
def detect_voice_activity(audio_file): |
|
""" |
|
Detect segments with voice/singing in audio using pyannote/voice-activity-detection |
|
|
|
Args: |
|
audio_file: Path to audio file |
|
|
|
Returns: |
|
List of dictionaries with start and end times of voice segments |
|
""" |
|
try: |
|
print("Detecting voice activity in audio...") |
|
|
|
hf_token = os.environ.get("HF_TOKEN", None) |
|
|
|
|
|
vad_pipeline = Pipeline.from_pretrained( |
|
"pyannote/voice-activity-detection", |
|
use_auth_token=hf_token |
|
) |
|
|
|
|
|
output = vad_pipeline(audio_file) |
|
|
|
|
|
voice_segments = [] |
|
for speech in output.get_timeline().support(): |
|
voice_segments.append({ |
|
"start": speech.start, |
|
"end": speech.end, |
|
"duration": speech.end - speech.start |
|
}) |
|
|
|
print(f"Detected {len(voice_segments)} voice segments") |
|
return voice_segments |
|
|
|
except Exception as e: |
|
print(f"Error detecting voice activity: {str(e)}") |
|
|
|
return [] |
|
|
|
def process_audio(audio_file, lyrics_requirements=None): |
|
"""Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis.""" |
|
if audio_file is None: |
|
return "Please upload an audio file.", None, None |
|
|
|
try: |
|
print("Step 1/6: Extracting audio features...") |
|
|
|
audio_data = extract_audio_features(audio_file) |
|
|
|
print("Step 2/6: Verifying audio contains music...") |
|
|
|
try: |
|
is_music, ast_results = detect_music(audio_data) |
|
except Exception as e: |
|
print(f"Error in music detection: {str(e)}") |
|
return f"Error in music detection: {str(e)}", None, ast_results |
|
|
|
if not is_music: |
|
return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results |
|
|
|
print("Step 3/6: Detecting voice activity segments...") |
|
|
|
voice_segments = detect_voice_activity(audio_file) |
|
|
|
print("Step 4/6: Classifying music genre...") |
|
|
|
try: |
|
top_genres = classify_genre(audio_data) |
|
|
|
genre_results = format_genre_results(top_genres) |
|
if not isinstance(top_genres, list) or len(top_genres) == 0: |
|
|
|
top_genres = [("rock", 1.0)] |
|
except Exception as e: |
|
print(f"Error in genre classification: {str(e)}") |
|
top_genres = [("rock", 1.0)] |
|
return f"Error in genre classification: {str(e)}", None, ast_results |
|
|
|
|
|
ast_results = ast_results if ast_results else [] |
|
song_structure = None |
|
emotion_results = { |
|
"emotion_analysis": {"primary_emotion": "Unknown"}, |
|
"theme_analysis": {"primary_theme": "Unknown"}, |
|
"rhythm_analysis": {"tempo": 0}, |
|
"tonal_analysis": {"key": "Unknown", "mode": ""}, |
|
"summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"} |
|
} |
|
|
|
print("Step 5/6: Analyzing music emotions, themes, and structure...") |
|
|
|
try: |
|
emotion_results = music_analyzer.analyze_music(audio_file) |
|
except Exception as e: |
|
print(f"Error in emotion analysis: {str(e)}") |
|
|
|
|
|
|
|
try: |
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
|
|
|
|
beats_info = detect_beats(y, sr) |
|
sections_info = detect_sections(y, sr) |
|
|
|
|
|
segments = [] |
|
|
|
|
|
if voice_segments and len(voice_segments) > 0: |
|
segments = voice_segments |
|
print(f"Using {len(segments)} voice segments for lyrics generation") |
|
|
|
elif sections_info and len(sections_info) > 1: |
|
min_segment_duration = 1.5 |
|
|
|
for section in sections_info: |
|
section_start = section["start"] |
|
section_end = section["end"] |
|
section_duration = section["duration"] |
|
|
|
|
|
if section_duration < min_segment_duration * 1.5: |
|
segments.append({ |
|
"start": section_start, |
|
"end": section_end, |
|
"duration": section_duration |
|
}) |
|
else: |
|
|
|
|
|
ideal_segment_duration = 3.0 |
|
segment_count = max(1, int(section_duration / ideal_segment_duration)) |
|
|
|
|
|
segment_duration = section_duration / segment_count |
|
for i in range(segment_count): |
|
segment_start = section_start + i * segment_duration |
|
segment_end = segment_start + segment_duration |
|
segments.append({ |
|
"start": segment_start, |
|
"end": segment_end, |
|
"duration": segment_duration |
|
}) |
|
|
|
elif beats_info and len(beats_info["beat_times"]) > 4: |
|
beats = beats_info["beat_times"] |
|
time_signature = beats_info.get("time_signature", 4) |
|
|
|
|
|
measure_size = time_signature |
|
for i in range(0, len(beats), measure_size): |
|
if i + 1 < len(beats): |
|
measure_start = beats[i] |
|
|
|
if i + measure_size < len(beats): |
|
measure_end = beats[i + measure_size] |
|
else: |
|
|
|
if i > 0: |
|
beat_interval = beats[i] - beats[i-1] |
|
measure_end = beats[-1] + (beat_interval * (measure_size - (len(beats) - i))) |
|
else: |
|
measure_end = audio_data["duration"] |
|
|
|
segments.append({ |
|
"start": measure_start, |
|
"end": measure_end |
|
}) |
|
|
|
else: |
|
|
|
segment_duration = 3.0 |
|
total_segments = max(4, int(audio_data["duration"] / segment_duration)) |
|
segment_duration = audio_data["duration"] / total_segments |
|
|
|
for i in range(total_segments): |
|
segment_start = i * segment_duration |
|
segment_end = segment_start + segment_duration |
|
segments.append({ |
|
"start": segment_start, |
|
"end": segment_end |
|
}) |
|
|
|
|
|
flexible_structure = { |
|
"beats": beats_info, |
|
"segments": segments |
|
} |
|
|
|
|
|
song_structure = { |
|
"beats": beats_info, |
|
"sections": sections_info, |
|
"flexible_structure": flexible_structure, |
|
"syllables": [] |
|
} |
|
|
|
|
|
for section in sections_info: |
|
|
|
section_has_voice = False |
|
for voice_segment in voice_segments: |
|
|
|
if (section["start"] <= voice_segment["end"] and |
|
section["end"] >= voice_segment["start"]): |
|
section_has_voice = True |
|
break |
|
|
|
|
|
section_beats_info = { |
|
"beat_times": [beat for beat in beats_info["beat_times"] |
|
if section["start"] <= beat < section["end"]], |
|
"tempo": beats_info.get("tempo", 120) |
|
} |
|
if "beat_strengths" in beats_info: |
|
section_beats_info["beat_strengths"] = [ |
|
strength for i, strength in enumerate(beats_info["beat_strengths"]) |
|
if i < len(beats_info["beat_times"]) and |
|
section["start"] <= beats_info["beat_times"][i] < section["end"] |
|
] |
|
|
|
|
|
|
|
syllable_count = int(section["duration"] * (beats_info.get("tempo", 120) / 60) * 1.5) if section_has_voice else 0 |
|
|
|
section_info = { |
|
"type": section["type"], |
|
"start": section["start"], |
|
"end": section["end"], |
|
"duration": section["duration"], |
|
"has_voice": section_has_voice, |
|
"syllable_count": syllable_count, |
|
"beat_count": len(section_beats_info["beat_times"]) |
|
} |
|
|
|
|
|
if len(section_beats_info["beat_times"]) >= 2 and section_has_voice: |
|
|
|
if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple): |
|
genre_name = top_genres[0][0] |
|
else: |
|
genre_name = "unknown" |
|
|
|
section_info["syllable_template"] = create_flexible_syllable_templates( |
|
section_beats_info, |
|
genre=genre_name |
|
) |
|
|
|
song_structure["syllables"].append(section_info) |
|
|
|
|
|
try: |
|
|
|
subbeat_info = detect_beats_and_subbeats(y, sr, subdivision=4) |
|
|
|
|
|
sec_map = map_beats_to_seconds( |
|
subbeat_info["subbeat_times"], |
|
audio_data["duration"] |
|
) |
|
|
|
|
|
|
|
genre_name = "unknown" |
|
if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple): |
|
genre_name = top_genres[0][0] |
|
|
|
second_level_templates = create_second_level_templates( |
|
sec_map, |
|
subbeat_info["tempo"], |
|
genre_name |
|
) |
|
|
|
|
|
song_structure["second_level"] = { |
|
"sec_map": sec_map, |
|
"templates": second_level_templates |
|
} |
|
|
|
except Exception as e: |
|
print(f"Error in second-level beat analysis: {str(e)}") |
|
|
|
|
|
except Exception as e: |
|
print(f"Error analyzing song structure: {str(e)}") |
|
|
|
|
|
print("Step 6/6: Generating rhythmically aligned lyrics...") |
|
|
|
try: |
|
|
|
primary_genre = "unknown" |
|
if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple): |
|
primary_genre, _ = top_genres[0] |
|
|
|
|
|
sanitized_song_structure = None |
|
if song_structure: |
|
sanitized_song_structure = {} |
|
|
|
|
|
if "beats" in song_structure and isinstance(song_structure["beats"], dict): |
|
sanitized_song_structure["beats"] = song_structure["beats"] |
|
|
|
|
|
if "sections" in song_structure and isinstance(song_structure["sections"], list): |
|
sanitized_song_structure["sections"] = song_structure["sections"] |
|
|
|
|
|
if "flexible_structure" in song_structure and isinstance(song_structure["flexible_structure"], dict): |
|
flex_struct = song_structure["flexible_structure"] |
|
sanitized_flex = {} |
|
|
|
|
|
if "segments" in flex_struct and isinstance(flex_struct["segments"], list): |
|
sanitized_flex["segments"] = flex_struct["segments"] |
|
|
|
|
|
if "beats" in flex_struct and isinstance(flex_struct["beats"], dict): |
|
sanitized_flex["beats"] = flex_struct["beats"] |
|
|
|
sanitized_song_structure["flexible_structure"] = sanitized_flex |
|
|
|
|
|
if "syllables" in song_structure and isinstance(song_structure["syllables"], list): |
|
sanitized_song_structure["syllables"] = song_structure["syllables"] |
|
|
|
|
|
if "second_level" in song_structure and isinstance(song_structure["second_level"], dict): |
|
second_level = song_structure["second_level"] |
|
sanitized_second = {} |
|
|
|
if "templates" in second_level and isinstance(second_level["templates"], list): |
|
sanitized_second["templates"] = second_level["templates"] |
|
|
|
if "sec_map" in second_level and isinstance(second_level["sec_map"], list): |
|
sanitized_second["sec_map"] = second_level["sec_map"] |
|
|
|
sanitized_song_structure["second_level"] = sanitized_second |
|
|
|
try: |
|
print("Calling generate_lyrics function...") |
|
|
|
lyrics_result = generate_lyrics(primary_genre, audio_data["duration"], emotion_results, |
|
sanitized_song_structure, lyrics_requirements) |
|
print(f"Type of lyrics_result: {type(lyrics_result)}") |
|
|
|
|
|
if isinstance(lyrics_result, dict) and all(k in lyrics_result for k in ["lyrics"]): |
|
lyrics = lyrics_result.get("lyrics", "No lyrics generated") |
|
rhythm_analysis = lyrics_result.get("rhythm_analysis", "No rhythm analysis available") |
|
syllable_analysis = lyrics_result.get("syllable_analysis", "No syllable analysis available") |
|
prompt_template = lyrics_result.get("prompt_template", "No prompt template available") |
|
else: |
|
|
|
lyrics = str(lyrics_result) if lyrics_result is not None else "No lyrics generated" |
|
rhythm_analysis = "No detailed rhythm analysis available" |
|
syllable_analysis = "No syllable analysis available" |
|
prompt_template = "No prompt template available" |
|
except Exception as inner_e: |
|
print(f"Inner error in lyrics generation: {str(inner_e)}") |
|
|
|
lyrics = f"Error generating lyrics: {str(inner_e)}" |
|
rhythm_analysis = "Error in rhythm analysis" |
|
syllable_analysis = "Error in syllable analysis" |
|
prompt_template = "Error in prompt template generation" |
|
|
|
except Exception as e: |
|
print(f"Outer error in lyrics generation: {str(e)}") |
|
lyrics = f"Error generating lyrics: {str(e)}" |
|
rhythm_analysis = "No rhythm analysis available" |
|
syllable_analysis = "No syllable analysis available" |
|
prompt_template = "No prompt template available" |
|
|
|
results = { |
|
"genre_results": genre_results, |
|
"lyrics": lyrics, |
|
"rhythm_analysis": rhythm_analysis, |
|
"syllable_analysis": syllable_analysis, |
|
"prompt_template": prompt_template, |
|
"ast_results": ast_results, |
|
"voice_segments": voice_segments |
|
} |
|
|
|
return results |
|
|
|
except Exception as e: |
|
error_msg = f"Error processing audio: {str(e)}" |
|
print(error_msg) |
|
return error_msg, None, [] |
|
|
|
def format_complete_beat_timeline(audio_file, lyrics=None): |
|
"""Creates a complete formatted timeline showing all beat timings and their syllable patterns without truncation""" |
|
if audio_file is None: |
|
return "Please upload an audio file to see beat timeline." |
|
|
|
try: |
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
|
|
|
|
beats_info = detect_beats(y, sr) |
|
|
|
|
|
try: |
|
voice_segments = detect_voice_activity(audio_file) |
|
except Exception as e: |
|
print(f"Error detecting voice segments: {str(e)}") |
|
voice_segments = [] |
|
|
|
|
|
def ensure_float(value): |
|
if isinstance(value, np.ndarray) or isinstance(value, np.number): |
|
return float(value) |
|
return value |
|
|
|
|
|
timeline = "=== BEAT & SYLLABLE TIMELINE ===\n\n" |
|
|
|
tempo = ensure_float(beats_info['tempo']) |
|
tempo_confidence = ensure_float(beats_info.get('tempo_confidence', 90.0)) |
|
time_sig_confidence = ensure_float(beats_info.get('time_sig_confidence', 85.0)) |
|
beat_periodicity = ensure_float(beats_info.get('beat_periodicity', 60 / tempo)) |
|
|
|
timeline += f"Tempo: {tempo:.1f} BPM (±{tempo_confidence:.1f}%)\n" |
|
timeline += f"Time Signature: {beats_info['time_signature']}/4 (Confidence: {time_sig_confidence:.1f}%)\n" |
|
timeline += f"Beat Periodicity: {beat_periodicity:.3f}s\n" |
|
timeline += f"Beat Entropy: {beats_info.get('beat_entropy', 'N/A')}\n" |
|
timeline += f"Total Beats: {beats_info['beat_count']}\n" |
|
|
|
|
|
if voice_segments: |
|
timeline += f"\nVoice Activity Segments: {len(voice_segments)}\n" |
|
for i, segment in enumerate(voice_segments[:5]): |
|
timeline += f" Segment {i+1}: {segment['start']:.2f}s - {segment['end']:.2f}s ({segment['duration']:.2f}s)\n" |
|
if len(voice_segments) > 5: |
|
timeline += f" ... and {len(voice_segments) - 5} more segments\n" |
|
|
|
|
|
if tempo < 60: |
|
tempo_class = "Largo (very slow)" |
|
elif tempo < 76: |
|
tempo_class = "Adagio (slow)" |
|
elif tempo < 108: |
|
tempo_class = "Andante (walking pace)" |
|
elif tempo < 132: |
|
tempo_class = "Moderato (moderate)" |
|
elif tempo < 168: |
|
tempo_class = "Allegro (fast)" |
|
else: |
|
tempo_class = "Presto (very fast)" |
|
|
|
timeline += f"Tempo Classification: {tempo_class}\n\n" |
|
|
|
|
|
timeline += "| Beat # | Time (s) | Beat Strength | Syllable Pattern |\n" |
|
timeline += "|--------|----------|--------------|------------------|\n" |
|
|
|
|
|
for i, (time, strength) in enumerate(zip(beats_info['beat_times'], beats_info['beat_strengths'])): |
|
|
|
time = ensure_float(time) |
|
strength = ensure_float(strength) |
|
|
|
|
|
in_voice_segment = False |
|
for segment in voice_segments: |
|
if segment['start'] <= time <= segment['end']: |
|
in_voice_segment = True |
|
break |
|
|
|
|
|
metrical_position = i % beats_info['time_signature'] |
|
|
|
if metrical_position == 0: |
|
beat_type = "STRONG" |
|
syllable_value = 1.5 |
|
elif metrical_position == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 2: |
|
|
|
beat_type = "MEDIUM" if strength < 0.8 else "STRONG" |
|
syllable_value = 1.0 if strength < 0.8 else 1.5 |
|
else: |
|
|
|
if strength >= 0.8: |
|
beat_type = "STRONG" |
|
syllable_value = 1.5 |
|
elif strength >= 0.5: |
|
beat_type = "MEDIUM" |
|
syllable_value = 1.0 |
|
else: |
|
beat_type = "WEAK" |
|
syllable_value = 1.0 |
|
|
|
|
|
if in_voice_segment: |
|
beat_type = f"{beat_type} (VOICE)" |
|
|
|
|
|
if beat_type == "STRONG": |
|
pattern = "S" |
|
elif beat_type == "MEDIUM": |
|
pattern = "m" |
|
else: |
|
pattern = "w" |
|
|
|
|
|
timeline += f"| {i+1:<6} | {time:.2f}s | {beat_type:<12} | {pattern}:{syllable_value} |\n" |
|
|
|
|
|
|
|
|
|
timeline += "\n=== VISUAL BEAT TIMELINE ===\n\n" |
|
timeline += "Each character represents 0.5 seconds. Beats are marked as:\n" |
|
timeline += "S = Strong beat | m = Medium beat | w = Weak beat | · = No beat\n\n" |
|
|
|
|
|
if 'beat_times' in beats_info and len(beats_info['beat_times']) > 0: |
|
|
|
max_beat_time = max([ensure_float(t) for t in beats_info['beat_times']]) |
|
total_duration = max_beat_time + 2 |
|
else: |
|
total_duration = 30 |
|
|
|
time_markers = "" |
|
for i in range(0, int(total_duration) + 1, 5): |
|
time_markers += f"{i:<5}" |
|
timeline += time_markers + " (seconds)\n" |
|
|
|
|
|
ruler = "" |
|
for i in range(0, int(total_duration) + 1): |
|
if i % 5 == 0: |
|
ruler += "+" |
|
else: |
|
ruler += "-" |
|
ruler += "-" * 9 |
|
timeline += ruler + "\n" |
|
|
|
|
|
beat_line = ["·"] * int(total_duration * 2) |
|
|
|
for i, time in enumerate(beats_info['beat_times']): |
|
if i >= len(beats_info['beat_strengths']): |
|
break |
|
|
|
|
|
time_val = ensure_float(time) |
|
|
|
|
|
pos = int(time_val * 2) |
|
if pos >= len(beat_line): |
|
continue |
|
|
|
|
|
strength = beats_info['beat_strengths'][i] |
|
|
|
strength = ensure_float(strength) |
|
|
|
if i % beats_info['time_signature'] == 0: |
|
beat_line[pos] = "S" |
|
elif strength >= 0.8: |
|
beat_line[pos] = "S" |
|
elif i % beats_info['time_signature'] == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 3: |
|
beat_line[pos] = "m" |
|
elif strength >= 0.5: |
|
beat_line[pos] = "m" |
|
else: |
|
beat_line[pos] = "w" |
|
|
|
|
|
beat_visualization = "" |
|
for i in range(0, len(beat_line), 10): |
|
beat_visualization += "".join(beat_line[i:i+10]) |
|
if i + 10 < len(beat_line): |
|
beat_visualization += " " |
|
timeline += beat_visualization + "\n\n" |
|
|
|
|
|
timeline += "=== MEASURE MARKERS ===\n\n" |
|
|
|
|
|
measure_starts = [] |
|
for i, time in enumerate(beats_info['beat_times']): |
|
if i % beats_info['time_signature'] == 0: |
|
|
|
time_val = ensure_float(time) |
|
measure_starts.append((i // beats_info['time_signature'] + 1, time_val)) |
|
|
|
|
|
if measure_starts: |
|
timeline += "| Measure # | Start Time | Duration |\n" |
|
timeline += "|-----------|------------|----------|\n" |
|
|
|
for i in range(len(measure_starts)): |
|
measure_num, start_time = measure_starts[i] |
|
|
|
|
|
if i < len(measure_starts) - 1: |
|
end_time = measure_starts[i+1][1] |
|
elif 'beat_times' in beats_info and len(beats_info['beat_times']) > 0: |
|
|
|
last_beat = beats_info['beat_times'][-1] |
|
end_time = ensure_float(last_beat) |
|
else: |
|
end_time = start_time + 2.0 |
|
|
|
duration = end_time - start_time |
|
|
|
timeline += f"| {measure_num:<9} | {start_time:.2f}s | {duration:.2f}s |\n" |
|
|
|
|
|
|
|
|
|
if 'phrases' in beats_info and beats_info['phrases']: |
|
timeline += "\n=== MUSICAL PHRASES ===\n\n" |
|
for i, phrase in enumerate(beats_info['phrases']): |
|
|
|
if not phrase: |
|
continue |
|
|
|
|
|
if not (len(phrase) > 0 and len(beats_info['beat_times']) > 0): |
|
continue |
|
|
|
start_beat = min(phrase[0], len(beats_info['beat_times'])-1) |
|
end_beat = min(phrase[-1], len(beats_info['beat_times'])-1) |
|
|
|
|
|
phrase_start = ensure_float(beats_info['beat_times'][start_beat]) |
|
phrase_end = ensure_float(beats_info['beat_times'][end_beat]) |
|
|
|
timeline += f"Phrase {i+1}: Beats {start_beat+1}-{end_beat+1} ({phrase_start:.2f}s - {phrase_end:.2f}s)\n" |
|
|
|
|
|
phrase_beats = { |
|
"beat_times": [ensure_float(beats_info['beat_times'][j]) |
|
for j in phrase if j < len(beats_info['beat_times'])], |
|
"beat_strengths": [ensure_float(beats_info['beat_strengths'][j]) |
|
for j in phrase if j < len(beats_info['beat_strengths'])], |
|
"tempo": ensure_float(beats_info['tempo']), |
|
"time_signature": beats_info['time_signature'], |
|
"phrases": [list(range(len(phrase)))] |
|
} |
|
|
|
template = create_flexible_syllable_templates(phrase_beats) |
|
timeline += f" Syllable Template: {template}\n" |
|
|
|
|
|
if phrase_start < total_duration and phrase_end < total_duration: |
|
|
|
phrase_visualization = ["·"] * int(total_duration * 2) |
|
|
|
|
|
start_pos = int(phrase_start * 2) |
|
end_pos = int(phrase_end * 2) |
|
|
|
if start_pos < len(phrase_visualization): |
|
phrase_visualization[start_pos] = "[" |
|
|
|
if end_pos < len(phrase_visualization): |
|
phrase_visualization[end_pos] = "]" |
|
|
|
|
|
for j in phrase: |
|
if j < len(beats_info['beat_times']): |
|
beat_time = ensure_float(beats_info['beat_times'][j]) |
|
beat_pos = int(beat_time * 2) |
|
|
|
if beat_pos < len(phrase_visualization) and beat_pos != start_pos and beat_pos != end_pos: |
|
|
|
if j % beats_info['time_signature'] == 0: |
|
phrase_visualization[beat_pos] = "S" |
|
elif j % beats_info['time_signature'] == beats_info['time_signature'] // 2: |
|
phrase_visualization[beat_pos] = "m" |
|
else: |
|
phrase_visualization[beat_pos] = "w" |
|
|
|
|
|
phrase_visual = "" |
|
for k in range(0, len(phrase_visualization), 10): |
|
phrase_visual += "".join(phrase_visualization[k:k+10]) |
|
if k + 10 < len(phrase_visualization): |
|
phrase_visual += " " |
|
|
|
timeline += f" Timeline: {phrase_visual}\n\n" |
|
|
|
|
|
try: |
|
|
|
subbeat_info = detect_beats_and_subbeats(y, sr, subdivision=4) |
|
duration = librosa.get_duration(y=y, sr=sr) |
|
|
|
|
|
sec_map = map_beats_to_seconds(subbeat_info["subbeat_times"], duration) |
|
|
|
|
|
templates = create_second_level_templates(sec_map, subbeat_info["tempo"]) |
|
|
|
|
|
timeline += "\n=== SECOND-LEVEL SCRIPT ===\n\n" |
|
timeline += "Each line below represents ONE SECOND of audio with matching lyric content.\n" |
|
timeline += "| Second | Beat Pattern | Lyric Content |\n" |
|
timeline += "|--------|-------------|---------------|\n" |
|
|
|
|
|
clean_lyrics = lyrics |
|
if isinstance(lyrics, str): |
|
if "[Note: Rhythm Analysis]" in lyrics: |
|
clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() |
|
elif "[Note: Potential rhythm mismatches" in lyrics: |
|
clean_lyrics = lyrics.split("[Note:")[0].strip() |
|
|
|
|
|
lines = clean_lyrics.strip().split('\n') if clean_lyrics else [] |
|
|
|
for i, template in enumerate(templates): |
|
|
|
lyric = lines[i] if i < len(lines) else "" |
|
if lyric.startswith('[') and ']' in lyric: |
|
lyric = "" |
|
|
|
|
|
timeline += f"| {i+1:<6} | {template:<30} | {lyric[:40]} |\n" |
|
|
|
|
|
timeline += "\n=== SECOND-LEVEL VISUALIZATION ===\n\n" |
|
timeline += "Each row represents ONE SECOND. Beat types:\n" |
|
timeline += "S = Strong beat | m = Medium beat | w = Weak beat | · = No beat\n\n" |
|
|
|
for i, window in enumerate(sec_map): |
|
beats = window["beats"] |
|
|
|
|
|
beat_viz = ["·"] * 20 |
|
|
|
for beat in beats: |
|
|
|
pos = int(beat["relative_pos"] * 19) |
|
if 0 <= pos < len(beat_viz): |
|
|
|
if beat["type"] == "main": |
|
beat_viz[pos] = "S" |
|
elif beat["strength"] >= 0.7: |
|
beat_viz[pos] = "m" |
|
else: |
|
beat_viz[pos] = "w" |
|
|
|
|
|
lyric = lines[i] if i < len(lines) else "" |
|
if lyric.startswith('[') and ']' in lyric: |
|
lyric = "" |
|
|
|
|
|
viz_line = f"Second {i+1:2d}: [" + "".join(beat_viz) + "]" |
|
if lyric: |
|
viz_line += f" → {lyric[:40]}" |
|
|
|
timeline += viz_line + "\n" |
|
|
|
except Exception as e: |
|
timeline += f"\n[Error generating second-level analysis: {str(e)}]" |
|
|
|
|
|
if lyrics and isinstance(lyrics, str): |
|
timeline += "\n=== LYRICS-BEAT ALIGNMENT ===\n\n" |
|
|
|
if "[Note:" in lyrics: |
|
clean_lyrics = lyrics.split("[Note:")[0].strip() |
|
else: |
|
clean_lyrics = lyrics |
|
|
|
lines = clean_lyrics.strip().split('\n') |
|
|
|
|
|
for i, line in enumerate(lines): |
|
if not line.strip() or line.startswith('['): |
|
continue |
|
|
|
timeline += f"Line: \"{line}\"\n" |
|
|
|
|
|
syllable_count = count_syllables(line) |
|
timeline += f" Syllables: {syllable_count}\n" |
|
|
|
|
|
|
|
matching_phrase = None |
|
if 'phrases' in beats_info and beats_info['phrases']: |
|
|
|
if i < len(beats_info['phrases']) and beats_info['phrases'][i]: |
|
matching_phrase = beats_info['phrases'][i] |
|
else: |
|
|
|
|
|
if len(beats_info['phrases']) > 0: |
|
section_size = max(1, len(beats_info['phrases']) // 4) |
|
section_index = min(i // section_size, 3) |
|
section_start = section_index * section_size |
|
section_end = min(section_start + section_size, len(beats_info['phrases'])) |
|
|
|
|
|
candidate_phrases = [phrase for j, phrase in enumerate(beats_info['phrases']) |
|
if section_start <= j < section_end and phrase] |
|
|
|
if candidate_phrases: |
|
matching_phrase = candidate_phrases[min(i % section_size, len(candidate_phrases)-1)] |
|
elif beats_info['phrases']: |
|
|
|
phrase_index = i % len(beats_info['phrases']) |
|
if beats_info['phrases'][phrase_index]: |
|
matching_phrase = beats_info['phrases'][phrase_index] |
|
|
|
|
|
if matching_phrase and len(matching_phrase) > 0 and len(beats_info['beat_times']) > 0: |
|
|
|
if len(matching_phrase) > 0 and len(beats_info['beat_times']) > 0: |
|
start_beat = min(matching_phrase[0], len(beats_info['beat_times'])-1) |
|
end_beat = min(matching_phrase[-1], len(beats_info['beat_times'])-1) |
|
|
|
start_time = ensure_float(beats_info['beat_times'][start_beat]) |
|
end_time = ensure_float(beats_info['beat_times'][end_beat]) |
|
|
|
timeline += f" Timing: {start_time:.2f}s - {end_time:.2f}s\n" |
|
|
|
|
|
timeline += " Alignment: " |
|
|
|
|
|
phrase_duration = end_time - start_time |
|
syllable_viz = [] |
|
|
|
|
|
for j, beat_idx in enumerate(matching_phrase): |
|
if beat_idx < len(beats_info['beat_times']): |
|
beat_time = ensure_float(beats_info['beat_times'][beat_idx]) |
|
|
|
|
|
if phrase_duration > 0.001: |
|
|
|
|
|
normalized_pos = (beat_time - start_time) / phrase_duration |
|
|
|
curved_pos = min(1.0, normalized_pos * (1.0 + 0.1 * (normalized_pos - 0.5))) |
|
relative_pos = int(curved_pos * syllable_count) |
|
else: |
|
relative_pos = j |
|
|
|
|
|
while len(syllable_viz) <= relative_pos: |
|
syllable_viz.append("·") |
|
|
|
|
|
metrical_pos = beat_idx % beats_info['time_signature'] |
|
beat_strength = beats_info['beat_strengths'][beat_idx] if beat_idx < len(beats_info['beat_strengths']) else 0 |
|
|
|
if metrical_pos == 0 or beat_strength >= 0.8: |
|
syllable_viz[relative_pos] = "S" |
|
elif metrical_pos == beats_info['time_signature'] // 2 or beat_strength >= 0.5: |
|
syllable_viz[relative_pos] = "m" |
|
else: |
|
syllable_viz[relative_pos] = "w" |
|
|
|
|
|
while len(syllable_viz) < syllable_count: |
|
syllable_viz.append("·") |
|
|
|
|
|
syllable_viz = syllable_viz[:syllable_count] |
|
|
|
|
|
timeline += "".join(syllable_viz) + "\n" |
|
|
|
|
|
words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) |
|
if words: |
|
word_stresses = [] |
|
cumulative_syllables = 0 |
|
|
|
for word in words: |
|
syllable_count_word = count_syllables_for_word(word) |
|
stress_pattern = get_word_stress(word) |
|
|
|
|
|
while len(stress_pattern) < syllable_count_word: |
|
stress_pattern += "0" |
|
|
|
for j in range(syllable_count_word): |
|
stress_char = "S" if j < len(stress_pattern) and stress_pattern[j] == "1" else "_" |
|
word_stresses.append(stress_char) |
|
|
|
cumulative_syllables += syllable_count_word |
|
|
|
|
|
timeline += " Word stress: " + "".join(word_stresses) + "\n" |
|
|
|
|
|
alignment_score = 0 |
|
alignment_issues = [] |
|
|
|
for j, (stress, beat) in enumerate(zip(word_stresses, syllable_viz)): |
|
if (stress == "S" and beat == "S") or (stress != "S" and beat != "S"): |
|
alignment_score += 1 |
|
elif stress == "S" and beat != "S": |
|
alignment_issues.append(f"Syllable {j+1} has stress but weak beat") |
|
elif stress != "S" and beat == "S": |
|
alignment_issues.append(f"Syllable {j+1} has no stress but strong beat") |
|
|
|
if word_stresses: |
|
alignment_percent = (alignment_score / len(word_stresses)) * 100 |
|
timeline += f" Stress alignment: {alignment_percent:.1f}% match\n" |
|
|
|
if alignment_issues and len(alignment_issues) <= 3: |
|
timeline += " Issues: " + "; ".join(alignment_issues) + "\n" |
|
else: |
|
timeline += " No matching phrase found for alignment\n" |
|
|
|
timeline += "\n" |
|
|
|
return timeline |
|
|
|
except Exception as e: |
|
print(f"Error generating complete beat timeline: {str(e)}") |
|
return f"Error generating complete beat timeline: {str(e)}" |
|
|
|
def display_results(audio_file, lyrics_requirements=None): |
|
"""Process audio file and return formatted results for display in the UI.""" |
|
|
|
error_response = ("Please upload an audio file.", |
|
"No emotion analysis available.", |
|
"No audio classification available.", |
|
"No lyrics generated.", |
|
"No beat timeline available.") |
|
|
|
if audio_file is None: |
|
return error_response |
|
|
|
try: |
|
|
|
results = process_audio(audio_file, lyrics_requirements) |
|
|
|
|
|
if isinstance(results, str) and "Error" in results: |
|
return results, *error_response[1:] |
|
elif isinstance(results, tuple) and isinstance(results[0], str) and "Error" in results[0]: |
|
return results[0], *error_response[1:] |
|
|
|
|
|
if isinstance(results, dict): |
|
|
|
genre_results = results.get("genre_results", "Genre classification failed") |
|
lyrics = results.get("lyrics", "Lyrics generation failed") |
|
ast_results = results.get("ast_results", []) |
|
voice_segments = results.get("voice_segments", []) |
|
else: |
|
|
|
genre_results, lyrics, ast_results = results |
|
|
|
try: |
|
voice_segments = detect_voice_activity(audio_file) |
|
except Exception as e: |
|
print(f"Error detecting voice segments: {str(e)}") |
|
voice_segments = [] |
|
|
|
|
|
clean_lyrics = lyrics |
|
if isinstance(lyrics, str): |
|
if "[Note: Rhythm Analysis]" in lyrics: |
|
clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() |
|
elif "[Note: Potential rhythm mismatches" in lyrics: |
|
clean_lyrics = lyrics.split("[Note:")[0].strip() |
|
|
|
|
|
beat_timeline = format_complete_beat_timeline(audio_file, clean_lyrics) |
|
|
|
|
|
emotion_text = "No emotion analysis available." |
|
try: |
|
emotion_results = music_analyzer.analyze_music(audio_file) |
|
emotion_text = (f"Tempo: {emotion_results['summary']['tempo']:.1f} BPM\n" |
|
f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n" |
|
f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n" |
|
f"Primary Theme: {emotion_results['summary']['primary_theme']}") |
|
|
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
beats_info = detect_beats(y, sr) |
|
|
|
|
|
emotion_text += f"\n\nBeat Analysis:\n" |
|
emotion_text += f"- Tempo: {beats_info.get('tempo', 0):.1f} BPM\n" |
|
emotion_text += f"- Time Signature: {beats_info.get('time_signature', 4)}/4\n" |
|
emotion_text += f"- Total Beats: {beats_info.get('beat_count', 0)}\n" |
|
|
|
|
|
if voice_segments: |
|
emotion_text += f"\n\nVoice Activity Segments ({len(voice_segments)}):\n" |
|
for i, segment in enumerate(voice_segments[:10]): |
|
emotion_text += f"- Segment {i+1}: {segment['start']:.2f}s - {segment['end']:.2f}s ({segment['duration']:.2f}s)\n" |
|
if len(voice_segments) > 10: |
|
emotion_text += f"... and {len(voice_segments) - 10} more segments\n" |
|
|
|
except Exception as e: |
|
print(f"Error in emotion analysis: {str(e)}") |
|
|
|
|
|
ast_text = "No valid audio classification results available." |
|
if ast_results and isinstance(ast_results, list): |
|
ast_text = "Audio Classification Results:\n" |
|
for result in ast_results[:5]: |
|
ast_text += f"{result['label']}: {result['score']*100:.2f}%\n" |
|
|
|
|
|
return genre_results, emotion_text, ast_text, clean_lyrics, beat_timeline |
|
|
|
except Exception as e: |
|
error_msg = f"Error: {str(e)}" |
|
print(error_msg) |
|
return error_msg, *error_response[1:] |
|
|
|
|
|
with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo: |
|
gr.Markdown("# Music Genre Classifier & Lyrics Generator") |
|
gr.Markdown("Upload a music file to classify its genre, analyze its emotions, and generate perfectly aligned lyrics.") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
audio_input = gr.Audio(label="Upload Music", type="filepath") |
|
|
|
|
|
lyrics_requirements_input = gr.Textbox( |
|
label="Lyrics Requirements (optional)", |
|
placeholder="Enter specific themes, topics, words, or styles you want in the lyrics", |
|
lines=3 |
|
) |
|
|
|
submit_btn = gr.Button("Analyze & Generate", variant="primary") |
|
|
|
|
|
with gr.Accordion("About Music Genres", open=False): |
|
gr.Markdown(""" |
|
The system recognizes various music genres including: |
|
- Pop, Rock, Hip-Hop, R&B |
|
- Electronic, Dance, Techno, House |
|
- Jazz, Blues, Classical |
|
- Folk, Country, Acoustic |
|
- Metal, Punk, Alternative |
|
- And many others! |
|
|
|
For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music. |
|
""") |
|
|
|
with gr.Column(scale=2): |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Analysis Results"): |
|
genre_output = gr.Textbox(label="Detected Genres", lines=4) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
emotion_output = gr.Textbox(label="Emotion & Structure Analysis", lines=8) |
|
with gr.Column(): |
|
ast_output = gr.Textbox(label="Audio Classification", lines=8) |
|
|
|
with gr.TabItem("Generated Lyrics"): |
|
lyrics_output = gr.Textbox(label="Lyrics", lines=18) |
|
|
|
with gr.TabItem("Beat & Syllable Timeline"): |
|
beat_timeline_output = gr.Textbox(label="Beat Timings & Syllable Patterns", lines=40) |
|
|
|
|
|
submit_btn.click( |
|
fn=display_results, |
|
inputs=[audio_input, lyrics_requirements_input], |
|
outputs=[genre_output, emotion_output, ast_output, lyrics_output, beat_timeline_output] |
|
) |
|
|
|
|
|
with gr.Accordion("How it works", open=False): |
|
gr.Markdown(""" |
|
## Advanced Lyrics Generation Process |
|
|
|
1. **Audio Analysis**: The system analyzes your uploaded music file using multiple machine learning models. |
|
|
|
2. **Genre Classification**: A specialized neural network identifies the musical genre, detecting subtle patterns in the audio. |
|
|
|
3. **Emotional Analysis**: The system examines harmonic, rhythmic, and timbral features to determine the emotional qualities of the music. |
|
|
|
4. **Rhythm Mapping**: Advanced beat detection algorithms create a detailed rhythmic map of the music, identifying: |
|
- Strong and weak beats |
|
- Natural phrase boundaries |
|
- Time signature and tempo variations |
|
- Beat subdivisions (half and quarter beats) |
|
|
|
5. **Second-Level Alignment**: The system maps beats and subbeats to each second of audio, creating precise templates for perfect alignment. |
|
|
|
6. **Syllable Template Creation**: For each second of audio, the system generates precise syllable templates that reflect: |
|
- Beat stress patterns (strong, medium, weak) |
|
- Appropriate syllable counts based on tempo |
|
- Genre-specific rhythmic qualities |
|
- Half-beat and quarter-beat subdivisions |
|
|
|
7. **Lyrics Generation**: Using the detected genre, emotion, rhythm patterns, and your custom requirements, a large language model generates lyrics that: |
|
- Match the emotional quality of the music |
|
- Follow the precise syllable templates for each second |
|
- Align stressed syllables with strong beats |
|
- Maintain genre-appropriate style and themes |
|
- Incorporate your specific requirements and preferences |
|
|
|
8. **Rhythm Verification**: The system verifies the generated lyrics, analyzing: |
|
- Syllable count accuracy |
|
- Stress alignment with strong beats |
|
- Word stress patterns |
|
- Second-by-second alignment precision |
|
|
|
9. **Refinement**: If significant rhythm mismatches are detected, the system can automatically refine the lyrics for better alignment. |
|
|
|
This multi-step process creates lyrics that feel naturally connected to the music, as if they were written specifically for it. |
|
""") |
|
|
|
|
|
demo.launch() |