|
import os |
|
import io |
|
import gradio as gr |
|
import torch |
|
import numpy as np |
|
import re |
|
import pronouncing |
|
import functools |
|
from transformers import ( |
|
AutoModelForAudioClassification, |
|
AutoFeatureExtractor, |
|
AutoTokenizer, |
|
pipeline, |
|
AutoModelForCausalLM, |
|
BitsAndBytesConfig |
|
) |
|
from huggingface_hub import login |
|
from utils import ( |
|
load_audio, |
|
extract_audio_duration, |
|
extract_mfcc_features, |
|
format_genre_results, |
|
ensure_cuda_availability |
|
) |
|
from emotionanalysis import MusicAnalyzer |
|
import librosa |
|
|
|
|
|
if "HF_TOKEN" in os.environ: |
|
login(token=os.environ["HF_TOKEN"]) |
|
|
|
|
|
GENRE_MODEL_NAME = "dima806/music_genres_classification" |
|
MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593" |
|
LLM_MODEL_NAME = "Qwen/Qwen3-32B" |
|
SAMPLE_RATE = 22050 |
|
|
|
|
|
CUDA_AVAILABLE = ensure_cuda_availability() |
|
|
|
|
|
print(f"Loading music detection model: {MUSIC_DETECTION_MODEL}") |
|
try: |
|
music_detector = pipeline( |
|
"audio-classification", |
|
model=MUSIC_DETECTION_MODEL, |
|
device=0 if CUDA_AVAILABLE else -1 |
|
) |
|
print("Successfully loaded music detection pipeline") |
|
except Exception as e: |
|
print(f"Error creating music detection pipeline: {str(e)}") |
|
|
|
try: |
|
music_processor = AutoFeatureExtractor.from_pretrained(MUSIC_DETECTION_MODEL) |
|
music_model = AutoModelForAudioClassification.from_pretrained(MUSIC_DETECTION_MODEL) |
|
print("Successfully loaded music detection model and feature extractor") |
|
except Exception as e2: |
|
print(f"Error loading music detection model components: {str(e2)}") |
|
raise RuntimeError(f"Could not load music detection model: {str(e2)}") |
|
|
|
|
|
print(f"Loading audio classification model: {GENRE_MODEL_NAME}") |
|
try: |
|
genre_classifier = pipeline( |
|
"audio-classification", |
|
model=GENRE_MODEL_NAME, |
|
device=0 if CUDA_AVAILABLE else -1 |
|
) |
|
print("Successfully loaded audio classification pipeline") |
|
except Exception as e: |
|
print(f"Error creating pipeline: {str(e)}") |
|
|
|
try: |
|
genre_processor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME) |
|
genre_model = AutoModelForAudioClassification.from_pretrained(GENRE_MODEL_NAME) |
|
print("Successfully loaded audio classification model and feature extractor") |
|
except Exception as e2: |
|
print(f"Error loading model components: {str(e2)}") |
|
raise RuntimeError(f"Could not load genre classification model: {str(e2)}") |
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_compute_dtype=torch.float16, |
|
) |
|
|
|
llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME) |
|
llm_model = AutoModelForCausalLM.from_pretrained( |
|
LLM_MODEL_NAME, |
|
device_map="auto", |
|
quantization_config=bnb_config, |
|
torch_dtype=torch.float16, |
|
) |
|
|
|
|
|
llm_pipeline = pipeline( |
|
"text-generation", |
|
model=llm_model, |
|
tokenizer=llm_tokenizer, |
|
max_new_tokens=512, |
|
) |
|
|
|
|
|
music_analyzer = MusicAnalyzer() |
|
|
|
|
|
@functools.lru_cache(maxsize=512) |
|
def cached_phones_for_word(word): |
|
"""Get word pronunciations with caching for better performance.""" |
|
return pronouncing.phones_for_word(word) |
|
|
|
@functools.lru_cache(maxsize=512) |
|
def count_syllables_for_word(word): |
|
"""Count syllables in a single word with caching for performance.""" |
|
|
|
pronunciations = cached_phones_for_word(word.lower()) |
|
if pronunciations: |
|
return pronouncing.syllable_count(pronunciations[0]) |
|
|
|
|
|
vowels = "aeiouy" |
|
word = word.lower() |
|
count = 0 |
|
prev_is_vowel = False |
|
|
|
for char in word: |
|
is_vowel = char in vowels |
|
if is_vowel and not prev_is_vowel: |
|
count += 1 |
|
prev_is_vowel = is_vowel |
|
|
|
|
|
if word.endswith('e') and not word.endswith('le'): |
|
count -= 1 |
|
if word.endswith('le') and len(word) > 2 and word[-3] not in vowels: |
|
count += 1 |
|
if count == 0: |
|
count = 1 |
|
|
|
return count |
|
|
|
@functools.lru_cache(maxsize=512) |
|
def get_word_stress(word): |
|
"""Get the stress pattern for a word with improved fallback handling.""" |
|
pronunciations = cached_phones_for_word(word.lower()) |
|
if pronunciations: |
|
return pronouncing.stresses(pronunciations[0]) |
|
|
|
|
|
syllables = count_syllables_for_word(word) |
|
|
|
|
|
if syllables == 1: |
|
return "1" |
|
elif syllables == 2: |
|
|
|
|
|
second_syllable_stress = ["ing", "er", "or", "ize", "ise", "ate", "ect", "end", "ure"] |
|
if any(word.endswith(ending) for ending in second_syllable_stress): |
|
return "01" |
|
else: |
|
return "10" |
|
elif syllables == 3: |
|
|
|
if any(word.endswith(ending) for ending in ["ity", "ety", "ify", "ogy", "graphy"]): |
|
return "100" |
|
elif any(word.endswith(ending) for ending in ["ation", "ious", "itis"]): |
|
return "010" |
|
else: |
|
return "100" |
|
else: |
|
|
|
return "1" + "0" * (syllables - 1) |
|
|
|
|
|
def count_syllables(text): |
|
"""Count syllables in a given text using the pronouncing library.""" |
|
words = re.findall(r'\b[a-zA-Z]+\b', text.lower()) |
|
syllable_count = 0 |
|
|
|
for word in words: |
|
syllable_count += count_syllables_for_word(word) |
|
|
|
return syllable_count |
|
|
|
def extract_audio_features(audio_file): |
|
"""Extract audio features from an audio file.""" |
|
try: |
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
|
|
if y is None or sr is None: |
|
raise ValueError("Failed to load audio data") |
|
|
|
|
|
duration = extract_audio_duration(y, sr) |
|
|
|
|
|
mfccs_mean = extract_mfcc_features(y, sr, n_mfcc=20) |
|
|
|
return { |
|
"features": mfccs_mean, |
|
"duration": duration, |
|
"waveform": y, |
|
"sample_rate": sr, |
|
"path": audio_file |
|
} |
|
except Exception as e: |
|
print(f"Error extracting audio features: {str(e)}") |
|
raise ValueError(f"Failed to extract audio features: {str(e)}") |
|
|
|
def classify_genre(audio_data): |
|
"""Classify the genre of the audio using the loaded model.""" |
|
try: |
|
|
|
if 'genre_classifier' in globals(): |
|
results = genre_classifier(audio_data["path"]) |
|
|
|
top_genres = [(result["label"], result["score"]) for result in results[:3]] |
|
return top_genres |
|
|
|
|
|
elif 'genre_processor' in globals() and 'genre_model' in globals(): |
|
|
|
inputs = genre_processor( |
|
audio_data["waveform"], |
|
sampling_rate=audio_data["sample_rate"], |
|
return_tensors="pt" |
|
) |
|
|
|
with torch.no_grad(): |
|
outputs = genre_model(**inputs) |
|
predictions = outputs.logits.softmax(dim=-1) |
|
|
|
|
|
values, indices = torch.topk(predictions, 3) |
|
|
|
|
|
genre_labels = genre_model.config.id2label |
|
|
|
top_genres = [] |
|
for i, (value, index) in enumerate(zip(values[0], indices[0])): |
|
genre = genre_labels[index.item()] |
|
confidence = value.item() |
|
top_genres.append((genre, confidence)) |
|
|
|
return top_genres |
|
|
|
else: |
|
raise ValueError("No genre classification model available") |
|
|
|
except Exception as e: |
|
print(f"Error in genre classification: {str(e)}") |
|
|
|
return [("rock", 1.0)] |
|
|
|
def detect_music(audio_data): |
|
"""Detect if the audio is music using the MIT AST model.""" |
|
try: |
|
|
|
if 'music_detector' in globals(): |
|
results = music_detector(audio_data["path"]) |
|
|
|
music_confidence = 0.0 |
|
for result in results: |
|
label = result["label"].lower() |
|
if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): |
|
music_confidence = max(music_confidence, result["score"]) |
|
return music_confidence >= 0.2, results |
|
|
|
|
|
elif 'music_processor' in globals() and 'music_model' in globals(): |
|
|
|
inputs = music_processor( |
|
audio_data["waveform"], |
|
sampling_rate=audio_data["sample_rate"], |
|
return_tensors="pt" |
|
) |
|
|
|
with torch.no_grad(): |
|
outputs = music_model(**inputs) |
|
predictions = outputs.logits.softmax(dim=-1) |
|
|
|
|
|
values, indices = torch.topk(predictions, 5) |
|
|
|
|
|
labels = music_model.config.id2label |
|
|
|
|
|
music_confidence = 0.0 |
|
results = [] |
|
|
|
for i, (value, index) in enumerate(zip(values[0], indices[0])): |
|
label = labels[index.item()].lower() |
|
score = value.item() |
|
results.append({"label": label, "score": score}) |
|
|
|
if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): |
|
music_confidence = max(music_confidence, score) |
|
|
|
return music_confidence >= 0.2, results |
|
|
|
else: |
|
raise ValueError("No music detection model available") |
|
|
|
except Exception as e: |
|
print(f"Error in music detection: {str(e)}") |
|
return False, [] |
|
|
|
def detect_beats(y, sr): |
|
"""Enhanced beat detection with adaptive threshold analysis, improved time signature detection and scientific confidence metrics.""" |
|
|
|
|
|
y = np.clip(y, 1e-10, None) |
|
|
|
|
|
y_harmonic, y_percussive = librosa.effects.hpss(y) |
|
|
|
|
|
onset_env_full = librosa.onset.onset_strength(y=y, sr=sr) |
|
onset_env_perc = librosa.onset.onset_strength(y=y_percussive, sr=sr) |
|
|
|
|
|
onset_env_full = np.maximum(onset_env_full, 1e-6) |
|
onset_env_perc = np.maximum(onset_env_perc, 1e-6) |
|
|
|
|
|
combined_onset = onset_env_full * 0.3 + onset_env_perc * 0.7 |
|
|
|
|
|
tempo_candidates = [] |
|
beat_candidates = [] |
|
consistency_metrics = [] |
|
|
|
|
|
tempo1, beats1 = librosa.beat.beat_track( |
|
onset_envelope=combined_onset, |
|
sr=sr, |
|
tightness=100 |
|
) |
|
tempo_candidates.append(tempo1) |
|
beat_candidates.append(beats1) |
|
|
|
|
|
ac = librosa.autocorrelate(combined_onset) |
|
estimated_period = int(sr * 60.0 / (tempo1 * librosa.get_duration(y=y, sr=sr) / len(combined_onset))) |
|
if estimated_period < len(ac) and estimated_period > 0: |
|
|
|
local_ac = ac[max(0, estimated_period-5):min(len(ac), estimated_period+6)] |
|
if np.max(local_ac) > 0: |
|
tempo1_confidence = ac[estimated_period] / np.max(local_ac) |
|
else: |
|
tempo1_confidence = 0.5 |
|
else: |
|
tempo1_confidence = 0.5 |
|
consistency_metrics.append(tempo1_confidence) |
|
|
|
|
|
tempo2, beats2 = librosa.beat.beat_track( |
|
onset_envelope=combined_onset, |
|
sr=sr, |
|
tightness=100, |
|
start_bpm=60 |
|
) |
|
tempo_candidates.append(tempo2) |
|
beat_candidates.append(beats2) |
|
|
|
|
|
estimated_period2 = int(sr * 60.0 / (tempo2 * librosa.get_duration(y=y, sr=sr) / len(combined_onset))) |
|
if estimated_period2 < len(ac) and estimated_period2 > 0: |
|
local_ac2 = ac[max(0, estimated_period2-5):min(len(ac), estimated_period2+6)] |
|
if np.max(local_ac2) > 0: |
|
tempo2_confidence = ac[estimated_period2] / np.max(local_ac2) |
|
else: |
|
tempo2_confidence = 0.5 |
|
else: |
|
tempo2_confidence = 0.5 |
|
consistency_metrics.append(tempo2_confidence) |
|
|
|
|
|
try: |
|
tempo3, beats3 = librosa.beat.beat_track( |
|
onset_envelope=combined_onset, |
|
sr=sr, |
|
tightness=300, |
|
trim=False |
|
) |
|
tempo_candidates.append(tempo3) |
|
beat_candidates.append(beats3) |
|
|
|
|
|
if len(beats3) > 1: |
|
beat_times3 = librosa.frames_to_time(beats3, sr=sr) |
|
intervals3 = np.diff(beat_times3) |
|
tempo3_consistency = 1.0 / (1.0 + np.std(intervals3)/np.mean(intervals3)) if np.mean(intervals3) > 0 else 0.5 |
|
else: |
|
tempo3_consistency = 0.5 |
|
consistency_metrics.append(tempo3_consistency) |
|
except Exception: |
|
|
|
pass |
|
|
|
|
|
beat_consistency = [] |
|
for i, beats in enumerate(beat_candidates): |
|
if len(beats) <= 1: |
|
beat_consistency.append(0) |
|
continue |
|
|
|
times = librosa.frames_to_time(beats, sr=sr) |
|
intervals = np.diff(times) |
|
|
|
|
|
if np.mean(intervals) > 0: |
|
|
|
cv = np.std(intervals)/np.mean(intervals) |
|
|
|
|
|
duration = librosa.get_duration(y=y, sr=sr) |
|
expected_beats = duration * tempo_candidates[i] / 60 |
|
beats_ratio = min(len(beats) / expected_beats, expected_beats / len(beats)) if expected_beats > 0 else 0.5 |
|
|
|
|
|
consistency = (0.7 * (1.0 / (1.0 + cv))) + (0.3 * consistency_metrics[i]) + (0.2 * beats_ratio) |
|
beat_consistency.append(consistency) |
|
else: |
|
beat_consistency.append(0) |
|
|
|
|
|
if beat_consistency: |
|
best_idx = np.argmax(beat_consistency) |
|
best_confidence = beat_consistency[best_idx] * 100 |
|
else: |
|
best_idx = 0 |
|
best_confidence = 50.0 |
|
|
|
tempo = tempo_candidates[best_idx] |
|
beat_frames = beat_candidates[best_idx] |
|
|
|
|
|
beat_entropy = 0.0 |
|
if len(beat_frames) > 2: |
|
times = librosa.frames_to_time(beat_frames, sr=sr) |
|
intervals = np.diff(times) |
|
|
|
|
|
if len(intervals) > 0 and np.std(intervals) > 0: |
|
quantized = np.round(intervals / np.min(intervals)) |
|
|
|
unique, counts = np.unique(quantized, return_counts=True) |
|
probs = counts / np.sum(counts) |
|
|
|
beat_entropy = -np.sum(probs * np.log2(probs)) |
|
|
|
|
|
beat_times = librosa.frames_to_time(beat_frames, sr=sr) |
|
|
|
|
|
beat_strengths = [] |
|
if len(beat_frames) > 0: |
|
|
|
valid_frames = [frame for frame in beat_frames if frame < len(combined_onset)] |
|
if valid_frames: |
|
|
|
raw_strengths = combined_onset[valid_frames] |
|
|
|
|
|
if np.max(raw_strengths) > 0: |
|
normalized_strengths = raw_strengths / np.max(raw_strengths) |
|
else: |
|
normalized_strengths = np.ones_like(raw_strengths) |
|
|
|
beat_strengths = normalized_strengths.tolist() |
|
|
|
|
|
if len(beat_times) > len(beat_strengths): |
|
missing_count = len(beat_times) - len(beat_strengths) |
|
|
|
if beat_strengths: |
|
last_strength = beat_strengths[-1] |
|
decay_factor = 0.9 |
|
beat_strengths.extend([last_strength * (decay_factor ** (i+1)) |
|
for i in range(missing_count)]) |
|
else: |
|
beat_strengths = [1.0] * len(beat_times) |
|
else: |
|
beat_strengths = [1.0] * len(beat_times) |
|
else: |
|
beat_strengths = [1.0] * len(beat_times) |
|
|
|
|
|
intervals = np.diff(beat_times).tolist() if len(beat_times) > 1 else [] |
|
|
|
|
|
|
|
time_signature = 4 |
|
time_sig_confidence = 70.0 |
|
|
|
if len(beat_strengths) > 8: |
|
|
|
if len(beat_strengths) > 4: |
|
|
|
norm_strengths = np.array(beat_strengths) |
|
if np.max(norm_strengths) > 0: |
|
norm_strengths = norm_strengths / np.max(norm_strengths) |
|
|
|
|
|
ac = librosa.autocorrelate(norm_strengths, max_size=len(norm_strengths)//2) |
|
|
|
|
|
if len(ac) > 3: |
|
|
|
peaks = librosa.util.peak_pick(ac[1:], pre_max=1, post_max=1, pre_avg=1, post_avg=1, delta=0.1, wait=1) |
|
peaks = peaks + 1 |
|
|
|
if len(peaks) > 0: |
|
|
|
peak_idx = peaks[0] |
|
N = peak_idx |
|
|
|
|
|
if peak_idx < len(ac): |
|
peak_height = ac[peak_idx] |
|
local_prominence = peak_height / np.mean(ac[max(0, peak_idx-2):min(len(ac), peak_idx+3)]) |
|
time_sig_confidence = min(95, 60 + 35 * local_prominence) |
|
|
|
|
|
if N == 2: |
|
time_signature = 2 |
|
time_sig_confidence += 5 |
|
elif N == 3: |
|
time_signature = 3 |
|
time_sig_confidence += 5 |
|
elif 4 <= N <= 5: |
|
time_signature = N |
|
elif N == 6: |
|
|
|
|
|
group_3_count = 0 |
|
for i in range(0, len(beat_strengths) - 6, 3): |
|
if i + 2 < len(beat_strengths): |
|
if beat_strengths[i] > beat_strengths[i+1] and beat_strengths[i] > beat_strengths[i+2]: |
|
group_3_count += 1 |
|
|
|
group_2_count = 0 |
|
for i in range(0, len(beat_strengths) - 4, 2): |
|
if i + 1 < len(beat_strengths): |
|
if beat_strengths[i] > beat_strengths[i+1]: |
|
group_2_count += 1 |
|
|
|
|
|
time_signature = 3 if group_3_count > group_2_count else 6 |
|
elif N == 8: |
|
time_signature = 4 |
|
elif N == 5 or N == 7: |
|
time_signature = N |
|
|
|
|
|
phrases = [] |
|
current_phrase = [] |
|
|
|
if len(beat_times) > 0: |
|
|
|
if len(beat_strengths) > 4: |
|
|
|
strong_threshold = np.percentile(beat_strengths, 75) |
|
|
|
if intervals: |
|
mean_interval = np.mean(intervals) |
|
std_interval = np.std(intervals) |
|
|
|
significant_gap = mean_interval + (1.5 * std_interval) if std_interval > 0 else mean_interval * 1.3 |
|
else: |
|
significant_gap = 0 |
|
else: |
|
|
|
strong_threshold = np.max(beat_strengths) * 0.8 if beat_strengths else 1.0 |
|
significant_gap = 0 |
|
|
|
|
|
for i in range(len(beat_times)): |
|
current_phrase.append(i) |
|
|
|
|
|
if i < len(beat_times) - 1: |
|
|
|
is_stronger_next = False |
|
if i < len(beat_strengths) - 1: |
|
is_stronger_next = beat_strengths[i+1] > strong_threshold and beat_strengths[i+1] > beat_strengths[i] * 1.1 |
|
|
|
|
|
is_longer_gap = False |
|
if i < len(beat_times) - 1 and intervals and i < len(intervals): |
|
is_longer_gap = intervals[i] > significant_gap |
|
|
|
|
|
is_measure_boundary = (i + 1) % time_signature == 0 and i > 0 |
|
|
|
|
|
is_energy_dip = False |
|
if i < len(beat_strengths) - 1: |
|
onset_ratio = beat_strengths[i+1] / max(beat_strengths[i], 0.001) |
|
is_energy_dip = onset_ratio < 0.6 |
|
|
|
|
|
phrase_boundary_score = ( |
|
(1.5 if is_stronger_next else 0) + |
|
(2.0 if is_longer_gap else 0) + |
|
(1.0 if is_measure_boundary else 0) + |
|
(0.5 if is_energy_dip else 0) |
|
) |
|
|
|
if (phrase_boundary_score >= 1.5 and len(current_phrase) >= 2) or \ |
|
(is_measure_boundary and len(current_phrase) >= time_signature): |
|
phrases.append(current_phrase) |
|
current_phrase = [] |
|
|
|
|
|
if current_phrase and len(current_phrase) >= 2: |
|
phrases.append(current_phrase) |
|
|
|
|
|
if not phrases and len(beat_times) >= 2: |
|
|
|
for i in range(0, len(beat_times), time_signature): |
|
end = min(i + time_signature, len(beat_times)) |
|
if end - i >= 2: |
|
phrases.append(list(range(i, end))) |
|
|
|
|
|
beat_periodicity = np.mean(intervals) if intervals else (60 / tempo) |
|
|
|
|
|
return { |
|
"tempo": tempo, |
|
"tempo_confidence": best_confidence, |
|
"time_signature": time_signature, |
|
"time_sig_confidence": time_sig_confidence, |
|
"beat_frames": beat_frames, |
|
"beat_times": beat_times, |
|
"beat_count": len(beat_times), |
|
"beat_strengths": beat_strengths, |
|
"intervals": intervals, |
|
"phrases": phrases, |
|
"beat_periodicity": beat_periodicity, |
|
"beat_entropy": beat_entropy |
|
} |
|
|
|
def detect_beats_and_subbeats(y, sr, subdivision=4): |
|
""" |
|
Detect main beats and interpolate subbeats between consecutive beats. |
|
|
|
Parameters: |
|
y: Audio time series |
|
sr: Sample rate |
|
subdivision: Number of subdivisions between beats (default: 4 for quarter beats) |
|
|
|
Returns: |
|
Dictionary containing beat times, subbeat times, and tempo information |
|
""" |
|
|
|
try: |
|
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr) |
|
beat_times = librosa.frames_to_time(beat_frames, sr=sr) |
|
|
|
|
|
if isinstance(tempo, np.ndarray) or isinstance(tempo, np.number): |
|
tempo = float(tempo) |
|
|
|
|
|
if isinstance(beat_times, np.ndarray): |
|
beat_times = [float(t) for t in beat_times] |
|
except Exception as e: |
|
print(f"Error in beat detection: {e}") |
|
|
|
tempo = 120.0 |
|
beat_times = [] |
|
|
|
|
|
subbeat_times = [] |
|
|
|
|
|
if not beat_times or len(beat_times) < 2: |
|
return { |
|
"tempo": float(tempo) if tempo is not None else 120.0, |
|
"beat_times": beat_times, |
|
"subbeat_times": [] |
|
} |
|
|
|
for i in range(len(beat_times) - 1): |
|
|
|
try: |
|
current_beat = float(beat_times[i]) |
|
next_beat = float(beat_times[i + 1]) |
|
except (IndexError, ValueError, TypeError): |
|
continue |
|
|
|
|
|
interval = (next_beat - current_beat) / subdivision |
|
|
|
|
|
subbeat_times.append({ |
|
"time": float(current_beat), |
|
"type": "main", |
|
"strength": 1.0, |
|
"beat_index": i |
|
}) |
|
|
|
|
|
for j in range(1, subdivision): |
|
subbeat_time = current_beat + j * interval |
|
|
|
|
|
if j == subdivision // 2 and subdivision == 4: |
|
strength = 0.8 |
|
else: |
|
strength = 0.5 |
|
|
|
subbeat_times.append({ |
|
"time": float(subbeat_time), |
|
"type": "sub", |
|
"strength": float(strength), |
|
"beat_index": i, |
|
"subbeat_index": j |
|
}) |
|
|
|
|
|
if beat_times: |
|
try: |
|
subbeat_times.append({ |
|
"time": float(beat_times[-1]), |
|
"type": "main", |
|
"strength": 1.0, |
|
"beat_index": len(beat_times) - 1 |
|
}) |
|
except (ValueError, TypeError): |
|
|
|
pass |
|
|
|
return { |
|
"tempo": float(tempo) if tempo is not None else 120.0, |
|
"beat_times": beat_times, |
|
"subbeat_times": subbeat_times |
|
} |
|
|
|
def map_beats_to_seconds(subbeat_times, duration, fps=1.0): |
|
""" |
|
Map beats and subbeats to second-level intervals. |
|
|
|
Parameters: |
|
subbeat_times: List of dictionaries containing beat and subbeat information |
|
duration: Total duration of the audio in seconds |
|
fps: Frames per second (default: 1.0 for one-second intervals) |
|
|
|
Returns: |
|
List of dictionaries, each containing beats within a time window |
|
""" |
|
|
|
if not isinstance(subbeat_times, list): |
|
print("Warning: subbeat_times is not a list") |
|
subbeat_times = [] |
|
|
|
try: |
|
duration = float(duration) |
|
except (ValueError, TypeError): |
|
print("Warning: duration is not convertible to float, defaulting to 30") |
|
duration = 30.0 |
|
|
|
|
|
num_windows = int(duration * fps) + 1 |
|
|
|
|
|
time_windows = [] |
|
|
|
for i in range(num_windows): |
|
|
|
start_time = i / fps |
|
end_time = (i + 1) / fps |
|
|
|
|
|
window_beats = [] |
|
|
|
for beat in subbeat_times: |
|
|
|
if not isinstance(beat, dict): |
|
continue |
|
|
|
|
|
try: |
|
beat_time = float(beat.get("time", 0)) |
|
except (ValueError, TypeError): |
|
continue |
|
|
|
if start_time <= beat_time < end_time: |
|
|
|
beat_type = beat.get("type", "sub") |
|
if not isinstance(beat_type, str): |
|
beat_type = "sub" |
|
|
|
|
|
try: |
|
strength = float(beat.get("strength", 0.5)) |
|
except (ValueError, TypeError): |
|
strength = 0.5 |
|
|
|
|
|
window_beats.append({ |
|
"time": beat_time, |
|
"type": beat_type, |
|
"strength": strength, |
|
"relative_pos": (beat_time - start_time) / (1/fps) |
|
}) |
|
|
|
|
|
time_windows.append({ |
|
"second": i, |
|
"start": start_time, |
|
"end": end_time, |
|
"beats": window_beats |
|
}) |
|
|
|
return time_windows |
|
|
|
def create_second_level_templates(sec_map, tempo, genre=None): |
|
""" |
|
Create syllable templates for each second-level window. |
|
|
|
Parameters: |
|
sec_map: List of second-level time windows with beat information |
|
tempo: Tempo in BPM |
|
genre: Optional genre for genre-specific adjustments |
|
|
|
Returns: |
|
List of template strings, one for each second |
|
""" |
|
|
|
def tempo_to_syllable_base(tempo): |
|
"""Continuous function mapping tempo to syllable base count""" |
|
|
|
if tempo > 180: |
|
return 1.0 |
|
elif tempo > 140: |
|
return 1.0 + (180 - tempo) * 0.02 |
|
elif tempo > 100: |
|
return 1.8 + (140 - tempo) * 0.01 |
|
elif tempo > 70: |
|
return 2.2 + (100 - tempo) * 0.02 |
|
else: |
|
return 2.8 + max(0, (70 - tempo) * 0.04) |
|
|
|
|
|
base_syllables = tempo_to_syllable_base(tempo) |
|
|
|
|
|
genre_factor = 1.0 |
|
if genre: |
|
genre_lower = genre.lower() |
|
if any(term in genre_lower for term in ["rap", "hip hop", "hip-hop"]): |
|
genre_factor = 1.4 |
|
elif any(term in genre_lower for term in ["folk", "country", "ballad"]): |
|
genre_factor = 0.8 |
|
|
|
|
|
templates = [] |
|
|
|
for window in sec_map: |
|
beats = window["beats"] |
|
|
|
|
|
if not beats: |
|
templates.append("w(0.5):1") |
|
continue |
|
|
|
|
|
beat_patterns = [] |
|
|
|
for beat in beats: |
|
|
|
if not isinstance(beat, dict): |
|
continue |
|
|
|
|
|
if "type" not in beat or not isinstance(beat["type"], str): |
|
beat_type = "w" |
|
else: |
|
beat_type = "S" if beat["type"] == "main" else "m" if beat.get("strength", 0) >= 0.7 else "w" |
|
|
|
|
|
try: |
|
strength = float(beat.get("strength", 0.5)) |
|
except (ValueError, TypeError): |
|
strength = 0.5 |
|
|
|
|
|
if beat_type == "S": |
|
syllable_factor = 1.2 |
|
elif beat_type == "m": |
|
syllable_factor = 1.0 |
|
else: |
|
syllable_factor = 0.8 |
|
|
|
|
|
syllable_count = base_syllables * syllable_factor * genre_factor |
|
|
|
|
|
syllable_count = round(syllable_count * 2) / 2 |
|
|
|
|
|
syllable_count = max(0.5, min(4, syllable_count)) |
|
|
|
|
|
strength_pct = round(strength * 100) / 100 |
|
beat_patterns.append(f"{beat_type}({strength_pct}):{syllable_count}") |
|
|
|
|
|
if not beat_patterns: |
|
templates.append("w(0.5):1") |
|
else: |
|
second_template = "-".join(beat_patterns) |
|
templates.append(second_template) |
|
|
|
return templates |
|
|
|
def detect_sections(y, sr): |
|
""" |
|
Detect musical segments without classifying them by type (verse, chorus, etc.). |
|
|
|
Parameters: |
|
y: Audio time series |
|
sr: Sample rate |
|
|
|
Returns: |
|
A list of section dictionaries with start time, end time, and duration |
|
""" |
|
|
|
|
|
hop_length = 512 |
|
|
|
|
|
S = np.abs(librosa.stft(y, hop_length=hop_length)) |
|
contrast = librosa.feature.spectral_contrast(S=S, sr=sr) |
|
|
|
|
|
chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length) |
|
|
|
|
|
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length) |
|
|
|
|
|
rms = librosa.feature.rms(y=y, hop_length=hop_length) |
|
|
|
|
|
y_harmonic, y_percussive = librosa.effects.hpss(y) |
|
|
|
|
|
|
|
duration = librosa.get_duration(y=y, sr=sr) |
|
|
|
|
|
|
|
feature_stack = np.vstack([ |
|
librosa.util.normalize(contrast), |
|
librosa.util.normalize(chroma), |
|
librosa.util.normalize(mfcc), |
|
librosa.util.normalize(rms) |
|
]) |
|
|
|
|
|
feature_matrix = feature_stack.T |
|
|
|
|
|
|
|
from sklearn.decomposition import PCA |
|
|
|
|
|
n_components = min(8, feature_matrix.shape[0], feature_matrix.shape[1]) |
|
|
|
if feature_matrix.shape[0] > n_components and feature_matrix.shape[1] > 0: |
|
try: |
|
pca = PCA(n_components=n_components) |
|
reduced_features = pca.fit_transform(feature_matrix) |
|
except Exception as e: |
|
print(f"PCA failed, falling back to original features: {e}") |
|
|
|
reduced_features = feature_matrix |
|
else: |
|
|
|
reduced_features = feature_matrix |
|
|
|
|
|
|
|
|
|
|
|
min_segments = max(2, int(duration / 60)) |
|
max_segments = min(10, int(duration / 20)) |
|
|
|
|
|
min_segments = max(2, min(min_segments, 4)) |
|
max_segments = max(min_segments + 1, min(max_segments, 8)) |
|
|
|
|
|
best_segments = min_segments |
|
best_score = -1 |
|
|
|
from sklearn.metrics import silhouette_score |
|
from sklearn.cluster import AgglomerativeClustering |
|
|
|
|
|
if reduced_features.shape[0] > max_segments: |
|
for n_segments in range(min_segments, max_segments + 1): |
|
try: |
|
|
|
clustering = AgglomerativeClustering(n_clusters=n_segments) |
|
labels = clustering.fit_predict(reduced_features) |
|
|
|
|
|
if len(np.unique(labels)) > 1 and len(labels) > n_segments + 1: |
|
score = silhouette_score(reduced_features, labels) |
|
|
|
if score > best_score: |
|
best_score = score |
|
best_segments = n_segments |
|
except Exception as e: |
|
print(f"Clustering with {n_segments} segments failed: {e}") |
|
continue |
|
|
|
|
|
n_segments = best_segments |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
clustering = AgglomerativeClustering(n_clusters=n_segments) |
|
labels = clustering.fit_predict(reduced_features) |
|
|
|
|
|
boundaries = [0] |
|
|
|
for i in range(1, len(labels)): |
|
if labels[i] != labels[i-1]: |
|
boundaries.append(i) |
|
|
|
boundaries.append(len(labels)) |
|
|
|
|
|
bounds_frames = np.array(boundaries) |
|
|
|
except Exception as e: |
|
print(f"Final clustering failed: {e}") |
|
|
|
bounds_frames = librosa.segment.agglomerative(feature_stack, n_segments) |
|
|
|
|
|
|
|
bounds_times = librosa.frames_to_time(bounds_frames, sr=sr, hop_length=hop_length) |
|
|
|
|
|
sections = [] |
|
|
|
for i in range(len(bounds_times) - 1): |
|
start = bounds_times[i] |
|
end = bounds_times[i+1] |
|
duration = end - start |
|
|
|
|
|
if duration < 4 and i > 0 and i < len(bounds_times) - 2: |
|
continue |
|
|
|
|
|
sections.append({ |
|
"type": "segment", |
|
"start": start, |
|
"end": end, |
|
"duration": duration |
|
}) |
|
|
|
|
|
sections = [s for s in sections if s["duration"] >= 5] |
|
|
|
return sections |
|
|
|
def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='default'): |
|
""" |
|
Create enhanced syllable templates based on beat patterns with improved musical intelligence. |
|
|
|
Parameters: |
|
beats_info: Dictionary containing beat analysis data |
|
genre: Optional genre to influence template creation |
|
phrase_mode: 'default' uses provided phrases, 'auto' forces recalculation |
|
|
|
Returns: |
|
String of syllable templates with embedded strength values and flexible timing |
|
""" |
|
import numpy as np |
|
from sklearn.cluster import KMeans |
|
|
|
|
|
|
|
if isinstance(beats_info, dict): |
|
processed_beats_info = {} |
|
for k, v in beats_info.items(): |
|
if isinstance(v, np.ndarray): |
|
if v.size == 1: |
|
processed_beats_info[k] = float(v.item()) |
|
else: |
|
processed_beats_info[k] = [float(x) if isinstance(x, np.number) else x for x in v] |
|
elif isinstance(v, np.number): |
|
processed_beats_info[k] = float(v) |
|
elif isinstance(v, list): |
|
processed_beats_info[k] = [float(x) if isinstance(x, np.number) else x for x in v] |
|
else: |
|
processed_beats_info[k] = v |
|
beats_info = processed_beats_info |
|
|
|
|
|
beat_times = beats_info.get("beat_times", []) |
|
beat_strengths = beats_info.get("beat_strengths", [1.0] * len(beat_times)) |
|
tempo = beats_info.get("tempo", 120) |
|
time_signature = beats_info.get("time_signature", 4) |
|
|
|
|
|
if len(beat_times) < 2: |
|
return "S(1.0):1-w(0.5):1|S(1.0):1-w(0.5):1" |
|
|
|
|
|
|
|
if len(beat_strengths) >= 6: |
|
|
|
X = np.array(beat_strengths).reshape(-1, 1) |
|
|
|
|
|
kmeans = KMeans(n_clusters=3, random_state=0, n_init=10).fit(X) |
|
|
|
|
|
centroids = sorted([float(c[0]) for c in kmeans.cluster_centers_]) |
|
|
|
|
|
if len(centroids) >= 3: |
|
medium_threshold = (centroids[0] + centroids[1]) / 2 |
|
strong_threshold = (centroids[1] + centroids[2]) / 2 |
|
else: |
|
|
|
medium_threshold = np.percentile(beat_strengths, 33) |
|
strong_threshold = np.percentile(beat_strengths, 66) |
|
else: |
|
|
|
medium_threshold = np.percentile(beat_strengths, 33) |
|
strong_threshold = np.percentile(beat_strengths, 66) |
|
|
|
|
|
|
|
phrases = beats_info.get("phrases", []) |
|
|
|
if phrase_mode == 'auto' or not phrases: |
|
|
|
phrases = [] |
|
current_phrase = [] |
|
|
|
for i in range(len(beat_times)): |
|
current_phrase.append(i) |
|
|
|
|
|
if (i + 1) % time_signature == 0 or i == len(beat_times) - 1: |
|
if len(current_phrase) >= 2: |
|
phrases.append(current_phrase) |
|
current_phrase = [] |
|
|
|
|
|
if current_phrase and len(current_phrase) >= 2: |
|
phrases.append(current_phrase) |
|
|
|
|
|
|
|
def tempo_to_syllable_base(tempo): |
|
"""Continuous function mapping tempo to syllable base count with scientific curve""" |
|
|
|
|
|
if tempo < 40: |
|
return 3.5 |
|
elif tempo > 200: |
|
return 0.8 |
|
else: |
|
|
|
L = 3.5 |
|
k = 0.04 |
|
x0 = 120 |
|
return L / (1 + np.exp(k * (tempo - x0))) |
|
|
|
|
|
|
|
syllable_templates = [] |
|
|
|
for phrase in phrases: |
|
|
|
if not phrase: |
|
continue |
|
|
|
|
|
phrase_strengths = [beat_strengths[i] for i in phrase if i < len(beat_strengths)] |
|
if not phrase_strengths: |
|
phrase_strengths = [1.0] * len(phrase) |
|
|
|
|
|
stress_pattern = [] |
|
for i, strength in enumerate(phrase_strengths): |
|
|
|
metrical_position = i % time_signature |
|
|
|
|
|
|
|
|
|
if metrical_position == 0: |
|
position_boost = 0.18 |
|
elif time_signature == 4 and metrical_position == 2: |
|
position_boost = 0.1 |
|
elif time_signature == 3 and metrical_position == 1: |
|
position_boost = 0.05 |
|
else: |
|
position_boost = 0 |
|
|
|
effective_strength = strength + position_boost |
|
|
|
if effective_strength >= strong_threshold: |
|
stress_pattern.append(("S", effective_strength)) |
|
elif effective_strength >= medium_threshold: |
|
stress_pattern.append(("m", effective_strength)) |
|
else: |
|
stress_pattern.append(("w", effective_strength)) |
|
|
|
|
|
|
|
detailed_template = [] |
|
|
|
for i, (stress_type, strength) in enumerate(stress_pattern): |
|
|
|
base_syllables = tempo_to_syllable_base(tempo) |
|
|
|
|
|
metrical_position = i % time_signature |
|
position_factor = 1.2 if metrical_position == 0 else 1.0 |
|
|
|
|
|
if stress_type == "S": |
|
syllable_factor = 1.2 * position_factor |
|
elif stress_type == "m": |
|
syllable_factor = 1.0 * position_factor |
|
else: |
|
syllable_factor = 0.8 |
|
|
|
|
|
genre_factor = 1.0 |
|
if genre: |
|
genre = genre.lower() |
|
if "rap" in genre or "hip" in genre: |
|
genre_factor = 1.5 |
|
elif "folk" in genre or "country" in genre or "ballad" in genre: |
|
genre_factor = 0.7 |
|
elif "metal" in genre or "rock" in genre: |
|
genre_factor = 1.1 |
|
elif "jazz" in genre: |
|
genre_factor = 1.2 |
|
elif "classical" in genre: |
|
genre_factor = 0.9 |
|
|
|
|
|
raw_count = base_syllables * syllable_factor * genre_factor |
|
|
|
|
|
|
|
rounded_count = round(raw_count * 4) / 4 |
|
|
|
|
|
syllable_count = max(0.5, min(4, rounded_count)) |
|
|
|
|
|
|
|
strength_pct = round(strength * 100) / 100 |
|
detailed_template.append(f"{stress_type}({strength_pct}):{syllable_count}") |
|
|
|
|
|
phrase_template = "-".join(detailed_template) |
|
syllable_templates.append(phrase_template) |
|
|
|
|
|
|
|
if not syllable_templates: |
|
|
|
if time_signature == 3: |
|
syllable_templates = ["S(0.95):2-w(0.4):1-w(0.35):1"] |
|
elif time_signature == 2: |
|
syllable_templates = ["S(0.95):1.5-w(0.4):1"] |
|
else: |
|
syllable_templates = ["S(0.95):2-w(0.4):1-m(0.7):1.5-w(0.35):1"] |
|
|
|
|
|
return "|".join(syllable_templates) |
|
|
|
def format_syllable_templates_for_prompt(syllable_templates, arrow="→", line_wrap=10, |
|
structured_output=False, beat_types=None): |
|
""" |
|
Convert technical syllable templates into clear, human-readable instructions with |
|
enhanced flexibility and customization options. |
|
|
|
Parameters: |
|
syllable_templates: String or list of templates |
|
arrow: Symbol to use between beats (default: "→") |
|
line_wrap: Number of beats before automatic line wrapping (0 = no wrapping) |
|
structured_output: If True, return structured data instead of text |
|
beat_types: Custom mapping for beat types (default: None, uses standard mapping) |
|
|
|
Returns: |
|
Human-readable instructions or structured data depending on parameters |
|
""" |
|
if not syllable_templates: |
|
return {} if structured_output else "" |
|
|
|
|
|
default_beat_types = { |
|
"S": {"name": "STRONG", "description": "stressed syllable"}, |
|
"m": {"name": "medium", "description": "medium-stressed syllable"}, |
|
"w": {"name": "weak", "description": "unstressed syllable"}, |
|
"X": {"name": "EXTRA", "description": "extra strong syllable"}, |
|
"L": {"name": "legato", "description": "connected/tied syllable"} |
|
} |
|
|
|
|
|
beat_types = beat_types or default_beat_types |
|
|
|
|
|
structured_data = {"lines": [], "explanations": []} if structured_output else None |
|
|
|
|
|
is_enhanced_format = False |
|
|
|
|
|
if isinstance(syllable_templates, str): |
|
|
|
if any(bt + "(" in syllable_templates or bt + ":" in syllable_templates or bt + "[" in syllable_templates |
|
for bt in beat_types.keys()): |
|
is_enhanced_format = True |
|
|
|
elif "|" in syllable_templates: |
|
is_enhanced_format = True |
|
|
|
|
|
output = [] |
|
|
|
if is_enhanced_format: |
|
|
|
phrases = syllable_templates.split("|") if "|" in syllable_templates else [syllable_templates] |
|
|
|
|
|
for i, phrase in enumerate(phrases): |
|
|
|
has_swing = "(swing)" in phrase |
|
if has_swing: |
|
phrase = phrase.replace("(swing)", "") |
|
|
|
beats = phrase.split("-") |
|
beat_instructions = [] |
|
|
|
|
|
for j, beat in enumerate(beats): |
|
|
|
beat_info = {"original": beat, "type": None, "count": None, "strength": None} |
|
|
|
|
|
if "(" in beat and ")" in beat and ":" in beat: |
|
parts = beat.split(":") |
|
beat_type = parts[0].split("(")[0] |
|
strength = parts[0].split("(")[1].rstrip(")") |
|
count = parts[1] |
|
|
|
beat_info["type"] = beat_type |
|
beat_info["count"] = count |
|
beat_info["strength"] = strength |
|
|
|
|
|
elif any(beat.startswith(bt) for bt in beat_types.keys()) and len(beat) > 1: |
|
beat_type = beat[0] |
|
count = beat[1:] |
|
|
|
beat_info["type"] = beat_type |
|
beat_info["count"] = count |
|
|
|
|
|
else: |
|
beat_instructions.append(beat) |
|
continue |
|
|
|
|
|
if beat_info["type"] in beat_types: |
|
type_name = beat_types[beat_info["type"]]["name"] |
|
if beat_info["strength"]: |
|
beat_instructions.append(f"{type_name}({beat_info['count']}) [{beat_info['strength']}]") |
|
else: |
|
beat_instructions.append(f"{type_name}({beat_info['count']})") |
|
else: |
|
|
|
beat_instructions.append(beat) |
|
|
|
|
|
if line_wrap > 0 and len(beat_instructions) > line_wrap: |
|
wrapped_instructions = [] |
|
for k in range(0, len(beat_instructions), line_wrap): |
|
section = beat_instructions[k:k+line_wrap] |
|
wrapped_instructions.append(f"{arrow} ".join(section)) |
|
line_desc = f"\n {arrow} ".join(wrapped_instructions) |
|
else: |
|
line_desc = f" {arrow} ".join(beat_instructions) |
|
|
|
|
|
if has_swing: |
|
line_desc += " [with swing feel]" |
|
|
|
|
|
line_output = f"Line {i+1}: {line_desc}" |
|
output.append(line_output) |
|
|
|
if structured_output: |
|
structured_data["lines"].append({ |
|
"line_number": i+1, |
|
"beats": [{"original": beats[j], |
|
"type": beat_info.get("type"), |
|
"count": beat_info.get("count"), |
|
"strength": beat_info.get("strength")} |
|
for j, beat_info in enumerate([b for b in beats if isinstance(b, dict)])], |
|
"has_swing": has_swing |
|
}) |
|
|
|
|
|
explanation = [ |
|
"\n📝 UNDERSTANDING THE NOTATION:" |
|
] |
|
|
|
|
|
used_beat_types = set() |
|
for phrase in phrases: |
|
for beat in phrase.split("-"): |
|
for bt in beat_types.keys(): |
|
if beat.startswith(bt): |
|
used_beat_types.add(bt) |
|
|
|
for bt in used_beat_types: |
|
if bt in beat_types: |
|
name = beat_types[bt]["name"] |
|
desc = beat_types[bt]["description"] |
|
explanation.append(f"- {name}(n): Place a {desc} here, plus (n-1) unstressed syllables") |
|
|
|
explanation.extend([ |
|
f"- {arrow}: Indicates flow from one beat to the next", |
|
"- [0.xx]: Beat strength value (higher = more emphasis needed)" |
|
]) |
|
|
|
output.extend(explanation) |
|
|
|
if structured_output: |
|
structured_data["explanations"] = explanation |
|
|
|
|
|
has_half_syllables = any((".5" in beat) for phrase in phrases for beat in phrase.split("-")) |
|
if has_half_syllables: |
|
half_syllable_examples = [ |
|
"\n🎵 HALF-SYLLABLE EXAMPLES:", |
|
"- STRONG(1.5): One stressed syllable followed by an unstressed half-syllable", |
|
" Example: \"LOVE you\" where \"LOVE\" is stressed and \"you\" is quick", |
|
"- medium(2.5): One medium syllable plus one-and-a-half unstressed syllables", |
|
" Example: \"Wait for the\" where \"Wait\" is medium-stressed and \"for the\" is quick" |
|
] |
|
output.extend(half_syllable_examples) |
|
|
|
if structured_output: |
|
structured_data["half_syllable_examples"] = half_syllable_examples |
|
|
|
|
|
if any("swing" in phrase for phrase in phrases): |
|
swing_guide = [ |
|
"\n🎶 SWING RHYTHM GUIDE:", |
|
"- In swing, syllables should be unevenly timed (long-short pattern)", |
|
"- Example: \"SUM-mer TIME\" in swing feels like \"SUM...mer-TIME\" with delay" |
|
] |
|
output.extend(swing_guide) |
|
|
|
if structured_output: |
|
structured_data["swing_guide"] = swing_guide |
|
|
|
|
|
else: |
|
formatted_lines = [] |
|
|
|
if isinstance(syllable_templates, list): |
|
for i, template in enumerate(syllable_templates): |
|
if isinstance(template, dict) and "syllable_template" in template: |
|
line = f"Line {i+1}: {template['syllable_template']} syllables" |
|
formatted_lines.append(line) |
|
|
|
if structured_output: |
|
structured_data["lines"].append({ |
|
"line_number": i+1, |
|
"syllable_count": template["syllable_template"] |
|
}) |
|
elif isinstance(template, str): |
|
line = f"Line {i+1}: {template} syllables" |
|
formatted_lines.append(line) |
|
|
|
if structured_output: |
|
structured_data["lines"].append({ |
|
"line_number": i+1, |
|
"syllable_count": template |
|
}) |
|
|
|
output = formatted_lines |
|
else: |
|
output = [str(syllable_templates)] |
|
|
|
if structured_output: |
|
structured_data["raw_content"] = str(syllable_templates) |
|
|
|
|
|
application_tips = [ |
|
"\n💡 APPLICATION TIPS:", |
|
"1. Strong beats need naturally stressed syllables (like the START of \"RE-mem-ber\")", |
|
"2. Place important words on strong beats for natural emphasis", |
|
"3. Vowel sounds work best for sustained or emphasized syllables", |
|
"4. Keep consonant clusters (like \"str\" or \"thr\") on weak beats" |
|
] |
|
output.extend(application_tips) |
|
|
|
if structured_output: |
|
structured_data["application_tips"] = application_tips |
|
return structured_data |
|
|
|
return "\n".join(output) |
|
|
|
def verify_flexible_syllable_counts(lyrics, templates, second_level_templates=None): |
|
""" |
|
Enhanced verification of syllable counts and stress patterns with precise alignment analysis |
|
for both phrase-level and second-level templates. |
|
""" |
|
import re |
|
import pronouncing |
|
import numpy as np |
|
import functools |
|
from itertools import chain |
|
|
|
print(f"DEBUG: In verify_flexible_syllable_counts, type of lyrics={type(lyrics)}") |
|
print(f"DEBUG: Type of templates={type(templates)}") |
|
|
|
|
|
if not isinstance(lyrics, str): |
|
print(f"DEBUG: lyrics is not a string, it's {type(lyrics)}") |
|
|
|
try: |
|
lyrics = str(lyrics) |
|
except Exception as e: |
|
print(f"DEBUG: Cannot convert lyrics to string: {str(e)}") |
|
return "Error: Cannot process non-string lyrics" |
|
|
|
|
|
if not isinstance(templates, list): |
|
print(f"DEBUG: templates is not a list, it's {type(templates)}") |
|
|
|
if templates is not None: |
|
templates = [templates] |
|
else: |
|
templates = [] |
|
|
|
|
|
lines = [line.strip() for line in lyrics.split("\n") if line.strip()] |
|
|
|
|
|
verification_notes = [] |
|
detailed_analysis = [] |
|
stress_misalignments = [] |
|
total_mismatch_count = 0 |
|
|
|
|
|
for i, line in enumerate(lines): |
|
if i >= len(templates): |
|
break |
|
|
|
template = templates[i] |
|
print(f"DEBUG: Processing template {i+1}, type={type(template)}") |
|
|
|
|
|
template_str = None |
|
if isinstance(template, dict) and "syllable_template" in template: |
|
template_str = template["syllable_template"] |
|
elif isinstance(template, str): |
|
template_str = template |
|
else: |
|
print(f"DEBUG: Skipping template {i+1}, not a string or dict with syllable_template") |
|
continue |
|
|
|
if not isinstance(template_str, str): |
|
print(f"DEBUG: template_str is not a string, it's {type(template_str)}") |
|
continue |
|
|
|
|
|
template_phrases = [template_str] |
|
if "|" in template_str: |
|
template_phrases = template_str.split("|") |
|
|
|
|
|
best_match_diff = float('inf') |
|
best_match_phrase = None |
|
best_phrase_beats = None |
|
actual_count = count_syllables(line) |
|
|
|
for phrase_idx, phrase in enumerate(template_phrases): |
|
|
|
beats_info = [] |
|
total_expected = 0 |
|
|
|
|
|
if "-" in phrase: |
|
beat_templates = phrase.split("-") |
|
|
|
|
|
for beat in beat_templates: |
|
beat_info = {"original": beat, "type": None, "count": 1, "strength": None} |
|
|
|
|
|
if "(" in beat and ")" in beat and ":" in beat: |
|
parts = beat.split(":") |
|
beat_type = parts[0].split("(")[0] |
|
try: |
|
strength = float(parts[0].split("(")[1].rstrip(")")) |
|
except ValueError: |
|
strength = 1.0 |
|
|
|
|
|
try: |
|
count = float(parts[1]) |
|
|
|
if count == int(count): |
|
count = int(count) |
|
except ValueError: |
|
count = 1 |
|
|
|
beat_info.update({ |
|
"type": beat_type, |
|
"count": count, |
|
"strength": strength |
|
}) |
|
|
|
|
|
elif any(beat.startswith(x) for x in ["S", "m", "w", "X", "L"]): |
|
beat_type = beat[0] |
|
|
|
|
|
try: |
|
count_str = beat[1:] |
|
count = float(count_str) |
|
if count == int(count): |
|
count = int(count) |
|
except ValueError: |
|
count = 1 |
|
|
|
beat_info.update({ |
|
"type": beat_type, |
|
"count": count |
|
}) |
|
|
|
|
|
else: |
|
try: |
|
count = float(beat) |
|
if count == int(count): |
|
count = int(count) |
|
beat_info["count"] = count |
|
except ValueError: |
|
pass |
|
|
|
beats_info.append(beat_info) |
|
total_expected += beat_info["count"] |
|
|
|
|
|
phrase_diff = abs(actual_count - total_expected) |
|
|
|
|
|
expected_ratio = 0.15 if total_expected > 10 else 0.25 |
|
phrase_threshold = max(1, round(total_expected * expected_ratio)) |
|
|
|
|
|
if phrase_diff < best_match_diff: |
|
best_match_diff = phrase_diff |
|
best_match_phrase = phrase |
|
best_phrase_beats = beats_info |
|
|
|
|
|
else: |
|
try: |
|
total_expected = float(phrase) |
|
phrase_diff = abs(actual_count - total_expected) |
|
if phrase_diff < best_match_diff: |
|
best_match_diff = phrase_diff |
|
best_match_phrase = phrase |
|
best_phrase_beats = [{"count": total_expected}] |
|
except ValueError: |
|
pass |
|
|
|
|
|
if best_match_phrase and best_phrase_beats: |
|
total_expected = sum(beat["count"] for beat in best_phrase_beats) |
|
|
|
|
|
expected_ratio = 0.15 if total_expected > 10 else 0.25 |
|
threshold = max(1, round(total_expected * expected_ratio)) |
|
|
|
|
|
if total_expected > 0 and best_match_diff > threshold: |
|
verification_notes.append(f"Line {i+1}: Expected {total_expected} syllables, got {actual_count}") |
|
total_mismatch_count += 1 |
|
|
|
|
|
words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) |
|
|
|
|
|
word_analysis = [] |
|
cumulative_syllables = 0 |
|
|
|
for word in words: |
|
syllable_count = count_syllables_for_word(word) |
|
|
|
|
|
stress_pattern = get_word_stress(word) |
|
|
|
word_analysis.append({ |
|
"word": word, |
|
"syllables": syllable_count, |
|
"stress_pattern": stress_pattern, |
|
"position": cumulative_syllables |
|
}) |
|
|
|
cumulative_syllables += syllable_count |
|
|
|
|
|
if best_phrase_beats and any(b.get("type") == "S" for b in best_phrase_beats if "type" in b): |
|
|
|
strong_positions = [] |
|
current_pos = 0 |
|
|
|
for beat in best_phrase_beats: |
|
if beat.get("type") == "S": |
|
strong_positions.append(current_pos) |
|
current_pos += beat.get("count", 1) |
|
|
|
|
|
alignment_issues = [] |
|
|
|
for pos in strong_positions: |
|
|
|
misaligned_word = None |
|
|
|
for word_info in word_analysis: |
|
word_start = word_info["position"] |
|
word_end = word_start + word_info["syllables"] |
|
|
|
if word_start <= pos < word_end: |
|
|
|
syllable_in_word = pos - word_start |
|
|
|
|
|
stress = word_info["stress_pattern"] |
|
|
|
|
|
if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1': |
|
misaligned_word = word_info["word"] |
|
alignment_issues.append(f"'{word_info['word']}' (unstressed syllable on strong beat)") |
|
stress_misalignments.append({ |
|
"line": i+1, |
|
"word": word_info["word"], |
|
"position": pos, |
|
"suggestion": get_stress_aligned_alternatives(word_info["word"], syllable_in_word) |
|
}) |
|
break |
|
|
|
if alignment_issues: |
|
verification_notes.append(f" → Stress misalignments: {', '.join(alignment_issues)}") |
|
|
|
|
|
alignment_map = generate_alignment_visualization(line, best_phrase_beats, word_analysis) |
|
if alignment_map: |
|
detailed_analysis.append(f"Line {i+1} Alignment Analysis:\n{alignment_map}") |
|
else: |
|
|
|
verification_notes.append(f"Line {i+1}: Unable to find matching template pattern") |
|
|
|
|
|
if second_level_templates: |
|
verification_notes.append("\n=== SECOND-LEVEL VERIFICATION ===\n") |
|
|
|
|
|
for i, template in enumerate(second_level_templates): |
|
if i >= len(lines): |
|
break |
|
|
|
line = lines[i] |
|
|
|
|
|
if line.startswith('[') and ']' in line: |
|
continue |
|
|
|
actual_count = count_syllables(line) |
|
|
|
|
|
total_expected = 0 |
|
beat_patterns = [] |
|
|
|
|
|
if isinstance(template, str) and "-" in template: |
|
for beat in template.split("-"): |
|
if ":" in beat: |
|
try: |
|
count_part = beat.split(":")[1] |
|
count = float(count_part) |
|
total_expected += count |
|
|
|
|
|
beat_type = beat.split("(")[0] if "(" in beat else beat[0] |
|
beat_patterns.append((beat_type, count)) |
|
except (IndexError, ValueError): |
|
pass |
|
|
|
|
|
if total_expected > 0: |
|
|
|
expected_ratio = 0.2 |
|
threshold = max(0.5, round(total_expected * expected_ratio)) |
|
|
|
difference = abs(actual_count - total_expected) |
|
|
|
if difference > threshold: |
|
verification_notes.append(f"Second {i+1}: Expected {total_expected} syllables, got {actual_count}") |
|
total_mismatch_count += 1 |
|
|
|
|
|
words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) |
|
word_analysis = [] |
|
cumulative_syllables = 0 |
|
|
|
for word in words: |
|
syllable_count = count_syllables_for_word(word) |
|
stress_pattern = get_word_stress(word) |
|
|
|
word_analysis.append({ |
|
"word": word, |
|
"syllables": syllable_count, |
|
"stress_pattern": stress_pattern, |
|
"position": cumulative_syllables |
|
}) |
|
|
|
cumulative_syllables += syllable_count |
|
|
|
|
|
if beat_patterns: |
|
strong_positions = [] |
|
current_pos = 0 |
|
|
|
for beat_type, count in beat_patterns: |
|
if beat_type == "S": |
|
strong_positions.append(current_pos) |
|
current_pos += count |
|
|
|
|
|
for pos in strong_positions: |
|
for word_info in word_analysis: |
|
word_start = word_info["position"] |
|
word_end = word_start + word_info["syllables"] |
|
|
|
if word_start <= pos < word_end: |
|
|
|
syllable_in_word = int(pos - word_start) |
|
stress = word_info["stress_pattern"] |
|
|
|
if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1': |
|
verification_notes.append(f" → In second {i+1}, '{word_info['word']}' has unstressed syllable on strong beat") |
|
break |
|
|
|
|
|
if verification_notes: |
|
lyrics += "\n\n[Note: Potential rhythm mismatches detected in these lines:]\n" |
|
lyrics += "\n".join(verification_notes) |
|
|
|
if detailed_analysis: |
|
lyrics += "\n\n[Detailed Alignment Analysis:]\n" |
|
lyrics += "\n\n".join(detailed_analysis) |
|
|
|
lyrics += "\n\n[How to fix rhythm mismatches:]\n" |
|
lyrics += "1. Make sure stressed syllables (like 'LO' in 'LOV-er') fall on STRONG beats\n" |
|
lyrics += "2. Adjust syllable counts to match the template (add/remove words or use different words)\n" |
|
lyrics += "3. Try using words where natural stress aligns with musical rhythm\n" |
|
|
|
|
|
if stress_misalignments: |
|
lyrics += "\n[Specific word replacement suggestions:]\n" |
|
for issue in stress_misalignments[:5]: |
|
if issue["suggestion"]: |
|
lyrics += f"Line {issue['line']}: Consider replacing '{issue['word']}' with: {issue['suggestion']}\n" |
|
|
|
return lyrics |
|
|
|
def generate_alignment_visualization(line, beats_info, word_analysis): |
|
"""Generate a visual representation of syllable alignment with beats.""" |
|
if not beats_info or not word_analysis: |
|
return None |
|
|
|
|
|
syllable_breakdown = [] |
|
syllable_stresses = [] |
|
|
|
for word_info in word_analysis: |
|
word = word_info["word"] |
|
syllables = word_info["syllables"] |
|
stress = word_info["stress_pattern"] or "" |
|
|
|
|
|
while len(stress) < syllables: |
|
stress += "0" |
|
|
|
|
|
parts = naive_syllable_split(word, syllables) |
|
|
|
for i, part in enumerate(parts): |
|
syllable_breakdown.append(part) |
|
if i < len(stress): |
|
syllable_stresses.append(stress[i]) |
|
else: |
|
syllable_stresses.append("0") |
|
|
|
|
|
beat_types = [] |
|
current_pos = 0 |
|
|
|
for beat in beats_info: |
|
beat_type = beat.get("type", "-") |
|
count = beat.get("count", 1) |
|
|
|
|
|
if isinstance(count, int): |
|
beat_types.extend([beat_type] * count) |
|
else: |
|
|
|
whole_part = int(count) |
|
frac_part = count - whole_part |
|
|
|
if whole_part > 0: |
|
beat_types.extend([beat_type] * whole_part) |
|
|
|
if frac_part > 0: |
|
beat_types.append(f"{beat_type}½") |
|
|
|
|
|
while len(beat_types) < len(syllable_breakdown): |
|
beat_types.append("-") |
|
|
|
|
|
beat_types = beat_types[:len(syllable_breakdown)] |
|
|
|
|
|
result = [] |
|
|
|
|
|
syllable_display = [] |
|
for i, syllable in enumerate(syllable_breakdown): |
|
if i < len(syllable_stresses) and syllable_stresses[i] == "1": |
|
syllable_display.append(syllable.upper()) |
|
else: |
|
syllable_display.append(syllable.lower()) |
|
|
|
result.append(" - ".join(syllable_display)) |
|
|
|
|
|
beat_indicators = [] |
|
for i, (syllable, beat_type) in enumerate(zip(syllable_stresses, beat_types)): |
|
if beat_type == "S" or beat_type.startswith("S"): |
|
if syllable == "1": |
|
beat_indicators.append("↑") |
|
else: |
|
beat_indicators.append("❌") |
|
elif beat_type == "m" or beat_type.startswith("m"): |
|
beat_indicators.append("•") |
|
elif beat_type == "w" or beat_type.startswith("w"): |
|
beat_indicators.append("·") |
|
else: |
|
beat_indicators.append(" ") |
|
|
|
result.append(" ".join(beat_indicators)) |
|
|
|
|
|
result.append(" - ".join(beat_types)) |
|
|
|
return "\n".join(result) |
|
|
|
@functools.lru_cache(maxsize=256) |
|
def naive_syllable_split(word, syllable_count): |
|
"""Naively split a word into the specified number of syllables, with caching for performance.""" |
|
if syllable_count <= 1: |
|
return [word] |
|
|
|
|
|
vowels = "aeiouy" |
|
consonants = "bcdfghjklmnpqrstvwxz" |
|
|
|
|
|
splits = [] |
|
for i in range(1, len(word) - 1): |
|
if word[i] in consonants and word[i-1] in vowels: |
|
splits.append(i) |
|
elif word[i] in vowels and word[i-1] in consonants and word[i+1] in consonants: |
|
splits.append(i+1) |
|
|
|
|
|
while len(splits) < syllable_count - 1: |
|
for i in range(1, len(word)): |
|
if i not in splits: |
|
splits.append(i) |
|
break |
|
|
|
|
|
splits.sort() |
|
splits = splits[:syllable_count - 1] |
|
|
|
|
|
result = [] |
|
prev = 0 |
|
for pos in splits: |
|
result.append(word[prev:pos]) |
|
prev = pos |
|
|
|
result.append(word[prev:]) |
|
return result |
|
|
|
def get_stress_aligned_alternatives(word, position_to_stress): |
|
"""Suggest alternative words with proper stress at the required position.""" |
|
|
|
|
|
syllable_count = count_syllables_for_word(word) |
|
|
|
|
|
if syllable_count == 2: |
|
if position_to_stress == 0: |
|
first_stress = ["love-ly", "won-der", "beau-ty", "danc-ing", "dream-ing", |
|
"heart-beat", "sun-light", "moon-light", "star-light"] |
|
return ", ".join(first_stress[:3]) |
|
else: |
|
second_stress = ["be-LIEVE", "a-BOVE", "a-ROUND", "to-DAY", "a-LIVE", |
|
"a-LONE", "be-HOLD", "re-TURN", "de-LIGHT"] |
|
return ", ".join(second_stress[:3]) |
|
elif syllable_count == 3: |
|
if position_to_stress == 0: |
|
return "MEM-o-ry, WON-der-ful, BEAU-ti-ful" |
|
elif position_to_stress == 1: |
|
return "a-MAZE-ing, to-GE-ther, for-EV-er" |
|
else: |
|
return "un-der-STAND, o-ver-COME, ne-ver-MORE" |
|
|
|
|
|
return f"a word with stress on syllable {position_to_stress + 1}" |
|
|
|
def generate_lyrics(genre, duration, emotion_results, song_structure=None, lyrics_requirements=None): |
|
""" |
|
Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment. |
|
|
|
This improved version uses advanced template creation, better formatting, and verification with |
|
potential refinement for lyrics that perfectly match the musical rhythm patterns. |
|
|
|
Parameters: |
|
genre: Musical genre of the audio |
|
duration: Duration of the audio in seconds |
|
emotion_results: Dictionary containing emotional analysis results |
|
song_structure: Optional dictionary containing song structure analysis |
|
lyrics_requirements: Optional user-provided requirements for the lyrics |
|
|
|
Returns: |
|
Generated lyrics aligned with the rhythm patterns of the music |
|
""" |
|
|
|
def is_safe_dict_access(obj, key): |
|
"""Safe dictionary key access with type checking""" |
|
if not isinstance(obj, dict): |
|
print(f"WARNING: Attempted to access key '{key}' on non-dictionary object of type {type(obj)}") |
|
return False |
|
return key in obj |
|
|
|
|
|
if not isinstance(emotion_results, dict): |
|
emotion_results = { |
|
"emotion_analysis": {"primary_emotion": "Unknown"}, |
|
"theme_analysis": {"primary_theme": "Unknown"}, |
|
"rhythm_analysis": {"tempo": 0}, |
|
"tonal_analysis": {"key": "Unknown", "mode": ""}, |
|
"summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"} |
|
} |
|
|
|
|
|
if song_structure is not None and not isinstance(song_structure, dict): |
|
print(f"WARNING: song_structure is not a dict, it's {type(song_structure)}") |
|
song_structure = None |
|
|
|
print(f"DEBUG: Starting generate_lyrics with genre={genre}, duration={duration}") |
|
print(f"DEBUG: Type of song_structure={type(song_structure)}") |
|
print(f"DEBUG: Type of emotion_results={type(emotion_results)}") |
|
|
|
|
|
def safe_dict_get(d, key, default=None): |
|
"""Safely get a value from a dictionary, handling non-dictionary objects.""" |
|
if not isinstance(d, dict): |
|
print(f"WARNING: Attempted to access key '{key}' in non-dictionary object of type {type(d)}") |
|
return default |
|
return d.get(key, default) |
|
|
|
|
|
primary_emotion = safe_dict_get(safe_dict_get(emotion_results, "emotion_analysis", {}), "primary_emotion", "Unknown") |
|
primary_theme = safe_dict_get(safe_dict_get(emotion_results, "theme_analysis", {}), "primary_theme", "Unknown") |
|
|
|
|
|
try: |
|
tempo = float(safe_dict_get(safe_dict_get(emotion_results, "rhythm_analysis", {}), "tempo", 0.0)) |
|
except (ValueError, TypeError): |
|
tempo = 0.0 |
|
|
|
key = safe_dict_get(safe_dict_get(emotion_results, "tonal_analysis", {}), "key", "Unknown") |
|
mode = safe_dict_get(safe_dict_get(emotion_results, "tonal_analysis", {}), "mode", "") |
|
|
|
|
|
syllable_guidance = "" |
|
templates_for_verification = [] |
|
|
|
|
|
structure_visualization = "=== MUSIC-LYRICS STRUCTURE MATCHING ===\n\n" |
|
structure_visualization += f"Song Duration: {duration:.1f} seconds\n" |
|
structure_visualization += f"Tempo: {tempo:.1f} BPM\n\n" |
|
|
|
|
|
if song_structure and is_safe_dict_access(song_structure, "second_level") and is_safe_dict_access(song_structure.get("second_level", {}), "templates"): |
|
print(f"DEBUG: Using second-level templates") |
|
second_level_templates = song_structure.get("second_level", {}).get("templates", []) |
|
|
|
|
|
second_level_guidance = "\nSECOND-BY-SECOND RHYTHM INSTRUCTIONS:\n" |
|
second_level_guidance += "Each line below corresponds to ONE SECOND of audio. Follow these rhythm patterns EXACTLY:\n\n" |
|
|
|
|
|
formatted_second_templates = [] |
|
for i, template in enumerate(second_level_templates): |
|
if i < min(60, len(second_level_templates)): |
|
formatted_template = format_syllable_templates_for_prompt(template, arrow="→", line_wrap=0) |
|
formatted_second_templates.append(f"Second {i+1}: {formatted_template}") |
|
|
|
second_level_guidance += "\n".join(formatted_second_templates) |
|
|
|
|
|
second_level_guidance += "\n\nCRITICAL: Create ONE LINE of lyrics for EACH SECOND, following the exact rhythm pattern." |
|
second_level_guidance += "\nIf a second has no beats, use it for a breath or pause in the lyrics." |
|
second_level_guidance += "\nThe first line of your lyrics MUST match Second 1, the second line matches Second 2, and so on." |
|
|
|
|
|
syllable_guidance = second_level_guidance |
|
|
|
|
|
templates_for_verification = second_level_templates |
|
|
|
elif song_structure: |
|
print(f"DEBUG: Checking flexible structure") |
|
|
|
if is_safe_dict_access(song_structure, "flexible_structure"): |
|
print(f"DEBUG: Using flexible structure") |
|
flexible = song_structure.get("flexible_structure", {}) |
|
if is_safe_dict_access(flexible, "segments") and len(flexible.get("segments", [])) > 0: |
|
print(f"DEBUG: Found segments in flexible structure") |
|
|
|
segments = flexible.get("segments", []) |
|
|
|
|
|
structure_visualization += f"Total segments: {len(segments)}\n" |
|
structure_visualization += "Each segment represents one musical phrase for which you should write ONE line of lyrics.\n\n" |
|
|
|
|
|
enhanced_templates = [] |
|
|
|
for i, segment in enumerate(segments): |
|
if i < 30: |
|
|
|
segment_start = segment["start"] |
|
segment_end = segment["end"] |
|
|
|
|
|
structure_visualization += f"Segment {i+1}: {segment_start:.1f}s - {segment_end:.1f}s (duration: {segment_end-segment_start:.1f}s)\n" |
|
|
|
|
|
segment_beats = [] |
|
|
|
|
|
print(f"DEBUG: Checking beat_times in flexible structure") |
|
if is_safe_dict_access(flexible, "beats") and is_safe_dict_access(flexible.get("beats", {}), "beat_times"): |
|
beat_times = flexible.get("beats", {}).get("beat_times", []) |
|
if isinstance(beat_times, list): |
|
beat_strengths = flexible.get("beats", {}).get("beat_strengths", []) |
|
|
|
for j, beat_time in enumerate(beat_times): |
|
if segment_start <= beat_time < segment_end: |
|
|
|
segment_beats.append(j) |
|
|
|
|
|
segment_beats_info = { |
|
"beat_times": [beat_times[j] for j in segment_beats if j < len(beat_times)], |
|
"tempo": flexible.get("beats", {}).get("tempo", 120) |
|
} |
|
|
|
if beat_strengths and isinstance(beat_strengths, list): |
|
segment_beats_info["beat_strengths"] = [ |
|
beat_strengths[j] for j in segment_beats |
|
if j < len(beat_strengths) |
|
] |
|
|
|
|
|
segment_beats_info["phrases"] = [segment_beats] |
|
|
|
|
|
print(f"DEBUG: Creating flexible syllable template for segment {i+1}") |
|
enhanced_template = create_flexible_syllable_templates( |
|
segment_beats_info, |
|
genre=genre, |
|
phrase_mode='auto' if i == 0 else 'default' |
|
) |
|
enhanced_templates.append(enhanced_template) |
|
templates_for_verification.append(enhanced_template) |
|
|
|
|
|
structure_visualization += f" Template: {enhanced_template}\n" |
|
else: |
|
print(f"DEBUG: beat_times is not a list, it's {type(beat_times)}") |
|
else: |
|
print(f"DEBUG: beats or beat_times not found in flexible structure") |
|
|
|
continue |
|
|
|
|
|
pattern_groups = {} |
|
|
|
for i, template in enumerate(enhanced_templates): |
|
|
|
simple_pattern = template.replace("(", "").replace(")", "").replace(":", "") |
|
|
|
|
|
found_match = False |
|
for group, patterns in pattern_groups.items(): |
|
if any(simple_pattern == p.replace("(", "").replace(")", "").replace(":", "") for p in patterns): |
|
pattern_groups[group].append(template) |
|
found_match = True |
|
break |
|
|
|
if not found_match: |
|
|
|
group_name = f"Group_{len(pattern_groups) + 1}" |
|
pattern_groups[group_name] = [template] |
|
|
|
|
|
syllable_guidance = "CRITICAL RHYTHM INSTRUCTIONS:\n" |
|
syllable_guidance += "Each line of lyrics MUST match exactly with one musical phrase/segment.\n" |
|
syllable_guidance += "Follow these rhythm patterns for each line (STRONG beats need stressed syllables):\n\n" |
|
|
|
|
|
formatted_templates = [] |
|
for i, template in enumerate(enhanced_templates): |
|
formatted_templates.append(format_syllable_templates_for_prompt([template], arrow="→", line_wrap=8)) |
|
|
|
syllable_guidance += "\n".join(formatted_templates) |
|
|
|
|
|
use_sections = True |
|
|
|
|
|
if verse_lines > 0: |
|
verse_lines = min(verse_lines, total_lines // 2) |
|
else: |
|
verse_lines = total_lines // 2 |
|
|
|
if chorus_lines > 0: |
|
chorus_lines = min(chorus_lines, total_lines // 3) |
|
else: |
|
chorus_lines = total_lines // 3 |
|
|
|
if bridge_lines > 0: |
|
bridge_lines = min(bridge_lines, total_lines // 6) |
|
else: |
|
bridge_lines = 0 |
|
|
|
|
|
elif song_structure and is_safe_dict_access(song_structure, "syllables") and song_structure.get("syllables"): |
|
syllable_guidance = "RHYTHM PATTERN INSTRUCTIONS:\n" |
|
syllable_guidance += "Follow these syllable patterns for each section. Each line should match ONE phrase:\n\n" |
|
|
|
|
|
section_counts = {"verse": 0, "chorus": 0, "bridge": 0, "intro": 0, "outro": 0} |
|
|
|
for section in song_structure.get("syllables", []): |
|
if not isinstance(section, dict): |
|
continue |
|
|
|
section_type = section.get("type", "verse") |
|
section_counts[section_type] = section_counts.get(section_type, 0) + 1 |
|
|
|
if is_safe_dict_access(section, "syllable_template"): |
|
|
|
if is_safe_dict_access(song_structure, "beats") and is_safe_dict_access(song_structure.get("beats", {}), "beat_times"): |
|
section_beats_info = { |
|
"beat_times": [beat for beat in song_structure.get("beats", {}).get("beat_times", []) |
|
if section.get("start", 0) <= beat < section.get("end", 0)], |
|
"tempo": song_structure.get("beats", {}).get("tempo", 120) |
|
} |
|
|
|
if is_safe_dict_access(song_structure.get("beats", {}), "beat_strengths"): |
|
section_beats_info["beat_strengths"] = [ |
|
strength for i, strength in enumerate(song_structure.get("beats", {}).get("beat_strengths", [])) |
|
if i < len(song_structure.get("beats", {}).get("beat_times", [])) and |
|
section.get("start", 0) <= song_structure.get("beats", {}).get("beat_times", [])[i] < section.get("end", 0) |
|
] |
|
|
|
|
|
section_beats_info["phrases"] = [list(range(len(section_beats_info["beat_times"])))] |
|
|
|
|
|
section_beats_info["phrases"] = [list(range(len(section_beats_info["beat_times"])))] |
|
|
|
|
|
enhanced_template = create_flexible_syllable_templates( |
|
section_beats_info, |
|
genre=genre, |
|
phrase_mode='auto' if section['type'] == 'verse' else 'default' |
|
) |
|
|
|
syllable_guidance += f"[{section['type'].capitalize()}]:\n" |
|
syllable_guidance += format_syllable_templates_for_prompt( |
|
enhanced_template, |
|
arrow="→", |
|
line_wrap=6 |
|
) + "\n\n" |
|
templates_for_verification.append(section) |
|
elif "syllable_count" in section: |
|
syllable_guidance += f"[{section['type'].capitalize()}]: ~{section['syllable_count']} syllables total\n" |
|
|
|
|
|
structure_visualization += "Using traditional section-based structure:\n" |
|
for section_type, count in section_counts.items(): |
|
if count > 0: |
|
structure_visualization += f"{section_type.capitalize()}: {count} sections\n" |
|
|
|
|
|
verse_lines = max(2, section_counts.get("verse", 0) * 4) |
|
chorus_lines = max(2, section_counts.get("chorus", 0) * 4) |
|
bridge_lines = max(0, section_counts.get("bridge", 0) * 2) |
|
|
|
|
|
use_sections = True |
|
|
|
|
|
if not syllable_guidance: |
|
syllable_guidance = "RHYTHM ALIGNMENT INSTRUCTIONS:\n\n" |
|
syllable_guidance += "1. Align stressed syllables with strong beats (usually beats 1 and 3 in 4/4 time)\n" |
|
syllable_guidance += "2. Use unstressed syllables on weak beats (usually beats 2 and 4 in 4/4 time)\n" |
|
syllable_guidance += "3. Use appropriate syllable counts based on tempo:\n" |
|
syllable_guidance += " - Fast tempo (>120 BPM): 4-6 syllables per line\n" |
|
syllable_guidance += " - Medium tempo (90-120 BPM): 6-8 syllables per line\n" |
|
syllable_guidance += " - Slow tempo (<90 BPM): 8-10 syllables per line\n" |
|
|
|
|
|
structure_visualization += "Using estimated structure (no detailed analysis available):\n" |
|
|
|
|
|
estimated_lines = max(8, int(duration / 10)) |
|
structure_visualization += f"Estimated total lines: {estimated_lines}\n" |
|
|
|
|
|
verse_lines = estimated_lines // 2 |
|
chorus_lines = estimated_lines // 3 |
|
bridge_lines = estimated_lines // 6 if estimated_lines > 12 else 0 |
|
|
|
|
|
use_sections = True |
|
|
|
|
|
syllable_guidance += "\nEXAMPLES OF PERFECT RHYTHM ALIGNMENT:\n" |
|
syllable_guidance += "Pattern: S(0.95):1 → w(0.4):1 → m(0.7):1 → w(0.3):1\n" |
|
syllable_guidance += "Lyric: 'HEAR the MU-sic PLAY'\n" |
|
syllable_guidance += " ↑ ↑ ↑ ↑\n" |
|
syllable_guidance += " S w m w <- BEAT TYPE\n\n" |
|
|
|
syllable_guidance += "Pattern: S(0.9):2 → w(0.3):1 → S(0.85):1 → w(0.4):2\n" |
|
syllable_guidance += "Lyric: 'DANC-ing TO the RHYTHM of LOVE'\n" |
|
syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" |
|
syllable_guidance += " S S w S w w <- BEAT TYPE\n\n" |
|
|
|
syllable_guidance += "Pattern: S(0.92):1 → m(0.65):2 → S(0.88):1 → w(0.35):1\n" |
|
syllable_guidance += "Lyric: 'TIME keeps FLOW-ing ON and ON'\n" |
|
syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" |
|
syllable_guidance += " S m m S w w <- BEAT TYPE\n\n" |
|
|
|
|
|
genre_guidance = "" |
|
if any(term in genre.lower() for term in ["rap", "hip-hop", "hip hop"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR RAP/HIP-HOP RHYTHMS:\n" |
|
genre_guidance += "- Use more syllables per beat for rapid-fire sections\n" |
|
genre_guidance += "- Create internal rhymes within lines, not just at line endings\n" |
|
genre_guidance += "- Emphasize the first beat of each bar with strong consonants\n" |
|
elif any(term in genre.lower() for term in ["electronic", "edm", "techno", "house", "dance"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR ELECTRONIC MUSIC RHYTHMS:\n" |
|
genre_guidance += "- Use repetitive phrases that build and release tension\n" |
|
genre_guidance += "- Match syllables precisely to the beat grid\n" |
|
genre_guidance += "- Use short, percussive words on strong beats\n" |
|
elif any(term in genre.lower() for term in ["rock", "metal", "punk", "alternative"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR ROCK RHYTHMS:\n" |
|
genre_guidance += "- Use powerful, emotive words on downbeats\n" |
|
genre_guidance += "- Create contrast between verse and chorus energy levels\n" |
|
genre_guidance += "- Emphasize hooks with simple, memorable phrases\n" |
|
elif any(term in genre.lower() for term in ["folk", "country", "acoustic", "ballad"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR FOLK/ACOUSTIC RHYTHMS:\n" |
|
genre_guidance += "- Focus on storytelling with clear narrative flow\n" |
|
genre_guidance += "- Use natural speech patterns that flow conversationally\n" |
|
genre_guidance += "- Place important words at the start of phrases\n" |
|
|
|
|
|
syllable_guidance += genre_guidance |
|
|
|
|
|
syllable_guidance_text = syllable_guidance |
|
|
|
|
|
use_sections = True |
|
use_second_level = False |
|
|
|
if song_structure and "second_level" in song_structure and song_structure["second_level"]: |
|
use_second_level = True |
|
|
|
if isinstance(song_structure["second_level"], dict) and "templates" in song_structure["second_level"]: |
|
templates = song_structure["second_level"]["templates"] |
|
if isinstance(templates, list) and len(templates) > 0: |
|
use_sections = False |
|
elif song_structure and "flexible_structure" in song_structure and song_structure["flexible_structure"]: |
|
|
|
if "segments" in song_structure["flexible_structure"]: |
|
segments = song_structure["flexible_structure"]["segments"] |
|
if len(segments) > 4: |
|
use_sections = False |
|
|
|
|
|
if use_second_level: |
|
|
|
content = f""" |
|
You are a talented songwriter who specializes in {genre} music. |
|
Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long. |
|
|
|
IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics. |
|
|
|
Music analysis has detected the following qualities: |
|
- Tempo: {tempo:.1f} BPM |
|
- Key: {key} {mode} |
|
- Primary emotion: {primary_emotion} |
|
- Primary theme: {primary_theme} |
|
|
|
{syllable_guidance} |
|
|
|
CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: |
|
1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) |
|
2. Natural word stress patterns must match the beat strength (strong words on strong beats) |
|
3. Line breaks should occur at phrase endings for natural breathing |
|
4. Consonant clusters should be avoided on fast notes and strong beats |
|
5. Open vowels (a, e, o) work better for sustained notes and syllables |
|
6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) |
|
7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels |
|
|
|
The lyrics should: |
|
- Perfectly capture the essence and style of {genre} music |
|
- Express the {primary_emotion} emotion and {primary_theme} theme |
|
- Be completely original |
|
- Maintain a consistent theme throughout |
|
- Match the audio segment duration of {duration:.1f} seconds |
|
|
|
Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY. |
|
|
|
IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics. |
|
|
|
IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" |
|
where you analyze how well the lyrics align with the musical rhythm. This section MUST appear |
|
even if there are no rhythm issues. Include the following in your analysis: |
|
1. Syllable counts for each line and how they match the rhythm pattern |
|
2. Where stressed syllables align with strong beats |
|
3. Any potential misalignments or improvements |
|
|
|
Your lyrics: |
|
""" |
|
|
|
|
|
if lyrics_requirements and lyrics_requirements.strip(): |
|
content += f""" |
|
USER REQUIREMENTS: |
|
{lyrics_requirements.strip()} |
|
|
|
The lyrics MUST incorporate these user requirements while still following the rhythm patterns. |
|
""" |
|
|
|
content += """ |
|
Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY. |
|
|
|
IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics. |
|
|
|
IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" |
|
where you analyze how well the lyrics align with the musical rhythm. This section MUST appear |
|
even if there are no rhythm issues. Include the following in your analysis: |
|
1. Syllable counts for each line and how they match the rhythm pattern |
|
2. Where stressed syllables align with strong beats |
|
3. Any potential misalignments or improvements |
|
|
|
Your lyrics: |
|
""" |
|
elif use_sections: |
|
|
|
content = f""" |
|
You are a talented songwriter who specializes in {genre} music. |
|
Write original {genre} song lyrics for a song that is {duration:.1f} seconds long. |
|
|
|
IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics. |
|
|
|
Music analysis has detected the following qualities in the music: |
|
- Tempo: {tempo:.1f} BPM |
|
- Key: {key} {mode} |
|
- Primary emotion: {primary_emotion} |
|
- Primary theme: {primary_theme} |
|
|
|
{syllable_guidance} |
|
|
|
CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: |
|
1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) |
|
2. Natural word stress patterns must match the beat strength (strong words on strong beats) |
|
3. Line breaks should occur at phrase endings for natural breathing |
|
4. Consonant clusters should be avoided on fast notes and strong beats |
|
5. Open vowels (a, e, o) work better for sustained notes and syllables |
|
6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) |
|
7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels |
|
|
|
The lyrics should: |
|
- Perfectly capture the essence and style of {genre} music |
|
- Express the {primary_emotion} emotion and {primary_theme} theme |
|
- Follow the structure patterns provided above |
|
- Be completely original |
|
- Match the song duration of {duration:.1f} seconds |
|
""" |
|
|
|
|
|
if lyrics_requirements and lyrics_requirements.strip(): |
|
content += f""" |
|
USER REQUIREMENTS: |
|
{lyrics_requirements.strip()} |
|
|
|
The lyrics MUST incorporate these user requirements while still following the rhythm patterns. |
|
""" |
|
|
|
content += """ |
|
IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics. |
|
|
|
IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" |
|
where you analyze how well the lyrics align with the musical rhythm. This section MUST appear |
|
even if there are no rhythm issues. Include the following in your analysis: |
|
1. Syllable counts for each line and how they match the rhythm pattern |
|
2. Where stressed syllables align with strong beats |
|
3. Any potential misalignments or improvements |
|
|
|
Your lyrics: |
|
""" |
|
else: |
|
|
|
content = f""" |
|
You are a talented songwriter who specializes in {genre} music. |
|
Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long. |
|
|
|
IMPORTANT: DO NOT include any thinking process, explanations, or analysis before the lyrics. Start directly with the song lyrics. |
|
|
|
Music analysis has detected the following qualities: |
|
- Tempo: {tempo:.1f} BPM |
|
- Key: {key} {mode} |
|
- Primary emotion: {primary_emotion} |
|
- Primary theme: {primary_theme} |
|
|
|
{syllable_guidance} |
|
|
|
CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: |
|
1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) |
|
2. Natural word stress patterns must match the beat strength (strong words on strong beats) |
|
3. Line breaks should occur at phrase endings for natural breathing |
|
4. Consonant clusters should be avoided on fast notes and strong beats |
|
5. Open vowels (a, e, o) work better for sustained notes and syllables |
|
6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) |
|
7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels |
|
|
|
The lyrics should: |
|
- Perfectly capture the essence and style of {genre} music |
|
- Express the {primary_emotion} emotion and {primary_theme} theme |
|
- Be completely original |
|
- Maintain a consistent theme throughout |
|
- Match the audio segment duration of {duration:.1f} seconds |
|
""" |
|
|
|
|
|
if lyrics_requirements and lyrics_requirements.strip(): |
|
content += f""" |
|
USER REQUIREMENTS: |
|
{lyrics_requirements.strip()} |
|
|
|
The lyrics MUST incorporate these user requirements while still following the rhythm patterns. |
|
""" |
|
|
|
content += """ |
|
Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above. |
|
Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY. |
|
|
|
IMPORTANT: Start immediately with the lyrics. DO NOT include any thinking process, analysis, or explanation before presenting the lyrics. |
|
|
|
IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" |
|
where you analyze how well the lyrics align with the musical rhythm. This section MUST appear |
|
even if there are no rhythm issues. Include the following in your analysis: |
|
1. Syllable counts for each line and how they match the rhythm pattern |
|
2. Where stressed syllables align with strong beats |
|
3. Any potential misalignments or improvements |
|
|
|
Your lyrics: |
|
""" |
|
|
|
|
|
messages = [ |
|
{"role": "system", "content": "You are a professional songwriter. Create lyrics that match the specified rhythm patterns exactly. Start with the lyrics immediately without any explanation or thinking. Be concise and direct."}, |
|
{"role": "user", "content": content} |
|
] |
|
|
|
|
|
text = llm_tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
|
|
|
|
model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device) |
|
|
|
|
|
generation_params = { |
|
"do_sample": True, |
|
"temperature": 0.5, |
|
"top_p": 0.85, |
|
"top_k": 50, |
|
"repetition_penalty": 1.2, |
|
"max_new_tokens": 2048, |
|
"num_return_sequences": 1 |
|
} |
|
|
|
|
|
if hasattr(llm_model.generation_config, "stopping_criteria"): |
|
thinking_stops = ["Let me think", "First, I need to", "Let's analyze", "I'll approach this", "Step 1:", "To start,"] |
|
for stop in thinking_stops: |
|
if stop not in llm_model.generation_config.stopping_criteria: |
|
llm_model.generation_config.stopping_criteria.append(stop) |
|
|
|
|
|
generated_ids = llm_model.generate( |
|
**model_inputs, |
|
**generation_params |
|
) |
|
|
|
|
|
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() |
|
|
|
|
|
lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip() |
|
|
|
|
|
|
|
if "<thinking>" in lyrics and "</thinking>" in lyrics: |
|
lyrics = lyrics.split("</thinking>")[1].strip() |
|
|
|
|
|
thinking_markers = [ |
|
"<think>", "</think>", |
|
"[thinking]", "[/thinking]", |
|
"I'll think step by step:", |
|
"First, I need to understand", |
|
"Let me think about", |
|
"Let's tackle this query", |
|
"Okay, let's tackle this query", |
|
"First, I need to understand the requirements", |
|
"Looking at the rhythm patterns" |
|
] |
|
|
|
|
|
for marker in thinking_markers: |
|
if marker in lyrics: |
|
parts = lyrics.split(marker) |
|
if len(parts) > 1: |
|
lyrics = parts[-1].strip() |
|
|
|
|
|
analytical_patterns = [ |
|
"Let me analyze", |
|
"I need to understand", |
|
"The tempo is", |
|
"First, let's look at", |
|
"Wait, maybe", |
|
"Considering the emotional tone", |
|
"Starting with the first line", |
|
"Let me check the examples" |
|
] |
|
|
|
|
|
for pattern in analytical_patterns: |
|
if lyrics.startswith(pattern): |
|
|
|
lyrics_markers = [ |
|
"\n\n[Verse", |
|
"\n\n[Chorus", |
|
"\n\nVerse", |
|
"\n\nChorus", |
|
"\n\n[Verse 1]", |
|
"\n\n[Intro]" |
|
] |
|
|
|
for marker in lyrics_markers: |
|
if marker in lyrics: |
|
lyrics = lyrics[lyrics.index(marker):].strip() |
|
break |
|
|
|
|
|
|
|
if len(lyrics.split()) > 100 and "\n\n" in lyrics: |
|
paragraphs = lyrics.split("\n\n") |
|
for i, paragraph in enumerate(paragraphs): |
|
|
|
if any(marker in paragraph for marker in ["[Verse", "[Chorus", "Verse 1", "Chorus:"]): |
|
lyrics = "\n\n".join(paragraphs[i:]) |
|
break |
|
|
|
|
|
lines = lyrics.split('\n') |
|
clean_lines = [] |
|
lyrics_started = False |
|
|
|
for line in lines: |
|
|
|
if not lyrics_started: |
|
if (line.strip().startswith('[') and ']' in line) or not any(thinking in line.lower() for thinking in ["i think", "let me", "maybe", "perhaps", "alternatively", "checking"]): |
|
lyrics_started = True |
|
|
|
if lyrics_started: |
|
clean_lines.append(line) |
|
|
|
|
|
if clean_lines: |
|
lyrics = '\n'.join(clean_lines) |
|
|
|
|
|
second_level_verification = None |
|
if song_structure and "second_level" in song_structure and song_structure["second_level"]: |
|
if isinstance(song_structure["second_level"], dict) and "templates" in song_structure["second_level"]: |
|
second_level_verification = song_structure["second_level"]["templates"] |
|
if not isinstance(second_level_verification, list): |
|
second_level_verification = None |
|
|
|
|
|
if templates_for_verification: |
|
|
|
|
|
if isinstance(templates_for_verification, list): |
|
safe_templates = [] |
|
for template in templates_for_verification: |
|
if isinstance(template, dict): |
|
processed_template = {} |
|
for k, v in template.items(): |
|
if isinstance(v, np.ndarray): |
|
if v.size == 1: |
|
processed_template[k] = float(v.item()) |
|
else: |
|
processed_template[k] = [float(x) if isinstance(x, np.number) else x for x in v] |
|
elif isinstance(v, np.number): |
|
processed_template[k] = float(v) |
|
else: |
|
processed_template[k] = v |
|
safe_templates.append(processed_template) |
|
else: |
|
safe_templates.append(template) |
|
else: |
|
safe_templates = templates_for_verification |
|
|
|
|
|
try: |
|
print(f"DEBUG: Calling verify_flexible_syllable_counts") |
|
print(f"DEBUG: Type of lyrics: {type(lyrics)}") |
|
print(f"DEBUG: Type of safe_templates: {type(safe_templates)}") |
|
print(f"DEBUG: Type of second_level_verification: {type(second_level_verification)}") |
|
|
|
verified_lyrics = verify_flexible_syllable_counts(lyrics, safe_templates, second_level_verification) |
|
print(f"DEBUG: Type of verified_lyrics: {type(verified_lyrics)}") |
|
|
|
except Exception as e: |
|
print(f"ERROR in verify_flexible_syllable_counts: {str(e)}") |
|
|
|
return { |
|
"lyrics": lyrics if isinstance(lyrics, str) else str(lyrics), |
|
"rhythm_analysis": f"Error in rhythm analysis: {str(e)}", |
|
"syllable_analysis": "Error performing syllable analysis", |
|
"prompt_template": "Error generating prompt template" |
|
} |
|
|
|
if isinstance(verified_lyrics, str) and "[Note: Potential rhythm mismatches" in verified_lyrics and "Detailed Alignment Analysis" in verified_lyrics: |
|
|
|
original_lyrics = lyrics.split("[Note:")[0].strip() if isinstance(lyrics, str) else str(lyrics) |
|
|
|
|
|
analysis = verified_lyrics.split("[Note:")[1] if "[Note:" in verified_lyrics else "" |
|
|
|
|
|
if "stress misalignments" in analysis and len(templates_for_verification) > 0: |
|
|
|
refinement_prompt = f""" |
|
You need to fix rhythm issues in these lyrics. Here's the analysis of the problems: |
|
|
|
{analysis} |
|
|
|
Revise the lyrics to perfectly match the rhythm pattern while maintaining the theme. |
|
Focus on fixing the stress misalignments by placing stressed syllables on STRONG beats. |
|
|
|
Original lyrics: |
|
{original_lyrics} |
|
|
|
Improved lyrics with fixed rhythm: |
|
""" |
|
|
|
refinement_messages = [ |
|
{"role": "user", "content": refinement_prompt} |
|
] |
|
|
|
|
|
refinement_text = llm_tokenizer.apply_chat_template( |
|
refinement_messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
|
|
try: |
|
|
|
refinement_inputs = llm_tokenizer([refinement_text], return_tensors="pt").to(llm_model.device) |
|
|
|
|
|
refinement_params = { |
|
"do_sample": True, |
|
"temperature": 0.4, |
|
"top_p": 0.9, |
|
"repetition_penalty": 1.3, |
|
"max_new_tokens": 1024 |
|
} |
|
|
|
refined_ids = llm_model.generate( |
|
**refinement_inputs, |
|
**refinement_params |
|
) |
|
|
|
|
|
refined_output_ids = refined_ids[0][len(refinement_inputs.input_ids[0]):].tolist() |
|
refined_lyrics = llm_tokenizer.decode(refined_output_ids, skip_special_tokens=True).strip() |
|
|
|
|
|
try: |
|
refined_verified_lyrics = verify_flexible_syllable_counts(refined_lyrics, safe_templates, second_level_verification) |
|
|
|
|
|
if "[Note: Potential rhythm mismatches" not in refined_verified_lyrics: |
|
lyrics = refined_lyrics |
|
elif refined_verified_lyrics.count("misalignments") < verified_lyrics.count("misalignments"): |
|
lyrics = refined_verified_lyrics |
|
else: |
|
lyrics = verified_lyrics |
|
except Exception as e: |
|
print(f"Error in refined lyrics verification: {str(e)}") |
|
lyrics = verified_lyrics |
|
except Exception as e: |
|
print(f"Error in lyrics refinement: {str(e)}") |
|
lyrics = verified_lyrics |
|
else: |
|
|
|
lyrics = verified_lyrics |
|
else: |
|
|
|
lyrics = verified_lyrics |
|
|
|
|
|
if "[RHYTHM_ANALYSIS_SECTION]" in lyrics: |
|
|
|
parts = lyrics.split("[RHYTHM_ANALYSIS_SECTION]") |
|
clean_lyrics = parts[0].strip() |
|
rhythm_analysis = parts[1].strip() |
|
|
|
|
|
lyrics = clean_lyrics + "\n\n[Note: Rhythm Analysis]\n" + rhythm_analysis |
|
|
|
|
|
elif "[Note: Potential rhythm mismatches" in lyrics: |
|
|
|
pass |
|
else: |
|
|
|
lyrics = lyrics + "\n\n[Note: Rhythm Analysis]\nNo rhythm issues detected. All syllables align well with the beat pattern." |
|
|
|
|
|
if isinstance(lyrics, str): |
|
|
|
if "[Note: Rhythm Analysis]" in lyrics: |
|
clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() |
|
rhythm_analysis = lyrics.split("[Note: Rhythm Analysis]")[1] |
|
elif "[Note: Potential rhythm mismatches" in lyrics: |
|
clean_lyrics = lyrics.split("[Note:")[0].strip() |
|
rhythm_analysis = "[Note:" + lyrics.split("[Note:")[1] |
|
else: |
|
clean_lyrics = lyrics |
|
rhythm_analysis = "No rhythm analysis available" |
|
|
|
|
|
syllable_analysis = "=== SYLLABLE ANALYSIS ===\n\n" |
|
if templates_for_verification: |
|
syllable_analysis += "Template Analysis:\n" |
|
for i, template in enumerate(templates_for_verification): |
|
if i < min(len(templates_for_verification), 30): |
|
syllable_analysis += f"Line {i+1}:\n" |
|
if isinstance(template, dict): |
|
if "syllable_template" in template: |
|
syllable_analysis += f" Template: {template['syllable_template']}\n" |
|
if "syllable_count" in template: |
|
syllable_analysis += f" Expected syllables: {template['syllable_count']}\n" |
|
elif isinstance(template, str): |
|
syllable_analysis += f" Template: {template}\n" |
|
syllable_analysis += "\n" |
|
|
|
if len(templates_for_verification) > 30: |
|
syllable_analysis += f"... and {len(templates_for_verification) - 30} more lines\n\n" |
|
|
|
|
|
if second_level_verification: |
|
syllable_analysis += "\nSecond-Level Template Analysis:\n" |
|
for i, template in enumerate(second_level_verification): |
|
if i < min(len(second_level_verification), 30): |
|
syllable_analysis += f"Second {i+1}: {template}\n" |
|
|
|
if len(second_level_verification) > 30: |
|
syllable_analysis += f"... and {len(second_level_verification) - 30} more seconds\n" |
|
|
|
|
|
syllable_analysis += "\n" + structure_visualization |
|
|
|
|
|
prompt_template = "=== PROMPT TEMPLATE ===\n\n" |
|
prompt_template += "Genre: " + genre + "\n" |
|
prompt_template += f"Duration: {duration:.1f} seconds\n" |
|
prompt_template += f"Tempo: {tempo:.1f} BPM\n" |
|
prompt_template += f"Key: {key} {mode}\n" |
|
prompt_template += f"Primary Emotion: {primary_emotion}\n" |
|
prompt_template += f"Primary Theme: {primary_theme}\n\n" |
|
prompt_template += "Syllable Guidance:\n" + syllable_guidance_text |
|
|
|
|
|
return { |
|
"lyrics": clean_lyrics, |
|
"rhythm_analysis": rhythm_analysis, |
|
"syllable_analysis": syllable_analysis, |
|
"prompt_template": prompt_template |
|
} |
|
|
|
return { |
|
"lyrics": lyrics, |
|
"rhythm_analysis": "No rhythm analysis available", |
|
"syllable_analysis": "No syllable analysis available", |
|
"prompt_template": "No prompt template available" |
|
} |
|
|
|
def process_audio(audio_file, lyrics_requirements=None): |
|
"""Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis.""" |
|
if audio_file is None: |
|
return "Please upload an audio file.", None, None |
|
|
|
try: |
|
print("Step 1/5: Extracting audio features...") |
|
|
|
audio_data = extract_audio_features(audio_file) |
|
|
|
print("Step 2/5: Verifying audio contains music...") |
|
|
|
try: |
|
is_music, ast_results = detect_music(audio_data) |
|
except Exception as e: |
|
print(f"Error in music detection: {str(e)}") |
|
return f"Error in music detection: {str(e)}", None, ast_results |
|
|
|
if not is_music: |
|
return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results |
|
|
|
print("Step 3/5: Classifying music genre...") |
|
|
|
try: |
|
top_genres = classify_genre(audio_data) |
|
|
|
genre_results = format_genre_results(top_genres) |
|
if not isinstance(top_genres, list) or len(top_genres) == 0: |
|
|
|
top_genres = [("rock", 1.0)] |
|
except Exception as e: |
|
print(f"Error in genre classification: {str(e)}") |
|
top_genres = [("rock", 1.0)] |
|
return f"Error in genre classification: {str(e)}", None, ast_results |
|
|
|
|
|
ast_results = ast_results if ast_results else [] |
|
song_structure = None |
|
emotion_results = { |
|
"emotion_analysis": {"primary_emotion": "Unknown"}, |
|
"theme_analysis": {"primary_theme": "Unknown"}, |
|
"rhythm_analysis": {"tempo": 0}, |
|
"tonal_analysis": {"key": "Unknown", "mode": ""}, |
|
"summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"} |
|
} |
|
|
|
print("Step 4/5: Analyzing music emotions, themes, and structure...") |
|
|
|
try: |
|
emotion_results = music_analyzer.analyze_music(audio_file) |
|
except Exception as e: |
|
print(f"Error in emotion analysis: {str(e)}") |
|
|
|
|
|
|
|
try: |
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
|
|
|
|
beats_info = detect_beats(y, sr) |
|
sections_info = detect_sections(y, sr) |
|
|
|
|
|
segments = [] |
|
|
|
|
|
|
|
if sections_info and len(sections_info) > 1: |
|
min_segment_duration = 1.5 |
|
|
|
for section in sections_info: |
|
section_start = section["start"] |
|
section_end = section["end"] |
|
section_duration = section["duration"] |
|
|
|
|
|
if section_duration < min_segment_duration * 1.5: |
|
segments.append({ |
|
"start": section_start, |
|
"end": section_end |
|
}) |
|
else: |
|
|
|
|
|
ideal_segment_duration = 3.0 |
|
segment_count = max(1, int(section_duration / ideal_segment_duration)) |
|
|
|
|
|
segment_duration = section_duration / segment_count |
|
for i in range(segment_count): |
|
segment_start = section_start + i * segment_duration |
|
segment_end = segment_start + segment_duration |
|
segments.append({ |
|
"start": segment_start, |
|
"end": segment_end |
|
}) |
|
|
|
elif beats_info and len(beats_info["beat_times"]) > 4: |
|
beats = beats_info["beat_times"] |
|
time_signature = beats_info.get("time_signature", 4) |
|
|
|
|
|
measure_size = time_signature |
|
for i in range(0, len(beats), measure_size): |
|
if i + 1 < len(beats): |
|
measure_start = beats[i] |
|
|
|
if i + measure_size < len(beats): |
|
measure_end = beats[i + measure_size] |
|
else: |
|
|
|
if i > 0: |
|
beat_interval = beats[i] - beats[i-1] |
|
measure_end = beats[-1] + (beat_interval * (measure_size - (len(beats) - i))) |
|
else: |
|
measure_end = audio_data["duration"] |
|
|
|
segments.append({ |
|
"start": measure_start, |
|
"end": measure_end |
|
}) |
|
|
|
else: |
|
|
|
segment_duration = 3.0 |
|
total_segments = max(4, int(audio_data["duration"] / segment_duration)) |
|
segment_duration = audio_data["duration"] / total_segments |
|
|
|
for i in range(total_segments): |
|
segment_start = i * segment_duration |
|
segment_end = segment_start + segment_duration |
|
segments.append({ |
|
"start": segment_start, |
|
"end": segment_end |
|
}) |
|
|
|
|
|
flexible_structure = { |
|
"beats": beats_info, |
|
"segments": segments |
|
} |
|
|
|
|
|
song_structure = { |
|
"beats": beats_info, |
|
"sections": sections_info, |
|
"flexible_structure": flexible_structure, |
|
"syllables": [] |
|
} |
|
|
|
|
|
for section in sections_info: |
|
|
|
section_beats_info = { |
|
"beat_times": [beat for beat in beats_info["beat_times"] |
|
if section["start"] <= beat < section["end"]], |
|
"tempo": beats_info.get("tempo", 120) |
|
} |
|
if "beat_strengths" in beats_info: |
|
section_beats_info["beat_strengths"] = [ |
|
strength for i, strength in enumerate(beats_info["beat_strengths"]) |
|
if i < len(beats_info["beat_times"]) and |
|
section["start"] <= beats_info["beat_times"][i] < section["end"] |
|
] |
|
|
|
|
|
syllable_count = int(section["duration"] * (beats_info.get("tempo", 120) / 60) * 1.5) |
|
|
|
section_info = { |
|
"type": section["type"], |
|
"start": section["start"], |
|
"end": section["end"], |
|
"duration": section["duration"], |
|
"syllable_count": syllable_count, |
|
"beat_count": len(section_beats_info["beat_times"]) |
|
} |
|
|
|
|
|
if len(section_beats_info["beat_times"]) >= 2: |
|
|
|
if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple): |
|
genre_name = top_genres[0][0] |
|
else: |
|
genre_name = "unknown" |
|
|
|
section_info["syllable_template"] = create_flexible_syllable_templates( |
|
section_beats_info, |
|
genre=genre_name |
|
) |
|
|
|
song_structure["syllables"].append(section_info) |
|
|
|
|
|
try: |
|
|
|
subbeat_info = detect_beats_and_subbeats(y, sr, subdivision=4) |
|
|
|
|
|
sec_map = map_beats_to_seconds( |
|
subbeat_info["subbeat_times"], |
|
audio_data["duration"] |
|
) |
|
|
|
|
|
|
|
genre_name = "unknown" |
|
if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple): |
|
genre_name = top_genres[0][0] |
|
|
|
second_level_templates = create_second_level_templates( |
|
sec_map, |
|
subbeat_info["tempo"], |
|
genre_name |
|
) |
|
|
|
|
|
song_structure["second_level"] = { |
|
"sec_map": sec_map, |
|
"templates": second_level_templates |
|
} |
|
|
|
except Exception as e: |
|
print(f"Error in second-level beat analysis: {str(e)}") |
|
|
|
|
|
except Exception as e: |
|
print(f"Error analyzing song structure: {str(e)}") |
|
|
|
|
|
print("Step 5/5: Generating rhythmically aligned lyrics...") |
|
|
|
try: |
|
|
|
primary_genre = "unknown" |
|
if isinstance(top_genres, list) and len(top_genres) > 0 and isinstance(top_genres[0], tuple): |
|
primary_genre, _ = top_genres[0] |
|
|
|
|
|
sanitized_song_structure = None |
|
if song_structure: |
|
sanitized_song_structure = {} |
|
|
|
|
|
if "beats" in song_structure and isinstance(song_structure["beats"], dict): |
|
sanitized_song_structure["beats"] = song_structure["beats"] |
|
|
|
|
|
if "sections" in song_structure and isinstance(song_structure["sections"], list): |
|
sanitized_song_structure["sections"] = song_structure["sections"] |
|
|
|
|
|
if "flexible_structure" in song_structure and isinstance(song_structure["flexible_structure"], dict): |
|
flex_struct = song_structure["flexible_structure"] |
|
sanitized_flex = {} |
|
|
|
|
|
if "segments" in flex_struct and isinstance(flex_struct["segments"], list): |
|
sanitized_flex["segments"] = flex_struct["segments"] |
|
|
|
|
|
if "beats" in flex_struct and isinstance(flex_struct["beats"], dict): |
|
sanitized_flex["beats"] = flex_struct["beats"] |
|
|
|
sanitized_song_structure["flexible_structure"] = sanitized_flex |
|
|
|
|
|
if "syllables" in song_structure and isinstance(song_structure["syllables"], list): |
|
sanitized_song_structure["syllables"] = song_structure["syllables"] |
|
|
|
|
|
if "second_level" in song_structure and isinstance(song_structure["second_level"], dict): |
|
second_level = song_structure["second_level"] |
|
sanitized_second = {} |
|
|
|
if "templates" in second_level and isinstance(second_level["templates"], list): |
|
sanitized_second["templates"] = second_level["templates"] |
|
|
|
if "sec_map" in second_level and isinstance(second_level["sec_map"], list): |
|
sanitized_second["sec_map"] = second_level["sec_map"] |
|
|
|
sanitized_song_structure["second_level"] = sanitized_second |
|
|
|
try: |
|
print("Calling generate_lyrics function...") |
|
|
|
lyrics_result = generate_lyrics(primary_genre, audio_data["duration"], emotion_results, |
|
sanitized_song_structure, lyrics_requirements) |
|
print(f"Type of lyrics_result: {type(lyrics_result)}") |
|
|
|
|
|
if isinstance(lyrics_result, dict) and all(k in lyrics_result for k in ["lyrics"]): |
|
lyrics = lyrics_result.get("lyrics", "No lyrics generated") |
|
rhythm_analysis = lyrics_result.get("rhythm_analysis", "No rhythm analysis available") |
|
syllable_analysis = lyrics_result.get("syllable_analysis", "No syllable analysis available") |
|
prompt_template = lyrics_result.get("prompt_template", "No prompt template available") |
|
else: |
|
|
|
lyrics = str(lyrics_result) if lyrics_result is not None else "No lyrics generated" |
|
rhythm_analysis = "No detailed rhythm analysis available" |
|
syllable_analysis = "No syllable analysis available" |
|
prompt_template = "No prompt template available" |
|
except Exception as inner_e: |
|
print(f"Inner error in lyrics generation: {str(inner_e)}") |
|
|
|
lyrics = f"Error generating lyrics: {str(inner_e)}" |
|
rhythm_analysis = "Error in rhythm analysis" |
|
syllable_analysis = "Error in syllable analysis" |
|
prompt_template = "Error in prompt template generation" |
|
|
|
except Exception as e: |
|
print(f"Outer error in lyrics generation: {str(e)}") |
|
lyrics = f"Error generating lyrics: {str(e)}" |
|
rhythm_analysis = "No rhythm analysis available" |
|
syllable_analysis = "No syllable analysis available" |
|
prompt_template = "No prompt template available" |
|
|
|
results = { |
|
"genre_results": genre_results, |
|
"lyrics": lyrics, |
|
"rhythm_analysis": rhythm_analysis, |
|
"syllable_analysis": syllable_analysis, |
|
"prompt_template": prompt_template, |
|
"ast_results": ast_results |
|
} |
|
|
|
return results |
|
|
|
except Exception as e: |
|
error_msg = f"Error processing audio: {str(e)}" |
|
print(error_msg) |
|
return error_msg, None, [] |
|
|
|
def format_complete_beat_timeline(audio_file, lyrics=None): |
|
"""Creates a complete formatted timeline showing all beat timings and their syllable patterns without truncation""" |
|
if audio_file is None: |
|
return "Please upload an audio file to see beat timeline." |
|
|
|
try: |
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
|
|
|
|
beats_info = detect_beats(y, sr) |
|
|
|
|
|
def ensure_float(value): |
|
if isinstance(value, np.ndarray) or isinstance(value, np.number): |
|
return float(value) |
|
return value |
|
|
|
|
|
timeline = "=== BEAT & SYLLABLE TIMELINE ===\n\n" |
|
|
|
tempo = ensure_float(beats_info['tempo']) |
|
tempo_confidence = ensure_float(beats_info.get('tempo_confidence', 90.0)) |
|
time_sig_confidence = ensure_float(beats_info.get('time_sig_confidence', 85.0)) |
|
beat_periodicity = ensure_float(beats_info.get('beat_periodicity', 60 / tempo)) |
|
|
|
timeline += f"Tempo: {tempo:.1f} BPM (±{tempo_confidence:.1f}%)\n" |
|
timeline += f"Time Signature: {beats_info['time_signature']}/4 (Confidence: {time_sig_confidence:.1f}%)\n" |
|
timeline += f"Beat Periodicity: {beat_periodicity:.3f}s\n" |
|
timeline += f"Beat Entropy: {beats_info.get('beat_entropy', 'N/A')}\n" |
|
timeline += f"Total Beats: {beats_info['beat_count']}\n" |
|
|
|
|
|
if tempo < 60: |
|
tempo_class = "Largo (very slow)" |
|
elif tempo < 76: |
|
tempo_class = "Adagio (slow)" |
|
elif tempo < 108: |
|
tempo_class = "Andante (walking pace)" |
|
elif tempo < 132: |
|
tempo_class = "Moderato (moderate)" |
|
elif tempo < 168: |
|
tempo_class = "Allegro (fast)" |
|
else: |
|
tempo_class = "Presto (very fast)" |
|
|
|
timeline += f"Tempo Classification: {tempo_class}\n\n" |
|
|
|
|
|
timeline += "| Beat # | Time (s) | Beat Strength | Syllable Pattern |\n" |
|
timeline += "|--------|----------|--------------|------------------|\n" |
|
|
|
|
|
for i, (time, strength) in enumerate(zip(beats_info['beat_times'], beats_info['beat_strengths'])): |
|
|
|
time = ensure_float(time) |
|
strength = ensure_float(strength) |
|
|
|
|
|
metrical_position = i % beats_info['time_signature'] |
|
|
|
if metrical_position == 0: |
|
beat_type = "STRONG" |
|
syllable_value = 1.5 |
|
elif metrical_position == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 2: |
|
|
|
beat_type = "MEDIUM" if strength < 0.8 else "STRONG" |
|
syllable_value = 1.0 if strength < 0.8 else 1.5 |
|
else: |
|
|
|
if strength >= 0.8: |
|
beat_type = "STRONG" |
|
syllable_value = 1.5 |
|
elif strength >= 0.5: |
|
beat_type = "MEDIUM" |
|
syllable_value = 1.0 |
|
else: |
|
beat_type = "WEAK" |
|
syllable_value = 1.0 |
|
|
|
|
|
if beat_type == "STRONG": |
|
pattern = "S" |
|
elif beat_type == "MEDIUM": |
|
pattern = "m" |
|
else: |
|
pattern = "w" |
|
|
|
|
|
timeline += f"| {i+1:<6} | {time:.2f}s | {beat_type:<12} | {pattern}:{syllable_value} |\n" |
|
|
|
|
|
|
|
|
|
timeline += "\n=== VISUAL BEAT TIMELINE ===\n\n" |
|
timeline += "Each character represents 0.5 seconds. Beats are marked as:\n" |
|
timeline += "S = Strong beat | m = Medium beat | w = Weak beat | · = No beat\n\n" |
|
|
|
|
|
if 'beat_times' in beats_info and len(beats_info['beat_times']) > 0: |
|
|
|
max_beat_time = max([ensure_float(t) for t in beats_info['beat_times']]) |
|
total_duration = max_beat_time + 2 |
|
else: |
|
total_duration = 30 |
|
|
|
time_markers = "" |
|
for i in range(0, int(total_duration) + 1, 5): |
|
time_markers += f"{i:<5}" |
|
timeline += time_markers + " (seconds)\n" |
|
|
|
|
|
ruler = "" |
|
for i in range(0, int(total_duration) + 1): |
|
if i % 5 == 0: |
|
ruler += "+" |
|
else: |
|
ruler += "-" |
|
ruler += "-" * 9 |
|
timeline += ruler + "\n" |
|
|
|
|
|
beat_line = ["·"] * int(total_duration * 2) |
|
|
|
for i, time in enumerate(beats_info['beat_times']): |
|
if i >= len(beats_info['beat_strengths']): |
|
break |
|
|
|
|
|
time_val = ensure_float(time) |
|
|
|
|
|
pos = int(time_val * 2) |
|
if pos >= len(beat_line): |
|
continue |
|
|
|
|
|
strength = beats_info['beat_strengths'][i] |
|
|
|
strength = ensure_float(strength) |
|
|
|
if i % beats_info['time_signature'] == 0: |
|
beat_line[pos] = "S" |
|
elif strength >= 0.8: |
|
beat_line[pos] = "S" |
|
elif i % beats_info['time_signature'] == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 3: |
|
beat_line[pos] = "m" |
|
elif strength >= 0.5: |
|
beat_line[pos] = "m" |
|
else: |
|
beat_line[pos] = "w" |
|
|
|
|
|
beat_visualization = "" |
|
for i in range(0, len(beat_line), 10): |
|
beat_visualization += "".join(beat_line[i:i+10]) |
|
if i + 10 < len(beat_line): |
|
beat_visualization += " " |
|
timeline += beat_visualization + "\n\n" |
|
|
|
|
|
timeline += "=== MEASURE MARKERS ===\n\n" |
|
|
|
|
|
measure_starts = [] |
|
for i, time in enumerate(beats_info['beat_times']): |
|
if i % beats_info['time_signature'] == 0: |
|
|
|
time_val = ensure_float(time) |
|
measure_starts.append((i // beats_info['time_signature'] + 1, time_val)) |
|
|
|
|
|
if measure_starts: |
|
timeline += "| Measure # | Start Time | Duration |\n" |
|
timeline += "|-----------|------------|----------|\n" |
|
|
|
for i in range(len(measure_starts)): |
|
measure_num, start_time = measure_starts[i] |
|
|
|
|
|
if i < len(measure_starts) - 1: |
|
end_time = measure_starts[i+1][1] |
|
elif 'beat_times' in beats_info and len(beats_info['beat_times']) > 0: |
|
|
|
last_beat = beats_info['beat_times'][-1] |
|
end_time = ensure_float(last_beat) |
|
else: |
|
end_time = start_time + 2.0 |
|
|
|
duration = end_time - start_time |
|
|
|
timeline += f"| {measure_num:<9} | {start_time:.2f}s | {duration:.2f}s |\n" |
|
|
|
|
|
|
|
|
|
if 'phrases' in beats_info and beats_info['phrases']: |
|
timeline += "\n=== MUSICAL PHRASES ===\n\n" |
|
for i, phrase in enumerate(beats_info['phrases']): |
|
|
|
if not phrase: |
|
continue |
|
|
|
|
|
if not (len(phrase) > 0 and len(beats_info['beat_times']) > 0): |
|
continue |
|
|
|
start_beat = min(phrase[0], len(beats_info['beat_times'])-1) |
|
end_beat = min(phrase[-1], len(beats_info['beat_times'])-1) |
|
|
|
|
|
phrase_start = ensure_float(beats_info['beat_times'][start_beat]) |
|
phrase_end = ensure_float(beats_info['beat_times'][end_beat]) |
|
|
|
timeline += f"Phrase {i+1}: Beats {start_beat+1}-{end_beat+1} ({phrase_start:.2f}s - {phrase_end:.2f}s)\n" |
|
|
|
|
|
phrase_beats = { |
|
"beat_times": [ensure_float(beats_info['beat_times'][j]) |
|
for j in phrase if j < len(beats_info['beat_times'])], |
|
"beat_strengths": [ensure_float(beats_info['beat_strengths'][j]) |
|
for j in phrase if j < len(beats_info['beat_strengths'])], |
|
"tempo": ensure_float(beats_info['tempo']), |
|
"time_signature": beats_info['time_signature'], |
|
"phrases": [list(range(len(phrase)))] |
|
} |
|
|
|
template = create_flexible_syllable_templates(phrase_beats) |
|
timeline += f" Syllable Template: {template}\n" |
|
|
|
|
|
if phrase_start < total_duration and phrase_end < total_duration: |
|
|
|
phrase_visualization = ["·"] * int(total_duration * 2) |
|
|
|
|
|
start_pos = int(phrase_start * 2) |
|
end_pos = int(phrase_end * 2) |
|
|
|
if start_pos < len(phrase_visualization): |
|
phrase_visualization[start_pos] = "[" |
|
|
|
if end_pos < len(phrase_visualization): |
|
phrase_visualization[end_pos] = "]" |
|
|
|
|
|
for j in phrase: |
|
if j < len(beats_info['beat_times']): |
|
beat_time = ensure_float(beats_info['beat_times'][j]) |
|
beat_pos = int(beat_time * 2) |
|
|
|
if beat_pos < len(phrase_visualization) and beat_pos != start_pos and beat_pos != end_pos: |
|
|
|
if j % beats_info['time_signature'] == 0: |
|
phrase_visualization[beat_pos] = "S" |
|
elif j % beats_info['time_signature'] == beats_info['time_signature'] // 2: |
|
phrase_visualization[beat_pos] = "m" |
|
else: |
|
phrase_visualization[beat_pos] = "w" |
|
|
|
|
|
phrase_visual = "" |
|
for k in range(0, len(phrase_visualization), 10): |
|
phrase_visual += "".join(phrase_visualization[k:k+10]) |
|
if k + 10 < len(phrase_visualization): |
|
phrase_visual += " " |
|
|
|
timeline += f" Timeline: {phrase_visual}\n\n" |
|
|
|
|
|
try: |
|
|
|
subbeat_info = detect_beats_and_subbeats(y, sr, subdivision=4) |
|
duration = librosa.get_duration(y=y, sr=sr) |
|
|
|
|
|
sec_map = map_beats_to_seconds(subbeat_info["subbeat_times"], duration) |
|
|
|
|
|
templates = create_second_level_templates(sec_map, subbeat_info["tempo"]) |
|
|
|
|
|
timeline += "\n=== SECOND-LEVEL SCRIPT ===\n\n" |
|
timeline += "Each line below represents ONE SECOND of audio with matching lyric content.\n" |
|
timeline += "| Second | Beat Pattern | Lyric Content |\n" |
|
timeline += "|--------|-------------|---------------|\n" |
|
|
|
|
|
clean_lyrics = lyrics |
|
if isinstance(lyrics, str): |
|
if "[Note: Rhythm Analysis]" in lyrics: |
|
clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() |
|
elif "[Note: Potential rhythm mismatches" in lyrics: |
|
clean_lyrics = lyrics.split("[Note:")[0].strip() |
|
|
|
|
|
lines = clean_lyrics.strip().split('\n') if clean_lyrics else [] |
|
|
|
for i, template in enumerate(templates): |
|
|
|
lyric = lines[i] if i < len(lines) else "" |
|
if lyric.startswith('[') and ']' in lyric: |
|
lyric = "" |
|
|
|
|
|
timeline += f"| {i+1:<6} | {template:<30} | {lyric[:40]} |\n" |
|
|
|
|
|
timeline += "\n=== SECOND-LEVEL VISUALIZATION ===\n\n" |
|
timeline += "Each row represents ONE SECOND. Beat types:\n" |
|
timeline += "S = Strong beat | m = Medium beat | w = Weak beat | · = No beat\n\n" |
|
|
|
for i, window in enumerate(sec_map): |
|
beats = window["beats"] |
|
|
|
|
|
beat_viz = ["·"] * 20 |
|
|
|
for beat in beats: |
|
|
|
pos = int(beat["relative_pos"] * 19) |
|
if 0 <= pos < len(beat_viz): |
|
|
|
if beat["type"] == "main": |
|
beat_viz[pos] = "S" |
|
elif beat["strength"] >= 0.7: |
|
beat_viz[pos] = "m" |
|
else: |
|
beat_viz[pos] = "w" |
|
|
|
|
|
lyric = lines[i] if i < len(lines) else "" |
|
if lyric.startswith('[') and ']' in lyric: |
|
lyric = "" |
|
|
|
|
|
viz_line = f"Second {i+1:2d}: [" + "".join(beat_viz) + "]" |
|
if lyric: |
|
viz_line += f" → {lyric[:40]}" |
|
|
|
timeline += viz_line + "\n" |
|
|
|
except Exception as e: |
|
timeline += f"\n[Error generating second-level analysis: {str(e)}]" |
|
|
|
|
|
if lyrics and isinstance(lyrics, str): |
|
timeline += "\n=== LYRICS-BEAT ALIGNMENT ===\n\n" |
|
|
|
if "[Note:" in lyrics: |
|
clean_lyrics = lyrics.split("[Note:")[0].strip() |
|
else: |
|
clean_lyrics = lyrics |
|
|
|
lines = clean_lyrics.strip().split('\n') |
|
|
|
|
|
for i, line in enumerate(lines): |
|
if not line.strip() or line.startswith('['): |
|
continue |
|
|
|
timeline += f"Line: \"{line}\"\n" |
|
|
|
|
|
syllable_count = count_syllables(line) |
|
timeline += f" Syllables: {syllable_count}\n" |
|
|
|
|
|
|
|
matching_phrase = None |
|
if 'phrases' in beats_info and beats_info['phrases']: |
|
|
|
if i < len(beats_info['phrases']) and beats_info['phrases'][i]: |
|
matching_phrase = beats_info['phrases'][i] |
|
else: |
|
|
|
|
|
if len(beats_info['phrases']) > 0: |
|
section_size = max(1, len(beats_info['phrases']) // 4) |
|
section_index = min(i // section_size, 3) |
|
section_start = section_index * section_size |
|
section_end = min(section_start + section_size, len(beats_info['phrases'])) |
|
|
|
|
|
candidate_phrases = [phrase for j, phrase in enumerate(beats_info['phrases']) |
|
if section_start <= j < section_end and phrase] |
|
|
|
if candidate_phrases: |
|
matching_phrase = candidate_phrases[min(i % section_size, len(candidate_phrases)-1)] |
|
elif beats_info['phrases']: |
|
|
|
phrase_index = i % len(beats_info['phrases']) |
|
if beats_info['phrases'][phrase_index]: |
|
matching_phrase = beats_info['phrases'][phrase_index] |
|
|
|
|
|
if matching_phrase and len(matching_phrase) > 0 and len(beats_info['beat_times']) > 0: |
|
|
|
if len(matching_phrase) > 0 and len(beats_info['beat_times']) > 0: |
|
start_beat = min(matching_phrase[0], len(beats_info['beat_times'])-1) |
|
end_beat = min(matching_phrase[-1], len(beats_info['beat_times'])-1) |
|
|
|
start_time = ensure_float(beats_info['beat_times'][start_beat]) |
|
end_time = ensure_float(beats_info['beat_times'][end_beat]) |
|
|
|
timeline += f" Timing: {start_time:.2f}s - {end_time:.2f}s\n" |
|
|
|
|
|
timeline += " Alignment: " |
|
|
|
|
|
phrase_duration = end_time - start_time |
|
syllable_viz = [] |
|
|
|
|
|
for j, beat_idx in enumerate(matching_phrase): |
|
if beat_idx < len(beats_info['beat_times']): |
|
beat_time = ensure_float(beats_info['beat_times'][beat_idx]) |
|
|
|
|
|
if phrase_duration > 0.001: |
|
|
|
|
|
normalized_pos = (beat_time - start_time) / phrase_duration |
|
|
|
curved_pos = min(1.0, normalized_pos * (1.0 + 0.1 * (normalized_pos - 0.5))) |
|
relative_pos = int(curved_pos * syllable_count) |
|
else: |
|
relative_pos = j |
|
|
|
|
|
while len(syllable_viz) <= relative_pos: |
|
syllable_viz.append("·") |
|
|
|
|
|
metrical_pos = beat_idx % beats_info['time_signature'] |
|
beat_strength = beats_info['beat_strengths'][beat_idx] if beat_idx < len(beats_info['beat_strengths']) else 0 |
|
|
|
if metrical_pos == 0 or beat_strength >= 0.8: |
|
syllable_viz[relative_pos] = "S" |
|
elif metrical_pos == beats_info['time_signature'] // 2 or beat_strength >= 0.5: |
|
syllable_viz[relative_pos] = "m" |
|
else: |
|
syllable_viz[relative_pos] = "w" |
|
|
|
|
|
while len(syllable_viz) < syllable_count: |
|
syllable_viz.append("·") |
|
|
|
|
|
syllable_viz = syllable_viz[:syllable_count] |
|
|
|
|
|
timeline += "".join(syllable_viz) + "\n" |
|
|
|
|
|
words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) |
|
if words: |
|
word_stresses = [] |
|
cumulative_syllables = 0 |
|
|
|
for word in words: |
|
syllable_count_word = count_syllables_for_word(word) |
|
stress_pattern = get_word_stress(word) |
|
|
|
|
|
while len(stress_pattern) < syllable_count_word: |
|
stress_pattern += "0" |
|
|
|
for j in range(syllable_count_word): |
|
stress_char = "S" if j < len(stress_pattern) and stress_pattern[j] == "1" else "_" |
|
word_stresses.append(stress_char) |
|
|
|
cumulative_syllables += syllable_count_word |
|
|
|
|
|
timeline += " Word stress: " + "".join(word_stresses) + "\n" |
|
|
|
|
|
alignment_score = 0 |
|
alignment_issues = [] |
|
|
|
for j, (stress, beat) in enumerate(zip(word_stresses, syllable_viz)): |
|
if (stress == "S" and beat == "S") or (stress != "S" and beat != "S"): |
|
alignment_score += 1 |
|
elif stress == "S" and beat != "S": |
|
alignment_issues.append(f"Syllable {j+1} has stress but weak beat") |
|
elif stress != "S" and beat == "S": |
|
alignment_issues.append(f"Syllable {j+1} has no stress but strong beat") |
|
|
|
if word_stresses: |
|
alignment_percent = (alignment_score / len(word_stresses)) * 100 |
|
timeline += f" Stress alignment: {alignment_percent:.1f}% match\n" |
|
|
|
if alignment_issues and len(alignment_issues) <= 3: |
|
timeline += " Issues: " + "; ".join(alignment_issues) + "\n" |
|
else: |
|
timeline += " No matching phrase found for alignment\n" |
|
|
|
timeline += "\n" |
|
|
|
return timeline |
|
|
|
except Exception as e: |
|
print(f"Error generating complete beat timeline: {str(e)}") |
|
return f"Error generating complete beat timeline: {str(e)}" |
|
|
|
def display_results(audio_file, lyrics_requirements=None): |
|
"""Process audio file and return formatted results for display in the UI.""" |
|
|
|
error_response = ("Please upload an audio file.", |
|
"No emotion analysis available.", |
|
"No audio classification available.", |
|
"No lyrics generated.", |
|
"No beat timeline available.") |
|
|
|
if audio_file is None: |
|
return error_response |
|
|
|
try: |
|
|
|
results = process_audio(audio_file, lyrics_requirements) |
|
|
|
|
|
if isinstance(results, str) and "Error" in results: |
|
return results, *error_response[1:] |
|
elif isinstance(results, tuple) and isinstance(results[0], str) and "Error" in results[0]: |
|
return results[0], *error_response[1:] |
|
|
|
|
|
if isinstance(results, dict): |
|
|
|
genre_results = results.get("genre_results", "Genre classification failed") |
|
lyrics = results.get("lyrics", "Lyrics generation failed") |
|
ast_results = results.get("ast_results", []) |
|
else: |
|
|
|
genre_results, lyrics, ast_results = results |
|
|
|
|
|
clean_lyrics = lyrics |
|
if isinstance(lyrics, str): |
|
if "[Note: Rhythm Analysis]" in lyrics: |
|
clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() |
|
elif "[Note: Potential rhythm mismatches" in lyrics: |
|
clean_lyrics = lyrics.split("[Note:")[0].strip() |
|
|
|
|
|
beat_timeline = format_complete_beat_timeline(audio_file, clean_lyrics) |
|
|
|
|
|
emotion_text = "No emotion analysis available." |
|
try: |
|
emotion_results = music_analyzer.analyze_music(audio_file) |
|
emotion_text = (f"Tempo: {emotion_results['summary']['tempo']:.1f} BPM\n" |
|
f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n" |
|
f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n" |
|
f"Primary Theme: {emotion_results['summary']['primary_theme']}") |
|
|
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
beats_info = detect_beats(y, sr) |
|
|
|
|
|
emotion_text += f"\n\nBeat Analysis:\n" |
|
emotion_text += f"- Tempo: {beats_info.get('tempo', 0):.1f} BPM\n" |
|
emotion_text += f"- Time Signature: {beats_info.get('time_signature', 4)}/4\n" |
|
emotion_text += f"- Total Beats: {beats_info.get('beat_count', 0)}\n" |
|
|
|
except Exception as e: |
|
print(f"Error in emotion analysis: {str(e)}") |
|
|
|
|
|
ast_text = "No valid audio classification results available." |
|
if ast_results and isinstance(ast_results, list): |
|
ast_text = "Audio Classification Results:\n" |
|
for result in ast_results[:5]: |
|
ast_text += f"{result['label']}: {result['score']*100:.2f}%\n" |
|
|
|
|
|
return genre_results, emotion_text, ast_text, clean_lyrics, beat_timeline |
|
|
|
except Exception as e: |
|
error_msg = f"Error: {str(e)}" |
|
print(error_msg) |
|
return error_msg, *error_response[1:] |
|
|
|
|
|
with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo: |
|
gr.Markdown("# Music Genre Classifier & Lyrics Generator") |
|
gr.Markdown("Upload a music file to classify its genre, analyze its emotions, and generate perfectly aligned lyrics.") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
audio_input = gr.Audio(label="Upload Music", type="filepath") |
|
|
|
|
|
lyrics_requirements_input = gr.Textbox( |
|
label="Lyrics Requirements (optional)", |
|
placeholder="Enter specific themes, topics, words, or styles you want in the lyrics", |
|
lines=3 |
|
) |
|
|
|
submit_btn = gr.Button("Analyze & Generate", variant="primary") |
|
|
|
|
|
with gr.Accordion("About Music Genres", open=False): |
|
gr.Markdown(""" |
|
The system recognizes various music genres including: |
|
- Pop, Rock, Hip-Hop, R&B |
|
- Electronic, Dance, Techno, House |
|
- Jazz, Blues, Classical |
|
- Folk, Country, Acoustic |
|
- Metal, Punk, Alternative |
|
- And many others! |
|
|
|
For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music. |
|
""") |
|
|
|
with gr.Column(scale=2): |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Analysis Results"): |
|
genre_output = gr.Textbox(label="Detected Genres", lines=4) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
emotion_output = gr.Textbox(label="Emotion & Structure Analysis", lines=8) |
|
with gr.Column(): |
|
ast_output = gr.Textbox(label="Audio Classification", lines=8) |
|
|
|
with gr.TabItem("Generated Lyrics"): |
|
lyrics_output = gr.Textbox(label="Lyrics", lines=18) |
|
|
|
with gr.TabItem("Beat & Syllable Timeline"): |
|
beat_timeline_output = gr.Textbox(label="Beat Timings & Syllable Patterns", lines=40) |
|
|
|
|
|
submit_btn.click( |
|
fn=display_results, |
|
inputs=[audio_input, lyrics_requirements_input], |
|
outputs=[genre_output, emotion_output, ast_output, lyrics_output, beat_timeline_output] |
|
) |
|
|
|
|
|
with gr.Accordion("How it works", open=False): |
|
gr.Markdown(""" |
|
## Advanced Lyrics Generation Process |
|
|
|
1. **Audio Analysis**: The system analyzes your uploaded music file using multiple machine learning models. |
|
|
|
2. **Genre Classification**: A specialized neural network identifies the musical genre, detecting subtle patterns in the audio. |
|
|
|
3. **Emotional Analysis**: The system examines harmonic, rhythmic, and timbral features to determine the emotional qualities of the music. |
|
|
|
4. **Rhythm Mapping**: Advanced beat detection algorithms create a detailed rhythmic map of the music, identifying: |
|
- Strong and weak beats |
|
- Natural phrase boundaries |
|
- Time signature and tempo variations |
|
- Beat subdivisions (half and quarter beats) |
|
|
|
5. **Second-Level Alignment**: The system maps beats and subbeats to each second of audio, creating precise templates for perfect alignment. |
|
|
|
6. **Syllable Template Creation**: For each second of audio, the system generates precise syllable templates that reflect: |
|
- Beat stress patterns (strong, medium, weak) |
|
- Appropriate syllable counts based on tempo |
|
- Genre-specific rhythmic qualities |
|
- Half-beat and quarter-beat subdivisions |
|
|
|
7. **Lyrics Generation**: Using the detected genre, emotion, rhythm patterns, and your custom requirements, a large language model generates lyrics that: |
|
- Match the emotional quality of the music |
|
- Follow the precise syllable templates for each second |
|
- Align stressed syllables with strong beats |
|
- Maintain genre-appropriate style and themes |
|
- Incorporate your specific requirements and preferences |
|
|
|
8. **Rhythm Verification**: The system verifies the generated lyrics, analyzing: |
|
- Syllable count accuracy |
|
- Stress alignment with strong beats |
|
- Word stress patterns |
|
- Second-by-second alignment precision |
|
|
|
9. **Refinement**: If significant rhythm mismatches are detected, the system can automatically refine the lyrics for better alignment. |
|
|
|
This multi-step process creates lyrics that feel naturally connected to the music, as if they were written specifically for it. |
|
""") |
|
|
|
|
|
demo.launch() |