|
import os |
|
import io |
|
import gradio as gr |
|
import torch |
|
import numpy as np |
|
import re |
|
import pronouncing |
|
import functools |
|
from transformers import ( |
|
AutoModelForAudioClassification, |
|
AutoFeatureExtractor, |
|
AutoTokenizer, |
|
pipeline, |
|
AutoModelForCausalLM, |
|
BitsAndBytesConfig |
|
) |
|
from huggingface_hub import login |
|
from utils import ( |
|
load_audio, |
|
extract_audio_duration, |
|
extract_mfcc_features, |
|
calculate_lyrics_length, |
|
format_genre_results, |
|
ensure_cuda_availability, |
|
preprocess_audio_for_model |
|
) |
|
from emotionanalysis import MusicAnalyzer |
|
import librosa |
|
|
|
|
|
if "HF_TOKEN" in os.environ: |
|
login(token=os.environ["HF_TOKEN"]) |
|
|
|
|
|
GENRE_MODEL_NAME = "dima806/music_genres_classification" |
|
MUSIC_DETECTION_MODEL = "MIT/ast-finetuned-audioset-10-10-0.4593" |
|
LLM_MODEL_NAME = "Qwen/Qwen3-14B" |
|
SAMPLE_RATE = 22050 |
|
|
|
|
|
CUDA_AVAILABLE = ensure_cuda_availability() |
|
|
|
|
|
print(f"Loading music detection model: {MUSIC_DETECTION_MODEL}") |
|
try: |
|
music_detector = pipeline( |
|
"audio-classification", |
|
model=MUSIC_DETECTION_MODEL, |
|
device=0 if CUDA_AVAILABLE else -1 |
|
) |
|
print("Successfully loaded music detection pipeline") |
|
except Exception as e: |
|
print(f"Error creating music detection pipeline: {str(e)}") |
|
|
|
try: |
|
music_processor = AutoFeatureExtractor.from_pretrained(MUSIC_DETECTION_MODEL) |
|
music_model = AutoModelForAudioClassification.from_pretrained(MUSIC_DETECTION_MODEL) |
|
print("Successfully loaded music detection model and feature extractor") |
|
except Exception as e2: |
|
print(f"Error loading music detection model components: {str(e2)}") |
|
raise RuntimeError(f"Could not load music detection model: {str(e2)}") |
|
|
|
|
|
print(f"Loading audio classification model: {GENRE_MODEL_NAME}") |
|
try: |
|
genre_classifier = pipeline( |
|
"audio-classification", |
|
model=GENRE_MODEL_NAME, |
|
device=0 if CUDA_AVAILABLE else -1 |
|
) |
|
print("Successfully loaded audio classification pipeline") |
|
except Exception as e: |
|
print(f"Error creating pipeline: {str(e)}") |
|
|
|
try: |
|
genre_processor = AutoFeatureExtractor.from_pretrained(GENRE_MODEL_NAME) |
|
genre_model = AutoModelForAudioClassification.from_pretrained(GENRE_MODEL_NAME) |
|
print("Successfully loaded audio classification model and feature extractor") |
|
except Exception as e2: |
|
print(f"Error loading model components: {str(e2)}") |
|
raise RuntimeError(f"Could not load genre classification model: {str(e2)}") |
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_compute_dtype=torch.float16, |
|
) |
|
|
|
llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME) |
|
llm_model = AutoModelForCausalLM.from_pretrained( |
|
LLM_MODEL_NAME, |
|
device_map="auto", |
|
quantization_config=bnb_config, |
|
torch_dtype=torch.float16, |
|
) |
|
|
|
|
|
llm_pipeline = pipeline( |
|
"text-generation", |
|
model=llm_model, |
|
tokenizer=llm_tokenizer, |
|
max_new_tokens=512, |
|
) |
|
|
|
|
|
music_analyzer = MusicAnalyzer() |
|
|
|
|
|
def count_syllables(text): |
|
"""Count syllables in a given text using the pronouncing library.""" |
|
words = re.findall(r'\b[a-zA-Z]+\b', text.lower()) |
|
syllable_count = 0 |
|
|
|
for word in words: |
|
|
|
pronunciations = pronouncing.phones_for_word(word) |
|
if pronunciations: |
|
|
|
syllable_count += pronouncing.syllable_count(pronunciations[0]) |
|
else: |
|
|
|
vowels = "aeiouy" |
|
count = 0 |
|
prev_is_vowel = False |
|
|
|
for char in word: |
|
is_vowel = char.lower() in vowels |
|
if is_vowel and not prev_is_vowel: |
|
count += 1 |
|
prev_is_vowel = is_vowel |
|
|
|
if word.endswith('e'): |
|
count -= 1 |
|
if word.endswith('le') and len(word) > 2 and word[-3] not in vowels: |
|
count += 1 |
|
if count == 0: |
|
count = 1 |
|
|
|
syllable_count += count |
|
|
|
return syllable_count |
|
|
|
def extract_audio_features(audio_file): |
|
"""Extract audio features from an audio file.""" |
|
try: |
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
|
|
if y is None or sr is None: |
|
raise ValueError("Failed to load audio data") |
|
|
|
|
|
duration = extract_audio_duration(y, sr) |
|
|
|
|
|
mfccs_mean = extract_mfcc_features(y, sr, n_mfcc=20) |
|
|
|
return { |
|
"features": mfccs_mean, |
|
"duration": duration, |
|
"waveform": y, |
|
"sample_rate": sr, |
|
"path": audio_file |
|
} |
|
except Exception as e: |
|
print(f"Error extracting audio features: {str(e)}") |
|
raise ValueError(f"Failed to extract audio features: {str(e)}") |
|
|
|
def classify_genre(audio_data): |
|
"""Classify the genre of the audio using the loaded model.""" |
|
try: |
|
|
|
if 'genre_classifier' in globals(): |
|
results = genre_classifier(audio_data["path"]) |
|
|
|
top_genres = [(result["label"], result["score"]) for result in results[:3]] |
|
return top_genres |
|
|
|
|
|
elif 'genre_processor' in globals() and 'genre_model' in globals(): |
|
|
|
inputs = genre_processor( |
|
audio_data["waveform"], |
|
sampling_rate=audio_data["sample_rate"], |
|
return_tensors="pt" |
|
) |
|
|
|
with torch.no_grad(): |
|
outputs = genre_model(**inputs) |
|
predictions = outputs.logits.softmax(dim=-1) |
|
|
|
|
|
values, indices = torch.topk(predictions, 3) |
|
|
|
|
|
genre_labels = genre_model.config.id2label |
|
|
|
top_genres = [] |
|
for i, (value, index) in enumerate(zip(values[0], indices[0])): |
|
genre = genre_labels[index.item()] |
|
confidence = value.item() |
|
top_genres.append((genre, confidence)) |
|
|
|
return top_genres |
|
|
|
else: |
|
raise ValueError("No genre classification model available") |
|
|
|
except Exception as e: |
|
print(f"Error in genre classification: {str(e)}") |
|
|
|
return [("rock", 1.0)] |
|
|
|
def detect_music(audio_data): |
|
"""Detect if the audio is music using the MIT AST model.""" |
|
try: |
|
|
|
if 'music_detector' in globals(): |
|
results = music_detector(audio_data["path"]) |
|
|
|
music_confidence = 0.0 |
|
for result in results: |
|
label = result["label"].lower() |
|
if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): |
|
music_confidence = max(music_confidence, result["score"]) |
|
return music_confidence >= 0.2, results |
|
|
|
|
|
elif 'music_processor' in globals() and 'music_model' in globals(): |
|
|
|
inputs = music_processor( |
|
audio_data["waveform"], |
|
sampling_rate=audio_data["sample_rate"], |
|
return_tensors="pt" |
|
) |
|
|
|
with torch.no_grad(): |
|
outputs = music_model(**inputs) |
|
predictions = outputs.logits.softmax(dim=-1) |
|
|
|
|
|
values, indices = torch.topk(predictions, 5) |
|
|
|
|
|
labels = music_model.config.id2label |
|
|
|
|
|
music_confidence = 0.0 |
|
results = [] |
|
|
|
for i, (value, index) in enumerate(zip(values[0], indices[0])): |
|
label = labels[index.item()].lower() |
|
score = value.item() |
|
results.append({"label": label, "score": score}) |
|
|
|
if any(music_term in label for music_term in ["music", "song", "singing", "instrument"]): |
|
music_confidence = max(music_confidence, score) |
|
|
|
return music_confidence >= 0.2, results |
|
|
|
else: |
|
raise ValueError("No music detection model available") |
|
|
|
except Exception as e: |
|
print(f"Error in music detection: {str(e)}") |
|
return False, [] |
|
|
|
def detect_beats(y, sr): |
|
"""Enhanced beat detection with adaptive threshold analysis and improved time signature detection.""" |
|
|
|
|
|
y = np.clip(y, 1e-10, None) |
|
|
|
|
|
y_harmonic, y_percussive = librosa.effects.hpss(y) |
|
|
|
|
|
onset_env_full = librosa.onset.onset_strength(y=y, sr=sr) |
|
onset_env_perc = librosa.onset.onset_strength(y=y_percussive, sr=sr) |
|
|
|
|
|
onset_env_full = np.maximum(onset_env_full, 1e-6) |
|
onset_env_perc = np.maximum(onset_env_perc, 1e-6) |
|
|
|
|
|
combined_onset = onset_env_full * 0.3 + onset_env_perc * 0.7 |
|
|
|
|
|
tempo_candidates = [] |
|
beat_candidates = [] |
|
|
|
|
|
tempo1, beats1 = librosa.beat.beat_track( |
|
onset_envelope=combined_onset, |
|
sr=sr, |
|
tightness=100 |
|
) |
|
tempo_candidates.append(tempo1) |
|
beat_candidates.append(beats1) |
|
|
|
|
|
tempo2, beats2 = librosa.beat.beat_track( |
|
onset_envelope=combined_onset, |
|
sr=sr, |
|
tightness=100, |
|
start_bpm=60 |
|
) |
|
tempo_candidates.append(tempo2) |
|
beat_candidates.append(beats2) |
|
|
|
|
|
beat_consistency = [] |
|
for beats in beat_candidates: |
|
if len(beats) <= 1: |
|
beat_consistency.append(0) |
|
continue |
|
|
|
times = librosa.frames_to_time(beats, sr=sr) |
|
intervals = np.diff(times) |
|
|
|
|
|
if np.mean(intervals) > 0: |
|
consistency = 1.0 / (1.0 + np.std(intervals)/np.mean(intervals)) |
|
beat_consistency.append(consistency) |
|
else: |
|
beat_consistency.append(0) |
|
|
|
best_idx = np.argmax(beat_consistency) if beat_consistency else 0 |
|
tempo = tempo_candidates[best_idx] |
|
beat_frames = beat_candidates[best_idx] |
|
|
|
|
|
beat_times = librosa.frames_to_time(beat_frames, sr=sr) |
|
|
|
|
|
beat_strengths = [] |
|
if len(beat_frames) > 0: |
|
|
|
valid_frames = [frame for frame in beat_frames if frame < len(combined_onset)] |
|
if valid_frames: |
|
|
|
beat_strengths = combined_onset[valid_frames].tolist() |
|
|
|
|
|
avg_strength = np.mean(beat_strengths) if beat_strengths else 1.0 |
|
beat_strengths.extend([avg_strength] * (len(beat_times) - len(beat_strengths))) |
|
else: |
|
beat_strengths = [1.0] * len(beat_times) |
|
else: |
|
beat_strengths = [1.0] * len(beat_times) |
|
|
|
|
|
intervals = np.diff(beat_times).tolist() if len(beat_times) > 1 else [] |
|
|
|
|
|
|
|
time_signature = 4 |
|
|
|
if len(beat_strengths) > 8: |
|
|
|
if len(beat_strengths) > 4: |
|
|
|
norm_strengths = np.array(beat_strengths) |
|
if np.max(norm_strengths) > 0: |
|
norm_strengths = norm_strengths / np.max(norm_strengths) |
|
|
|
|
|
ac = librosa.autocorrelate(norm_strengths, max_size=len(norm_strengths)//2) |
|
|
|
|
|
if len(ac) > 3: |
|
|
|
peaks = librosa.util.peak_pick(ac[1:], pre_max=1, post_max=1, pre_avg=1, post_avg=1, delta=0.1, wait=1) |
|
peaks = peaks + 1 |
|
|
|
if len(peaks) > 0: |
|
|
|
N = peaks[0] |
|
|
|
|
|
if 2 <= N <= 3: |
|
time_signature = N |
|
elif N == 6: |
|
time_signature = 3 |
|
elif N == 8: |
|
time_signature = 4 |
|
elif N == 5 or N == 7: |
|
time_signature = N |
|
|
|
|
|
|
|
if len(beat_strengths) > 3: |
|
|
|
strengths_array = np.array(beat_strengths) |
|
mean_strength = np.mean(strengths_array) |
|
std_strength = np.std(strengths_array) |
|
|
|
if std_strength > 0: |
|
z_scores = (strengths_array - mean_strength) / std_strength |
|
|
|
|
|
strong_beat_pattern = [] |
|
for i in range(0, len(z_scores) - 2, 3): |
|
|
|
|
|
if z_scores[i] > 1 and z_scores[i+1] < 0.5 and z_scores[i+2] < 0.5: |
|
strong_beat_pattern.append(1) |
|
else: |
|
strong_beat_pattern.append(0) |
|
|
|
|
|
if strong_beat_pattern and len(strong_beat_pattern) >= 3: |
|
three_pattern_probability = sum(strong_beat_pattern) / len(strong_beat_pattern) |
|
if three_pattern_probability > 0.6: |
|
time_signature = 3 |
|
|
|
|
|
phrases = [] |
|
current_phrase = [] |
|
|
|
if len(beat_times) > 0: |
|
|
|
if len(beat_strengths) > 4: |
|
|
|
strong_threshold = np.percentile(beat_strengths, 75) |
|
|
|
if intervals: |
|
mean_interval = np.mean(intervals) |
|
std_interval = np.std(intervals) |
|
|
|
significant_gap = mean_interval + (1.5 * std_interval) if std_interval > 0 else mean_interval * 1.3 |
|
else: |
|
significant_gap = 0 |
|
else: |
|
|
|
strong_threshold = np.max(beat_strengths) * 0.8 if beat_strengths else 1.0 |
|
significant_gap = 0 |
|
|
|
|
|
for i in range(len(beat_times)): |
|
current_phrase.append(i) |
|
|
|
|
|
if i < len(beat_times) - 1: |
|
|
|
is_stronger_next = False |
|
if i < len(beat_strengths) - 1: |
|
is_stronger_next = beat_strengths[i+1] > strong_threshold and beat_strengths[i+1] > beat_strengths[i] * 1.1 |
|
|
|
|
|
is_longer_gap = False |
|
if i < len(beat_times) - 1 and intervals and i < len(intervals): |
|
is_longer_gap = intervals[i] > significant_gap |
|
|
|
|
|
is_measure_boundary = (i + 1) % time_signature == 0 and i > 0 |
|
|
|
|
|
if ((is_stronger_next or is_longer_gap) and len(current_phrase) >= 2) or \ |
|
(is_measure_boundary and len(current_phrase) >= time_signature): |
|
phrases.append(current_phrase) |
|
current_phrase = [] |
|
|
|
|
|
if current_phrase and len(current_phrase) >= 2: |
|
phrases.append(current_phrase) |
|
|
|
|
|
if not phrases and len(beat_times) >= 2: |
|
|
|
for i in range(0, len(beat_times), time_signature): |
|
end = min(i + time_signature, len(beat_times)) |
|
if end - i >= 2: |
|
phrases.append(list(range(i, end))) |
|
|
|
|
|
return { |
|
"tempo": tempo, |
|
"beat_frames": beat_frames, |
|
"beat_times": beat_times, |
|
"beat_count": len(beat_times), |
|
"beat_strengths": beat_strengths, |
|
"intervals": intervals, |
|
"time_signature": time_signature, |
|
"phrases": phrases |
|
} |
|
|
|
def detect_sections(y, sr): |
|
""" |
|
Advanced detection of musical sections with adaptive segmentation and improved classification. |
|
|
|
Parameters: |
|
y: Audio time series |
|
sr: Sample rate |
|
|
|
Returns: |
|
A list of section dictionaries with type, start time, end time, and duration |
|
""" |
|
|
|
|
|
hop_length = 512 |
|
|
|
|
|
S = np.abs(librosa.stft(y, hop_length=hop_length)) |
|
contrast = librosa.feature.spectral_contrast(S=S, sr=sr) |
|
|
|
|
|
chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length) |
|
|
|
|
|
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, hop_length=hop_length) |
|
|
|
|
|
rms = librosa.feature.rms(y=y, hop_length=hop_length) |
|
|
|
|
|
y_harmonic, y_percussive = librosa.effects.hpss(y) |
|
percussive_rms = librosa.feature.rms(y=y_percussive, hop_length=hop_length) |
|
|
|
|
|
|
|
duration = librosa.get_duration(y=y, sr=sr) |
|
|
|
|
|
|
|
feature_stack = np.vstack([ |
|
librosa.util.normalize(contrast), |
|
librosa.util.normalize(chroma), |
|
librosa.util.normalize(mfcc), |
|
librosa.util.normalize(rms) |
|
]) |
|
|
|
|
|
feature_matrix = feature_stack.T |
|
|
|
|
|
|
|
|
|
|
|
from sklearn.decomposition import PCA |
|
|
|
|
|
n_components = min(8, feature_matrix.shape[0], feature_matrix.shape[1]) |
|
|
|
if feature_matrix.shape[0] > n_components and feature_matrix.shape[1] > 0: |
|
try: |
|
pca = PCA(n_components=n_components) |
|
reduced_features = pca.fit_transform(feature_matrix) |
|
except Exception as e: |
|
print(f"PCA failed, falling back to original features: {e}") |
|
|
|
reduced_features = feature_matrix |
|
else: |
|
|
|
reduced_features = feature_matrix |
|
|
|
|
|
|
|
|
|
|
|
min_segments = max(2, int(duration / 60)) |
|
max_segments = min(10, int(duration / 20)) |
|
|
|
|
|
min_segments = max(2, min(min_segments, 4)) |
|
max_segments = max(min_segments + 1, min(max_segments, 8)) |
|
|
|
|
|
best_segments = min_segments |
|
best_score = -1 |
|
|
|
from sklearn.metrics import silhouette_score |
|
from sklearn.cluster import AgglomerativeClustering |
|
|
|
|
|
if reduced_features.shape[0] > max_segments: |
|
for n_segments in range(min_segments, max_segments + 1): |
|
try: |
|
|
|
clustering = AgglomerativeClustering(n_clusters=n_segments) |
|
labels = clustering.fit_predict(reduced_features) |
|
|
|
|
|
if len(np.unique(labels)) > 1 and len(labels) > n_segments + 1: |
|
score = silhouette_score(reduced_features, labels) |
|
|
|
if score > best_score: |
|
best_score = score |
|
best_segments = n_segments |
|
except Exception as e: |
|
print(f"Clustering with {n_segments} segments failed: {e}") |
|
continue |
|
|
|
|
|
n_segments = best_segments |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
clustering = AgglomerativeClustering(n_clusters=n_segments) |
|
labels = clustering.fit_predict(reduced_features) |
|
|
|
|
|
boundaries = [0] |
|
|
|
for i in range(1, len(labels)): |
|
if labels[i] != labels[i-1]: |
|
boundaries.append(i) |
|
|
|
boundaries.append(len(labels)) |
|
|
|
|
|
bounds_frames = np.array(boundaries) |
|
|
|
except Exception as e: |
|
print(f"Final clustering failed: {e}") |
|
|
|
bounds_frames = librosa.segment.agglomerative(feature_stack, n_segments) |
|
|
|
|
|
|
|
|
|
|
|
tonnetz = librosa.feature.tonnetz(y=y_harmonic, sr=sr) |
|
|
|
|
|
harmonic_changes = [] |
|
|
|
if tonnetz.shape[1] > 1: |
|
tonnetz_diff = np.sum(np.abs(np.diff(tonnetz, axis=1)), axis=0) |
|
|
|
if np.max(tonnetz_diff) > 0: |
|
tonnetz_diff = tonnetz_diff / np.max(tonnetz_diff) |
|
|
|
|
|
threshold = np.percentile(tonnetz_diff, 90) |
|
for i in range(len(tonnetz_diff)): |
|
if tonnetz_diff[i] > threshold: |
|
harmonic_changes.append(i) |
|
|
|
|
|
|
|
bounds_times = librosa.frames_to_time(bounds_frames, sr=sr, hop_length=hop_length) |
|
|
|
|
|
sections = [] |
|
|
|
for i in range(len(bounds_times) - 1): |
|
start = bounds_times[i] |
|
end = bounds_times[i+1] |
|
duration = end - start |
|
|
|
|
|
if duration < 4 and i > 0 and i < len(bounds_times) - 2: |
|
continue |
|
|
|
|
|
|
|
|
|
|
|
start_idx = bounds_frames[i] |
|
end_idx = bounds_frames[i+1] |
|
|
|
|
|
if i == 0: |
|
section_type = "intro" |
|
elif i == len(bounds_times) - 2: |
|
section_type = "outro" |
|
else: |
|
|
|
section_type = "chorus" if i % 2 == 1 else "verse" |
|
|
|
|
|
if end_idx > start_idx: |
|
|
|
|
|
|
|
energy = np.mean(rms[0, start_idx:end_idx]) |
|
|
|
|
|
rhythm_intensity = np.mean(percussive_rms[0, start_idx:end_idx]) |
|
|
|
|
|
if chroma.shape[1] > 0: |
|
chroma_var = np.var(chroma[:, start_idx:end_idx]) |
|
else: |
|
chroma_var = 0 |
|
|
|
|
|
if mfcc.shape[1] > 0: |
|
mfcc_mean = np.mean(mfcc[:, start_idx:end_idx], axis=1) |
|
mfcc_var = np.var(mfcc[:, start_idx:end_idx], axis=1) |
|
else: |
|
mfcc_mean = np.zeros(mfcc.shape[0]) |
|
mfcc_var = np.zeros(mfcc.shape[0]) |
|
|
|
|
|
has_harmonic_change = False |
|
for change_idx in harmonic_changes: |
|
if start_idx <= change_idx < end_idx: |
|
has_harmonic_change = True |
|
break |
|
|
|
|
|
relative_energy = energy / np.mean(rms) |
|
relative_rhythm = rhythm_intensity / np.mean(percussive_rms) |
|
|
|
|
|
|
|
|
|
if (relative_energy > 1.1 and relative_rhythm > 1.1 and |
|
section_type != "intro" and section_type != "outro"): |
|
section_type = "chorus" |
|
|
|
|
|
elif (0.8 <= relative_energy <= 1.1 and chroma_var > np.mean(np.var(chroma, axis=1)) and |
|
section_type != "intro" and section_type != "outro"): |
|
section_type = "verse" |
|
|
|
|
|
if (section_type not in ["intro", "outro"] and |
|
(has_harmonic_change or |
|
(0.5 <= relative_energy <= 0.9 and duration < 30) or |
|
np.any(mfcc_var > np.percentile(np.var(mfcc, axis=1), 75)))): |
|
section_type = "bridge" |
|
|
|
|
|
sections.append({ |
|
"type": section_type, |
|
"start": start, |
|
"end": end, |
|
"duration": duration |
|
}) |
|
|
|
|
|
for i in range(1, len(sections) - 1): |
|
|
|
if sections[i]["duration"] < 8 and sections[i]["type"] not in ["intro", "outro", "bridge"]: |
|
|
|
prev_type = sections[i-1]["type"] |
|
next_type = sections[i+1]["type"] if i+1 < len(sections) else "outro" |
|
|
|
|
|
sections[i]["type"] = prev_type |
|
|
|
|
|
sections = [s for s in sections if s["duration"] >= 5 or |
|
s["type"] == "intro" or s["type"] == "outro"] |
|
|
|
return sections |
|
|
|
def create_flexible_syllable_templates(beats_info, genre=None, phrase_mode='default'): |
|
""" |
|
Create enhanced syllable templates based on beat patterns with improved musical intelligence. |
|
|
|
Parameters: |
|
beats_info: Dictionary containing beat analysis data |
|
genre: Optional genre to influence template creation |
|
phrase_mode: 'default' uses provided phrases, 'auto' forces recalculation |
|
|
|
Returns: |
|
String of syllable templates with embedded strength values and flexible timing |
|
""" |
|
import numpy as np |
|
from sklearn.cluster import KMeans |
|
|
|
|
|
beat_times = beats_info.get("beat_times", []) |
|
beat_strengths = beats_info.get("beat_strengths", [1.0] * len(beat_times)) |
|
tempo = beats_info.get("tempo", 120) |
|
time_signature = beats_info.get("time_signature", 4) |
|
|
|
|
|
if len(beat_times) < 2: |
|
return "S(1.0):1-w(0.5):1|S(1.0):1-w(0.5):1" |
|
|
|
|
|
|
|
if len(beat_strengths) >= 6: |
|
|
|
X = np.array(beat_strengths).reshape(-1, 1) |
|
|
|
|
|
kmeans = KMeans(n_clusters=3, random_state=0, n_init=10).fit(X) |
|
|
|
|
|
centroids = sorted([float(c[0]) for c in kmeans.cluster_centers_]) |
|
|
|
|
|
if len(centroids) >= 3: |
|
medium_threshold = (centroids[0] + centroids[1]) / 2 |
|
strong_threshold = (centroids[1] + centroids[2]) / 2 |
|
else: |
|
|
|
medium_threshold = np.percentile(beat_strengths, 33) |
|
strong_threshold = np.percentile(beat_strengths, 66) |
|
else: |
|
|
|
medium_threshold = np.percentile(beat_strengths, 33) |
|
strong_threshold = np.percentile(beat_strengths, 66) |
|
|
|
|
|
|
|
phrases = beats_info.get("phrases", []) |
|
|
|
if phrase_mode == 'auto' or not phrases: |
|
|
|
phrases = [] |
|
current_phrase = [] |
|
|
|
for i in range(len(beat_times)): |
|
current_phrase.append(i) |
|
|
|
|
|
if (i + 1) % time_signature == 0 or i == len(beat_times) - 1: |
|
if len(current_phrase) >= 2: |
|
phrases.append(current_phrase) |
|
current_phrase = [] |
|
|
|
|
|
if current_phrase and len(current_phrase) >= 2: |
|
phrases.append(current_phrase) |
|
|
|
|
|
|
|
def tempo_to_syllable_base(tempo): |
|
"""Continuous function mapping tempo to syllable base count""" |
|
|
|
if tempo > 180: |
|
return 1.0 |
|
elif tempo > 140: |
|
return 1.0 + (180 - tempo) * 0.02 |
|
elif tempo > 100: |
|
return 1.8 + (140 - tempo) * 0.01 |
|
elif tempo > 70: |
|
return 2.2 + (100 - tempo) * 0.02 |
|
else: |
|
return 2.8 + max(0, (70 - tempo) * 0.04) |
|
|
|
|
|
|
|
syllable_templates = [] |
|
|
|
for phrase in phrases: |
|
|
|
if not phrase: |
|
continue |
|
|
|
|
|
phrase_strengths = [beat_strengths[i] for i in phrase if i < len(beat_strengths)] |
|
if not phrase_strengths: |
|
phrase_strengths = [1.0] * len(phrase) |
|
|
|
|
|
stress_pattern = [] |
|
for i, strength in enumerate(phrase_strengths): |
|
|
|
metrical_position = i % time_signature |
|
|
|
|
|
position_boost = 0.15 if metrical_position == 0 else 0 |
|
|
|
if time_signature == 4 and metrical_position == 2: |
|
position_boost = 0.08 |
|
|
|
effective_strength = strength + position_boost |
|
|
|
if effective_strength >= strong_threshold: |
|
stress_pattern.append(("S", effective_strength)) |
|
elif effective_strength >= medium_threshold: |
|
stress_pattern.append(("m", effective_strength)) |
|
else: |
|
stress_pattern.append(("w", effective_strength)) |
|
|
|
|
|
|
|
detailed_template = [] |
|
|
|
for i, (stress_type, strength) in enumerate(stress_pattern): |
|
|
|
base_syllables = tempo_to_syllable_base(tempo) |
|
|
|
|
|
if stress_type == "S": |
|
syllable_factor = 1.2 |
|
elif stress_type == "m": |
|
syllable_factor = 1.0 |
|
else: |
|
syllable_factor = 0.8 |
|
|
|
|
|
genre_factor = 1.0 |
|
if genre: |
|
genre = genre.lower() |
|
if any(term in genre for term in ["rap", "hip hop", "hip-hop"]): |
|
genre_factor = 1.4 |
|
elif any(term in genre for term in ["folk", "country", "ballad"]): |
|
genre_factor = 0.8 |
|
|
|
|
|
raw_count = base_syllables * syllable_factor * genre_factor |
|
|
|
|
|
|
|
rounded_count = round(raw_count * 2) / 2 |
|
|
|
|
|
syllable_count = max(0.5, min(4, rounded_count)) |
|
|
|
|
|
|
|
strength_pct = int(strength * 100) / 100 |
|
detailed_template.append(f"{stress_type}({strength_pct}):{syllable_count}") |
|
|
|
|
|
phrase_template = "-".join(detailed_template) |
|
syllable_templates.append(phrase_template) |
|
|
|
|
|
|
|
if not syllable_templates: |
|
|
|
if time_signature == 3: |
|
syllable_templates = ["S(0.95):2-w(0.4):1-w(0.35):1"] |
|
else: |
|
syllable_templates = ["S(0.95):2-w(0.4):1-m(0.7):1.5-w(0.35):1"] |
|
|
|
|
|
return "|".join(syllable_templates) |
|
|
|
def format_syllable_templates_for_prompt(syllable_templates, arrow="→", line_wrap=10, |
|
structured_output=False, beat_types=None): |
|
""" |
|
Convert technical syllable templates into clear, human-readable instructions with |
|
enhanced flexibility and customization options. |
|
|
|
Parameters: |
|
syllable_templates: String or list of templates |
|
arrow: Symbol to use between beats (default: "→") |
|
line_wrap: Number of beats before automatic line wrapping (0 = no wrapping) |
|
structured_output: If True, return structured data instead of text |
|
beat_types: Custom mapping for beat types (default: None, uses standard mapping) |
|
|
|
Returns: |
|
Human-readable instructions or structured data depending on parameters |
|
""" |
|
if not syllable_templates: |
|
return {} if structured_output else "" |
|
|
|
|
|
default_beat_types = { |
|
"S": {"name": "STRONG", "description": "stressed syllable"}, |
|
"m": {"name": "medium", "description": "medium-stressed syllable"}, |
|
"w": {"name": "weak", "description": "unstressed syllable"}, |
|
"X": {"name": "EXTRA", "description": "extra strong syllable"}, |
|
"L": {"name": "legato", "description": "connected/tied syllable"} |
|
} |
|
|
|
|
|
beat_types = beat_types or default_beat_types |
|
|
|
|
|
structured_data = {"lines": [], "explanations": []} if structured_output else None |
|
|
|
|
|
is_enhanced_format = False |
|
|
|
|
|
if isinstance(syllable_templates, str): |
|
|
|
if any(bt + "(" in syllable_templates or bt + ":" in syllable_templates or bt + "[" in syllable_templates |
|
for bt in beat_types.keys()): |
|
is_enhanced_format = True |
|
|
|
elif "|" in syllable_templates: |
|
is_enhanced_format = True |
|
|
|
|
|
output = [] |
|
|
|
if is_enhanced_format: |
|
|
|
phrases = syllable_templates.split("|") if "|" in syllable_templates else [syllable_templates] |
|
|
|
|
|
for i, phrase in enumerate(phrases): |
|
|
|
has_swing = "(swing)" in phrase |
|
if has_swing: |
|
phrase = phrase.replace("(swing)", "") |
|
|
|
beats = phrase.split("-") |
|
beat_instructions = [] |
|
|
|
|
|
for j, beat in enumerate(beats): |
|
|
|
beat_info = {"original": beat, "type": None, "count": None, "strength": None} |
|
|
|
|
|
if "(" in beat and ")" in beat and ":" in beat: |
|
parts = beat.split(":") |
|
beat_type = parts[0].split("(")[0] |
|
strength = parts[0].split("(")[1].rstrip(")") |
|
count = parts[1] |
|
|
|
beat_info["type"] = beat_type |
|
beat_info["count"] = count |
|
beat_info["strength"] = strength |
|
|
|
|
|
elif any(beat.startswith(bt) for bt in beat_types.keys()) and len(beat) > 1: |
|
beat_type = beat[0] |
|
count = beat[1:] |
|
|
|
beat_info["type"] = beat_type |
|
beat_info["count"] = count |
|
|
|
|
|
else: |
|
beat_instructions.append(beat) |
|
continue |
|
|
|
|
|
if beat_info["type"] in beat_types: |
|
type_name = beat_types[beat_info["type"]]["name"] |
|
if beat_info["strength"]: |
|
beat_instructions.append(f"{type_name}({beat_info['count']}) [{beat_info['strength']}]") |
|
else: |
|
beat_instructions.append(f"{type_name}({beat_info['count']})") |
|
else: |
|
|
|
beat_instructions.append(beat) |
|
|
|
|
|
if line_wrap > 0 and len(beat_instructions) > line_wrap: |
|
wrapped_instructions = [] |
|
for k in range(0, len(beat_instructions), line_wrap): |
|
section = beat_instructions[k:k+line_wrap] |
|
wrapped_instructions.append(f"{arrow} ".join(section)) |
|
line_desc = f"\n {arrow} ".join(wrapped_instructions) |
|
else: |
|
line_desc = f" {arrow} ".join(beat_instructions) |
|
|
|
|
|
if has_swing: |
|
line_desc += " [with swing feel]" |
|
|
|
|
|
line_output = f"Line {i+1}: {line_desc}" |
|
output.append(line_output) |
|
|
|
if structured_output: |
|
structured_data["lines"].append({ |
|
"line_number": i+1, |
|
"beats": [{"original": beats[j], |
|
"type": beat_info.get("type"), |
|
"count": beat_info.get("count"), |
|
"strength": beat_info.get("strength")} |
|
for j, beat_info in enumerate([b for b in beats if isinstance(b, dict)])], |
|
"has_swing": has_swing |
|
}) |
|
|
|
|
|
explanation = [ |
|
"\n📝 UNDERSTANDING THE NOTATION:" |
|
] |
|
|
|
|
|
used_beat_types = set() |
|
for phrase in phrases: |
|
for beat in phrase.split("-"): |
|
for bt in beat_types.keys(): |
|
if beat.startswith(bt): |
|
used_beat_types.add(bt) |
|
|
|
for bt in used_beat_types: |
|
if bt in beat_types: |
|
name = beat_types[bt]["name"] |
|
desc = beat_types[bt]["description"] |
|
explanation.append(f"- {name}(n): Place a {desc} here, plus (n-1) unstressed syllables") |
|
|
|
explanation.extend([ |
|
f"- {arrow}: Indicates flow from one beat to the next", |
|
"- [0.xx]: Beat strength value (higher = more emphasis needed)" |
|
]) |
|
|
|
output.extend(explanation) |
|
|
|
if structured_output: |
|
structured_data["explanations"] = explanation |
|
|
|
|
|
has_half_syllables = any((".5" in beat) for phrase in phrases for beat in phrase.split("-")) |
|
if has_half_syllables: |
|
half_syllable_examples = [ |
|
"\n🎵 HALF-SYLLABLE EXAMPLES:", |
|
"- STRONG(1.5): One stressed syllable followed by an unstressed half-syllable", |
|
" Example: \"LOVE you\" where \"LOVE\" is stressed and \"you\" is quick", |
|
"- medium(2.5): One medium syllable plus one-and-a-half unstressed syllables", |
|
" Example: \"Wait for the\" where \"Wait\" is medium-stressed and \"for the\" is quick" |
|
] |
|
output.extend(half_syllable_examples) |
|
|
|
if structured_output: |
|
structured_data["half_syllable_examples"] = half_syllable_examples |
|
|
|
|
|
if any("swing" in phrase for phrase in phrases): |
|
swing_guide = [ |
|
"\n🎶 SWING RHYTHM GUIDE:", |
|
"- In swing, syllables should be unevenly timed (long-short pattern)", |
|
"- Example: \"SUM-mer TIME\" in swing feels like \"SUM...mer-TIME\" with delay" |
|
] |
|
output.extend(swing_guide) |
|
|
|
if structured_output: |
|
structured_data["swing_guide"] = swing_guide |
|
|
|
|
|
else: |
|
formatted_lines = [] |
|
|
|
if isinstance(syllable_templates, list): |
|
for i, template in enumerate(syllable_templates): |
|
if isinstance(template, dict) and "syllable_template" in template: |
|
line = f"Line {i+1}: {template['syllable_template']} syllables" |
|
formatted_lines.append(line) |
|
|
|
if structured_output: |
|
structured_data["lines"].append({ |
|
"line_number": i+1, |
|
"syllable_count": template["syllable_template"] |
|
}) |
|
elif isinstance(template, str): |
|
line = f"Line {i+1}: {template} syllables" |
|
formatted_lines.append(line) |
|
|
|
if structured_output: |
|
structured_data["lines"].append({ |
|
"line_number": i+1, |
|
"syllable_count": template |
|
}) |
|
|
|
output = formatted_lines |
|
else: |
|
output = [str(syllable_templates)] |
|
|
|
if structured_output: |
|
structured_data["raw_content"] = str(syllable_templates) |
|
|
|
|
|
application_tips = [ |
|
"\n💡 APPLICATION TIPS:", |
|
"1. Strong beats need naturally stressed syllables (like the START of \"RE-mem-ber\")", |
|
"2. Place important words on strong beats for natural emphasis", |
|
"3. Vowel sounds work best for sustained or emphasized syllables", |
|
"4. Keep consonant clusters (like \"str\" or \"thr\") on weak beats" |
|
] |
|
output.extend(application_tips) |
|
|
|
if structured_output: |
|
structured_data["application_tips"] = application_tips |
|
return structured_data |
|
|
|
return "\n".join(output) |
|
|
|
def verify_flexible_syllable_counts(lyrics, templates): |
|
""" |
|
Enhanced verification of syllable counts and stress patterns with precise alignment analysis |
|
and detailed feedback for all phrases in a template. |
|
""" |
|
import re |
|
import pronouncing |
|
import numpy as np |
|
import functools |
|
from itertools import chain |
|
|
|
|
|
@functools.lru_cache(maxsize=512) |
|
def cached_phones_for_word(word): |
|
return pronouncing.phones_for_word(word) |
|
|
|
@functools.lru_cache(maxsize=512) |
|
def count_syllables_for_word(word): |
|
"""Count syllables in a single word with caching for performance.""" |
|
|
|
pronunciations = cached_phones_for_word(word.lower()) |
|
if pronunciations: |
|
return pronouncing.syllable_count(pronunciations[0]) |
|
|
|
|
|
vowels = "aeiouy" |
|
word = word.lower() |
|
count = 0 |
|
prev_is_vowel = False |
|
|
|
for char in word: |
|
is_vowel = char in vowels |
|
if is_vowel and not prev_is_vowel: |
|
count += 1 |
|
prev_is_vowel = is_vowel |
|
|
|
|
|
if word.endswith('e') and not word.endswith('le'): |
|
count -= 1 |
|
if word.endswith('le') and len(word) > 2 and word[-3] not in vowels: |
|
count += 1 |
|
if count == 0: |
|
count = 1 |
|
|
|
return count |
|
|
|
@functools.lru_cache(maxsize=512) |
|
def get_word_stress(word): |
|
"""Get the stress pattern for a word with improved fallback handling.""" |
|
pronunciations = cached_phones_for_word(word.lower()) |
|
if pronunciations: |
|
return pronouncing.stresses(pronunciations[0]) |
|
|
|
|
|
syllables = count_syllables_for_word(word) |
|
|
|
|
|
if syllables == 1: |
|
return "1" |
|
elif syllables == 2: |
|
|
|
|
|
second_syllable_stress = ["ing", "er", "or", "ize", "ise", "ate", "ect", "end", "ure"] |
|
if any(word.endswith(ending) for ending in second_syllable_stress): |
|
return "01" |
|
else: |
|
return "10" |
|
elif syllables == 3: |
|
|
|
if any(word.endswith(ending) for ending in ["ity", "ety", "ify", "ogy", "graphy"]): |
|
return "100" |
|
elif any(word.endswith(ending) for ending in ["ation", "ious", "itis"]): |
|
return "010" |
|
else: |
|
return "100" |
|
else: |
|
|
|
return "1" + "0" * (syllables - 1) |
|
|
|
|
|
lines = [line.strip() for line in lyrics.split("\n") if line.strip()] |
|
|
|
|
|
verification_notes = [] |
|
detailed_analysis = [] |
|
stress_misalignments = [] |
|
total_mismatch_count = 0 |
|
|
|
|
|
for i, line in enumerate(lines): |
|
if i >= len(templates): |
|
break |
|
|
|
template = templates[i] |
|
|
|
|
|
if isinstance(template, dict) and "syllable_template" in template: |
|
template_str = template["syllable_template"] |
|
elif isinstance(template, str): |
|
template_str = template |
|
else: |
|
continue |
|
|
|
|
|
template_phrases = [template_str] |
|
if "|" in template_str: |
|
template_phrases = template_str.split("|") |
|
|
|
|
|
best_match_diff = float('inf') |
|
best_match_phrase = None |
|
best_phrase_beats = None |
|
actual_count = count_syllables(line) |
|
|
|
for phrase_idx, phrase in enumerate(template_phrases): |
|
|
|
beats_info = [] |
|
total_expected = 0 |
|
|
|
|
|
if "-" in phrase: |
|
beat_templates = phrase.split("-") |
|
|
|
|
|
for beat in beat_templates: |
|
beat_info = {"original": beat, "type": None, "count": 1, "strength": None} |
|
|
|
|
|
if "(" in beat and ")" in beat and ":" in beat: |
|
parts = beat.split(":") |
|
beat_type = parts[0].split("(")[0] |
|
try: |
|
strength = float(parts[0].split("(")[1].rstrip(")")) |
|
except ValueError: |
|
strength = 1.0 |
|
|
|
|
|
try: |
|
count = float(parts[1]) |
|
|
|
if count == int(count): |
|
count = int(count) |
|
except ValueError: |
|
count = 1 |
|
|
|
beat_info.update({ |
|
"type": beat_type, |
|
"count": count, |
|
"strength": strength |
|
}) |
|
|
|
|
|
elif any(beat.startswith(x) for x in ["S", "m", "w", "X", "L"]): |
|
beat_type = beat[0] |
|
|
|
|
|
try: |
|
count_str = beat[1:] |
|
count = float(count_str) |
|
if count == int(count): |
|
count = int(count) |
|
except ValueError: |
|
count = 1 |
|
|
|
beat_info.update({ |
|
"type": beat_type, |
|
"count": count |
|
}) |
|
|
|
|
|
else: |
|
try: |
|
count = float(beat) |
|
if count == int(count): |
|
count = int(count) |
|
beat_info["count"] = count |
|
except ValueError: |
|
pass |
|
|
|
beats_info.append(beat_info) |
|
total_expected += beat_info["count"] |
|
|
|
|
|
phrase_diff = abs(actual_count - total_expected) |
|
|
|
|
|
expected_ratio = 0.15 if total_expected > 10 else 0.25 |
|
phrase_threshold = max(1, round(total_expected * expected_ratio)) |
|
|
|
|
|
if phrase_diff < best_match_diff: |
|
best_match_diff = phrase_diff |
|
best_match_phrase = phrase |
|
best_phrase_beats = beats_info |
|
|
|
|
|
else: |
|
try: |
|
total_expected = float(phrase) |
|
phrase_diff = abs(actual_count - total_expected) |
|
if phrase_diff < best_match_diff: |
|
best_match_diff = phrase_diff |
|
best_match_phrase = phrase |
|
best_phrase_beats = [{"count": total_expected}] |
|
except ValueError: |
|
pass |
|
|
|
|
|
if best_match_phrase and best_phrase_beats: |
|
total_expected = sum(beat["count"] for beat in best_phrase_beats) |
|
|
|
|
|
expected_ratio = 0.15 if total_expected > 10 else 0.25 |
|
threshold = max(1, round(total_expected * expected_ratio)) |
|
|
|
|
|
if total_expected > 0 and best_match_diff > threshold: |
|
verification_notes.append(f"Line {i+1}: Expected {total_expected} syllables, got {actual_count}") |
|
total_mismatch_count += 1 |
|
|
|
|
|
words = re.findall(r'\b[a-zA-Z]+\b', line.lower()) |
|
|
|
|
|
word_analysis = [] |
|
cumulative_syllables = 0 |
|
|
|
for word in words: |
|
syllable_count = count_syllables_for_word(word) |
|
|
|
|
|
stress_pattern = get_word_stress(word) |
|
|
|
word_analysis.append({ |
|
"word": word, |
|
"syllables": syllable_count, |
|
"stress_pattern": stress_pattern, |
|
"position": cumulative_syllables |
|
}) |
|
|
|
cumulative_syllables += syllable_count |
|
|
|
|
|
if best_phrase_beats and any(b.get("type") == "S" for b in best_phrase_beats if "type" in b): |
|
|
|
strong_positions = [] |
|
current_pos = 0 |
|
|
|
for beat in best_phrase_beats: |
|
if beat.get("type") == "S": |
|
strong_positions.append(current_pos) |
|
current_pos += beat.get("count", 1) |
|
|
|
|
|
alignment_issues = [] |
|
|
|
for pos in strong_positions: |
|
|
|
misaligned_word = None |
|
|
|
for word_info in word_analysis: |
|
word_start = word_info["position"] |
|
word_end = word_start + word_info["syllables"] |
|
|
|
if word_start <= pos < word_end: |
|
|
|
syllable_in_word = pos - word_start |
|
|
|
|
|
stress = word_info["stress_pattern"] |
|
|
|
|
|
if stress and syllable_in_word < len(stress) and stress[syllable_in_word] != '1': |
|
misaligned_word = word_info["word"] |
|
alignment_issues.append(f"'{word_info['word']}' (unstressed syllable on strong beat)") |
|
stress_misalignments.append({ |
|
"line": i+1, |
|
"word": word_info["word"], |
|
"position": pos, |
|
"suggestion": get_stress_aligned_alternatives(word_info["word"], syllable_in_word) |
|
}) |
|
break |
|
|
|
if alignment_issues: |
|
verification_notes.append(f" → Stress misalignments: {', '.join(alignment_issues)}") |
|
|
|
|
|
alignment_map = generate_alignment_visualization(line, best_phrase_beats, word_analysis) |
|
if alignment_map: |
|
detailed_analysis.append(f"Line {i+1} Alignment Analysis:\n{alignment_map}") |
|
else: |
|
|
|
verification_notes.append(f"Line {i+1}: Unable to find matching template pattern") |
|
|
|
|
|
if verification_notes: |
|
lyrics += "\n\n[Note: Potential rhythm mismatches detected in these lines:]\n" |
|
lyrics += "\n".join(verification_notes) |
|
|
|
if detailed_analysis: |
|
lyrics += "\n\n[Detailed Alignment Analysis:]\n" |
|
lyrics += "\n\n".join(detailed_analysis) |
|
|
|
lyrics += "\n\n[How to fix rhythm mismatches:]\n" |
|
lyrics += "1. Make sure stressed syllables (like 'LO' in 'LOV-er') fall on STRONG beats\n" |
|
lyrics += "2. Adjust syllable counts to match the template (add/remove words or use different words)\n" |
|
lyrics += "3. Try using words where natural stress aligns with musical rhythm\n" |
|
|
|
|
|
if stress_misalignments: |
|
lyrics += "\n[Specific word replacement suggestions:]\n" |
|
for issue in stress_misalignments[:5]: |
|
if issue["suggestion"]: |
|
lyrics += f"Line {issue['line']}: Consider replacing '{issue['word']}' with: {issue['suggestion']}\n" |
|
|
|
return lyrics |
|
|
|
def generate_alignment_visualization(line, beats_info, word_analysis): |
|
"""Generate a visual representation of syllable alignment with beats.""" |
|
if not beats_info or not word_analysis: |
|
return None |
|
|
|
|
|
syllable_breakdown = [] |
|
syllable_stresses = [] |
|
|
|
for word_info in word_analysis: |
|
word = word_info["word"] |
|
syllables = word_info["syllables"] |
|
stress = word_info["stress_pattern"] or "" |
|
|
|
|
|
while len(stress) < syllables: |
|
stress += "0" |
|
|
|
|
|
parts = naive_syllable_split(word, syllables) |
|
|
|
for i, part in enumerate(parts): |
|
syllable_breakdown.append(part) |
|
if i < len(stress): |
|
syllable_stresses.append(stress[i]) |
|
else: |
|
syllable_stresses.append("0") |
|
|
|
|
|
beat_types = [] |
|
current_pos = 0 |
|
|
|
for beat in beats_info: |
|
beat_type = beat.get("type", "-") |
|
count = beat.get("count", 1) |
|
|
|
|
|
if isinstance(count, int): |
|
beat_types.extend([beat_type] * count) |
|
else: |
|
|
|
whole_part = int(count) |
|
frac_part = count - whole_part |
|
|
|
if whole_part > 0: |
|
beat_types.extend([beat_type] * whole_part) |
|
|
|
if frac_part > 0: |
|
beat_types.append(f"{beat_type}½") |
|
|
|
|
|
while len(beat_types) < len(syllable_breakdown): |
|
beat_types.append("-") |
|
|
|
|
|
beat_types = beat_types[:len(syllable_breakdown)] |
|
|
|
|
|
result = [] |
|
|
|
|
|
syllable_display = [] |
|
for i, syllable in enumerate(syllable_breakdown): |
|
if i < len(syllable_stresses) and syllable_stresses[i] == "1": |
|
syllable_display.append(syllable.upper()) |
|
else: |
|
syllable_display.append(syllable.lower()) |
|
|
|
result.append(" - ".join(syllable_display)) |
|
|
|
|
|
beat_indicators = [] |
|
for i, (syllable, beat_type) in enumerate(zip(syllable_stresses, beat_types)): |
|
if beat_type == "S" or beat_type.startswith("S"): |
|
if syllable == "1": |
|
beat_indicators.append("↑") |
|
else: |
|
beat_indicators.append("❌") |
|
elif beat_type == "m" or beat_type.startswith("m"): |
|
beat_indicators.append("•") |
|
elif beat_type == "w" or beat_type.startswith("w"): |
|
beat_indicators.append("·") |
|
else: |
|
beat_indicators.append(" ") |
|
|
|
result.append(" ".join(beat_indicators)) |
|
|
|
|
|
result.append(" - ".join(beat_types)) |
|
|
|
return "\n".join(result) |
|
|
|
@functools.lru_cache(maxsize=256) |
|
def naive_syllable_split(word, syllable_count): |
|
"""Naively split a word into the specified number of syllables, with caching for performance.""" |
|
if syllable_count <= 1: |
|
return [word] |
|
|
|
|
|
vowels = "aeiouy" |
|
consonants = "bcdfghjklmnpqrstvwxz" |
|
|
|
|
|
splits = [] |
|
for i in range(1, len(word) - 1): |
|
if word[i] in consonants and word[i-1] in vowels: |
|
splits.append(i) |
|
elif word[i] in vowels and word[i-1] in consonants and word[i+1] in consonants: |
|
splits.append(i+1) |
|
|
|
|
|
while len(splits) < syllable_count - 1: |
|
for i in range(1, len(word)): |
|
if i not in splits: |
|
splits.append(i) |
|
break |
|
|
|
|
|
splits.sort() |
|
splits = splits[:syllable_count - 1] |
|
|
|
|
|
result = [] |
|
prev = 0 |
|
for pos in splits: |
|
result.append(word[prev:pos]) |
|
prev = pos |
|
|
|
result.append(word[prev:]) |
|
return result |
|
|
|
def get_stress_aligned_alternatives(word, position_to_stress): |
|
"""Suggest alternative words with proper stress at the required position.""" |
|
|
|
|
|
syllable_count = count_syllables_for_word(word) |
|
|
|
|
|
if syllable_count == 2: |
|
if position_to_stress == 0: |
|
first_stress = ["love-ly", "won-der", "beau-ty", "danc-ing", "dream-ing", |
|
"heart-beat", "sun-light", "moon-light", "star-light"] |
|
return ", ".join(first_stress[:3]) |
|
else: |
|
second_stress = ["be-LIEVE", "a-BOVE", "a-ROUND", "to-DAY", "a-LIVE", |
|
"a-LONE", "be-HOLD", "re-TURN", "de-LIGHT"] |
|
return ", ".join(second_stress[:3]) |
|
elif syllable_count == 3: |
|
if position_to_stress == 0: |
|
return "MEM-o-ry, WON-der-ful, BEAU-ti-ful" |
|
elif position_to_stress == 1: |
|
return "a-MAZE-ing, to-GE-ther, for-EV-er" |
|
else: |
|
return "un-der-STAND, o-ver-COME, ne-ver-MORE" |
|
|
|
|
|
return f"a word with stress on syllable {position_to_stress + 1}" |
|
|
|
def generate_lyrics(genre, duration, emotion_results, song_structure=None): |
|
""" |
|
Generate lyrics based on the genre, emotion, and structure analysis with enhanced rhythmic alignment. |
|
|
|
This improved version uses advanced template creation, better formatting, and verification with |
|
potential refinement for lyrics that perfectly match the musical rhythm patterns. |
|
|
|
Parameters: |
|
genre: Musical genre of the audio |
|
duration: Duration of the audio in seconds |
|
emotion_results: Dictionary containing emotional analysis results |
|
song_structure: Optional dictionary containing song structure analysis |
|
|
|
Returns: |
|
Generated lyrics aligned with the rhythm patterns of the music |
|
""" |
|
|
|
primary_emotion = emotion_results["emotion_analysis"]["primary_emotion"] |
|
primary_theme = emotion_results["theme_analysis"]["primary_theme"] |
|
|
|
|
|
try: |
|
tempo = float(emotion_results["rhythm_analysis"]["tempo"]) |
|
except (KeyError, ValueError, TypeError): |
|
tempo = 0.0 |
|
|
|
key = emotion_results["tonal_analysis"]["key"] |
|
mode = emotion_results["tonal_analysis"]["mode"] |
|
|
|
|
|
syllable_guidance = "" |
|
templates_for_verification = [] |
|
|
|
|
|
structure_visualization = "=== MUSIC-LYRICS STRUCTURE MATCHING ===\n\n" |
|
structure_visualization += f"Song Duration: {duration:.1f} seconds\n" |
|
structure_visualization += f"Tempo: {tempo:.1f} BPM\n\n" |
|
|
|
if song_structure: |
|
|
|
if "flexible_structure" in song_structure and song_structure["flexible_structure"]: |
|
flexible = song_structure["flexible_structure"] |
|
if "segments" in flexible and flexible["segments"]: |
|
|
|
segments = flexible["segments"] |
|
|
|
|
|
structure_visualization += f"Total segments: {len(segments)}\n" |
|
structure_visualization += "Each segment represents one musical phrase for which you should write ONE line of lyrics.\n\n" |
|
|
|
|
|
enhanced_templates = [] |
|
|
|
for i, segment in enumerate(segments): |
|
if i < 30: |
|
|
|
segment_start = segment["start"] |
|
segment_end = segment["end"] |
|
|
|
|
|
structure_visualization += f"Segment {i+1}: {segment_start:.1f}s - {segment_end:.1f}s (duration: {segment_end-segment_start:.1f}s)\n" |
|
|
|
|
|
segment_beats = [] |
|
beat_times = flexible["beats"]["beat_times"] |
|
beat_strengths = flexible["beats"].get("beat_strengths", []) |
|
|
|
for j, beat_time in enumerate(beat_times): |
|
if segment_start <= beat_time < segment_end: |
|
|
|
segment_beats.append(j) |
|
|
|
|
|
segment_beats_info = { |
|
"beat_times": [beat_times[j] for j in segment_beats], |
|
"tempo": flexible["beats"].get("tempo", 120) |
|
} |
|
|
|
if beat_strengths: |
|
segment_beats_info["beat_strengths"] = [ |
|
beat_strengths[j] for j in segment_beats |
|
if j < len(beat_strengths) |
|
] |
|
|
|
|
|
segment_beats_info["phrases"] = [segment_beats] |
|
|
|
|
|
enhanced_template = create_flexible_syllable_templates( |
|
segment_beats_info, |
|
genre=genre, |
|
phrase_mode='auto' if i == 0 else 'default' |
|
) |
|
enhanced_templates.append(enhanced_template) |
|
templates_for_verification.append(enhanced_template) |
|
|
|
|
|
structure_visualization += f" Template: {enhanced_template}\n" |
|
|
|
|
|
|
|
section_types = [] |
|
pattern_groups = {} |
|
|
|
for i, template in enumerate(enhanced_templates): |
|
|
|
simple_pattern = template.replace("(", "").replace(")", "").replace(":", "") |
|
|
|
|
|
found_match = False |
|
for group, patterns in pattern_groups.items(): |
|
if any(simple_pattern == p.replace("(", "").replace(")", "").replace(":", "") for p in patterns): |
|
pattern_groups[group].append(template) |
|
section_types.append(group) |
|
found_match = True |
|
break |
|
|
|
if not found_match: |
|
|
|
group_name = f"Group_{len(pattern_groups) + 1}" |
|
pattern_groups[group_name] = [template] |
|
section_types.append(group_name) |
|
|
|
|
|
section_mapping = {} |
|
if len(pattern_groups) >= 1: |
|
|
|
most_common = max(pattern_groups.items(), key=lambda x: len(x[1]))[0] |
|
section_mapping[most_common] = "verse" |
|
|
|
if len(pattern_groups) >= 2: |
|
|
|
sorted_groups = sorted(pattern_groups.items(), key=lambda x: len(x[1]), reverse=True) |
|
if len(sorted_groups) > 1: |
|
section_mapping[sorted_groups[1][0]] = "chorus" |
|
|
|
if len(pattern_groups) >= 3: |
|
|
|
sorted_groups = sorted(pattern_groups.items(), key=lambda x: len(x[1]), reverse=True) |
|
if len(sorted_groups) > 2: |
|
section_mapping[sorted_groups[2][0]] = "bridge" |
|
|
|
|
|
mapped_section_types = [] |
|
for section_type in section_types: |
|
if section_type in section_mapping: |
|
mapped_section_types.append(section_mapping[section_type]) |
|
else: |
|
mapped_section_types.append("verse") |
|
|
|
|
|
structure_visualization += "\nPredicted Song Structure:\n" |
|
for i, section_type in enumerate(mapped_section_types): |
|
if i < len(enhanced_templates): |
|
structure_visualization += f"Line {i+1}: [{section_type.upper()}] {enhanced_templates[i]}\n" |
|
|
|
|
|
total_lines = len(enhanced_templates) |
|
verse_lines = mapped_section_types.count("verse") |
|
chorus_lines = mapped_section_types.count("chorus") |
|
bridge_lines = mapped_section_types.count("bridge") |
|
|
|
|
|
structure_visualization += f"\nTotal Lines Required: {total_lines}\n" |
|
structure_visualization += f"Verse Lines: {verse_lines}\n" |
|
structure_visualization += f"Chorus Lines: {chorus_lines}\n" |
|
structure_visualization += f"Bridge Lines: {bridge_lines}\n" |
|
|
|
|
|
syllable_guidance = "CRITICAL RHYTHM INSTRUCTIONS:\n" |
|
syllable_guidance += "Each line of lyrics MUST match exactly with one musical phrase/segment.\n" |
|
syllable_guidance += "Follow these rhythm patterns for each line (STRONG beats need stressed syllables):\n\n" |
|
|
|
|
|
formatted_templates = [] |
|
for i, template in enumerate(enhanced_templates): |
|
if i < len(mapped_section_types): |
|
section_type = mapped_section_types[i].upper() |
|
if i > 0 and mapped_section_types[i] != mapped_section_types[i-1]: |
|
|
|
formatted_templates.append(f"\n[{section_type}]") |
|
elif i == 0: |
|
|
|
formatted_templates.append(f"[{section_type}]") |
|
formatted_templates.append(format_syllable_templates_for_prompt([template], arrow="→", line_wrap=8)) |
|
|
|
syllable_guidance += "\n".join(formatted_templates) |
|
|
|
|
|
use_sections = True |
|
|
|
|
|
if verse_lines > 0: |
|
verse_lines = min(verse_lines, total_lines // 2) |
|
else: |
|
verse_lines = total_lines // 2 |
|
|
|
if chorus_lines > 0: |
|
chorus_lines = min(chorus_lines, total_lines // 3) |
|
else: |
|
chorus_lines = total_lines // 3 |
|
|
|
if bridge_lines > 0: |
|
bridge_lines = min(bridge_lines, total_lines // 6) |
|
else: |
|
bridge_lines = 0 |
|
|
|
|
|
elif "syllables" in song_structure and song_structure["syllables"]: |
|
syllable_guidance = "RHYTHM PATTERN INSTRUCTIONS:\n" |
|
syllable_guidance += "Follow these syllable patterns for each section. Each line should match ONE phrase:\n\n" |
|
|
|
|
|
section_counts = {"verse": 0, "chorus": 0, "bridge": 0, "intro": 0, "outro": 0} |
|
|
|
for section in song_structure["syllables"]: |
|
section_counts[section["type"]] = section_counts.get(section["type"], 0) + 1 |
|
|
|
if "syllable_template" in section: |
|
|
|
section_beats_info = { |
|
"beat_times": [beat for beat in song_structure["beats"]["beat_times"] |
|
if section["start"] <= beat < section["end"]], |
|
"tempo": song_structure["beats"].get("tempo", 120) |
|
} |
|
|
|
if "beat_strengths" in song_structure["beats"]: |
|
section_beats_info["beat_strengths"] = [ |
|
strength for i, strength in enumerate(song_structure["beats"]["beat_strengths"]) |
|
if i < len(song_structure["beats"]["beat_times"]) and |
|
section["start"] <= song_structure["beats"]["beat_times"][i] < section["end"] |
|
] |
|
|
|
|
|
section_beats_info["phrases"] = [list(range(len(section_beats_info["beat_times"])))] |
|
|
|
|
|
enhanced_template = create_flexible_syllable_templates( |
|
section_beats_info, |
|
genre=genre, |
|
phrase_mode='auto' if section['type'] == 'verse' else 'default' |
|
) |
|
|
|
syllable_guidance += f"[{section['type'].capitalize()}]:\n" |
|
syllable_guidance += format_syllable_templates_for_prompt( |
|
enhanced_template, |
|
arrow="→", |
|
line_wrap=6 |
|
) + "\n\n" |
|
templates_for_verification.append(section) |
|
elif "syllable_count" in section: |
|
syllable_guidance += f"[{section['type'].capitalize()}]: ~{section['syllable_count']} syllables total\n" |
|
|
|
|
|
structure_visualization += "Using traditional section-based structure:\n" |
|
for section_type, count in section_counts.items(): |
|
if count > 0: |
|
structure_visualization += f"{section_type.capitalize()}: {count} sections\n" |
|
|
|
|
|
verse_lines = max(2, section_counts.get("verse", 0) * 4) |
|
chorus_lines = max(2, section_counts.get("chorus", 0) * 4) |
|
bridge_lines = max(0, section_counts.get("bridge", 0) * 2) |
|
|
|
|
|
use_sections = True |
|
|
|
|
|
if not syllable_guidance: |
|
syllable_guidance = "RHYTHM ALIGNMENT INSTRUCTIONS:\n\n" |
|
syllable_guidance += "1. Align stressed syllables with strong beats (usually beats 1 and 3 in 4/4 time)\n" |
|
syllable_guidance += "2. Use unstressed syllables on weak beats (usually beats 2 and 4 in 4/4 time)\n" |
|
syllable_guidance += "3. Use appropriate syllable counts based on tempo:\n" |
|
syllable_guidance += " - Fast tempo (>120 BPM): 4-6 syllables per line\n" |
|
syllable_guidance += " - Medium tempo (90-120 BPM): 6-8 syllables per line\n" |
|
syllable_guidance += " - Slow tempo (<90 BPM): 8-10 syllables per line\n" |
|
|
|
|
|
structure_visualization += "Using estimated structure (no detailed analysis available):\n" |
|
|
|
|
|
estimated_lines = max(8, int(duration / 10)) |
|
structure_visualization += f"Estimated total lines: {estimated_lines}\n" |
|
|
|
|
|
verse_lines = estimated_lines // 2 |
|
chorus_lines = estimated_lines // 3 |
|
bridge_lines = estimated_lines // 6 if estimated_lines > 12 else 0 |
|
|
|
|
|
use_sections = True |
|
|
|
|
|
syllable_guidance += "\nEXAMPLES OF PERFECT RHYTHM ALIGNMENT:\n" |
|
syllable_guidance += "Pattern: S(0.95):1 → w(0.4):1 → m(0.7):1 → w(0.3):1\n" |
|
syllable_guidance += "Lyric: 'HEAR the MU-sic PLAY'\n" |
|
syllable_guidance += " ↑ ↑ ↑ ↑\n" |
|
syllable_guidance += " S w m w <- BEAT TYPE\n\n" |
|
|
|
syllable_guidance += "Pattern: S(0.9):2 → w(0.3):1 → S(0.85):1 → w(0.4):2\n" |
|
syllable_guidance += "Lyric: 'DANC-ing TO the RHYTHM of LOVE'\n" |
|
syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" |
|
syllable_guidance += " S S w S w w <- BEAT TYPE\n\n" |
|
|
|
syllable_guidance += "Pattern: S(0.92):1 → m(0.65):2 → S(0.88):1 → w(0.35):1\n" |
|
syllable_guidance += "Lyric: 'TIME keeps FLOW-ing ON and ON'\n" |
|
syllable_guidance += " ↑ ↑ ↑ ↑ ↑ ↑\n" |
|
syllable_guidance += " S m m S w w <- BEAT TYPE\n\n" |
|
|
|
|
|
genre_guidance = "" |
|
if any(term in genre.lower() for term in ["rap", "hip-hop", "hip hop"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR RAP/HIP-HOP RHYTHMS:\n" |
|
genre_guidance += "- Use more syllables per beat for rapid-fire sections\n" |
|
genre_guidance += "- Create internal rhymes within lines, not just at line endings\n" |
|
genre_guidance += "- Emphasize the first beat of each bar with strong consonants\n" |
|
elif any(term in genre.lower() for term in ["electronic", "edm", "techno", "house", "dance"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR ELECTRONIC MUSIC RHYTHMS:\n" |
|
genre_guidance += "- Use repetitive phrases that build and release tension\n" |
|
genre_guidance += "- Match syllables precisely to the beat grid\n" |
|
genre_guidance += "- Use short, percussive words on strong beats\n" |
|
elif any(term in genre.lower() for term in ["rock", "metal", "punk", "alternative"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR ROCK RHYTHMS:\n" |
|
genre_guidance += "- Use powerful, emotive words on downbeats\n" |
|
genre_guidance += "- Create contrast between verse and chorus energy levels\n" |
|
genre_guidance += "- Emphasize hooks with simple, memorable phrases\n" |
|
elif any(term in genre.lower() for term in ["folk", "country", "acoustic", "ballad"]): |
|
genre_guidance += "\nSPECIFIC GUIDANCE FOR FOLK/ACOUSTIC RHYTHMS:\n" |
|
genre_guidance += "- Focus on storytelling with clear narrative flow\n" |
|
genre_guidance += "- Use natural speech patterns that flow conversationally\n" |
|
genre_guidance += "- Place important words at the start of phrases\n" |
|
|
|
|
|
syllable_guidance += genre_guidance |
|
|
|
|
|
syllable_guidance_text = syllable_guidance |
|
|
|
|
|
if song_structure and "flexible_structure" in song_structure and song_structure["flexible_structure"]: |
|
|
|
if "segments" in song_structure["flexible_structure"]: |
|
segments = song_structure["flexible_structure"]["segments"] |
|
if len(segments) > 4: |
|
use_sections = False |
|
|
|
|
|
if use_sections: |
|
|
|
content = f""" |
|
You are a talented songwriter who specializes in {genre} music. |
|
Write original {genre} song lyrics for a song that is {duration:.1f} seconds long. |
|
|
|
Music analysis has detected the following qualities in the music: |
|
- Tempo: {tempo:.1f} BPM |
|
- Key: {key} {mode} |
|
- Primary emotion: {primary_emotion} |
|
- Primary theme: {primary_theme} |
|
|
|
{syllable_guidance} |
|
|
|
CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: |
|
1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) |
|
2. Natural word stress patterns must match the beat strength (strong words on strong beats) |
|
3. Line breaks should occur at phrase endings for natural breathing |
|
4. Consonant clusters should be avoided on fast notes and strong beats |
|
5. Open vowels (a, e, o) work better for sustained notes and syllables |
|
6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) |
|
7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels |
|
|
|
Think step by step about how to match words to the rhythm pattern: |
|
1. First, identify the strong beats in each line pattern |
|
2. Choose words where stressed syllables naturally fall on strong beats |
|
3. Count syllables carefully to ensure they match the pattern precisely |
|
4. Test your line against the pattern by mapping each syllable |
|
|
|
IMPORTANT: Each line of lyrics must match exactly to ONE musical phrase/segment. |
|
|
|
The lyrics should: |
|
- Perfectly capture the essence and style of {genre} music |
|
- Express the {primary_emotion} emotion and {primary_theme} theme |
|
- Follow the structure patterns provided above |
|
- Be completely original |
|
- Match the song duration of {duration:.1f} seconds |
|
|
|
IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" |
|
where you analyze how well the lyrics align with the musical rhythm. This section MUST appear |
|
even if there are no rhythm issues. Include the following in your analysis: |
|
1. Syllable counts for each line and how they match the rhythm pattern |
|
2. Where stressed syllables align with strong beats |
|
3. Any potential misalignments or improvements |
|
|
|
Your lyrics: |
|
""" |
|
else: |
|
|
|
content = f""" |
|
You are a talented songwriter who specializes in {genre} music. |
|
Write original lyrics that match the rhythm of a {genre} music segment that is {duration:.1f} seconds long. |
|
|
|
Music analysis has detected the following qualities: |
|
- Tempo: {tempo:.1f} BPM |
|
- Key: {key} {mode} |
|
- Primary emotion: {primary_emotion} |
|
- Primary theme: {primary_theme} |
|
|
|
{syllable_guidance} |
|
|
|
CRITICAL PRINCIPLES FOR RHYTHMIC ALIGNMENT: |
|
1. STRESSED syllables MUST fall on STRONG beats (marked with STRONG in the pattern) |
|
2. Natural word stress patterns must match the beat strength (strong words on strong beats) |
|
3. Line breaks should occur at phrase endings for natural breathing |
|
4. Consonant clusters should be avoided on fast notes and strong beats |
|
5. Open vowels (a, e, o) work better for sustained notes and syllables |
|
6. Pay attention to strength values in the pattern (higher values like 0.95 need stronger emphasis) |
|
7. For half-syllable positions (like S1.5 or m2.5), use short, quick syllables or words with weak vowels |
|
|
|
Think step by step about how to match words to the rhythm pattern: |
|
1. First, identify the strong beats in each line pattern |
|
2. Choose words where stressed syllables naturally fall on strong beats |
|
3. Count syllables carefully to ensure they match the pattern precisely |
|
4. Test your line against the pattern by mapping each syllable |
|
|
|
CRITICAL: Each line of lyrics must match exactly to ONE musical phrase/segment. |
|
|
|
For perfect alignment examples: |
|
- "FEEL the RHY-thm in your SOUL" – stressed syllables on strong beats |
|
- "to-DAY we DANCE a-LONG" – natural speech stress matches musical stress |
|
- "WAIT-ing FOR the SUN to RISE" – syllable emphasis aligns with beat emphasis |
|
|
|
The lyrics should: |
|
- Perfectly capture the essence and style of {genre} music |
|
- Express the {primary_emotion} emotion and {primary_theme} theme |
|
- Be completely original |
|
- Maintain a consistent theme throughout |
|
- Match the audio segment duration of {duration:.1f} seconds |
|
|
|
Include any section labels like [Verse] or [Chorus] as indicated in the rhythm patterns above. |
|
Each line of lyrics must follow the corresponding segment's rhythm pattern EXACTLY. |
|
|
|
IMPORTANT: Your generated lyrics must be followed by a section titled "[RHYTHM_ANALYSIS_SECTION]" |
|
where you analyze how well the lyrics align with the musical rhythm. This section MUST appear |
|
even if there are no rhythm issues. Include the following in your analysis: |
|
1. Syllable counts for each line and how they match the rhythm pattern |
|
2. Where stressed syllables align with strong beats |
|
3. Any potential misalignments or improvements |
|
|
|
Your lyrics: |
|
""" |
|
|
|
|
|
messages = [ |
|
{"role": "user", "content": content} |
|
] |
|
|
|
|
|
text = llm_tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
|
|
|
|
model_inputs = llm_tokenizer([text], return_tensors="pt").to(llm_model.device) |
|
|
|
|
|
generation_params = { |
|
"do_sample": True, |
|
"temperature": 0.6, |
|
"top_p": 0.95, |
|
"top_k": 50, |
|
"repetition_penalty": 1.2, |
|
"max_new_tokens": 2048 |
|
} |
|
|
|
|
|
generated_ids = llm_model.generate( |
|
**model_inputs, |
|
**generation_params |
|
) |
|
|
|
|
|
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() |
|
|
|
|
|
lyrics = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip() |
|
|
|
|
|
if "<thinking>" in lyrics and "</thinking>" in lyrics: |
|
lyrics = lyrics.split("</thinking>")[1].strip() |
|
|
|
|
|
thinking_markers = ["<think>", "</think>", "[thinking]", "[/thinking]", "I'll think step by step:"] |
|
for marker in thinking_markers: |
|
if marker in lyrics: |
|
parts = lyrics.split(marker) |
|
if len(parts) > 1: |
|
lyrics = parts[-1].strip() |
|
|
|
|
|
if templates_for_verification: |
|
verified_lyrics = verify_flexible_syllable_counts(lyrics, templates_for_verification) |
|
|
|
|
|
if "[Note: Potential rhythm mismatches" in verified_lyrics and "Detailed Alignment Analysis" in verified_lyrics: |
|
|
|
original_lyrics = lyrics.split("[Note:")[0].strip() |
|
|
|
|
|
analysis = verified_lyrics.split("[Note:")[1] |
|
|
|
|
|
if "stress misalignments" in analysis and len(templates_for_verification) > 0: |
|
|
|
refinement_prompt = f""" |
|
You need to fix rhythm issues in these lyrics. Here's the analysis of the problems: |
|
|
|
{analysis} |
|
|
|
Revise the lyrics to perfectly match the rhythm pattern while maintaining the theme. |
|
Focus on fixing the stress misalignments by placing stressed syllables on STRONG beats. |
|
|
|
Original lyrics: |
|
{original_lyrics} |
|
|
|
Improved lyrics with fixed rhythm: |
|
""" |
|
|
|
refinement_messages = [ |
|
{"role": "user", "content": refinement_prompt} |
|
] |
|
|
|
|
|
refinement_text = llm_tokenizer.apply_chat_template( |
|
refinement_messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
|
|
try: |
|
|
|
refinement_inputs = llm_tokenizer([refinement_text], return_tensors="pt").to(llm_model.device) |
|
|
|
|
|
refinement_params = { |
|
"do_sample": True, |
|
"temperature": 0.4, |
|
"top_p": 0.9, |
|
"repetition_penalty": 1.3, |
|
"max_new_tokens": 1024 |
|
} |
|
|
|
refined_ids = llm_model.generate( |
|
**refinement_inputs, |
|
**refinement_params |
|
) |
|
|
|
|
|
refined_output_ids = refined_ids[0][len(refinement_inputs.input_ids[0]):].tolist() |
|
refined_lyrics = llm_tokenizer.decode(refined_output_ids, skip_special_tokens=True).strip() |
|
|
|
|
|
refined_verified_lyrics = verify_flexible_syllable_counts(refined_lyrics, templates_for_verification) |
|
|
|
|
|
if "[Note: Potential rhythm mismatches" not in refined_verified_lyrics: |
|
lyrics = refined_lyrics |
|
elif refined_verified_lyrics.count("misalignments") < verified_lyrics.count("misalignments"): |
|
lyrics = refined_verified_lyrics |
|
else: |
|
lyrics = verified_lyrics |
|
except Exception as e: |
|
print(f"Error in lyrics refinement: {str(e)}") |
|
lyrics = verified_lyrics |
|
else: |
|
|
|
lyrics = verified_lyrics |
|
else: |
|
|
|
lyrics = verified_lyrics |
|
|
|
|
|
if "[RHYTHM_ANALYSIS_SECTION]" in lyrics: |
|
|
|
parts = lyrics.split("[RHYTHM_ANALYSIS_SECTION]") |
|
clean_lyrics = parts[0].strip() |
|
rhythm_analysis = parts[1].strip() |
|
|
|
|
|
lyrics = clean_lyrics + "\n\n[Note: Rhythm Analysis]\n" + rhythm_analysis |
|
|
|
|
|
elif "[Note: Potential rhythm mismatches" in lyrics: |
|
|
|
pass |
|
else: |
|
|
|
lyrics = lyrics + "\n\n[Note: Rhythm Analysis]\nNo rhythm issues detected. All syllables align well with the beat pattern." |
|
|
|
|
|
if isinstance(lyrics, str): |
|
|
|
if "[Note: Rhythm Analysis]" in lyrics: |
|
clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() |
|
rhythm_analysis = lyrics.split("[Note: Rhythm Analysis]")[1] |
|
elif "[Note: Potential rhythm mismatches" in lyrics: |
|
clean_lyrics = lyrics.split("[Note:")[0].strip() |
|
rhythm_analysis = "[Note:" + lyrics.split("[Note:")[1] |
|
else: |
|
clean_lyrics = lyrics |
|
rhythm_analysis = "No rhythm analysis available" |
|
|
|
|
|
syllable_analysis = "=== SYLLABLE ANALYSIS ===\n\n" |
|
if templates_for_verification: |
|
syllable_analysis += "Template Analysis:\n" |
|
for i, template in enumerate(templates_for_verification): |
|
if i < min(len(templates_for_verification), 30): |
|
syllable_analysis += f"Line {i+1}:\n" |
|
if isinstance(template, dict): |
|
if "syllable_template" in template: |
|
syllable_analysis += f" Template: {template['syllable_template']}\n" |
|
if "syllable_count" in template: |
|
syllable_analysis += f" Expected syllables: {template['syllable_count']}\n" |
|
elif isinstance(template, str): |
|
syllable_analysis += f" Template: {template}\n" |
|
syllable_analysis += "\n" |
|
|
|
if len(templates_for_verification) > 30: |
|
syllable_analysis += f"... and {len(templates_for_verification) - 30} more lines\n\n" |
|
|
|
|
|
syllable_analysis += "\n" + structure_visualization |
|
|
|
|
|
prompt_template = "=== PROMPT TEMPLATE ===\n\n" |
|
prompt_template += "Genre: " + genre + "\n" |
|
prompt_template += f"Duration: {duration:.1f} seconds\n" |
|
prompt_template += f"Tempo: {tempo:.1f} BPM\n" |
|
prompt_template += f"Key: {key} {mode}\n" |
|
prompt_template += f"Primary Emotion: {primary_emotion}\n" |
|
prompt_template += f"Primary Theme: {primary_theme}\n\n" |
|
prompt_template += "Syllable Guidance:\n" + syllable_guidance_text |
|
|
|
|
|
return { |
|
"lyrics": clean_lyrics, |
|
"rhythm_analysis": rhythm_analysis, |
|
"syllable_analysis": syllable_analysis, |
|
"prompt_template": prompt_template |
|
} |
|
|
|
return lyrics |
|
|
|
def process_audio(audio_file): |
|
"""Main function to process audio file, classify genre, and generate lyrics with enhanced rhythm analysis.""" |
|
if audio_file is None: |
|
return "Please upload an audio file.", None, None |
|
|
|
try: |
|
print("Step 1/5: Extracting audio features...") |
|
|
|
audio_data = extract_audio_features(audio_file) |
|
|
|
print("Step 2/5: Verifying audio contains music...") |
|
|
|
try: |
|
is_music, ast_results = detect_music(audio_data) |
|
except Exception as e: |
|
print(f"Error in music detection: {str(e)}") |
|
return f"Error in music detection: {str(e)}", None, ast_results |
|
|
|
if not is_music: |
|
return "The uploaded audio does not appear to be music. Please upload a music file.", None, ast_results |
|
|
|
print("Step 3/5: Classifying music genre...") |
|
|
|
try: |
|
top_genres = classify_genre(audio_data) |
|
|
|
genre_results = format_genre_results(top_genres) |
|
except Exception as e: |
|
print(f"Error in genre classification: {str(e)}") |
|
return f"Error in genre classification: {str(e)}", None, ast_results |
|
|
|
print("Step 4/5: Analyzing music emotions, themes, and structure...") |
|
|
|
try: |
|
emotion_results = music_analyzer.analyze_music(audio_file) |
|
except Exception as e: |
|
print(f"Error in emotion analysis: {str(e)}") |
|
|
|
emotion_results = { |
|
"emotion_analysis": {"primary_emotion": "Unknown"}, |
|
"theme_analysis": {"primary_theme": "Unknown"}, |
|
"rhythm_analysis": {"tempo": 0}, |
|
"tonal_analysis": {"key": "Unknown", "mode": ""}, |
|
"summary": {"tempo": 0, "key": "Unknown", "mode": "", "primary_emotion": "Unknown", "primary_theme": "Unknown"} |
|
} |
|
|
|
|
|
try: |
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
|
|
|
|
beats_info = detect_beats(y, sr) |
|
sections_info = detect_sections(y, sr) |
|
|
|
|
|
segments = [] |
|
|
|
|
|
|
|
if sections_info and len(sections_info) > 1: |
|
min_segment_duration = 1.5 |
|
|
|
for section in sections_info: |
|
section_start = section["start"] |
|
section_end = section["end"] |
|
section_duration = section["duration"] |
|
|
|
|
|
if section_duration < min_segment_duration * 1.5: |
|
segments.append({ |
|
"start": section_start, |
|
"end": section_end |
|
}) |
|
else: |
|
|
|
|
|
ideal_segment_duration = 3.0 |
|
segment_count = max(1, int(section_duration / ideal_segment_duration)) |
|
|
|
|
|
segment_duration = section_duration / segment_count |
|
for i in range(segment_count): |
|
segment_start = section_start + i * segment_duration |
|
segment_end = segment_start + segment_duration |
|
segments.append({ |
|
"start": segment_start, |
|
"end": segment_end |
|
}) |
|
|
|
elif beats_info and len(beats_info["beat_times"]) > 4: |
|
beats = beats_info["beat_times"] |
|
time_signature = beats_info.get("time_signature", 4) |
|
|
|
|
|
measure_size = time_signature |
|
for i in range(0, len(beats), measure_size): |
|
if i + 1 < len(beats): |
|
measure_start = beats[i] |
|
|
|
if i + measure_size < len(beats): |
|
measure_end = beats[i + measure_size] |
|
else: |
|
|
|
if i > 0: |
|
beat_interval = beats[i] - beats[i-1] |
|
measure_end = beats[-1] + (beat_interval * (measure_size - (len(beats) - i))) |
|
else: |
|
measure_end = audio_data["duration"] |
|
|
|
segments.append({ |
|
"start": measure_start, |
|
"end": measure_end |
|
}) |
|
|
|
else: |
|
|
|
segment_duration = 3.0 |
|
total_segments = max(4, int(audio_data["duration"] / segment_duration)) |
|
segment_duration = audio_data["duration"] / total_segments |
|
|
|
for i in range(total_segments): |
|
segment_start = i * segment_duration |
|
segment_end = segment_start + segment_duration |
|
segments.append({ |
|
"start": segment_start, |
|
"end": segment_end |
|
}) |
|
|
|
|
|
flexible_structure = { |
|
"beats": beats_info, |
|
"segments": segments |
|
} |
|
|
|
|
|
song_structure = { |
|
"beats": beats_info, |
|
"sections": sections_info, |
|
"flexible_structure": flexible_structure |
|
} |
|
|
|
|
|
song_structure["syllables"] = [] |
|
for section in sections_info: |
|
|
|
section_beats_info = { |
|
"beat_times": [beat for beat in beats_info["beat_times"] |
|
if section["start"] <= beat < section["end"]], |
|
"tempo": beats_info.get("tempo", 120) |
|
} |
|
if "beat_strengths" in beats_info: |
|
section_beats_info["beat_strengths"] = [ |
|
strength for i, strength in enumerate(beats_info["beat_strengths"]) |
|
if i < len(beats_info["beat_times"]) and |
|
section["start"] <= beats_info["beat_times"][i] < section["end"] |
|
] |
|
|
|
|
|
syllable_count = int(section["duration"] * (beats_info.get("tempo", 120) / 60) * 1.5) |
|
|
|
section_info = { |
|
"type": section["type"], |
|
"start": section["start"], |
|
"end": section["end"], |
|
"duration": section["duration"], |
|
"syllable_count": syllable_count, |
|
"beat_count": len(section_beats_info["beat_times"]) |
|
} |
|
|
|
|
|
if len(section_beats_info["beat_times"]) >= 2: |
|
section_info["syllable_template"] = create_flexible_syllable_templates( |
|
section_beats_info, |
|
genre=top_genres[0][0] |
|
) |
|
|
|
song_structure["syllables"].append(section_info) |
|
|
|
print(f"Successfully analyzed song structure with {len(segments)} segments") |
|
|
|
except Exception as e: |
|
print(f"Error analyzing song structure: {str(e)}") |
|
|
|
song_structure = None |
|
|
|
print("Step 5/5: Generating rhythmically aligned lyrics...") |
|
|
|
try: |
|
primary_genre, _ = top_genres[0] |
|
lyrics_result = generate_lyrics(primary_genre, audio_data["duration"], emotion_results, song_structure) |
|
|
|
|
|
if isinstance(lyrics_result, dict): |
|
lyrics = lyrics_result["lyrics"] |
|
rhythm_analysis = lyrics_result["rhythm_analysis"] |
|
syllable_analysis = lyrics_result["syllable_analysis"] |
|
prompt_template = lyrics_result["prompt_template"] |
|
else: |
|
lyrics = lyrics_result |
|
rhythm_analysis = "No detailed rhythm analysis available" |
|
syllable_analysis = "No syllable analysis available" |
|
prompt_template = "No prompt template available" |
|
|
|
except Exception as e: |
|
print(f"Error generating lyrics: {str(e)}") |
|
lyrics = f"Error generating lyrics: {str(e)}" |
|
rhythm_analysis = "No rhythm analysis available" |
|
syllable_analysis = "No syllable analysis available" |
|
prompt_template = "No prompt template available" |
|
|
|
|
|
results = { |
|
"genre_results": genre_results, |
|
"lyrics": lyrics, |
|
"rhythm_analysis": rhythm_analysis, |
|
"syllable_analysis": syllable_analysis, |
|
"prompt_template": prompt_template, |
|
"ast_results": ast_results |
|
} |
|
|
|
return results |
|
|
|
except Exception as e: |
|
error_msg = f"Error processing audio: {str(e)}" |
|
print(error_msg) |
|
return error_msg, None, [] |
|
|
|
def format_beat_timeline(audio_file, lyrics=None): |
|
"""Creates a formatted timeline showing beat timings and their syllable patterns""" |
|
if audio_file is None: |
|
return "Please upload an audio file to see beat timeline." |
|
|
|
try: |
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
|
|
|
|
beats_info = detect_beats(y, sr) |
|
|
|
|
|
timeline = "=== BEAT & SYLLABLE TIMELINE ===\n\n" |
|
|
|
tempo = float(beats_info['tempo']) if isinstance(beats_info['tempo'], np.ndarray) else beats_info['tempo'] |
|
timeline += f"Tempo: {tempo:.1f} BPM\n" |
|
timeline += f"Time Signature: {beats_info['time_signature']}/4\n" |
|
timeline += f"Total Beats: {beats_info['beat_count']}\n\n" |
|
|
|
|
|
timeline += "| Beat # | Time (s) | Beat Strength | Syllable Pattern |\n" |
|
timeline += "|--------|----------|--------------|------------------|\n" |
|
|
|
|
|
for i, (time, strength) in enumerate(zip(beats_info['beat_times'], beats_info['beat_strengths'])): |
|
|
|
time = float(time) if isinstance(time, np.ndarray) else time |
|
strength = float(strength) if isinstance(strength, np.ndarray) else strength |
|
|
|
|
|
if strength >= 0.8: |
|
beat_type = "STRONG" |
|
elif strength >= 0.5: |
|
beat_type = "medium" |
|
else: |
|
beat_type = "weak" |
|
|
|
|
|
if i % beats_info['time_signature'] == 0: |
|
pattern = "S" |
|
elif i % beats_info['time_signature'] == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 3: |
|
pattern = "m" |
|
else: |
|
pattern = "w" |
|
|
|
|
|
timeline += f"| {i+1:<6} | {time:.2f}s | {beat_type:<12} | {pattern}:{1.5 if pattern=='S' else 1.0} |\n" |
|
|
|
|
|
if i >= 29: |
|
timeline += f"... and {beats_info['beat_count'] - 30} more beats ...\n" |
|
break |
|
|
|
|
|
timeline += "\n=== VISUAL BEAT TIMELINE ===\n\n" |
|
timeline += "Each character represents 0.5 seconds. Beats are marked as:\n" |
|
timeline += "S = Strong beat | m = Medium beat | w = Weak beat | · = No beat\n\n" |
|
|
|
|
|
if beats_info['beat_times']: |
|
|
|
max_beat_time = float(max(beats_info['beat_times'])) if isinstance(max(beats_info['beat_times']), np.ndarray) else max(beats_info['beat_times']) |
|
total_duration = max_beat_time + 2 |
|
else: |
|
total_duration = 30 |
|
|
|
time_markers = "" |
|
for i in range(0, int(total_duration) + 1, 5): |
|
time_markers += f"{i:<5}" |
|
timeline += time_markers + " (seconds)\n" |
|
|
|
|
|
ruler = "" |
|
for i in range(0, int(total_duration) + 1): |
|
if i % 5 == 0: |
|
ruler += "+" |
|
else: |
|
ruler += "-" |
|
ruler += "-" * 9 |
|
timeline += ruler + "\n" |
|
|
|
|
|
beat_line = ["·"] * int(total_duration * 2) |
|
|
|
for i, time in enumerate(beats_info['beat_times']): |
|
if i >= len(beats_info['beat_strengths']): |
|
break |
|
|
|
|
|
time_val = float(time) if isinstance(time, np.ndarray) else time |
|
|
|
|
|
pos = int(time_val * 2) |
|
if pos >= len(beat_line): |
|
continue |
|
|
|
|
|
strength = beats_info['beat_strengths'][i] |
|
|
|
strength = float(strength) if isinstance(strength, np.ndarray) else strength |
|
|
|
if i % beats_info['time_signature'] == 0: |
|
beat_line[pos] = "S" |
|
elif strength >= 0.8: |
|
beat_line[pos] = "S" |
|
elif i % beats_info['time_signature'] == beats_info['time_signature'] // 2 and beats_info['time_signature'] > 3: |
|
beat_line[pos] = "m" |
|
elif strength >= 0.5: |
|
beat_line[pos] = "m" |
|
else: |
|
beat_line[pos] = "w" |
|
|
|
|
|
beat_visualization = "" |
|
for i in range(0, len(beat_line), 10): |
|
beat_visualization += "".join(beat_line[i:i+10]) |
|
if i + 10 < len(beat_line): |
|
beat_visualization += " " |
|
timeline += beat_visualization + "\n\n" |
|
|
|
|
|
timeline += "=== MEASURE MARKERS ===\n\n" |
|
|
|
|
|
measure_starts = [] |
|
for i, time in enumerate(beats_info['beat_times']): |
|
if i % beats_info['time_signature'] == 0: |
|
|
|
time_val = float(time) if isinstance(time, np.ndarray) else time |
|
measure_starts.append((i // beats_info['time_signature'] + 1, time_val)) |
|
|
|
|
|
if measure_starts: |
|
timeline += "| Measure # | Start Time | Duration |\n" |
|
timeline += "|-----------|------------|----------|\n" |
|
|
|
for i in range(len(measure_starts)): |
|
measure_num, start_time = measure_starts[i] |
|
|
|
|
|
if i < len(measure_starts) - 1: |
|
end_time = measure_starts[i+1][1] |
|
elif beats_info['beat_times']: |
|
|
|
last_beat = beats_info['beat_times'][-1] |
|
end_time = float(last_beat) if isinstance(last_beat, np.ndarray) else last_beat |
|
else: |
|
end_time = start_time + 2.0 |
|
|
|
duration = end_time - start_time |
|
|
|
timeline += f"| {measure_num:<9} | {start_time:.2f}s | {duration:.2f}s |\n" |
|
|
|
|
|
if i >= 9: |
|
timeline += f"... and {len(measure_starts) - 10} more measures ...\n" |
|
break |
|
|
|
|
|
if beats_info['phrases']: |
|
timeline += "\n=== MUSICAL PHRASES ===\n\n" |
|
for i, phrase in enumerate(beats_info['phrases']): |
|
if i < 10: |
|
if not phrase: |
|
continue |
|
|
|
start_beat = phrase[0] |
|
end_beat = phrase[-1] |
|
if start_beat >= len(beats_info['beat_times']) or end_beat >= len(beats_info['beat_times']): |
|
continue |
|
|
|
|
|
phrase_start = beats_info['beat_times'][start_beat] |
|
phrase_start = float(phrase_start) if isinstance(phrase_start, np.ndarray) else phrase_start |
|
|
|
phrase_end = beats_info['beat_times'][end_beat] |
|
phrase_end = float(phrase_end) if isinstance(phrase_end, np.ndarray) else phrase_end |
|
|
|
timeline += f"Phrase {i+1}: Beats {start_beat+1}-{end_beat+1} ({phrase_start:.2f}s - {phrase_end:.2f}s)\n" |
|
|
|
|
|
phrase_beats = { |
|
"beat_times": [float(beats_info['beat_times'][j]) if isinstance(beats_info['beat_times'][j], np.ndarray) |
|
else beats_info['beat_times'][j] |
|
for j in phrase if j < len(beats_info['beat_times'])], |
|
"beat_strengths": [float(beats_info['beat_strengths'][j]) if isinstance(beats_info['beat_strengths'][j], np.ndarray) |
|
else beats_info['beat_strengths'][j] |
|
for j in phrase if j < len(beats_info['beat_strengths'])], |
|
"tempo": float(beats_info['tempo']) if isinstance(beats_info['tempo'], np.ndarray) else beats_info['tempo'], |
|
"time_signature": beats_info['time_signature'], |
|
"phrases": [list(range(len(phrase)))] |
|
} |
|
|
|
template = create_flexible_syllable_templates(phrase_beats) |
|
timeline += f" Syllable Template: {template}\n" |
|
|
|
|
|
if phrase_start < total_duration and phrase_end < total_duration: |
|
|
|
phrase_visualization = ["·"] * int(total_duration * 2) |
|
|
|
|
|
start_pos = int(phrase_start * 2) |
|
end_pos = int(phrase_end * 2) |
|
|
|
if start_pos < len(phrase_visualization): |
|
phrase_visualization[start_pos] = "[" |
|
|
|
if end_pos < len(phrase_visualization): |
|
phrase_visualization[end_pos] = "]" |
|
|
|
|
|
for j in phrase: |
|
if j < len(beats_info['beat_times']): |
|
beat_time = beats_info['beat_times'][j] |
|
beat_time = float(beat_time) if isinstance(beat_time, np.ndarray) else beat_time |
|
beat_pos = int(beat_time * 2) |
|
|
|
if beat_pos < len(phrase_visualization) and beat_pos != start_pos and beat_pos != end_pos: |
|
|
|
if j % beats_info['time_signature'] == 0: |
|
phrase_visualization[beat_pos] = "S" |
|
elif j % beats_info['time_signature'] == beats_info['time_signature'] // 2: |
|
phrase_visualization[beat_pos] = "m" |
|
else: |
|
phrase_visualization[beat_pos] = "w" |
|
|
|
|
|
phrase_visual = "" |
|
for k in range(0, len(phrase_visualization), 10): |
|
phrase_visual += "".join(phrase_visualization[k:k+10]) |
|
if k + 10 < len(phrase_visualization): |
|
phrase_visual += " " |
|
|
|
timeline += f" Timeline: {phrase_visual}\n\n" |
|
|
|
if len(beats_info['phrases']) > 10: |
|
timeline += f"... and {len(beats_info['phrases']) - 10} more phrases ...\n" |
|
|
|
|
|
if lyrics and isinstance(lyrics, str): |
|
timeline += "\n=== LYRICS-BEAT ALIGNMENT ===\n\n" |
|
|
|
if "[Note:" in lyrics: |
|
clean_lyrics = lyrics.split("[Note:")[0].strip() |
|
else: |
|
clean_lyrics = lyrics |
|
|
|
lines = clean_lyrics.strip().split('\n') |
|
|
|
|
|
for i, line in enumerate(lines[:10]): |
|
if not line.strip() or line.startswith('['): |
|
continue |
|
|
|
timeline += f"Line: \"{line}\"\n" |
|
|
|
|
|
syllable_count = count_syllables(line) |
|
timeline += f" Syllables: {syllable_count}\n" |
|
|
|
|
|
if beats_info['phrases'] and i < len(beats_info['phrases']): |
|
phrase = beats_info['phrases'][i] |
|
if phrase and phrase[0] < len(beats_info['beat_times']) and phrase[-1] < len(beats_info['beat_times']): |
|
start_beat = phrase[0] |
|
end_beat = phrase[-1] |
|
|
|
start_time = beats_info['beat_times'][start_beat] |
|
start_time = float(start_time) if isinstance(start_time, np.ndarray) else start_time |
|
|
|
end_time = beats_info['beat_times'][end_beat] |
|
end_time = float(end_time) if isinstance(end_time, np.ndarray) else end_time |
|
|
|
timeline += f" Timing: {start_time:.2f}s - {end_time:.2f}s\n" |
|
|
|
|
|
timeline += " Alignment: " |
|
|
|
|
|
phrase_duration = end_time - start_time |
|
syllable_viz = [] |
|
|
|
|
|
for j in phrase: |
|
if j < len(beats_info['beat_times']): |
|
beat_time = beats_info['beat_times'][j] |
|
beat_time = float(beat_time) if isinstance(beat_time, np.ndarray) else beat_time |
|
relative_pos = int((beat_time - start_time) / phrase_duration * syllable_count) |
|
|
|
while len(syllable_viz) <= relative_pos: |
|
syllable_viz.append("·") |
|
|
|
if j % beats_info['time_signature'] == 0: |
|
syllable_viz[relative_pos] = "S" |
|
elif j % beats_info['time_signature'] == beats_info['time_signature'] // 2: |
|
syllable_viz[relative_pos] = "m" |
|
else: |
|
syllable_viz[relative_pos] = "w" |
|
|
|
|
|
while len(syllable_viz) < syllable_count: |
|
syllable_viz.append("·") |
|
|
|
|
|
syllable_viz = syllable_viz[:syllable_count] |
|
|
|
|
|
timeline += "".join(syllable_viz) + "\n" |
|
|
|
timeline += "\n" |
|
|
|
if len(lines) > 10: |
|
timeline += f"... and {len(lines) - 10} more lines ...\n" |
|
|
|
return timeline |
|
|
|
except Exception as e: |
|
print(f"Error generating beat timeline: {str(e)}") |
|
return f"Error generating beat timeline: {str(e)}" |
|
|
|
|
|
with gr.Blocks(title="Music Genre Classifier & Lyrics Generator") as demo: |
|
gr.Markdown("# Music Genre Classifier & Lyrics Generator") |
|
gr.Markdown("Upload a music file to classify its genre, analyze its emotions, and generate perfectly aligned lyrics.") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
audio_input = gr.Audio(label="Upload Music", type="filepath") |
|
submit_btn = gr.Button("Analyze & Generate", variant="primary") |
|
|
|
|
|
with gr.Accordion("About Music Genres", open=False): |
|
gr.Markdown(""" |
|
The system recognizes various music genres including: |
|
- Pop, Rock, Hip-Hop, R&B |
|
- Electronic, Dance, Techno, House |
|
- Jazz, Blues, Classical |
|
- Folk, Country, Acoustic |
|
- Metal, Punk, Alternative |
|
- And many others! |
|
|
|
For best results, use high-quality audio files (MP3, WAV, FLAC) with at least 10 seconds of music. |
|
""") |
|
|
|
with gr.Column(scale=2): |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Analysis Results"): |
|
genre_output = gr.Textbox(label="Detected Genres", lines=4) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
emotion_output = gr.Textbox(label="Emotion & Structure Analysis", lines=8) |
|
with gr.Column(): |
|
ast_output = gr.Textbox(label="Audio Classification", lines=8) |
|
|
|
with gr.TabItem("Generated Lyrics"): |
|
lyrics_output = gr.Textbox(label="Lyrics", lines=18) |
|
|
|
with gr.TabItem("Rhythm Analysis"): |
|
rhythm_analysis_output = gr.Textbox(label="Syllable-Beat Alignment Analysis", lines=16) |
|
|
|
with gr.TabItem("Syllable Analysis"): |
|
syllable_analysis_output = gr.Textbox(label="Detailed Syllable Analysis", lines=16) |
|
prompt_template_output = gr.Textbox(label="Prompt Template", lines=16) |
|
|
|
with gr.TabItem("Beat & Syllable Timeline"): |
|
beat_timeline_output = gr.Textbox(label="Beat Timings & Syllable Patterns", lines=16) |
|
|
|
|
|
def display_results(audio_file): |
|
if audio_file is None: |
|
return "Please upload an audio file.", "No emotion analysis available.", "No audio classification available.", "No lyrics generated.", "No rhythm analysis available.", "No syllable analysis available.", "No prompt template available.", "No beat timeline available." |
|
|
|
try: |
|
|
|
results = process_audio(audio_file) |
|
|
|
|
|
if isinstance(results, str) and "Error" in results: |
|
return results, "Error in analysis", "Error in classification", "No lyrics generated", "No rhythm analysis available", "No syllable analysis available", "No prompt template available", "No beat timeline available" |
|
elif isinstance(results, tuple) and isinstance(results[0], str) and "Error" in results[0]: |
|
return results[0], "Error in analysis", "Error in classification", "No lyrics generated", "No rhythm analysis available", "No syllable analysis available", "No prompt template available", "No beat timeline available" |
|
|
|
|
|
if isinstance(results, dict): |
|
genre_results = results.get("genre_results", "Genre classification failed") |
|
lyrics = results.get("lyrics", "Lyrics generation failed") |
|
ast_results = results.get("ast_results", []) |
|
|
|
|
|
clean_lyrics = results.get("clean_lyrics", lyrics) |
|
rhythm_analysis = results.get("rhythm_analysis", "No detailed rhythm analysis available") |
|
|
|
|
|
syllable_analysis = results.get("syllable_analysis", "No syllable analysis available") |
|
prompt_template = results.get("prompt_template", "No prompt template available") |
|
else: |
|
|
|
genre_results, lyrics, ast_results = results |
|
clean_lyrics = lyrics |
|
|
|
|
|
rhythm_analysis = "No detailed rhythm analysis available" |
|
if isinstance(lyrics, str): |
|
|
|
if "[Note: Rhythm Analysis]" in lyrics: |
|
clean_lyrics = lyrics.split("[Note: Rhythm Analysis]")[0].strip() |
|
rhythm_analysis = lyrics.split("[Note: Rhythm Analysis]")[1] |
|
|
|
elif "[Note: Potential rhythm mismatches" in lyrics: |
|
clean_lyrics = lyrics.split("[Note:")[0].strip() |
|
rhythm_analysis = "[Note:" + lyrics.split("[Note:")[1] |
|
|
|
|
|
syllable_analysis = "No syllable analysis available" |
|
prompt_template = "No prompt template available" |
|
|
|
|
|
beat_timeline = format_beat_timeline(audio_file, clean_lyrics) |
|
|
|
|
|
try: |
|
emotion_results = music_analyzer.analyze_music(audio_file) |
|
emotion_text = f"Tempo: {emotion_results['summary']['tempo']:.1f} BPM\n" |
|
emotion_text += f"Key: {emotion_results['summary']['key']} {emotion_results['summary']['mode']}\n" |
|
emotion_text += f"Primary Emotion: {emotion_results['summary']['primary_emotion']}\n" |
|
emotion_text += f"Primary Theme: {emotion_results['summary']['primary_theme']}" |
|
|
|
|
|
try: |
|
audio_data = extract_audio_features(audio_file) |
|
|
|
y, sr = load_audio(audio_file, SAMPLE_RATE) |
|
beats_info = detect_beats(y, sr) |
|
sections_info = detect_sections(y, sr) |
|
|
|
|
|
song_structure = { |
|
"beats": beats_info, |
|
"sections": sections_info, |
|
"syllables": [] |
|
} |
|
|
|
|
|
for section in sections_info: |
|
|
|
section_beats_info = { |
|
"beat_times": [beat for beat in beats_info["beat_times"] |
|
if section["start"] <= beat < section["end"]], |
|
"tempo": beats_info.get("tempo", 120) |
|
} |
|
if "beat_strengths" in beats_info: |
|
section_beats_info["beat_strengths"] = [ |
|
strength for i, strength in enumerate(beats_info["beat_strengths"]) |
|
if i < len(beats_info["beat_times"]) and |
|
section["start"] <= beats_info["beat_times"][i] < section["end"] |
|
] |
|
|
|
|
|
syllable_count = int(section["duration"] * (beats_info.get("tempo", 120) / 60) * 1.5) |
|
|
|
section_info = { |
|
"type": section["type"], |
|
"start": section["start"], |
|
"end": section["end"], |
|
"duration": section["duration"], |
|
"syllable_count": syllable_count, |
|
"beat_count": len(section_beats_info["beat_times"]) |
|
} |
|
|
|
|
|
if len(section_beats_info["beat_times"]) >= 2: |
|
section_info["syllable_template"] = create_flexible_syllable_templates( |
|
section_beats_info |
|
) |
|
|
|
song_structure["syllables"].append(section_info) |
|
|
|
emotion_text += "\n\nSong Structure:\n" |
|
for section in song_structure["syllables"]: |
|
emotion_text += f"- {section['type'].capitalize()}: {section['start']:.1f}s to {section['end']:.1f}s " |
|
emotion_text += f"({section['duration']:.1f}s, {section['beat_count']} beats, " |
|
|
|
if "syllable_template" in section: |
|
emotion_text += f"template: {section['syllable_template']})\n" |
|
else: |
|
emotion_text += f"~{section['syllable_count']} syllables)\n" |
|
|
|
|
|
if "flexible_structure" in song_structure and song_structure["flexible_structure"]: |
|
flexible = song_structure["flexible_structure"] |
|
if "segments" in flexible and flexible["segments"]: |
|
emotion_text += "\nDetailed Rhythm Analysis:\n" |
|
for i, segment in enumerate(flexible["segments"][:5]): |
|
emotion_text += f"- Segment {i+1}: {segment['start']:.1f}s to {segment['end']:.1f}s, " |
|
emotion_text += f"pattern: {segment.get('syllable_template', 'N/A')}\n" |
|
|
|
if len(flexible["segments"]) > 5: |
|
emotion_text += f" (+ {len(flexible['segments']) - 5} more segments)\n" |
|
|
|
except Exception as e: |
|
print(f"Error displaying song structure: {str(e)}") |
|
|
|
|
|
except Exception as e: |
|
print(f"Error in emotion analysis: {str(e)}") |
|
emotion_text = f"Error in emotion analysis: {str(e)}" |
|
|
|
|
|
if ast_results and isinstance(ast_results, list): |
|
ast_text = "Audio Classification Results:\n" |
|
for result in ast_results[:5]: |
|
ast_text += f"{result['label']}: {result['score']*100:.2f}%\n" |
|
else: |
|
ast_text = "No valid audio classification results available." |
|
|
|
|
|
return genre_results, emotion_text, ast_text, clean_lyrics, rhythm_analysis, syllable_analysis, prompt_template, beat_timeline |
|
|
|
except Exception as e: |
|
error_msg = f"Error: {str(e)}" |
|
print(error_msg) |
|
return error_msg, "Error in emotion analysis", "Error in audio classification", "No lyrics generated", "No rhythm analysis available", "No syllable analysis available", "No prompt template available", "No beat timeline available" |
|
|
|
|
|
submit_btn.click( |
|
fn=display_results, |
|
inputs=[audio_input], |
|
outputs=[genre_output, emotion_output, ast_output, lyrics_output, rhythm_analysis_output, syllable_analysis_output, prompt_template_output, beat_timeline_output] |
|
) |
|
|
|
|
|
with gr.Accordion("How it works", open=False): |
|
gr.Markdown(""" |
|
## Advanced Lyrics Generation Process |
|
|
|
1. **Audio Analysis**: The system analyzes your uploaded music file using multiple machine learning models. |
|
|
|
2. **Genre Classification**: A specialized neural network identifies the musical genre, detecting subtle patterns in the audio. |
|
|
|
3. **Emotional Analysis**: The system examines harmonic, rhythmic, and timbral features to determine the emotional qualities of the music. |
|
|
|
4. **Rhythm Mapping**: Advanced beat detection algorithms create a detailed rhythmic map of the music, identifying: |
|
- Strong and weak beats |
|
- Natural phrase boundaries |
|
- Time signature and tempo variations |
|
|
|
5. **Syllable Template Creation**: For each musical phrase, the system generates precise syllable templates that reflect: |
|
- Beat stress patterns (strong, medium, weak) |
|
- Appropriate syllable counts based on tempo |
|
- Genre-specific rhythmic qualities |
|
|
|
6. **Lyrics Generation**: Using the detected genre, emotion, and rhythm patterns, a large language model generates lyrics that: |
|
- Match the emotional quality of the music |
|
- Follow the precise syllable templates |
|
- Align stressed syllables with strong beats |
|
- Maintain genre-appropriate style and themes |
|
|
|
7. **Rhythm Verification**: The system verifies the generated lyrics, analyzing: |
|
- Syllable count accuracy |
|
- Stress alignment with strong beats |
|
- Word stress patterns |
|
|
|
8. **Refinement**: If significant rhythm mismatches are detected, the system can automatically refine the lyrics for better alignment. |
|
|
|
This multi-step process creates lyrics that feel naturally connected to the music, as if they were written specifically for it. |
|
""") |
|
|
|
|
|
demo.launch() |