Spaces:

jacob-c
/

syllables_matching_experiment

Paused

syllables_matching_experiment / beat_analysis.py

root

8515dc5 about 2 months ago

17.4 kB

	import librosa
	import numpy as np
	import pronouncing
	import re
	from functools import lru_cache
	import string
	from nltk.corpus import cmudict
	import nltk

	try:
	nltk.data.find('corpora/cmudict')
	except LookupError:
	nltk.download('cmudict')

	class BeatAnalyzer:
	def __init__(self):
	# Mapping for standard stress patterns by time signature
	# Simplified to only include 4/4, 3/4, 2/4, and 6/8
	self.stress_patterns = {
	# Format: Strong (1.0), Medium (0.5), Weak (0.0)
	"4/4": [1.0, 0.0, 0.5, 0.0], # Strong, weak, medium, weak
	"3/4": [1.0, 0.0, 0.0], # Strong, weak, weak
	"2/4": [1.0, 0.0], # Strong, weak
	"6/8": [1.0, 0.0, 0.0, 0.5, 0.0, 0.0] # Strong, weak, weak, medium, weak, weak
	}

	self.cmudict = None
	try:
	self.cmudict = cmudict.dict()
	except:
	pass # Fall back to rule-based counting if cmudict is not available

	# Genre-specific syllable-to-beat ratio guidelines
	self.genre_syllable_ratios = {
	# Genre: (min_ratio, typical_ratio, max_ratio)
	'pop': (0.7, 1.5, 3.0), # Pop tends to have more syllables per beat
	'rock': (0.7, 1.2, 2.5), # Rock can vary widely
	'hiphop': (1.5, 3.0, 5.0), # Hip hop often has many syllables per beat
	'rap': (2.0, 4.0, 7.0), # Rap often has very high syllable counts
	'folk': (0.8, 1.0, 1.5), # Folk often has close to 1:1 ratio
	'country': (0.7, 1.2, 2.0), # Country tends to be moderate
	'jazz': (0.5, 1.0, 3.0), # Jazz can be very flexible
	'reggae': (0.6, 1.0, 1.5), # Reggae often emphasizes specific beats
	'soul': (0.7, 1.2, 2.0), # Soul music tends to be expressive
	'r&b': (0.8, 1.5, 2.5), # R&B can have melisma
	'electronic': (0.5, 1.0, 2.0), # Electronic music varies widely
	'disco': (1.0, 1.5, 2.5), # Disco tends to have more syllables
	'classical': (0.5, 1.0, 2.0), # Classical can vary by subgenre
	'metal': (0.8, 1.5, 3.0), # Metal often has more syllables on strong beats
	'blues': (0.5, 0.8, 1.5), # Blues often extends syllables
	'default': (0.7, 1.5, 3.0) # Default for unknown genres
	}

	@lru_cache(maxsize=128)
	def count_syllables(self, word):
	"""Count syllables in a word using CMU dictionary if available, otherwise use rule-based method."""
	word = word.lower().strip()
	word = re.sub(r'[^a-z]', '', word) # Remove non-alphabetic characters

	if not word:
	return 0

	# Try using CMUDict first if available
	if self.cmudict and word in self.cmudict:
	return max([len(list(y for y in x if y[-1].isdigit())) for x in self.cmudict[word]])

	# Rule-based syllable counting as fallback
	# Modified version from NLTK's implementation
	vowels = "aeiouy"
	double_vowels = ['aa', 'ae', 'ai', 'ao', 'au', 'ay', 'ea', 'ee', 'ei', 'eo', 'eu', 'ey', 'ia', 'ie', 'ii', 'io', 'iu', 'oa', 'oe', 'oi', 'oo', 'ou', 'oy', 'ua', 'ue', 'ui', 'uo', 'uy']
	prev_was_vowel = False
	count = 0
	final_e = False

	if word.endswith('e') and not word.endswith('le'):
	final_e = True

	for i, char in enumerate(word):
	if char in vowels:
	# Check if current char and previous char form a dipthong
	if prev_was_vowel and i > 0 and (word[i-1:i+1] in double_vowels):
	prev_was_vowel = True
	continue

	if not prev_was_vowel:
	count += 1
	prev_was_vowel = True
	else:
	prev_was_vowel = False

	# Handle edge cases
	if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
	count += 1
	elif final_e:
	count = max(count-1, 1) # Remove last 'e', but ensure at least 1 syllable
	elif word.endswith('y') and not prev_was_vowel:
	count += 1

	# Ensure at least one syllable
	return max(count, 1)

	def analyze_beat_pattern(self, audio_path, sr=22050, time_signature="4/4"):
	"""Analyze beat patterns and stresses in music using the provided time signature."""
	# Load audio
	y, sr = librosa.load(audio_path, sr=sr)

	# Get tempo and beat frames
	tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
	beat_times = librosa.frames_to_time(beat_frames, sr=sr)

	# Get beat strengths using onset envelope
	onset_env = librosa.onset.onset_strength(y=y, sr=sr)
	beat_strengths = onset_env[beat_frames]

	# Normalize beat strengths
	if len(beat_strengths) > 0 and np.max(beat_strengths) > np.min(beat_strengths):
	beat_strengths = (beat_strengths - np.min(beat_strengths)) / (np.max(beat_strengths) - np.min(beat_strengths))

	# Parse time signature
	if '/' in time_signature:
	num, denom = map(int, time_signature.split('/'))
	else:
	num, denom = 4, 4 # Default to 4/4

	# Group beats into bars (each bar is one phrase based on time signature)
	bars = []
	current_bar = []

	for i, (time, strength) in enumerate(zip(beat_times, beat_strengths)):
	# Determine metrical position and stress
	metrical_position = i % num

	# Define stress pattern according to time signature
	if time_signature == "4/4":
	if metrical_position == 0: # First beat (strongest)
	stress = "S" # Strong
	elif metrical_position == 2: # Third beat (medium)
	stress = "M" # Medium
	else: # Second and fourth beats (weak)
	stress = "W" # Weak
	elif time_signature == "3/4":
	if metrical_position == 0: # First beat (strongest)
	stress = "S" # Strong
	else: # Other beats (weak)
	stress = "W" # Weak
	elif time_signature == "6/8":
	if metrical_position == 0: # First beat (strongest)
	stress = "S" # Strong
	elif metrical_position == 3: # Fourth beat (medium)
	stress = "M" # Medium
	else: # Other beats (weak)
	stress = "W" # Weak
	elif time_signature == "2/4":
	if metrical_position == 0: # First beat (strongest)
	stress = "S" # Strong
	else: # Second beat (weak)
	stress = "W" # Weak
	else:
	# Default pattern for other time signatures
	if metrical_position == 0:
	stress = "S"
	else:
	stress = "W"

	# Add beat to current bar
	current_bar.append({
	'time': time,
	'strength': strength,
	'stress': stress,
	'metrical_position': metrical_position
	})

	# When we complete a bar, add it to our bars list
	if metrical_position == num - 1 or i == len(beat_times) - 1:
	if current_bar:
	bars.append(current_bar)
	current_bar = []

	# If there's any remaining beats, add them as a partial bar
	if current_bar:
	bars.append(current_bar)

	# Organize beats into phrases (one phrase = one bar)
	phrases = []

	for i, bar in enumerate(bars):
	phrase_beats = bar

	if not phrase_beats:
	continue

	# Calculate the phrase information
	phrase = {
	'id': i,
	'num_beats': len(phrase_beats),
	'beats': phrase_beats,
	'stress_pattern': ''.join(beat['stress'] for beat in phrase_beats),
	'start_time': phrase_beats[0]['time'],
	'end_time': phrase_beats[-1]['time'] + (phrase_beats[-1]['time'] - phrase_beats[-2]['time'] if len(phrase_beats) > 1 else 0.5),
	}

	phrases.append(phrase)

	return {
	'tempo': tempo,
	'time_signature': time_signature,
	'num_beats': len(beat_times),
	'beat_times': beat_times.tolist(),
	'beat_strengths': beat_strengths.tolist(),
	'phrases': phrases
	}

	def create_lyric_template(self, beat_analysis):
	"""Create templates for lyrics based on beat phrases."""
	templates = []

	if not beat_analysis or 'phrases' not in beat_analysis:
	return templates

	phrases = beat_analysis['phrases']

	for i, phrase in enumerate(phrases):
	duration = phrase['end_time'] - phrase['start_time']

	template = {
	'id': phrase['id'],
	'start_time': phrase['start_time'],
	'end_time': phrase['end_time'],
	'duration': duration,
	'num_beats': phrase['num_beats'],
	'stress_pattern': phrase['stress_pattern'],
	'syllable_guide': self.generate_phrase_guide(phrase)
	}

	templates.append(template)

	return templates

	def generate_phrase_guide(self, template, words_per_beat=0.5):
	"""Generate a guide for each phrase to help the LLM."""
	num_beats = template['num_beats']
	stress_pattern = template['stress_pattern']

	# Create a visual representation of the stress pattern
	# S = Strong stress, M = Medium stress, W = Weak stress
	visual_pattern = ""
	for i, stress in enumerate(stress_pattern):
	if stress == "S":
	visual_pattern += "STRONG "
	elif stress == "M":
	visual_pattern += "medium "
	else:
	visual_pattern += "weak "

	# Estimate number of words based on beats (very rough estimate)
	est_words = max(1, int(num_beats * words_per_beat))

	# Estimate syllables - more flexible now, allowing for reasonable ranges
	# Typical song might have 1-3 syllables per beat
	min_syllables = num_beats
	max_syllables = num_beats * 3

	guide = f"~{est_words} words, ~{min_syllables}-{max_syllables} syllables \| Pattern: {visual_pattern}"
	return guide

	def check_syllable_stress_match(self, text, template, genre="pop"):
	"""Check if lyrics match the syllable and stress pattern with genre-specific flexibility."""
	# Split text into words and count syllables
	words = text.split()
	syllable_count = sum(self.count_syllables(word) for word in words)

	# Get expected syllable count based on number of beats
	expected_count = template['num_beats']

	# Get syllable-to-beat ratios based on genre
	genre_lower = genre.lower()
	if genre_lower in self.genre_syllable_ratios:
	min_ratio, typical_ratio, max_ratio = self.genre_syllable_ratios[genre_lower]
	else:
	min_ratio, typical_ratio, max_ratio = self.genre_syllable_ratios['default']

	# Calculate flexible min and max syllable expectations based on genre
	min_expected = max(1, int(expected_count * min_ratio))
	max_expected = int(expected_count * max_ratio)

	# Check if syllable count falls within genre-appropriate range
	within_range = min_expected <= syllable_count <= max_expected

	# Consider typical ratio - how close are we to the ideal for this genre?
	ideal_count = int(expected_count * typical_ratio)
	closeness_to_ideal = 1.0 - min(abs(syllable_count - ideal_count) / (max_expected - min_expected + 1), 1.0)

	# Get detailed syllable breakdown for stress analysis
	word_syllables = []
	for word in words:
	count = self.count_syllables(word)
	word_syllables.append(count)

	# Analyze stress pattern match using a more flexible approach
	stress_pattern = template['stress_pattern']

	# Simple stress matching algorithm (can be improved in future versions)
	# We need to map syllables to beats in a more flexible way
	syllable_to_beat_mapping = self._map_syllables_to_beats(word_syllables, stress_pattern)

	# Calculate stress match score based on alignment of stressed syllables with strong beats
	stress_match_percentage = self._calculate_stress_match(words, word_syllables, syllable_to_beat_mapping, stress_pattern)

	# Consider a stress match if the percentage is high enough
	stress_matches = stress_match_percentage >= 0.7

	return {
	'syllable_count': syllable_count,
	'expected_count': expected_count,
	'min_expected': min_expected,
	'max_expected': max_expected,
	'within_range': within_range,
	'matches_beat_count': syllable_count == expected_count, # Exact match (strict)
	'close_match': within_range, # Flexible match (based on genre)
	'stress_matches': stress_matches,
	'stress_match_percentage': stress_match_percentage,
	'closeness_to_ideal': closeness_to_ideal,
	'word_syllables': word_syllables,
	'ideal_syllable_count': ideal_count
	}

	def _map_syllables_to_beats(self, word_syllables, stress_pattern):
	"""Map syllables to beats in a flexible way."""
	total_syllables = sum(word_syllables)
	total_beats = len(stress_pattern)

	# Simple mapping for now - this could be improved with more sophisticated algorithms
	if total_syllables <= total_beats:
	# Fewer syllables than beats - some beats have no syllables (prolongation)
	mapping = []
	syllable_index = 0
	for beat_index in range(total_beats):
	if syllable_index < total_syllables:
	mapping.append((syllable_index, beat_index))
	syllable_index += 1
	return mapping
	else:
	# More syllables than beats - some beats have multiple syllables (melisma/syncopation)
	mapping = []
	syllables_per_beat = total_syllables / total_beats
	for beat_index in range(total_beats):
	start_syllable = int(beat_index * syllables_per_beat)
	end_syllable = int((beat_index + 1) * syllables_per_beat)
	for syllable_index in range(start_syllable, end_syllable):
	if syllable_index < total_syllables:
	mapping.append((syllable_index, beat_index))
	return mapping

	def _calculate_stress_match(self, words, word_syllables, syllable_to_beat_mapping, stress_pattern):
	"""Calculate how well syllable stresses match beat stresses."""
	# This is a simplified version - real stress analysis would be more complex
	# For now, we'll assume the first syllable of each word is stressed

	# First, create a flat list of all syllables with their stress (1 = stressed, 0 = unstressed)
	syllable_stresses = []
	for word, syllable_count in zip(words, word_syllables):
	# Simple assumption: first syllable is stressed, rest are unstressed
	for i in range(syllable_count):
	if i == 0: # First syllable of word
	syllable_stresses.append(1) # Stressed
	else:
	syllable_stresses.append(0) # Unstressed

	# Count matches between syllable stress and beat stress
	matches = 0
	total_mapped = 0

	for syllable_index, beat_index in syllable_to_beat_mapping:
	if syllable_index < len(syllable_stresses):
	syllable_stress = syllable_stresses[syllable_index]
	beat_stress = 1 if stress_pattern[beat_index] == 'S' else (0.5 if stress_pattern[beat_index] == 'M' else 0)

	# Consider it a match if:
	# - Stressed syllable on Strong beat
	# - Unstressed syllable on Weak beat
	# - Some partial credit for other combinations
	if (syllable_stress == 1 and beat_stress > 0.5) or (syllable_stress == 0 and beat_stress < 0.5):
	matches += 1
	elif syllable_stress == 1 and beat_stress == 0.5: # Stressed syllable on Medium beat
	matches += 0.7

	total_mapped += 1

	if total_mapped == 0:
	return 0

	return matches / total_mapped