root commited on
Commit
5b33796
·
1 Parent(s): c95399f
Files changed (4) hide show
  1. README.md +27 -23
  2. app.py +0 -0
  3. emotionanalysis.py +558 -36
  4. requirements.txt +0 -1
README.md CHANGED
@@ -11,37 +11,41 @@ license: mit
11
  short_description: AI music genre detection and lyrics generation
12
  ---
13
 
14
- # Music Genre Classifier & Lyrics Generator
15
 
16
- This Hugging Face Space application provides two AI-powered features:
17
-
18
- 1. **Music Genre Classification**: Upload a music file and get an analysis of its genre using the [dima806/music_genres_classification](https://huggingface.co/dima806/music_genres_classification) model.
19
-
20
- 2. **Lyrics Generation**: Based on the detected genre, the app generates original lyrics using [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) that match both the style of the genre and approximate length of the song.
21
 
22
  ## Features
23
 
24
- - Upload any music file for instant genre classification
25
- - Receive genre predictions with confidence scores
26
- - Get AI-generated lyrics tailored to the detected music genre
27
- - Lyrics length is automatically adjusted based on the song duration
28
- - Simple and intuitive user interface
29
 
30
- ## Usage
31
 
32
- 1. Visit the live application on Hugging Face Spaces
33
- 2. Upload your music file using the provided interface
34
- 3. Click "Analyze & Generate" to process the audio
35
- 4. View the detected genre and generated lyrics in the output panels
36
 
37
  ## Technical Details
38
 
39
- - Uses MFCC features extraction from audio for genre classification
40
- - Leverages 4-bit quantization for efficient LLM inference on T4 GPU
41
- - Implements a specialized prompt engineering approach to generate genre-specific lyrics
42
- - Automatically scales lyrics length based on audio duration
 
 
 
 
 
 
 
 
 
 
43
 
44
- ## Links
45
 
46
- - [Music Genre Classification Model](https://huggingface.co/dima806/music_genres_classification)
47
- - [Llama 3.1 8B Instruct Model](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
 
11
  short_description: AI music genre detection and lyrics generation
12
  ---
13
 
14
+ # Music Analysis & Lyrics Generator
15
 
16
+ This Hugging Face Space application analyzes music files and generates lyrics that match the musical characteristics.
 
 
 
 
17
 
18
  ## Features
19
 
20
+ - **Music Analysis**: Detects tempo, time signature, key, emotion, and theme
21
+ - **Genre Classification**: Identifies the music genre using a pre-trained classifier
22
+ - **Lyrics Generation**: Creates lyrics that match the style, emotion, and length of your music using Qwen3-32B
 
 
23
 
24
+ ## How to Use
25
 
26
+ 1. Upload a music file or record audio directly in the app
27
+ 2. Click "Analyze and Generate Lyrics"
28
+ 3. View the analysis results showing tempo, key, emotion, theme, and genre
29
+ 4. Check the generated lyrics tailored to match your music
30
 
31
  ## Technical Details
32
 
33
+ This application uses:
34
+ - **MusicAnalyzer**: Custom analysis tool for detecting musical features
35
+ - **Hugging Face Transformers**: Pre-trained models for genre classification and lyrics generation
36
+ - **Gradio**: For the user interface
37
+ - **Librosa**: For audio processing
38
+
39
+ ## Requirements
40
+
41
+ See requirements.txt for detailed dependencies.
42
+
43
+ ## Limitations
44
+
45
+ - Large audio files may take longer to process
46
+ - The quality of lyrics generation depends on the clarity of the audio and the detected musical features
47
 
48
+ ## Credits
49
 
50
+ - Genre classification model: dima806/music_genres_classification
51
+ - LLM for lyrics generation: Qwen/Qwen3-32B
app.py CHANGED
The diff for this file is too large to render. See raw diff
 
emotionanalysis.py CHANGED
@@ -1,5 +1,7 @@
1
  import librosa
2
  import numpy as np
 
 
3
  try:
4
  import matplotlib.pyplot as plt
5
  except ImportError:
@@ -7,6 +9,7 @@ except ImportError:
7
  from scipy.stats import mode
8
  import warnings
9
  warnings.filterwarnings('ignore') # Suppress librosa warnings
 
10
  class MusicAnalyzer:
11
  def __init__(self):
12
  # Emotion feature mappings - these define characteristics of different emotions
@@ -31,6 +34,40 @@ class MusicAnalyzer:
31
 
32
  # Musical key mapping
33
  self.key_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def load_audio(self, file_path, sr=22050, duration=None):
36
  """Load audio file and return time series and sample rate"""
@@ -56,8 +93,12 @@ class MusicAnalyzer:
56
  ac = librosa.autocorrelate(onset_env, max_size=sr // 2)
57
  ac = librosa.util.normalize(ac, norm=np.inf)
58
 
59
- # Time signature estimation - a challenging task with many limitations
60
- estimated_signature = self._estimate_time_signature(y, sr, beat_times, onset_env)
 
 
 
 
61
 
62
  # Compute onset strength to get a measure of rhythm intensity
63
  rhythm_intensity = np.mean(onset_env) / np.max(onset_env) if np.max(onset_env) > 0 else 0
@@ -65,48 +106,509 @@ class MusicAnalyzer:
65
  # Rhythm complexity based on variation in onset strength
66
  rhythm_complexity = np.std(onset_env) / np.mean(onset_env) if np.mean(onset_env) > 0 else 0
67
 
 
 
 
 
68
  return {
69
  "tempo": float(tempo),
70
- "beat_times": beat_times.tolist(),
71
- "beat_intervals": beat_intervals.tolist(),
72
  "beat_regularity": float(beat_regularity),
73
  "rhythm_intensity": float(rhythm_intensity),
74
  "rhythm_complexity": float(rhythm_complexity),
75
- "estimated_time_signature": estimated_signature
 
 
76
  }
77
 
78
- def _estimate_time_signature(self, y, sr, beat_times, onset_env):
79
- """Estimate the time signature based on beat patterns"""
80
- # This is a simplified approach - accurate time signature detection is complex
81
- if len(beat_times) < 4:
82
- return "Unknown"
83
-
84
- # Analyze beat emphasis patterns to detect meter
85
- beat_intervals = np.diff(beat_times)
86
-
87
- # Look for periodicity in the onset envelope
88
- ac = librosa.autocorrelate(onset_env, max_size=sr)
89
-
90
- # Find peaks in autocorrelation after the first one (which is at lag 0)
91
- peaks = librosa.util.peak_pick(ac, pre_max=20, post_max=20, pre_avg=20, post_avg=20, delta=0.1, wait=1)
92
- peaks = peaks[peaks > 0] # Remove the first peak which is at lag 0
93
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  if len(peaks) == 0:
95
- return "4/4" # Default to most common
96
-
97
- # Convert first significant peak to beats
98
- first_peak_time = peaks[0] / sr
99
- beats_per_bar = round(first_peak_time / np.median(beat_intervals))
100
-
101
- # Map to common time signatures
102
- if beats_per_bar == 4 or beats_per_bar == 8:
103
- return "4/4"
104
- elif beats_per_bar == 3 or beats_per_bar == 6:
105
- return "3/4"
106
- elif beats_per_bar == 2:
107
- return "2/4"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  else:
109
- return f"{beats_per_bar}/4" # Default assumption
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  def analyze_tonality(self, y, sr):
112
  """Analyze tonal features: key, mode, harmonic features"""
@@ -355,6 +857,26 @@ class MusicAnalyzer:
355
  emotion_data = self.analyze_emotion(rhythm_data, tonal_data, energy_data)
356
  theme_data = self.analyze_theme(rhythm_data, tonal_data, emotion_data)
357
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  # Combine all results
359
  return {
360
  "file": file_path,
@@ -364,7 +886,7 @@ class MusicAnalyzer:
364
  "emotion_analysis": emotion_data,
365
  "theme_analysis": theme_data,
366
  "summary": {
367
- "tempo": rhythm_data["tempo"],
368
  "time_signature": rhythm_data["estimated_time_signature"],
369
  "key": tonal_data["key"],
370
  "mode": tonal_data["mode"],
 
1
  import librosa
2
  import numpy as np
3
+ from scipy import signal
4
+ from collections import Counter
5
  try:
6
  import matplotlib.pyplot as plt
7
  except ImportError:
 
9
  from scipy.stats import mode
10
  import warnings
11
  warnings.filterwarnings('ignore') # Suppress librosa warnings
12
+
13
  class MusicAnalyzer:
14
  def __init__(self):
15
  # Emotion feature mappings - these define characteristics of different emotions
 
34
 
35
  # Musical key mapping
36
  self.key_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
37
+
38
+ # Common time signatures and their beat patterns with weights for prior probability
39
+ self.common_time_signatures = {
40
+ "4/4": {"beats_per_bar": 4, "beat_pattern": [1.0, 0.2, 0.5, 0.2], "weight": 0.35},
41
+ "3/4": {"beats_per_bar": 3, "beat_pattern": [1.0, 0.2, 0.3], "weight": 0.25},
42
+ "2/4": {"beats_per_bar": 2, "beat_pattern": [1.0, 0.3], "weight": 0.15},
43
+ "6/8": {"beats_per_bar": 6, "beat_pattern": [1.0, 0.2, 0.3, 0.8, 0.2, 0.3], "weight": 0.25},
44
+ "5/4": {"beats_per_bar": 5, "beat_pattern": [1.0, 0.2, 0.4, 0.7, 0.2], "weight": 0.10},
45
+ "7/8": {"beats_per_bar": 7, "beat_pattern": [1.0, 0.2, 0.3, 0.8, 0.2, 0.2, 0.3], "weight": 0.10},
46
+ "9/8": {"beats_per_bar": 9, "beat_pattern": [1.0, 0.2, 0.3, 0.8, 0.2, 0.3, 0.7, 0.2, 0.3], "weight": 0.10},
47
+ "12/8": {"beats_per_bar": 12, "beat_pattern": [1.0, 0.2, 0.3, 0.6, 0.2, 0.3, 0.8, 0.2, 0.3, 0.6, 0.2, 0.3], "weight": 0.15}
48
+ }
49
+
50
+ # Add common accent patterns for different time signatures
51
+ self.accent_patterns = {
52
+ "4/4": [[1, 0, 0, 0], [1, 0, 2, 0], [1, 0, 2, 0, 3, 0, 2, 0]],
53
+ "3/4": [[1, 0, 0], [1, 0, 2]],
54
+ "2/4": [[1, 0], [1, 2]],
55
+ "6/8": [[1, 0, 0, 2, 0, 0], [1, 0, 0, 2, 0, 3]],
56
+ "5/4": [[1, 0, 0, 2, 0], [1, 0, 2, 0, 0]],
57
+ "7/8": [[1, 0, 0, 2, 0, 0, 0], [1, 0, 0, 2, 0, 3, 0]],
58
+ "9/8": [[1, 0, 0, 2, 0, 0, 3, 0, 0]],
59
+ "12/8": [[1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0]]
60
+ }
61
+
62
+ # Expected rhythm density (relative note density per beat) for different time signatures
63
+ self.rhythm_density = {
64
+ "4/4": [1.0, 0.7, 0.8, 0.6],
65
+ "3/4": [1.0, 0.6, 0.7],
66
+ "6/8": [1.0, 0.5, 0.4, 0.8, 0.5, 0.4],
67
+ "2/4": [1.0, 0.6],
68
+ "5/4": [1.0, 0.6, 0.8, 0.7, 0.6],
69
+ "7/8": [1.0, 0.5, 0.4, 0.8, 0.5, 0.4, 0.5]
70
+ }
71
 
72
  def load_audio(self, file_path, sr=22050, duration=None):
73
  """Load audio file and return time series and sample rate"""
 
93
  ac = librosa.autocorrelate(onset_env, max_size=sr // 2)
94
  ac = librosa.util.normalize(ac, norm=np.inf)
95
 
96
+ # Advanced time signature detection
97
+ time_sig_result = self._detect_time_signature(y, sr)
98
+
99
+ # Extract results from the time signature detection
100
+ estimated_signature = time_sig_result["time_signature"]
101
+ time_sig_confidence = time_sig_result["confidence"]
102
 
103
  # Compute onset strength to get a measure of rhythm intensity
104
  rhythm_intensity = np.mean(onset_env) / np.max(onset_env) if np.max(onset_env) > 0 else 0
 
106
  # Rhythm complexity based on variation in onset strength
107
  rhythm_complexity = np.std(onset_env) / np.mean(onset_env) if np.mean(onset_env) > 0 else 0
108
 
109
+ # Convert numpy arrays to regular Python types for JSON serialization
110
+ beat_times_list = [float(t) for t in beat_times.tolist()]
111
+ beat_intervals_list = [float(i) for i in beat_intervals.tolist()]
112
+
113
  return {
114
  "tempo": float(tempo),
115
+ "beat_times": beat_times_list,
116
+ "beat_intervals": beat_intervals_list,
117
  "beat_regularity": float(beat_regularity),
118
  "rhythm_intensity": float(rhythm_intensity),
119
  "rhythm_complexity": float(rhythm_complexity),
120
+ "estimated_time_signature": estimated_signature,
121
+ "time_signature_confidence": float(time_sig_confidence),
122
+ "time_signature_candidates": time_sig_result.get("all_candidates", {})
123
  }
124
 
125
+ def _detect_time_signature(self, y, sr):
126
+ """
127
+ Multi-method approach to time signature detection
128
+
129
+ Args:
130
+ y: Audio signal
131
+ sr: Sample rate
132
+
133
+ Returns:
134
+ dict with detected time signature and confidence
135
+ """
136
+ # 1. Compute onset envelope and beat positions
137
+ onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=512)
138
+
139
+ # Get tempo and beat frames
140
+ tempo, beat_frames = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
141
+ beat_times = librosa.frames_to_time(beat_frames, sr=sr)
142
+
143
+ # Return default if not enough beats detected
144
+ if len(beat_times) < 8:
145
+ return {"time_signature": "4/4", "confidence": 0.5}
146
+
147
+ # 2. Extract beat strengths and normalize
148
+ beat_strengths = self._get_beat_strengths(y, sr, beat_times, onset_env)
149
+
150
+ # 3. Compute various time signature features using different methods
151
+ results = {}
152
+
153
+ # Method 1: Beat pattern autocorrelation
154
+ autocorr_result = self._detect_by_autocorrelation(onset_env, sr)
155
+ results["autocorrelation"] = autocorr_result
156
+
157
+ # Method 2: Beat strength pattern matching
158
+ pattern_result = self._detect_by_pattern_matching(beat_strengths)
159
+ results["pattern_matching"] = pattern_result
160
+
161
+ # Method 3: Spectral rhythmic analysis
162
+ spectral_result = self._detect_by_spectral_analysis(onset_env, sr)
163
+ results["spectral"] = spectral_result
164
+
165
+ # Method 4: Note density analysis
166
+ density_result = self._detect_by_note_density(y, sr, beat_times)
167
+ results["note_density"] = density_result
168
+
169
+ # Method 5: Tempo-based estimation
170
+ tempo_result = self._estimate_from_tempo(tempo)
171
+ results["tempo_based"] = tempo_result
172
+
173
+ # 4. Combine results with weighted voting
174
+ final_result = self._combine_detection_results(results, tempo)
175
+
176
+ return final_result
177
+
178
+ def _get_beat_strengths(self, y, sr, beat_times, onset_env):
179
+ """Extract normalized strengths at beat positions"""
180
+ # Convert beat times to frames
181
+ beat_frames = librosa.time_to_frames(beat_times, sr=sr, hop_length=512)
182
+ beat_frames = [min(f, len(onset_env)-1) for f in beat_frames]
183
+
184
+ # Get beat strengths from onset envelope
185
+ beat_strengths = np.array([onset_env[f] for f in beat_frames])
186
+
187
+ # Also look at energy and spectral flux at beat positions
188
+ hop_length = 512
189
+ frame_length = 2048
190
+
191
+ # Get energy at each beat
192
+ energy = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]
193
+ beat_energy = np.array([energy[min(f, len(energy)-1)] for f in beat_frames])
194
+
195
+ # Combine onset strength with energy (weighted average)
196
+ beat_strengths = 0.7 * beat_strengths + 0.3 * beat_energy
197
+
198
+ # Normalize
199
+ if np.max(beat_strengths) > 0:
200
+ beat_strengths = beat_strengths / np.max(beat_strengths)
201
+
202
+ return beat_strengths
203
+
204
+ def _detect_by_autocorrelation(self, onset_env, sr):
205
+ """Detect meter using autocorrelation of onset strength"""
206
+ # Calculate autocorrelation of onset envelope
207
+ hop_length = 512
208
+ ac = librosa.autocorrelate(onset_env, max_size=4 * sr // hop_length)
209
+ ac = librosa.util.normalize(ac)
210
+
211
+ # Find significant peaks in autocorrelation
212
+ peaks = signal.find_peaks(ac, height=0.2, distance=sr//(8*hop_length))[0]
213
+
214
+ if len(peaks) < 2:
215
+ return {"time_signature": "4/4", "confidence": 0.4}
216
+
217
+ # Analyze peak intervals in terms of beats
218
+ peak_intervals = np.diff(peaks)
219
+
220
+ # Convert peaks to time
221
+ peak_times = peaks * hop_length / sr
222
+
223
+ # Analyze for common time signature patterns
224
+ time_sig_votes = {}
225
+
226
+ # Check if peaks match expected bar lengths
227
+ for ts, info in self.common_time_signatures.items():
228
+ beats_per_bar = info["beats_per_bar"]
229
+
230
+ # Check how well peaks match this meter
231
+ score = 0
232
+ for interval in peak_intervals:
233
+ # Check if this interval corresponds to this time signature
234
+ # Allow some tolerance around the expected value
235
+ expected = beats_per_bar * (hop_length / sr) # in seconds
236
+ tolerance = 0.25 * expected
237
+
238
+ if abs(interval * hop_length / sr - expected) < tolerance:
239
+ score += 1
240
+
241
+ if len(peak_intervals) > 0:
242
+ time_sig_votes[ts] = score / len(peak_intervals)
243
+
244
+ # Return most likely time signature
245
+ if time_sig_votes:
246
+ best_ts = max(time_sig_votes.items(), key=lambda x: x[1])
247
+ return {"time_signature": best_ts[0], "confidence": best_ts[1]}
248
+
249
+ return {"time_signature": "4/4", "confidence": 0.4}
250
+
251
+ def _detect_by_pattern_matching(self, beat_strengths):
252
+ """Match beat strength patterns against known time signature patterns"""
253
+ if len(beat_strengths) < 6:
254
+ return {"time_signature": "4/4", "confidence": 0.4}
255
+
256
+ results = {}
257
+
258
+ # Try each possible time signature
259
+ for ts, info in self.common_time_signatures.items():
260
+ beats_per_bar = info["beats_per_bar"]
261
+ expected_pattern = info["beat_pattern"]
262
+
263
+ # Calculate correlation scores for overlapping segments
264
+ scores = []
265
+
266
+ # We need at least one complete pattern
267
+ if len(beat_strengths) >= beats_per_bar:
268
+ # Try different offsets to find best alignment
269
+ for offset in range(min(beats_per_bar, len(beat_strengths) - beats_per_bar + 1)):
270
+ # Calculate scores for each complete pattern
271
+ pattern_scores = []
272
+
273
+ for i in range(offset, len(beat_strengths) - beats_per_bar + 1, beats_per_bar):
274
+ segment = beat_strengths[i:i+beats_per_bar]
275
+
276
+ # If expected pattern is longer than segment, truncate it
277
+ pattern = expected_pattern[:len(segment)]
278
+
279
+ # Normalize segment and pattern
280
+ if np.std(segment) > 0 and np.std(pattern) > 0:
281
+ # Calculate correlation
282
+ corr = np.corrcoef(segment, pattern)[0, 1]
283
+ if not np.isnan(corr):
284
+ pattern_scores.append(corr)
285
+
286
+ if pattern_scores:
287
+ scores.append(np.mean(pattern_scores))
288
+
289
+ # Use the best score among different offsets
290
+ if scores:
291
+ confidence = max(scores)
292
+ results[ts] = confidence
293
+
294
+ # Find best match
295
+ if results:
296
+ best_ts = max(results.items(), key=lambda x: x[1])
297
+ return {"time_signature": best_ts[0], "confidence": best_ts[1]}
298
+
299
+ # Default
300
+ return {"time_signature": "4/4", "confidence": 0.5}
301
+
302
+ def _detect_by_spectral_analysis(self, onset_env, sr):
303
+ """Analyze rhythm in frequency domain"""
304
+ # Get rhythm periodicity through Fourier Transform
305
+ # Focus on periods corresponding to typical bar lengths (1-8 seconds)
306
+ hop_length = 512
307
+
308
+ # Calculate rhythm periodicity
309
+ fft_size = 2**13 # Large enough to give good frequency resolution
310
+ S = np.abs(np.fft.rfft(onset_env, n=fft_size))
311
+
312
+ # Convert frequency to tempo in BPM
313
+ freqs = np.fft.rfftfreq(fft_size, d=hop_length/sr)
314
+ tempos = 60 * freqs
315
+
316
+ # Focus on reasonable tempo range (40-240 BPM)
317
+ tempo_mask = (tempos >= 40) & (tempos <= 240)
318
+ S_tempo = S[tempo_mask]
319
+ tempos = tempos[tempo_mask]
320
+
321
+ # Find peaks in spectrum
322
+ peaks = signal.find_peaks(S_tempo, height=np.max(S_tempo)*0.1, distance=5)[0]
323
+
324
  if len(peaks) == 0:
325
+ return {"time_signature": "4/4", "confidence": 0.4}
326
+
327
+ # Get peak tempos and strengths
328
+ peak_tempos = tempos[peaks]
329
+ peak_strengths = S_tempo[peaks]
330
+
331
+ # Sort by strength
332
+ peak_indices = np.argsort(peak_strengths)[::-1]
333
+ peak_tempos = peak_tempos[peak_indices]
334
+ peak_strengths = peak_strengths[peak_indices]
335
+
336
+ # Analyze relationships between peaks
337
+ # For example, 3/4 typically has peaks at multiples of 3 beats
338
+ # 4/4 has peaks at multiples of 4 beats
339
+
340
+ time_sig_scores = {}
341
+
342
+ # Check relationships between top peaks
343
+ if len(peak_tempos) >= 2:
344
+ tempo_ratios = []
345
+ for i in range(len(peak_tempos)):
346
+ for j in range(i+1, len(peak_tempos)):
347
+ if peak_tempos[j] > 0:
348
+ ratio = peak_tempos[i] / peak_tempos[j]
349
+ tempo_ratios.append(ratio)
350
+
351
+ # Check for patterns indicative of different time signatures
352
+ for ts in self.common_time_signatures:
353
+ score = 0
354
+
355
+ if ts == "4/4" or ts == "2/4":
356
+ # Look for ratios close to 2 or 4
357
+ for ratio in tempo_ratios:
358
+ if abs(ratio - 2) < 0.2 or abs(ratio - 4) < 0.2:
359
+ score += 1
360
+
361
+ elif ts == "3/4" or ts == "6/8":
362
+ # Look for ratios close to 3 or 6
363
+ for ratio in tempo_ratios:
364
+ if abs(ratio - 3) < 0.2 or abs(ratio - 6) < 0.3:
365
+ score += 1
366
+
367
+ # Normalize score
368
+ if tempo_ratios:
369
+ time_sig_scores[ts] = min(1.0, score / len(tempo_ratios) + 0.4)
370
+
371
+ # If we have meaningful scores, return best match
372
+ if time_sig_scores:
373
+ best_ts = max(time_sig_scores.items(), key=lambda x: x[1])
374
+ return {"time_signature": best_ts[0], "confidence": best_ts[1]}
375
+
376
+ # Default fallback
377
+ return {"time_signature": "4/4", "confidence": 0.4}
378
+
379
+ def _detect_by_note_density(self, y, sr, beat_times):
380
+ """Analyze note density patterns between beats"""
381
+ if len(beat_times) < 6:
382
+ return {"time_signature": "4/4", "confidence": 0.4}
383
+
384
+ # Extract note onsets (not just beats)
385
+ onset_times = librosa.onset.onset_detect(y=y, sr=sr, units='time')
386
+
387
+ if len(onset_times) < len(beat_times):
388
+ return {"time_signature": "4/4", "confidence": 0.4}
389
+
390
+ # Count onsets between consecutive beats
391
+ note_counts = []
392
+ for i in range(len(beat_times) - 1):
393
+ start = beat_times[i]
394
+ end = beat_times[i+1]
395
+
396
+ # Count onsets in this beat
397
+ count = sum(1 for t in onset_times if start <= t < end)
398
+ note_counts.append(count)
399
+
400
+ # Look for repeating patterns in the note counts
401
+ time_sig_scores = {}
402
+
403
+ for ts, info in self.common_time_signatures.items():
404
+ beats_per_bar = info["beats_per_bar"]
405
+
406
+ # Skip if we don't have enough data
407
+ if len(note_counts) < beats_per_bar:
408
+ continue
409
+
410
+ # Calculate pattern similarity for this time signature
411
+ scores = []
412
+
413
+ for offset in range(min(beats_per_bar, len(note_counts) - beats_per_bar + 1)):
414
+ similarities = []
415
+
416
+ for i in range(offset, len(note_counts) - beats_per_bar + 1, beats_per_bar):
417
+ # Get current bar pattern
418
+ pattern = note_counts[i:i+beats_per_bar]
419
+
420
+ # Compare with expected density pattern
421
+ expected = self.rhythm_density.get(ts, [1.0] * beats_per_bar)
422
+ expected = expected[:len(pattern)] # Truncate if needed
423
+
424
+ # Normalize both patterns
425
+ if sum(pattern) > 0 and sum(expected) > 0:
426
+ pattern_norm = [p/max(1, sum(pattern)) for p in pattern]
427
+ expected_norm = [e/sum(expected) for e in expected]
428
+
429
+ # Calculate similarity (1 - distance)
430
+ distance = sum(abs(p - e) for p, e in zip(pattern_norm, expected_norm)) / len(pattern)
431
+ similarity = 1 - min(1.0, distance)
432
+ similarities.append(similarity)
433
+
434
+ if similarities:
435
+ scores.append(np.mean(similarities))
436
+
437
+ # Use the best score
438
+ if scores:
439
+ time_sig_scores[ts] = max(scores)
440
+
441
+ # Return best match
442
+ if time_sig_scores:
443
+ best_ts = max(time_sig_scores.items(), key=lambda x: x[1])
444
+ return {"time_signature": best_ts[0], "confidence": best_ts[1]}
445
+
446
+ # Default
447
+ return {"time_signature": "4/4", "confidence": 0.4}
448
+
449
+ def _estimate_from_tempo(self, tempo):
450
+ """Use tempo to help estimate likely time signature"""
451
+ # Statistical tendencies: slower tempos often in compound meters (6/8, 12/8)
452
+ # Very fast tempos often counted in cut time (2/2 instead of 4/4)
453
+
454
+ scores = {}
455
+
456
+ if tempo < 70:
457
+ # Slow tempos favor compound meters
458
+ scores = {
459
+ "4/4": 0.4,
460
+ "3/4": 0.5,
461
+ "6/8": 0.7,
462
+ "12/8": 0.6
463
+ }
464
+ elif 70 <= tempo <= 120:
465
+ # Medium tempos favor 4/4, 3/4
466
+ scores = {
467
+ "4/4": 0.7,
468
+ "3/4": 0.6,
469
+ "2/4": 0.4,
470
+ "6/8": 0.5
471
+ }
472
  else:
473
+ # Fast tempos favor simpler meters
474
+ scores = {
475
+ "4/4": 0.6,
476
+ "2/4": 0.7,
477
+ "2/2": 0.6,
478
+ "3/4": 0.4
479
+ }
480
+
481
+ # Find best match
482
+ best_ts = max(scores.items(), key=lambda x: x[1])
483
+ return {"time_signature": best_ts[0], "confidence": best_ts[1]}
484
+
485
+ def _combine_detection_results(self, results, tempo):
486
+ """Combine results from different detection methods"""
487
+ # Define weights for different methods
488
+ method_weights = {
489
+ "autocorrelation": 0.25,
490
+ "pattern_matching": 0.30,
491
+ "spectral": 0.20,
492
+ "note_density": 0.20,
493
+ "tempo_based": 0.05
494
+ }
495
+
496
+ # Prior probability (based on frequency in music)
497
+ prior_weights = {ts: info["weight"] for ts, info in self.common_time_signatures.items()}
498
+
499
+ # Combine votes
500
+ total_votes = {ts: prior_weights.get(ts, 0.1) for ts in self.common_time_signatures}
501
+
502
+ for method, result in results.items():
503
+ ts = result["time_signature"]
504
+ confidence = result["confidence"]
505
+ weight = method_weights.get(method, 0.1)
506
+
507
+ # Add weighted vote
508
+ if ts in total_votes:
509
+ total_votes[ts] += confidence * weight
510
+ else:
511
+ total_votes[ts] = confidence * weight
512
+
513
+ # Special case: disambiguate between 3/4 and 6/8
514
+ if "3/4" in total_votes and "6/8" in total_votes:
515
+ # If the two are close, use tempo to break tie
516
+ if abs(total_votes["3/4"] - total_votes["6/8"]) < 0.1:
517
+ if tempo < 100: # Slower tempo favors 6/8
518
+ total_votes["6/8"] += 0.1
519
+ else: # Faster tempo favors 3/4
520
+ total_votes["3/4"] += 0.1
521
+
522
+ # Get highest scoring time signature
523
+ best_ts = max(total_votes.items(), key=lambda x: x[1])
524
+
525
+ # Calculate confidence score (normalize to 0-1)
526
+ confidence = best_ts[1] / (sum(total_votes.values()) + 0.001)
527
+ confidence = min(0.95, max(0.4, confidence)) # Bound confidence
528
+
529
+ return {
530
+ "time_signature": best_ts[0],
531
+ "confidence": confidence,
532
+ "all_candidates": {ts: float(score) for ts, score in total_votes.items()}
533
+ }
534
+
535
+ def _evaluate_beat_pattern(self, beat_strengths, pattern_length):
536
+ """
537
+ Evaluate how consistently a specific pattern length fits the beat strengths
538
+
539
+ Args:
540
+ beat_strengths: Array of normalized beat strengths
541
+ pattern_length: Length of pattern to evaluate
542
+
543
+ Returns:
544
+ score: How well this pattern length explains the data (0-1)
545
+ """
546
+ if len(beat_strengths) < pattern_length * 2:
547
+ return 0.0
548
+
549
+ # Calculate correlation between consecutive patterns
550
+ correlations = []
551
+
552
+ num_full_patterns = len(beat_strengths) // pattern_length
553
+ for i in range(num_full_patterns - 1):
554
+ pattern1 = beat_strengths[i*pattern_length:(i+1)*pattern_length]
555
+ pattern2 = beat_strengths[(i+1)*pattern_length:(i+2)*pattern_length]
556
+
557
+ # Calculate similarity between consecutive patterns
558
+ if len(pattern1) == len(pattern2) and len(pattern1) > 0:
559
+ corr = np.corrcoef(pattern1, pattern2)[0, 1]
560
+ if not np.isnan(corr):
561
+ correlations.append(corr)
562
+
563
+ # Calculate variance of beat strengths within each position
564
+ variance_score = 0
565
+ if num_full_patterns >= 2:
566
+ position_values = [[] for _ in range(pattern_length)]
567
+
568
+ for i in range(num_full_patterns):
569
+ for pos in range(pattern_length):
570
+ idx = i * pattern_length + pos
571
+ if idx < len(beat_strengths):
572
+ position_values[pos].append(beat_strengths[idx])
573
+
574
+ # Calculate variance ratio (higher means consistent accent patterns)
575
+ between_pos_var = np.var([np.mean(vals) for vals in position_values if vals])
576
+ within_pos_var = np.mean([np.var(vals) for vals in position_values if len(vals) > 1])
577
+
578
+ if within_pos_var > 0:
579
+ variance_score = between_pos_var / within_pos_var
580
+ variance_score = min(1.0, variance_score / 2.0) # Normalize
581
+
582
+ # Combine correlation and variance scores
583
+ if correlations:
584
+ correlation_score = np.mean(correlations)
585
+ return 0.7 * correlation_score + 0.3 * variance_score
586
+
587
+ return 0.5 * variance_score # Lower confidence if we couldn't calculate correlations
588
+
589
+ def _extract_average_pattern(self, beat_strengths, pattern_length):
590
+ """
591
+ Extract the average beat pattern of specified length
592
+
593
+ Args:
594
+ beat_strengths: Array of beat strengths
595
+ pattern_length: Length of pattern to extract
596
+
597
+ Returns:
598
+ Average pattern of the specified length
599
+ """
600
+ if len(beat_strengths) < pattern_length:
601
+ return np.array([])
602
+
603
+ # Number of complete patterns
604
+ num_patterns = len(beat_strengths) // pattern_length
605
+
606
+ if num_patterns == 0:
607
+ return np.array([])
608
+
609
+ # Reshape to stack patterns and calculate average
610
+ patterns = beat_strengths[:num_patterns * pattern_length].reshape((num_patterns, pattern_length))
611
+ return np.mean(patterns, axis=0)
612
 
613
  def analyze_tonality(self, y, sr):
614
  """Analyze tonal features: key, mode, harmonic features"""
 
857
  emotion_data = self.analyze_emotion(rhythm_data, tonal_data, energy_data)
858
  theme_data = self.analyze_theme(rhythm_data, tonal_data, emotion_data)
859
 
860
+ # Convert any remaining numpy values to native Python types
861
+ def convert_numpy_to_python(obj):
862
+ if isinstance(obj, dict):
863
+ return {k: convert_numpy_to_python(v) for k, v in obj.items()}
864
+ elif isinstance(obj, list):
865
+ return [convert_numpy_to_python(item) for item in obj]
866
+ elif isinstance(obj, np.ndarray):
867
+ return obj.tolist()
868
+ elif isinstance(obj, np.number):
869
+ return float(obj)
870
+ else:
871
+ return obj
872
+
873
+ # Ensure all numpy values are converted
874
+ rhythm_data = convert_numpy_to_python(rhythm_data)
875
+ tonal_data = convert_numpy_to_python(tonal_data)
876
+ energy_data = convert_numpy_to_python(energy_data)
877
+ emotion_data = convert_numpy_to_python(emotion_data)
878
+ theme_data = convert_numpy_to_python(theme_data)
879
+
880
  # Combine all results
881
  return {
882
  "file": file_path,
 
886
  "emotion_analysis": emotion_data,
887
  "theme_analysis": theme_data,
888
  "summary": {
889
+ "tempo": float(rhythm_data["tempo"]),
890
  "time_signature": rhythm_data["estimated_time_signature"],
891
  "key": tonal_data["key"],
892
  "mode": tonal_data["mode"],
requirements.txt CHANGED
@@ -13,4 +13,3 @@ scipy>=1.12.0
13
  soundfile>=0.12.1
14
  matplotlib>=3.7.0
15
  pronouncing>=0.2.0
16
- pyannote.audio>=2.1.1
 
13
  soundfile>=0.12.1
14
  matplotlib>=3.7.0
15
  pronouncing>=0.2.0