Spaces:
Runtime error
Runtime error
import torch | |
import numpy as np | |
import librosa | |
def load_audio(audio_file, sr=22050): | |
"""Load an audio file and convert to mono if needed.""" | |
try: | |
# Try to load audio with librosa | |
y, sr = librosa.load(audio_file, sr=sr, mono=True) | |
return y, sr | |
except Exception as e: | |
print(f"Error loading audio with librosa: {str(e)}") | |
# Fallback to basic loading if necessary | |
import soundfile as sf | |
try: | |
y, sr = sf.read(audio_file) | |
# Convert to mono if stereo | |
if len(y.shape) > 1: | |
y = y.mean(axis=1) | |
return y, sr | |
except Exception as e2: | |
print(f"Error loading audio with soundfile: {str(e2)}") | |
raise ValueError(f"Could not load audio file: {audio_file}") | |
def extract_audio_duration(y, sr): | |
"""Get the duration of audio in seconds.""" | |
return len(y) / sr | |
def extract_mfcc_features(y, sr, n_mfcc=20): | |
"""Extract MFCC features from audio.""" | |
try: | |
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc) | |
mfccs_mean = np.mean(mfccs.T, axis=0) | |
return mfccs_mean | |
except Exception as e: | |
print(f"Error extracting MFCCs: {str(e)}") | |
# Return a fallback feature vector if extraction fails | |
return np.zeros(n_mfcc) | |
def calculate_lyrics_length(duration): | |
""" | |
Calculate appropriate lyrics length based on audio duration. | |
Uses a more conservative calculation that generates shorter lyrics: | |
- Average words per line (8-10 words) | |
- Reduced words per minute (45 words instead of 135) | |
- Simplified song structure | |
""" | |
# Convert duration to minutes | |
duration_minutes = duration / 60 | |
# Calculate total words based on duration | |
# Using 45 words per minute (reduced from 135) | |
total_words = int(duration_minutes * 90) | |
# Calculate number of lines | |
# Assuming 8-10 words per line | |
words_per_line = 9 # average | |
total_lines = total_words // words_per_line | |
# Adjust for song structure with shorter lengths | |
if total_lines < 6: | |
# Very short song - keep it simple | |
return max(2, total_lines) | |
elif total_lines < 10: | |
# Short song - one verse and chorus | |
return min(6, total_lines) | |
elif total_lines < 15: | |
# Medium song - two verses and chorus | |
return min(10, total_lines) | |
else: | |
# Longer song - two verses, chorus, and bridge | |
return min(15, total_lines) | |
def format_genre_results(top_genres): | |
"""Format genre classification results for display.""" | |
result = "Top Detected Genres:\n" | |
for genre, confidence in top_genres: | |
result += f"- {genre}: {confidence*100:.2f}%\n" | |
return result | |
def ensure_cuda_availability(): | |
"""Check and report CUDA availability for informational purposes.""" | |
cuda_available = torch.cuda.is_available() | |
if cuda_available: | |
device_count = torch.cuda.device_count() | |
device_name = torch.cuda.get_device_name(0) if device_count > 0 else "Unknown" | |
print(f"CUDA is available with {device_count} device(s). Using: {device_name}") | |
else: | |
print("CUDA is not available. Using CPU for inference.") | |
return cuda_available | |
def preprocess_audio_for_model(waveform, sample_rate, target_sample_rate=16000, max_length=16000): | |
"""Preprocess audio for model input (resample, pad/trim).""" | |
# Resample if needed | |
if sample_rate != target_sample_rate: | |
waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=target_sample_rate) | |
# Trim or pad to expected length | |
if len(waveform) > max_length: | |
waveform = waveform[:max_length] | |
elif len(waveform) < max_length: | |
padding = max_length - len(waveform) | |
waveform = np.pad(waveform, (0, padding), 'constant') | |
return waveform |