root
ss
93aef48
import torch
import numpy as np
import librosa
def load_audio(audio_file, sr=22050):
"""Load an audio file and convert to mono if needed."""
try:
# Try to load audio with librosa
y, sr = librosa.load(audio_file, sr=sr, mono=True)
return y, sr
except Exception as e:
print(f"Error loading audio with librosa: {str(e)}")
# Fallback to basic loading if necessary
import soundfile as sf
try:
y, sr = sf.read(audio_file)
# Convert to mono if stereo
if len(y.shape) > 1:
y = y.mean(axis=1)
return y, sr
except Exception as e2:
print(f"Error loading audio with soundfile: {str(e2)}")
raise ValueError(f"Could not load audio file: {audio_file}")
def extract_audio_duration(y, sr):
"""Get the duration of audio in seconds."""
return len(y) / sr
def extract_mfcc_features(y, sr, n_mfcc=20):
"""Extract MFCC features from audio."""
try:
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
mfccs_mean = np.mean(mfccs.T, axis=0)
return mfccs_mean
except Exception as e:
print(f"Error extracting MFCCs: {str(e)}")
# Return a fallback feature vector if extraction fails
return np.zeros(n_mfcc)
def calculate_lyrics_length(duration):
"""
Calculate appropriate lyrics length based on audio duration.
Uses a more conservative calculation that generates shorter lyrics:
- Average words per line (8-10 words)
- Reduced words per minute (45 words instead of 135)
- Simplified song structure
"""
# Convert duration to minutes
duration_minutes = duration / 60
# Calculate total words based on duration
# Using 45 words per minute (reduced from 135)
total_words = int(duration_minutes * 90)
# Calculate number of lines
# Assuming 8-10 words per line
words_per_line = 9 # average
total_lines = total_words // words_per_line
# Adjust for song structure with shorter lengths
if total_lines < 6:
# Very short song - keep it simple
return max(2, total_lines)
elif total_lines < 10:
# Short song - one verse and chorus
return min(6, total_lines)
elif total_lines < 15:
# Medium song - two verses and chorus
return min(10, total_lines)
else:
# Longer song - two verses, chorus, and bridge
return min(15, total_lines)
def format_genre_results(top_genres):
"""Format genre classification results for display."""
result = "Top Detected Genres:\n"
for genre, confidence in top_genres:
result += f"- {genre}: {confidence*100:.2f}%\n"
return result
def ensure_cuda_availability():
"""Check and report CUDA availability for informational purposes."""
cuda_available = torch.cuda.is_available()
if cuda_available:
device_count = torch.cuda.device_count()
device_name = torch.cuda.get_device_name(0) if device_count > 0 else "Unknown"
print(f"CUDA is available with {device_count} device(s). Using: {device_name}")
else:
print("CUDA is not available. Using CPU for inference.")
return cuda_available
def preprocess_audio_for_model(waveform, sample_rate, target_sample_rate=16000, max_length=16000):
"""Preprocess audio for model input (resample, pad/trim)."""
# Resample if needed
if sample_rate != target_sample_rate:
waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=target_sample_rate)
# Trim or pad to expected length
if len(waveform) > max_length:
waveform = waveform[:max_length]
elif len(waveform) < max_length:
padding = max_length - len(waveform)
waveform = np.pad(waveform, (0, padding), 'constant')
return waveform