Spaces:
Sleeping
Sleeping
import numpy as np | |
import librosa | |
import torch | |
import torch.nn as nn | |
# import pywt | |
from scipy import signal | |
def compute_cwt_power_spectrum(audio, sample_rate, num_freqs=128, f_min=20, f_max=None): | |
""" | |
Compute the power spectrum of continuous wavelet transform using Morlet wavelet. | |
Parameters: | |
audio: torch.Tensor | |
Input audio signal | |
sample_rate: int | |
Sampling rate of the audio | |
num_freqs: int | |
Number of frequency bins for the CWT | |
f_min: float | |
Minimum frequency to analyze | |
f_max: float or None | |
Maximum frequency to analyze (defaults to Nyquist frequency) | |
Returns: | |
torch.Tensor: CWT power spectrum | |
""" | |
# Convert to numpy | |
audio_np = audio.cpu().numpy() | |
# Set default f_max to Nyquist frequency if not specified | |
if f_max is None: | |
f_max = sample_rate // 2 | |
# Generate frequency bins (logarithmically spaced) | |
frequencies = np.logspace( | |
np.log10(f_min), | |
np.log10(f_max), | |
num=num_freqs | |
) | |
# Compute the width of the wavelet (in samples) | |
widths = sample_rate / (2 * frequencies * np.pi) | |
# Compute CWT using Morlet wavelet | |
cwt = signal.cwt( | |
audio_np, | |
signal.morlet2, | |
widths, | |
w=5.0 # Width parameter of Morlet wavelet | |
) | |
# Compute power spectrum (magnitude squared) | |
power_spectrum = np.abs(cwt) ** 2 | |
# Convert to torch tensor | |
power_spectrum_tensor = torch.FloatTensor(power_spectrum) | |
return power_spectrum_tensor | |
# def compute_wavelet_transform(audio, wavelet, decompos_level): | |
# """Compute wavelet decomposition of the audio signal.""" | |
# # Convert to numpy and ensure 1D | |
# audio_np = audio.cpu().numpy() | |
# | |
# # Perform wavelet decomposition | |
# coeffs = pywt.wavedec(audio_np, wavelet, level=decompos_level) | |
# | |
# # Stack coefficients into a 2D array | |
# # First, pad all coefficient arrays to the same length | |
# max_len = max(len(c) for c in coeffs) | |
# padded_coeffs = [] | |
# for coeff in coeffs: | |
# pad_len = max_len - len(coeff) | |
# if pad_len > 0: | |
# padded_coeff = np.pad(coeff, (0, pad_len), mode='constant') | |
# else: | |
# padded_coeff = coeff | |
# padded_coeffs.append(padded_coeff) | |
# | |
# # Stack into 2D array where each row is a different scale | |
# wavelet_features = np.stack(padded_coeffs) | |
# | |
# # Convert to tensor | |
# return torch.FloatTensor(wavelet_features) | |
def compute_melspectrogram(audio, sample_rate): | |
mel_spec = librosa.feature.melspectrogram( | |
y=audio.cpu().numpy(), | |
sr=sample_rate, | |
n_mels=128 | |
) | |
return torch.FloatTensor(librosa.power_to_db(mel_spec)) | |
def compute_mfcc(audio, sample_rate): | |
mfcc = librosa.feature.mfcc( | |
y=audio.cpu().numpy(), | |
sr=sample_rate, | |
n_mfcc=20 | |
) | |
return torch.FloatTensor(mfcc) | |
def compute_chroma(audio, sample_rate): | |
chroma = librosa.feature.chroma_stft( | |
y=audio.cpu().numpy(), | |
sr=sample_rate | |
) | |
return torch.FloatTensor(chroma) | |
def compute_time_domain_features(audio, sample_rate, frame_length=2048, hop_length=128): | |
""" | |
Compute time-domain features from audio signal. | |
Returns a dictionary of features. | |
""" | |
# Convert to numpy | |
audio_np = audio.cpu().numpy() | |
# Initialize dictionary for features | |
features = {} | |
# 1. Zero Crossing Rate | |
zcr = librosa.feature.zero_crossing_rate( | |
y=audio_np, | |
frame_length=frame_length, | |
hop_length=hop_length | |
) | |
features['zcr'] = torch.Tensor([zcr.sum()]) | |
# 2. Root Mean Square Energy | |
rms = librosa.feature.rms( | |
y=audio_np, | |
frame_length=frame_length, | |
hop_length=hop_length | |
) | |
features['rms_energy'] = torch.Tensor([rms.mean()]) | |
# 3. Temporal Statistics | |
frames = librosa.util.frame(audio_np, frame_length=frame_length, hop_length=hop_length) | |
features['mean'] = torch.Tensor([np.mean(frames, axis=0).mean()]) | |
features['std'] = torch.Tensor([np.std(frames, axis=0).mean()]) | |
features['max'] = torch.Tensor([np.max(frames, axis=0).mean()]) | |
# 4. Tempo and Beat Features | |
onset_env = librosa.onset.onset_strength(y=audio_np, sr=sample_rate) | |
tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sample_rate) | |
features['tempo'] = torch.Tensor(tempo) | |
# 5. Amplitude Envelope | |
envelope = np.abs(librosa.stft(audio_np, n_fft=frame_length, hop_length=hop_length)) | |
features['envelope'] = torch.Tensor([np.mean(envelope, axis=0).mean()]) | |
return features | |
def compute_frequency_domain_features(audio, sample_rate, n_fft=2048, hop_length=512): | |
""" | |
Compute frequency-domain features from audio signal. | |
Returns a dictionary of features. | |
""" | |
# Convert to numpy | |
audio_np = audio.cpu().numpy() | |
# Initialize dictionary for features | |
features = {} | |
# 1. Spectral Centroid | |
try: | |
spectral_centroids = librosa.feature.spectral_centroid( | |
y=audio_np, | |
sr=sample_rate, | |
n_fft=n_fft, | |
hop_length=hop_length, | |
) | |
features['spectral_centroid'] = torch.FloatTensor([spectral_centroids.max()]) | |
except Exception as e: | |
features['spectral_centroid'] = torch.FloatTensor([np.nan]) | |
# 2. Spectral Rolloff | |
try: | |
spectral_rolloff = librosa.feature.spectral_rolloff( | |
y=audio_np, | |
sr=sample_rate, | |
n_fft=n_fft, | |
hop_length=hop_length, | |
) | |
features['spectral_rolloff'] = torch.FloatTensor([spectral_rolloff.max()]) | |
except Exception as e: | |
features['spectral_rolloff'] = torch.FloatTensor([np.nan]) | |
# 3. Spectral Bandwidth | |
try: | |
spectral_bandwidth = librosa.feature.spectral_bandwidth( | |
y=audio_np, | |
sr=sample_rate, | |
n_fft=n_fft, | |
hop_length=hop_length | |
) | |
features['spectral_bandwidth'] = torch.FloatTensor([spectral_bandwidth.max()]) | |
except Exception as e: | |
features['spectral_bandwidth'] = torch.FloatTensor([np.nan]) | |
# 4. Spectral Contrast | |
try: | |
spectral_contrast = librosa.feature.spectral_contrast( | |
y=audio_np, | |
sr=sample_rate, | |
n_fft=n_fft, | |
hop_length=hop_length, | |
fmin=20, # Lower minimum frequency | |
n_bands=4, # Reduce number of bands | |
quantile=0.02 | |
) | |
features['spectral_contrast'] = torch.FloatTensor([spectral_contrast.mean()]) | |
except Exception as e: | |
features['spectral_contrast'] = torch.FloatTensor([np.nan]) | |
# 5. Spectral Flatness | |
try: | |
spectral_flatness = librosa.feature.spectral_flatness( | |
y=audio_np, | |
n_fft=n_fft, | |
hop_length=hop_length | |
) | |
features['spectral_flatness'] = torch.FloatTensor([spectral_flatness.max()]) | |
except Exception as e: | |
features['spectral_flatness'] = torch.FloatTensor([np.nan]) | |
# 6. Spectral Flux | |
try: | |
stft = np.abs(librosa.stft(audio_np, n_fft=n_fft, hop_length=hop_length)) | |
spectral_flux = np.diff(stft, axis=1) | |
spectral_flux = np.pad(spectral_flux, ((0, 0), (1, 0)), mode='constant') | |
features['spectral_flux'] = torch.FloatTensor([np.std(spectral_flux)]) | |
except Exception as e: | |
features['spectral_flux'] = torch.FloatTensor([np.nan]) | |
# 7. MFCCs (Mel-Frequency Cepstral Coefficients) | |
try: | |
mfccs = librosa.feature.mfcc( | |
y=audio_np, | |
sr=sample_rate, | |
n_mfcc=13, # Number of MFCCs to compute | |
n_fft=n_fft, | |
hop_length=hop_length | |
) | |
features['mfcc_mean'] = torch.FloatTensor([mfccs.mean()]) | |
except Exception as e: | |
features['mfcc_mean'] = torch.FloatTensor([np.nan]) | |
# 8. Chroma Features | |
try: | |
chroma = librosa.feature.chroma_stft( | |
y=audio_np, | |
sr=sample_rate, | |
n_fft=n_fft, | |
hop_length=hop_length | |
) | |
features['chroma_mean'] = torch.FloatTensor([chroma.mean()]) | |
except Exception as e: | |
features['chroma_mean'] = torch.FloatTensor([np.nan]) | |
# 9. Spectral Kurtosis | |
try: | |
spectral_kurtosis = librosa.feature.spectral_kurtosis( | |
y=audio_np, | |
sr=sample_rate, | |
n_fft=n_fft, | |
hop_length=hop_length | |
) | |
features['spectral_kurtosis'] = torch.FloatTensor([spectral_kurtosis.mean()]) | |
except Exception as e: | |
features['spectral_kurtosis'] = torch.FloatTensor([np.nan]) | |
# 10. Spectral Skewness | |
try: | |
spectral_skewness = librosa.feature.spectral_skewness( | |
y=audio_np, | |
sr=sample_rate, | |
n_fft=n_fft, | |
hop_length=hop_length | |
) | |
features['spectral_skewness'] = torch.FloatTensor([spectral_skewness.mean()]) | |
except Exception as e: | |
features['spectral_skewness'] = torch.FloatTensor([np.nan]) | |
# 11. Spectral Slope | |
try: | |
spectral_slope = librosa.feature.spectral_slope( | |
y=audio_np, | |
sr=sample_rate, | |
n_fft=n_fft, | |
hop_length=hop_length | |
) | |
features['spectral_slope'] = torch.FloatTensor([spectral_slope.mean()]) | |
except Exception as e: | |
features['spectral_slope'] = torch.FloatTensor([np.nan]) | |
# 12. Tonnetz (Tonal Centroid Features) | |
try: | |
tonnetz = librosa.feature.tonnetz( | |
y=audio_np, | |
sr=sample_rate | |
) | |
features['tonnetz_mean'] = torch.FloatTensor([tonnetz.mean()]) | |
except Exception as e: | |
features['tonnetz_mean'] = torch.FloatTensor([np.nan]) | |
return features | |
def compute_all_features(audio, sample_rate, wavelet='db1', decompos_level=4): | |
""" | |
Compute all available features and return them in a dictionary. | |
""" | |
features = {} | |
# Basic transformations | |
# features['wavelet'] = compute_wavelet_transform(audio, wavelet, decompos_level) | |
# features['melspectrogram'] = compute_melspectrogram(audio, sample_rate) | |
# features['mfcc'] = compute_mfcc(audio, sample_rate) | |
# features['chroma'] = compute_chroma(audio, sample_rate) | |
# features['cwt_power'] = compute_cwt_power_spectrum( | |
# audio, | |
# sample_rate, | |
# num_freqs=128, # Same as mel bands for consistency | |
# f_min=20, # Standard lower frequency bound | |
# f_max=sample_rate // 2 # Nyquist frequency | |
# ) | |
# Time domain features | |
# features['time_domain'] = compute_time_domain_features(audio, sample_rate) | |
# Frequency domain features | |
return compute_frequency_domain_features(audio, sample_rate) |