File size: 9,955 Bytes
dd1cb8f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
import numpy as np
import librosa
import textgrids
import os
import python_speech_features
from tqdm import tqdm
# Function for reading labels from .TextGrig file:
def readLabels(path, sample_rate):
labeled_list = []
grid = textgrids.TextGrid(path)
for interval in grid['silences']:
if interval.text == "-" or interval.text == " ":
label = 0
else:
label = 1
dur = interval.dur
dur_samples = int(np.round(dur * sample_rate)) # sec -> num of samples
for i in range(dur_samples):
labeled_list.append(label)
return np.array(labeled_list)
def load_files(audio_path, audio_extension):
"""
Recursively loads audio files from a specified directory.
Args:
audio_path (str): The root directory to search for audio files.
audio_extension (str, optional): The audio file extension to filter
for (default is ".wav").
Returns:
list: A sorted list of full paths to the found audio files.
Raises:
FileNotFoundError: If the specified audio_path does not exist.
"""
if not os.path.exists(audio_path):
raise FileNotFoundError(f"Audio path '{audio_path}' not found.")
audio_files = []
for root, _, files in os.walk(audio_path):
for file in files:
if file.endswith(audio_extension):
audio_files.append(os.path.join(root, file))
return sorted(audio_files)
def max_signal_length(audio_files):
"""
Determines the maximum signal length among a list of audio files.
Args:
audio_files (list): A list of paths to audio files.
Returns:
int: The maximum signal length found among the audio files.
Raises:
ValueError: If the input list is empty.
IOError: If any audio file cannot be loaded.
"""
if not audio_files:
raise ValueError("Audio file list cannot be empty")
max_length = 0
for audio_file in audio_files:
try:
signal, _ = librosa.load(audio_file)
max_length = max(max_length, len(signal))
except Exception as e: # Catch potential loading errors
raise IOError(f"Error loading audio file '{audio_file}': {e}")
return max_length
def object_padding(object, length):
"""
Pad the object to the given length
Args:
object (np.array): time series object
max_length (int): Desired length to pad/truncate signals to.
Returns:
object (np.array): processed time series object
"""
if len(object) < length:
padding_length = length - len(object)
# Pad at the end
object = np.pad(object, (0, padding_length), mode="constant")
else:
object = object[:length]
return np.array(object)
def fbank_features_extraction(audio_files, max_length, preemphasis_coef=0.97, window_length=0.025, window_step=0.01, window_function=np.hamming, num_nfft=551, num_features=40):
"""
Extracts log Mel-filterbank (fbank) features from a list of audio files.
Args:
audio_files (list): List of paths to audio files.
max_length (int): Desired length to pad/truncate signals to.
preemphasis_coef (float): Pre-emphasis filter coefficient (default: 0.97).
window_length (float): Length of the analysis window in seconds (default: 0.025).
window_step (float): Step between successive windows in seconds (default: 0.01).
window_function (callable): Window function to apply (default: np.hamming).
nfft (int): Number of FFT points (default: 551).
num_features (int): Number of Mel filters (default: 40).
Returns:
np.ndarray: 2D array of shape (num_files, num_frames, num_features + 1)
where num_features + 1 represents the log energy feature.
"""
# Filter Bank
fbank_features = list()
for i in tqdm(range(len(audio_files))):
# Load the signal and sample rate
signal, sample_rate = librosa.load(audio_files[i])
# Audio padding
signal = object_padding(signal, max_length)
# Extract features
# features_fbank (Mel-frequency cepstral coefficiens): captures the spectral information of the audio signal in a way that mimics human auditory perception
# feature_energy: the overall energy of the audio signal within a specific frequency range
features_fbank, feature_energy = python_speech_features.base.fbank(signal=signal,
samplerate=sample_rate,
winlen=window_length,
winstep=window_step,
nfilt=num_features,
nfft=num_nfft,
lowfreq=0,
highfreq=None,
preemph=preemphasis_coef,
winfunc=window_function)
# Log fbank and log energy
features_logfbank = np.log(features_fbank)
feature_logenergy = np.log(feature_energy)
# Merge logfbank and log energy:
features = np.hstack((feature_logenergy.reshape(feature_logenergy.shape[0], 1), features_logfbank))
# Storing the fbank features for each audio
fbank_features.append(features)
# Return the features in numpy array format
return np.array(fbank_features)
def supervised_features_extraction(audio_files, annotation_files, max_length, preemphasis_coef=0.97, window_length=0.025, window_step=0.01, window_function=np.hamming, num_nfft=551, num_features=40):
"""
Extracts log Mel-filterbank (fbank) features from a list of audio files.
Args:
audio_files (list): List of paths to audio files.
annotation_files (list): List of paths to annotation files
max_length (int): Desired length to pad/truncate signals to.
preemphasis_coef (float): Pre-emphasis filter coefficient (default: 0.97).
window_length (float): Length of the analysis window in seconds (default: 0.025).
window_step (float): Step between successive windows in seconds (default: 0.01).
window_function (callable): Window function to apply (default: np.hamming).
nfft (int): Number of FFT points (default: 551).
num_features (int): Number of Mel filters (default: 40).
Returns:
np.ndarray: 2D array of shape (num_files, num_frames, num_features + 1)
where num_features + 1 represents the log energy feature.
"""
# Filter Bank
fbank_features = list()
labels = list()
for i in tqdm(range(len(audio_files))):
# Load the signal and sample rate
signal, sample_rate = librosa.load(audio_files[i])
signal = object_padding(signal, max_length)
truth_labels = readLabels(path=annotation_files[i], sample_rate=sample_rate)
truth_labels = object_padding(truth_labels, max_length)
# Extract features
# features_fbank (Mel-frequency cepstral coefficiens): captures the spectral information of the audio signal in a way that mimics human auditory perception
# feature_energy: the overall energy of the audio signal within a specific frequency range
features_fbank, feature_energy = python_speech_features.base.fbank(signal=signal,
samplerate=sample_rate,
winlen=window_length,
winstep=window_step,
nfilt=num_features,
nfft=num_nfft,
lowfreq=0,
highfreq=None,
preemph=preemphasis_coef,
winfunc=window_function)
# Log fbank and log energy
features_logfbank = np.log(features_fbank)
feature_logenergy = np.log(feature_energy)
# Merge logfbank and log energy:
features = np.hstack((feature_logenergy.reshape(feature_logenergy.shape[0], 1), features_logfbank))
# Reshape labels for each group of features:
temp_label = python_speech_features.sigproc.framesig(sig=truth_labels,
frame_len=window_length * sample_rate,
frame_step=window_step * sample_rate,
winfunc=np.ones)
label = np.zeros(temp_label.shape[0])
label = np.array([1 if np.sum(temp_label[j], axis=0) > temp_label.shape[0] / 2 else 0 for j in range(temp_label.shape[0])])
# Storing the fbank features and label for each audio
fbank_features.append(features)
labels.append(label)
# Return the features in numpy array format
return np.array(fbank_features), np.array(labels)
|