File size: 9,955 Bytes
dd1cb8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import numpy as np
import librosa
import textgrids
import os
import python_speech_features
from tqdm import tqdm

# Function for reading labels from .TextGrig file:
def readLabels(path, sample_rate):
        
    labeled_list  = []
    grid = textgrids.TextGrid(path)

    for interval in grid['silences']:
        if interval.text == "-" or interval.text == " ":
            label = 0
        else:
            label = 1

        dur = interval.dur
        dur_samples = int(np.round(dur * sample_rate)) # sec -> num of samples
        
        for i in range(dur_samples):
            labeled_list.append(label)

    return np.array(labeled_list)

def load_files(audio_path, audio_extension):
    """

    Recursively loads audio files from a specified directory.



    Args:

        audio_path (str): The root directory to search for audio files.

        audio_extension (str, optional): The audio file extension to filter 

                                        for (default is ".wav").



    Returns:

        list: A sorted list of full paths to the found audio files.

    Raises:

        FileNotFoundError: If the specified audio_path does not exist.

    """

    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"Audio path '{audio_path}' not found.")

    audio_files = []

    for root, _, files in os.walk(audio_path):
        for file in files:
            if file.endswith(audio_extension):
                audio_files.append(os.path.join(root, file))

    return sorted(audio_files)

def max_signal_length(audio_files):
    """

    Determines the maximum signal length among a list of audio files.



    Args:

        audio_files (list): A list of paths to audio files.



    Returns:

        int: The maximum signal length found among the audio files.

    

    Raises:

        ValueError: If the input list is empty.

        IOError: If any audio file cannot be loaded.

    """
    
    if not audio_files:
        raise ValueError("Audio file list cannot be empty")
    
    max_length = 0
    for audio_file in audio_files:
        try:
            signal, _ = librosa.load(audio_file)
            max_length = max(max_length, len(signal))  
        except Exception as e:  # Catch potential loading errors
            raise IOError(f"Error loading audio file '{audio_file}': {e}")

    return max_length

def object_padding(object, length):
    """

    Pad the object to the given length



    Args:

        object (np.array): time series object

        max_length (int): Desired length to pad/truncate signals to.

    Returns:

        object (np.array): processed time series object

    """
    if len(object) < length:
        padding_length = length - len(object)
        # Pad at the end
        object = np.pad(object, (0, padding_length), mode="constant")
    else:
        object = object[:length]
    return np.array(object)

def fbank_features_extraction(audio_files, max_length, preemphasis_coef=0.97, window_length=0.025, window_step=0.01, window_function=np.hamming, num_nfft=551, num_features=40):
    """

    Extracts log Mel-filterbank (fbank) features from a list of audio files.



    Args:

        audio_files (list): List of paths to audio files.

        max_length (int): Desired length to pad/truncate signals to.

        preemphasis_coef (float): Pre-emphasis filter coefficient (default: 0.97).

        window_length (float): Length of the analysis window in seconds (default: 0.025).

        window_step (float): Step between successive windows in seconds (default: 0.01).

        window_function (callable): Window function to apply (default: np.hamming).

        nfft (int): Number of FFT points (default: 551).

        num_features (int): Number of Mel filters (default: 40).



    Returns:

        np.ndarray: 2D array of shape (num_files, num_frames, num_features + 1) 

                    where num_features + 1 represents the log energy feature.

    """
    # Filter Bank
    fbank_features = list()
    for i in tqdm(range(len(audio_files))):
        # Load the signal and sample rate
        signal, sample_rate = librosa.load(audio_files[i])
        # Audio padding
        signal = object_padding(signal, max_length)
        # Extract features
        # features_fbank (Mel-frequency cepstral coefficiens): captures the spectral information of the audio signal in a way that mimics human auditory perception
        # feature_energy: the overall energy of the audio signal within a specific frequency range
        features_fbank, feature_energy = python_speech_features.base.fbank(signal=signal,
                                                                        samplerate=sample_rate,
                                                                        winlen=window_length,
                                                                        winstep=window_step,
                                                                        nfilt=num_features,
                                                                        nfft=num_nfft,
                                                                        lowfreq=0,
                                                                        highfreq=None,
                                                                        preemph=preemphasis_coef,
                                                                        winfunc=window_function)
        # Log fbank and log energy
        features_logfbank = np.log(features_fbank)
        feature_logenergy = np.log(feature_energy)
        # Merge logfbank and log energy:
        features = np.hstack((feature_logenergy.reshape(feature_logenergy.shape[0], 1), features_logfbank))
        # Storing the fbank features for each audio
        fbank_features.append(features)
    # Return the features in numpy array format
    return np.array(fbank_features)

def supervised_features_extraction(audio_files, annotation_files, max_length, preemphasis_coef=0.97, window_length=0.025, window_step=0.01, window_function=np.hamming, num_nfft=551, num_features=40):
    """

    Extracts log Mel-filterbank (fbank) features from a list of audio files.



    Args:

        audio_files (list): List of paths to audio files.

        annotation_files (list): List of paths to annotation files

        max_length (int): Desired length to pad/truncate signals to.

        preemphasis_coef (float): Pre-emphasis filter coefficient (default: 0.97).

        window_length (float): Length of the analysis window in seconds (default: 0.025).

        window_step (float): Step between successive windows in seconds (default: 0.01).

        window_function (callable): Window function to apply (default: np.hamming).

        nfft (int): Number of FFT points (default: 551).

        num_features (int): Number of Mel filters (default: 40).



    Returns:

        np.ndarray: 2D array of shape (num_files, num_frames, num_features + 1) 

                    where num_features + 1 represents the log energy feature.

    """
    # Filter Bank
    fbank_features = list()
    labels = list()
    for i in tqdm(range(len(audio_files))):
        # Load the signal and sample rate
        signal, sample_rate = librosa.load(audio_files[i])
        signal = object_padding(signal, max_length)
        truth_labels = readLabels(path=annotation_files[i], sample_rate=sample_rate)
        truth_labels = object_padding(truth_labels, max_length)
        # Extract features
        # features_fbank (Mel-frequency cepstral coefficiens): captures the spectral information of the audio signal in a way that mimics human auditory perception
        # feature_energy: the overall energy of the audio signal within a specific frequency range
        features_fbank, feature_energy = python_speech_features.base.fbank(signal=signal,
                                                                        samplerate=sample_rate,
                                                                        winlen=window_length,
                                                                        winstep=window_step,
                                                                        nfilt=num_features,
                                                                        nfft=num_nfft,
                                                                        lowfreq=0,
                                                                        highfreq=None,
                                                                        preemph=preemphasis_coef,
                                                                        winfunc=window_function)
        # Log fbank and log energy
        features_logfbank = np.log(features_fbank)
        feature_logenergy = np.log(feature_energy)
        # Merge logfbank and log energy:
        features = np.hstack((feature_logenergy.reshape(feature_logenergy.shape[0], 1), features_logfbank))
        
        # Reshape labels for each group of features:
        temp_label = python_speech_features.sigproc.framesig(sig=truth_labels, 
                                                                    frame_len=window_length * sample_rate, 
                                                                    frame_step=window_step * sample_rate, 
                                                                    winfunc=np.ones)
        label = np.zeros(temp_label.shape[0])
        label = np.array([1 if np.sum(temp_label[j], axis=0) > temp_label.shape[0] / 2 else 0 for j in range(temp_label.shape[0])])
        # Storing the fbank features and label for each audio
        fbank_features.append(features)
        labels.append(label)
    # Return the features in numpy array format
    return np.array(fbank_features), np.array(labels)