File size: 6,277 Bytes
eafbf97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
"""Audio utils"""
import librosa
import numpy as np
import matplotlib.pyplot as plt


def load_audio(audio_path: str, sr: int = None, max_duration: int = 10., start: int = 0, stop: int = None):
    """Loads audio and pads/trims it to max_duration"""
    data, sr = librosa.load(audio_path, sr=sr)
    
    if stop is not None:
        start = int(start * sr)
        stop = int(stop * sr)
        data = data[start:stop]
    
    # Convert to mono
    if len(data.shape) > 1:
        data = np.mean(data, axis=1)
    
    n_frames = int(max_duration * sr)
    if len(data) > n_frames:
        data = data[:n_frames]
    elif len(data) < n_frames:
        data = np.pad(data, (0, n_frames - len(data)), "constant")
    return data, sr
    

# def compute_spectrogram(data: np.ndarray, sr: int):
#     D = librosa.stft(data)  # STFT of y
#     S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
#     return S_db


def compute_spec_freq_mean(S_db: np.ndarray, eps=1e-5):
    # Compute mean of spectrogram over frequency axis
    S_db_normalized = (S_db - S_db.mean(axis=1)[:, None]) / (S_db.std(axis=1)[:, None] + eps)
    S_db_over_time = S_db_normalized.sum(axis=0)
    return S_db_over_time


def process_audiofile(audio_path, functions=["load_audio", "compute_spectrogram", "compute_spec_freq_mean"]):
    """Processes audio file with a list of functions"""
    data, sr = load_audio(audio_path)
    for function in functions:
        if function == "load_audio":
            pass
        elif function == "compute_spectrogram":
            data = compute_spectrogram(data, sr)
        elif function == "compute_spec_freq_mean":
            data = compute_spec_freq_mean(data)
        else:
            raise ValueError(f"Unknown function {function}")
    return data



"""PyDub's silence detection is based on the energy of the audio signal."""
import numpy as np


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


class SilenceDetector:
    

    def __init__(self, silence_thresh=-36) -> None:
        self.silence_thresh = silence_thresh
    
    def __call__(self, audio_path: str, start=None, end=None):
        
        import pydub
        from pydub.utils import db_to_float
        
        try:
            waveform = pydub.AudioSegment.from_file(audio_path)
        except:
            print("Error loading audio file: ", audio_path)
            return 100.0
        
        start_ms = int(start * 1000) if start else 0
        end_ms = int(end * 1000) if end else len(waveform)
        waveform = waveform[start_ms:end_ms]
        
        # convert silence threshold to a float value (so we can compare it to rms)
        silence_thresh = db_to_float(self.silence_thresh) * waveform.max_possible_amplitude
        
        if waveform.rms == 0:
            return 100.0

        silence_prob = sigmoid((silence_thresh - waveform.rms) / waveform.rms)

        # return waveform.rms <= silence_thresh
        return np.round(100 * silence_prob, 2)


def frequency_bin_to_value(bin_index, sr, n_fft):
    return int(bin_index * sr / n_fft)


def time_bin_to_value(bin_index, hop_length, sr):
    return (bin_index) * (hop_length / sr)


def add_time_annotations(ax, nt_bins, hop_length, sr, skip=50):
    # Show time (s) values on the x-axis
    t_bins = np.arange(nt_bins)
    t_vals = np.round(np.array([time_bin_to_value(tb, hop_length, sr) for tb in t_bins]), 1)
    try:
        ax.set_xticks(t_bins[::skip], t_vals[::skip])
    except:
        pass
    ax.set_xlabel("Time (s)")


def add_freq_annotations(ax, nf_bins, sr, n_fft, skip=50):
    f_bins = np.arange(nf_bins)
    f_vals = np.array([frequency_bin_to_value(fb, sr, n_fft) for fb in f_bins])
    try:
        ax.set_yticks(f_bins[::skip], f_vals[::skip])
    except:
        pass
    # ax.set_yticks(f_bins[::skip])
    # ax.set_yticklabels(f_vals[::skip])
    ax.set_ylabel("Frequency (Hz)")


def show_single_spectrogram(
    spec,
    sr,
    n_fft,
    hop_length,
    ax=None,
    fig=None,
    figsize=(10, 2),
    cmap="viridis",
    colorbar=True,
    show=True,
    format='%+2.0f dB',
    xlabel='Time (s)',
    ylabel="Frequency (Hz)",
    title=None,
    show_dom_freq=False,
):

    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=figsize)
    axim = ax.imshow(spec, origin="lower", cmap=cmap)

    # Show frequency (Hz) values on y-axis
    nf_bins, nt_bins = spec.shape

    if "frequency" in ylabel.lower():
        # Add frequency annotation
        add_freq_annotations(ax, nf_bins, sr, n_fft)

    # Add time annotation
    add_time_annotations(ax, nt_bins, hop_length, sr)

    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)

    if colorbar:
        fig.colorbar(axim, ax=ax, orientation='vertical', fraction=0.01, format=format)

    if show_dom_freq:
        fmax = spec.argmax(axis=0)
        ax.scatter(np.arange(spec.shape[1]), fmax, color="white", s=0.2)

    if show:
        plt.show()


def compute_spectrogram(y, n_fft, hop_length, margin, n_mels=None):
    # STFT
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length)

    # Run HPSS
    S, _ = librosa.decompose.hpss(D, margin=margin)

    # DB
    S = librosa.amplitude_to_db(np.abs(S), ref=np.max)

    if n_mels is not None:
        S = librosa.feature.melspectrogram(S=S, n_mels=n_mels)

    return S


def show_spectrogram(S, sr, n_fft=512, hop_length=256, figsize=(10, 3), n_mels=None, ax=None, show=True):
    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=figsize)
    y_axis = "mel" if n_mels is not None else "linear"
    librosa.display.specshow(
        S,
        sr=sr,
        hop_length=hop_length,
        n_fft=n_fft,
        y_axis=y_axis,
        x_axis='time',
        ax=ax,
    )
    ax.set_title("LogSpectrogram" if n_mels is None else "LogMelSpectrogram")
    if show:
        plt.show()


def show_frame_and_spectrogram(frame, S, sr, figsize=(12, 4), show=True, axes=None, **spec_args):
    if axes is None:
        fig, axes = plt.subplots(1, 2, figsize=figsize, gridspec_kw={"width_ratios": [0.2, 0.8]})
    ax = axes[0]
    ax.imshow(frame)
    ax.set_xticks([])
    ax.set_yticks([])

    ax = axes[1]
    show_spectrogram(S=S, sr=sr, ax=ax, show=False, **spec_args)

    plt.tight_layout()

    if show:
        plt.show()