|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Compute input examples for VGGish from audio waveform.""" |
|
|
|
import numpy as np |
|
import resampy |
|
|
|
import mel_features |
|
import vggish_params |
|
|
|
try: |
|
import soundfile as sf |
|
|
|
def wav_read(wav_file): |
|
wav_data, sr = sf.read(wav_file, dtype='int16') |
|
return wav_data, sr |
|
|
|
except ImportError: |
|
|
|
def wav_read(wav_file): |
|
raise NotImplementedError('WAV file reading requires soundfile package.') |
|
|
|
|
|
def waveform_to_examples(data, sample_rate): |
|
"""Converts audio waveform into an array of examples for VGGish. |
|
|
|
Args: |
|
data: np.array of either one dimension (mono) or two dimensions |
|
(multi-channel, with the outer dimension representing channels). |
|
Each sample is generally expected to lie in the range [-1.0, +1.0], |
|
although this is not required. |
|
sample_rate: Sample rate of data. |
|
|
|
Returns: |
|
3-D np.array of shape [num_examples, num_frames, num_bands] which represents |
|
a sequence of examples, each of which contains a patch of log mel |
|
spectrogram, covering num_frames frames of audio and num_bands mel frequency |
|
bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. |
|
""" |
|
|
|
if len(data.shape) > 1: |
|
data = np.mean(data, axis=1) |
|
|
|
if sample_rate != vggish_params.SAMPLE_RATE: |
|
data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) |
|
|
|
|
|
log_mel = mel_features.log_mel_spectrogram( |
|
data, |
|
audio_sample_rate=vggish_params.SAMPLE_RATE, |
|
log_offset=vggish_params.LOG_OFFSET, |
|
window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, |
|
hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, |
|
num_mel_bins=vggish_params.NUM_MEL_BINS, |
|
lower_edge_hertz=vggish_params.MEL_MIN_HZ, |
|
upper_edge_hertz=vggish_params.MEL_MAX_HZ) |
|
|
|
|
|
features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS |
|
example_window_length = int(round( |
|
vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) |
|
example_hop_length = int(round( |
|
vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) |
|
log_mel_examples = mel_features.frame( |
|
log_mel, |
|
window_length=example_window_length, |
|
hop_length=example_hop_length) |
|
return log_mel_examples |
|
|
|
|
|
def wavfile_to_examples(wav_file): |
|
"""Convenience wrapper around waveform_to_examples() for a common WAV format. |
|
|
|
Args: |
|
wav_file: String path to a file, or a file-like object. The file |
|
is assumed to contain WAV audio data with signed 16-bit PCM samples. |
|
|
|
Returns: |
|
See waveform_to_examples. |
|
""" |
|
wav_data, sr = wav_read(wav_file) |
|
assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype |
|
samples = wav_data / 32768.0 |
|
return waveform_to_examples(samples, sr) |
|
|