NCTCMumbai's picture
Upload 2583 files
97b6013 verified
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utility class for extracting features from the text and audio input."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import codecs
import numpy as np
def compute_spectrogram_feature(samples, sample_rate, stride_ms=10.0,
window_ms=20.0, max_freq=None, eps=1e-14):
"""Compute the spectrograms for the input samples(waveforms).
More about spectrogram computation, please refer to:
https://en.wikipedia.org/wiki/Short-time_Fourier_transform.
"""
if max_freq is None:
max_freq = sample_rate / 2
if max_freq > sample_rate / 2:
raise ValueError("max_freq must not be greater than half of sample rate.")
if stride_ms > window_ms:
raise ValueError("Stride size must not be greater than window size.")
stride_size = int(0.001 * sample_rate * stride_ms)
window_size = int(0.001 * sample_rate * window_ms)
# Extract strided windows
truncate_size = (len(samples) - window_size) % stride_size
samples = samples[:len(samples) - truncate_size]
nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
nstrides = (samples.strides[0], samples.strides[0] * stride_size)
windows = np.lib.stride_tricks.as_strided(
samples, shape=nshape, strides=nstrides)
assert np.all(
windows[:, 1] == samples[stride_size:(stride_size + window_size)])
# Window weighting, squared Fast Fourier Transform (fft), scaling
weighting = np.hanning(window_size)[:, None]
fft = np.fft.rfft(windows * weighting, axis=0)
fft = np.absolute(fft)
fft = fft**2
scale = np.sum(weighting**2) * sample_rate
fft[1:-1, :] *= (2.0 / scale)
fft[(0, -1), :] /= scale
# Prepare fft frequency list
freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
# Compute spectrogram feature
ind = np.where(freqs <= max_freq)[0][-1] + 1
specgram = np.log(fft[:ind, :] + eps)
return np.transpose(specgram, (1, 0))
class AudioFeaturizer(object):
"""Class to extract spectrogram features from the audio input."""
def __init__(self,
sample_rate=16000,
window_ms=20.0,
stride_ms=10.0):
"""Initialize the audio featurizer class according to the configs.
Args:
sample_rate: an integer specifying the sample rate of the input waveform.
window_ms: an integer for the length of a spectrogram frame, in ms.
stride_ms: an integer for the frame stride, in ms.
"""
self.sample_rate = sample_rate
self.window_ms = window_ms
self.stride_ms = stride_ms
def compute_label_feature(text, token_to_idx):
"""Convert string to a list of integers."""
tokens = list(text.strip().lower())
feats = [token_to_idx[token] for token in tokens]
return feats
class TextFeaturizer(object):
"""Extract text feature based on char-level granularity.
By looking up the vocabulary table, each input string (one line of transcript)
will be converted to a sequence of integer indexes.
"""
def __init__(self, vocab_file):
lines = []
with codecs.open(vocab_file, "r", "utf-8") as fin:
lines.extend(fin.readlines())
self.token_to_index = {}
self.index_to_token = {}
self.speech_labels = ""
index = 0
for line in lines:
line = line[:-1] # Strip the '\n' char.
if line.startswith("#"):
# Skip from reading comment line.
continue
self.token_to_index[line] = index
self.index_to_token[index] = line
self.speech_labels += line
index += 1