Spaces:
Sleeping
Sleeping
# !/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# | |
# Copyright 2016-2099 Ailemon.net | |
# | |
# This file is part of ASRT Speech Recognition Tool. | |
# | |
# ASRT is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# ASRT is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with ASRT. If not, see <https://www.gnu.org/licenses/>. | |
# ============================================================================ | |
# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications | |
# Author: James Lyons 2012 | |
""" | |
@author: nl8590687 | |
ASRT语音识别声学特征基础库模块,一些基础函数实现 | |
""" | |
from __future__ import division | |
import numpy | |
from scipy.fftpack import dct | |
from .sigproc import preemphasis, framesig, powspec | |
def calculate_nfft(samplerate, winlen): | |
"""Calculates the FFT size as a power of two greater than or equal to | |
the number of samples in a single window length. | |
Having an FFT less than the window length loses precision by dropping | |
many of the samples; a longer FFT than the window allows zero-padding | |
of the FFT buffer which is neutral in terms of frequency domain conversion. | |
:param samplerate: The sample rate of the signal we are working with, in Hz. | |
:param winlen: The length of the analysis window in seconds. | |
""" | |
window_length_samples = winlen * samplerate | |
nfft = 1 | |
while nfft < window_length_samples: | |
nfft *= 2 | |
return nfft | |
def mfcc(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13, | |
nfilt=26, nfft=None, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True, | |
winfunc=lambda x: numpy.ones((x,))): | |
"""Compute MFCC features from an audio signal. | |
:param signal: the audio signal from which to compute features. Should be an N*1 array | |
:param samplerate: the sample rate of the signal we are working with, in Hz. | |
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) | |
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) | |
:param numcep: the number of cepstrum to return, default 13 | |
:param nfilt: the number of filters in the filterbank, default 26. | |
:param nfft: the FFT size. Default is None, which uses the calculate_nfft function to choose the smallest size that does not drop sample data. | |
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0. | |
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 | |
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. | |
:param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. | |
:param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. | |
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming | |
:returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. | |
""" | |
nfft = nfft or calculate_nfft(samplerate, winlen) | |
feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft, lowfreq, highfreq, preemph, winfunc) | |
feat = numpy.log(feat) | |
feat = dct(feat, type=2, axis=1, norm='ortho')[:, :numcep] | |
feat = lifter(feat, ceplifter) | |
if appendEnergy: feat[:, 0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy | |
return feat | |
def fbank(signal, samplerate=16000, winlen=0.025, winstep=0.01, | |
nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, | |
winfunc=lambda x: numpy.ones((x,))): | |
"""Compute Mel-filterbank energy features from an audio signal. | |
:param signal: the audio signal from which to compute features. Should be an N*1 array | |
:param samplerate: the sample rate of the signal we are working with, in Hz. | |
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) | |
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) | |
:param nfilt: the number of filters in the filterbank, default 26. | |
:param nfft: the FFT size. Default is 512. | |
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0. | |
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 | |
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. | |
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming | |
:returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The | |
second return value is the energy in each frame (total energy, unwindowed) | |
""" | |
highfreq = highfreq or samplerate / 2 | |
signal = preemphasis(signal, preemph) | |
frames = framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) | |
pspec = powspec(frames, nfft) | |
energy = numpy.sum(pspec, 1) # this stores the total energy in each frame | |
energy = numpy.where(energy == 0, numpy.finfo(float).eps, energy) # if energy is zero, we get problems with log | |
fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) | |
feat = numpy.dot(pspec, fb.T) # compute the filterbank energies | |
feat = numpy.where(feat == 0, numpy.finfo(float).eps, feat) # if feat is zero, we get problems with log | |
return feat, energy | |
def logfbank(signal, samplerate=16000, winlen=0.025, winstep=0.01, | |
nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, | |
winfunc=lambda x: numpy.ones((x,))): | |
"""Compute log Mel-filterbank energy features from an audio signal. | |
:param signal: the audio signal from which to compute features. Should be an N*1 array | |
:param samplerate: the sample rate of the signal we are working with, in Hz. | |
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) | |
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) | |
:param nfilt: the number of filters in the filterbank, default 26. | |
:param nfft: the FFT size. Default is 512. | |
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0. | |
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 | |
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. | |
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming | |
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. | |
""" | |
feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft, lowfreq, highfreq, preemph, winfunc) | |
return numpy.log(feat) | |
def ssc(signal, samplerate=16000, winlen=0.025, winstep=0.01, | |
nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, | |
winfunc=lambda x: numpy.ones((x,))): | |
"""Compute Spectral Subband Centroid features from an audio signal. | |
:param signal: the audio signal from which to compute features. Should be an N*1 array | |
:param samplerate: the sample rate of the signal we are working with, in Hz. | |
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) | |
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) | |
:param nfilt: the number of filters in the filterbank, default 26. | |
:param nfft: the FFT size. Default is 512. | |
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0. | |
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 | |
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. | |
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming | |
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. | |
""" | |
highfreq = highfreq or samplerate / 2 | |
signal = preemphasis(signal, preemph) | |
frames = framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) | |
pspec = powspec(frames, nfft) | |
pspec = numpy.where(pspec == 0, numpy.finfo(float).eps, pspec) # if things are all zeros we get problems | |
fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) | |
feat = numpy.dot(pspec, fb.T) # compute the filterbank energies | |
R = numpy.tile(numpy.linspace(1, samplerate / 2, numpy.size(pspec, 1)), (numpy.size(pspec, 0), 1)) | |
return numpy.dot(pspec * R, fb.T) / feat | |
def hz2mel(hz): | |
"""Convert a value in Hertz to Mels | |
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise. | |
:returns: a value in Mels. If an array was passed in, an identical sized array is returned. | |
""" | |
return 2595 * numpy.log10(1 + hz / 700.) | |
def mel2hz(mel): | |
"""Convert a value in Mels to Hertz | |
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise. | |
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned. | |
""" | |
return 700 * (10 ** (mel / 2595.0) - 1) | |
def get_filterbanks(nfilt=20, nfft=512, samplerate=16000, lowfreq=0, highfreq=None): | |
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond | |
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1) | |
:param nfilt: the number of filters in the filterbank, default 20. | |
:param nfft: the FFT size. Default is 512. | |
:param samplerate: the sample rate of the signal we are working with, in Hz. Affects mel spacing. | |
:param lowfreq: lowest band edge of mel filters, default 0 Hz | |
:param highfreq: highest band edge of mel filters, default samplerate/2 | |
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter. | |
""" | |
highfreq = highfreq or samplerate / 2 | |
assert highfreq <= samplerate / 2, "highfreq is greater than samplerate/2" | |
# compute points evenly spaced in mels | |
lowmel = hz2mel(lowfreq) | |
highmel = hz2mel(highfreq) | |
melpoints = numpy.linspace(lowmel, highmel, nfilt + 2) | |
# our points are in Hz, but we use fft bins, so we have to convert | |
# from Hz to fft bin number | |
bin = numpy.floor((nfft + 1) * mel2hz(melpoints) / samplerate) | |
fbank = numpy.zeros([nfilt, nfft // 2 + 1]) | |
for j in range(0, nfilt): | |
for i in range(int(bin[j]), int(bin[j + 1])): | |
fbank[j, i] = (i - bin[j]) / (bin[j + 1] - bin[j]) | |
for i in range(int(bin[j + 1]), int(bin[j + 2])): | |
fbank[j, i] = (bin[j + 2] - i) / (bin[j + 2] - bin[j + 1]) | |
return fbank | |
def lifter(cepstra, L=22): | |
"""Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the | |
magnitude of the high frequency DCT coeffs. | |
:param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size. | |
:param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter. | |
""" | |
if L > 0: | |
nframes, ncoeff = numpy.shape(cepstra) | |
n = numpy.arange(ncoeff) | |
lift = 1 + (L / 2.) * numpy.sin(numpy.pi * n / L) | |
return lift * cepstra | |
else: | |
# values of L <= 0, do nothing | |
return cepstra | |
def delta(feat, N): | |
"""Compute delta features from a feature vector sequence. | |
:param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector. | |
:param N: For each frame, calculate delta features based on preceding and following N frames | |
:returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector. | |
""" | |
if N < 1: | |
raise ValueError('N must be an integer >= 1') | |
NUMFRAMES = len(feat) | |
denominator = 2 * sum([i ** 2 for i in range(1, N + 1)]) | |
delta_feat = numpy.empty_like(feat) | |
padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat | |
for t in range(NUMFRAMES): | |
delta_feat[t] = numpy.dot(numpy.arange(-N, N + 1), | |
padded[t: t + 2 * N + 1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1] | |
return delta_feat | |