Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# | |
# Copyright 2016-2099 Ailemon.net | |
# | |
# This file is part of ASRT Speech Recognition Tool. | |
# | |
# ASRT is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# ASRT is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with ASRT. If not, see <https://www.gnu.org/licenses/>. | |
# ============================================================================ | |
# This file includes routines for basic signal processing including framing and computing power spectra. | |
# Author: James Lyons 2012 | |
''' | |
@author: nl8590687 | |
ASRT语音识别声学特征计算的信号处理计算的函数库 | |
''' | |
import decimal | |
import logging | |
import math | |
import numpy | |
def round_half_up(number): | |
return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP)) | |
def rolling_window(a, window, step=1): | |
# http://ellisvalentiner.com/post/2017-03-21-np-strides-trick | |
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) | |
strides = a.strides + (a.strides[-1],) | |
return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step] | |
def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True): | |
"""Frame a signal into overlapping frames. | |
:param sig: the audio signal to frame. | |
:param frame_len: length of each frame measured in samples. | |
:param frame_step: number of samples after the start of the previous frame that the next frame should begin. | |
:param winfunc: the analysis window to apply to each frame. By default no window is applied. | |
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster | |
:returns: an array of frames. Size is NUMFRAMES by frame_len. | |
""" | |
slen = len(sig) | |
frame_len = int(round_half_up(frame_len)) | |
frame_step = int(round_half_up(frame_step)) | |
if slen <= frame_len: | |
numframes = 1 | |
else: | |
numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step)) | |
padlen = int((numframes - 1) * frame_step + frame_len) | |
zeros = numpy.zeros((padlen - slen,)) | |
padsignal = numpy.concatenate((sig, zeros)) | |
if stride_trick: | |
win = winfunc(frame_len) | |
frames = rolling_window(padsignal, window=frame_len, step=frame_step) | |
else: | |
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( | |
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T | |
indices = numpy.array(indices, dtype=numpy.int32) | |
frames = padsignal[indices] | |
win = numpy.tile(winfunc(frame_len), (numframes, 1)) | |
return frames * win | |
def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))): | |
"""Does overlap-add procedure to undo the action of framesig. | |
:param frames: the array of frames. | |
:param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples. | |
:param frame_len: length of each frame measured in samples. | |
:param frame_step: number of samples after the start of the previous frame that the next frame should begin. | |
:param winfunc: the analysis window to apply to each frame. By default no window is applied. | |
:returns: a 1-D signal. | |
""" | |
frame_len = round_half_up(frame_len) | |
frame_step = round_half_up(frame_step) | |
numframes = numpy.shape(frames)[0] | |
assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len' | |
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( | |
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T | |
indices = numpy.array(indices, dtype=numpy.int32) | |
padlen = (numframes - 1) * frame_step + frame_len | |
if siglen <= 0: siglen = padlen | |
rec_signal = numpy.zeros((padlen,)) | |
window_correction = numpy.zeros((padlen,)) | |
win = winfunc(frame_len) | |
for i in range(0, numframes): | |
window_correction[indices[i, :]] = window_correction[ | |
indices[i, :]] + win + 1e-15 # add a little bit so it is never zero | |
rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :] | |
rec_signal = rec_signal / window_correction | |
return rec_signal[0:siglen] | |
def magspec(frames, NFFT): | |
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). | |
:param frames: the array of frames. Each row is a frame. | |
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. | |
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame. | |
""" | |
if numpy.shape(frames)[1] > NFFT: | |
logging.warn( | |
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.', | |
numpy.shape(frames)[1], NFFT) | |
complex_spec = numpy.fft.rfft(frames, NFFT) | |
return numpy.absolute(complex_spec) | |
def powspec(frames, NFFT): | |
"""Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). | |
:param frames: the array of frames. Each row is a frame. | |
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. | |
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame. | |
""" | |
return 1.0 / NFFT * numpy.square(magspec(frames, NFFT)) | |
def logpowspec(frames, NFFT, norm=1): | |
"""Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). | |
:param frames: the array of frames. Each row is a frame. | |
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. | |
:param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0. | |
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame. | |
""" | |
ps = powspec(frames, NFFT) | |
ps[ps <= 1e-30] = 1e-30 | |
lps = 10 * numpy.log10(ps) | |
if norm: | |
return lps - numpy.max(lps) | |
else: | |
return lps | |
def preemphasis(signal, coeff=0.95): | |
"""perform preemphasis on the input signal. | |
:param signal: The signal to filter. | |
:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95. | |
:returns: the filtered signal. | |
""" | |
return numpy.append(signal[0], signal[1:] - coeff * signal[:-1]) | |