Spaces:
Sleeping
Sleeping
File size: 9,030 Bytes
ea41881 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 |
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2016-2099 Ailemon.net
#
# This file is part of ASRT Speech Recognition Tool.
#
# ASRT is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# ASRT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ASRT. If not, see <https://www.gnu.org/licenses/>.
# ============================================================================
"""
@author: nl8590687
ASRT语音识别内置声学特征提取模块,定义了几个常用的声学特征类
"""
import random
import numpy as np
from scipy.fftpack import fft
from .base import mfcc, delta, logfbank
class SpeechFeatureMeta:
"""
ASRT语音识别中所有声学特征提取类的基类
"""
def __init__(self, framesamplerate=16000):
self.framesamplerate = framesamplerate
def run(self, wavsignal, fs=16000):
'''
run method
'''
raise NotImplementedError('[ASRT] `run()` method is not implemented.')
class MFCC(SpeechFeatureMeta):
"""
ASRT语音识别内置的mfcc声学特征提取类
Compute MFCC features from an audio signal.
:param framesamplerate: the sample rate of the signal we are working with, in Hz.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param numcep: the number of cepstrum to return, default 13
:param nfilt: the number of filters in the filterbank, default 26.
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
"""
def __init__(self, framesamplerate=16000,
winlen=0.025,
winstep=0.01,
numcep=13,
nfilt=26,
preemph=0.97):
self.framesamplerate = framesamplerate
self.winlen = winlen
self.winstep = winstep
self.numcep = numcep
self.nfilt = nfilt
self.preemph = preemph
super().__init__(framesamplerate)
def run(self, wavsignal, fs=16000):
"""
计算mfcc声学特征,包含静态特征、一阶差分和二阶差分
:returns: A numpy array of size (NUMFRAMES by numcep * 3) containing features. Each row holds 1 feature vector.
"""
wavsignal = np.array(wavsignal, dtype=np.float64)
# 获取输入特征
feat_mfcc = mfcc(wavsignal[0], samplerate=self.framesamplerate, winlen=self.winlen,
winstep=self.winstep, numcep=self.numcep, nfilt=self.nfilt, preemph=self.preemph)
feat_mfcc_d = delta(feat_mfcc, 2)
feat_mfcc_dd = delta(feat_mfcc_d, 2)
# 返回值分别是mfcc特征向量的矩阵及其一阶差分和二阶差分矩阵
wav_feature = np.column_stack((feat_mfcc, feat_mfcc_d, feat_mfcc_dd))
return wav_feature
class Logfbank(SpeechFeatureMeta):
"""
ASRT语音识别内置的logfbank声学特征提取类
"""
def __init__(self, framesamplerate=16000, nfilt=26):
self.nfilt = nfilt
super().__init__(framesamplerate)
def run(self, wavsignal, fs=16000):
wavsignal = np.array(wavsignal, dtype=np.float64)
# 获取输入特征
wav_feature = logfbank(wavsignal, fs, nfilt=self.nfilt)
return wav_feature
class Spectrogram(SpeechFeatureMeta):
"""
ASRT语音识别内置的语谱图声学特征提取类
"""
def __init__(self, framesamplerate=16000, timewindow=25, timeshift=10):
self.time_window = timewindow
self.window_length = int(framesamplerate / 1000 * self.time_window) # 计算窗长度的公式,目前全部为400固定值
self.timeshift = timeshift
'''
# 保留将来用于不同采样频率
self.x=np.linspace(0, self.window_length - 1, self.window_length, dtype = np.int64)
self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (self.window_length - 1) ) # 汉明窗
'''
self.x = np.linspace(0, 400 - 1, 400, dtype=np.int64)
self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (400 - 1)) # 汉明窗
super().__init__(framesamplerate)
def run(self, wavsignal, fs=16000):
if fs != 16000:
raise ValueError(
f"[Error] ASRT currently only supports wav audio files with a sampling rate of 16000 Hz, but this "
f"audio is {fs} Hz.")
# wav波形 加时间窗以及时移10ms
time_window = 25 # 单位ms
window_length = int(fs / 1000 * time_window) # 计算窗长度的公式,目前全部为400固定值
wav_arr = np.array(wavsignal)
# wav_length = len(wavsignal[0])
# wav_length = wav_arr.shape[1]
range0_end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10 + 1 # 计算循环终止的位置,也就是最终生成的窗数
data_input = np.zeros((range0_end, window_length // 2), dtype=np.float64) # 用于存放最终的频率特征数据
data_line = np.zeros((1, window_length), dtype=np.float64)
for i in range(0, range0_end):
p_start = i * 160
p_end = p_start + 400
data_line = wav_arr[0, p_start:p_end]
data_line = data_line * self.w # 加窗
data_line = np.abs(fft(data_line))
data_input[i] = data_line[0: window_length // 2] # 设置为400除以2的值(即200)是取一半数据,因为是对称的
data_input = np.log(data_input + 1)
return data_input
class SpecAugment(SpeechFeatureMeta):
"""
复现谷歌SpecAugment数据增强特征算法,基于Spectrogram语谱图基础特征
"""
def __init__(self, framesamplerate=16000, timewindow=25, timeshift=10):
self.time_window = timewindow
self.window_length = int(framesamplerate / 1000 * self.time_window) # 计算窗长度的公式,目前全部为400固定值
self.timeshift = timeshift
'''
# 保留将来用于不同采样频率
self.x=np.linspace(0, self.window_length - 1, self.window_length, dtype = np.int64)
self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (self.window_length - 1) ) # 汉明窗
'''
self.x = np.linspace(0, 400 - 1, 400, dtype=np.int64)
self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (400 - 1)) # 汉明窗
super().__init__(framesamplerate)
def run(self, wavsignal, fs=16000):
if fs != 16000:
raise ValueError(
f"[Error] ASRT currently only supports wav audio files with a sampling rate of 16000 Hz, but this "
f"audio is {fs} Hz.")
# wav波形 加时间窗以及时移10ms
time_window = 25 # 单位ms
window_length = int(fs / 1000 * time_window) # 计算窗长度的公式,目前全部为400固定值
wav_arr = np.array(wavsignal)
# wav_length = len(wavsignal[0])
# wav_length = wav_arr.shape[1]
range0_end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10 + 1 # 计算循环终止的位置,也就是最终生成的窗数
data_input = np.zeros((range0_end, window_length // 2), dtype=np.float64) # 用于存放最终的频率特征数据
data_line = np.zeros((1, window_length), dtype=np.float64)
for i in range(0, range0_end):
p_start = i * 160
p_end = p_start + 400
data_line = wav_arr[0, p_start:p_end]
data_line = data_line * self.w # 加窗
data_line = np.abs(fft(data_line))
data_input[i] = data_line[0: window_length // 2] # 设置为400除以2的值(即200)是取一半数据,因为是对称的
# print(data_input.shape)
data_input = np.log(data_input + 1)
# 开始对得到的特征应用SpecAugment
mode = random.randint(1, 100)
h_start = random.randint(1, data_input.shape[0])
h_width = random.randint(1, 100)
v_start = random.randint(1, data_input.shape[1])
v_width = random.randint(1, 100)
if mode <= 60: # 正常特征 60%
pass
elif 60 < mode <= 75: # 横向遮盖 15%
data_input[h_start:h_start + h_width, :] = 0
elif 75 < mode <= 90: # 纵向遮盖 15%
data_input[:, v_start:v_start + v_width] = 0
else: # 两种遮盖叠加 10%
data_input[h_start:h_start + h_width, :v_start:v_start + v_width] = 0
return data_input
|