File size: 9,030 Bytes
ea41881
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2016-2099 Ailemon.net
#
# This file is part of ASRT Speech Recognition Tool.
#
# ASRT is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# ASRT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ASRT.  If not, see <https://www.gnu.org/licenses/>.
# ============================================================================

"""
@author: nl8590687
ASRT语音识别内置声学特征提取模块,定义了几个常用的声学特征类
"""

import random
import numpy as np
from scipy.fftpack import fft
from .base import mfcc, delta, logfbank


class SpeechFeatureMeta:
    """
    ASRT语音识别中所有声学特征提取类的基类
    """

    def __init__(self, framesamplerate=16000):
        self.framesamplerate = framesamplerate

    def run(self, wavsignal, fs=16000):
        '''
        run method
        '''
        raise NotImplementedError('[ASRT] `run()` method is not implemented.')


class MFCC(SpeechFeatureMeta):
    """
    ASRT语音识别内置的mfcc声学特征提取类

    Compute MFCC features from an audio signal.

    :param framesamplerate: the sample rate of the signal we are working with, in Hz.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param numcep: the number of cepstrum to return, default 13
    :param nfilt: the number of filters in the filterbank, default 26.
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    """

    def __init__(self, framesamplerate=16000,
                 winlen=0.025,
                 winstep=0.01,
                 numcep=13,
                 nfilt=26,
                 preemph=0.97):
        self.framesamplerate = framesamplerate
        self.winlen = winlen
        self.winstep = winstep
        self.numcep = numcep
        self.nfilt = nfilt
        self.preemph = preemph
        super().__init__(framesamplerate)

    def run(self, wavsignal, fs=16000):
        """
        计算mfcc声学特征,包含静态特征、一阶差分和二阶差分

        :returns: A numpy array of size (NUMFRAMES by numcep * 3) containing features. Each row holds 1 feature vector.
        """
        wavsignal = np.array(wavsignal, dtype=np.float64)
        # 获取输入特征
        feat_mfcc = mfcc(wavsignal[0], samplerate=self.framesamplerate, winlen=self.winlen,
                         winstep=self.winstep, numcep=self.numcep, nfilt=self.nfilt, preemph=self.preemph)
        feat_mfcc_d = delta(feat_mfcc, 2)
        feat_mfcc_dd = delta(feat_mfcc_d, 2)
        # 返回值分别是mfcc特征向量的矩阵及其一阶差分和二阶差分矩阵
        wav_feature = np.column_stack((feat_mfcc, feat_mfcc_d, feat_mfcc_dd))
        return wav_feature


class Logfbank(SpeechFeatureMeta):
    """
    ASRT语音识别内置的logfbank声学特征提取类
    """

    def __init__(self, framesamplerate=16000, nfilt=26):
        self.nfilt = nfilt
        super().__init__(framesamplerate)

    def run(self, wavsignal, fs=16000):
        wavsignal = np.array(wavsignal, dtype=np.float64)
        # 获取输入特征
        wav_feature = logfbank(wavsignal, fs, nfilt=self.nfilt)
        return wav_feature


class Spectrogram(SpeechFeatureMeta):
    """
    ASRT语音识别内置的语谱图声学特征提取类
    """

    def __init__(self, framesamplerate=16000, timewindow=25, timeshift=10):
        self.time_window = timewindow
        self.window_length = int(framesamplerate / 1000 * self.time_window)  # 计算窗长度的公式,目前全部为400固定值
        self.timeshift = timeshift

        '''
        # 保留将来用于不同采样频率
        self.x=np.linspace(0, self.window_length - 1, self.window_length, dtype = np.int64)
        self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (self.window_length - 1) ) # 汉明窗
        '''

        self.x = np.linspace(0, 400 - 1, 400, dtype=np.int64)
        self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (400 - 1))  # 汉明窗
        super().__init__(framesamplerate)

    def run(self, wavsignal, fs=16000):
        if fs != 16000:
            raise ValueError(
                f"[Error] ASRT currently only supports wav audio files with a sampling rate of 16000 Hz, but this "
                f"audio is {fs} Hz.")

        # wav波形 加时间窗以及时移10ms
        time_window = 25  # 单位ms
        window_length = int(fs / 1000 * time_window)  # 计算窗长度的公式,目前全部为400固定值

        wav_arr = np.array(wavsignal)
        # wav_length = len(wavsignal[0])
        # wav_length = wav_arr.shape[1]

        range0_end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10 + 1  # 计算循环终止的位置,也就是最终生成的窗数
        data_input = np.zeros((range0_end, window_length // 2), dtype=np.float64)  # 用于存放最终的频率特征数据
        data_line = np.zeros((1, window_length), dtype=np.float64)

        for i in range(0, range0_end):
            p_start = i * 160
            p_end = p_start + 400

            data_line = wav_arr[0, p_start:p_end]
            data_line = data_line * self.w  # 加窗
            data_line = np.abs(fft(data_line))

            data_input[i] = data_line[0: window_length // 2]  # 设置为400除以2的值(即200)是取一半数据,因为是对称的

        data_input = np.log(data_input + 1)
        return data_input


class SpecAugment(SpeechFeatureMeta):
    """
    复现谷歌SpecAugment数据增强特征算法,基于Spectrogram语谱图基础特征
    """

    def __init__(self, framesamplerate=16000, timewindow=25, timeshift=10):
        self.time_window = timewindow
        self.window_length = int(framesamplerate / 1000 * self.time_window)  # 计算窗长度的公式,目前全部为400固定值
        self.timeshift = timeshift

        '''
        # 保留将来用于不同采样频率
        self.x=np.linspace(0, self.window_length - 1, self.window_length, dtype = np.int64)
        self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (self.window_length - 1) ) # 汉明窗
        '''

        self.x = np.linspace(0, 400 - 1, 400, dtype=np.int64)
        self.w = 0.54 - 0.46 * np.cos(2 * np.pi * (self.x) / (400 - 1))  # 汉明窗
        super().__init__(framesamplerate)

    def run(self, wavsignal, fs=16000):
        if fs != 16000:
            raise ValueError(
                f"[Error] ASRT currently only supports wav audio files with a sampling rate of 16000 Hz, but this "
                f"audio is {fs} Hz.")

        # wav波形 加时间窗以及时移10ms
        time_window = 25  # 单位ms
        window_length = int(fs / 1000 * time_window)  # 计算窗长度的公式,目前全部为400固定值

        wav_arr = np.array(wavsignal)
        # wav_length = len(wavsignal[0])
        # wav_length = wav_arr.shape[1]

        range0_end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10 + 1  # 计算循环终止的位置,也就是最终生成的窗数
        data_input = np.zeros((range0_end, window_length // 2), dtype=np.float64)  # 用于存放最终的频率特征数据
        data_line = np.zeros((1, window_length), dtype=np.float64)

        for i in range(0, range0_end):
            p_start = i * 160
            p_end = p_start + 400

            data_line = wav_arr[0, p_start:p_end]
            data_line = data_line * self.w  # 加窗
            data_line = np.abs(fft(data_line))

            data_input[i] = data_line[0: window_length // 2]  # 设置为400除以2的值(即200)是取一半数据,因为是对称的

        # print(data_input.shape)
        data_input = np.log(data_input + 1)

        # 开始对得到的特征应用SpecAugment
        mode = random.randint(1, 100)
        h_start = random.randint(1, data_input.shape[0])
        h_width = random.randint(1, 100)

        v_start = random.randint(1, data_input.shape[1])
        v_width = random.randint(1, 100)

        if mode <= 60:  # 正常特征 60%
            pass
        elif 60 < mode <= 75:  # 横向遮盖 15%
            data_input[h_start:h_start + h_width, :] = 0
        elif 75 < mode <= 90:  # 纵向遮盖 15%
            data_input[:, v_start:v_start + v_width] = 0
        else:  # 两种遮盖叠加 10%
            data_input[h_start:h_start + h_width, :v_start:v_start + v_width] = 0

        return data_input