ZhongYing commited on
Commit
503ec99
1 Parent(s): 6ffe62c

first commit

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ **/__pycache
configs/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import yaml
3
+
4
+
5
+ def load_yaml(path):
6
+ loader = yaml.SafeLoader
7
+ loader.add_implicit_resolver(
8
+ u'tag:yaml.org,2002:float',
9
+ re.compile(u'''^(?:
10
+ [-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+]?[0-9]+)?
11
+ |[-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+)
12
+ |\\.[0-9_]+(?:[eE][-+][0-9]+)?
13
+ |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*
14
+ |[-+]?\\.(?:inf|Inf|INF)
15
+ |\\.(?:nan|NaN|NAN))$''', re.X),
16
+ list(u'-+0123456789.'))
17
+ with open(path, "r", encoding="utf-8") as file:
18
+ return yaml.load(file, Loader=loader)
configs/config.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 by zhongying
2
+ #
3
+
4
+
5
+ from . import load_yaml
6
+ from util.utils import preprocess_paths
7
+
8
+
9
+ class Config:
10
+ """ User configs class for training, testing or infering """
11
+
12
+ def __init__(self, path: str):
13
+ print('configs file path:', path)
14
+ config = load_yaml(preprocess_paths(path))
15
+ self.speech_config = config.get("speech_config", {})
16
+ self.model_config = config.get("model_config", {})
17
+ self.dataset_config = config.get("dataset_config", {})
18
+ self.optimizer_config = config.get("optimizer_config", {})
19
+ self.running_config = config.get("running_config", {})
20
+
21
+ def print(self):
22
+ print('==================================================')
23
+ print('speech configs:', self.speech_config)
24
+ print('--------------------------------------------------')
25
+ print('model configs:', self.model_config)
26
+ print('--------------------------------------------------')
27
+ print('dataset configs:', self.dataset_config)
28
+ print('--------------------------------------------------')
29
+ print('optimizer configs', self.optimizer_config)
30
+ print('--------------------------------------------------')
31
+ print('running configs:', self.running_config)
32
+ print('==================================================')
33
+
34
+ def toString(self):
35
+ string = ''
36
+ string += '#==================================================' + '\n'
37
+ string += '#speech config: ' + str(self.speech_config) + '\n'
38
+ string += '#--------------------------------------------------' + '\n'
39
+ string += '#model config: ' + str(self.model_config) + '\n'
40
+ string += '#--------------------------------------------------' + '\n'
41
+ string += '#dataset config: ' + str(self.dataset_config) + '\n'
42
+ string += '#--------------------------------------------------' + '\n'
43
+ string += '#optimizer config: ' + str(self.optimizer_config) + '\n'
44
+ string += '#--------------------------------------------------' + '\n'
45
+ string += '#running config: ' + str(self.running_config) + '\n'
46
+ string += '#==================================================' + '\n'
47
+ return string
configs/config.yml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ speech_config:
2
+ sample_rate: 16000
3
+ frame_ms: 25
4
+ stride_ms: 10
5
+ num_feature_bins: 80
6
+ feature_type: log_mel_spectrogram
7
+ preemphasis: 0.97
8
+ normalize_signal: True
9
+ normalize_feature: True
10
+ normalize_per_feature: False
11
+
12
+ model_config:
13
+ name: acrnn
14
+ d_model: 64
15
+ filters: [32,64,64]
16
+ kernel_size: [[11,5],[11,5],[11,5]]
17
+ rnn_cell: 256
18
+ seq_mask: True
19
+
20
+ dataset_config:
21
+ vocabulary: vocab/vocab.txt
22
+ data_path: ./data/wavs/
23
+ corpus_name: ./data/demo_txt/demo
24
+ file_nums: 1
25
+ max_audio_length: 2000
26
+ shuffle_size: 1200
27
+ data_length: None
28
+ suffix: .txt
29
+ load_type: txt
30
+ train: train
31
+ dev: dev
32
+ test: test
33
+
34
+ optimizer_config:
35
+ init_steps: 0
36
+ warmup_steps: 10000
37
+ max_lr: 1e-4
38
+ beta1: 0.9
39
+ beta2: 0.999
40
+ epsilon: 1e-9
41
+
42
+ running_config:
43
+ prefetch: False
44
+ load_weights: ./saved_weights/20230228-084356/last/model
45
+ num_epochs: 100
46
+ batch_size: 1
47
+ train_steps: 50
48
+ dev_steps: 10
49
+ test_steps: 10
dataset.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from featurizers.speech_featurizers import SpeechFeaturizer
2
+ from configs.config import Config
3
+ from random import shuffle
4
+ import numpy as np
5
+ from vocab.vocab import Vocab
6
+ import os
7
+ import math
8
+ import librosa
9
+ import tensorflow as tf
10
+
11
+
12
+ def wav_padding(wav_data_lst, wav_max_len, fbank_dim):
13
+ wav_lens = [len(data) for data in wav_data_lst]
14
+ # input wav from 1200 frames down sample 8 times to 150 frames
15
+ wav_lens = [math.ceil(x/8) for x in wav_lens]
16
+ wav_lens = np.array(wav_lens)
17
+ new_wav_data_lst = np.zeros((len(wav_data_lst), wav_max_len, fbank_dim))
18
+ for i in range(len(wav_data_lst)):
19
+ new_wav_data_lst[i, :wav_data_lst[i].shape[0], :] = wav_data_lst[i]
20
+ return new_wav_data_lst, wav_lens
21
+
22
+
23
+ class DatDataSet:
24
+ def __init__(self,
25
+ batch_size,
26
+ data_type,
27
+ vocab: Vocab,
28
+ speech_featurizer: SpeechFeaturizer,
29
+ config: Config):
30
+ self.batch_size = batch_size
31
+ self.data_type = data_type
32
+ self.vocab = vocab
33
+ self.data_path =config.dataset_config['data_path']
34
+ self.corpus_name = config.dataset_config['corpus_name']
35
+ self.fbank_dim = config.speech_config['num_feature_bins']
36
+ self.max_audio_length =config.dataset_config['max_audio_length']
37
+ self.mel_banks = config.speech_config['num_feature_bins']
38
+ self.file_nums = config.dataset_config['file_nums']
39
+ self.language_classes = config.running_config['language_classes']
40
+ self.suffix = config.dataset_config['suffix']
41
+ self.READ_BUFFER_SIZE = 2 * 1024 * 1024 * 1024
42
+ self.shuffle = True
43
+ self.blank = 0
44
+ self.source_init()
45
+
46
+
47
+ def source_init(self):
48
+ self.dat_file_list, self.txt_file_list = self.get_dat_txt_list(self.data_type)
49
+ print('>>', self.data_type, 'load dat files:', len(self.dat_file_list))
50
+ print('>>', self.data_type, 'load txt files:', len(self.txt_file_list))
51
+ max_binary_file_size = max([os.path.getsize(dat) for dat in self.dat_file_list])
52
+ print('>> max binary file size:', max_binary_file_size)
53
+ # alloc a huge memory block
54
+ self.feature_binary = np.zeros(max_binary_file_size // 4 + 1, np.float32)
55
+
56
+
57
+ def get_dat_txt_list(self, dir_name):
58
+ corpus_dir = self.data_path+'/'+self.corpus_name + '/'
59
+ print('!!', corpus_dir)
60
+ file_lst = os.listdir(corpus_dir)
61
+ txt_file_lst = []
62
+ dat_file_lst = []
63
+
64
+ for align_file in file_lst:
65
+ if align_file.endswith(self.suffix):
66
+ file_name = align_file[:-len(self.suffix)]
67
+ dat_file = file_name + '.dat'
68
+ if dir_name in file_name:
69
+ # if dir_name in ['dev', 'test']:
70
+ # dat_file = dat_file.replace(dir_name, 'train')
71
+ dat_file_lst.append(corpus_dir + dat_file)
72
+ txt_file_lst.append(corpus_dir + align_file)
73
+ print('*********',dir_name, txt_file_lst, dat_file_lst)
74
+ return dat_file_lst, txt_file_lst
75
+
76
+
77
+ def load_dat_file(self, dat_file_path):
78
+ f = open(dat_file_path, 'rb')
79
+ pos = 0
80
+ buf = f.read(self.READ_BUFFER_SIZE)
81
+ while len(buf) > 0:
82
+ nbuf = np.frombuffer(buf, np.float32)
83
+ self.feature_binary[pos: pos + len(nbuf)] = nbuf
84
+ pos += len(nbuf)
85
+ buf = f.read(self.READ_BUFFER_SIZE)
86
+
87
+
88
+ def get_batch(self):
89
+ while 1:
90
+ shuffle_did_list = [i for i in range(len(self.dat_file_list))]
91
+ if self.shuffle:
92
+ shuffle(shuffle_did_list)
93
+ for did in shuffle_did_list:
94
+ wav_lst = []
95
+ label_lst = []
96
+ self.load_dat_file(self.dat_file_list[did])
97
+ txt_file = open(self.txt_file_list[did], 'r', encoding='utf8')
98
+ utt_lines = txt_file.readlines()
99
+ txt_lines = utt_lines
100
+ if self.shuffle:
101
+ shuffle(txt_lines)
102
+ # sort lines by wav len
103
+ # txt_lines = sorted(
104
+ # txt_lines,
105
+ # key=lambda line: int(line.split('\t')[0].split(':')[2]) - int(line.split('\t')[0].split(':')[1]),
106
+ # reverse=False)
107
+ for line in txt_lines:
108
+ wav_file, label = line.split('\t')
109
+ wav_lst.append(wav_file)
110
+ label_lst.append(label.strip('\n'))
111
+ shuffle_list = [i for i in range(len(wav_lst) // self.batch_size)]
112
+ if self.shuffle:
113
+ shuffle(shuffle_list)
114
+ for i in shuffle_list:
115
+ begin = i * self.batch_size
116
+ end = begin + self.batch_size
117
+ sub_list = list(range(begin, end, 1))
118
+ # label batch
119
+ label_data_lst = [label_lst[index] for index in sub_list]
120
+ prediction = np.array(
121
+ [self.vocab.token_list.index(line) for
122
+ line in label_data_lst],
123
+ dtype=np.int32)
124
+
125
+ feature_lst = []
126
+ wav_path = []
127
+ get_next_batch = False
128
+ for index in sub_list:
129
+ # data_aishell/wav/test/S0764/BAC009S0764W0121.wav:0:33680 chinese
130
+ _, start, end = wav_lst[index].split(':')
131
+ feature = self.feature_binary[int(start): int(end)]
132
+ feature = np.reshape(feature, (-1, 80))
133
+ feature = feature[:self.max_audio_length, :]
134
+ feature_lst.append(feature)
135
+ wav_path.append(wav_lst[index])
136
+
137
+ if get_next_batch:
138
+ continue
139
+ features, input_length = wav_padding(feature_lst, self.max_audio_length, self.fbank_dim)
140
+
141
+ yield features, input_length, prediction
142
+
143
+
144
+ class TxtDataSet:
145
+ def __init__(self,
146
+ batch_size,
147
+ data_type,
148
+ vocab,
149
+ speech_featurizer: SpeechFeaturizer,
150
+ config: Config
151
+ ):
152
+ self.batch_size = batch_size
153
+ self.data_type = data_type
154
+ self.vocab = vocab
155
+ self.feature_extracter = speech_featurizer
156
+ self.data_path = config.dataset_config['data_path']
157
+ self.corpus_name = config.dataset_config['corpus_name']
158
+ self.fbank_dim = config.speech_config['num_feature_bins']
159
+ self.max_audio_length =config.dataset_config['max_audio_length']
160
+ self.mel_banks = config.speech_config['num_feature_bins']
161
+ self.file_nums = config.dataset_config['file_nums']
162
+ self.data_length = config.dataset_config['data_length']
163
+ self.shuffle = True
164
+ self.sentence_list = []
165
+ self.wav_lst = []
166
+ self.label_lst = []
167
+ self.max_sentence_length = 0
168
+ self.source_init()
169
+
170
+ def source_init(self):
171
+ read_files = []
172
+ if self.data_type == 'train':
173
+ read_files.append(self.corpus_name + '_train.txt')
174
+ elif self.data_type == 'dev':
175
+ read_files.append(self.corpus_name + '_dev.txt')
176
+ elif self.data_type == 'test':
177
+ read_files.append(self.corpus_name + '_test.txt')
178
+ print('data type:{} \n files:{}'.format(self.data_type, read_files))
179
+ total_lines = 0
180
+ for sub_file in read_files:
181
+ with open(sub_file, 'r', encoding='utf8') as f:
182
+ for line in f:
183
+ wav_file, label = line.split(' ', 1)
184
+ label = label.strip('\n').split()
185
+
186
+ self.label_lst.append(label)
187
+ self.wav_lst.append(wav_file)
188
+ total_lines += 1
189
+ if self.data_length:
190
+ if total_lines == self.data_length:
191
+ break
192
+ if total_lines % 10000 == 0:
193
+ print('\rload', total_lines, end='', flush=True)
194
+
195
+ if not self.data_length:
196
+ self.wav_lst = self.wav_lst[:self.data_length]
197
+ self.label_lst = self.label_lst[:self.data_length]
198
+ print('number of', self.data_type, 'data:', len(self.wav_lst))
199
+
200
+
201
+ def get_batch(self):
202
+ shuffle_list = [i for i in range(len(self.wav_lst))]
203
+ while 1:
204
+ if self.shuffle:
205
+ shuffle(shuffle_list)
206
+ for i in range(len(self.wav_lst) // self.batch_size):
207
+ begin = i * self.batch_size
208
+ end = begin + self.batch_size
209
+ sub_list = shuffle_list[begin:end]
210
+
211
+ label_data_lst = [self.label_lst[index] for index in sub_list]
212
+ prediction = np.array(
213
+ [self.vocab.token_list.index(line[0]) for
214
+ line in label_data_lst],
215
+ dtype=np.int32)
216
+ feature_lst = []
217
+ wav_path = []
218
+ get_next_batch = False
219
+ for index in sub_list:
220
+ # start = time.time()
221
+ audio, _ = librosa.load(self.data_path + self.wav_lst[index], sr=16000)
222
+ if len(audio) == 0:
223
+ get_next_batch = True
224
+ break
225
+ feature = self.feature_extracter.extract(audio)
226
+
227
+ feature_lst.append(feature)
228
+ wav_path.append(self.wav_lst[index])
229
+
230
+ if get_next_batch:
231
+ continue # get next batch
232
+
233
+ features, input_length = wav_padding(feature_lst, self.max_audio_length, self.fbank_dim)
234
+
235
+ yield features,input_length, prediction
236
+
237
+
238
+ def create_dataset(batch_size, load_type, data_type, speech_featurizer, config, vocab):
239
+ """
240
+ batch_size: global batch size
241
+ data_type: the type of lode data, supports type: txt, dat()
242
+
243
+ """
244
+ if load_type == 'dat':
245
+ dataset = DatDataSet(batch_size, data_type, vocab, speech_featurizer, config)
246
+ dataset = tf.data.Dataset.from_generator(dataset.get_batch,
247
+ output_types=(tf.float32, tf.int32, tf.int32),
248
+ output_shapes=([None, None, config.speech_config['num_feature_bins']],
249
+ [None], [None]))
250
+ elif load_type == 'txt':
251
+ dataset = TxtDataSet(batch_size, data_type, vocab, speech_featurizer, config)
252
+ dataset = tf.data.Dataset.from_generator(dataset.get_batch,
253
+ output_types=(tf.float32, tf.int32, tf.int32),
254
+ output_shapes=([None, None, config.speech_config['num_feature_bins']],
255
+ [None], [None]))
256
+ else:
257
+ print('load_type must be dat or txt!!')
258
+ return
259
+
260
+ options = tf.data.Options()
261
+ options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA.DATA
262
+ dataset = dataset.with_options(options)
263
+ return dataset
featurizers/__init__.py ADDED
File without changes
featurizers/gammatone.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ This code is inspired from https://github.com/detly/gammatone """
2
+
3
+ import numpy as np
4
+ import tensorflow as tf
5
+
6
+ from util.utils import shape_list
7
+
8
+ pi = tf.constant(np.pi, dtype=tf.complex64)
9
+
10
+ DEFAULT_FILTER_NUM = 100
11
+ DEFAULT_LOW_FREQ = 100
12
+ DEFAULT_HIGH_FREQ = 44100 / 4
13
+
14
+
15
+ def fft_weights(
16
+ nfft,
17
+ fs,
18
+ nfilts,
19
+ width,
20
+ fmin,
21
+ fmax,
22
+ maxlen):
23
+ """
24
+ :param nfft: the source FFT size
25
+ :param sr: sampling rate (Hz)
26
+ :param nfilts: the number of output bands required (default 64)
27
+ :param width: the constant width of each band in Bark (default 1)
28
+ :param fmin: lower limit of frequencies (Hz)
29
+ :param fmax: upper limit of frequencies (Hz)
30
+ :param maxlen: number of bins to truncate the rows to
31
+
32
+ :return: a tuple `weights`, `gain` with the calculated weight matrices and
33
+ gain vectors
34
+
35
+ Generate a matrix of weights to combine FFT bins into Gammatone bins.
36
+
37
+ Note about `maxlen` parameter: While wts has nfft columns, the second half
38
+ are all zero. Hence, aud spectrum is::
39
+
40
+ fft2gammatonemx(nfft,sr)*abs(fft(xincols,nfft))
41
+
42
+ `maxlen` truncates the rows to this many bins.
43
+
44
+ | (c) 2004-2009 Dan Ellis [email protected] based on rastamat/audspec.m
45
+ | (c) 2012 Jason Heeris (Python implementation)
46
+ """
47
+ ucirc = tf.exp(1j * 2 * pi * tf.cast(tf.range(0, nfft / 2 + 1),
48
+ tf.complex64) / nfft)[None, ...]
49
+
50
+ # Common ERB filter code factored out
51
+ cf_array = erb_space(fmin, fmax, nfilts)[::-1]
52
+
53
+ _, A11, A12, A13, A14, _, _, _, B2, gain = make_erb_filters(fs, cf_array, width)
54
+
55
+ A11, A12, A13, A14 = A11[..., None], A12[..., None], A13[..., None], A14[..., None]
56
+
57
+ r = tf.cast(tf.sqrt(B2), tf.complex64)
58
+ theta = 2 * pi * cf_array / fs
59
+ pole = (r * tf.exp(1j * theta))[..., None]
60
+
61
+ GTord = 4
62
+
63
+ weights = (
64
+ tf.abs(ucirc + A11 * fs) * tf.abs(ucirc + A12 * fs)
65
+ * tf.abs(ucirc + A13 * fs) * tf.abs(ucirc + A14 * fs)
66
+ * tf.abs(fs * (pole - ucirc) * (tf.math.conj(pole) - ucirc)) ** (-GTord)
67
+ / tf.cast(gain[..., None], tf.float32)
68
+ )
69
+
70
+ weights = tf.pad(weights, [[0, 0], [0, nfft - shape_list(weights)[-1]]])
71
+
72
+ weights = weights[:, 0:int(maxlen)]
73
+
74
+ return tf.transpose(weights, perm=[1, 0])
75
+
76
+
77
+ def erb_point(low_freq, high_freq, fraction):
78
+ """
79
+ Calculates a single point on an ERB scale between ``low_freq`` and
80
+ ``high_freq``, determined by ``fraction``. When ``fraction`` is ``1``,
81
+ ``low_freq`` will be returned. When ``fraction`` is ``0``, ``high_freq``
82
+ will be returned.
83
+
84
+ ``fraction`` can actually be outside the range ``[0, 1]``, which in general
85
+ isn't very meaningful, but might be useful when ``fraction`` is rounded a
86
+ little above or below ``[0, 1]`` (eg. for plot axis labels).
87
+ """
88
+ # Change the following three parameters if you wish to use a different ERB
89
+ # scale. Must change in MakeERBCoeffs too.
90
+ # TODO: Factor these parameters out
91
+ ear_q = 9.26449 # Glasberg and Moore Parameters
92
+ min_bw = 24.7
93
+
94
+ # All of the following expressions are derived in Apple TR #35, "An
95
+ # Efficient Implementation of the Patterson-Holdsworth Cochlear Filter
96
+ # Bank." See pages 33-34.
97
+ erb_point = (
98
+ -ear_q * min_bw
99
+ + tf.exp(
100
+ fraction * (
101
+ -tf.math.log(high_freq + ear_q * min_bw)
102
+ + tf.math.log(low_freq + ear_q * min_bw)
103
+ )
104
+ ) *
105
+ (high_freq + ear_q * min_bw)
106
+ )
107
+
108
+ return tf.cast(erb_point, tf.complex64)
109
+
110
+
111
+ def erb_space(
112
+ low_freq=DEFAULT_LOW_FREQ,
113
+ high_freq=DEFAULT_HIGH_FREQ,
114
+ num=DEFAULT_FILTER_NUM):
115
+ """
116
+ This function computes an array of ``num`` frequencies uniformly spaced
117
+ between ``high_freq`` and ``low_freq`` on an ERB scale.
118
+
119
+ For a definition of ERB, see Moore, B. C. J., and Glasberg, B. R. (1983).
120
+ "Suggested formulae for calculating auditory-filter bandwidths and
121
+ excitation patterns," J. Acoust. Soc. Am. 74, 750-753.
122
+ """
123
+ return erb_point(
124
+ low_freq,
125
+ high_freq,
126
+ tf.range(1, num + 1, dtype=tf.float32) / num
127
+ )
128
+
129
+
130
+ def make_erb_filters(fs, centre_freqs, width=1.0):
131
+ """
132
+ This function computes the filter coefficients for a bank of
133
+ Gammatone filters. These filters were defined by Patterson and Holdworth for
134
+ simulating the cochlea.
135
+
136
+ The result is returned as a :class:`ERBCoeffArray`. Each row of the
137
+ filter arrays contains the coefficients for four second order filters. The
138
+ transfer function for these four filters share the same denominator (poles)
139
+ but have different numerators (zeros). All of these coefficients are
140
+ assembled into one vector that the ERBFilterBank can take apart to implement
141
+ the filter.
142
+
143
+ The filter bank contains "numChannels" channels that extend from
144
+ half the sampling rate (fs) to "lowFreq". Alternatively, if the numChannels
145
+ input argument is a vector, then the values of this vector are taken to be
146
+ the center frequency of each desired filter. (The lowFreq argument is
147
+ ignored in this case.)
148
+
149
+ Note this implementation fixes a problem in the original code by
150
+ computing four separate second order filters. This avoids a big problem with
151
+ round off errors in cases of very small cfs (100Hz) and large sample rates
152
+ (44kHz). The problem is caused by roundoff error when a number of poles are
153
+ combined, all very close to the unit circle. Small errors in the eigth order
154
+ coefficient, are multiplied when the eigth root is taken to give the pole
155
+ location. These small errors lead to poles outside the unit circle and
156
+ instability. Thanks to Julius Smith for leading me to the proper
157
+ explanation.
158
+
159
+ Execute the following code to evaluate the frequency response of a 10
160
+ channel filterbank::
161
+
162
+ fcoefs = MakeERBFilters(16000,10,100);
163
+ y = ERBFilterBank([1 zeros(1,511)], fcoefs);
164
+ resp = 20*log10(abs(fft(y')));
165
+ freqScale = (0:511)/512*16000;
166
+ semilogx(freqScale(1:255),resp(1:255,:));
167
+ axis([100 16000 -60 0])
168
+ xlabel('Frequency (Hz)'); ylabel('Filter Response (dB)');
169
+
170
+ | Rewritten by Malcolm Slaney@Interval. June 11, 1998.
171
+ | (c) 1998 Interval Research Corporation
172
+ |
173
+ | (c) 2012 Jason Heeris (Python implementation)
174
+ """
175
+ T = 1 / fs
176
+ # Change the followFreqing three parameters if you wish to use a different
177
+ # ERB scale. Must change in ERBSpace too.
178
+ # TODO: factor these out
179
+ ear_q = 9.26449 # Glasberg and Moore Parameters
180
+ min_bw = 24.7
181
+ order = 1
182
+
183
+ erb = width * ((centre_freqs / ear_q) ** order + min_bw ** order) ** (1 / order)
184
+ B = 1.019 * 2 * pi * erb
185
+
186
+ arg = 2 * centre_freqs * pi * T
187
+ vec = tf.exp(2j * arg)
188
+
189
+ A0 = T
190
+ A2 = 0
191
+ B0 = 1
192
+ B1 = -2 * tf.cos(arg) / tf.exp(B * T)
193
+ B2 = tf.exp(-2 * B * T)
194
+
195
+ rt_pos = tf.cast(tf.sqrt(3 + 2 ** 1.5), tf.complex64)
196
+ rt_neg = tf.cast(tf.sqrt(3 - 2 ** 1.5), tf.complex64)
197
+
198
+ common = -T * tf.exp(-(B * T))
199
+
200
+ # TODO: This could be simplified to a matrix calculation involving the
201
+ # constant first term and the alternating rt_pos/rt_neg and +/-1 second
202
+ # terms
203
+ k11 = tf.cos(arg) + rt_pos * tf.sin(arg)
204
+ k12 = tf.cos(arg) - rt_pos * tf.sin(arg)
205
+ k13 = tf.cos(arg) + rt_neg * tf.sin(arg)
206
+ k14 = tf.cos(arg) - rt_neg * tf.sin(arg)
207
+
208
+ A11 = common * k11
209
+ A12 = common * k12
210
+ A13 = common * k13
211
+ A14 = common * k14
212
+
213
+ gain_arg = tf.exp(1j * arg - B * T)
214
+
215
+ gain = tf.cast(tf.abs(
216
+ (vec - gain_arg * k11)
217
+ * (vec - gain_arg * k12)
218
+ * (vec - gain_arg * k13)
219
+ * (vec - gain_arg * k14)
220
+ * (T * tf.exp(B * T)
221
+ / (-1 / tf.exp(B * T) + 1 + vec * (1 - tf.exp(B * T)))
222
+ )**4
223
+ ), tf.complex64)
224
+
225
+ allfilts = tf.ones_like(centre_freqs, dtype=tf.complex64)
226
+
227
+ fcoefs = tf.stack([
228
+ A0 * allfilts, A11, A12, A13, A14, A2 * allfilts,
229
+ B0 * allfilts, B1, B2,
230
+ gain
231
+ ], axis=1)
232
+
233
+ return tf.transpose(fcoefs, perm=[1, 0])
featurizers/speech_featurizers.py ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import abc
4
+ import six
5
+ import numpy as np
6
+ import librosa
7
+ import soundfile as sf
8
+ import tensorflow as tf
9
+
10
+ from util.utils import log10
11
+ from .gammatone import fft_weights
12
+
13
+
14
+ def read_raw_audio(audio, sample_rate=16000):
15
+ if isinstance(audio, str):
16
+ wave, _ = librosa.load(os.path.expanduser(audio), sr=sample_rate)
17
+ elif isinstance(audio, bytes):
18
+ wave, sr = sf.read(io.BytesIO(audio))
19
+ wave = np.asfortranarray(wave)
20
+ if sr != sample_rate:
21
+ wave = librosa.resample(wave, sr, sample_rate)
22
+ elif isinstance(audio, np.ndarray):
23
+ return audio
24
+ else:
25
+ raise ValueError("input audio must be either a path or bytes")
26
+ return wave
27
+
28
+
29
+ def slice_signal(signal, window_size, stride=0.5) -> np.ndarray:
30
+ """ Return windows of the given signal by sweeping in stride fractions of window """
31
+ assert signal.ndim == 1, signal.ndim
32
+ n_samples = signal.shape[0]
33
+ offset = int(window_size * stride)
34
+ slices = []
35
+ for beg_i, end_i in zip(range(0, n_samples, offset),
36
+ range(window_size, n_samples + offset,
37
+ offset)):
38
+ slice_ = signal[beg_i:end_i]
39
+ if slice_.shape[0] < window_size:
40
+ slice_ = np.pad(
41
+ slice_, (0, window_size - slice_.shape[0]), 'constant', constant_values=0.0)
42
+ if slice_.shape[0] == window_size:
43
+ slices.append(slice_)
44
+ return np.array(slices, dtype=np.float32)
45
+
46
+
47
+ def tf_merge_slices(slices: tf.Tensor) -> tf.Tensor:
48
+ # slices shape = [batch, window_size]
49
+ return tf.keras.backend.flatten(slices) # return shape = [-1, ]
50
+
51
+
52
+ def merge_slices(slices: np.ndarray) -> np.ndarray:
53
+ # slices shape = [batch, window_size]
54
+ return np.reshape(slices, [-1])
55
+
56
+
57
+ def normalize_audio_feature(audio_feature: np.ndarray, per_feature=False):
58
+ """ Mean and variance normalization """
59
+ axis = 0 if per_feature else None
60
+ mean = np.mean(audio_feature, axis=axis)
61
+ std_dev = np.std(audio_feature, axis=axis) + 1e-9
62
+ normalized = (audio_feature - mean) / std_dev
63
+ return normalized
64
+
65
+
66
+ def tf_normalize_audio_features(audio_feature: tf.Tensor, per_feature=False):
67
+ """
68
+ TF Mean and variance features normalization
69
+ Args:
70
+ audio_feature: tf.Tensor with shape [T, F]
71
+
72
+ Returns:
73
+ normalized audio features with shape [T, F]
74
+ """
75
+ axis = 0 if per_feature else None
76
+ mean = tf.reduce_mean(audio_feature, axis=axis)
77
+ std_dev = tf.math.reduce_std(audio_feature, axis=axis) + 1e-9
78
+ return (audio_feature - mean) / std_dev
79
+
80
+
81
+ def normalize_signal(signal: np.ndarray):
82
+ """ Normailize signal to [-1, 1] range """
83
+ gain = 1.0 / (np.max(np.abs(signal)) + 1e-9)
84
+ return signal * gain
85
+
86
+
87
+ def tf_normalize_signal(signal: tf.Tensor):
88
+ """
89
+ TF Normailize signal to [-1, 1] range
90
+ Args:
91
+ signal: tf.Tensor with shape [None]
92
+
93
+ Returns:
94
+ normalized signal with shape [None]
95
+ """
96
+ gain = 1.0 / (tf.reduce_max(tf.abs(signal), axis=-1) + 1e-9)
97
+ return signal * gain
98
+
99
+
100
+ def preemphasis(signal: np.ndarray, coeff=0.97):
101
+ if not coeff or coeff <= 0.0:
102
+ return signal
103
+ return np.append(signal[0], signal[1:] - coeff * signal[:-1])
104
+
105
+
106
+ def tf_preemphasis(signal: tf.Tensor, coeff=0.97):
107
+ """
108
+ TF Pre-emphasis
109
+ Args:
110
+ signal: tf.Tensor with shape [None]
111
+ coeff: Float that indicates the preemphasis coefficient
112
+
113
+ Returns:
114
+ pre-emphasized signal with shape [None]
115
+ """
116
+ if not coeff or coeff <= 0.0: return signal
117
+ s0 = tf.expand_dims(signal[0], axis=-1)
118
+ s1 = signal[1:] - coeff * signal[:-1]
119
+ return tf.concat([s0, s1], axis=-1)
120
+
121
+
122
+ def depreemphasis(signal: np.ndarray, coeff=0.97):
123
+ if not coeff or coeff <= 0.0: return signal
124
+ x = np.zeros(signal.shape[0], dtype=np.float32)
125
+ x[0] = signal[0]
126
+ for n in range(1, signal.shape[0], 1):
127
+ x[n] = coeff * x[n - 1] + signal[n]
128
+ return x
129
+
130
+
131
+ def tf_depreemphasis(signal: tf.Tensor, coeff=0.97):
132
+ """
133
+ TF Depreemphasis
134
+ Args:
135
+ signal: tf.Tensor with shape [B, None]
136
+ coeff: Float that indicates the preemphasis coefficient
137
+
138
+ Returns:
139
+ depre-emphasized signal with shape [B, None]
140
+ """
141
+ if not coeff or coeff <= 0.0: return signal
142
+
143
+ def map_fn(elem):
144
+ x = tf.expand_dims(elem[0], axis=-1)
145
+ for n in range(1, elem.shape[0], 1):
146
+ current = coeff * x[n - 1] + elem[n]
147
+ x = tf.concat([x, [current]], axis=0)
148
+ return x
149
+
150
+ return tf.map_fn(map_fn, signal)
151
+
152
+
153
+ class SpeechFeaturizer(metaclass=abc.ABCMeta):
154
+ def __init__(self, speech_config: dict):
155
+ """
156
+ We should use TFSpeechFeaturizer for training to avoid differences
157
+ between tf and librosa when converting to tflite in post-training stage
158
+ speech_config = {
159
+ "sample_rate": int,
160
+ "frame_ms": int,
161
+ "stride_ms": int,
162
+ "num_feature_bins": int,
163
+ "feature_type": str,
164
+ "delta": bool,
165
+ "delta_delta": bool,
166
+ "pitch": bool,
167
+ "normalize_signal": bool,
168
+ "normalize_feature": bool,
169
+ "normalize_per_feature": bool
170
+ }
171
+ """
172
+ # Samples
173
+ self.sample_rate = speech_config.get("sample_rate", 16000)
174
+ self.frame_length = int(self.sample_rate * (speech_config.get("frame_ms", 25) / 1000))
175
+ self.frame_step = int(self.sample_rate * (speech_config.get("stride_ms", 10) / 1000))
176
+ # Features
177
+ self.num_feature_bins = speech_config.get("num_feature_bins", 80)
178
+ self.feature_type = speech_config.get("feature_type", "log_mel_spectrogram")
179
+ self.preemphasis = speech_config.get("preemphasis", None)
180
+ # Normalization
181
+ self.normalize_signal = speech_config.get("normalize_signal", True)
182
+ self.normalize_feature = speech_config.get("normalize_feature", True)
183
+ self.normalize_per_feature = speech_config.get("normalize_per_feature", False)
184
+ # librosa mel filter
185
+ self.mel_filter = None
186
+
187
+ @property
188
+ def nfft(self) -> int:
189
+ """ Number of FFT """
190
+ return 2 ** (self.frame_length - 1).bit_length()
191
+
192
+ @property
193
+ def shape(self) -> list:
194
+ """ The shape of extracted features """
195
+ raise NotImplementedError()
196
+
197
+ @abc.abstractclassmethod
198
+ def stft(self, signal):
199
+ raise NotImplementedError()
200
+
201
+ @abc.abstractclassmethod
202
+ def power_to_db(self, S, ref=1.0, amin=1e-10, top_db=80.0):
203
+ raise NotImplementedError()
204
+
205
+ @abc.abstractmethod
206
+ def extract(self, signal):
207
+ """ Function to perform feature extraction """
208
+ raise NotImplementedError()
209
+
210
+
211
+ class NumpySpeechFeaturizer(SpeechFeaturizer):
212
+ def __init__(self, speech_config: dict):
213
+ super(NumpySpeechFeaturizer, self).__init__(speech_config)
214
+ self.delta = speech_config.get("delta", False)
215
+ self.delta_delta = speech_config.get("delta_delta", False)
216
+ self.pitch = speech_config.get("pitch", False)
217
+
218
+ @property
219
+ def shape(self) -> list:
220
+ # None for time dimension
221
+ channel_dim = 1
222
+
223
+ if self.delta:
224
+ channel_dim += 1
225
+
226
+ if self.delta_delta:
227
+ channel_dim += 1
228
+
229
+ if self.pitch:
230
+ channel_dim += 1
231
+
232
+ return [None, self.num_feature_bins, channel_dim]
233
+
234
+ def stft(self, signal):
235
+ return np.square(
236
+ np.abs(librosa.core.stft(signal, n_fft=self.nfft, hop_length=self.frame_step,
237
+ win_length=self.frame_length, center=True, window="hann")))
238
+
239
+ def power_to_db(self, S, ref=1.0, amin=1e-10, top_db=80.0):
240
+ return librosa.power_to_db(S, ref=ref, amin=amin, top_db=top_db)
241
+
242
+ def extract(self, signal: np.ndarray) -> np.ndarray:
243
+ signal = np.asfortranarray(signal)
244
+ if self.normalize_signal:
245
+ signal = normalize_signal(signal)
246
+ signal = preemphasis(signal, self.preemphasis)
247
+
248
+ if self.feature_type == "mfcc":
249
+ features = self.compute_mfcc(signal)
250
+ elif self.feature_type == "log_mel_spectrogram":
251
+ features = self.compute_log_mel_spectrogram(signal)
252
+ elif self.feature_type == "spectrogram":
253
+ features = self.compute_spectrogram(signal)
254
+ elif self.feature_type == "log_gammatone_spectrogram":
255
+ features = self.compute_log_gammatone_spectrogram(signal)
256
+ else:
257
+ raise ValueError("feature_type must be either 'mfcc', "
258
+ "'log_mel_spectrogram', 'log_gammatone_spectrogram' "
259
+ "or 'spectrogram'")
260
+
261
+ if self.normalize_feature:
262
+ features = normalize_audio_feature(features, per_feature=self.normalize_per_feature)
263
+
264
+ # features = np.expand_dims(features, axis=-1)
265
+
266
+ return features
267
+
268
+ def compute_pitch(self, signal: np.ndarray) -> np.ndarray:
269
+ pitches, _ = librosa.core.piptrack(
270
+ y=signal, sr=self.sample_rate,
271
+ n_fft=self.nfft, hop_length=self.frame_step,
272
+ fmin=0.0, fmax=int(self.sample_rate / 2), win_length=self.frame_length, center=True
273
+ )
274
+
275
+ pitches = pitches.T
276
+
277
+ assert self.num_feature_bins <= self.frame_length // 2 + 1, \
278
+ "num_features for spectrogram should \
279
+ be <= (sample_rate * window_size // 2 + 1)"
280
+
281
+ return pitches[:, :self.num_feature_bins]
282
+
283
+ def compute_spectrogram(self, signal: np.ndarray) -> np.ndarray:
284
+ powspec = self.stft(signal)
285
+ features = self.power_to_db(powspec.T)
286
+
287
+ assert self.num_feature_bins <= self.frame_length // 2 + 1, \
288
+ "num_features for spectrogram should \
289
+ be <= (sample_rate * window_size // 2 + 1)"
290
+
291
+ # cut high frequency part, keep num_feature_bins features
292
+ features = features[:, :self.num_feature_bins]
293
+
294
+ return features
295
+
296
+ def compute_mfcc(self, signal: np.ndarray) -> np.ndarray:
297
+ S = self.stft(signal)
298
+
299
+ mel = librosa.filters.mel(self.sample_rate, self.nfft,
300
+ n_mels=self.num_feature_bins,
301
+ fmin=0.0, fmax=int(self.sample_rate / 2))
302
+
303
+ mel_spectrogram = np.dot(S.T, mel.T)
304
+
305
+ mfcc = librosa.feature.mfcc(sr=self.sample_rate,
306
+ S=self.power_to_db(mel_spectrogram).T,
307
+ n_mfcc=self.num_feature_bins)
308
+
309
+ return mfcc.T
310
+
311
+ def compute_log_mel_spectrogram(self, signal: np.ndarray) -> np.ndarray:
312
+ S = self.stft(signal)
313
+
314
+ mel = librosa.filters.mel(self.sample_rate, self.nfft,
315
+ n_mels=self.num_feature_bins,
316
+ fmin=0.0, fmax=int(self.sample_rate / 2))
317
+
318
+ mel_spectrogram = np.dot(S.T, mel.T)
319
+
320
+ return self.power_to_db(mel_spectrogram)
321
+
322
+ def compute_log_gammatone_spectrogram(self, signal: np.ndarray) -> np.ndarray:
323
+ S = self.stft(signal)
324
+
325
+ gammatone = fft_weights(self.nfft, self.sample_rate,
326
+ self.num_feature_bins, width=1.0,
327
+ fmin=0, fmax=int(self.sample_rate / 2),
328
+ maxlen=(self.nfft / 2 + 1))
329
+
330
+ gammatone = gammatone.numpy().astype(np.float32)
331
+
332
+ gammatone_spectrogram = np.dot(S.T, gammatone)
333
+
334
+ return self.power_to_db(gammatone_spectrogram)
335
+
336
+
337
+ class TFSpeechFeaturizer(SpeechFeaturizer):
338
+ @property
339
+ def shape(self) -> list:
340
+ # None for time dimension
341
+ return [None, self.num_feature_bins, 1]
342
+
343
+ def stft(self, signal):
344
+ signal = tf.pad(signal, [[self.nfft // 2, self.nfft // 2]], mode="REFLECT")
345
+ window = tf.signal.hann_window(self.frame_length, periodic=True)
346
+ left_pad = (self.nfft - self.frame_length) // 2
347
+ right_pad = self.nfft - self.frame_length - left_pad
348
+ window = tf.pad(window, [[left_pad, right_pad]])
349
+ framed_signals = tf.signal.frame(signal, frame_length=self.nfft, frame_step=self.frame_step)
350
+ framed_signals *= window
351
+ return tf.square(tf.abs(tf.signal.rfft(framed_signals, [self.nfft])))
352
+
353
+ def power_to_db(self, S, ref=1.0, amin=1e-10, top_db=80.0):
354
+ if amin <= 0:
355
+ raise ValueError('amin must be strictly positive')
356
+
357
+ magnitude = S
358
+
359
+ if six.callable(ref):
360
+ # User supplied a function to calculate reference power
361
+ ref_value = ref(magnitude)
362
+ else:
363
+ ref_value = np.abs(ref)
364
+
365
+ log_spec = 10.0 * log10(tf.maximum(amin, magnitude))
366
+ log_spec -= 10.0 * log10(tf.maximum(amin, ref_value))
367
+
368
+ if top_db is not None:
369
+ if top_db < 0:
370
+ raise ValueError('top_db must be non-negative')
371
+ log_spec = tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)
372
+
373
+ return log_spec
374
+
375
+ def extract(self, signal: np.ndarray) -> np.ndarray:
376
+ signal = np.asfortranarray(signal)
377
+ features = self.tf_extract(tf.convert_to_tensor(signal, dtype=tf.float32))
378
+ return features.numpy()
379
+
380
+ def tf_extract(self, signal: tf.Tensor) -> tf.Tensor:
381
+ """
382
+ Extract speech features from signals (for using in tflite)
383
+ Args:
384
+ signal: tf.Tensor with shape [None]
385
+
386
+ Returns:
387
+ features: tf.Tensor with shape [T, F]
388
+ """
389
+ if self.normalize_signal:
390
+ signal = tf_normalize_signal(signal)
391
+ signal = tf_preemphasis(signal, self.preemphasis)
392
+
393
+ if self.feature_type == "spectrogram":
394
+ features = self.compute_spectrogram(signal)
395
+ elif self.feature_type == "log_mel_spectrogram":
396
+ features = self.compute_log_mel_spectrogram(signal)
397
+ elif self.feature_type == "mfcc":
398
+ features = self.compute_mfcc(signal)
399
+ elif self.feature_type == "log_gammatone_spectrogram":
400
+ features = self.compute_log_gammatone_spectrogram(signal)
401
+ else:
402
+ raise ValueError("feature_type must be either 'mfcc',"
403
+ "'log_mel_spectrogram' or 'spectrogram'")
404
+
405
+ if self.normalize_feature:
406
+ features = tf_normalize_audio_features(
407
+ features, per_feature=self.normalize_per_feature)
408
+
409
+ # features = tf.expand_dims(features, axis=-1)
410
+
411
+ return features
412
+
413
+ def compute_log_mel_spectrogram(self, signal):
414
+ spectrogram = self.stft(signal)
415
+ if self.mel_filter is None:
416
+ linear_to_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
417
+ num_mel_bins=self.num_feature_bins,
418
+ num_spectrogram_bins=spectrogram.shape[-1],
419
+ sample_rate=self.sample_rate,
420
+ lower_edge_hertz=0.0, upper_edge_hertz=(self.sample_rate / 2)
421
+ )
422
+ else:
423
+ linear_to_weight_matrix = self.mel_filter
424
+
425
+ mel_spectrogram = tf.tensordot(spectrogram, linear_to_weight_matrix, 1)
426
+ return self.power_to_db(mel_spectrogram)
427
+
428
+ def compute_spectrogram(self, signal):
429
+ S = self.stft(signal)
430
+ spectrogram = self.power_to_db(S)
431
+ return spectrogram[:, :self.num_feature_bins]
432
+
433
+ def compute_mfcc(self, signal):
434
+ log_mel_spectrogram = self.compute_log_mel_spectrogram(signal)
435
+ return tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrogram)
436
+
437
+ def compute_log_gammatone_spectrogram(self, signal: np.ndarray) -> np.ndarray:
438
+ S = self.stft(signal)
439
+
440
+ gammatone = fft_weights(self.nfft, self.sample_rate,
441
+ self.num_feature_bins, width=1.0,
442
+ fmin=0, fmax=int(self.sample_rate / 2),
443
+ maxlen=(self.nfft / 2 + 1))
444
+
445
+ gammatone_spectrogram = tf.tensordot(S, gammatone, 1)
446
+
447
+ return self.power_to_db(gammatone_spectrogram)
448
+
449
+ def set_mel_filter(self, librosa_mel_filter):
450
+ """
451
+ Set librosa mel filter.
452
+ """
453
+ self.mel_filter = librosa_mel_filter
librosa_mel_filter.csv ADDED
The diff for this file is too large to render. See raw diff
 
models/__init__.py ADDED
File without changes
models/layers/__init__.py ADDED
File without changes
models/layers/attention.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import tensorflow as tf
3
+
4
+
5
+ class Attention(tf.keras.layers.Layer):
6
+ def __init__(self, hidden_size,
7
+ attention_size=1,
8
+ name=None,
9
+ **kwargs):
10
+ super().__init__( **kwargs)
11
+ self.w_kernel = self.add_variable('w_kernel', [hidden_size, attention_size])
12
+ self.w_bias = self.add_variable('w_bias', [attention_size])
13
+ self.bias = self.add_variable('bias', [attention_size])
14
+
15
+
16
+ def call(self, inputs, inp_len, maxlen=150, mask=None, training=False, **kwargs):
17
+ """
18
+ inp_len: length of input audio
19
+ maxlen: audio length after downsampling(cnn(twice downsample) and maxpool), in our experiments
20
+ the input length is 1200s, after downsampling, the sequence length is 1200//8=1500,
21
+ (8=2*2*2, see model parameters for details).
22
+ If you change input length and times of dowansampling,
23
+ please reset the maxlen parameter!!!!
24
+ """
25
+ # In case of Bi-RNN, concatenate the forward and the backward Rnn outputs.
26
+ if isinstance(inputs, tuple):
27
+ inputs = tf.concat(inputs, 2)
28
+ v = tf.sigmoid(tf.tensordot(inputs, self.w_kernel, axes=1) + self.w_bias)
29
+ vu = tf.tensordot(v, self.bias, axes=1)
30
+ alphas = tf.nn.softmax(vu) #(B,T)
31
+ if mask is not None:
32
+ alphas = alphas*tf.cast(tf.sequence_mask(inp_len, maxlen), dtype=tf.float32)
33
+ output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)
34
+
35
+ return output
models/model.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from featurizers.speech_featurizers import SpeechFeaturizer
3
+ from .layers.attention import Attention
4
+
5
+
6
+ L2 = tf.keras.regularizers.l2(1e-6)
7
+
8
+
9
+ def shape_list(x, out_type=tf.int32):
10
+ """Deal with dynamic shape in tensorflow cleanly."""
11
+ static = x.shape.as_list()
12
+ dynamic = tf.shape(x, out_type=out_type)
13
+ return [dynamic[i] if s is None else s for i, s in enumerate(static)]
14
+
15
+
16
+ def merge_two_last_dims(x):
17
+ b, _, f, c = shape_list(x)
18
+ return tf.reshape(x, shape=[b, -1, f * c])
19
+
20
+
21
+ class MulSpeechLR(tf.keras.Model):
22
+ def __init__(self, name, filters, kernel_size, d_model, rnn_cell, seq_mask, vocab_size, dropout=0.5):
23
+ super(MulSpeechLR, self).__init__()
24
+ self.filters1 = filters[0]
25
+ self.filters2 = filters[1]
26
+ self.filters3 = filters[2]
27
+ self.kernel_size1 = kernel_size[0]
28
+ self.kernel_size2 = kernel_size[1]
29
+ self.kernel_size3 = kernel_size[2]
30
+ #during training, self.mask can be set true, but during inference, it must be false
31
+ self.mask = seq_mask
32
+ self.conv1 = tf.keras.layers.Conv2D(filters=self.filters1, kernel_size=self.kernel_size1,
33
+ strides=(2,2), padding='same', activation='relu')
34
+ self.maxpool1 = tf.keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))
35
+
36
+ self.conv2 = tf.keras.layers.Conv2D(filters=self.filters2, kernel_size=self.kernel_size2,
37
+ strides=(2,2), padding='same', activation='relu')
38
+ self.conv3 = tf.keras.layers.Conv2D(filters=self.filters3, kernel_size=self.kernel_size3,
39
+ strides=(1,1), padding='same', activation='relu')
40
+ self.ln1 = tf.keras.layers.LayerNormalization(name=f"{name}_ln_1")
41
+ self.ln2 = tf.keras.layers.LayerNormalization(name=f"{name}_ln_2")
42
+ self.ln3 = tf.keras.layers.LayerNormalization(name=f"{name}_ln_3")
43
+ # self.linear1 = tf.keras.layers.Dense(d_model*2, name=f"{name}_dense_1")
44
+ self.linear2 = tf.keras.layers.Dense(d_model, name=f"{name}_dense_2")
45
+ self.rnn = tf.keras.layers.GRU(rnn_cell, return_sequences=True, return_state=True, name=f"{name}_gru")
46
+ self.attention = Attention(rnn_cell)
47
+ self.class_layer = tf.keras.layers.Dense(vocab_size)
48
+ self.res_add = tf.keras.layers.Add(name=f"{name}_add")
49
+
50
+
51
+ def call(self, inputs):
52
+ x, x_len = inputs
53
+ # mask = tf.cast(tf.sequence_mask(x_len, maxlen=150), dtype=tf.float32)
54
+ x = tf.expand_dims(x, axis=-1)
55
+ x = self.conv1(x)
56
+ x = self.ln1(x)
57
+ x = self.maxpool1(x)
58
+ x = self.conv2(x)
59
+ x = self.ln2(x)
60
+ x = self.conv3(x)
61
+ x = self.ln3(x)
62
+ x = merge_two_last_dims(x)
63
+ x, final_state = self.rnn(x)
64
+ x = self.attention(x, x_len, self.mask)
65
+ x = self.res_add([x, final_state])
66
+ output = self.linear2(x)
67
+ output = tf.nn.relu(output)
68
+ output = self.class_layer(output)
69
+
70
+ return output
71
+
72
+
73
+ def init_build(self, input_shape):
74
+ x = tf.keras.Input(shape=input_shape, dtype= tf.float32)
75
+ l = tf.keras.Input(shape=[], dtype=tf.int32)
76
+ self([x, l], training=False)
77
+
78
+ def add_featurizers(self,
79
+ speech_featurizer: SpeechFeaturizer):
80
+ """
81
+ Function to add featurizer to model to convert to end2end tflite
82
+ Args:
83
+ speech_featurizer: SpeechFeaturizer instance
84
+ """
85
+ self.speech_featurizer = speech_featurizer
86
+
87
+
88
+ @tf.function(input_signature=[tf.TensorSpec([None], dtype=tf.float32)])
89
+ def predict_pb(self, signal):
90
+ features = self.speech_featurizer.tf_extract(signal)
91
+ input_len = tf.expand_dims(tf.shape(features)[0], axis=0)
92
+ input = tf.expand_dims(features, axis=0)
93
+ output = self([input, input_len], training=False)
94
+ output = tf.nn.softmax(output)
95
+ output1 = tf.squeeze(output)
96
+ output = tf.argmax(output1, axis=-1)
97
+
98
+ return output, tf.gather(output1, output)
optimizers/__init.py ADDED
File without changes
optimizers/schedules.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 by zhongying
2
+
3
+ import tensorflow as tf
4
+ from tensorflow.python.framework import ops
5
+ from tensorflow.python.ops import math_ops
6
+ from tensorflow.keras.optimizers.schedules import ExponentialDecay
7
+
8
+
9
+ class TransformerLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
10
+ """ Transformer learning rate schedule """
11
+
12
+ def __init__(self, d_model, init_steps=0, warmup_steps=4000, max_lr=None):
13
+ super(TransformerLRSchedule, self).__init__()
14
+
15
+ self.d_model = d_model
16
+ self.d_model = tf.cast(self.d_model, tf.float32)
17
+ self.max_lr = max_lr
18
+ self.warmup_steps = warmup_steps
19
+ self.init_steps = init_steps
20
+
21
+ def __call__(self, step):
22
+ # lr = (d_model^-0.5) * min(step^-0.5, step*(warm_up^-1.5))
23
+ step += self.init_steps
24
+ arg1 = tf.math.rsqrt(step)
25
+ arg2 = step * (self.warmup_steps ** -1.5)
26
+ lr = tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
27
+ if self.max_lr is not None:
28
+ return tf.math.minimum(self.max_lr, lr)
29
+ return lr
30
+
31
+ def get_config(self):
32
+ return {
33
+ "d_model": self.d_model,
34
+ "warmup_steps": self.warmup_steps,
35
+ "max_lr": self.max_lr
36
+ }
37
+
38
+
39
+ class SANSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
40
+ def __init__(self, lamb, d_model, warmup_steps=4000):
41
+ super(SANSchedule, self).__init__()
42
+
43
+ self.lamb = tf.cast(lamb, tf.float32)
44
+ self.d_model = tf.cast(d_model, tf.float32)
45
+
46
+ self.warmup_steps = tf.cast(warmup_steps, tf.float32)
47
+
48
+ def __call__(self, step):
49
+ arg1 = step / (self.warmup_steps ** 1.5)
50
+ arg2 = 1 / tf.math.sqrt(step)
51
+
52
+ return (self.lamb / tf.math.sqrt(self.d_model)) * tf.math.minimum(arg1, arg2)
53
+
54
+ def get_config(self):
55
+ return {
56
+ "lamb": self.lamb,
57
+ "d_model": self.d_model,
58
+ "warmup_steps": self.warmup_steps
59
+ }
60
+
61
+
62
+ class BoundExponentialDecay(ExponentialDecay):
63
+ def __init__(self, min_lr=0.0, **kwargs):
64
+ super().__init__(**kwargs)
65
+ self.min_lr = min_lr
66
+
67
+ def __call__(self, step):
68
+ with ops.name_scope_v2(self.name or "ExponentialDecay") as name:
69
+ initial_learning_rate = ops.convert_to_tensor(
70
+ self.initial_learning_rate, name="initial_learning_rate")
71
+ dtype = initial_learning_rate.dtype
72
+ decay_steps = math_ops.cast(self.decay_steps, dtype)
73
+ decay_rate = math_ops.cast(self.decay_rate, dtype)
74
+
75
+ global_step_recomp = math_ops.cast(step, dtype)
76
+ p = global_step_recomp / decay_steps
77
+ if self.staircase:
78
+ p = math_ops.floor(p)
79
+ new_lr = math_ops.multiply(
80
+ initial_learning_rate, math_ops.pow(decay_rate, p), name=name)
81
+ return math_ops.maximum(self.min_lr, new_lr)
predict_by_pb.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from signal import signal
2
+ import tensorflow as tf
3
+ gpus = tf.config.list_physical_devices('GPU')
4
+ tf.config.set_visible_devices(gpus[0:1], 'GPU')
5
+ from vocab.vocab import Vocab
6
+ import librosa
7
+ import numpy as np
8
+ import sys
9
+ import os
10
+ from tqdm import tqdm
11
+ from sklearn.metrics import accuracy_score
12
+
13
+
14
+ vocab = Vocab("vocab/vocab.txt")
15
+ model = tf.saved_model.load('saved_models/lang14/pb/2/')
16
+
17
+
18
+ def predict_wav(wav_path):
19
+ signal, _ = librosa.load(wav_path, sr=16000)
20
+ output, prob = model.predict_pb(signal)
21
+ language = vocab.token_list[output.numpy()]
22
+ print(language, prob.numpy()*100)
23
+
24
+ return output.numpy(), prob.numpy()
25
+
26
+
27
+ if __name__ == '__main__':
28
+ wav_path = sys.argv[1]
29
+ predict_wav(wav_path)
30
+
predict_by_weights.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ gpus = tf.config.list_physical_devices('GPU')
3
+ tf.config.set_visible_devices(gpus[0:1], 'GPU')
4
+ from vocab.vocab import Vocab
5
+ from dataset import create_dataset
6
+ from configs.config import Config
7
+ import sys
8
+ from featurizers.speech_featurizers import TFSpeechFeaturizer, NumpySpeechFeaturizer
9
+ from models.model import MulSpeechLR as Model
10
+ import librosa
11
+
12
+
13
+ weights_dir = './saved_weights/20230228-084356/'
14
+ config_file = weights_dir + 'config.yml'
15
+ model_file = weights_dir + 'last/model'
16
+ vocab_file = weights_dir + 'vocab.txt'
17
+ config = Config(config_file)
18
+ speech_featurizer = TFSpeechFeaturizer(config.speech_config)
19
+ lr_vocab = Vocab(vocab_file)
20
+ lr_model = Model(**config.model_config, vocab_size=len(lr_vocab.token_list))
21
+ lr_model.load_weights(model_file)
22
+ lr_model.add_featurizers(speech_featurizer)
23
+ lr_model.init_build([None, config.speech_config['num_feature_bins']])
24
+ lr_model.summary()
25
+
26
+
27
+ def predict_wav(wav_path):
28
+ sample_rate = 16000
29
+ signal, _ = librosa.load(wav_path, sr=sample_rate)
30
+ predict, prob = lr_model.predict_pb(signal)
31
+ language = lr_vocab.token_list[predict.numpy()]
32
+ print("predict language={} prob={:.4f}".format(language, prob.numpy()*100))
33
+
34
+ if __name__ == '__main__':
35
+ wav_path = sys.argv[1]
36
+ predict_wav(wav_path)
train.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # copyright by speechflow 2023/03/17
3
+
4
+ import argparse
5
+ import tensorflow as tf
6
+ gpus = tf.config.list_physical_devices('GPU')
7
+ # tf.config.set_visible_devices(gpus[0:1], 'GPU')
8
+ import datetime
9
+ import time
10
+ import os
11
+ from shutil import copyfile
12
+ import matplotlib.pyplot as plt
13
+ from vocab.vocab import Vocab
14
+ from configs.config import Config
15
+ from models.model import MulSpeechLR as Model
16
+ from termcolor import colored
17
+ from featurizers.speech_featurizers import NumpySpeechFeaturizer
18
+ from dataset import create_dataset
19
+ import tensorflow_addons as tfa
20
+ from sklearn.metrics import f1_score, recall_score, precision_score
21
+ mirrored_strategy = tf.distribute.MirroredStrategy()
22
+
23
+
24
+ def train(config_file):
25
+ config = Config(config_file)
26
+ current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
27
+ dir_log_root = "./saved_weights/"
28
+ if not os.path.exists(dir_log_root):
29
+ os.mkdir(dir_log_root)
30
+ dir_current = dir_log_root + current_time
31
+ if not os.path.isdir(dir_log_root):
32
+ os.mkdir(dir_log_root)
33
+ if not os.path.isdir(dir_current):
34
+ os.mkdir(dir_current)
35
+ copyfile(config_file, dir_current + '/config.yml')
36
+ log_file = open(dir_current + '/log.txt', 'w')
37
+ copyfile(config.dataset_config['vocabulary'], dir_current + '/vocab.txt')
38
+
39
+
40
+ config.print()
41
+ log_file.write(config.toString())
42
+ # vocab_file.write(config.toString())
43
+ log_file.flush()
44
+
45
+ vocab = Vocab(config.dataset_config['vocabulary'])
46
+ batch_size = config.running_config['batch_size']
47
+ global_batch_size = batch_size * mirrored_strategy.num_replicas_in_sync
48
+ speech_featurizer = NumpySpeechFeaturizer(config.speech_config)
49
+ model = Model(**config.model_config, vocab_size=len(vocab.token_list))
50
+ if config.running_config['load_weights'] is not None:
51
+ model.load_weights(config.running_config['load_weights'])
52
+ model.add_featurizers(speech_featurizer)
53
+ model.init_build([None, config.speech_config['num_feature_bins']])
54
+ model.summary()
55
+
56
+ train_dataset = create_dataset(batch_size=global_batch_size,
57
+ load_type=config.dataset_config['load_type'],
58
+ data_type=config.dataset_config['train'],
59
+ speech_featurizer=speech_featurizer,
60
+ config = config,
61
+ vocab = vocab)
62
+ eval_dataset = create_dataset(batch_size=global_batch_size,
63
+ load_type=config.dataset_config['load_type'],
64
+ data_type=config.dataset_config['dev'],
65
+ speech_featurizer=speech_featurizer,
66
+ config = config,
67
+ vocab = vocab)
68
+ test_dataset = create_dataset(batch_size=global_batch_size,
69
+ load_type=config.dataset_config['load_type'],
70
+ data_type=config.dataset_config['test'],
71
+ speech_featurizer=speech_featurizer,
72
+ config = config,
73
+ vocab = vocab)
74
+ train_dist_batch = mirrored_strategy.experimental_distribute_dataset(train_dataset)
75
+ dev_dist_batch = mirrored_strategy.experimental_distribute_dataset(eval_dataset)
76
+ test_dist_batch = mirrored_strategy.experimental_distribute_dataset(test_dataset)
77
+ dev_loss = tf.keras.metrics.Mean(name='dev_loss')
78
+ train_loss = tf.keras.metrics.Mean(name='train_loss')
79
+ dev_accuracy = tf.keras.metrics.Mean(name='train_accuracy')
80
+ init_steps = config.optimizer_config['init_steps']
81
+ step = tf.Variable(init_steps)
82
+
83
+ optimizer = tf.keras.optimizers.Adam(lr=config.optimizer_config['max_lr'])
84
+ ckpt = tf.train.Checkpoint(step=step, optimizer=optimizer, model=model)
85
+ ckpt_manager = tf.train.CheckpointManager(ckpt, dir_current + '/ckpt', max_to_keep=5)
86
+ loss_object = tfa.losses.SigmoidFocalCrossEntropy(
87
+ from_logits = True,
88
+ alpha = 0.25,
89
+ gamma = 0,
90
+ reduction = tf.keras.losses.Reduction.NONE)
91
+ loss_object_label_smooth = tf.keras.losses.CategoricalCrossentropy(
92
+ from_logits=True, label_smoothing=0.1, reduction=tf.keras.losses.Reduction.NONE)
93
+
94
+ def compute_loss(real, pred, smooth=False):
95
+ if smooth:
96
+ loss_ = loss_object_label_smooth(tf.one_hot(real, len(vocab.token_list)), pred)
97
+ else:
98
+ real = tf.one_hot(real, len(vocab.token_list))
99
+ loss_ = loss_object(real, pred)
100
+ return tf.nn.compute_average_loss(loss_, global_batch_size=global_batch_size)
101
+
102
+ def accuracy_function(real, pred):
103
+ pred = tf.cast(pred, dtype=tf.int32)
104
+ accuracies = tf.equal(real, pred)
105
+
106
+ mask = tf.math.logical_not(tf.math.equal(real, 0))
107
+ accuracies = tf.math.logical_and(mask, accuracies)
108
+
109
+ accuracies = tf.cast(accuracies, dtype=tf.float32)
110
+ mask = tf.cast(mask, dtype=tf.float32)
111
+
112
+ return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)
113
+
114
+ @tf.function
115
+ def train_step(input, input_length, target):
116
+ with tf.GradientTape() as tape:
117
+ predictions = model([input, input_length], training=True)
118
+ loss = compute_loss(target, predictions, smooth=True)
119
+ grads = tape.gradient(loss, model.trainable_variables)
120
+ optimizer.apply_gradients(zip(grads, model.trainable_variables))
121
+ return loss
122
+
123
+ @tf.function
124
+ def dev_step(input, input_length, target):
125
+ predictions = model([input, input_length], training=False)
126
+ t_loss = compute_loss(target, predictions, smooth=True)
127
+
128
+ return t_loss, predictions
129
+
130
+ @tf.function
131
+ def test_step(input, input_length, target):
132
+ predictions = model([input, input_length], training=False)
133
+ return predictions, target
134
+
135
+ @tf.function(experimental_relax_shapes=True)
136
+ def distributed_train_step(x, x_len, y):
137
+ per_replica_losses = mirrored_strategy.run(train_step, args=(x, x_len, y))
138
+ mean_loss = mirrored_strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
139
+ return mean_loss
140
+
141
+ @tf.function(experimental_relax_shapes=True)
142
+ def distributed_dev_step(x, x_len, y):
143
+ per_replica_losses, per_replica_preds = mirrored_strategy.run(dev_step, args=(x, x_len, y))
144
+ mean_loss = mirrored_strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
145
+ return mean_loss, per_replica_preds
146
+
147
+
148
+ @tf.function(experimental_relax_shapes=True)
149
+ def distributed_test_step(x, x_len, y):
150
+ return mirrored_strategy.run(test_step, args=(x, x_len, y))
151
+
152
+ plot_train_loss = []
153
+ plot_dev_loss = []
154
+ plot_acc, plot_precision = [], []
155
+ best_acc= 0
156
+ train_iter = iter(train_dist_batch)
157
+ dev_iter = iter(dev_dist_batch)
158
+ test_iter = iter(test_dist_batch)
159
+
160
+ for epoch in range(1, config.running_config['num_epochs'] + 1):
161
+ if config.dataset_config['load_type']=='txt':
162
+ train_iter = iter(train_dist_batch)
163
+ dev_iter = iter(dev_dist_batch)
164
+ test_iter = iter(test_dist_batch)
165
+ start = time.time()
166
+ # training loop
167
+ train_loss = 0.0
168
+ dev_loss = 0.0
169
+ for train_batches in range(config.running_config['train_steps']):
170
+ inp, inp_len, target = next(train_iter)
171
+ train_loss += distributed_train_step(inp, inp_len, target)
172
+ template = '\rEpoch {} Step {} Loss {:.4f}'
173
+ print(colored(template.format(
174
+ epoch, train_batches + 1, train_loss / (train_batches + 1),
175
+ ), 'green'), end='', flush=True)
176
+ step.assign_add(1)
177
+
178
+ # validation loop
179
+ pred_all = tf.zeros([1], dtype=tf.int32)
180
+ true_all = tf.zeros([1], dtype=tf.int32)
181
+ for dev_batches in range(config.running_config['dev_steps']):
182
+ inp, inp_len, target = next(dev_iter)
183
+ loss, predicted_result = distributed_dev_step(inp, inp_len, target)
184
+ dev_loss += loss
185
+ if mirrored_strategy.num_replicas_in_sync == 1:
186
+ prediction = tf.nn.softmax(predicted_result)
187
+ y_pred = tf.argmax(prediction, axis=-1)
188
+ y_pred = tf.cast(y_pred, dtype=tf.int32)
189
+ pred_all = tf.concat([pred_all, y_pred], axis=0)
190
+ true_all = tf.concat([true_all, target], axis=0)
191
+ else:
192
+ for i in range(mirrored_strategy.num_replicas_in_sync):
193
+ predicted_result_per_replica = predicted_result.values[i]
194
+ y_true = target.values[i]
195
+ y_pred = tf.argmax(predicted_result_per_replica, axis=-1)
196
+ y_pred = tf.cast(y_pred, dtype=tf.int32)
197
+ pred_all = tf.concat([pred_all, y_pred], axis=0)
198
+ true_all = tf.concat([true_all, y_true], axis=0)
199
+ dev_accuracy = accuracy_function(true_all, pred_all)
200
+
201
+ pred_all = tf.zeros([1], dtype=tf.int32)
202
+ true_all = tf.zeros([1], dtype=tf.int32)
203
+ for test_batches in range(config.running_config['test_steps']):
204
+ inp, inp_len, target = next(test_iter)
205
+ predicted_result, target_result = distributed_test_step(inp, inp_len, target)
206
+ if mirrored_strategy.num_replicas_in_sync == 1:
207
+ prediction = tf.nn.softmax(predicted_result)
208
+ y_pred =tf.argmax(prediction, axis=-1)
209
+ y_pred = tf.cast(y_pred, dtype=tf.int32)
210
+ pred_all = tf.concat([pred_all, y_pred], axis=0)
211
+ true_all = tf.concat([true_all, target], axis=0)
212
+ else:
213
+ for replica in range(mirrored_strategy.num_replicas_in_sync):
214
+ predicted_result_per_replica = predicted_result.values[i]
215
+ y_true = target.values[i]
216
+ y_pred = tf.argmax(predicted_result_per_replica, axis=-1)
217
+ y_pred = tf.cast(y_pred, dtype=tf.int32)
218
+ pred_all = tf.concat([pred_all, y_pred], axis=0)
219
+ true_all = tf.concat([true_all, y_true], axis=0)
220
+
221
+ test_acc = accuracy_function(real=true_all, pred=pred_all)
222
+
223
+ test_f1 = f1_score(y_true=true_all, y_pred=pred_all, average='macro')
224
+ precision = precision_score(y_true=true_all, y_pred=pred_all, average='macro', zero_division=1)
225
+ recall = recall_score(y_true=true_all, y_pred=pred_all, average='macro')
226
+ if precision > best_acc:
227
+ best_acc = precision
228
+ model.save_weights(dir_current + '/best/' + 'model')
229
+ model.save_weights(dir_current + '/last/' + 'model')
230
+ template = ("\rEpoch {}, Loss: {:.4f}, Val Loss: {:.4f}, "
231
+ "Val Acc: {:.4f}, test ACC: {:.4f},F1: {:.4f}, precision: {:.4f}, recall: {:.4f}, Time Cost: {:.2f} sec")
232
+ text = template.format(epoch, train_loss / config.running_config['train_steps'],
233
+ dev_loss/ config.running_config['dev_steps'], dev_accuracy *100,
234
+ test_acc*100, test_f1*100, precision*100, recall*100, time.time() - start)
235
+ print(colored(text, 'cyan'))
236
+ log_file.write(text)
237
+ log_file.flush()
238
+ plot_train_loss.append(train_loss / config.running_config['train_steps'])
239
+ plot_dev_loss.append(dev_loss / config.running_config['dev_steps'])
240
+ plot_acc.append(test_acc)
241
+ plot_precision.append(precision)
242
+ ckpt_manager.save()
243
+
244
+ plt.plot(plot_train_loss, '-r', label='train_loss')
245
+ plt.title('Train Loss')
246
+ plt.xlabel('Epochs')
247
+ plt.savefig(dir_current + '/loss.png')
248
+ #plot dev
249
+ plt.clf()
250
+ plt.plot(plot_dev_loss, '-g', label='dev_loss')
251
+ plt.title('dev Loss')
252
+ plt.xlabel('Epochs')
253
+ plt.savefig(dir_current + '/dev_loss.png')
254
+
255
+ # plot acc curve
256
+ plt.clf()
257
+ plt.plot(plot_acc, 'b-', label='acc')
258
+ plt.title('Accuracy')
259
+ plt.xlabel('Epochs')
260
+ plt.savefig(dir_current + '/acc.png')
261
+ # plot f1 curve
262
+ plt.clf()
263
+ plt.plot(plot_precision, 'y-', label='f1-score')
264
+ plt.title('F1')
265
+ plt.xlabel('Epochs')
266
+ plt.savefig(dir_current + '/f1-score.png')
267
+
268
+
269
+ if __name__ == "__main__":
270
+ parser = argparse.ArgumentParser(description="Spoken_language_identification Model training")
271
+ parser.add_argument("--config_file", type=str, default='./configs/config.yml', help="Config File Path")
272
+ args = parser.parse_args()
273
+ kwargs = vars(args)
274
+ with mirrored_strategy.scope():
275
+ train(**kwargs)
util/__init__.py ADDED
File without changes
util/utils.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2020 Beijing BluePulse Corp.
3
+ # Created by Zhang Guanqun on 2020/6/5
4
+
5
+
6
+ import matplotlib.pyplot as plt
7
+ import os
8
+ import tensorflow as tf
9
+ from typing import Union, List
10
+ import unicodedata
11
+
12
+
13
+ def preprocess_paths(paths: Union[List, str]):
14
+ if isinstance(paths, list):
15
+ return [os.path.abspath(os.path.expanduser(path)) for path in paths]
16
+ return os.path.abspath(os.path.expanduser(paths)) if paths else None
17
+
18
+
19
+ def get_reduced_length(length, reduction_factor):
20
+ return tf.cast(tf.math.ceil(tf.divide(length, tf.cast(reduction_factor, dtype=length.dtype))), dtype=tf.int32)
21
+
22
+
23
+ def merge_two_last_dims(x):
24
+ b, _, f, c = shape_list(x)
25
+ return tf.reshape(x, shape=[b, -1, f * c])
26
+
27
+
28
+ def shape_list(x):
29
+ """Deal with dynamic shape in tensorflow cleanly."""
30
+ static = x.shape.as_list()
31
+ dynamic = tf.shape(x)
32
+ return [dynamic[i] if s is None else s for i, s in enumerate(static)]
33
+
34
+
35
+ # draw loss pic
36
+ def plot_metric(history, metric, pic_file_name):
37
+ train_metrics = history.history[metric]
38
+ val_metrics = history.history['val_'+metric]
39
+ epochs = range(1, len(train_metrics) + 1)
40
+ plt.plot(epochs, train_metrics, 'bo--')
41
+ plt.plot(epochs, val_metrics, 'ro-')
42
+ plt.title('Training and validation '+ metric)
43
+ plt.xlabel("Epochs")
44
+ plt.ylabel(metric)
45
+ plt.legend(["train_"+metric, 'val_'+metric])
46
+ plt.savefig(pic_file_name)
47
+
48
+
49
+ # against LAS loop decoding
50
+ def text_no_repeat(s):
51
+ repeat_times = 0
52
+ repeat_pattern = ''
53
+ for i in range(1, len(s) // 2):
54
+ pos = i
55
+ if s[0 - 2 * pos:0 - pos] == s[0 - i:]:
56
+ tmp_repeat_pattern = s[0 - i:]
57
+ tmp_repeat_times = 1
58
+ while pos * (tmp_repeat_times + 2) <= len(s) \
59
+ and s[0 - pos * (tmp_repeat_times + 2):0 - pos * (tmp_repeat_times + 1)] == s[0 - i:]:
60
+ tmp_repeat_times += 1
61
+ if tmp_repeat_times * len(tmp_repeat_pattern) > repeat_times * len(repeat_pattern):
62
+ repeat_times = tmp_repeat_times
63
+ repeat_pattern = tmp_repeat_pattern
64
+ # print(repeat_pattern, '*', repeat_times)
65
+ if len(repeat_pattern) != 1:
66
+ s = s[:0 - repeat_times * len(repeat_pattern)] if repeat_times > 0 else s
67
+ # print(s)
68
+ return s
69
+
70
+ # Converts the unicode file to ascii
71
+ def unicode_to_ascii(s):
72
+ return ''.join(c for c in unicodedata.normalize('NFD', s)
73
+ if unicodedata.category(c) != 'Mn')
74
+
75
+ def log10(x):
76
+ numerator = tf.math.log(x)
77
+ denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
78
+ return numerator / denominator
vocab/__init__.py ADDED
File without changes
vocab/vocab.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ class Vocab:
3
+ def __init__(self, file_path):
4
+ self.token_list = []
5
+ self.load_vocab_from_file(file_path)
6
+
7
+ def load_vocab_from_file(self, file_path):
8
+ with open(file_path, 'r', encoding='utf-8') as f:
9
+ vocabs = f.readlines()
10
+ for vocab in vocabs:
11
+ self.token_list.append(vocab.strip('\n'))
vocab/vocab.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ chinese
2
+ english
3
+ french
4
+ german
5
+ indonesian
6
+ italian
7
+ japanese
8
+ korean
9
+ portuguese
10
+ russian
11
+ spanish
12
+ turkish
13
+ vietnamese
14
+ other