ZhongYing commited on Mar 20, 2023

Commit

503ec99

•

1 Parent(s): 6ffe62c

first commit

Browse files

Files changed (23) hide show

.gitignore +1 -0
configs/__init__.py +18 -0
configs/config.py +47 -0
configs/config.yml +49 -0
dataset.py +263 -0
featurizers/__init__.py +0 -0
featurizers/gammatone.py +233 -0
featurizers/speech_featurizers.py +453 -0
librosa_mel_filter.csv +0 -0
models/__init__.py +0 -0
models/layers/__init__.py +0 -0
models/layers/attention.py +35 -0
models/model.py +98 -0
optimizers/__init.py +0 -0
optimizers/schedules.py +81 -0
predict_by_pb.py +30 -0
predict_by_weights.py +36 -0
train.py +275 -0
util/__init__.py +0 -0
util/utils.py +78 -0
vocab/__init__.py +0 -0
vocab/vocab.py +11 -0
vocab/vocab.txt +14 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ **/__pycache

configs/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import re
+import yaml
+def load_yaml(path):
+    loader = yaml.SafeLoader
+    loader.add_implicit_resolver(
+        u'tag:yaml.org,2002:float',
+        re.compile(u'''^(?:
+         [-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+]?[0-9]+)?
+        |[-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+)
+        |\\.[0-9_]+(?:[eE][-+][0-9]+)?
+        |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*
+        |[-+]?\\.(?:inf|Inf|INF)
+        |\\.(?:nan|NaN|NAN))$''', re.X),
+        list(u'-+0123456789.'))
+    with open(path, "r", encoding="utf-8") as file:
+        return yaml.load(file, Loader=loader)

configs/config.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright 2023 by zhongying
+#
+from . import load_yaml
+from util.utils import preprocess_paths
+class Config:
+    """ User configs class for training, testing or infering """
+    def __init__(self, path: str):
+        print('configs file path:', path)
+        config = load_yaml(preprocess_paths(path))
+        self.speech_config = config.get("speech_config", {})
+        self.model_config = config.get("model_config", {})
+        self.dataset_config = config.get("dataset_config", {})
+        self.optimizer_config = config.get("optimizer_config", {})
+        self.running_config = config.get("running_config", {})
+    def print(self):
+        print('==================================================')
+        print('speech configs:', self.speech_config)
+        print('--------------------------------------------------')
+        print('model configs:', self.model_config)
+        print('--------------------------------------------------')
+        print('dataset configs:', self.dataset_config)
+        print('--------------------------------------------------')
+        print('optimizer configs', self.optimizer_config)
+        print('--------------------------------------------------')
+        print('running configs:', self.running_config)
+        print('==================================================')
+    def toString(self):
+        string = ''
+        string += '#==================================================' + '\n'
+        string += '#speech config: ' + str(self.speech_config) + '\n'
+        string += '#--------------------------------------------------' + '\n'
+        string += '#model config: ' + str(self.model_config) + '\n'
+        string += '#--------------------------------------------------' + '\n'
+        string += '#dataset config: ' + str(self.dataset_config) + '\n'
+        string += '#--------------------------------------------------' + '\n'
+        string += '#optimizer config: ' + str(self.optimizer_config) + '\n'
+        string += '#--------------------------------------------------' + '\n'
+        string += '#running config: ' + str(self.running_config) + '\n'
+        string += '#==================================================' + '\n'
+        return string

configs/config.yml ADDED Viewed

	@@ -0,0 +1,49 @@

+speech_config:
+  sample_rate: 16000
+  frame_ms: 25
+  stride_ms: 10
+  num_feature_bins: 80
+  feature_type: log_mel_spectrogram
+  preemphasis: 0.97
+  normalize_signal: True
+  normalize_feature: True
+  normalize_per_feature: False
+model_config:
+  name: acrnn
+  d_model: 64
+  filters: [32,64,64]
+  kernel_size: [[11,5],[11,5],[11,5]]
+  rnn_cell: 256
+  seq_mask: True
+dataset_config:
+  vocabulary: vocab/vocab.txt
+  data_path: ./data/wavs/
+  corpus_name: ./data/demo_txt/demo
+  file_nums: 1
+  max_audio_length: 2000
+  shuffle_size: 1200
+  data_length: None
+  suffix: .txt
+  load_type: txt
+  train: train
+  dev: dev
+  test: test
+optimizer_config:
+  init_steps: 0
+  warmup_steps: 10000
+  max_lr: 1e-4
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-9
+running_config:
+  prefetch: False
+  load_weights: ./saved_weights/20230228-084356/last/model
+  num_epochs: 100
+  batch_size: 1
+  train_steps: 50
+  dev_steps: 10
+  test_steps: 10

dataset.py ADDED Viewed

	@@ -0,0 +1,263 @@

+from featurizers.speech_featurizers import SpeechFeaturizer
+from configs.config import Config
+from random import shuffle
+import numpy as np
+from vocab.vocab import Vocab
+import os
+import math
+import librosa
+import tensorflow as tf
+def wav_padding(wav_data_lst, wav_max_len, fbank_dim):
+        wav_lens = [len(data) for data in wav_data_lst]
+        # input wav from 1200 frames down sample 8 times to 150 frames
+        wav_lens = [math.ceil(x/8) for x in wav_lens]
+        wav_lens = np.array(wav_lens)
+        new_wav_data_lst = np.zeros((len(wav_data_lst), wav_max_len, fbank_dim))
+        for i in range(len(wav_data_lst)):
+            new_wav_data_lst[i, :wav_data_lst[i].shape[0], :] = wav_data_lst[i]
+        return new_wav_data_lst, wav_lens
+class DatDataSet:
+    def __init__(self,
+        batch_size,
+        data_type,
+        vocab: Vocab,
+        speech_featurizer: SpeechFeaturizer,
+        config: Config):
+        self.batch_size = batch_size
+        self.data_type = data_type
+        self.vocab = vocab
+        self.data_path =config.dataset_config['data_path']
+        self.corpus_name = config.dataset_config['corpus_name']
+        self.fbank_dim = config.speech_config['num_feature_bins']
+        self.max_audio_length =config.dataset_config['max_audio_length']
+        self.mel_banks = config.speech_config['num_feature_bins']
+        self.file_nums = config.dataset_config['file_nums']
+        self.language_classes = config.running_config['language_classes']
+        self.suffix = config.dataset_config['suffix']
+        self.READ_BUFFER_SIZE = 2 * 1024 * 1024 * 1024
+        self.shuffle = True
+        self.blank = 0
+        self.source_init()
+    def source_init(self):
+        self.dat_file_list, self.txt_file_list = self.get_dat_txt_list(self.data_type)
+        print('>>', self.data_type, 'load dat files:', len(self.dat_file_list))
+        print('>>', self.data_type, 'load txt files:', len(self.txt_file_list))
+        max_binary_file_size = max([os.path.getsize(dat) for dat in self.dat_file_list])
+        print('>> max binary file size:', max_binary_file_size)
+        # alloc a huge memory block
+        self.feature_binary = np.zeros(max_binary_file_size // 4 + 1, np.float32)
+    def get_dat_txt_list(self, dir_name):
+        corpus_dir = self.data_path+'/'+self.corpus_name + '/'
+        print('!!', corpus_dir)
+        file_lst = os.listdir(corpus_dir)
+        txt_file_lst = []
+        dat_file_lst = []
+        for align_file in file_lst:
+            if align_file.endswith(self.suffix):
+                file_name = align_file[:-len(self.suffix)]
+                dat_file = file_name + '.dat'
+                if dir_name in file_name:
+                    # if dir_name in ['dev', 'test']:
+                    #     dat_file = dat_file.replace(dir_name, 'train')
+                    dat_file_lst.append(corpus_dir + dat_file)
+                    txt_file_lst.append(corpus_dir + align_file)
+        print('*********',dir_name, txt_file_lst, dat_file_lst)
+        return dat_file_lst, txt_file_lst
+    def load_dat_file(self, dat_file_path):
+        f = open(dat_file_path, 'rb')
+        pos = 0
+        buf = f.read(self.READ_BUFFER_SIZE)
+        while len(buf) > 0:
+            nbuf = np.frombuffer(buf, np.float32)
+            self.feature_binary[pos: pos + len(nbuf)] = nbuf
+            pos += len(nbuf)
+            buf = f.read(self.READ_BUFFER_SIZE)
+    def get_batch(self):
+        while 1:
+            shuffle_did_list = [i for i in range(len(self.dat_file_list))]
+            if self.shuffle:
+                shuffle(shuffle_did_list)
+            for did in shuffle_did_list:
+                wav_lst = []
+                label_lst = []
+                self.load_dat_file(self.dat_file_list[did])
+                txt_file = open(self.txt_file_list[did], 'r', encoding='utf8')
+                utt_lines = txt_file.readlines()
+                txt_lines = utt_lines
+                if self.shuffle:
+                    shuffle(txt_lines)
+                # sort lines by wav len
+                # txt_lines = sorted(
+                #     txt_lines,
+                #     key=lambda line: int(line.split('\t')[0].split(':')[2]) - int(line.split('\t')[0].split(':')[1]),
+                #     reverse=False)
+                for line in txt_lines:
+                    wav_file, label = line.split('\t')
+                    wav_lst.append(wav_file)
+                    label_lst.append(label.strip('\n'))
+                shuffle_list = [i for i in range(len(wav_lst) // self.batch_size)]
+                if self.shuffle:
+                    shuffle(shuffle_list)
+                for i in shuffle_list:
+                    begin = i * self.batch_size
+                    end = begin + self.batch_size
+                    sub_list = list(range(begin, end, 1))
+                    # label batch
+                    label_data_lst = [label_lst[index] for index in sub_list]
+                    prediction = np.array(
+                    [self.vocab.token_list.index(line) for
+                     line in label_data_lst],
+                    dtype=np.int32)
+                    feature_lst = []
+                    wav_path = []
+                    get_next_batch = False
+                    for index in sub_list:
+                        # data_aishell/wav/test/S0764/BAC009S0764W0121.wav:0:33680	chinese
+                        _, start, end = wav_lst[index].split(':')
+                        feature = self.feature_binary[int(start): int(end)]
+                        feature = np.reshape(feature, (-1, 80))
+                        feature = feature[:self.max_audio_length, :]
+                        feature_lst.append(feature)
+                        wav_path.append(wav_lst[index])
+                    if get_next_batch:
+                        continue
+                    features, input_length = wav_padding(feature_lst, self.max_audio_length, self.fbank_dim)
+                    yield features, input_length, prediction
+class TxtDataSet:
+    def __init__(self,
+                batch_size,
+                data_type,
+                vocab,
+                speech_featurizer: SpeechFeaturizer,
+                config: Config
+                ):
+        self.batch_size = batch_size
+        self.data_type = data_type
+        self.vocab = vocab
+        self.feature_extracter = speech_featurizer
+        self.data_path = config.dataset_config['data_path']
+        self.corpus_name = config.dataset_config['corpus_name']
+        self.fbank_dim = config.speech_config['num_feature_bins']
+        self.max_audio_length =config.dataset_config['max_audio_length']
+        self.mel_banks = config.speech_config['num_feature_bins']
+        self.file_nums = config.dataset_config['file_nums']
+        self.data_length = config.dataset_config['data_length']
+        self.shuffle = True
+        self.sentence_list = []
+        self.wav_lst = []
+        self.label_lst = []
+        self.max_sentence_length = 0
+        self.source_init()
+    def source_init(self):
+        read_files = []
+        if self.data_type == 'train':
+            read_files.append(self.corpus_name + '_train.txt')
+        elif self.data_type == 'dev':
+            read_files.append(self.corpus_name + '_dev.txt')
+        elif self.data_type == 'test':
+            read_files.append(self.corpus_name + '_test.txt')
+        print('data type:{} \n files:{}'.format(self.data_type, read_files))
+        total_lines = 0
+        for sub_file in read_files:
+            with open(sub_file, 'r', encoding='utf8') as f:
+                for line in f:
+                    wav_file, label = line.split(' ', 1)
+                    label = label.strip('\n').split()
+                    self.label_lst.append(label)
+                    self.wav_lst.append(wav_file)
+                    total_lines += 1
+                    if self.data_length:
+                        if total_lines == self.data_length:
+                            break
+                    if total_lines % 10000 == 0:
+                        print('\rload', total_lines, end='', flush=True)
+        if not self.data_length:
+            self.wav_lst = self.wav_lst[:self.data_length]
+            self.label_lst = self.label_lst[:self.data_length]
+        print('number of', self.data_type, 'data:', len(self.wav_lst))
+    def get_batch(self):
+        shuffle_list = [i for i in range(len(self.wav_lst))]
+        while 1:
+            if self.shuffle:
+                shuffle(shuffle_list)
+            for i in range(len(self.wav_lst) // self.batch_size):
+                begin = i * self.batch_size
+                end = begin + self.batch_size
+                sub_list = shuffle_list[begin:end]
+                label_data_lst = [self.label_lst[index] for index in sub_list]
+                prediction = np.array(
+                    [self.vocab.token_list.index(line[0]) for
+                     line in label_data_lst],
+                    dtype=np.int32)
+                feature_lst = []
+                wav_path = []
+                get_next_batch = False
+                for index in sub_list:
+                    # start = time.time()
+                    audio, _ = librosa.load(self.data_path + self.wav_lst[index], sr=16000)
+                    if len(audio) == 0:
+                        get_next_batch = True
+                        break
+                    feature = self.feature_extracter.extract(audio)
+                    feature_lst.append(feature)
+                    wav_path.append(self.wav_lst[index])
+                if get_next_batch:
+                    continue  # get next batch
+                features, input_length = wav_padding(feature_lst, self.max_audio_length, self.fbank_dim)
+                yield features,input_length, prediction
+def create_dataset(batch_size, load_type, data_type, speech_featurizer, config, vocab):
+    """
+    batch_size: global batch size
+    data_type: the type of lode data, supports type: txt, dat()
+    """
+    if load_type == 'dat':
+        dataset = DatDataSet(batch_size, data_type, vocab, speech_featurizer, config)
+        dataset = tf.data.Dataset.from_generator(dataset.get_batch,
+                                                output_types=(tf.float32, tf.int32, tf.int32),
+                                                output_shapes=([None, None, config.speech_config['num_feature_bins']],
+                                                                [None], [None]))
+    elif load_type == 'txt':
+        dataset = TxtDataSet(batch_size, data_type, vocab, speech_featurizer, config)
+        dataset = tf.data.Dataset.from_generator(dataset.get_batch,
+                                             output_types=(tf.float32, tf.int32, tf.int32),
+                                             output_shapes=([None, None, config.speech_config['num_feature_bins']],
+                                             [None], [None]))
+    else:
+        print('load_type must be dat or txt!!')
+        return
+    options = tf.data.Options()
+    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA.DATA
+    dataset = dataset.with_options(options)
+    return dataset

featurizers/__init__.py ADDED Viewed

File without changes

featurizers/gammatone.py ADDED Viewed

	@@ -0,0 +1,233 @@

+""" This code is inspired from https://github.com/detly/gammatone """
+import numpy as np
+import tensorflow as tf
+from util.utils import shape_list
+pi = tf.constant(np.pi, dtype=tf.complex64)
+DEFAULT_FILTER_NUM = 100
+DEFAULT_LOW_FREQ = 100
+DEFAULT_HIGH_FREQ = 44100 / 4
+def fft_weights(
+        nfft,
+        fs,
+        nfilts,
+        width,
+        fmin,
+        fmax,
+        maxlen):
+    """
+    :param nfft: the source FFT size
+    :param sr: sampling rate (Hz)
+    :param nfilts: the number of output bands required (default 64)
+    :param width: the constant width of each band in Bark (default 1)
+    :param fmin: lower limit of frequencies (Hz)
+    :param fmax: upper limit of frequencies (Hz)
+    :param maxlen: number of bins to truncate the rows to
+    :return: a tuple `weights`, `gain` with the calculated weight matrices and
+             gain vectors
+    Generate a matrix of weights to combine FFT bins into Gammatone bins.
+    Note about `maxlen` parameter: While wts has nfft columns, the second half
+    are all zero. Hence, aud spectrum is::
+        fft2gammatonemx(nfft,sr)*abs(fft(xincols,nfft))
+    `maxlen` truncates the rows to this many bins.
+    | (c) 2004-2009 Dan Ellis [email protected]  based on rastamat/audspec.m
+    | (c) 2012 Jason Heeris (Python implementation)
+    """
+    ucirc = tf.exp(1j * 2 * pi * tf.cast(tf.range(0, nfft / 2 + 1),
+                                         tf.complex64) / nfft)[None, ...]
+    # Common ERB filter code factored out
+    cf_array = erb_space(fmin, fmax, nfilts)[::-1]
+    _, A11, A12, A13, A14, _, _, _, B2, gain = make_erb_filters(fs, cf_array, width)
+    A11, A12, A13, A14 = A11[..., None], A12[..., None], A13[..., None], A14[..., None]
+    r = tf.cast(tf.sqrt(B2), tf.complex64)
+    theta = 2 * pi * cf_array / fs
+    pole = (r * tf.exp(1j * theta))[..., None]
+    GTord = 4
+    weights = (
+        tf.abs(ucirc + A11 * fs) * tf.abs(ucirc + A12 * fs)
+        * tf.abs(ucirc + A13 * fs) * tf.abs(ucirc + A14 * fs)
+        * tf.abs(fs * (pole - ucirc) * (tf.math.conj(pole) - ucirc)) ** (-GTord)
+        / tf.cast(gain[..., None], tf.float32)
+    )
+    weights = tf.pad(weights, [[0, 0], [0, nfft - shape_list(weights)[-1]]])
+    weights = weights[:, 0:int(maxlen)]
+    return tf.transpose(weights, perm=[1, 0])
+def erb_point(low_freq, high_freq, fraction):
+    """
+    Calculates a single point on an ERB scale between ``low_freq`` and
+    ``high_freq``, determined by ``fraction``. When ``fraction`` is ``1``,
+    ``low_freq`` will be returned. When ``fraction`` is ``0``, ``high_freq``
+    will be returned.
+    ``fraction`` can actually be outside the range ``[0, 1]``, which in general
+    isn't very meaningful, but might be useful when ``fraction`` is rounded a
+    little above or below ``[0, 1]`` (eg. for plot axis labels).
+    """
+    # Change the following three parameters if you wish to use a different ERB
+    # scale. Must change in MakeERBCoeffs too.
+    # TODO: Factor these parameters out
+    ear_q = 9.26449  # Glasberg and Moore Parameters
+    min_bw = 24.7
+    # All of the following expressions are derived in Apple TR #35, "An
+    # Efficient Implementation of the Patterson-Holdsworth Cochlear Filter
+    # Bank." See pages 33-34.
+    erb_point = (
+        -ear_q * min_bw
+        + tf.exp(
+            fraction * (
+                -tf.math.log(high_freq + ear_q * min_bw)
+                + tf.math.log(low_freq + ear_q * min_bw)
+            )
+        ) *
+        (high_freq + ear_q * min_bw)
+    )
+    return tf.cast(erb_point, tf.complex64)
+def erb_space(
+        low_freq=DEFAULT_LOW_FREQ,
+        high_freq=DEFAULT_HIGH_FREQ,
+        num=DEFAULT_FILTER_NUM):
+    """
+    This function computes an array of ``num`` frequencies uniformly spaced
+    between ``high_freq`` and ``low_freq`` on an ERB scale.
+    For a definition of ERB, see Moore, B. C. J., and Glasberg, B. R. (1983).
+    "Suggested formulae for calculating auditory-filter bandwidths and
+    excitation patterns," J. Acoust. Soc. Am. 74, 750-753.
+    """
+    return erb_point(
+        low_freq,
+        high_freq,
+        tf.range(1, num + 1, dtype=tf.float32) / num
+    )
+def make_erb_filters(fs, centre_freqs, width=1.0):
+    """
+    This function computes the filter coefficients for a bank of
+    Gammatone filters. These filters were defined by Patterson and Holdworth for
+    simulating the cochlea.
+    The result is returned as a :class:`ERBCoeffArray`. Each row of the
+    filter arrays contains the coefficients for four second order filters. The
+    transfer function for these four filters share the same denominator (poles)
+    but have different numerators (zeros). All of these coefficients are
+    assembled into one vector that the ERBFilterBank can take apart to implement
+    the filter.
+    The filter bank contains "numChannels" channels that extend from
+    half the sampling rate (fs) to "lowFreq". Alternatively, if the numChannels
+    input argument is a vector, then the values of this vector are taken to be
+    the center frequency of each desired filter. (The lowFreq argument is
+    ignored in this case.)
+    Note this implementation fixes a problem in the original code by
+    computing four separate second order filters. This avoids a big problem with
+    round off errors in cases of very small cfs (100Hz) and large sample rates
+    (44kHz). The problem is caused by roundoff error when a number of poles are
+    combined, all very close to the unit circle. Small errors in the eigth order
+    coefficient, are multiplied when the eigth root is taken to give the pole
+    location. These small errors lead to poles outside the unit circle and
+    instability. Thanks to Julius Smith for leading me to the proper
+    explanation.
+    Execute the following code to evaluate the frequency response of a 10
+    channel filterbank::
+        fcoefs = MakeERBFilters(16000,10,100);
+        y = ERBFilterBank([1 zeros(1,511)], fcoefs);
+        resp = 20*log10(abs(fft(y')));
+        freqScale = (0:511)/512*16000;
+        semilogx(freqScale(1:255),resp(1:255,:));
+        axis([100 16000 -60 0])
+        xlabel('Frequency (Hz)'); ylabel('Filter Response (dB)');
+    | Rewritten by Malcolm Slaney@Interval.  June 11, 1998.
+    | (c) 1998 Interval Research Corporation
+    |
+    | (c) 2012 Jason Heeris (Python implementation)
+    """
+    T = 1 / fs
+    # Change the followFreqing three parameters if you wish to use a different
+    # ERB scale. Must change in ERBSpace too.
+    # TODO: factor these out
+    ear_q = 9.26449  # Glasberg and Moore Parameters
+    min_bw = 24.7
+    order = 1
+    erb = width * ((centre_freqs / ear_q) ** order + min_bw ** order) ** (1 / order)
+    B = 1.019 * 2 * pi * erb
+    arg = 2 * centre_freqs * pi * T
+    vec = tf.exp(2j * arg)
+    A0 = T
+    A2 = 0
+    B0 = 1
+    B1 = -2 * tf.cos(arg) / tf.exp(B * T)
+    B2 = tf.exp(-2 * B * T)
+    rt_pos = tf.cast(tf.sqrt(3 + 2 ** 1.5), tf.complex64)
+    rt_neg = tf.cast(tf.sqrt(3 - 2 ** 1.5), tf.complex64)
+    common = -T * tf.exp(-(B * T))
+    # TODO: This could be simplified to a matrix calculation involving the
+    # constant first term and the alternating rt_pos/rt_neg and +/-1 second
+    # terms
+    k11 = tf.cos(arg) + rt_pos * tf.sin(arg)
+    k12 = tf.cos(arg) - rt_pos * tf.sin(arg)
+    k13 = tf.cos(arg) + rt_neg * tf.sin(arg)
+    k14 = tf.cos(arg) - rt_neg * tf.sin(arg)
+    A11 = common * k11
+    A12 = common * k12
+    A13 = common * k13
+    A14 = common * k14
+    gain_arg = tf.exp(1j * arg - B * T)
+    gain = tf.cast(tf.abs(
+        (vec - gain_arg * k11)
+        * (vec - gain_arg * k12)
+        * (vec - gain_arg * k13)
+        * (vec - gain_arg * k14)
+        * (T * tf.exp(B * T)
+           / (-1 / tf.exp(B * T) + 1 + vec * (1 - tf.exp(B * T)))
+           )**4
+    ), tf.complex64)
+    allfilts = tf.ones_like(centre_freqs, dtype=tf.complex64)
+    fcoefs = tf.stack([
+        A0 * allfilts, A11, A12, A13, A14, A2 * allfilts,
+        B0 * allfilts, B1, B2,
+        gain
+    ], axis=1)
+    return tf.transpose(fcoefs, perm=[1, 0])

featurizers/speech_featurizers.py ADDED Viewed

	@@ -0,0 +1,453 @@

+import os
+import io
+import abc
+import six
+import numpy as np
+import librosa
+import soundfile as sf
+import tensorflow as tf
+from util.utils import log10
+from .gammatone import fft_weights
+def read_raw_audio(audio, sample_rate=16000):
+    if isinstance(audio, str):
+        wave, _ = librosa.load(os.path.expanduser(audio), sr=sample_rate)
+    elif isinstance(audio, bytes):
+        wave, sr = sf.read(io.BytesIO(audio))
+        wave = np.asfortranarray(wave)
+        if sr != sample_rate:
+            wave = librosa.resample(wave, sr, sample_rate)
+    elif isinstance(audio, np.ndarray):
+        return audio
+    else:
+        raise ValueError("input audio must be either a path or bytes")
+    return wave
+def slice_signal(signal, window_size, stride=0.5) -> np.ndarray:
+    """ Return windows of the given signal by sweeping in stride fractions of window """
+    assert signal.ndim == 1, signal.ndim
+    n_samples = signal.shape[0]
+    offset = int(window_size * stride)
+    slices = []
+    for beg_i, end_i in zip(range(0, n_samples, offset),
+                            range(window_size, n_samples + offset,
+                                  offset)):
+        slice_ = signal[beg_i:end_i]
+        if slice_.shape[0] < window_size:
+            slice_ = np.pad(
+                slice_, (0, window_size - slice_.shape[0]), 'constant', constant_values=0.0)
+        if slice_.shape[0] == window_size:
+            slices.append(slice_)
+    return np.array(slices, dtype=np.float32)
+def tf_merge_slices(slices: tf.Tensor) -> tf.Tensor:
+    # slices shape = [batch, window_size]
+    return tf.keras.backend.flatten(slices)  # return shape = [-1, ]
+def merge_slices(slices: np.ndarray) -> np.ndarray:
+    # slices shape = [batch, window_size]
+    return np.reshape(slices, [-1])
+def normalize_audio_feature(audio_feature: np.ndarray, per_feature=False):
+    """ Mean and variance normalization """
+    axis = 0 if per_feature else None
+    mean = np.mean(audio_feature, axis=axis)
+    std_dev = np.std(audio_feature, axis=axis) + 1e-9
+    normalized = (audio_feature - mean) / std_dev
+    return normalized
+def tf_normalize_audio_features(audio_feature: tf.Tensor, per_feature=False):
+    """
+    TF Mean and variance features normalization
+    Args:
+        audio_feature: tf.Tensor with shape [T, F]
+    Returns:
+        normalized audio features with shape [T, F]
+    """
+    axis = 0 if per_feature else None
+    mean = tf.reduce_mean(audio_feature, axis=axis)
+    std_dev = tf.math.reduce_std(audio_feature, axis=axis) + 1e-9
+    return (audio_feature - mean) / std_dev
+def normalize_signal(signal: np.ndarray):
+    """ Normailize signal to [-1, 1] range """
+    gain = 1.0 / (np.max(np.abs(signal)) + 1e-9)
+    return signal * gain
+def tf_normalize_signal(signal: tf.Tensor):
+    """
+    TF Normailize signal to [-1, 1] range
+    Args:
+        signal: tf.Tensor with shape [None]
+    Returns:
+        normalized signal with shape [None]
+    """
+    gain = 1.0 / (tf.reduce_max(tf.abs(signal), axis=-1) + 1e-9)
+    return signal * gain
+def preemphasis(signal: np.ndarray, coeff=0.97):
+    if not coeff or coeff <= 0.0:
+        return signal
+    return np.append(signal[0], signal[1:] - coeff * signal[:-1])
+def tf_preemphasis(signal: tf.Tensor, coeff=0.97):
+    """
+    TF Pre-emphasis
+    Args:
+        signal: tf.Tensor with shape [None]
+        coeff: Float that indicates the preemphasis coefficient
+    Returns:
+        pre-emphasized signal with shape [None]
+    """
+    if not coeff or coeff <= 0.0: return signal
+    s0 = tf.expand_dims(signal[0], axis=-1)
+    s1 = signal[1:] - coeff * signal[:-1]
+    return tf.concat([s0, s1], axis=-1)
+def depreemphasis(signal: np.ndarray, coeff=0.97):
+    if not coeff or coeff <= 0.0: return signal
+    x = np.zeros(signal.shape[0], dtype=np.float32)
+    x[0] = signal[0]
+    for n in range(1, signal.shape[0], 1):
+        x[n] = coeff * x[n - 1] + signal[n]
+    return x
+def tf_depreemphasis(signal: tf.Tensor, coeff=0.97):
+    """
+    TF Depreemphasis
+    Args:
+        signal: tf.Tensor with shape [B, None]
+        coeff: Float that indicates the preemphasis coefficient
+    Returns:
+        depre-emphasized signal with shape [B, None]
+    """
+    if not coeff or coeff <= 0.0: return signal
+    def map_fn(elem):
+        x = tf.expand_dims(elem[0], axis=-1)
+        for n in range(1, elem.shape[0], 1):
+            current = coeff * x[n - 1] + elem[n]
+            x = tf.concat([x, [current]], axis=0)
+        return x
+    return tf.map_fn(map_fn, signal)
+class SpeechFeaturizer(metaclass=abc.ABCMeta):
+    def __init__(self, speech_config: dict):
+        """
+        We should use TFSpeechFeaturizer for training to avoid differences
+        between tf and librosa when converting to tflite in post-training stage
+        speech_config = {
+            "sample_rate": int,
+            "frame_ms": int,
+            "stride_ms": int,
+            "num_feature_bins": int,
+            "feature_type": str,
+            "delta": bool,
+            "delta_delta": bool,
+            "pitch": bool,
+            "normalize_signal": bool,
+            "normalize_feature": bool,
+            "normalize_per_feature": bool
+        }
+        """
+        # Samples
+        self.sample_rate = speech_config.get("sample_rate", 16000)
+        self.frame_length = int(self.sample_rate * (speech_config.get("frame_ms", 25) / 1000))
+        self.frame_step = int(self.sample_rate * (speech_config.get("stride_ms", 10) / 1000))
+        # Features
+        self.num_feature_bins = speech_config.get("num_feature_bins", 80)
+        self.feature_type = speech_config.get("feature_type", "log_mel_spectrogram")
+        self.preemphasis = speech_config.get("preemphasis", None)
+        # Normalization
+        self.normalize_signal = speech_config.get("normalize_signal", True)
+        self.normalize_feature = speech_config.get("normalize_feature", True)
+        self.normalize_per_feature = speech_config.get("normalize_per_feature", False)
+        # librosa mel filter
+        self.mel_filter = None
+    @property
+    def nfft(self) -> int:
+        """ Number of FFT """
+        return 2 ** (self.frame_length - 1).bit_length()
+    @property
+    def shape(self) -> list:
+        """ The shape of extracted features """
+        raise NotImplementedError()
+    @abc.abstractclassmethod
+    def stft(self, signal):
+        raise NotImplementedError()
+    @abc.abstractclassmethod
+    def power_to_db(self, S, ref=1.0, amin=1e-10, top_db=80.0):
+        raise NotImplementedError()
+    @abc.abstractmethod
+    def extract(self, signal):
+        """ Function to perform feature extraction """
+        raise NotImplementedError()
+class NumpySpeechFeaturizer(SpeechFeaturizer):
+    def __init__(self, speech_config: dict):
+        super(NumpySpeechFeaturizer, self).__init__(speech_config)
+        self.delta = speech_config.get("delta", False)
+        self.delta_delta = speech_config.get("delta_delta", False)
+        self.pitch = speech_config.get("pitch", False)
+    @property
+    def shape(self) -> list:
+        # None for time dimension
+        channel_dim = 1
+        if self.delta:
+            channel_dim += 1
+        if self.delta_delta:
+            channel_dim += 1
+        if self.pitch:
+            channel_dim += 1
+        return [None, self.num_feature_bins, channel_dim]
+    def stft(self, signal):
+        return np.square(
+            np.abs(librosa.core.stft(signal, n_fft=self.nfft, hop_length=self.frame_step,
+                                     win_length=self.frame_length, center=True, window="hann")))
+    def power_to_db(self, S, ref=1.0, amin=1e-10, top_db=80.0):
+        return librosa.power_to_db(S, ref=ref, amin=amin, top_db=top_db)
+    def extract(self, signal: np.ndarray) -> np.ndarray:
+        signal = np.asfortranarray(signal)
+        if self.normalize_signal:
+            signal = normalize_signal(signal)
+        signal = preemphasis(signal, self.preemphasis)
+        if self.feature_type == "mfcc":
+            features = self.compute_mfcc(signal)
+        elif self.feature_type == "log_mel_spectrogram":
+            features = self.compute_log_mel_spectrogram(signal)
+        elif self.feature_type == "spectrogram":
+            features = self.compute_spectrogram(signal)
+        elif self.feature_type == "log_gammatone_spectrogram":
+            features = self.compute_log_gammatone_spectrogram(signal)
+        else:
+            raise ValueError("feature_type must be either 'mfcc', "
+                             "'log_mel_spectrogram', 'log_gammatone_spectrogram' "
+                             "or 'spectrogram'")
+        if self.normalize_feature:
+            features = normalize_audio_feature(features, per_feature=self.normalize_per_feature)
+        # features = np.expand_dims(features, axis=-1)
+        return features
+    def compute_pitch(self, signal: np.ndarray) -> np.ndarray:
+        pitches, _ = librosa.core.piptrack(
+            y=signal, sr=self.sample_rate,
+            n_fft=self.nfft, hop_length=self.frame_step,
+            fmin=0.0, fmax=int(self.sample_rate / 2), win_length=self.frame_length, center=True
+        )
+        pitches = pitches.T
+        assert self.num_feature_bins <= self.frame_length // 2 + 1, \
+            "num_features for spectrogram should \
+        be <= (sample_rate * window_size // 2 + 1)"
+        return pitches[:, :self.num_feature_bins]
+    def compute_spectrogram(self, signal: np.ndarray) -> np.ndarray:
+        powspec = self.stft(signal)
+        features = self.power_to_db(powspec.T)
+        assert self.num_feature_bins <= self.frame_length // 2 + 1, \
+            "num_features for spectrogram should \
+        be <= (sample_rate * window_size // 2 + 1)"
+        # cut high frequency part, keep num_feature_bins features
+        features = features[:, :self.num_feature_bins]
+        return features
+    def compute_mfcc(self, signal: np.ndarray) -> np.ndarray:
+        S = self.stft(signal)
+        mel = librosa.filters.mel(self.sample_rate, self.nfft,
+                                  n_mels=self.num_feature_bins,
+                                  fmin=0.0, fmax=int(self.sample_rate / 2))
+        mel_spectrogram = np.dot(S.T, mel.T)
+        mfcc = librosa.feature.mfcc(sr=self.sample_rate,
+                                    S=self.power_to_db(mel_spectrogram).T,
+                                    n_mfcc=self.num_feature_bins)
+        return mfcc.T
+    def compute_log_mel_spectrogram(self, signal: np.ndarray) -> np.ndarray:
+        S = self.stft(signal)
+        mel = librosa.filters.mel(self.sample_rate, self.nfft,
+                                  n_mels=self.num_feature_bins,
+                                  fmin=0.0, fmax=int(self.sample_rate / 2))
+        mel_spectrogram = np.dot(S.T, mel.T)
+        return self.power_to_db(mel_spectrogram)
+    def compute_log_gammatone_spectrogram(self, signal: np.ndarray) -> np.ndarray:
+        S = self.stft(signal)
+        gammatone = fft_weights(self.nfft, self.sample_rate,
+                                self.num_feature_bins, width=1.0,
+                                fmin=0, fmax=int(self.sample_rate / 2),
+                                maxlen=(self.nfft / 2 + 1))
+        gammatone = gammatone.numpy().astype(np.float32)
+        gammatone_spectrogram = np.dot(S.T, gammatone)
+        return self.power_to_db(gammatone_spectrogram)
+class TFSpeechFeaturizer(SpeechFeaturizer):
+    @property
+    def shape(self) -> list:
+        # None for time dimension
+        return [None, self.num_feature_bins, 1]
+    def stft(self, signal):
+        signal = tf.pad(signal, [[self.nfft // 2, self.nfft // 2]], mode="REFLECT")
+        window = tf.signal.hann_window(self.frame_length, periodic=True)
+        left_pad = (self.nfft - self.frame_length) // 2
+        right_pad = self.nfft - self.frame_length - left_pad
+        window = tf.pad(window, [[left_pad, right_pad]])
+        framed_signals = tf.signal.frame(signal, frame_length=self.nfft, frame_step=self.frame_step)
+        framed_signals *= window
+        return tf.square(tf.abs(tf.signal.rfft(framed_signals, [self.nfft])))
+    def power_to_db(self, S, ref=1.0, amin=1e-10, top_db=80.0):
+        if amin <= 0:
+            raise ValueError('amin must be strictly positive')
+        magnitude = S
+        if six.callable(ref):
+            # User supplied a function to calculate reference power
+            ref_value = ref(magnitude)
+        else:
+            ref_value = np.abs(ref)
+        log_spec = 10.0 * log10(tf.maximum(amin, magnitude))
+        log_spec -= 10.0 * log10(tf.maximum(amin, ref_value))
+        if top_db is not None:
+            if top_db < 0:
+                raise ValueError('top_db must be non-negative')
+            log_spec = tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)
+        return log_spec
+    def extract(self, signal: np.ndarray) -> np.ndarray:
+        signal = np.asfortranarray(signal)
+        features = self.tf_extract(tf.convert_to_tensor(signal, dtype=tf.float32))
+        return features.numpy()
+    def tf_extract(self, signal: tf.Tensor) -> tf.Tensor:
+        """
+        Extract speech features from signals (for using in tflite)
+        Args:
+            signal: tf.Tensor with shape [None]
+        Returns:
+            features: tf.Tensor with shape [T, F]
+        """
+        if self.normalize_signal:
+            signal = tf_normalize_signal(signal)
+        signal = tf_preemphasis(signal, self.preemphasis)
+        if self.feature_type == "spectrogram":
+            features = self.compute_spectrogram(signal)
+        elif self.feature_type == "log_mel_spectrogram":
+            features = self.compute_log_mel_spectrogram(signal)
+        elif self.feature_type == "mfcc":
+            features = self.compute_mfcc(signal)
+        elif self.feature_type == "log_gammatone_spectrogram":
+            features = self.compute_log_gammatone_spectrogram(signal)
+        else:
+            raise ValueError("feature_type must be either 'mfcc',"
+                             "'log_mel_spectrogram' or 'spectrogram'")
+        if self.normalize_feature:
+            features = tf_normalize_audio_features(
+                features, per_feature=self.normalize_per_feature)
+        # features = tf.expand_dims(features, axis=-1)
+        return features
+    def compute_log_mel_spectrogram(self, signal):
+        spectrogram = self.stft(signal)
+        if self.mel_filter is None:
+            linear_to_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
+                num_mel_bins=self.num_feature_bins,
+                num_spectrogram_bins=spectrogram.shape[-1],
+                sample_rate=self.sample_rate,
+                lower_edge_hertz=0.0, upper_edge_hertz=(self.sample_rate / 2)
+            )
+        else:
+            linear_to_weight_matrix = self.mel_filter
+        mel_spectrogram = tf.tensordot(spectrogram, linear_to_weight_matrix, 1)
+        return self.power_to_db(mel_spectrogram)
+    def compute_spectrogram(self, signal):
+        S = self.stft(signal)
+        spectrogram = self.power_to_db(S)
+        return spectrogram[:, :self.num_feature_bins]
+    def compute_mfcc(self, signal):
+        log_mel_spectrogram = self.compute_log_mel_spectrogram(signal)
+        return tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrogram)
+    def compute_log_gammatone_spectrogram(self, signal: np.ndarray) -> np.ndarray:
+        S = self.stft(signal)
+        gammatone = fft_weights(self.nfft, self.sample_rate,
+                                self.num_feature_bins, width=1.0,
+                                fmin=0, fmax=int(self.sample_rate / 2),
+                                maxlen=(self.nfft / 2 + 1))
+        gammatone_spectrogram = tf.tensordot(S, gammatone, 1)
+        return self.power_to_db(gammatone_spectrogram)
+    def set_mel_filter(self, librosa_mel_filter):
+        """
+        Set librosa mel filter.
+        """
+        self.mel_filter = librosa_mel_filter

librosa_mel_filter.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

models/__init__.py ADDED Viewed

File without changes

models/layers/__init__.py ADDED Viewed

File without changes

models/layers/attention.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import tensorflow as tf
+class Attention(tf.keras.layers.Layer):
+    def __init__(self, hidden_size,
+                attention_size=1,
+                name=None,
+                 **kwargs):
+        super().__init__( **kwargs)
+        self.w_kernel = self.add_variable('w_kernel', [hidden_size, attention_size])
+        self.w_bias = self.add_variable('w_bias', [attention_size])
+        self.bias = self.add_variable('bias', [attention_size])
+    def call(self, inputs, inp_len, maxlen=150, mask=None, training=False,  **kwargs):
+        """
+        inp_len: length of input audio
+        maxlen: audio length after downsampling(cnn(twice downsample) and maxpool), in our experiments
+        the input length is 1200s, after downsampling, the sequence length is 1200//8=1500,
+        (8=2*2*2, see model parameters for details).
+        If you change input length and times of dowansampling,
+        please reset the maxlen parameter!!!!
+        """
+        # In case of Bi-RNN, concatenate the forward and the backward Rnn outputs.
+        if isinstance(inputs, tuple):
+            inputs = tf.concat(inputs, 2)
+        v = tf.sigmoid(tf.tensordot(inputs, self.w_kernel, axes=1) + self.w_bias)
+        vu = tf.tensordot(v, self.bias, axes=1)
+        alphas = tf.nn.softmax(vu)  #(B,T)
+        if mask is not None:
+            alphas = alphas*tf.cast(tf.sequence_mask(inp_len, maxlen), dtype=tf.float32)
+        output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)
+        return output

models/model.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import tensorflow as tf
+from featurizers.speech_featurizers import SpeechFeaturizer
+from .layers.attention import Attention
+L2 = tf.keras.regularizers.l2(1e-6)
+def shape_list(x, out_type=tf.int32):
+    """Deal with dynamic shape in tensorflow cleanly."""
+    static = x.shape.as_list()
+    dynamic = tf.shape(x, out_type=out_type)
+    return [dynamic[i] if s is None else s for i, s in enumerate(static)]
+def merge_two_last_dims(x):
+    b, _, f, c = shape_list(x)
+    return tf.reshape(x, shape=[b, -1, f * c])
+class MulSpeechLR(tf.keras.Model):
+    def __init__(self, name, filters, kernel_size, d_model, rnn_cell, seq_mask, vocab_size, dropout=0.5):
+        super(MulSpeechLR, self).__init__()
+        self.filters1 = filters[0]
+        self.filters2 = filters[1]
+        self.filters3 = filters[2]
+        self.kernel_size1 = kernel_size[0]
+        self.kernel_size2 = kernel_size[1]
+        self.kernel_size3 = kernel_size[2]
+        #during training, self.mask can be set true, but during inference, it must be false
+        self.mask = seq_mask
+        self.conv1 = tf.keras.layers.Conv2D(filters=self.filters1, kernel_size=self.kernel_size1,
+                    strides=(2,2), padding='same', activation='relu')
+        self.maxpool1 = tf.keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2))
+        self.conv2 = tf.keras.layers.Conv2D(filters=self.filters2, kernel_size=self.kernel_size2,
+                    strides=(2,2), padding='same', activation='relu')
+        self.conv3 = tf.keras.layers.Conv2D(filters=self.filters3, kernel_size=self.kernel_size3,
+                    strides=(1,1), padding='same', activation='relu')
+        self.ln1 = tf.keras.layers.LayerNormalization(name=f"{name}_ln_1")
+        self.ln2 = tf.keras.layers.LayerNormalization(name=f"{name}_ln_2")
+        self.ln3 = tf.keras.layers.LayerNormalization(name=f"{name}_ln_3")
+        # self.linear1 = tf.keras.layers.Dense(d_model*2, name=f"{name}_dense_1")
+        self.linear2 = tf.keras.layers.Dense(d_model, name=f"{name}_dense_2")
+        self.rnn = tf.keras.layers.GRU(rnn_cell, return_sequences=True, return_state=True, name=f"{name}_gru")
+        self.attention = Attention(rnn_cell)
+        self.class_layer = tf.keras.layers.Dense(vocab_size)
+        self.res_add = tf.keras.layers.Add(name=f"{name}_add")
+    def call(self, inputs):
+        x, x_len = inputs
+        # mask = tf.cast(tf.sequence_mask(x_len, maxlen=150), dtype=tf.float32)
+        x = tf.expand_dims(x, axis=-1)
+        x = self.conv1(x)
+        x = self.ln1(x)
+        x = self.maxpool1(x)
+        x = self.conv2(x)
+        x = self.ln2(x)
+        x = self.conv3(x)
+        x = self.ln3(x)
+        x = merge_two_last_dims(x)
+        x, final_state = self.rnn(x)
+        x = self.attention(x, x_len, self.mask)
+        x = self.res_add([x, final_state])
+        output = self.linear2(x)
+        output = tf.nn.relu(output)
+        output = self.class_layer(output)
+        return output
+    def init_build(self, input_shape):
+        x = tf.keras.Input(shape=input_shape, dtype= tf.float32)
+        l = tf.keras.Input(shape=[], dtype=tf.int32)
+        self([x, l],  training=False)
+    def add_featurizers(self,
+                        speech_featurizer: SpeechFeaturizer):
+        """
+        Function to add featurizer to model to convert to end2end tflite
+        Args:
+            speech_featurizer: SpeechFeaturizer instance
+        """
+        self.speech_featurizer = speech_featurizer
+    @tf.function(input_signature=[tf.TensorSpec([None], dtype=tf.float32)])
+    def predict_pb(self, signal):
+        features = self.speech_featurizer.tf_extract(signal)
+        input_len = tf.expand_dims(tf.shape(features)[0], axis=0)
+        input = tf.expand_dims(features, axis=0)
+        output = self([input, input_len], training=False)
+        output = tf.nn.softmax(output)
+        output1 = tf.squeeze(output)
+        output = tf.argmax(output1, axis=-1)
+        return output, tf.gather(output1, output)

optimizers/__init.py ADDED Viewed

File without changes

optimizers/schedules.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright 2023 by zhongying
+import tensorflow as tf
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.keras.optimizers.schedules import ExponentialDecay
+class TransformerLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+    """ Transformer learning rate schedule """
+    def __init__(self, d_model, init_steps=0, warmup_steps=4000, max_lr=None):
+        super(TransformerLRSchedule, self).__init__()
+        self.d_model = d_model
+        self.d_model = tf.cast(self.d_model, tf.float32)
+        self.max_lr = max_lr
+        self.warmup_steps = warmup_steps
+        self.init_steps = init_steps
+    def __call__(self, step):
+        # lr = (d_model^-0.5) * min(step^-0.5, step*(warm_up^-1.5))
+        step += self.init_steps
+        arg1 = tf.math.rsqrt(step)
+        arg2 = step * (self.warmup_steps ** -1.5)
+        lr = tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
+        if self.max_lr is not None:
+            return tf.math.minimum(self.max_lr, lr)
+        return lr
+    def get_config(self):
+        return {
+            "d_model": self.d_model,
+            "warmup_steps": self.warmup_steps,
+            "max_lr": self.max_lr
+        }
+class SANSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+    def __init__(self, lamb, d_model, warmup_steps=4000):
+        super(SANSchedule, self).__init__()
+        self.lamb = tf.cast(lamb, tf.float32)
+        self.d_model = tf.cast(d_model, tf.float32)
+        self.warmup_steps = tf.cast(warmup_steps, tf.float32)
+    def __call__(self, step):
+        arg1 = step / (self.warmup_steps ** 1.5)
+        arg2 = 1 / tf.math.sqrt(step)
+        return (self.lamb / tf.math.sqrt(self.d_model)) * tf.math.minimum(arg1, arg2)
+    def get_config(self):
+        return {
+            "lamb": self.lamb,
+            "d_model": self.d_model,
+            "warmup_steps": self.warmup_steps
+        }
+class BoundExponentialDecay(ExponentialDecay):
+    def __init__(self, min_lr=0.0, **kwargs):
+        super().__init__(**kwargs)
+        self.min_lr = min_lr
+    def __call__(self, step):
+        with ops.name_scope_v2(self.name or "ExponentialDecay") as name:
+            initial_learning_rate = ops.convert_to_tensor(
+                self.initial_learning_rate, name="initial_learning_rate")
+            dtype = initial_learning_rate.dtype
+            decay_steps = math_ops.cast(self.decay_steps, dtype)
+            decay_rate = math_ops.cast(self.decay_rate, dtype)
+            global_step_recomp = math_ops.cast(step, dtype)
+            p = global_step_recomp / decay_steps
+            if self.staircase:
+                p = math_ops.floor(p)
+            new_lr = math_ops.multiply(
+                initial_learning_rate, math_ops.pow(decay_rate, p), name=name)
+            return math_ops.maximum(self.min_lr, new_lr)

predict_by_pb.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from signal import signal
+import tensorflow as tf
+gpus = tf.config.list_physical_devices('GPU')
+tf.config.set_visible_devices(gpus[0:1], 'GPU')
+from vocab.vocab import Vocab
+import librosa
+import numpy as np
+import sys
+import os
+from tqdm import tqdm
+from sklearn.metrics import accuracy_score
+vocab = Vocab("vocab/vocab.txt")
+model = tf.saved_model.load('saved_models/lang14/pb/2/')
+def predict_wav(wav_path):
+    signal, _ = librosa.load(wav_path, sr=16000)
+    output, prob = model.predict_pb(signal)
+    language = vocab.token_list[output.numpy()]
+    print(language, prob.numpy()*100)
+    return output.numpy(), prob.numpy()
+if __name__ == '__main__':
+    wav_path = sys.argv[1]
+    predict_wav(wav_path)

predict_by_weights.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import tensorflow as tf
+gpus = tf.config.list_physical_devices('GPU')
+tf.config.set_visible_devices(gpus[0:1], 'GPU')
+from vocab.vocab import Vocab
+from dataset import create_dataset
+from configs.config import Config
+import sys
+from featurizers.speech_featurizers import TFSpeechFeaturizer, NumpySpeechFeaturizer
+from models.model import MulSpeechLR as Model
+import librosa
+weights_dir = './saved_weights/20230228-084356/'
+config_file = weights_dir + 'config.yml'
+model_file = weights_dir + 'last/model'
+vocab_file = weights_dir + 'vocab.txt'
+config = Config(config_file)
+speech_featurizer = TFSpeechFeaturizer(config.speech_config)
+lr_vocab = Vocab(vocab_file)
+lr_model = Model(**config.model_config, vocab_size=len(lr_vocab.token_list))
+lr_model.load_weights(model_file)
+lr_model.add_featurizers(speech_featurizer)
+lr_model.init_build([None, config.speech_config['num_feature_bins']])
+lr_model.summary()
+def predict_wav(wav_path):
+    sample_rate = 16000
+    signal, _ = librosa.load(wav_path, sr=sample_rate)
+    predict, prob = lr_model.predict_pb(signal)
+    language = lr_vocab.token_list[predict.numpy()]
+    print("predict language={}  prob={:.4f}".format(language, prob.numpy()*100))
+if __name__ == '__main__':
+    wav_path = sys.argv[1]
+    predict_wav(wav_path)

train.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# coding=utf-8
+# copyright by speechflow  2023/03/17
+import argparse
+import tensorflow as tf
+gpus = tf.config.list_physical_devices('GPU')
+# tf.config.set_visible_devices(gpus[0:1], 'GPU')
+import datetime
+import time
+import os
+from shutil import copyfile
+import matplotlib.pyplot as plt
+from vocab.vocab import Vocab
+from configs.config import Config
+from models.model import MulSpeechLR as Model
+from termcolor import colored
+from featurizers.speech_featurizers import NumpySpeechFeaturizer
+from dataset import create_dataset
+import tensorflow_addons as tfa
+from sklearn.metrics import f1_score, recall_score, precision_score
+mirrored_strategy = tf.distribute.MirroredStrategy()
+def train(config_file):
+    config = Config(config_file)
+    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+    dir_log_root = "./saved_weights/"
+    if not os.path.exists(dir_log_root):
+        os.mkdir(dir_log_root)
+    dir_current = dir_log_root + current_time
+    if not os.path.isdir(dir_log_root):
+        os.mkdir(dir_log_root)
+    if not os.path.isdir(dir_current):
+        os.mkdir(dir_current)
+    copyfile(config_file, dir_current + '/config.yml')
+    log_file = open(dir_current + '/log.txt', 'w')
+    copyfile(config.dataset_config['vocabulary'], dir_current + '/vocab.txt')
+    config.print()
+    log_file.write(config.toString())
+    # vocab_file.write(config.toString())
+    log_file.flush()
+    vocab = Vocab(config.dataset_config['vocabulary'])
+    batch_size = config.running_config['batch_size']
+    global_batch_size = batch_size * mirrored_strategy.num_replicas_in_sync
+    speech_featurizer = NumpySpeechFeaturizer(config.speech_config)
+    model = Model(**config.model_config, vocab_size=len(vocab.token_list))
+    if config.running_config['load_weights'] is not None:
+        model.load_weights(config.running_config['load_weights'])
+    model.add_featurizers(speech_featurizer)
+    model.init_build([None, config.speech_config['num_feature_bins']])
+    model.summary()
+    train_dataset = create_dataset(batch_size=global_batch_size,
+                                load_type=config.dataset_config['load_type'],
+                                data_type=config.dataset_config['train'],
+                                speech_featurizer=speech_featurizer,
+                                config = config,
+                                vocab = vocab)
+    eval_dataset = create_dataset(batch_size=global_batch_size,
+                                load_type=config.dataset_config['load_type'],
+                                data_type=config.dataset_config['dev'],
+                                speech_featurizer=speech_featurizer,
+                                config = config,
+                                vocab = vocab)
+    test_dataset = create_dataset(batch_size=global_batch_size,
+                                load_type=config.dataset_config['load_type'],
+                                data_type=config.dataset_config['test'],
+                                speech_featurizer=speech_featurizer,
+                                config = config,
+                                vocab = vocab)
+    train_dist_batch = mirrored_strategy.experimental_distribute_dataset(train_dataset)
+    dev_dist_batch = mirrored_strategy.experimental_distribute_dataset(eval_dataset)
+    test_dist_batch = mirrored_strategy.experimental_distribute_dataset(test_dataset)
+    dev_loss = tf.keras.metrics.Mean(name='dev_loss')
+    train_loss = tf.keras.metrics.Mean(name='train_loss')
+    dev_accuracy = tf.keras.metrics.Mean(name='train_accuracy')
+    init_steps = config.optimizer_config['init_steps']
+    step = tf.Variable(init_steps)
+    optimizer = tf.keras.optimizers.Adam(lr=config.optimizer_config['max_lr'])
+    ckpt = tf.train.Checkpoint(step=step, optimizer=optimizer, model=model)
+    ckpt_manager = tf.train.CheckpointManager(ckpt, dir_current + '/ckpt', max_to_keep=5)
+    loss_object = tfa.losses.SigmoidFocalCrossEntropy(
+            from_logits = True,
+            alpha = 0.25,
+            gamma  = 0,
+        reduction  = tf.keras.losses.Reduction.NONE)
+    loss_object_label_smooth = tf.keras.losses.CategoricalCrossentropy(
+    from_logits=True, label_smoothing=0.1, reduction=tf.keras.losses.Reduction.NONE)
+    def compute_loss(real, pred, smooth=False):
+        if smooth:
+            loss_ = loss_object_label_smooth(tf.one_hot(real, len(vocab.token_list)), pred)
+        else:
+            real = tf.one_hot(real, len(vocab.token_list))
+            loss_ = loss_object(real, pred)
+        return tf.nn.compute_average_loss(loss_, global_batch_size=global_batch_size)
+    def accuracy_function(real, pred):
+        pred = tf.cast(pred, dtype=tf.int32)
+        accuracies = tf.equal(real, pred)
+        mask = tf.math.logical_not(tf.math.equal(real, 0))
+        accuracies = tf.math.logical_and(mask, accuracies)
+        accuracies = tf.cast(accuracies, dtype=tf.float32)
+        mask = tf.cast(mask, dtype=tf.float32)
+        return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)
+    @tf.function
+    def train_step(input, input_length, target):
+        with tf.GradientTape() as tape:
+            predictions = model([input, input_length], training=True)
+            loss = compute_loss(target, predictions, smooth=True)
+        grads = tape.gradient(loss, model.trainable_variables)
+        optimizer.apply_gradients(zip(grads, model.trainable_variables))
+        return loss
+    @tf.function
+    def dev_step(input, input_length, target):
+        predictions = model([input, input_length], training=False)
+        t_loss = compute_loss(target, predictions, smooth=True)
+        return t_loss, predictions
+    @tf.function
+    def test_step(input, input_length, target):
+        predictions = model([input, input_length], training=False)
+        return predictions, target
+    @tf.function(experimental_relax_shapes=True)
+    def distributed_train_step(x, x_len, y):
+        per_replica_losses = mirrored_strategy.run(train_step, args=(x, x_len, y))
+        mean_loss = mirrored_strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
+        return mean_loss
+    @tf.function(experimental_relax_shapes=True)
+    def distributed_dev_step(x, x_len, y):
+        per_replica_losses, per_replica_preds = mirrored_strategy.run(dev_step, args=(x, x_len, y))
+        mean_loss = mirrored_strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
+        return mean_loss, per_replica_preds
+    @tf.function(experimental_relax_shapes=True)
+    def distributed_test_step(x, x_len, y):
+        return mirrored_strategy.run(test_step, args=(x, x_len, y))
+    plot_train_loss = []
+    plot_dev_loss = []
+    plot_acc, plot_precision = [], []
+    best_acc= 0
+    train_iter = iter(train_dist_batch)
+    dev_iter = iter(dev_dist_batch)
+    test_iter = iter(test_dist_batch)
+    for epoch in range(1, config.running_config['num_epochs'] + 1):
+        if config.dataset_config['load_type']=='txt':
+            train_iter = iter(train_dist_batch)
+            dev_iter = iter(dev_dist_batch)
+            test_iter = iter(test_dist_batch)
+        start = time.time()
+        # training loop
+        train_loss = 0.0
+        dev_loss = 0.0
+        for train_batches in range(config.running_config['train_steps']):
+            inp, inp_len, target = next(train_iter)
+            train_loss += distributed_train_step(inp, inp_len, target)
+            template = '\rEpoch {} Step {} Loss {:.4f}'
+            print(colored(template.format(
+                epoch, train_batches + 1, train_loss / (train_batches + 1),
+                ), 'green'), end='', flush=True)
+            step.assign_add(1)
+        # validation loop
+        pred_all = tf.zeros([1], dtype=tf.int32)
+        true_all = tf.zeros([1], dtype=tf.int32)
+        for dev_batches in range(config.running_config['dev_steps']):
+            inp, inp_len, target = next(dev_iter)
+            loss, predicted_result = distributed_dev_step(inp, inp_len, target)
+            dev_loss += loss
+            if mirrored_strategy.num_replicas_in_sync == 1:
+                prediction = tf.nn.softmax(predicted_result)
+                y_pred = tf.argmax(prediction, axis=-1)
+                y_pred = tf.cast(y_pred, dtype=tf.int32)
+                pred_all = tf.concat([pred_all, y_pred], axis=0)
+                true_all = tf.concat([true_all, target], axis=0)
+            else:
+                for i in range(mirrored_strategy.num_replicas_in_sync):
+                    predicted_result_per_replica = predicted_result.values[i]
+                    y_true = target.values[i]
+                    y_pred = tf.argmax(predicted_result_per_replica, axis=-1)
+                    y_pred = tf.cast(y_pred, dtype=tf.int32)
+                    pred_all = tf.concat([pred_all, y_pred], axis=0)
+                    true_all = tf.concat([true_all, y_true], axis=0)
+        dev_accuracy = accuracy_function(true_all, pred_all)
+        pred_all = tf.zeros([1], dtype=tf.int32)
+        true_all = tf.zeros([1], dtype=tf.int32)
+        for test_batches in range(config.running_config['test_steps']):
+            inp, inp_len, target = next(test_iter)
+            predicted_result, target_result = distributed_test_step(inp, inp_len, target)
+            if mirrored_strategy.num_replicas_in_sync == 1:
+                prediction = tf.nn.softmax(predicted_result)
+                y_pred =tf.argmax(prediction, axis=-1)
+                y_pred = tf.cast(y_pred, dtype=tf.int32)
+                pred_all = tf.concat([pred_all, y_pred], axis=0)
+                true_all = tf.concat([true_all, target], axis=0)
+            else:
+                for replica in range(mirrored_strategy.num_replicas_in_sync):
+                    predicted_result_per_replica = predicted_result.values[i]
+                    y_true = target.values[i]
+                    y_pred = tf.argmax(predicted_result_per_replica, axis=-1)
+                    y_pred = tf.cast(y_pred, dtype=tf.int32)
+                    pred_all = tf.concat([pred_all, y_pred], axis=0)
+                    true_all = tf.concat([true_all, y_true], axis=0)
+        test_acc =  accuracy_function(real=true_all, pred=pred_all)
+        test_f1 = f1_score(y_true=true_all, y_pred=pred_all, average='macro')
+        precision = precision_score(y_true=true_all, y_pred=pred_all, average='macro', zero_division=1)
+        recall = recall_score(y_true=true_all, y_pred=pred_all, average='macro')
+        if precision > best_acc:
+            best_acc = precision
+            model.save_weights(dir_current + '/best/' + 'model')
+        model.save_weights(dir_current + '/last/' + 'model')
+        template = ("\rEpoch {}, Loss: {:.4f}, Val Loss: {:.4f}, "
+                    "Val Acc: {:.4f}, test ACC: {:.4f},F1: {:.4f}, precision: {:.4f}, recall: {:.4f}, Time Cost: {:.2f} sec")
+        text = template.format(epoch, train_loss / config.running_config['train_steps'],
+                               dev_loss/ config.running_config['dev_steps'], dev_accuracy *100,
+                               test_acc*100, test_f1*100, precision*100, recall*100, time.time() - start)
+        print(colored(text, 'cyan'))
+        log_file.write(text)
+        log_file.flush()
+        plot_train_loss.append(train_loss / config.running_config['train_steps'])
+        plot_dev_loss.append(dev_loss / config.running_config['dev_steps'])
+        plot_acc.append(test_acc)
+        plot_precision.append(precision)
+        ckpt_manager.save()
+        plt.plot(plot_train_loss, '-r', label='train_loss')
+        plt.title('Train Loss')
+        plt.xlabel('Epochs')
+        plt.savefig(dir_current + '/loss.png')
+        #plot dev
+        plt.clf()
+        plt.plot(plot_dev_loss, '-g', label='dev_loss')
+        plt.title('dev Loss')
+        plt.xlabel('Epochs')
+        plt.savefig(dir_current + '/dev_loss.png')
+        # plot acc curve
+        plt.clf()
+        plt.plot(plot_acc, 'b-', label='acc')
+        plt.title('Accuracy')
+        plt.xlabel('Epochs')
+        plt.savefig(dir_current + '/acc.png')
+        # plot f1 curve
+        plt.clf()
+        plt.plot(plot_precision, 'y-', label='f1-score')
+        plt.title('F1')
+        plt.xlabel('Epochs')
+        plt.savefig(dir_current + '/f1-score.png')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Spoken_language_identification Model training")
+    parser.add_argument("--config_file", type=str, default='./configs/config.yml', help="Config File Path")
+    args = parser.parse_args()
+    kwargs = vars(args)
+    with mirrored_strategy.scope():
+        train(**kwargs)

util/__init__.py ADDED Viewed

File without changes

util/utils.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# coding=utf-8
+# Copyright 2020 Beijing BluePulse Corp.
+# Created by Zhang Guanqun on 2020/6/5
+import matplotlib.pyplot as plt
+import os
+import tensorflow as tf
+from typing import Union, List
+import unicodedata
+def preprocess_paths(paths: Union[List, str]):
+    if isinstance(paths, list):
+        return [os.path.abspath(os.path.expanduser(path)) for path in paths]
+    return os.path.abspath(os.path.expanduser(paths)) if paths else None
+def get_reduced_length(length, reduction_factor):
+    return tf.cast(tf.math.ceil(tf.divide(length, tf.cast(reduction_factor, dtype=length.dtype))), dtype=tf.int32)
+def merge_two_last_dims(x):
+    b, _, f, c = shape_list(x)
+    return tf.reshape(x, shape=[b, -1, f * c])
+def shape_list(x):
+    """Deal with dynamic shape in tensorflow cleanly."""
+    static = x.shape.as_list()
+    dynamic = tf.shape(x)
+    return [dynamic[i] if s is None else s for i, s in enumerate(static)]
+# draw loss pic
+def plot_metric(history, metric, pic_file_name):
+    train_metrics = history.history[metric]
+    val_metrics = history.history['val_'+metric]
+    epochs = range(1, len(train_metrics) + 1)
+    plt.plot(epochs, train_metrics, 'bo--')
+    plt.plot(epochs, val_metrics, 'ro-')
+    plt.title('Training and validation '+ metric)
+    plt.xlabel("Epochs")
+    plt.ylabel(metric)
+    plt.legend(["train_"+metric, 'val_'+metric])
+    plt.savefig(pic_file_name)
+# against LAS loop decoding
+def text_no_repeat(s):
+    repeat_times = 0
+    repeat_pattern = ''
+    for i in range(1, len(s) // 2):
+        pos = i
+        if s[0 - 2 * pos:0 - pos] == s[0 - i:]:
+            tmp_repeat_pattern = s[0 - i:]
+            tmp_repeat_times = 1
+            while pos * (tmp_repeat_times + 2) <= len(s) \
+                    and s[0 - pos * (tmp_repeat_times + 2):0 - pos * (tmp_repeat_times + 1)] == s[0 - i:]:
+                tmp_repeat_times += 1
+            if tmp_repeat_times * len(tmp_repeat_pattern) > repeat_times * len(repeat_pattern):
+                repeat_times = tmp_repeat_times
+                repeat_pattern = tmp_repeat_pattern
+    # print(repeat_pattern, '*', repeat_times)
+    if len(repeat_pattern) != 1:
+        s = s[:0 - repeat_times * len(repeat_pattern)] if repeat_times > 0 else s
+    # print(s)
+    return s
+# Converts the unicode file to ascii
+def unicode_to_ascii(s):
+    return ''.join(c for c in unicodedata.normalize('NFD', s)
+                   if unicodedata.category(c) != 'Mn')
+def log10(x):
+    numerator = tf.math.log(x)
+    denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
+    return numerator / denominator

vocab/__init__.py ADDED Viewed

File without changes

vocab/vocab.py ADDED Viewed

	@@ -0,0 +1,11 @@

+class Vocab:
+    def __init__(self, file_path):
+        self.token_list = []
+        self.load_vocab_from_file(file_path)
+    def load_vocab_from_file(self, file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            vocabs = f.readlines()
+            for vocab in vocabs:
+                self.token_list.append(vocab.strip('\n'))

vocab/vocab.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+chinese
+english
+french
+german
+indonesian
+italian
+japanese
+korean
+portuguese
+russian
+spanish
+turkish
+vietnamese
+other