File size: 4,275 Bytes
f831146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
This script extracts filter banks from audio files. Audio files are split
into frames of 25 ms and 64 F banks are extracted from each frame.
64 such frames are grouped together to create a sample which is a
64 x 64 matrix. Each matrix is saved as a .npy file into the output folder.
Samples from different speakers are in different folders and can be easily read
by torchvision's DatasetFolder.
"""

import os
import re
from io import StringIO
from pathlib import Path

import numpy as np
import pandas as pd
import librosa
import python_speech_features as psf

BASE_PATH = 'LibriSpeech'
OUTPUT_PATH = 'fbanks'
np.random.seed(42)


def read_metadata():
    with open(BASE_PATH + '/SPEAKERS.TXT', 'r') as meta:
        data = meta.readlines()

    data = data[11:]
    data = ''.join(data)
    data = data[1:]
    data = re.sub(' +|', '', data)
    data = StringIO(data)

    speakers = pd.read_csv(data, sep='|', error_bad_lines=False)

    # This is using just the train clean 100 part. Update this line to extract from
    # train clean 360 or include both 100 and 360
    speakers_filtered = speakers[(speakers['SUBSET'] == 'train-clean-100')]
    speakers_filtered = speakers_filtered.copy()
    speakers_filtered['LABEL'] = speakers_filtered['ID'].astype('category').cat.codes
    speakers_filtered = speakers_filtered.reset_index(drop=True)
    return speakers_filtered


def get_fbanks(audio_file):

    def normalize_frames(signal, epsilon=1e-12):
        return np.array([(v - np.mean(v)) / max(np.std(v), epsilon) for v in signal])

    y, sr = librosa.load(audio_file, sr=None)
    assert sr == 16000

    trim_len = int(0.25 * sr)
    if y.shape[0] < 1 * sr:
        # if less than 1 seconds, don't use that audio
        return None

    y = y[trim_len:-trim_len]

    # frame width of 25 ms with a stride of 10 ms. This will have an overlap of 15s
    filter_banks, energies = psf.fbank(y, samplerate=sr, nfilt=64, winlen=0.025, winstep=0.01)
    filter_banks = normalize_frames(signal=filter_banks)

    filter_banks = filter_banks.reshape((filter_banks.shape[0], 64, 1))
    return filter_banks


def assert_out_dir_exists(index):
    dir_ = OUTPUT_PATH + '/' + str(index)

    if not os.path.exists(dir_):
        os.makedirs(dir_)
        print('crated dir {}'.format(dir_))
    else:
        print('dir {} already exists'.format(dir_))

    return dir_


def main():
    speakers = read_metadata()

    print('read metadata from file, number of rows in in are: {}'.format(speakers.shape))
    print('numer of unique labels in the dataset is: {}'.format(speakers['LABEL'].unique().shape))
    print('max label in the dataset is: {}'.format(speakers['LABEL'].max()))
    print('number of unique index: {}, max index: {}'.format(speakers.index.shape, max(speakers.index)))

    for index, row in speakers.iterrows():
        subset = row['SUBSET']
        id_ = row['ID']
        dir_ = BASE_PATH + '/' + subset + '/' + str(id_) + '/'

        print('working for id: {}, index: {}, at path: {}'.format(id_, index, dir_))

        files_iter = Path(dir_).glob('**/*.flac')
        files_ = [str(f) for f in files_iter]

        index_target_dir = assert_out_dir_exists(index)

        sample_counter = 0

        for f in files_:
            fbanks = get_fbanks(f)
            num_frames = fbanks.shape[0]

            # sample sets of 64 frames each
            file_sample_counter = 0
            start = 0
            while start < num_frames + 64:
                slice_ = fbanks[start:start + 64]
                if slice_ is not None and slice_.shape[0] == 64:
                    assert slice_.shape[0] == 64
                    assert slice_.shape[1] == 64
                    assert slice_.shape[2] == 1
                    np.save(index_target_dir + '/' + str(sample_counter) + '.npy', slice_)

                    file_sample_counter += 1
                    sample_counter += 1

                start = start + 64

            print('done for index: {}, Samples from this file: {}'.format(index, file_sample_counter))

        print('done for id: {}, index: {}, total number of samples for this id: {}'.format(id_, index, sample_counter))
        print('')

    print('All done, YAY!, look at the files')


if __name__ == '__main__':
    main()