|
|
|
"""*********************************************************************************************""" |
|
|
|
|
|
|
|
|
|
|
|
"""*********************************************************************************************""" |
|
|
|
|
|
|
|
|
|
import os |
|
import pickle |
|
import kaldi_io |
|
import numpy as np |
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
|
|
|
KALDI_ROOT = '/media/andi611/1TBSSD/kaldi/' |
|
INPUT_PATH = '../data/timit_mel160_phoneme63' |
|
|
|
INPUT_SETS = ['train', 'test'] |
|
OUTPUT_SETS = ['train', 'dev', 'test'] |
|
|
|
TIMIT_PATH = os.path.join(KALDI_ROOT, 'egs/timit/s5/') |
|
SOURCE_DIR = os.path.join(TIMIT_PATH, 'data-kaldi-mel') |
|
OUTPUT_PATH = os.path.join(TIMIT_PATH, 'timit_mel160_arked') |
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
if not os.path.isdir(KALDI_ROOT): |
|
print('CHANGE THIS TO YOUR OWN KALDI ROOT: ', KALDI_ROOT) |
|
exit() |
|
|
|
if not os.path.isdir(INPUT_PATH): |
|
print('Invalid path for the preprocessed timit dataset: ', INPUT_PATH) |
|
print('Please run \'preprocess_timit.py\' first!') |
|
exit() |
|
|
|
if not os.path.isdir(SOURCE_DIR): |
|
print('Invalid path for the source directory: ', SOURCE_DIR) |
|
print('Please read the Wiki page for instructions!') |
|
exit() |
|
|
|
if not os.path.isdir(OUTPUT_PATH): |
|
os.mkdir(OUTPUT_PATH) |
|
|
|
|
|
x, ids = [], [] |
|
for s in INPUT_SETS: |
|
with open(os.path.join(INPUT_PATH, s + '_x.pkl'), 'rb') as fp: |
|
x += pickle.load(fp) |
|
with open(os.path.join(INPUT_PATH, s + '_id.pkl'), 'rb') as fp: |
|
ids += pickle.load(fp) |
|
assert len(x)==len(ids) |
|
print('[TIMIT-to-ARK] - ', 'Total Dataset len:', len(x)) |
|
|
|
|
|
all_inputs = {} |
|
for idx, i in enumerate(ids): |
|
i = str(i).strip('.wav').split('/') |
|
i = i[-2].upper() + '_' + i[-1].upper() |
|
all_inputs[i] = np.asarray(x[idx]) |
|
|
|
|
|
for s in OUTPUT_SETS: |
|
if not os.path.isdir(SOURCE_DIR): |
|
raise NotADirectoryError('Source directory does not exist!', SOURCE_DIR) |
|
|
|
if not os.path.isdir(OUTPUT_PATH + '/' + str(s)): |
|
os.mkdir(OUTPUT_PATH + '/' + str(s)) |
|
|
|
|
|
partial_outputs = {} |
|
with open(os.path.join(SOURCE_DIR, s + '/feats.scp'), 'r') as f: |
|
lines = f.readlines() |
|
for line in lines: |
|
line = line.split(' ')[0] |
|
if line in all_inputs: |
|
partial_outputs[line] = all_inputs[line] |
|
assert len(lines) == len(partial_outputs) |
|
|
|
|
|
ark_scp_output = 'ark:| copy-feats --compress=true ark:- ark,scp:{}/raw_mel_{}.ark,{}/{}/feats.scp'.format(OUTPUT_PATH, str(s), OUTPUT_PATH, str(s)) |
|
with kaldi_io.open_or_fd(ark_scp_output, 'wb') as f: |
|
for key, mat in tqdm(partial_outputs.items()): |
|
kaldi_io.write_mat(f, mat, key=key) |
|
|
|
print('[TIMIT-to-ARK] - All done, saved at \'' + str(OUTPUT_PATH) + '\' exit.') |
|
|
|
if __name__ == '__main__': |
|
main() |
|
|
|
|