lmzjms's picture
Upload 1162 files
0b32ad6 verified
# -*- coding: utf-8 -*- #
"""*********************************************************************************************"""
# FileName [ timit2ark.py ]
# Synopsis [ plug-in our own preprocessed features to .ark files for kaldi ]
# Author [ Andy T. Liu (Andi611) ]
# Copyright [ Copyleft(c), Speech Lab, NTU, Taiwan ]
# Reference [ https://github.com/mravanelli/pytorch-kaldi#how-can-i-use-my-own-dataset ]
"""*********************************************************************************************"""
###############
# IMPORTATION #
###############
import os
import pickle
import kaldi_io
import numpy as np
from tqdm import tqdm
############
# SETTINGS #
############
KALDI_ROOT = '/media/andi611/1TBSSD/kaldi/' # !!!!!!!!!!!!!!!!! CHANGE THIS TO YOUR OWN KALDI ROOT !!!!!!!!!!!!!!!!! #
INPUT_PATH = '../data/timit_mel160_phoneme63' # this can be generated with 'preprocess_timit.py'
INPUT_SETS = ['train', 'test'] # you should not need to change this
OUTPUT_SETS = ['train', 'dev', 'test'] # you should not need to change this
TIMIT_PATH = os.path.join(KALDI_ROOT, 'egs/timit/s5/') # you should not need to change this
SOURCE_DIR = os.path.join(TIMIT_PATH, 'data-kaldi-mel') # the data directory generated by kaldi script
OUTPUT_PATH = os.path.join(TIMIT_PATH, 'timit_mel160_arked') # you should not need to change this
########
# MAIN #
########
def main():
if not os.path.isdir(KALDI_ROOT):
print('CHANGE THIS TO YOUR OWN KALDI ROOT: ', KALDI_ROOT)
exit()
if not os.path.isdir(INPUT_PATH):
print('Invalid path for the preprocessed timit dataset: ', INPUT_PATH)
print('Please run \'preprocess_timit.py\' first!')
exit()
if not os.path.isdir(SOURCE_DIR):
print('Invalid path for the source directory: ', SOURCE_DIR)
print('Please read the Wiki page for instructions!')
exit()
if not os.path.isdir(OUTPUT_PATH):
os.mkdir(OUTPUT_PATH)
# read train and test from the preprocessed directory
x, ids = [], []
for s in INPUT_SETS:
with open(os.path.join(INPUT_PATH, s + '_x.pkl'), 'rb') as fp:
x += pickle.load(fp)
with open(os.path.join(INPUT_PATH, s + '_id.pkl'), 'rb') as fp:
ids += pickle.load(fp)
assert len(x)==len(ids)
print('[TIMIT-to-ARK] - ', 'Total Dataset len:', len(x))
# construct all input dict
all_inputs = {}
for idx, i in enumerate(ids):
i = str(i).strip('.wav').split('/')
i = i[-2].upper() + '_' + i[-1].upper()
all_inputs[i] = np.asarray(x[idx])
# filter all input with kaldi generated files
for s in OUTPUT_SETS:
if not os.path.isdir(SOURCE_DIR):
raise NotADirectoryError('Source directory does not exist!', SOURCE_DIR)
if not os.path.isdir(OUTPUT_PATH + '/' + str(s)):
os.mkdir(OUTPUT_PATH + '/' + str(s))
# read train / dev / test from the kaldi generated directory
partial_outputs = {}
with open(os.path.join(SOURCE_DIR, s + '/feats.scp'), 'r') as f:
lines = f.readlines()
for line in lines:
line = line.split(' ')[0]
if line in all_inputs:
partial_outputs[line] = all_inputs[line]
assert len(lines) == len(partial_outputs)
# writiing output with kaldi_io
ark_scp_output = 'ark:| copy-feats --compress=true ark:- ark,scp:{}/raw_mel_{}.ark,{}/{}/feats.scp'.format(OUTPUT_PATH, str(s), OUTPUT_PATH, str(s))
with kaldi_io.open_or_fd(ark_scp_output, 'wb') as f:
for key, mat in tqdm(partial_outputs.items()):
kaldi_io.write_mat(f, mat, key=key)
print('[TIMIT-to-ARK] - All done, saved at \'' + str(OUTPUT_PATH) + '\' exit.')
if __name__ == '__main__':
main()