# -*- coding: utf-8 -*- # """*********************************************************************************************""" # FileName [ ark2timit.py ] # Synopsis [ process the .ark file preprocessed by kaldi for our dataloader ] # Author [ Andy T. Liu (Andi611) ] # Copyright [ Copyleft(c), Speech Lab, NTU, Taiwan ] # Reference [ https://github.com/nttcslab-sp/kaldiio ] """*********************************************************************************************""" ############### # IMPORTATION # ############### import os import pickle import operator import numpy as np import pandas as pd from tqdm import tqdm from kaldiio import ReadHelper ############ # SETTINGS # ############ KALDI_ROOT = '/media/andi611/1TBSSD/kaldi/' # change this to your own kaldi root TIMIT_PATH = os.path.join(KALDI_ROOT, 'egs/timit/s5/data-fmllr-tri3/') # this needs to be generated by kaldi scripts OUTPUT_DIR = '../data/timit_fmllr_cmvn' # SETS = ['dev', 'test', 'train'] # you should not need to change this ######## # MAIN # ######## def main(): if not os.path.isdir(KALDI_ROOT): print('CHANGE THIS TO YOUR OWN KALDI ROOT: ', KALDI_ROOT) exit() if not os.path.isdir(TIMIT_PATH): print('Invalid path for the kaldi TIMIT dataset: ', TIMIT_PATH) print('Please run the kaldi scripts first! More information are described in the README file and Wiki page.') if not os.path.isdir(OUTPUT_DIR): os.mkdir(OUTPUT_DIR) # read data from the preprocessed kaldi directory for s in SETS: output = {} print('Preprocessing', s, 'data...') cur_dir = os.path.join(OUTPUT_DIR, s.replace('_', '-')) if not os.path.isdir(cur_dir): os.mkdir(cur_dir) for i in range(10): with ReadHelper('ark:' + TIMIT_PATH + s + '/data/feats_fmllr_' + s + '.' + str(i+1) + '.ark') as reader: for key, array in tqdm(reader): array = np.asarray(array).astype('float32') np.save(os.path.join(cur_dir, key), array) output[os.path.join(s.replace('_', '-'), key + '.npy')] = len(array) output = sorted(output.items(), key=operator.itemgetter(1), reverse=True) df = pd.DataFrame(data={'file_path':[fp for fp, l in output], 'length':[l for fp, l in output], 'label':'None'}) df.to_csv(os.path.join(OUTPUT_DIR, s.replace('_', '-') + '.csv')) print('[ARK-TO-TIMIT] - All done, saved at \'' + str(OUTPUT_DIR) + '\', exit.') exit() if __name__ == '__main__': main()