File size: 3,872 Bytes

0b32ad6

# -*- coding: utf-8 -*- #
"""*********************************************************************************************"""
#   FileName     [ timit2ark.py ]
#   Synopsis     [ plug-in our own preprocessed features to .ark files for kaldi ]
#   Author       [ Andy T. Liu (Andi611) ]
#   Copyright    [ Copyleft(c), Speech Lab, NTU, Taiwan ]
#   Reference    [ https://github.com/mravanelli/pytorch-kaldi#how-can-i-use-my-own-dataset ]
"""*********************************************************************************************"""

###############
# IMPORTATION #
###############
import os
import pickle
import kaldi_io
import numpy as np
from tqdm import tqdm


############
# SETTINGS #
############
KALDI_ROOT = '/media/andi611/1TBSSD/kaldi/' # !!!!!!!!!!!!!!!!! CHANGE THIS TO YOUR OWN KALDI ROOT !!!!!!!!!!!!!!!!! #
INPUT_PATH = '../data/timit_mel160_phoneme63' # this can be generated with 'preprocess_timit.py'

INPUT_SETS = ['train', 'test'] # you should not need to change this
OUTPUT_SETS = ['train', 'dev', 'test'] # you should not need to change this

TIMIT_PATH = os.path.join(KALDI_ROOT, 'egs/timit/s5/') # you should not need to change this
SOURCE_DIR = os.path.join(TIMIT_PATH, 'data-kaldi-mel') # the data directory generated by kaldi script
OUTPUT_PATH = os.path.join(TIMIT_PATH, 'timit_mel160_arked') # you should not need to change this


########
# MAIN #
########
def main():
    if not os.path.isdir(KALDI_ROOT):
        print('CHANGE THIS TO YOUR OWN KALDI ROOT: ', KALDI_ROOT)
        exit()

    if not os.path.isdir(INPUT_PATH):
        print('Invalid path for the preprocessed timit dataset: ', INPUT_PATH)
        print('Please run \'preprocess_timit.py\' first!')
        exit()
        
    if not os.path.isdir(SOURCE_DIR):
        print('Invalid path for the source directory: ', SOURCE_DIR)
        print('Please read the Wiki page for instructions!')
        exit()

    if not os.path.isdir(OUTPUT_PATH):
        os.mkdir(OUTPUT_PATH)

    # read train and test from the preprocessed directory
    x, ids = [], []
    for s in INPUT_SETS:
        with open(os.path.join(INPUT_PATH, s + '_x.pkl'), 'rb') as fp:
            x += pickle.load(fp)
        with open(os.path.join(INPUT_PATH, s + '_id.pkl'), 'rb') as fp:
            ids += pickle.load(fp)
        assert len(x)==len(ids)
    print('[TIMIT-to-ARK] - ', 'Total Dataset len:', len(x))

    # construct all input dict
    all_inputs = {}
    for idx, i in enumerate(ids):
        i = str(i).strip('.wav').split('/')
        i = i[-2].upper() + '_' + i[-1].upper()
        all_inputs[i] = np.asarray(x[idx])

    # filter all input with kaldi generated files
    for s in OUTPUT_SETS:
        if not os.path.isdir(SOURCE_DIR):
            raise NotADirectoryError('Source directory does not exist!', SOURCE_DIR)    
        
        if not os.path.isdir(OUTPUT_PATH + '/' + str(s)):
            os.mkdir(OUTPUT_PATH + '/' + str(s))
        
        # read train / dev / test from the kaldi generated directory
        partial_outputs = {}
        with open(os.path.join(SOURCE_DIR, s + '/feats.scp'), 'r') as f:
            lines = f.readlines()
            for line in lines:
                line = line.split(' ')[0]
                if line in all_inputs:
                    partial_outputs[line] = all_inputs[line]
            assert len(lines) == len(partial_outputs)

        # writiing output with kaldi_io
        ark_scp_output = 'ark:| copy-feats --compress=true ark:- ark,scp:{}/raw_mel_{}.ark,{}/{}/feats.scp'.format(OUTPUT_PATH, str(s), OUTPUT_PATH, str(s))
        with kaldi_io.open_or_fd(ark_scp_output, 'wb') as f:
            for key, mat in tqdm(partial_outputs.items()): 
                kaldi_io.write_mat(f, mat, key=key)

    print('[TIMIT-to-ARK] - All done, saved at \'' + str(OUTPUT_PATH) + '\' exit.')

if __name__ == '__main__':
    main()