wavlm-large / s3prl_s3prl_main /s3prl /preprocess /timit2ark.py

Upload 1162 files

0b32ad6 verified 5 months ago

3.87 kB

	# -- coding: utf-8 -- #
	"""*********************************************************************************************"""
	# FileName [ timit2ark.py ]
	# Synopsis [ plug-in our own preprocessed features to .ark files for kaldi ]
	# Author [ Andy T. Liu (Andi611) ]
	# Copyright [ Copyleft(c), Speech Lab, NTU, Taiwan ]
	# Reference [ https://github.com/mravanelli/pytorch-kaldi#how-can-i-use-my-own-dataset ]
	"""*********************************************************************************************"""

	###############
	# IMPORTATION #
	###############
	import os
	import pickle
	import kaldi_io
	import numpy as np
	from tqdm import tqdm


	############
	# SETTINGS #
	############
	KALDI_ROOT = '/media/andi611/1TBSSD/kaldi/' # !!!!!!!!!!!!!!!!! CHANGE THIS TO YOUR OWN KALDI ROOT !!!!!!!!!!!!!!!!! #
	INPUT_PATH = '../data/timit_mel160_phoneme63' # this can be generated with 'preprocess_timit.py'

	INPUT_SETS = ['train', 'test'] # you should not need to change this
	OUTPUT_SETS = ['train', 'dev', 'test'] # you should not need to change this

	TIMIT_PATH = os.path.join(KALDI_ROOT, 'egs/timit/s5/') # you should not need to change this
	SOURCE_DIR = os.path.join(TIMIT_PATH, 'data-kaldi-mel') # the data directory generated by kaldi script
	OUTPUT_PATH = os.path.join(TIMIT_PATH, 'timit_mel160_arked') # you should not need to change this


	########
	# MAIN #
	########
	def main():
	if not os.path.isdir(KALDI_ROOT):
	print('CHANGE THIS TO YOUR OWN KALDI ROOT: ', KALDI_ROOT)
	exit()

	if not os.path.isdir(INPUT_PATH):
	print('Invalid path for the preprocessed timit dataset: ', INPUT_PATH)
	print('Please run \'preprocess_timit.py\' first!')
	exit()

	if not os.path.isdir(SOURCE_DIR):
	print('Invalid path for the source directory: ', SOURCE_DIR)
	print('Please read the Wiki page for instructions!')
	exit()

	if not os.path.isdir(OUTPUT_PATH):
	os.mkdir(OUTPUT_PATH)

	# read train and test from the preprocessed directory
	x, ids = [], []
	for s in INPUT_SETS:
	with open(os.path.join(INPUT_PATH, s + '_x.pkl'), 'rb') as fp:
	x += pickle.load(fp)
	with open(os.path.join(INPUT_PATH, s + '_id.pkl'), 'rb') as fp:
	ids += pickle.load(fp)
	assert len(x)==len(ids)
	print('[TIMIT-to-ARK] - ', 'Total Dataset len:', len(x))

	# construct all input dict
	all_inputs = {}
	for idx, i in enumerate(ids):
	i = str(i).strip('.wav').split('/')
	i = i[-2].upper() + '_' + i[-1].upper()
	all_inputs[i] = np.asarray(x[idx])

	# filter all input with kaldi generated files
	for s in OUTPUT_SETS:
	if not os.path.isdir(SOURCE_DIR):
	raise NotADirectoryError('Source directory does not exist!', SOURCE_DIR)

	if not os.path.isdir(OUTPUT_PATH + '/' + str(s)):
	os.mkdir(OUTPUT_PATH + '/' + str(s))

	# read train / dev / test from the kaldi generated directory
	partial_outputs = {}
	with open(os.path.join(SOURCE_DIR, s + '/feats.scp'), 'r') as f:
	lines = f.readlines()
	for line in lines:
	line = line.split(' ')[0]
	if line in all_inputs:
	partial_outputs[line] = all_inputs[line]
	assert len(lines) == len(partial_outputs)

	# writiing output with kaldi_io
	ark_scp_output = 'ark:\| copy-feats --compress=true ark:- ark,scp:{}/raw_mel_{}.ark,{}/{}/feats.scp'.format(OUTPUT_PATH, str(s), OUTPUT_PATH, str(s))
	with kaldi_io.open_or_fd(ark_scp_output, 'wb') as f:
	for key, mat in tqdm(partial_outputs.items()):
	kaldi_io.write_mat(f, mat, key=key)

	print('[TIMIT-to-ARK] - All done, saved at \'' + str(OUTPUT_PATH) + '\' exit.')

	if __name__ == '__main__':
	main()