Spaces:

AdalAbilbekov
/

EmotionalIntensityControl

Sleeping

App Files Files Community

EmotionalIntensityControl / data_preparation.py

AdalAbilbekov

First commit

ae8e1dd over 1 year ago

raw

history blame

4.36 kB

	import kaldiio
	import os
	import librosa
	from tqdm import tqdm
	import glob
	import json
	from shutil import copyfile
	import pandas as pd
	import argparse
	from text import _clean_text, symbols
	from num2words import num2words
	import re
	from melspec import mel_spectrogram
	import torchaudio

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('-d', '--data', type=str, required=True, help='path to the emotional dataset')
	args = parser.parse_args()
	dataset_path = args.data
	filelists_path = 'filelists/all_spks/'
	feats_scp_file = filelists_path + 'feats.scp'
	feats_ark_file = filelists_path + 'feats.ark'


	spks = ['1263201035', '805570882', '399172782']
	train_files = []
	eval_files = []
	for spk in spks:
	train_files += glob.glob(dataset_path + spk + "/train/*.wav")
	eval_files += glob.glob(dataset_path + spk + "/eval/*.wav")

	os.makedirs(filelists_path, exist_ok=True)

	with open(filelists_path + 'train_utts.txt', 'w', encoding='utf-8') as f:
	for wav_path in train_files:
	wav_name = os.path.splitext(os.path.basename(wav_path))[0]
	f.write(wav_name + '\n')
	with open(filelists_path + 'eval_utts.txt', 'w', encoding='utf-8') as f:
	for wav_path in eval_files:
	wav_name = os.path.splitext(os.path.basename(wav_path))[0]
	f.write(wav_name + '\n')

	with open(feats_scp_file, 'w') as feats_scp, \
	kaldiio.WriteHelper(f'ark,scp:{feats_ark_file},{feats_scp_file}') as writer:
	for root, dirs, files in os.walk(dataset_path):
	for file in tqdm(files):
	if file.endswith('.wav'):
	# Get the file name and relative path to the root folder
	wav_path = os.path.join(root, file)
	rel_path = os.path.relpath(wav_path, dataset_path)
	wav_name = os.path.splitext(os.path.basename(wav_path))[0]
	signal, rate = torchaudio.load(wav_path)
	spec = mel_spectrogram(signal, 1024, 80, 22050, 256,
	1024, 0, 8000, center=False).squeeze()
	# Write the features to feats.ark and feats.scp
	writer[wav_name] = spec


	emotions = [os.path.basename(x).split("_")[1] for x in glob.glob(dataset_path + '///*')]
	emotions = sorted(set(emotions))

	utt2spk = {}
	utt2emo = {}
	wavs = glob.glob(dataset_path + '//*.wav')
	for wav_path in tqdm(wavs):
	wav_name = os.path.splitext(os.path.basename(wav_path))[0]
	emotion = emotions.index(wav_name.split("_")[1])
	if wav_path.split('/')[-3] == '1263201035':
	spk = 0 ## labels should start with 0
	elif wav_path.split('/')[-3] == '805570882':
	spk = 1
	else:
	spk = 2
	utt2spk[wav_name] = str(spk)
	utt2emo[wav_name] = str(emotion)
	utt2spk = dict(sorted(utt2spk.items()))
	utt2emo = dict(sorted(utt2emo.items()))

	with open(filelists_path + 'utt2emo.json', 'w') as fp:
	json.dump(utt2emo, fp, indent=4)
	with open(filelists_path + 'utt2spk.json', 'w') as fp:
	json.dump(utt2spk, fp, indent=4)

	txt_files = sorted(glob.glob(dataset_path + '///*.txt'))
	count = 0
	txt = []
	basenames = []
	utt2text = {}
	flag = False
	with open(filelists_path + 'text', 'w', encoding='utf-8') as write:
	for txt_path in txt_files:
	basename = os.path.basename(txt_path).replace('.txt', '')
	with open(txt_path, 'r', encoding='utf-8') as f:
	txt.append(_clean_text(f.read().strip("\n"), cleaner_names=["kazakh_cleaners"]).replace("'", ""))
	basenames.append(basename)
	output_string = [re.sub('(\d+)', lambda m: num2words(m.group(), lang='kz'), sentence) for sentence in txt]
	cleaned_txt = []
	for t in output_string:
	cleaned_txt.append(''.join([s for s in t if s in symbols]))
	utt2text = {basenames[i]: cleaned_txt[i] for i in range(len(cleaned_txt))}
	utt2text = dict(sorted(utt2text.items()))

	vocab = set()
	with open(filelists_path + '/text', 'w', encoding='utf-8') as f:
	for x, y in utt2text.items():
	for c in y: vocab.add(c)
	f.write(x + ' ' + y + '\n')