File size: 4,356 Bytes
ae8e1dd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import kaldiio
import os
import librosa
from tqdm import tqdm
import glob
import json
from shutil import copyfile
import pandas as pd
import argparse
from text import _clean_text, symbols
from num2words import num2words
import re
from melspec import mel_spectrogram
import torchaudio
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--data', type=str, required=True, help='path to the emotional dataset')
args = parser.parse_args()
dataset_path = args.data
filelists_path = 'filelists/all_spks/'
feats_scp_file = filelists_path + 'feats.scp'
feats_ark_file = filelists_path + 'feats.ark'
spks = ['1263201035', '805570882', '399172782']
train_files = []
eval_files = []
for spk in spks:
train_files += glob.glob(dataset_path + spk + "/train/*.wav")
eval_files += glob.glob(dataset_path + spk + "/eval/*.wav")
os.makedirs(filelists_path, exist_ok=True)
with open(filelists_path + 'train_utts.txt', 'w', encoding='utf-8') as f:
for wav_path in train_files:
wav_name = os.path.splitext(os.path.basename(wav_path))[0]
f.write(wav_name + '\n')
with open(filelists_path + 'eval_utts.txt', 'w', encoding='utf-8') as f:
for wav_path in eval_files:
wav_name = os.path.splitext(os.path.basename(wav_path))[0]
f.write(wav_name + '\n')
with open(feats_scp_file, 'w') as feats_scp, \
kaldiio.WriteHelper(f'ark,scp:{feats_ark_file},{feats_scp_file}') as writer:
for root, dirs, files in os.walk(dataset_path):
for file in tqdm(files):
if file.endswith('.wav'):
# Get the file name and relative path to the root folder
wav_path = os.path.join(root, file)
rel_path = os.path.relpath(wav_path, dataset_path)
wav_name = os.path.splitext(os.path.basename(wav_path))[0]
signal, rate = torchaudio.load(wav_path)
spec = mel_spectrogram(signal, 1024, 80, 22050, 256,
1024, 0, 8000, center=False).squeeze()
# Write the features to feats.ark and feats.scp
writer[wav_name] = spec
emotions = [os.path.basename(x).split("_")[1] for x in glob.glob(dataset_path + '/**/**/*')]
emotions = sorted(set(emotions))
utt2spk = {}
utt2emo = {}
wavs = glob.glob(dataset_path + '**/**/*.wav')
for wav_path in tqdm(wavs):
wav_name = os.path.splitext(os.path.basename(wav_path))[0]
emotion = emotions.index(wav_name.split("_")[1])
if wav_path.split('/')[-3] == '1263201035':
spk = 0 ## labels should start with 0
elif wav_path.split('/')[-3] == '805570882':
spk = 1
else:
spk = 2
utt2spk[wav_name] = str(spk)
utt2emo[wav_name] = str(emotion)
utt2spk = dict(sorted(utt2spk.items()))
utt2emo = dict(sorted(utt2emo.items()))
with open(filelists_path + 'utt2emo.json', 'w') as fp:
json.dump(utt2emo, fp, indent=4)
with open(filelists_path + 'utt2spk.json', 'w') as fp:
json.dump(utt2spk, fp, indent=4)
txt_files = sorted(glob.glob(dataset_path + '/**/**/*.txt'))
count = 0
txt = []
basenames = []
utt2text = {}
flag = False
with open(filelists_path + 'text', 'w', encoding='utf-8') as write:
for txt_path in txt_files:
basename = os.path.basename(txt_path).replace('.txt', '')
with open(txt_path, 'r', encoding='utf-8') as f:
txt.append(_clean_text(f.read().strip("\n"), cleaner_names=["kazakh_cleaners"]).replace("'", ""))
basenames.append(basename)
output_string = [re.sub('(\d+)', lambda m: num2words(m.group(), lang='kz'), sentence) for sentence in txt]
cleaned_txt = []
for t in output_string:
cleaned_txt.append(''.join([s for s in t if s in symbols]))
utt2text = {basenames[i]: cleaned_txt[i] for i in range(len(cleaned_txt))}
utt2text = dict(sorted(utt2text.items()))
vocab = set()
with open(filelists_path + '/text', 'w', encoding='utf-8') as f:
for x, y in utt2text.items():
for c in y: vocab.add(c)
f.write(x + ' ' + y + '\n')
|