|
import kaldiio |
|
import os |
|
import librosa |
|
from tqdm import tqdm |
|
import glob |
|
import json |
|
from shutil import copyfile |
|
import pandas as pd |
|
import argparse |
|
from text import _clean_text, symbols |
|
from num2words import num2words |
|
import re |
|
from melspec import mel_spectrogram |
|
import torchaudio |
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('-d', '--data', type=str, required=True, help='path to the emotional dataset') |
|
args = parser.parse_args() |
|
dataset_path = args.data |
|
filelists_path = 'filelists/all_spks/' |
|
feats_scp_file = filelists_path + 'feats.scp' |
|
feats_ark_file = filelists_path + 'feats.ark' |
|
|
|
|
|
spks = ['1263201035', '805570882', '399172782'] |
|
train_files = [] |
|
eval_files = [] |
|
for spk in spks: |
|
train_files += glob.glob(dataset_path + spk + "/train/*.wav") |
|
eval_files += glob.glob(dataset_path + spk + "/eval/*.wav") |
|
|
|
os.makedirs(filelists_path, exist_ok=True) |
|
|
|
with open(filelists_path + 'train_utts.txt', 'w', encoding='utf-8') as f: |
|
for wav_path in train_files: |
|
wav_name = os.path.splitext(os.path.basename(wav_path))[0] |
|
f.write(wav_name + '\n') |
|
with open(filelists_path + 'eval_utts.txt', 'w', encoding='utf-8') as f: |
|
for wav_path in eval_files: |
|
wav_name = os.path.splitext(os.path.basename(wav_path))[0] |
|
f.write(wav_name + '\n') |
|
|
|
with open(feats_scp_file, 'w') as feats_scp, \ |
|
kaldiio.WriteHelper(f'ark,scp:{feats_ark_file},{feats_scp_file}') as writer: |
|
for root, dirs, files in os.walk(dataset_path): |
|
for file in tqdm(files): |
|
if file.endswith('.wav'): |
|
|
|
wav_path = os.path.join(root, file) |
|
rel_path = os.path.relpath(wav_path, dataset_path) |
|
wav_name = os.path.splitext(os.path.basename(wav_path))[0] |
|
signal, rate = torchaudio.load(wav_path) |
|
spec = mel_spectrogram(signal, 1024, 80, 22050, 256, |
|
1024, 0, 8000, center=False).squeeze() |
|
|
|
writer[wav_name] = spec |
|
|
|
|
|
emotions = [os.path.basename(x).split("_")[1] for x in glob.glob(dataset_path + '/**/**/*')] |
|
emotions = sorted(set(emotions)) |
|
|
|
utt2spk = {} |
|
utt2emo = {} |
|
wavs = glob.glob(dataset_path + '**/**/*.wav') |
|
for wav_path in tqdm(wavs): |
|
wav_name = os.path.splitext(os.path.basename(wav_path))[0] |
|
emotion = emotions.index(wav_name.split("_")[1]) |
|
if wav_path.split('/')[-3] == '1263201035': |
|
spk = 0 |
|
elif wav_path.split('/')[-3] == '805570882': |
|
spk = 1 |
|
else: |
|
spk = 2 |
|
utt2spk[wav_name] = str(spk) |
|
utt2emo[wav_name] = str(emotion) |
|
utt2spk = dict(sorted(utt2spk.items())) |
|
utt2emo = dict(sorted(utt2emo.items())) |
|
|
|
with open(filelists_path + 'utt2emo.json', 'w') as fp: |
|
json.dump(utt2emo, fp, indent=4) |
|
with open(filelists_path + 'utt2spk.json', 'w') as fp: |
|
json.dump(utt2spk, fp, indent=4) |
|
|
|
txt_files = sorted(glob.glob(dataset_path + '/**/**/*.txt')) |
|
count = 0 |
|
txt = [] |
|
basenames = [] |
|
utt2text = {} |
|
flag = False |
|
with open(filelists_path + 'text', 'w', encoding='utf-8') as write: |
|
for txt_path in txt_files: |
|
basename = os.path.basename(txt_path).replace('.txt', '') |
|
with open(txt_path, 'r', encoding='utf-8') as f: |
|
txt.append(_clean_text(f.read().strip("\n"), cleaner_names=["kazakh_cleaners"]).replace("'", "")) |
|
basenames.append(basename) |
|
output_string = [re.sub('(\d+)', lambda m: num2words(m.group(), lang='kz'), sentence) for sentence in txt] |
|
cleaned_txt = [] |
|
for t in output_string: |
|
cleaned_txt.append(''.join([s for s in t if s in symbols])) |
|
utt2text = {basenames[i]: cleaned_txt[i] for i in range(len(cleaned_txt))} |
|
utt2text = dict(sorted(utt2text.items())) |
|
|
|
vocab = set() |
|
with open(filelists_path + '/text', 'w', encoding='utf-8') as f: |
|
for x, y in utt2text.items(): |
|
for c in y: vocab.add(c) |
|
f.write(x + ' ' + y + '\n') |
|
|