Spaces:
Running
on
Zero
Running
on
Zero
import json | |
import re | |
import csv | |
import shutil | |
import os | |
import argparse | |
main_path = os.getcwd() | |
def get_duration(row): | |
phone_durs = row.split() | |
dur_sum = 0 | |
for phone_dur in phone_durs: | |
if phone_dur == '|': | |
continue | |
else: | |
phone_dur = phone_dur.split('[') | |
dur = float(phone_dur[1][:-1])/1000 | |
dur_sum += dur | |
return dur_sum | |
def prepare_data_for_model(path, duration_lim): | |
f = open(path, 'r') | |
data = csv.DictReader(f) | |
data_lines = [] | |
for row in data: | |
dur = get_duration(row['phenome']) | |
if dur > duration_lim: | |
continue | |
phoneme = row['phenome'] | |
utterance_name = row['seg_id'] | |
speaker_id = row['speaker_id'] | |
phoneme = re.sub("\[([0-9]+)\]", '', phoneme) | |
phoneme = re.sub("\s+\|\s+", ' ', phoneme) | |
data_lines.append([phoneme, utterance_name, speaker_id]) | |
f.close() | |
return data_lines | |
def save_files(train_data, test_data, data_path): | |
for line in train_data: | |
try: | |
original = os.path.join(data_path, 'train_wav/{}.wav'.format(line[1])) | |
target = os.path.join(main_path, 'dataset/persian_data/train_data/speaker-{0}/book-1/utterance-{1}.wav'.format(line[2], line[1])) | |
os.makedirs(os.path.dirname(target), exist_ok=True) | |
shutil.copyfile(original, target) | |
except Exception as e: | |
print(e) | |
return False | |
path = os.path.join(main_path, 'dataset/persian_data/train_data/speaker-{0}/book-1/utterance-{1}.txt'.format(line[2], line[1])) | |
with open(path, 'w') as fp: | |
fp.write(line[0]) | |
for line in test_data: | |
try: | |
original = os.path.join(data_path, 'test_wav/{}.wav'.format(line[1])) | |
target = os.path.join(main_path, 'dataset/persian_data/test_data/speaker-{0}/book-1/utterance-{1}.wav'.format(line[2], line[1])) | |
os.makedirs(os.path.dirname(target), exist_ok=True) | |
shutil.copyfile(original, target) | |
except Exception as e: | |
print(e) | |
return False | |
path = os.path.join(main_path, 'dataset/persian_data/test_data/speaker-{0}/book-1/utterance-{1}.txt'.format(line[2], line[1])) | |
with open(path, 'w') as fp: | |
fp.write(line[0]) | |
return True | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--data_path', required=True) | |
args = parser.parse_args() | |
data_path = args.data_path | |
if os.path.isfile(os.path.join(data_path, 'train_info.csv')): | |
train_data_path = os.path.join(data_path, 'train_info.csv') | |
else: | |
print('data_path is not correct!') | |
return -1 | |
if os.path.isfile(os.path.join(data_path, 'test_info.csv')): | |
test_data_path = os.path.join(data_path, 'test_info.csv') | |
else: | |
print('data_path is not correct!') | |
return -1 | |
train_data = prepare_data_for_model(train_data_path, 12) | |
test_data = prepare_data_for_model(test_data_path, 15) | |
print('number of train data: ' + str(len(train_data))) | |
print('number of test data: ' + str(len(test_data))) | |
res = save_files(train_data, test_data, data_path) | |
if res: | |
print('Data is created.') | |
if __name__ == "__main__": | |
main() | |