import json import re import csv import shutil import os import sys main_path = os.getcwd() def prepare_data_for_model(path): f = open(path, 'r') data = csv.DictReader(f) data_lines = [] for row in data: phoneme = row['phenome'] utterance_name = row['seg_id'] speake_id = row['speaker_id'] phoneme = re.sub("\[([0-9]+)\]", '', phoneme) phoneme = re.sub("\s+\|\s+", ' ', phoneme) data_lines.append([phoneme, utterance_name, speake_id]) f.close() return data_lines def save_files(train_data, test_data, data_path): for line in train_data: try: original = os.path.join(data_path, 'train_wav/{}.wav'.format(line[1])) target = os.path.join(main_path, 'dataset/persian_data/train_data/book-1/speaker-{0}/utterance-{1}.wav'.format(line[2], line[1])) os.makedirs(os.path.dirname(target), exist_ok=True) shutil.copyfile(original, target) except Exception as e: print(e) return False path = os.path.join(main_path, 'dataset/persian_data/train_data/book-1/speaker-{0}/utterance-{1}.txt'.format(line[2], line[1])) with open(path, 'w') as fp: fp.write(line[0]) for line in test_data: try: original = os.path.join(data_path, 'test_wav/{}.wav'.format(line[1])) target = os.path.join(main_path, 'dataset/persian_data/test_data/book-1/speaker-{0}/utterance-{1}.wav'.format(line[2], line[1])) os.makedirs(os.path.dirname(target), exist_ok=True) shutil.copyfile(original, target) except Exception as e: print(e) return False path = os.path.join(main_path, 'dataset/persian_data/test_data/book-1/speaker-{0}/utterance-{1}.txt'.format(line[2], line[1]) with open(path, 'w') as fp: fp.write(line[0]) return True def main(data_path): if os.path.isfile(os.path.join(data_path, 'train_info.csv')): train_data_path = os.path.join(data_path, 'train_info.csv') else: print('data_path is not correct!') return -1 if os.path.isfile(os.path.join(data_path, 'test_info.csv')): test_data_path = os.path.join(data_path, 'test_info.csv') else: print('data_path is not correct!') return -1 train_data = prepare_data_for_model(train_data_path) test_data = prepare_data_for_model(test_data_path) print('number of train data: ' + str(len(train_data))) print('number of test data: ' + str(len(test_data))) res = save_files(train_data, test_data, data_path) if res: print('Data is created.') if __name__ == "__main__": main(sys.argv[1])