File size: 2,808 Bytes
10e72d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import json
import re
import csv
import shutil
import os
import sys

main_path = os.getcwd()

def prepare_data_for_model(path):
    f = open(path, 'r')
    data = csv.DictReader(f)
    data_lines = []
    for row in data:
        phoneme = row['phenome']
        utterance_name = row['seg_id']
        speake_id = row['speaker_id']
        phoneme = re.sub("\[([0-9]+)\]", '', phoneme)
        phoneme = re.sub("\s+\|\s+", ' ', phoneme)
        data_lines.append([phoneme, utterance_name, speake_id])
    f.close()
    return data_lines
    

def save_files(train_data, test_data, data_path):
    for line in train_data:
        try:
            original = os.path.join(data_path, 'train_wav/{}.wav'.format(line[1]))
            target = os.path.join(main_path, 'dataset/persian_data/train_data/book-1/speaker-{0}/utterance-{1}.wav'.format(line[2], line[1]))
            os.makedirs(os.path.dirname(target), exist_ok=True)
            shutil.copyfile(original, target)
        except Exception as e:
            print(e)
            return False

        path = os.path.join(main_path, 'dataset/persian_data/train_data/book-1/speaker-{0}/utterance-{1}.txt'.format(line[2], line[1]))
        with open(path, 'w') as fp:
            fp.write(line[0])

    for line in test_data:
        try:
            original = os.path.join(data_path, 'test_wav/{}.wav'.format(line[1]))
            target = os.path.join(main_path, 'dataset/persian_data/test_data/book-1/speaker-{0}/utterance-{1}.wav'.format(line[2], line[1]))
            os.makedirs(os.path.dirname(target), exist_ok=True)
            shutil.copyfile(original, target)
        except Exception as e:
            print(e)
            return False

        path = os.path.join(main_path, 'dataset/persian_data/test_data/book-1/speaker-{0}/utterance-{1}.txt'.format(line[2], line[1])
        with open(path, 'w') as fp:
            fp.write(line[0])
    return True
    
def main(data_path):
    if os.path.isfile(os.path.join(data_path, 'train_info.csv')):
        train_data_path = os.path.join(data_path, 'train_info.csv')
    else:
        print('data_path is not correct!')
        return -1
    if os.path.isfile(os.path.join(data_path, 'test_info.csv')):
        test_data_path = os.path.join(data_path, 'test_info.csv')
    else:
        print('data_path is not correct!')
        return -1
    train_data = prepare_data_for_model(train_data_path)
    test_data = prepare_data_for_model(test_data_path)
    print('number of train data: ' + str(len(train_data)))
    print('number of test data: ' + str(len(test_data)))
    
    res = save_files(train_data, test_data, data_path)
    if res:
        print('Data is created.')

if __name__ == "__main__":
    main(sys.argv[1])