File size: 1,987 Bytes
f831146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""
I didn't extract features from the test set of LibriSpeech, the features extracted
from train-100 was split into train and test set into two separate folders.
This was again done to read them easily using torch vision's Dataset Folder
"""

import os
import shutil
from pathlib import Path

import numpy as np


def assert_out_dir_exists(root, index):
    dir_ = root + '/' + str(index)

    if not os.path.exists(dir_):
        os.makedirs(dir_)
        print('crated dir {}'.format(dir_))
    else:
        print('dir {} already exists'.format(dir_))

    return dir_


def train_test_split(root, test_size=0.05):
    # make two folders, train and test
    train_dir = root + '_train'
    test_dir = root + '_test'

    os.makedirs(train_dir)
    os.makedirs(test_dir)

    for label in os.listdir(root):
        files_iter = Path(root + '/' + label).glob('**/*.npy')
        files_ = [str(f) for f in files_iter]
        files_ = np.array(files_)

        assert_out_dir_exists(train_dir, label)
        assert_out_dir_exists(test_dir, label)

        choices = np.random.choice([0, 1], size=files_.shape[0], p=(1 - test_size, test_size))
        train_files = files_[choices == 0]
        test_files = files_[choices == 1]

        for train_sample in train_files:
            src = train_sample
            dest = train_dir + '/' + label + '/' + train_sample.split('/')[-1]
            print('copying file {} to {}'.format(src, dest))
            shutil.copyfile(train_sample, train_dir + '/' + label + '/' + train_sample.split('/')[-1])

        for test_sample in test_files:
            src = test_sample
            dest = test_dir + '/' + label + '/' + test_sample.split('/')[-1]
            print('copying file {} to {}'.format(src, dest))
            shutil.copyfile(test_sample, test_dir + '/' + label + '/' + test_sample.split('/')[-1])

        print('done for label: {}'.format(label))

    print('All done')


if __name__ == '__main__':
    train_test_split('fbanks')