File size: 623 Bytes
9988244
7140c69
 
9988244
f9cfbca
9988244
3f8d76d
7140c69
 
 
3f8d76d
2466d7f
7140c69
dd353f6
fc96e58
3f8d76d
f9cfbca
 
2466d7f
f9cfbca
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import pandas as pd
import yaml
import os


def process_data(split='train'):

    with open("params.yml") as f:
        params = yaml.safe_load(f)

    df = pd.read_csv('data/raw/{}.csv'.format(split))
    df.columns = ['Unnamed: 0', 'input_text', 'output_text']
    df = df.sample(frac=params['split'], replace=True, random_state=1)
    if os.path.exists("data/raw/{}.csv".format(split)):
        os.remove("data/raw/{}.csv".format(split))
    df.to_csv('data/processed/{}.csv'.format(split))


if __name__ == '__main__':
    process_data(split='train')
    process_data(split='test')
    process_data(split='validation')