gagan3012 commited on
Commit
7140c69
·
1 Parent(s): f6b4508

split added

Browse files
Files changed (1) hide show
  1. src/data/process_data.py +8 -0
src/data/process_data.py CHANGED
@@ -1,10 +1,18 @@
1
  import pandas as pd
 
 
2
 
3
 
4
  def process_data(split='train'):
5
 
 
 
 
6
  df = pd.read_csv('data/raw/{}.csv'.format(split))
7
  df.columns = ['Unnamed: 0', 'input_text', 'output_text']
 
 
 
8
  df.to_csv('data/processed/{}.csv'.format(split))
9
 
10
 
 
1
  import pandas as pd
2
+ import yaml
3
+ import os
4
 
5
 
6
  def process_data(split='train'):
7
 
8
+ with open("params.yml") as f:
9
+ params = yaml.safe_load(f)
10
+
11
  df = pd.read_csv('data/raw/{}.csv'.format(split))
12
  df.columns = ['Unnamed: 0', 'input_text', 'output_text']
13
+ df = df.sample(frac=params['split'], replace=True, random_state=1)
14
+ if os.path.exists("data/raw/{}.csv"):
15
+ os.remove("data/raw/{}.csv")
16
  df.to_csv('data/processed/{}.csv'.format(split))
17
 
18