gagan3012 commited on
Commit
9988244
·
1 Parent(s): c420ca5
dvc.yaml CHANGED
@@ -3,6 +3,7 @@ stages:
3
  cmd: python src/models/train_model.py
4
  deps:
5
  - data/processed/train.csv
 
6
  - src/models/train_model.py
7
  outs:
8
  - models:
 
3
  cmd: python src/models/train_model.py
4
  deps:
5
  - data/processed/train.csv
6
+ - data/processed/validation.csv
7
  - src/models/train_model.py
8
  outs:
9
  - models:
src/data/make_dataset.py CHANGED
@@ -6,9 +6,9 @@ def make_dataset(dataset='cnn_dailymail', split='train'):
6
  """make dataset for summarisation"""
7
  dataset = load_dataset(dataset, '3.0.0', split=split)
8
  df = pd.DataFrame()
9
- df['input_text'] = dataset['article']
10
- df['output_text'] = dataset['highlights']
11
- df.to_csv('C:/Users/gbhat/Documents/GitHub/summarization/data/processed/{}.csv'.format(split, split))
12
 
13
 
14
  if __name__ == '__main__':
 
6
  """make dataset for summarisation"""
7
  dataset = load_dataset(dataset, '3.0.0', split=split)
8
  df = pd.DataFrame()
9
+ df['article'] = dataset['article']
10
+ df['highlights'] = dataset['highlights']
11
+ df.to_csv('summarization/data/raw/{}.csv'.format(split))
12
 
13
 
14
  if __name__ == '__main__':
src/data/process_data.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def process_data(split='train'):
4
+ df= pd.DataFrame()
5
+ dataset = pd.load_csv('summarization/data/raw/{}.csv'.format(split))
6
+ df['article'] = dataset['article']
7
+ df['highlights'] = dataset['highlights']
8
+ df.to_csv('summarization/data/processed/{}.csv'.format(split))