gagan3012 commited on
Commit
da86775
·
1 Parent(s): f9cfbca
src/data/make_dataset.py CHANGED
@@ -8,7 +8,7 @@ def make_dataset(dataset='cnn_dailymail', split='train'):
8
  df = pd.DataFrame()
9
  df['article'] = dataset['article']
10
  df['highlights'] = dataset['highlights']
11
- df.to_csv('summarization/data/raw/{}.csv'.format(split))
12
 
13
 
14
  if __name__ == '__main__':
 
8
  df = pd.DataFrame()
9
  df['article'] = dataset['article']
10
  df['highlights'] = dataset['highlights']
11
+ df.to_csv('C:/Users/gbhat/Documents/GitHub/summarization/data/raw/{}.csv'.format(split))
12
 
13
 
14
  if __name__ == '__main__':
src/data/process_data.py CHANGED
@@ -2,11 +2,10 @@ import pandas as pd
2
 
3
 
4
  def process_data(split='train'):
5
- df = pd.DataFrame()
6
- dataset = pd.load_csv('summarization/data/raw/{}.csv'.format(split))
7
- df['article'] = dataset['article']
8
- df['highlights'] = dataset['highlights']
9
- df.to_csv('summarization/data/processed/{}.csv'.format(split))
10
 
11
 
12
  if __name__ == '__name__':
 
2
 
3
 
4
  def process_data(split='train'):
5
+ df = pd.read_csv('C:/Users/gbhat/Documents/GitHub/summarization/data/raw/{}.csv'.format(split))
6
+ df.rename(columns={"article": "input_text", "highlights": "output_text"})
7
+ print(df.shape)
8
+ df.to_csv('C:/Users/gbhat/Documents/GitHub/summarization/data/processed/{}.csv'.format(split))
 
9
 
10
 
11
  if __name__ == '__name__':