summarization / src /data /make_dataset.py
gagan3012's picture
Pipeline updates
62828bf
raw
history blame
635 Bytes
from datasets import load_dataset
import pandas as pd
def make_dataset(dataset='cnn_dailymail', split='train'):
"""make dataset for summarisation"""
dataset = load_dataset(dataset, '3.0.0', split=split)
df = pd.DataFrame()
df['input_text'] = dataset['article']
df['output_text'] = dataset['highlights']
df.to_csv('C:/Users/gbhat/Documents/GitHub/summarization/data/processed/{}.csv'.format(split, split))
if __name__ == '__main__':
make_dataset(dataset='cnn_dailymail', split='train')
make_dataset(dataset='cnn_dailymail', split='test')
make_dataset(dataset='cnn_dailymail', split='validation')