File size: 776 Bytes
3f8d76d
9bbcc22
322ebac
f6b4508
5d9f40a
f6b4508
9bbcc22
c6e4955
322ebac
c6e4955
 
 
322ebac
c6e4955
 
 
c015c4c
0ee5810
c6e4955
d5a6d18
3f8d76d
5d9f40a
c6e4955
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import yaml
from datasets import load_dataset
import pandas as pd
import os
import pprint


def make_dataset(dataset="cnn_dailymail", split="train"):
    """make dataset for summarisation"""
    if not os.path.exists("data/raw"):
        os.makedirs("data/raw")
    dataset = load_dataset(dataset, "3.0.0", split=split)
    df = pd.DataFrame()
    df["article"] = dataset["article"]
    df["highlights"] = dataset["highlights"]
    df.to_csv("data/raw/{}.csv".format(split))


if __name__ == "__main__":
    with open("data_params.yml") as f:
        params = yaml.safe_load(f)
    pprint.pprint(params)
    make_dataset(dataset=params["data"], split="train")
    make_dataset(dataset=params["data"], split="test")
    make_dataset(dataset=params["data"], split="validation")