summarization / src /data /process_data.py
Dean
Merge commit '8d1f074f67512e839e8d290ade59fc8fe73f7c9c' into fix-mlflow
c6912f8
raw
history blame
516 Bytes
import pandas as pd
import yaml
import os
def process_data(split="train"):
with open("params.yml") as f:
params = yaml.safe_load(f)
df = pd.read_csv("data/raw/{}.csv".format(split))
df.columns = ["Unnamed: 0", "input_text", "output_text"]
df = df.sample(frac=params["split"], replace=True, random_state=1)
df.to_csv("data/processed/{}.csv".format(split))
if __name__ == "__main__":
process_data(split="train")
process_data(split="test")
process_data(split="validation")