File size: 1,268 Bytes
d4515d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
import os, json
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
#load json into a dictionary
json_dir = "../Article-Bias-Prediction/data/jsons"
id_to_article = {}
for filename in os.listdir(json_dir):
with open(os.path.join(json_dir, filename), 'r', encoding='utf-8') as f:
data = json.load(f)
id_to_article[data["ID"]] = data
#load TSV splits
def load_split(split_path):
df = pd.read_csv(split_path, sep="\t", header=None, names=["id", "label"])
articles = []
for _, row in df.iterrows():
article = id_to_article.get(row["id"])
if article and article["content"]: # Skip empty ones
articles.append({
"text": article["content"],
"label": row["label"]
})
return Dataset.from_pandas(pd.DataFrame(articles))
train = load_split("../Article-Bias-Prediction/data/splits/random/train.tsv")
valid = load_split("../Article-Bias-Prediction/data/splits/random/valid.tsv")
test = load_split("../Article-Bias-Prediction/data/splits/random/test.tsv")
dataset = DatasetDict({
"train": train,
"test": test,
"validation": valid
})
|