|
import os, json |
|
import pandas as pd |
|
from datasets import Dataset, DatasetDict |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer |
|
import torch |
|
|
|
|
|
json_dir = "../Article-Bias-Prediction/data/jsons" |
|
id_to_article = {} |
|
for filename in os.listdir(json_dir): |
|
with open(os.path.join(json_dir, filename), 'r', encoding='utf-8') as f: |
|
data = json.load(f) |
|
id_to_article[data["ID"]] = data |
|
|
|
|
|
def load_split(split_path): |
|
df = pd.read_csv(split_path, sep="\t", header=None, names=["id", "label"]) |
|
articles = [] |
|
for _, row in df.iterrows(): |
|
article = id_to_article.get(row["id"]) |
|
if article and article["content"]: |
|
articles.append({ |
|
"text": article["content"], |
|
"label": row["label"] |
|
}) |
|
return Dataset.from_pandas(pd.DataFrame(articles)) |
|
|
|
train = load_split("../Article-Bias-Prediction/data/splits/random/train.tsv") |
|
valid = load_split("../Article-Bias-Prediction/data/splits/random/valid.tsv") |
|
test = load_split("../Article-Bias-Prediction/data/splits/random/test.tsv") |
|
|
|
dataset = DatasetDict({ |
|
"train": train, |
|
"test": test, |
|
"validation": valid |
|
}) |
|
|