import os, json import pandas as pd from datasets import Dataset, DatasetDict from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer import torch #load json into a dictionary json_dir = "../Article-Bias-Prediction/data/jsons" id_to_article = {} for filename in os.listdir(json_dir): with open(os.path.join(json_dir, filename), 'r', encoding='utf-8') as f: data = json.load(f) id_to_article[data["ID"]] = data #load TSV splits def load_split(split_path): df = pd.read_csv(split_path, sep="\t", header=None, names=["id", "label"]) articles = [] for _, row in df.iterrows(): article = id_to_article.get(row["id"]) if article and article["content"]: # Skip empty ones articles.append({ "text": article["content"], "label": row["label"] }) return Dataset.from_pandas(pd.DataFrame(articles)) train = load_split("../Article-Bias-Prediction/data/splits/random/train.tsv") valid = load_split("../Article-Bias-Prediction/data/splits/random/valid.tsv") test = load_split("../Article-Bias-Prediction/data/splits/random/test.tsv") dataset = DatasetDict({ "train": train, "test": test, "validation": valid })