bias-detector / training /cleanallsidesdata.py
mjwagerman's picture
new model, trained on 36000 articles from allsides
d4515d7
import os, json
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
#load json into a dictionary
json_dir = "../Article-Bias-Prediction/data/jsons"
id_to_article = {}
for filename in os.listdir(json_dir):
with open(os.path.join(json_dir, filename), 'r', encoding='utf-8') as f:
data = json.load(f)
id_to_article[data["ID"]] = data
#load TSV splits
def load_split(split_path):
df = pd.read_csv(split_path, sep="\t", header=None, names=["id", "label"])
articles = []
for _, row in df.iterrows():
article = id_to_article.get(row["id"])
if article and article["content"]: # Skip empty ones
articles.append({
"text": article["content"],
"label": row["label"]
})
return Dataset.from_pandas(pd.DataFrame(articles))
train = load_split("../Article-Bias-Prediction/data/splits/random/train.tsv")
valid = load_split("../Article-Bias-Prediction/data/splits/random/valid.tsv")
test = load_split("../Article-Bias-Prediction/data/splits/random/test.tsv")
dataset = DatasetDict({
"train": train,
"test": test,
"validation": valid
})