import os import json import pandas as pd from datasets import Dataset, DatasetDict from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer ) import torch # Load all JSON articles json_dir = "../Article-Bias-Prediction/data/jsons" id_to_article = {} print("Loading JSON articles...") for filename in os.listdir(json_dir): with open(os.path.join(json_dir, filename), 'r', encoding='utf-8') as f: data = json.load(f) if data.get("content"): # only use if content is not empty id_to_article[data["ID"]] = data # Load TSV split and match to JSON def load_split(split_path): df = pd.read_csv(split_path, sep="\t", header=None, names=["id", "label"]) articles = [] for _, row in df.iterrows(): article = id_to_article.get(row["id"]) if article: articles.append({ "text": article["content"], "label": int(row["label"]) # <-- convert label to int }) return Dataset.from_pandas(pd.DataFrame(articles)) print("Loading splits and building dataset...") train_ds = load_split("../Article-Bias-Prediction/data/splits/random/train.tsv") val_ds = load_split("../Article-Bias-Prediction/data/splits/random/valid.tsv") test_ds = load_split("../Article-Bias-Prediction/data/splits/random/test.tsv") dataset = DatasetDict({ "train": train_ds, "validation": val_ds, "test": test_ds }) # Tokenize print("Tokenizing...") tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") def tokenize_function(example): return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512) tokenized_dataset = dataset.map(tokenize_function, batched=True) tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) # Load model model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) print("Model loaded and moved to device:", device) print(tokenized_dataset["train"][0]["label"], type(tokenized_dataset["train"][0]["label"])) # Training config training_args = TrainingArguments( output_dir="./bert-allsides-bias-detector", evaluation_strategy="epoch", save_strategy="epoch", per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, weight_decay=0.01, logging_dir="./logs", logging_steps=100, load_best_model_at_end=True, metric_for_best_model="accuracy", ) # Accuracy function def compute_metrics(eval_pred): predictions, labels = eval_pred preds = predictions.argmax(axis=1) acc = (preds == labels).astype(float).mean().item() return {"accuracy": acc} # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["validation"], tokenizer=tokenizer, compute_metrics=compute_metrics ) # Train print("Training...") trainer.train() # Evaluate print("Evaluating on test set...") results = trainer.evaluate(eval_dataset=tokenized_dataset["test"]) print("Test Results:", results)