|
import os |
|
import json |
|
import pandas as pd |
|
from datasets import Dataset, DatasetDict |
|
from transformers import ( |
|
AutoTokenizer, |
|
AutoModelForSequenceClassification, |
|
TrainingArguments, |
|
Trainer |
|
) |
|
import torch |
|
|
|
|
|
json_dir = "../Article-Bias-Prediction/data/jsons" |
|
id_to_article = {} |
|
|
|
print("Loading JSON articles...") |
|
for filename in os.listdir(json_dir): |
|
with open(os.path.join(json_dir, filename), 'r', encoding='utf-8') as f: |
|
data = json.load(f) |
|
if data.get("content"): |
|
id_to_article[data["ID"]] = data |
|
|
|
|
|
def load_split(split_path): |
|
df = pd.read_csv(split_path, sep="\t", header=None, names=["id", "label"]) |
|
articles = [] |
|
for _, row in df.iterrows(): |
|
article = id_to_article.get(row["id"]) |
|
if article: |
|
articles.append({ |
|
"text": article["content"], |
|
"label": int(row["label"]) |
|
}) |
|
return Dataset.from_pandas(pd.DataFrame(articles)) |
|
|
|
|
|
print("Loading splits and building dataset...") |
|
train_ds = load_split("../Article-Bias-Prediction/data/splits/random/train.tsv") |
|
val_ds = load_split("../Article-Bias-Prediction/data/splits/random/valid.tsv") |
|
test_ds = load_split("../Article-Bias-Prediction/data/splits/random/test.tsv") |
|
|
|
dataset = DatasetDict({ |
|
"train": train_ds, |
|
"validation": val_ds, |
|
"test": test_ds |
|
}) |
|
|
|
|
|
print("Tokenizing...") |
|
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") |
|
|
|
def tokenize_function(example): |
|
return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512) |
|
|
|
tokenized_dataset = dataset.map(tokenize_function, batched=True) |
|
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]) |
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3) |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
print("Model loaded and moved to device:", device) |
|
|
|
print(tokenized_dataset["train"][0]["label"], type(tokenized_dataset["train"][0]["label"])) |
|
|
|
|
|
training_args = TrainingArguments( |
|
output_dir="./bert-allsides-bias-detector", |
|
evaluation_strategy="epoch", |
|
save_strategy="epoch", |
|
per_device_train_batch_size=8, |
|
per_device_eval_batch_size=8, |
|
num_train_epochs=3, |
|
weight_decay=0.01, |
|
logging_dir="./logs", |
|
logging_steps=100, |
|
load_best_model_at_end=True, |
|
metric_for_best_model="accuracy", |
|
) |
|
|
|
|
|
def compute_metrics(eval_pred): |
|
predictions, labels = eval_pred |
|
preds = predictions.argmax(axis=1) |
|
acc = (preds == labels).astype(float).mean().item() |
|
return {"accuracy": acc} |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_dataset["train"], |
|
eval_dataset=tokenized_dataset["validation"], |
|
tokenizer=tokenizer, |
|
compute_metrics=compute_metrics |
|
) |
|
|
|
|
|
print("Training...") |
|
trainer.train() |
|
|
|
|
|
print("Evaluating on test set...") |
|
results = trainer.evaluate(eval_dataset=tokenized_dataset["test"]) |
|
print("Test Results:", results) |