File size: 3,228 Bytes
d4515d7 4dfb4e3 d4515d7 4dfb4e3 d4515d7 4dfb4e3 d4515d7 4dfb4e3 d4515d7 4dfb4e3 d4515d7 4dfb4e3 d4515d7 4dfb4e3 d4515d7 4dfb4e3 d4515d7 4dfb4e3 d4515d7 4dfb4e3 d4515d7 4dfb4e3 d4515d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import os
import json
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer
)
import torch
# Load all JSON articles
json_dir = "../Article-Bias-Prediction/data/jsons"
id_to_article = {}
print("Loading JSON articles...")
for filename in os.listdir(json_dir):
with open(os.path.join(json_dir, filename), 'r', encoding='utf-8') as f:
data = json.load(f)
if data.get("content"): # only use if content is not empty
id_to_article[data["ID"]] = data
# Load TSV split and match to JSON
def load_split(split_path):
df = pd.read_csv(split_path, sep="\t", header=None, names=["id", "label"])
articles = []
for _, row in df.iterrows():
article = id_to_article.get(row["id"])
if article:
articles.append({
"text": article["content"],
"label": int(row["label"]) # <-- convert label to int
})
return Dataset.from_pandas(pd.DataFrame(articles))
print("Loading splits and building dataset...")
train_ds = load_split("../Article-Bias-Prediction/data/splits/random/train.tsv")
val_ds = load_split("../Article-Bias-Prediction/data/splits/random/valid.tsv")
test_ds = load_split("../Article-Bias-Prediction/data/splits/random/test.tsv")
dataset = DatasetDict({
"train": train_ds,
"validation": val_ds,
"test": test_ds
})
# Tokenize
print("Tokenizing...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize_function(example):
return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model loaded and moved to device:", device)
print(tokenized_dataset["train"][0]["label"], type(tokenized_dataset["train"][0]["label"]))
# Training config
training_args = TrainingArguments(
output_dir="./bert-allsides-bias-detector",
evaluation_strategy="epoch",
save_strategy="epoch",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=100,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
)
# Accuracy function
def compute_metrics(eval_pred):
predictions, labels = eval_pred
preds = predictions.argmax(axis=1)
acc = (preds == labels).astype(float).mean().item()
return {"accuracy": acc}
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
# Train
print("Training...")
trainer.train()
# Evaluate
print("Evaluating on test set...")
results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print("Test Results:", results) |