File size: 3,228 Bytes
d4515d7
 
 
 
 
 
 
 
 
 
4dfb4e3
 
d4515d7
 
 
4dfb4e3
d4515d7
 
 
 
 
 
4dfb4e3
d4515d7
 
 
 
 
 
 
 
 
 
 
 
4dfb4e3
 
d4515d7
 
 
 
4dfb4e3
d4515d7
 
 
 
 
 
 
 
 
4dfb4e3
 
 
 
 
 
 
d4515d7
 
 
 
 
 
 
 
 
 
4dfb4e3
 
 
 
 
 
 
 
 
d4515d7
 
 
4dfb4e3
 
d4515d7
 
 
 
 
 
 
 
4dfb4e3
 
 
 
d4515d7
4dfb4e3
d4515d7
4dfb4e3
 
 
d4515d7
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import json
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer
)
import torch

# Load all JSON articles
json_dir = "../Article-Bias-Prediction/data/jsons"
id_to_article = {}

print("Loading JSON articles...")
for filename in os.listdir(json_dir):
    with open(os.path.join(json_dir, filename), 'r', encoding='utf-8') as f:
        data = json.load(f)
        if data.get("content"):  # only use if content is not empty
            id_to_article[data["ID"]] = data

# Load TSV split and match to JSON
def load_split(split_path):
    df = pd.read_csv(split_path, sep="\t", header=None, names=["id", "label"])
    articles = []
    for _, row in df.iterrows():
        article = id_to_article.get(row["id"])
        if article:
            articles.append({
                "text": article["content"],
                "label": int(row["label"])  # <-- convert label to int
            })
    return Dataset.from_pandas(pd.DataFrame(articles))


print("Loading splits and building dataset...")
train_ds = load_split("../Article-Bias-Prediction/data/splits/random/train.tsv")
val_ds = load_split("../Article-Bias-Prediction/data/splits/random/valid.tsv")
test_ds = load_split("../Article-Bias-Prediction/data/splits/random/test.tsv")

dataset = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds
})

# Tokenize
print("Tokenizing...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model loaded and moved to device:", device)

print(tokenized_dataset["train"][0]["label"], type(tokenized_dataset["train"][0]["label"]))

# Training config
training_args = TrainingArguments(
    output_dir="./bert-allsides-bias-detector",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Accuracy function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)
    acc = (preds == labels).astype(float).mean().item()
    return {"accuracy": acc}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train
print("Training...")
trainer.train()

# Evaluate
print("Evaluating on test set...")
results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print("Test Results:", results)