mjwagerman commited on
Commit
d4515d7
·
1 Parent(s): 849684c

new model, trained on 36000 articles from allsides

Browse files
.gitignore CHANGED
Binary files a/.gitignore and b/.gitignore differ
 
inference.py CHANGED
@@ -1,7 +1,7 @@
1
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
2
  import torch
3
 
4
- model_path = "./bert-bias-detector/checkpoint-4894"
5
 
6
  tokenizer = AutoTokenizer.from_pretrained(model_path)
7
  model = AutoModelForSequenceClassification.from_pretrained(model_path)
 
1
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
2
  import torch
3
 
4
+ model_path = "./training/bert-allsides-bias-detector/checkpoint-10494"
5
 
6
  tokenizer = AutoTokenizer.from_pretrained(model_path)
7
  model = AutoModelForSequenceClassification.from_pretrained(model_path)
training/bert-allsides-bias-detector/checkpoint-10494/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25691d8b332dba45dc84710c03e463f422b3c1e44b3a38d0c404c04ed3abe24b
3
+ size 437961724
training/bert-allsides-bias-detector/checkpoint-10494/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6048d49ebb2c9ab388c031266dde0f631c475cd4b841900ce7d5bcacc56d044c
3
+ size 14244
training/bert-allsides-bias-detector/checkpoint-10494/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0ae8261d8f9389fb1049f4819320deb00f3601aa96e7909934aae9620f13394
3
+ size 5304
training/bert-allsides-bias-detector/checkpoint-10494/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
training/bert-allsides-bias-detector/checkpoint-3498/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0edeb86ac4e270604b2d79e14f0beeac75009a87e228a55a98eefd5a581471bb
3
+ size 437961724
training/bert-allsides-bias-detector/checkpoint-3498/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ee7949f35878e7083f3115f072a31251b534b3a057989dfe232049bc65f85d6
3
+ size 14244
training/bert-allsides-bias-detector/checkpoint-3498/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0ae8261d8f9389fb1049f4819320deb00f3601aa96e7909934aae9620f13394
3
+ size 5304
training/bert-allsides-bias-detector/checkpoint-3498/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
training/bert-allsides-bias-detector/checkpoint-6996/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd843685c0ce4fed68465cfe74a2878492732feda513d8ae10fc682263712cd0
3
+ size 437961724
training/bert-allsides-bias-detector/checkpoint-6996/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d88a900976ce3868a15a753bcd9b50f45d11ad95326f47bfcae45724f9fe073
3
+ size 14244
training/bert-allsides-bias-detector/checkpoint-6996/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0ae8261d8f9389fb1049f4819320deb00f3601aa96e7909934aae9620f13394
3
+ size 5304
training/bert-allsides-bias-detector/checkpoint-6996/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
training/berttrainedonallsides.py CHANGED
@@ -1,40 +1,71 @@
1
- from datasets import load_dataset
2
- from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
 
 
 
 
 
 
 
 
3
  import torch
4
 
5
- model_name = "bert-base-uncased"
6
- tokenizer = BertTokenizer.from_pretrained(model_name)
7
- model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)
8
 
9
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
- model.to(device)
11
- print("Model loaded and moved to device:", device)
 
 
 
12
 
13
- # Load the dataset
14
- dataset = load_dataset(
15
- "csv",
16
- data_files="Qbias/allsides_balanced_news_headlines-texts.csv"
17
- )["train"]
 
 
 
 
 
 
 
18
 
19
- # Map string labels to integers
20
- def label_map(example):
21
- mapping = {"left": 0, "center": 1, "right": 2}
22
- example["label"] = mapping[example["bias_rating"].strip().lower()]
23
- return example
24
 
25
- dataset = dataset.map(label_map)
 
 
 
26
 
27
- # Train/test split
28
- dataset = dataset.train_test_split(test_size=0.1)
 
 
 
 
 
 
 
29
 
30
- # Tokenization
31
  def tokenize_function(example):
32
  return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)
33
 
34
  tokenized_dataset = dataset.map(tokenize_function, batched=True)
35
  tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
36
 
37
- # Training setup
 
 
 
 
 
 
 
 
 
38
  training_args = TrainingArguments(
39
  output_dir="./bert-allsides-bias-detector",
40
  evaluation_strategy="epoch",
@@ -44,16 +75,33 @@ training_args = TrainingArguments(
44
  num_train_epochs=3,
45
  weight_decay=0.01,
46
  logging_dir="./logs",
47
- logging_steps=500,
 
 
48
  )
49
 
 
 
 
 
 
 
 
 
50
  trainer = Trainer(
51
  model=model,
52
  args=training_args,
53
  train_dataset=tokenized_dataset["train"],
54
- eval_dataset=tokenized_dataset["test"],
55
  tokenizer=tokenizer,
 
56
  )
57
 
58
  # Train
59
- #trainer.train()
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pandas as pd
4
+ from datasets import Dataset, DatasetDict
5
+ from transformers import (
6
+ AutoTokenizer,
7
+ AutoModelForSequenceClassification,
8
+ TrainingArguments,
9
+ Trainer
10
+ )
11
  import torch
12
 
13
+ # Load all JSON articles
14
+ json_dir = "../Article-Bias-Prediction/data/jsons"
15
+ id_to_article = {}
16
 
17
+ print("Loading JSON articles...")
18
+ for filename in os.listdir(json_dir):
19
+ with open(os.path.join(json_dir, filename), 'r', encoding='utf-8') as f:
20
+ data = json.load(f)
21
+ if data.get("content"): # only use if content is not empty
22
+ id_to_article[data["ID"]] = data
23
 
24
+ # Load TSV split and match to JSON
25
+ def load_split(split_path):
26
+ df = pd.read_csv(split_path, sep="\t", header=None, names=["id", "label"])
27
+ articles = []
28
+ for _, row in df.iterrows():
29
+ article = id_to_article.get(row["id"])
30
+ if article:
31
+ articles.append({
32
+ "text": article["content"],
33
+ "label": int(row["label"]) # <-- convert label to int
34
+ })
35
+ return Dataset.from_pandas(pd.DataFrame(articles))
36
 
 
 
 
 
 
37
 
38
+ print("Loading splits and building dataset...")
39
+ train_ds = load_split("../Article-Bias-Prediction/data/splits/random/train.tsv")
40
+ val_ds = load_split("../Article-Bias-Prediction/data/splits/random/valid.tsv")
41
+ test_ds = load_split("../Article-Bias-Prediction/data/splits/random/test.tsv")
42
 
43
+ dataset = DatasetDict({
44
+ "train": train_ds,
45
+ "validation": val_ds,
46
+ "test": test_ds
47
+ })
48
+
49
+ # Tokenize
50
+ print("Tokenizing...")
51
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
52
 
 
53
  def tokenize_function(example):
54
  return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)
55
 
56
  tokenized_dataset = dataset.map(tokenize_function, batched=True)
57
  tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
58
 
59
+ # Load model
60
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
61
+
62
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
63
+ model.to(device)
64
+ print("Model loaded and moved to device:", device)
65
+
66
+ print(tokenized_dataset["train"][0]["label"], type(tokenized_dataset["train"][0]["label"]))
67
+
68
+ # Training config
69
  training_args = TrainingArguments(
70
  output_dir="./bert-allsides-bias-detector",
71
  evaluation_strategy="epoch",
 
75
  num_train_epochs=3,
76
  weight_decay=0.01,
77
  logging_dir="./logs",
78
+ logging_steps=100,
79
+ load_best_model_at_end=True,
80
+ metric_for_best_model="accuracy",
81
  )
82
 
83
+ # Accuracy function
84
+ def compute_metrics(eval_pred):
85
+ predictions, labels = eval_pred
86
+ preds = predictions.argmax(axis=1)
87
+ acc = (preds == labels).astype(float).mean().item()
88
+ return {"accuracy": acc}
89
+
90
+ # Trainer
91
  trainer = Trainer(
92
  model=model,
93
  args=training_args,
94
  train_dataset=tokenized_dataset["train"],
95
+ eval_dataset=tokenized_dataset["validation"],
96
  tokenizer=tokenizer,
97
+ compute_metrics=compute_metrics
98
  )
99
 
100
  # Train
101
+ print("Training...")
102
+ trainer.train()
103
+
104
+ # Evaluate
105
+ print("Evaluating on test set...")
106
+ results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
107
+ print("Test Results:", results)
training/cleanallsidesdata.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+ import pandas as pd
3
+ from datasets import Dataset, DatasetDict
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
5
+ import torch
6
+
7
+ #load json into a dictionary
8
+ json_dir = "../Article-Bias-Prediction/data/jsons"
9
+ id_to_article = {}
10
+ for filename in os.listdir(json_dir):
11
+ with open(os.path.join(json_dir, filename), 'r', encoding='utf-8') as f:
12
+ data = json.load(f)
13
+ id_to_article[data["ID"]] = data
14
+
15
+ #load TSV splits
16
+ def load_split(split_path):
17
+ df = pd.read_csv(split_path, sep="\t", header=None, names=["id", "label"])
18
+ articles = []
19
+ for _, row in df.iterrows():
20
+ article = id_to_article.get(row["id"])
21
+ if article and article["content"]: # Skip empty ones
22
+ articles.append({
23
+ "text": article["content"],
24
+ "label": row["label"]
25
+ })
26
+ return Dataset.from_pandas(pd.DataFrame(articles))
27
+
28
+ train = load_split("../Article-Bias-Prediction/data/splits/random/train.tsv")
29
+ valid = load_split("../Article-Bias-Prediction/data/splits/random/valid.tsv")
30
+ test = load_split("../Article-Bias-Prediction/data/splits/random/test.tsv")
31
+
32
+ dataset = DatasetDict({
33
+ "train": train,
34
+ "test": test,
35
+ "validation": valid
36
+ })