# Installing dependencies

## Please make a copy of this notebook.

In [None]:
!pip install geopy > delete.txt
!pip install datasets > delete.txt
!pip install torch torchvision datasets > delete.txt
!pip install huggingface_hub > delete.txt
!pip install pyhocon > delete.txt
!pip install transformers > delete.txt
!pip install gensim > delete.txt
!rm delete.txt

# Huggingface login
You will require your personal token.

In [44]:
!huggingface-cli login

# Part 1: Load Data

## Downloading the train and test dataset

In [45]:
from datasets import load_dataset

dataset_train = load_dataset("CISProject/FOX_NBC", split="train")
dataset_test = load_dataset("CISProject/FOX_NBC", split="test")
# dataset_test = load_dataset("CISProject/FOX_NBC", split="test_data_random_subset")


In [46]:
import numpy as np
import torch
import re
from transformers import BertTokenizer
from transformers import RobertaTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocess_data(data,
 mode="train",
 vectorizer=None,
 w2v_model=None,
 max_features=4096,
 max_seq_length=128,
 num_proc=4):
 if w2v_model is None:
 raise ValueError("w2v_model must be provided for Word2Vec embeddings.")

 # tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
 # 1. Clean text once
 def clean_text(examples):
 import re
 cleaned = []
 for text in examples["title"]:
 text = text.lower()
 text = re.sub(r'[^\w\s]', '', text)
 text = text.strip()
 cleaned.append(text)
 return {"clean_title": cleaned}

 data = data.map(clean_text, batched=True, num_proc=num_proc)

 # 2. Fit CountVectorizer on training data if needed
 if mode == "train" and vectorizer is None:
 # Collect all cleaned titles to fit
 all_titles = data["clean_title"]
 #vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1,2))
 vectorizer = TfidfVectorizer(max_features=max_features)
 vectorizer.fit(all_titles)
 print("vectorizer fitted on training data.")

 # 3. Transform titles with vectorizer once
 def vectorize_batch(examples):
 import numpy as np
 freq = vectorizer.transform(examples["clean_title"]).toarray().astype(np.float32)
 return {"freq_inputs": freq}

 data = data.map(vectorize_batch, batched=True, num_proc=num_proc)

 # 4. Tokenize with BERT once
 def tokenize_batch(examples):
 tokenized = tokenizer(
 examples["title"],
 padding="max_length",
 truncation=True,
 max_length=max_seq_length
 )
 return {
 "input_ids": tokenized["input_ids"],
 "attention_mask": tokenized["attention_mask"]
 }

 data = data.map(tokenize_batch, batched=True, num_proc=num_proc)

 # 5. Convert titles into tokens for W2V
 def split_tokens(examples):
 tokens_list = [t.split() for t in examples["clean_title"]]
 return {"tokens": tokens_list}

 data = data.map(split_tokens, batched=True, num_proc=num_proc)

 # Build an embedding dictionary for all unique tokens (do this once before embedding map)
 unique_tokens = set()
 for tokens in data["tokens"]:
 unique_tokens.update(tokens)

 embedding_dim = w2v_model.vector_size
 embedding_dict = {}
 for tk in unique_tokens:
 if tk in w2v_model:
 embedding_dict[tk] = w2v_model[tk].astype(np.float32)
 else:
 embedding_dict[tk] = np.zeros((embedding_dim,), dtype=np.float32)

 def w2v_embedding_batch(examples):
 import numpy as np
 batch_w2v = []
 for tokens in examples["tokens"]:
 vectors = [embedding_dict[tk] for tk in tokens[:max_seq_length]]
 if len(vectors) < max_seq_length:
 vectors += [np.zeros((embedding_dim,), dtype=np.float32)] * (max_seq_length - len(vectors))
 batch_w2v.append(vectors)
 return {"pos_inputs": batch_w2v}


 data = data.map(w2v_embedding_batch, batched=True, batch_size=32, num_proc=num_proc)

 # 7. Create labels
 def make_labels(examples):
 labels = examples["labels"]
 return {"labels": labels}

 data = data.map(make_labels, batched=True, num_proc=num_proc)

 # Convert freq_inputs and pos_inputs to torch tensors in a final map step
 def to_tensors(examples):
 import torch

 freq_inputs = torch.tensor(examples["freq_inputs"], dtype=torch.float32)
 input_ids = torch.tensor(examples["input_ids"])
 attention_mask = torch.tensor(examples["attention_mask"])
 pos_inputs = torch.tensor(examples["pos_inputs"], dtype=torch.float32)
 labels = torch.tensor(examples["labels"],dtype=torch.long)

 # seq_inputs shape: (batch_size, 2, seq_len)
 seq_inputs = torch.stack([input_ids, attention_mask], dim=1)

 return {
 "freq_inputs": freq_inputs,
 "seq_inputs": seq_inputs,
 "pos_inputs": pos_inputs,
 "labels": labels
 }

 # Apply final conversion to tensor
 processed_data = data.map(to_tensors, batched=True, num_proc=num_proc)

 return processed_data, vectorizer


In [47]:
from gensim.models import KeyedVectors
w2v_model = KeyedVectors.load_word2vec_format("./GoogleNews-vectors-negative300.bin", binary=True)

dataset_train,vectorizer = preprocess_data(
 data=dataset_train,
 mode="train",
 w2v_model=w2v_model,
 max_features=8192,
 max_seq_length=128
)

dataset_test, _ = preprocess_data(
 data=dataset_test,
 mode="test",
 vectorizer=vectorizer,
 w2v_model=w2v_model,
 max_features=8192,
 max_seq_length=128
)

vectorizer fitted on training data.


In [48]:
print(dataset_train)
print(dataset_test)

Dataset({
 features: ['title', 'outlet', 'index', 'url', 'labels', 'clean_title', 'freq_inputs', 'input_ids', 'attention_mask', 'tokens', 'pos_inputs', 'seq_inputs'],
 num_rows: 3044
})
Dataset({
 features: ['title', 'outlet', 'index', 'url', 'labels', 'clean_title', 'freq_inputs', 'input_ids', 'attention_mask', 'tokens', 'pos_inputs', 'seq_inputs'],
 num_rows: 761
})


# Part 2: Model

## Defining the Custom Model

In [49]:
# TODO: import all packages necessary for your custom model
import pandas as pd
import os
from torch.utils.data import DataLoader
from transformers import PreTrainedModel, PretrainedConfig, AutoConfig, AutoModel
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaConfig,RobertaForSequenceClassification, BertModel
from model.network import Classifier
from model.frequential import FreqNetwork
from model.sequential import SeqNetwork
from model.positional import PosNetwork

class CustomConfig(PretrainedConfig):
 model_type = "headlineclassifier"

 def __init__(
 self,
 base_exp_dir="./exp/fox_nbc/",
 # dataset={"data_dir": "./data/CASE_NAME/data.csv", "transform": True},
 train={
 "learning_rate": 2e-5,
 "learning_rate_alpha": 0.05,
 "end_iter": 10,
 "batch_size": 32,
 "warm_up_end": 2,
 "anneal_end": 5,
 "save_freq": 1,
 "val_freq": 1,
 },
 model={
 "freq": {
 "tfidf_input_dim": 8145,
 "tfidf_output_dim": 128,
 "tfidf_hidden_dim": 512,
 "n_layers": 2,
 "skip_in": [80],
 "weight_norm": True,
 },
 "pos": {
 "input_dim": 300,
 "output_dim": 128,
 "hidden_dim": 256,
 "n_layers": 2,
 "skip_in": [80],
 "weight_norm": True,
 },
 "cls": {
 "combined_input": 1024, #1024
 "combined_dim": 128,
 "num_classes": 2,
 "n_layers": 2,
 "skip_in": [80],
 "weight_norm": True,
 },
 },
 **kwargs,
 ):
 super().__init__(**kwargs)

 self.base_exp_dir = base_exp_dir
 # self.dataset = dataset
 self.train = train
 self.model = model

# TODO: define all parameters needed for your model, as well as calling the model itself
class CustomModel(PreTrainedModel):
 config_class = CustomConfig

 def __init__(self, config):
 super().__init__(config)
 self.conf = config
 self.freq = FreqNetwork(**self.conf.model["freq"])
 self.pos = PosNetwork(**self.conf.model["pos"])
 self.cls = Classifier(**self.conf.model["cls"])
 self.fc = nn.Linear(self.conf.model["cls"]["combined_input"],2)
 self.seq = RobertaModel.from_pretrained("roberta-base")
 # self.seq = BertModel.from_pretrained("bert-base-uncased")
 #for param in self.roberta.parameters():
 # param.requires_grad = False
 self.dropout = nn.Dropout(0.2)

 def forward(self, x):
 freq_inputs = x["freq_inputs"]
 seq_inputs = x["seq_inputs"]
 pos_inputs = x["pos_inputs"]
 seq_feature = self.seq(
 input_ids=seq_inputs[:,0,:],
 attention_mask=seq_inputs[:,1,:]
 ).pooler_output # last_hidden_state[:, 0, :]
 lstm_out, (h_n, c_n) = self.lstm(seq_feature)
 seq_feature = h_n[-1] # Use the last hidden state
 freq_feature = self.freq(freq_inputs) # Shape: (batch_size, 128)

 pos_feature = self.pos(pos_inputs) #Shape: (batch_size, 128)
 inputs = torch.cat((seq_feature, freq_feature, pos_feature), dim=1) # Shape: (batch_size, 384)
 # inputs = torch.cat((seq_feature, freq_feature), dim=1) # Shape: (batch_size,256)
 # inputs = seq_feature

 x = inputs
 x = self.dropout(x)
 outputs = self.fc(x)

 return outputs

 def save_model(self, save_path):
 """Save the model locally using the Hugging Face format."""
 self.save_pretrained(save_path)

 def push_model(self, repo_name):
 """Push the model to the Hugging Face Hub."""
 self.push_to_hub(repo_name)

In [50]:
from huggingface_hub import hf_hub_download

AutoConfig.register("headlineclassifier", CustomConfig)
AutoModel.register(CustomConfig, CustomModel)
config = CustomConfig()
model = CustomModel(config)

REPO_NAME = "CISProject/News-Headline-Classifier-Notebook" # TODO: PROVIDE A STRING TO YOUR REPO ON HUGGINGFACE

 WeightNorm.apply(module, name, dim)
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
import torch
from tqdm import tqdm
import os


class Trainer:
 def __init__(self, model, train_loader, val_loader, config, device="cuda"):
 self.model = model.to(device)
 self.train_loader = train_loader
 self.val_loader = val_loader
 self.device = device
 self.conf = config

 self.end_iter = self.conf.train["end_iter"]
 self.save_freq = self.conf.train["save_freq"]
 self.val_freq = self.conf.train["val_freq"]

 self.batch_size = self.conf.train['batch_size']
 self.learning_rate = self.conf.train['learning_rate']
 self.learning_rate_alpha = self.conf.train['learning_rate_alpha']
 self.warm_up_end = self.conf.train['warm_up_end']
 self.anneal_end = self.conf.train['anneal_end']

 self.optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate)
 #self.criterion = torch.nn.BCEWithLogitsLoss()
 self.criterion = torch.nn.CrossEntropyLoss()
 self.save_path = os.path.join(self.conf.base_exp_dir, "checkpoints")
 os.makedirs(self.save_path, exist_ok=True)

 self.iter_step = 0

 self.val_loss = None

 def get_cos_anneal_ratio(self):
 if self.anneal_end == 0.0:
 return 1.0
 else:
 return np.min([1.0, self.iter_step / self.anneal_end])

 def update_learning_rate(self):
 if self.iter_step < self.warm_up_end:
 learning_factor = self.iter_step / self.warm_up_end
 else:
 alpha = self.learning_rate_alpha
 progress = (self.iter_step - self.warm_up_end) / (self.end_iter - self.warm_up_end)
 learning_factor = (np.cos(np.pi * progress) + 1.0) * 0.5 * (1 - alpha) + alpha

 for g in self.optimizer.param_groups:
 g['lr'] = self.learning_rate * learning_factor

 def train(self):
 for epoch in range(self.end_iter):
 self.update_learning_rate()
 self.model.train()
 epoch_loss = 0.0
 correct = 0
 total = 0

 for batch_inputs, labels in tqdm(self.train_loader, desc=f"Epoch {epoch + 1}/{self.end_iter}"):
 # Extract features

 freq_inputs = batch_inputs["freq_inputs"].to(self.device)
 seq_inputs = batch_inputs["seq_inputs"].to(self.device)
 pos_inputs = batch_inputs["pos_inputs"].to(self.device)
 # y_train = labels.to(self.device)[:,None]
 y_train = labels.to(self.device)

 # Forward pass
 preds = self.model({"freq_inputs": freq_inputs, "seq_inputs": seq_inputs, "pos_inputs": pos_inputs})
 loss = self.criterion(preds, y_train)

 # preds = (torch.sigmoid(preds) > 0.5).int()
 # Backward pass
 self.optimizer.zero_grad()
 loss.backward()
 self.optimizer.step()
 _, preds = torch.max(preds, dim=1)
 # Metrics
 epoch_loss += loss.item()
 total += y_train.size(0)
 # print(preds.shape)
 correct += (preds == y_train).sum().item()

 # Log epoch metrics
 print(f"Train Loss: {epoch_loss / len(self.train_loader):.4f}")
 print(f"Train Accuracy: {correct / total:.4f}")

 # Validation and Save Checkpoints
 if (epoch + 1) % self.val_freq == 0:
 self.val()
 if (epoch + 1) % self.save_freq == 0:
 self.save_checkpoint(epoch + 1)

 # Update learning rate
 self.iter_step += 1
 self.update_learning_rate()


 def val(self):
 self.model.eval()
 val_loss = 0.0
 correct = 0
 total = 0

 with torch.no_grad():
 for batch_inputs, labels in tqdm(self.val_loader, desc="Validation", leave=False):
 freq_inputs = batch_inputs["freq_inputs"].to(self.device)
 seq_inputs = batch_inputs["seq_inputs"].to(self.device)
 pos_inputs = batch_inputs["pos_inputs"].to(self.device)
 y_val = labels.to(self.device)

 preds = self.model({"freq_inputs": freq_inputs, "seq_inputs": seq_inputs, "pos_inputs": pos_inputs})
 loss = self.criterion(preds, y_val)
 # preds = (torch.sigmoid(preds)>0.5).float()
 _, preds = torch.max(preds, dim=1)
 val_loss += loss.item()
 total += y_val.size(0)
 correct += (preds == y_val).sum().item()
 if self.val_loss is None or val_loss < self.val_loss:
 self.val_loss = val_loss
 self.save_checkpoint("best")
 # Log validation metrics
 print(f"Validation Loss: {val_loss / len(self.val_loader):.4f}")
 print(f"Validation Accuracy: {correct / total:.4f}")

 def save_checkpoint(self, epoch):
 """Save model in Hugging Face format."""
 checkpoint_dir = os.path.join(self.save_path, f"checkpoint_epoch_{epoch}")
 if epoch =="best":
 checkpoint_dir = os.path.join(self.save_path, "best")
 self.model.save_pretrained(checkpoint_dir)
 print(f"Checkpoint saved at {checkpoint_dir}")

In [52]:
from torch.utils.data import DataLoader

# Define a collate function to handle the batched data
def collate_fn(batch):
 freq_inputs = torch.stack([torch.tensor(item["freq_inputs"]) for item in batch])
 seq_inputs = torch.stack([torch.tensor(item["seq_inputs"]) for item in batch])
 pos_inputs = torch.stack([torch.tensor(item["pos_inputs"]) for item in batch])
 labels = torch.tensor([torch.tensor(item["labels"],dtype=torch.long) for item in batch])
 return {"freq_inputs": freq_inputs, "seq_inputs": seq_inputs, "pos_inputs": pos_inputs}, labels

train_loader = DataLoader(dataset_train, batch_size=config.train["batch_size"], shuffle=True,collate_fn=collate_fn)
test_loader = DataLoader(dataset_test, batch_size=config.train["batch_size"], shuffle=False,collate_fn=collate_fn)
trainer = Trainer(model, train_loader, test_loader, config)

# Train the model
trainer.train()
# Save the final model in Hugging Face format
final_save_path = os.path.join(config.base_exp_dir, "checkpoints")
model.save_pretrained(final_save_path)
print(f"Final model saved at {final_save_path}")


Epoch 1/10: 0%| | 0/96 [00:00 16[0m trainer[38;5;241m.[39mtrain()
[0;32m 17[0m [38;5;66;03m# Save the final model in Hugging Face format[39;00m
[0;32m 18[0m final_save_path [38;5;241m=[39m os[38;5;241m.[39mpath[38;5;241m.[39mjoin(config[38;5;241m.[39mbase_exp_dir, [38;5;124m"[39m[38;5;124mcheckpoints[39m[38;5;124m"[39m)
Cell [1;32mIn[51], line 69[0m, in [0;36mTrainer.train[1;34m(self)[0m
[0;32m 66[0m y_train [38;5;241m=[39m labels[38;5;241m.[39mto([38;5;28mself[39m[38;5;241m.[39mdevice)
[0;32m 68[0m [38;5;66;03m# Forward pass[39;00m
[1;32m---> 69[0m preds [38;5;241m=[39m [38;5;28mself[39m[38;5;241m.[39mmodel({[38;5;124m"[39m[38;5;124mfreq_inputs[39m[38;5;124m"[39m: freq_inputs, [38;5;124m"[39m[38;5;124mseq_inputs[39m[38;5;124m"[39m: seq_inputs, [38;5;124m"[39m[38;5;124mpos_inputs[39m[38;5;124m"[39m: pos_inputs})
[0;32m 70[0m loss [38;5;241m=[39m [38;5;28mself[39m[38;5;241m.[39mcriterion(preds, y_train)
[0;32m 

## Evaluate Model

In [None]:
from transformers import AutoConfig, AutoModel
from sklearn.metrics import accuracy_score, classification_report
def load_last_checkpoint(checkpoint_dir):
 # Find all checkpoints in the directory
 checkpoints = [f for f in os.listdir(checkpoint_dir) if f.startswith("checkpoint_epoch_")]
 if not checkpoints:
 raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}!")
 # Sort checkpoints by epoch number
 checkpoints.sort(key=lambda x: int(x.split("_")[-1]))

 # Load the last checkpoint
 last_checkpoint = os.path.join(checkpoint_dir, checkpoints[-1])
 # print(f"Loading checkpoint from {last_checkpoint}")
 # Load the best checkpoint
 if os.path.join(checkpoint_dir, "best") is not None:
 last_checkpoint = os.path.join(checkpoint_dir, "best")
 print(f"Loading checkpoint from {last_checkpoint}")
 # Load model and config
 config = AutoConfig.from_pretrained(last_checkpoint)
 model = AutoModel.from_pretrained(last_checkpoint, config=config)
 return model

# Step 1: Define paths and setup
checkpoint_dir = os.path.join(config.base_exp_dir, "checkpoints") # Directory where checkpoints are stored
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = load_last_checkpoint(checkpoint_dir)
model.to(device)

# criterion = torch.nn.BCEWithLogitsLoss()

criterion = torch.nn.CrossEntropyLoss()

def evaluate_model(model, val_loader, criterion, device="cuda"):
 model.eval()
 val_loss = 0.0
 correct = 0
 total = 0
 all_preds = []
 all_labels = []
 with torch.no_grad():
 for batch_inputs, labels in tqdm(val_loader, desc="Testing", leave=False):
 freq_inputs = batch_inputs["freq_inputs"].to(device)
 seq_inputs = batch_inputs["seq_inputs"].to(device)
 pos_inputs = batch_inputs["pos_inputs"].to(device)
 labels = labels.to(device)

 preds= model({"freq_inputs": freq_inputs, "seq_inputs": seq_inputs, "pos_inputs": pos_inputs})
 loss = criterion(preds, labels)
 _, preds = torch.max(preds, dim=1)
 # preds = (torch.sigmoid(preds) > 0.5).float()
 val_loss += loss.item()
 total += labels.size(0)
 # preds = (torch.sigmoid(preds) > 0.5).int()
 correct += (preds == labels).sum().item()
 all_preds.extend(preds.cpu().numpy())
 all_labels.extend(labels.cpu().numpy())

 return accuracy_score(all_labels, all_preds), classification_report(all_labels, all_preds)


accuracy, report = evaluate_model(model, test_loader, criterion)
print(f"Accuracy: {accuracy:.4f}")
print(report)


# Part 3. Pushing the Model to the Hugging Face

In [None]:
model.push_model(REPO_NAME)

### NOTE: You need to ensure that your Hugging Face token has both read and write access to your repository and Hugging Face organization.

In [None]:
# Load model directly
from transformers import AutoModel, AutoConfig
config = AutoConfig.from_pretrained("CISProject/News-Headline-Classifier-Notebook")
model = AutoModel.from_pretrained("CISProject/News-Headline-Classifier-Notebook",config = config)

In [None]:
from transformers import AutoConfig, AutoModel
from sklearn.metrics import accuracy_score, classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

#criterion = torch.nn.BCEWithLogitsLoss()

criterion = torch.nn.CrossEntropyLoss()
def evaluate_model(model, val_loader, criterion, device="cuda"):
 model.eval()
 val_loss = 0.0
 correct = 0
 total = 0
 all_preds = []
 all_labels = []
 with torch.no_grad():
 for batch_inputs, labels in tqdm(val_loader, desc="Testing", leave=False):
 freq_inputs = batch_inputs["freq_inputs"].to(device)
 seq_inputs = batch_inputs["seq_inputs"].to(device)
 pos_inputs = batch_inputs["pos_inputs"].to(device)
 labels = labels.to(device)

 preds = model({"freq_inputs": freq_inputs, "seq_inputs": seq_inputs, "pos_inputs": pos_inputs})
 loss = criterion(preds, labels)
 _, preds = torch.max(preds, dim=1)
 # preds = (torch.sigmoid(preds) > 0.5).float()
 val_loss += loss.item()
 total += labels.size(0)
 correct += (preds == labels).sum().item()
 all_preds.extend(preds.cpu().numpy())
 all_labels.extend(labels.cpu().numpy())

 return accuracy_score(all_labels, all_preds), classification_report(all_labels, all_preds)


accuracy, report = evaluate_model(model, test_loader, criterion)
print(f"Accuracy: {accuracy:.4f}")
print(report)
