In [1]:

!pip install geopy > delete.txt
!pip install datasets > delete.txt
!pip install torch torchvision datasets > delete.txt
!pip install huggingface_hub > delete.txt
!pip install pyhocon > delete.txt
!pip install transformers > delete.txt
!pip install gensim > delete.txt
!rm delete.txt


'\n!pip install geopy > delete.txt\n!pip install datasets > delete.txt\n!pip install torch torchvision datasets > delete.txt\n!pip install huggingface_hub > delete.txt\n!pip install pyhocon > delete.txt\n!pip install transformers > delete.txt\n!pip install gensim > delete.txt\n!rm delete.txt\n'

In [2]:
!huggingface-cli login

In [3]:
from datasets import load_dataset

dataset_train = load_dataset("CISProject/FOX_NBC", split="train")

# MODIFY IT TO THE TEST DATASET PATH
dataset_test = load_dataset("CISProject/FOX_NBC", split="test")

In [4]:
import numpy as np
import torch
import re
from transformers import BertTokenizer
from transformers import RobertaTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocess_data(data,
                    mode="train",
                    vectorizer=None,
                    w2v_model=None,
                    max_features=4096,
                    max_seq_length=128,
                    num_proc=4):
    if w2v_model is None:
        raise ValueError("w2v_model must be provided for Word2Vec embeddings.")

    # tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    # 1. Clean text once
    def clean_text(examples):
        import re
        cleaned = []
        for text in examples["title"]:
            text = text.lower()
            text = re.sub(r'[^\w\s]', '', text)
            text = text.strip()
            cleaned.append(text)
        return {"clean_title": cleaned}

    data = data.map(clean_text, batched=True, num_proc=num_proc)

    # 2. Fit CountVectorizer on training data if needed
    if mode == "train" and vectorizer is None:
        # Collect all cleaned titles to fit
        all_titles = data["clean_title"]
        #vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1,2))
        vectorizer = TfidfVectorizer(max_features=max_features)
        vectorizer.fit(all_titles)
        print("vectorizer fitted on training data.")

    # 3. Transform titles with vectorizer once
    def vectorize_batch(examples):
        import numpy as np
        freq = vectorizer.transform(examples["clean_title"]).toarray().astype(np.float32)
        return {"freq_inputs": freq}

    data = data.map(vectorize_batch, batched=True, num_proc=num_proc)

    # 4. Tokenize with BERT once
    def tokenize_batch(examples):
        tokenized = tokenizer(
            examples["title"],
            padding="max_length",
            truncation=True,
            max_length=max_seq_length
        )
        return {
            "input_ids": tokenized["input_ids"],
            "attention_mask": tokenized["attention_mask"]
        }

    data = data.map(tokenize_batch, batched=True, num_proc=num_proc)

    # 5. Convert titles into tokens for W2V
    def split_tokens(examples):
        tokens_list = [t.split() for t in examples["clean_title"]]
        return {"tokens": tokens_list}

    data = data.map(split_tokens, batched=True, num_proc=num_proc)

    # Build an embedding dictionary for all unique tokens (do this once before embedding map)
    unique_tokens = set()
    for tokens in data["tokens"]:
        unique_tokens.update(tokens)

    embedding_dim = w2v_model.vector_size
    embedding_dict = {}
    for tk in unique_tokens:
        if tk in w2v_model:
            embedding_dict[tk] = w2v_model[tk].astype(np.float32)
        else:
            embedding_dict[tk] = np.zeros((embedding_dim,), dtype=np.float32)

    def w2v_embedding_batch(examples):
        import numpy as np
        batch_w2v = []
        for tokens in examples["tokens"]:
            vectors = [embedding_dict[tk] for tk in tokens[:max_seq_length]]
            if len(vectors) < max_seq_length:
                vectors += [np.zeros((embedding_dim,), dtype=np.float32)] * (max_seq_length - len(vectors))
            batch_w2v.append(vectors)
        return {"pos_inputs": batch_w2v}


    data = data.map(w2v_embedding_batch, batched=True, batch_size=32, num_proc=num_proc)

    # 7. Create labels
    def make_labels(examples):
        labels = examples["labels"]
        return {"labels": labels}

    data = data.map(make_labels, batched=True, num_proc=num_proc)

    # Convert freq_inputs and pos_inputs to torch tensors in a final map step
    def to_tensors(examples):
        import torch

        freq_inputs = torch.tensor(examples["freq_inputs"], dtype=torch.float32)
        input_ids = torch.tensor(examples["input_ids"])
        attention_mask = torch.tensor(examples["attention_mask"])
        pos_inputs = torch.tensor(examples["pos_inputs"], dtype=torch.float32)
        labels = torch.tensor(examples["labels"],dtype=torch.long)

        # seq_inputs shape: (batch_size, 2, seq_len)
        seq_inputs = torch.stack([input_ids, attention_mask], dim=1)

        return {
            "freq_inputs": freq_inputs,
            "seq_inputs": seq_inputs,
            "pos_inputs": pos_inputs,
            "labels": labels
        }

    # Apply final conversion to tensor
    processed_data = data.map(to_tensors, batched=True, num_proc=num_proc)

    return processed_data, vectorizer


In [5]:
from gensim.models import KeyedVectors
w2v_model = KeyedVectors.load_word2vec_format("./GoogleNews-vectors-negative300.bin", binary=True) #PATH TO PRETRAINED WORD2VEC MODEL

dataset_train,vectorizer = preprocess_data(
    data=dataset_train,
    mode="train",
    w2v_model=w2v_model,
    max_features=8192,
    max_seq_length=128
)

dataset_test, _ = preprocess_data(
    data=dataset_test,
    mode="test",
    vectorizer=vectorizer,
    w2v_model=w2v_model,
    max_features=8192,
    max_seq_length=128
)

vectorizer fitted on training data.


Map (num_proc=4):   0%|          | 0/3044 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3044 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3044 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3044 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3044 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3044 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/761 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/761 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/761 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/761 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/761 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/761 [00:00<?, ? examples/s]

In [7]:
# TODO: import all packages necessary for your custom model
import pandas as pd
import os
from torch.utils.data import DataLoader
from transformers import PreTrainedModel, PretrainedConfig, AutoConfig, AutoModel
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaConfig,RobertaForSequenceClassification, BertModel
from model.network import Classifier
from model.frequential import FreqNetwork
from model.sequential import SeqNetwork
from model.positional import PosNetwork

class CustomConfig(PretrainedConfig):
    model_type = "headlineclassifier"

    def __init__(
        self,
        base_exp_dir="./exp/fox_nbc/",
        # dataset={"data_dir": "./data/CASE_NAME/data.csv", "transform": True},
        train={
            "learning_rate": 2e-5,
            "learning_rate_alpha": 0.05,
            "end_iter": 10,
            "batch_size": 32,
            "warm_up_end": 2,
            "anneal_end": 5,
            "save_freq": 1,
            "val_freq": 1,
        },
        model={
            "freq": {
                "tfidf_input_dim": 8145,
                "tfidf_output_dim": 128,
                "tfidf_hidden_dim": 512,
                "n_layers": 2,
                "skip_in": [80],
                "weight_norm": True,
            },
            "pos": {
                "input_dim": 300,
                "output_dim": 128,
                "hidden_dim": 256,
                "n_layers": 2,
                "skip_in": [80],
                "weight_norm": True,
            },
            "cls": {
                "combined_input": 1024, #1024
                "combined_dim": 128,
                "num_classes": 2,
                "n_layers": 2,
                "skip_in": [80],
                "weight_norm": True,
            },
        },
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.base_exp_dir = base_exp_dir
        # self.dataset = dataset
        self.train = train
        self.model = model

# TODO: define all parameters needed for your model, as well as calling the model itself
class CustomModel(PreTrainedModel):
    config_class = CustomConfig

    def __init__(self, config):
        super().__init__(config)
        self.conf = config
        self.freq = FreqNetwork(**self.conf.model["freq"])
        self.pos = PosNetwork(**self.conf.model["pos"])
        self.cls = Classifier(**self.conf.model["cls"])
        self.fc = nn.Linear(self.conf.model["cls"]["combined_input"],2)
        self.seq = RobertaModel.from_pretrained("roberta-base")
        # self.seq = BertModel.from_pretrained("bert-base-uncased")
        #for param in self.roberta.parameters():
        #    param.requires_grad = False
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        freq_inputs = x["freq_inputs"]
        seq_inputs = x["seq_inputs"]
        pos_inputs = x["pos_inputs"]
        seq_feature = self.seq(
            input_ids=seq_inputs[:,0,:],
            attention_mask=seq_inputs[:,1,:]
        ).pooler_output # last_hidden_state[:, 0, :]
        freq_feature = self.freq(freq_inputs) # Shape: (batch_size, 128)

        pos_feature = self.pos(pos_inputs) #Shape: (batch_size, 128)
        inputs = torch.cat((seq_feature, freq_feature, pos_feature), dim=1)  # Shape: (batch_size, 384)
        # inputs = torch.cat((seq_feature, freq_feature), dim=1)  # Shape: (batch_size,256)
        # inputs = seq_feature

        x = inputs
        x = self.dropout(x)
        outputs = self.fc(x)

        return outputs

    def save_model(self, save_path):
        """Save the model locally using the Hugging Face format."""
        self.save_pretrained(save_path)

    def push_model(self, repo_name):
        """Push the model to the Hugging Face Hub."""
        self.push_to_hub(repo_name)

In [8]:
AutoConfig.register("headlineclassifier", CustomConfig)
AutoModel.register(CustomConfig, CustomModel)
config = CustomConfig()
model = CustomModel(config)

  WeightNorm.apply(module, name, dim)
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from transformers import AutoModel, AutoConfig
config = AutoConfig.from_pretrained("CISProject/News-Headline-Classifier-Notebook")
model = AutoModel.from_pretrained("CISProject/News-Headline-Classifier-Notebook",config = config)

model.safetensors:   0%|          | 0.00/518M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at CISProject/News-Headline-Classifier-Notebook were not used when initializing CustomModel: ['cls.lin0.parametrizations.weight.original0', 'cls.lin0.parametrizations.weight.original1', 'cls.lin1.parametrizations.weight.original0', 'cls.lin1.parametrizations.weight.original1', 'cls.lin2.parametrizations.weight.original0', 'cls.lin2.parametrizations.weight.original1', 'freq.lin0.parametrizations.weight.origin

In [12]:
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
# Define a collate function to handle the batched data
def collate_fn(batch):
    freq_inputs = torch.stack([torch.tensor(item["freq_inputs"]) for item in batch])
    seq_inputs = torch.stack([torch.tensor(item["seq_inputs"]) for item in batch])
    pos_inputs = torch.stack([torch.tensor(item["pos_inputs"]) for item in batch])
    labels = torch.tensor([torch.tensor(item["labels"],dtype=torch.long) for item in batch])
    return {"freq_inputs": freq_inputs, "seq_inputs": seq_inputs, "pos_inputs": pos_inputs}, labels

train_loader = DataLoader(dataset_train, batch_size=config.train["batch_size"], shuffle=True,collate_fn=collate_fn)
test_loader = DataLoader(dataset_test, batch_size=config.train["batch_size"], shuffle=False,collate_fn=collate_fn)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = torch.nn.CrossEntropyLoss()
def evaluate_model(model, val_loader, criterion, device="cuda"):
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch_inputs, labels in tqdm(val_loader, desc="Testing", leave=False):
            freq_inputs = batch_inputs["freq_inputs"].to(device)
            seq_inputs = batch_inputs["seq_inputs"].to(device)
            pos_inputs = batch_inputs["pos_inputs"].to(device)
            labels = labels.to(device)

            preds = model({"freq_inputs": freq_inputs, "seq_inputs": seq_inputs, "pos_inputs": pos_inputs})
            loss = criterion(preds, labels)
            _, preds = torch.max(preds, dim=1)
            # preds = (torch.sigmoid(preds) > 0.5).float()
            val_loss += loss.item()
            total += labels.size(0)
            correct += (preds == labels).sum().item()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return accuracy_score(all_labels, all_preds), classification_report(all_labels, all_preds)


accuracy, report = evaluate_model(model, test_loader, criterion)
print(f"Accuracy: {accuracy:.4f}")
print(report)

                                                        

Accuracy: 0.8988
              precision    recall  f1-score   support

           0       0.90      0.88      0.89       356
           1       0.90      0.91      0.91       405

    accuracy                           0.90       761
   macro avg       0.90      0.90      0.90       761
weighted avg       0.90      0.90      0.90       761



