{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "# Installing dependencies\n",
    "## Please make a copy of this notebook."
   ],
   "id": "13156d7ed48b282"
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "# Huggingface login\n",
    "You will require your personal token."
   ],
   "id": "432a756039e6399"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "# !huggingface-cli login",
   "id": "2e73da09a7c6171e",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Part 1: Load Data",
   "id": "c731d9c1ebb477dc"
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Downloading the train and test dataset",
   "id": "14070f20b547688f"
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "",
   "id": "b8920847b7cc378d"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "dataset_train = load_dataset(\"CISProject/FOX_NBC\", split=\"train\")\n",
    "dataset_test = load_dataset(\"CISProject/FOX_NBC\", split=\"test\")\n",
    "# dataset_test = load_dataset(\"CISProject/FOX_NBC\", split=\"test_data_random_subset\")\n"
   ],
   "id": "877c90c978d62b7d",
   "outputs": [],
   "execution_count": 32
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-16T08:10:32.645990Z",
     "start_time": "2024-12-16T08:10:32.636717Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import numpy as np\n",
    "import torch\n",
    "import re\n",
    "from transformers import BertTokenizer\n",
    "from transformers import RobertaTokenizer\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from gensim.models import KeyedVectors\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "def preprocess_data(data,\n",
    "                    mode=\"train\",\n",
    "                    vectorizer=None,\n",
    "                    w2v_model=None,\n",
    "                    max_features=4096,\n",
    "                    max_seq_length=128,\n",
    "                    num_proc=4):\n",
    "    if w2v_model is None:\n",
    "        raise ValueError(\"w2v_model must be provided for Word2Vec embeddings.\")\n",
    "\n",
    "    # tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n",
    "    tokenizer = RobertaTokenizer.from_pretrained(\"roberta-base\")\n",
    "    # 1. Clean text once\n",
    "    def clean_text(examples):\n",
    "        import re\n",
    "        cleaned = []\n",
    "        for text in examples[\"title\"]:\n",
    "            text = text.lower()\n",
    "            text = re.sub(r'[^\\w\\s]', '', text)\n",
    "            text = text.strip()\n",
    "            cleaned.append(text)\n",
    "        return {\"clean_title\": cleaned}\n",
    "\n",
    "    data = data.map(clean_text, batched=True, num_proc=num_proc)\n",
    "\n",
    "    # 2. Fit CountVectorizer on training data if needed\n",
    "    if mode == \"train\" and vectorizer is None:\n",
    "        # Collect all cleaned titles to fit\n",
    "        all_titles = data[\"clean_title\"]\n",
    "        #vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1,2))\n",
    "        vectorizer = TfidfVectorizer(max_features=max_features)\n",
    "        vectorizer.fit(all_titles)\n",
    "        print(\"vectorizer fitted on training data.\")\n",
    "\n",
    "    # 3. Transform titles with vectorizer once\n",
    "    def vectorize_batch(examples):\n",
    "        import numpy as np\n",
    "        freq = vectorizer.transform(examples[\"clean_title\"]).toarray().astype(np.float32)\n",
    "        return {\"freq_inputs\": freq}\n",
    "\n",
    "    data = data.map(vectorize_batch, batched=True, num_proc=num_proc)\n",
    "\n",
    "    # 4. Tokenize with BERT once\n",
    "    def tokenize_batch(examples):\n",
    "        tokenized = tokenizer(\n",
    "            examples[\"title\"],\n",
    "            padding=\"max_length\",\n",
    "            truncation=True,\n",
    "            max_length=max_seq_length\n",
    "        )\n",
    "        return {\n",
    "            \"input_ids\": tokenized[\"input_ids\"],\n",
    "            \"attention_mask\": tokenized[\"attention_mask\"]\n",
    "        }\n",
    "\n",
    "    data = data.map(tokenize_batch, batched=True, num_proc=num_proc)\n",
    "\n",
    "    # 5. Convert titles into tokens for W2V\n",
    "    def split_tokens(examples):\n",
    "        tokens_list = [t.split() for t in examples[\"clean_title\"]]\n",
    "        return {\"tokens\": tokens_list}\n",
    "\n",
    "    data = data.map(split_tokens, batched=True, num_proc=num_proc)\n",
    "\n",
    "    # Build an embedding dictionary for all unique tokens (do this once before embedding map)\n",
    "    unique_tokens = set()\n",
    "    for tokens in data[\"tokens\"]:\n",
    "        unique_tokens.update(tokens)\n",
    "\n",
    "    embedding_dim = w2v_model.vector_size\n",
    "    embedding_dict = {}\n",
    "    for tk in unique_tokens:\n",
    "        if tk in w2v_model:\n",
    "            embedding_dict[tk] = w2v_model[tk].astype(np.float32)\n",
    "        else:\n",
    "            embedding_dict[tk] = np.zeros((embedding_dim,), dtype=np.float32)\n",
    "\n",
    "    def w2v_embedding_batch(examples):\n",
    "        import numpy as np\n",
    "        batch_w2v = []\n",
    "        for tokens in examples[\"tokens\"]:\n",
    "            vectors = [embedding_dict[tk] for tk in tokens[:max_seq_length]]\n",
    "            if len(vectors) < max_seq_length:\n",
    "                vectors += [np.zeros((embedding_dim,), dtype=np.float32)] * (max_seq_length - len(vectors))\n",
    "            batch_w2v.append(vectors)\n",
    "        return {\"pos_inputs\": batch_w2v}\n",
    "\n",
    "\n",
    "    data = data.map(w2v_embedding_batch, batched=True, batch_size=32, num_proc=num_proc)\n",
    "\n",
    "    # 7. Create labels\n",
    "    def make_labels(examples):\n",
    "        labels = [1.0 if agency == \"fox\" else 0.0 for agency in examples[\"news\"]]\n",
    "        return {\"labels\": labels}\n",
    "\n",
    "    data = data.map(make_labels, batched=True, num_proc=num_proc)\n",
    "\n",
    "    # Convert freq_inputs and pos_inputs to torch tensors in a final map step\n",
    "    def to_tensors(examples):\n",
    "        import torch\n",
    "\n",
    "        freq_inputs = torch.tensor(examples[\"freq_inputs\"], dtype=torch.float32)\n",
    "        input_ids = torch.tensor(examples[\"input_ids\"])\n",
    "        attention_mask = torch.tensor(examples[\"attention_mask\"])\n",
    "        pos_inputs = torch.tensor(examples[\"pos_inputs\"], dtype=torch.float32)\n",
    "        labels = torch.tensor(examples[\"labels\"],dtype=torch.long)\n",
    "\n",
    "        # seq_inputs shape: (batch_size, 2, seq_len)\n",
    "        seq_inputs = torch.stack([input_ids, attention_mask], dim=1)\n",
    "\n",
    "        return {\n",
    "            \"freq_inputs\": freq_inputs,\n",
    "            \"seq_inputs\": seq_inputs,\n",
    "            \"pos_inputs\": pos_inputs,\n",
    "            \"labels\": labels\n",
    "        }\n",
    "\n",
    "    # Apply final conversion to tensor\n",
    "    processed_data = data.map(to_tensors, batched=True, num_proc=num_proc)\n",
    "\n",
    "    return processed_data, vectorizer\n"
   ],
   "id": "dc2ba675ce880d6d",
   "outputs": [],
   "execution_count": 33
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-16T08:11:25.586667Z",
     "start_time": "2024-12-16T08:10:32.651505Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from gensim.models import KeyedVectors\n",
    "w2v_model = KeyedVectors.load_word2vec_format(\"./GoogleNews-vectors-negative300.bin\", binary=True)\n",
    "\n",
    "dataset_train,vectorizer = preprocess_data(\n",
    "    data=dataset_train,\n",
    "    mode=\"train\",\n",
    "    w2v_model=w2v_model,\n",
    "    max_features=8192,\n",
    "    max_seq_length=128\n",
    ")\n",
    "\n",
    "dataset_test, _ = preprocess_data(\n",
    "    data=dataset_test,\n",
    "    mode=\"test\",\n",
    "    vectorizer=vectorizer,\n",
    "    w2v_model=w2v_model,\n",
    "    max_features=8192,\n",
    "    max_seq_length=128\n",
    ")"
   ],
   "id": "158b99950fb22d1",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "vectorizer fitted on training data.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Map (num_proc=4):   0%|          | 0/3044 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "bfd4d90535a94d6089d9713c39003468"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Map (num_proc=4):   0%|          | 0/3044 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "88c8f4ba83ba48c1b047549b7086be66"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Map (num_proc=4):   0%|          | 0/761 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "7b73903c642241bca21fabc4c37b9919"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Map (num_proc=4):   0%|          | 0/761 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "fa333143b6344de8bbab7ddb27f618cf"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Map (num_proc=4):   0%|          | 0/761 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "43ea10bbe58b43e4a630c05296f4a92d"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Map (num_proc=4):   0%|          | 0/761 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "8917e01c927e43c2adcf3fd9b9d8a602"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Map (num_proc=4):   0%|          | 0/761 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "b7ebd99939cf4bc48ba23029bdd1d651"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Map (num_proc=4):   0%|          | 0/761 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "99f71c080bbe41018345656f102529f7"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Map (num_proc=4):   0%|          | 0/761 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "27db7129c3b848ada119588691e6a602"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "execution_count": 34
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-16T08:11:25.595796Z",
     "start_time": "2024-12-16T08:11:25.593319Z"
    }
   },
   "cell_type": "code",
   "source": [
    "print(dataset_train)\n",
    "print(dataset_test)"
   ],
   "id": "edd80d33175c96a0",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset({\n",
      "    features: ['title', 'news', 'index', 'url', 'clean_title', 'freq_inputs', 'input_ids', 'attention_mask', 'tokens', 'pos_inputs', 'labels', 'seq_inputs'],\n",
      "    num_rows: 3044\n",
      "})\n",
      "Dataset({\n",
      "    features: ['title', 'news', 'index', 'url', 'clean_title', 'freq_inputs', 'input_ids', 'attention_mask', 'tokens', 'pos_inputs', 'labels', 'seq_inputs'],\n",
      "    num_rows: 761\n",
      "})\n"
     ]
    }
   ],
   "execution_count": 35
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Part 2: Model",
   "id": "c9a49fc1fbca29d7"
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Defining the Custom Model",
   "id": "aebe5e51f0e611cc"
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "",
   "id": "f0eae08a025b6ed9"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-16T08:11:25.680033Z",
     "start_time": "2024-12-16T08:11:25.672667Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# TODO: import all packages necessary for your custom model\n",
    "import pandas as pd\n",
    "import os\n",
    "from torch.utils.data import DataLoader\n",
    "from transformers import PreTrainedModel, PretrainedConfig, AutoConfig, AutoModel\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "from transformers import RobertaModel, RobertaConfig,RobertaForSequenceClassification, BertModel\n",
    "from model.network import Classifier\n",
    "from model.frequential import FreqNetwork\n",
    "from model.sequential import SeqNetwork\n",
    "from model.positional import PosNetwork\n",
    "\n",
    "class CustomConfig(PretrainedConfig):\n",
    "    model_type = \"headlineclassifier\"\n",
    "\n",
    "    def __init__(\n",
    "        self,\n",
    "        base_exp_dir=\"./exp/fox_nbc/\",\n",
    "        # dataset={\"data_dir\": \"./data/CASE_NAME/data.csv\", \"transform\": True},\n",
    "        train={\n",
    "            \"learning_rate\": 2e-5,\n",
    "            \"learning_rate_alpha\": 0.05,\n",
    "            \"end_iter\": 10,\n",
    "            \"batch_size\": 32,\n",
    "            \"warm_up_end\": 2,\n",
    "            \"anneal_end\": 5,\n",
    "            \"save_freq\": 1,\n",
    "            \"val_freq\": 1,\n",
    "        },\n",
    "        model={\n",
    "            \"freq\": {\n",
    "                \"tfidf_input_dim\": 8145,\n",
    "                \"tfidf_output_dim\": 128,\n",
    "                \"tfidf_hidden_dim\": 512,\n",
    "                \"n_layers\": 2,\n",
    "                \"skip_in\": [80],\n",
    "                \"weight_norm\": True,\n",
    "            },\n",
    "            \"pos\": {\n",
    "                \"input_dim\": 300,\n",
    "                \"output_dim\": 128,\n",
    "                \"hidden_dim\": 256,\n",
    "                \"n_layers\": 2,\n",
    "                \"skip_in\": [80],\n",
    "                \"weight_norm\": True,\n",
    "            },\n",
    "            \"cls\": {\n",
    "                \"combined_input\": 1024, #1024\n",
    "                \"combined_dim\": 128,\n",
    "                \"num_classes\": 1,\n",
    "                \"n_layers\": 2,\n",
    "                \"skip_in\": [80],\n",
    "                \"weight_norm\": True,\n",
    "            },\n",
    "        },\n",
    "        **kwargs,\n",
    "    ):\n",
    "        super().__init__(**kwargs)\n",
    "\n",
    "        self.base_exp_dir = base_exp_dir\n",
    "        # self.dataset = dataset\n",
    "        self.train = train\n",
    "        self.model = model\n",
    "\n",
    "# TODO: define all parameters needed for your model, as well as calling the model itself\n",
    "class CustomModel(PreTrainedModel):\n",
    "    config_class = CustomConfig\n",
    "\n",
    "    def __init__(self, config):\n",
    "        super().__init__(config)\n",
    "        self.conf = config\n",
    "        self.freq = FreqNetwork(**self.conf.model[\"freq\"])\n",
    "        self.pos = PosNetwork(**self.conf.model[\"pos\"])\n",
    "        self.fc = nn.Linear(self.conf.model[\"cls\"][\"combined_input\"],2)\n",
    "        self.seq = RobertaModel.from_pretrained(\"roberta-base\")\n",
    "        # self.seq = BertModel.from_pretrained(\"bert-base-uncased\")\n",
    "        #for param in self.roberta.parameters():\n",
    "        #    param.requires_grad = False\n",
    "        self.dropout = nn.Dropout(0.1)\n",
    "\n",
    "    def forward(self, x):\n",
    "        freq_inputs = x[\"freq_inputs\"]\n",
    "        seq_inputs = x[\"seq_inputs\"]\n",
    "        pos_inputs = x[\"pos_inputs\"]\n",
    "        seq_feature = self.seq(\n",
    "            input_ids=seq_inputs[:,0,:],\n",
    "            attention_mask=seq_inputs[:,1,:]\n",
    "        ).pooler_output # last_hidden_state[:, 0, :]\n",
    "        freq_feature = self.freq(freq_inputs) # Shape: (batch_size, 128)\n",
    "\n",
    "        pos_feature = self.pos(pos_inputs) #Shape: (batch_size, 128)\n",
    "        inputs = torch.cat((seq_feature, freq_feature, pos_feature), dim=1)  # Shape: (batch_size, 384)\n",
    "        # inputs = torch.cat((seq_feature, freq_feature), dim=1)  # Shape: (batch_size,256)\n",
    "        # inputs = seq_feature\n",
    "\n",
    "        x = inputs\n",
    "        x = self.dropout(x)\n",
    "        outputs = self.fc(x)\n",
    "\n",
    "        return outputs\n",
    "\n",
    "    def save_model(self, save_path):\n",
    "        \"\"\"Save the model locally using the Hugging Face format.\"\"\"\n",
    "        self.save_pretrained(save_path)\n",
    "\n",
    "    def push_model(self, repo_name):\n",
    "        \"\"\"Push the model to the Hugging Face Hub.\"\"\"\n",
    "        self.push_to_hub(repo_name)"
   ],
   "id": "21f079d0c52d7d",
   "outputs": [],
   "execution_count": 36
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-16T08:11:25.888720Z",
     "start_time": "2024-12-16T08:11:25.683038Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from huggingface_hub import hf_hub_download\n",
    "\n",
    "AutoConfig.register(\"headlineclassifier\", CustomConfig)\n",
    "AutoModel.register(CustomConfig, CustomModel)\n",
    "config = CustomConfig()\n",
    "model = CustomModel(config)\n",
    "\n",
    "REPO_NAME = \"CISProject/News-Headline-Classifier-Notebook\" # TODO: PROVIDE A STRING TO YOUR REPO ON HUGGINGFACE"
   ],
   "id": "b6ba3f96d3ce21",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\swall\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\torch\\nn\\utils\\weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n",
      "  WeightNorm.apply(module, name, dim)\n",
      "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "execution_count": 37
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-16T08:11:25.904773Z",
     "start_time": "2024-12-16T08:11:25.895279Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import torch\n",
    "from tqdm import tqdm\n",
    "import os\n",
    "\n",
    "\n",
    "class Trainer:\n",
    "    def __init__(self, model, train_loader, val_loader, config, device=\"cuda\"):\n",
    "        self.model = model.to(device)\n",
    "        self.train_loader = train_loader\n",
    "        self.val_loader = val_loader\n",
    "        self.device = device\n",
    "        self.conf = config\n",
    "\n",
    "        self.end_iter = self.conf.train[\"end_iter\"]\n",
    "        self.save_freq = self.conf.train[\"save_freq\"]\n",
    "        self.val_freq = self.conf.train[\"val_freq\"]\n",
    "\n",
    "        self.batch_size = self.conf.train['batch_size']\n",
    "        self.learning_rate = self.conf.train['learning_rate']\n",
    "        self.learning_rate_alpha = self.conf.train['learning_rate_alpha']\n",
    "        self.warm_up_end = self.conf.train['warm_up_end']\n",
    "        self.anneal_end = self.conf.train['anneal_end']\n",
    "\n",
    "        self.optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate)\n",
    "        #self.criterion = torch.nn.BCEWithLogitsLoss()\n",
    "        self.criterion = torch.nn.CrossEntropyLoss()\n",
    "        self.save_path = os.path.join(self.conf.base_exp_dir, \"checkpoints\")\n",
    "        os.makedirs(self.save_path, exist_ok=True)\n",
    "\n",
    "        self.iter_step = 0\n",
    "\n",
    "        self.val_loss = None\n",
    "\n",
    "    def get_cos_anneal_ratio(self):\n",
    "        if self.anneal_end == 0.0:\n",
    "            return 1.0\n",
    "        else:\n",
    "            return np.min([1.0, self.iter_step / self.anneal_end])\n",
    "\n",
    "    def update_learning_rate(self):\n",
    "        if self.iter_step < self.warm_up_end:\n",
    "            learning_factor = self.iter_step / self.warm_up_end\n",
    "        else:\n",
    "            alpha = self.learning_rate_alpha\n",
    "            progress = (self.iter_step - self.warm_up_end) / (self.end_iter - self.warm_up_end)\n",
    "            learning_factor = (np.cos(np.pi * progress) + 1.0) * 0.5 * (1 - alpha) + alpha\n",
    "\n",
    "        for g in self.optimizer.param_groups:\n",
    "            g['lr'] = self.learning_rate * learning_factor\n",
    "\n",
    "    def train(self):\n",
    "        for epoch in range(self.end_iter):\n",
    "            self.update_learning_rate()\n",
    "            self.model.train()\n",
    "            epoch_loss = 0.0\n",
    "            correct = 0\n",
    "            total = 0\n",
    "\n",
    "            for batch_inputs, labels in tqdm(self.train_loader, desc=f\"Epoch {epoch + 1}/{self.end_iter}\"):\n",
    "                # Extract features\n",
    "\n",
    "                freq_inputs = batch_inputs[\"freq_inputs\"].to(self.device)\n",
    "                seq_inputs = batch_inputs[\"seq_inputs\"].to(self.device)\n",
    "                pos_inputs = batch_inputs[\"pos_inputs\"].to(self.device)\n",
    "                # y_train = labels.to(self.device)[:,None]\n",
    "                y_train = labels.to(self.device)\n",
    "\n",
    "                # Forward pass\n",
    "                preds = self.model({\"freq_inputs\": freq_inputs, \"seq_inputs\": seq_inputs, \"pos_inputs\": pos_inputs})\n",
    "                loss = self.criterion(preds, y_train)\n",
    "\n",
    "                # preds = (torch.sigmoid(preds) > 0.5).int()\n",
    "                # Backward pass\n",
    "                self.optimizer.zero_grad()\n",
    "                loss.backward()\n",
    "                self.optimizer.step()\n",
    "                _, preds = torch.max(preds, dim=1)\n",
    "                # Metrics\n",
    "                epoch_loss += loss.item()\n",
    "                total += y_train.size(0)\n",
    "                # print(preds.shape)\n",
    "                correct += (preds == y_train).sum().item()\n",
    "\n",
    "            # Log epoch metrics\n",
    "            print(f\"Train Loss: {epoch_loss / len(self.train_loader):.4f}\")\n",
    "            print(f\"Train Accuracy: {correct / total:.4f}\")\n",
    "\n",
    "            # Validation and Save Checkpoints\n",
    "            if (epoch + 1) % self.val_freq == 0:\n",
    "                self.val()\n",
    "            if (epoch + 1) % self.save_freq == 0:\n",
    "                self.save_checkpoint(epoch + 1)\n",
    "\n",
    "            # Update learning rate\n",
    "            self.iter_step += 1\n",
    "            self.update_learning_rate()\n",
    "\n",
    "\n",
    "    def val(self):\n",
    "        self.model.eval()\n",
    "        val_loss = 0.0\n",
    "        correct = 0\n",
    "        total = 0\n",
    "\n",
    "        with torch.no_grad():\n",
    "            for batch_inputs, labels in tqdm(self.val_loader, desc=\"Validation\", leave=False):\n",
    "                freq_inputs = batch_inputs[\"freq_inputs\"].to(self.device)\n",
    "                seq_inputs = batch_inputs[\"seq_inputs\"].to(self.device)\n",
    "                pos_inputs = batch_inputs[\"pos_inputs\"].to(self.device)\n",
    "                y_val = labels.to(self.device)\n",
    "\n",
    "                preds = self.model({\"freq_inputs\": freq_inputs, \"seq_inputs\": seq_inputs, \"pos_inputs\": pos_inputs})\n",
    "                loss = self.criterion(preds, y_val)\n",
    "                # preds = (torch.sigmoid(preds)>0.5).float()\n",
    "                _, preds = torch.max(preds, dim=1)\n",
    "                val_loss += loss.item()\n",
    "                total += y_val.size(0)\n",
    "                correct += (preds == y_val).sum().item()\n",
    "        if self.val_loss is None or val_loss < self.val_loss:\n",
    "            self.val_loss = val_loss\n",
    "            self.save_checkpoint(\"best\")\n",
    "        # Log validation metrics\n",
    "        print(f\"Validation Loss: {val_loss / len(self.val_loader):.4f}\")\n",
    "        print(f\"Validation Accuracy: {correct / total:.4f}\")\n",
    "\n",
    "    def save_checkpoint(self, epoch):\n",
    "        \"\"\"Save model in Hugging Face format.\"\"\"\n",
    "        checkpoint_dir = os.path.join(self.save_path, f\"checkpoint_epoch_{epoch}\")\n",
    "        if epoch ==\"best\":\n",
    "            checkpoint_dir = os.path.join(self.save_path, \"best\")\n",
    "        self.model.save_pretrained(checkpoint_dir)\n",
    "        print(f\"Checkpoint saved at {checkpoint_dir}\")"
   ],
   "id": "7be377251b81a25d",
   "outputs": [],
   "execution_count": 38
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-16T08:23:59.280460Z",
     "start_time": "2024-12-16T08:11:25.911156Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from torch.utils.data import DataLoader\n",
    "\n",
    "# Define a collate function to handle the batched data\n",
    "def collate_fn(batch):\n",
    "    freq_inputs = torch.stack([torch.tensor(item[\"freq_inputs\"]) for item in batch])\n",
    "    seq_inputs = torch.stack([torch.tensor(item[\"seq_inputs\"]) for item in batch])\n",
    "    pos_inputs = torch.stack([torch.tensor(item[\"pos_inputs\"]) for item in batch])\n",
    "    labels = torch.tensor([torch.tensor(item[\"labels\"],dtype=torch.long) for item in batch])\n",
    "    return {\"freq_inputs\": freq_inputs, \"seq_inputs\": seq_inputs, \"pos_inputs\": pos_inputs}, labels\n",
    "\n",
    "train_loader = DataLoader(dataset_train, batch_size=config.train[\"batch_size\"], shuffle=True,collate_fn=collate_fn)\n",
    "test_loader = DataLoader(dataset_test, batch_size=config.train[\"batch_size\"], shuffle=False,collate_fn=collate_fn)\n",
    "trainer = Trainer(model, train_loader, test_loader, config)\n",
    "\n",
    "# Train the model\n",
    "trainer.train()\n",
    "# Save the final model in Hugging Face format\n",
    "final_save_path = os.path.join(config.base_exp_dir, \"checkpoints\")\n",
    "model.save_pretrained(final_save_path)\n",
    "print(f\"Final model saved at {final_save_path}\")\n"
   ],
   "id": "dd1749c306f148eb",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Epoch 1/10: 100%|██████████| 96/96 [01:00<00:00,  1.60it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train Loss: 0.6920\n",
      "Train Accuracy: 0.5217\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                           \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checkpoint saved at ./exp/fox_nbc/checkpoints\\best\n",
      "Validation Loss: 0.6914\n",
      "Validation Accuracy: 0.5322\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Epoch 2/10: 100%|██████████| 96/96 [01:00<00:00,  1.59it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train Loss: 0.6137\n",
      "Train Accuracy: 0.6390\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                           \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checkpoint saved at ./exp/fox_nbc/checkpoints\\best\n",
      "Validation Loss: 0.4313\n",
      "Validation Accuracy: 0.7937\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Epoch 3/10: 100%|██████████| 96/96 [01:00<00:00,  1.59it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train Loss: 0.3599\n",
      "Train Accuracy: 0.8466\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                           \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Validation Loss: 0.4837\n",
      "Validation Accuracy: 0.7911\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Epoch 4/10: 100%|██████████| 96/96 [01:00<00:00,  1.59it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train Loss: 0.2002\n",
      "Train Accuracy: 0.9208\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                           \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checkpoint saved at ./exp/fox_nbc/checkpoints\\best\n",
      "Validation Loss: 0.3466\n",
      "Validation Accuracy: 0.8647\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Epoch 5/10: 100%|██████████| 96/96 [01:00<00:00,  1.57it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train Loss: 0.0939\n",
      "Train Accuracy: 0.9655\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                           \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Validation Loss: 0.5384\n",
      "Validation Accuracy: 0.8371\n",
      "Checkpoint saved at ./exp/fox_nbc/checkpoints\\checkpoint_epoch_5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Epoch 6/10: 100%|██████████| 96/96 [01:00<00:00,  1.58it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train Loss: 0.0555\n",
      "Train Accuracy: 0.9800\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                           \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Validation Loss: 0.4440\n",
      "Validation Accuracy: 0.8857\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Epoch 7/10: 100%|██████████| 96/96 [01:00<00:00,  1.58it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train Loss: 0.0422\n",
      "Train Accuracy: 0.9855\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                           \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Validation Loss: 0.5519\n",
      "Validation Accuracy: 0.8739\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Epoch 8/10: 100%|██████████| 96/96 [01:01<00:00,  1.56it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train Loss: 0.0163\n",
      "Train Accuracy: 0.9944\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                           \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Validation Loss: 0.6396\n",
      "Validation Accuracy: 0.8581\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Epoch 9/10: 100%|██████████| 96/96 [01:01<00:00,  1.57it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train Loss: 0.0125\n",
      "Train Accuracy: 0.9974\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                           \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Validation Loss: 0.4890\n",
      "Validation Accuracy: 0.8857\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Epoch 10/10: 100%|██████████| 96/96 [01:17<00:00,  1.24it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train Loss: 0.0039\n",
      "Train Accuracy: 0.9997\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                           \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Validation Loss: 0.5024\n",
      "Validation Accuracy: 0.8883\n",
      "Checkpoint saved at ./exp/fox_nbc/checkpoints\\checkpoint_epoch_10\n",
      "Final model saved at ./exp/fox_nbc/checkpoints\n"
     ]
    }
   ],
   "execution_count": 39
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Evaluate Model",
   "id": "4af000263dd99bca"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-16T08:24:40.129577Z",
     "start_time": "2024-12-16T08:24:26.080416Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from transformers import AutoConfig, AutoModel\n",
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "def load_last_checkpoint(checkpoint_dir):\n",
    "    # Find all checkpoints in the directory\n",
    "    checkpoints = [f for f in os.listdir(checkpoint_dir) if f.startswith(\"checkpoint_epoch_\")]\n",
    "    if not checkpoints:\n",
    "        raise FileNotFoundError(f\"No checkpoints found in {checkpoint_dir}!\")\n",
    "    # Sort checkpoints by epoch number\n",
    "    checkpoints.sort(key=lambda x: int(x.split(\"_\")[-1]))\n",
    "\n",
    "    # Load the last checkpoint\n",
    "    last_checkpoint = os.path.join(checkpoint_dir, checkpoints[-1])\n",
    "    # print(f\"Loading checkpoint from {last_checkpoint}\")\n",
    "    # Load the best checkpoint\n",
    "    #if os.path.join(checkpoint_dir, \"best\") is not None:\n",
    "   #     last_checkpoint = os.path.join(checkpoint_dir, \"best\")\n",
    "    print(f\"Loading checkpoint from {last_checkpoint}\")\n",
    "    # Load model and config\n",
    "    config = AutoConfig.from_pretrained(last_checkpoint)\n",
    "    model = AutoModel.from_pretrained(last_checkpoint, config=config)\n",
    "    return model\n",
    "\n",
    "# Step 1: Define paths and setup\n",
    "checkpoint_dir = os.path.join(config.base_exp_dir, \"checkpoints\")  # Directory where checkpoints are stored\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "model = load_last_checkpoint(checkpoint_dir)\n",
    "model.to(device)\n",
    "\n",
    "# criterion = torch.nn.BCEWithLogitsLoss()\n",
    "\n",
    "criterion = torch.nn.CrossEntropyLoss()\n",
    "\n",
    "def evaluate_model(model, val_loader, criterion, device=\"cuda\"):\n",
    "    model.eval()\n",
    "    val_loss = 0.0\n",
    "    correct = 0\n",
    "    total = 0\n",
    "    all_preds = []\n",
    "    all_labels = []\n",
    "    with torch.no_grad():\n",
    "        for batch_inputs, labels in tqdm(val_loader, desc=\"Testing\", leave=False):\n",
    "            freq_inputs = batch_inputs[\"freq_inputs\"].to(device)\n",
    "            seq_inputs = batch_inputs[\"seq_inputs\"].to(device)\n",
    "            pos_inputs = batch_inputs[\"pos_inputs\"].to(device)\n",
    "            labels = labels.to(device)\n",
    "\n",
    "            preds= model({\"freq_inputs\": freq_inputs, \"seq_inputs\": seq_inputs, \"pos_inputs\": pos_inputs})\n",
    "            loss = criterion(preds, labels)\n",
    "            _, preds = torch.max(preds, dim=1)\n",
    "            # preds = (torch.sigmoid(preds) > 0.5).float()\n",
    "            val_loss += loss.item()\n",
    "            total += labels.size(0)\n",
    "            # preds = (torch.sigmoid(preds) > 0.5).int()\n",
    "            correct += (preds == labels).sum().item()\n",
    "            all_preds.extend(preds.cpu().numpy())\n",
    "            all_labels.extend(labels.cpu().numpy())\n",
    "\n",
    "    return accuracy_score(all_labels, all_preds), classification_report(all_labels, all_preds)\n",
    "\n",
    "\n",
    "accuracy, report = evaluate_model(model, test_loader, criterion)\n",
    "print(f\"Accuracy: {accuracy:.4f}\")\n",
    "print(report)\n"
   ],
   "id": "b75d2dc8a300cdf6",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "Some weights of the model checkpoint at ./exp/fox_nbc/checkpoints\\checkpoint_epoch_10 were not used when initializing CustomModel: ['freq.lin0.parametrizations.weight.original0', 'freq.lin0.parametrizations.weight.original1', 'freq.lin1.parametrizations.weight.original0', 'freq.lin1.parametrizations.weight.original1', 'freq.lin2.parametrizations.weight.original0', 'freq.lin2.parametrizations.weight.original1', 'pos.lin0.parametrizations.weight.original0', 'pos.lin0.parametrizations.weight.original1', 'pos.lin1.parametrizations.weight.original0', 'pos.lin1.parametrizations.weight.original1', 'pos.lin2.parametrizations.weight.original0', 'pos.lin2.parametrizations.weight.original1']\n",
      "- This IS expected if you are initializing CustomModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing CustomModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading checkpoint from ./exp/fox_nbc/checkpoints\\checkpoint_epoch_10\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of CustomModel were not initialized from the model checkpoint at ./exp/fox_nbc/checkpoints\\checkpoint_epoch_10 and are newly initialized: ['freq.lin0.weight_g', 'freq.lin0.weight_v', 'freq.lin1.weight_g', 'freq.lin1.weight_v', 'freq.lin2.weight_g', 'freq.lin2.weight_v', 'pos.lin0.weight_g', 'pos.lin0.weight_v', 'pos.lin1.weight_g', 'pos.lin1.weight_v', 'pos.lin2.weight_g', 'pos.lin2.weight_v']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "                                                        "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.8883\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.86      0.91      0.88       356\n",
      "           1       0.92      0.87      0.89       405\n",
      "\n",
      "    accuracy                           0.89       761\n",
      "   macro avg       0.89      0.89      0.89       761\n",
      "weighted avg       0.89      0.89      0.89       761\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r"
     ]
    }
   ],
   "execution_count": 41
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Part 3. Pushing the Model to the Hugging Face",
   "id": "d2ffeb383ea00beb"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-16T08:25:19.716888Z",
     "start_time": "2024-12-16T08:24:56.833580Z"
    }
   },
   "cell_type": "code",
   "source": "model.push_model(REPO_NAME)",
   "id": "f55c22b0a1b2a66b",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "README.md:   0%|          | 0.00/326 [00:00<?, ?B/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "7595a65d4b2f4332a7655b4270bb6f5b"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "model.safetensors:   0%|          | 0.00/517M [00:00<?, ?B/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "6ad5e72579ed463b9d6bbb1c3ef4d2a9"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "execution_count": 42
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### NOTE: You need to ensure that your Hugging Face token has both read and write access to your repository and Hugging Face organization.",
   "id": "3826c0b6195a8fd5"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-16T08:25:45.224733Z",
     "start_time": "2024-12-16T08:25:21.297050Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# Load model directly\n",
    "from transformers import AutoModel, AutoConfig\n",
    "config = AutoConfig.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\")\n",
    "model = AutoModel.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\",config = config)"
   ],
   "id": "33a0ca269c24d700",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "11974267c2544c6885478d2f63a8c41c"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "model.safetensors:   0%|          | 0.00/517M [00:00<?, ?B/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "140a2fb1413a409782bb4322ec70e4d4"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\swall\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\torch\\nn\\utils\\weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n",
      "  WeightNorm.apply(module, name, dim)\n",
      "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "Some weights of the model checkpoint at CISProject/News-Headline-Classifier-Notebook were not used when initializing CustomModel: ['freq.lin0.parametrizations.weight.original0', 'freq.lin0.parametrizations.weight.original1', 'freq.lin1.parametrizations.weight.original0', 'freq.lin1.parametrizations.weight.original1', 'freq.lin2.parametrizations.weight.original0', 'freq.lin2.parametrizations.weight.original1', 'pos.lin0.parametrizations.weight.original0', 'pos.lin0.parametrizations.weight.original1', 'pos.lin1.parametrizations.weight.original0', 'pos.lin1.parametrizations.weight.original1', 'pos.lin2.parametrizations.weight.original0', 'pos.lin2.parametrizations.weight.original1']\n",
      "- This IS expected if you are initializing CustomModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing CustomModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of CustomModel were not initialized from the model checkpoint at CISProject/News-Headline-Classifier-Notebook and are newly initialized: ['freq.lin0.weight_g', 'freq.lin0.weight_v', 'freq.lin1.weight_g', 'freq.lin1.weight_v', 'freq.lin2.weight_g', 'freq.lin2.weight_v', 'pos.lin0.weight_g', 'pos.lin0.weight_v', 'pos.lin1.weight_g', 'pos.lin1.weight_v', 'pos.lin2.weight_g', 'pos.lin2.weight_v']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "execution_count": 43
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-16T08:26:00.658712Z",
     "start_time": "2024-12-16T08:25:47.304227Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from transformers import AutoConfig, AutoModel\n",
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "model.to(device)\n",
    "\n",
    "#criterion = torch.nn.BCEWithLogitsLoss()\n",
    "\n",
    "criterion = torch.nn.CrossEntropyLoss()\n",
    "def evaluate_model(model, val_loader, criterion, device=\"cuda\"):\n",
    "    model.eval()\n",
    "    val_loss = 0.0\n",
    "    correct = 0\n",
    "    total = 0\n",
    "    all_preds = []\n",
    "    all_labels = []\n",
    "    with torch.no_grad():\n",
    "        for batch_inputs, labels in tqdm(val_loader, desc=\"Testing\", leave=False):\n",
    "            freq_inputs = batch_inputs[\"freq_inputs\"].to(device)\n",
    "            seq_inputs = batch_inputs[\"seq_inputs\"].to(device)\n",
    "            pos_inputs = batch_inputs[\"pos_inputs\"].to(device)\n",
    "            labels = labels.to(device)\n",
    "\n",
    "            preds = model({\"freq_inputs\": freq_inputs, \"seq_inputs\": seq_inputs, \"pos_inputs\": pos_inputs})\n",
    "            loss = criterion(preds, labels)\n",
    "            _, preds = torch.max(preds, dim=1)\n",
    "            # preds = (torch.sigmoid(preds) > 0.5).float()\n",
    "            val_loss += loss.item()\n",
    "            total += labels.size(0)\n",
    "            correct += (preds == labels).sum().item()\n",
    "            all_preds.extend(preds.cpu().numpy())\n",
    "            all_labels.extend(labels.cpu().numpy())\n",
    "\n",
    "    return accuracy_score(all_labels, all_preds), classification_report(all_labels, all_preds)\n",
    "\n",
    "\n",
    "accuracy, report = evaluate_model(model, test_loader, criterion)\n",
    "print(f\"Accuracy: {accuracy:.4f}\")\n",
    "print(report)\n"
   ],
   "id": "cc313b4396f87690",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                        "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.8883\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.86      0.91      0.88       356\n",
      "           1       0.92      0.87      0.89       405\n",
      "\n",
      "    accuracy                           0.89       761\n",
      "   macro avg       0.89      0.89      0.89       761\n",
      "weighted avg       0.89      0.89      0.89       761\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r"
     ]
    }
   ],
   "execution_count": 44
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "language": "python",
   "display_name": "Python 3 (ipykernel)"
  }
 },
 "nbformat": 5,
 "nbformat_minor": 9
}