CISProject
/

News-Headline-Classifier-Notebook

Safetensors

headlineclassifier

Model card Files Files and versions Community

TUEN-YUE commited on Dec 16, 2024

Commit

d6cf153

verified ·

1 Parent(s): 9e5bba0

Upload train+test.ipynb

Browse files

Files changed (1) hide show

train+test.ipynb +868 -0

train+test.ipynb ADDED Viewed

	@@ -0,0 +1,868 @@

+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "# Installing dependencies\n",
+    "\n",
+    "## Please make a copy of this notebook."
+   ],
+   "id": "13156d7ed48b282"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "!pip install geopy > delete.txt\n",
+    "!pip install datasets > delete.txt\n",
+    "!pip install torch torchvision datasets > delete.txt\n",
+    "!pip install huggingface_hub > delete.txt\n",
+    "!pip install pyhocon > delete.txt\n",
+    "!pip install transformers > delete.txt\n",
+    "!pip install gensim > delete.txt\n",
+    "!rm delete.txt"
+   ],
+   "id": "5a596f2639253772"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "# Huggingface login\n",
+    "You will require your personal token."
+   ],
+   "id": "432a756039e6399"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-16T19:48:43.216631Z",
+     "start_time": "2024-12-16T19:48:43.214630Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "!huggingface-cli login",
+   "id": "2e73da09a7c6171e",
+   "outputs": [],
+   "execution_count": 44
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Part 1: Load Data",
+   "id": "c731d9c1ebb477dc"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Downloading the train and test dataset",
+   "id": "14070f20b547688f"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "",
+   "id": "b8920847b7cc378d"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-16T19:48:45.272372Z",
+     "start_time": "2024-12-16T19:48:43.220140Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "dataset_train = load_dataset(\"CISProject/FOX_NBC\", split=\"train\")\n",
+    "dataset_test = load_dataset(\"CISProject/FOX_NBC\", split=\"test\")\n",
+    "# dataset_test = load_dataset(\"CISProject/FOX_NBC\", split=\"test_data_random_subset\")\n"
+   ],
+   "id": "877c90c978d62b7d",
+   "outputs": [],
+   "execution_count": 45
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-16T19:48:45.287939Z",
+     "start_time": "2024-12-16T19:48:45.278748Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import numpy as np\n",
+    "import torch\n",
+    "import re\n",
+    "from transformers import BertTokenizer\n",
+    "from transformers import RobertaTokenizer\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from gensim.models import KeyedVectors\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "\n",
+    "def preprocess_data(data,\n",
+    "                    mode=\"train\",\n",
+    "                    vectorizer=None,\n",
+    "                    w2v_model=None,\n",
+    "                    max_features=4096,\n",
+    "                    max_seq_length=128,\n",
+    "                    num_proc=4):\n",
+    "    if w2v_model is None:\n",
+    "        raise ValueError(\"w2v_model must be provided for Word2Vec embeddings.\")\n",
+    "\n",
+    "    # tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n",
+    "    tokenizer = RobertaTokenizer.from_pretrained(\"roberta-base\")\n",
+    "    # 1. Clean text once\n",
+    "    def clean_text(examples):\n",
+    "        import re\n",
+    "        cleaned = []\n",
+    "        for text in examples[\"title\"]:\n",
+    "            text = text.lower()\n",
+    "            text = re.sub(r'[^\\w\\s]', '', text)\n",
+    "            text = text.strip()\n",
+    "            cleaned.append(text)\n",
+    "        return {\"clean_title\": cleaned}\n",
+    "\n",
+    "    data = data.map(clean_text, batched=True, num_proc=num_proc)\n",
+    "\n",
+    "    # 2. Fit CountVectorizer on training data if needed\n",
+    "    if mode == \"train\" and vectorizer is None:\n",
+    "        # Collect all cleaned titles to fit\n",
+    "        all_titles = data[\"clean_title\"]\n",
+    "        #vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1,2))\n",
+    "        vectorizer = TfidfVectorizer(max_features=max_features)\n",
+    "        vectorizer.fit(all_titles)\n",
+    "        print(\"vectorizer fitted on training data.\")\n",
+    "\n",
+    "    # 3. Transform titles with vectorizer once\n",
+    "    def vectorize_batch(examples):\n",
+    "        import numpy as np\n",
+    "        freq = vectorizer.transform(examples[\"clean_title\"]).toarray().astype(np.float32)\n",
+    "        return {\"freq_inputs\": freq}\n",
+    "\n",
+    "    data = data.map(vectorize_batch, batched=True, num_proc=num_proc)\n",
+    "\n",
+    "    # 4. Tokenize with BERT once\n",
+    "    def tokenize_batch(examples):\n",
+    "        tokenized = tokenizer(\n",
+    "            examples[\"title\"],\n",
+    "            padding=\"max_length\",\n",
+    "            truncation=True,\n",
+    "            max_length=max_seq_length\n",
+    "        )\n",
+    "        return {\n",
+    "            \"input_ids\": tokenized[\"input_ids\"],\n",
+    "            \"attention_mask\": tokenized[\"attention_mask\"]\n",
+    "        }\n",
+    "\n",
+    "    data = data.map(tokenize_batch, batched=True, num_proc=num_proc)\n",
+    "\n",
+    "    # 5. Convert titles into tokens for W2V\n",
+    "    def split_tokens(examples):\n",
+    "        tokens_list = [t.split() for t in examples[\"clean_title\"]]\n",
+    "        return {\"tokens\": tokens_list}\n",
+    "\n",
+    "    data = data.map(split_tokens, batched=True, num_proc=num_proc)\n",
+    "\n",
+    "    # Build an embedding dictionary for all unique tokens (do this once before embedding map)\n",
+    "    unique_tokens = set()\n",
+    "    for tokens in data[\"tokens\"]:\n",
+    "        unique_tokens.update(tokens)\n",
+    "\n",
+    "    embedding_dim = w2v_model.vector_size\n",
+    "    embedding_dict = {}\n",
+    "    for tk in unique_tokens:\n",
+    "        if tk in w2v_model:\n",
+    "            embedding_dict[tk] = w2v_model[tk].astype(np.float32)\n",
+    "        else:\n",
+    "            embedding_dict[tk] = np.zeros((embedding_dim,), dtype=np.float32)\n",
+    "\n",
+    "    def w2v_embedding_batch(examples):\n",
+    "        import numpy as np\n",
+    "        batch_w2v = []\n",
+    "        for tokens in examples[\"tokens\"]:\n",
+    "            vectors = [embedding_dict[tk] for tk in tokens[:max_seq_length]]\n",
+    "            if len(vectors) < max_seq_length:\n",
+    "                vectors += [np.zeros((embedding_dim,), dtype=np.float32)] * (max_seq_length - len(vectors))\n",
+    "            batch_w2v.append(vectors)\n",
+    "        return {\"pos_inputs\": batch_w2v}\n",
+    "\n",
+    "\n",
+    "    data = data.map(w2v_embedding_batch, batched=True, batch_size=32, num_proc=num_proc)\n",
+    "\n",
+    "    # 7. Create labels\n",
+    "    def make_labels(examples):\n",
+    "        labels = examples[\"labels\"]\n",
+    "        return {\"labels\": labels}\n",
+    "\n",
+    "    data = data.map(make_labels, batched=True, num_proc=num_proc)\n",
+    "\n",
+    "    # Convert freq_inputs and pos_inputs to torch tensors in a final map step\n",
+    "    def to_tensors(examples):\n",
+    "        import torch\n",
+    "\n",
+    "        freq_inputs = torch.tensor(examples[\"freq_inputs\"], dtype=torch.float32)\n",
+    "        input_ids = torch.tensor(examples[\"input_ids\"])\n",
+    "        attention_mask = torch.tensor(examples[\"attention_mask\"])\n",
+    "        pos_inputs = torch.tensor(examples[\"pos_inputs\"], dtype=torch.float32)\n",
+    "        labels = torch.tensor(examples[\"labels\"],dtype=torch.long)\n",
+    "\n",
+    "        # seq_inputs shape: (batch_size, 2, seq_len)\n",
+    "        seq_inputs = torch.stack([input_ids, attention_mask], dim=1)\n",
+    "\n",
+    "        return {\n",
+    "            \"freq_inputs\": freq_inputs,\n",
+    "            \"seq_inputs\": seq_inputs,\n",
+    "            \"pos_inputs\": pos_inputs,\n",
+    "            \"labels\": labels\n",
+    "        }\n",
+    "\n",
+    "    # Apply final conversion to tensor\n",
+    "    processed_data = data.map(to_tensors, batched=True, num_proc=num_proc)\n",
+    "\n",
+    "    return processed_data, vectorizer\n"
+   ],
+   "id": "dc2ba675ce880d6d",
+   "outputs": [],
+   "execution_count": 46
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-16T19:49:01.529651Z",
+     "start_time": "2024-12-16T19:48:45.294290Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from gensim.models import KeyedVectors\n",
+    "w2v_model = KeyedVectors.load_word2vec_format(\"./GoogleNews-vectors-negative300.bin\", binary=True)\n",
+    "\n",
+    "dataset_train,vectorizer = preprocess_data(\n",
+    "    data=dataset_train,\n",
+    "    mode=\"train\",\n",
+    "    w2v_model=w2v_model,\n",
+    "    max_features=8192,\n",
+    "    max_seq_length=128\n",
+    ")\n",
+    "\n",
+    "dataset_test, _ = preprocess_data(\n",
+    "    data=dataset_test,\n",
+    "    mode=\"test\",\n",
+    "    vectorizer=vectorizer,\n",
+    "    w2v_model=w2v_model,\n",
+    "    max_features=8192,\n",
+    "    max_seq_length=128\n",
+    ")"
+   ],
+   "id": "158b99950fb22d1",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "vectorizer fitted on training data.\n"
+     ]
+    }
+   ],
+   "execution_count": 47
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-16T19:49:01.538067Z",
+     "start_time": "2024-12-16T19:49:01.535063Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "print(dataset_train)\n",
+    "print(dataset_test)"
+   ],
+   "id": "edd80d33175c96a0",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset({\n",
+      "    features: ['title', 'outlet', 'index', 'url', 'labels', 'clean_title', 'freq_inputs', 'input_ids', 'attention_mask', 'tokens', 'pos_inputs', 'seq_inputs'],\n",
+      "    num_rows: 3044\n",
+      "})\n",
+      "Dataset({\n",
+      "    features: ['title', 'outlet', 'index', 'url', 'labels', 'clean_title', 'freq_inputs', 'input_ids', 'attention_mask', 'tokens', 'pos_inputs', 'seq_inputs'],\n",
+      "    num_rows: 761\n",
+      "})\n"
+     ]
+    }
+   ],
+   "execution_count": 48
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Part 2: Model",
+   "id": "c9a49fc1fbca29d7"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Defining the Custom Model",
+   "id": "aebe5e51f0e611cc"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "",
+   "id": "f0eae08a025b6ed9"
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-16T19:49:01.554769Z",
+     "start_time": "2024-12-16T19:49:01.543575Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "# TODO: import all packages necessary for your custom model\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "from torch.utils.data import DataLoader\n",
+    "from transformers import PreTrainedModel, PretrainedConfig, AutoConfig, AutoModel\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "from transformers import RobertaModel, RobertaConfig,RobertaForSequenceClassification, BertModel\n",
+    "from model.network import Classifier\n",
+    "from model.frequential import FreqNetwork\n",
+    "from model.sequential import SeqNetwork\n",
+    "from model.positional import PosNetwork\n",
+    "\n",
+    "class CustomConfig(PretrainedConfig):\n",
+    "    model_type = \"headlineclassifier\"\n",
+    "\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        base_exp_dir=\"./exp/fox_nbc/\",\n",
+    "        # dataset={\"data_dir\": \"./data/CASE_NAME/data.csv\", \"transform\": True},\n",
+    "        train={\n",
+    "            \"learning_rate\": 2e-5,\n",
+    "            \"learning_rate_alpha\": 0.05,\n",
+    "            \"end_iter\": 10,\n",
+    "            \"batch_size\": 32,\n",
+    "            \"warm_up_end\": 2,\n",
+    "            \"anneal_end\": 5,\n",
+    "            \"save_freq\": 1,\n",
+    "            \"val_freq\": 1,\n",
+    "        },\n",
+    "        model={\n",
+    "            \"freq\": {\n",
+    "                \"tfidf_input_dim\": 8145,\n",
+    "                \"tfidf_output_dim\": 128,\n",
+    "                \"tfidf_hidden_dim\": 512,\n",
+    "                \"n_layers\": 2,\n",
+    "                \"skip_in\": [80],\n",
+    "                \"weight_norm\": True,\n",
+    "            },\n",
+    "            \"pos\": {\n",
+    "                \"input_dim\": 300,\n",
+    "                \"output_dim\": 128,\n",
+    "                \"hidden_dim\": 256,\n",
+    "                \"n_layers\": 2,\n",
+    "                \"skip_in\": [80],\n",
+    "                \"weight_norm\": True,\n",
+    "            },\n",
+    "            \"cls\": {\n",
+    "                \"combined_input\": 1024, #1024\n",
+    "                \"combined_dim\": 128,\n",
+    "                \"num_classes\": 2,\n",
+    "                \"n_layers\": 2,\n",
+    "                \"skip_in\": [80],\n",
+    "                \"weight_norm\": True,\n",
+    "            },\n",
+    "        },\n",
+    "        **kwargs,\n",
+    "    ):\n",
+    "        super().__init__(**kwargs)\n",
+    "\n",
+    "        self.base_exp_dir = base_exp_dir\n",
+    "        # self.dataset = dataset\n",
+    "        self.train = train\n",
+    "        self.model = model\n",
+    "\n",
+    "# TODO: define all parameters needed for your model, as well as calling the model itself\n",
+    "class CustomModel(PreTrainedModel):\n",
+    "    config_class = CustomConfig\n",
+    "\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        self.conf = config\n",
+    "        self.freq = FreqNetwork(**self.conf.model[\"freq\"])\n",
+    "        self.pos = PosNetwork(**self.conf.model[\"pos\"])\n",
+    "        self.cls = Classifier(**self.conf.model[\"cls\"])\n",
+    "        self.fc = nn.Linear(self.conf.model[\"cls\"][\"combined_input\"],2)\n",
+    "        self.seq = RobertaModel.from_pretrained(\"roberta-base\")\n",
+    "        # self.seq = BertModel.from_pretrained(\"bert-base-uncased\")\n",
+    "        #for param in self.roberta.parameters():\n",
+    "        #    param.requires_grad = False\n",
+    "        self.dropout = nn.Dropout(0.2)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        freq_inputs = x[\"freq_inputs\"]\n",
+    "        seq_inputs = x[\"seq_inputs\"]\n",
+    "        pos_inputs = x[\"pos_inputs\"]\n",
+    "        seq_feature = self.seq(\n",
+    "            input_ids=seq_inputs[:,0,:],\n",
+    "            attention_mask=seq_inputs[:,1,:]\n",
+    "        ).pooler_output # last_hidden_state[:, 0, :]\n",
+    "        lstm_out, (h_n, c_n) = self.lstm(seq_feature)\n",
+    "        seq_feature = h_n[-1]  # Use the last hidden state\n",
+    "        freq_feature = self.freq(freq_inputs) # Shape: (batch_size, 128)\n",
+    "\n",
+    "        pos_feature = self.pos(pos_inputs) #Shape: (batch_size, 128)\n",
+    "        inputs = torch.cat((seq_feature, freq_feature, pos_feature), dim=1)  # Shape: (batch_size, 384)\n",
+    "        # inputs = torch.cat((seq_feature, freq_feature), dim=1)  # Shape: (batch_size,256)\n",
+    "        # inputs = seq_feature\n",
+    "\n",
+    "        x = inputs\n",
+    "        x = self.dropout(x)\n",
+    "        outputs = self.fc(x)\n",
+    "\n",
+    "        return outputs\n",
+    "\n",
+    "    def save_model(self, save_path):\n",
+    "        \"\"\"Save the model locally using the Hugging Face format.\"\"\"\n",
+    "        self.save_pretrained(save_path)\n",
+    "\n",
+    "    def push_model(self, repo_name):\n",
+    "        \"\"\"Push the model to the Hugging Face Hub.\"\"\"\n",
+    "        self.push_to_hub(repo_name)"
+   ],
+   "id": "21f079d0c52d7d",
+   "outputs": [],
+   "execution_count": 49
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-16T19:49:01.791918Z",
+     "start_time": "2024-12-16T19:49:01.561338Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from huggingface_hub import hf_hub_download\n",
+    "\n",
+    "AutoConfig.register(\"headlineclassifier\", CustomConfig)\n",
+    "AutoModel.register(CustomConfig, CustomModel)\n",
+    "config = CustomConfig()\n",
+    "model = CustomModel(config)\n",
+    "\n",
+    "REPO_NAME = \"CISProject/News-Headline-Classifier-Notebook\" # TODO: PROVIDE A STRING TO YOUR REPO ON HUGGINGFACE"
+   ],
+   "id": "b6ba3f96d3ce21",
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\swall\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\torch\\nn\\utils\\weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n",
+      "  WeightNorm.apply(module, name, dim)\n",
+      "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "execution_count": 50
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-16T19:49:01.808079Z",
+     "start_time": "2024-12-16T19:49:01.798760Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import torch\n",
+    "from tqdm import tqdm\n",
+    "import os\n",
+    "\n",
+    "\n",
+    "class Trainer:\n",
+    "    def __init__(self, model, train_loader, val_loader, config, device=\"cuda\"):\n",
+    "        self.model = model.to(device)\n",
+    "        self.train_loader = train_loader\n",
+    "        self.val_loader = val_loader\n",
+    "        self.device = device\n",
+    "        self.conf = config\n",
+    "\n",
+    "        self.end_iter = self.conf.train[\"end_iter\"]\n",
+    "        self.save_freq = self.conf.train[\"save_freq\"]\n",
+    "        self.val_freq = self.conf.train[\"val_freq\"]\n",
+    "\n",
+    "        self.batch_size = self.conf.train['batch_size']\n",
+    "        self.learning_rate = self.conf.train['learning_rate']\n",
+    "        self.learning_rate_alpha = self.conf.train['learning_rate_alpha']\n",
+    "        self.warm_up_end = self.conf.train['warm_up_end']\n",
+    "        self.anneal_end = self.conf.train['anneal_end']\n",
+    "\n",
+    "        self.optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate)\n",
+    "        #self.criterion = torch.nn.BCEWithLogitsLoss()\n",
+    "        self.criterion = torch.nn.CrossEntropyLoss()\n",
+    "        self.save_path = os.path.join(self.conf.base_exp_dir, \"checkpoints\")\n",
+    "        os.makedirs(self.save_path, exist_ok=True)\n",
+    "\n",
+    "        self.iter_step = 0\n",
+    "\n",
+    "        self.val_loss = None\n",
+    "\n",
+    "    def get_cos_anneal_ratio(self):\n",
+    "        if self.anneal_end == 0.0:\n",
+    "            return 1.0\n",
+    "        else:\n",
+    "            return np.min([1.0, self.iter_step / self.anneal_end])\n",
+    "\n",
+    "    def update_learning_rate(self):\n",
+    "        if self.iter_step < self.warm_up_end:\n",
+    "            learning_factor = self.iter_step / self.warm_up_end\n",
+    "        else:\n",
+    "            alpha = self.learning_rate_alpha\n",
+    "            progress = (self.iter_step - self.warm_up_end) / (self.end_iter - self.warm_up_end)\n",
+    "            learning_factor = (np.cos(np.pi * progress) + 1.0) * 0.5 * (1 - alpha) + alpha\n",
+    "\n",
+    "        for g in self.optimizer.param_groups:\n",
+    "            g['lr'] = self.learning_rate * learning_factor\n",
+    "\n",
+    "    def train(self):\n",
+    "        for epoch in range(self.end_iter):\n",
+    "            self.update_learning_rate()\n",
+    "            self.model.train()\n",
+    "            epoch_loss = 0.0\n",
+    "            correct = 0\n",
+    "            total = 0\n",
+    "\n",
+    "            for batch_inputs, labels in tqdm(self.train_loader, desc=f\"Epoch {epoch + 1}/{self.end_iter}\"):\n",
+    "                # Extract features\n",
+    "\n",
+    "                freq_inputs = batch_inputs[\"freq_inputs\"].to(self.device)\n",
+    "                seq_inputs = batch_inputs[\"seq_inputs\"].to(self.device)\n",
+    "                pos_inputs = batch_inputs[\"pos_inputs\"].to(self.device)\n",
+    "                # y_train = labels.to(self.device)[:,None]\n",
+    "                y_train = labels.to(self.device)\n",
+    "\n",
+    "                # Forward pass\n",
+    "                preds = self.model({\"freq_inputs\": freq_inputs, \"seq_inputs\": seq_inputs, \"pos_inputs\": pos_inputs})\n",
+    "                loss = self.criterion(preds, y_train)\n",
+    "\n",
+    "                # preds = (torch.sigmoid(preds) > 0.5).int()\n",
+    "                # Backward pass\n",
+    "                self.optimizer.zero_grad()\n",
+    "                loss.backward()\n",
+    "                self.optimizer.step()\n",
+    "                _, preds = torch.max(preds, dim=1)\n",
+    "                # Metrics\n",
+    "                epoch_loss += loss.item()\n",
+    "                total += y_train.size(0)\n",
+    "                # print(preds.shape)\n",
+    "                correct += (preds == y_train).sum().item()\n",
+    "\n",
+    "            # Log epoch metrics\n",
+    "            print(f\"Train Loss: {epoch_loss / len(self.train_loader):.4f}\")\n",
+    "            print(f\"Train Accuracy: {correct / total:.4f}\")\n",
+    "\n",
+    "            # Validation and Save Checkpoints\n",
+    "            if (epoch + 1) % self.val_freq == 0:\n",
+    "                self.val()\n",
+    "            if (epoch + 1) % self.save_freq == 0:\n",
+    "                self.save_checkpoint(epoch + 1)\n",
+    "\n",
+    "            # Update learning rate\n",
+    "            self.iter_step += 1\n",
+    "            self.update_learning_rate()\n",
+    "\n",
+    "\n",
+    "    def val(self):\n",
+    "        self.model.eval()\n",
+    "        val_loss = 0.0\n",
+    "        correct = 0\n",
+    "        total = 0\n",
+    "\n",
+    "        with torch.no_grad():\n",
+    "            for batch_inputs, labels in tqdm(self.val_loader, desc=\"Validation\", leave=False):\n",
+    "                freq_inputs = batch_inputs[\"freq_inputs\"].to(self.device)\n",
+    "                seq_inputs = batch_inputs[\"seq_inputs\"].to(self.device)\n",
+    "                pos_inputs = batch_inputs[\"pos_inputs\"].to(self.device)\n",
+    "                y_val = labels.to(self.device)\n",
+    "\n",
+    "                preds = self.model({\"freq_inputs\": freq_inputs, \"seq_inputs\": seq_inputs, \"pos_inputs\": pos_inputs})\n",
+    "                loss = self.criterion(preds, y_val)\n",
+    "                # preds = (torch.sigmoid(preds)>0.5).float()\n",
+    "                _, preds = torch.max(preds, dim=1)\n",
+    "                val_loss += loss.item()\n",
+    "                total += y_val.size(0)\n",
+    "                correct += (preds == y_val).sum().item()\n",
+    "        if self.val_loss is None or val_loss < self.val_loss:\n",
+    "            self.val_loss = val_loss\n",
+    "            self.save_checkpoint(\"best\")\n",
+    "        # Log validation metrics\n",
+    "        print(f\"Validation Loss: {val_loss / len(self.val_loader):.4f}\")\n",
+    "        print(f\"Validation Accuracy: {correct / total:.4f}\")\n",
+    "\n",
+    "    def save_checkpoint(self, epoch):\n",
+    "        \"\"\"Save model in Hugging Face format.\"\"\"\n",
+    "        checkpoint_dir = os.path.join(self.save_path, f\"checkpoint_epoch_{epoch}\")\n",
+    "        if epoch ==\"best\":\n",
+    "            checkpoint_dir = os.path.join(self.save_path, \"best\")\n",
+    "        self.model.save_pretrained(checkpoint_dir)\n",
+    "        print(f\"Checkpoint saved at {checkpoint_dir}\")"
+   ],
+   "id": "7be377251b81a25d",
+   "outputs": [],
+   "execution_count": 51
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-16T19:49:03.149673Z",
+     "start_time": "2024-12-16T19:49:01.812943Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from torch.utils.data import DataLoader\n",
+    "\n",
+    "# Define a collate function to handle the batched data\n",
+    "def collate_fn(batch):\n",
+    "    freq_inputs = torch.stack([torch.tensor(item[\"freq_inputs\"]) for item in batch])\n",
+    "    seq_inputs = torch.stack([torch.tensor(item[\"seq_inputs\"]) for item in batch])\n",
+    "    pos_inputs = torch.stack([torch.tensor(item[\"pos_inputs\"]) for item in batch])\n",
+    "    labels = torch.tensor([torch.tensor(item[\"labels\"],dtype=torch.long) for item in batch])\n",
+    "    return {\"freq_inputs\": freq_inputs, \"seq_inputs\": seq_inputs, \"pos_inputs\": pos_inputs}, labels\n",
+    "\n",
+    "train_loader = DataLoader(dataset_train, batch_size=config.train[\"batch_size\"], shuffle=True,collate_fn=collate_fn)\n",
+    "test_loader = DataLoader(dataset_test, batch_size=config.train[\"batch_size\"], shuffle=False,collate_fn=collate_fn)\n",
+    "trainer = Trainer(model, train_loader, test_loader, config)\n",
+    "\n",
+    "# Train the model\n",
+    "trainer.train()\n",
+    "# Save the final model in Hugging Face format\n",
+    "final_save_path = os.path.join(config.base_exp_dir, \"checkpoints\")\n",
+    "model.save_pretrained(final_save_path)\n",
+    "print(f\"Final model saved at {final_save_path}\")\n"
+   ],
+   "id": "dd1749c306f148eb",
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1/10:   0%|          | 0/96 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([1, 768]) torch.Size([32, 128]) torch.Size([32, 128])\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "Sizes of tensors must match except in dimension 1. Expected size 1 but got size 32 for tensor number 1 in the list.",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[1;31mRuntimeError\u001B[0m                              Traceback (most recent call last)",
+      "Cell \u001B[1;32mIn[52], line 16\u001B[0m\n\u001B[0;32m     13\u001B[0m trainer \u001B[38;5;241m=\u001B[39m Trainer(model, train_loader, test_loader, config)\n\u001B[0;32m     15\u001B[0m \u001B[38;5;66;03m# Train the model\u001B[39;00m\n\u001B[1;32m---> 16\u001B[0m trainer\u001B[38;5;241m.\u001B[39mtrain()\n\u001B[0;32m     17\u001B[0m \u001B[38;5;66;03m# Save the final model in Hugging Face format\u001B[39;00m\n\u001B[0;32m     18\u001B[0m final_save_path \u001B[38;5;241m=\u001B[39m os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39mjoin(config\u001B[38;5;241m.\u001B[39mbase_exp_dir, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcheckpoints\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n",
+      "Cell \u001B[1;32mIn[51], line 69\u001B[0m, in \u001B[0;36mTrainer.train\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m     66\u001B[0m y_train \u001B[38;5;241m=\u001B[39m labels\u001B[38;5;241m.\u001B[39mto(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mdevice)\n\u001B[0;32m     68\u001B[0m \u001B[38;5;66;03m# Forward pass\u001B[39;00m\n\u001B[1;32m---> 69\u001B[0m preds \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mmodel({\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mfreq_inputs\u001B[39m\u001B[38;5;124m\"\u001B[39m: freq_inputs, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mseq_inputs\u001B[39m\u001B[38;5;124m\"\u001B[39m: seq_inputs, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mpos_inputs\u001B[39m\u001B[38;5;124m\"\u001B[39m: pos_inputs})\n\u001B[0;32m     70\u001B[0m loss \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcriterion(preds, y_train)\n\u001B[0;32m     72\u001B[0m \u001B[38;5;66;03m# preds = (torch.sigmoid(preds) > 0.5).int()\u001B[39;00m\n\u001B[0;32m     73\u001B[0m \u001B[38;5;66;03m# Backward pass\u001B[39;00m\n",
+      "File \u001B[1;32m~\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1736\u001B[0m, in \u001B[0;36mModule._wrapped_call_impl\u001B[1;34m(self, *args, **kwargs)\u001B[0m\n\u001B[0;32m   1734\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_compiled_call_impl(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)  \u001B[38;5;66;03m# type: ignore[misc]\u001B[39;00m\n\u001B[0;32m   1735\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m-> 1736\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_call_impl(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n",
+      "File \u001B[1;32m~\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1747\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[1;34m(self, *args, **kwargs)\u001B[0m\n\u001B[0;32m   1742\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[0;32m   1743\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[0;32m   1744\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks\n\u001B[0;32m   1745\u001B[0m         \u001B[38;5;129;01mor\u001B[39;00m _global_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[0;32m   1746\u001B[0m         \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[1;32m-> 1747\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m forward_call(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[0;32m   1749\u001B[0m result \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[0;32m   1750\u001B[0m called_always_called_hooks \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mset\u001B[39m()\n",
+      "Cell \u001B[1;32mIn[49], line 99\u001B[0m, in \u001B[0;36mCustomModel.forward\u001B[1;34m(self, x)\u001B[0m\n\u001B[0;32m     97\u001B[0m pos_feature \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mpos(pos_inputs) \u001B[38;5;66;03m#Shape: (batch_size, 128)\u001B[39;00m\n\u001B[0;32m     98\u001B[0m \u001B[38;5;28mprint\u001B[39m(seq_feature\u001B[38;5;241m.\u001B[39mshape,pos_feature\u001B[38;5;241m.\u001B[39mshape,freq_feature\u001B[38;5;241m.\u001B[39mshape)\n\u001B[1;32m---> 99\u001B[0m inputs \u001B[38;5;241m=\u001B[39m torch\u001B[38;5;241m.\u001B[39mcat((seq_feature, freq_feature, pos_feature), dim\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m1\u001B[39m)  \u001B[38;5;66;03m# Shape: (batch_size, 384)\u001B[39;00m\n\u001B[0;32m    100\u001B[0m \u001B[38;5;66;03m# inputs = torch.cat((seq_feature, freq_feature), dim=1)  # Shape: (batch_size,256)\u001B[39;00m\n\u001B[0;32m    101\u001B[0m \u001B[38;5;66;03m# inputs = seq_feature\u001B[39;00m\n\u001B[0;32m    103\u001B[0m x \u001B[38;5;241m=\u001B[39m inputs\n",
+      "\u001B[1;31mRuntimeError\u001B[0m: Sizes of tensors must match except in dimension 1. Expected size 1 but got size 32 for tensor number 1 in the list."
+     ]
+    }
+   ],
+   "execution_count": 52
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "## Evaluate Model",
+   "id": "4af000263dd99bca"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "from transformers import AutoConfig, AutoModel\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "def load_last_checkpoint(checkpoint_dir):\n",
+    "    # Find all checkpoints in the directory\n",
+    "    checkpoints = [f for f in os.listdir(checkpoint_dir) if f.startswith(\"checkpoint_epoch_\")]\n",
+    "    if not checkpoints:\n",
+    "        raise FileNotFoundError(f\"No checkpoints found in {checkpoint_dir}!\")\n",
+    "    # Sort checkpoints by epoch number\n",
+    "    checkpoints.sort(key=lambda x: int(x.split(\"_\")[-1]))\n",
+    "\n",
+    "    # Load the last checkpoint\n",
+    "    last_checkpoint = os.path.join(checkpoint_dir, checkpoints[-1])\n",
+    "    # print(f\"Loading checkpoint from {last_checkpoint}\")\n",
+    "    # Load the best checkpoint\n",
+    "    if os.path.join(checkpoint_dir, \"best\") is not None:\n",
+    "        last_checkpoint = os.path.join(checkpoint_dir, \"best\")\n",
+    "    print(f\"Loading checkpoint from {last_checkpoint}\")\n",
+    "    # Load model and config\n",
+    "    config = AutoConfig.from_pretrained(last_checkpoint)\n",
+    "    model = AutoModel.from_pretrained(last_checkpoint, config=config)\n",
+    "    return model\n",
+    "\n",
+    "# Step 1: Define paths and setup\n",
+    "checkpoint_dir = os.path.join(config.base_exp_dir, \"checkpoints\")  # Directory where checkpoints are stored\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "model = load_last_checkpoint(checkpoint_dir)\n",
+    "model.to(device)\n",
+    "\n",
+    "# criterion = torch.nn.BCEWithLogitsLoss()\n",
+    "\n",
+    "criterion = torch.nn.CrossEntropyLoss()\n",
+    "\n",
+    "def evaluate_model(model, val_loader, criterion, device=\"cuda\"):\n",
+    "    model.eval()\n",
+    "    val_loss = 0.0\n",
+    "    correct = 0\n",
+    "    total = 0\n",
+    "    all_preds = []\n",
+    "    all_labels = []\n",
+    "    with torch.no_grad():\n",
+    "        for batch_inputs, labels in tqdm(val_loader, desc=\"Testing\", leave=False):\n",
+    "            freq_inputs = batch_inputs[\"freq_inputs\"].to(device)\n",
+    "            seq_inputs = batch_inputs[\"seq_inputs\"].to(device)\n",
+    "            pos_inputs = batch_inputs[\"pos_inputs\"].to(device)\n",
+    "            labels = labels.to(device)\n",
+    "\n",
+    "            preds= model({\"freq_inputs\": freq_inputs, \"seq_inputs\": seq_inputs, \"pos_inputs\": pos_inputs})\n",
+    "            loss = criterion(preds, labels)\n",
+    "            _, preds = torch.max(preds, dim=1)\n",
+    "            # preds = (torch.sigmoid(preds) > 0.5).float()\n",
+    "            val_loss += loss.item()\n",
+    "            total += labels.size(0)\n",
+    "            # preds = (torch.sigmoid(preds) > 0.5).int()\n",
+    "            correct += (preds == labels).sum().item()\n",
+    "            all_preds.extend(preds.cpu().numpy())\n",
+    "            all_labels.extend(labels.cpu().numpy())\n",
+    "\n",
+    "    return accuracy_score(all_labels, all_preds), classification_report(all_labels, all_preds)\n",
+    "\n",
+    "\n",
+    "accuracy, report = evaluate_model(model, test_loader, criterion)\n",
+    "print(f\"Accuracy: {accuracy:.4f}\")\n",
+    "print(report)\n"
+   ],
+   "id": "b75d2dc8a300cdf6",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Part 3. Pushing the Model to the Hugging Face",
+   "id": "d2ffeb383ea00beb"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": "model.push_model(REPO_NAME)",
+   "id": "f55c22b0a1b2a66b",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "### NOTE: You need to ensure that your Hugging Face token has both read and write access to your repository and Hugging Face organization.",
+   "id": "3826c0b6195a8fd5"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# Load model directly\n",
+    "from transformers import AutoModel, AutoConfig\n",
+    "config = AutoConfig.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\")\n",
+    "model = AutoModel.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\",config = config)"
+   ],
+   "id": "33a0ca269c24d700",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "from transformers import AutoConfig, AutoModel\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "model.to(device)\n",
+    "\n",
+    "#criterion = torch.nn.BCEWithLogitsLoss()\n",
+    "\n",
+    "criterion = torch.nn.CrossEntropyLoss()\n",
+    "def evaluate_model(model, val_loader, criterion, device=\"cuda\"):\n",
+    "    model.eval()\n",
+    "    val_loss = 0.0\n",
+    "    correct = 0\n",
+    "    total = 0\n",
+    "    all_preds = []\n",
+    "    all_labels = []\n",
+    "    with torch.no_grad():\n",
+    "        for batch_inputs, labels in tqdm(val_loader, desc=\"Testing\", leave=False):\n",
+    "            freq_inputs = batch_inputs[\"freq_inputs\"].to(device)\n",
+    "            seq_inputs = batch_inputs[\"seq_inputs\"].to(device)\n",
+    "            pos_inputs = batch_inputs[\"pos_inputs\"].to(device)\n",
+    "            labels = labels.to(device)\n",
+    "\n",
+    "            preds = model({\"freq_inputs\": freq_inputs, \"seq_inputs\": seq_inputs, \"pos_inputs\": pos_inputs})\n",
+    "            loss = criterion(preds, labels)\n",
+    "            _, preds = torch.max(preds, dim=1)\n",
+    "            # preds = (torch.sigmoid(preds) > 0.5).float()\n",
+    "            val_loss += loss.item()\n",
+    "            total += labels.size(0)\n",
+    "            correct += (preds == labels).sum().item()\n",
+    "            all_preds.extend(preds.cpu().numpy())\n",
+    "            all_labels.extend(labels.cpu().numpy())\n",
+    "\n",
+    "    return accuracy_score(all_labels, all_preds), classification_report(all_labels, all_preds)\n",
+    "\n",
+    "\n",
+    "accuracy, report = evaluate_model(model, test_loader, criterion)\n",
+    "print(f\"Accuracy: {accuracy:.4f}\")\n",
+    "print(report)\n"
+   ],
+   "id": "cc313b4396f87690",
+   "outputs": [],
+   "execution_count": null
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "name": "python3",
+   "language": "python",
+   "display_name": "Python 3 (ipykernel)"
+  }
+ },
+ "nbformat": 5,
+ "nbformat_minor": 9
+}