{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# NewsClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "mtVYEQSYsswc",
    "outputId": "6f16c0c1-ef25-406c-dd14-edd1a72dc760",
    "trusted": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     C:\\Users\\manis\\AppData\\Roaming\\nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Imports\n",
    "import os\n",
    "import gc\n",
    "import time\n",
    "from pathlib import Path\n",
    "import json\n",
    "from typing import Tuple, Dict\n",
    "from warnings import filterwarnings\n",
    "\n",
    "filterwarnings(\"ignore\")\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import ipywidgets as widgets\n",
    "from wordcloud import WordCloud, STOPWORDS\n",
    "\n",
    "from tqdm.auto import tqdm\n",
    "from dataclasses import dataclass\n",
    "\n",
    "import re\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "from torch.utils.data import DataLoader, Dataset\n",
    "\n",
    "from transformers import RobertaTokenizer, RobertaModel\n",
    "\n",
    "import wandb\n",
    "\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "nltk.download(\"stopwords\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "trusted": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mmanishdrw1\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wandb.login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "id": "fGW_WYn31JHT",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "@dataclass\n",
    "class Cfg:\n",
    "    STOPWORDS = stopwords.words(\"english\")\n",
    "    dataset_loc = \"../dataset/raw/news_dataset.csv\"\n",
    "    test_size = 0.2\n",
    "\n",
    "    add_special_tokens = True\n",
    "    max_len = 50\n",
    "    pad_to_max_length = True\n",
    "    truncation = True\n",
    "\n",
    "    change_config = False\n",
    "\n",
    "    dropout_pb = 0.5\n",
    "    lr = 1e-4\n",
    "    lr_redfactor = 0.7\n",
    "    lr_redpatience = 4\n",
    "    epochs = 10\n",
    "    batch_size = 128\n",
    "\n",
    "    wandb_sweep = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "id": "7V5OJWw4sswg",
    "outputId": "8eb13263-d31a-4d49-f1f6-3c2dc0595c78",
    "trusted": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Matthew McConaughey Gives Joy Behar A Foot Massage On ‘The View’\n",
      "Entertainment\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv(Cfg.dataset_loc)\n",
    "print(df[\"Title\"][10040])\n",
    "print(df[\"Category\"][10040])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "w05pkO5RN1H2"
   },
   "source": [
    "## Prepare Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "id": "l8Z3Hhk3sswg",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "def prepare_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:\n",
    "    \"\"\"Separate headlines instance and feature selection.\n",
    "\n",
    "    Args:\n",
    "        df: original dataframe.\n",
    "\n",
    "    Returns:\n",
    "        df: new dataframe with appropriate features.\n",
    "        headlines_df: dataframe cintaining \"headlines\" category instances.\n",
    "    \"\"\"\n",
    "    df = df[[\"Title\", \"Category\"]]\n",
    "    df.rename(columns={\"Title\": \"Text\"}, inplace=True)\n",
    "    df, headlines_df = df[df[\"Category\"] != \"Headlines\"].reset_index(drop=True), df[df[\"Category\"] == \"Headlines\"].reset_index(drop=True)\n",
    "\n",
    "    return df, headlines_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "id": "d4t7JjIEsswg",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "def clean_text(text: str) -> str:\n",
    "    \"\"\"Clean text (lower, puntuations removal, blank space removal).\"\"\"\n",
    "    # lower case the text\n",
    "    text = text.lower()  # necessary to do before as stopwords are in lower case\n",
    "\n",
    "    # remove stopwords\n",
    "    stp_pattern = re.compile(r\"\\b(\" + r\"|\".join(Cfg.STOPWORDS) + r\")\\b\\s*\")\n",
    "    text = stp_pattern.sub(\"\", text)\n",
    "\n",
    "    # custom cleaning\n",
    "    text = text.strip()  # remove space at start or end if any\n",
    "    text = re.sub(\" +\", \" \", text)  # remove extra spaces\n",
    "    text = re.sub(\"[^A-Za-z0-9]+\", \" \", text)  # remove characters that are not alphanumeric\n",
    "\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "id": "NokmvVFusswh",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "def preprocess(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict, Dict]:\n",
    "    \"\"\"Preprocess the data.\n",
    "\n",
    "    Args:\n",
    "        df: Dataframe on which the preprocessing steps need to be performed.\n",
    "\n",
    "    Returns:\n",
    "        df: Preprocessed Data.\n",
    "        class_to_index: class labels to indices mapping\n",
    "        class_to_index: indices to class labels mapping\n",
    "    \"\"\"\n",
    "    df, headlines_df = prepare_data(df)\n",
    "\n",
    "    cats = df[\"Category\"].unique().tolist()\n",
    "    num_classes = len(cats)\n",
    "    class_to_index = {tag: i for i, tag in enumerate(cats)}\n",
    "    index_to_class = {v: k for k, v in class_to_index.items()}\n",
    "\n",
    "    df[\"Text\"] = df[\"Text\"].apply(clean_text)  # clean text\n",
    "    df = df[[\"Text\", \"Category\"]]\n",
    "    df[\"Category\"] = df[\"Category\"].map(class_to_index)  # label encoding\n",
    "    return df, class_to_index, index_to_class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "id": "f45cNikCsswh",
    "outputId": "880e338e-11a3-4048-ccf7-d30bf13e996b",
    "trusted": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text</th>\n",
       "      <th>Category</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>chainlink link falters hedera hbar wobbles yet...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>funds punished owning nvidia shares stunning 2...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>crude oil prices stalled hedge funds sold kemp</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>grayscale bitcoin win still half battle</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>home shopping editor miss labor day deals eyeing</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44142</th>\n",
       "      <td>slovakia election could echo ukraine expect</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44143</th>\n",
       "      <td>things know nobel prizes washington post</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44144</th>\n",
       "      <td>brief calm protests killing 2 students rock im...</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44145</th>\n",
       "      <td>one safe france vows action bedbugs sweep paris</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44146</th>\n",
       "      <td>slovakia election polls open knife edge vote u...</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>44147 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    Text  Category\n",
       "0      chainlink link falters hedera hbar wobbles yet...         0\n",
       "1      funds punished owning nvidia shares stunning 2...         0\n",
       "2         crude oil prices stalled hedge funds sold kemp         0\n",
       "3                grayscale bitcoin win still half battle         0\n",
       "4       home shopping editor miss labor day deals eyeing         0\n",
       "...                                                  ...       ...\n",
       "44142       slovakia election could echo ukraine expect          6\n",
       "44143           things know nobel prizes washington post         6\n",
       "44144  brief calm protests killing 2 students rock im...         6\n",
       "44145    one safe france vows action bedbugs sweep paris         6\n",
       "44146  slovakia election polls open knife edge vote u...         6\n",
       "\n",
       "[44147 rows x 2 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds, class_to_index, index_to_class = preprocess(df)\n",
    "ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "index_to_class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "id": "zGlMz2UJsswi",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# Data splits\n",
    "train_ds, val_ds = train_test_split(ds, test_size=Cfg.test_size, stratify=ds[\"Category\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "id": "zTeAsruMsswi",
    "outputId": "bffed91d-04c6-490e-d682-03537d3182dd",
    "trusted": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'input_ids': tensor([    0,   462, 25744,  7188,   155,    23,   462, 11485,   112,     2,\n",
       "            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
       "            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
       "            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,\n",
       "            1,     1,     1,     1,     1,     1,     1,     1,     1,     1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0])}"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def prepare_input(tokenizer: RobertaTokenizer, text: str) -> Dict:\n",
    "    \"\"\"Tokenize and prepare the input text using the provided tokenizer.\n",
    "\n",
    "    Args:\n",
    "        tokenizer (RobertaTokenizer): The Roberta tokenizer to encode the input.\n",
    "        text (str): The input text to be tokenized.\n",
    "\n",
    "    Returns:\n",
    "        inputs (dict): A dictionary containing the tokenized input with keys such as 'input_ids',\n",
    "            'attention_mask', etc.\n",
    "    \"\"\"\n",
    "    inputs = tokenizer.encode_plus(\n",
    "        text,\n",
    "        return_tensors=None,\n",
    "        add_special_tokens=Cfg.add_special_tokens,\n",
    "        max_length=Cfg.max_len,\n",
    "        pad_to_max_length=Cfg.pad_to_max_length,\n",
    "        truncation=Cfg.truncation,\n",
    "    )\n",
    "    for k, v in inputs.items():\n",
    "        inputs[k] = torch.tensor(v, dtype=torch.long)\n",
    "    return inputs\n",
    "\n",
    "\n",
    "class NewsDataset(Dataset):\n",
    "    def __init__(self, ds):\n",
    "        self.texts = ds[\"Text\"].values\n",
    "        self.labels = ds[\"Category\"].values\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.texts)\n",
    "\n",
    "    def __getitem__(self, item):\n",
    "        inputs = prepare_input(tokenizer, self.texts[item])\n",
    "        labels = torch.tensor(self.labels[item], dtype=torch.float)\n",
    "        return inputs, labels\n",
    "\n",
    "\n",
    "def collate(inputs: Dict) -> Dict:\n",
    "    \"\"\"Collate and modify the input dictionary to have the same sequence length for a particular input batch.\n",
    "\n",
    "    Args:\n",
    "        inputs (dict): A dictionary containing input tensors with varying sequence lengths.\n",
    "\n",
    "    Returns:\n",
    "        modified_inputs (dict): A modified dictionary with input tensors trimmed to have the same sequence length.\n",
    "    \"\"\"\n",
    "    max_len = int(inputs[\"input_ids\"].sum(axis=1).max())\n",
    "    for k, v in inputs.items():\n",
    "        inputs[k] = inputs[k][:, :max_len]\n",
    "    return inputs\n",
    "\n",
    "\n",
    "tokenizer = RobertaTokenizer.from_pretrained(\"roberta-base\")\n",
    "\n",
    "sample_input = prepare_input(tokenizer, train_ds[\"Text\"].values[10])\n",
    "sample_input"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "-qp-4d-aN503"
   },
   "source": [
    "## Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "id": "XIJ6ARJfsswj",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "class CustomModel(nn.Module):\n",
    "    def __init__(self, num_classes, change_config=False, dropout_pb=0.0):\n",
    "        super(CustomModel, self).__init__()\n",
    "        if change_config:\n",
    "            pass\n",
    "        self.model = RobertaModel.from_pretrained(\"roberta-base\")\n",
    "        self.hidden_size = self.model.config.hidden_size\n",
    "        self.num_classes = num_classes\n",
    "        self.dropout_pb = dropout_pb\n",
    "        self.dropout = torch.nn.Dropout(self.dropout_pb)\n",
    "        self.fc = nn.Linear(self.hidden_size, self.num_classes)\n",
    "\n",
    "    def forward(self, inputs):\n",
    "        output = self.model(**inputs)\n",
    "        z = self.dropout(output[1])\n",
    "        z = self.fc(z)\n",
    "        return z\n",
    "\n",
    "    @torch.inference_mode()\n",
    "    def predict(self, inputs):\n",
    "        self.eval()\n",
    "        z = self(inputs)\n",
    "        y_pred = torch.argmax(z, dim=1).cpu().numpy()\n",
    "        return y_pred\n",
    "\n",
    "    @torch.inference_mode()\n",
    "    def predict_proba(self, inputs):\n",
    "        self.eval()\n",
    "        z = self(inputs)\n",
    "        y_probs = F.softmax(z, dim=1).cpu().numpy()\n",
    "        return y_probs\n",
    "\n",
    "    def save(self, dp):\n",
    "        with open(Path(dp, \"args.json\"), \"w\") as fp:\n",
    "            contents = {\n",
    "                \"dropout_pb\": self.dropout_pb,\n",
    "                \"hidden_size\": self.hidden_size,\n",
    "                \"num_classes\": self.num_classes,\n",
    "            }\n",
    "            json.dump(contents, fp, indent=4, sort_keys=False)\n",
    "        torch.save(self.state_dict(), os.path.join(dp, \"model.pt\"))\n",
    "\n",
    "    @classmethod\n",
    "    def load(cls, args_fp, state_dict_fp):\n",
    "        with open(args_fp, \"r\") as fp:\n",
    "            kwargs = json.load(fp=fp)\n",
    "        llm = RobertaModel.from_pretrained(\"roberta-base\")\n",
    "        model = cls(llm=llm, **kwargs)\n",
    "        model.load_state_dict(torch.load(state_dict_fp, map_location=torch.device(\"cpu\")))\n",
    "        return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "YZEM0lIlsswj",
    "outputId": "c05d70cf-e75d-4514-b730-3070484ceee3",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# Initialize model check\n",
    "num_classes = len(ds[\"Category\"].unique())\n",
    "model = CustomModel(num_classes=num_classes, dropout_pb=Cfg.dropout_pb)\n",
    "print(model.named_parameters)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ztUd4m9CN8qM"
   },
   "source": [
    "## Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "a3VPiwjqsswk",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "def train_step(train_loader: DataLoader, model, num_classes: int, loss_fn, optimizer, epoch: int) -> float:\n",
    "    \"\"\"Train step.\"\"\"\n",
    "    model.train()\n",
    "    loss = 0.0\n",
    "    total_iterations = len(train_loader)\n",
    "    desc = f\"Training - Epoch {epoch+1}\"\n",
    "    for step, (inputs, labels) in tqdm(enumerate(train_loader), total=total_iterations, desc=desc):\n",
    "        inputs = collate(inputs)\n",
    "        for k, v in inputs.items():\n",
    "            inputs[k] = v.to(device)\n",
    "        labels = labels.to(device)\n",
    "        optimizer.zero_grad()  # reset gradients\n",
    "        y_pred = model(inputs)  # forward pass\n",
    "        targets = F.one_hot(labels.long(), num_classes=num_classes).float()  # one-hot (for loss_fn)\n",
    "        J = loss_fn(y_pred, targets)  # define loss\n",
    "        J.backward()  # backward pass\n",
    "        optimizer.step()  # update weights\n",
    "        loss += (J.detach().item() - loss) / (step + 1)  # cumulative loss\n",
    "    return loss\n",
    "\n",
    "\n",
    "def eval_step(val_loader: DataLoader, model, num_classes: int, loss_fn, epoch: int) -> Tuple[float, np.ndarray, np.ndarray]:\n",
    "    \"\"\"Eval step.\"\"\"\n",
    "    model.eval()\n",
    "    loss = 0.0\n",
    "    total_iterations = len(val_loader)\n",
    "    desc = f\"Validation - Epoch {epoch+1}\"\n",
    "    y_trues, y_preds = [], []\n",
    "    with torch.inference_mode():\n",
    "        for step, (inputs, labels) in tqdm(enumerate(val_loader), total=total_iterations, desc=desc):\n",
    "            inputs = collate(inputs)\n",
    "            for k, v in inputs.items():\n",
    "                inputs[k] = v.to(device)\n",
    "            labels = labels.to(device)\n",
    "            y_pred = model(inputs)\n",
    "            targets = F.one_hot(labels.long(), num_classes=num_classes).float()  # one-hot (for loss_fn)\n",
    "            J = loss_fn(y_pred, targets).item()\n",
    "            loss += (J - loss) / (step + 1)\n",
    "            y_trues.extend(targets.cpu().numpy())\n",
    "            y_preds.extend(torch.argmax(y_pred, dim=1).cpu().numpy())\n",
    "    return loss, np.vstack(y_trues), np.vstack(y_preds)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sweep config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "sweep_config = {\"method\": \"random\"}\n",
    "\n",
    "metric = {\"name\": \"val_loss\", \"goal\": \"minimize\"}\n",
    "\n",
    "sweep_config[\"metric\"] = metric\n",
    "\n",
    "parameters_dict = {\n",
    "    \"dropout_pb\": {\n",
    "        \"values\": [0.3, 0.4, 0.5],\n",
    "    },\n",
    "    \"learning_rate\": {\n",
    "        \"values\": [0.0001, 0.001, 0.01],\n",
    "    },\n",
    "    \"batch_size\": {\n",
    "        \"values\": [32, 64, 128],\n",
    "    },\n",
    "    \"lr_reduce_factor\": {\n",
    "        \"values\": [0.5, 0.6, 0.7, 0.8],\n",
    "    },\n",
    "    \"lr_reduce_patience\": {\n",
    "        \"values\": [2, 3, 4, 5],\n",
    "    },\n",
    "    \"epochs\": {\"value\": 1},\n",
    "}\n",
    "\n",
    "sweep_config[\"parameters\"] = parameters_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# create sweep\n",
    "if Cfg.wandb_sweep:\n",
    "    sweep_id = wandb.sweep(sweep_config, project=\"NewsClassifier\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "oG-4tz-Lsswk",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "def train_loop(config=None):\n",
    "    # ====================================================\n",
    "    # loader\n",
    "    # ====================================================\n",
    "\n",
    "    if not Cfg.wandb_sweep:\n",
    "        config = dict(\n",
    "            batch_size=Cfg.batch_size,\n",
    "            num_classes=7,\n",
    "            epochs=Cfg.epochs,\n",
    "            dropout_pb=Cfg.dropout_pb,\n",
    "            learning_rate=Cfg.lr,\n",
    "            lr_reduce_factor=Cfg.lr_redfactor,\n",
    "            lr_reduce_patience=Cfg.lr_redpatience,\n",
    "        )\n",
    "\n",
    "    with wandb.init(project=\"NewsClassifier\", config=config):\n",
    "        config = wandb.config\n",
    "\n",
    "        train_ds, val_ds = train_test_split(ds, test_size=Cfg.test_size, stratify=ds[\"Category\"])\n",
    "\n",
    "        train_dataset = NewsDataset(train_ds)\n",
    "        valid_dataset = NewsDataset(val_ds)\n",
    "\n",
    "        train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)\n",
    "        valid_loader = DataLoader(valid_dataset, batch_size=config.batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=False)\n",
    "\n",
    "        # ====================================================\n",
    "        # model\n",
    "        # ====================================================\n",
    "        num_classes = 7\n",
    "        device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "\n",
    "        model = CustomModel(num_classes=num_classes, dropout_pb=config.dropout_pb)\n",
    "        model.to(device)\n",
    "\n",
    "        # ====================================================\n",
    "        # Training components\n",
    "        # ====================================================\n",
    "        criterion = nn.BCEWithLogitsLoss()\n",
    "        optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)\n",
    "        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(\n",
    "            optimizer, mode=\"min\", factor=config.lr_reduce_factor, patience=config.lr_reduce_patience\n",
    "        )\n",
    "\n",
    "        # ====================================================\n",
    "        # loop\n",
    "        # ====================================================\n",
    "        wandb.watch(model, criterion, log=\"all\", log_freq=10)\n",
    "\n",
    "        min_loss = np.inf\n",
    "\n",
    "        for epoch in range(config.epochs):\n",
    "            start_time = time.time()\n",
    "\n",
    "            # Step\n",
    "            train_loss = train_step(train_loader, model, num_classes, criterion, optimizer, epoch)\n",
    "            val_loss, _, _ = eval_step(valid_loader, model, num_classes, criterion, epoch)\n",
    "            scheduler.step(val_loss)\n",
    "\n",
    "            # scoring\n",
    "            elapsed = time.time() - start_time\n",
    "            wandb.log({\"epoch\": epoch + 1, \"train_loss\": train_loss, \"val_loss\": val_loss})\n",
    "            print(f\"Epoch {epoch+1} - avg_train_loss: {train_loss:.4f}  avg_val_loss: {val_loss:.4f}  time: {elapsed:.0f}s\")\n",
    "\n",
    "            if min_loss > val_loss:\n",
    "                min_loss = val_loss\n",
    "                print(\"Best Score : saving model.\")\n",
    "                os.makedirs(\"../artifacts\", exist_ok=True)\n",
    "                model.save(\"../artifacts\")\n",
    "            print(f\"\\nSaved Best Model Score: {min_loss:.4f}\\n\\n\")\n",
    "\n",
    "        wandb.save(\"../artifacts/model.pt\")\n",
    "        torch.cuda.empty_cache()\n",
    "        gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "tIBl_kvssswk",
    "outputId": "4bff057f-a3a7-45ca-f3c2-5b5fbd15bab5",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# Train/Tune\n",
    "if not Cfg.wandb_sweep:\n",
    "    train_loop()\n",
    "else:\n",
    "    wandb.agent(sweep_id, train_loop, count=10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "qxXv-FaNNtKJ"
   },
   "source": [
    "## Inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "id": "SHCGJBhABesw",
    "outputId": "a62f9ff6-d47d-46d0-f971-cfeb76adc6d5",
    "trusted": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "CustomModel(\n",
       "  (model): RobertaModel(\n",
       "    (embeddings): RobertaEmbeddings(\n",
       "      (word_embeddings): Embedding(50265, 768, padding_idx=1)\n",
       "      (position_embeddings): Embedding(514, 768, padding_idx=1)\n",
       "      (token_type_embeddings): Embedding(1, 768)\n",
       "      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
       "      (dropout): Dropout(p=0.1, inplace=False)\n",
       "    )\n",
       "    (encoder): RobertaEncoder(\n",
       "      (layer): ModuleList(\n",
       "        (0-11): 12 x RobertaLayer(\n",
       "          (attention): RobertaAttention(\n",
       "            (self): RobertaSelfAttention(\n",
       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "            (output): RobertaSelfOutput(\n",
       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "              (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
       "              (dropout): Dropout(p=0.1, inplace=False)\n",
       "            )\n",
       "          )\n",
       "          (intermediate): RobertaIntermediate(\n",
       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (intermediate_act_fn): GELUActivation()\n",
       "          )\n",
       "          (output): RobertaOutput(\n",
       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
       "            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "          )\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "    (pooler): RobertaPooler(\n",
       "      (dense): Linear(in_features=768, out_features=768, bias=True)\n",
       "      (activation): Tanh()\n",
       "    )\n",
       "  )\n",
       "  (dropout): Dropout(p=0.0, inplace=False)\n",
       "  (fc): Linear(in_features=768, out_features=7, bias=True)\n",
       ")"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = CustomModel(num_classes=7)\n",
    "model.load_state_dict(torch.load(\"../artifacts/model.pt\", map_location=torch.device(\"cpu\")))\n",
    "model.to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "BjupBkbOCI22",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "def test_step(test_loader: DataLoader, model, num_classes: int) -> Tuple[np.ndarray, np.ndarray]:\n",
    "    \"\"\"Eval step.\"\"\"\n",
    "    model.eval()\n",
    "    y_trues, y_preds = [], []\n",
    "    with torch.inference_mode():\n",
    "        for step, (inputs, labels) in tqdm(enumerate(test_loader)):\n",
    "            inputs = collate(inputs)\n",
    "            for k, v in inputs.items():\n",
    "                inputs[k] = v.to(device)\n",
    "            labels = labels.to(device)\n",
    "            y_pred = model(inputs)\n",
    "            y_trues.extend(labels.cpu().numpy())\n",
    "            y_preds.extend(torch.argmax(y_pred, dim=1).cpu().numpy())\n",
    "    return np.vstack(y_trues), np.vstack(y_preds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "QimlSstFDsbJ",
    "outputId": "8c903f7f-eddd-417c-c85e-4d57a4206501",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "test_dataset = NewsDataset(val_ds)\n",
    "test_loader = DataLoader(test_dataset, batch_size=Cfg.batch_size, shuffle=False, num_workers=4, pin_memory=True, drop_last=False)\n",
    "\n",
    "y_true, y_pred = test_step(test_loader, model, 7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "CLz_GuoeEEgz",
    "outputId": "8870b27c-46a6-4695-e526-e5c1e778f96a",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "print(\n",
    "    f'Precision: {precision_score(y_true, y_pred, average=\"weighted\")} \\n Recall: {recall_score(y_true, y_pred, average=\"weighted\")} \\n F1: {f1_score(y_true, y_pred, average=\"weighted\")} \\n Accuracy: {accuracy_score(y_true, y_pred)}'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "j_D8B0aNOBiI"
   },
   "source": [
    "## Prediction on single sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "val_ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "id": "-wU3xnKkH0Tt",
    "outputId": "171245e5-4844-4e71-82b7-a0f3e97879e7",
    "trusted": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ground Truth: 5, Sports\n",
      "Predicted: 5, Sports\n",
      "Predicted Probabilities: [9.8119999e-05 1.0613000e-04 7.7200002e-06 3.2520002e-05 8.3100003e-06\n",
      " 9.9973667e-01 1.0560000e-05]\n"
     ]
    }
   ],
   "source": [
    "sample = 2\n",
    "sample_input = prepare_input(tokenizer, val_ds[\"Text\"].values[sample])\n",
    "\n",
    "cats = df[\"Category\"].unique().tolist()\n",
    "num_classes = len(cats)\n",
    "class_to_index = {tag: i for i, tag in enumerate(cats)}\n",
    "index_to_class = {v: k for k, v in class_to_index.items()}\n",
    "\n",
    "label = val_ds[\"Category\"].values[sample]\n",
    "input_ids = torch.unsqueeze(sample_input[\"input_ids\"], 0).to(device)\n",
    "attention_masks = torch.unsqueeze(sample_input[\"attention_mask\"], 0).to(device)\n",
    "test_sample = dict(input_ids=input_ids, attention_mask=attention_masks)\n",
    "\n",
    "with torch.no_grad():\n",
    "    y_pred_test_sample = model.predict_proba(test_sample)\n",
    "    print(f\"Ground Truth: {label}, {index_to_class[int(label)]}\")\n",
    "    print(f\"Predicted: {np.argmax(y_pred_test_sample)}, {index_to_class[int(np.argmax(y_pred_test_sample))]}\")\n",
    "    print(f\"Predicted Probabilities: {np.round(y_pred_test_sample, 8)[0]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}