CISProject
/

News-Headline-Classifier-Notebook

Safetensors

headlineclassifier

Model card Files Files and versions Community

TUEN-YUE commited on Dec 16, 2024

Commit

21d6445

verified ·

1 Parent(s): 8e86e93

Upload train+test.ipynb

Browse files

Files changed (1) hide show

train+test.ipynb +25 -143

train+test.ipynb CHANGED Viewed

@@ -13,8 +13,6 @@
   {
    "metadata": {},
    "cell_type": "code",
-   "outputs": [],
-   "execution_count": null,
    "source": [
     "!pip install geopy > delete.txt\n",
     "!pip install datasets > delete.txt\n",
@@ -25,7 +23,9 @@
     "!pip install gensim > delete.txt\n",
     "!rm delete.txt"
    ],
-   "id": "5a596f2639253772"
   },
   {
    "metadata": {},
@@ -37,17 +37,12 @@
    "id": "432a756039e6399"
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-12-16T19:48:43.216631Z",
-     "start_time": "2024-12-16T19:48:43.214630Z"
-    }
-   },
    "cell_type": "code",
    "source": "!huggingface-cli login",
    "id": "2e73da09a7c6171e",
    "outputs": [],
-   "execution_count": 44
   },
   {
    "metadata": {},
@@ -68,12 +63,7 @@
    "id": "b8920847b7cc378d"
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-12-16T19:48:45.272372Z",
-     "start_time": "2024-12-16T19:48:43.220140Z"
-    }
-   },
    "cell_type": "code",
    "source": [
     "from datasets import load_dataset\n",
@@ -84,15 +74,10 @@
    ],
    "id": "877c90c978d62b7d",
    "outputs": [],
-   "execution_count": 45
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-12-16T19:48:45.287939Z",
-     "start_time": "2024-12-16T19:48:45.278748Z"
-    }
-   },
    "cell_type": "code",
    "source": [
     "import numpy as np\n",
@@ -228,15 +213,10 @@
    ],
    "id": "dc2ba675ce880d6d",
    "outputs": [],
-   "execution_count": 46
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-12-16T19:49:01.529651Z",
-     "start_time": "2024-12-16T19:48:45.294290Z"
-    }
-   },
    "cell_type": "code",
    "source": [
     "from gensim.models import KeyedVectors\n",
@@ -260,47 +240,19 @@
     ")"
    ],
    "id": "158b99950fb22d1",
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "vectorizer fitted on training data.\n"
-     ]
-    }
-   ],
-   "execution_count": 47
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-12-16T19:49:01.538067Z",
-     "start_time": "2024-12-16T19:49:01.535063Z"
-    }
-   },
    "cell_type": "code",
    "source": [
     "print(dataset_train)\n",
     "print(dataset_test)"
    ],
    "id": "edd80d33175c96a0",
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset({\n",
-      "    features: ['title', 'outlet', 'index', 'url', 'labels', 'clean_title', 'freq_inputs', 'input_ids', 'attention_mask', 'tokens', 'pos_inputs', 'seq_inputs'],\n",
-      "    num_rows: 3044\n",
-      "})\n",
-      "Dataset({\n",
-      "    features: ['title', 'outlet', 'index', 'url', 'labels', 'clean_title', 'freq_inputs', 'input_ids', 'attention_mask', 'tokens', 'pos_inputs', 'seq_inputs'],\n",
-      "    num_rows: 761\n",
-      "})\n"
-     ]
-    }
-   ],
-   "execution_count": 48
   },
   {
    "metadata": {},
@@ -321,12 +273,7 @@
    "id": "f0eae08a025b6ed9"
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-12-16T19:49:01.554769Z",
-     "start_time": "2024-12-16T19:49:01.543575Z"
-    }
-   },
    "cell_type": "code",
    "source": [
     "# TODO: import all packages necessary for your custom model\n",
@@ -419,8 +366,6 @@
     "            input_ids=seq_inputs[:,0,:],\n",
     "            attention_mask=seq_inputs[:,1,:]\n",
     "        ).pooler_output # last_hidden_state[:, 0, :]\n",
-    "        lstm_out, (h_n, c_n) = self.lstm(seq_feature)\n",
-    "        seq_feature = h_n[-1]  # Use the last hidden state\n",
     "        freq_feature = self.freq(freq_inputs) # Shape: (batch_size, 128)\n",
     "\n",
     "        pos_feature = self.pos(pos_inputs) #Shape: (batch_size, 128)\n",
@@ -444,15 +389,10 @@
    ],
    "id": "21f079d0c52d7d",
    "outputs": [],
-   "execution_count": 49
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-12-16T19:49:01.791918Z",
-     "start_time": "2024-12-16T19:49:01.561338Z"
-    }
-   },
    "cell_type": "code",
    "source": [
     "from huggingface_hub import hf_hub_download\n",
@@ -465,27 +405,11 @@
     "REPO_NAME = \"CISProject/News-Headline-Classifier-Notebook\" # TODO: PROVIDE A STRING TO YOUR REPO ON HUGGINGFACE"
    ],
    "id": "b6ba3f96d3ce21",
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "C:\\Users\\swall\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\torch\\nn\\utils\\weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n",
-      "  WeightNorm.apply(module, name, dim)\n",
-      "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-     ]
-    }
-   ],
-   "execution_count": 50
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-12-16T19:49:01.808079Z",
-     "start_time": "2024-12-16T19:49:01.798760Z"
-    }
-   },
    "cell_type": "code",
    "source": [
     "import torch\n",
@@ -623,15 +547,10 @@
    ],
    "id": "7be377251b81a25d",
    "outputs": [],
-   "execution_count": 51
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-12-16T19:49:03.149673Z",
-     "start_time": "2024-12-16T19:49:01.812943Z"
-    }
-   },
    "cell_type": "code",
    "source": [
     "from torch.utils.data import DataLoader\n",
@@ -656,45 +575,8 @@
     "print(f\"Final model saved at {final_save_path}\")\n"
    ],
    "id": "dd1749c306f148eb",
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Epoch 1/10:   0%|          | 0/96 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "torch.Size([1, 768]) torch.Size([32, 128]) torch.Size([32, 128])\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    },
-    {
-     "ename": "RuntimeError",
-     "evalue": "Sizes of tensors must match except in dimension 1. Expected size 1 but got size 32 for tensor number 1 in the list.",
-     "output_type": "error",
-     "traceback": [
-      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
-      "\u001B[1;31mRuntimeError\u001B[0m                              Traceback (most recent call last)",
-      "Cell \u001B[1;32mIn[52], line 16\u001B[0m\n\u001B[0;32m     13\u001B[0m trainer \u001B[38;5;241m=\u001B[39m Trainer(model, train_loader, test_loader, config)\n\u001B[0;32m     15\u001B[0m \u001B[38;5;66;03m# Train the model\u001B[39;00m\n\u001B[1;32m---> 16\u001B[0m trainer\u001B[38;5;241m.\u001B[39mtrain()\n\u001B[0;32m     17\u001B[0m \u001B[38;5;66;03m# Save the final model in Hugging Face format\u001B[39;00m\n\u001B[0;32m     18\u001B[0m final_save_path \u001B[38;5;241m=\u001B[39m os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39mjoin(config\u001B[38;5;241m.\u001B[39mbase_exp_dir, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcheckpoints\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n",
-      "Cell \u001B[1;32mIn[51], line 69\u001B[0m, in \u001B[0;36mTrainer.train\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m     66\u001B[0m y_train \u001B[38;5;241m=\u001B[39m labels\u001B[38;5;241m.\u001B[39mto(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mdevice)\n\u001B[0;32m     68\u001B[0m \u001B[38;5;66;03m# Forward pass\u001B[39;00m\n\u001B[1;32m---> 69\u001B[0m preds \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mmodel({\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mfreq_inputs\u001B[39m\u001B[38;5;124m\"\u001B[39m: freq_inputs, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mseq_inputs\u001B[39m\u001B[38;5;124m\"\u001B[39m: seq_inputs, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mpos_inputs\u001B[39m\u001B[38;5;124m\"\u001B[39m: pos_inputs})\n\u001B[0;32m     70\u001B[0m loss \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcriterion(preds, y_train)\n\u001B[0;32m     72\u001B[0m \u001B[38;5;66;03m# preds = (torch.sigmoid(preds) > 0.5).int()\u001B[39;00m\n\u001B[0;32m     73\u001B[0m \u001B[38;5;66;03m# Backward pass\u001B[39;00m\n",
-      "File \u001B[1;32m~\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1736\u001B[0m, in \u001B[0;36mModule._wrapped_call_impl\u001B[1;34m(self, *args, **kwargs)\u001B[0m\n\u001B[0;32m   1734\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_compiled_call_impl(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)  \u001B[38;5;66;03m# type: ignore[misc]\u001B[39;00m\n\u001B[0;32m   1735\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m-> 1736\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_call_impl(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n",
-      "File \u001B[1;32m~\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1747\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[1;34m(self, *args, **kwargs)\u001B[0m\n\u001B[0;32m   1742\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[0;32m   1743\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[0;32m   1744\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks\n\u001B[0;32m   1745\u001B[0m         \u001B[38;5;129;01mor\u001B[39;00m _global_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[0;32m   1746\u001B[0m         \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[1;32m-> 1747\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m forward_call(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[0;32m   1749\u001B[0m result \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[0;32m   1750\u001B[0m called_always_called_hooks \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mset\u001B[39m()\n",
-      "Cell \u001B[1;32mIn[49], line 99\u001B[0m, in \u001B[0;36mCustomModel.forward\u001B[1;34m(self, x)\u001B[0m\n\u001B[0;32m     97\u001B[0m pos_feature \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mpos(pos_inputs) \u001B[38;5;66;03m#Shape: (batch_size, 128)\u001B[39;00m\n\u001B[0;32m     98\u001B[0m \u001B[38;5;28mprint\u001B[39m(seq_feature\u001B[38;5;241m.\u001B[39mshape,pos_feature\u001B[38;5;241m.\u001B[39mshape,freq_feature\u001B[38;5;241m.\u001B[39mshape)\n\u001B[1;32m---> 99\u001B[0m inputs \u001B[38;5;241m=\u001B[39m torch\u001B[38;5;241m.\u001B[39mcat((seq_feature, freq_feature, pos_feature), dim\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m1\u001B[39m)  \u001B[38;5;66;03m# Shape: (batch_size, 384)\u001B[39;00m\n\u001B[0;32m    100\u001B[0m \u001B[38;5;66;03m# inputs = torch.cat((seq_feature, freq_feature), dim=1)  # Shape: (batch_size,256)\u001B[39;00m\n\u001B[0;32m    101\u001B[0m \u001B[38;5;66;03m# inputs = seq_feature\u001B[39;00m\n\u001B[0;32m    103\u001B[0m x \u001B[38;5;241m=\u001B[39m inputs\n",
-      "\u001B[1;31mRuntimeError\u001B[0m: Sizes of tensors must match except in dimension 1. Expected size 1 but got size 32 for tensor number 1 in the list."
-     ]
-    }
-   ],
-   "execution_count": 52
   },
   {
    "metadata": {},

   {
    "metadata": {},
    "cell_type": "code",
    "source": [
     "!pip install geopy > delete.txt\n",
     "!pip install datasets > delete.txt\n",
     "!pip install gensim > delete.txt\n",
     "!rm delete.txt"
    ],
+   "id": "5a596f2639253772",
+   "outputs": [],
+   "execution_count": null
   },
   {
    "metadata": {},
    "id": "432a756039e6399"
   },
   {
+   "metadata": {},
    "cell_type": "code",
    "source": "!huggingface-cli login",
    "id": "2e73da09a7c6171e",
    "outputs": [],
+   "execution_count": null
   },
   {
    "metadata": {},
    "id": "b8920847b7cc378d"
   },
   {
+   "metadata": {},
    "cell_type": "code",
    "source": [
     "from datasets import load_dataset\n",
    ],
    "id": "877c90c978d62b7d",
    "outputs": [],
+   "execution_count": null
   },
   {
+   "metadata": {},
    "cell_type": "code",
    "source": [
     "import numpy as np\n",
    ],
    "id": "dc2ba675ce880d6d",
    "outputs": [],
+   "execution_count": null
   },
   {
+   "metadata": {},
    "cell_type": "code",
    "source": [
     "from gensim.models import KeyedVectors\n",
     ")"
    ],
    "id": "158b99950fb22d1",
+   "outputs": [],
+   "execution_count": null
   },
   {
+   "metadata": {},
    "cell_type": "code",
    "source": [
     "print(dataset_train)\n",
     "print(dataset_test)"
    ],
    "id": "edd80d33175c96a0",
+   "outputs": [],
+   "execution_count": null
   },
   {
    "metadata": {},
    "id": "f0eae08a025b6ed9"
   },
   {
+   "metadata": {},
    "cell_type": "code",
    "source": [
     "# TODO: import all packages necessary for your custom model\n",
     "            input_ids=seq_inputs[:,0,:],\n",
     "            attention_mask=seq_inputs[:,1,:]\n",
     "        ).pooler_output # last_hidden_state[:, 0, :]\n",
     "        freq_feature = self.freq(freq_inputs) # Shape: (batch_size, 128)\n",
     "\n",
     "        pos_feature = self.pos(pos_inputs) #Shape: (batch_size, 128)\n",
    ],
    "id": "21f079d0c52d7d",
    "outputs": [],
+   "execution_count": null
   },
   {
+   "metadata": {},
    "cell_type": "code",
    "source": [
     "from huggingface_hub import hf_hub_download\n",
     "REPO_NAME = \"CISProject/News-Headline-Classifier-Notebook\" # TODO: PROVIDE A STRING TO YOUR REPO ON HUGGINGFACE"
    ],
    "id": "b6ba3f96d3ce21",
+   "outputs": [],
+   "execution_count": null
   },
   {
+   "metadata": {},
    "cell_type": "code",
    "source": [
     "import torch\n",
    ],
    "id": "7be377251b81a25d",
    "outputs": [],
+   "execution_count": null
   },
   {
+   "metadata": {},
    "cell_type": "code",
    "source": [
     "from torch.utils.data import DataLoader\n",
     "print(f\"Final model saved at {final_save_path}\")\n"
    ],
    "id": "dd1749c306f148eb",
+   "outputs": [],
+   "execution_count": null
   },
   {
    "metadata": {},