CISProject
/

News-Headline-Classifier-Notebook

Safetensors

headlineclassifier

Model card Files Files and versions Community

TUEN-YUE commited on Dec 16, 2024

Commit

1ee6252

verified ·

1 Parent(s): e6b7fc7

Upload eval_py.ipynb

Browse files

Files changed (1) hide show

eval_py.ipynb +60 -20

eval_py.ipynb CHANGED Viewed

@@ -102,7 +102,11 @@
      ]
     },
     "id": "a4aa3b759defc904",
-    "outputId": "b1868c23-e675-41db-aa26-5eed9de60d9f"
    },
    "cell_type": "code",
    "source": [
@@ -115,7 +119,7 @@
    ],
    "id": "a4aa3b759defc904",
    "outputs": [],
-   "execution_count": null
   },
   {
    "metadata": {
@@ -137,7 +141,11 @@
      ]
     },
     "id": "ce6e6b982e22e9fe",
-    "outputId": "f38ef6b3-35ac-41dc-a8ae-f0dd28b1f84d"
    },
    "cell_type": "code",
    "source": [
@@ -148,6 +156,7 @@
     "from transformers import RobertaTokenizer\n",
     "from sklearn.feature_extraction.text import CountVectorizer\n",
     "from gensim.models import KeyedVectors\n",
     "\n",
     "def preprocess_data(data,\n",
     "                    mode=\"train\",\n",
@@ -178,9 +187,10 @@
     "    if mode == \"train\" and vectorizer is None:\n",
     "        # Collect all cleaned titles to fit\n",
     "        all_titles = data[\"clean_title\"]\n",
-    "        vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1,2))\n",
     "        vectorizer.fit(all_titles)\n",
-    "        print(\"N-gram vectorizer fitted on training data.\")\n",
     "\n",
     "    # 3. Transform titles with vectorizer once\n",
     "    def vectorize_batch(examples):\n",
@@ -193,7 +203,7 @@
     "    # 4. Tokenize with BERT once\n",
     "    def tokenize_batch(examples):\n",
     "        tokenized = tokenizer(\n",
-    "            examples[\"clean_title\"],\n",
     "            padding=\"max_length\",\n",
     "            truncation=True,\n",
     "            max_length=max_seq_length\n",
@@ -240,11 +250,8 @@
     "\n",
     "    # 7. Create labels\n",
     "    def make_labels(examples):\n",
-    "        if examples[\"labels\"] is not None:\n",
-    "            return {\"labels\": examples[\"labels\"]}\n",
-    "        else:\n",
-    "            labels = [1.0 if agency == \"fox\" else 0.0 for agency in examples[\"news\"]]\n",
-    "            return {\"labels\": labels}\n",
     "\n",
     "    data = data.map(make_labels, batched=True, num_proc=num_proc)\n",
     "\n",
@@ -256,7 +263,7 @@
     "        input_ids = torch.tensor(examples[\"input_ids\"])\n",
     "        attention_mask = torch.tensor(examples[\"attention_mask\"])\n",
     "        pos_inputs = torch.tensor(examples[\"pos_inputs\"], dtype=torch.float32)\n",
-    "        labels = torch.tensor(examples[\"labels\"])\n",
     "\n",
     "        # seq_inputs shape: (batch_size, 2, seq_len)\n",
     "        seq_inputs = torch.stack([input_ids, attention_mask], dim=1)\n",
@@ -275,7 +282,7 @@
    ],
    "id": "ce6e6b982e22e9fe",
    "outputs": [],
-   "execution_count": null
   },
   {
    "metadata": {
@@ -352,7 +359,11 @@
      ]
     },
     "id": "b605d3b4f5ff547a",
-    "outputId": "f365a98e-c181-4754-9fac-77aa1e8639db"
    },
    "cell_type": "code",
    "source": [
@@ -377,8 +388,16 @@
     ")"
    ],
    "id": "b605d3b4f5ff547a",
-   "outputs": [],
-   "execution_count": null
   },
   {
    "metadata": {
@@ -400,18 +419,39 @@
      ]
     },
     "id": "b20d11caa1d25445",
-    "outputId": "986c82fd-014b-432a-8174-857b2b866cb8"
    },
    "cell_type": "code",
    "source": [
-    "# Load model directly\n",
     "from transformers import AutoModel, AutoConfig\n",
     "config = AutoConfig.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\")\n",
     "model = AutoModel.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\",config = config)"
    ],
    "id": "b20d11caa1d25445",
-   "outputs": [],
-   "execution_count": null
   },
   {
    "metadata": {

      ]
     },
     "id": "a4aa3b759defc904",
+    "outputId": "b1868c23-e675-41db-aa26-5eed9de60d9f",
+    "ExecuteTime": {
+     "end_time": "2024-12-16T08:26:09.513376Z",
+     "start_time": "2024-12-16T08:26:05.978557Z"
+    }
    },
    "cell_type": "code",
    "source": [
    ],
    "id": "a4aa3b759defc904",
    "outputs": [],
+   "execution_count": 1
   },
   {
    "metadata": {
      ]
     },
     "id": "ce6e6b982e22e9fe",
+    "outputId": "f38ef6b3-35ac-41dc-a8ae-f0dd28b1f84d",
+    "ExecuteTime": {
+     "end_time": "2024-12-16T08:26:54.306779Z",
+     "start_time": "2024-12-16T08:26:54.298397Z"
+    }
    },
    "cell_type": "code",
    "source": [
     "from transformers import RobertaTokenizer\n",
     "from sklearn.feature_extraction.text import CountVectorizer\n",
     "from gensim.models import KeyedVectors\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
     "\n",
     "def preprocess_data(data,\n",
     "                    mode=\"train\",\n",
     "    if mode == \"train\" and vectorizer is None:\n",
     "        # Collect all cleaned titles to fit\n",
     "        all_titles = data[\"clean_title\"]\n",
+    "        #vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1,2))\n",
+    "        vectorizer = TfidfVectorizer(max_features=max_features)\n",
     "        vectorizer.fit(all_titles)\n",
+    "        print(\"vectorizer fitted on training data.\")\n",
     "\n",
     "    # 3. Transform titles with vectorizer once\n",
     "    def vectorize_batch(examples):\n",
     "    # 4. Tokenize with BERT once\n",
     "    def tokenize_batch(examples):\n",
     "        tokenized = tokenizer(\n",
+    "            examples[\"title\"],\n",
     "            padding=\"max_length\",\n",
     "            truncation=True,\n",
     "            max_length=max_seq_length\n",
     "\n",
     "    # 7. Create labels\n",
     "    def make_labels(examples):\n",
+    "        labels = examples[\"labels\"]\n",
+    "        return {\"labels\": labels}\n",
     "\n",
     "    data = data.map(make_labels, batched=True, num_proc=num_proc)\n",
     "\n",
     "        input_ids = torch.tensor(examples[\"input_ids\"])\n",
     "        attention_mask = torch.tensor(examples[\"attention_mask\"])\n",
     "        pos_inputs = torch.tensor(examples[\"pos_inputs\"], dtype=torch.float32)\n",
+    "        labels = torch.tensor(examples[\"labels\"],dtype=torch.long)\n",
     "\n",
     "        # seq_inputs shape: (batch_size, 2, seq_len)\n",
     "        seq_inputs = torch.stack([input_ids, attention_mask], dim=1)\n",
    ],
    "id": "ce6e6b982e22e9fe",
    "outputs": [],
+   "execution_count": 4
   },
   {
    "metadata": {
      ]
     },
     "id": "b605d3b4f5ff547a",
+    "outputId": "f365a98e-c181-4754-9fac-77aa1e8639db",
+    "ExecuteTime": {
+     "end_time": "2024-12-16T08:27:16.788714Z",
+     "start_time": "2024-12-16T08:27:01.757035Z"
+    }
    },
    "cell_type": "code",
    "source": [
     ")"
    ],
    "id": "b605d3b4f5ff547a",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "vectorizer fitted on training data.\n"
+     ]
+    }
+   ],
+   "execution_count": 5
   },
   {
    "metadata": {
      ]
     },
     "id": "b20d11caa1d25445",
+    "outputId": "986c82fd-014b-432a-8174-857b2b866cb8",
+    "ExecuteTime": {
+     "end_time": "2024-12-16T08:27:32.874705Z",
+     "start_time": "2024-12-16T08:27:32.787248Z"
+    }
    },
    "cell_type": "code",
    "source": [
     "from transformers import AutoModel, AutoConfig\n",
     "config = AutoConfig.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\")\n",
     "model = AutoModel.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\",config = config)"
    ],
    "id": "b20d11caa1d25445",
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "The checkpoint you are trying to load has model type `headlineclassifier` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[1;31mKeyError\u001B[0m                                  Traceback (most recent call last)",
+      "File \u001B[1;32m~\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\transformers\\models\\auto\\configuration_auto.py:1038\u001B[0m, in \u001B[0;36mAutoConfig.from_pretrained\u001B[1;34m(cls, pretrained_model_name_or_path, **kwargs)\u001B[0m\n\u001B[0;32m   1037\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m-> 1038\u001B[0m     config_class \u001B[38;5;241m=\u001B[39m CONFIG_MAPPING[config_dict[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mmodel_type\u001B[39m\u001B[38;5;124m\"\u001B[39m]]\n\u001B[0;32m   1039\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m:\n",
+      "File \u001B[1;32m~\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\transformers\\models\\auto\\configuration_auto.py:740\u001B[0m, in \u001B[0;36m_LazyConfigMapping.__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m    739\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m key \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_mapping:\n\u001B[1;32m--> 740\u001B[0m     \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key)\n\u001B[0;32m    741\u001B[0m value \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_mapping[key]\n",
+      "\u001B[1;31mKeyError\u001B[0m: 'headlineclassifier'",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001B[1;31mValueError\u001B[0m                                Traceback (most recent call last)",
+      "Cell \u001B[1;32mIn[15], line 2\u001B[0m\n\u001B[0;32m      1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtransformers\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m AutoModel, AutoConfig\n\u001B[1;32m----> 2\u001B[0m config \u001B[38;5;241m=\u001B[39m AutoConfig\u001B[38;5;241m.\u001B[39mfrom_pretrained(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCISProject/News-Headline-Classifier-Notebook\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m      3\u001B[0m model \u001B[38;5;241m=\u001B[39m AutoModel\u001B[38;5;241m.\u001B[39mfrom_pretrained(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCISProject/News-Headline-Classifier-Notebook\u001B[39m\u001B[38;5;124m\"\u001B[39m,config \u001B[38;5;241m=\u001B[39m config)\n",
+      "File \u001B[1;32m~\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\transformers\\models\\auto\\configuration_auto.py:1040\u001B[0m, in \u001B[0;36mAutoConfig.from_pretrained\u001B[1;34m(cls, pretrained_model_name_or_path, **kwargs)\u001B[0m\n\u001B[0;32m   1038\u001B[0m         config_class \u001B[38;5;241m=\u001B[39m CONFIG_MAPPING[config_dict[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mmodel_type\u001B[39m\u001B[38;5;124m\"\u001B[39m]]\n\u001B[0;32m   1039\u001B[0m     \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m:\n\u001B[1;32m-> 1040\u001B[0m         \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[0;32m   1041\u001B[0m             \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mThe checkpoint you are trying to load has model type `\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mconfig_dict[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mmodel_type\u001B[39m\u001B[38;5;124m'\u001B[39m]\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m` \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m   1042\u001B[0m             \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mbut Transformers does not recognize this architecture. This could be because of an \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m   1043\u001B[0m             \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124missue with the checkpoint, or because your version of Transformers is out of date.\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m   1044\u001B[0m         )\n\u001B[0;32m   1045\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m config_class\u001B[38;5;241m.\u001B[39mfrom_dict(config_dict, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39munused_kwargs)\n\u001B[0;32m   1046\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m   1047\u001B[0m     \u001B[38;5;66;03m# Fallback: use pattern matching on the string.\u001B[39;00m\n\u001B[0;32m   1048\u001B[0m     \u001B[38;5;66;03m# We go from longer names to shorter names to catch roberta before bert (for instance)\u001B[39;00m\n",
+      "\u001B[1;31mValueError\u001B[0m: The checkpoint you are trying to load has model type `headlineclassifier` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date."
+     ]
+    }
+   ],
+   "execution_count": 15
   },
   {
    "metadata": {