TUEN-YUE commited on
Commit
1ee6252
·
verified ·
1 Parent(s): e6b7fc7

Upload eval_py.ipynb

Browse files
Files changed (1) hide show
  1. eval_py.ipynb +60 -20
eval_py.ipynb CHANGED
@@ -102,7 +102,11 @@
102
  ]
103
  },
104
  "id": "a4aa3b759defc904",
105
- "outputId": "b1868c23-e675-41db-aa26-5eed9de60d9f"
 
 
 
 
106
  },
107
  "cell_type": "code",
108
  "source": [
@@ -115,7 +119,7 @@
115
  ],
116
  "id": "a4aa3b759defc904",
117
  "outputs": [],
118
- "execution_count": null
119
  },
120
  {
121
  "metadata": {
@@ -137,7 +141,11 @@
137
  ]
138
  },
139
  "id": "ce6e6b982e22e9fe",
140
- "outputId": "f38ef6b3-35ac-41dc-a8ae-f0dd28b1f84d"
 
 
 
 
141
  },
142
  "cell_type": "code",
143
  "source": [
@@ -148,6 +156,7 @@
148
  "from transformers import RobertaTokenizer\n",
149
  "from sklearn.feature_extraction.text import CountVectorizer\n",
150
  "from gensim.models import KeyedVectors\n",
 
151
  "\n",
152
  "def preprocess_data(data,\n",
153
  " mode=\"train\",\n",
@@ -178,9 +187,10 @@
178
  " if mode == \"train\" and vectorizer is None:\n",
179
  " # Collect all cleaned titles to fit\n",
180
  " all_titles = data[\"clean_title\"]\n",
181
- " vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1,2))\n",
 
182
  " vectorizer.fit(all_titles)\n",
183
- " print(\"N-gram vectorizer fitted on training data.\")\n",
184
  "\n",
185
  " # 3. Transform titles with vectorizer once\n",
186
  " def vectorize_batch(examples):\n",
@@ -193,7 +203,7 @@
193
  " # 4. Tokenize with BERT once\n",
194
  " def tokenize_batch(examples):\n",
195
  " tokenized = tokenizer(\n",
196
- " examples[\"clean_title\"],\n",
197
  " padding=\"max_length\",\n",
198
  " truncation=True,\n",
199
  " max_length=max_seq_length\n",
@@ -240,11 +250,8 @@
240
  "\n",
241
  " # 7. Create labels\n",
242
  " def make_labels(examples):\n",
243
- " if examples[\"labels\"] is not None:\n",
244
- " return {\"labels\": examples[\"labels\"]}\n",
245
- " else:\n",
246
- " labels = [1.0 if agency == \"fox\" else 0.0 for agency in examples[\"news\"]]\n",
247
- " return {\"labels\": labels}\n",
248
  "\n",
249
  " data = data.map(make_labels, batched=True, num_proc=num_proc)\n",
250
  "\n",
@@ -256,7 +263,7 @@
256
  " input_ids = torch.tensor(examples[\"input_ids\"])\n",
257
  " attention_mask = torch.tensor(examples[\"attention_mask\"])\n",
258
  " pos_inputs = torch.tensor(examples[\"pos_inputs\"], dtype=torch.float32)\n",
259
- " labels = torch.tensor(examples[\"labels\"])\n",
260
  "\n",
261
  " # seq_inputs shape: (batch_size, 2, seq_len)\n",
262
  " seq_inputs = torch.stack([input_ids, attention_mask], dim=1)\n",
@@ -275,7 +282,7 @@
275
  ],
276
  "id": "ce6e6b982e22e9fe",
277
  "outputs": [],
278
- "execution_count": null
279
  },
280
  {
281
  "metadata": {
@@ -352,7 +359,11 @@
352
  ]
353
  },
354
  "id": "b605d3b4f5ff547a",
355
- "outputId": "f365a98e-c181-4754-9fac-77aa1e8639db"
 
 
 
 
356
  },
357
  "cell_type": "code",
358
  "source": [
@@ -377,8 +388,16 @@
377
  ")"
378
  ],
379
  "id": "b605d3b4f5ff547a",
380
- "outputs": [],
381
- "execution_count": null
 
 
 
 
 
 
 
 
382
  },
383
  {
384
  "metadata": {
@@ -400,18 +419,39 @@
400
  ]
401
  },
402
  "id": "b20d11caa1d25445",
403
- "outputId": "986c82fd-014b-432a-8174-857b2b866cb8"
 
 
 
 
404
  },
405
  "cell_type": "code",
406
  "source": [
407
- "# Load model directly\n",
408
  "from transformers import AutoModel, AutoConfig\n",
409
  "config = AutoConfig.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\")\n",
410
  "model = AutoModel.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\",config = config)"
411
  ],
412
  "id": "b20d11caa1d25445",
413
- "outputs": [],
414
- "execution_count": null
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
  },
416
  {
417
  "metadata": {
 
102
  ]
103
  },
104
  "id": "a4aa3b759defc904",
105
+ "outputId": "b1868c23-e675-41db-aa26-5eed9de60d9f",
106
+ "ExecuteTime": {
107
+ "end_time": "2024-12-16T08:26:09.513376Z",
108
+ "start_time": "2024-12-16T08:26:05.978557Z"
109
+ }
110
  },
111
  "cell_type": "code",
112
  "source": [
 
119
  ],
120
  "id": "a4aa3b759defc904",
121
  "outputs": [],
122
+ "execution_count": 1
123
  },
124
  {
125
  "metadata": {
 
141
  ]
142
  },
143
  "id": "ce6e6b982e22e9fe",
144
+ "outputId": "f38ef6b3-35ac-41dc-a8ae-f0dd28b1f84d",
145
+ "ExecuteTime": {
146
+ "end_time": "2024-12-16T08:26:54.306779Z",
147
+ "start_time": "2024-12-16T08:26:54.298397Z"
148
+ }
149
  },
150
  "cell_type": "code",
151
  "source": [
 
156
  "from transformers import RobertaTokenizer\n",
157
  "from sklearn.feature_extraction.text import CountVectorizer\n",
158
  "from gensim.models import KeyedVectors\n",
159
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
160
  "\n",
161
  "def preprocess_data(data,\n",
162
  " mode=\"train\",\n",
 
187
  " if mode == \"train\" and vectorizer is None:\n",
188
  " # Collect all cleaned titles to fit\n",
189
  " all_titles = data[\"clean_title\"]\n",
190
+ " #vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1,2))\n",
191
+ " vectorizer = TfidfVectorizer(max_features=max_features)\n",
192
  " vectorizer.fit(all_titles)\n",
193
+ " print(\"vectorizer fitted on training data.\")\n",
194
  "\n",
195
  " # 3. Transform titles with vectorizer once\n",
196
  " def vectorize_batch(examples):\n",
 
203
  " # 4. Tokenize with BERT once\n",
204
  " def tokenize_batch(examples):\n",
205
  " tokenized = tokenizer(\n",
206
+ " examples[\"title\"],\n",
207
  " padding=\"max_length\",\n",
208
  " truncation=True,\n",
209
  " max_length=max_seq_length\n",
 
250
  "\n",
251
  " # 7. Create labels\n",
252
  " def make_labels(examples):\n",
253
+ " labels = examples[\"labels\"]\n",
254
+ " return {\"labels\": labels}\n",
 
 
 
255
  "\n",
256
  " data = data.map(make_labels, batched=True, num_proc=num_proc)\n",
257
  "\n",
 
263
  " input_ids = torch.tensor(examples[\"input_ids\"])\n",
264
  " attention_mask = torch.tensor(examples[\"attention_mask\"])\n",
265
  " pos_inputs = torch.tensor(examples[\"pos_inputs\"], dtype=torch.float32)\n",
266
+ " labels = torch.tensor(examples[\"labels\"],dtype=torch.long)\n",
267
  "\n",
268
  " # seq_inputs shape: (batch_size, 2, seq_len)\n",
269
  " seq_inputs = torch.stack([input_ids, attention_mask], dim=1)\n",
 
282
  ],
283
  "id": "ce6e6b982e22e9fe",
284
  "outputs": [],
285
+ "execution_count": 4
286
  },
287
  {
288
  "metadata": {
 
359
  ]
360
  },
361
  "id": "b605d3b4f5ff547a",
362
+ "outputId": "f365a98e-c181-4754-9fac-77aa1e8639db",
363
+ "ExecuteTime": {
364
+ "end_time": "2024-12-16T08:27:16.788714Z",
365
+ "start_time": "2024-12-16T08:27:01.757035Z"
366
+ }
367
  },
368
  "cell_type": "code",
369
  "source": [
 
388
  ")"
389
  ],
390
  "id": "b605d3b4f5ff547a",
391
+ "outputs": [
392
+ {
393
+ "name": "stdout",
394
+ "output_type": "stream",
395
+ "text": [
396
+ "vectorizer fitted on training data.\n"
397
+ ]
398
+ }
399
+ ],
400
+ "execution_count": 5
401
  },
402
  {
403
  "metadata": {
 
419
  ]
420
  },
421
  "id": "b20d11caa1d25445",
422
+ "outputId": "986c82fd-014b-432a-8174-857b2b866cb8",
423
+ "ExecuteTime": {
424
+ "end_time": "2024-12-16T08:27:32.874705Z",
425
+ "start_time": "2024-12-16T08:27:32.787248Z"
426
+ }
427
  },
428
  "cell_type": "code",
429
  "source": [
 
430
  "from transformers import AutoModel, AutoConfig\n",
431
  "config = AutoConfig.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\")\n",
432
  "model = AutoModel.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\",config = config)"
433
  ],
434
  "id": "b20d11caa1d25445",
435
+ "outputs": [
436
+ {
437
+ "ename": "ValueError",
438
+ "evalue": "The checkpoint you are trying to load has model type `headlineclassifier` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.",
439
+ "output_type": "error",
440
+ "traceback": [
441
+ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
442
+ "\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)",
443
+ "File \u001B[1;32m~\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\transformers\\models\\auto\\configuration_auto.py:1038\u001B[0m, in \u001B[0;36mAutoConfig.from_pretrained\u001B[1;34m(cls, pretrained_model_name_or_path, **kwargs)\u001B[0m\n\u001B[0;32m 1037\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m-> 1038\u001B[0m config_class \u001B[38;5;241m=\u001B[39m CONFIG_MAPPING[config_dict[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mmodel_type\u001B[39m\u001B[38;5;124m\"\u001B[39m]]\n\u001B[0;32m 1039\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m:\n",
444
+ "File \u001B[1;32m~\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\transformers\\models\\auto\\configuration_auto.py:740\u001B[0m, in \u001B[0;36m_LazyConfigMapping.__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 739\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m key \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_mapping:\n\u001B[1;32m--> 740\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key)\n\u001B[0;32m 741\u001B[0m value \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_mapping[key]\n",
445
+ "\u001B[1;31mKeyError\u001B[0m: 'headlineclassifier'",
446
+ "\nDuring handling of the above exception, another exception occurred:\n",
447
+ "\u001B[1;31mValueError\u001B[0m Traceback (most recent call last)",
448
+ "Cell \u001B[1;32mIn[15], line 2\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtransformers\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m AutoModel, AutoConfig\n\u001B[1;32m----> 2\u001B[0m config \u001B[38;5;241m=\u001B[39m AutoConfig\u001B[38;5;241m.\u001B[39mfrom_pretrained(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCISProject/News-Headline-Classifier-Notebook\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m 3\u001B[0m model \u001B[38;5;241m=\u001B[39m AutoModel\u001B[38;5;241m.\u001B[39mfrom_pretrained(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCISProject/News-Headline-Classifier-Notebook\u001B[39m\u001B[38;5;124m\"\u001B[39m,config \u001B[38;5;241m=\u001B[39m config)\n",
449
+ "File \u001B[1;32m~\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\transformers\\models\\auto\\configuration_auto.py:1040\u001B[0m, in \u001B[0;36mAutoConfig.from_pretrained\u001B[1;34m(cls, pretrained_model_name_or_path, **kwargs)\u001B[0m\n\u001B[0;32m 1038\u001B[0m config_class \u001B[38;5;241m=\u001B[39m CONFIG_MAPPING[config_dict[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mmodel_type\u001B[39m\u001B[38;5;124m\"\u001B[39m]]\n\u001B[0;32m 1039\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m:\n\u001B[1;32m-> 1040\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[0;32m 1041\u001B[0m \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mThe checkpoint you are trying to load has model type `\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mconfig_dict[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mmodel_type\u001B[39m\u001B[38;5;124m'\u001B[39m]\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m` \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 1042\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mbut Transformers does not recognize this architecture. This could be because of an \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 1043\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124missue with the checkpoint, or because your version of Transformers is out of date.\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 1044\u001B[0m )\n\u001B[0;32m 1045\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m config_class\u001B[38;5;241m.\u001B[39mfrom_dict(config_dict, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39munused_kwargs)\n\u001B[0;32m 1046\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 1047\u001B[0m \u001B[38;5;66;03m# Fallback: use pattern matching on the string.\u001B[39;00m\n\u001B[0;32m 1048\u001B[0m \u001B[38;5;66;03m# We go from longer names to shorter names to catch roberta before bert (for instance)\u001B[39;00m\n",
450
+ "\u001B[1;31mValueError\u001B[0m: The checkpoint you are trying to load has model type `headlineclassifier` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date."
451
+ ]
452
+ }
453
+ ],
454
+ "execution_count": 15
455
  },
456
  {
457
  "metadata": {