Upload eval_py.ipynb
Browse files- eval_py.ipynb +60 -20
eval_py.ipynb
CHANGED
@@ -102,7 +102,11 @@
|
|
102 |
]
|
103 |
},
|
104 |
"id": "a4aa3b759defc904",
|
105 |
-
"outputId": "b1868c23-e675-41db-aa26-5eed9de60d9f"
|
|
|
|
|
|
|
|
|
106 |
},
|
107 |
"cell_type": "code",
|
108 |
"source": [
|
@@ -115,7 +119,7 @@
|
|
115 |
],
|
116 |
"id": "a4aa3b759defc904",
|
117 |
"outputs": [],
|
118 |
-
"execution_count":
|
119 |
},
|
120 |
{
|
121 |
"metadata": {
|
@@ -137,7 +141,11 @@
|
|
137 |
]
|
138 |
},
|
139 |
"id": "ce6e6b982e22e9fe",
|
140 |
-
"outputId": "f38ef6b3-35ac-41dc-a8ae-f0dd28b1f84d"
|
|
|
|
|
|
|
|
|
141 |
},
|
142 |
"cell_type": "code",
|
143 |
"source": [
|
@@ -148,6 +156,7 @@
|
|
148 |
"from transformers import RobertaTokenizer\n",
|
149 |
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
150 |
"from gensim.models import KeyedVectors\n",
|
|
|
151 |
"\n",
|
152 |
"def preprocess_data(data,\n",
|
153 |
" mode=\"train\",\n",
|
@@ -178,9 +187,10 @@
|
|
178 |
" if mode == \"train\" and vectorizer is None:\n",
|
179 |
" # Collect all cleaned titles to fit\n",
|
180 |
" all_titles = data[\"clean_title\"]\n",
|
181 |
-
" vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1,2))\n",
|
|
|
182 |
" vectorizer.fit(all_titles)\n",
|
183 |
-
" print(\"
|
184 |
"\n",
|
185 |
" # 3. Transform titles with vectorizer once\n",
|
186 |
" def vectorize_batch(examples):\n",
|
@@ -193,7 +203,7 @@
|
|
193 |
" # 4. Tokenize with BERT once\n",
|
194 |
" def tokenize_batch(examples):\n",
|
195 |
" tokenized = tokenizer(\n",
|
196 |
-
" examples[\"
|
197 |
" padding=\"max_length\",\n",
|
198 |
" truncation=True,\n",
|
199 |
" max_length=max_seq_length\n",
|
@@ -240,11 +250,8 @@
|
|
240 |
"\n",
|
241 |
" # 7. Create labels\n",
|
242 |
" def make_labels(examples):\n",
|
243 |
-
"
|
244 |
-
"
|
245 |
-
" else:\n",
|
246 |
-
" labels = [1.0 if agency == \"fox\" else 0.0 for agency in examples[\"news\"]]\n",
|
247 |
-
" return {\"labels\": labels}\n",
|
248 |
"\n",
|
249 |
" data = data.map(make_labels, batched=True, num_proc=num_proc)\n",
|
250 |
"\n",
|
@@ -256,7 +263,7 @@
|
|
256 |
" input_ids = torch.tensor(examples[\"input_ids\"])\n",
|
257 |
" attention_mask = torch.tensor(examples[\"attention_mask\"])\n",
|
258 |
" pos_inputs = torch.tensor(examples[\"pos_inputs\"], dtype=torch.float32)\n",
|
259 |
-
" labels = torch.tensor(examples[\"labels\"])\n",
|
260 |
"\n",
|
261 |
" # seq_inputs shape: (batch_size, 2, seq_len)\n",
|
262 |
" seq_inputs = torch.stack([input_ids, attention_mask], dim=1)\n",
|
@@ -275,7 +282,7 @@
|
|
275 |
],
|
276 |
"id": "ce6e6b982e22e9fe",
|
277 |
"outputs": [],
|
278 |
-
"execution_count":
|
279 |
},
|
280 |
{
|
281 |
"metadata": {
|
@@ -352,7 +359,11 @@
|
|
352 |
]
|
353 |
},
|
354 |
"id": "b605d3b4f5ff547a",
|
355 |
-
"outputId": "f365a98e-c181-4754-9fac-77aa1e8639db"
|
|
|
|
|
|
|
|
|
356 |
},
|
357 |
"cell_type": "code",
|
358 |
"source": [
|
@@ -377,8 +388,16 @@
|
|
377 |
")"
|
378 |
],
|
379 |
"id": "b605d3b4f5ff547a",
|
380 |
-
"outputs": [
|
381 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
382 |
},
|
383 |
{
|
384 |
"metadata": {
|
@@ -400,18 +419,39 @@
|
|
400 |
]
|
401 |
},
|
402 |
"id": "b20d11caa1d25445",
|
403 |
-
"outputId": "986c82fd-014b-432a-8174-857b2b866cb8"
|
|
|
|
|
|
|
|
|
404 |
},
|
405 |
"cell_type": "code",
|
406 |
"source": [
|
407 |
-
"# Load model directly\n",
|
408 |
"from transformers import AutoModel, AutoConfig\n",
|
409 |
"config = AutoConfig.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\")\n",
|
410 |
"model = AutoModel.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\",config = config)"
|
411 |
],
|
412 |
"id": "b20d11caa1d25445",
|
413 |
-
"outputs": [
|
414 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
415 |
},
|
416 |
{
|
417 |
"metadata": {
|
|
|
102 |
]
|
103 |
},
|
104 |
"id": "a4aa3b759defc904",
|
105 |
+
"outputId": "b1868c23-e675-41db-aa26-5eed9de60d9f",
|
106 |
+
"ExecuteTime": {
|
107 |
+
"end_time": "2024-12-16T08:26:09.513376Z",
|
108 |
+
"start_time": "2024-12-16T08:26:05.978557Z"
|
109 |
+
}
|
110 |
},
|
111 |
"cell_type": "code",
|
112 |
"source": [
|
|
|
119 |
],
|
120 |
"id": "a4aa3b759defc904",
|
121 |
"outputs": [],
|
122 |
+
"execution_count": 1
|
123 |
},
|
124 |
{
|
125 |
"metadata": {
|
|
|
141 |
]
|
142 |
},
|
143 |
"id": "ce6e6b982e22e9fe",
|
144 |
+
"outputId": "f38ef6b3-35ac-41dc-a8ae-f0dd28b1f84d",
|
145 |
+
"ExecuteTime": {
|
146 |
+
"end_time": "2024-12-16T08:26:54.306779Z",
|
147 |
+
"start_time": "2024-12-16T08:26:54.298397Z"
|
148 |
+
}
|
149 |
},
|
150 |
"cell_type": "code",
|
151 |
"source": [
|
|
|
156 |
"from transformers import RobertaTokenizer\n",
|
157 |
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
158 |
"from gensim.models import KeyedVectors\n",
|
159 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
160 |
"\n",
|
161 |
"def preprocess_data(data,\n",
|
162 |
" mode=\"train\",\n",
|
|
|
187 |
" if mode == \"train\" and vectorizer is None:\n",
|
188 |
" # Collect all cleaned titles to fit\n",
|
189 |
" all_titles = data[\"clean_title\"]\n",
|
190 |
+
" #vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1,2))\n",
|
191 |
+
" vectorizer = TfidfVectorizer(max_features=max_features)\n",
|
192 |
" vectorizer.fit(all_titles)\n",
|
193 |
+
" print(\"vectorizer fitted on training data.\")\n",
|
194 |
"\n",
|
195 |
" # 3. Transform titles with vectorizer once\n",
|
196 |
" def vectorize_batch(examples):\n",
|
|
|
203 |
" # 4. Tokenize with BERT once\n",
|
204 |
" def tokenize_batch(examples):\n",
|
205 |
" tokenized = tokenizer(\n",
|
206 |
+
" examples[\"title\"],\n",
|
207 |
" padding=\"max_length\",\n",
|
208 |
" truncation=True,\n",
|
209 |
" max_length=max_seq_length\n",
|
|
|
250 |
"\n",
|
251 |
" # 7. Create labels\n",
|
252 |
" def make_labels(examples):\n",
|
253 |
+
" labels = examples[\"labels\"]\n",
|
254 |
+
" return {\"labels\": labels}\n",
|
|
|
|
|
|
|
255 |
"\n",
|
256 |
" data = data.map(make_labels, batched=True, num_proc=num_proc)\n",
|
257 |
"\n",
|
|
|
263 |
" input_ids = torch.tensor(examples[\"input_ids\"])\n",
|
264 |
" attention_mask = torch.tensor(examples[\"attention_mask\"])\n",
|
265 |
" pos_inputs = torch.tensor(examples[\"pos_inputs\"], dtype=torch.float32)\n",
|
266 |
+
" labels = torch.tensor(examples[\"labels\"],dtype=torch.long)\n",
|
267 |
"\n",
|
268 |
" # seq_inputs shape: (batch_size, 2, seq_len)\n",
|
269 |
" seq_inputs = torch.stack([input_ids, attention_mask], dim=1)\n",
|
|
|
282 |
],
|
283 |
"id": "ce6e6b982e22e9fe",
|
284 |
"outputs": [],
|
285 |
+
"execution_count": 4
|
286 |
},
|
287 |
{
|
288 |
"metadata": {
|
|
|
359 |
]
|
360 |
},
|
361 |
"id": "b605d3b4f5ff547a",
|
362 |
+
"outputId": "f365a98e-c181-4754-9fac-77aa1e8639db",
|
363 |
+
"ExecuteTime": {
|
364 |
+
"end_time": "2024-12-16T08:27:16.788714Z",
|
365 |
+
"start_time": "2024-12-16T08:27:01.757035Z"
|
366 |
+
}
|
367 |
},
|
368 |
"cell_type": "code",
|
369 |
"source": [
|
|
|
388 |
")"
|
389 |
],
|
390 |
"id": "b605d3b4f5ff547a",
|
391 |
+
"outputs": [
|
392 |
+
{
|
393 |
+
"name": "stdout",
|
394 |
+
"output_type": "stream",
|
395 |
+
"text": [
|
396 |
+
"vectorizer fitted on training data.\n"
|
397 |
+
]
|
398 |
+
}
|
399 |
+
],
|
400 |
+
"execution_count": 5
|
401 |
},
|
402 |
{
|
403 |
"metadata": {
|
|
|
419 |
]
|
420 |
},
|
421 |
"id": "b20d11caa1d25445",
|
422 |
+
"outputId": "986c82fd-014b-432a-8174-857b2b866cb8",
|
423 |
+
"ExecuteTime": {
|
424 |
+
"end_time": "2024-12-16T08:27:32.874705Z",
|
425 |
+
"start_time": "2024-12-16T08:27:32.787248Z"
|
426 |
+
}
|
427 |
},
|
428 |
"cell_type": "code",
|
429 |
"source": [
|
|
|
430 |
"from transformers import AutoModel, AutoConfig\n",
|
431 |
"config = AutoConfig.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\")\n",
|
432 |
"model = AutoModel.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\",config = config)"
|
433 |
],
|
434 |
"id": "b20d11caa1d25445",
|
435 |
+
"outputs": [
|
436 |
+
{
|
437 |
+
"ename": "ValueError",
|
438 |
+
"evalue": "The checkpoint you are trying to load has model type `headlineclassifier` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.",
|
439 |
+
"output_type": "error",
|
440 |
+
"traceback": [
|
441 |
+
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
|
442 |
+
"\u001B[1;31mKeyError\u001B[0m Traceback (most recent call last)",
|
443 |
+
"File \u001B[1;32m~\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\transformers\\models\\auto\\configuration_auto.py:1038\u001B[0m, in \u001B[0;36mAutoConfig.from_pretrained\u001B[1;34m(cls, pretrained_model_name_or_path, **kwargs)\u001B[0m\n\u001B[0;32m 1037\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m-> 1038\u001B[0m config_class \u001B[38;5;241m=\u001B[39m CONFIG_MAPPING[config_dict[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mmodel_type\u001B[39m\u001B[38;5;124m\"\u001B[39m]]\n\u001B[0;32m 1039\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m:\n",
|
444 |
+
"File \u001B[1;32m~\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\transformers\\models\\auto\\configuration_auto.py:740\u001B[0m, in \u001B[0;36m_LazyConfigMapping.__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m 739\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m key \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_mapping:\n\u001B[1;32m--> 740\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m(key)\n\u001B[0;32m 741\u001B[0m value \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_mapping[key]\n",
|
445 |
+
"\u001B[1;31mKeyError\u001B[0m: 'headlineclassifier'",
|
446 |
+
"\nDuring handling of the above exception, another exception occurred:\n",
|
447 |
+
"\u001B[1;31mValueError\u001B[0m Traceback (most recent call last)",
|
448 |
+
"Cell \u001B[1;32mIn[15], line 2\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mtransformers\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m AutoModel, AutoConfig\n\u001B[1;32m----> 2\u001B[0m config \u001B[38;5;241m=\u001B[39m AutoConfig\u001B[38;5;241m.\u001B[39mfrom_pretrained(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCISProject/News-Headline-Classifier-Notebook\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m 3\u001B[0m model \u001B[38;5;241m=\u001B[39m AutoModel\u001B[38;5;241m.\u001B[39mfrom_pretrained(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCISProject/News-Headline-Classifier-Notebook\u001B[39m\u001B[38;5;124m\"\u001B[39m,config \u001B[38;5;241m=\u001B[39m config)\n",
|
449 |
+
"File \u001B[1;32m~\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\transformers\\models\\auto\\configuration_auto.py:1040\u001B[0m, in \u001B[0;36mAutoConfig.from_pretrained\u001B[1;34m(cls, pretrained_model_name_or_path, **kwargs)\u001B[0m\n\u001B[0;32m 1038\u001B[0m config_class \u001B[38;5;241m=\u001B[39m CONFIG_MAPPING[config_dict[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mmodel_type\u001B[39m\u001B[38;5;124m\"\u001B[39m]]\n\u001B[0;32m 1039\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m:\n\u001B[1;32m-> 1040\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[0;32m 1041\u001B[0m \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mThe checkpoint you are trying to load has model type `\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mconfig_dict[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mmodel_type\u001B[39m\u001B[38;5;124m'\u001B[39m]\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m` \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 1042\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mbut Transformers does not recognize this architecture. This could be because of an \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 1043\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124missue with the checkpoint, or because your version of Transformers is out of date.\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m 1044\u001B[0m )\n\u001B[0;32m 1045\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m config_class\u001B[38;5;241m.\u001B[39mfrom_dict(config_dict, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39munused_kwargs)\n\u001B[0;32m 1046\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 1047\u001B[0m \u001B[38;5;66;03m# Fallback: use pattern matching on the string.\u001B[39;00m\n\u001B[0;32m 1048\u001B[0m \u001B[38;5;66;03m# We go from longer names to shorter names to catch roberta before bert (for instance)\u001B[39;00m\n",
|
450 |
+
"\u001B[1;31mValueError\u001B[0m: The checkpoint you are trying to load has model type `headlineclassifier` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date."
|
451 |
+
]
|
452 |
+
}
|
453 |
+
],
|
454 |
+
"execution_count": 15
|
455 |
},
|
456 |
{
|
457 |
"metadata": {
|