TUEN-YUE commited on
Commit
d6cf153
·
verified ·
1 Parent(s): 9e5bba0

Upload train+test.ipynb

Browse files
Files changed (1) hide show
  1. train+test.ipynb +868 -0
train+test.ipynb ADDED
@@ -0,0 +1,868 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "metadata": {},
5
+ "cell_type": "markdown",
6
+ "source": [
7
+ "# Installing dependencies\n",
8
+ "\n",
9
+ "## Please make a copy of this notebook."
10
+ ],
11
+ "id": "13156d7ed48b282"
12
+ },
13
+ {
14
+ "metadata": {},
15
+ "cell_type": "code",
16
+ "outputs": [],
17
+ "execution_count": null,
18
+ "source": [
19
+ "!pip install geopy > delete.txt\n",
20
+ "!pip install datasets > delete.txt\n",
21
+ "!pip install torch torchvision datasets > delete.txt\n",
22
+ "!pip install huggingface_hub > delete.txt\n",
23
+ "!pip install pyhocon > delete.txt\n",
24
+ "!pip install transformers > delete.txt\n",
25
+ "!pip install gensim > delete.txt\n",
26
+ "!rm delete.txt"
27
+ ],
28
+ "id": "5a596f2639253772"
29
+ },
30
+ {
31
+ "metadata": {},
32
+ "cell_type": "markdown",
33
+ "source": [
34
+ "# Huggingface login\n",
35
+ "You will require your personal token."
36
+ ],
37
+ "id": "432a756039e6399"
38
+ },
39
+ {
40
+ "metadata": {
41
+ "ExecuteTime": {
42
+ "end_time": "2024-12-16T19:48:43.216631Z",
43
+ "start_time": "2024-12-16T19:48:43.214630Z"
44
+ }
45
+ },
46
+ "cell_type": "code",
47
+ "source": "!huggingface-cli login",
48
+ "id": "2e73da09a7c6171e",
49
+ "outputs": [],
50
+ "execution_count": 44
51
+ },
52
+ {
53
+ "metadata": {},
54
+ "cell_type": "markdown",
55
+ "source": "# Part 1: Load Data",
56
+ "id": "c731d9c1ebb477dc"
57
+ },
58
+ {
59
+ "metadata": {},
60
+ "cell_type": "markdown",
61
+ "source": "## Downloading the train and test dataset",
62
+ "id": "14070f20b547688f"
63
+ },
64
+ {
65
+ "metadata": {},
66
+ "cell_type": "markdown",
67
+ "source": "",
68
+ "id": "b8920847b7cc378d"
69
+ },
70
+ {
71
+ "metadata": {
72
+ "ExecuteTime": {
73
+ "end_time": "2024-12-16T19:48:45.272372Z",
74
+ "start_time": "2024-12-16T19:48:43.220140Z"
75
+ }
76
+ },
77
+ "cell_type": "code",
78
+ "source": [
79
+ "from datasets import load_dataset\n",
80
+ "\n",
81
+ "dataset_train = load_dataset(\"CISProject/FOX_NBC\", split=\"train\")\n",
82
+ "dataset_test = load_dataset(\"CISProject/FOX_NBC\", split=\"test\")\n",
83
+ "# dataset_test = load_dataset(\"CISProject/FOX_NBC\", split=\"test_data_random_subset\")\n"
84
+ ],
85
+ "id": "877c90c978d62b7d",
86
+ "outputs": [],
87
+ "execution_count": 45
88
+ },
89
+ {
90
+ "metadata": {
91
+ "ExecuteTime": {
92
+ "end_time": "2024-12-16T19:48:45.287939Z",
93
+ "start_time": "2024-12-16T19:48:45.278748Z"
94
+ }
95
+ },
96
+ "cell_type": "code",
97
+ "source": [
98
+ "import numpy as np\n",
99
+ "import torch\n",
100
+ "import re\n",
101
+ "from transformers import BertTokenizer\n",
102
+ "from transformers import RobertaTokenizer\n",
103
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
104
+ "from gensim.models import KeyedVectors\n",
105
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
106
+ "\n",
107
+ "def preprocess_data(data,\n",
108
+ " mode=\"train\",\n",
109
+ " vectorizer=None,\n",
110
+ " w2v_model=None,\n",
111
+ " max_features=4096,\n",
112
+ " max_seq_length=128,\n",
113
+ " num_proc=4):\n",
114
+ " if w2v_model is None:\n",
115
+ " raise ValueError(\"w2v_model must be provided for Word2Vec embeddings.\")\n",
116
+ "\n",
117
+ " # tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n",
118
+ " tokenizer = RobertaTokenizer.from_pretrained(\"roberta-base\")\n",
119
+ " # 1. Clean text once\n",
120
+ " def clean_text(examples):\n",
121
+ " import re\n",
122
+ " cleaned = []\n",
123
+ " for text in examples[\"title\"]:\n",
124
+ " text = text.lower()\n",
125
+ " text = re.sub(r'[^\\w\\s]', '', text)\n",
126
+ " text = text.strip()\n",
127
+ " cleaned.append(text)\n",
128
+ " return {\"clean_title\": cleaned}\n",
129
+ "\n",
130
+ " data = data.map(clean_text, batched=True, num_proc=num_proc)\n",
131
+ "\n",
132
+ " # 2. Fit CountVectorizer on training data if needed\n",
133
+ " if mode == \"train\" and vectorizer is None:\n",
134
+ " # Collect all cleaned titles to fit\n",
135
+ " all_titles = data[\"clean_title\"]\n",
136
+ " #vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1,2))\n",
137
+ " vectorizer = TfidfVectorizer(max_features=max_features)\n",
138
+ " vectorizer.fit(all_titles)\n",
139
+ " print(\"vectorizer fitted on training data.\")\n",
140
+ "\n",
141
+ " # 3. Transform titles with vectorizer once\n",
142
+ " def vectorize_batch(examples):\n",
143
+ " import numpy as np\n",
144
+ " freq = vectorizer.transform(examples[\"clean_title\"]).toarray().astype(np.float32)\n",
145
+ " return {\"freq_inputs\": freq}\n",
146
+ "\n",
147
+ " data = data.map(vectorize_batch, batched=True, num_proc=num_proc)\n",
148
+ "\n",
149
+ " # 4. Tokenize with BERT once\n",
150
+ " def tokenize_batch(examples):\n",
151
+ " tokenized = tokenizer(\n",
152
+ " examples[\"title\"],\n",
153
+ " padding=\"max_length\",\n",
154
+ " truncation=True,\n",
155
+ " max_length=max_seq_length\n",
156
+ " )\n",
157
+ " return {\n",
158
+ " \"input_ids\": tokenized[\"input_ids\"],\n",
159
+ " \"attention_mask\": tokenized[\"attention_mask\"]\n",
160
+ " }\n",
161
+ "\n",
162
+ " data = data.map(tokenize_batch, batched=True, num_proc=num_proc)\n",
163
+ "\n",
164
+ " # 5. Convert titles into tokens for W2V\n",
165
+ " def split_tokens(examples):\n",
166
+ " tokens_list = [t.split() for t in examples[\"clean_title\"]]\n",
167
+ " return {\"tokens\": tokens_list}\n",
168
+ "\n",
169
+ " data = data.map(split_tokens, batched=True, num_proc=num_proc)\n",
170
+ "\n",
171
+ " # Build an embedding dictionary for all unique tokens (do this once before embedding map)\n",
172
+ " unique_tokens = set()\n",
173
+ " for tokens in data[\"tokens\"]:\n",
174
+ " unique_tokens.update(tokens)\n",
175
+ "\n",
176
+ " embedding_dim = w2v_model.vector_size\n",
177
+ " embedding_dict = {}\n",
178
+ " for tk in unique_tokens:\n",
179
+ " if tk in w2v_model:\n",
180
+ " embedding_dict[tk] = w2v_model[tk].astype(np.float32)\n",
181
+ " else:\n",
182
+ " embedding_dict[tk] = np.zeros((embedding_dim,), dtype=np.float32)\n",
183
+ "\n",
184
+ " def w2v_embedding_batch(examples):\n",
185
+ " import numpy as np\n",
186
+ " batch_w2v = []\n",
187
+ " for tokens in examples[\"tokens\"]:\n",
188
+ " vectors = [embedding_dict[tk] for tk in tokens[:max_seq_length]]\n",
189
+ " if len(vectors) < max_seq_length:\n",
190
+ " vectors += [np.zeros((embedding_dim,), dtype=np.float32)] * (max_seq_length - len(vectors))\n",
191
+ " batch_w2v.append(vectors)\n",
192
+ " return {\"pos_inputs\": batch_w2v}\n",
193
+ "\n",
194
+ "\n",
195
+ " data = data.map(w2v_embedding_batch, batched=True, batch_size=32, num_proc=num_proc)\n",
196
+ "\n",
197
+ " # 7. Create labels\n",
198
+ " def make_labels(examples):\n",
199
+ " labels = examples[\"labels\"]\n",
200
+ " return {\"labels\": labels}\n",
201
+ "\n",
202
+ " data = data.map(make_labels, batched=True, num_proc=num_proc)\n",
203
+ "\n",
204
+ " # Convert freq_inputs and pos_inputs to torch tensors in a final map step\n",
205
+ " def to_tensors(examples):\n",
206
+ " import torch\n",
207
+ "\n",
208
+ " freq_inputs = torch.tensor(examples[\"freq_inputs\"], dtype=torch.float32)\n",
209
+ " input_ids = torch.tensor(examples[\"input_ids\"])\n",
210
+ " attention_mask = torch.tensor(examples[\"attention_mask\"])\n",
211
+ " pos_inputs = torch.tensor(examples[\"pos_inputs\"], dtype=torch.float32)\n",
212
+ " labels = torch.tensor(examples[\"labels\"],dtype=torch.long)\n",
213
+ "\n",
214
+ " # seq_inputs shape: (batch_size, 2, seq_len)\n",
215
+ " seq_inputs = torch.stack([input_ids, attention_mask], dim=1)\n",
216
+ "\n",
217
+ " return {\n",
218
+ " \"freq_inputs\": freq_inputs,\n",
219
+ " \"seq_inputs\": seq_inputs,\n",
220
+ " \"pos_inputs\": pos_inputs,\n",
221
+ " \"labels\": labels\n",
222
+ " }\n",
223
+ "\n",
224
+ " # Apply final conversion to tensor\n",
225
+ " processed_data = data.map(to_tensors, batched=True, num_proc=num_proc)\n",
226
+ "\n",
227
+ " return processed_data, vectorizer\n"
228
+ ],
229
+ "id": "dc2ba675ce880d6d",
230
+ "outputs": [],
231
+ "execution_count": 46
232
+ },
233
+ {
234
+ "metadata": {
235
+ "ExecuteTime": {
236
+ "end_time": "2024-12-16T19:49:01.529651Z",
237
+ "start_time": "2024-12-16T19:48:45.294290Z"
238
+ }
239
+ },
240
+ "cell_type": "code",
241
+ "source": [
242
+ "from gensim.models import KeyedVectors\n",
243
+ "w2v_model = KeyedVectors.load_word2vec_format(\"./GoogleNews-vectors-negative300.bin\", binary=True)\n",
244
+ "\n",
245
+ "dataset_train,vectorizer = preprocess_data(\n",
246
+ " data=dataset_train,\n",
247
+ " mode=\"train\",\n",
248
+ " w2v_model=w2v_model,\n",
249
+ " max_features=8192,\n",
250
+ " max_seq_length=128\n",
251
+ ")\n",
252
+ "\n",
253
+ "dataset_test, _ = preprocess_data(\n",
254
+ " data=dataset_test,\n",
255
+ " mode=\"test\",\n",
256
+ " vectorizer=vectorizer,\n",
257
+ " w2v_model=w2v_model,\n",
258
+ " max_features=8192,\n",
259
+ " max_seq_length=128\n",
260
+ ")"
261
+ ],
262
+ "id": "158b99950fb22d1",
263
+ "outputs": [
264
+ {
265
+ "name": "stdout",
266
+ "output_type": "stream",
267
+ "text": [
268
+ "vectorizer fitted on training data.\n"
269
+ ]
270
+ }
271
+ ],
272
+ "execution_count": 47
273
+ },
274
+ {
275
+ "metadata": {
276
+ "ExecuteTime": {
277
+ "end_time": "2024-12-16T19:49:01.538067Z",
278
+ "start_time": "2024-12-16T19:49:01.535063Z"
279
+ }
280
+ },
281
+ "cell_type": "code",
282
+ "source": [
283
+ "print(dataset_train)\n",
284
+ "print(dataset_test)"
285
+ ],
286
+ "id": "edd80d33175c96a0",
287
+ "outputs": [
288
+ {
289
+ "name": "stdout",
290
+ "output_type": "stream",
291
+ "text": [
292
+ "Dataset({\n",
293
+ " features: ['title', 'outlet', 'index', 'url', 'labels', 'clean_title', 'freq_inputs', 'input_ids', 'attention_mask', 'tokens', 'pos_inputs', 'seq_inputs'],\n",
294
+ " num_rows: 3044\n",
295
+ "})\n",
296
+ "Dataset({\n",
297
+ " features: ['title', 'outlet', 'index', 'url', 'labels', 'clean_title', 'freq_inputs', 'input_ids', 'attention_mask', 'tokens', 'pos_inputs', 'seq_inputs'],\n",
298
+ " num_rows: 761\n",
299
+ "})\n"
300
+ ]
301
+ }
302
+ ],
303
+ "execution_count": 48
304
+ },
305
+ {
306
+ "metadata": {},
307
+ "cell_type": "markdown",
308
+ "source": "# Part 2: Model",
309
+ "id": "c9a49fc1fbca29d7"
310
+ },
311
+ {
312
+ "metadata": {},
313
+ "cell_type": "markdown",
314
+ "source": "## Defining the Custom Model",
315
+ "id": "aebe5e51f0e611cc"
316
+ },
317
+ {
318
+ "metadata": {},
319
+ "cell_type": "markdown",
320
+ "source": "",
321
+ "id": "f0eae08a025b6ed9"
322
+ },
323
+ {
324
+ "metadata": {
325
+ "ExecuteTime": {
326
+ "end_time": "2024-12-16T19:49:01.554769Z",
327
+ "start_time": "2024-12-16T19:49:01.543575Z"
328
+ }
329
+ },
330
+ "cell_type": "code",
331
+ "source": [
332
+ "# TODO: import all packages necessary for your custom model\n",
333
+ "import pandas as pd\n",
334
+ "import os\n",
335
+ "from torch.utils.data import DataLoader\n",
336
+ "from transformers import PreTrainedModel, PretrainedConfig, AutoConfig, AutoModel\n",
337
+ "import torch\n",
338
+ "import torch.nn as nn\n",
339
+ "from transformers import RobertaModel, RobertaConfig,RobertaForSequenceClassification, BertModel\n",
340
+ "from model.network import Classifier\n",
341
+ "from model.frequential import FreqNetwork\n",
342
+ "from model.sequential import SeqNetwork\n",
343
+ "from model.positional import PosNetwork\n",
344
+ "\n",
345
+ "class CustomConfig(PretrainedConfig):\n",
346
+ " model_type = \"headlineclassifier\"\n",
347
+ "\n",
348
+ " def __init__(\n",
349
+ " self,\n",
350
+ " base_exp_dir=\"./exp/fox_nbc/\",\n",
351
+ " # dataset={\"data_dir\": \"./data/CASE_NAME/data.csv\", \"transform\": True},\n",
352
+ " train={\n",
353
+ " \"learning_rate\": 2e-5,\n",
354
+ " \"learning_rate_alpha\": 0.05,\n",
355
+ " \"end_iter\": 10,\n",
356
+ " \"batch_size\": 32,\n",
357
+ " \"warm_up_end\": 2,\n",
358
+ " \"anneal_end\": 5,\n",
359
+ " \"save_freq\": 1,\n",
360
+ " \"val_freq\": 1,\n",
361
+ " },\n",
362
+ " model={\n",
363
+ " \"freq\": {\n",
364
+ " \"tfidf_input_dim\": 8145,\n",
365
+ " \"tfidf_output_dim\": 128,\n",
366
+ " \"tfidf_hidden_dim\": 512,\n",
367
+ " \"n_layers\": 2,\n",
368
+ " \"skip_in\": [80],\n",
369
+ " \"weight_norm\": True,\n",
370
+ " },\n",
371
+ " \"pos\": {\n",
372
+ " \"input_dim\": 300,\n",
373
+ " \"output_dim\": 128,\n",
374
+ " \"hidden_dim\": 256,\n",
375
+ " \"n_layers\": 2,\n",
376
+ " \"skip_in\": [80],\n",
377
+ " \"weight_norm\": True,\n",
378
+ " },\n",
379
+ " \"cls\": {\n",
380
+ " \"combined_input\": 1024, #1024\n",
381
+ " \"combined_dim\": 128,\n",
382
+ " \"num_classes\": 2,\n",
383
+ " \"n_layers\": 2,\n",
384
+ " \"skip_in\": [80],\n",
385
+ " \"weight_norm\": True,\n",
386
+ " },\n",
387
+ " },\n",
388
+ " **kwargs,\n",
389
+ " ):\n",
390
+ " super().__init__(**kwargs)\n",
391
+ "\n",
392
+ " self.base_exp_dir = base_exp_dir\n",
393
+ " # self.dataset = dataset\n",
394
+ " self.train = train\n",
395
+ " self.model = model\n",
396
+ "\n",
397
+ "# TODO: define all parameters needed for your model, as well as calling the model itself\n",
398
+ "class CustomModel(PreTrainedModel):\n",
399
+ " config_class = CustomConfig\n",
400
+ "\n",
401
+ " def __init__(self, config):\n",
402
+ " super().__init__(config)\n",
403
+ " self.conf = config\n",
404
+ " self.freq = FreqNetwork(**self.conf.model[\"freq\"])\n",
405
+ " self.pos = PosNetwork(**self.conf.model[\"pos\"])\n",
406
+ " self.cls = Classifier(**self.conf.model[\"cls\"])\n",
407
+ " self.fc = nn.Linear(self.conf.model[\"cls\"][\"combined_input\"],2)\n",
408
+ " self.seq = RobertaModel.from_pretrained(\"roberta-base\")\n",
409
+ " # self.seq = BertModel.from_pretrained(\"bert-base-uncased\")\n",
410
+ " #for param in self.roberta.parameters():\n",
411
+ " # param.requires_grad = False\n",
412
+ " self.dropout = nn.Dropout(0.2)\n",
413
+ "\n",
414
+ " def forward(self, x):\n",
415
+ " freq_inputs = x[\"freq_inputs\"]\n",
416
+ " seq_inputs = x[\"seq_inputs\"]\n",
417
+ " pos_inputs = x[\"pos_inputs\"]\n",
418
+ " seq_feature = self.seq(\n",
419
+ " input_ids=seq_inputs[:,0,:],\n",
420
+ " attention_mask=seq_inputs[:,1,:]\n",
421
+ " ).pooler_output # last_hidden_state[:, 0, :]\n",
422
+ " lstm_out, (h_n, c_n) = self.lstm(seq_feature)\n",
423
+ " seq_feature = h_n[-1] # Use the last hidden state\n",
424
+ " freq_feature = self.freq(freq_inputs) # Shape: (batch_size, 128)\n",
425
+ "\n",
426
+ " pos_feature = self.pos(pos_inputs) #Shape: (batch_size, 128)\n",
427
+ " inputs = torch.cat((seq_feature, freq_feature, pos_feature), dim=1) # Shape: (batch_size, 384)\n",
428
+ " # inputs = torch.cat((seq_feature, freq_feature), dim=1) # Shape: (batch_size,256)\n",
429
+ " # inputs = seq_feature\n",
430
+ "\n",
431
+ " x = inputs\n",
432
+ " x = self.dropout(x)\n",
433
+ " outputs = self.fc(x)\n",
434
+ "\n",
435
+ " return outputs\n",
436
+ "\n",
437
+ " def save_model(self, save_path):\n",
438
+ " \"\"\"Save the model locally using the Hugging Face format.\"\"\"\n",
439
+ " self.save_pretrained(save_path)\n",
440
+ "\n",
441
+ " def push_model(self, repo_name):\n",
442
+ " \"\"\"Push the model to the Hugging Face Hub.\"\"\"\n",
443
+ " self.push_to_hub(repo_name)"
444
+ ],
445
+ "id": "21f079d0c52d7d",
446
+ "outputs": [],
447
+ "execution_count": 49
448
+ },
449
+ {
450
+ "metadata": {
451
+ "ExecuteTime": {
452
+ "end_time": "2024-12-16T19:49:01.791918Z",
453
+ "start_time": "2024-12-16T19:49:01.561338Z"
454
+ }
455
+ },
456
+ "cell_type": "code",
457
+ "source": [
458
+ "from huggingface_hub import hf_hub_download\n",
459
+ "\n",
460
+ "AutoConfig.register(\"headlineclassifier\", CustomConfig)\n",
461
+ "AutoModel.register(CustomConfig, CustomModel)\n",
462
+ "config = CustomConfig()\n",
463
+ "model = CustomModel(config)\n",
464
+ "\n",
465
+ "REPO_NAME = \"CISProject/News-Headline-Classifier-Notebook\" # TODO: PROVIDE A STRING TO YOUR REPO ON HUGGINGFACE"
466
+ ],
467
+ "id": "b6ba3f96d3ce21",
468
+ "outputs": [
469
+ {
470
+ "name": "stderr",
471
+ "output_type": "stream",
472
+ "text": [
473
+ "C:\\Users\\swall\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\torch\\nn\\utils\\weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n",
474
+ " WeightNorm.apply(module, name, dim)\n",
475
+ "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n",
476
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
477
+ ]
478
+ }
479
+ ],
480
+ "execution_count": 50
481
+ },
482
+ {
483
+ "metadata": {
484
+ "ExecuteTime": {
485
+ "end_time": "2024-12-16T19:49:01.808079Z",
486
+ "start_time": "2024-12-16T19:49:01.798760Z"
487
+ }
488
+ },
489
+ "cell_type": "code",
490
+ "source": [
491
+ "import torch\n",
492
+ "from tqdm import tqdm\n",
493
+ "import os\n",
494
+ "\n",
495
+ "\n",
496
+ "class Trainer:\n",
497
+ " def __init__(self, model, train_loader, val_loader, config, device=\"cuda\"):\n",
498
+ " self.model = model.to(device)\n",
499
+ " self.train_loader = train_loader\n",
500
+ " self.val_loader = val_loader\n",
501
+ " self.device = device\n",
502
+ " self.conf = config\n",
503
+ "\n",
504
+ " self.end_iter = self.conf.train[\"end_iter\"]\n",
505
+ " self.save_freq = self.conf.train[\"save_freq\"]\n",
506
+ " self.val_freq = self.conf.train[\"val_freq\"]\n",
507
+ "\n",
508
+ " self.batch_size = self.conf.train['batch_size']\n",
509
+ " self.learning_rate = self.conf.train['learning_rate']\n",
510
+ " self.learning_rate_alpha = self.conf.train['learning_rate_alpha']\n",
511
+ " self.warm_up_end = self.conf.train['warm_up_end']\n",
512
+ " self.anneal_end = self.conf.train['anneal_end']\n",
513
+ "\n",
514
+ " self.optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate)\n",
515
+ " #self.criterion = torch.nn.BCEWithLogitsLoss()\n",
516
+ " self.criterion = torch.nn.CrossEntropyLoss()\n",
517
+ " self.save_path = os.path.join(self.conf.base_exp_dir, \"checkpoints\")\n",
518
+ " os.makedirs(self.save_path, exist_ok=True)\n",
519
+ "\n",
520
+ " self.iter_step = 0\n",
521
+ "\n",
522
+ " self.val_loss = None\n",
523
+ "\n",
524
+ " def get_cos_anneal_ratio(self):\n",
525
+ " if self.anneal_end == 0.0:\n",
526
+ " return 1.0\n",
527
+ " else:\n",
528
+ " return np.min([1.0, self.iter_step / self.anneal_end])\n",
529
+ "\n",
530
+ " def update_learning_rate(self):\n",
531
+ " if self.iter_step < self.warm_up_end:\n",
532
+ " learning_factor = self.iter_step / self.warm_up_end\n",
533
+ " else:\n",
534
+ " alpha = self.learning_rate_alpha\n",
535
+ " progress = (self.iter_step - self.warm_up_end) / (self.end_iter - self.warm_up_end)\n",
536
+ " learning_factor = (np.cos(np.pi * progress) + 1.0) * 0.5 * (1 - alpha) + alpha\n",
537
+ "\n",
538
+ " for g in self.optimizer.param_groups:\n",
539
+ " g['lr'] = self.learning_rate * learning_factor\n",
540
+ "\n",
541
+ " def train(self):\n",
542
+ " for epoch in range(self.end_iter):\n",
543
+ " self.update_learning_rate()\n",
544
+ " self.model.train()\n",
545
+ " epoch_loss = 0.0\n",
546
+ " correct = 0\n",
547
+ " total = 0\n",
548
+ "\n",
549
+ " for batch_inputs, labels in tqdm(self.train_loader, desc=f\"Epoch {epoch + 1}/{self.end_iter}\"):\n",
550
+ " # Extract features\n",
551
+ "\n",
552
+ " freq_inputs = batch_inputs[\"freq_inputs\"].to(self.device)\n",
553
+ " seq_inputs = batch_inputs[\"seq_inputs\"].to(self.device)\n",
554
+ " pos_inputs = batch_inputs[\"pos_inputs\"].to(self.device)\n",
555
+ " # y_train = labels.to(self.device)[:,None]\n",
556
+ " y_train = labels.to(self.device)\n",
557
+ "\n",
558
+ " # Forward pass\n",
559
+ " preds = self.model({\"freq_inputs\": freq_inputs, \"seq_inputs\": seq_inputs, \"pos_inputs\": pos_inputs})\n",
560
+ " loss = self.criterion(preds, y_train)\n",
561
+ "\n",
562
+ " # preds = (torch.sigmoid(preds) > 0.5).int()\n",
563
+ " # Backward pass\n",
564
+ " self.optimizer.zero_grad()\n",
565
+ " loss.backward()\n",
566
+ " self.optimizer.step()\n",
567
+ " _, preds = torch.max(preds, dim=1)\n",
568
+ " # Metrics\n",
569
+ " epoch_loss += loss.item()\n",
570
+ " total += y_train.size(0)\n",
571
+ " # print(preds.shape)\n",
572
+ " correct += (preds == y_train).sum().item()\n",
573
+ "\n",
574
+ " # Log epoch metrics\n",
575
+ " print(f\"Train Loss: {epoch_loss / len(self.train_loader):.4f}\")\n",
576
+ " print(f\"Train Accuracy: {correct / total:.4f}\")\n",
577
+ "\n",
578
+ " # Validation and Save Checkpoints\n",
579
+ " if (epoch + 1) % self.val_freq == 0:\n",
580
+ " self.val()\n",
581
+ " if (epoch + 1) % self.save_freq == 0:\n",
582
+ " self.save_checkpoint(epoch + 1)\n",
583
+ "\n",
584
+ " # Update learning rate\n",
585
+ " self.iter_step += 1\n",
586
+ " self.update_learning_rate()\n",
587
+ "\n",
588
+ "\n",
589
+ " def val(self):\n",
590
+ " self.model.eval()\n",
591
+ " val_loss = 0.0\n",
592
+ " correct = 0\n",
593
+ " total = 0\n",
594
+ "\n",
595
+ " with torch.no_grad():\n",
596
+ " for batch_inputs, labels in tqdm(self.val_loader, desc=\"Validation\", leave=False):\n",
597
+ " freq_inputs = batch_inputs[\"freq_inputs\"].to(self.device)\n",
598
+ " seq_inputs = batch_inputs[\"seq_inputs\"].to(self.device)\n",
599
+ " pos_inputs = batch_inputs[\"pos_inputs\"].to(self.device)\n",
600
+ " y_val = labels.to(self.device)\n",
601
+ "\n",
602
+ " preds = self.model({\"freq_inputs\": freq_inputs, \"seq_inputs\": seq_inputs, \"pos_inputs\": pos_inputs})\n",
603
+ " loss = self.criterion(preds, y_val)\n",
604
+ " # preds = (torch.sigmoid(preds)>0.5).float()\n",
605
+ " _, preds = torch.max(preds, dim=1)\n",
606
+ " val_loss += loss.item()\n",
607
+ " total += y_val.size(0)\n",
608
+ " correct += (preds == y_val).sum().item()\n",
609
+ " if self.val_loss is None or val_loss < self.val_loss:\n",
610
+ " self.val_loss = val_loss\n",
611
+ " self.save_checkpoint(\"best\")\n",
612
+ " # Log validation metrics\n",
613
+ " print(f\"Validation Loss: {val_loss / len(self.val_loader):.4f}\")\n",
614
+ " print(f\"Validation Accuracy: {correct / total:.4f}\")\n",
615
+ "\n",
616
+ " def save_checkpoint(self, epoch):\n",
617
+ " \"\"\"Save model in Hugging Face format.\"\"\"\n",
618
+ " checkpoint_dir = os.path.join(self.save_path, f\"checkpoint_epoch_{epoch}\")\n",
619
+ " if epoch ==\"best\":\n",
620
+ " checkpoint_dir = os.path.join(self.save_path, \"best\")\n",
621
+ " self.model.save_pretrained(checkpoint_dir)\n",
622
+ " print(f\"Checkpoint saved at {checkpoint_dir}\")"
623
+ ],
624
+ "id": "7be377251b81a25d",
625
+ "outputs": [],
626
+ "execution_count": 51
627
+ },
628
+ {
629
+ "metadata": {
630
+ "ExecuteTime": {
631
+ "end_time": "2024-12-16T19:49:03.149673Z",
632
+ "start_time": "2024-12-16T19:49:01.812943Z"
633
+ }
634
+ },
635
+ "cell_type": "code",
636
+ "source": [
637
+ "from torch.utils.data import DataLoader\n",
638
+ "\n",
639
+ "# Define a collate function to handle the batched data\n",
640
+ "def collate_fn(batch):\n",
641
+ " freq_inputs = torch.stack([torch.tensor(item[\"freq_inputs\"]) for item in batch])\n",
642
+ " seq_inputs = torch.stack([torch.tensor(item[\"seq_inputs\"]) for item in batch])\n",
643
+ " pos_inputs = torch.stack([torch.tensor(item[\"pos_inputs\"]) for item in batch])\n",
644
+ " labels = torch.tensor([torch.tensor(item[\"labels\"],dtype=torch.long) for item in batch])\n",
645
+ " return {\"freq_inputs\": freq_inputs, \"seq_inputs\": seq_inputs, \"pos_inputs\": pos_inputs}, labels\n",
646
+ "\n",
647
+ "train_loader = DataLoader(dataset_train, batch_size=config.train[\"batch_size\"], shuffle=True,collate_fn=collate_fn)\n",
648
+ "test_loader = DataLoader(dataset_test, batch_size=config.train[\"batch_size\"], shuffle=False,collate_fn=collate_fn)\n",
649
+ "trainer = Trainer(model, train_loader, test_loader, config)\n",
650
+ "\n",
651
+ "# Train the model\n",
652
+ "trainer.train()\n",
653
+ "# Save the final model in Hugging Face format\n",
654
+ "final_save_path = os.path.join(config.base_exp_dir, \"checkpoints\")\n",
655
+ "model.save_pretrained(final_save_path)\n",
656
+ "print(f\"Final model saved at {final_save_path}\")\n"
657
+ ],
658
+ "id": "dd1749c306f148eb",
659
+ "outputs": [
660
+ {
661
+ "name": "stderr",
662
+ "output_type": "stream",
663
+ "text": [
664
+ "Epoch 1/10: 0%| | 0/96 [00:00<?, ?it/s]"
665
+ ]
666
+ },
667
+ {
668
+ "name": "stdout",
669
+ "output_type": "stream",
670
+ "text": [
671
+ "torch.Size([1, 768]) torch.Size([32, 128]) torch.Size([32, 128])\n"
672
+ ]
673
+ },
674
+ {
675
+ "name": "stderr",
676
+ "output_type": "stream",
677
+ "text": [
678
+ "\n"
679
+ ]
680
+ },
681
+ {
682
+ "ename": "RuntimeError",
683
+ "evalue": "Sizes of tensors must match except in dimension 1. Expected size 1 but got size 32 for tensor number 1 in the list.",
684
+ "output_type": "error",
685
+ "traceback": [
686
+ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
687
+ "\u001B[1;31mRuntimeError\u001B[0m Traceback (most recent call last)",
688
+ "Cell \u001B[1;32mIn[52], line 16\u001B[0m\n\u001B[0;32m 13\u001B[0m trainer \u001B[38;5;241m=\u001B[39m Trainer(model, train_loader, test_loader, config)\n\u001B[0;32m 15\u001B[0m \u001B[38;5;66;03m# Train the model\u001B[39;00m\n\u001B[1;32m---> 16\u001B[0m trainer\u001B[38;5;241m.\u001B[39mtrain()\n\u001B[0;32m 17\u001B[0m \u001B[38;5;66;03m# Save the final model in Hugging Face format\u001B[39;00m\n\u001B[0;32m 18\u001B[0m final_save_path \u001B[38;5;241m=\u001B[39m os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39mjoin(config\u001B[38;5;241m.\u001B[39mbase_exp_dir, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcheckpoints\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n",
689
+ "Cell \u001B[1;32mIn[51], line 69\u001B[0m, in \u001B[0;36mTrainer.train\u001B[1;34m(self)\u001B[0m\n\u001B[0;32m 66\u001B[0m y_train \u001B[38;5;241m=\u001B[39m labels\u001B[38;5;241m.\u001B[39mto(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mdevice)\n\u001B[0;32m 68\u001B[0m \u001B[38;5;66;03m# Forward pass\u001B[39;00m\n\u001B[1;32m---> 69\u001B[0m preds \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mmodel({\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mfreq_inputs\u001B[39m\u001B[38;5;124m\"\u001B[39m: freq_inputs, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mseq_inputs\u001B[39m\u001B[38;5;124m\"\u001B[39m: seq_inputs, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mpos_inputs\u001B[39m\u001B[38;5;124m\"\u001B[39m: pos_inputs})\n\u001B[0;32m 70\u001B[0m loss \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcriterion(preds, y_train)\n\u001B[0;32m 72\u001B[0m \u001B[38;5;66;03m# preds = (torch.sigmoid(preds) > 0.5).int()\u001B[39;00m\n\u001B[0;32m 73\u001B[0m \u001B[38;5;66;03m# Backward pass\u001B[39;00m\n",
690
+ "File \u001B[1;32m~\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1736\u001B[0m, in \u001B[0;36mModule._wrapped_call_impl\u001B[1;34m(self, *args, **kwargs)\u001B[0m\n\u001B[0;32m 1734\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_compiled_call_impl(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs) \u001B[38;5;66;03m# type: ignore[misc]\u001B[39;00m\n\u001B[0;32m 1735\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m-> 1736\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_call_impl(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n",
691
+ "File \u001B[1;32m~\\anaconda3\\envs\\newsCLS\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1747\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[1;34m(self, *args, **kwargs)\u001B[0m\n\u001B[0;32m 1742\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[0;32m 1743\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[0;32m 1744\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks\n\u001B[0;32m 1745\u001B[0m \u001B[38;5;129;01mor\u001B[39;00m _global_backward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[0;32m 1746\u001B[0m \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[1;32m-> 1747\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m forward_call(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[0;32m 1749\u001B[0m result \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[0;32m 1750\u001B[0m called_always_called_hooks \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mset\u001B[39m()\n",
692
+ "Cell \u001B[1;32mIn[49], line 99\u001B[0m, in \u001B[0;36mCustomModel.forward\u001B[1;34m(self, x)\u001B[0m\n\u001B[0;32m 97\u001B[0m pos_feature \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mpos(pos_inputs) \u001B[38;5;66;03m#Shape: (batch_size, 128)\u001B[39;00m\n\u001B[0;32m 98\u001B[0m \u001B[38;5;28mprint\u001B[39m(seq_feature\u001B[38;5;241m.\u001B[39mshape,pos_feature\u001B[38;5;241m.\u001B[39mshape,freq_feature\u001B[38;5;241m.\u001B[39mshape)\n\u001B[1;32m---> 99\u001B[0m inputs \u001B[38;5;241m=\u001B[39m torch\u001B[38;5;241m.\u001B[39mcat((seq_feature, freq_feature, pos_feature), dim\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m1\u001B[39m) \u001B[38;5;66;03m# Shape: (batch_size, 384)\u001B[39;00m\n\u001B[0;32m 100\u001B[0m \u001B[38;5;66;03m# inputs = torch.cat((seq_feature, freq_feature), dim=1) # Shape: (batch_size,256)\u001B[39;00m\n\u001B[0;32m 101\u001B[0m \u001B[38;5;66;03m# inputs = seq_feature\u001B[39;00m\n\u001B[0;32m 103\u001B[0m x \u001B[38;5;241m=\u001B[39m inputs\n",
693
+ "\u001B[1;31mRuntimeError\u001B[0m: Sizes of tensors must match except in dimension 1. Expected size 1 but got size 32 for tensor number 1 in the list."
694
+ ]
695
+ }
696
+ ],
697
+ "execution_count": 52
698
+ },
699
+ {
700
+ "metadata": {},
701
+ "cell_type": "markdown",
702
+ "source": "## Evaluate Model",
703
+ "id": "4af000263dd99bca"
704
+ },
705
+ {
706
+ "metadata": {},
707
+ "cell_type": "code",
708
+ "source": [
709
+ "from transformers import AutoConfig, AutoModel\n",
710
+ "from sklearn.metrics import accuracy_score, classification_report\n",
711
+ "def load_last_checkpoint(checkpoint_dir):\n",
712
+ " # Find all checkpoints in the directory\n",
713
+ " checkpoints = [f for f in os.listdir(checkpoint_dir) if f.startswith(\"checkpoint_epoch_\")]\n",
714
+ " if not checkpoints:\n",
715
+ " raise FileNotFoundError(f\"No checkpoints found in {checkpoint_dir}!\")\n",
716
+ " # Sort checkpoints by epoch number\n",
717
+ " checkpoints.sort(key=lambda x: int(x.split(\"_\")[-1]))\n",
718
+ "\n",
719
+ " # Load the last checkpoint\n",
720
+ " last_checkpoint = os.path.join(checkpoint_dir, checkpoints[-1])\n",
721
+ " # print(f\"Loading checkpoint from {last_checkpoint}\")\n",
722
+ " # Load the best checkpoint\n",
723
+ " if os.path.join(checkpoint_dir, \"best\") is not None:\n",
724
+ " last_checkpoint = os.path.join(checkpoint_dir, \"best\")\n",
725
+ " print(f\"Loading checkpoint from {last_checkpoint}\")\n",
726
+ " # Load model and config\n",
727
+ " config = AutoConfig.from_pretrained(last_checkpoint)\n",
728
+ " model = AutoModel.from_pretrained(last_checkpoint, config=config)\n",
729
+ " return model\n",
730
+ "\n",
731
+ "# Step 1: Define paths and setup\n",
732
+ "checkpoint_dir = os.path.join(config.base_exp_dir, \"checkpoints\") # Directory where checkpoints are stored\n",
733
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
734
+ "model = load_last_checkpoint(checkpoint_dir)\n",
735
+ "model.to(device)\n",
736
+ "\n",
737
+ "# criterion = torch.nn.BCEWithLogitsLoss()\n",
738
+ "\n",
739
+ "criterion = torch.nn.CrossEntropyLoss()\n",
740
+ "\n",
741
+ "def evaluate_model(model, val_loader, criterion, device=\"cuda\"):\n",
742
+ " model.eval()\n",
743
+ " val_loss = 0.0\n",
744
+ " correct = 0\n",
745
+ " total = 0\n",
746
+ " all_preds = []\n",
747
+ " all_labels = []\n",
748
+ " with torch.no_grad():\n",
749
+ " for batch_inputs, labels in tqdm(val_loader, desc=\"Testing\", leave=False):\n",
750
+ " freq_inputs = batch_inputs[\"freq_inputs\"].to(device)\n",
751
+ " seq_inputs = batch_inputs[\"seq_inputs\"].to(device)\n",
752
+ " pos_inputs = batch_inputs[\"pos_inputs\"].to(device)\n",
753
+ " labels = labels.to(device)\n",
754
+ "\n",
755
+ " preds= model({\"freq_inputs\": freq_inputs, \"seq_inputs\": seq_inputs, \"pos_inputs\": pos_inputs})\n",
756
+ " loss = criterion(preds, labels)\n",
757
+ " _, preds = torch.max(preds, dim=1)\n",
758
+ " # preds = (torch.sigmoid(preds) > 0.5).float()\n",
759
+ " val_loss += loss.item()\n",
760
+ " total += labels.size(0)\n",
761
+ " # preds = (torch.sigmoid(preds) > 0.5).int()\n",
762
+ " correct += (preds == labels).sum().item()\n",
763
+ " all_preds.extend(preds.cpu().numpy())\n",
764
+ " all_labels.extend(labels.cpu().numpy())\n",
765
+ "\n",
766
+ " return accuracy_score(all_labels, all_preds), classification_report(all_labels, all_preds)\n",
767
+ "\n",
768
+ "\n",
769
+ "accuracy, report = evaluate_model(model, test_loader, criterion)\n",
770
+ "print(f\"Accuracy: {accuracy:.4f}\")\n",
771
+ "print(report)\n"
772
+ ],
773
+ "id": "b75d2dc8a300cdf6",
774
+ "outputs": [],
775
+ "execution_count": null
776
+ },
777
+ {
778
+ "metadata": {},
779
+ "cell_type": "markdown",
780
+ "source": "# Part 3. Pushing the Model to the Hugging Face",
781
+ "id": "d2ffeb383ea00beb"
782
+ },
783
+ {
784
+ "metadata": {},
785
+ "cell_type": "code",
786
+ "source": "model.push_model(REPO_NAME)",
787
+ "id": "f55c22b0a1b2a66b",
788
+ "outputs": [],
789
+ "execution_count": null
790
+ },
791
+ {
792
+ "metadata": {},
793
+ "cell_type": "markdown",
794
+ "source": "### NOTE: You need to ensure that your Hugging Face token has both read and write access to your repository and Hugging Face organization.",
795
+ "id": "3826c0b6195a8fd5"
796
+ },
797
+ {
798
+ "metadata": {},
799
+ "cell_type": "code",
800
+ "source": [
801
+ "# Load model directly\n",
802
+ "from transformers import AutoModel, AutoConfig\n",
803
+ "config = AutoConfig.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\")\n",
804
+ "model = AutoModel.from_pretrained(\"CISProject/News-Headline-Classifier-Notebook\",config = config)"
805
+ ],
806
+ "id": "33a0ca269c24d700",
807
+ "outputs": [],
808
+ "execution_count": null
809
+ },
810
+ {
811
+ "metadata": {},
812
+ "cell_type": "code",
813
+ "source": [
814
+ "from transformers import AutoConfig, AutoModel\n",
815
+ "from sklearn.metrics import accuracy_score, classification_report\n",
816
+ "\n",
817
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
818
+ "model.to(device)\n",
819
+ "\n",
820
+ "#criterion = torch.nn.BCEWithLogitsLoss()\n",
821
+ "\n",
822
+ "criterion = torch.nn.CrossEntropyLoss()\n",
823
+ "def evaluate_model(model, val_loader, criterion, device=\"cuda\"):\n",
824
+ " model.eval()\n",
825
+ " val_loss = 0.0\n",
826
+ " correct = 0\n",
827
+ " total = 0\n",
828
+ " all_preds = []\n",
829
+ " all_labels = []\n",
830
+ " with torch.no_grad():\n",
831
+ " for batch_inputs, labels in tqdm(val_loader, desc=\"Testing\", leave=False):\n",
832
+ " freq_inputs = batch_inputs[\"freq_inputs\"].to(device)\n",
833
+ " seq_inputs = batch_inputs[\"seq_inputs\"].to(device)\n",
834
+ " pos_inputs = batch_inputs[\"pos_inputs\"].to(device)\n",
835
+ " labels = labels.to(device)\n",
836
+ "\n",
837
+ " preds = model({\"freq_inputs\": freq_inputs, \"seq_inputs\": seq_inputs, \"pos_inputs\": pos_inputs})\n",
838
+ " loss = criterion(preds, labels)\n",
839
+ " _, preds = torch.max(preds, dim=1)\n",
840
+ " # preds = (torch.sigmoid(preds) > 0.5).float()\n",
841
+ " val_loss += loss.item()\n",
842
+ " total += labels.size(0)\n",
843
+ " correct += (preds == labels).sum().item()\n",
844
+ " all_preds.extend(preds.cpu().numpy())\n",
845
+ " all_labels.extend(labels.cpu().numpy())\n",
846
+ "\n",
847
+ " return accuracy_score(all_labels, all_preds), classification_report(all_labels, all_preds)\n",
848
+ "\n",
849
+ "\n",
850
+ "accuracy, report = evaluate_model(model, test_loader, criterion)\n",
851
+ "print(f\"Accuracy: {accuracy:.4f}\")\n",
852
+ "print(report)\n"
853
+ ],
854
+ "id": "cc313b4396f87690",
855
+ "outputs": [],
856
+ "execution_count": null
857
+ }
858
+ ],
859
+ "metadata": {
860
+ "kernelspec": {
861
+ "name": "python3",
862
+ "language": "python",
863
+ "display_name": "Python 3 (ipykernel)"
864
+ }
865
+ },
866
+ "nbformat": 5,
867
+ "nbformat_minor": 9
868
+ }