riyageorge commited on
Commit
e1bb13a
·
1 Parent(s): 37bc6d8

Upload 3 files

Browse files
dnn_smsspam_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9194959640c90b0579ead07653f3ebc4ddea231a1acd28a18a5b6a7b96b5b821
3
+ size 5890160
dnn_smsspam_model.ipynb ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "### SMS SPAM DETECTION USING DNN"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "import pandas as pd\n",
17
+ "import matplotlib.pyplot as plt\n",
18
+ "import seaborn as sns\n",
19
+ "from sklearn.model_selection import train_test_split\n",
20
+ "import tensorflow as tf\n",
21
+ "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
22
+ "import pickle"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 2,
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "# Downloading Dataset\n",
32
+ "dataset = pd.read_csv(r'SMSSpamCollection.txt', sep='\\t', names=['label', 'message'])"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 3,
38
+ "metadata": {},
39
+ "outputs": [
40
+ {
41
+ "name": "stdout",
42
+ "output_type": "stream",
43
+ "text": [
44
+ " label message\n",
45
+ "0 ham Go until jurong point, crazy.. Available only ...\n",
46
+ "1 ham Ok lar... Joking wif u oni...\n",
47
+ "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n",
48
+ "3 ham U dun say so early hor... U c already then say...\n",
49
+ "4 ham Nah I don't think he goes to usf, he lives aro...\n",
50
+ "---------------------- -------------------------\n",
51
+ " message \n",
52
+ " count unique top freq\n",
53
+ "label \n",
54
+ "ham 4825 4516 Sorry, I'll call later 30\n",
55
+ "spam 747 653 Please call our customer service representativ... 4\n"
56
+ ]
57
+ }
58
+ ],
59
+ "source": [
60
+ "print(dataset.head())\n",
61
+ "print(\"---------------------- -------------------------\")\n",
62
+ "print(dataset.groupby('label').describe())"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": 4,
68
+ "metadata": {},
69
+ "outputs": [],
70
+ "source": [
71
+ "# Preprocessing\n",
72
+ "dataset['label'] = dataset['label'].map({'spam': 1, 'ham': 0})\n",
73
+ "X = dataset['message'].values\n",
74
+ "y = dataset['label'].values"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 5,
80
+ "metadata": {},
81
+ "outputs": [
82
+ {
83
+ "name": "stdout",
84
+ "output_type": "stream",
85
+ "text": [
86
+ "[[387, 245, 325, 450, 917, 432, 1, 1323, 169, 2377], [19, 4, 1021, 112, 93, 6, 40, 358]]\n"
87
+ ]
88
+ }
89
+ ],
90
+ "source": [
91
+ "# Train Test Split\n",
92
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n",
93
+ "\n",
94
+ "tokeniser = tf.keras.preprocessing.text.Tokenizer()\n",
95
+ "tokeniser.fit_on_texts(X_train)\n",
96
+ "\n",
97
+ "# Save the tokenizer using pickle\n",
98
+ "with open('dnn_smsspam_tokenizer.pickle', 'wb') as handle:\n",
99
+ " pickle.dump(tokeniser, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
100
+ "\n",
101
+ "encoded_train = tokeniser.texts_to_sequences(X_train)\n",
102
+ "encoded_test = tokeniser.texts_to_sequences(X_test)\n",
103
+ "print(encoded_train[0:2])"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 6,
109
+ "metadata": {},
110
+ "outputs": [
111
+ {
112
+ "name": "stdout",
113
+ "output_type": "stream",
114
+ "text": [
115
+ "[[ 14 61 388 540 3557 23 3558 0 0 0 0 0 0 0\n",
116
+ " 0 0 0 0 0 0]\n",
117
+ " [ 474 59 35 10 61 22 63 75 76 0 0 0 0 0\n",
118
+ " 0 0 0 0 0 0]\n",
119
+ " [ 36 727 180 26 3559 2396 452 41 9 1850 0 0 0 0\n",
120
+ " 0 0 0 0 0 0]\n",
121
+ " [ 518 2397 158 73 243 10 48 92 0 0 0 0 0 0\n",
122
+ " 0 0 0 0 0 0]]\n"
123
+ ]
124
+ }
125
+ ],
126
+ "source": [
127
+ "# Padding\n",
128
+ "max_length = 20\n",
129
+ "padded_train = tf.keras.preprocessing.sequence.pad_sequences(encoded_train, maxlen=max_length, padding='post')\n",
130
+ "padded_test = tf.keras.preprocessing.sequence.pad_sequences(encoded_test, maxlen=max_length, padding='post')\n",
131
+ "print(padded_train[30:34])"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": 7,
137
+ "metadata": {},
138
+ "outputs": [],
139
+ "source": [
140
+ "vocab_size = len(tokeniser.word_index) + 1"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": 8,
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": [
149
+ "# Model definition\n",
150
+ "model=tf.keras.models.Sequential([\n",
151
+ " tf.keras.layers.Embedding(input_dim=vocab_size,output_dim= 64, input_length=max_length),\n",
152
+ " tf.keras.layers.GlobalAveragePooling1D(),\n",
153
+ " tf.keras.layers.Dense(64, activation='relu'),\n",
154
+ " tf.keras.layers.Dense(32, activation='relu'),\n",
155
+ " tf.keras.layers.Dense(16, activation='relu'),\n",
156
+ " tf.keras.layers.Dense(1, activation='sigmoid')\n",
157
+ "])"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 9,
163
+ "metadata": {},
164
+ "outputs": [
165
+ {
166
+ "name": "stdout",
167
+ "output_type": "stream",
168
+ "text": [
169
+ "Model: \"sequential\"\n",
170
+ "_________________________________________________________________\n",
171
+ " Layer (type) Output Shape Param # \n",
172
+ "=================================================================\n",
173
+ " embedding (Embedding) (None, 20, 64) 480128 \n",
174
+ " \n",
175
+ " global_average_pooling1d ( (None, 64) 0 \n",
176
+ " GlobalAveragePooling1D) \n",
177
+ " \n",
178
+ " dense (Dense) (None, 64) 4160 \n",
179
+ " \n",
180
+ " dense_1 (Dense) (None, 32) 2080 \n",
181
+ " \n",
182
+ " dense_2 (Dense) (None, 16) 528 \n",
183
+ " \n",
184
+ " dense_3 (Dense) (None, 1) 17 \n",
185
+ " \n",
186
+ "=================================================================\n",
187
+ "Total params: 486913 (1.86 MB)\n",
188
+ "Trainable params: 486913 (1.86 MB)\n",
189
+ "Non-trainable params: 0 (0.00 Byte)\n",
190
+ "_________________________________________________________________\n",
191
+ "None\n"
192
+ ]
193
+ }
194
+ ],
195
+ "source": [
196
+ "# compile the model\n",
197
+ "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n",
198
+ "\n",
199
+ "# summarize the model\n",
200
+ "print(model.summary())\n",
201
+ "\n",
202
+ "# Early stopping callback\n",
203
+ "early_stop = tf.keras.callbacks.EarlyStopping(monitor='accuracy', mode='min', patience=10)"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": 10,
209
+ "metadata": {},
210
+ "outputs": [
211
+ {
212
+ "name": "stdout",
213
+ "output_type": "stream",
214
+ "text": [
215
+ "Epoch 1/50\n",
216
+ "122/122 [==============================] - 2s 6ms/step - loss: 0.3687 - accuracy: 0.8895 - val_loss: 0.0994 - val_accuracy: 0.9767\n",
217
+ "Epoch 2/50\n",
218
+ "122/122 [==============================] - 1s 4ms/step - loss: 0.0500 - accuracy: 0.9864 - val_loss: 0.0381 - val_accuracy: 0.9904\n",
219
+ "Epoch 3/50\n",
220
+ "122/122 [==============================] - 1s 5ms/step - loss: 0.0163 - accuracy: 0.9959 - val_loss: 0.0373 - val_accuracy: 0.9910\n",
221
+ "Epoch 4/50\n",
222
+ "122/122 [==============================] - 1s 5ms/step - loss: 0.0069 - accuracy: 0.9985 - val_loss: 0.0399 - val_accuracy: 0.9886\n",
223
+ "Epoch 5/50\n",
224
+ "122/122 [==============================] - 1s 5ms/step - loss: 0.0043 - accuracy: 0.9992 - val_loss: 0.0416 - val_accuracy: 0.9910\n",
225
+ "Epoch 6/50\n",
226
+ "122/122 [==============================] - 1s 6ms/step - loss: 0.0026 - accuracy: 0.9995 - val_loss: 0.0439 - val_accuracy: 0.9910\n",
227
+ "Epoch 7/50\n",
228
+ "122/122 [==============================] - 1s 5ms/step - loss: 0.0018 - accuracy: 0.9997 - val_loss: 0.0454 - val_accuracy: 0.9910\n",
229
+ "Epoch 8/50\n",
230
+ "122/122 [==============================] - 1s 5ms/step - loss: 0.0011 - accuracy: 0.9997 - val_loss: 0.0476 - val_accuracy: 0.9916\n",
231
+ "Epoch 9/50\n",
232
+ "122/122 [==============================] - 1s 6ms/step - loss: 0.0015 - accuracy: 0.9992 - val_loss: 0.0533 - val_accuracy: 0.9904\n",
233
+ "Epoch 10/50\n",
234
+ "122/122 [==============================] - 1s 5ms/step - loss: 2.8591e-04 - accuracy: 1.0000 - val_loss: 0.0531 - val_accuracy: 0.9910\n",
235
+ "Epoch 11/50\n",
236
+ "122/122 [==============================] - 1s 5ms/step - loss: 3.3040e-04 - accuracy: 1.0000 - val_loss: 0.0553 - val_accuracy: 0.9904\n"
237
+ ]
238
+ },
239
+ {
240
+ "data": {
241
+ "text/plain": [
242
+ "<keras.src.callbacks.History at 0x252ee469930>"
243
+ ]
244
+ },
245
+ "execution_count": 10,
246
+ "metadata": {},
247
+ "output_type": "execute_result"
248
+ }
249
+ ],
250
+ "source": [
251
+ "# Model training\n",
252
+ "model.fit(x=padded_train,\n",
253
+ " y=y_train,\n",
254
+ " epochs=50,\n",
255
+ " validation_data=(padded_test, y_test),\n",
256
+ " callbacks=[early_stop]\n",
257
+ " )"
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "code",
262
+ "execution_count": 11,
263
+ "metadata": {},
264
+ "outputs": [
265
+ {
266
+ "name": "stdout",
267
+ "output_type": "stream",
268
+ "text": [
269
+ "53/53 [==============================] - 0s 886us/step\n"
270
+ ]
271
+ }
272
+ ],
273
+ "source": [
274
+ "# Generate predictions after model training\n",
275
+ "preds = (model.predict(padded_test) > 0.5).astype(\"int32\")"
276
+ ]
277
+ },
278
+ {
279
+ "cell_type": "code",
280
+ "execution_count": 12,
281
+ "metadata": {},
282
+ "outputs": [
283
+ {
284
+ "name": "stdout",
285
+ "output_type": "stream",
286
+ "text": [
287
+ "Classification Report\n",
288
+ " precision recall f1-score support\n",
289
+ "\n",
290
+ " 0 0.99 1.00 0.99 1448\n",
291
+ " 1 1.00 0.93 0.96 224\n",
292
+ "\n",
293
+ " accuracy 0.99 1672\n",
294
+ " macro avg 0.99 0.97 0.98 1672\n",
295
+ "weighted avg 0.99 0.99 0.99 1672\n",
296
+ "\n",
297
+ "Accuracy : 99.04\n"
298
+ ]
299
+ }
300
+ ],
301
+ "source": [
302
+ "# Classification report\n",
303
+ "print(\"Classification Report\")\n",
304
+ "print(classification_report(y_test, preds))\n",
305
+ "\n",
306
+ "# Accuracy score\n",
307
+ "acc_sc = accuracy_score(y_test, preds)\n",
308
+ "print(f\"Accuracy : {round(acc_sc * 100, 2)}\")"
309
+ ]
310
+ },
311
+ {
312
+ "cell_type": "code",
313
+ "execution_count": 13,
314
+ "metadata": {},
315
+ "outputs": [
316
+ {
317
+ "data": {
318
+ "image/png": "",
319
+ "text/plain": [
320
+ "<Figure size 640x480 with 1 Axes>"
321
+ ]
322
+ },
323
+ "metadata": {},
324
+ "output_type": "display_data"
325
+ }
326
+ ],
327
+ "source": [
328
+ "# Confusion matrix plotting\n",
329
+ "mtx = confusion_matrix(y_test, preds)\n",
330
+ "sns.heatmap(mtx, annot=True, fmt='d', linewidths=.5, cmap=\"Blues\", cbar=False)\n",
331
+ "plt.ylabel('True label')\n",
332
+ "plt.xlabel('Predicted label')\n",
333
+ "plt.show() # Display the plot"
334
+ ]
335
+ },
336
+ {
337
+ "cell_type": "code",
338
+ "execution_count": 14,
339
+ "metadata": {},
340
+ "outputs": [
341
+ {
342
+ "name": "stderr",
343
+ "output_type": "stream",
344
+ "text": [
345
+ "d:\\STUDY\\Sem3\\deeplearning\\DLENV\\lib\\site-packages\\keras\\src\\engine\\training.py:3079: UserWarning: You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.\n",
346
+ " saving_api.save_model(\n"
347
+ ]
348
+ }
349
+ ],
350
+ "source": [
351
+ "# Save the trained model\n",
352
+ "model.save(\"dnn_smsspam_model.h5\")\n",
353
+ "dnn_smsspam_model = tf.keras.models.load_model('dnn_smsspam_model.h5')"
354
+ ]
355
+ },
356
+ {
357
+ "cell_type": "code",
358
+ "execution_count": 15,
359
+ "metadata": {},
360
+ "outputs": [],
361
+ "source": [
362
+ "def predict_message(input_text):\n",
363
+ " # Process input text similarly to training data\n",
364
+ " encoded_input = tokeniser.texts_to_sequences([input_text])\n",
365
+ " padded_input = tf.keras.preprocessing.sequence.pad_sequences(encoded_input, maxlen=max_length, padding='post')\n",
366
+ " \n",
367
+ " # Get the probabilities of being classified as \"Spam\" for each input\n",
368
+ " predictions = dnn_smsspam_model.predict(padded_input)\n",
369
+ " \n",
370
+ " # Define a threshold (e.g., 0.5) for classification\n",
371
+ " threshold = 0.5\n",
372
+ "\n",
373
+ " # Make the predictions based on the threshold for each input\n",
374
+ " results = []\n",
375
+ " for prediction in predictions:\n",
376
+ " if prediction > threshold:\n",
377
+ " results.append(\"Spam\")\n",
378
+ " else:\n",
379
+ " results.append(\"Not spam\")\n",
380
+ " \n",
381
+ " return results\n"
382
+ ]
383
+ },
384
+ {
385
+ "cell_type": "code",
386
+ "execution_count": 16,
387
+ "metadata": {},
388
+ "outputs": [
389
+ {
390
+ "name": "stdout",
391
+ "output_type": "stream",
392
+ "text": [
393
+ "1/1 [==============================] - 0s 57ms/step\n",
394
+ "Message: Your free ringtone is waiting to be collected. Simply text the password \"MIX\" to 85069 to verify. Get Usher and Britney. FML, PO Box 5249, MK17 92H. 450Ppw 16 haWatching telugu movie..wat abt u? \n",
395
+ "The message is classified as: ['Spam']\n"
396
+ ]
397
+ }
398
+ ],
399
+ "source": [
400
+ "# Take user input for prediction\n",
401
+ "user_input =('Your free ringtone is waiting to be collected. Simply text the password \"MIX\" to 85069 to verify. Get Usher and Britney. FML, PO Box 5249, MK17 92H. 450Ppw 16 haWatching telugu movie..wat abt u?')\n",
402
+ "prediction_result = predict_message(user_input)\n",
403
+ "print(f\"Message: {user_input} \\nThe message is classified as: {prediction_result}\")"
404
+ ]
405
+ },
406
+ {
407
+ "cell_type": "code",
408
+ "execution_count": 17,
409
+ "metadata": {},
410
+ "outputs": [
411
+ {
412
+ "name": "stdout",
413
+ "output_type": "stream",
414
+ "text": [
415
+ "1/1 [==============================] - 0s 23ms/step\n",
416
+ "Message: XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL \n",
417
+ "The message is classified as: ['Spam']\n"
418
+ ]
419
+ }
420
+ ],
421
+ "source": [
422
+ "\n",
423
+ "user_input_1 = ('XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL')\n",
424
+ "\n",
425
+ "\n",
426
+ "prediction_result_1 = predict_message(user_input_1)\n",
427
+ "print(f\"Message: {user_input_1} \\nThe message is classified as: {prediction_result_1}\")\n",
428
+ " "
429
+ ]
430
+ },
431
+ {
432
+ "cell_type": "code",
433
+ "execution_count": 18,
434
+ "metadata": {},
435
+ "outputs": [
436
+ {
437
+ "name": "stdout",
438
+ "output_type": "stream",
439
+ "text": [
440
+ "1/1 [==============================] - 0s 18ms/step\n",
441
+ "Message: Hi i want to speak to you \n",
442
+ "The message is classified as: ['Not spam']\n"
443
+ ]
444
+ }
445
+ ],
446
+ "source": [
447
+ "user_input= ('Hi i want to speak to you')\n",
448
+ "\n",
449
+ "\n",
450
+ "prediction_result= predict_message(user_input)\n",
451
+ "print(f\"Message: {user_input} \\nThe message is classified as: {prediction_result}\")"
452
+ ]
453
+ },
454
+ {
455
+ "cell_type": "code",
456
+ "execution_count": null,
457
+ "metadata": {},
458
+ "outputs": [],
459
+ "source": []
460
+ },
461
+ {
462
+ "cell_type": "code",
463
+ "execution_count": null,
464
+ "metadata": {},
465
+ "outputs": [],
466
+ "source": []
467
+ }
468
+ ],
469
+ "metadata": {
470
+ "kernelspec": {
471
+ "display_name": "DLENV",
472
+ "language": "python",
473
+ "name": "python3"
474
+ },
475
+ "language_info": {
476
+ "codemirror_mode": {
477
+ "name": "ipython",
478
+ "version": 3
479
+ },
480
+ "file_extension": ".py",
481
+ "mimetype": "text/x-python",
482
+ "name": "python",
483
+ "nbconvert_exporter": "python",
484
+ "pygments_lexer": "ipython3",
485
+ "version": "3.10.11"
486
+ }
487
+ },
488
+ "nbformat": 4,
489
+ "nbformat_minor": 2
490
+ }
dnn_smsspam_tokenizer.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9797f71f6e298d22ad16c8e17256351a5124192d536e452cf4192de8731c110f
3
+ size 290462