riyageorge commited on
Commit
51649e7
·
1 Parent(s): a82bbda

Upload 3 files

Browse files
gru_movie_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b0fa91a9f80f4388c147f3b4a638fbbb30a6f99834dec25060420fde323afac
3
+ size 43126776
gru_movie_model.ipynb ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "#### Movie sentiment model - GRU"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "import pandas as pd\n",
17
+ "import matplotlib.pyplot as plt\n",
18
+ "import seaborn as sns\n",
19
+ "from numpy import asarray\n",
20
+ "from numpy import zeros\n",
21
+ "import tensorflow as tf\n",
22
+ "from sklearn.model_selection import train_test_split\n",
23
+ "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
24
+ "import pickle"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 2,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "def getGloveEmbeddings(glovefolderpath):\n",
34
+ " print(\"---------------------- Getting Glove Embeddings -------------------------\\n\")\n",
35
+ " embeddings_dictionary = dict()\n",
36
+ " glove_file = open(f\"{glovefolderpath}\", encoding=\"utf8\")\n",
37
+ " for line in glove_file:\n",
38
+ " records = line.split()\n",
39
+ " word = records[0]\n",
40
+ " vector_dimensions = asarray(records[1:], dtype='float32')\n",
41
+ " embeddings_dictionary [word] = vector_dimensions\n",
42
+ " glove_file.close()\n",
43
+ " print(\"---------------------- -------------------------\\n\")\n",
44
+ " return embeddings_dictionary\n",
45
+ "\n",
46
+ "\n",
47
+ "glove_folder=r'D:/STUDY/Sem3/deeplearning/glove.6B/glove.6B.100d.txt'"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 3,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "maxlen = 100"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 4,
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "dataset = pd.read_csv('movie_data.csv')\n",
66
+ "\n",
67
+ "X = dataset['review'].values\n",
68
+ "y = dataset['sentiment'].values"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 5,
74
+ "metadata": {},
75
+ "outputs": [],
76
+ "source": [
77
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": 6,
83
+ "metadata": {},
84
+ "outputs": [],
85
+ "source": [
86
+ "tokeniser = tf.keras.preprocessing.text.Tokenizer()\n",
87
+ "tokeniser.fit_on_texts(X_train)\n",
88
+ "\n",
89
+ "\n",
90
+ "# Save the tokenizer using pickle\n",
91
+ "with open('tokenizer_movie_gru.pickle', 'wb') as handle:\n",
92
+ " pickle.dump(tokeniser, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
93
+ "\n",
94
+ "\n",
95
+ "X_train = tokeniser.texts_to_sequences(X_train)\n",
96
+ "X_test = tokeniser.texts_to_sequences(X_test)\n",
97
+ "vocab_size = len(tokeniser.word_index) + 1\n",
98
+ "\n",
99
+ "X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=maxlen)\n",
100
+ "X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding='post', maxlen=maxlen)"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 7,
106
+ "metadata": {},
107
+ "outputs": [
108
+ {
109
+ "name": "stdout",
110
+ "output_type": "stream",
111
+ "text": [
112
+ "---------------------- Getting Glove Embeddings -------------------------\n",
113
+ "\n",
114
+ "---------------------- -------------------------\n",
115
+ "\n"
116
+ ]
117
+ }
118
+ ],
119
+ "source": [
120
+ "embeddings_dictionary=getGloveEmbeddings(glove_folder)\n",
121
+ "embedding_matrix = zeros((vocab_size, maxlen))\n",
122
+ "for word, index in tokeniser.word_index.items():\n",
123
+ " embedding_vector = embeddings_dictionary.get(word)\n",
124
+ " if embedding_vector is not None:\n",
125
+ " embedding_matrix[index] = embedding_vector"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": 8,
131
+ "metadata": {},
132
+ "outputs": [
133
+ {
134
+ "name": "stdout",
135
+ "output_type": "stream",
136
+ "text": [
137
+ "Model: \"sequential\"\n",
138
+ "_________________________________________________________________\n",
139
+ " Layer (type) Output Shape Param # \n",
140
+ "=================================================================\n",
141
+ " embedding (Embedding) (None, 100, 100) 10591700 \n",
142
+ " \n",
143
+ " gru (GRU) (None, 100) 60600 \n",
144
+ " \n",
145
+ " dense (Dense) (None, 1) 101 \n",
146
+ " \n",
147
+ "=================================================================\n",
148
+ "Total params: 10652401 (40.64 MB)\n",
149
+ "Trainable params: 60701 (237.11 KB)\n",
150
+ "Non-trainable params: 10591700 (40.40 MB)\n",
151
+ "_________________________________________________________________\n",
152
+ "None\n"
153
+ ]
154
+ }
155
+ ],
156
+ "source": [
157
+ "model=tf.keras.models.Sequential([\n",
158
+ " tf.keras.layers.Embedding(input_dim=vocab_size,output_dim= maxlen, weights=[embedding_matrix], input_length=maxlen , trainable=False),\n",
159
+ " tf.keras.layers.GRU(maxlen),\n",
160
+ " tf.keras.layers.Dense(1, activation='sigmoid')\n",
161
+ " ]) \n",
162
+ " \n",
163
+ "print(model.summary())\n",
164
+ "\n",
165
+ "early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='auto', patience=10)\n",
166
+ "\n",
167
+ "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": 9,
173
+ "metadata": {},
174
+ "outputs": [
175
+ {
176
+ "name": "stdout",
177
+ "output_type": "stream",
178
+ "text": [
179
+ "Epoch 1/100\n",
180
+ "766/766 [==============================] - 41s 51ms/step - loss: 0.5112 - accuracy: 0.7408 - val_loss: 0.4424 - val_accuracy: 0.7906\n",
181
+ "Epoch 2/100\n",
182
+ "766/766 [==============================] - 39s 51ms/step - loss: 0.3865 - accuracy: 0.8234 - val_loss: 0.3741 - val_accuracy: 0.8330\n",
183
+ "Epoch 3/100\n",
184
+ "766/766 [==============================] - 38s 49ms/step - loss: 0.3418 - accuracy: 0.8473 - val_loss: 0.3578 - val_accuracy: 0.8444\n",
185
+ "Epoch 4/100\n",
186
+ "766/766 [==============================] - 39s 51ms/step - loss: 0.3104 - accuracy: 0.8653 - val_loss: 0.3519 - val_accuracy: 0.8446\n",
187
+ "Epoch 5/100\n",
188
+ "766/766 [==============================] - 40s 52ms/step - loss: 0.2721 - accuracy: 0.8819 - val_loss: 0.3361 - val_accuracy: 0.8510\n",
189
+ "Epoch 6/100\n",
190
+ "766/766 [==============================] - 40s 52ms/step - loss: 0.2412 - accuracy: 0.8972 - val_loss: 0.3429 - val_accuracy: 0.8540\n",
191
+ "Epoch 7/100\n",
192
+ "766/766 [==============================] - 41s 54ms/step - loss: 0.2082 - accuracy: 0.9143 - val_loss: 0.3459 - val_accuracy: 0.8570\n",
193
+ "Epoch 8/100\n",
194
+ "766/766 [==============================] - 61s 79ms/step - loss: 0.1683 - accuracy: 0.9329 - val_loss: 0.4076 - val_accuracy: 0.8528\n",
195
+ "Epoch 9/100\n",
196
+ "766/766 [==============================] - 59s 78ms/step - loss: 0.1338 - accuracy: 0.9495 - val_loss: 0.4233 - val_accuracy: 0.8490\n",
197
+ "Epoch 10/100\n",
198
+ "766/766 [==============================] - 56s 73ms/step - loss: 0.0984 - accuracy: 0.9636 - val_loss: 0.4878 - val_accuracy: 0.8514\n",
199
+ "Epoch 11/100\n",
200
+ "766/766 [==============================] - 56s 73ms/step - loss: 0.0725 - accuracy: 0.9753 - val_loss: 0.5296 - val_accuracy: 0.8413\n",
201
+ "Epoch 12/100\n",
202
+ "766/766 [==============================] - 56s 73ms/step - loss: 0.0513 - accuracy: 0.9831 - val_loss: 0.5957 - val_accuracy: 0.8437\n",
203
+ "Epoch 13/100\n",
204
+ "766/766 [==============================] - 56s 73ms/step - loss: 0.0390 - accuracy: 0.9879 - val_loss: 0.6976 - val_accuracy: 0.8336\n",
205
+ "Epoch 14/100\n",
206
+ "766/766 [==============================] - 61s 80ms/step - loss: 0.0334 - accuracy: 0.9895 - val_loss: 0.7144 - val_accuracy: 0.8468\n",
207
+ "Epoch 15/100\n",
208
+ "766/766 [==============================] - 63s 82ms/step - loss: 0.0264 - accuracy: 0.9913 - val_loss: 0.7993 - val_accuracy: 0.8417\n",
209
+ "Epoch 16/100\n",
210
+ "766/766 [==============================] - 61s 79ms/step - loss: 0.0285 - accuracy: 0.9907 - val_loss: 0.8220 - val_accuracy: 0.8445\n",
211
+ "Epoch 17/100\n",
212
+ "766/766 [==============================] - 57s 74ms/step - loss: 0.0257 - accuracy: 0.9915 - val_loss: 0.8161 - val_accuracy: 0.8396\n"
213
+ ]
214
+ }
215
+ ],
216
+ "source": [
217
+ "history=model.fit(x=X_train,\n",
218
+ " y=y_train,\n",
219
+ " epochs=100,\n",
220
+ " callbacks=[early_stop],\n",
221
+ " validation_split=0.3\n",
222
+ " )"
223
+ ]
224
+ },
225
+ {
226
+ "cell_type": "code",
227
+ "execution_count": 10,
228
+ "metadata": {},
229
+ "outputs": [],
230
+ "source": [
231
+ "def c_report(y_true, y_pred):\n",
232
+ " print(\"Classification Report\")\n",
233
+ " print(classification_report(y_true, y_pred))\n",
234
+ " acc_sc = accuracy_score(y_true, y_pred)\n",
235
+ " print(f\"Accuracy : {str(round(acc_sc,2)*100)}\")\n",
236
+ " return acc_sc\n",
237
+ "\n",
238
+ "def plot_confusion_matrix(y_true, y_pred):\n",
239
+ " mtx = confusion_matrix(y_true, y_pred)\n",
240
+ " sns.heatmap(mtx, annot=True, fmt='d', linewidths=.5, cmap=\"Blues\", cbar=False)\n",
241
+ " plt.ylabel('True label')\n",
242
+ " plt.xlabel('Predicted label')\n",
243
+ " plt.show()"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": 11,
249
+ "metadata": {},
250
+ "outputs": [
251
+ {
252
+ "name": "stdout",
253
+ "output_type": "stream",
254
+ "text": [
255
+ "469/469 [==============================] - 9s 18ms/step\n",
256
+ "Classification Report\n",
257
+ " precision recall f1-score support\n",
258
+ "\n",
259
+ " 0 0.82 0.87 0.85 7443\n",
260
+ " 1 0.86 0.81 0.84 7557\n",
261
+ "\n",
262
+ " accuracy 0.84 15000\n",
263
+ " macro avg 0.84 0.84 0.84 15000\n",
264
+ "weighted avg 0.84 0.84 0.84 15000\n",
265
+ "\n",
266
+ "Accuracy : 84.0\n"
267
+ ]
268
+ },
269
+ {
270
+ "data": {
271
+ "image/png": "",
272
+ "text/plain": [
273
+ "<Figure size 640x480 with 1 Axes>"
274
+ ]
275
+ },
276
+ "metadata": {},
277
+ "output_type": "display_data"
278
+ }
279
+ ],
280
+ "source": [
281
+ "preds = (model.predict(X_test) > 0.5).astype(\"int32\")\n",
282
+ "c_report(y_test, preds)\n",
283
+ "plot_confusion_matrix(y_test, preds)"
284
+ ]
285
+ },
286
+ {
287
+ "cell_type": "code",
288
+ "execution_count": 12,
289
+ "metadata": {},
290
+ "outputs": [
291
+ {
292
+ "name": "stderr",
293
+ "output_type": "stream",
294
+ "text": [
295
+ "d:\\STUDY\\Sem3\\deeplearning\\DLENV\\lib\\site-packages\\keras\\src\\engine\\training.py:3079: UserWarning: You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.\n",
296
+ " saving_api.save_model(\n"
297
+ ]
298
+ }
299
+ ],
300
+ "source": [
301
+ "# Save the model\n",
302
+ "model.save(\"gru_movie_model.h5\")"
303
+ ]
304
+ },
305
+ {
306
+ "cell_type": "code",
307
+ "execution_count": 13,
308
+ "metadata": {},
309
+ "outputs": [],
310
+ "source": [
311
+ "# Load the saved model\n",
312
+ "gru_movie_model = tf.keras.models.load_model('gru_movie_model.h5')\n",
313
+ "\n",
314
+ "# Function to predict sentiment for a given review\n",
315
+ "def gru_predict_sentiment(review):\n",
316
+ " sequence = tokeniser.texts_to_sequences([review])\n",
317
+ " sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, padding='post', maxlen=maxlen)\n",
318
+ " prediction = gru_movie_model.predict(sequence)\n",
319
+ " if prediction > 0.5:\n",
320
+ " return \"Positive\"\n",
321
+ " else:\n",
322
+ " return \"Negative\""
323
+ ]
324
+ },
325
+ {
326
+ "cell_type": "code",
327
+ "execution_count": 14,
328
+ "metadata": {},
329
+ "outputs": [
330
+ {
331
+ "name": "stdout",
332
+ "output_type": "stream",
333
+ "text": [
334
+ "1/1 [==============================] - 1s 571ms/step\n",
335
+ "Review: This movie was fantastic! I loved every bit of it. \n",
336
+ "The sentiment is predicted as: Positive\n"
337
+ ]
338
+ }
339
+ ],
340
+ "source": [
341
+ "# Test the model prediction\n",
342
+ "example_review = \"This movie was fantastic! I loved every bit of it.\"\n",
343
+ "prediction_result = gru_predict_sentiment(example_review)\n",
344
+ "print(f\"Review: {example_review} \\nThe sentiment is predicted as: {prediction_result}\")"
345
+ ]
346
+ },
347
+ {
348
+ "cell_type": "code",
349
+ "execution_count": 15,
350
+ "metadata": {},
351
+ "outputs": [
352
+ {
353
+ "name": "stdout",
354
+ "output_type": "stream",
355
+ "text": [
356
+ "1/1 [==============================] - 0s 51ms/step\n",
357
+ "Review: This movie was very bad! I hated every bit of it. \n",
358
+ "The sentiment is predicted as: Negative\n"
359
+ ]
360
+ }
361
+ ],
362
+ "source": [
363
+ "# Test the model prediction\n",
364
+ "example_review = \"This movie was very bad! I hated every bit of it.\"\n",
365
+ "prediction_result = gru_predict_sentiment(example_review)\n",
366
+ "print(f\"Review: {example_review} \\nThe sentiment is predicted as: {prediction_result}\")"
367
+ ]
368
+ },
369
+ {
370
+ "cell_type": "code",
371
+ "execution_count": null,
372
+ "metadata": {},
373
+ "outputs": [],
374
+ "source": []
375
+ }
376
+ ],
377
+ "metadata": {
378
+ "kernelspec": {
379
+ "display_name": "DLENV",
380
+ "language": "python",
381
+ "name": "python3"
382
+ },
383
+ "language_info": {
384
+ "codemirror_mode": {
385
+ "name": "ipython",
386
+ "version": 3
387
+ },
388
+ "file_extension": ".py",
389
+ "mimetype": "text/x-python",
390
+ "name": "python",
391
+ "nbconvert_exporter": "python",
392
+ "pygments_lexer": "ipython3",
393
+ "version": "3.10.11"
394
+ }
395
+ },
396
+ "nbformat": 4,
397
+ "nbformat_minor": 2
398
+ }
tokenizer_movie_gru.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ce600da938552049e3985141044107bf41171365090270f3405d9f17c7d822f
3
+ size 4549072