Lokesh1024 commited on
Commit
5601d1b
·
verified ·
1 Parent(s): c927a02

Delete model.py

Browse files
Files changed (1) hide show
  1. model.py +0 -331
model.py DELETED
@@ -1,331 +0,0 @@
1
- import pickle
2
- import tensorflow as tf
3
- import pandas as pd
4
- import numpy as np
5
-
6
- import os
7
- os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
8
-
9
-
10
- # CONTANTS
11
- MAX_LENGTH = 40
12
- # VOCABULARY_SIZE = 10000
13
- BATCH_SIZE = 32
14
- BUFFER_SIZE = 1000
15
- EMBEDDING_DIM = 512
16
- UNITS = 512
17
-
18
-
19
- # LOADING DATA
20
- vocab = pickle.load(open('saved_vocabulary/vocab_coco.file', 'rb'))
21
-
22
- tokenizer = tf.keras.layers.TextVectorization(
23
- # max_tokens=VOCABULARY_SIZE,
24
- standardize=None,
25
- output_sequence_length=MAX_LENGTH,
26
- vocabulary=vocab
27
- )
28
-
29
- idx2word = tf.keras.layers.StringLookup(
30
- mask_token="",
31
- vocabulary=tokenizer.get_vocabulary(),
32
- invert=True
33
- )
34
-
35
-
36
- # MODEL
37
- def CNN_Encoder():
38
- inception_v3 = tf.keras.applications.InceptionV3(
39
- include_top=False,
40
- weights='imagenet'
41
- )
42
-
43
- output = inception_v3.output
44
- output = tf.keras.layers.Reshape(
45
- (-1, output.shape[-1]))(output)
46
-
47
- cnn_model = tf.keras.models.Model(inception_v3.input, output)
48
- return cnn_model
49
-
50
-
51
- class TransformerEncoderLayer(tf.keras.layers.Layer):
52
-
53
- def __init__(self, embed_dim, num_heads):
54
- super().__init__()
55
- self.layer_norm_1 = tf.keras.layers.LayerNormalization()
56
- self.layer_norm_2 = tf.keras.layers.LayerNormalization()
57
- self.attention = tf.keras.layers.MultiHeadAttention(
58
- num_heads=num_heads, key_dim=embed_dim)
59
- self.dense = tf.keras.layers.Dense(embed_dim, activation="relu")
60
-
61
-
62
- def call(self, x, training):
63
- x = self.layer_norm_1(x)
64
- x = self.dense(x)
65
-
66
- attn_output = self.attention(
67
- query=x,
68
- value=x,
69
- key=x,
70
- attention_mask=None,
71
- training=training
72
- )
73
-
74
- x = self.layer_norm_2(x + attn_output)
75
- return x
76
-
77
-
78
- class Embeddings(tf.keras.layers.Layer):
79
-
80
- def __init__(self, vocab_size, embed_dim, max_len):
81
- super().__init__()
82
- self.token_embeddings = tf.keras.layers.Embedding(
83
- vocab_size, embed_dim)
84
- self.position_embeddings = tf.keras.layers.Embedding(
85
- max_len, embed_dim, input_shape=(None, max_len))
86
-
87
-
88
- def call(self, input_ids):
89
- length = tf.shape(input_ids)[-1]
90
- position_ids = tf.range(start=0, limit=length, delta=1)
91
- position_ids = tf.expand_dims(position_ids, axis=0)
92
-
93
- token_embeddings = self.token_embeddings(input_ids)
94
- position_embeddings = self.position_embeddings(position_ids)
95
-
96
- return token_embeddings + position_embeddings
97
-
98
-
99
- class TransformerDecoderLayer(tf.keras.layers.Layer):
100
-
101
- def __init__(self, embed_dim, units, num_heads):
102
- super().__init__()
103
- self.embedding = Embeddings(
104
- tokenizer.vocabulary_size(), embed_dim, MAX_LENGTH)
105
-
106
- self.attention_1 = tf.keras.layers.MultiHeadAttention(
107
- num_heads=num_heads, key_dim=embed_dim, dropout=0.1
108
- )
109
- self.attention_2 = tf.keras.layers.MultiHeadAttention(
110
- num_heads=num_heads, key_dim=embed_dim, dropout=0.1
111
- )
112
-
113
- self.layernorm_1 = tf.keras.layers.LayerNormalization()
114
- self.layernorm_2 = tf.keras.layers.LayerNormalization()
115
- self.layernorm_3 = tf.keras.layers.LayerNormalization()
116
-
117
- self.ffn_layer_1 = tf.keras.layers.Dense(units, activation="relu")
118
- self.ffn_layer_2 = tf.keras.layers.Dense(embed_dim)
119
-
120
- self.out = tf.keras.layers.Dense(tokenizer.vocabulary_size(), activation="softmax")
121
-
122
- self.dropout_1 = tf.keras.layers.Dropout(0.3)
123
- self.dropout_2 = tf.keras.layers.Dropout(0.5)
124
-
125
-
126
- def call(self, input_ids, encoder_output, training, mask=None):
127
- embeddings = self.embedding(input_ids)
128
-
129
- combined_mask = None
130
- padding_mask = None
131
-
132
- if mask is not None:
133
- causal_mask = self.get_causal_attention_mask(embeddings)
134
- padding_mask = tf.cast(mask[:, :, tf.newaxis], dtype=tf.int32)
135
- combined_mask = tf.cast(mask[:, tf.newaxis, :], dtype=tf.int32)
136
- combined_mask = tf.minimum(combined_mask, causal_mask)
137
-
138
- attn_output_1 = self.attention_1(
139
- query=embeddings,
140
- value=embeddings,
141
- key=embeddings,
142
- attention_mask=combined_mask,
143
- training=training
144
- )
145
-
146
- out_1 = self.layernorm_1(embeddings + attn_output_1)
147
-
148
- attn_output_2 = self.attention_2(
149
- query=out_1,
150
- value=encoder_output,
151
- key=encoder_output,
152
- attention_mask=padding_mask,
153
- training=training
154
- )
155
-
156
- out_2 = self.layernorm_2(out_1 + attn_output_2)
157
-
158
- ffn_out = self.ffn_layer_1(out_2)
159
- ffn_out = self.dropout_1(ffn_out, training=training)
160
- ffn_out = self.ffn_layer_2(ffn_out)
161
-
162
- ffn_out = self.layernorm_3(ffn_out + out_2)
163
- ffn_out = self.dropout_2(ffn_out, training=training)
164
- preds = self.out(ffn_out)
165
- return preds
166
-
167
-
168
- def get_causal_attention_mask(self, inputs):
169
- input_shape = tf.shape(inputs)
170
- batch_size, sequence_length = input_shape[0], input_shape[1]
171
- i = tf.range(sequence_length)[:, tf.newaxis]
172
- j = tf.range(sequence_length)
173
- mask = tf.cast(i >= j, dtype="int32")
174
- mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
175
- mult = tf.concat(
176
- [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
177
- axis=0
178
- )
179
- return tf.tile(mask, mult)
180
-
181
-
182
- class ImageCaptioningModel(tf.keras.Model):
183
-
184
- def __init__(self, cnn_model, encoder, decoder, image_aug=None):
185
- super().__init__()
186
- self.cnn_model = cnn_model
187
- self.encoder = encoder
188
- self.decoder = decoder
189
- self.image_aug = image_aug
190
- self.loss_tracker = tf.keras.metrics.Mean(name="loss")
191
- self.acc_tracker = tf.keras.metrics.Mean(name="accuracy")
192
-
193
-
194
- def calculate_loss(self, y_true, y_pred, mask):
195
- loss = self.loss(y_true, y_pred)
196
- mask = tf.cast(mask, dtype=loss.dtype)
197
- loss *= mask
198
- return tf.reduce_sum(loss) / tf.reduce_sum(mask)
199
-
200
-
201
- def calculate_accuracy(self, y_true, y_pred, mask):
202
- accuracy = tf.equal(y_true, tf.argmax(y_pred, axis=2))
203
- accuracy = tf.math.logical_and(mask, accuracy)
204
- accuracy = tf.cast(accuracy, dtype=tf.float32)
205
- mask = tf.cast(mask, dtype=tf.float32)
206
- return tf.reduce_sum(accuracy) / tf.reduce_sum(mask)
207
-
208
-
209
- def compute_loss_and_acc(self, img_embed, captions, training=True):
210
- encoder_output = self.encoder(img_embed, training=True)
211
- y_input = captions[:, :-1]
212
- y_true = captions[:, 1:]
213
- mask = (y_true != 0)
214
- y_pred = self.decoder(
215
- y_input, encoder_output, training=True, mask=mask
216
- )
217
- loss = self.calculate_loss(y_true, y_pred, mask)
218
- acc = self.calculate_accuracy(y_true, y_pred, mask)
219
- return loss, acc
220
-
221
-
222
- def train_step(self, batch):
223
- imgs, captions = batch
224
-
225
- if self.image_aug:
226
- imgs = self.image_aug(imgs)
227
-
228
- img_embed = self.cnn_model(imgs)
229
-
230
- with tf.GradientTape() as tape:
231
- loss, acc = self.compute_loss_and_acc(
232
- img_embed, captions
233
- )
234
-
235
- train_vars = (
236
- self.encoder.trainable_variables + self.decoder.trainable_variables
237
- )
238
- grads = tape.gradient(loss, train_vars)
239
- self.optimizer.apply_gradients(zip(grads, train_vars))
240
- self.loss_tracker.update_state(loss)
241
- self.acc_tracker.update_state(acc)
242
-
243
- return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()}
244
-
245
-
246
- def test_step(self, batch):
247
- imgs, captions = batch
248
-
249
- img_embed = self.cnn_model(imgs)
250
-
251
- loss, acc = self.compute_loss_and_acc(
252
- img_embed, captions, training=False
253
- )
254
-
255
- self.loss_tracker.update_state(loss)
256
- self.acc_tracker.update_state(acc)
257
-
258
- return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()}
259
-
260
- @property
261
- def metrics(self):
262
- return [self.loss_tracker, self.acc_tracker]
263
-
264
-
265
- def load_image_from_path(img_path):
266
- img = tf.io.read_file(img_path)
267
- img = tf.io.decode_jpeg(img, channels=3)
268
- img = tf.keras.layers.Resizing(299, 299)(img)
269
- img = tf.keras.applications.inception_v3.preprocess_input(img)
270
- return img
271
-
272
-
273
- def generate_caption(img, caption_model, add_noise=False):
274
- if isinstance(img, str):
275
- img = load_image_from_path(img)
276
-
277
- if add_noise == True:
278
- noise = tf.random.normal(img.shape)*0.1
279
- img = (img + noise)
280
- img = (img - tf.reduce_min(img))/(tf.reduce_max(img) - tf.reduce_min(img))
281
-
282
- img = tf.expand_dims(img, axis=0)
283
- img_embed = caption_model.cnn_model(img)
284
- img_encoded = caption_model.encoder(img_embed, training=False)
285
-
286
- y_inp = '[start]'
287
- for i in range(MAX_LENGTH-1):
288
- tokenized = tokenizer([y_inp])[:, :-1]
289
- mask = tf.cast(tokenized != 0, tf.int32)
290
- pred = caption_model.decoder(
291
- tokenized, img_encoded, training=False, mask=mask)
292
-
293
- pred_idx = np.argmax(pred[0, i, :])
294
- pred_word = idx2word(pred_idx).numpy().decode('utf-8')
295
- if pred_word == '[end]':
296
- break
297
-
298
- y_inp += ' ' + pred_word
299
-
300
- y_inp = y_inp.replace('[start] ', '')
301
- return y_inp
302
-
303
-
304
- def get_caption_model():
305
- encoder = TransformerEncoderLayer(EMBEDDING_DIM, 1)
306
- decoder = TransformerDecoderLayer(EMBEDDING_DIM, UNITS, 8)
307
-
308
- cnn_model = CNN_Encoder()
309
-
310
- caption_model = ImageCaptioningModel(
311
- cnn_model=cnn_model, encoder=encoder, decoder=decoder, image_aug=None,
312
- )
313
-
314
- def call_fn(batch, training):
315
- return batch
316
-
317
- caption_model.call = call_fn
318
- sample_x, sample_y = tf.random.normal((1, 299, 299, 3)), tf.zeros((1, 40))
319
-
320
- caption_model((sample_x, sample_y))
321
-
322
- sample_img_embed = caption_model.cnn_model(sample_x)
323
- sample_enc_out = caption_model.encoder(sample_img_embed, training=False)
324
- caption_model.decoder(sample_y, sample_enc_out, training=False)
325
-
326
- try:
327
- caption_model.load_weights('saved_models/image_captioning_coco_weights.h5')
328
- except FileNotFoundError:
329
- caption_model.load_weights('Image-Captioning/saved_models/image_captioning_coco_weights.h5')
330
-
331
- return caption_model