File size: 4,678 Bytes
73d4923
 
 
 
7301eb7
 
73d4923
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import numpy as np
import tensorflow as tf
import keras
from keras.models import Model
import warnings
warnings.filterwarnings('ignore')

class Encoder(Model):
    def __init__(self, embed_dim):
        super(Encoder, self).__init__()
        self.dense = tf.keras.layers.Dense(embed_dim)

    def call(self, features):
        features = self.dense(features)
        features = tf.keras.activations.relu(features)

        return features


class Attention_model(Model):
    def __init__(self, units):
        super(Attention_model, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        self.units = units

    def call(self, features, hidden):
        hidden_with_time_axis = hidden[:, tf.newaxis]

        score = tf.keras.activations.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

        attention_weights = tf.keras.activations.softmax(self.V(score), axis=1)

        context_vector = attention_weights * features

        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights


class Decoder(Model):
    def __init__(self, embed_dim, units, vocab_size):
        super(Decoder, self).__init__()
        self.units = units
        self.attention = Attention_model(self.units)
        self.embed = tf.keras.layers.Embedding(vocab_size, embed_dim)
        self.gru = tf.keras.layers.GRU(self.units, return_sequences=True, return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.d1 = tf.keras.layers.Dense(self.units)
        self.d2 = tf.keras.layers.Dense(vocab_size)

    def call(self, x, features, hidden):
        context_vector, attention_weights = self.attention(features, hidden)
        embed = self.embed(x)
        embed = tf.concat([tf.expand_dims(context_vector, 1), embed], axis=-1)
        output, state = self.gru(embed)
        output = self.d1(output)
        output = tf.reshape(output, (-1, output.shape[2]))
        output = self.d2(output)

        return output, state, attention_weights

    def init_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))


# Loading the tokenizer
with open("efb-requirements/tokenizer.json", 'r', encoding='utf-8') as f:
    loaded_tokenizer_json = f.read()

tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(loaded_tokenizer_json)


def load_and_process_image(image, target_size=(299, 299)):
    img = tf.convert_to_tensor(image)
    img = tf.cast(img, tf.uint8)
    img = tf.image.resize(img, target_size)
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img


image_features_extract_model = keras.models.load_model("efb-requirements/inception_v3.h5")

embedding_dim = 256
units = 512
vocab_size = 5001

encoder = Encoder(embedding_dim)
decoder = Decoder(embedding_dim, units, vocab_size)

# Creating dummy inputs
dummy_img_input = tf.ones((32, 64, 2048))
features = encoder(dummy_img_input)
hidden = decoder.init_state(32)
dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * 32, 1)
dec = decoder(dec_input, features, hidden)

# Loading saved models
encoder.load_weights("efb-requirements/encoder_50epoch_weights.h5")
decoder.load_weights("efb-requirements/decoder_50epoch_weights.h5")


def evaluate(image):
    max_length = 39
    attention_plot = np.zeros((max_length, 64))

    hidden = decoder.reset_state(batch_size=1)
    # processing the input image to desired format before extracting features
    temp_input = tf.expand_dims(load_and_process_image(image), 0)

    img_tensor_val = image_features_extract_model(temp_input)
    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))

    cnn_features = encoder(img_tensor_val)

    decoder_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)

    result = []
    predictions = ''
    for i in range(max_length):
        predictions, hidden, attention_weights = decoder(decoder_input, cnn_features, hidden)

        attention_plot[i] = tf.reshape(attention_weights, (-1,)).numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()
        result.append(tokenizer.index_word[predicted_id])

        if tokenizer.index_word[predicted_id] == '<end>':
            # return result, attention_plot, predictions
            return result

        decoder_input = tf.expand_dims([predicted_id], 0)

    attention_plot = attention_plot[:len(result), :]
    # return result, attention_plot, predictions
    return result