import tensorflow as tf import os import json import pandas as pd import re import numpy as np import time import matplotlib.pyplot as plt import collections import random import requests import json import pickle from math import sqrt from PIL import Image from tqdm.auto import tqdm DATASET_PATH = './coco2017/' MAX_LENGTH = 40 MAX_VOCABULARY = 12000 BATCH_SIZE = 64 BUFFER_SIZE = 1000 EMBEDDING_DIM = 512 UNITS = 512 EPOCHS = 5 with open(f'{DATASET_PATH}/annotations/captions_train2017.json', 'r') as f: data = json.load(f) data = data['annotations'] img_cap_pairs = [] for sample in data: img_name = '%012d.jpg' % sample['image_id'] img_cap_pairs.append([img_name, sample['caption']]) captions = pd.DataFrame(img_cap_pairs, columns=['image', 'caption']) captions['image'] = captions['image'].apply( lambda x: f'{DATASET_PATH}/train2017/{x}' ) captions = captions.sample(70000) captions = captions.reset_index(drop=True) captions.head() def preprocessing(text): text = text.lower() text = re.sub(r'[^\w\s]', '', text) text = re.sub('\s+', ' ', text) text = text.strip() text = '[start] ' + text + ' [end]' return text captions['caption'] = captions['caption'].apply(preprocessing) captions.head() tokenizer = tf.keras.layers.TextVectorization( max_tokens=MAX_VOCABULARY, standardize=None, output_sequence_length=MAX_LENGTH) tokenizer.adapt(captions['caption']) pickle.dump(tokenizer.get_vocabulary(), open('./image-caption-generator/vocabulary/vocab_coco.file', 'wb')) word2idx = tf.keras.layers.StringLookup( mask_token = "", vocabulary = tokenizer.get_vocabulary() ) idx2word = tf.keras.layers.StringLookup( mask_token = "", vocabulary = tokenizer.get_vocabulary(), invert = True ) img_to_cap_vector = collections.defaultdict(list) for img, cap in zip(captions['image'], captions['caption']): img_to_cap_vector[img].append(cap) img_keys = list(img_to_cap_vector.keys()) random.shuffle(img_keys) slice_index = int(len(img_keys)*0.8) img_name_train_keys, img_name_test_keys = (img_keys[:slice_index], img_keys[slice_index:]) train_img = [] train_caption = [] for imgt in img_name_train_keys: capt_len = len(img_to_cap_vector[imgt]) train_img.extend([imgt]*capt_len) train_caption.extend(img_to_cap_vector[imgt]) test_img = [] test_caption = [] for imgtest in img_name_test_keys: capv_len = len(img_to_cap_vector[imgtest]) test_img.extend([imgtest]*capv_len) test_caption.extend(img_to_cap_vector[imgtest]) len(train_img), len(train_caption), len(test_img), len(test_caption) def load_data(img_path, caption): img = tf.io.read_file(img_path) img = tf.io.decode_jpeg(img, channels=3) img = tf.keras.layers.Resizing(299, 299)(img) img = tf.keras.applications.inception_v3.preprocess_input(img) caption = tokenizer(caption) return img, caption train_dataset = tf.data.Dataset.from_tensor_slices((train_img,train_caption)) train_dataset = train_dataset.map(load_data, num_parallel_calls = tf.data.AUTOTUNE).shuffle(BUFFER_SIZE).batch(BATCH_SIZE) test_dataset = tf.data.Dataset.from_tensor_slices((test_img,test_caption)) test_dataset = test_dataset.map(load_data, num_parallel_calls=tf.data.AUTOTUNE).shuffle(BUFFER_SIZE).batch(BATCH_SIZE) image_augmentation = tf.keras.Sequential( [ tf.keras.layers.RandomFlip("horizontal"), tf.keras.layers.RandomRotation(0.2), tf.keras.layers.RandomContrast(0.3), ] ) def CNN_Encoder(): inception_v3 = tf.keras.applications.InceptionV3( include_top=False, weights='imagenet' ) output = inception_v3.output output = tf.keras.layers.Reshape( (-1, output.shape[-1]))(output) cnn_model = tf.keras.models.Model(inception_v3.input, output) return cnn_model class TransformerEncoderLayer(tf.keras.layers.Layer): def __init__(self, embed_dim, num_heads): super().__init__() self.layer_norm_1 = tf.keras.layers.LayerNormalization() self.layer_norm_2 = tf.keras.layers.LayerNormalization() self.attention = tf.keras.layers.MultiHeadAttention( num_heads=num_heads, key_dim=embed_dim) self.dense = tf.keras.layers.Dense(embed_dim, activation="relu") def call(self, x, training): x = self.layer_norm_1(x) x = self.dense(x) attn_output = self.attention( query=x, value=x, key=x, attention_mask=None, training=training ) x = self.layer_norm_2(x + attn_output) return x class Embeddings(tf.keras.layers.Layer): def __init__(self, vocab_size, embed_dim, max_len): super().__init__() self.token_embeddings = tf.keras.layers.Embedding( vocab_size, embed_dim) self.position_embeddings = tf.keras.layers.Embedding( max_len, embed_dim, input_shape=(None, max_len)) def call(self, input_ids): length = tf.shape(input_ids)[-1] position_ids = tf.range(start=0, limit=length, delta=1) position_ids = tf.expand_dims(position_ids, axis=0) token_embeddings = self.token_embeddings(input_ids) position_embeddings = self.position_embeddings(position_ids) return token_embeddings + position_embeddings class TransformerDecoderLayer(tf.keras.layers.Layer): def __init__(self, embed_dim, units, num_heads): super().__init__() self.embedding = Embeddings( tokenizer.vocabulary_size(), embed_dim, MAX_LENGTH) self.attention_1 = tf.keras.layers.MultiHeadAttention( num_heads=num_heads, key_dim=embed_dim, dropout=0.1 ) self.attention_2 = tf.keras.layers.MultiHeadAttention( num_heads=num_heads, key_dim=embed_dim, dropout=0.1 ) self.layernorm_1 = tf.keras.layers.LayerNormalization() self.layernorm_2 = tf.keras.layers.LayerNormalization() self.layernorm_3 = tf.keras.layers.LayerNormalization() self.ffn_layer_1 = tf.keras.layers.Dense(units, activation="relu") self.ffn_layer_2 = tf.keras.layers.Dense(embed_dim) self.out = tf.keras.layers.Dense(tokenizer.vocabulary_size(), activation="softmax") self.dropout_1 = tf.keras.layers.Dropout(0.3) self.dropout_2 = tf.keras.layers.Dropout(0.5) def call(self, input_ids, encoder_output, training, mask=None): embeddings = self.embedding(input_ids) combined_mask = None padding_mask = None if mask is not None: causal_mask = self.get_causal_attention_mask(embeddings) padding_mask = tf.cast(mask[:, :, tf.newaxis], dtype=tf.int32) combined_mask = tf.cast(mask[:, tf.newaxis, :], dtype=tf.int32) combined_mask = tf.minimum(combined_mask, causal_mask) attn_output_1 = self.attention_1( query=embeddings, value=embeddings, key=embeddings, attention_mask=combined_mask, training=training ) out_1 = self.layernorm_1(embeddings + attn_output_1) attn_output_2 = self.attention_2( query=out_1, value=encoder_output, key=encoder_output, attention_mask=padding_mask, training=training ) out_2 = self.layernorm_2(out_1 + attn_output_2) ffn_out = self.ffn_layer_1(out_2) ffn_out = self.dropout_1(ffn_out, training=training) ffn_out = self.ffn_layer_2(ffn_out) ffn_out = self.layernorm_3(ffn_out + out_2) ffn_out = self.dropout_2(ffn_out, training=training) preds = self.out(ffn_out) return preds def get_causal_attention_mask(self, inputs): input_shape = tf.shape(inputs) batch_size, sequence_length = input_shape[0], input_shape[1] i = tf.range(sequence_length)[:, tf.newaxis] j = tf.range(sequence_length) mask = tf.cast(i >= j, dtype="int32") mask = tf.reshape(mask, (1, input_shape[1], input_shape[1])) mult = tf.concat( [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], axis=0 ) return tf.tile(mask, mult) class ImageCaptioningModel(tf.keras.Model): def __init__(self, cnn_model, encoder, decoder, image_aug=None): super().__init__() self.cnn_model = cnn_model self.encoder = encoder self.decoder = decoder self.image_aug = image_aug self.loss_tracker = tf.keras.metrics.Mean(name="loss") self.acc_tracker = tf.keras.metrics.Mean(name="accuracy") def calculate_loss(self, y_true, y_pred, mask): loss = self.loss(y_true, y_pred) mask = tf.cast(mask, dtype=loss.dtype) loss *= mask return tf.reduce_sum(loss) / tf.reduce_sum(mask) def calculate_accuracy(self, y_true, y_pred, mask): accuracy = tf.equal(y_true, tf.argmax(y_pred, axis=2)) accuracy = tf.math.logical_and(mask, accuracy) accuracy = tf.cast(accuracy, dtype=tf.float32) mask = tf.cast(mask, dtype=tf.float32) return tf.reduce_sum(accuracy) / tf.reduce_sum(mask) def compute_loss_and_acc(self, img_embed, captions, training=True): encoder_output = self.encoder(img_embed, training=True) y_input = captions[:, :-1] y_true = captions[:, 1:] mask = (y_true != 0) y_pred = self.decoder( y_input, encoder_output, training=True, mask=mask ) loss = self.calculate_loss(y_true, y_pred, mask) acc = self.calculate_accuracy(y_true, y_pred, mask) return loss, acc def train_step(self, batch): imgs, captions = batch if self.image_aug: imgs = self.image_aug(imgs) img_embed = self.cnn_model(imgs) with tf.GradientTape() as tape: loss, acc = self.compute_loss_and_acc( img_embed, captions ) train_vars = ( self.encoder.trainable_variables + self.decoder.trainable_variables ) grads = tape.gradient(loss, train_vars) self.optimizer.apply_gradients(zip(grads, train_vars)) self.loss_tracker.update_state(loss) self.acc_tracker.update_state(acc) return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()} def test_step(self, batch): imgs, captions = batch img_embed = self.cnn_model(imgs) loss, acc = self.compute_loss_and_acc( img_embed, captions, training=False ) self.loss_tracker.update_state(loss) self.acc_tracker.update_state(acc) return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()} @property def metrics(self): return [self.loss_tracker, self.acc_tracker] encoder = TransformerEncoderLayer(EMBEDDING_DIM, 1) decoder = TransformerDecoderLayer(EMBEDDING_DIM, UNITS, 8) cnn_model = CNN_Encoder() caption_model = ImageCaptioningModel( cnn_model=cnn_model, encoder=encoder, decoder=decoder, image_aug=image_augmentation, ) cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=False, reduction="none" ) early_stopping = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True) caption_model.compile( optimizer=tf.keras.optimizers.Adam(), loss=cross_entropy ) history = caption_model.fit( train_dataset, epochs=EPOCHS, validation_data=val_dataset, callbacks=[early_stopping] ) caption_model.save_weights('./image-caption-generator/models/trained_coco_weights.h5')