import math import json import numpy as np import tensorflow as tf import tfimm import efficientnet.tfkeras as efnv1 import keras_efficientnet_v2 as efnv2 import tensorflow_hub as hub embedding_size = 1024 n_images = 51033 + 27956 class DotDict(dict): """dot.notation access to dictionary attributes Reference: https://stackoverflow.com/questions/2352181/how-to-use-a-dot-to-access-members-of-dictionary/23689767#23689767 """ __getattr__ = dict.get # returns None if missing key, don't use getattr() with default! __setattr__ = dict.__setitem__ __delattr__ = dict.__delitem__ def get_cfg(json_file): json_file = str(json_file) config_dict = json.load(open(json_file)) return DotDict(config_dict) def get_embeddings(img, embed_model): inp = img[None, ...] embeddings = embed_model.predict(inp, verbose=1, batch_size=1, workers=4, use_multiprocessing=True) return embeddings # Train embeddings have to be re-ordered: embeddings were concatenated (train, valid) # in the training notebook and the valid fold is different for each ensemble model. FOLDS = 10 shards, n_total = [], 0 for fold in range(10): n_img = 5104 if fold <= 2 else 5103 shards.append(list(range(n_total, n_total + n_img))) n_total += n_img assert n_total == 51033 def get_train_idx(use_fold): "Return embedding index that restores the order of images in the tfrec files." train_folds = [i for i in range(10) if i % FOLDS != use_fold] valid_folds = [i for i in range(10) if i % FOLDS == use_fold] folds = train_folds + valid_folds # order of saved embeddings (train + valid) train_idx = [] for fold in folds: train_idx.append(shards[fold]) train_idx = np.concatenate(train_idx) return np.argsort(train_idx) def get_comp_embeddings(emb_files, use_folds): "Load embeddings for competition images [n_images, embedding_size]" comp_embeddings = [] for npz_file, use_fold in zip(emb_files, use_folds): # Get embeddings for all competition images d = np.load(str(npz_file)) comp_train_emb = d['train'] comp_test_emb = d['test'] # Restore original order of comp_train_emb, targets (use targets as fingerprint-check) comp_train_idx = get_train_idx(use_fold) comp_train_emb = comp_train_emb[comp_train_idx, :] comp_embs = np.concatenate([comp_train_emb, comp_test_emb], axis=0) assert comp_embs.shape == (n_images, embedding_size) # Normalize embeddings comp_embs_norms = np.linalg.norm(comp_embs, axis=1) print("comp_embs norm:", comp_embs_norms.min(), "...", comp_embs_norms.max()) comp_embs /= comp_embs_norms[:, None] comp_embeddings.append(comp_embs) return np.concatenate(comp_embeddings, axis=1) def get_test_embedding(image, embed_models, sizes): test_embedding = [] for embed_model, size in zip(embed_models, sizes): # Get model input scaled_image = tf.image.resize(image, size) scaled_image = tf.cast(scaled_image, tf.float32) / 255.0 # Get embedding for test image test_emb = get_embeddings(scaled_image, embed_model) # shape: [1, embedding_size] assert test_emb.shape == (1, embedding_size) # Normalize embeddings test_emb_norm = np.linalg.norm(test_emb, axis=1) test_emb /= test_emb_norm[:, None] test_embedding.append(test_emb) return np.concatenate(test_embedding, axis=1) # [1, embedding_size] def p2logit(x): return np.log(x / (1 - x)) def sigmoid(x): return 1 / (1 + np.exp(-x)) def get_confidence(similarity, threshold): "Calculate confidence in known/unknown prediction" if similarity <= 0: return 0 logit_sim = p2logit(similarity) logit_threshold = p2logit(threshold) return sigmoid(abs(logit_sim - logit_threshold)) class ArcMarginProductSubCenter(tf.keras.layers.Layer): ''' Implements large margin arc distance. References: https://arxiv.org/pdf/1801.07698.pdf https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/ https://github.com/haqishen/Google-Landmark-Recognition-2020-3rd-Place-Solution/ Sub-center version: for k > 1, the embedding layer can learn k sub-centers per class ''' def __init__(self, n_classes, s=30, m=0.50, k=3, easy_margin=False, ls_eps=0.0, **kwargs): super(ArcMarginProductSubCenter, self).__init__(**kwargs) self.n_classes = n_classes self.s = s self.m = m self.k = k self.ls_eps = ls_eps self.easy_margin = easy_margin self.cos_m = tf.math.cos(m) self.sin_m = tf.math.sin(m) self.th = tf.math.cos(math.pi - m) self.mm = tf.math.sin(math.pi - m) * m def get_config(self): config = super().get_config().copy() config.update({ 'n_classes': self.n_classes, 's': self.s, 'm': self.m, 'k': self.k, 'ls_eps': self.ls_eps, 'easy_margin': self.easy_margin, }) return config def build(self, input_shape): super(ArcMarginProductSubCenter, self).build(input_shape[0]) self.W = self.add_weight( name='W', shape=(int(input_shape[0][-1]), self.n_classes * self.k), initializer='glorot_uniform', dtype='float32', trainable=True) def call(self, inputs): X, y = inputs y = tf.cast(y, dtype=tf.int32) cosine_all = tf.matmul( tf.math.l2_normalize(X, axis=1), tf.math.l2_normalize(self.W, axis=0) ) if self.k > 1: cosine_all = tf.reshape(cosine_all, [-1, self.n_classes, self.k]) cosine = tf.math.reduce_max(cosine_all, axis=2) else: cosine = cosine_all sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2)) phi = cosine * self.cos_m - sine * self.sin_m if self.easy_margin: phi = tf.where(cosine > 0, phi, cosine) else: phi = tf.where(cosine > self.th, phi, cosine - self.mm) one_hot = tf.cast( tf.one_hot(y, depth=self.n_classes), dtype=cosine.dtype ) if self.ls_eps > 0: one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes output = (one_hot * phi) + ((1.0 - one_hot) * cosine) output *= self.s return output TFHUB = { 'hub_efnv2s': "https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_ft1k_s/feature_vector/2", 'hub_efnv2m': "https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_ft1k_m/feature_vector/2", 'hub_efnv2l': "https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_ft1k_l/feature_vector/2", 'hub_efnv2xl': "https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_ft1k_xl/feature_vector/2", 'bit_m-r50x1': "https://tfhub.dev/google/bit/m-r50x1/1", 'bit_m-r50x3': "https://tfhub.dev/google/bit/m-r50x3/1", 'bit_m-r101x1': "https://tfhub.dev/google/bit/m-r101x1/1", 'bit_m-r101x3': "https://tfhub.dev/google/bit/m-r101x3/1", 'bit_m-r152x4': "https://tfhub.dev/google/bit/m-r152x4/1", } def get_model(cfg): aux_arcface = False # Chris Deotte suggested this if cfg.head == 'arcface': head = ArcMarginProductSubCenter else: assert False, "INVALID HEAD" if cfg.adaptive_margin: raise NotImplementedError if cfg.arch_name.startswith('efnv1'): EFN = {'efnv1b0': efnv1.EfficientNetB0, 'efnv1b1': efnv1.EfficientNetB1, 'efnv1b2': efnv1.EfficientNetB2, 'efnv1b3': efnv1.EfficientNetB3, 'efnv1b4': efnv1.EfficientNetB4, 'efnv1b5': efnv1.EfficientNetB5, 'efnv1b6': efnv1.EfficientNetB6, 'efnv1b7': efnv1.EfficientNetB7} if cfg.arch_name.startswith('efnv2'): EFN = {'efnv2s': efnv2.EfficientNetV2S, 'efnv2m': efnv2.EfficientNetV2M, 'efnv2l': efnv2.EfficientNetV2L, 'efnv2xl': efnv2.EfficientNetV2XL} with tf.distribute.get_strategy().scope(): margin = head( n_classes=cfg.N_CLASSES, s=30, m=0.3, k=cfg.subcenters or 1, easy_margin=False, name=f'head/{cfg.head}', dtype='float32') inp = tf.keras.layers.Input(shape=[*cfg.IMAGE_SIZE, 3], name='inp1') label = tf.keras.layers.Input(shape=(), name='inp2') if aux_arcface: label2 = tf.keras.layers.Input(shape=(), name='inp3') if cfg.arch_name.startswith('efnv1'): x = EFN[cfg.arch_name](weights=cfg.pretrained, include_top=False)(inp) if cfg.pool == 'flatten': embed = tf.keras.layers.Flatten()(x) elif cfg.pool == 'fc': embed = tf.keras.layers.Flatten()(x) embed = tf.keras.layers.Dropout(0.1)(embed) embed = tf.keras.layers.Dense(1024)(embed) elif cfg.pool == 'concat': embed = tf.keras.layers.concatenate([tf.keras.layers.GlobalAveragePooling2D()(x), tf.keras.layers.GlobalAveragePooling2D()(x)]) elif cfg.pool == 'max': embed = tf.keras.layers.GlobalMaxPooling2D()(x) else: embed = tf.keras.layers.GlobalAveragePooling2D()(x) elif cfg.arch_name.startswith('efnv2'): x = EFN[cfg.arch_name](input_shape=(None, None, 3), num_classes=0, pretrained=cfg.pretrained)(inp) if cfg.pool == 'flatten': embed = tf.keras.layers.Flatten()(x) elif cfg.pool == 'fc': embed = tf.keras.layers.Flatten()(x) embed = tf.keras.layers.Dropout(0.1)(embed) embed = tf.keras.layers.Dense(1024)(embed) elif cfg.pool == 'concat': embed = tf.keras.layers.concatenate([tf.keras.layers.GlobalAveragePooling2D()(x), tf.keras.layers.GlobalAveragePooling2D()(x)]) elif cfg.pool == 'max': embed = tf.keras.layers.GlobalMaxPooling2D()(x) else: embed = tf.keras.layers.GlobalAveragePooling2D()(x) elif cfg.arch_name in TFHUB: # tfhub models cannot be modified => Pooling cannot be changed! url = TFHUB[cfg.arch_name] model = hub.KerasLayer(url, trainable=True) embed = model(inp) assert cfg.pool in [None, False, 'avg', ''], 'tfhub model, no custom pooling supported!' elif cfg.arch_name in tfimm.list_models(pretrained="timm"): embed = tfimm.create_model(cfg.arch_name, pretrained=None, nb_classes=0)(inp) if len(cfg.dropout_ps) > 0: # Chris Deotte posted model code without Dropout/FC1 after pooling embed = tf.keras.layers.Dropout(cfg.dropout_ps[0])(embed) embed = tf.keras.layers.Dense(1024)(embed) # tunable embedding size embed = tf.keras.layers.BatchNormalization()(embed) # missing in public notebooks x = margin([embed, label]) output = tf.keras.layers.Softmax(dtype='float32', name='arc' if cfg.aux_loss else None)(x) if cfg.aux_loss: aux_features = tf.keras.layers.Dense(cfg.n_species)(embed) aux_output = tf.keras.layers.Softmax(dtype='float32', name='aux')(aux_features) inputs = [inp, label, label2] if (cfg.aux_loss and aux_arcface) else [inp, label] outputs = (output, aux_output) if cfg.aux_loss else [output] model = tf.keras.models.Model(inputs=inputs, outputs=outputs) embed_model = tf.keras.models.Model(inputs=inp, outputs=embed) if cfg.FREEZE_BATCH_NORM: raise NotImplementedError return model, embed_model