import math

import tensorflow as tf
import tfimm
import efficientnet
import efficientnet.tfkeras as efnv1
import keras_efficientnet_v2 as efnv2
import tensorflow_hub as hub


class DotDict(dict):
    """dot.notation access to dictionary attributes

    Reference:
    https://stackoverflow.com/questions/2352181/how-to-use-a-dot-to-access-members-of-dictionary/23689767#23689767
    """
    __getattr__ = dict.get  # returns None if missing key, don't use getattr() with default!
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

    
def get_cfg(rst_file):
    json_file = str(rst_file).replace('.h5', '_config.json')
    config_dict = json.load(open(json_file))
    return DotDict(config_dict)


def get_embeddings(img, embed_model):
    inp = img[None, ...]
    embeddings = embed_model.predict(inp, verbose=1, batch_size=1, workers=4, use_multiprocessing=True)
    return embeddings


# Train embeddings have to be re-ordered: embeddings were concatenated (train, valid) 
# in the training notebook and the valid fold is different for each ensemble model.
FOLDS = 10
shards, n_total = [], 0
for fold in range(10):
    n_img = 5104 if fold <= 2 else 5103
    shards.append(list(range(n_total, n_total + n_img)))
    n_total += n_img
assert n_total == 51033

def get_train_idx(use_fold):
    "Return embedding index that restores the order of images in the tfrec files."
    train_folds = [i for i in range(10) if i % FOLDS != use_fold]
    valid_folds = [i for i in range(10) if i % FOLDS == use_fold]
    folds = train_folds + valid_folds

    # order of saved embeddings (train + valid)
    train_idx = []
    for fold in folds:
        train_idx.append(shards[fold])
    train_idx = np.concatenate(train_idx)
    
    return np.argsort(train_idx)

use_fold = {
    'efnv1b7_colab216_emb.npz': 4,
    'efnv1b7_colab225_emb.npz': 1,
    'efnv1b7_colab197_emb.npz': 0,
    'efnv1b7_colab227_emb.npz': 5,
    'efnv1b7_v72_emb.npz': 6,
    'efnv1b7_colab229_emb.npz': 9,
    'efnv1b6_colab217_emb.npz': 5,
    'efnv1b6_colab218_emb.npz': 6,
    'hub_efnv2xl_colab221_emb.npz': 8, 
    'hub_efnv2xl_v69_emb.npz': 2,
    'hub_efnv2xl_v73_emb.npz': 0,
    'efnv1b6_colab226_emb.npz': 2,
    'hub_efnv2l_v70_emb.npz': 3,
    'hub_efnv2l_colab200_emb.npz': 2, 
    'hub_efnv2l_colab199_emb.npz': 1,
    'convnext_base_384_in22ft1k_v68_emb.npz': 0,
    'convnext_base_384_in22ft1k_colab220_emb.npz': 9,
    'convnext_base_384_in22ft1k_colab201_emb.npz': 3,  # new
}


def get_comp_embeddings(rst_files):
    "Load embeddings for competition images [n_images, embedding_size]"

    comp_embeddings = []

    for rst_file in rst_files:
        # Get embeddings for all competition images
        npz_file = Path(rst_file.replace('.h5', '_emb.npz')).name
        d = np.load(str(Path(emb_path) / npz_file))
        comp_train_emb = d['train']
        comp_test_emb = d['test']
        
        # Restore original order of comp_train_emb, targets (use targets as fingerprint-check)
        comp_train_idx = get_train_idx(use_fold[npz_file])
        comp_train_emb = comp_train_emb[comp_train_idx, :]
        comp_embs = np.concatenate([comp_train_emb, comp_test_emb], axis=0)
        assert comp_embs.shape == (n_images, embedding_size)

        # Normalize embeddings
        comp_embs_norms = np.linalg.norm(comp_embs, axis=1)
        print("comp_embs norm:", comp_embs_norms.min(), "...", comp_embs_norms.max())
        comp_embs /= comp_embs_norms[:, None]

        comp_embeddings.append(comp_embs)

    return np.concatenate(comp_embeddings, axis=1)


def get_test_embedding(embed_models, sizes):
    test_embedding, similarities = [], []

    for embed_model, size in zip(embed_models, sizes):
        # Get model input
        scaled_img = tf.image.resize(img, size)
        scaled_img = tf.cast(scaled_img, tf.float32) / 255.0
        #print("test image normalized and resized to", scaled_img.shape[:2])

        # Get embedding for test image
        test_emb = get_embeddings(scaled_img, embed_model)  # shape: [1, embedding_size]
        assert test_emb.shape == (1, embedding_size)

        # Normalize embeddings
        test_emb_norm = np.linalg.norm(test_emb, axis=1)
        #print("test_emb norm: ", test_emb_norm[0])
        test_emb /= test_emb_norm[:, None]

        test_embedding.append(test_emb)

    return np.concatenate(test_embedding, axis=1)  # [1, embedding_size]


class ArcMarginProductSubCenter(tf.keras.layers.Layer):
    '''
    Implements large margin arc distance.

    References:
        https://arxiv.org/pdf/1801.07698.pdf
        https://github.com/lyakaap/Landmark2019-1st-and-3rd-Place-Solution/
        https://github.com/haqishen/Google-Landmark-Recognition-2020-3rd-Place-Solution/

    Sub-center version:
        for k > 1, the embedding layer can learn k sub-centers per class
    '''
    def __init__(self, n_classes, s=30, m=0.50, k=3, easy_margin=False,
                 ls_eps=0.0, **kwargs):

        super(ArcMarginProductSubCenter, self).__init__(**kwargs)

        self.n_classes = n_classes
        self.s = s
        self.m = m
        self.k = k
        self.ls_eps = ls_eps
        self.easy_margin = easy_margin
        self.cos_m = tf.math.cos(m)
        self.sin_m = tf.math.sin(m)
        self.th = tf.math.cos(math.pi - m)
        self.mm = tf.math.sin(math.pi - m) * m

    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'n_classes': self.n_classes,
            's': self.s,
            'm': self.m,
            'k': self.k,
            'ls_eps': self.ls_eps,
            'easy_margin': self.easy_margin,
        })
        return config

    def build(self, input_shape):
        super(ArcMarginProductSubCenter, self).build(input_shape[0])

        self.W = self.add_weight(
            name='W',
            shape=(int(input_shape[0][-1]), self.n_classes * self.k),
            initializer='glorot_uniform',
            dtype='float32',
            trainable=True)

    def call(self, inputs):
        X, y = inputs
        y = tf.cast(y, dtype=tf.int32)
        cosine_all = tf.matmul(
            tf.math.l2_normalize(X, axis=1),
            tf.math.l2_normalize(self.W, axis=0)
        )
        if self.k > 1:
            cosine_all = tf.reshape(cosine_all, [-1, self.n_classes, self.k])
            cosine = tf.math.reduce_max(cosine_all, axis=2)
        else:
            cosine = cosine_all
        sine = tf.math.sqrt(1.0 - tf.math.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = tf.where(cosine > 0, phi, cosine)
        else:
            phi = tf.where(cosine > self.th, phi, cosine - self.mm)
        one_hot = tf.cast(
            tf.one_hot(y, depth=self.n_classes),
            dtype=cosine.dtype
        )
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.n_classes

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s
        return output


TFHUB = {
    'hub_efnv2s': "https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_ft1k_s/feature_vector/2",
    'hub_efnv2m': "https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_ft1k_m/feature_vector/2",
    'hub_efnv2l': "https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_ft1k_l/feature_vector/2",
    'hub_efnv2xl': "https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_ft1k_xl/feature_vector/2",
    'bit_m-r50x1': "https://tfhub.dev/google/bit/m-r50x1/1",
    'bit_m-r50x3': "https://tfhub.dev/google/bit/m-r50x3/1",
    'bit_m-r101x1': "https://tfhub.dev/google/bit/m-r101x1/1",
    'bit_m-r101x3': "https://tfhub.dev/google/bit/m-r101x3/1",
    'bit_m-r152x4': "https://tfhub.dev/google/bit/m-r152x4/1",
}


def get_model(cfg):
    aux_arcface = False  # Chris Deotte suggested this
    if cfg.head == 'arcface2':
        head = ArcMarginPenaltyLogists
    elif cfg.head == 'arcface':
        head = ArcMarginProductSubCenter
    elif cfg.head == 'addface':
        head = AddMarginProductSubCenter
    else:
        assert False, "INVALID HEAD"

    if cfg.adaptive_margin:
        # define adaptive margins depending on class frequencies (dynamic margins)
        df = pd.read_csv(f'{project_dir}/train.csv')
        fewness = df['individual_id'].value_counts().sort_index() ** (-1/4)
        fewness -= fewness.min()
        fewness /= fewness.max() - fewness.min()
        adaptive_margin = cfg.margin_min + fewness * (cfg.margin_max - cfg.margin_min)

        # align margins with targets
        splits_path = '/kaggle/input/happywhale-splits'
        with open (f'{splits_path}/individual_ids.json', "r") as f:
            target_encodings = json.loads(f.read())  # individual_id: index
        individual_ids = pd.Series(target_encodings).sort_values().index.values
        adaptive_margin = adaptive_margin.loc[individual_ids].values.astype(np.float32)
        
    if cfg.arch_name.startswith('efnv1'):
        EFN = {'efnv1b0': efnv1.EfficientNetB0, 'efnv1b1': efnv1.EfficientNetB1, 
               'efnv1b2': efnv1.EfficientNetB2, 'efnv1b3': efnv1.EfficientNetB3,
               'efnv1b4': efnv1.EfficientNetB4, 'efnv1b5': efnv1.EfficientNetB5, 
               'efnv1b6': efnv1.EfficientNetB6, 'efnv1b7': efnv1.EfficientNetB7}

    if cfg.arch_name.startswith('efnv2'):
        EFN = {'efnv2s': efnv2.EfficientNetV2S, 'efnv2m': efnv2.EfficientNetV2M,
               'efnv2l': efnv2.EfficientNetV2L, 'efnv2xl': efnv2.EfficientNetV2XL}


    with strategy.scope():

        margin = head(
            n_classes = cfg.N_CLASSES,
            s = 30,
            m = adaptive_margin if cfg.adaptive_margin else 0.3,
            k = cfg.subcenters or 1,
            easy_margin = False,
            name=f'head/{cfg.head}', 
            dtype='float32')

        inp = tf.keras.layers.Input(shape = [*cfg.IMAGE_SIZE, 3], name = 'inp1')
        label = tf.keras.layers.Input(shape = (), name = 'inp2')
        if aux_arcface:
            label2 = tf.keras.layers.Input(shape = (), name = 'inp3')

        if cfg.arch_name.startswith('efnv1'):
            x = EFN[cfg.arch_name](weights=cfg.pretrained, include_top=False)(inp)
            if cfg.pool == 'flatten':
                embed = tf.keras.layers.Flatten()(x)
            elif cfg.pool == 'fc':
                embed = tf.keras.layers.Flatten()(x)
                embed = tf.keras.layers.Dropout(0.1)(embed)
                embed = tf.keras.layers.Dense(1024)(embed)
            elif cfg.pool == 'gem':
                embed = GeMPoolingLayer(train_p=True)(x)
            elif cfg.pool == 'concat':
                embed = tf.keras.layers.concatenate([tf.keras.layers.GlobalAveragePooling2D()(x),
                                                     tf.keras.layers.GlobalAveragePooling2D()(x)])
            elif cfg.pool == 'max':
                embed = tf.keras.layers.GlobalMaxPooling2D()(x)
            else:
                embed = tf.keras.layers.GlobalAveragePooling2D()(x)
            
        elif cfg.arch_name.startswith('efnv2'):
            x = EFN[cfg.arch_name](input_shape=(None, None, 3), num_classes=0,
                                   pretrained=cfg.pretrained)(inp)
            if cfg.pool == 'flatten':
                embed = tf.keras.layers.Flatten()(x)
            elif cfg.pool == 'fc':
                embed = tf.keras.layers.Flatten()(x)
                embed = tf.keras.layers.Dropout(0.1)(embed)
                embed = tf.keras.layers.Dense(1024)(embed)
            elif cfg.pool == 'gem':
                embed = GeMPoolingLayer(train_p=True)(x)
            elif cfg.pool == 'concat':
                embed = tf.keras.layers.concatenate([tf.keras.layers.GlobalAveragePooling2D()(x),
                                                     tf.keras.layers.GlobalAveragePooling2D()(x)])
            elif cfg.pool == 'max':
                embed = tf.keras.layers.GlobalMaxPooling2D()(x)
            else:
                embed = tf.keras.layers.GlobalAveragePooling2D()(x)

        elif cfg.arch_name in TFHUB:
            # tfhub models cannot be modified => Pooling cannot be changed!
            url = TFHUB[cfg.arch_name]
            model = hub.KerasLayer(url, trainable=True)
            embed = model(inp)
            #print(f"{cfg.arch_name} from tfhub")
            assert cfg.pool in [None, False, 'avg', ''], 'tfhub model, no custom pooling supported!'
            
        elif cfg.arch_name in tfimm.list_models(pretrained="timm"):
            #print(f"{cfg.arch_name} from tfimm")
            #embed = tfimm.create_model(cfg.arch_name, pretrained="timm", nb_classes=0)(inp)
            embed = tfimm.create_model(cfg.arch_name, pretrained=None, nb_classes=0)(inp)
            # create_model(nb_classes=0) includes pooling as last layer
        
        if len(cfg.dropout_ps) > 0:
            # Chris Deotte posted model code without Dropout/FC1 after pooling
            embed = tf.keras.layers.Dropout(cfg.dropout_ps[0])(embed)
            embed = tf.keras.layers.Dense(1024)(embed)       # tunable embedding size
        embed = tf.keras.layers.BatchNormalization()(embed)  # missing in public notebooks
        x = margin([embed, label])

        output = tf.keras.layers.Softmax(dtype='float32', name='arc' if cfg.aux_loss else None)(x)
        
        if cfg.aux_loss and aux_arcface:
            # Use 2nd arcface head for species (aux loss)
            head2 = ArcMarginProductSubCenter
            margin2 = head(
                n_classes = cfg.n_species,
                s = 30,
                m = 0.3,
                k = 1,
                easy_margin = False,
                name=f'auxhead/{cfg.head}', 
                dtype='float32')
            aux_features = margin2([embed, label2])
            aux_output = tf.keras.layers.Softmax(dtype='float32', name='aux')(aux_features)

        elif cfg.aux_loss:
            aux_features = tf.keras.layers.Dense(cfg.n_species)(embed)
            aux_output = tf.keras.layers.Softmax(dtype='float32', name='aux')(aux_features)
        inputs = [inp, label, label2] if (cfg.aux_loss and aux_arcface) else [inp, label]
        outputs = (output, aux_output) if cfg.aux_loss else [output]
        
        model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
        embed_model = tf.keras.models.Model(inputs=inp, outputs=embed)
        
        opt = tf.keras.optimizers.Adam(learning_rate=cfg.LR)
        if cfg.FREEZE_BATCH_NORM:
            freeze_BN(model)
        
        return model, embed_model