Upload TFBilma

Browse files

Files changed (4) hide show

config.json +4 -7
configuration_bilma.py +54 -0
modeling_bilma.py +441 -0
tf_model.h5 +1 -1

config.json CHANGED Viewed

@@ -1,20 +1,17 @@
 {
-  "_name_or_path": "bilma_MX_mean",
   "add_head": [
     512,
     15
   ],
   "auto_map": {
     "AutoConfig": "configuration_bilma.BilmaConfig",
-    "TFAutoModel": "modeling_bilma.TFBilma",
-    "TFAutoModelForMaskedLM": "modeling_bilma.TFBilma"
   },
   "hidden_dropout_prob": 0.1,
   "hidden_size": 512,
-  "include_head": [
-    512,
-    15
-  ],
   "include_top": false,
   "model_type": "bilma",
   "num_attention_heads": 4,

 {
   "add_head": [
     512,
     15
   ],
+  "architectures": [
+    "Bilma"
+  ],
   "auto_map": {
     "AutoConfig": "configuration_bilma.BilmaConfig",
+    "TFAutoModel": "modeling_bilma.TFBilma"
   },
   "hidden_dropout_prob": 0.1,
   "hidden_size": 512,
   "include_top": false,
   "model_type": "bilma",
   "num_attention_heads": 4,

configuration_bilma.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from transformers import PretrainedConfig
+class BilmaConfig(PretrainedConfig):
+    model_type = "bilma"
+    def __init__(
+        self,
+        weights="MX",
+        include_top = True,
+        add_head = None,
+        pooling = None,
+        num_attention_heads: int = 4,
+        num_hidden_layers: int = 2,
+        seq_max_length: int = 280,
+        hidden_size: int = 512,
+        vocab_size: int = 29025,
+        hidden_dropout_prob: float = 0.1,
+        **kwargs,
+    ):
+        countries = ["MX"]
+        poolings = ["mean", "cls", "max"]
+        if weights not in countries:
+            raise ValueError(f"`weights` must be one of {countries}, got {weights}.")
+        if add_head is not None and include_top == True:
+            raise ValueError(f"To add a head, 'include_top' must be False")
+        if pooling is not None and include_top == True:
+            raise ValueError(f"To specify a pooling, 'include_top' must be False")
+        if pooling is not None and pooling not in poolings:
+            raise ValueError(f"`pooling` must be one of {poolings}, got {pooling}.")
+        if weights is not None:
+            self.weights = weights
+            self.include_top = include_top
+            self.add_head = add_head
+            self.pooling = pooling
+            self.num_attention_heads = 4
+            self.num_hidden_layers = 2
+            self.seq_max_length = 280
+            self.hidden_size = 512
+            self.vocab_size = 29025
+            self.hidden_dropout_prob = 0.1
+            super().__init__(**kwargs)
+            return
+        self.weights = weights
+        self.include_top = include_top
+        self.add_head = add_head
+        self.pooling = pooling
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.seq_max_length = seq_max_length
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        super().__init__(**kwargs)

modeling_bilma.py ADDED Viewed

	@@ -0,0 +1,441 @@

+from transformers import TFPreTrainedModel, PreTrainedTokenizer, BatchEncoding
+from tensorflow.keras.models import Model, load_model, Sequential
+from tensorflow.keras.layers import Layer, Dense, concatenate, Input, add, Dropout, LayerNormalization, MultiHeadAttention, Embedding
+import tensorflow as tf
+import numpy as np
+from typing import Dict
+import re
+import unicodedata
+from configuration_bilma import BilmaConfig
+# copied from preprocessing.py
+BLANK = ' '
+RE_OPS = re.I | re.M | re.S
+RE_USR = re.compile(r"""@\S+""", RE_OPS)
+RE_TAG = re.compile(r"""#\S+""", RE_OPS)
+RE_URL = re.compile(r"""(http|ftp|https)://\S+""", RE_OPS)
+RE_NUM = re.compile(r"""[-+]?\d+\.?\d*""", RE_OPS)
+SYMBOLS_ = "()[]¿?¡!{}~<>|"
+SYMBOLS = set(";:,.@\\-\"/" + SYMBOLS_)
+# ------------------
+# Class declaration
+# ------------------
+class TFBilma(TFPreTrainedModel):
+    config_class = BilmaConfig
+    main_input_name = "input_ids"
+    #base_model_prefix = "bilma"
+    def __init__(self, config):
+        self.seq_max_length = config.seq_max_length
+        self.include_top = config.include_top
+        self.add_head = config.add_head
+        super().__init__(config)
+        self.model = bilma(num_enc=config.num_hidden_layers,
+                           embed_dim=config.hidden_size,
+                           max_length=config.seq_max_length,
+                           num_heads=config.num_attention_heads,
+                           ff_dim=config.hidden_size,
+                           vocab_size=config.vocab_size,
+                           rate=config.hidden_dropout_prob,
+                           include_top = config.include_top,
+                           add_head = config.add_head,
+                           pooling = config.pooling)
+    @property
+    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
+        dummies = {}
+        for key, spec in self.input_signature.items():
+            dummy_shape = [dim if dim is not None else 2 for dim in spec.shape]
+            if spec.shape[0] is None:
+                dummy_shape[0] = 1
+            dummies[key] = tf.ones(shape=dummy_shape, dtype=spec.dtype)
+        return dummies
+    @property
+    def input_signature(self) -> Dict[str, tf.TensorSpec]:
+        sig = {}
+        sig["input_ids"] = tf.TensorSpec([None, self.seq_max_length], tf.int32, name="input_ids")
+        return sig
+    def call(self, inputs):
+        if isinstance(inputs, Dict) or isinstance(inputs, BatchEncoding):
+            ins = tf.cast(inputs["input_ids"], tf.float32)
+        else:
+            ins = inputs
+        if self.include_top:
+            output = {"logits":self.model(ins)}
+        else:
+            if self.add_head is None:
+                output = {"last_hidden_state":self.model(ins)}
+            else:
+                output = {"label":self.model(ins)}
+        return output
+    def get_loss_function():
+        return loss_funtion()
+    def get_acc_function():
+        return accuracy_function()
+# copied from bilma_model.py
+# --------------------------
+def loss_function(ignore_id=0):
+    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
+    def loss(real, pred):
+        mask = tf.math.logical_not(tf.math.equal(real, ignore_id))
+        loss_ = loss_object(real, pred)
+        mask = tf.cast(mask, dtype=loss_.dtype)
+        loss_ *= mask
+        sum_ = tf.reduce_sum(mask,axis=1)
+        loss_ = tf.math.divide_no_nan(tf.reduce_sum(loss_, axis=1), sum_)
+        return loss_
+    return loss
+def accuracy_function(ignore_id=0):
+    def acc_mlm(real, pred):
+        accuracies = tf.equal(tf.cast(real, tf.int64), tf.argmax(pred, axis=2))
+        mask = tf.math.logical_not(tf.math.equal(real, ignore_id))
+        accuracies = tf.math.logical_and(mask, accuracies)
+        accuracies = tf.cast(accuracies, dtype=tf.float32)
+        mask = tf.cast(mask, dtype=tf.float32)
+        return tf.math.divide_no_nan(tf.reduce_sum(accuracies), tf.reduce_sum(mask))
+    return acc_mlm
+def mean_vectors(inputs, enc_vectors, max_length):
+    p = tf.where(inputs == 3)
+    pos = tf.transpose(p)[1]
+    C = tf.sequence_mask(pos, maxlen=max_length, dtype=tf.float32)
+    C = tf.reshape(C, (-1, max_length, 1))
+    S = tf.reduce_sum(enc_vectors * C, 1)
+    x = S / tf.expand_dims(tf.cast(pos, tf.float32), (1))
+    return x
+def mean_diff_vectors(inputs, enc_vectors, max_length):
+    p = tf.where(inputs == 3)
+    pos = tf.transpose(p)[1]
+    C = tf.sequence_mask(pos, maxlen=max_length, dtype=tf.float32)
+    C = tf.reshape(C, (-1, max_length, 1))
+    vecs = enc_vectors * C
+    S = tf.reduce_sum(vecs, 1)
+    mu = S / tf.expand_dims(tf.cast(pos, tf.float32), (1))
+    x = tf.reduce_sum(mu - vecs, 1) / tf.expand_dims(tf.cast(pos, tf.float32), (1))
+    return x
+def max_vectors(inputs, enc_vectors, max_length):
+    p = tf.where(inputs == 3)
+    pos = tf.transpose(p)[1]
+    C = tf.sequence_mask(pos, maxlen=max_length, dtype=tf.float32)
+    C = tf.reshape(C, (-1, max_length, 1))
+    x = tf.reduce_max(enc_vectors * C, 1)
+    return x
+def cls_vectors(inputs, enc_vectors, max_length):
+    x = tf.squeeze(enc_vectors[:, 0:1, :], axis=1)
+    return x
+def bilma(num_enc=6, embed_dim=300, max_length=50, num_heads=6, ff_dim=512, vocab_size=9739, rate=0.1, include_top=True, add_head=None, pooling=None):
+    capt_inputs_ids = Input(shape=(max_length, ), name='input_ids')
+    capt_embedding = Embedding(vocab_size, embed_dim, mask_zero=False, name="bilma/embedding")
+    capt_inputs = capt_embedding(capt_inputs_ids)
+    enc = Encoder(num_enc, embed_dim, max_length, num_heads, ff_dim, rate=rate, name="bilma/encoder")
+    enc_output = enc(capt_inputs)
+    if include_top:
+        fin_output = Dense(vocab_size, use_bias=True, name="bilma/dense_final")(enc_output)
+    else:
+        x = enc_output
+        if pooling == "mean":
+            x = mean_vectors(capt_inputs_ids, x, max_length)
+        elif pooling == "cls":
+            x = cls_vectors(capt_inputs_ids, x, max_length)
+        elif pooling == "max":
+            x = max_vectors(capt_inputs_ids, x, max_length)
+        if add_head is None:
+            fin_output = x
+        else:
+            for i, m in enumerate(add_head[:-1]):
+                x = Dense(m, use_bias=True, activation="relu", name=f"bilma/dense_ex_{i}")(x)
+            fin_output = Dense(add_head[-1], use_bias=True, activation="softmax", name=f"bilma/dense_ex_final")(x)
+    caption_model = Model(inputs=capt_inputs_ids, outputs=fin_output, name="bilma_model")
+    return caption_model
+def load(model_file):
+    custom_objects={"EncoderBlock": EncoderBlock,
+                    "Encoder": Encoder,
+                    "loss": loss_function(),
+                    "acc_mlm":accuracy_function(),
+                   }
+    return load_model(model_file, custom_objects=custom_objects)
+#
+# Copied from transformer_text.py
+# -------------------------------
+class EncoderBlock(Layer):
+    def __init__(self, layer_num, patch_dim, num_heads, ff_dim, rate=0.1, **kwargs):
+        super(EncoderBlock, self).__init__(**kwargs)
+        self.ln = layer_num
+        self.p_d = patch_dim
+        self.n_h = num_heads
+        self.f_d = ff_dim
+        self.rate = rate
+        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=patch_dim, name=f"bilma/MHA_{layer_num}")
+        self.ffn = Sequential(
+            #[Conv1D(ff_dim, kernel_size=1, activation=tf.nn.gelu),
+            # Conv1D(patch_dim, kernel_size=1),]
+            [Dense(ff_dim, activation=tf.nn.gelu, name=f"bilma/dense1_{layer_num}"),
+             Dense(patch_dim, name=f"bilma/dense2_{layer_num}")]
+        )
+        #self.layernorm0 = LayerNormalization(epsilon=1e-6)
+        self.layernorm1 = LayerNormalization(epsilon=1e-6, name=f"ln1_{layer_num}")
+        self.layernorm2 = LayerNormalization(epsilon=1e-6, name=f"ln2_{layer_num}")
+        self.dropout1 = Dropout(rate)
+        self.dropout2 = Dropout(rate)
+    def get_config(self):
+        config = super(EncoderBlock, self).get_config()
+        config.update({"layer_num":self.ln, "patch_dim":self.p_d, "num_heads":self.n_h, "ff_dim":self.f_d, "rate":self.rate})
+        return config
+    def call(self, inputs, training=False):
+        #inputs = self.layernorm0(inputs)
+        attn_output = self.att(inputs, inputs)
+        attn_output = self.dropout1(attn_output, training=training)
+        out1 = self.layernorm1(add([inputs, attn_output]))
+        ffn_output = self.ffn(out1)
+        ffn_output = self.dropout2(ffn_output, training=training)
+        return self.layernorm2(add([out1, ffn_output]))
+class DecoderBlock(Layer):
+    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
+        super(DecoderBlock, self).__init__(**kwargs)
+        self.e_d = embed_dim
+        self.n_h = num_heads
+        self.f_d = ff_dim
+        self.rate = rate
+        self.att1 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
+        self.att2 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
+        self.ffn = Sequential(
+            #[Conv1D(ff_dim, kernel_size=1, activation=tf.nn.gelu),
+            # Conv1D(embed_dim, kernel_size=1),]
+            [Dense(ff_dim, activation=tf.nn.gelu),
+             Dense(embed_dim),]
+        )
+        self.layernorm1 = LayerNormalization(epsilon=1e-6)
+        self.layernorm2 = LayerNormalization(epsilon=1e-6)
+        self.dropout1 = Dropout(rate)
+        self.dropout2 = Dropout(rate)
+        self.dropout3 = Dropout(rate)
+    def get_config(self):
+        config = super(DecoderBlock, self).get_config()
+        config.update({"embed_dim":self.e_d, "num_heads":self.n_h, "ff_dim":self.f_d, "rate":self.rate})
+        return config
+    def call(self, inputs, encoder_output, look_ahead_mask, padding_mask, training=None):
+        y, attn_output1 = self.att1(inputs, inputs, attention_mask=look_ahead_mask, return_attention_scores=True)
+        y = self.dropout1(y, training=training)
+        y = add([inputs, y])
+        out1 = self.layernorm1(y)
+        y, attn_encoder = self.att2(out1, encoder_output, attention_mask=padding_mask, return_attention_scores=True)
+        y = self.dropout2(y, training=training)
+        y = add([out1, y])
+        out2 = self.layernorm1(y)
+        ffn_output = self.ffn(out2)
+        ffn_output = self.dropout3(ffn_output, training=training)
+        final_output =  self.layernorm2(out2 + ffn_output)
+        return final_output, attn_output1, attn_encoder
+class Encoder(Layer):
+    def __init__(self, n, embed_dim, max_length, num_heads, ff_dim, rate=0.1, **kwargs):
+        super(Encoder, self).__init__(**kwargs)
+        self.n = n
+        self.embed_dim = embed_dim
+        self.max_length = max_length
+        self.n_h = num_heads
+        self.f_d = ff_dim
+        self.rate = rate
+        self._layers = [EncoderBlock(i, embed_dim, num_heads, ff_dim, rate=0.1, name=f"enc_block_{i}") for i in range(n)]
+        self.pe = positional_encoding(self.max_length, self.embed_dim)
+    def get_config(self):
+        config = super(Encoder, self).get_config()
+        config.update({"n": self.n, "embed_dim":self.embed_dim, "max_length": self.max_length, "num_heads":self.n_h, "ff_dim":self.f_d, "rate":self.rate})
+        return config
+    def call(self, x, training=False):
+        x *= tf.math.sqrt(tf.cast(self.embed_dim, tf.float32))
+        x = x + self.pe[:, :tf.shape(x)[1], :]
+        for layer in self._layers:
+            x = layer(x, training)
+        return x
+class Decoder(Layer):
+    def __init__(self, n, embed_dim, max_length, num_heads, ff_dim, rate=0.1, **kwargs):
+        super(Decoder, self).__init__(**kwargs)
+        self.n = n
+        self.embed_dim = embed_dim
+        self.max_length = max_length
+        self.n_h = num_heads
+        self.f_d = ff_dim
+        self.rate = rate
+        self._layers = [DecoderBlock(embed_dim, num_heads, ff_dim, rate=0.1) for _ in range(n)]
+        self.pe = positional_encoding(self.max_length, self.embed_dim)
+    def get_config(self):
+        config = super(Decoder, self).get_config()
+        config.update({"n": self.n, "embed_dim":self.embed_dim, "max_length": self.max_length, "num_heads":self.n_h, "ff_dim":self.f_d, "rate":self.rate})
+        return config
+    def call(self, x, encoder_output, look_ahead_mask, padding_mask, training):
+        x *= tf.math.sqrt(tf.cast(self.embed_dim, tf.float32))
+        x = x + self.pe[:, :tf.shape(x)[1], :]
+        for layer in self._layers:
+            x, self_att, enc_att = layer(x, encoder_output, look_ahead_mask, padding_mask, training)
+        return x
+# =========================================
+#   M A S K S
+# =========================================
+def create_padding_mask(seq):
+    """
+    For self-attention
+    seq shape(bs, max_length, emb_dim)
+    output shape (bs, max_length, max_length)
+    """
+    mask = tf.cast(tf.not_equal(seq, 0), tf.bool)
+    mask = tf.reduce_any(mask, 2)
+    mask = tf.repeat(mask, seq.shape[1], 0)
+    mask = tf.reshape(mask, (-1,seq.shape[1], seq.shape[1]))
+    return tf.cast(mask, tf.float32)
+def create_cross_padding_mask(seq, target_seq):
+    """
+    For cross-attention
+    seq shape(bs, k, image_features)
+    target_seq(bs, max_length, emb_dim)
+    output shape (bs, max_length, k)
+    """
+    mask = tf.cast(tf.not_equal(target_seq, 0), tf.bool)
+    mask = tf.reduce_any(mask, 2)
+    mask = tf.repeat(mask, seq.shape[1], 0)
+    mask = tf.reshape(mask, (-1, tf.shape(seq)[1], tf.shape(target_seq)[1]))
+    mask = tf.transpose(mask, [0, 2, 1])
+    return mask
+def create_look_ahead_mask(seq):
+    """
+    seq shape(bs, max_length, emb_dim)
+    output 2D matrix of shape (bs, max_length, max_length) with ones on the diagonal and below.
+    """
+    size = seq.shape[1]
+    mask = tf.linalg.band_part(tf.ones((size, size)), -1, 0)
+    mask = tf.expand_dims(mask, 0)
+    mask = tf.repeat(mask, tf.shape(seq)[0], 0)
+    return mask
+def create_masks(seq, target_seq):
+    decoder_mask = create_padding_mask(target_seq)
+    decoder_mask *= create_look_ahead_mask(target_seq)
+    cross_att_mask = create_cross_padding_mask(seq, target_seq)
+    return decoder_mask, cross_att_mask
+def create_masks_looking_ahead(seq, target_seq):
+    decoder_mask = create_padding_mask(target_seq)
+    cross_att_mask = create_cross_padding_mask(seq, target_seq)
+    return decoder_mask, cross_att_mask
+# =========================================
+#   P O S I T I O N A L   E N C O D I N G
+# =========================================
+def get_angles(pos, i, d_model):
+    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
+    return pos * angle_rates
+@tf.autograph.experimental.do_not_convert
+def positional_encoding(position, d_model):
+    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
+                          np.arange(d_model)[np.newaxis, :],
+                          d_model)
+    # apply sin to even indices in the array; 2i
+    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
+    # apply cos to odd indices in the array; 2i+1
+    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
+    pos_encoding = angle_rads[np.newaxis, ...]
+    return tf.cast(pos_encoding, dtype=tf.float32)
+class PatchEncoder(Layer):
+    def __init__(self, num_patches, projection_dim, **kwargs):
+        super(PatchEncoder, self).__init__(**kwargs)
+        self.num_patches = num_patches
+        self.projection_dim = projection_dim
+        self.projection = Dense(units=projection_dim)
+        self.position_embedding = Embedding(
+            input_dim=num_patches, output_dim=projection_dim
+        )
+    def get_config(self):
+        config = super(PatchEncoder, self).get_config()
+        config.update({"num_patches": self.num_patches, "projection_dim":self.projection_dim})
+        return config
+    def call(self, patch):
+        positions = tf.range(start=0, limit=self.num_patches, delta=1)
+        encoded = self.projection(patch) + self.position_embedding(positions)
+        return encoded

tf_model.h5 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f7ebeece34f34d796b0b483adcfc34d966b6829c284d0ff8be986dd86e7313b9
 size 98400100

 version https://git-lfs.github.com/spec/v1
+oid sha256:3adf655ed163d94a182df5698d86300ea8a65593bc3e3502dfbca39940ad2fa9
 size 98400100