ccdv
/

lsg-albert-base-v2-4096

@@ -17,7 +17,7 @@ AUTO_MAP = {
 class LSGAlbertConfig(AlbertConfig):
     """
-    This class overrides :class:`~transformers.LSGAlbertConfig`. Please check the superclass for the appropriate
     documentation alongside usage examples.
     """
@@ -55,7 +55,8 @@ class LSGAlbertConfig(AlbertConfig):
         if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
             logger.warning(
-                "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], setting sparsity_type=None, computation will skip sparse attention")
             self.sparsity_type = None
         if self.sparsity_type in ["stride", "block_stride"]:
@@ -71,7 +72,7 @@ class LSGAlbertConfig(AlbertConfig):
             self.num_global_tokens = 1
         elif self.num_global_tokens > 512:
             logger.warning(
-                "[WARNING CONFIG]: num_global_tokens > 512 is not compatible, setting num_global_tokens=512"
             )
             self.num_global_tokens = 512
@@ -79,7 +80,17 @@ class LSGAlbertConfig(AlbertConfig):
             assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
             assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
 class BaseSelfAttention(nn.Module):
     def init_modules(self, config):
@@ -635,9 +646,6 @@ class LSGAttention(BaseSelfAttention):
         hidden_states,
         attention_mask=None,
         head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
         output_attentions=False,
         ):
@@ -655,11 +663,7 @@ class LSGAttention(BaseSelfAttention):
         context = self.output_dropout(context)
         context = self.LayerNorm(context + hidden_states)
-        outputs = (context, ) + outputs[1:]
-        #if head_mask is not None:
-        #    outputs = (outputs[0] * head_mask[:, :, :1, :1], ) + outputs[1:]
-        return outputs
     def not_causal_forward(
         self,
@@ -751,6 +755,7 @@ class LSGAlbertLayer(AlbertLayer):
 class LSGAlbertLayerGroup(AlbertLayerGroup):
     def __init__(self, config):
         nn.Module.__init__(self)
         self.albert_layers = nn.ModuleList([LSGAlbertLayer(config) for _ in range(config.inner_group_num)])
@@ -759,10 +764,9 @@ class LSGAlbertLayerGroup(AlbertLayerGroup):
 class LSGAlbertTransformer(AlbertTransformer):
     def __init__(self, config):
-        nn.Module.__init__(self)
-        self.config = config
-        self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size)
         self.albert_layer_groups = nn.ModuleList([LSGAlbertLayerGroup(config) for _ in range(config.num_hidden_groups)])
@@ -838,6 +842,12 @@ class LSGAlbertModel(LSGAlbertPreTrainedModel, AlbertModel):
         return_dict=None,
         ):
         inputs_ = input_ids if input_ids is not None else inputs_embeds
         n, t = inputs_.size()[:2]
@@ -878,31 +888,26 @@ class LSGAlbertModel(LSGAlbertPreTrainedModel, AlbertModel):
             return_dict=return_dict
             )
-        context = encoder_outputs[0]
         if self.pool_with_global:
-            context[:, self.num_global_tokens] = context[:, 0]
         diff = t - t_
-        n, _, d = context.size()
-        context = context[..., self.num_global_tokens:, :]
         # Adapt sequence to initial shape
         if diff < 0:
-            context = context[:, :t]
-        encoder_outputs.last_hidden_state = context
-        sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
-        return BaseModelOutputWithPooling(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
 class LSGAlbertForPreTraining(LSGAlbertPreTrainedModel, AlbertForPreTraining):

 class LSGAlbertConfig(AlbertConfig):
     """
+    This class overrides :class:`~transformers.AlbertConfig`. Please check the superclass for the appropriate
     documentation alongside usage examples.
     """
         if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
             logger.warning(
+                "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], \
+                    setting sparsity_type=None, computation will skip sparse attention")
             self.sparsity_type = None
         if self.sparsity_type in ["stride", "block_stride"]:
             self.num_global_tokens = 1
         elif self.num_global_tokens > 512:
             logger.warning(
+                "[WARNING CONFIG]: num_global_tokens > 512 is not allowed, setting num_global_tokens=512"
             )
             self.num_global_tokens = 512
             assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
             assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
+        if self.mask_first_token and not pool_with_global:
+            logger.warning(
+                "[WARNING CONFIG]: pool_with_global==False is not compatible with mask_first_token==True. Setting pool_with_global to True.")
+            self.pool_with_global = True
+        if hasattr(self, "position_embedding_type"):
+            if self.position_embedding_type != "absolute":
+                logger.warning(
+                "[WARNING CONFIG]: LSG Attention is not compatible with relative positional embedding and will skip its computation. Set position_embedding_type='absolute' to remove this warning.")
 class BaseSelfAttention(nn.Module):
     def init_modules(self, config):
         hidden_states,
         attention_mask=None,
         head_mask=None,
         output_attentions=False,
         ):
         context = self.output_dropout(context)
         context = self.LayerNorm(context + hidden_states)
+        return (context, ) + outputs[1:]
     def not_causal_forward(
         self,
 class LSGAlbertLayerGroup(AlbertLayerGroup):
     def __init__(self, config):
         nn.Module.__init__(self)
         self.albert_layers = nn.ModuleList([LSGAlbertLayer(config) for _ in range(config.inner_group_num)])
 class LSGAlbertTransformer(AlbertTransformer):
     def __init__(self, config):
+        super().__init__(config)
         self.albert_layer_groups = nn.ModuleList([LSGAlbertLayerGroup(config) for _ in range(config.num_hidden_groups)])
         return_dict=None,
         ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         inputs_ = input_ids if input_ids is not None else inputs_embeds
         n, t = inputs_.size()[:2]
             return_dict=return_dict
             )
+        sequence_output = encoder_outputs[0]
         if self.pool_with_global:
+            sequence_output[:, self.num_global_tokens] = sequence_output[:, 0]
         diff = t - t_
+        n, _, d = sequence_output.size()
+        sequence_output = sequence_output[..., self.num_global_tokens:, :]
         # Adapt sequence to initial shape
         if diff < 0:
+            sequence_output = sequence_output[:, :t]
         pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
+        encoder_outputs.last_hidden_state = sequence_output
+        encoder_outputs.pooler_output = pooled_output
+        return encoder_outputs
 class LSGAlbertForPreTraining(LSGAlbertPreTrainedModel, AlbertForPreTraining):