ccdv
/

lsg-distilroberta-base-4096

@@ -55,7 +55,8 @@ class LSGRobertaConfig(RobertaConfig):
         if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
             logger.warning(
-                "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], setting sparsity_type=None, computation will skip sparse attention")
             self.sparsity_type = None
         if self.sparsity_type in ["stride", "block_stride"]:
@@ -71,7 +72,7 @@ class LSGRobertaConfig(RobertaConfig):
             self.num_global_tokens = 1
         elif self.num_global_tokens > 512:
             logger.warning(
-                "[WARNING CONFIG]: num_global_tokens > 512 is not compatible, setting num_global_tokens=512"
             )
             self.num_global_tokens = 512
@@ -79,6 +80,16 @@ class LSGRobertaConfig(RobertaConfig):
             assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
             assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
 class BaseSelfAttention(nn.Module):
@@ -436,39 +447,13 @@ class LSGRobertaEmbeddings(RobertaEmbeddings):
         return embeddings
-class LSGRobertaSelfOutput(RobertaSelfOutput):
-    def __init__(self, config):
-        super().__init__(config)
 class LSGAttention(RobertaAttention):
     def __init__(self, config):
-        nn.Module.__init__(self)
         self.self = LSGSelfAttention(config)
-        self.output = LSGRobertaSelfOutput(config)
-        self.pruned_heads = set()
-class LSGRobertaIntermediate(RobertaIntermediate):
-    def __init__(self, config):
-        super().__init__(config)
-class LSGRobertaOutput(RobertaOutput):
-    def __init__(self, config):
-        super().__init__(config)
-class LSGRobertaPooler(RobertaPooler):
-    def __init__(self, config):
-        super().__init__(config)
 class LSGSelfAttention(BaseSelfAttention):
@@ -726,9 +711,7 @@ class LSGSelfAttention(BaseSelfAttention):
                 attention_mask=attention_mask,
                 output_attentions=output_attentions
                 )
-        #if head_mask is not None:
-        #    outputs = (outputs[0] * head_mask[:, :, :1, :1], ) + outputs[1:]
         return outputs
     def causal_forward(
@@ -898,29 +881,21 @@ class LSGRobertaLayer(RobertaLayer):
     def __init__(self, config):
-        nn.Module.__init__(self)
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
         self.attention = LSGAttention(config)
-        self.is_decoder = config.is_decoder
-        self.add_cross_attention = config.add_cross_attention
         if self.add_cross_attention:
             assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
             self.crossattention = LSGAttention(config)
-        self.intermediate = LSGRobertaIntermediate(config)
-        self.output = LSGRobertaOutput(config)
 class LSGRobertaEncoder(RobertaEncoder):
     def __init__(self, config):
-        nn.Module.__init__(self)
-        self.config = config
         self.layer = nn.ModuleList([LSGRobertaLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
 class LSGRobertaPreTrainedModel(RobertaPreTrainedModel):
@@ -945,7 +920,7 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
     config_class = LSGRobertaConfig
-    def __init__(self, config, add_pooling_layer=False):
         LSGRobertaPreTrainedModel.__init__(self, config)
@@ -961,7 +936,7 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
         self.embeddings = LSGRobertaEmbeddings(config)
         self.encoder = LSGRobertaEncoder(config)
-        self.pooler = LSGRobertaPooler(config) if add_pooling_layer else None
         if config.add_cross_attention:
             logger.warning(
@@ -988,6 +963,12 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
         return_dict=None
         ):
         inputs_ = input_ids if input_ids is not None else inputs_embeds
         n, t = inputs_.size()[:2]
@@ -1032,33 +1013,26 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
             return_dict=return_dict
             )
-        context = encoder_outputs[0]
         if self.pool_with_global:
-            context[:, self.num_global_tokens] = context[:, 0]
         diff = t - t_
-        n, _, d = context.size()
-        context = context[..., self.num_global_tokens:, :]
         # Adapt sequence to initial shape
         if diff < 0:
-            context = context[:, :t]
-        encoder_outputs.last_hidden_state = context
-        sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
-        return BaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            past_key_values=encoder_outputs.past_key_values,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
-        )
     def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
@@ -1092,7 +1066,7 @@ class LSGRobertaForCausalLM(LSGRobertaPreTrainedModel, RobertaForCausalLM):
             logger.warning("If you want to use `LSGRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
         self.roberta = LSGRobertaModel(config, add_pooling_layer=False)
-        self.lm_head = LSGRobertaLMHead(config)
         # The LM head weights require special treatment only when they are tied with the word embeddings
         self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1122,7 +1096,7 @@ class LSGRobertaForMaskedLM(LSGRobertaPreTrainedModel, RobertaForMaskedLM):
             )
         self.roberta = LSGRobertaModel(config, add_pooling_layer=False)
-        self.lm_head = LSGRobertaLMHead(config)
         # The LM head weights require special treatment only when they are tied with the word embeddings
         self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1131,13 +1105,6 @@ class LSGRobertaForMaskedLM(LSGRobertaPreTrainedModel, RobertaForMaskedLM):
         self.post_init()
-class LSGRobertaLMHead(RobertaLMHead):
-    """LSG Head for masked language modeling."""
-    def __init__(self, config):
-        super().__init__(config)
 class LSGRobertaForSequenceClassification(LSGRobertaPreTrainedModel, RobertaForSequenceClassification):
     """
     This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
@@ -1154,19 +1121,12 @@ class LSGRobertaForSequenceClassification(LSGRobertaPreTrainedModel, RobertaForS
         self.config = config
         self.roberta = LSGRobertaModel(config, add_pooling_layer=False)
-        self.classifier = LSGRobertaClassificationHead(config)
         # Initialize weights and apply final processing
         self.post_init()
-class LSGRobertaClassificationHead(RobertaClassificationHead):
-    """Head for sentence-level classification tasks."""
-    def __init__(self, config):
-        super().__init__(config)
 class LSGRobertaForMultipleChoice(LSGRobertaPreTrainedModel, RobertaForMultipleChoice):
     """
     This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the superclass for the

         if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
             logger.warning(
+                "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], \
+                    setting sparsity_type=None, computation will skip sparse attention")
             self.sparsity_type = None
         if self.sparsity_type in ["stride", "block_stride"]:
             self.num_global_tokens = 1
         elif self.num_global_tokens > 512:
             logger.warning(
+                "[WARNING CONFIG]: num_global_tokens > 512 is not allowed, setting num_global_tokens=512"
             )
             self.num_global_tokens = 512
             assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
             assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
+        if self.mask_first_token and not pool_with_global:
+            logger.warning(
+                "[WARNING CONFIG]: pool_with_global==False is not compatible with mask_first_token==True. Setting pool_with_global to True.")
+            self.pool_with_global = True
+        if hasattr(self, "position_embedding_type"):
+            if self.position_embedding_type != "absolute":
+                logger.warning(
+                "[WARNING CONFIG]: LSG Attention is not compatible with relative positional embedding and will skip its computation. Set position_embedding_type='absolute' to remove this warning.")
 class BaseSelfAttention(nn.Module):
         return embeddings
 class LSGAttention(RobertaAttention):
     def __init__(self, config):
+        super().__init__(config)
         self.self = LSGSelfAttention(config)
 class LSGSelfAttention(BaseSelfAttention):
                 attention_mask=attention_mask,
                 output_attentions=output_attentions
                 )
         return outputs
     def causal_forward(
     def __init__(self, config):
+        super().__init__(config)
         self.attention = LSGAttention(config)
         if self.add_cross_attention:
             assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
             self.crossattention = LSGAttention(config)
 class LSGRobertaEncoder(RobertaEncoder):
     def __init__(self, config):
+        super().__init__(config)
         self.layer = nn.ModuleList([LSGRobertaLayer(config) for _ in range(config.num_hidden_layers)])
 class LSGRobertaPreTrainedModel(RobertaPreTrainedModel):
     config_class = LSGRobertaConfig
+    def __init__(self, config, add_pooling_layer=True):
         LSGRobertaPreTrainedModel.__init__(self, config)
         self.embeddings = LSGRobertaEmbeddings(config)
         self.encoder = LSGRobertaEncoder(config)
+        self.pooler = RobertaPooler(config) if add_pooling_layer else None
         if config.add_cross_attention:
             logger.warning(
         return_dict=None
         ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         inputs_ = input_ids if input_ids is not None else inputs_embeds
         n, t = inputs_.size()[:2]
             return_dict=return_dict
             )
+        sequence_output = encoder_outputs[0]
         if self.pool_with_global:
+            sequence_output[:, self.num_global_tokens] = sequence_output[:, 0]
         diff = t - t_
+        n, _, d = sequence_output.size()
+        sequence_output = sequence_output[..., self.num_global_tokens:, :]
         # Adapt sequence to initial shape
         if diff < 0:
+            sequence_output = sequence_output[:, :t]
         pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
         if not return_dict:
             return (sequence_output, pooled_output) + encoder_outputs[1:]
+        encoder_outputs.last_hidden_state = sequence_output
+        encoder_outputs.pooler_output = pooled_output
+        return encoder_outputs
     def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
             logger.warning("If you want to use `LSGRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
         self.roberta = LSGRobertaModel(config, add_pooling_layer=False)
+        self.lm_head = RobertaLMHead(config)
         # The LM head weights require special treatment only when they are tied with the word embeddings
         self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
             )
         self.roberta = LSGRobertaModel(config, add_pooling_layer=False)
+        self.lm_head = RobertaLMHead(config)
         # The LM head weights require special treatment only when they are tied with the word embeddings
         self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
         self.post_init()
 class LSGRobertaForSequenceClassification(LSGRobertaPreTrainedModel, RobertaForSequenceClassification):
     """
     This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
         self.config = config
         self.roberta = LSGRobertaModel(config, add_pooling_layer=False)
+        self.classifier = RobertaClassificationHead(config)
         # Initialize weights and apply final processing
         self.post_init()
 class LSGRobertaForMultipleChoice(LSGRobertaPreTrainedModel, RobertaForMultipleChoice):
     """
     This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the superclass for the