ccdv commited on
Commit
92d5cca
·
1 Parent(s): 3224af8
Files changed (1) hide show
  1. modeling_lsg_roberta.py +37 -77
modeling_lsg_roberta.py CHANGED
@@ -55,7 +55,8 @@ class LSGRobertaConfig(RobertaConfig):
55
 
56
  if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
57
  logger.warning(
58
- "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], setting sparsity_type=None, computation will skip sparse attention")
 
59
  self.sparsity_type = None
60
 
61
  if self.sparsity_type in ["stride", "block_stride"]:
@@ -71,7 +72,7 @@ class LSGRobertaConfig(RobertaConfig):
71
  self.num_global_tokens = 1
72
  elif self.num_global_tokens > 512:
73
  logger.warning(
74
- "[WARNING CONFIG]: num_global_tokens > 512 is not compatible, setting num_global_tokens=512"
75
  )
76
  self.num_global_tokens = 512
77
 
@@ -79,6 +80,16 @@ class LSGRobertaConfig(RobertaConfig):
79
  assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
80
  assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
81
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  class BaseSelfAttention(nn.Module):
84
 
@@ -436,39 +447,13 @@ class LSGRobertaEmbeddings(RobertaEmbeddings):
436
  return embeddings
437
 
438
 
439
- class LSGRobertaSelfOutput(RobertaSelfOutput):
440
-
441
- def __init__(self, config):
442
- super().__init__(config)
443
-
444
-
445
  class LSGAttention(RobertaAttention):
446
 
447
  def __init__(self, config):
448
 
449
- nn.Module.__init__(self)
450
 
451
  self.self = LSGSelfAttention(config)
452
- self.output = LSGRobertaSelfOutput(config)
453
- self.pruned_heads = set()
454
-
455
-
456
- class LSGRobertaIntermediate(RobertaIntermediate):
457
-
458
- def __init__(self, config):
459
- super().__init__(config)
460
-
461
-
462
- class LSGRobertaOutput(RobertaOutput):
463
-
464
- def __init__(self, config):
465
- super().__init__(config)
466
-
467
-
468
- class LSGRobertaPooler(RobertaPooler):
469
-
470
- def __init__(self, config):
471
- super().__init__(config)
472
 
473
 
474
  class LSGSelfAttention(BaseSelfAttention):
@@ -726,9 +711,7 @@ class LSGSelfAttention(BaseSelfAttention):
726
  attention_mask=attention_mask,
727
  output_attentions=output_attentions
728
  )
729
-
730
- #if head_mask is not None:
731
- # outputs = (outputs[0] * head_mask[:, :, :1, :1], ) + outputs[1:]
732
  return outputs
733
 
734
  def causal_forward(
@@ -898,29 +881,21 @@ class LSGRobertaLayer(RobertaLayer):
898
 
899
  def __init__(self, config):
900
 
901
- nn.Module.__init__(self)
902
 
903
- self.chunk_size_feed_forward = config.chunk_size_feed_forward
904
- self.seq_len_dim = 1
905
  self.attention = LSGAttention(config)
906
- self.is_decoder = config.is_decoder
907
- self.add_cross_attention = config.add_cross_attention
908
  if self.add_cross_attention:
909
  assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
910
  self.crossattention = LSGAttention(config)
911
- self.intermediate = LSGRobertaIntermediate(config)
912
- self.output = LSGRobertaOutput(config)
913
 
914
 
915
  class LSGRobertaEncoder(RobertaEncoder):
916
 
917
  def __init__(self, config):
918
 
919
- nn.Module.__init__(self)
920
 
921
- self.config = config
922
  self.layer = nn.ModuleList([LSGRobertaLayer(config) for _ in range(config.num_hidden_layers)])
923
- self.gradient_checkpointing = False
924
 
925
 
926
  class LSGRobertaPreTrainedModel(RobertaPreTrainedModel):
@@ -945,7 +920,7 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
945
  config_class = LSGRobertaConfig
946
 
947
 
948
- def __init__(self, config, add_pooling_layer=False):
949
 
950
  LSGRobertaPreTrainedModel.__init__(self, config)
951
 
@@ -961,7 +936,7 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
961
 
962
  self.embeddings = LSGRobertaEmbeddings(config)
963
  self.encoder = LSGRobertaEncoder(config)
964
- self.pooler = LSGRobertaPooler(config) if add_pooling_layer else None
965
 
966
  if config.add_cross_attention:
967
  logger.warning(
@@ -988,6 +963,12 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
988
  return_dict=None
989
  ):
990
 
 
 
 
 
 
 
991
  inputs_ = input_ids if input_ids is not None else inputs_embeds
992
  n, t = inputs_.size()[:2]
993
 
@@ -1032,33 +1013,26 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
1032
  return_dict=return_dict
1033
  )
1034
 
1035
- context = encoder_outputs[0]
1036
  if self.pool_with_global:
1037
- context[:, self.num_global_tokens] = context[:, 0]
1038
 
1039
  diff = t - t_
1040
- n, _, d = context.size()
1041
- context = context[..., self.num_global_tokens:, :]
1042
 
1043
  # Adapt sequence to initial shape
1044
  if diff < 0:
1045
- context = context[:, :t]
1046
 
1047
- encoder_outputs.last_hidden_state = context
1048
- sequence_output = encoder_outputs[0]
1049
  pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
1050
 
1051
  if not return_dict:
1052
  return (sequence_output, pooled_output) + encoder_outputs[1:]
1053
-
1054
- return BaseModelOutputWithPoolingAndCrossAttentions(
1055
- last_hidden_state=sequence_output,
1056
- pooler_output=pooled_output,
1057
- past_key_values=encoder_outputs.past_key_values,
1058
- hidden_states=encoder_outputs.hidden_states,
1059
- attentions=encoder_outputs.attentions,
1060
- cross_attentions=encoder_outputs.cross_attentions,
1061
- )
1062
 
1063
  def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
1064
 
@@ -1092,7 +1066,7 @@ class LSGRobertaForCausalLM(LSGRobertaPreTrainedModel, RobertaForCausalLM):
1092
  logger.warning("If you want to use `LSGRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
1093
 
1094
  self.roberta = LSGRobertaModel(config, add_pooling_layer=False)
1095
- self.lm_head = LSGRobertaLMHead(config)
1096
 
1097
  # The LM head weights require special treatment only when they are tied with the word embeddings
1098
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1122,7 +1096,7 @@ class LSGRobertaForMaskedLM(LSGRobertaPreTrainedModel, RobertaForMaskedLM):
1122
  )
1123
 
1124
  self.roberta = LSGRobertaModel(config, add_pooling_layer=False)
1125
- self.lm_head = LSGRobertaLMHead(config)
1126
 
1127
  # The LM head weights require special treatment only when they are tied with the word embeddings
1128
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
@@ -1131,13 +1105,6 @@ class LSGRobertaForMaskedLM(LSGRobertaPreTrainedModel, RobertaForMaskedLM):
1131
  self.post_init()
1132
 
1133
 
1134
- class LSGRobertaLMHead(RobertaLMHead):
1135
- """LSG Head for masked language modeling."""
1136
-
1137
- def __init__(self, config):
1138
- super().__init__(config)
1139
-
1140
-
1141
  class LSGRobertaForSequenceClassification(LSGRobertaPreTrainedModel, RobertaForSequenceClassification):
1142
  """
1143
  This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
@@ -1154,19 +1121,12 @@ class LSGRobertaForSequenceClassification(LSGRobertaPreTrainedModel, RobertaForS
1154
  self.config = config
1155
 
1156
  self.roberta = LSGRobertaModel(config, add_pooling_layer=False)
1157
- self.classifier = LSGRobertaClassificationHead(config)
1158
 
1159
  # Initialize weights and apply final processing
1160
  self.post_init()
1161
 
1162
 
1163
- class LSGRobertaClassificationHead(RobertaClassificationHead):
1164
- """Head for sentence-level classification tasks."""
1165
-
1166
- def __init__(self, config):
1167
- super().__init__(config)
1168
-
1169
-
1170
  class LSGRobertaForMultipleChoice(LSGRobertaPreTrainedModel, RobertaForMultipleChoice):
1171
  """
1172
  This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the superclass for the
 
55
 
56
  if sparsity_type not in [None, "none", "norm", "lsh", "pooling", "stride", "block_stride"]:
57
  logger.warning(
58
+ "[WARNING CONFIG]: sparsity_mode not in [None, 'none', 'norm', 'lsh', 'pooling', 'stride', 'block_stride'], \
59
+ setting sparsity_type=None, computation will skip sparse attention")
60
  self.sparsity_type = None
61
 
62
  if self.sparsity_type in ["stride", "block_stride"]:
 
72
  self.num_global_tokens = 1
73
  elif self.num_global_tokens > 512:
74
  logger.warning(
75
+ "[WARNING CONFIG]: num_global_tokens > 512 is not allowed, setting num_global_tokens=512"
76
  )
77
  self.num_global_tokens = 512
78
 
 
80
  assert self.block_size % self.sparsity_factor == 0, "[ERROR CONFIG]: block_size must be divisible by sparsity_factor"
81
  assert self.block_size//self.sparsity_factor >= 1, "[ERROR CONFIG]: make sure block_size >= sparsity_factor"
82
 
83
+ if self.mask_first_token and not pool_with_global:
84
+ logger.warning(
85
+ "[WARNING CONFIG]: pool_with_global==False is not compatible with mask_first_token==True. Setting pool_with_global to True.")
86
+ self.pool_with_global = True
87
+
88
+ if hasattr(self, "position_embedding_type"):
89
+ if self.position_embedding_type != "absolute":
90
+ logger.warning(
91
+ "[WARNING CONFIG]: LSG Attention is not compatible with relative positional embedding and will skip its computation. Set position_embedding_type='absolute' to remove this warning.")
92
+
93
 
94
  class BaseSelfAttention(nn.Module):
95
 
 
447
  return embeddings
448
 
449
 
 
 
 
 
 
 
450
  class LSGAttention(RobertaAttention):
451
 
452
  def __init__(self, config):
453
 
454
+ super().__init__(config)
455
 
456
  self.self = LSGSelfAttention(config)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
 
459
  class LSGSelfAttention(BaseSelfAttention):
 
711
  attention_mask=attention_mask,
712
  output_attentions=output_attentions
713
  )
714
+
 
 
715
  return outputs
716
 
717
  def causal_forward(
 
881
 
882
  def __init__(self, config):
883
 
884
+ super().__init__(config)
885
 
 
 
886
  self.attention = LSGAttention(config)
 
 
887
  if self.add_cross_attention:
888
  assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
889
  self.crossattention = LSGAttention(config)
 
 
890
 
891
 
892
  class LSGRobertaEncoder(RobertaEncoder):
893
 
894
  def __init__(self, config):
895
 
896
+ super().__init__(config)
897
 
 
898
  self.layer = nn.ModuleList([LSGRobertaLayer(config) for _ in range(config.num_hidden_layers)])
 
899
 
900
 
901
  class LSGRobertaPreTrainedModel(RobertaPreTrainedModel):
 
920
  config_class = LSGRobertaConfig
921
 
922
 
923
+ def __init__(self, config, add_pooling_layer=True):
924
 
925
  LSGRobertaPreTrainedModel.__init__(self, config)
926
 
 
936
 
937
  self.embeddings = LSGRobertaEmbeddings(config)
938
  self.encoder = LSGRobertaEncoder(config)
939
+ self.pooler = RobertaPooler(config) if add_pooling_layer else None
940
 
941
  if config.add_cross_attention:
942
  logger.warning(
 
963
  return_dict=None
964
  ):
965
 
966
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
967
+ output_hidden_states = (
968
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
969
+ )
970
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
971
+
972
  inputs_ = input_ids if input_ids is not None else inputs_embeds
973
  n, t = inputs_.size()[:2]
974
 
 
1013
  return_dict=return_dict
1014
  )
1015
 
1016
+ sequence_output = encoder_outputs[0]
1017
  if self.pool_with_global:
1018
+ sequence_output[:, self.num_global_tokens] = sequence_output[:, 0]
1019
 
1020
  diff = t - t_
1021
+ n, _, d = sequence_output.size()
1022
+ sequence_output = sequence_output[..., self.num_global_tokens:, :]
1023
 
1024
  # Adapt sequence to initial shape
1025
  if diff < 0:
1026
+ sequence_output = sequence_output[:, :t]
1027
 
 
 
1028
  pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
1029
 
1030
  if not return_dict:
1031
  return (sequence_output, pooled_output) + encoder_outputs[1:]
1032
+
1033
+ encoder_outputs.last_hidden_state = sequence_output
1034
+ encoder_outputs.pooler_output = pooled_output
1035
+ return encoder_outputs
 
 
 
 
 
1036
 
1037
  def get_extended_attention_mask(self, attention_mask, input_shape, device=None):
1038
 
 
1066
  logger.warning("If you want to use `LSGRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
1067
 
1068
  self.roberta = LSGRobertaModel(config, add_pooling_layer=False)
1069
+ self.lm_head = RobertaLMHead(config)
1070
 
1071
  # The LM head weights require special treatment only when they are tied with the word embeddings
1072
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
 
1096
  )
1097
 
1098
  self.roberta = LSGRobertaModel(config, add_pooling_layer=False)
1099
+ self.lm_head = RobertaLMHead(config)
1100
 
1101
  # The LM head weights require special treatment only when they are tied with the word embeddings
1102
  self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
 
1105
  self.post_init()
1106
 
1107
 
 
 
 
 
 
 
 
1108
  class LSGRobertaForSequenceClassification(LSGRobertaPreTrainedModel, RobertaForSequenceClassification):
1109
  """
1110
  This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
 
1121
  self.config = config
1122
 
1123
  self.roberta = LSGRobertaModel(config, add_pooling_layer=False)
1124
+ self.classifier = RobertaClassificationHead(config)
1125
 
1126
  # Initialize weights and apply final processing
1127
  self.post_init()
1128
 
1129
 
 
 
 
 
 
 
 
1130
  class LSGRobertaForMultipleChoice(LSGRobertaPreTrainedModel, RobertaForMultipleChoice):
1131
  """
1132
  This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the superclass for the