small fix
Browse files- README.md +1 -1
- modeling_lsg_xlm_roberta.py +51 -38
README.md
CHANGED
@@ -7,7 +7,7 @@ pipeline_tag: fill-mask
|
|
7 |
---
|
8 |
|
9 |
# LSG model
|
10 |
-
**Transformers >= 4.
|
11 |
**This model relies on a custom modeling file, you need to add trust_remote_code=True**\
|
12 |
**See [\#13467](https://github.com/huggingface/transformers/pull/13467)**
|
13 |
|
|
|
7 |
---
|
8 |
|
9 |
# LSG model
|
10 |
+
**Transformers >= 4.36.1**\
|
11 |
**This model relies on a custom modeling file, you need to add trust_remote_code=True**\
|
12 |
**See [\#13467](https://github.com/huggingface/transformers/pull/13467)**
|
13 |
|
modeling_lsg_xlm_roberta.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from logging import warn
|
2 |
-
from transformers.models.
|
3 |
import torch
|
4 |
import torch.nn as nn
|
5 |
from transformers.models.xlm_roberta.configuration_xlm_roberta import XLMRobertaConfig
|
@@ -153,7 +153,7 @@ class BaseAttentionProduct(nn.Module):
|
|
153 |
del key_layer
|
154 |
|
155 |
if attention_mask is not None:
|
156 |
-
# Apply the attention mask is (precomputed for all layers in
|
157 |
attention_scores = attention_scores + attention_mask
|
158 |
del attention_mask
|
159 |
|
@@ -397,7 +397,7 @@ class LSGAttentionProduct(nn.Module):
|
|
397 |
return x.reshape(*x.size()[:-2], n_blocks, -1, d)
|
398 |
|
399 |
|
400 |
-
class
|
401 |
|
402 |
def __init__(self, config):
|
403 |
super().__init__(config)
|
@@ -411,13 +411,11 @@ class LSGRobertaEmbeddings(RobertaEmbeddings):
|
|
411 |
|
412 |
def forward(
|
413 |
self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
|
414 |
-
|
415 |
if position_ids is None:
|
416 |
if input_ids is not None:
|
417 |
# Create the position ids from the input token ids. Any padded tokens remain padded.
|
418 |
-
position_ids = create_position_ids_from_input_ids(
|
419 |
-
input_ids, self.padding_idx, past_key_values_length
|
420 |
-
).to(input_ids.device)
|
421 |
else:
|
422 |
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
|
423 |
|
@@ -426,10 +424,18 @@ class LSGRobertaEmbeddings(RobertaEmbeddings):
|
|
426 |
else:
|
427 |
input_shape = inputs_embeds.size()[:-1]
|
428 |
|
429 |
-
seq_length = input_shape[
|
430 |
|
|
|
|
|
|
|
431 |
if token_type_ids is None:
|
432 |
-
|
|
|
|
|
|
|
|
|
|
|
433 |
|
434 |
if inputs_embeds is None:
|
435 |
inputs_embeds = self.word_embeddings(input_ids)
|
@@ -453,7 +459,7 @@ class LSGRobertaEmbeddings(RobertaEmbeddings):
|
|
453 |
return embeddings
|
454 |
|
455 |
|
456 |
-
class LSGAttention(
|
457 |
|
458 |
def __init__(self, config):
|
459 |
|
@@ -912,7 +918,7 @@ class LSGSelfAttention(BaseSelfAttention):
|
|
912 |
return x.reshape(n, h, -1, chunk_size, d)
|
913 |
|
914 |
|
915 |
-
class
|
916 |
|
917 |
def __init__(self, config):
|
918 |
|
@@ -924,12 +930,12 @@ class LSGRobertaLayer(RobertaLayer):
|
|
924 |
self.crossattention = LSGAttention(config)
|
925 |
|
926 |
|
927 |
-
class
|
928 |
|
929 |
def __init__(self, config):
|
930 |
|
931 |
super().__init__(config)
|
932 |
-
self.layer = nn.ModuleList([
|
933 |
|
934 |
assert hasattr(config, "num_global_tokens")
|
935 |
self.num_global_tokens = config.num_global_tokens
|
@@ -997,7 +1003,8 @@ class LSGRobertaEncoder(RobertaEncoder):
|
|
997 |
encoder_outputs.last_hidden_state = sequence_output
|
998 |
return encoder_outputs
|
999 |
|
1000 |
-
|
|
|
1001 |
"""
|
1002 |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
1003 |
models.
|
@@ -1009,11 +1016,11 @@ class LSGRobertaPreTrainedModel(RobertaPreTrainedModel):
|
|
1009 |
_no_split_modules = []
|
1010 |
|
1011 |
def _set_gradient_checkpointing(self, module, value=False):
|
1012 |
-
if isinstance(module, (
|
1013 |
module.gradient_checkpointing = value
|
1014 |
|
1015 |
|
1016 |
-
class LSGXLMRobertaModel(
|
1017 |
"""
|
1018 |
This class overrides :class:`~transformers.RobertaModel`. Please check the superclass for the appropriate
|
1019 |
documentation alongside usage examples.
|
@@ -1021,17 +1028,23 @@ class LSGXLMRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
|
|
1021 |
|
1022 |
def __init__(self, config, add_pooling_layer=True):
|
1023 |
|
1024 |
-
|
1025 |
|
1026 |
-
self.embeddings =
|
1027 |
-
self.encoder =
|
1028 |
-
self.pooler =
|
1029 |
|
1030 |
if config.add_cross_attention:
|
1031 |
logger.warning(
|
1032 |
"Cross attention is computed using full attention since it is not LSG compatible."
|
1033 |
)
|
1034 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1035 |
# Initialize weights and apply final processing
|
1036 |
self.post_init()
|
1037 |
|
@@ -1053,25 +1066,25 @@ class LSGXLMRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
|
|
1053 |
return extended_attention_mask
|
1054 |
|
1055 |
|
1056 |
-
class LSGXLMRobertaForCausalLM(
|
1057 |
|
1058 |
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
1059 |
|
1060 |
def __init__(self, config):
|
1061 |
|
1062 |
-
|
1063 |
|
1064 |
if not config.is_decoder:
|
1065 |
-
logger.warning("If you want to use `
|
1066 |
|
1067 |
self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
|
1068 |
-
self.lm_head =
|
1069 |
|
1070 |
# Initialize weights and apply final processing
|
1071 |
self.post_init()
|
1072 |
|
1073 |
|
1074 |
-
class LSGXLMRobertaForMaskedLM(
|
1075 |
"""
|
1076 |
This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the superclass for the appropriate
|
1077 |
documentation alongside usage examples.
|
@@ -1084,22 +1097,22 @@ class LSGXLMRobertaForMaskedLM(LSGRobertaPreTrainedModel, RobertaForMaskedLM):
|
|
1084 |
|
1085 |
def __init__(self, config):
|
1086 |
|
1087 |
-
|
1088 |
|
1089 |
if config.is_decoder:
|
1090 |
logger.warning(
|
1091 |
-
"If you want to use `
|
1092 |
"bi-directional self-attention."
|
1093 |
)
|
1094 |
|
1095 |
self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
|
1096 |
-
self.lm_head =
|
1097 |
|
1098 |
# Initialize weights and apply final processing
|
1099 |
self.post_init()
|
1100 |
|
1101 |
|
1102 |
-
class LSGXLMRobertaForSequenceClassification(
|
1103 |
"""
|
1104 |
This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
|
1105 |
appropriate documentation alongside usage examples.
|
@@ -1107,19 +1120,19 @@ class LSGXLMRobertaForSequenceClassification(LSGRobertaPreTrainedModel, RobertaF
|
|
1107 |
|
1108 |
def __init__(self, config):
|
1109 |
|
1110 |
-
|
1111 |
|
1112 |
self.num_labels = config.num_labels
|
1113 |
self.config = config
|
1114 |
|
1115 |
self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
|
1116 |
-
self.classifier =
|
1117 |
|
1118 |
# Initialize weights and apply final processing
|
1119 |
self.post_init()
|
1120 |
|
1121 |
|
1122 |
-
class LSGXLMRobertaForMultipleChoice(
|
1123 |
"""
|
1124 |
This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the superclass for the
|
1125 |
appropriate documentation alongside usage examples.
|
@@ -1129,7 +1142,7 @@ class LSGXLMRobertaForMultipleChoice(LSGRobertaPreTrainedModel, RobertaForMultip
|
|
1129 |
|
1130 |
def __init__(self, config):
|
1131 |
|
1132 |
-
|
1133 |
|
1134 |
self.roberta = LSGXLMRobertaModel(config)
|
1135 |
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
@@ -1139,7 +1152,7 @@ class LSGXLMRobertaForMultipleChoice(LSGRobertaPreTrainedModel, RobertaForMultip
|
|
1139 |
self.post_init()
|
1140 |
|
1141 |
|
1142 |
-
class LSGXLMRobertaForTokenClassification(
|
1143 |
"""
|
1144 |
This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the superclass for the
|
1145 |
appropriate documentation alongside usage examples.
|
@@ -1147,7 +1160,7 @@ class LSGXLMRobertaForTokenClassification(LSGRobertaPreTrainedModel, RobertaForT
|
|
1147 |
|
1148 |
def __init__(self, config):
|
1149 |
|
1150 |
-
|
1151 |
|
1152 |
self.num_labels = config.num_labels
|
1153 |
|
@@ -1162,7 +1175,7 @@ class LSGXLMRobertaForTokenClassification(LSGRobertaPreTrainedModel, RobertaForT
|
|
1162 |
self.post_init()
|
1163 |
|
1164 |
|
1165 |
-
class LSGXLMRobertaForQuestionAnswering(
|
1166 |
"""
|
1167 |
This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the superclass for the
|
1168 |
appropriate documentation alongside usage examples.
|
@@ -1170,7 +1183,7 @@ class LSGXLMRobertaForQuestionAnswering(LSGRobertaPreTrainedModel, RobertaForQue
|
|
1170 |
|
1171 |
def __init__(self, config):
|
1172 |
|
1173 |
-
|
1174 |
|
1175 |
self.num_labels = config.num_labels
|
1176 |
|
@@ -1191,4 +1204,4 @@ try:
|
|
1191 |
str_to_class(value.split(".")[-1]).register_for_auto_class(key)
|
1192 |
except:
|
1193 |
warn("AutoRegister isn't available, you'll have to manually copy modeling.py after .save_pretrained(...).")
|
1194 |
-
warn("Update to transformers >= 4.
|
|
|
1 |
from logging import warn
|
2 |
+
from transformers.models.xlm_roberta.modeling_xlm_roberta import *
|
3 |
import torch
|
4 |
import torch.nn as nn
|
5 |
from transformers.models.xlm_roberta.configuration_xlm_roberta import XLMRobertaConfig
|
|
|
153 |
del key_layer
|
154 |
|
155 |
if attention_mask is not None:
|
156 |
+
# Apply the attention mask is (precomputed for all layers in XLMRobertaModel forward() function)
|
157 |
attention_scores = attention_scores + attention_mask
|
158 |
del attention_mask
|
159 |
|
|
|
397 |
return x.reshape(*x.size()[:-2], n_blocks, -1, d)
|
398 |
|
399 |
|
400 |
+
class LSGXLMRobertaEmbeddings(XLMRobertaEmbeddings):
|
401 |
|
402 |
def __init__(self, config):
|
403 |
super().__init__(config)
|
|
|
411 |
|
412 |
def forward(
|
413 |
self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
|
414 |
+
):
|
415 |
if position_ids is None:
|
416 |
if input_ids is not None:
|
417 |
# Create the position ids from the input token ids. Any padded tokens remain padded.
|
418 |
+
position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
|
|
|
|
|
419 |
else:
|
420 |
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
|
421 |
|
|
|
424 |
else:
|
425 |
input_shape = inputs_embeds.size()[:-1]
|
426 |
|
427 |
+
seq_length = input_shape[1]
|
428 |
|
429 |
+
# Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
|
430 |
+
# when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
|
431 |
+
# issue #5664
|
432 |
if token_type_ids is None:
|
433 |
+
if hasattr(self, "token_type_ids"):
|
434 |
+
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
|
435 |
+
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
|
436 |
+
token_type_ids = buffered_token_type_ids_expanded
|
437 |
+
else:
|
438 |
+
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
|
439 |
|
440 |
if inputs_embeds is None:
|
441 |
inputs_embeds = self.word_embeddings(input_ids)
|
|
|
459 |
return embeddings
|
460 |
|
461 |
|
462 |
+
class LSGAttention(XLMRobertaAttention):
|
463 |
|
464 |
def __init__(self, config):
|
465 |
|
|
|
918 |
return x.reshape(n, h, -1, chunk_size, d)
|
919 |
|
920 |
|
921 |
+
class LSGXLMRobertaLayer(XLMRobertaLayer):
|
922 |
|
923 |
def __init__(self, config):
|
924 |
|
|
|
930 |
self.crossattention = LSGAttention(config)
|
931 |
|
932 |
|
933 |
+
class LSGXLMRobertaEncoder(XLMRobertaEncoder):
|
934 |
|
935 |
def __init__(self, config):
|
936 |
|
937 |
super().__init__(config)
|
938 |
+
self.layer = nn.ModuleList([LSGXLMRobertaLayer(config) for _ in range(config.num_hidden_layers)])
|
939 |
|
940 |
assert hasattr(config, "num_global_tokens")
|
941 |
self.num_global_tokens = config.num_global_tokens
|
|
|
1003 |
encoder_outputs.last_hidden_state = sequence_output
|
1004 |
return encoder_outputs
|
1005 |
|
1006 |
+
|
1007 |
+
class LSGXLMRobertaPreTrainedModel(XLMRobertaPreTrainedModel):
|
1008 |
"""
|
1009 |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
1010 |
models.
|
|
|
1016 |
_no_split_modules = []
|
1017 |
|
1018 |
def _set_gradient_checkpointing(self, module, value=False):
|
1019 |
+
if isinstance(module, (XLMRobertaEncoder, LSGXLMRobertaEncoder)):
|
1020 |
module.gradient_checkpointing = value
|
1021 |
|
1022 |
|
1023 |
+
class LSGXLMRobertaModel(LSGXLMRobertaPreTrainedModel, XLMRobertaModel):
|
1024 |
"""
|
1025 |
This class overrides :class:`~transformers.RobertaModel`. Please check the superclass for the appropriate
|
1026 |
documentation alongside usage examples.
|
|
|
1028 |
|
1029 |
def __init__(self, config, add_pooling_layer=True):
|
1030 |
|
1031 |
+
LSGXLMRobertaPreTrainedModel.__init__(self, config)
|
1032 |
|
1033 |
+
self.embeddings = LSGXLMRobertaEmbeddings(config)
|
1034 |
+
self.encoder = LSGXLMRobertaEncoder(config)
|
1035 |
+
self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None
|
1036 |
|
1037 |
if config.add_cross_attention:
|
1038 |
logger.warning(
|
1039 |
"Cross attention is computed using full attention since it is not LSG compatible."
|
1040 |
)
|
1041 |
|
1042 |
+
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
|
1043 |
+
if self._use_flash_attention_2:
|
1044 |
+
logger.warning(
|
1045 |
+
"[WARNING flash-attention]: LSG doesnt support flash-attention currently"
|
1046 |
+
)
|
1047 |
+
|
1048 |
# Initialize weights and apply final processing
|
1049 |
self.post_init()
|
1050 |
|
|
|
1066 |
return extended_attention_mask
|
1067 |
|
1068 |
|
1069 |
+
class LSGXLMRobertaForCausalLM(LSGXLMRobertaPreTrainedModel, XLMRobertaForCausalLM):
|
1070 |
|
1071 |
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
1072 |
|
1073 |
def __init__(self, config):
|
1074 |
|
1075 |
+
LSGXLMRobertaPreTrainedModel.__init__(self, config)
|
1076 |
|
1077 |
if not config.is_decoder:
|
1078 |
+
logger.warning("If you want to use `LSGXLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
|
1079 |
|
1080 |
self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
|
1081 |
+
self.lm_head = XLMRobertaLMHead(config)
|
1082 |
|
1083 |
# Initialize weights and apply final processing
|
1084 |
self.post_init()
|
1085 |
|
1086 |
|
1087 |
+
class LSGXLMRobertaForMaskedLM(LSGXLMRobertaPreTrainedModel, XLMRobertaForMaskedLM):
|
1088 |
"""
|
1089 |
This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the superclass for the appropriate
|
1090 |
documentation alongside usage examples.
|
|
|
1097 |
|
1098 |
def __init__(self, config):
|
1099 |
|
1100 |
+
LSGXLMRobertaPreTrainedModel.__init__(self, config)
|
1101 |
|
1102 |
if config.is_decoder:
|
1103 |
logger.warning(
|
1104 |
+
"If you want to use `LSGXLMRobertaForMaskedLM` make sure `config.is_decoder=False` for "
|
1105 |
"bi-directional self-attention."
|
1106 |
)
|
1107 |
|
1108 |
self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
|
1109 |
+
self.lm_head = XLMRobertaLMHead(config)
|
1110 |
|
1111 |
# Initialize weights and apply final processing
|
1112 |
self.post_init()
|
1113 |
|
1114 |
|
1115 |
+
class LSGXLMRobertaForSequenceClassification(LSGXLMRobertaPreTrainedModel, XLMRobertaForSequenceClassification):
|
1116 |
"""
|
1117 |
This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
|
1118 |
appropriate documentation alongside usage examples.
|
|
|
1120 |
|
1121 |
def __init__(self, config):
|
1122 |
|
1123 |
+
LSGXLMRobertaPreTrainedModel.__init__(self, config)
|
1124 |
|
1125 |
self.num_labels = config.num_labels
|
1126 |
self.config = config
|
1127 |
|
1128 |
self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
|
1129 |
+
self.classifier = XLMRobertaClassificationHead(config)
|
1130 |
|
1131 |
# Initialize weights and apply final processing
|
1132 |
self.post_init()
|
1133 |
|
1134 |
|
1135 |
+
class LSGXLMRobertaForMultipleChoice(LSGXLMRobertaPreTrainedModel, XLMRobertaForMultipleChoice):
|
1136 |
"""
|
1137 |
This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the superclass for the
|
1138 |
appropriate documentation alongside usage examples.
|
|
|
1142 |
|
1143 |
def __init__(self, config):
|
1144 |
|
1145 |
+
LSGXLMRobertaPreTrainedModel.__init__(self, config)
|
1146 |
|
1147 |
self.roberta = LSGXLMRobertaModel(config)
|
1148 |
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
|
|
1152 |
self.post_init()
|
1153 |
|
1154 |
|
1155 |
+
class LSGXLMRobertaForTokenClassification(LSGXLMRobertaPreTrainedModel, XLMRobertaForTokenClassification):
|
1156 |
"""
|
1157 |
This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the superclass for the
|
1158 |
appropriate documentation alongside usage examples.
|
|
|
1160 |
|
1161 |
def __init__(self, config):
|
1162 |
|
1163 |
+
LSGXLMRobertaPreTrainedModel.__init__(self, config)
|
1164 |
|
1165 |
self.num_labels = config.num_labels
|
1166 |
|
|
|
1175 |
self.post_init()
|
1176 |
|
1177 |
|
1178 |
+
class LSGXLMRobertaForQuestionAnswering(LSGXLMRobertaPreTrainedModel, XLMRobertaForQuestionAnswering):
|
1179 |
"""
|
1180 |
This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the superclass for the
|
1181 |
appropriate documentation alongside usage examples.
|
|
|
1183 |
|
1184 |
def __init__(self, config):
|
1185 |
|
1186 |
+
LSGXLMRobertaPreTrainedModel.__init__(self, config)
|
1187 |
|
1188 |
self.num_labels = config.num_labels
|
1189 |
|
|
|
1204 |
str_to_class(value.split(".")[-1]).register_for_auto_class(key)
|
1205 |
except:
|
1206 |
warn("AutoRegister isn't available, you'll have to manually copy modeling.py after .save_pretrained(...).")
|
1207 |
+
warn("Update to transformers >= 4.36.1 to fix.")
|