Update modeling_hf_nomic_bert.py
Browse files
modeling_hf_nomic_bert.py
CHANGED
@@ -724,7 +724,7 @@ class NomicBertAttention(nn.Module):
|
|
724 |
|
725 |
self.rotary_emb_dim = self.head_dim * config.rotary_emb_fraction
|
726 |
if self.rotary_emb_dim > 0:
|
727 |
-
if config
|
728 |
self.rotary_emb = NomicBertDynamicNTKRotaryEmbedding(
|
729 |
dim=self.rotary_emb_dim,
|
730 |
base=config.rotary_emb_base,
|
@@ -859,7 +859,6 @@ class NomicBertBlock(nn.Module):
|
|
859 |
max_seq_len: Optional[int] = None,
|
860 |
):
|
861 |
r"""Pass the input through the encoder layer.
|
862 |
-
|
863 |
Args:
|
864 |
hidden_states: the sequence to the encoder layer (required).
|
865 |
residual: if postnorm, residual=None, If prenorm, hidden_states = Attn/MLP(LN(residual))
|
@@ -1055,10 +1054,11 @@ class NomicBertModel(NomicBertPreTrainedModel):
|
|
1055 |
def forward(
|
1056 |
self,
|
1057 |
input_ids,
|
1058 |
-
attention_mask=None,
|
1059 |
-
token_type_ids=None,
|
1060 |
position_ids=None,
|
|
|
|
|
1061 |
return_dict=None,
|
|
|
1062 |
):
|
1063 |
if token_type_ids is None:
|
1064 |
token_type_ids = torch.zeros_like(input_ids)
|
@@ -1071,6 +1071,9 @@ class NomicBertModel(NomicBertPreTrainedModel):
|
|
1071 |
|
1072 |
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
|
1073 |
|
|
|
|
|
|
|
1074 |
return BaseModelOutputWithPoolingAndCrossAttentions(
|
1075 |
last_hidden_state=sequence_output,
|
1076 |
pooler_output=pooled_output,
|
@@ -1113,7 +1116,6 @@ class NomicBertForPreTraining(NomicBertPreTrainedModel):
|
|
1113 |
Outputs a tuple comprising
|
1114 |
- the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
|
1115 |
- the next sentence classification logits of shape [batch_size, 2].
|
1116 |
-
|
1117 |
"""
|
1118 |
outputs = self.bert(
|
1119 |
input_ids,
|
|
|
724 |
|
725 |
self.rotary_emb_dim = self.head_dim * config.rotary_emb_fraction
|
726 |
if self.rotary_emb_dim > 0:
|
727 |
+
if getattr(config, "rotary_scaling_factor", None):
|
728 |
self.rotary_emb = NomicBertDynamicNTKRotaryEmbedding(
|
729 |
dim=self.rotary_emb_dim,
|
730 |
base=config.rotary_emb_base,
|
|
|
859 |
max_seq_len: Optional[int] = None,
|
860 |
):
|
861 |
r"""Pass the input through the encoder layer.
|
|
|
862 |
Args:
|
863 |
hidden_states: the sequence to the encoder layer (required).
|
864 |
residual: if postnorm, residual=None, If prenorm, hidden_states = Attn/MLP(LN(residual))
|
|
|
1054 |
def forward(
|
1055 |
self,
|
1056 |
input_ids,
|
|
|
|
|
1057 |
position_ids=None,
|
1058 |
+
token_type_ids=None,
|
1059 |
+
attention_mask=None,
|
1060 |
return_dict=None,
|
1061 |
+
matryoshka_dim=None,
|
1062 |
):
|
1063 |
if token_type_ids is None:
|
1064 |
token_type_ids = torch.zeros_like(input_ids)
|
|
|
1071 |
|
1072 |
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
|
1073 |
|
1074 |
+
if matryoshka_dim:
|
1075 |
+
sequence_output = sequence_output[:, :matryoshka_dim]
|
1076 |
+
|
1077 |
return BaseModelOutputWithPoolingAndCrossAttentions(
|
1078 |
last_hidden_state=sequence_output,
|
1079 |
pooler_output=pooled_output,
|
|
|
1116 |
Outputs a tuple comprising
|
1117 |
- the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
|
1118 |
- the next sentence classification logits of shape [batch_size, 2].
|
|
|
1119 |
"""
|
1120 |
outputs = self.bert(
|
1121 |
input_ids,
|