nomic-ai
/

nomic-bert-2048

@@ -724,7 +724,7 @@ class NomicBertAttention(nn.Module):
         self.rotary_emb_dim = self.head_dim * config.rotary_emb_fraction
         if self.rotary_emb_dim > 0:
-            if config.rotary_scaling_factor:
                 self.rotary_emb = NomicBertDynamicNTKRotaryEmbedding(
                     dim=self.rotary_emb_dim,
                     base=config.rotary_emb_base,
@@ -859,7 +859,6 @@ class NomicBertBlock(nn.Module):
         max_seq_len: Optional[int] = None,
     ):
         r"""Pass the input through the encoder layer.
         Args:
             hidden_states: the sequence to the encoder layer (required).
             residual: if postnorm, residual=None, If prenorm, hidden_states = Attn/MLP(LN(residual))
@@ -1055,10 +1054,11 @@ class NomicBertModel(NomicBertPreTrainedModel):
     def forward(
         self,
         input_ids,
-        attention_mask=None,
-        token_type_ids=None,
         position_ids=None,
         return_dict=None,
     ):
         if token_type_ids is None:
             token_type_ids = torch.zeros_like(input_ids)
@@ -1071,6 +1071,9 @@ class NomicBertModel(NomicBertPreTrainedModel):
         pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
         return BaseModelOutputWithPoolingAndCrossAttentions(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
@@ -1113,7 +1116,6 @@ class NomicBertForPreTraining(NomicBertPreTrainedModel):
                 Outputs a tuple comprising
                 - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
                 - the next sentence classification logits of shape [batch_size, 2].
         """
         outputs = self.bert(
             input_ids,

         self.rotary_emb_dim = self.head_dim * config.rotary_emb_fraction
         if self.rotary_emb_dim > 0:
+            if getattr(config, "rotary_scaling_factor", None):
                 self.rotary_emb = NomicBertDynamicNTKRotaryEmbedding(
                     dim=self.rotary_emb_dim,
                     base=config.rotary_emb_base,
         max_seq_len: Optional[int] = None,
     ):
         r"""Pass the input through the encoder layer.
         Args:
             hidden_states: the sequence to the encoder layer (required).
             residual: if postnorm, residual=None, If prenorm, hidden_states = Attn/MLP(LN(residual))
     def forward(
         self,
         input_ids,
         position_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
         return_dict=None,
+        matryoshka_dim=None,
     ):
         if token_type_ids is None:
             token_type_ids = torch.zeros_like(input_ids)
         pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if matryoshka_dim:
+            sequence_output = sequence_output[:, :matryoshka_dim]
         return BaseModelOutputWithPoolingAndCrossAttentions(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
                 Outputs a tuple comprising
                 - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
                 - the next sentence classification logits of shape [batch_size, 2].
         """
         outputs = self.bert(
             input_ids,