chandar-lab
/

NeoBERT

@@ -6,7 +6,7 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from torch.nn.functional import scaled_dot_product_attention
-from typing import Optional
 import numpy as np
 from xformers.ops import SwiGLU
@@ -190,7 +190,7 @@ class EncoderBlock(nn.Module):
                 query=xq.transpose(1, 2),
                 key=xk.transpose(1, 2),
                 value=xv.transpose(1, 2),
-                attn_mask=attention_mask.bool(),
                 dropout_p=0,
             ).transpose(1, 2)
@@ -199,7 +199,6 @@ class EncoderBlock(nn.Module):
 class NeoBERTPreTrainedModel(PreTrainedModel):
     config_class = NeoBERTConfig
-    base_model_prefix = "model"
     _supports_cache_class = True
     def _init_weights(self, module):
@@ -234,11 +233,12 @@ class NeoBERT(NeoBERTPreTrainedModel):
     def forward(
         self,
-        input_ids: torch.Tensor,
         position_ids: torch.Tensor = None,
         max_seqlen: int = None,
         cu_seqlens: torch.Tensor = None,
         attention_mask: torch.Tensor = None,
         output_hidden_states: bool = False,
         output_attentions: bool = False,
         **kwargs,
@@ -246,6 +246,9 @@ class NeoBERT(NeoBERTPreTrainedModel):
         # Initialize
         hidden_states, attentions = [], []
         # Expand and repeat: (Batch, Length) -> (Batch, Heads, Length, Length)
         if attention_mask is not None:
             attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1, self.config.num_attention_heads, attention_mask.size(-1), 1)
@@ -257,14 +260,22 @@ class NeoBERT(NeoBERTPreTrainedModel):
             ), "Flash-attention is not available. Please ''pip install flash_attn'', or provide un-packed sequences."
             assert not output_attentions, "Output attentions is not supported when sequences are packed."
             assert max_seqlen is not None, "Missing max_seqlen. It must be provided when cu_seqlens are not None."
-            assert input_ids.shape[0] == 1, "Cumulative sequence lengths are provided but input_ids are not packed."
-            assert input_ids.is_cuda, "Packing uses an implementation of flash-attention and is only supported on GPU."
         # RoPE
-        freqs_cis = self.freqs_cis[position_ids] if position_ids is not None else self.freqs_cis[: input_ids.shape[1]].unsqueeze(0)
         # Embedding
-        x = self.encoder(input_ids)
         # Transformer encoder
         for layer in self.transformer_encoder:
@@ -356,7 +367,7 @@ class NeoBERTForSequenceClassification(NeoBERTPreTrainedModel):
     def forward(
         self,
-        input_ids: torch.Tensor,
         position_ids: torch.Tensor = None,
         max_seqlen: int = None,
         cu_seqlens: torch.Tensor = None,

 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from torch.nn.functional import scaled_dot_product_attention
+from typing import Optional, Tuple
 import numpy as np
 from xformers.ops import SwiGLU
                 query=xq.transpose(1, 2),
                 key=xk.transpose(1, 2),
                 value=xv.transpose(1, 2),
+                attn_mask=attention_mask,
                 dropout_p=0,
             ).transpose(1, 2)
 class NeoBERTPreTrainedModel(PreTrainedModel):
     config_class = NeoBERTConfig
     _supports_cache_class = True
     def _init_weights(self, module):
     def forward(
         self,
+        input_ids: Optional[torch.Tensor] = None,
         position_ids: torch.Tensor = None,
         max_seqlen: int = None,
         cu_seqlens: torch.Tensor = None,
         attention_mask: torch.Tensor = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         output_hidden_states: bool = False,
         output_attentions: bool = False,
         **kwargs,
         # Initialize
         hidden_states, attentions = [], []
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         # Expand and repeat: (Batch, Length) -> (Batch, Heads, Length, Length)
         if attention_mask is not None:
             attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1, self.config.num_attention_heads, attention_mask.size(-1), 1)
             ), "Flash-attention is not available. Please ''pip install flash_attn'', or provide un-packed sequences."
             assert not output_attentions, "Output attentions is not supported when sequences are packed."
             assert max_seqlen is not None, "Missing max_seqlen. It must be provided when cu_seqlens are not None."
+            assert (input_ids if input_ids is not None else inputs_embeds).shape[
+                0
+            ] == 1, "Cumulative sequence lengths are provided but inputs are not packed."
+            assert (
+                input_ids if input_ids is not None else inputs_embeds
+            ).is_cuda, "Packing uses an implementation of flash-attention and is only supported on GPU."
         # RoPE
+        freqs_cis = (
+            self.freqs_cis[position_ids]
+            if position_ids is not None
+            else self.freqs_cis[: (input_ids if input_ids is not None else inputs_embeds).shape[1]].unsqueeze(0)
+        )
         # Embedding
+        x = self.encoder(input_ids) if input_ids is not None else inputs_embeds
         # Transformer encoder
         for layer in self.transformer_encoder:
     def forward(
         self,
+        input_ids: Optional[torch.Tensor] = None,
         position_ids: torch.Tensor = None,
         max_seqlen: int = None,
         cu_seqlens: torch.Tensor = None,