add fast loading/inference

Browse files

Files changed (14) hide show

README.md +3 -0
adapt_tokenizer.py +8 -22
attention.py +306 -29
blocks.py +11 -5
config.json +1 -1
configuration_mpt.py +19 -71
flash_attn_triton.py +198 -256
generation_config.json +3 -1
hf_prefixlm_converter.py +110 -200
is_torch_version.py +56 -0
meta_init_context.py +8 -49
modeling_mpt.py +90 -9
norm.py +10 -9
param_init_fns.py +64 -68

README.md CHANGED Viewed

@@ -9,6 +9,9 @@ datasets:
 inference: false
 ---
 # MPT-7B-StoryWriter-65k+
 MPT-7B-StoryWriter-65k+ is a model designed to read and write fictional stories with super long context lengths.

 inference: false
 ---
+The code for this model has been updated to include the adaptions from [Birchlabs/mosaicml-mpt-7b-chat-qlora](https://huggingface.co/Birchlabs/mosaicml-mpt-7b-chat-qlora) which allow MPT models to be loaded with `device_map="auto"` and [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) support (e.g. `load_in_8bit`, `load_in_4bit`).
+It also has the [latest key-value cache MPT code](https://github.com/mosaicml/llm-foundry/pull/210) to allow for fast inference with `transformers` (thus, `use_cache` is set to `True` in `config.json`).
 # MPT-7B-StoryWriter-65k+
 MPT-7B-StoryWriter-65k+ is a model designed to read and write fictional stories with super long context lengths.

adapt_tokenizer.py CHANGED Viewed

@@ -1,41 +1,27 @@
 from typing import Union
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
-Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 NUM_SENTINEL_TOKENS: int = 100
 def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
-    """Adds sentinel tokens and padding token (if missing).
-    Expands the tokenizer vocabulary to include sentinel tokens
-    used in mixture-of-denoiser tasks as well as a padding token.
-    All added tokens are added as special tokens. No tokens are
-    added if sentinel tokens and padding token already exist.
-    """
     sentinels_to_add = [f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)]
     tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
-    if tokenizer.pad_token is None:
         tokenizer.add_tokens('<pad>', special_tokens=True)
         tokenizer.pad_token = '<pad>'
-        assert tokenizer.pad_token_id is not None
     sentinels = ''.join([f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)])
     _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
     tokenizer.sentinel_token_ids = _sentinel_token_ids
 class AutoTokenizerForMOD(AutoTokenizer):
-    """AutoTokenizer + Adaptation for MOD.
-    A simple wrapper around AutoTokenizer to make instantiating
-    an MOD-adapted tokenizer a bit easier.
-    MOD-adapted tokenizers have sentinel tokens (e.g., <extra_id_0>),
-    a padding token, and a property to get the token ids of the
-    sentinel tokens.
-    """
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
-        """See `AutoTokenizer.from_pretrained` docstring."""
         tokenizer = super().from_pretrained(*args, **kwargs)
         adapt_tokenizer_for_denoising(tokenizer)
-        return tokenizer

 from typing import Union
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+Tokenizer = Union[(PreTrainedTokenizer, PreTrainedTokenizerFast)]
 NUM_SENTINEL_TOKENS: int = 100
 def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
+    'Adds sentinel tokens and padding token (if missing).\n\n    Expands the tokenizer vocabulary to include sentinel tokens\n    used in mixture-of-denoiser tasks as well as a padding token.\n\n    All added tokens are added as special tokens. No tokens are\n    added if sentinel tokens and padding token already exist.\n    '
     sentinels_to_add = [f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)]
     tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
+    if (tokenizer.pad_token is None):
         tokenizer.add_tokens('<pad>', special_tokens=True)
         tokenizer.pad_token = '<pad>'
+        assert (tokenizer.pad_token_id is not None)
     sentinels = ''.join([f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)])
     _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
     tokenizer.sentinel_token_ids = _sentinel_token_ids
 class AutoTokenizerForMOD(AutoTokenizer):
+    'AutoTokenizer + Adaptation for MOD.\n\n    A simple wrapper around AutoTokenizer to make instantiating\n    an MOD-adapted tokenizer a bit easier.\n\n    MOD-adapted tokenizers have sentinel tokens (e.g., <extra_id_0>),\n    a padding token, and a property to get the token ids of the\n    sentinel tokens.\n    '
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
+        'See `AutoTokenizer.from_pretrained` docstring.'
         tokenizer = super().from_pretrained(*args, **kwargs)
         adapt_tokenizer_for_denoising(tokenizer)
+        return tokenizer

attention.py CHANGED Viewed

@@ -1,13 +1,73 @@
 """Attention layers."""
 import math
 import warnings
-from typing import Optional
 import torch
 import torch.nn as nn
 from einops import rearrange
 from packaging import version
 from torch import nn
 from .norm import LPLayerNorm
 def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
     if original_is_causal and num_query_tokens != num_key_tokens:
@@ -17,25 +77,57 @@ def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_cau
             return False
     return original_is_causal
-def scaled_multihead_dot_product_attention(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
     q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
-    k = rearrange(key, 'b s (h d) -> b h d s', h=1 if multiquery else n_heads)
-    v = rearrange(value, 'b s (h d) -> b h s d', h=1 if multiquery else n_heads)
-    min_val = torch.finfo(q.dtype).min
     (b, _, s_q, d) = q.shape
     s_k = k.size(-1)
     if softmax_scale is None:
         softmax_scale = 1 / math.sqrt(d)
     attn_weight = q.matmul(k) * softmax_scale
     if attn_bias is not None:
         if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
             raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
         attn_weight = attn_weight + attn_bias
     if key_padding_mask is not None:
         if attn_bias is not None:
-            warnings.warn('Propogating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unneccessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
         attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
-    if is_causal:
         s = max(s_q, s_k)
         causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
         causal_mask = causal_mask.tril()
@@ -49,8 +141,8 @@ def scaled_multihead_dot_product_attention(query, key, value, n_heads, softmax_s
     out = attn_weight.matmul(v)
     out = rearrange(out, 'b h s d -> b s (h d)')
     if needs_weights:
-        return (out, attn_weight)
-    return (out, None)
 def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
     for tensor in tensors:
@@ -59,12 +151,38 @@ def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
         if not tensor.is_cuda:
             raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).')
-def flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
     try:
         from flash_attn import bert_padding, flash_attn_interface
     except:
         raise RuntimeError('Please install flash-attn==1.0.3.post0')
     check_valid_inputs(query, key, value)
     if attn_bias is not None:
         raise NotImplementedError(f'attn_bias not implemented for flash attn.')
     (batch_size, seqlen) = query.shape[:2]
@@ -84,9 +202,23 @@ def flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
     output_unpad = flash_attn_interface.flash_attn_unpadded_func(query_unpad, key_unpad, value_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
     output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
-    return (output, None)
-def triton_flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
     try:
         from .flash_attn_triton import flash_attn_func
     except:
@@ -100,6 +232,18 @@ def triton_flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bi
         if not _installed:
             raise RuntimeError('Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU and `pip install .[gpu]` if installing from llm-foundry source or `pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). Note: (1) requires you have CMake and PyTorch already installed.')
     check_valid_inputs(query, key, value)
     if dropout_p:
         raise NotImplementedError(f'Dropout not implemented for attn_impl: triton.')
     if needs_weights:
@@ -119,14 +263,16 @@ def triton_flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bi
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
     attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
     output = attn_output.view(*attn_output.shape[:2], -1)
-    return (output, None)
-class MultiheadAttention(nn.Module):
     """Multi-head self attention.
     Using torch or triton attention implemetation enables user to also use
     additive bias.
     """
     def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None):
         super().__init__()
@@ -160,7 +306,15 @@ class MultiheadAttention(nn.Module):
         self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
         self.out_proj._is_residual = True
-    def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
         qkv = self.Wqkv(x)
         if self.clip_qkv:
             qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
@@ -170,17 +324,71 @@ class MultiheadAttention(nn.Module):
             dtype = query.dtype
             query = self.q_ln(query).to(dtype)
             key = self.k_ln(key).to(dtype)
-        if past_key_value is not None:
-            if len(past_key_value) != 0:
-                key = torch.cat([past_key_value[0], key], dim=1)
-                value = torch.cat([past_key_value[1], value], dim=1)
-            past_key_value = (key, value)
-        if attn_bias is not None:
-            attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
-        (context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights)
-        return (self.out_proj(context), attn_weights, past_key_value)
-class MultiQueryAttention(nn.Module):
     """Multi-Query self attention.
     Using torch or triton attention implemetation enables user to also use
@@ -220,7 +428,15 @@ class MultiQueryAttention(nn.Module):
         self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
         self.out_proj._is_residual = True
-    def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
         qkv = self.Wqkv(x)
         if self.clip_qkv:
             qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
@@ -234,11 +450,72 @@ class MultiQueryAttention(nn.Module):
             if len(past_key_value) != 0:
                 key = torch.cat([past_key_value[0], key], dim=1)
                 value = torch.cat([past_key_value[1], value], dim=1)
-            past_key_value = (key, value)
         if attn_bias is not None:
             attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
-        (context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, multiquery=True)
-        return (self.out_proj(context), attn_weights, past_key_value)
 def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):
     if attn_impl == 'flash':

 """Attention layers."""
 import math
 import warnings
+from typing import Optional, Dict, Any, NamedTuple, Protocol, Tuple, Union
 import torch
 import torch.nn as nn
 from einops import rearrange
 from packaging import version
 from torch import nn
+from torch.utils.checkpoint import checkpoint
 from .norm import LPLayerNorm
+from .is_torch_version import is_torch_version
+class PastKeyValue(NamedTuple):
+    key: torch.Tensor
+    value: torch.Tensor
+class AttnFnOutput(NamedTuple):
+    attns: torch.Tensor
+    attn_probs: Optional[torch.Tensor]
+    past_key_value: Union[PastKeyValue, Tuple, None]
+class AttnFn(Protocol):
+    def __call__(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        n_heads: int,
+        softmax_scale: Optional[float] = None,
+        attn_bias: Optional[torch.Tensor] = None,
+        key_padding_mask: Optional[torch.ByteTensor] = None,
+        is_causal = False,
+        dropout_p = 0.0,
+        training = False,
+        needs_weights = False,
+        multiquery = False,
+    ) -> AttnFnOutput: ...
+class AttnFnCheckpointed(Protocol):
+    def __call__(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        n_heads: int,
+        softmax_scale: Optional[float],
+        attn_bias: Optional[torch.Tensor],
+        key_padding_mask: Optional[torch.ByteTensor],
+        is_causal: bool,
+        dropout_p: float,
+        training: bool,
+        needs_weights: bool,
+    ) -> AttnFnOutput: ...
+class AttnOutput(NamedTuple):
+    projected_context: torch.Tensor
+    attn_weights: Optional[torch.Tensor]
+    past_key_value: Union[PastKeyValue, Tuple, None]
+class Attn(Protocol):
+    def __call__(
+        self,
+        x: torch.Tensor,
+        past_key_value: Union[PastKeyValue, Tuple, None] = None,
+        attn_bias: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        is_causal = True,
+        needs_weights = False,
+    ) -> AttnOutput: ...
 def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
     if original_is_causal and num_query_tokens != num_key_tokens:
             return False
     return original_is_causal
+def scaled_multihead_dot_product_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    n_heads: int,
+    past_key_value=None,
+    softmax_scale: Optional[float] = None,
+    attn_bias: Optional[torch.Tensor] = None,
+    key_padding_mask: Optional[torch.ByteTensor] = None,
+    is_causal = False,
+    dropout_p = 0.0,
+    training = False,
+    needs_weights = False,
+    multiquery = False,
+) -> AttnFnOutput:
     q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
+    kv_n_heads = 1 if multiquery else n_heads
+    k = rearrange(key, 'b s (h d) -> b h d s', h=kv_n_heads)
+    v = rearrange(value, 'b s (h d) -> b h s d', h=kv_n_heads)
+    if past_key_value is not None:
+        # attn_impl: flash & triton use kernels which expect input shape [b, s, h, d_head].
+        # kv_cache is therefore stored using that shape.
+        # attn_impl: torch stores the kv_cache in the ordering which is most advantageous
+        # for its attn computation ie
+        # keys are stored as tensors with shape [b, h, d_head, s] and
+        # values are stored as tensors with shape [b, h, s, d_head]
+        if len(past_key_value) != 0:
+            k = torch.cat([past_key_value[0], k], dim=3)
+            v = torch.cat([past_key_value[1], v], dim=2)
+        past_key_value = (k, v)
     (b, _, s_q, d) = q.shape
     s_k = k.size(-1)
     if softmax_scale is None:
         softmax_scale = 1 / math.sqrt(d)
     attn_weight = q.matmul(k) * softmax_scale
     if attn_bias is not None:
+        # clamp to 0 necessary for torch 2.0 compile()
+        _s_q = max(0, attn_bias.size(2) - s_q)
+        _s_k = max(0, attn_bias.size(3) - s_k)
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
         if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
             raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
         attn_weight = attn_weight + attn_bias
+    min_val = torch.finfo(q.dtype).min
     if key_padding_mask is not None:
         if attn_bias is not None:
+            warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unneccessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
         attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
+    if is_causal and (not q.size(2) == 1):
         s = max(s_q, s_k)
         causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
         causal_mask = causal_mask.tril()
     out = attn_weight.matmul(v)
     out = rearrange(out, 'b h s d -> b s (h d)')
     if needs_weights:
+        return AttnFnOutput(out, attn_weight, past_key_value)
+    return AttnFnOutput(out, None, past_key_value)
 def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
     for tensor in tensors:
         if not tensor.is_cuda:
             raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).')
+def flash_attn_fn(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    n_heads: int,
+    past_key_value=None,
+    softmax_scale: Optional[float] = None,
+    attn_bias: Optional[torch.Tensor] = None,
+    key_padding_mask: Optional[torch.ByteTensor] = None,
+    is_causal = False,
+    dropout_p = 0.0,
+    training = False,
+    needs_weights = False,
+    multiquery = False,
+) -> AttnFnOutput:
     try:
         from flash_attn import bert_padding, flash_attn_interface
     except:
         raise RuntimeError('Please install flash-attn==1.0.3.post0')
     check_valid_inputs(query, key, value)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            key = torch.cat([past_key_value[0], key], dim=1)
+            value = torch.cat([past_key_value[1], value], dim=1)
+        past_key_value = (key, value)
+    if attn_bias is not None:
+        # clamp to 0 necessary for torch 2.0 compile()
+        _s_q = max(0, attn_bias.size(2) - query.size(1))
+        _s_k = max(0, attn_bias.size(3) - key.size(1))
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
     if attn_bias is not None:
         raise NotImplementedError(f'attn_bias not implemented for flash attn.')
     (batch_size, seqlen) = query.shape[:2]
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
     output_unpad = flash_attn_interface.flash_attn_unpadded_func(query_unpad, key_unpad, value_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
     output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
+    return AttnFnOutput(output, None, past_key_value)
+def triton_flash_attn_fn(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    n_heads: int,
+    past_key_value=None,
+    softmax_scale: Optional[float] = None,
+    attn_bias: Optional[torch.Tensor] = None,
+    key_padding_mask: Optional[torch.ByteTensor] = None,
+    is_causal = False,
+    dropout_p = 0.0,
+    training = False,
+    needs_weights = False,
+    multiquery = False,
+) -> AttnFnOutput:
     try:
         from .flash_attn_triton import flash_attn_func
     except:
         if not _installed:
             raise RuntimeError('Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU and `pip install .[gpu]` if installing from llm-foundry source or `pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). Note: (1) requires you have CMake and PyTorch already installed.')
     check_valid_inputs(query, key, value)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            key = torch.cat([past_key_value[0], key], dim=1)
+            value = torch.cat([past_key_value[1], value], dim=1)
+        past_key_value = (key, value)
+    if attn_bias is not None:
+        # clamp to 0 necessary for torch 2.0 compile()
+        _s_q = max(0, attn_bias.size(2) - query.size(1))
+        _s_k = max(0, attn_bias.size(3) - key.size(1))
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
     if dropout_p:
         raise NotImplementedError(f'Dropout not implemented for attn_impl: triton.')
     if needs_weights:
     reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
     attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
     output = attn_output.view(*attn_output.shape[:2], -1)
+    return AttnFnOutput(output, None, past_key_value)
+class MultiheadAttention(nn.Module, Attn):
     """Multi-head self attention.
     Using torch or triton attention implemetation enables user to also use
     additive bias.
     """
+    gradient_checkpointing = False
+    attn_fn: AttnFn
     def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, device: Optional[str]=None):
         super().__init__()
         self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
         self.out_proj._is_residual = True
+    def forward(
+        self,
+        x: torch.Tensor,
+        past_key_value: Union[PastKeyValue, Tuple, None] = None,
+        attn_bias: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        is_causal = True,
+        needs_weights = False,
+    ) -> AttnOutput:
         qkv = self.Wqkv(x)
         if self.clip_qkv:
             qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
             dtype = query.dtype
             query = self.q_ln(query).to(dtype)
             key = self.k_ln(key).to(dtype)
+        if self.training and self.gradient_checkpointing:
+            ckpt_kwargs: Dict[str, Any] = {'use_reentrant': False} if is_torch_version('>=', '1.11.0') else {}
+            def create_custom_forward(attn_fn: AttnFn) -> AttnFnCheckpointed:
+                def custom_forward(
+                    query: torch.Tensor,
+                    key: torch.Tensor,
+                    value: torch.Tensor,
+                    n_heads: int,
+                    softmax_scale: Optional[float],
+                    attn_bias: Optional[torch.Tensor],
+                    key_padding_mask: Optional[torch.ByteTensor],
+                    is_causal: bool,
+                    dropout_p: float,
+                    training: bool,
+                    needs_weights: bool,
+                ):
+                    return attn_fn(
+                        query,
+                        key,
+                        value,
+                        n_heads,
+                        softmax_scale,
+                        attn_bias,
+                        key_padding_mask,
+                        is_causal,
+                        dropout_p,
+                        training,
+                        needs_weights,
+                        False, # multiquery
+                    )
+                return custom_forward
+            attn_fn_out: AttnFnOutput = checkpoint(
+                create_custom_forward(self.attn_fn),
+                query,
+                key,
+                value,
+                self.n_heads,
+                self.softmax_scale,
+                attn_bias,
+                key_padding_mask,
+                is_causal,
+                self.attn_dropout_p,
+                self.training,
+                needs_weights,
+                **ckpt_kwargs,
+            )
+        else:
+            attn_fn_out: AttnFnOutput = self.attn_fn(
+                query,
+                key,
+                value,
+                self.n_heads,
+                past_key_value=past_key_value,
+                softmax_scale=self.softmax_scale,
+                attn_bias=attn_bias,
+                key_padding_mask=key_padding_mask,
+                is_causal=is_causal,
+                dropout_p=self.attn_dropout_p,
+                training=self.training,
+                needs_weights=needs_weights,
+            )
+        context, attn_weights, past_key_value = attn_fn_out
+        return AttnOutput(self.out_proj(context), attn_weights, past_key_value)
+class MultiQueryAttention(nn.Module, Attn):
     """Multi-Query self attention.
     Using torch or triton attention implemetation enables user to also use
         self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
         self.out_proj._is_residual = True
+    def forward(
+        self,
+        x: torch.Tensor,
+        past_key_value: Union[PastKeyValue, Tuple, None] = None,
+        attn_bias: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        is_causal = True,
+        needs_weights = False,
+    ) -> AttnOutput:
         qkv = self.Wqkv(x)
         if self.clip_qkv:
             qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
             if len(past_key_value) != 0:
                 key = torch.cat([past_key_value[0], key], dim=1)
                 value = torch.cat([past_key_value[1], value], dim=1)
+            past_key_value = PastKeyValue(key, value)
         if attn_bias is not None:
             attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
+        if self.training and self.gradient_checkpointing:
+            ckpt_kwargs: Dict[str, Any] = {'use_reentrant': False} if is_torch_version('>=', '1.11.0') else {}
+            def create_custom_forward(attn_fn: AttnFn) -> AttnFnCheckpointed:
+                def custom_forward(
+                    query: torch.Tensor,
+                    key: torch.Tensor,
+                    value: torch.Tensor,
+                    n_heads: int,
+                    softmax_scale: Optional[float],
+                    attn_bias: Optional[torch.Tensor],
+                    key_padding_mask: Optional[torch.ByteTensor],
+                    is_causal: bool,
+                    dropout_p: float,
+                    training: bool,
+                    needs_weights: bool,
+                ):
+                    return attn_fn(
+                        query,
+                        key,
+                        value,
+                        n_heads,
+                        softmax_scale,
+                        attn_bias,
+                        key_padding_mask,
+                        is_causal,
+                        dropout_p,
+                        training,
+                        needs_weights,
+                        True, # multiquery
+                    )
+                return custom_forward
+            attn_fn_out: AttnFnOutput = checkpoint(
+                create_custom_forward(self.attn_fn),
+                query,
+                key,
+                value,
+                self.n_heads,
+                self.softmax_scale,
+                attn_bias,
+                key_padding_mask,
+                is_causal,
+                self.attn_dropout_p,
+                self.training,
+                needs_weights,
+                **ckpt_kwargs,
+            )
+        else:
+            attn_fn_out: AttnFnOutput = self.attn_fn(
+                query,
+                key,
+                value,
+                self.n_heads,
+                past_key_value=past_key_value,
+                softmax_scale=self.softmax_scale,
+                attn_bias=attn_bias,
+                key_padding_mask=key_padding_mask,
+                is_causal=is_causal,
+                dropout_p=self.attn_dropout_p,
+                training=self.training,
+                needs_weights=needs_weights,
+            )
+        context, attn_weights = attn_fn_out
+        return AttnOutput(self.out_proj(context), attn_weights, past_key_value)
 def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):
     if attn_impl == 'flash':

blocks.py CHANGED Viewed

@@ -1,10 +1,15 @@
 """GPT Blocks used for the GPT Model."""
-from typing import Dict, Optional, Tuple
 import torch
 import torch.nn as nn
-from .attention import ATTN_CLASS_REGISTRY
 from .norm import NORM_CLASS_REGISTRY
 class MPTMLP(nn.Module):
     def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
@@ -18,6 +23,7 @@ class MPTMLP(nn.Module):
         return self.down_proj(self.act(self.up_proj(x)))
 class MPTBlock(nn.Module):
     def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', device: Optional[str]=None, **kwargs):
         del kwargs
@@ -31,11 +37,11 @@ class MPTBlock(nn.Module):
         self.resid_attn_dropout = nn.Dropout(resid_pdrop)
         self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
-    def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
         a = self.norm_1(x)
-        (b, _, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
         x = x + self.resid_attn_dropout(b)
         m = self.norm_2(x)
         n = self.ffn(m)
         x = x + self.resid_ffn_dropout(n)
-        return (x, past_key_value)

 """GPT Blocks used for the GPT Model."""
+from typing import Dict, Optional, Tuple, NamedTuple, Union
 import torch
 import torch.nn as nn
+from .attention import ATTN_CLASS_REGISTRY, Attn, PastKeyValue
 from .norm import NORM_CLASS_REGISTRY
+class MPTBlockOutput(NamedTuple):
+    hidden_states: torch.Tensor
+    attn_probs: Optional[torch.Tensor]
+    past_key_value: Union[PastKeyValue, Tuple, None]
 class MPTMLP(nn.Module):
     def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
         return self.down_proj(self.act(self.up_proj(x)))
 class MPTBlock(nn.Module):
+    attn: Attn
     def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', device: Optional[str]=None, **kwargs):
         del kwargs
         self.resid_attn_dropout = nn.Dropout(resid_pdrop)
         self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
+    def forward(self, x: torch.Tensor, past_key_value: Union[PastKeyValue, Tuple, None] = None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> MPTBlockOutput:
         a = self.norm_1(x)
+        (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
         x = x + self.resid_attn_dropout(b)
         m = self.norm_2(x)
         n = self.ffn(m)
         x = x + self.resid_ffn_dropout(n)
+        return MPTBlockOutput(x, attn_weights, past_key_value)

config.json CHANGED Viewed

@@ -46,7 +46,7 @@
   "tokenizer_name": "EleutherAI/gpt-neox-20b",
   "torch_dtype": "bfloat16",
   "transformers_version": "4.28.1",
-  "use_cache": false,
   "verbose": 0,
   "vocab_size": 50432
 }

   "tokenizer_name": "EleutherAI/gpt-neox-20b",
   "torch_dtype": "bfloat16",
   "transformers_version": "4.28.1",
+  "use_cache": true,
   "verbose": 0,
   "vocab_size": 50432
 }

configuration_mpt.py CHANGED Viewed

@@ -1,67 +1,15 @@
-"""A HuggingFace-style model configuration."""
 from typing import Dict, Optional, Union
 from transformers import PretrainedConfig
 attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
-init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu'}
 class MPTConfig(PretrainedConfig):
     model_type = 'mpt'
-    def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
-        """The MPT configuration class.
-        Args:
-            d_model (int): The size of the embedding dimension of the model.
-            n_heads (int): The number of attention heads.
-            n_layers (int): The number of layers in the model.
-            expansion_ratio (int): The ratio of the up/down scale in the MLP.
-            max_seq_len (int): The maximum sequence length of the model.
-            vocab_size (int): The size of the vocabulary.
-            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
-            emb_pdrop (float): The dropout probability for the embedding layer.
-            learned_pos_emb (bool): Whether to use learned positional embeddings
-            attn_config (Dict):  A dictionary used to configure the model's attention module:
-                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
-                attn_pdrop (float): The dropout probability for the attention layers.
-                attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
-                qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
-                clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
-                    this value.
-                softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
-                    use the default scale of ``1/sqrt(d_keys)``.
-                prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
-                    extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
-                    can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
-                attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
-                    When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
-                    which sub-sequence each token belongs to.
-                    Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
-                alibi (bool): Whether to use the alibi bias instead of position embeddings.
-                alibi_bias_max (int): The maximum value of the alibi bias.
-            init_device (str): The device to use for parameter initialization.
-            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
-            no_bias (bool): Whether to use bias in all layers.
-            verbose (int): The verbosity level. 0 is silent.
-            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
-            norm_type (str): choose type of norm to use
-            multiquery_attention (bool): Whether to use multiquery attention implementation.
-            use_cache (bool): Whether or not the model should return the last key/values attentions
-            init_config (Dict): A dictionary used to configure the model initialization:
-                init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
-                    'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
-                    'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
-                init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
-                emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
-                emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
-                    used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
-                init_std (float): The standard deviation of the normal distribution used to initialize the model,
-                    if using the baseline_ parameter initialization scheme.
-                init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
-                fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
-                init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
-                ---
-                See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
-        """
         self.d_model = d_model
         self.n_heads = n_heads
         self.n_layers = n_layers
@@ -80,39 +28,39 @@ class MPTConfig(PretrainedConfig):
         self.norm_type = norm_type
         self.use_cache = use_cache
         self.init_config = init_config
-        if 'name' in kwargs:
             del kwargs['name']
-        if 'loss_fn' in kwargs:
             del kwargs['loss_fn']
         super().__init__(**kwargs)
         self._validate_config()
     def _set_config_defaults(self, config, config_defaults):
         for (k, v) in config_defaults.items():
-            if k not in config:
                 config[k] = v
         return config
     def _validate_config(self):
         self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
         self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
-        if self.d_model % self.n_heads != 0:
             raise ValueError('d_model must be divisible by n_heads')
-        if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
             raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
-        if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
             raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
-        if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
             raise NotImplementedError('prefix_lm only implemented with torch and triton attention.')
-        if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
             raise NotImplementedError('alibi only implemented with torch and triton attention.')
-        if self.attn_config['attn_uses_sequence_id'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
             raise NotImplementedError('attn_uses_sequence_id only implemented with torch and triton attention.')
-        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
             raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!')
-        if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model':
             raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
-        if self.init_config.get('name', None) is None:
             raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
-        if not self.learned_pos_emb and (not self.attn_config['alibi']):
-            raise ValueError(f'Positional information must be provided to the model using either learned_pos_emb or alibi.')

+'A HuggingFace-style model configuration.'
 from typing import Dict, Optional, Union
 from transformers import PretrainedConfig
 attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
+init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
 class MPTConfig(PretrainedConfig):
     model_type = 'mpt'
+    def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[(float, str)]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
+        "The MPT configuration class.\n\n        Args:\n            d_model (int): The size of the embedding dimension of the model.\n            n_heads (int): The number of attention heads.\n            n_layers (int): The number of layers in the model.\n            expansion_ratio (int): The ratio of the up/down scale in the MLP.\n            max_seq_len (int): The maximum sequence length of the model.\n            vocab_size (int): The size of the vocabulary.\n            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.\n            emb_pdrop (float): The dropout probability for the embedding layer.\n            learned_pos_emb (bool): Whether to use learned positional embeddings\n            attn_config (Dict):  A dictionary used to configure the model's attention module:\n                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention\n                attn_pdrop (float): The dropout probability for the attention layers.\n                attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.\n                qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.\n                clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to\n                    this value.\n                softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,\n                    use the default scale of ``1/sqrt(d_keys)``.\n                prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an\n                    extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix\n                    can attend to one another bi-directionally. Tokens outside the prefix use causal attention.\n                attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.\n                    When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates\n                    which sub-sequence each token belongs to.\n                    Defaults to ``False`` meaning any provided `sequence_id` will be ignored.\n                alibi (bool): Whether to use the alibi bias instead of position embeddings.\n                alibi_bias_max (int): The maximum value of the alibi bias.\n            init_device (str): The device to use for parameter initialization.\n            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.\n            no_bias (bool): Whether to use bias in all layers.\n            verbose (int): The verbosity level. 0 is silent.\n            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.\n            norm_type (str): choose type of norm to use\n            multiquery_attention (bool): Whether to use multiquery attention implementation.\n            use_cache (bool): Whether or not the model should return the last key/values attentions\n            init_config (Dict): A dictionary used to configure the model initialization:\n                init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',\n                    'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or\n                    'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.\n                init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.\n                emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.\n                emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution\n                    used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.\n                init_std (float): The standard deviation of the normal distribution used to initialize the model,\n                    if using the baseline_ parameter initialization scheme.\n                init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.\n                fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.\n                init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.\n                ---\n                See llmfoundry.models.utils.param_init_fns.py for info on other param init config options\n        "
         self.d_model = d_model
         self.n_heads = n_heads
         self.n_layers = n_layers
         self.norm_type = norm_type
         self.use_cache = use_cache
         self.init_config = init_config
+        if ('name' in kwargs):
             del kwargs['name']
+        if ('loss_fn' in kwargs):
             del kwargs['loss_fn']
         super().__init__(**kwargs)
         self._validate_config()
     def _set_config_defaults(self, config, config_defaults):
         for (k, v) in config_defaults.items():
+            if (k not in config):
                 config[k] = v
         return config
     def _validate_config(self):
         self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
         self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
+        if ((self.d_model % self.n_heads) != 0):
             raise ValueError('d_model must be divisible by n_heads')
+        if any((((prob < 0) or (prob > 1)) for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
             raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
+        if (self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']):
             raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
+        if (self.attn_config['prefix_lm'] and (self.attn_config['attn_impl'] not in ['torch', 'triton'])):
             raise NotImplementedError('prefix_lm only implemented with torch and triton attention.')
+        if (self.attn_config['alibi'] and (self.attn_config['attn_impl'] not in ['torch', 'triton'])):
             raise NotImplementedError('alibi only implemented with torch and triton attention.')
+        if (self.attn_config['attn_uses_sequence_id'] and (self.attn_config['attn_impl'] not in ['torch', 'triton'])):
             raise NotImplementedError('attn_uses_sequence_id only implemented with torch and triton attention.')
+        if ((self.embedding_fraction > 1) or (self.embedding_fraction <= 0)):
             raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!')
+        if (isinstance(self.logit_scale, str) and (self.logit_scale != 'inv_sqrt_d_model')):
             raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
+        if (self.init_config.get('name', None) is None):
             raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
+        if ((not self.learned_pos_emb) and (not self.attn_config['alibi'])):
+            raise ValueError(f'Positional information must be provided to the model using either learned_pos_emb or alibi.')

flash_attn_triton.py CHANGED Viewed

@@ -1,324 +1,283 @@
-"""
-Copied from https://github.com/HazyResearch/flash-attention/blob/eff9fe6b8076df59d64d7a3f464696738a3c7c24/flash_attn/flash_attn_triton.py
-update imports to use 'triton_pre_mlir'
-*Experimental* implementation of FlashAttention in Triton.
-Tested with triton==2.0.0.dev20221202.
-Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
-other than 64:
-https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
-We'll update this implementation with the new Triton backend once this is fixed.
-We use the FlashAttention implementation from Phil Tillet a starting point.
-https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
-Changes:
-- Implement both causal and non-causal attention.
-- Implement both self-attention and cross-attention.
-- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
-- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
-- Support attention bias.
-- Speed up the forward pass a bit, and only store the LSE instead of m and l.
-- Make the backward for d=128 much faster by reducing register spilling.
-- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
-small batch size * nheads.
-Caution:
-- This is an *experimental* implementation. The forward pass should be quite robust but
-I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
-- This implementation has only been tested on A100.
-- If you plan to use headdim other than 64 and 128, you should test for race conditions
-(due to the Triton compiler), as done in tests/test_flash_attn.py
-"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
-for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
-that there are none left for other head dimensions.
-Differences between this Triton version and the CUDA version:
-- Triton version doesn't support dropout.
-- Triton forward is generally faster than CUDA forward, while Triton backward is
-generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
-than CUDA forward + backward.
-- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
-- Triton version supports attention bias, while CUDA version doesn't.
-"""
 import math
 import torch
 import triton_pre_mlir as triton
 import triton_pre_mlir.language as tl
-@triton.heuristics({'EVEN_M': lambda args: args['seqlen_q'] % args['BLOCK_M'] == 0, 'EVEN_N': lambda args: args['seqlen_k'] % args['BLOCK_N'] == 0, 'EVEN_HEADDIM': lambda args: args['headdim'] == args['BLOCK_HEADDIM']})
 @triton.jit
 def _fwd_kernel(Q, K, V, Bias, Out, Lse, TMP, softmax_scale, stride_qb, stride_qh, stride_qm, stride_kb, stride_kh, stride_kn, stride_vb, stride_vh, stride_vn, stride_bb, stride_bh, stride_bm, stride_ob, stride_oh, stride_om, nheads, seqlen_q, seqlen_k, seqlen_q_rounded, headdim, CACHE_KEY_SEQLEN_Q, CACHE_KEY_SEQLEN_K, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
     start_m = tl.program_id(0)
     off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
     offs_n = tl.arange(0, BLOCK_N)
     offs_d = tl.arange(0, BLOCK_HEADDIM)
-    q_ptrs = Q + off_b * stride_qb + off_h * stride_qh + (offs_m[:, None] * stride_qm + offs_d[None, :])
-    k_ptrs = K + off_b * stride_kb + off_h * stride_kh + (offs_n[:, None] * stride_kn + offs_d[None, :])
-    v_ptrs = V + off_b * stride_vb + off_h * stride_vh + (offs_n[:, None] * stride_vn + offs_d[None, :])
-    if BIAS_TYPE == 'vector':
-        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
-    elif BIAS_TYPE == 'matrix':
-        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + (offs_m[:, None] * stride_bm + offs_n[None, :])
-    t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
-    lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
     acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
-    if EVEN_M & EVEN_N:
         if EVEN_HEADDIM:
             q = tl.load(q_ptrs)
         else:
-            q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
     elif EVEN_HEADDIM:
-        q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0)
     else:
-        q = tl.load(q_ptrs, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
-    end_n = seqlen_k if not IS_CAUSAL else tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
     for start_n in range(0, end_n, BLOCK_N):
         start_n = tl.multiple_of(start_n, BLOCK_N)
-        if EVEN_N & EVEN_M:
             if EVEN_HEADDIM:
-                k = tl.load(k_ptrs + start_n * stride_kn)
             else:
-                k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_d[None, :] < headdim, other=0.0)
         elif EVEN_HEADDIM:
-            k = tl.load(k_ptrs + start_n * stride_kn, mask=(start_n + offs_n)[:, None] < seqlen_k, other=0.0)
         else:
-            k = tl.load(k_ptrs + start_n * stride_kn, mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k, trans_b=True)
-        if not EVEN_N:
-            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float('-inf'))
         if IS_CAUSAL:
-            qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float('-inf'))
-        if BIAS_TYPE != 'none':
-            if BIAS_TYPE == 'vector':
                 if EVEN_N:
-                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
                 else:
-                    bias = tl.load(b_ptrs + start_n, mask=start_n + offs_n < seqlen_k, other=0.0).to(tl.float32)
                 bias = bias[None, :]
-            elif BIAS_TYPE == 'matrix':
-                if EVEN_M & EVEN_N:
-                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
                 else:
-                    bias = tl.load(b_ptrs + start_n, mask=(offs_m[:, None] < seqlen_q) & ((start_n + offs_n)[None, :] < seqlen_k), other=0.0).to(tl.float32)
-            qk = qk * softmax_scale + bias
             m_ij = tl.maximum(tl.max(qk, 1), lse_i)
-            p = tl.exp(qk - m_ij[:, None])
         else:
-            m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i)
-            p = tl.exp(qk * softmax_scale - m_ij[:, None])
         l_ij = tl.sum(p, 1)
-        acc_o_scale = tl.exp(m_i - m_ij)
         tl.store(t_ptrs, acc_o_scale)
         acc_o_scale = tl.load(t_ptrs)
-        acc_o = acc_o * acc_o_scale[:, None]
-        if EVEN_N & EVEN_M:
             if EVEN_HEADDIM:
-                v = tl.load(v_ptrs + start_n * stride_vn)
             else:
-                v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_d[None, :] < headdim, other=0.0)
         elif EVEN_HEADDIM:
-            v = tl.load(v_ptrs + start_n * stride_vn, mask=(start_n + offs_n)[:, None] < seqlen_k, other=0.0)
         else:
-            v = tl.load(v_ptrs + start_n * stride_vn, mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
         p = p.to(v.dtype)
         acc_o += tl.dot(p, v)
         m_i = m_ij
-        l_i_new = tl.exp(lse_i - m_ij) + l_ij
-        lse_i = m_ij + tl.log(l_i_new)
-    o_scale = tl.exp(m_i - lse_i)
     tl.store(t_ptrs, o_scale)
     o_scale = tl.load(t_ptrs)
-    acc_o = acc_o * o_scale[:, None]
     start_m = tl.program_id(0)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m
     tl.store(lse_ptrs, lse_i)
     offs_d = tl.arange(0, BLOCK_HEADDIM)
-    out_ptrs = Out + off_b * stride_ob + off_h * stride_oh + (offs_m[:, None] * stride_om + offs_d[None, :])
     if EVEN_M:
         if EVEN_HEADDIM:
             tl.store(out_ptrs, acc_o)
         else:
-            tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim)
     elif EVEN_HEADDIM:
-        tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q)
     else:
-        tl.store(out_ptrs, acc_o, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
 @triton.jit
 def _bwd_preprocess_do_o_dot(Out, DO, Delta, stride_ob, stride_oh, stride_om, stride_dob, stride_doh, stride_dom, nheads, seqlen_q, seqlen_q_rounded, headdim, BLOCK_M: tl.constexpr, BLOCK_HEADDIM: tl.constexpr):
     start_m = tl.program_id(0)
     off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
     offs_d = tl.arange(0, BLOCK_HEADDIM)
-    o = tl.load(Out + off_b * stride_ob + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :], mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0).to(tl.float32)
-    do = tl.load(DO + off_b * stride_dob + off_h * stride_doh + offs_m[:, None] * stride_dom + offs_d[None, :], mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0).to(tl.float32)
-    delta = tl.sum(o * do, axis=1)
-    tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta)
 @triton.jit
 def _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr):
-    if EVEN_N & EVEN_M:
         if EVEN_HEADDIM:
             tl.store(dv_ptrs, dv)
             tl.store(dk_ptrs, dk)
         else:
-            tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
-            tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
     elif EVEN_HEADDIM:
-        tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
-        tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
     else:
-        tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
-        tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
 @triton.jit
 def _bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD: tl.constexpr, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
-    begin_m = 0 if not IS_CAUSAL else start_n * BLOCK_N // BLOCK_M * BLOCK_M
-    offs_qm = begin_m + tl.arange(0, BLOCK_M)
-    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
     offs_m = tl.arange(0, BLOCK_M)
     offs_d = tl.arange(0, BLOCK_HEADDIM)
-    q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :])
-    k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :])
-    v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :])
-    do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :])
-    dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :])
-    if BIAS_TYPE == 'vector':
-        b_ptrs = Bias + offs_n
-    elif BIAS_TYPE == 'matrix':
-        b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
     dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
     dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
-    if begin_m >= seqlen_q:
-        dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
-        dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
         _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
         return
-    if EVEN_N & EVEN_M:
         if EVEN_HEADDIM:
             k = tl.load(k_ptrs)
             v = tl.load(v_ptrs)
         else:
-            k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-            v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
     elif EVEN_HEADDIM:
-        k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
-        v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
     else:
-        k = tl.load(k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
-        v = tl.load(v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
     num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
-    for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M):
         start_m = tl.multiple_of(start_m, BLOCK_M)
-        offs_m_curr = start_m + offs_m
-        if EVEN_M & EVEN_HEADDIM:
             q = tl.load(q_ptrs)
         elif EVEN_HEADDIM:
-            q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
         else:
-            q = tl.load(q_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
         qk = tl.dot(q, k, trans_b=True)
-        if not EVEN_N:
-            qk = tl.where(offs_n[None, :] < seqlen_k, qk, float('-inf'))
         if IS_CAUSAL:
-            qk = tl.where(offs_m_curr[:, None] >= offs_n[None, :], qk, float('-inf'))
-        if BIAS_TYPE != 'none':
             tl.debug_barrier()
-            if BIAS_TYPE == 'vector':
                 if EVEN_N:
                     bias = tl.load(b_ptrs).to(tl.float32)
                 else:
-                    bias = tl.load(b_ptrs, mask=offs_n < seqlen_k, other=0.0).to(tl.float32)
                 bias = bias[None, :]
-            elif BIAS_TYPE == 'matrix':
-                if EVEN_M & EVEN_N:
                     bias = tl.load(b_ptrs).to(tl.float32)
                 else:
-                    bias = tl.load(b_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_n[None, :] < seqlen_k), other=0.0).to(tl.float32)
-            qk = qk * softmax_scale + bias
-        if not EVEN_M & EVEN_HEADDIM:
             tl.debug_barrier()
-        lse_i = tl.load(LSE + offs_m_curr)
-        if BIAS_TYPE == 'none':
-            p = tl.exp(qk * softmax_scale - lse_i[:, None])
         else:
-            p = tl.exp(qk - lse_i[:, None])
-        if EVEN_M & EVEN_HEADDIM:
             do = tl.load(do_ptrs)
         else:
-            do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
         dv += tl.dot(p.to(do.dtype), do, trans_a=True)
-        if not EVEN_M & EVEN_HEADDIM:
             tl.debug_barrier()
         dp = tl.dot(do, v, trans_b=True)
-        if not EVEN_HEADDIM:
             tl.debug_barrier()
-        Di = tl.load(D + offs_m_curr)
-        ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype)
         dk += tl.dot(ds, q, trans_a=True)
-        if not EVEN_M & EVEN_HEADDIM:
             tl.debug_barrier()
-        if not ATOMIC_ADD:
-            if EVEN_M & EVEN_HEADDIM:
                 dq = tl.load(dq_ptrs, eviction_policy='evict_last')
                 dq += tl.dot(ds, k)
                 tl.store(dq_ptrs, dq, eviction_policy='evict_last')
             elif EVEN_HEADDIM:
-                dq = tl.load(dq_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0, eviction_policy='evict_last')
                 dq += tl.dot(ds, k)
-                tl.store(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q, eviction_policy='evict_last')
             else:
-                dq = tl.load(dq_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0, eviction_policy='evict_last')
                 dq += tl.dot(ds, k)
-                tl.store(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), eviction_policy='evict_last')
         else:
             dq = tl.dot(ds, k)
-            if EVEN_M & EVEN_HEADDIM:
                 tl.atomic_add(dq_ptrs, dq)
             elif EVEN_HEADDIM:
-                tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q)
             else:
-                tl.atomic_add(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
-        dq_ptrs += BLOCK_M * stride_dqm
-        q_ptrs += BLOCK_M * stride_qm
-        do_ptrs += BLOCK_M * stride_dom
-        if BIAS_TYPE == 'matrix':
-            b_ptrs += BLOCK_M * stride_bm
-    dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
-    dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
     _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
 def init_to_zero(name):
-    return lambda nargs: nargs[name].zero_()
 @triton.autotune(configs=[triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'SEQUENCE_PARALLEL': False}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'SEQUENCE_PARALLEL': True}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ'))], key=['CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL', 'BLOCK_HEADDIM'])
-@triton.heuristics({'EVEN_M': lambda args: args['seqlen_q'] % args['BLOCK_M'] == 0, 'EVEN_N': lambda args: args['seqlen_k'] % args['BLOCK_N'] == 0, 'EVEN_HEADDIM': lambda args: args['headdim'] == args['BLOCK_HEADDIM']})
 @triton.jit
 def _bwd_kernel(Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qb, stride_qh, stride_qm, stride_kb, stride_kh, stride_kn, stride_vb, stride_vh, stride_vn, stride_bb, stride_bh, stride_bm, stride_dob, stride_doh, stride_dom, stride_dqb, stride_dqh, stride_dqm, stride_dkb, stride_dkh, stride_dkn, stride_dvb, stride_dvh, stride_dvn, nheads, seqlen_q, seqlen_k, seqlen_q_rounded, headdim, CACHE_KEY_SEQLEN_Q, CACHE_KEY_SEQLEN_K, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, SEQUENCE_PARALLEL: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
     off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    Q += off_b * stride_qb + off_h * stride_qh
-    K += off_b * stride_kb + off_h * stride_kh
-    V += off_b * stride_vb + off_h * stride_vh
-    DO += off_b * stride_dob + off_h * stride_doh
-    DQ += off_b * stride_dqb + off_h * stride_dqh
-    DK += off_b * stride_dkb + off_h * stride_dkh
-    DV += off_b * stride_dvb + off_h * stride_dvh
-    if BIAS_TYPE != 'none':
-        Bias += off_b * stride_bb + off_h * stride_bh
-    D += off_hb * seqlen_q_rounded
-    LSE += off_hb * seqlen_q_rounded
-    if not SEQUENCE_PARALLEL:
         num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
         for start_n in range(0, num_block_n):
             _bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD=False, BIAS_TYPE=BIAS_TYPE, IS_CAUSAL=IS_CAUSAL, BLOCK_HEADDIM=BLOCK_HEADDIM, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N)
@@ -329,86 +288,81 @@ def _bwd_kernel(Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qb,
 def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
     (batch, seqlen_q, nheads, d) = q.shape
     (_, seqlen_k, _, _) = k.shape
-    assert k.shape == (batch, seqlen_k, nheads, d)
-    assert v.shape == (batch, seqlen_k, nheads, d)
-    assert d <= 128, 'FlashAttention only support head dimensions up to 128'
-    assert q.dtype == k.dtype == v.dtype, 'All tensors must have the same type'
-    assert q.dtype in [torch.float16, torch.bfloat16], 'Only support fp16 and bf16'
-    assert q.is_cuda and k.is_cuda and v.is_cuda
-    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
-    has_bias = bias is not None
     bias_type = 'none'
     if has_bias:
-        assert bias.dtype in [q.dtype, torch.float]
         assert bias.is_cuda
-        assert bias.dim() == 4
-        if bias.stride(-1) != 1:
             bias = bias.contiguous()
-        if bias.shape[2:] == (1, seqlen_k):
             bias_type = 'vector'
-        elif bias.shape[2:] == (seqlen_q, seqlen_k):
             bias_type = 'matrix'
         else:
             raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)')
         bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
-    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
-    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
     lse = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
     tmp = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
     o = torch.empty_like(q)
     BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
     BLOCK = 128
-    num_warps = 4 if d <= 64 else 8
-    grid = lambda META: (triton.cdiv(seqlen_q, META['BLOCK_M']), batch * nheads)
-    _fwd_kernel[grid](q, k, v, bias, o, lse, tmp, softmax_scale, q.stride(0), q.stride(2), q.stride(1), k.stride(0), k.stride(2), k.stride(1), v.stride(0), v.stride(2), v.stride(1), *bias_strides, o.stride(0), o.stride(2), o.stride(1), nheads, seqlen_q, seqlen_k, seqlen_q_rounded, d, seqlen_q // 32, seqlen_k // 32, bias_type, causal, BLOCK_HEADDIM, BLOCK_M=BLOCK, BLOCK_N=BLOCK, num_warps=num_warps, num_stages=1)
     return (o, lse, softmax_scale)
 def _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None):
-    if do.stride(-1) != 1:
         do = do.contiguous()
     (batch, seqlen_q, nheads, d) = q.shape
     (_, seqlen_k, _, _) = k.shape
-    assert d <= 128
-    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
-    assert lse.shape == (batch, nheads, seqlen_q_rounded)
-    assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1
-    assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
-    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
     dq_accum = torch.empty_like(q, dtype=torch.float32)
     delta = torch.empty_like(lse)
     BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
-    grid = lambda META: (triton.cdiv(seqlen_q, META['BLOCK_M']), batch * nheads)
     _bwd_preprocess_do_o_dot[grid](o, do, delta, o.stride(0), o.stride(2), o.stride(1), do.stride(0), do.stride(2), do.stride(1), nheads, seqlen_q, seqlen_q_rounded, d, BLOCK_M=128, BLOCK_HEADDIM=BLOCK_HEADDIM)
-    has_bias = bias is not None
     bias_type = 'none'
     if has_bias:
-        assert bias.dtype in [q.dtype, torch.float]
         assert bias.is_cuda
-        assert bias.dim() == 4
-        assert bias.stride(-1) == 1
-        if bias.shape[2:] == (1, seqlen_k):
             bias_type = 'vector'
-        elif bias.shape[2:] == (seqlen_q, seqlen_k):
             bias_type = 'matrix'
         else:
             raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)')
         bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
-    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
-    grid = lambda META: (triton.cdiv(seqlen_k, META['BLOCK_N']) if META['SEQUENCE_PARALLEL'] else 1, batch * nheads)
-    _bwd_kernel[grid](q, k, v, bias, do, dq_accum, dk, dv, lse, delta, softmax_scale, q.stride(0), q.stride(2), q.stride(1), k.stride(0), k.stride(2), k.stride(1), v.stride(0), v.stride(2), v.stride(1), *bias_strides, do.stride(0), do.stride(2), do.stride(1), dq_accum.stride(0), dq_accum.stride(2), dq_accum.stride(1), dk.stride(0), dk.stride(2), dk.stride(1), dv.stride(0), dv.stride(2), dv.stride(1), nheads, seqlen_q, seqlen_k, seqlen_q_rounded, d, seqlen_q // 32, seqlen_k // 32, bias_type, causal, BLOCK_HEADDIM)
     dq.copy_(dq_accum)
 class FlashAttnQKVPackedFunc(torch.autograd.Function):
     @staticmethod
     def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
-        """
-            qkv: (batch, seqlen, 3, nheads, headdim)
-            bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
-                For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
-                ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
-        """
-        if qkv.stride(-1) != 1:
             qkv = qkv.contiguous()
         (o, lse, ctx.softmax_scale) = _flash_attn_forward(qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], bias=bias, causal=causal, softmax_scale=softmax_scale)
         ctx.save_for_backward(qkv, o, lse, bias)
@@ -418,7 +372,7 @@ class FlashAttnQKVPackedFunc(torch.autograd.Function):
     @staticmethod
     def backward(ctx, do):
         (qkv, o, lse, bias) = ctx.saved_tensors
-        assert not ctx.needs_input_grad[1], 'FlashAttention does not support bias gradient yet'
         with torch.inference_mode():
             dqkv = torch.empty_like(qkv)
             _flash_attn_backward(do, qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], o, lse, dqkv[:, :, 0], dqkv[:, :, 1], dqkv[:, :, 2], bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
@@ -429,14 +383,8 @@ class FlashAttnKVPackedFunc(torch.autograd.Function):
     @staticmethod
     def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
-        """
-            q: (batch, seqlen_q, nheads, headdim)
-            kv: (batch, seqlen_k, 2, nheads, headdim)
-            bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
-                For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
-                ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
-        """
-        (q, kv) = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, kv]]
         (o, lse, ctx.softmax_scale) = _flash_attn_forward(q, kv[:, :, 0], kv[:, :, 1], bias=bias, causal=causal, softmax_scale=softmax_scale)
         ctx.save_for_backward(q, kv, o, lse, bias)
         ctx.causal = causal
@@ -445,8 +393,8 @@ class FlashAttnKVPackedFunc(torch.autograd.Function):
     @staticmethod
     def backward(ctx, do):
         (q, kv, o, lse, bias) = ctx.saved_tensors
-        if len(ctx.needs_input_grad) >= 3:
-            assert not ctx.needs_input_grad[2], 'FlashAttention does not support bias gradient yet'
         with torch.inference_mode():
             dq = torch.empty_like(q)
             dkv = torch.empty_like(kv)
@@ -458,14 +406,8 @@ class FlashAttnFunc(torch.autograd.Function):
     @staticmethod
     def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
-        """
-            q: (batch_size, seqlen_q, nheads, headdim)
-            k, v: (batch_size, seqlen_k, nheads, headdim)
-            bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
-                For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
-                ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
-        """
-        (q, k, v) = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]]
         (o, lse, ctx.softmax_scale) = _flash_attn_forward(q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale)
         ctx.save_for_backward(q, k, v, o, lse, bias)
         ctx.causal = causal
@@ -474,11 +416,11 @@ class FlashAttnFunc(torch.autograd.Function):
     @staticmethod
     def backward(ctx, do):
         (q, k, v, o, lse, bias) = ctx.saved_tensors
-        assert not ctx.needs_input_grad[3], 'FlashAttention does not support bias gradient yet'
         with torch.inference_mode():
             dq = torch.empty_like(q)
             dk = torch.empty_like(k)
             dv = torch.empty_like(v)
             _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
         return (dq, dk, dv, None, None, None)
-flash_attn_func = FlashAttnFunc.apply

+'\nCopied from https://github.com/HazyResearch/flash-attention/blob/eff9fe6b8076df59d64d7a3f464696738a3c7c24/flash_attn/flash_attn_triton.py\nupdate imports to use \'triton_pre_mlir\'\n\n*Experimental* implementation of FlashAttention in Triton.\nTested with triton==2.0.0.dev20221202.\nTriton 2.0 has a new backend (MLIR) but seems like it doesn\'t yet work for head dimensions\nother than 64:\nhttps://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207\nWe\'ll update this implementation with the new Triton backend once this is fixed.\n\nWe use the FlashAttention implementation from Phil Tillet a starting point.\nhttps://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py\n\nChanges:\n- Implement both causal and non-causal attention.\n- Implement both self-attention and cross-attention.\n- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.\n- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.\n- Support attention bias.\n- Speed up the forward pass a bit, and only store the LSE instead of m and l.\n- Make the backward for d=128 much faster by reducing register spilling.\n- Optionally parallelize the backward pass across seqlen_k, to deal with the case of\nsmall batch size * nheads.\n\nCaution:\n- This is an *experimental* implementation. The forward pass should be quite robust but\nI\'m not 100% sure that the backward pass doesn\'t have race conditions (due to the Triton compiler).\n- This implementation has only been tested on A100.\n- If you plan to use headdim other than 64 and 128, you should test for race conditions\n(due to the Triton compiler), as done in tests/test_flash_attn.py\n"test_flash_attn_triton_race_condition". I\'ve tested and fixed many race conditions\nfor different head dimensions (40, 48, 64, 128, 80, 88, 96), but I\'m still not 100% confident\nthat there are none left for other head dimensions.\n\nDifferences between this Triton version and the CUDA version:\n- Triton version doesn\'t support dropout.\n- Triton forward is generally faster than CUDA forward, while Triton backward is\ngenerally slower than CUDA backward. Overall Triton forward + backward is slightly slower\nthan CUDA forward + backward.\n- Triton version doesn\'t support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).\n- Triton version supports attention bias, while CUDA version doesn\'t.\n'
 import math
 import torch
 import triton_pre_mlir as triton
 import triton_pre_mlir.language as tl
+@triton.heuristics({'EVEN_M': (lambda args: ((args['seqlen_q'] % args['BLOCK_M']) == 0)), 'EVEN_N': (lambda args: ((args['seqlen_k'] % args['BLOCK_N']) == 0)), 'EVEN_HEADDIM': (lambda args: (args['headdim'] == args['BLOCK_HEADDIM']))})
 @triton.jit
 def _fwd_kernel(Q, K, V, Bias, Out, Lse, TMP, softmax_scale, stride_qb, stride_qh, stride_qm, stride_kb, stride_kh, stride_kn, stride_vb, stride_vh, stride_vn, stride_bb, stride_bh, stride_bm, stride_ob, stride_oh, stride_om, nheads, seqlen_q, seqlen_k, seqlen_q_rounded, headdim, CACHE_KEY_SEQLEN_Q, CACHE_KEY_SEQLEN_K, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
     start_m = tl.program_id(0)
     off_hb = tl.program_id(1)
+    off_b = (off_hb // nheads)
+    off_h = (off_hb % nheads)
+    offs_m = ((start_m * BLOCK_M) + tl.arange(0, BLOCK_M))
     offs_n = tl.arange(0, BLOCK_N)
     offs_d = tl.arange(0, BLOCK_HEADDIM)
+    q_ptrs = (((Q + (off_b * stride_qb)) + (off_h * stride_qh)) + ((offs_m[:, None] * stride_qm) + offs_d[None, :]))
+    k_ptrs = (((K + (off_b * stride_kb)) + (off_h * stride_kh)) + ((offs_n[:, None] * stride_kn) + offs_d[None, :]))
+    v_ptrs = (((V + (off_b * stride_vb)) + (off_h * stride_vh)) + ((offs_n[:, None] * stride_vn) + offs_d[None, :]))
+    if (BIAS_TYPE == 'vector'):
+        b_ptrs = (((Bias + (off_b * stride_bb)) + (off_h * stride_bh)) + offs_n)
+    elif (BIAS_TYPE == 'matrix'):
+        b_ptrs = (((Bias + (off_b * stride_bb)) + (off_h * stride_bh)) + ((offs_m[:, None] * stride_bm) + offs_n[None, :]))
+    t_ptrs = ((TMP + (off_hb * seqlen_q_rounded)) + offs_m)
+    lse_i = (tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf'))
+    m_i = (tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf'))
     acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
+    if (EVEN_M & EVEN_N):
         if EVEN_HEADDIM:
             q = tl.load(q_ptrs)
         else:
+            q = tl.load(q_ptrs, mask=(offs_d[None, :] < headdim), other=0.0)
     elif EVEN_HEADDIM:
+        q = tl.load(q_ptrs, mask=(offs_m[:, None] < seqlen_q), other=0.0)
     else:
+        q = tl.load(q_ptrs, mask=((offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim)), other=0.0)
+    end_n = (seqlen_k if (not IS_CAUSAL) else tl.minimum(((start_m + 1) * BLOCK_M), seqlen_k))
     for start_n in range(0, end_n, BLOCK_N):
         start_n = tl.multiple_of(start_n, BLOCK_N)
+        if (EVEN_N & EVEN_M):
             if EVEN_HEADDIM:
+                k = tl.load((k_ptrs + (start_n * stride_kn)))
             else:
+                k = tl.load((k_ptrs + (start_n * stride_kn)), mask=(offs_d[None, :] < headdim), other=0.0)
         elif EVEN_HEADDIM:
+            k = tl.load((k_ptrs + (start_n * stride_kn)), mask=((start_n + offs_n)[:, None] < seqlen_k), other=0.0)
         else:
+            k = tl.load((k_ptrs + (start_n * stride_kn)), mask=(((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim)), other=0.0)
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k, trans_b=True)
+        if (not EVEN_N):
+            qk += tl.where(((start_n + offs_n)[None, :] < seqlen_k), 0, float('-inf'))
         if IS_CAUSAL:
+            qk += tl.where((offs_m[:, None] >= (start_n + offs_n)[None, :]), 0, float('-inf'))
+        if (BIAS_TYPE != 'none'):
+            if (BIAS_TYPE == 'vector'):
                 if EVEN_N:
+                    bias = tl.load((b_ptrs + start_n)).to(tl.float32)
                 else:
+                    bias = tl.load((b_ptrs + start_n), mask=((start_n + offs_n) < seqlen_k), other=0.0).to(tl.float32)
                 bias = bias[None, :]
+            elif (BIAS_TYPE == 'matrix'):
+                if (EVEN_M & EVEN_N):
+                    bias = tl.load((b_ptrs + start_n)).to(tl.float32)
                 else:
+                    bias = tl.load((b_ptrs + start_n), mask=((offs_m[:, None] < seqlen_q) & ((start_n + offs_n)[None, :] < seqlen_k)), other=0.0).to(tl.float32)
+            qk = ((qk * softmax_scale) + bias)
             m_ij = tl.maximum(tl.max(qk, 1), lse_i)
+            p = tl.exp((qk - m_ij[:, None]))
         else:
+            m_ij = tl.maximum((tl.max(qk, 1) * softmax_scale), lse_i)
+            p = tl.exp(((qk * softmax_scale) - m_ij[:, None]))
         l_ij = tl.sum(p, 1)
+        acc_o_scale = tl.exp((m_i - m_ij))
         tl.store(t_ptrs, acc_o_scale)
         acc_o_scale = tl.load(t_ptrs)
+        acc_o = (acc_o * acc_o_scale[:, None])
+        if (EVEN_N & EVEN_M):
             if EVEN_HEADDIM:
+                v = tl.load((v_ptrs + (start_n * stride_vn)))
             else:
+                v = tl.load((v_ptrs + (start_n * stride_vn)), mask=(offs_d[None, :] < headdim), other=0.0)
         elif EVEN_HEADDIM:
+            v = tl.load((v_ptrs + (start_n * stride_vn)), mask=((start_n + offs_n)[:, None] < seqlen_k), other=0.0)
         else:
+            v = tl.load((v_ptrs + (start_n * stride_vn)), mask=(((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim)), other=0.0)
         p = p.to(v.dtype)
         acc_o += tl.dot(p, v)
         m_i = m_ij
+        l_i_new = (tl.exp((lse_i - m_ij)) + l_ij)
+        lse_i = (m_ij + tl.log(l_i_new))
+    o_scale = tl.exp((m_i - lse_i))
     tl.store(t_ptrs, o_scale)
     o_scale = tl.load(t_ptrs)
+    acc_o = (acc_o * o_scale[:, None])
     start_m = tl.program_id(0)
+    offs_m = ((start_m * BLOCK_M) + tl.arange(0, BLOCK_M))
+    lse_ptrs = ((Lse + (off_hb * seqlen_q_rounded)) + offs_m)
     tl.store(lse_ptrs, lse_i)
     offs_d = tl.arange(0, BLOCK_HEADDIM)
+    out_ptrs = (((Out + (off_b * stride_ob)) + (off_h * stride_oh)) + ((offs_m[:, None] * stride_om) + offs_d[None, :]))
     if EVEN_M:
         if EVEN_HEADDIM:
             tl.store(out_ptrs, acc_o)
         else:
+            tl.store(out_ptrs, acc_o, mask=(offs_d[None, :] < headdim))
     elif EVEN_HEADDIM:
+        tl.store(out_ptrs, acc_o, mask=(offs_m[:, None] < seqlen_q))
     else:
+        tl.store(out_ptrs, acc_o, mask=((offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim)))
 @triton.jit
 def _bwd_preprocess_do_o_dot(Out, DO, Delta, stride_ob, stride_oh, stride_om, stride_dob, stride_doh, stride_dom, nheads, seqlen_q, seqlen_q_rounded, headdim, BLOCK_M: tl.constexpr, BLOCK_HEADDIM: tl.constexpr):
     start_m = tl.program_id(0)
     off_hb = tl.program_id(1)
+    off_b = (off_hb // nheads)
+    off_h = (off_hb % nheads)
+    offs_m = ((start_m * BLOCK_M) + tl.arange(0, BLOCK_M))
     offs_d = tl.arange(0, BLOCK_HEADDIM)
+    o = tl.load(((((Out + (off_b * stride_ob)) + (off_h * stride_oh)) + (offs_m[:, None] * stride_om)) + offs_d[None, :]), mask=((offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim)), other=0.0).to(tl.float32)
+    do = tl.load(((((DO + (off_b * stride_dob)) + (off_h * stride_doh)) + (offs_m[:, None] * stride_dom)) + offs_d[None, :]), mask=((offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim)), other=0.0).to(tl.float32)
+    delta = tl.sum((o * do), axis=1)
+    tl.store(((Delta + (off_hb * seqlen_q_rounded)) + offs_m), delta)
 @triton.jit
 def _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr):
+    if (EVEN_N & EVEN_M):
         if EVEN_HEADDIM:
             tl.store(dv_ptrs, dv)
             tl.store(dk_ptrs, dk)
         else:
+            tl.store(dv_ptrs, dv, mask=(offs_d[None, :] < headdim))
+            tl.store(dk_ptrs, dk, mask=(offs_d[None, :] < headdim))
     elif EVEN_HEADDIM:
+        tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k))
+        tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k))
     else:
+        tl.store(dv_ptrs, dv, mask=((offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim)))
+        tl.store(dk_ptrs, dk, mask=((offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim)))
 @triton.jit
 def _bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD: tl.constexpr, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
+    begin_m = (0 if (not IS_CAUSAL) else (((start_n * BLOCK_N) // BLOCK_M) * BLOCK_M))
+    offs_qm = (begin_m + tl.arange(0, BLOCK_M))
+    offs_n = ((start_n * BLOCK_N) + tl.arange(0, BLOCK_N))
     offs_m = tl.arange(0, BLOCK_M)
     offs_d = tl.arange(0, BLOCK_HEADDIM)
+    q_ptrs = (Q + ((offs_qm[:, None] * stride_qm) + offs_d[None, :]))
+    k_ptrs = (K + ((offs_n[:, None] * stride_kn) + offs_d[None, :]))
+    v_ptrs = (V + ((offs_n[:, None] * stride_vn) + offs_d[None, :]))
+    do_ptrs = (DO + ((offs_qm[:, None] * stride_dom) + offs_d[None, :]))
+    dq_ptrs = (DQ + ((offs_qm[:, None] * stride_dqm) + offs_d[None, :]))
+    if (BIAS_TYPE == 'vector'):
+        b_ptrs = (Bias + offs_n)
+    elif (BIAS_TYPE == 'matrix'):
+        b_ptrs = (Bias + ((offs_qm[:, None] * stride_bm) + offs_n[None, :]))
     dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
     dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    if (begin_m >= seqlen_q):
+        dv_ptrs = (DV + ((offs_n[:, None] * stride_dvn) + offs_d[None, :]))
+        dk_ptrs = (DK + ((offs_n[:, None] * stride_dkn) + offs_d[None, :]))
         _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
         return
+    if (EVEN_N & EVEN_M):
         if EVEN_HEADDIM:
             k = tl.load(k_ptrs)
             v = tl.load(v_ptrs)
         else:
+            k = tl.load(k_ptrs, mask=(offs_d[None, :] < headdim), other=0.0)
+            v = tl.load(v_ptrs, mask=(offs_d[None, :] < headdim), other=0.0)
     elif EVEN_HEADDIM:
+        k = tl.load(k_ptrs, mask=(offs_n[:, None] < seqlen_k), other=0.0)
+        v = tl.load(v_ptrs, mask=(offs_n[:, None] < seqlen_k), other=0.0)
     else:
+        k = tl.load(k_ptrs, mask=((offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim)), other=0.0)
+        v = tl.load(v_ptrs, mask=((offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim)), other=0.0)
     num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
+    for start_m in range(begin_m, (num_block_m * BLOCK_M), BLOCK_M):
         start_m = tl.multiple_of(start_m, BLOCK_M)
+        offs_m_curr = (start_m + offs_m)
+        if (EVEN_M & EVEN_HEADDIM):
             q = tl.load(q_ptrs)
         elif EVEN_HEADDIM:
+            q = tl.load(q_ptrs, mask=(offs_m_curr[:, None] < seqlen_q), other=0.0)
         else:
+            q = tl.load(q_ptrs, mask=((offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim)), other=0.0)
         qk = tl.dot(q, k, trans_b=True)
+        if (not EVEN_N):
+            qk = tl.where((offs_n[None, :] < seqlen_k), qk, float('-inf'))
         if IS_CAUSAL:
+            qk = tl.where((offs_m_curr[:, None] >= offs_n[None, :]), qk, float('-inf'))
+        if (BIAS_TYPE != 'none'):
             tl.debug_barrier()
+            if (BIAS_TYPE == 'vector'):
                 if EVEN_N:
                     bias = tl.load(b_ptrs).to(tl.float32)
                 else:
+                    bias = tl.load(b_ptrs, mask=(offs_n < seqlen_k), other=0.0).to(tl.float32)
                 bias = bias[None, :]
+            elif (BIAS_TYPE == 'matrix'):
+                if (EVEN_M & EVEN_N):
                     bias = tl.load(b_ptrs).to(tl.float32)
                 else:
+                    bias = tl.load(b_ptrs, mask=((offs_m_curr[:, None] < seqlen_q) & (offs_n[None, :] < seqlen_k)), other=0.0).to(tl.float32)
+            qk = ((qk * softmax_scale) + bias)
+        if (not (EVEN_M & EVEN_HEADDIM)):
             tl.debug_barrier()
+        lse_i = tl.load((LSE + offs_m_curr))
+        if (BIAS_TYPE == 'none'):
+            p = tl.exp(((qk * softmax_scale) - lse_i[:, None]))
         else:
+            p = tl.exp((qk - lse_i[:, None]))
+        if (EVEN_M & EVEN_HEADDIM):
             do = tl.load(do_ptrs)
         else:
+            do = tl.load(do_ptrs, mask=((offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim)), other=0.0)
         dv += tl.dot(p.to(do.dtype), do, trans_a=True)
+        if (not (EVEN_M & EVEN_HEADDIM)):
             tl.debug_barrier()
         dp = tl.dot(do, v, trans_b=True)
+        if (not EVEN_HEADDIM):
             tl.debug_barrier()
+        Di = tl.load((D + offs_m_curr))
+        ds = ((p * (dp - Di[:, None])) * softmax_scale).to(q.dtype)
         dk += tl.dot(ds, q, trans_a=True)
+        if (not (EVEN_M & EVEN_HEADDIM)):
             tl.debug_barrier()
+        if (not ATOMIC_ADD):
+            if (EVEN_M & EVEN_HEADDIM):
                 dq = tl.load(dq_ptrs, eviction_policy='evict_last')
                 dq += tl.dot(ds, k)
                 tl.store(dq_ptrs, dq, eviction_policy='evict_last')
             elif EVEN_HEADDIM:
+                dq = tl.load(dq_ptrs, mask=(offs_m_curr[:, None] < seqlen_q), other=0.0, eviction_policy='evict_last')
                 dq += tl.dot(ds, k)
+                tl.store(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q), eviction_policy='evict_last')
             else:
+                dq = tl.load(dq_ptrs, mask=((offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim)), other=0.0, eviction_policy='evict_last')
                 dq += tl.dot(ds, k)
+                tl.store(dq_ptrs, dq, mask=((offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim)), eviction_policy='evict_last')
         else:
             dq = tl.dot(ds, k)
+            if (EVEN_M & EVEN_HEADDIM):
                 tl.atomic_add(dq_ptrs, dq)
             elif EVEN_HEADDIM:
+                tl.atomic_add(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q))
             else:
+                tl.atomic_add(dq_ptrs, dq, mask=((offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim)))
+        dq_ptrs += (BLOCK_M * stride_dqm)
+        q_ptrs += (BLOCK_M * stride_qm)
+        do_ptrs += (BLOCK_M * stride_dom)
+        if (BIAS_TYPE == 'matrix'):
+            b_ptrs += (BLOCK_M * stride_bm)
+    dv_ptrs = (DV + ((offs_n[:, None] * stride_dvn) + offs_d[None, :]))
+    dk_ptrs = (DK + ((offs_n[:, None] * stride_dkn) + offs_d[None, :]))
     _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
 def init_to_zero(name):
+    return (lambda nargs: nargs[name].zero_())
 @triton.autotune(configs=[triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'SEQUENCE_PARALLEL': False}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'SEQUENCE_PARALLEL': True}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ'))], key=['CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL', 'BLOCK_HEADDIM'])
+@triton.heuristics({'EVEN_M': (lambda args: ((args['seqlen_q'] % args['BLOCK_M']) == 0)), 'EVEN_N': (lambda args: ((args['seqlen_k'] % args['BLOCK_N']) == 0)), 'EVEN_HEADDIM': (lambda args: (args['headdim'] == args['BLOCK_HEADDIM']))})
 @triton.jit
 def _bwd_kernel(Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qb, stride_qh, stride_qm, stride_kb, stride_kh, stride_kn, stride_vb, stride_vh, stride_vn, stride_bb, stride_bh, stride_bm, stride_dob, stride_doh, stride_dom, stride_dqb, stride_dqh, stride_dqm, stride_dkb, stride_dkh, stride_dkn, stride_dvb, stride_dvh, stride_dvn, nheads, seqlen_q, seqlen_k, seqlen_q_rounded, headdim, CACHE_KEY_SEQLEN_Q, CACHE_KEY_SEQLEN_K, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, SEQUENCE_PARALLEL: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
     off_hb = tl.program_id(1)
+    off_b = (off_hb // nheads)
+    off_h = (off_hb % nheads)
+    Q += ((off_b * stride_qb) + (off_h * stride_qh))
+    K += ((off_b * stride_kb) + (off_h * stride_kh))
+    V += ((off_b * stride_vb) + (off_h * stride_vh))
+    DO += ((off_b * stride_dob) + (off_h * stride_doh))
+    DQ += ((off_b * stride_dqb) + (off_h * stride_dqh))
+    DK += ((off_b * stride_dkb) + (off_h * stride_dkh))
+    DV += ((off_b * stride_dvb) + (off_h * stride_dvh))
+    if (BIAS_TYPE != 'none'):
+        Bias += ((off_b * stride_bb) + (off_h * stride_bh))
+    D += (off_hb * seqlen_q_rounded)
+    LSE += (off_hb * seqlen_q_rounded)
+    if (not SEQUENCE_PARALLEL):
         num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
         for start_n in range(0, num_block_n):
             _bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD=False, BIAS_TYPE=BIAS_TYPE, IS_CAUSAL=IS_CAUSAL, BLOCK_HEADDIM=BLOCK_HEADDIM, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N)
 def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
     (batch, seqlen_q, nheads, d) = q.shape
     (_, seqlen_k, _, _) = k.shape
+    assert (k.shape == (batch, seqlen_k, nheads, d))
+    assert (v.shape == (batch, seqlen_k, nheads, d))
+    assert (d <= 128), 'FlashAttention only support head dimensions up to 128'
+    assert (q.dtype == k.dtype == v.dtype), 'All tensors must have the same type'
+    assert (q.dtype in [torch.float16, torch.bfloat16]), 'Only support fp16 and bf16'
+    assert (q.is_cuda and k.is_cuda and v.is_cuda)
+    softmax_scale = (softmax_scale or (1.0 / math.sqrt(d)))
+    has_bias = (bias is not None)
     bias_type = 'none'
     if has_bias:
+        assert (bias.dtype in [q.dtype, torch.float])
         assert bias.is_cuda
+        assert (bias.dim() == 4)
+        if (bias.stride((- 1)) != 1):
             bias = bias.contiguous()
+        if (bias.shape[2:] == (1, seqlen_k)):
             bias_type = 'vector'
+        elif (bias.shape[2:] == (seqlen_q, seqlen_k)):
             bias_type = 'matrix'
         else:
             raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)')
         bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
+    bias_strides = ((bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0))
+    seqlen_q_rounded = (math.ceil((seqlen_q / 128)) * 128)
     lse = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
     tmp = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
     o = torch.empty_like(q)
     BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
     BLOCK = 128
+    num_warps = (4 if (d <= 64) else 8)
+    grid = (lambda META: (triton.cdiv(seqlen_q, META['BLOCK_M']), (batch * nheads)))
+    _fwd_kernel[grid](q, k, v, bias, o, lse, tmp, softmax_scale, q.stride(0), q.stride(2), q.stride(1), k.stride(0), k.stride(2), k.stride(1), v.stride(0), v.stride(2), v.stride(1), *bias_strides, o.stride(0), o.stride(2), o.stride(1), nheads, seqlen_q, seqlen_k, seqlen_q_rounded, d, (seqlen_q // 32), (seqlen_k // 32), bias_type, causal, BLOCK_HEADDIM, BLOCK_M=BLOCK, BLOCK_N=BLOCK, num_warps=num_warps, num_stages=1)
     return (o, lse, softmax_scale)
 def _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None):
+    if (do.stride((- 1)) != 1):
         do = do.contiguous()
     (batch, seqlen_q, nheads, d) = q.shape
     (_, seqlen_k, _, _) = k.shape
+    assert (d <= 128)
+    seqlen_q_rounded = (math.ceil((seqlen_q / 128)) * 128)
+    assert (lse.shape == (batch, nheads, seqlen_q_rounded))
+    assert (q.stride((- 1)) == k.stride((- 1)) == v.stride((- 1)) == o.stride((- 1)) == 1)
+    assert (dq.stride((- 1)) == dk.stride((- 1)) == dv.stride((- 1)) == 1)
+    softmax_scale = (softmax_scale or (1.0 / math.sqrt(d)))
     dq_accum = torch.empty_like(q, dtype=torch.float32)
     delta = torch.empty_like(lse)
     BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
+    grid = (lambda META: (triton.cdiv(seqlen_q, META['BLOCK_M']), (batch * nheads)))
     _bwd_preprocess_do_o_dot[grid](o, do, delta, o.stride(0), o.stride(2), o.stride(1), do.stride(0), do.stride(2), do.stride(1), nheads, seqlen_q, seqlen_q_rounded, d, BLOCK_M=128, BLOCK_HEADDIM=BLOCK_HEADDIM)
+    has_bias = (bias is not None)
     bias_type = 'none'
     if has_bias:
+        assert (bias.dtype in [q.dtype, torch.float])
         assert bias.is_cuda
+        assert (bias.dim() == 4)
+        assert (bias.stride((- 1)) == 1)
+        if (bias.shape[2:] == (1, seqlen_k)):
             bias_type = 'vector'
+        elif (bias.shape[2:] == (seqlen_q, seqlen_k)):
             bias_type = 'matrix'
         else:
             raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)')
         bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
+    bias_strides = ((bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0))
+    grid = (lambda META: ((triton.cdiv(seqlen_k, META['BLOCK_N']) if META['SEQUENCE_PARALLEL'] else 1), (batch * nheads)))
+    _bwd_kernel[grid](q, k, v, bias, do, dq_accum, dk, dv, lse, delta, softmax_scale, q.stride(0), q.stride(2), q.stride(1), k.stride(0), k.stride(2), k.stride(1), v.stride(0), v.stride(2), v.stride(1), *bias_strides, do.stride(0), do.stride(2), do.stride(1), dq_accum.stride(0), dq_accum.stride(2), dq_accum.stride(1), dk.stride(0), dk.stride(2), dk.stride(1), dv.stride(0), dv.stride(2), dv.stride(1), nheads, seqlen_q, seqlen_k, seqlen_q_rounded, d, (seqlen_q // 32), (seqlen_k // 32), bias_type, causal, BLOCK_HEADDIM)
     dq.copy_(dq_accum)
 class FlashAttnQKVPackedFunc(torch.autograd.Function):
     @staticmethod
     def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
+        '\n            qkv: (batch, seqlen, 3, nheads, headdim)\n            bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).\n                For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).\n                ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)\n        '
+        if (qkv.stride((- 1)) != 1):
             qkv = qkv.contiguous()
         (o, lse, ctx.softmax_scale) = _flash_attn_forward(qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], bias=bias, causal=causal, softmax_scale=softmax_scale)
         ctx.save_for_backward(qkv, o, lse, bias)
     @staticmethod
     def backward(ctx, do):
         (qkv, o, lse, bias) = ctx.saved_tensors
+        assert (not ctx.needs_input_grad[1]), 'FlashAttention does not support bias gradient yet'
         with torch.inference_mode():
             dqkv = torch.empty_like(qkv)
             _flash_attn_backward(do, qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], o, lse, dqkv[:, :, 0], dqkv[:, :, 1], dqkv[:, :, 2], bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
     @staticmethod
     def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
+        '\n            q: (batch, seqlen_q, nheads, headdim)\n            kv: (batch, seqlen_k, 2, nheads, headdim)\n            bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).\n                For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).\n                ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)\n        '
+        (q, kv) = [(x if (x.stride((- 1)) == 1) else x.contiguous()) for x in [q, kv]]
         (o, lse, ctx.softmax_scale) = _flash_attn_forward(q, kv[:, :, 0], kv[:, :, 1], bias=bias, causal=causal, softmax_scale=softmax_scale)
         ctx.save_for_backward(q, kv, o, lse, bias)
         ctx.causal = causal
     @staticmethod
     def backward(ctx, do):
         (q, kv, o, lse, bias) = ctx.saved_tensors
+        if (len(ctx.needs_input_grad) >= 3):
+            assert (not ctx.needs_input_grad[2]), 'FlashAttention does not support bias gradient yet'
         with torch.inference_mode():
             dq = torch.empty_like(q)
             dkv = torch.empty_like(kv)
     @staticmethod
     def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
+        '\n            q: (batch_size, seqlen_q, nheads, headdim)\n            k, v: (batch_size, seqlen_k, nheads, headdim)\n            bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).\n                For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).\n                ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)\n        '
+        (q, k, v) = [(x if (x.stride((- 1)) == 1) else x.contiguous()) for x in [q, k, v]]
         (o, lse, ctx.softmax_scale) = _flash_attn_forward(q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale)
         ctx.save_for_backward(q, k, v, o, lse, bias)
         ctx.causal = causal
     @staticmethod
     def backward(ctx, do):
         (q, k, v, o, lse, bias) = ctx.saved_tensors
+        assert (not ctx.needs_input_grad[3]), 'FlashAttention does not support bias gradient yet'
         with torch.inference_mode():
             dq = torch.empty_like(q)
             dk = torch.empty_like(k)
             dv = torch.empty_like(v)
             _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
         return (dq, dk, dv, None, None, None)
+flash_attn_func = FlashAttnFunc.apply

generation_config.json CHANGED Viewed

@@ -1,5 +1,7 @@
 {
   "_from_model_config": true,
   "transformers_version": "4.28.1",
-  "use_cache": false
 }

 {
   "_from_model_config": true,
   "transformers_version": "4.28.1",
+  "eos_token_id": 0,
+  "pad_token_id": 0,
+  "use_cache": true
 }

hf_prefixlm_converter.py CHANGED Viewed

@@ -1,11 +1,5 @@
-"""Converts Huggingface Causal LM to Prefix LM.
-Conversion does lightweight surgery on a HuggingFace
-Causal LM to convert it to a Prefix LM.
-Prefix LMs accepts a `bidirectional_mask` input in `forward`
-and treat the input prompt as the prefix in `generate`.
-"""
 import math
 import warnings
 from types import MethodType
@@ -24,31 +18,17 @@ from transformers.models.opt.modeling_opt import _expand_mask as _expand_mask_op
 from transformers.models.opt.modeling_opt import _make_causal_mask as _make_causal_mask_opt
 logger = logging.get_logger(__name__)
 _SUPPORTED_GPT_MODELS = (GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM)
-CAUSAL_GPT_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM]
 def _convert_gpt_causal_lm_to_prefix_lm(model: CAUSAL_GPT_TYPES) -> CAUSAL_GPT_TYPES:
-    """Converts a GPT-style Causal LM to a Prefix LM.
-    Supported HuggingFace model classes:
-        - `GPT2LMHeadModel`
-        - `GPTNeoForCausalLM`
-        - `GPTNeoXForCausalLM`
-        - `GPTJForCausalLM`
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
     if hasattr(model, '_prefix_lm_converted'):
         return model
     assert isinstance(model, _SUPPORTED_GPT_MODELS)
-    assert model.config.add_cross_attention == False, 'Only supports GPT-style decoder-only models'
     def _get_attn_modules(model: CAUSAL_GPT_TYPES) -> List[torch.nn.Module]:
-        """Helper that gets a list of the model's attention modules.
-        Each module has a `bias` buffer used for causal masking. The Prefix LM
-        conversion adds logic to dynamically manipulate these biases to support
-        Prefix LM attention masking.
-        """
         attn_modules = []
         if isinstance(model, GPTNeoXForCausalLM):
             blocks = model.gpt_neox.layers
@@ -56,7 +36,7 @@ def _convert_gpt_causal_lm_to_prefix_lm(model: CAUSAL_GPT_TYPES) -> CAUSAL_GPT_T
             blocks = model.transformer.h
         for block in blocks:
             if isinstance(model, GPTNeoForCausalLM):
-                if block.attn.attention_type != 'global':
                     continue
                 attn_module = block.attn.attention
             elif isinstance(model, GPTNeoXForCausalLM):
@@ -69,41 +49,41 @@ def _convert_gpt_causal_lm_to_prefix_lm(model: CAUSAL_GPT_TYPES) -> CAUSAL_GPT_T
     setattr(model, '_original_generate', getattr(model, 'generate'))
     def forward(self: CAUSAL_GPT_TYPES, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]]=None, attention_mask: Optional[torch.FloatTensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, token_type_ids: Optional[torch.LongTensor]=None, position_ids: Optional[torch.LongTensor]=None, head_mask: Optional[torch.FloatTensor]=None, inputs_embeds: Optional[torch.FloatTensor]=None, labels: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None):
-        """Wraps original forward to enable PrefixLM attention."""
         def call_og_forward():
             if isinstance(self, GPTNeoXForCausalLM):
                 return self._original_forward(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
             else:
                 return self._original_forward(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
-        if bidirectional_mask is None:
             return call_og_forward()
         assert isinstance(bidirectional_mask, torch.Tensor)
         attn_modules = _get_attn_modules(model)
         (b, s) = bidirectional_mask.shape
-        max_length = attn_modules[0].bias.shape[-1]
-        if s > max_length:
-            raise ValueError(f'bidirectional_mask sequence length (={s}) exceeds the ' + f'max length allowed by the model ({max_length}).')
-        assert s <= max_length
-        if s < max_length:
-            pad = torch.zeros((int(b), int(max_length - s)), dtype=bidirectional_mask.dtype, device=bidirectional_mask.device)
             bidirectional_mask = torch.cat([bidirectional_mask, pad], dim=1)
         bidirectional = bidirectional_mask.unsqueeze(1).unsqueeze(1)
         for attn_module in attn_modules:
             attn_module.bias.data = torch.logical_or(attn_module.bias.data, bidirectional)
         output = call_og_forward()
         for attn_module in attn_modules:
-            attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
         return output
-    def generate(self: CAUSAL_GPT_TYPES, *args: tuple, **kwargs: Dict[str, Any]):
-        """Wraps original generate to enable PrefixLM attention."""
         attn_modules = _get_attn_modules(model)
         for attn_module in attn_modules:
             attn_module.bias.data[:] = 1
         output = self._original_generate(*args, **kwargs)
         for attn_module in attn_modules:
-            attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
         return output
     setattr(model, 'forward', MethodType(forward, model))
     setattr(model, 'generate', MethodType(generate, model))
@@ -111,85 +91,79 @@ def _convert_gpt_causal_lm_to_prefix_lm(model: CAUSAL_GPT_TYPES) -> CAUSAL_GPT_T
     return model
 def _convert_bloom_causal_lm_to_prefix_lm(model: BloomForCausalLM) -> BloomForCausalLM:
-    """Converts a BLOOM Causal LM to a Prefix LM.
-    Supported HuggingFace model classes:
-        - `BloomForCausalLM`
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
     if hasattr(model, '_prefix_lm_converted'):
         return model
     assert isinstance(model, BloomForCausalLM)
-    assert model.config.add_cross_attention == False, 'Only supports BLOOM decoder-only models'
-    def _prepare_attn_mask(self: BloomModel, attention_mask: torch.Tensor, bidirectional_mask: Optional[torch.Tensor], input_shape: Tuple[int, int], past_key_values_length: int) -> torch.BoolTensor:
         combined_attention_mask = None
         device = attention_mask.device
         (_, src_length) = input_shape
-        if src_length > 1:
             combined_attention_mask = _make_causal_mask_bloom(input_shape, device=device, past_key_values_length=past_key_values_length)
-            if bidirectional_mask is not None:
-                assert attention_mask.shape == bidirectional_mask.shape
                 expanded_bidirectional_mask = _expand_mask_bloom(bidirectional_mask, tgt_length=src_length)
                 combined_attention_mask = torch.logical_and(combined_attention_mask, expanded_bidirectional_mask)
         expanded_attn_mask = _expand_mask_bloom(attention_mask, tgt_length=src_length)
-        combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
         return combined_attention_mask
     def _build_alibi_tensor(self: BloomModel, batch_size: int, query_length: int, key_length: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
         num_heads = self.config.n_head
-        closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
-        base = torch.tensor(2 ** (-2 ** (-(math.log2(closest_power_of_2) - 3))), device=device, dtype=torch.float32)
-        powers = torch.arange(1, 1 + closest_power_of_2, device=device, dtype=torch.int32)
         slopes = torch.pow(base, powers)
-        if closest_power_of_2 != num_heads:
-            extra_base = torch.tensor(2 ** (-2 ** (-(math.log2(2 * closest_power_of_2) - 3))), device=device, dtype=torch.float32)
-            num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
-            extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=device, dtype=torch.int32)
             slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
-        qa = torch.arange(query_length, device=device, dtype=torch.int32).view(-1, 1)
-        ka = torch.arange(key_length, device=device, dtype=torch.int32).view(1, -1)
-        diffs = qa - ka + key_length - query_length
-        diffs = -diffs.abs()
-        alibi = slopes.view(1, num_heads, 1, 1) * diffs.view(1, 1, query_length, key_length)
-        alibi = alibi.expand(batch_size, -1, -1, -1).reshape(-1, query_length, key_length)
         return alibi.to(dtype)
-    KeyValueT = Tuple[torch.Tensor, torch.Tensor]
-    def forward(self: BloomModel, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[KeyValueT, ...]]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, head_mask: Optional[torch.LongTensor]=None, inputs_embeds: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None, **deprecated_arguments) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
-        if deprecated_arguments.pop('position_ids', False) is not False:
-            warnings.warn('`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. ' + 'You can safely ignore passing `position_ids`.', FutureWarning)
-        if len(deprecated_arguments) > 0:
             raise ValueError(f'Got unexpected arguments: {deprecated_arguments}')
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if input_ids is not None and inputs_embeds is not None:
             raise ValueError('You cannot specify both input_ids and inputs_embeds at the same time')
-        elif input_ids is not None:
             (batch_size, seq_length) = input_ids.shape
-        elif inputs_embeds is not None:
             (batch_size, seq_length, _) = inputs_embeds.shape
         else:
             raise ValueError('You have to specify either input_ids or inputs_embeds')
-        if past_key_values is None:
-            past_key_values = tuple([None] * len(self.h))
         head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-        if inputs_embeds is None:
             inputs_embeds = self.word_embeddings(input_ids)
         hidden_states = self.word_embeddings_layernorm(inputs_embeds)
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
         seq_length_with_past = seq_length
         past_key_values_length = 0
-        if past_key_values[0] is not None:
             tmp = past_key_values[0][0]
             past_key_values_length = tmp.shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-        if attention_mask is None:
             attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
         else:
             attention_mask = attention_mask.to(hidden_states.device)
@@ -198,8 +172,8 @@ def _convert_bloom_causal_lm_to_prefix_lm(model: BloomForCausalLM) -> BloomForCa
         for (i, (block, layer_past)) in enumerate(zip(self.h, past_key_values)):
             if output_hidden_states:
                 hst = (hidden_states,)
-                all_hidden_states = all_hidden_states + hst
-            if self.gradient_checkpointing and self.training:
                 if use_cache:
                     logger.warning('`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...')
                     use_cache = False
@@ -213,50 +187,50 @@ def _convert_bloom_causal_lm_to_prefix_lm(model: BloomForCausalLM) -> BloomForCa
             else:
                 outputs = block(hidden_states, layer_past=layer_past, attention_mask=causal_mask, head_mask=head_mask[i], use_cache=use_cache, output_attentions=output_attentions, alibi=alibi)
             hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
             if output_attentions:
-                oa = (outputs[2 if use_cache else 1],)
-                all_self_attentions = all_self_attentions + oa
         hidden_states = self.ln_f(hidden_states)
         if output_hidden_states:
             hst = (hidden_states,)
-            all_hidden_states = all_hidden_states + hst
-        if not return_dict:
-            return tuple((v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None))
         return BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions)
     setattr(model.transformer, '_prepare_attn_mask', MethodType(_prepare_attn_mask, model.transformer))
     setattr(model.transformer, '_build_alibi_tensor', MethodType(_build_alibi_tensor, model.transformer))
     setattr(model.transformer, 'forward', MethodType(forward, model.transformer))
-    KeyValueT = Tuple[torch.Tensor, torch.Tensor]
-    def forward(self: BloomForCausalLM, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[KeyValueT, ...]]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, head_mask: Optional[torch.Tensor]=None, inputs_embeds: Optional[torch.Tensor]=None, labels: Optional[torch.Tensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None, **deprecated_arguments) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
-        """Replacement forward method for BloomCausalLM."""
-        if deprecated_arguments.pop('position_ids', False) is not False:
-            warnings.warn('`position_ids` have no functionality in BLOOM and will be removed ' + 'in v5.0.0. You can safely ignore passing `position_ids`.', FutureWarning)
-        if len(deprecated_arguments) > 0:
             raise ValueError(f'Got unexpected arguments: {deprecated_arguments}')
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         transformer_outputs = self.transformer(input_ids, past_key_values=past_key_values, attention_mask=attention_mask, bidirectional_mask=bidirectional_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
         hidden_states = transformer_outputs[0]
         lm_logits = self.lm_head(hidden_states)
         loss = None
-        if labels is not None:
-            shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
             (batch_size, seq_length, vocab_size) = shift_logits.shape
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length))
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return (loss,) + output if loss is not None else output
         return CausalLMOutputWithCrossAttentions(loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions)
     def prepare_inputs_for_generation(self: BloomForCausalLM, input_ids: torch.LongTensor, past: Optional[torch.Tensor]=None, attention_mask: Optional[torch.Tensor]=None, **kwargs) -> dict:
         if past:
-            input_ids = input_ids[:, -1].unsqueeze(-1)
             bidirectional_mask = None
-            if past[0][0].shape[0] == input_ids.shape[0]:
                 past = self._convert_to_bloom_cache(past)
         else:
             bidirectional_mask = torch.ones_like(input_ids)
@@ -267,36 +241,30 @@ def _convert_bloom_causal_lm_to_prefix_lm(model: BloomForCausalLM) -> BloomForCa
     return model
 def _convert_opt_causal_lm_to_prefix_lm(model: OPTForCausalLM) -> OPTForCausalLM:
-    """Converts an OPT Causal LM to a Prefix LM.
-    Supported HuggingFace model classes:
-        - `OPTForCausalLM`
-    See `convert_hf_causal_lm_to_prefix_lm` for more details.
-    """
     if hasattr(model, '_prefix_lm_converted'):
         return model
     assert isinstance(model, OPTForCausalLM)
-    assert model.config.add_cross_attention == False, 'Only supports OPT decoder-only models'
     setattr(model, '_original_forward', getattr(model, 'forward'))
     setattr(model, '_original_generate', getattr(model, 'generate'))
     model.model.decoder.bidirectional_mask = None
     def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
         combined_attention_mask = None
-        if input_shape[-1] > 1:
-            if self.bidirectional_mask == 'g':
                 (bsz, src_length) = input_shape
-                combined_attention_mask = torch.zeros((bsz, 1, src_length, src_length + past_key_values_length), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
             else:
                 combined_attention_mask = _make_causal_mask_opt(input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length).to(inputs_embeds.device)
-                if self.bidirectional_mask is not None:
-                    assert attention_mask.shape == self.bidirectional_mask.shape
-                    expanded_bidirectional_mask = _expand_mask_opt(self.bidirectional_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(inputs_embeds.device)
                     combined_attention_mask = torch.maximum(expanded_bidirectional_mask, combined_attention_mask)
-        if attention_mask is not None:
-            expanded_attn_mask = _expand_mask_opt(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(inputs_embeds.device)
-            combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
         return combined_attention_mask
     setattr(model.model.decoder, '_prepare_decoder_attention_mask', MethodType(_prepare_decoder_attention_mask, model.model.decoder))
@@ -304,7 +272,7 @@ def _convert_opt_causal_lm_to_prefix_lm(model: OPTForCausalLM) -> OPTForCausalLM
         def call_og_forward():
             return self._original_forward(input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
-        if bidirectional_mask is None:
             return call_og_forward()
         self.model.decoder.bidirectional_mask = bidirectional_mask
         try:
@@ -315,8 +283,8 @@ def _convert_opt_causal_lm_to_prefix_lm(model: OPTForCausalLM) -> OPTForCausalLM
         self.model.decoder.bidirectional_mask = None
         return outputs
-    def generate(self: OPTForCausalLM, *args: tuple, **kwargs: Dict[str, Any]):
-        """Wraps original generate to enable PrefixLM-style attention."""
         self.model.decoder.bidirectional_mask = 'g'
         try:
             output = self._original_generate(*args, **kwargs)
@@ -329,66 +297,11 @@ def _convert_opt_causal_lm_to_prefix_lm(model: OPTForCausalLM) -> OPTForCausalLM
     setattr(model, 'generate', MethodType(generate, model))
     setattr(model, '_prefix_lm_converted', True)
     return model
-_SUPPORTED_HF_MODELS = _SUPPORTED_GPT_MODELS + (BloomForCausalLM, OPTForCausalLM)
-CAUSAL_LM_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM, BloomForCausalLM, OPTForCausalLM]
 def convert_hf_causal_lm_to_prefix_lm(model: CAUSAL_LM_TYPES) -> CAUSAL_LM_TYPES:
-    """Converts a HuggingFace Causal LM to a Prefix LM.
-    Supported HuggingFace model classes:
-        - `GPT2LMHeadModel`
-        - `GPTNeoForCausalLM`
-        - `GPTNeoXForCausalLM`
-        - `GPTJForCausalLM`
-        - `BloomForCausalLM`
-        - `OPTForCausalLM`
-    Conversion to a Prefix LM is done by modifying the `forward` method, and possibly also the
-    `generate` method and/or select underlying methods depending on the model class.
-    These changes preserve the model API, but add a new input to `forward`: "bidirectional_mask".
-    Notes on training:
-        To actually train the converted model as a Prefix LM, training batches will need to indicate
-        the prefix/target structure by including `bidirectional_mask` as part of the batch inputs.
-        **This is not a standard input and requires custom layers either within or after your dataloader.**
-        In addition to adding `bidirectional_mask` to the batch, this custom code should modify `labels`
-        such that `batch['labels'][batch['bidirectional_mask'] == 1] == -100`.
-        That is, the prefix portion of the sequence should not generate any loss. Loss should only be
-        generated by the target portion of the sequence.
-    Notes on `GPTNeoForCausalLM`:
-        To simplify the implementation, "global" and "local" attention layers are handled differently.
-        For "global" layers, we handle conversion as described above. For "local" layers, which use a
-        causal attention mask within a restricted local window, we do not alter the masking.
-    Notes on `forward` method conversion:
-        After conversion, the `forward` method will handle a new input, `bidirectional_mask`,
-        which should be a [batch_size, seq_length] byte tensor, where 1 indicates token positions
-        belonging to the prefix (prefix tokens can attend to one another bidirectionally), and
-        0 indicates token positions belonging to the target.
-        The new `forward` method will incorporate `bidirectional_mask` (if supplied) into the existing
-        causal mask, call the original `forward` method, and (if the causal mask is a buffer) reset
-        the causal masks before returning the result.
-    Notes on `generate` method conversion:
-        After conversion, the `generate` method will have the same signature but will internally
-        convert all causal masks to be purely bidirectional, call the original `generate` method, and
-        (where appropriate) reset the causal masks before returning the result.
-        This works thanks to the logic of the HuggingFace `generate` API, which first encodes the token
-        "prompt" passed to `generate` (which is treated as the prefix) and then sequentially generates
-        each new token. Encodings are cached as generation happens, so all prefix tokens can attend to one
-        another (as expected in a Prefix LM) and generated tokens can only attend to prefix tokens and
-        previously-generated tokens (also as expected in a Prefix LM).
-    To preserve the API, the original methods are renamed to `_original_forward` and
-    `_original_generate`, and replaced with new `forward` and `generate` methods that wrap
-    them, respectively. Although implementation details vary by model class.
-    """
     if isinstance(model, _SUPPORTED_GPT_MODELS):
         return _convert_gpt_causal_lm_to_prefix_lm(model)
     elif isinstance(model, BloomForCausalLM):
@@ -396,20 +309,17 @@ def convert_hf_causal_lm_to_prefix_lm(model: CAUSAL_LM_TYPES) -> CAUSAL_LM_TYPES
     elif isinstance(model, OPTForCausalLM):
         return _convert_opt_causal_lm_to_prefix_lm(model)
     else:
-        raise TypeError(f'Cannot convert model to Prefix LM. ' + f'Model does not belong to set of supported HF models:' + f'\n{_SUPPORTED_HF_MODELS}')
-def add_bidirectional_mask_if_missing(batch: Dict[str, Any]):
-    """Attempts to add bidirectional_mask to batch if missing.
-    Raises:
-        KeyError if bidirectional_mask is missing and can't be inferred
-    """
-    if 'bidirectional_mask' not in batch:
-        if batch.get('mode', None) == 'icl_task':
             batch['bidirectional_mask'] = batch['attention_mask'].clone()
             for (i, continuation_indices) in enumerate(batch['continuation_indices']):
-                batch['bidirectional_mask'][i, continuation_indices] = 0
-        elif 'labels' in batch and 'attention_mask' in batch:
-            batch['bidirectional_mask'] = torch.logical_and(torch.eq(batch['attention_mask'], 1), torch.eq(batch['labels'], -100)).type_as(batch['attention_mask'])
         else:
-            raise KeyError('No bidirectional_mask in batch and not sure how to construct one.')

+'Converts Huggingface Causal LM to Prefix LM.\n\nConversion does lightweight surgery on a HuggingFace\nCausal LM to convert it to a Prefix LM.\n\nPrefix LMs accepts a `bidirectional_mask` input in `forward`\nand treat the input prompt as the prefix in `generate`.\n'
 import math
 import warnings
 from types import MethodType
 from transformers.models.opt.modeling_opt import _make_causal_mask as _make_causal_mask_opt
 logger = logging.get_logger(__name__)
 _SUPPORTED_GPT_MODELS = (GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM)
+CAUSAL_GPT_TYPES = Union[(GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM)]
 def _convert_gpt_causal_lm_to_prefix_lm(model: CAUSAL_GPT_TYPES) -> CAUSAL_GPT_TYPES:
+    'Converts a GPT-style Causal LM to a Prefix LM.\n\n    Supported HuggingFace model classes:\n        - `GPT2LMHeadModel`\n        - `GPTNeoForCausalLM`\n        - `GPTNeoXForCausalLM`\n        - `GPTJForCausalLM`\n\n    See `convert_hf_causal_lm_to_prefix_lm` for more details.\n    '
     if hasattr(model, '_prefix_lm_converted'):
         return model
     assert isinstance(model, _SUPPORTED_GPT_MODELS)
+    assert (model.config.add_cross_attention == False), 'Only supports GPT-style decoder-only models'
     def _get_attn_modules(model: CAUSAL_GPT_TYPES) -> List[torch.nn.Module]:
+        "Helper that gets a list of the model's attention modules.\n\n        Each module has a `bias` buffer used for causal masking. The Prefix LM\n        conversion adds logic to dynamically manipulate these biases to support\n        Prefix LM attention masking.\n        "
         attn_modules = []
         if isinstance(model, GPTNeoXForCausalLM):
             blocks = model.gpt_neox.layers
             blocks = model.transformer.h
         for block in blocks:
             if isinstance(model, GPTNeoForCausalLM):
+                if (block.attn.attention_type != 'global'):
                     continue
                 attn_module = block.attn.attention
             elif isinstance(model, GPTNeoXForCausalLM):
     setattr(model, '_original_generate', getattr(model, 'generate'))
     def forward(self: CAUSAL_GPT_TYPES, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]]=None, attention_mask: Optional[torch.FloatTensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, token_type_ids: Optional[torch.LongTensor]=None, position_ids: Optional[torch.LongTensor]=None, head_mask: Optional[torch.FloatTensor]=None, inputs_embeds: Optional[torch.FloatTensor]=None, labels: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None):
+        'Wraps original forward to enable PrefixLM attention.'
         def call_og_forward():
             if isinstance(self, GPTNeoXForCausalLM):
                 return self._original_forward(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
             else:
                 return self._original_forward(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
+        if (bidirectional_mask is None):
             return call_og_forward()
         assert isinstance(bidirectional_mask, torch.Tensor)
         attn_modules = _get_attn_modules(model)
         (b, s) = bidirectional_mask.shape
+        max_length = attn_modules[0].bias.shape[(- 1)]
+        if (s > max_length):
+            raise ValueError((f'bidirectional_mask sequence length (={s}) exceeds the ' + f'max length allowed by the model ({max_length}).'))
+        assert (s <= max_length)
+        if (s < max_length):
+            pad = torch.zeros((int(b), int((max_length - s))), dtype=bidirectional_mask.dtype, device=bidirectional_mask.device)
             bidirectional_mask = torch.cat([bidirectional_mask, pad], dim=1)
         bidirectional = bidirectional_mask.unsqueeze(1).unsqueeze(1)
         for attn_module in attn_modules:
             attn_module.bias.data = torch.logical_or(attn_module.bias.data, bidirectional)
         output = call_og_forward()
         for attn_module in attn_modules:
+            attn_module.bias.data = torch.tril(attn_module.bias.data[(0, 0)])[(None, None)]
         return output
+    def generate(self: CAUSAL_GPT_TYPES, *args: tuple, **kwargs: Dict[(str, Any)]):
+        'Wraps original generate to enable PrefixLM attention.'
         attn_modules = _get_attn_modules(model)
         for attn_module in attn_modules:
             attn_module.bias.data[:] = 1
         output = self._original_generate(*args, **kwargs)
         for attn_module in attn_modules:
+            attn_module.bias.data = torch.tril(attn_module.bias.data[(0, 0)])[(None, None)]
         return output
     setattr(model, 'forward', MethodType(forward, model))
     setattr(model, 'generate', MethodType(generate, model))
     return model
 def _convert_bloom_causal_lm_to_prefix_lm(model: BloomForCausalLM) -> BloomForCausalLM:
+    'Converts a BLOOM Causal LM to a Prefix LM.\n\n    Supported HuggingFace model classes:\n        - `BloomForCausalLM`\n\n    See `convert_hf_causal_lm_to_prefix_lm` for more details.\n    '
     if hasattr(model, '_prefix_lm_converted'):
         return model
     assert isinstance(model, BloomForCausalLM)
+    assert (model.config.add_cross_attention == False), 'Only supports BLOOM decoder-only models'
+    def _prepare_attn_mask(self: BloomModel, attention_mask: torch.Tensor, bidirectional_mask: Optional[torch.Tensor], input_shape: Tuple[(int, int)], past_key_values_length: int) -> torch.BoolTensor:
         combined_attention_mask = None
         device = attention_mask.device
         (_, src_length) = input_shape
+        if (src_length > 1):
             combined_attention_mask = _make_causal_mask_bloom(input_shape, device=device, past_key_values_length=past_key_values_length)
+            if (bidirectional_mask is not None):
+                assert (attention_mask.shape == bidirectional_mask.shape)
                 expanded_bidirectional_mask = _expand_mask_bloom(bidirectional_mask, tgt_length=src_length)
                 combined_attention_mask = torch.logical_and(combined_attention_mask, expanded_bidirectional_mask)
         expanded_attn_mask = _expand_mask_bloom(attention_mask, tgt_length=src_length)
+        combined_attention_mask = (expanded_attn_mask if (combined_attention_mask is None) else (expanded_attn_mask | combined_attention_mask))
         return combined_attention_mask
     def _build_alibi_tensor(self: BloomModel, batch_size: int, query_length: int, key_length: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
         num_heads = self.config.n_head
+        closest_power_of_2 = (2 ** math.floor(math.log2(num_heads)))
+        base = torch.tensor((2 ** (- (2 ** (- (math.log2(closest_power_of_2) - 3))))), device=device, dtype=torch.float32)
+        powers = torch.arange(1, (1 + closest_power_of_2), device=device, dtype=torch.int32)
         slopes = torch.pow(base, powers)
+        if (closest_power_of_2 != num_heads):
+            extra_base = torch.tensor((2 ** (- (2 ** (- (math.log2((2 * closest_power_of_2)) - 3))))), device=device, dtype=torch.float32)
+            num_remaining_heads = min(closest_power_of_2, (num_heads - closest_power_of_2))
+            extra_powers = torch.arange(1, (1 + (2 * num_remaining_heads)), 2, device=device, dtype=torch.int32)
             slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+        qa = torch.arange(query_length, device=device, dtype=torch.int32).view((- 1), 1)
+        ka = torch.arange(key_length, device=device, dtype=torch.int32).view(1, (- 1))
+        diffs = (((qa - ka) + key_length) - query_length)
+        diffs = (- diffs.abs())
+        alibi = (slopes.view(1, num_heads, 1, 1) * diffs.view(1, 1, query_length, key_length))
+        alibi = alibi.expand(batch_size, (- 1), (- 1), (- 1)).reshape((- 1), query_length, key_length)
         return alibi.to(dtype)
+    KeyValueT = Tuple[(torch.Tensor, torch.Tensor)]
+    def forward(self: BloomModel, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[(KeyValueT, ...)]]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, head_mask: Optional[torch.LongTensor]=None, inputs_embeds: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None, **deprecated_arguments) -> Union[(Tuple[(torch.Tensor, ...)], BaseModelOutputWithPastAndCrossAttentions)]:
+        if (deprecated_arguments.pop('position_ids', False) is not False):
+            warnings.warn(('`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. ' + 'You can safely ignore passing `position_ids`.'), FutureWarning)
+        if (len(deprecated_arguments) > 0):
             raise ValueError(f'Got unexpected arguments: {deprecated_arguments}')
+        output_attentions = (output_attentions if (output_attentions is not None) else self.config.output_attentions)
+        output_hidden_states = (output_hidden_states if (output_hidden_states is not None) else self.config.output_hidden_states)
+        use_cache = (use_cache if (use_cache is not None) else self.config.use_cache)
+        return_dict = (return_dict if (return_dict is not None) else self.config.use_return_dict)
+        if ((input_ids is not None) and (inputs_embeds is not None)):
             raise ValueError('You cannot specify both input_ids and inputs_embeds at the same time')
+        elif (input_ids is not None):
             (batch_size, seq_length) = input_ids.shape
+        elif (inputs_embeds is not None):
             (batch_size, seq_length, _) = inputs_embeds.shape
         else:
             raise ValueError('You have to specify either input_ids or inputs_embeds')
+        if (past_key_values is None):
+            past_key_values = tuple(([None] * len(self.h)))
         head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+        if (inputs_embeds is None):
             inputs_embeds = self.word_embeddings(input_ids)
         hidden_states = self.word_embeddings_layernorm(inputs_embeds)
+        presents = (() if use_cache else None)
+        all_self_attentions = (() if output_attentions else None)
+        all_hidden_states = (() if output_hidden_states else None)
         seq_length_with_past = seq_length
         past_key_values_length = 0
+        if (past_key_values[0] is not None):
             tmp = past_key_values[0][0]
             past_key_values_length = tmp.shape[2]
+            seq_length_with_past = (seq_length_with_past + past_key_values_length)
+        if (attention_mask is None):
             attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
         else:
             attention_mask = attention_mask.to(hidden_states.device)
         for (i, (block, layer_past)) in enumerate(zip(self.h, past_key_values)):
             if output_hidden_states:
                 hst = (hidden_states,)
+                all_hidden_states = (all_hidden_states + hst)
+            if (self.gradient_checkpointing and self.training):
                 if use_cache:
                     logger.warning('`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...')
                     use_cache = False
             else:
                 outputs = block(hidden_states, layer_past=layer_past, attention_mask=causal_mask, head_mask=head_mask[i], use_cache=use_cache, output_attentions=output_attentions, alibi=alibi)
             hidden_states = outputs[0]
+            if (use_cache is True):
+                presents = (presents + (outputs[1],))
             if output_attentions:
+                oa = (outputs[(2 if use_cache else 1)],)
+                all_self_attentions = (all_self_attentions + oa)
         hidden_states = self.ln_f(hidden_states)
         if output_hidden_states:
             hst = (hidden_states,)
+            all_hidden_states = (all_hidden_states + hst)
+        if (not return_dict):
+            return tuple((v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if (v is not None)))
         return BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions)
     setattr(model.transformer, '_prepare_attn_mask', MethodType(_prepare_attn_mask, model.transformer))
     setattr(model.transformer, '_build_alibi_tensor', MethodType(_build_alibi_tensor, model.transformer))
     setattr(model.transformer, 'forward', MethodType(forward, model.transformer))
+    KeyValueT = Tuple[(torch.Tensor, torch.Tensor)]
+    def forward(self: BloomForCausalLM, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[(KeyValueT, ...)]]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, head_mask: Optional[torch.Tensor]=None, inputs_embeds: Optional[torch.Tensor]=None, labels: Optional[torch.Tensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None, **deprecated_arguments) -> Union[(Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions)]:
+        'Replacement forward method for BloomCausalLM.'
+        if (deprecated_arguments.pop('position_ids', False) is not False):
+            warnings.warn(('`position_ids` have no functionality in BLOOM and will be removed ' + 'in v5.0.0. You can safely ignore passing `position_ids`.'), FutureWarning)
+        if (len(deprecated_arguments) > 0):
             raise ValueError(f'Got unexpected arguments: {deprecated_arguments}')
+        return_dict = (return_dict if (return_dict is not None) else self.config.use_return_dict)
         transformer_outputs = self.transformer(input_ids, past_key_values=past_key_values, attention_mask=attention_mask, bidirectional_mask=bidirectional_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
         hidden_states = transformer_outputs[0]
         lm_logits = self.lm_head(hidden_states)
         loss = None
+        if (labels is not None):
+            shift_logits = lm_logits[..., :(- 1), :].contiguous()
             shift_labels = labels[..., 1:].contiguous()
             (batch_size, seq_length, vocab_size) = shift_logits.shape
             loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view((batch_size * seq_length), vocab_size), shift_labels.view((batch_size * seq_length)))
+        if (not return_dict):
+            output = ((lm_logits,) + transformer_outputs[1:])
+            return (((loss,) + output) if (loss is not None) else output)
         return CausalLMOutputWithCrossAttentions(loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions)
     def prepare_inputs_for_generation(self: BloomForCausalLM, input_ids: torch.LongTensor, past: Optional[torch.Tensor]=None, attention_mask: Optional[torch.Tensor]=None, **kwargs) -> dict:
         if past:
+            input_ids = input_ids[:, (- 1)].unsqueeze((- 1))
             bidirectional_mask = None
+            if (past[0][0].shape[0] == input_ids.shape[0]):
                 past = self._convert_to_bloom_cache(past)
         else:
             bidirectional_mask = torch.ones_like(input_ids)
     return model
 def _convert_opt_causal_lm_to_prefix_lm(model: OPTForCausalLM) -> OPTForCausalLM:
+    'Converts an OPT Causal LM to a Prefix LM.\n\n    Supported HuggingFace model classes:\n        - `OPTForCausalLM`\n\n    See `convert_hf_causal_lm_to_prefix_lm` for more details.\n    '
     if hasattr(model, '_prefix_lm_converted'):
         return model
     assert isinstance(model, OPTForCausalLM)
+    assert (model.config.add_cross_attention == False), 'Only supports OPT decoder-only models'
     setattr(model, '_original_forward', getattr(model, 'forward'))
     setattr(model, '_original_generate', getattr(model, 'generate'))
     model.model.decoder.bidirectional_mask = None
     def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
         combined_attention_mask = None
+        if (input_shape[(- 1)] > 1):
+            if (self.bidirectional_mask == 'g'):
                 (bsz, src_length) = input_shape
+                combined_attention_mask = torch.zeros((bsz, 1, src_length, (src_length + past_key_values_length)), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
             else:
                 combined_attention_mask = _make_causal_mask_opt(input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length).to(inputs_embeds.device)
+                if (self.bidirectional_mask is not None):
+                    assert (attention_mask.shape == self.bidirectional_mask.shape)
+                    expanded_bidirectional_mask = _expand_mask_opt(self.bidirectional_mask, inputs_embeds.dtype, tgt_len=input_shape[(- 1)]).to(inputs_embeds.device)
                     combined_attention_mask = torch.maximum(expanded_bidirectional_mask, combined_attention_mask)
+        if (attention_mask is not None):
+            expanded_attn_mask = _expand_mask_opt(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[(- 1)]).to(inputs_embeds.device)
+            combined_attention_mask = (expanded_attn_mask if (combined_attention_mask is None) else (expanded_attn_mask + combined_attention_mask))
         return combined_attention_mask
     setattr(model.model.decoder, '_prepare_decoder_attention_mask', MethodType(_prepare_decoder_attention_mask, model.model.decoder))
         def call_og_forward():
             return self._original_forward(input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
+        if (bidirectional_mask is None):
             return call_og_forward()
         self.model.decoder.bidirectional_mask = bidirectional_mask
         try:
         self.model.decoder.bidirectional_mask = None
         return outputs
+    def generate(self: OPTForCausalLM, *args: tuple, **kwargs: Dict[(str, Any)]):
+        'Wraps original generate to enable PrefixLM-style attention.'
         self.model.decoder.bidirectional_mask = 'g'
         try:
             output = self._original_generate(*args, **kwargs)
     setattr(model, 'generate', MethodType(generate, model))
     setattr(model, '_prefix_lm_converted', True)
     return model
+_SUPPORTED_HF_MODELS = (_SUPPORTED_GPT_MODELS + (BloomForCausalLM, OPTForCausalLM))
+CAUSAL_LM_TYPES = Union[(GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM, BloomForCausalLM, OPTForCausalLM)]
 def convert_hf_causal_lm_to_prefix_lm(model: CAUSAL_LM_TYPES) -> CAUSAL_LM_TYPES:
+    'Converts a HuggingFace Causal LM to a Prefix LM.\n\n    Supported HuggingFace model classes:\n        - `GPT2LMHeadModel`\n        - `GPTNeoForCausalLM`\n        - `GPTNeoXForCausalLM`\n        - `GPTJForCausalLM`\n        - `BloomForCausalLM`\n        - `OPTForCausalLM`\n\n    Conversion to a Prefix LM is done by modifying the `forward` method, and possibly also the\n    `generate` method and/or select underlying methods depending on the model class.\n\n    These changes preserve the model API, but add a new input to `forward`: "bidirectional_mask".\n\n    Notes on training:\n        To actually train the converted model as a Prefix LM, training batches will need to indicate\n        the prefix/target structure by including `bidirectional_mask` as part of the batch inputs.\n\n        **This is not a standard input and requires custom layers either within or after your dataloader.**\n\n        In addition to adding `bidirectional_mask` to the batch, this custom code should modify `labels`\n        such that `batch[\'labels\'][batch[\'bidirectional_mask\'] == 1] == -100`.\n        That is, the prefix portion of the sequence should not generate any loss. Loss should only be\n        generated by the target portion of the sequence.\n\n    Notes on `GPTNeoForCausalLM`:\n        To simplify the implementation, "global" and "local" attention layers are handled differently.\n        For "global" layers, we handle conversion as described above. For "local" layers, which use a\n        causal attention mask within a restricted local window, we do not alter the masking.\n\n    Notes on `forward` method conversion:\n        After conversion, the `forward` method will handle a new input, `bidirectional_mask`,\n        which should be a [batch_size, seq_length] byte tensor, where 1 indicates token positions\n        belonging to the prefix (prefix tokens can attend to one another bidirectionally), and\n        0 indicates token positions belonging to the target.\n\n        The new `forward` method will incorporate `bidirectional_mask` (if supplied) into the existing\n        causal mask, call the original `forward` method, and (if the causal mask is a buffer) reset\n        the causal masks before returning the result.\n\n    Notes on `generate` method conversion:\n        After conversion, the `generate` method will have the same signature but will internally\n        convert all causal masks to be purely bidirectional, call the original `generate` method, and\n        (where appropriate) reset the causal masks before returning the result.\n\n        This works thanks to the logic of the HuggingFace `generate` API, which first encodes the token\n        "prompt" passed to `generate` (which is treated as the prefix) and then sequentially generates\n        each new token. Encodings are cached as generation happens, so all prefix tokens can attend to one\n        another (as expected in a Prefix LM) and generated tokens can only attend to prefix tokens and\n        previously-generated tokens (also as expected in a Prefix LM).\n\n    To preserve the API, the original methods are renamed to `_original_forward` and\n    `_original_generate`, and replaced with new `forward` and `generate` methods that wrap\n    them, respectively. Although implementation details vary by model class.\n    '
     if isinstance(model, _SUPPORTED_GPT_MODELS):
         return _convert_gpt_causal_lm_to_prefix_lm(model)
     elif isinstance(model, BloomForCausalLM):
     elif isinstance(model, OPTForCausalLM):
         return _convert_opt_causal_lm_to_prefix_lm(model)
     else:
+        raise TypeError(((f'Cannot convert model to Prefix LM. ' + f'Model does not belong to set of supported HF models:') + f'''
+{_SUPPORTED_HF_MODELS}'''))
+def add_bidirectional_mask_if_missing(batch: Dict[(str, Any)]):
+    "Attempts to add bidirectional_mask to batch if missing.\n\n    Raises:\n        KeyError if bidirectional_mask is missing and can't be inferred\n    "
+    if ('bidirectional_mask' not in batch):
+        if (batch.get('mode', None) == 'icl_task'):
             batch['bidirectional_mask'] = batch['attention_mask'].clone()
             for (i, continuation_indices) in enumerate(batch['continuation_indices']):
+                batch['bidirectional_mask'][(i, continuation_indices)] = 0
+        elif (('labels' in batch) and ('attention_mask' in batch)):
+            batch['bidirectional_mask'] = torch.logical_and(torch.eq(batch['attention_mask'], 1), torch.eq(batch['labels'], (- 100))).type_as(batch['attention_mask'])
         else:
+            raise KeyError('No bidirectional_mask in batch and not sure how to construct one.')

is_torch_version.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import sys
+import logging
+import operator as op
+from packaging import version
+from packaging.version import Version, parse
+from typing import Union
+import importlib.util
+# The package importlib_metadata is in a different place, depending on the python version.
+if sys.version_info < (3, 8):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}
+logger = logging.getLogger(__name__)
+_torch_available = importlib.util.find_spec("torch") is not None
+if _torch_available:
+  try:
+    _torch_version = importlib_metadata.version("torch")
+    logger.info(f"PyTorch version {_torch_version} available.")
+  except importlib_metadata.PackageNotFoundError:
+    _torch_available = False
+# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319
+def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
+  """
+  Args:
+  Compares a library version to some requirement using a given operation.
+    library_or_version (`str` or `packaging.version.Version`):
+      A library name or a version to check.
+    operation (`str`):
+      A string representation of an operator, such as `">"` or `"<="`.
+    requirement_version (`str`):
+      The version to compare the library version against
+  """
+  if operation not in STR_OPERATION_TO_FUNC.keys():
+    raise ValueError(f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}")
+  operation = STR_OPERATION_TO_FUNC[operation]
+  if isinstance(library_or_version, str):
+    library_or_version = parse(importlib_metadata.version(library_or_version))
+  return operation(library_or_version, parse(requirement_version))
+# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L338
+def is_torch_version(operation: str, version: str):
+  """
+  Args:
+  Compares the current PyTorch version to a given reference with an operation.
+    operation (`str`):
+      A string representation of an operator, such as `">"` or `"<="`
+    version (`str`):
+      A string version of PyTorch
+  """
+  return compare_versions(parse(_torch_version), operation, version)

meta_init_context.py CHANGED Viewed

@@ -1,72 +1,31 @@
 from contextlib import contextmanager
 import torch
 import torch.nn as nn
 @contextmanager
 def init_empty_weights(include_buffers: bool=False):
-    """Meta initialization context manager.
-    A context manager under which models are initialized with all parameters
-    on the meta device, therefore creating an empty model. Useful when just
-    initializing the model would blow the available RAM.
-    Args:
-        include_buffers (`bool`, *optional*, defaults to `False`): Whether or
-            not to also put all buffers on the meta device while initializing.
-    Example:
-    ```python
-    import torch.nn as nn
-    # Initialize a model with 100 billions parameters in no time and without using any RAM.
-    with init_empty_weights():
-        tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
-    ```
-    <Tip warning={true}>
-    Any model created under this context manager has no weights. As such you can't do something like
-    `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
-    </Tip>
-    """
     with init_on_device(torch.device('meta'), include_buffers=include_buffers) as f:
-        yield f
 @contextmanager
 def init_on_device(device: torch.device, include_buffers: bool=False):
-    """Device initialization context manager.
-    A context manager under which models are initialized with all parameters
-    on the specified device.
-    Args:
-        device (`torch.device`): Device to initialize all parameters on.
-        include_buffers (`bool`, *optional*, defaults to `False`): Whether or
-            not to also put all buffers on the meta device while initializing.
-    Example:
-    ```python
-    import torch.nn as nn
-    with init_on_device(device=torch.device("cuda")):
-        tst = nn.Liner(100, 100)  # on `cuda` device
-    ```
-    """
     old_register_parameter = nn.Module.register_parameter
     if include_buffers:
         old_register_buffer = nn.Module.register_buffer
     def register_empty_parameter(module, name, param):
         old_register_parameter(module, name, param)
-        if param is not None:
             param_cls = type(module._parameters[name])
             kwargs = module._parameters[name].__dict__
             module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
     def register_empty_buffer(module, name, buffer):
         old_register_buffer(module, name, buffer)
-        if buffer is not None:
             module._buffers[name] = module._buffers[name].to(device)
     if include_buffers:
         tensor_constructors_to_patch = {torch_function_name: getattr(torch, torch_function_name) for torch_function_name in ['empty', 'zeros', 'ones', 'full']}
@@ -85,10 +44,10 @@ def init_on_device(device: torch.device, include_buffers: bool=False):
             nn.Module.register_buffer = register_empty_buffer
         for torch_function_name in tensor_constructors_to_patch.keys():
             setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
-        yield
     finally:
         nn.Module.register_parameter = old_register_parameter
         if include_buffers:
             nn.Module.register_buffer = old_register_buffer
         for (torch_function_name, old_torch_function) in tensor_constructors_to_patch.items():
-            setattr(torch, torch_function_name, old_torch_function)

 from contextlib import contextmanager
 import torch
 import torch.nn as nn
 @contextmanager
 def init_empty_weights(include_buffers: bool=False):
+    "Meta initialization context manager.\n\n    A context manager under which models are initialized with all parameters\n    on the meta device, therefore creating an empty model. Useful when just\n    initializing the model would blow the available RAM.\n\n    Args:\n        include_buffers (`bool`, *optional*, defaults to `False`): Whether or\n            not to also put all buffers on the meta device while initializing.\n\n    Example:\n    ```python\n    import torch.nn as nn\n\n    # Initialize a model with 100 billions parameters in no time and without using any RAM.\n    with init_empty_weights():\n        tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])\n    ```\n\n    <Tip warning={true}>\n\n    Any model created under this context manager has no weights. As such you can't do something like\n    `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].\n\n    </Tip>\n    "
     with init_on_device(torch.device('meta'), include_buffers=include_buffers) as f:
+        (yield f)
 @contextmanager
 def init_on_device(device: torch.device, include_buffers: bool=False):
+    'Device initialization context manager.\n\n    A context manager under which models are initialized with all parameters\n    on the specified device.\n\n    Args:\n        device (`torch.device`): Device to initialize all parameters on.\n        include_buffers (`bool`, *optional*, defaults to `False`): Whether or\n            not to also put all buffers on the meta device while initializing.\n\n    Example:\n    ```python\n    import torch.nn as nn\n\n    with init_on_device(device=torch.device("cuda")):\n        tst = nn.Liner(100, 100)  # on `cuda` device\n    ```\n    '
     old_register_parameter = nn.Module.register_parameter
     if include_buffers:
         old_register_buffer = nn.Module.register_buffer
     def register_empty_parameter(module, name, param):
         old_register_parameter(module, name, param)
+        if (param is not None):
             param_cls = type(module._parameters[name])
             kwargs = module._parameters[name].__dict__
             module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
     def register_empty_buffer(module, name, buffer):
         old_register_buffer(module, name, buffer)
+        if (buffer is not None):
             module._buffers[name] = module._buffers[name].to(device)
     if include_buffers:
         tensor_constructors_to_patch = {torch_function_name: getattr(torch, torch_function_name) for torch_function_name in ['empty', 'zeros', 'ones', 'full']}
             nn.Module.register_buffer = register_empty_buffer
         for torch_function_name in tensor_constructors_to_patch.keys():
             setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
+        (yield)
     finally:
         nn.Module.register_parameter = old_register_parameter
         if include_buffers:
             nn.Module.register_buffer = old_register_buffer
         for (torch_function_name, old_torch_function) in tensor_constructors_to_patch.items():
+            setattr(torch, torch_function_name, old_torch_function)

modeling_mpt.py CHANGED Viewed

@@ -4,25 +4,45 @@ Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
 """
 import math
 import warnings
-from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from .attention import attn_bias_shape, build_attn_bias
-from .blocks import MPTBlock
 from .norm import NORM_CLASS_REGISTRY
 from .configuration_mpt import MPTConfig
 from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
 from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
 from .meta_init_context import init_empty_weights
 from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
 Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 class MPTPreTrainedModel(PreTrainedModel):
     config_class = MPTConfig
     base_model_prefix = 'model'
 class MPTModel(MPTPreTrainedModel):
@@ -64,6 +84,7 @@ class MPTModel(MPTPreTrainedModel):
         if self.config.init_config['verbose'] > 1:
             init_fn_name = self.config.init_config['name']
             warnings.warn(f'Using {init_fn_name} initialization.')
     def get_input_embeddings(self):
         return self.wte
@@ -95,7 +116,9 @@ class MPTModel(MPTPreTrainedModel):
             if attn_bias is None:
                 attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
             else:
-                attn_bias = attn_bias[:, :, :, -s_k:]
             if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
                 raise ValueError(f'attention_mask shape={attention_mask.shape} ' + f'and prefix_mask shape={prefix_mask.shape} are not equal.')
             min_val = torch.finfo(attn_bias.dtype).min
@@ -130,6 +153,12 @@ class MPTModel(MPTPreTrainedModel):
     def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None):
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         if attention_mask is not None:
             attention_mask = attention_mask.bool()
         if prefix_mask is not None:
@@ -137,7 +166,10 @@ class MPTModel(MPTPreTrainedModel):
         if not return_dict:
             raise NotImplementedError('return_dict False is not implemented yet for MPT')
         if output_attentions:
-            raise NotImplementedError('output_attentions is not implemented yet for MPT')
         if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0] and self.training:
             raise NotImplementedError('MPT does not support training with left padding.')
         if self.prefix_lm and prefix_mask is None:
@@ -157,7 +189,12 @@ class MPTModel(MPTPreTrainedModel):
             if past_key_values is not None:
                 if len(past_key_values) != self.config.n_layers:
                     raise ValueError(f'past_key_values must provide a past_key_value for each attention ' + f'layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r}).')
                 past_position = past_key_values[0][0].size(1)
             if S + past_position > self.config.max_seq_len:
                 raise ValueError(f'Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.')
             pos = torch.arange(past_position, S + past_position, dtype=torch.long, device=input_ids.device).unsqueeze(0)
@@ -175,16 +212,60 @@ class MPTModel(MPTPreTrainedModel):
         if use_cache and past_key_values is None:
             past_key_values = [() for _ in range(self.config.n_layers)]
         all_hidden_states = () if output_hidden_states else None
         for (b_idx, block) in enumerate(self.blocks):
             if output_hidden_states:
                 assert all_hidden_states is not None
                 all_hidden_states = all_hidden_states + (x,)
             past_key_value = past_key_values[b_idx] if past_key_values is not None else None
-            (x, past_key_value) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=self.is_causal)
             if past_key_values is not None:
                 past_key_values[b_idx] = past_key_value
         x = self.norm_f(x)
-        return BaseModelOutputWithPast(last_hidden_state=x, past_key_values=past_key_values, hidden_states=all_hidden_states)
     def param_init_fn(self, module):
         init_fn_name = self.config.init_config['name']
@@ -231,7 +312,7 @@ class MPTForCausalLM(MPTPreTrainedModel):
     def get_decoder(self):
         return self.transformer
-    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None):
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)
@@ -245,7 +326,7 @@ class MPTForCausalLM(MPTPreTrainedModel):
             labels = torch.roll(labels, shifts=-1)
             labels[:, -1] = -100
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
-        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states)
     def param_init_fn(self, module):
         init_fn_name = self.config.init_config['name']

 """
 import math
 import warnings
+from typing import Any, List, Optional, Tuple, Union, Protocol, Dict
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
 from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.utils import logging
+from .attention import attn_bias_shape, build_attn_bias, PastKeyValue, MultiheadAttention, MultiQueryAttention
+from .blocks import MPTBlock, MPTBlockOutput
 from .norm import NORM_CLASS_REGISTRY
 from .configuration_mpt import MPTConfig
 from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
 from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
 from .meta_init_context import init_empty_weights
 from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
+from .is_torch_version import is_torch_version
 Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+logger = logging.get_logger(__name__)
+class MPTBlockCheckpointedForward(Protocol):
+    def __call__(
+        x: torch.Tensor,
+        past_key_value: Union[PastKeyValue, Tuple, None],
+        attn_bias: Optional[torch.Tensor],
+        attention_mask: Optional[torch.ByteTensor],
+        is_causal: bool,
+    ) -> MPTBlockOutput: ...
 class MPTPreTrainedModel(PreTrainedModel):
     config_class = MPTConfig
     base_model_prefix = 'model'
+    _no_split_modules = ['MPTBlock']
+    supports_gradient_checkpointing = True
+    def _set_gradient_checkpointing(self, module: nn.Module, value=False) -> None:
+        if isinstance(module, MPTModel) or isinstance(module, MultiheadAttention) or isinstance(module, MultiQueryAttention):
+            module.gradient_checkpointing = value
 class MPTModel(MPTPreTrainedModel):
         if self.config.init_config['verbose'] > 1:
             init_fn_name = self.config.init_config['name']
             warnings.warn(f'Using {init_fn_name} initialization.')
+        self.gradient_checkpointing = False
     def get_input_embeddings(self):
         return self.wte
             if attn_bias is None:
                 attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
             else:
+                # clamp to 0 necessary for torch 2.0 compile()
+                _s_k = max(0, attn_bias.size(-1) - s_k)
+                attn_bias = attn_bias[:, :, :, _s_k:]
             if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
                 raise ValueError(f'attention_mask shape={attention_mask.shape} ' + f'and prefix_mask shape={prefix_mask.shape} are not equal.')
             min_val = torch.finfo(attn_bias.dtype).min
     def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None):
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
         if attention_mask is not None:
             attention_mask = attention_mask.bool()
         if prefix_mask is not None:
         if not return_dict:
             raise NotImplementedError('return_dict False is not implemented yet for MPT')
         if output_attentions:
+            if self.attn_impl != 'torch':
+                raise NotImplementedError(
+                    'output_attentions is not implemented for MPT when using attn_impl `flash` or `triton`.'
+                )
         if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0] and self.training:
             raise NotImplementedError('MPT does not support training with left padding.')
         if self.prefix_lm and prefix_mask is None:
             if past_key_values is not None:
                 if len(past_key_values) != self.config.n_layers:
                     raise ValueError(f'past_key_values must provide a past_key_value for each attention ' + f'layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r}).')
+                # For attn_impl: triton and flash the past key tensor spec is (batch, seq, dim).
+                # For attn_impl: torch the past key tensor spec is (batch, heads, head_dim, seq).
+                # Here we shift position embedding using the `seq` dim of the past key
                 past_position = past_key_values[0][0].size(1)
+                if self.attn_impl == 'torch':
+                    past_position = past_key_values[0][0].size(3)
             if S + past_position > self.config.max_seq_len:
                 raise ValueError(f'Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.')
             pos = torch.arange(past_position, S + past_position, dtype=torch.long, device=input_ids.device).unsqueeze(0)
         if use_cache and past_key_values is None:
             past_key_values = [() for _ in range(self.config.n_layers)]
         all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
         for (b_idx, block) in enumerate(self.blocks):
             if output_hidden_states:
                 assert all_hidden_states is not None
                 all_hidden_states = all_hidden_states + (x,)
             past_key_value = past_key_values[b_idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                ckpt_kwargs: Dict[str, Any] = {'use_reentrant': False} if is_torch_version('>=', '1.11.0') else {}
+                def create_custom_forward(module: MPTBlock) -> MPTBlockCheckpointedForward:
+                    def custom_forward(
+                        x: torch.Tensor,
+                        past_key_value: Union[PastKeyValue, Tuple, None],
+                        attn_bias: Optional[torch.Tensor],
+                        attention_mask: Optional[torch.ByteTensor],
+                        is_causal: bool
+                    ):
+                        return module.forward(
+                            x,
+                            past_key_value,
+                            attn_bias,
+                            attention_mask,
+                            is_causal,
+                        )
+                    return custom_forward
+                block_out: MPTBlockOutput = checkpoint(
+                    create_custom_forward(block),
+                    x,
+                    past_key_value,
+                    attn_bias,
+                    attention_mask,
+                    self.is_causal,
+                    **ckpt_kwargs,
+                )
+            else:
+                block_out: MPTBlockOutput = block(
+                    x,
+                    past_key_value=past_key_value,
+                    attn_bias=attn_bias,
+                    attention_mask=attention_mask,
+                    is_causal=self.is_causal,
+                )
+            x, attn_weights, past_key_value = block_out
+            del block_out
             if past_key_values is not None:
                 past_key_values[b_idx] = past_key_value
+            if output_attentions:
+                assert all_self_attns is not None  # pyright
+                all_self_attns = all_self_attns + (attn_weights,)
         x = self.norm_f(x)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            assert all_hidden_states is not None  # pyright
+            all_hidden_states = all_hidden_states + (x,)
+        return BaseModelOutputWithPast(last_hidden_state=x, past_key_values=past_key_values, hidden_states=all_hidden_states, attentions=all_self_attns)
     def param_init_fn(self, module):
         init_fn_name = self.config.init_config['name']
     def get_decoder(self):
         return self.transformer
+    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, *args, **kwargs):
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)
             labels = torch.roll(labels, shifts=-1)
             labels[:, -1] = -100
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
+        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
     def param_init_fn(self, module):
         init_fn_name = self.config.init_config['name']

norm.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import torch
 def _cast_if_autocast_enabled(tensor):
     if torch.is_autocast_enabled():
-        if tensor.device.type == 'cuda':
             dtype = torch.get_autocast_gpu_dtype()
-        elif tensor.device.type == 'cpu':
             dtype = torch.get_autocast_cpu_dtype()
         else:
             raise NotImplementedError()
@@ -19,15 +20,15 @@ class LPLayerNorm(torch.nn.LayerNorm):
     def forward(self, x):
         module_device = x.device
         downcast_x = _cast_if_autocast_enabled(x)
-        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
-        downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
         with torch.autocast(enabled=False, device_type=module_device.type):
             return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
 def rms_norm(x, weight=None, eps=1e-05):
-    output = x / torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
-    if weight is not None:
-        return output * weight
     return output
 class RMSNorm(torch.nn.Module):
@@ -50,7 +51,7 @@ class LPRMSNorm(RMSNorm):
     def forward(self, x):
         downcast_x = _cast_if_autocast_enabled(x)
-        downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
         with torch.autocast(enabled=False, device_type=x.device.type):
             return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
-NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}

 import torch
 def _cast_if_autocast_enabled(tensor):
     if torch.is_autocast_enabled():
+        if (tensor.device.type == 'cuda'):
             dtype = torch.get_autocast_gpu_dtype()
+        elif (tensor.device.type == 'cpu'):
             dtype = torch.get_autocast_cpu_dtype()
         else:
             raise NotImplementedError()
     def forward(self, x):
         module_device = x.device
         downcast_x = _cast_if_autocast_enabled(x)
+        downcast_weight = (_cast_if_autocast_enabled(self.weight) if (self.weight is not None) else self.weight)
+        downcast_bias = (_cast_if_autocast_enabled(self.bias) if (self.bias is not None) else self.bias)
         with torch.autocast(enabled=False, device_type=module_device.type):
             return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
 def rms_norm(x, weight=None, eps=1e-05):
+    output = (x / torch.rsqrt((x.pow(2).mean((- 1), keepdim=True) + eps)))
+    if (weight is not None):
+        return (output * weight)
     return output
 class RMSNorm(torch.nn.Module):
     def forward(self, x):
         downcast_x = _cast_if_autocast_enabled(x)
+        downcast_weight = (_cast_if_autocast_enabled(self.weight) if (self.weight is not None) else self.weight)
         with torch.autocast(enabled=False, device_type=x.device.type):
             return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
+NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}

param_init_fns.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import math
 import warnings
 from collections.abc import Sequence
@@ -9,110 +10,110 @@ from .norm import NORM_CLASS_REGISTRY
 def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs):
     del kwargs
-    if verbose > 1:
         warnings.warn(f"Initializing network using module's reset_parameters attribute")
     if hasattr(module, 'reset_parameters'):
         module.reset_parameters()
 def fused_init_helper_(module: nn.Module, init_fn_):
     _fused = getattr(module, '_fused', None)
-    if _fused is None:
         raise RuntimeError(f'Internal logic error')
     (dim, splits) = _fused
     splits = (0, *splits, module.weight.size(dim))
-    for (s, e) in zip(splits[:-1], splits[1:]):
-        slice_indices = [slice(None)] * module.weight.ndim
         slice_indices[dim] = slice(s, e)
         init_fn_(module.weight[slice_indices])
-def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
     del kwargs
-    if verbose > 1:
         warnings.warn(f'If model has bias parameters they are initialized to 0.')
     init_div_is_residual = init_div_is_residual
-    if init_div_is_residual is False:
         div_is_residual = 1.0
-    elif init_div_is_residual is True:
-        div_is_residual = math.sqrt(2 * n_layers)
-    elif isinstance(init_div_is_residual, float) or isinstance(init_div_is_residual, int):
         div_is_residual = init_div_is_residual
-    elif isinstance(init_div_is_residual, str) and init_div_is_residual.isnumeric():
         div_is_residual = float(init_div_is_residual)
     else:
         div_is_residual = 1.0
         raise ValueError(f'Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}')
-    if init_div_is_residual is not False:
-        if verbose > 1:
-            warnings.warn(f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. ' + f'Set `init_div_is_residual: false` in init config to disable this.')
     if isinstance(module, nn.Linear):
         if hasattr(module, '_fused'):
             fused_init_helper_(module, init_fn_)
         else:
             init_fn_(module.weight)
-        if module.bias is not None:
             torch.nn.init.zeros_(module.bias)
-        if init_div_is_residual is not False and getattr(module, '_is_residual', False):
             with torch.no_grad():
                 module.weight.div_(div_is_residual)
     elif isinstance(module, nn.Embedding):
-        if emb_init_std is not None:
             std = emb_init_std
-            if std == 0:
                 warnings.warn(f'Embedding layer initialized to 0.')
             emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
-            if verbose > 1:
                 warnings.warn(f'Embedding layer initialized using normal distribution with mean=0 and std={std!r}.')
-        elif emb_init_uniform_lim is not None:
             lim = emb_init_uniform_lim
             if isinstance(lim, Sequence):
-                if len(lim) > 2:
                     raise ValueError(f'Uniform init requires a min and a max limit. User input: {lim}.')
-                if lim[0] == lim[1]:
                     warnings.warn(f'Embedding layer initialized to {lim[0]}.')
             else:
-                if lim == 0:
                     warnings.warn(f'Embedding layer initialized to 0.')
-                lim = [-lim, lim]
             (a, b) = lim
             emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
-            if verbose > 1:
                 warnings.warn(f'Embedding layer initialized using uniform distribution in range {lim}.')
         else:
             emb_init_fn_ = init_fn_
         emb_init_fn_(module.weight)
     elif isinstance(module, tuple(set(NORM_CLASS_REGISTRY.values()))):
-        if verbose > 1:
             warnings.warn(f'Norm weights are set to 1. If norm layer has a bias it is initialized to 0.')
-        if hasattr(module, 'weight') and module.weight is not None:
             torch.nn.init.ones_(module.weight)
-        if hasattr(module, 'bias') and module.bias is not None:
             torch.nn.init.zeros_(module.bias)
     elif isinstance(module, nn.MultiheadAttention):
         if module._qkv_same_embed_dim:
-            assert module.in_proj_weight is not None
-            assert module.q_proj_weight is None and module.k_proj_weight is None and (module.v_proj_weight is None)
-            assert d_model is not None
             _d = d_model
-            splits = (0, _d, 2 * _d, 3 * _d)
-            for (s, e) in zip(splits[:-1], splits[1:]):
                 init_fn_(module.in_proj_weight[s:e])
         else:
-            assert module.q_proj_weight is not None and module.k_proj_weight is not None and (module.v_proj_weight is not None)
-            assert module.in_proj_weight is None
             init_fn_(module.q_proj_weight)
             init_fn_(module.k_proj_weight)
             init_fn_(module.v_proj_weight)
-        if module.in_proj_bias is not None:
             torch.nn.init.zeros_(module.in_proj_bias)
-        if module.bias_k is not None:
             torch.nn.init.zeros_(module.bias_k)
-        if module.bias_v is not None:
             torch.nn.init.zeros_(module.bias_v)
         init_fn_(module.out_proj.weight)
-        if init_div_is_residual is not False and getattr(module.out_proj, '_is_residual', False):
             with torch.no_grad():
                 module.out_proj.weight.div_(div_is_residual)
-        if module.out_proj.bias is not None:
             torch.nn.init.zeros_(module.out_proj.bias)
     else:
         for _ in module.parameters(recurse=False):
@@ -121,61 +122,56 @@ def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model:
 def _normal_init_(std, mean=0.0):
     return partial(torch.nn.init.normal_, mean=mean, std=std)
-def _normal_param_init_fn_(module: nn.Module, std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
     del kwargs
     init_fn_ = _normal_init_(std=std)
-    if verbose > 1:
         warnings.warn(f'Using torch.nn.init.normal_ init fn mean=0.0, std={std}')
     generic_param_init_fn_(module=module, init_fn_=init_fn_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-def baseline_param_init_fn_(module: nn.Module, init_std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
     del kwargs
-    if init_std is None:
         raise ValueError("You must set model.init_config['init_std'] to a float value to use the default initialization scheme.")
     _normal_param_init_fn_(module=module, std=init_std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-def small_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
     del kwargs
-    std = math.sqrt(2 / (5 * d_model))
     _normal_param_init_fn_(module=module, std=std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-def neox_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
-    """From section 2.3.1 of GPT-NeoX-20B:
-    An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)
-    see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151
-    and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py
-    """
     del kwargs
-    residual_div = n_layers / math.sqrt(10)
-    if verbose > 1:
         warnings.warn(f'setting init_div_is_residual to {residual_div}')
     small_param_init_fn_(module=module, d_model=d_model, n_layers=n_layers, init_div_is_residual=residual_div, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-def kaiming_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
     del kwargs
-    if verbose > 1:
-        warnings.warn(f'Using nn.init.kaiming_uniform_ init fn with parameters: ' + f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}')
     kaiming_uniform_ = partial(nn.init.kaiming_uniform_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
     generic_param_init_fn_(module=module, init_fn_=kaiming_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-def kaiming_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
     del kwargs
-    if verbose > 1:
-        warnings.warn(f'Using nn.init.kaiming_normal_ init fn with parameters: ' + f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}')
     kaiming_normal_ = partial(torch.nn.init.kaiming_normal_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
     generic_param_init_fn_(module=module, init_fn_=kaiming_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-def xavier_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
     del kwargs
     xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
-    if verbose > 1:
-        warnings.warn(f'Using torch.nn.init.xavier_uniform_ init fn with parameters: ' + f'gain={init_gain}')
     generic_param_init_fn_(module=module, init_fn_=xavier_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-def xavier_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
     xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
-    if verbose > 1:
-        warnings.warn(f'Using torch.nn.init.xavier_normal_ init fn with parameters: ' + f'gain={init_gain}')
     generic_param_init_fn_(module=module, init_fn_=xavier_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
-MODEL_INIT_REGISTRY = {'default_': torch_default_param_init_fn_, 'baseline_': baseline_param_init_fn_, 'kaiming_uniform_': kaiming_uniform_param_init_fn_, 'kaiming_normal_': kaiming_normal_param_init_fn_, 'neox_init_': neox_param_init_fn_, 'small_init_': small_param_init_fn_, 'xavier_uniform_': xavier_uniform_param_init_fn_, 'xavier_normal_': xavier_normal_param_init_fn_}

 import math
 import warnings
 from collections.abc import Sequence
 def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs):
     del kwargs
+    if (verbose > 1):
         warnings.warn(f"Initializing network using module's reset_parameters attribute")
     if hasattr(module, 'reset_parameters'):
         module.reset_parameters()
 def fused_init_helper_(module: nn.Module, init_fn_):
     _fused = getattr(module, '_fused', None)
+    if (_fused is None):
         raise RuntimeError(f'Internal logic error')
     (dim, splits) = _fused
     splits = (0, *splits, module.weight.size(dim))
+    for (s, e) in zip(splits[:(- 1)], splits[1:]):
+        slice_indices = ([slice(None)] * module.weight.ndim)
         slice_indices[dim] = slice(s, e)
         init_fn_(module.weight[slice_indices])
+def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[(int, float, str, bool)]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[(Tuple[(float, float)], float)]]=None, verbose: int=0, **kwargs):
     del kwargs
+    if (verbose > 1):
         warnings.warn(f'If model has bias parameters they are initialized to 0.')
     init_div_is_residual = init_div_is_residual
+    if (init_div_is_residual is False):
         div_is_residual = 1.0
+    elif (init_div_is_residual is True):
+        div_is_residual = math.sqrt((2 * n_layers))
+    elif (isinstance(init_div_is_residual, float) or isinstance(init_div_is_residual, int)):
         div_is_residual = init_div_is_residual
+    elif (isinstance(init_div_is_residual, str) and init_div_is_residual.isnumeric()):
         div_is_residual = float(init_div_is_residual)
     else:
         div_is_residual = 1.0
         raise ValueError(f'Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}')
+    if (init_div_is_residual is not False):
+        if (verbose > 1):
+            warnings.warn((f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. ' + f'Set `init_div_is_residual: false` in init config to disable this.'))
     if isinstance(module, nn.Linear):
         if hasattr(module, '_fused'):
             fused_init_helper_(module, init_fn_)
         else:
             init_fn_(module.weight)
+        if (module.bias is not None):
             torch.nn.init.zeros_(module.bias)
+        if ((init_div_is_residual is not False) and getattr(module, '_is_residual', False)):
             with torch.no_grad():
                 module.weight.div_(div_is_residual)
     elif isinstance(module, nn.Embedding):
+        if (emb_init_std is not None):
             std = emb_init_std
+            if (std == 0):
                 warnings.warn(f'Embedding layer initialized to 0.')
             emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
+            if (verbose > 1):
                 warnings.warn(f'Embedding layer initialized using normal distribution with mean=0 and std={std!r}.')
+        elif (emb_init_uniform_lim is not None):
             lim = emb_init_uniform_lim
             if isinstance(lim, Sequence):
+                if (len(lim) > 2):
                     raise ValueError(f'Uniform init requires a min and a max limit. User input: {lim}.')
+                if (lim[0] == lim[1]):
                     warnings.warn(f'Embedding layer initialized to {lim[0]}.')
             else:
+                if (lim == 0):
                     warnings.warn(f'Embedding layer initialized to 0.')
+                lim = [(- lim), lim]
             (a, b) = lim
             emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
+            if (verbose > 1):
                 warnings.warn(f'Embedding layer initialized using uniform distribution in range {lim}.')
         else:
             emb_init_fn_ = init_fn_
         emb_init_fn_(module.weight)
     elif isinstance(module, tuple(set(NORM_CLASS_REGISTRY.values()))):
+        if (verbose > 1):
             warnings.warn(f'Norm weights are set to 1. If norm layer has a bias it is initialized to 0.')
+        if (hasattr(module, 'weight') and (module.weight is not None)):
             torch.nn.init.ones_(module.weight)
+        if (hasattr(module, 'bias') and (module.bias is not None)):
             torch.nn.init.zeros_(module.bias)
     elif isinstance(module, nn.MultiheadAttention):
         if module._qkv_same_embed_dim:
+            assert (module.in_proj_weight is not None)
+            assert ((module.q_proj_weight is None) and (module.k_proj_weight is None) and (module.v_proj_weight is None))
+            assert (d_model is not None)
             _d = d_model
+            splits = (0, _d, (2 * _d), (3 * _d))
+            for (s, e) in zip(splits[:(- 1)], splits[1:]):
                 init_fn_(module.in_proj_weight[s:e])
         else:
+            assert ((module.q_proj_weight is not None) and (module.k_proj_weight is not None) and (module.v_proj_weight is not None))
+            assert (module.in_proj_weight is None)
             init_fn_(module.q_proj_weight)
             init_fn_(module.k_proj_weight)
             init_fn_(module.v_proj_weight)
+        if (module.in_proj_bias is not None):
             torch.nn.init.zeros_(module.in_proj_bias)
+        if (module.bias_k is not None):
             torch.nn.init.zeros_(module.bias_k)
+        if (module.bias_v is not None):
             torch.nn.init.zeros_(module.bias_v)
         init_fn_(module.out_proj.weight)
+        if ((init_div_is_residual is not False) and getattr(module.out_proj, '_is_residual', False)):
             with torch.no_grad():
                 module.out_proj.weight.div_(div_is_residual)
+        if (module.out_proj.bias is not None):
             torch.nn.init.zeros_(module.out_proj.bias)
     else:
         for _ in module.parameters(recurse=False):
 def _normal_init_(std, mean=0.0):
     return partial(torch.nn.init.normal_, mean=mean, std=std)
+def _normal_param_init_fn_(module: nn.Module, std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[(int, float, str, bool)]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[(Tuple[(float, float)], float)]]=None, verbose: int=0, **kwargs):
     del kwargs
     init_fn_ = _normal_init_(std=std)
+    if (verbose > 1):
         warnings.warn(f'Using torch.nn.init.normal_ init fn mean=0.0, std={std}')
     generic_param_init_fn_(module=module, init_fn_=init_fn_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+def baseline_param_init_fn_(module: nn.Module, init_std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[(int, float, str, bool)]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[(Tuple[(float, float)], float)]]=None, verbose: int=0, **kwargs):
     del kwargs
+    if (init_std is None):
         raise ValueError("You must set model.init_config['init_std'] to a float value to use the default initialization scheme.")
     _normal_param_init_fn_(module=module, std=init_std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+def small_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, init_div_is_residual: Union[(int, float, str, bool)]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[(Tuple[(float, float)], float)]]=None, verbose: int=0, **kwargs):
     del kwargs
+    std = math.sqrt((2 / (5 * d_model)))
     _normal_param_init_fn_(module=module, std=std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+def neox_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[(Tuple[(float, float)], float)]]=None, verbose: int=0, **kwargs):
+    'From section 2.3.1 of GPT-NeoX-20B:\n\n    An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)\n    see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151\n    and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py\n    '
     del kwargs
+    residual_div = (n_layers / math.sqrt(10))
+    if (verbose > 1):
         warnings.warn(f'setting init_div_is_residual to {residual_div}')
     small_param_init_fn_(module=module, d_model=d_model, n_layers=n_layers, init_div_is_residual=residual_div, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+def kaiming_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[(int, float, str, bool)]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[(Tuple[(float, float)], float)]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
     del kwargs
+    if (verbose > 1):
+        warnings.warn((f'Using nn.init.kaiming_uniform_ init fn with parameters: ' + f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}'))
     kaiming_uniform_ = partial(nn.init.kaiming_uniform_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
     generic_param_init_fn_(module=module, init_fn_=kaiming_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+def kaiming_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[(int, float, str, bool)]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[(Tuple[(float, float)], float)]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
     del kwargs
+    if (verbose > 1):
+        warnings.warn((f'Using nn.init.kaiming_normal_ init fn with parameters: ' + f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}'))
     kaiming_normal_ = partial(torch.nn.init.kaiming_normal_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
     generic_param_init_fn_(module=module, init_fn_=kaiming_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+def xavier_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[(int, float, str, bool)]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[(Tuple[(float, float)], float)]]=None, init_gain: float=0, verbose: int=0, **kwargs):
     del kwargs
     xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
+    if (verbose > 1):
+        warnings.warn((f'Using torch.nn.init.xavier_uniform_ init fn with parameters: ' + f'gain={init_gain}'))
     generic_param_init_fn_(module=module, init_fn_=xavier_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+def xavier_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[(int, float, str, bool)]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[(Tuple[(float, float)], float)]]=None, init_gain: float=0, verbose: int=0, **kwargs):
     xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
+    if (verbose > 1):
+        warnings.warn((f'Using torch.nn.init.xavier_normal_ init fn with parameters: ' + f'gain={init_gain}'))
     generic_param_init_fn_(module=module, init_fn_=xavier_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
+MODEL_INIT_REGISTRY = {'default_': torch_default_param_init_fn_, 'baseline_': baseline_param_init_fn_, 'kaiming_uniform_': kaiming_uniform_param_init_fn_, 'kaiming_normal_': kaiming_normal_param_init_fn_, 'neox_init_': neox_param_init_fn_, 'small_init_': small_param_init_fn_, 'xavier_uniform_': xavier_uniform_param_init_fn_, 'xavier_normal_': xavier_normal_param_init_fn_}