Upload 6 files

Browse files

Files changed (6) hide show

configuration_phi3_small.py +250 -0
modeling_phi3_small.py +1140 -0
positional_embedding.py +288 -0
tokenization_phi3_small.py +313 -0
triton_blocksparse_attention_layer.py +176 -0
triton_flash_blocksparse_attn.py +1943 -0

configuration_phi3_small.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, List, Optional, Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from functools import cached_property
+""" Phi3Small model configuration """
+logger = logging.get_logger(__name__)
+def next_mult(x, y):
+    return (x + y - 1) // y * y
+class Phi3SmallConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a `Phi3Small` model. It is used to
+    instantiate a Phi-3-small model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Phi-3-small
+    [phi3](https://arxiv.org/pdf/2404.14219) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 100352):
+            Vocabulary size of the Phi3Small model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling `Phi3Small`.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might safely be used with.
+        rope_embedding_base (`float`, *optional*, defaults to 10^6):
+            The base value for the RoPE (Relative Position Encoding) embedding.
+        rope_position_scale (`float`, *optional*, defaults to 1.0):
+            The scale factor for the RoPE position encoding.
+        rope_scaling (`Optional[Dict[str, Union[float, List[float], int]]]`, *optional*, defaults to None):
+            The scaling configuration used for LongRoPE.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            The size of the hidden layers in the model.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            The number of layers in the model.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            The number of query heads in the model.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            The number of key-value heads in the model.
+        hidden_act (`str`, *optional*, defaults to "gegelu"):
+            The activation function used in the model.
+        gegelu_limit (`float`, *optional*, defaults to 20.0):
+            The limit value for the GELU activation function (for numerical stability).
+        gegelu_pad_to_256 (`bool`, *optional*, defaults to True):
+            Whether to pad the intermediate size to a multiple of 256 (for faster matmul ops).
+        ff_dim_multiplier (`Optional[int]`, *optional*, defaults to None):
+            The dimension multiplier for the feed-forward layers.
+        ff_intermediate_size (`Optional[int]`, *optional*, defaults to 14336):
+            The intermediate size for the feed-forward layers.
+            One of `ff_dim_multiplier` or `ff_intermediate_size` must be specified.
+        blocksparse_homo_head_pattern (`bool`, *optional*, defaults to False):
+            Whether to use a homogeneous head pattern for block-sparse attention.
+        blocksparse_block_size (`int`, *optional*, defaults to 64):
+            The block size for block-sparse attention.
+        blocksparse_num_local_blocks (`int`, *optional*, defaults to 16):
+            The number of local blocks for block-sparse attention.
+            The local window used in blocksparse equals `blocksparse_num_local_blocks * blocksparse_block_size`
+        blocksparse_vert_stride (`int`, *optional*, defaults to 8):
+            The vertical stride for block-sparse attention.
+        blocksparse_triton_kernel_block_size (`int`, *optional*, defaults to 64):
+            The kernel block size for block-sparse attention.
+        dense_attention_every_n_layers (`Optional[int]`, *optional*, defaults to 2):
+            The frequency of all dense attention layers in the model
+        embedding_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the embedding layer.
+        attention_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        ffn_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the feed-forward layers.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon value for layer normalization.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The range for weight initialization.
+        mup_use_scaling (`bool`, *optional*, defaults to True):
+            Whether to use scaling for MuP parameters (see: https://arxiv.org/abs/2203.03466).
+        mup_width_multiplier (`bool`, *optional*, defaults to 8.0):
+            The width multiplier for MuP.
+        mup_embedding_multiplier (`bool`, *optional*, defaults to 10.0):
+            The embedding multiplier for MuP.
+        mup_attn_multiplier (`bool`, *optional*, defaults to 1.0):
+            The attention multiplier for MuP.
+        use_cache (`bool`, *optional*, defaults to True):
+            Whether to use cache for the model.
+        bos_token_id (`int`, *optional*, defaults to 100257):
+            The token ID for the beginning of sentence.
+        eos_token_id (`int`, *optional*, defaults to 100257):
+            The token ID for the end of sentence.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to False):
+            Whether to reorder and upcast attention.
+        pad_sequence_to_multiple_of_64 (`bool`, *optional*, defaults to True):
+            Whether to pad the sequence length to a multiple of 64.
+        **kwargs:
+            Additional keyword arguments.
+    Example:
+    ```python
+    >>> from transformers import Phi3SmallConfig, Phi3SmallModel
+    >>> # Initializing a Phi3Small configuration
+    >>> configuration = Phi3SmallConfig()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = Phi3SmallModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "phi3small"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        # General information about the model
+        vocab_size: int =100352,
+        max_position_embeddings: int = 8192,
+        # RoPE Related Parameters
+        rope_embedding_base: float = 10**6,
+        rope_position_scale: float = 1.0,
+        rope_scaling: Optional[Dict[str, Union[float, List[float], int]]] = None,
+        # General Model Parameters
+        hidden_size: int = 4096,
+        num_hidden_layers: int = 32,
+        # KV Shared Attention Configurations
+        num_attention_heads: int = 32,
+        num_key_value_heads: int = 8,
+        # GEGELU Related Parameters
+        hidden_act: str = "gegelu",
+        gegelu_limit: float = 20.0,
+        gegelu_pad_to_256: bool = True,
+        ff_dim_multiplier: Optional[int] = None,
+        ff_intermediate_size: Optional[int] = 14336,
+        # Block Sparse Attention Parameters
+        blocksparse_homo_head_pattern: bool = False,
+        blocksparse_block_size: int = 64,
+        blocksparse_num_local_blocks: int = 16,
+        blocksparse_vert_stride: int = 8,
+        blocksparse_triton_kernel_block_size: int = 64,
+        # Frequency of block-sparsity
+        dense_attention_every_n_layers: Optional[int] = 2,
+        # Reegularization parameters
+        embedding_dropout_prob: float =0.1,
+        attention_dropout_prob: float = 0.0,
+        ffn_dropout_prob: float = 0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        # MuP parameters
+        mup_use_scaling: bool = True,
+        mup_width_multiplier: bool = 8.0,
+        mup_embedding_multiplier: bool = 10.0,
+        mup_attn_multiplier: bool =1.0,
+        use_cache=True,
+        # The model does not have a bos token id
+        # However, in order for some of the downstream libraries to not break
+        # we set this to be the same as the eos_token_id
+        bos_token_id: int = 100257,
+        eos_token_id: int = 100257,
+        reorder_and_upcast_attn=False,
+        # Configuration to pad sequence length to a multiple of 64
+        pad_sequence_to_multiple_of_64: bool = True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_embedding_base = rope_embedding_base
+        self.rope_position_scale = rope_position_scale
+        self.rope_scaling = rope_scaling
+        self.hidden_size = hidden_size
+        # QK Shared Attention
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        # Block Sparse Attention Pattern
+        self.blocksparse_homo_head_pattern = blocksparse_homo_head_pattern
+        self.blocksparse_block_size = blocksparse_block_size
+        self.blocksparse_num_local_blocks = blocksparse_num_local_blocks
+        self.blocksparse_vert_stride = blocksparse_vert_stride
+        self.blocksparse_triton_kernel_block_size = blocksparse_triton_kernel_block_size
+        # Frequency of block sparsity
+        self.dense_attention_every_n_layers = dense_attention_every_n_layers
+        # Activation function
+        self.hidden_act = hidden_act
+        self.gegelu_limit = gegelu_limit
+        self.gegelu_pad_to_256 = gegelu_pad_to_256
+        self.ff_dim_multiplier = ff_dim_multiplier
+        self.ff_intermediate_size = ff_intermediate_size
+        if self.ff_dim_multiplier is None and self.ff_intermediate_size is None:
+            raise ValueError(f"Cannot have both {self.ff_dim_multiplier} and {self.ff_intermediate_size} as None")
+        if self.ff_dim_multiplier is not None and self.ff_intermediate_size is not None:
+            raise ValueError(f"Cannot specify both {self.ff_dim_multiplier} and {self.ff_intermediate_size}.")
+        # General regularization
+        self.embedding_dropout_prob = embedding_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.ffn_dropout_prob = ffn_dropout_prob
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        # MuP parameters
+        self.mup_use_scaling = mup_use_scaling
+        self.mup_width_multiplier = mup_width_multiplier
+        self.mup_embedding_multiplier = mup_embedding_multiplier
+        self.mup_attn_multiplier = mup_attn_multiplier
+        self.use_cache = use_cache
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+        self.pad_sequence_to_multiple_of_64 = pad_sequence_to_multiple_of_64
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+    @cached_property
+    def dummy_token_indices(self) -> List[int]:
+        # Importing here to avoid circular imports
+        from .tokenization_phi3_small import Phi3SmallTokenizer
+        tokenizer = Phi3SmallTokenizer()
+        return tokenizer.dummy_token_indices
+    @property
+    def intermediate_size(self) -> int:
+        if self.ff_intermediate_size is not None:
+            return self.ff_intermediate_size
+        intermediate_size = (self.ff_dim_multiplier) * (self.hidden_size // 3) * 2
+        if self.gegelu_pad_to_256:
+            intermediate_size = next_mult(intermediate_size, 256)
+        return intermediate_size

modeling_phi3_small.py ADDED Viewed

	@@ -0,0 +1,1140 @@

+import math
+from typing import Any, Dict, Optional, List, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers.modeling_outputs import SequenceClassifierOutputWithPast, CausalLMOutputWithPast, BaseModelOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.cache_utils import Cache, DynamicCache
+from .triton_flash_blocksparse_attn import BlockSparseParams
+from .triton_blocksparse_attention_layer import BlockSparseAttentionLayer
+from .positional_embedding import RotaryEmbedding
+from .configuration_phi3_small import Phi3SmallConfig
+# Flash Attention Related Imports
+is_flash_attention_available = False
+try:
+    import flash_attn
+    if int(flash_attn.__version__.split('.')[0]) < 2:
+        from flash_attn.flash_attn_interface import (
+            flash_attn_func,
+            flash_attn_unpadded_kvpacked_func as flash_attn_varlen_kvpacked_func,
+            )
+        # rename `max_seqlen`
+        def flash_attn_varlen_qkvpacked_func(qkv, cu_seqlens, max_seqlen, dropout_p=0.0, **kwargs):
+            return flash_attn_func(qkv, cu_seqlens, dropout_p=dropout_p, max_s=max_seqlen, **kwargs)
+    else:
+        from flash_attn.flash_attn_interface import (
+            flash_attn_varlen_kvpacked_func,
+        )
+        from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
+    is_flash_attention_available = True
+except ImportError:
+    pass
+logger = logging.get_logger(__name__)
+LegacyCache = Tuple[Tuple[torch.FloatTensor]]
+# Taken from https://github.com/allenai/allennlp/blob/main/allennlp/nn/util.py
+def info_value_of_dtype(dtype: torch.dtype):
+    """
+    Returns the `finfo` or `iinfo` object of a given PyTorch data type. Does not allow torch.bool.
+    """
+    if dtype == torch.bool:
+        raise TypeError("Does not support torch.bool")
+    elif dtype.is_floating_point:
+        return torch.finfo(dtype)
+    else:
+        return torch.iinfo(dtype)
+# Taken from https://github.com/allenai/allennlp/blob/main/allennlp/nn/util.py
+def min_value_of_dtype(dtype: torch.dtype):
+    """
+    Returns the minimum value of a given PyTorch data type. Does not allow torch.bool.
+    """
+    return info_value_of_dtype(dtype).min
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+@torch.jit.script
+def quick_gelu(x):
+    return x * torch.sigmoid(1.702 * x)
+@torch.jit.script
+def gegelu(input, limit: Optional[float] = None):
+    a_gelu, a_linear = input[..., ::2], input[..., 1::2]
+    if limit is not None:
+        a_gelu = torch.where(
+            torch.isinf(a_gelu), a_gelu, a_gelu.clamp(min=None, max=limit)
+        )
+        a_linear = torch.where(
+            torch.isinf(a_linear), a_linear, a_linear.clamp(min=-limit, max=limit)
+        )
+    out_gelu = quick_gelu(a_gelu)
+    return out_gelu * (a_linear + 1)
+def collapse_first_n_dims(x: torch.Tensor, n: int) -> torch.Tensor:
+    """
+    Collapse the first `n` dimensions of a tensor into a single dimension.
+    Args:
+        x (torch.Tensor): The input tensor.
+        n (int): The number of dimensions to collapse.
+    Returns:
+        torch.Tensor: The output tensor.
+    """
+    return x.view(-1, *x.shape[n:])
+def pad_tensor_to_next_mult_of(
+    tensor: torch.Tensor,
+    dim: int,
+    n: int,
+) -> Tuple[torch.Tensor, int]:
+    """
+    Pads a tensor along a specified dimension to the next multiple of a given number.
+    Args:
+        tensor (torch.Tensor): The input tensor.
+        dim (int): The dimension along which to pad the tensor.
+        n (int): The number to pad the tensor to the next multiple of.
+    Returns:
+        Tuple[torch.Tensor, int]: A tuple containing the padded tensor and the amount of padding added.
+    """
+    residual = tensor.size(dim) % n
+    if residual == 0:
+        return tensor, 0
+    padding = n - residual
+    padding_tensor = torch.zeros((*tensor.size()[:dim], padding, *tensor.size()[dim + 1:]), device=tensor.device, dtype=tensor.dtype)
+    return torch.cat([tensor, padding_tensor], dim=dim), padding
+def strip_padding_from_tensor(
+    tensor: torch.Tensor,
+    dim: int,
+    residual: int,
+) -> torch.Tensor:
+    """
+    Removes padding from a tensor along a specified dimension.
+    Args:
+        tensor (torch.Tensor): The input tensor.
+        dim (int): The dimension along which to remove padding.
+        residual (int): The amount of padding to remove.
+    Returns:
+        torch.Tensor: The tensor with padding removed along the specified dimension.
+    """
+    return torch.narrow(tensor, dim, 0, tensor.size(dim) - residual)
+class Phi3SmallMLP(nn.Module):
+    def __init__(self, config: Phi3SmallConfig):
+        super().__init__()
+        self.config = config
+        assert self.config.hidden_act == "gegelu", "Only `gegelu` is supported for the Phi-3-small model .."
+        self.hidden_size = config.hidden_size
+        self.gegelu_limit = config.gegelu_limit
+        self.intermediate_size = config.intermediate_size
+        self.up_proj = nn.Linear(self.hidden_size, 2 * self.intermediate_size)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size)
+        self.dropout = nn.Dropout(config.ffn_dropout_prob)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.dropout(
+            self.down_proj(
+                gegelu(self.up_proj(x), limit=self.gegelu_limit)
+            )
+        )
+class Phi3SmallSelfAttention(nn.Module):
+    def __init__(self, config: Phi3SmallConfig, layer_idx: Optional[int] = None) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.hidden_size = config.hidden_size
+        # Number of Query Heads
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        # Number of Key Value Heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_q_per_kv = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_embedding_base = config.rope_embedding_base
+        self.rope_position_scale = config.rope_position_scale
+        self.is_causal = True
+        self.attention_dropout_rate = config.attention_dropout_prob
+        norm_factor = None
+        if config.mup_use_scaling:
+            norm_factor = self.head_dim / config.mup_attn_multiplier
+        else:
+            norm_factor = math.sqrt(self.head_dim)
+        self.softmax_scale = 1.0 / norm_factor
+        self.query_key_value = nn.Linear(self.hidden_size, (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim)
+        self.dense = nn.Linear(self.hidden_size, self.hidden_size)
+        self.blocksparse_params = None
+        # layer_idx is 0 indexed because that's what the KV Cache expects.
+        if self.config.dense_attention_every_n_layers and ((self.layer_idx + 1) % self.config.dense_attention_every_n_layers == 0):
+            logger.info(
+                f"Layer {layer_idx + 1} is using dense attention since it is divisible by "
+                f"{self.config.dense_attention_every_n_layers}"
+            )
+            assert is_flash_attention_available, "Flash Attention is not available, but is needed for dense attention"
+        else:
+            # BlockSparse related Parameters
+            self.blocksparse_params = BlockSparseParams.from_config(config)
+        if self.blocksparse:
+            active_head_range = None
+            """
+                ... note(bapatra)::
+                    In case of tensor parallelism and while using the heterogeneous head patterns,
+                    the active head range needs to be modified based on the tensor parallel rank
+                    and the tensor parallel world size.
+                    This is because in the case of heterogeneous head patterns, the kernel needs to know
+                    which head is on which device, so that it can pick the corresponding blocksparse head
+                    pattern correctly.
+                    Example:
+                    ```python
+                        if not self.blocksparse_params.homo_head_pattern:
+                            tp_rank = torch.distributed.get_rank() % tp_world_size
+                            num_heads_per_partition = num_heads // tp_world_size
+                            active_head_range = (tp_rank * num_heads_per_partition, (tp_rank + 1) * num_heads_per_partition)
+                    ```
+            """
+            self._blocksparse_layer = BlockSparseAttentionLayer(
+                n_heads=self.num_heads,
+                max_seq_len=self.max_position_embeddings,
+                sparse_block_size=self.blocksparse_params.block_size,
+                local_blocks=self.blocksparse_params.num_local_blocks,
+                vert_stride=self.blocksparse_params.vert_stride,
+                kernel_block_size=self.blocksparse_params.kernel_block_size,
+                homo_head=self.blocksparse_params.homo_head_pattern,
+                active_head_range=active_head_range,
+            )
+        self.rotary_emb = RotaryEmbedding.from_config(config)
+    @property
+    def blocksparse(self):
+        return self.blocksparse_params is not None
+    def _split_heads(self, mixed_x_layer: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        bs, sq, _ = mixed_x_layer.size()
+        r"""
+        The main idea is that we group tensors as
+        [bs, sq, (q00, q01, ... q0m, k0, v0), (q10, q11, ... q1m, k1, v1), ... (qn0, qn1, ... qnm, kn, vn)]
+        That ways, when the MP column sharding happens, this tensor will be sharded keeping all the
+        queries and keys intact. In order to get the correct qkv, we first break into groups, and then
+        index into the groups.
+        """
+        intermediate_shape = (bs, sq, -1, (self.num_q_per_kv + 2), self.head_dim)
+        mixed_x_layer = mixed_x_layer.view(*intermediate_shape)
+        q = mixed_x_layer[:, :, :, :-2]
+        k = mixed_x_layer[:, :, :, [-2]]
+        v = mixed_x_layer[:, :, :, [-1]]
+        q, k, v = [
+            rearrange(
+                x,
+                "bs sq group nh hn -> bs sq (group nh) hn"
+            ) for x in (q, k, v)
+        ]
+        return q, k, v
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._unpad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+    def _apply_blocksparse_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor],
+        return_attention_probs: bool = False,
+    ) -> torch.Tensor:
+        """
+        Applies blocksparse attention to the input tensors.
+        Args:
+            q (torch.Tensor): The query tensor of shape (bs, nqp, seq_len, hn).
+            k (torch.Tensor): The key tensor of shape (bs, nkp, seq_len, hn).
+            v (torch.Tensor): The value tensor of shape (bs, nkp, seq_len, hn).
+            attention_mask (Optional[torch.LongTensor]): The attention mask tensor of shape (bs, seq_len).
+            return_attention_probs (bool, optional): Whether to return attention probabilities. Defaults to False.
+        Returns:
+            torch.Tensor: The context layer tensor of shape (bs, nqp, seq_len, hn).
+        """
+        assert not return_attention_probs, "return_attention_probs is not supported for blocksparse attention"
+        q, k, v = q.contiguous(), k.contiguous(), v.contiguous()
+        # shape: (bs, nqp, seq_len, hn)
+        if torch.is_grad_enabled():
+            # Training or non-batched inference
+            context_layer = self._blocksparse_layer(
+                q=q, k=k, v=v, sm_scale=self.softmax_scale
+            )
+        elif attention_mask is None:
+            if q.size(0) != 1:
+                logger.warning_once(
+                    "You are attempting to do batched inference without passing the attention mask.\n"
+                    "This is okay if you are running loglikelihood requests. However, if you want to do generation, "
+                    "this probably won't work as expected. Please pass the attention mask to the forward function."
+                )
+            context_layer = self._blocksparse_layer(
+                q=q, k=k, v=v, sm_scale=self.softmax_scale
+            )
+        else:
+            """
+                Shapes of tensors are as follows:
+                    q: (bs, nqp, seq_len, hdim)
+                    k: (bs, nkp, seq_len, hdim)
+                    v: (bs, nkp, seq_len, hdim)
+                We first need to transpose the shapes to fit what the
+                kernel needs, and the reinvert it back at the end of the operations
+            """
+            assert attention_mask.ndim == 2, "The kernel, like flash-attention-2, only supports 2d attention masks ..."
+            left_paddings = attention_mask.shape[1] - attention_mask.sum(dim=-1)
+            # shape: (bs, seq_len, nqp, hdim)
+            q = q.transpose(1, 2).contiguous()
+            # shape: (bs, seq_len, nkp, hdim)
+            k = k.transpose(1, 2).contiguous()
+            # shape: (bs, seq_len, nkp, hdim)
+            v = v.transpose(1, 2).contiguous()
+            context_layer = self._blocksparse_layer(
+                q=q, k=k, v=v, sm_scale=self.softmax_scale, left_paddings=left_paddings.to(torch.int32)
+            )
+            # shape: (bs, nqp, seq_len, hdim)
+            context_layer = context_layer.transpose(1, 2).contiguous()
+        return context_layer
+    def _apply_dense_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_mask: torch.Tensor,
+        return_attention_probs: bool = False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Apply dense attention
+        Args:
+            q (torch.Tensor):
+                The query tensor, shape: (bs, num_query_heads, seq_len, head_size)
+            k (torch.Tensor):
+                The key tensor, shape: (bs, num_query_heads, seq_len, head_size)
+            v (torch.Tensor):
+                The value tensor, shape: (bs, num_query_heads, seq_len, head_size)
+            return_attention_probs (bool, optional):
+                Return the attention probabilities. Defaults to False.
+        Returns:
+            Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+                Return the output of the attention aggregation. If `return_attention_probs` is True, then
+                also return the attention probabilities
+        .. note::
+            Right now, am assuming the expansion for the query key values is already done
+            outside. But ideally, since Flash attention handles the GQA correctly, we can
+            avoid doing that.
+        """
+        attention_dropout_prob = self.attention_dropout_rate if self.training else 0.0
+        # Get into the correct shape for the Flash Attention API
+        # shape: (bs, seq_len, nqp, hn)
+        q = q.transpose(1, 2).contiguous()
+        query_length = q.size(1)
+        # shape: (bs, seq_len, npq, hn)
+        k = k.transpose(1, 2).contiguous()
+        # shape: (bs, seq_len, npq, hn)
+        v = v.transpose(1, 2).contiguous()
+        if attention_mask is not None:
+            causal = q.size(2) == k.size(2)
+            batch_size = q.shape[0]
+            flat_q, flat_k, flat_v, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                q, k, v, attention_mask, query_length
+            )
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_q, max_seqlen_k = max_seq_lens
+            flat_kv = torch.cat((flat_k.unsqueeze(1), flat_v.unsqueeze(1)), dim=1)
+            attn_output_unpad = flash_attn_varlen_kvpacked_func(
+                q=flat_q,
+                kv=flat_kv,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_k=max_seqlen_k,
+                dropout_p=attention_dropout_prob,
+                softmax_scale=self.softmax_scale,
+                causal=causal,
+                return_attn_probs=return_attention_probs
+            )
+            attention_output = pad_input(
+                attn_output_unpad, indices_q, batch_size, query_length
+            )
+        else:
+            kv = torch.cat((k.unsqueeze(2), v.unsqueeze(2)), dim=2)
+            cu_seqlens_q = torch.arange(
+                0, (q.size(0) + 1), device=q.device, dtype=torch.int32
+            ) * q.size(1)
+            cu_seqlens_kv = torch.arange(
+                0, (kv.size(0) + 1), device=kv.device, dtype=torch.int32
+            ) * kv.size(1)
+            max_seqlen_q = q.size(1)
+            max_seqlen_k = kv.size(1)
+            attention_output = flash_attn_varlen_kvpacked_func(
+                q=collapse_first_n_dims(q, 2),
+                kv=collapse_first_n_dims(kv, 2),
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_k=max_seqlen_k,
+                dropout_p=attention_dropout_prob,
+                softmax_scale=self.softmax_scale,
+                causal=q.size(1) == kv.size(1),
+                return_attn_probs=return_attention_probs
+            )
+        if return_attention_probs:
+            (context_layer, attn_probs) = attention_output
+            context_layer = context_layer.view(q.size(0), q.size(1), -1, q.size(3)).transpose(1, 2).contiguous()
+            return (context_layer, attn_probs)
+        context_layer = attention_output
+        context_layer = context_layer.view(q.size(0), q.size(1), -1, q.size(3)).transpose(1, 2).contiguous()
+        return context_layer
+    def expand_kv_to_q_size(self, kv: torch.Tensor, num_q_per_kv: int) -> torch.Tensor:
+        """
+        Expand the key-value tensor to match the size of the query tensor.
+        Args:
+            kv (torch.Tensor): The key-value tensor of shape (bsz, nkp, 2, seq_len, hdim).
+            num_q_per_kv (int): The number of queries per key-value.
+        Returns:
+            torch.Tensor: The expanded key-value tensor of shape (bsz, nqp, 2, seq_len, hdim).
+            Where nqp = num_q_per_kv * nkp
+        .. note(bapatra)::
+            Right now, I am using a repeat_interleave to expand the kv to the size of q.
+            This incurs a memory penalty, since the tensors are actually copied.
+            TODO: If this does yield benefits, then potentially we can use the re-written
+            flash attention kernel that can handle GQA.
+        """
+        repeats = torch.tensor([num_q_per_kv] * kv.size(1)).to(kv.device)
+        total = repeats.sum()
+        expanded_kv = torch.repeat_interleave(
+            kv,
+            repeats=repeats,
+            dim=1,
+            output_size=total
+        )
+        return expanded_kv
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """
+        The forward function of the Self Attention Layer.
+        Args:
+            hidden_states (torch.Tensor):
+                The input tensor of shape (bs, q_len, h).
+            attention_mask (Optional[torch.Tensor], optional):
+                The attention mask tensor of shape (bs, seq_len). This is the 2D attention mask tensor as is standard in the flash-attention
+                kernel.
+                Defaults to None.
+            position_ids (Optional[torch.LongTensor], optional):
+                The position ids tensor of shape (bs, q_len). Defaults to None. Unused by the function.
+            past_key_value (Optional[Cache], optional):
+                The previous kv cache values. Defaults to None.
+            output_attentions (bool, optional):
+                Whether to return the attention scores. Defaults to False.
+                    .. note::
+                        For the blocksparse attention kernel, we do not support returning the attention scores.
+            use_cache (bool, optional):
+                Whether to use the cache for storing the kv. Defaults to False.
+        Returns:
+            Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+                The output tensor of shape (bs, q_len, h),
+                the attention scores tensor of shape (bs, nqp, q_len, seq_len) if `output_attentions` is True,
+                and the updated cache values if `use_cache` is True.
+        Notations:
+        ------------
+            bs: batch size
+            sq_len: sequence length of the entire sequence
+            q_len: sequence length of the query
+            cache_sq: sequence length in the cache
+                If there is no cache then cache_sq = 0
+                and sq_len = q_len
+                otherwise sq_len = q_len + cache_sq
+            h: hidden size
+            nq: number of query heads
+            nkv: number of key heads
+            hn: hidden size per head
+                hn = h // nq
+            nqp: number of query heads (per MP partition)
+                nqp = nq // (num mp partitions)
+            nkvp: number of key-value heads (per MP partition)
+                nkvp = nk // (num mp partitions)
+        """
+        # shape: (bs, q_len, h)
+        bsz, q_len, _ = hidden_states.size()
+        # shape: (bs, q_len, (nqp + 2 * nkvp) * hn)
+        mixed_x_layer = self.query_key_value(hidden_states)
+        # shape: (bs, q_len, nqp, hn), shape: (bs, q_len, nkvp, hn), shape: (bs, q_len, nkvp, hn)
+        q, k, v = self._split_heads(mixed_x_layer)
+        # shape: (bs, qnp, q_len, hn)
+        query_states = q.permute(0, 2, 1, 3).contiguous()
+        # shape: (bs, nkvp, q_len, hn)
+        key_states = k.permute(0, 2, 1, 3).contiguous()
+        # shape: (bs, nkvp, q_len, hn)
+        value_states = v.permute(0, 2, 1, 3).contiguous()
+        kv_seq_len = key_states.shape[-2]
+        if past_key_values is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            if self.rotary_emb is not None:
+                seqlen_offset = past_key_values.get_usable_length(kv_seq_len, layer_idx=self.layer_idx)
+                # shape: (bs, nqp, q_len, hn), shape: (bs, nkvp, q_len, hn)
+                query_states, key_states = self.rotary_emb(
+                    query_states, key_states, seq_dimension=2, seqlen_offset=seqlen_offset
+                )
+                key_states, value_states = past_key_values.update(key_states=key_states, value_states=value_states, layer_idx=self.layer_idx)
+        else:
+            # In this case seq_len = q_len and cache_sq = 0
+            if self.rotary_emb is not None:
+                # shape: (bs, nqp, seq_len, hn), shape: (bs, nkvp, seq_len, hn)
+                query_states, key_states = self.rotary_emb(query_states, key_states, seq_dimension=2)
+        # shape: (bs, nkvp, 2, seq_len, hn)
+        kv_states = torch.cat((key_states.unsqueeze(2), value_states.unsqueeze(2)), dim=2)
+        # shape: (bs, nqp, 2, seq_len, hn)
+        expanded_kv_states = self.expand_kv_to_q_size(kv_states, num_q_per_kv=self.num_q_per_kv)
+        # shape: (bs, nqp, seq_len, hn), shape: (bs, nqp, seq_len, hn)
+        expanded_key_states, expanded_value_states = expanded_kv_states[:, :, 0], expanded_kv_states[:, :, 1]
+        if self.blocksparse:
+            attn_function_output = self._apply_blocksparse_attention(
+                q=query_states,
+                k=expanded_key_states,
+                v=expanded_value_states,
+                attention_mask=attention_mask,
+                return_attention_probs=output_attentions
+            )
+        else:
+            attn_function_output = self._apply_dense_attention(
+                q=query_states,
+                k=expanded_key_states,
+                v=expanded_value_states,
+                attention_mask=attention_mask,
+                return_attention_probs=output_attentions
+            )
+        attn_weights = None
+        if output_attentions:
+            attn_output, attn_weights = attn_function_output
+        else:
+            # shape: (bs, nqp, seq_len, hn)
+            attn_output = attn_function_output
+        # shape: (bs, seq_len, nqp, hn)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        # shape: (bs, seq_len, h)
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.dense(attn_output)
+        return attn_output, attn_weights, past_key_values
+class Phi3SmallDecoderLayer(nn.Module):
+    def __init__(self, config: Phi3SmallConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Phi3SmallSelfAttention(config, layer_idx)
+        self.mlp = Phi3SmallMLP(config)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Cache]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_values = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_values,)
+        return outputs
+class Phi3SmallPreTrainedModel(PreTrainedModel):
+    config_class = Phi3SmallConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Phi3SmallDecoderLayer"]
+    skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = False
+    _supports_cache_class = True
+    def _init_weights(self, module: nn.Module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        # The output projection on the decoder attention layer as well as the down_proj in the MLP are scaled
+        # differently (dubbed `output_layer_init_method` in the Megatron code). This is replicated here
+        for name, p in module.named_parameters():
+            if any(x in name for x in ("c_proj.weight", "down_proj.weight", "o_proj.weight")):
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.num_hidden_layers)))
+class Phi3SmallModel(Phi3SmallPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        # Embedding Dropout
+        self.embedding_dropout = nn.Dropout(config.embedding_dropout_prob)
+        # MuP Embedding scaling
+        self.mup_embedding_multiplier = config.mup_embedding_multiplier
+        self.layers = nn.ModuleList([Phi3SmallDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @property
+    def pad_sequence_to_multiple_of_64(self):
+        # We only need to do this for the backward pass. So only required
+        # when we are in the context of generating gradients
+        return self.config.pad_sequence_to_multiple_of_64 and torch.is_grad_enabled()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, LegacyCache]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, past_key_values_length + seq_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = self.embedding_dropout(inputs_embeds)
+        if self.mup_embedding_multiplier is not None and self.mup_embedding_multiplier > 0.0:
+            inputs_embeds = inputs_embeds * self.mup_embedding_multiplier
+        residual = 0
+        if self.pad_sequence_to_multiple_of_64:
+            # note(bapatra): Since we don't particularly use the position_ids and the attention mask
+            # we don't need to pad them
+            inputs_embeds, residual = pad_tensor_to_next_mult_of(tensor=inputs_embeds, dim=1, n=64)
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                # Following the Mistral schema for layer return values
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.final_layernorm(hidden_states)
+        if residual > 0:
+            hidden_states = strip_padding_from_tensor(tensor=hidden_states, dim=1, residual=residual)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class Phi3SmallForCausalLM(Phi3SmallPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Phi3SmallModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, self.vocab_size, bias=False)
+        self.mup_width_multiplier = config.mup_width_multiplier
+        # Create the mask for the dummy tokens in the vocabulary
+        dummy_token_indices = config.dummy_token_indices
+        dummy_tokens_mask = torch.zeros(self.vocab_size).bool()
+        dummy_tokens_mask[dummy_token_indices] = True
+        # shape: (vocab_size,)
+        self.register_buffer("dummy_tokens_mask", dummy_tokens_mask, persistent=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, value):
+        self.lm_head = value
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        if self.mup_width_multiplier:
+            logits = logits / self.mup_width_multiplier
+        logits = logits.masked_fill(self.dummy_tokens_mask, min_value_of_dtype(logits.dtype))
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        # only last token for inputs_ids if past is defined in kwargs
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        else:
+            position_ids = None
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+# Copied from transformers.models.mistral.modeling_mistral.MistralForSequenceClassification with Mistral -> Phi3Small
+class Phi3SmallForSequenceClassification(Phi3SmallPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Phi3SmallModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

positional_embedding.py ADDED Viewed

	@@ -0,0 +1,288 @@

+"""
+Orginally Taken verbatim from xformers library
+https://github.com/facebookresearch/xformers/blob/bcb707576c6a80eaf850aa80e8643d3497ec2bc4/xformers/components/positional_embedding/rotary.py
+The difference is that xformers seems to assume the inputs to be
+(bs, head, seq_len, dim) while we assume (bs, seq_len, head, dim)
+"""
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+# CREDITS: This implementation is inspired by GPT-NeoX https://github.com/EleutherAI/gpt-neox
+# NOTE: Almost the same right now, moving parts to Triton is the next step
+import math
+from typing import List, Optional, Tuple, Dict, Union
+import torch
+import dataclasses
+from transformers.utils import logging
+from transformers import PretrainedConfig
+is_dacite_available = False
+try:
+    import dacite
+    is_dacite_available = True
+except ImportError:
+    pass
+logger = logging.get_logger(__name__)
+@dataclasses.dataclass
+class LongRopeConfig(object):
+    short_factor: List[float]
+    long_factor: List[float]
+    original_max_position_embeddings: int
+    type: str = "longrope"
+    short_mscale: float = -1
+    long_mscale: float = -1
+    def __post_init__(self):
+        assert self.type in ("longrope", "su"), f"Invalid type {self.type} for LongRopeConfig. Expected longrope / su"
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Union[float, List[float], int]]) -> "LongRopeConfig":
+        if is_dacite_available:
+            # Preferred since we can also type check the input
+            return dacite.from_dict(data_class=cls, data=config_dict)
+        kwargs = {}
+        for field in dataclasses.fields(cls):
+            if field.name in config_dict:
+                if field.init:
+                    kwargs[field.name] = config_dict[field.name]
+                else:
+                    raise ValueError(f"Field {field.name} is not initiable")
+            else:
+                if field.default is dataclasses.MISSING:
+                    raise ValueError(f"Field {field.name} is required")
+        extra_keys = set(config_dict.keys()) - set(kwargs.keys())
+        if len(extra_keys) > 0:
+            for key in extra_keys:
+                logger.error(f"Unrecognized key {key} in config_dict")
+            raise ValueError(f"Unrecognized keys in config_dict")
+        return cls(**kwargs)
+def rotate_half(x):
+    x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=x1.ndim - 1)
+@torch.jit.script
+def apply_rotary_pos_emb(x, cos, sin, seq_dimension: int):
+    # NOTE: This could probably be moved to Triton
+    if seq_dimension == 0:
+        cos = cos[: x.shape[0], None, None, :]
+        sin = sin[: x.shape[0], None, None, :]
+    elif seq_dimension == 1:
+        # Handle a possible sequence length mismatch in between q and k
+        cos = cos[None, : x.shape[1], None, :]
+        sin = sin[None, : x.shape[1], None, :]
+    elif seq_dimension == 2:
+        cos = cos[None, None, : x.shape[2], :]
+        sin = sin[None, None, : x.shape[2], :]
+    return (x * cos) + (rotate_half(x) * sin)
+class RotaryEmbedding(torch.nn.Module):
+    """
+    Adapted from the xformers library
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+    .. warning: Please note that this embedding is not registered on purpose, as it is transformative
+        (it does not create the embedding dimension) and will likely be picked up (imported) on a ad-hoc basis
+    # Arguments
+    :param dim_mode: head dimention
+    :param max_seq_len:
+    :param default_seq_dimension: which dim is the sequence length
+    :param dtype: cos/sin dtype
+    :param use_fused_kernel: if to use customized fused kernel.
+        Note: if used, q, k will be modified inplace. Ok for both forward & backward.
+    """
+    def __init__(
+        self,
+        dim_model: int,
+        *,
+        max_seq_len: Optional[int] = None,
+        dtype: Optional[torch.dtype] = None,
+        base=10000,
+        position_scale=1,
+        device: Optional[torch.device] = None,
+        longrope_config: Optional[LongRopeConfig] = None,
+    ):
+        super().__init__()
+        self.base = base
+        self.dim_model = dim_model
+        self.max_seq_len = max_seq_len
+        self.longrope_config = longrope_config
+        if self.is_longrope:
+            # Keep the maximum range vector, and slice from it as needed
+            self.register_buffer(
+                "range_vector",
+                torch.arange(max_seq_len, device=device, dtype=torch.float32),
+                persistent=False
+            )
+            self.register_buffer(
+                "short_factors",
+                torch.tensor(self.longrope_config.short_factor, dtype=torch.float32),
+                persistent=False
+            )
+            self.register_buffer(
+                "long_factors",
+                torch.tensor(self.longrope_config.long_factor, dtype=torch.float32),
+                persistent=False
+            )
+        else:
+            # Generate and save the inverse frequency buffer (non trainable)
+            inv_freq = 1.0 / (base ** (torch.arange(0, dim_model, 2).float().to(device) / self.dim_model))
+            self.register_buffer("inv_freq", inv_freq)
+        self.position_scale = position_scale
+        if not self.is_longrope:
+            dtype = dtype or torch.get_default_dtype()
+            self._set_cos_sin_cache(
+                seq_len=max_seq_len,
+                device=self.inv_freq.device,
+                dtype=dtype,
+            )
+    @property
+    def is_longrope(self):
+        return self.longrope_config is not None
+    @property
+    def original_max_seq_len(self):
+        if self.longrope_config is not None:
+            return self.longrope_config.original_max_position_embeddings
+        logger.warning_once(
+            (
+                "``original_max_seq_len'' is being accessed, but longrope_config has not been set. "
+                "Please only do this if you are sure about the context."
+            )
+        )
+        return self.max_seq_len
+    def get_range_vector(self, seq_len: int, device: torch.device):
+        if self.is_longrope:
+            assert seq_len < self.range_vector.shape[0], f"Found seq_len {seq_len} greater than max_seq_len {self.range_vector.shape[0]}"
+            if self.range_vector.device != device:
+                self.range_vector = self.range_vector.to(device)
+            return self.range_vector[:seq_len]
+        return torch.arange(seq_len, device=device, dtype=torch.float32)
+    def _calc_mscale(self, scale: torch.Tensor) -> torch.Tensor:
+        if scale <= 1.0:
+            return 1.0
+        return math.sqrt(1 + math.log(scale) / math.log(self.original_max_seq_len))
+    def _set_cos_sin_cache(
+        self,
+        seq_len: int,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        dtype = dtype or torch.get_default_dtype()
+        self.max_seq_len_cached = seq_len
+        t = (torch.arange(self.max_seq_len_cached, device=device, dtype=torch.float32) * self.position_scale).type_as(self.inv_freq)
+        device_type = device.type if device is not None else "cpu"
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            # shape: (seq_len, dim_model // 2)
+            freqs = torch.outer(t, self.inv_freq)
+            # shape: (seq_len, dim_model)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        self.register_buffer("cos_cached", cos.to(dtype), persistent=False)
+        self.register_buffer("sin_cached", sin.to(dtype), persistent=False)
+    def forward(
+        self, q: torch.Tensor,
+        k: torch.Tensor,
+        seq_dimension: int = 1,
+        seqlen_offset: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """q, k does not include `seqlen_offset`
+        q: Either (bs, seq_len, num_heads, head_dim) or (seq_len, bs, num_heads, head_dim)
+        k: Either (bs, seq_len, num_heads, head_dim) or (seq_len, bs, num_heads, head_dim)
+        """
+        if seq_dimension < 0:
+            seq_dimension = k.ndim + seq_dimension
+        assert seq_dimension in (0, 1, 2)
+        seq_len = k.shape[seq_dimension] + seqlen_offset
+        if self.is_longrope:
+            if seq_len > self.original_max_seq_len:
+                t = self.get_range_vector(seq_len, device=q.device)
+                rescale_factors = self.long_factors.to(q.device)
+                long_mscale = self.longrope_config.long_mscale
+                mscale = long_mscale if long_mscale > 0 else self._calc_mscale(self.max_seq_len / self.original_max_seq_len)
+            else:
+                t = self.get_range_vector(self.original_max_seq_len, device=q.device)
+                rescale_factors = self.short_factors.to(q.device)
+                short_mscale = self.longrope_config.short_mscale
+                mscale = short_mscale if short_mscale > 0 else 1.0
+            assert rescale_factors.shape == (self.dim_model // 2, ), (
+                f"misaligned shape for LongRoPE rescale factors:\n"
+                f"\tExpected {(self.dim_model // 2, )}, got {rescale_factors.shape}."
+            )
+            inv_freq = 1.0 / (rescale_factors * (self.base ** (torch.arange(0, self.dim_model, 2).float().to(q.device) / self.dim_model)))
+            device_type = q.device.type if q.device is not None else "cpu"
+            device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+            with torch.autocast(device_type=device_type, enabled=False):
+                freqs = torch.outer(t, inv_freq)
+                emb = torch.cat((freqs, freqs), dim=-1)
+                cos = emb.cos() * mscale
+                sin = emb.sin() * mscale
+            cos_cached = cos.to(q.dtype)
+            sin_cached = sin.to(q.dtype)
+        else:
+            if seq_len > self.max_seq_len_cached:
+                self._set_cos_sin_cache(
+                    seq_len=seq_len,
+                    device=k.device,
+                    dtype=k.dtype,
+                )
+            cos_cached = self.cos_cached
+            sin_cached = self.sin_cached
+        return (
+            apply_rotary_pos_emb(
+                q, cos_cached[seqlen_offset:seq_len], sin_cached[seqlen_offset:seq_len], seq_dimension=seq_dimension
+            ).to(q.dtype),
+            apply_rotary_pos_emb(
+                k, cos_cached[seqlen_offset:seq_len], sin_cached[seqlen_offset:seq_len], seq_dimension=seq_dimension
+            ).to(k.dtype),
+        )
+    @classmethod
+    def from_config(cls, config: PretrainedConfig) -> "RotaryEmbedding":
+        kwargs = dict(
+            dim_model=config.hidden_size // config.num_attention_heads,
+            max_seq_len=config.max_position_embeddings,
+            base=config.rope_embedding_base,
+            position_scale=config.rope_position_scale,
+        )
+        if config.rope_scaling is not None:
+            kwargs["longrope_config"] = LongRopeConfig.from_dict(config.rope_scaling)
+        return cls(**kwargs)

tokenization_phi3_small.py ADDED Viewed

	@@ -0,0 +1,313 @@

+# Adapted from https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/tokenization_qwen.py
+import os
+from typing import Collection, List, Optional, Dict, Set, Tuple, Union
+from functools import cached_property
+import base64
+from transformers import PreTrainedTokenizer, AddedToken, AutoConfig
+from transformers.models.auto.tokenization_auto import get_tokenizer_config
+import tiktoken
+"""
+    This tokenizer is almost identical to tiktoken.get_encoding("cl100k_base")
+    with a few additional special tokens to support the ChatML format.
+    TODO(bapatra): Right now, I do not save the special tokens to the vocab file.
+    Maybe in the future, that would be useful? Can add that support later.
+"""
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+# On the megatron codebase, we pad vocabularies to ensure matrix multiplication is fast.
+# this in turn causes some indices to be empty. We account for these empty indices by adding
+# dummy tokens to the tokenizer.
+EFFECTIVE_PADDED_VOCAB_SIZE = 100352
+ACTUAL_VOCAB_SIZE = 100276
+DUMMY_TOKENS = {
+    f"<|dummy_id_{11 + offset}|>": 100276 + offset
+    for offset in range(1, EFFECTIVE_PADDED_VOCAB_SIZE - ACTUAL_VOCAB_SIZE)
+}
+SPECIAL_TOKENS = {
+    # tiktoken.get_encoding("cl100k_base")._special_tokens
+    '<|endoftext|>': 100257,
+    '<|fim_prefix|>': 100258,
+    '<|fim_middle|>': 100259,
+    '<|fim_suffix|>': 100260,
+    # Special tokens for post-training
+    "<|system|>": 100261,
+    "<|user|>": 100262,
+    "<|assistant|>": 100263,
+    # Dummy unused tokens
+    "<|dummy_id_0|>": 100264,
+    "<|dummy_id_1|>": 100265,
+    # Special tokens for post-training continued
+    "<|end|>": 100266,
+    # Some dummy tokens, so that tokenization is contiguous and does not cause issues
+    # Note that the 100256th token of tiktoken.get_encoding("cl100k_base") does not
+    # actually map to anything. So we use a dummy token here.
+    "<|dummy_id_2|>": 100256,
+    # Likewise, tokens from 100267 to 100275 are also unused
+    "<|dummy_id_3|>": 100267,
+    "<|dummy_id_4|>": 100268,
+    "<|dummy_id_5|>": 100269,
+    "<|dummy_id_6|>": 100270,
+    "<|dummy_id_7|>": 100271,
+    "<|dummy_id_8|>": 100272,
+    "<|dummy_id_9|>": 100273,
+    "<|dummy_id_10|>": 100274,
+    "<|dummy_id_11|>": 100275,
+    # The final end of prompt token
+    # (unused, but present as a part of tiktoken.get_encoding("cl100k_base")._special_tokens)
+    '<|endofprompt|>': 100276,
+    # Dummy tokens to account for padding of the tokenizer
+    # We pad to ensure tensor cores are used for vocab multiplication
+    **DUMMY_TOKENS
+}
+class Phi3SmallTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {
+        "vocab_file": "cl100k_base.tiktoken"
+    }
+    model_input_names: List[str] = ["input_ids", "attention_mask"]
+    padding_side = "left"
+    def __init__(
+        self,
+        vocab_file: Optional[str] = None,
+        errors: str = "replace",
+        **kwargs
+    ) -> None:
+        # PreTrainedTokenizer's init calls _add_tokens, which in turn checks
+        # if the token is present in `self.special_tokens``. Hence instantiating it here.
+        # The way Qwen gets around this is by checking against SPECIAL_TOKENS
+        # But I think it's better to check against the objects own `special_tokens`
+        # in case we eventually want to allow the tokenizer to have special tokens.
+        self.special_tokens = SPECIAL_TOKENS
+        super().__init__(**kwargs)
+        self.errors = errors
+        base = tiktoken.get_encoding("cl100k_base")
+        if vocab_file is None:
+            self.mergeable_ranks: Dict[bytes, int] = base._mergeable_ranks
+        else:
+            self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)
+        self.pat_str = base._pat_str
+        enc = tiktoken.Encoding(
+            name="phi3small",
+            pat_str=self.pat_str,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+        self.decoder: Dict[int, bytes] = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.eod_id = self.tokenizer.eot_token
+        self._eos_token = self._convert_id_to_token(self.eod_id)
+        # Setting the bos_token to be the same as the eos_token
+        # Note that this is **not** the correct thing to do, and is done
+        # just so that some of the downstream libraries do not break.
+        self._bos_token = self._eos_token
+        # Assign the special tokens to class variables
+        self.system_id = self.special_tokens["<|system|>"]
+        self.user_id = self.special_tokens["<|user|>"]
+        self.assistant_id = self.special_tokens["<|assistant|>"]
+        self.end_id = self.special_tokens["<|end|>"]
+    @cached_property
+    def dummy_token_indices(self) -> List[int]:
+        # There are some additional special tokens in the cl100k_base tokenizer
+        # that we do not use. Hence, we also consider them to be dummy tokens.
+        additional_tokens = [
+            "<|fim_prefix|>",
+            "<|fim_middle|>",
+            "<|fim_suffix|>",
+            "<|endofprompt|>"
+        ]
+        dummy_token_indices = [index for token, index in self.special_tokens.items() if "dummy_id" in token]
+        dummy_token_indices.extend([self.special_tokens[token] for token in additional_tokens])
+        return sorted(dummy_token_indices)
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state["tokenizer"]
+        return state
+    def __setstate__(self, state):
+        self.__dict__ = state
+        enc = tiktoken.Encoding(
+            name="cl100k_im",
+            pat_str=self.pat_str,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+    def __len__(self):
+        return self.tokenizer.n_vocab
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        *init_inputs,
+        **kwargs,
+    ):
+        cls_kwargs = kwargs
+        # First try to load from the tokenization config if it exists
+        tokenization_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
+        if tokenization_config:
+            cls_kwargs = {
+                **tokenization_config,
+                **cls_kwargs
+            }
+        else:
+            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+            cls_kwargs["model_max_length"] = config.max_position_embeddings
+        return cls(**cls_kwargs)
+    def get_vocab(self) -> Dict[Union[str, bytes], int]:
+        return {**self.mergeable_ranks, **self.special_tokens}
+    def convert_tokens_to_ids(
+        self,
+        tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> Union[int, List[int]]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        ids: List[int] = []
+        for token in tokens:
+            ids.append(self.convert_tokens_to_ids(token))
+        return ids
+    def _add_tokens(
+            self,
+            new_tokens: Union[List[str], List[AddedToken]],
+            special_tokens: bool = False,
+    ) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError("Only special tokens can be added to this tokenizer")
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in self.special_tokens:
+                raise ValueError(
+                    "For now, we do not support unknown special tokens\n"
+                    "In the future, if there is a need for this, we can add special tokens to the tokenizer\n"
+                    "starting from rank 100261 - 100263 and then 100266 - 100275.\n"
+                    "And finally, we can re-construct the enc object back\n"
+                )
+        return 0
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        file_path = os.path.join(save_directory, "cl100k_base.tiktoken")
+        with open(file_path, "w") as f:
+            for token, rank in self.mergeable_ranks.items():
+                line = base64.b64encode(token).decode("utf-8") + " " + str(rank) + "\n"
+                f.write(line)
+        return (file_path,)
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs
+    ) -> List[Union[bytes, str]]:
+        tokens: List[Union[bytes, str]] = []
+        for token_id in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[token_id])
+        return tokens
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+    @property
+    def eos_token_id(self) -> int:
+        return self.eod_id
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)

triton_blocksparse_attention_layer.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import math
+from typing import Optional, Tuple, TypeVar
+import torch.nn as nn
+import torch
+import triton
+from functools import lru_cache
+from .triton_flash_blocksparse_attn import get_local_strided_sparse_attention_op, _get_sparse_attn_mask, blocksparse_flash_attn_padded_fwd, blocksparse_flash_attn_varlen_fwd
+Layout = Tuple[torch.LongTensor, torch.LongTensor]
+def create_sparse_attn_mask(
+    n_heads: int,
+    max_seq_len: int,
+    max_seq_len_k: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    BLOCK: int,
+    local_blocks: int,
+    vert_stride: int,
+    homo_head: bool,
+    return_dense: bool
+) -> Tuple[Layout, torch.Tensor, Optional[torch.Tensor]]:
+    layout, block_sparse_pattern, _ = _get_sparse_attn_mask(
+        n_heads=n_heads,
+        q_len=max_seq_len,
+        N_CTX=max_seq_len_k,
+        dtype=dtype,
+        device=device,
+        BLOCK=BLOCK,
+        local_blocks=local_blocks,
+        vert_stride=vert_stride,
+        homo_head=homo_head,
+        return_dense=return_dense
+    )
+    return layout, block_sparse_pattern
+class BlockSparseAttentionLayer(nn.Module):
+    def __init__(
+        self,
+        n_heads: int,
+        max_seq_len: int,
+        sparse_block_size: int,
+        local_blocks: int,
+        vert_stride: int,
+        kernel_block_size: Optional[int] = None,
+        homo_head: bool = False,
+        active_head_range: Optional[Tuple[int]] = None
+    ) -> None:
+        super().__init__()
+        self.n_heads = n_heads
+        self.max_seq_len = max_seq_len
+        self.sparse_block_size = sparse_block_size
+        self.kernel_block_size = kernel_block_size or sparse_block_size
+        self.local_blocks = local_blocks
+        self.vert_stride = vert_stride
+        self.homo_head = homo_head
+        self.active_head_range = active_head_range
+        # Internal Parameters used by the layer
+        self._sparse_block_mask = None
+        self._sparse_layout = None
+        self._dtype = None
+        self._device = None
+        # TODO(bapatra): Ideally, I'd want to keep all the code for
+        # forward to be handled here, and not branch for training and inference.
+        # However, that refactor would need a lot of testing. For now, using the
+        # training op as is, and will refactor again later.
+    def prune_blocksparse_layout_to_heads(self, h_start: int, h_end: int) -> None:
+        self._sparse_block_mask = self._sparse_block_mask[h_start: h_end]
+        self._sparse_layout[0] = self._sparse_layout[0][h_start: h_end]
+        self._sparse_layout[1] = self._sparse_layout[1][h_start: h_end]
+    def _initialize_internals(
+        self,
+        dtype: torch.dtype,
+        device: torch.device
+    ) -> None:
+        self._dtype, self._device = dtype, device
+        self._sparse_layout, self._sparse_block_mask = create_sparse_attn_mask(
+            n_heads=self.n_heads,
+            max_seq_len=self.max_seq_len,
+            max_seq_len_k=self.max_seq_len,
+            dtype=dtype,
+            device=device,
+            BLOCK=self.sparse_block_size,
+            local_blocks=self.local_blocks,
+            vert_stride=self.vert_stride,
+            homo_head=self.homo_head,
+            return_dense=False,
+        )
+        if (not self.homo_head) and (self.active_head_range is not None):
+            assert len(self.active_head_range) == 2, "\"active_head_range\" should be a tuple of start/end index of the heads."
+            h_start, h_end = self.active_head_range
+            self.prune_blocksparse_layout_to_heads(h_start=h_start, h_end=h_end)
+        assert self.sparse_block_size % self.kernel_block_size == 0,  f"The sparse block size must be a multiple of {self.kernel_block_size}. Found {self.sparse_block_size}."
+        assert self.kernel_block_size >=16 and math.log2(self.kernel_block_size) % 1 == 0, f"block_size must be power of 2 and at least 16, but {self.kernel_block_size} is given"
+        if self.sparse_block_size // self.kernel_block_size > 1:
+            _mul = self.sparse_block_size // self.kernel_block_size
+            # need to consider if block_m and block_n are different
+            self._sparse_block_mask = torch.kron(self._sparse_block_mask, self._sparse_block_mask.new_ones(_mul, _mul))
+            num_sparse_blocks = self._sparse_block_mask.size(-1)
+            block_causal_mask = torch.arange(0, num_sparse_blocks)[:, None] >= torch.arange(0, num_sparse_blocks)[None]
+            self._sparse_block_mask *= block_causal_mask.type_as(self._sparse_block_mask)
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        sm_scale: float,
+        *,
+        # Arguments Related to Block Attention Inference
+        left_paddings: Optional[torch.LongTensor] = None,
+        seqlens: Optional[torch.LongTensor] = None,
+        # Arguements Related to Variable Length Inference
+        cu_seqlens_k: Optional[torch.LongTensor] = None,
+        cu_seqlens_q: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        if left_paddings is None and seqlens is None and cu_seqlens_k is None and cu_seqlens_q is None:
+            blocksparse_op = get_local_strided_sparse_attention_op(
+                n_heads=self.n_heads,
+                max_seq_len=self.max_seq_len,
+                sparse_block_size=self.sparse_block_size,
+                kernel_block_size=self.kernel_block_size,
+                local_blocks=self.local_blocks,
+                vert_stride=self.vert_stride,
+                homo_head=self.homo_head,
+                device=q.device,
+                inference=not self.training
+            )
+            return blocksparse_op(q, k, v, sm_scale)
+        assert not torch.is_grad_enabled(), "Variable Length Inference / Batched inference is not supported during training. Please run it in a torch.no_grad() context"
+        # First set internals if they have not been set
+        if self._sparse_block_mask is None or (self._dtype != q.dtype) or (self._device != q.device):
+            self._initialize_internals(dtype=q.dtype, device=q.device)
+        if k.dim() == 3:
+            assert cu_seqlens_k is not None
+            return blocksparse_flash_attn_varlen_fwd(
+                q=q,
+                k=k,
+                v=v,
+                cu_seqlens_k=cu_seqlens_k,
+                cu_seqlens_q=cu_seqlens_q,
+                sm_scale=sm_scale,
+                sparse_layout=self._sparse_layout,
+                block_size=self.kernel_block_size,
+                max_seqlen=self.max_seq_len,
+            )
+        if k.dim() == 4:
+            assert not (left_paddings is None and seqlens is None), "Either left_paddings or seqlens must be provided for batched inference."
+            return blocksparse_flash_attn_padded_fwd(
+                q=q,
+                k=k,
+                v=v,
+                sm_scale=sm_scale,
+                sparse_layout=self._sparse_layout,
+                left_paddings=left_paddings,
+                seqlens=seqlens,
+                block_size=self.kernel_block_size,
+                max_seqlen=self.max_seq_len,
+            )
+        raise ValueError('q/k/v must be either 3 dim for variable-length input or 4 dim for fixed-length.')

triton_flash_blocksparse_attn.py ADDED Viewed

	@@ -0,0 +1,1943 @@

+"""
+    Author: Eric Lin (xihlin)
+"""
+"""
+    ... note(bapatra)::
+        This is written as one big file, instead of splitting into logical components because I was running into issues with transformers auto module
+        imports when splitting into different files. I've tried keeping the logical partitions demarkated with comment blocks, but it is not ideal.
+        In the future, would be really good to revisit this and refactor into a more readable file structure.
+"""
+from typing import TypeVar
+from functools import lru_cache
+import math
+import pytest
+import torch
+import numpy as np
+import triton
+import triton.language as tl
+import os
+import dataclasses
+Phi3SmallConfig = TypeVar('Phi3SmallConfig')
+# triton 2.0.0: fail at backward on A100, for the examples, if h_dim=128.
+# Done
+#  1. strided of qkv
+#  2. seq len not power of 2
+#  3. bf16 with Triton May, 2023
+# TODO:
+#  1. wip: support non-contiguous backward, also help reduce memory allocation in training (q, k, v split)
+#  2. block sparse with different BLOCK_M, BLOCK_N?
+#  3. for Lq not divided by BLOCK_M, BLOCK_N, only apply mask to K/V on last batch, still need to apply mask on Q.
+#     Attempt, fail to compile
+#  4. For 2nd iter of inference,  BLOCK_M=1, how to make things work?  K/V maynot divided by BLOCK_N.
+#  5. The inner loop can also be paralled via bigger num_stage(better) or on different thread-block (via m/L and atomic update, but this no-comm/sync between blocks)
+###########################################################
+################### Kernel Parameters #####################
+###########################################################
+@dataclasses.dataclass
+class BlockSparseParams(object):
+    block_size: int
+    kernel_block_size: int
+    num_local_blocks: int
+    vert_stride: int
+    homo_head_pattern: bool = False
+    @classmethod
+    def from_config(cls, config: Phi3SmallConfig) -> "BlockSparseParams":
+        return cls(
+            block_size=config.blocksparse_block_size,
+            kernel_block_size=config.blocksparse_triton_kernel_block_size,
+            num_local_blocks=config.blocksparse_num_local_blocks,
+            vert_stride=config.blocksparse_vert_stride,
+            homo_head_pattern=config.blocksparse_homo_head_pattern,
+        )
+###########################################################
+###########################################################
+###########################################################
+################### Utility Functions #####################
+###########################################################
+# helper functions for 3D sparse pattern
+# these function are not optimized and very inefficient. Avoid calling them too frequent.
+# currently, it is only called within `get_local_strided_sparse_attention_op`, which is cached.
+def dense_to_crow_col(x):
+    ''' Turning a 2D/3D torch tensor (x) to CSR rows/cols indexing.
+    param:
+    TODO:
+        1. improve efficiency, is it faster if done in CPU, or customize a cuda kernel for it?
+    NOTE: col_indices padded -1
+    '''
+    pad = -1
+    dim = x.dim()
+    assert x.dim() in (2, 3)
+    if x.dim() == 2:
+        x = x[None]
+    x = [xi.to_sparse_csr() for xi in x]
+    crows = torch.vstack([xi.crow_indices() for xi in x])
+    cols = [xi.col_indices() for xi in x]
+    max_cols = max(len(xi) for xi in cols)
+    cols = [torch.cat([xi, pad + xi.new_zeros(max_cols - xi.shape[0])]) for xi in cols]
+    cols = torch.vstack(cols)
+    if dim == 2:
+        crows = crows[0]
+        cols = cols[0]
+    return crows, cols
+def crow_col_to_dense(crows, cols, dtype=torch.float16):
+    dim = crows.dim()
+    if dim == 1:
+        crows = crows[None]
+        cols = cols[None]
+    device = crows.device
+    crows, cols = crows.cpu(), cols.cpu()  # faster in cpu
+    shape = (crows.shape[0], crows.shape[1] - 1, cols.max() + 1)
+    x = torch.zeros(shape, dtype=dtype)
+    for i in range(shape[0]):
+        for j in range(shape[1]):
+            x[i, j, cols[i, crows[i, j]:crows[i, j+1]]] = 1
+    if dim == 1:
+        x = x[0]
+    return x.to(device)
+def dense_to_ccol_row(x):
+    '''Similar, but to CSC format
+    '''
+    x = x.transpose(-2, -1)
+    return dense_to_crow_col(x)
+def ccol_row_to_dense(ccol, rows, dtype=torch.float16):
+    return crow_col_to_dense(ccol, rows, dtype).permute(0, 2, 1).contiguous()
+def _get_sparse_attn_mask_homo_head(q_len, N_CTX, dtype, device, BLOCK=128, local_blocks=4, vert_stride=4, return_dense=False):
+    '''
+    :return: a tuple of 3:
+        - tuple of crow_indices, col_indices representation of CSR format.
+        - block dense mask
+        - all token dense mask (be aware that it can be OOM if it is too big) if `return_dense==True`, otherwise, None
+    '''
+    with torch.no_grad():
+        N_BLOCK = triton.cdiv(N_CTX, BLOCK)
+        q_pos = torch.arange(N_BLOCK)[:, None]
+        k_pos = torch.arange(N_BLOCK)[None]
+        mask_vert_strided = (torch.arange(N_BLOCK) + 1) % vert_stride == 0
+        block_mask_dense = ((q_pos >= k_pos) & ((q_pos - k_pos < local_blocks) | mask_vert_strided)).to(device).to(dtype)
+        N_BLOCK_Q = triton.cdiv(q_len, BLOCK)
+        block_mask_dense_output = block_mask_dense[-N_BLOCK_Q:].contiguous().to_sparse_csr()
+    if return_dense:
+        mask_dense = torch.kron(block_mask_dense, block_mask_dense.new_ones((BLOCK, BLOCK)))
+        causal_mask = torch.tril(torch.ones(N_CTX, N_CTX)).type_as(mask_dense)[-q_len:]
+        mask_dense = mask_dense[-q_len:, :N_CTX] * causal_mask
+        return (block_mask_dense_output.crow_indices(), block_mask_dense_output.col_indices()), block_mask_dense, mask_dense
+    else:
+        return (block_mask_dense_output.crow_indices(), block_mask_dense_output.col_indices()), block_mask_dense, None
+def _get_sparse_attn_mask(n_heads, q_len, N_CTX, dtype, device, BLOCK=128, local_blocks=4, vert_stride=4, homo_head=True, return_dense=False):
+    '''
+    :return: a tuple of 3:
+        - tuple of crow_indices, col_indices representation of CSR format.
+        - block dense mask
+        - all token dense mask (be aware that it can be OOM if it is too big) if `return_dense==True`, otherwise, None
+    '''
+    if homo_head:
+        with torch.no_grad():
+            (crow, col), block_mask_dense, mask_dense = _get_sparse_attn_mask_homo_head(q_len, N_CTX, dtype, device, BLOCK, local_blocks, vert_stride, return_dense)
+            crow = crow[None].expand(n_heads, crow.shape[0])
+            col = col[None].expand(n_heads, col.shape[0])
+            if return_dense:
+                mask_dense = mask_dense[None].expand(n_heads, *mask_dense.shape)
+            return (crow, col), block_mask_dense, mask_dense
+    with torch.no_grad():
+        N_BLOCK = triton.cdiv(N_CTX, BLOCK)
+        q_pos = torch.arange(N_BLOCK)[None, :, None]
+        k_pos = torch.arange(N_BLOCK)[None, None]
+        head_sliding_step = max(1, int(vert_stride / n_heads))  # if vert_stride <= n_heads, rotating the heads
+        mask_vert_strided = [(torch.arange(N_BLOCK) + h * head_sliding_step + 1) % vert_stride == 0 for h in range(n_heads)]
+        mask_vert_strided = torch.vstack(mask_vert_strided).unsqueeze(1)
+        block_mask_dense = ((q_pos >= k_pos) & ((q_pos - k_pos < local_blocks) | mask_vert_strided)).to(device).to(dtype)
+        N_BLOCK_Q = triton.cdiv(q_len, BLOCK)
+        block_mask_dense_output = block_mask_dense[:, -N_BLOCK_Q:]
+    if return_dense:
+        mask_dense = torch.kron(block_mask_dense, block_mask_dense.new_ones((BLOCK, BLOCK)))
+        causal_mask = torch.tril(torch.ones(N_CTX, N_CTX)).type_as(mask_dense)[-q_len:]
+        mask_dense = mask_dense[..., -q_len:, :N_CTX] * causal_mask[None]
+        return dense_to_crow_col(block_mask_dense_output), block_mask_dense, mask_dense
+    else:
+        return dense_to_crow_col(block_mask_dense_output), block_mask_dense, None
+def get_sparse_attn_mask(q, N_CTX, *args, **kwargs):
+    return _get_sparse_attn_mask(q.size(1), q.size(2), N_CTX, q.dtype, q.device, *args, **kwargs)
+###########################################################
+###########################################################
+###########################################################
+###################### Training Kernels ###################
+###########################################################
+# TODO: only apply loading/saving mask on the last iteration for EVEN_N_BLOCK, useful for 1st iteration of inference.
+#    Experiment failed inside loop.
+#    Another idea: only on saving? load even out of boundary(will it causes illegal access error)?
+@triton.jit
+def _fwd_kernel(
+    Q, K, V, sm_scale,
+    layout_crow_ptr,
+    layout_col_ptr,
+    layout_crow_stride_h, layout_crow_stride_m,
+    layout_col_stride_h, layout_col_stride_m,
+    TMP, L, M,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug. TMP, L, M are assumed to have contiguous layouts
+    Out,
+    stride_qz, stride_qh, stride_qm, stride_qd,
+    stride_kz, stride_kh, stride_kn, stride_kd,
+    stride_vz, stride_vh, stride_vn, stride_vd,
+    stride_oz, stride_oh, stride_om, stride_od,
+    Z, H, N_CTX,
+    PAST_LEN,
+    Q_ROUNDED_LEN,
+    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    EVEN_M_BLOCK: tl.constexpr,
+    EVEN_N_BLOCK: tl.constexpr,
+    INFERENCE: tl.constexpr,
+    NUM_DBLOCKS: tl.constexpr,
+):
+    Q_LEN = N_CTX - PAST_LEN
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    off_h = off_hz % H
+    off_z = off_hz // H
+    Q += off_z * stride_qz + off_h * stride_qh
+    K += off_z * stride_kz + off_h * stride_kh
+    V += off_z * stride_vz + off_h * stride_vh
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    off_q = offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qd
+    # off_k = offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kd
+    off_k = offs_n[None, :] * stride_kn + offs_d[:, None] * stride_kd
+    off_v = offs_n[:, None] * stride_vn + offs_d[None, :] * stride_vd
+    # Initialize pointers to Q, K, V
+    q_ptrs = Q + off_q
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+    # initialize pointer to m and l
+    t_ptrs = TMP + off_hz * Q_ROUNDED_LEN + offs_m
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    if NUM_DBLOCKS >= 2:
+        acc2 = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # load q: it will stay in SRAM throughout
+    if EVEN_M_BLOCK:
+        q = tl.load(q_ptrs)
+        if NUM_DBLOCKS >= 2:
+            q2 = tl.load(q_ptrs + BLOCK_DMODEL * stride_qd)
+    else:
+        q = tl.load(q_ptrs, mask=offs_m[:, None] < Q_LEN)
+        if NUM_DBLOCKS >= 2:
+            q2 = tl.load(q_ptrs + BLOCK_DMODEL * stride_qd, mask=offs_m[:, None] < Q_LEN)
+    layout_ptr = layout_crow_ptr + off_h * layout_crow_stride_h + start_m * layout_crow_stride_m
+    start_l = tl.load(layout_ptr).to(tl.int32)
+    end_l = tl.load(layout_ptr + layout_crow_stride_m).to(tl.int32)
+    # loop over k, v and update accumulator
+    for col_idx_idx in range(start_l, end_l):
+        col_idx = tl.load(layout_col_ptr +  off_h * layout_col_stride_h + col_idx_idx * layout_col_stride_m).to(tl.int32)
+        start_n = col_idx * BLOCK_N
+        # -- compute qk ----
+        if EVEN_N_BLOCK:
+            k = tl.load(k_ptrs + start_n * stride_kn)
+        else:
+            k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_n[None, :] + start_n < N_CTX)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        if NUM_DBLOCKS >= 2:
+            if EVEN_N_BLOCK:
+                k = tl.load(k_ptrs + start_n * stride_kn + BLOCK_DMODEL * stride_kd)
+            else:
+                k = tl.load(k_ptrs + start_n * stride_kn + BLOCK_DMODEL * stride_kd, mask=offs_n[None, :] + start_n < N_CTX)
+            qk += tl.dot(q2, k)
+        qk *= sm_scale
+        qk += tl.where(offs_m[:, None] + PAST_LEN >= (start_n + offs_n[None, :]), 0, float('-inf'))
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+        m_i_new = tl.maximum(m_i, m_ij)
+        alpha = tl.exp(m_i - m_i_new)
+        beta = tl.exp(m_ij - m_i_new)
+        l_i_new = alpha * l_i + beta * l_ij
+        # -- update output accumulator --
+        # scale p
+        p_scale = beta / l_i_new
+        p = p * p_scale[:, None]
+        # scale acc
+        acc_scale = l_i / l_i_new * alpha
+        # tl.store(t_ptrs, acc_scale)
+        # acc_scale = tl.load(t_ptrs)  # BUG: have to store and immediately load
+        acc = acc * acc_scale[:, None]
+        if NUM_DBLOCKS >= 2:
+            acc2 = acc2 * acc_scale[:, None]
+        p = p.to(Q.dtype.element_ty)
+        # update acc
+        if EVEN_N_BLOCK:
+            v = tl.load(v_ptrs + start_n * stride_vn)
+        else:
+            v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_n[:, None] + start_n < N_CTX)
+        acc += tl.dot(p, v)
+        if NUM_DBLOCKS >= 2:
+            if EVEN_N_BLOCK:
+                v = tl.load(v_ptrs + start_n * stride_vn + BLOCK_DMODEL * stride_vd)
+            else:
+                v = tl.load(v_ptrs + start_n * stride_vn + BLOCK_DMODEL * stride_vd, mask=offs_n[:, None] + start_n < N_CTX)
+            acc2 += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+    # rematerialize offsets to save registers
+    # start_m = tl.program_id(0)
+    # offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    # write back l and m
+    if not INFERENCE:
+        l_ptrs = L + off_hz * N_CTX + offs_m
+        m_ptrs = M + off_hz * N_CTX + offs_m
+        if EVEN_M_BLOCK:
+            tl.store(l_ptrs, l_i)
+            tl.store(m_ptrs, m_i)
+        else:
+            tl.store(l_ptrs, l_i,  mask=offs_m < Q_LEN)
+            tl.store(m_ptrs, m_i,  mask=offs_m < Q_LEN)
+    # initialize pointers to output
+    # offs_n = tl.arange(0, BLOCK_DMODEL)
+    off_o = off_z * stride_oz + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :] * stride_od
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs, acc,  mask=offs_m[:, None] < Q_LEN)
+    if NUM_DBLOCKS >= 2:
+        tl.store(out_ptrs + BLOCK_DMODEL * stride_od, acc2,  mask=offs_m[:, None] < Q_LEN)
+## backward
+@triton.heuristics(
+    {
+        'EVEN_M_BLOCK': lambda kwargs: kwargs['N_CTX'] % kwargs['BLOCK_M'] == 0,
+    }
+)
+@triton.jit
+def _bwd_preprocess(
+    Out, DO, L, # assume contiguous for Out, DO, L, NewDO, Delta layout.
+    NewDO, Delta,
+    N_CTX,
+    BLOCK_M: tl.constexpr, D_HEAD: tl.constexpr,
+    EVEN_M_BLOCK: tl.constexpr,
+):
+    off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_d = tl.arange(0, D_HEAD)
+    # load
+    if EVEN_M_BLOCK:
+        o = tl.load(Out + off_m[:, None] * D_HEAD + off_d[None, :]).to(tl.float32)
+        do = tl.load(DO + off_m[:, None] * D_HEAD + off_d[None, :]).to(tl.float32)
+    else:
+        o = tl.load(Out + off_m[:, None] * D_HEAD + off_d[None, :], mask=off_m[:, None] < N_CTX).to(tl.float32)
+        do = tl.load(DO + off_m[:, None] * D_HEAD + off_d[None, :], mask=off_m[:, None] < N_CTX).to(tl.float32)
+    denom = tl.load(L + off_m).to(tl.float32)
+    # compute
+    do = do / denom[:, None]
+    delta = tl.sum(o * do, axis=1)
+    # write-back
+    if EVEN_M_BLOCK:
+        tl.store(NewDO + off_m[:, None] * D_HEAD + off_d[None, :], do)
+    else:
+        tl.store(NewDO + off_m[:, None] * D_HEAD + off_d[None, :], do,  mask=off_m[:, None] < N_CTX)
+    tl.store(Delta + off_m, delta)
+# Does not suuport unequal seqlen(q) and seqlen(k)
+@triton.heuristics(
+    {
+        'EVEN_M_BLOCK': lambda kwargs: kwargs['N_CTX'] % kwargs['BLOCK_M'] == 0,
+        'EVEN_N_BLOCK': lambda kwargs: kwargs['N_CTX'] % kwargs['BLOCK_N'] == 0,
+    }
+)
+@triton.jit
+def _bwd_kernel(
+    Q, K, V, sm_scale,
+    layout_ccol_ptr,
+    layout_row_ptr,
+    layout_ccol_stride_h, layout_ccol_stride_m,
+    layout_row_stride_h, layout_row_stride_m,
+    Out, DO,  # assume contigous: Out, Do, DQ, DK, DV, L, M, D, seq(q) == seq(k), with stride_oz, stride_oh, stride_om, stride_od,
+    DQ, DK, DV,
+    L, M,
+    D,
+    stride_qz, stride_qh, stride_qm, stride_qd,
+    stride_kz, stride_kh, stride_kn, stride_kd,
+    stride_vz, stride_vh, stride_vn, stride_vd,
+    stride_oz, stride_oh, stride_om, stride_od,
+    # stride_dz, stride_dh, stride_dm, stride_dd,
+    Z, H, N_CTX,
+    num_block,
+    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    EVEN_M_BLOCK: tl.constexpr,
+    EVEN_N_BLOCK: tl.constexpr,
+    NUM_DBLOCKS: tl.constexpr,
+):
+    start_n = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    off_z = off_hz // H
+    off_h = off_hz % H
+    # offset pointers for batch/head
+    Q += off_z * stride_qz + off_h * stride_qh
+    K += off_z * stride_kz + off_h * stride_kh
+    V += off_z * stride_vz + off_h * stride_vh
+    DO += off_z * stride_oz + off_h * stride_oh
+    DQ += off_z * stride_oz + off_h * stride_oh
+    DK += off_z * stride_oz + off_h * stride_oh
+    DV += off_z * stride_oz + off_h * stride_oh
+    # Look like this loop can be parallelled
+    # for start_n in range(0, num_block):
+    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    # initialize pointers to value-like data
+    k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kd)
+    v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :] * stride_vd)
+    # pointer to row-wise quantities in value-like data
+    D_ptrs = D + off_hz * N_CTX
+    m_ptrs = M + off_hz * N_CTX
+    # initialize dv amd dk
+    dv = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
+    dk = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
+    # k and v stay in SRAM throughout
+    if EVEN_N_BLOCK:
+        k = tl.load(k_ptrs)
+        v = tl.load(v_ptrs)
+    else:
+        k = tl.load(k_ptrs, mask=offs_n[:, None] < N_CTX)
+        v = tl.load(v_ptrs, mask=offs_n[:, None] < N_CTX)
+    if NUM_DBLOCKS >= 2:
+        dv2 = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
+        dk2 = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
+        if EVEN_N_BLOCK:
+            k2 = tl.load(k_ptrs + BLOCK_DMODEL * stride_kd)
+            v2 = tl.load(v_ptrs + BLOCK_DMODEL * stride_vd)
+        else:
+            k2 = tl.load(k_ptrs + BLOCK_DMODEL * stride_kd, mask=offs_n[:, None] < N_CTX)
+            v2 = tl.load(v_ptrs + BLOCK_DMODEL * stride_vd, mask=offs_n[:, None] < N_CTX)
+    # loop over rows
+    layout_ptr = layout_ccol_ptr + off_h * layout_ccol_stride_h + start_n * layout_ccol_stride_m
+    start_l = tl.load(layout_ptr).to(tl.int32)
+    end_l = tl.load(layout_ptr + layout_ccol_stride_m).to(tl.int32)
+    for row_idx_idx in range(start_l, end_l):
+        row_idx = tl.load(layout_row_ptr + off_h * layout_row_stride_h + row_idx_idx * layout_row_stride_m).to(tl.int32)
+        start_m = row_idx * BLOCK_M
+        # offs_qm = start_m + tl.arange(0, BLOCK_M)
+        offs_m_curr = start_m + offs_m
+        q_ptrs =   Q + (offs_m_curr[:, None] * stride_qm + offs_d[None, :] * stride_qd)
+        do_ptrs = DO + (offs_m_curr[:, None] * stride_om + offs_d[None, :] * stride_od)
+        dq_ptrs = DQ + (offs_m_curr[:, None] * stride_om + offs_d[None, :] * stride_od)
+        # load q, k, v, do on-chip
+        if EVEN_M_BLOCK:
+            q = tl.load(q_ptrs)
+        else:
+            q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < N_CTX)
+        # re-compute p = softmax(qk, dim=-1).T
+        # NOTE: `do` is pre-divided by `l`; no normalization here
+        qk = tl.dot(q, tl.trans(k))
+        if NUM_DBLOCKS >= 2:
+            if EVEN_M_BLOCK:
+                q2 = tl.load(q_ptrs + BLOCK_DMODEL * stride_qd)
+            else:
+                q2 = tl.load(q_ptrs + BLOCK_DMODEL * stride_qd, mask=offs_m_curr[:, None] < N_CTX)
+            qk += tl.dot(q2, tl.trans(k2))
+        qk += tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), 0, float('-inf'))
+        if EVEN_M_BLOCK:
+            m = tl.load(m_ptrs + offs_m_curr)
+        else:
+            m = tl.load(m_ptrs + offs_m_curr, mask=offs_m_curr < N_CTX)
+        p = tl.exp(qk * sm_scale - m[:, None])
+        # compute dv
+        if EVEN_M_BLOCK:
+            do = tl.load(do_ptrs)
+        else:
+            do = tl.load(do_ptrs, mask=offs_m_curr[:, None] < N_CTX)
+        if NUM_DBLOCKS >= 2:
+            if EVEN_M_BLOCK:
+                do2 = tl.load(do_ptrs + BLOCK_DMODEL * stride_od)
+            else:
+                do2 = tl.load(do_ptrs + BLOCK_DMODEL * stride_od, mask=offs_m_curr[:, None] < N_CTX)
+        dv += tl.dot(tl.trans(p.to(Q.dtype.element_ty)), do)
+        if NUM_DBLOCKS >= 2:
+            dv2 += tl.dot(tl.trans(p.to(Q.dtype.element_ty)), do2)
+        # compute dp = dot(v, do)
+        if EVEN_M_BLOCK:
+            Di = tl.load(D_ptrs + offs_m_curr)
+        else:
+            Di = tl.load(D_ptrs + offs_m_curr, mask=offs_m_curr < N_CTX)
+        dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
+        dp += tl.dot(do, tl.trans(v))
+        if NUM_DBLOCKS >= 2:
+            dp += tl.dot(do2, tl.trans(v2))
+        # compute ds = p * (dp - delta[:, None])
+        ds = p * dp * sm_scale
+        # compute dk = dot(ds.T, q)
+        dk += tl.dot(tl.trans(ds.to(Q.dtype.element_ty)), q)
+        if NUM_DBLOCKS >= 2:
+            dk2 += tl.dot(tl.trans(ds.to(Q.dtype.element_ty)), q2)
+        # # compute dq
+        dq = tl.dot(ds.to(Q.dtype.element_ty), k)
+        if EVEN_M_BLOCK:
+            tl.atomic_add(dq_ptrs, dq)
+        else:
+            tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < N_CTX)
+        if NUM_DBLOCKS >= 2:
+            dq2 = tl.dot(ds.to(Q.dtype.element_ty), k2)
+            dq_ptrs2 = dq_ptrs + BLOCK_DMODEL * stride_od
+            if EVEN_M_BLOCK:
+                tl.atomic_add(dq_ptrs2, dq2)
+            else:
+                tl.atomic_add(dq_ptrs2, dq2, mask=offs_m_curr[:, None] < N_CTX)
+    # write-back
+    dv_ptrs = DV + (offs_n[:, None] * stride_om + offs_d[None, :] * stride_od)
+    dk_ptrs = DK + (offs_n[:, None] * stride_om + offs_d[None, :] * stride_od)
+    if EVEN_N_BLOCK:
+        tl.store(dv_ptrs, dv)
+        tl.store(dk_ptrs, dk)
+    else:
+        tl.store(dv_ptrs, dv, mask=offs_n[:, None] < N_CTX)
+        tl.store(dk_ptrs, dk, mask=offs_n[:, None] < N_CTX)
+    if NUM_DBLOCKS >= 2:
+        dv_ptrs2 = dv_ptrs + BLOCK_DMODEL * stride_od
+        dk_ptrs2 = dk_ptrs + BLOCK_DMODEL * stride_od
+        if EVEN_N_BLOCK:
+            tl.store(dv_ptrs2, dv2)
+            tl.store(dk_ptrs2, dk2)
+        else:
+            tl.store(dv_ptrs2, dv2, mask=offs_n[:, None] < N_CTX)
+            tl.store(dk_ptrs2, dk2, mask=offs_n[:, None] < N_CTX)
+def _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, BLOCK_M, BLOCK_N, num_warps=None, num_stages=1, inference=None, out=None):
+    '''
+    :param q, k, v: [batch, n_heads, seq_len, model_dim]. len of q is allowed to be different than k/v.
+    :param layout_crow_indices, layout_col_indices: same as CSR.crow_indices, and CSR.col_indices used to preresent a sparse tensor.
+        Each element represent a block, i.e, all elements in a block to be attentdd, or not attended at all..
+    '''
+    assert q.shape[-1] == k.shape[-1] == v.shape[-1]
+    assert k.shape[2] == v.shape[2]
+    o = out if out is not None else torch.empty_like(q).contiguous()
+    grid = (triton.cdiv(q.shape[2], BLOCK_M), q.shape[0] * q.shape[1])
+    q_rounded_len = grid[0] * BLOCK_M
+    tmp = torch.empty((q.shape[0] * q.shape[1], q_rounded_len), device=q.device, dtype=torch.float32)
+    if inference is None:
+        inference = (not q.requires_grad) and (not k.requires_grad)  and (not v.requires_grad)
+    if inference:
+        L, m = tmp, tmp  # no need to use create new tensor
+    else:
+        L = torch.empty((q.shape[0] * q.shape[1], q_rounded_len), device=q.device, dtype=torch.float32)
+        m = torch.empty((q.shape[0] * q.shape[1], q_rounded_len), device=q.device, dtype=torch.float32)
+    if layout_col_indices.dim() == 1:
+        layout_crow_indices = layout_crow_indices[None].expand(q.shape[1] , -1)
+        layout_col_indices = layout_col_indices[None].expand(q.shape[1] , -1)
+    assert q.shape[-1] in [64, 128]
+    BLOCK_DMODEL = 64
+    if num_warps is None:
+        MIN_D = min(BLOCK_M, BLOCK_N, BLOCK_DMODEL)
+        num_warps = max(1, 2 ** int(math.log2(MIN_D / 16)))
+        # print(f'> {BLOCK_M=}, {BLOCK_N=}, {BLOCK_DMODEL=}, {num_warps=}, {num_stages=}')
+    else:
+        assert math.log2(num_warps) % 1 == 0, f'''"num_warps" should be power of 2, but got {num_warps}.'''
+    ## For debugging:
+    # print(f'>> {q.shape=}, {k.shape=}, {BLOCK_M=}, {BLOCK_N=}, {num_warps=}, {BLOCK_DMODEL=}, {q.stride()=}, {k.stride()=}')
+    # print(f'>> {layout_crow_indices=}\n{layout_col_indices=}\n {layout_crow_indices.stride()=}, {layout_crow_indices.stride()=}')
+    # print(f'> {q.shape=}, {k.shape=}, {layout_crow_indices.shape}, {layout_col_indices.shape}, {layout_crow_indices.stride()}, \
+    #   {layout_col_indices.stride()}, {layout_crow_indices=}, {layout_col_indices=}')
+    _fwd_kernel[grid](
+        q, k, v, sm_scale,
+        layout_crow_indices,
+        layout_col_indices,
+        layout_crow_indices.stride(0), layout_crow_indices.stride(1),
+        layout_col_indices.stride(0), layout_col_indices.stride(1),
+        tmp, L, m,
+        o,
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+        q.shape[0], q.shape[1], k.shape[2],
+        k.shape[2] - q.shape[2],
+        q_rounded_len,
+        BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        EVEN_M_BLOCK=q.shape[2] % BLOCK_M == 0,
+        EVEN_N_BLOCK=k.shape[2] % BLOCK_N == 0 ,
+        INFERENCE=inference,
+        NUM_DBLOCKS=q.shape[-1] // BLOCK_DMODEL,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    if inference:
+        L, m = None, None
+    ctx.save_for_backward(q, k, v, o, L, m, layout_crow_indices,  layout_col_indices)
+    ctx.BLOCK_M = BLOCK_M
+    ctx.BLOCK_N = BLOCK_N
+    ctx.BLOCK_DMODEL = BLOCK_DMODEL
+    # ctx.BLOCK = BLOCK
+    ctx.grid = grid
+    ctx.sm_scale = sm_scale
+    ctx.num_warps = num_warps
+    ctx.num_stages = num_stages
+    return o
+def _backward(ctx, do, layout_ccol_indices, layout_row_indices, dq=None, dk=None, dv=None):
+    # q, k, v, o, l, m = ctx.saved_tensors
+    q, k, v, o, l, m, layout_crow_indices, layout_col_indices = ctx.saved_tensors
+    ## this following too slow to do online, so get it from inputs, which is cached.
+    # layout_ccol_indices, layout_row_indices = dense_to_ccol_row(crow_col_to_dense(ctx.layout_crow_indices, ctx.layout_col_indices))
+    # layout_ccol_indices, layout_row_indices = dense_to_ccol_row(crow_col_to_dense(layout_crow_indices, layout_col_indices))
+    if not do.is_contiguous():
+        do = do.contiguous()
+        ## for debugging
+        # print(f'----> do is not contiguous: {do.stride()=}')
+        # raise ValueError(f'>>>> output grad is not contiguous: {do.stride()=}')
+    if not o.is_contiguous():
+        # TODO: currently only work with contiguous q/k/v.
+        raise ValueError(f'--> output is not contiguous: {o.stride()=}. This is maybe caused by q/k/v not being contiguous.')
+    if layout_ccol_indices.dim() == 1:
+        layout_ccol_indices = layout_ccol_indices[None].expand(q.shape[1], -1)
+        layout_row_indices = layout_row_indices[None].expand(q.shape[1], -1)
+    # do = do.contiguous()
+    dq = dq if dq is not None else torch.zeros_like(q, dtype=torch.float32)
+    dk = dk if dk is not None else torch.empty_like(k)
+    dv =dv if dv is not None else  torch.empty_like(v)
+    do_scaled = torch.empty_like(do)
+    delta = torch.empty_like(l)
+    assert o.stride() == dq.stride() == dk.stride() == dv.stride() == do_scaled.stride()
+    _bwd_preprocess[(ctx.grid[0] * ctx.grid[1], )](
+        o, do, l,
+        do_scaled, delta,
+        k.shape[2],
+        BLOCK_M=ctx.BLOCK_M, D_HEAD=q.shape[-1],
+    )
+    grid = (triton.cdiv(q.shape[2], ctx.BLOCK_N), ctx.grid[1])
+    _bwd_kernel[grid](
+        q, k, v, ctx.sm_scale,
+        layout_ccol_indices,
+        layout_row_indices,
+        layout_ccol_indices.stride(0), layout_ccol_indices.stride(1),
+        layout_row_indices.stride(0), layout_row_indices.stride(1),
+        o, do_scaled,
+        dq, dk, dv,
+        l, m,
+        delta,
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+        q.shape[0], q.shape[1], q.shape[2],
+        ctx.grid[0],
+        BLOCK_M=ctx.BLOCK_M,
+        BLOCK_N=ctx.BLOCK_N,
+        BLOCK_DMODEL=ctx.BLOCK_DMODEL,
+        NUM_DBLOCKS=q.shape[-1] // ctx.BLOCK_DMODEL,
+        num_warps=ctx.num_warps,
+        num_stages=1,
+    )
+    return dq, dk, dv, None, None, None
+class _sparse_attention(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale):
+        BLOCK = 128
+        # shape constraints
+        return _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, BLOCK, BLOCK)
+    @staticmethod
+    def backward(ctx, do):
+        # q, k, v, o, l, m = ctx.saved_tensors
+        q, k, v, o, l, m, layout_crow_indices, layout_col_indices = ctx.saved_tensors
+        # TODO: the following is very inefficient.
+        # layout_ccol_indices, layout_row_indices = dense_to_ccol_row(crow_col_to_dense(ctx.layout_crow_indices, ctx.layout_col_indices))
+        layout_ccol_indices, layout_row_indices = dense_to_ccol_row(crow_col_to_dense(layout_crow_indices, layout_col_indices))
+        return _backward(ctx, do, layout_ccol_indices, layout_row_indices)
+# suppressed
+class _sparse_attention_inference(_sparse_attention):
+    # TODO: does not work now, as BLOCK_M cannot be <1, as shape for tl.dot cannot be smaller than 16.
+    @staticmethod
+    def forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale):
+        BLOCK = 128
+        return _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, 1, BLOCK)
+def sparse_attention_factory(BLOCK_M=128, BLOCK_N=128, **kwargs):
+    class _sparse_attention_config(_sparse_attention):
+        @staticmethod
+        def forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale):
+            # shape constraints
+            return _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, BLOCK_M, BLOCK_N,
+                            **kwargs
+                        )
+    return _sparse_attention_config.apply
+@lru_cache(maxsize=8)
+def get_local_strided_sparse_attention_op(
+        n_heads: int,
+        max_seq_len:int,
+        sparse_block_size: int=128,
+        local_blocks: int=4,
+        vert_stride: int=4,
+        homo_head: bool=False,
+        dtype=torch.bfloat16,
+        device='cuda',
+        active_head_range=None,
+        verbose=True,
+        **kwargs):
+    '''
+    :param n_heads: total number of attention heads (regardless of tensor/model parallel)
+    :param max_seq_len: max sequence length. Need to be bigger or equal to the length of sequences.
+    :param sparse_block_size: sparse block size. Default to 128
+    :param local_blocks: number of nearest block to attend to. Default to 4, i.e., attention to previous 4xblock_size tokens.
+    :param vert_stride: Default to 4. Meaning
+    :param homo_head: if all head shared the same pattern.
+    :param active_head_range: tuple of start & end of the heads, e..g, (8, 16). Default to use all heads.
+                              Mainly for tensor/model parallelization where heads are splitted to different GPUs.
+    '''
+    if verbose:
+        print((f'> new block_sparse_attn op constructed with config: '
+            f'{n_heads=}, {max_seq_len=}, {sparse_block_size=}, {local_blocks=}, '
+            f'{vert_stride=}, {homo_head=}, {active_head_range=}, {kwargs=}'))
+    # assert math.log2(max_seq_len) % 2 == 0, f"max_seq_len should be power of 2 to be more efficient"
+    _, block_sparse_pattern, _ = _get_sparse_attn_mask(n_heads, max_seq_len, max_seq_len, dtype, device,
+                                                       BLOCK=sparse_block_size, local_blocks=local_blocks,
+                                                       vert_stride=vert_stride, homo_head=homo_head,
+                                                       return_dense=False)
+    if (not homo_head) and (active_head_range is not None):
+        assert isinstance(active_head_range, tuple)
+        assert len(active_head_range) == 2, '"active_head_range" should be a tuple of start/end index of the heads.'
+        h_start, h_end = active_head_range
+        block_sparse_pattern = block_sparse_pattern[h_start:h_end]
+    # print(block_sparse_pattern)
+    return get_sparse_attn_op(block_sparse_pattern, sparse_block_size, **kwargs)
+def get_sparse_attn_op(
+        sparse_pattern: torch.tensor,
+        sparse_block_size: int=128,
+        kernel_block_size=128,
+        qkv_format='q,k,v',
+          **kwargs):
+    '''
+    Ccreate a block-sparse op with fixed layout. This is to avoid the need to of create CSR layout and convert it to CSC layout everytime,
+        which is very inefficient (use python loops on CPU.  PyTorch 1.13 supports CSR->CSC, may help.)
+    :param sparse_pattern: sparse pattern of the blocks. Should be `num_blocks(q) x num_blocks(k)` or `n_heads x num_blocks x num_blocks`.
+        This tensor should have lower-triangular matrices on the last 2 dimensions for causal attention
+    :param sparse_block_size: sparse block size. Default to 128
+    :param kernel_block_size: the tile/block size to launch a triton instance. Default to None, i.e., same as `sparse_block_size`
+    :param qkv_format: Choices=['q,k,v', 'q, kv', 'qkv'], i.e., separated q,k,v, or kv packed, or qkv packed. Currently, only 'q,k,v' is supported.
+    :param kwargs: keyward arguments passed to `_forward`
+    '''
+    # assert qkv_format in ('q,k,v', 'q, kv', 'qkv')  # to save from running `concat` at forward/backward
+    assert qkv_format == 'q,k,v'
+    if kernel_block_size is None:
+        kernel_block_size = sparse_block_size
+    else:
+        assert sparse_block_size % kernel_block_size == 0, f"The sparse block size must be a multiple of {kernel_block_size}."
+        assert kernel_block_size >=16 and math.log2(kernel_block_size) % 1 == 0, f"block_size must be power of 2 and at least 16, but {kernel_block_size} is given"
+        # print(f'>> {sparse_pattern.shape=}')
+        # print(f'{sparse_pattern=}')
+        if sparse_block_size // kernel_block_size > 1:
+            _mul = sparse_block_size // kernel_block_size
+            # need to consider if block_m and block_n are different
+            sparse_pattern = torch.kron(sparse_pattern, sparse_pattern.new_ones(_mul, _mul))
+            num_sparse_blocks = sparse_pattern.size(-1)
+            block_causal_mask = torch.arange(0, num_sparse_blocks)[:, None] >= torch.arange(0, num_sparse_blocks)[None]
+            sparse_pattern *= block_causal_mask.type_as(sparse_pattern)
+            # print(f'>> after: {sparse_pattern.shape=}')
+            # print(f'{sparse_pattern=}')
+    BLOCK_N = kernel_block_size
+    NUM_BLOCK =  sparse_pattern.size(-1)
+    MAX_SEQ_LEN = kernel_block_size * NUM_BLOCK
+    grand_layout_crow_indices, grand_layout_col_indices = dense_to_crow_col(sparse_pattern)
+    # sparse csc layout for backward
+    grand_layout_ccol_indices, grand_layout_row_indices = dense_to_ccol_row(sparse_pattern)
+    # cache GPU backward layout. limit the size to avoid OOM as time goes.
+    # For inference, one only needs to cache one block as sequence length always increases
+    # Therefore, this cache needs to be reconstructed per every `block_size`-steps.
+    # For training/finetune, set to 8 to increase cache hit.
+    # Given an input, the block_len will be the same for all layers, so cache is very helpful.
+    max_cache_size = 1 if kwargs.get('inference', False) else 8
+    @lru_cache(maxsize=max_cache_size)
+    def get_backward_layout_by_block_len(block_len):
+        assert block_len <= NUM_BLOCK
+        if block_len == NUM_BLOCK:
+            return (grand_layout_ccol_indices, grand_layout_row_indices)
+        return dense_to_ccol_row(sparse_pattern[..., :block_len, :block_len])
+    # for debugging
+    # if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+    #     print(f'> {sparse_pattern.cpu().tolist()=}')
+    #     print('----')
+    #     print(f'> {grand_layout_crow_indices.cpu().tolist()=}\n{grand_layout_col_indices.cpu().tolist()=}')
+     # q, k, v separated
+    class _q_k_v_sparse_attention(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, q, k, v, sm_scale):
+            # assert q.shape[2] == 1 or q.shape[2] == k.shape[2]
+            # shape constraints
+            MIN_BLOCK_SIZE = 16
+            assert BLOCK_N >= MIN_BLOCK_SIZE
+            BLOCK_M = 16 if q.shape[2] <= 16 else BLOCK_N  # BLOCK_M has to be power of 2
+            # this following code only works for causal attention
+            K_BLOCKS = triton.cdiv(k.shape[2],  kernel_block_size)
+            # Q_START_BLOCKS = K_BLOCKS - 1 if q.shape[2] == 1 else 0
+            Q_START_BLOCKS = K_BLOCKS - triton.cdiv(q.shape[2], BLOCK_N)
+            # print(Q_START_BLOCKS, K_BLOCKS)
+            layout_crow_indices = grand_layout_crow_indices[..., Q_START_BLOCKS:K_BLOCKS+1]
+            layout_col_indices = grand_layout_col_indices
+            # print(BLOCK_M, BLOCK_N, Q_START_BLOCKS, K_BLOCKS+1, layout_crow_indices, layout_col_indices)
+            return _forward(ctx, q, k, v, layout_crow_indices, layout_col_indices, sm_scale, BLOCK_M, BLOCK_N,
+                            **kwargs
+                        )
+        @staticmethod
+        def backward(ctx, do):
+            q, k = ctx.saved_tensors[:2]
+            assert q.shape[2] == k.shape[2], '> currently backward can only be done if q, k have same length. Contact @EricLin if you need it.'
+            # assume q, k have same length
+            block_len = triton.cdiv(do.shape[2], kernel_block_size)
+            backward_layout = get_backward_layout_by_block_len(block_len)
+            return _backward(ctx, do, *backward_layout)[:4]
+    def _q_k_v_sparse_attention_fn(*args):
+        return _q_k_v_sparse_attention.apply(*args)
+    _q_k_v_sparse_attention_fn.sparse_pattern = sparse_pattern
+    _q_k_v_sparse_attention_fn.grand_layout_crow_indices = grand_layout_crow_indices
+    _q_k_v_sparse_attention_fn.grand_layout_col_indices = grand_layout_col_indices
+    _q_k_v_sparse_attention_fn.grand_layout_ccol_indices = grand_layout_ccol_indices
+    _q_k_v_sparse_attention_fn.grand_layout_row_indices = grand_layout_row_indices
+    return _q_k_v_sparse_attention_fn
+###########################################################
+###########################################################
+###########################################################
+################ Inference Kernels ########################
+###########################################################
+def blocksparse_flash_attn_padded_fwd(
+    q, k, v, # (batch, tokens, n_heads, head_size)
+    sm_scale,
+    sparse_layout,
+    *,
+    left_paddings = None,
+    seqlens = None,
+    block_size = 64,
+    max_seqlen = None
+):
+    '''
+    q, k, v: (batch, tokens, n_heads/n_kv_heads, head_size)
+    left_paddings: (batch, ), number of left paddings for each sample.
+    seqlens: can be used to specify right padding. No need to specify if left_paddings is used.
+    '''
+    batches, q_len, n_heads, head_size = q.shape
+    _, k_len, n_kv_heads, _ = k.shape
+    assert q.dim() == k.dim() == v.dim() == 4
+    assert q.size(2) % k.size(2) == 0
+    assert q.size(0) == k.size(0) and q.size(3) == k.size(3)
+    assert k.shape == v.shape # TODO: allow diff head_size for k, v
+    assert q_len == 1 or q_len == k_len, \
+    f'q length can only 1 for decoding for same as k length for prefilling.'
+    q_k_ratio = q.size(2) // k.size(2)
+    if max_seqlen:
+        assert k.size(1) <= max_seqlen, f'k has seqlen {k.size(1)} while max sequence length is set to {max_seqlen}.'
+    # paddings always has zero output, a little slower than using empty
+    out = q.new_zeros(q.shape)
+    layout_crow_indices, layout_col_indices = sparse_layout
+    block_d = triton.next_power_of_2(head_size)
+    if left_paddings is not None:
+        assert left_paddings.shape == (batches,)
+        k_batch_starts = left_paddings.to(q.device, dtype=torch.int32).contiguous()
+    else:
+        k_batch_starts = torch.zeros((batches,), dtype=torch.int32, device=q.device)
+    if seqlens is not None:
+        k_batch_ends = k_batch_starts + seqlens.type_as(k_batch_starts)
+        assert k_batch_ends.max() <= k_len, f'seqlens (+left_paddings if any) exceeds seqlen.'
+    else:
+        k_batch_ends = torch.zeros_like(k_batch_starts) + k_len
+    if q_len == 1:
+        q_batch_starts = torch.zeros_like(k_batch_starts)
+        q_batch_ends = q_batch_starts + 1
+    else:
+        q_batch_starts = k_batch_starts
+        q_batch_ends = k_batch_ends
+    # switch to use cpu to avoid too many kernel lauch when iterate over
+    q_lens = (q_batch_ends - q_batch_starts).cpu()
+    n_blocks = (q_lens + block_size - 1) // block_size
+    q_batch_ids = torch.tensor([i for i, n in enumerate(n_blocks) for _ in range(n)],
+                                dtype=q_batch_starts.dtype,
+                                device=q_batch_starts.device)
+    q_start_sids = torch.tensor([i * block_size for n in n_blocks for i in range(n)],
+                               dtype=q_batch_starts.dtype,
+                               device=q_batch_starts.device)
+    grid = (len(q_start_sids), n_heads)
+    _fwd_kernel_batch_inference[grid](
+    q, k, v, out,
+    sm_scale,
+    q_batch_starts,
+    q_batch_ends,
+    k_batch_starts,
+    k_batch_ends,
+    q_batch_ids,
+    q_start_sids,
+    *q.stride(),
+    *k.stride(),
+    *v.stride(),
+    *out.stride(),
+    layout_crow_indices,
+    layout_col_indices,
+    *layout_crow_indices.stride(),
+    *layout_col_indices.stride(),
+    q_k_ratio,
+    HAS_BATCH_DIM = True,
+    D_HEAD = head_size,
+    BLOCK_M = block_size,
+    BLOCK_N = block_size,
+    BLOCK_D = block_d,
+    BLOCK_M_LOADING = 16 if q_len == 1 else block_size, # smaller for decoding
+    EVEN_D = block_d == head_size,
+    num_warps = 1 if q_len == 1 else 4,
+    num_stages = 3
+    )
+    return out
+def blocksparse_flash_attn_varlen_fwd(
+    q, k, v, # (#tokens, n_heads, head_size)
+    cu_seqlens_k,
+    cu_seqlens_q,
+    sm_scale,
+    sparse_layout,
+    *,
+    block_size=64,
+    max_seqlen = None
+):
+    # split q to blocks
+    _, n_heads, head_size = q.shape
+    batch_size = cu_seqlens_k.size(0) - 1
+    # print(f'> {q.shape=}, {k.shape=}')
+    assert q.dim() == k.dim() == v.dim() == 3
+    assert q.size(1) % k.size(1) == 0
+    assert q.size(2) == k.size(2)
+    assert k.shape == v.shape # TODO: allow diff head_size for k, v
+    assert cu_seqlens_k.dim() == 1
+    q_k_ratio = q.size(1) // k.size(1)
+    if cu_seqlens_q is None:
+        if q.size(0) == batch_size: # decoding only
+            cu_seqlens_q = torch.arange(0, batch_size + 1,
+                                        dtype=cu_seqlens_k.dtype,
+                                        device=cu_seqlens_k.device)
+        elif q.size(0) == k.size(0):
+            cu_seqlens_q = cu_seqlens_k
+        else:
+            raise ValueError('cu_seqlens_q must be specified if it is mix of prefilling and decoding.')
+    else:
+        assert cu_seqlens_k.size(0) == cu_seqlens_q.size(0)
+    # switch to use cpu to avoid too many kernel lauch when iterate over
+    q_lens = (cu_seqlens_q[1:] - cu_seqlens_q[:-1]).cpu()
+    k_lens = (cu_seqlens_k[1:] - cu_seqlens_k[:-1]).cpu()
+    assert torch.logical_or(q_lens == 1, k_lens == q_lens).all(), \
+        'length of q should either be 1 (decoding) or same as k (prefilling).'
+    if max_seqlen:
+        assert k_lens.max() <= max_seqlen
+    n_blocks = (q_lens + block_size - 1) // block_size
+    q_batch_ids = torch.tensor([i for i, n in enumerate(n_blocks) for _ in range(n)],
+                                dtype=cu_seqlens_q.dtype,
+                                device=cu_seqlens_q.device)
+    q_start_sids = torch.tensor([i * block_size for n in n_blocks for i in range(n)],
+                               dtype=cu_seqlens_q.dtype,
+                               device=cu_seqlens_q.device)
+    out = q.new_empty(q.shape)
+    cu_seqlens_q = cu_seqlens_q.contiguous()
+    cu_seqlens_k = cu_seqlens_k.contiguous()
+    layout_crow_indices, layout_col_indices = sparse_layout
+    block_d = triton.next_power_of_2(head_size)
+    decoding_only =  (q_lens == 1).all()
+    grid = (len(q_start_sids), n_heads)
+    _fwd_kernel_batch_inference[grid](
+    q, k, v, out,
+    sm_scale,
+    cu_seqlens_q[:-1],
+    cu_seqlens_q[1:],
+    cu_seqlens_k[:-1],
+    cu_seqlens_k[1:],
+    q_batch_ids,
+    q_start_sids,
+    0, *q.stride(),
+    0, *k.stride(),
+    0, *v.stride(),
+    0, *out.stride(),
+    layout_crow_indices,
+    layout_col_indices,
+    *layout_crow_indices.stride(),
+    *layout_col_indices.stride(),
+    q_k_ratio,
+    HAS_BATCH_DIM = False,
+    D_HEAD = head_size,
+    BLOCK_M = block_size,
+    BLOCK_N = block_size,
+    BLOCK_D = block_d,
+    BLOCK_M_LOADING = 16 if decoding_only else block_size, # smaller for decoding
+    EVEN_D = block_d == head_size,
+    num_warps = 1 if decoding_only else 4,
+    num_stages = 3
+    )
+    return out
+@triton.jit
+def _fwd_kernel_inner(
+    acc, l_i, m_i,
+    q, Q,
+    k_block_col_idx,
+    layout_col_ptr,
+    layout_col_stride_h, layout_col_stride_m,
+    k_ptrs,
+    v_ptrs,
+    off_h, offs_m, offs_n, offs_d,
+    stride_kt, stride_vt,
+    sm_scale,
+    k_seqlen,
+    past_len,
+    LAST_K_BLOCK: tl.constexpr,
+    BLOCK_M_LOADING: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    D_HEAD: tl.constexpr,
+    EVEN_D: tl.constexpr,
+    M_LT_N: tl.constexpr
+):
+    k_block_id = tl.load(layout_col_ptr +  off_h * layout_col_stride_h + k_block_col_idx * layout_col_stride_m).to(tl.int32)
+    start_n = k_block_id * BLOCK_N
+    # -- compute qk ----
+    if LAST_K_BLOCK:
+        if EVEN_D:
+            k = tl.load(k_ptrs + start_n * stride_kt,
+                        mask=offs_n[None, :] + start_n < k_seqlen)
+        else:
+            # mask = mask & (offs_d[:, ])
+            k = tl.load(k_ptrs + start_n * stride_kt,
+                        mask=(offs_n[None, :] + start_n < k_seqlen) & (offs_d[:, None] < D_HEAD))
+    else:
+        if EVEN_D:
+            k = tl.load(k_ptrs + start_n * stride_kt)
+        else:
+            k = tl.load(k_ptrs + start_n * stride_kt,
+                        mask=offs_d[:, None] < D_HEAD)
+    qk = tl.zeros([BLOCK_M_LOADING, BLOCK_N], dtype=tl.float32)
+    qk += tl.dot(q, k)
+    qk *= sm_scale
+    # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N
+    if LAST_K_BLOCK | M_LT_N:
+        qk += tl.where(offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), 0, float('-inf'))
+    # -- compute m_ij, p, l_ij
+    m_ij = tl.max(qk, 1)
+    p = tl.exp(qk - m_ij[:, None])
+    l_ij = tl.sum(p, 1)
+    # -- update m_i and l_i
+    m_i_new = tl.maximum(m_i, m_ij)
+    alpha = tl.exp(m_i - m_i_new)
+    beta = tl.exp(m_ij - m_i_new)
+    l_i_new = alpha * l_i + beta * l_ij
+    # -- update output accumulator --
+    # scale p
+    p_scale = beta / l_i_new
+    p = p * p_scale[:, None]
+    # scale acc
+    acc_scale = l_i / l_i_new * alpha
+    acc = acc * acc_scale[:, None]
+    p = p.to(Q.dtype.element_ty)
+    # update acc
+    if LAST_K_BLOCK:
+        if EVEN_D:
+            v = tl.load(v_ptrs + start_n * stride_vt,
+                        mask=offs_n[:, None] + start_n < k_seqlen)
+        else:
+            v = tl.load(v_ptrs + start_n * stride_vt,
+                        mask=(offs_n[:, None] + start_n < k_seqlen) & (offs_d[None, :] < D_HEAD))
+    else:
+        if EVEN_D:
+            v = tl.load(v_ptrs + start_n * stride_vt)
+        else:
+            v = tl.load(v_ptrs + start_n * stride_vt,
+                        mask=offs_d[None, :] < D_HEAD)
+    acc += tl.dot(p, v)
+    # update m_i and l_i
+    l_i = l_i_new
+    m_i = m_i_new
+    return acc, l_i, m_i
+@triton.heuristics(
+    {
+        'M_LT_N': lambda kwargs: kwargs['BLOCK_M'] < kwargs['BLOCK_N'],
+    }
+)
+@triton.jit
+def _fwd_kernel_batch_inference(
+    Q, K, V, Out,
+    sm_scale,
+    q_batch_starts,
+    q_batch_ends,
+    k_batch_starts,
+    k_batch_ends,
+    q_batch_ids,
+    q_start_sids,
+    stride_qb, stride_qt, stride_qh, stride_qd,
+    stride_kb, stride_kt, stride_kh, stride_kd,
+    stride_vb, stride_vt, stride_vh, stride_vd,
+    stride_ob, stride_ot, stride_oh, stride_od,
+    layout_crow_ptr,
+    layout_col_ptr,
+    layout_crow_stride_h, layout_crow_stride_m,
+    layout_col_stride_h, layout_col_stride_m,
+    q_k_ratio,
+    HAS_BATCH_DIM: tl.constexpr,
+    D_HEAD: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_M_LOADING: tl.constexpr,
+    EVEN_D: tl.constexpr,
+    M_LT_N: tl.constexpr
+):
+    '''
+    NOTATION:
+    pid: position id
+    sid: storage id
+    sbid: storage block id
+    pbid: position block id
+    offs_m, offs_n: storage offsets of m-dim(q, row) and n-dim(k, col)
+    q and blocks in KV needs to be contiguous
+    Arguments:
+    kv_seq_lens: for compute past_len
+    kv_storage_offsets: similar to block_tables in vllm, except it is dynamic.
+        TODO: fix this
+    TODO:
+    Optimize grouped-attn
+    CUDA graph support issue
+        1. grid is dynamic: vllm set up multiple cuda graph in decoding phase, with diff max token size (16, 32, ...)
+            since we mix prompt and decoing phase here, it can be more complex.
+            need to set up diff cuda-graph for diff (off_zm, off_z)
+            # indeed, q_batch_ids can be padded to maximum number of grid[0], i.e., assume all decoding
+            therefore, cu_seqlens_q, kv_seq_lens
+    '''
+    off_zm = tl.program_id(0)
+    off_h = tl.program_id(1)
+    off_h_for_kv = off_h // q_k_ratio
+    off_z = tl.load(q_batch_ids + off_zm).to(tl.int32)   # [0, 0, 0, 1]
+    q_start_sid = tl.load(q_start_sids + off_zm)
+    start_m = q_start_sid // BLOCK_M
+    if HAS_BATCH_DIM:
+        Q += off_z * stride_qb
+        K += off_z * stride_kb
+        V += off_z * stride_vb
+        Out += off_z * stride_ob
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M_LOADING)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_D)
+    q_cu_start = tl.load(q_batch_starts + off_z).to(tl.int32)
+    q_seqlen = tl.load(q_batch_ends + off_z).to(tl.int32) - q_cu_start
+    k_cu_start = tl.load(k_batch_starts + off_z).to(tl.int32)
+    k_seqlen = tl.load(k_batch_ends + off_z).to(tl.int32) - k_cu_start
+    past_len = k_seqlen - q_seqlen
+    Q += q_cu_start * stride_qt + off_h * stride_qh
+    K += k_cu_start * stride_kt + off_h_for_kv * stride_kh
+    V += k_cu_start * stride_vt + off_h_for_kv * stride_vh
+    Out += q_cu_start * stride_ot + off_h * stride_oh
+    q_pbid = (past_len + q_start_sid) // BLOCK_M
+    if EVEN_D:
+        q = tl.load(Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd,
+                    mask=offs_m[:, None] < q_seqlen)
+    else:
+        q = tl.load(Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd,
+                    mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD),
+                    other=0)
+    sparse_crow_ptr = layout_crow_ptr + off_h * layout_crow_stride_h + q_pbid * layout_crow_stride_m
+    # TODO: load at once, supported in new Triton
+    k_block_start = tl.load(sparse_crow_ptr).to(tl.int32)
+    k_block_end = tl.load(sparse_crow_ptr + 1).to(tl.int32)
+    m_i = tl.zeros([BLOCK_M_LOADING], dtype=tl.float32) - float('inf')
+    l_i = tl.zeros([BLOCK_M_LOADING], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M_LOADING, BLOCK_D], dtype=tl.float32)
+    k_ptrs = K + offs_n[None, :] * stride_kt + offs_d[:, None] * stride_kd
+    v_ptrs = V + offs_n[:, None] * stride_vt + offs_d[None, :] * stride_vd
+    for k_block_col_idx in range(k_block_start, k_block_end - 1):
+        acc, l_i, m_i = _fwd_kernel_inner(
+            acc, l_i, m_i,
+            q, Q,
+            k_block_col_idx,
+            layout_col_ptr,
+            layout_col_stride_h, layout_col_stride_m,
+            k_ptrs,
+            v_ptrs,
+            off_h, offs_m, offs_n, offs_d,
+            stride_kt, stride_vt,
+            sm_scale,
+            k_seqlen,
+            past_len,
+            False,
+            BLOCK_M_LOADING,
+            BLOCK_N,
+            D_HEAD,
+            EVEN_D,
+            M_LT_N
+            )
+    acc, l_i, m_i = _fwd_kernel_inner(
+        acc, l_i, m_i,
+        q, Q,
+        k_block_end - 1,
+        layout_col_ptr,
+        layout_col_stride_h, layout_col_stride_m,
+        k_ptrs,
+        v_ptrs,
+        off_h, offs_m, offs_n, offs_d,
+        stride_kt, stride_vt,
+        sm_scale,
+        k_seqlen,
+        past_len,
+        True,
+        BLOCK_M_LOADING,
+        BLOCK_N,
+        D_HEAD,
+        EVEN_D,
+        M_LT_N
+        )
+    # write output
+    if EVEN_D:
+        tl.store(Out + offs_m[:, None] * stride_ot + offs_d[None, :] * stride_od, acc,
+                mask=offs_m[:, None] < q_seqlen)
+    else:
+        tl.store(Out + offs_m[:, None] * stride_ot + offs_d[None, :] * stride_od, acc,
+                mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD))
+###########################################################
+###########################################################
+###########################################################
+################## Testing Utilities ######################
+###########################################################
+def torch_attention(q, k, v, attn_mask=None, sm_scale=None, block_attn_mask=None, block_size=128, do=None):
+    '''
+    q, k, v: shape=(batch, n_heads, seq, dim)
+    '''
+    # for verification
+    if sm_scale is None:
+        sm_scale = math.sqrt(float(q.size(-1)))
+    if block_attn_mask is not None:
+        assert attn_mask is None
+        outs = []
+        for s in range(0, q.size(2), block_size):
+            e = min(s + block_size, q.size(2))
+            q_block = q[:, :, s:e]
+            attn = torch.einsum('bhmd,bhnd->bhmn', q_block, k[:, :, :e]).float() * sm_scale
+            mask = block_attn_mask[..., s // block_size, : (s // block_size + 1)]
+            mask = torch.kron(mask, torch.ones(block_size, block_size, device=mask.device))
+            mask[..., :, s:].masked_fill_(torch.arange(0, block_size)[:, None] <= torch.arange(0, block_size)[None, :], 0)
+            attn = attn.masked_fill((1 - mask).bool(), float('-inf'))
+            attn = attn.softmax(-1)
+            out = torch.einsum('bhmn,bhnd->bhmd', attn.type_as(v), v[:, :, :e])
+            outs.append(out)
+        torch_output = torch.cat(outs, dim=2)
+    else:
+        attn = torch.einsum('bhmd,bhnd->bhmn', q, k).float() * sm_scale
+        # import ipdb; ipdb.set_trace()
+        if attn_mask is not None:
+            attn = attn.masked_fill((1 - attn_mask).bool(), float('-inf'))
+        # print(f'> torch attn: {attn.exp().sum(-1)=}')
+        attn = attn.softmax(-1)
+        if do is not None:
+            dv = torch.einsum('bhqk,bhqd->bhkd', attn.type_as(do), do)
+            print(f'> torch_attn computed dv: {dv=}')
+        torch_output = torch.einsum('bhmn,bhnd->bhmd', attn.type_as(v), v)
+    return torch_output
+###########################################################
+###########################################################
+###########################################################
+#################### Unit Tests ###########################
+###########################################################
+@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(2, 8, 2048, 128), (1, 4, 4096, 64)])
+def test_op(Z, H, N_CTX, D_HEAD, Q_LEN=None, dtype=torch.bfloat16, homo_head=True, kernel_block_size=None, sparse_block_size=128, backward=True,
+            sparse_attention_fn=None, local_blocks=4, vert_stride=4, sm_scale=None, max_length=None):
+    Q_LEN = Q_LEN or N_CTX
+    torch.manual_seed(20)
+    q = torch.empty((Z, H, Q_LEN, D_HEAD), dtype=dtype, device='cuda').normal_(mean=0, std=.5) # .requires_grad_()
+    k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device='cuda').normal_(mean=0, std=.5) # .requires_grad_()
+    v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device='cuda').normal_(mean=0, std=.5) # .requires_grad_()
+    if sm_scale is None:
+        sm_scale = 1. / math.sqrt(D_HEAD)
+    # for debugging
+    # print(f'>> {q.shape=}, {k.shape=}, {v.shape=}, {homo_head=}, {kernel_block_size=}, {sparse_block_size=}, {local_blocks=}, {vert_stride=}')
+    sm_scale = 0.0078125
+    if backward:
+        q.requires_grad_(), k.requires_grad_(), v.requires_grad_()
+    # qkv = torch.empty((Z, N_CTX, 3*H*D_HEAD), dtype=dtype, device='cuda').normal_(mean=0, std=.5)
+    # q = qkv[..., :H*D_HEAD]
+    # k = qkv[..., H*D_HEAD:2*H*D_HEAD]
+    # v = qkv[..., 2*H*D_HEAD:]
+    # q = q.view(Z, N_CTX, H, -1).permute(0, 2, 1, 3)
+    # k = k.view(Z, N_CTX, H, -1).permute(0, 2, 1, 3)
+    # v = v.view(Z, N_CTX, H, -1).permute(0, 2, 1, 3)
+    # if Q_LEN and Q_LEN < N_CTX:
+    #     q = q[:, :, -Q_LEN:] # .contiguous()
+    # q = q.requires_grad_()
+    # k = k.requires_grad_()
+    # v = v.requires_grad_()
+    dout = torch.randn_like(q).contiguous()
+    # dout = torch.eye(N_CTX)[:, :D_HEAD][None, None].expand_as(q).type_as(q).contiguous()
+    # print(dout)
+    mask_csr, _, mask_dense = get_sparse_attn_mask(q, N_CTX, BLOCK=sparse_block_size,
+                            local_blocks=local_blocks, vert_stride=vert_stride, homo_head=homo_head, return_dense=True)
+    if sparse_attention_fn is None:
+        sparse_attention_fn = get_local_strided_sparse_attention_op(H, N_CTX,
+                                                                    sparse_block_size=sparse_block_size,
+                                                                    local_blocks=local_blocks,
+                                                                    vert_stride=vert_stride,
+                                                                    homo_head=homo_head,
+                                                                    device=q.device,
+                                                                    dtype=q.dtype,
+                                                                    kernel_block_size=kernel_block_size)
+    # reference implementation
+    ref_out = torch_attention(q, k, v, mask_dense, sm_scale)
+    # lengths = torch.full((Z,), fill_value=N_CTX, device='cuda')
+    # cu_seqlens = torch.zeros((Z + 1,), device='cuda', dtype=torch.int32)
+    # cu_seqlens[1:] = lengths.cumsum(0)
+    # # qkv = torch.randn((Z * N_CTX, 3, H, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+    # qkv_list = list(map(lambda x: x.permute(0, 2, 1, 3).contiguous().view(Z * N_CTX, 1, H, D_HEAD), [q, k, v]))
+    # qkv = torch.cat(qkv_list, dim=1)
+    # ref_out0 = flash_attn_func(qkv, cu_seqlens, dropout_p=0, max_s=N_CTX, softmax_scale=sm_scale, causal=True)
+    # ref_out = ref_out0.view(Z, N_CTX, H, D_HEAD).permute(0, 2, 1, 3).contiguous()
+    if backward:
+        ref_out.backward(dout)
+        ref_dv, v.grad = v.grad.clone(), None
+        ref_dk, k.grad = k.grad.clone(), None
+        ref_dq, q.grad = q.grad.clone(), None
+    tri_out = sparse_attention_fn(q, k, v, sm_scale)
+    decimal = 1 if dtype == torch.bfloat16 else 2
+    assert torch.allclose(ref_out.cpu(), tri_out.cpu(), atol=1e-2, rtol=0), f'>> {ref_out[0, 0, :, 0].tolist()=}\n\n{tri_out[0, 0, :, 0].tolist()=}'
+    if backward:
+        tri_out.backward(dout)
+        tri_dv, v.grad = v.grad.clone(), None
+        tri_dk, k.grad = k.grad.clone(), None
+        tri_dq, q.grad = q.grad.clone(), None
+    if backward:
+        assert torch.allclose(ref_dv, tri_dv, atol=1e-2, rtol=1e-2)
+        assert torch.allclose(ref_dk, tri_dk, atol=1e-2, rtol=0)
+        assert torch.allclose(ref_dq, tri_dq, atol=1e-2, rtol=0)
+    print(f'> test passed: {Z=}, {H=}, {N_CTX=}, {D_HEAD=}, {Q_LEN=}, {dtype=}, {homo_head=}, {sparse_block_size=}')
+###########################################################
+if __name__ == '__main__':
+    GPU_TYPE = os.popen('nvidia-smi --query-gpu=name --format=csv | tail -n 1').read().strip()
+    # print(GPU_TYPE)
+    support_backward = True # 'A100' in GPU_TYPE. Wasn't supportted in consumer A1000.
+    ###############
+    # benchmarking
+    HAS_DENSE_TRITON_FLASH = False
+    # try:
+    #     from triton.ops.flash_attention import attention as triton_attention
+    #     HAS_DENSE_TRITON_FLASH = True
+    # except:
+    #     HAS_DENSE_TRITON_FLASH = False
+    #     print('> cannot import Trition flash attn')
+    try:
+        from flash_attn.flash_attn_interface import flash_attn_func, flash_attn_unpadded_func
+        HAS_FLASH = True
+    except BaseException:
+        HAS_FLASH = False
+        print('> cannot import flash_attn')
+    # BATCH, N_HEADS, N_CTX, D_HEAD = 4, 48, 4096, 64
+    BATCH, N_HEADS, N_CTX, D_HEAD = 4, 32, 4096, 128  # 6.7B model, with 4k len
+    # BATCH, N_HEADS, N_CTX, D_HEAD = 4, 16, 4096, 128  # 204m model
+    BLOCK_SIZE = 64
+    LOCAl_BLOCKS = 8 # 4
+    VERT_STRIDE = 1 # 16 # 8
+    HOMO_HEAD = False
+    sparse_type = 'home' if HOMO_HEAD else 'hetero'
+    dtype = torch.bfloat16
+    modes = ['fwd', 'bwd'] if support_backward else ['fwd']
+    configs = [triton.testing.Benchmark(
+        x_names=['SEQ_LEN'],
+        x_vals=[2**i for i in range(8, 16)],
+        line_arg='provider',
+        line_vals=(['triton'] if HAS_DENSE_TRITON_FLASH else []) + (['flash'] if HAS_FLASH else []) + ['triton_sparse'],
+        line_names=(['Triton-Dense'] if HAS_DENSE_TRITON_FLASH else [])  + (['Flash-Dense'] if HAS_FLASH else []) + ['Triton-Sparse'],
+        styles=[('red', '-'), ('blue', '-'), ('green', '-')],
+        ylabel='ms',
+        plot_name=f'fused-attention-batch{BATCH}-head{N_HEADS}-d{D_HEAD}-sparse-local{LOCAl_BLOCKS}-vert{VERT_STRIDE}-{sparse_type}-{dtype}-{mode}',
+        args={'H': N_HEADS, 'BATCH': BATCH, 'D_HEAD': D_HEAD, 'dtype': dtype, 'mode': mode}
+    ) for mode in modes]
+    @triton.testing.perf_report(configs)
+    def bench_flash_attention(BATCH, H, SEQ_LEN, D_HEAD, mode, provider, dtype=torch.bfloat16, device='cuda', sparse_attention_fn=None):
+        assert mode in ['fwd', 'bwd']
+        warmup = 25
+        rep = 100
+        N_CTX = SEQ_LEN
+        if provider == 'triton':
+            q = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+            k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+            v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+            sm_scale = 1.3
+            fn = lambda: triton_attention(q, k, v, sm_scale)
+            if mode == 'bwd':
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        if provider == 'triton_sparse':
+            q = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+            k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+            v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+            sm_scale = 1.3
+            # q_pos = torch.arange(N_CTX // BLOCK, device='cuda')[:, None]
+            # k_pos = torch.arange(N_CTX // BLOCK, device='cuda')[None]
+            # local_blocks = 4 # num_block per attn, block_size is tied to BLOCK
+            # vert_stride =N_CTX + 1 # 4
+            # mask_vert_strided = torch.arange(N_CTX // BLOCK, device='cuda') % vert_stride == vert_stride - 1
+            # mask_dense = ((q_pos >= k_pos) & ((q_pos - k_pos < local_blocks) | mask_vert_strided)).type_as(q)
+            # mask = mask_dense.to_sparse_csr()
+            # mask_csr, _ = get_sparse_attn_mask(q, N_CTX, BLOCK=BLOCK, local_blocks=LOCAl_BLOCKS, vert_stride=VERT_STRIDE, homo_head=HOMO_HEAD)
+            if sparse_attention_fn is None:
+                # sparse_attention_fn = sparse_attention
+                sparse_attention_fn = get_local_strided_sparse_attention_op(H, SEQ_LEN,
+                                                                            local_blocks=LOCAl_BLOCKS,
+                                                                            vert_stride=VERT_STRIDE,
+                                                                            homo_head=HOMO_HEAD,
+                                                                            sparse_block_size=BLOCK_SIZE,
+                                                                            kernel_block_size=BLOCK_SIZE,
+                                                                            device=q.device)
+            # sparse_attention_fn = sparse_attention_factory(128, 128, num_warps=8)
+            # fn = lambda: sparse_attention_fn(q, k, v, mask_csr[0], mask_csr[1], sm_scale)
+            fn = lambda: sparse_attention_fn(q, k, v, sm_scale)
+            if mode == 'bwd':
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        if provider == 'flash':
+            lengths = torch.full((BATCH,), fill_value=N_CTX, device=device)
+            cu_seqlens = torch.zeros((BATCH + 1,), device=device, dtype=torch.int32)
+            cu_seqlens[1:] = lengths.cumsum(0)
+            qkv = torch.randn((BATCH * N_CTX, 3, H, D_HEAD), dtype=dtype, device=device, requires_grad=True)
+            fn = lambda: flash_attn_func(qkv, cu_seqlens, 0., N_CTX, causal=True)
+            if mode == 'bwd':
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        # if provider == 'torch':
+        #     q = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+        #     k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+        #     v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=True)
+        #     sm_scale = 1.3
+        #     causal_mask = torch.tril(torch.ones(N_CTX, N_CTX)).type_as(q)
+        #     fn = lambda:  torch_attention(q, k, v, causal_mask, sm_scale)
+        #     ms = triton.testing.do_bench(fn, percentiles=None, warmup=warmup, rep=rep)
+        #     return ms
+    BATCH, N_HEADS, N_CTX, D_HEAD, Q_LEN = 4, 32, 4096, 128, 1  # 6.7B model, with 4k len
+    BLOCK_SIZE = 64
+    LOCAl_BLOCKS = 8 # 4
+    VERT_STRIDE = 16 # 8
+    HOMO_HEAD = False
+    sparse_type = 'home' if HOMO_HEAD else 'hetero'
+    dtype = torch.bfloat16
+    MAX_N_CTX = 8192
+    configs = [triton.testing.Benchmark(
+        x_names=['PAST_LEN'],
+        x_vals=[2**i - 1 for i in range(8, 14)],
+        line_arg='provider',
+        line_vals=['torch'] + (['flash'] if HAS_FLASH else []) + ['triton_sparse', 'triton_dense'],
+        line_names=['Torch']  + (['Flash-Dense'] if HAS_FLASH else []) + ['Triton-Sparse', 'Triton-Dense'],
+        styles=[('red', '-'), ('blue', '-'), ('green', '-'), ('cyan', '-')],
+        ylabel='ms',
+        plot_name=f'fused-attention-inference-batch{BATCH}-head{N_HEADS}-d{D_HEAD}-sparse-local{LOCAl_BLOCKS}-vert{VERT_STRIDE}-{sparse_type}',
+        args={'H': N_HEADS, 'BATCH': BATCH, 'D_HEAD': D_HEAD, 'Q_LEN': Q_LEN, 'dtype': torch.float16, 'mode': mode}
+    ) for mode in ['fwd']]
+    @triton.testing.perf_report(configs)
+    def bench_flash_attention_inference(BATCH, H, PAST_LEN, D_HEAD, Q_LEN, mode, provider, dtype=torch.bfloat16, device='cuda'):
+        assert mode in ['fwd']
+        warmup = 25
+        rep = 100
+        N_CTX = PAST_LEN + Q_LEN
+        if provider == 'torch':
+            q = torch.randn((BATCH, H, Q_LEN, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            sm_scale = 1.3
+            mask_csr, _, mask_dense = get_sparse_attn_mask(q, N_CTX, BLOCK=BLOCK_SIZE,
+                                    local_blocks=LOCAl_BLOCKS, vert_stride=VERT_STRIDE, homo_head=VERT_STRIDE, return_dense=True)
+            fn = lambda: torch_attention(q, k, v, mask_dense, sm_scale=sm_scale, block_size=2048)
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        if provider == 'triton_sparse':
+            q = torch.randn((BATCH, H, Q_LEN, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            sm_scale = 1.3
+            sparse_attention_fn = get_local_strided_sparse_attention_op(H, MAX_N_CTX,
+                                                                        local_blocks=LOCAl_BLOCKS,
+                                                                        vert_stride=VERT_STRIDE,
+                                                                        homo_head=HOMO_HEAD,
+                                                                        sparse_block_size=BLOCK_SIZE,
+                                                                        kernel_block_size=BLOCK_SIZE,
+                                                                        device=q.device,
+                                                                        inference=True)
+            fn = lambda: sparse_attention_fn(q, k, v, sm_scale)
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        if provider == 'triton_dense':
+            q = torch.randn((BATCH, H, Q_LEN, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            k = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            v = torch.randn((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            sm_scale = 1.3
+            sparse_attention_fn = get_local_strided_sparse_attention_op(H, MAX_N_CTX,
+                                                                        local_blocks=1,
+                                                                        vert_stride=1,
+                                                                        homo_head=True,
+                                                                        sparse_block_size=BLOCK_SIZE,
+                                                                        kernel_block_size=BLOCK_SIZE,
+                                                                        device=q.device,
+                                                                        inference=True)
+            fn = lambda: sparse_attention_fn(q, k, v, sm_scale)
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        if provider == 'flash':
+            assert Q_LEN == 1
+            lengths = torch.full((BATCH,), fill_value=N_CTX, device=device)
+            cu_seqlens = torch.zeros((BATCH + 1,), device=device, dtype=torch.int32)
+            cu_seqlens[1:] = lengths.cumsum(0)
+            cu_seqlens_q = torch.arange(BATCH + 1, device=device, dtype=torch.int32)
+            # (total_q, nheads, headdim),
+            q = torch.randn((BATCH, H, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            k = torch.randn((BATCH*N_CTX, H, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            v = torch.randn((BATCH*N_CTX, H, D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+            fn = lambda: flash_attn_unpadded_func(q, k, v, cu_seqlens_q, cu_seqlens, 1, N_CTX, dropout_p=0, softmax_scale=1.3, causal=False)
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+    test_op(1, 4, 512, 128, dtype=torch.float16, homo_head=False, backward=support_backward)
+    # bench_flash_attention.run(save_path='.', print_data=True)
+    bench_flash_attention_inference.run(save_path='.', print_data=True)
+    exit()
+    # head_dim=64
+    test_op(1, 2, 1024, 64, kernel_block_size=64, sparse_block_size=64,
+            dtype=torch.bfloat16, homo_head=False, backward=support_backward)
+    # uneven length, bf16
+    test_op(1, 16, 224, 128, dtype=torch.bfloat16, homo_head=False, backward=False, sparse_block_size=128,
+            kernel_block_size=64, local_blocks=8, vert_stride=8)
+    test_op(3, 2, 2047, 128, homo_head=False, backward=False)
+    # diff kernel/sparse block size
+    test_op(1, 16, 224, 128, dtype=torch.bfloat16, homo_head=False, backward=False, kernel_block_size=64)
+    # inference
+    # test_op(1, 4, 512 + 256, 128, Q_LEN=1,  dtype=torch.bfloat16, homo_head=False, backward=support_backward)
+    # dense flash attn
+    test_op(1, 2, 1024, 128, kernel_block_size=128, sparse_block_size=128, dtype=torch.bfloat16, homo_head=False,
+            backward=support_backward, local_blocks=1, vert_stride=1)
+    # fp16
+    test_op(1, 4, 512 + 256, 128, dtype=torch.float16, homo_head=False, backward=support_backward)
+    # longer sequence
+    test_op(2, 4, 8192, 64, homo_head=False, backward=support_backward)
+    test_op(2, 4, 8192, 128, dtype=torch.bfloat16, homo_head=False, backward=support_backward)
+    # homo head
+    test_op(3, 2, 2048, 64, homo_head=True, dtype=torch.bfloat16, backward=False)
+    test_op(3, 2, 2048, 64, homo_head=True, backward=support_backward)
+    # sparse_attention_fn = sparse_attention_factory(16, 128, num_warps=1, INFERENCE=True)
+    # test_op(8, 1, 2047, 128, 1, backward=False, sparse_attention_fn=None)
+    # test_op_inference(3, 2, 2048, 128, 2048)
+    # test_op_inference(3, 2, 2047, 64, 2047)
+    # test_op_inference(3, 2, 256, 64, 128)
+    # test_op_inference(3, 2, 2048, 64, 1)
+    bench_flash_attention.run(save_path='.', print_data=True)
+    # bench_flash_attention_inference.run(save_path='.', print_data=True)
+# ========================
+# Some Benchmark Results #
+# ========================
+# fused-attention-batch4-head48-d64-sparse-local4-vert4-hetero-fwd
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.057184     0.069646       0.052567
+# 1    512.0      0.131688     0.187658       0.110212
+# 2   1024.0      0.391844     0.524990       0.247875
+# 3   2048.0      1.305190     1.456685       0.596506
+# 4   4096.0      4.623019     4.968653       1.600277
+# 5   8192.0     17.513062    18.332262       4.802458
+# 6  16384.0     68.453377    70.337540      16.052908
+# 7  32768.0    270.655487   276.020233      57.938946
+# fused-attention-batch4-head48-d64-sparse-local4-vert4-hetero-bwd (num_warp=8):
+# SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.190120     0.150313       0.181451
+# 1    512.0      0.406348     0.391767       0.391177
+# 2   1024.0      1.029704     1.182967       0.885741
+# 3   2048.0      2.985456     3.843399       2.040469
+# 4   4096.0      9.808897    13.073701       5.069609
+# 5   8192.0     34.995201    47.863808      13.948782
+# 6  16384.0    132.740097   182.579193      42.816513
+# 7  32768.0    542.223389   714.820618     147.053574
+# fused-attention-inference-batch4-head32-d128-sparse-local4-vert4-hetero:
+# PAST_LEN  Torch-Dense  Flash-Dense  Triton-Sparse
+# 0     256.0     0.050949     0.032357       0.107513
+# 1     512.0     0.073624     0.050651       0.199086
+# 2    1024.0     0.107472     0.080379       0.245445
+# 3    2048.0     0.178423     0.129448       0.338259
+# 4    4096.0     0.327647     0.223106       0.517048
+# 5    8192.0     0.588423     0.411263       0.884606
+# 6   16384.0     1.098898     0.798941       1.611809
+# 7   32768.0     2.094537     1.594726       3.044160
+# 6.7B
+# fused-attention-batch4-head32-d128-sparse-local4-vert4-hetero-fwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.069208     0.082156       0.065097
+# 1    512.0      0.138271     0.201393       0.144467
+# 2   1024.0      0.391521     0.624614       0.322382
+# 3   2048.0      1.268443     2.406325       0.784367
+# 4   4096.0      4.455703     9.139097       2.100856
+# 5   8192.0     16.764315    35.289600       6.328320
+# 6  16384.0     65.221634   138.401794      21.069057
+# 7  32768.0    257.251343   548.085754      76.111870
+# fused-attention-batch4-head32-d128-sparse-local4-vert4-hetero-bwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.297118     0.266469       0.255255
+# 1    512.0      0.672826     0.613685       0.552954
+# 2   1024.0      1.718434     1.705066       1.251953
+# 3   2048.0      4.936755     5.403875       2.927895
+# 4   4096.0     15.911594    18.959362       7.436288
+# 5   8192.0     55.357441    70.808578      21.140224
+# 6  16384.0    208.188416   273.617920      68.018173
+# 7  32768.0    806.037476  1081.453613     218.720261
+# fused-attention-inference-batch4-head32-d128-sparse-local4-vert4-hetero:
+#    PAST_LEN  Torch-Dense  Flash-Dense  Triton-Sparse
+# 0     256.0     0.050151     0.032337       0.107593
+# 1     512.0     0.073409     0.051737       0.200200
+# 2    1024.0     0.107533     0.082099       0.247067
+# 3    2048.0     0.177259     0.128891       0.338510
+# 4    4096.0     0.325866     0.223621       0.524842
+# 5    8192.0     0.586926     0.408913       0.885490
+# 6   16384.0     1.100834     0.793277       1.612271
+# 7   32768.0     2.098851     1.595831       3.064544
+# fused-attention-batch4-head32-d128-sparse-local4-vert8-hetero-fwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.066673     0.082037       0.065085
+# 1    512.0      0.137379     0.201880       0.143473
+# 2   1024.0      0.390675     0.624234       0.312046
+# 3   2048.0      1.267739     2.406950       0.696045
+# 4   4096.0      4.445138     9.136333       1.665788
+# 5   8192.0     16.768614    35.265533       4.380486
+# 6  16384.0     65.235970   138.393600      12.997633
+# 7  32768.0    257.317902   550.442993      42.821121
+# fused-attention-batch4-head32-d128-sparse-local4-vert8-hetero-bwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.296461     0.266581       0.254022
+# 1    512.0      0.671427     0.613643       0.551283
+# 2   1024.0      1.719918     1.704295       1.229982
+# 3   2048.0      4.945305     5.403364       2.721906
+# 4   4096.0     15.934293    18.960999       6.259371
+# 5   8192.0     55.406593    70.832130      15.676929
+# 6  16384.0    208.750595   275.004425      44.837891
+# 7  32768.0    808.057861  1080.647705     141.856766
+# fused-attention-inference-batch4-head32-d128-sparse-local4-vert8-hetero:
+#    PAST_LEN  Torch-Dense  Flash-Dense  Triton-Sparse
+# 0     256.0     0.050739     0.032886       0.107837
+# 1     512.0     0.073507     0.051996       0.200293
+# 2    1024.0     0.106394     0.080679       0.240610
+# 3    2048.0     0.177659     0.127660       0.287625
+# 4    4096.0     0.326326     0.226971       0.377500
+# 5    8192.0     0.586339     0.407367       0.559266
+# 6   16384.0     1.102279     0.786221       0.920976
+# 7   32768.0     2.097370     1.545090       1.644288
+################
+##### fp16 #####
+################
+# fused-attention-batch4-head16-d64-sparse-local4-vert8-hetero-fwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.032518     0.035472       0.029939
+# 1    512.0      0.054266     0.087841       0.054320
+# 2   1024.0      0.133447     0.263090       0.102045
+# 3   2048.0      0.384615     1.023293       0.201763
+# 4   4096.0      1.300890     4.023936       0.449555
+# 5   8192.0      4.774144    15.816704       1.150854
+# 6  16384.0     18.220032    62.771198       3.356001
+# 7  32768.0     71.405571   250.273788      10.976142
+# fused-attention-batch4-head16-d64-sparse-local4-vert8-hetero-bwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.083342     0.069742       0.079496
+# 1    512.0      0.159894     0.170995       0.151705
+# 2   1024.0      0.386071     0.522407       0.331443
+# 3   2048.0      1.067715     1.737333       0.715248
+# 4   4096.0      3.382731     6.219520       1.597457
+# 5   8192.0     11.857793    23.560448       3.879035
+# 6  16384.0     44.422142    91.251709      10.626843
+# 7  32768.0    175.011841   359.473145      32.340992
+################
+##### bf16 #####
+################
+# fused-attention-batch4-head16-d64-sparse-local4-vert8-hetero-fwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.037636     0.035902       0.031512
+# 1    512.0      0.058591     0.087229       0.058125
+# 2   1024.0      0.143337     0.263919       0.108443
+# 3   2048.0      0.414458     1.025985       0.214114
+# 4   4096.0      1.390841     4.020010       0.480550
+# 5   8192.0      5.067938    15.808171       1.230874
+# 6  16384.0     19.442280    62.765057       3.597274
+# 7  32768.0     75.501572   250.443771      11.768959
+# fused-attention-batch4-head16-d64-sparse-local4-vert8-hetero-bwd:
+#    SEQ_LEN  Triton-Dense  Flash-Dense  Triton-Sparse
+# 0    256.0      0.084404     0.070663       0.082613
+# 1    512.0      0.161510     0.172882       0.157661
+# 2   1024.0      0.388954     0.526047       0.339855
+# 3   2048.0      1.075814     1.736057       0.732420
+# 4   4096.0      3.401622     6.221376       1.636039
+# 5   8192.0     11.915136    23.483391       3.968725
+# 6  16384.0     44.660225    91.302910      10.857130
+# 7  32768.0    175.038467   359.048187      32.778240