diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..e3eb456a457edebf8c4564a930dadd63884ef6be 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+flashattention_logo.png filter=lfs diff=lfs merge=lfs -text
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000000000000000000000000000000000000..e35a781665eafa7421c30241962ef8e49588bffc
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1 @@
+Tri Dao, trid@cs.stanford.edu
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..e546cfc1327df2b91a891fa22a7c7c0cabee3290
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,91 @@
+# Inspired by https://github.com/anibali/docker-pytorch/blob/master/dockerfiles/1.10.0-cuda11.3-ubuntu20.04/Dockerfile
+# ARG COMPAT=0
+ARG PERSONAL=0
+# FROM nvidia/cuda:11.3.1-devel-ubuntu20.04 as base-0
+FROM nvcr.io/nvidia/pytorch:22.12-py3 as base
+
+ENV HOST docker
+ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
+# https://serverfault.com/questions/683605/docker-container-time-timezone-will-not-reflect-changes
+ENV TZ America/Los_Angeles
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+
+# git for installing dependencies
+# tzdata to set time zone
+# wget and unzip to download data
+# [2021-09-09] TD: zsh, stow, subversion, fasd are for setting up my personal environment.
+# [2021-12-07] TD: openmpi-bin for MPI (multi-node training)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    cmake \
+    curl \
+    ca-certificates \
+    sudo \
+    less \
+    htop \
+    git \
+    tzdata \
+    wget \
+    tmux \
+    zip \
+    unzip \
+    zsh stow subversion fasd \
+    && rm -rf /var/lib/apt/lists/*
+    # openmpi-bin \
+
+# Allow running runmpi as root
+# ENV OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1
+
+# # Create a non-root user and switch to it
+# RUN adduser --disabled-password --gecos '' --shell /bin/bash user \
+#     && echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
+# USER user
+
+# All users can use /home/user as their home directory
+ENV HOME=/home/user
+RUN mkdir -p /home/user && chmod 777 /home/user
+WORKDIR /home/user
+
+# Set up personal environment
+# FROM base-${COMPAT} as env-0
+FROM base as env-0
+FROM env-0 as env-1
+# Use ONBUILD so that the dotfiles dir doesn't need to exist unless we're building a personal image
+# https://stackoverflow.com/questions/31528384/conditional-copy-add-in-dockerfile
+ONBUILD COPY dotfiles ./dotfiles
+ONBUILD RUN cd ~/dotfiles && stow bash zsh tmux && sudo chsh -s /usr/bin/zsh $(whoami)
+# nvcr pytorch image sets SHELL=/bin/bash
+ONBUILD ENV SHELL=/bin/zsh
+
+FROM env-${PERSONAL} as packages
+
+# Disable pip cache: https://stackoverflow.com/questions/45594707/what-is-pips-no-cache-dir-good-for
+ENV PIP_NO_CACHE_DIR=1
+
+# # apex and pytorch-fast-transformers take a while to compile so we install them first
+# TD [2022-04-28] apex is already installed. In case we need a newer commit:
+# RUN pip install --upgrade --force-reinstall --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_multihead_attn" --global-option="--fmha" --global-option="--fast_layer_norm" --global-option="--xentropy" git+https://github.com/NVIDIA/apex.git#egg=apex
+
+# xgboost conflicts with deepspeed
+RUN pip uninstall -y xgboost && DS_BUILD_UTILS=1 DS_BUILD_FUSED_LAMB=1 pip install deepspeed==0.7.7
+
+# General packages that we don't care about the version
+# zstandard to extract the_pile dataset
+# psutil to get the number of cpu physical cores
+# twine to upload package to PyPI
+RUN pip install pytest matplotlib jupyter ipython ipdb gpustat scikit-learn spacy munch einops opt_einsum fvcore gsutil cmake pykeops zstandard psutil h5py twine gdown \
+    && python -m spacy download en_core_web_sm
+# hydra
+RUN pip install hydra-core==1.3.1 hydra-colorlog==1.2.0 hydra-optuna-sweeper==1.2.0 pyrootutils rich
+# Core packages
+RUN pip install transformers==4.25.1 datasets==2.8.0 pytorch-lightning==1.8.6 triton==2.0.0.dev20221202 wandb==0.13.7 timm==0.6.12 torchmetrics==0.10.3
+# torchmetrics 0.11.0 broke hydra's instantiate
+
+# For MLPerf
+RUN pip install git+https://github.com/mlcommons/logging.git@2.1.0
+
+# Install FlashAttention
+RUN pip install flash-attn==2.6.3
+
+# Install CUDA extensions for fused dense
+RUN pip install git+https://github.com/Dao-AILab/flash-attention@v2.6.3#subdirectory=csrc/fused_dense_lib
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..5860e4b33f3d9d85fc636137c559331d51783a5b
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..021b4d0f7d3fbf523d3b0d4934f2a1f46781cb50
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,11 @@
+recursive-include csrc *.cu
+recursive-include csrc *.h
+recursive-include csrc *.cuh
+recursive-include csrc *.cpp
+recursive-include csrc *.hpp
+
+recursive-include flash_attn *.cu
+recursive-include flash_attn *.h
+recursive-include flash_attn *.cuh
+recursive-include flash_attn *.cpp
+recursive-include flash_attn *.hpp
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..79aefc7973fb6163b84dc149012f5101b2d399af
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,9 @@
+
+clean_dist:
+	rm -rf dist/*
+
+create_dist: clean_dist
+	python setup.py sdist
+
+upload_package: create_dist
+	twine upload dist/*
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6bb429e123a17fa027648c4238fe404a42895f86
--- /dev/null
+++ b/README.md
@@ -0,0 +1,231 @@
+# Optimized Transformer implementation
+This repo contains examples of how FlashAttention can be integrated into a model
+(e.g., GPT, ViT) and trained end-to-end. We also provide optimized
+implementations of other layers (e.g., MLP, LayerNorm, cross-entropy loss,
+rotary embedding). Overall this speeds up training by 3-5x compared to the
+baseline implementation from Huggingface, reaching up to 189 TFLOPs/sec per A100,
+equivalent to 60.6\% model FLOPs utilization (we don't need any activation
+checkpointing). All without changing the model architecture (i.e., no
+approximation).
+
+Goals:
+- Performance: we optimize for model speed and memory, especially on 1-node
+  (e.g., with 8 A100s).
+- Flexibility: we provide optimized building blocks (MLP, attention, LayerNorm),
+  and the model code illustrates how these components can be put together.
+  The training code also aims to be model- & task-agnostic.
+
+Non-goals (and other resources):
+- Support as many models as possible: Huggingface's
+  [transformers](https://github.com/huggingface/transformers) and
+  [timm](https://github.com/rwightman/pytorch-image-models/) are great for this.
+- Large-scale distributed training: our codebase has been used for multi-GPU and multi-node
+  training for models up to 2.7B parameters. However, if you're looking for large-scale distributed
+  training techniques (e.g., pipeline parallelism, tensor parallelism),
+  check out [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/) and
+  [DeepSpeed](https://github.com/microsoft/deepspeed).
+- Inference: we currently focus on training (this might change in the future).
+  If you want fast inference, take a look at
+  [FasterTransformer](https://github.com/NVIDIA/FasterTransformer).
+- Production: this codebase was written during several research projects to validate ideas
+  on speeding up ML models.
+
+## Model Components
+
+The GPT model is implemented
+[here](https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/models/gpt.py).
+And here's an example to construct the GPT3-1.3B model with rotary embedding:
+```python
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+from flash_attn.models.gpt import GPTLMHeadModel
+
+seqlen = 2048
+hidden_dim = 2048
+nheads = 16
+n_layer = 24
+rotary_emb_fraction = 0.5
+config = GPT2Config(vocab_size=50257, n_positions=seqlen, n_embd=hidden_dim,
+                    n_layer=n_layer, n_head=nheads, 
+                    scale_attn_by_inverse_layer_idx=True, 
+                    rotary_emb_fraction=rotary_emb_fraction,
+                    use_flash_attn=True, fused_mlp=True,
+                    fused_bias_fc=True, fused_dropout_add_ln=True, 
+                    pad_vocab_size_multiple=8)
+model = GPTLMHeadModel(config)
+```
+
+We provide the following optimized components:
+
+1. FlashAttention: fast and memory-efficient exact attention. This makes
+attention much faster and saves a lot of activation memory. As a result we don't need
+to use any activation checkpointing.
+```sh
+pip install flash-attn
+```
+
+2. Fused matmul + bias (forward and backward), and fused matmul + bias + gelu
+(forward and backward), adapted from Apex's
+[FusedDense](https://github.com/NVIDIA/apex/tree/master/apex/fused_dense). We
+make it work for bfloat16. For best performance, you should use CUDA >= 11.8. CuBLAS versions before
+this doesn't have the best matmul + bias + gelu performance for bfloat16.
+```sh
+cd ../csrc/fused_dense_lib && pip install .
+```
+3. Optimized cross-entropy loss, adapted from Apex's
+[Xentropy](https://github.com/NVIDIA/apex/tree/master/apex/contrib/xentropy). We make it work for bfloat16 and support in-place backward to save memory.
+```sh
+cd ../csrc/xentropy && pip install .
+```
+4. Fused rotary embedding:
+```sh
+cd ../csrc/rotary && pip install .
+```
+5. Fused dropout + residual + LayerNorm, adapted from Apex's
+[FastLayerNorm](https://github.com/NVIDIA/apex/tree/master/apex/contrib/layer_norm). We add dropout and residual, and make it work for both pre-norm and post-norm architecture.
+This supports dimensions divisible by 8, up to 6144.
+```sh
+cd ../csrc/layer_norm && pip install .
+```
+
+## Training
+
+We also provide here training scripts to train GPT2 on Openwebtext and GPT3 on
+The Pile as examples. Feel free to use the model in your own training setup as
+well.
+
+We use [Hydra](https://hydra.cc/) for configuration,
+[Pytorch-Lightning](https://github.com/Lightning-AI/lightning) for training, and
+[Wandb](https://wandb.ai/) for logging.
+
+We use the template from `https://github.com/ashleve/lightning-hydra-template`.
+Please read the instructions there to understand the repo structure.
+
+### Requirements
+
+Python 3.8+, Pytorch 1.12+, torchvision, einops, timm, hydra-core,
+hydra-colorlog, python-dotenv, rich, pytorch-lightning, triton, flash-attn.
+We recommend CUDA 11.8 (e.g., using the Nvidia's Pytorch Docker image from https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch)
+
+We provide a Dockerfile that lists all the required packages.
+
+### Dataset preparation
+
+Running the training command would automatically download the datasets
+(Openwebtext, Pile), tokenize with the GPT2 tokenizer, concatenate all the
+tokens, then save this cache to disk. Alternatively, you can also prepare the
+datasets as a separate step.
+
+The cached datasets are saved to `${DATA_DIR}/openwebtext` and
+`${DATA_DIR}/the_pile`. If `${DATA_DIR}` is not set, they will be saved to
+`./data/{openwebtext,the_pile}`. 
+
+- Openwebtext:
+```sh
+export PYTHONPATH=$PWD:$PYTHONPATH
+pytest -q -s tests/datamodules/test_language_modeling_hf.py -k "openwebtext"
+```
+This takes around 1h on a 64-core CPU. The processed dataset has size 17GB.
+
+- The Pile:
+```sh
+export PYTHONPATH=$PWD:$PYTHONPATH
+pytest -q -s tests/datamodules/test_language_modeling_hf.py -k "pile"
+```
+This takes around 20h on a 64-core CPU. The processed dataset has size 699GB.
+
+### GPT2 training on Openwebtext
+To train GPT2 on Openwebtext with 8 GPUs:
+```sh
+python run.py experiment=owt/gpt2s-flash trainer.devices=8  # 125M
+python run.py experiment=owt/gpt2m-flash trainer.devices=8  # 355M
+python run.py experiment=owt/gpt2l-flash trainer.devices=8  # 760M
+python run.py experiment=owt/gpt2xl-flash trainer.devices=8  # 1.6B
+```
+The default parameters are set for 8 x A100 80GB.
+
+To train with bf16 instead of fp16, add `trainer.precision=bf16`.
+
+### GPT3 training on The Pile
+To train GPT3 on The Pile with 8 GPUs:
+```sh
+python run.py experiment=pile/gpt3s-flash trainer.devices=8  # 125M
+python run.py experiment=pile/gpt3m-flash trainer.devices=8  # 355M
+python run.py experiment=pile/gpt3l-flash trainer.devices=8  # 760M
+python run.py experiment=pile/gpt3xl-flash trainer.devices=8  # 1.3B
+python run.py experiment=pile/gpt3-2.7B-flash-hdim128 trainer.devices=8  # 2.7B
+```
+The default parameters are set for 8 x A100 80GB. We train with bf16 by default.
+
+To train with rotary embedding, run the experiments `pile/gpt3{s,m,l,xl}-flash-rotary`.
+
+### Training options
+
+**Gradient accumulation**: to adjust device batch size to fit into GPU memory
+(the global batch size stays the same, and gradient accumulation is calculated
+automatically), set `datamodule.batch_size=blah`.
+
+**Multi-node**: to train on multiple nodes, add `trainer.num_nodes=blah`.
+
+**Speed benchmarking**: to print out iteration time, add `+callbacks.speed_monitor.verbose=True`.
+
+**Resumable training**: set a name to the run, and then set `resume=True` when
+you resume. Training will restart at exactly the same batch.
+```sh
+python run.py experiment=pile/gpt3s-flash trainer.devices=8 name=pile-gpt3s-flash resume=True
+```
+
+## Training speed
+
+We measure the wallclock training speed on one node with 8 x A100 80GB SXM4 80GB (400W) with NVLink.
+
+FLOPs are calculated using the formula from the [Megatron-LM
+paper](https://arxiv.org/abs/2104.04473) (Section 5.1), except we scale by 3/4
+to get the model FLOPs (instead of hardware FLOPs with activation
+checkpointing).
+
+
+### GPT2 (sequence length 1024)
+
+![GPT2 speedup](../assets/gpt2_training_efficiency.jpg)
+
+The implementation in this repo (FlashAttention) is 3-4x faster than the
+baseline implementation from Huggingface.
+
+### GPT3 (sequence length 2048)
+
+![GPT3 speedup](../assets/gpt3_training_efficiency.jpg)
+
+The implementation in this repo (FlashAttention) is 3-5x faster than the
+baseline implementation from Huggingface.
+
+For the GPT3-2.7B model, we set head dimension to 128 (instead of 80) for better efficiency.
+
+We include here more details on the training speed with FlashAttention on 8 x
+A100 80GB.
+
+| Model     | Batch size (tokens) | Through put (tokens/sec) | Hours / 1B tokens |
+| --------- | ------------------- | ------------------------ | ----------------- |
+| GPT3-125M | 0.5M                | 1310k                    |              0.21 |
+| GPT3-355M | 0.5M                | 503k                     |              0.55 |
+| GPT3-760M | 0.5M                | 245k                     |              1.13 |
+| GPT3-1.3B | 1M                  | 169k                     |              1.64 |
+| GPT3-2.7B | 1M                  | 85k                      |              3.27 |
+
+As an example, this means that one can train a GPT3-1.3B model on 26B tokens
+(compute-optimal according to Chinchilla scaling) in about 43 hours on 8 x A100.
+
+## Training quality
+
+We include here the loss curve for GPT2 on Openwebtext, trained for 200B tokens.
+For GPT2, the runs with FlashAttention yield the same loss curve as the runs
+with the baseline implementation from Huggingface for 125M and 355M models. For
+larger models the baseline implementation just takes too long.
+
+![GPT2 training curve](../assets/gpt2_training_curve.jpg)
+
+We include here the loss curve for GPT3 on The Pile, trained for 400B tokens.
+The 125M, 355M, 760M models have batch size 512k tokens so this translates to
+800k training steps, while the 1.3B and 2.7B models have batch size 1M tokens,
+which translates to 400k training steps.
+
+![GPT3 training curve](../assets/gpt3_training_curve.jpg)
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e33087c53f36e50efce2ba54b90d9a7562262d5
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1 @@
+__version__ = "3.0.0.b1"
diff --git a/acc.yaml b/acc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fe7a63f5c162d0bded2b19e4e227c1eb4147442e
--- /dev/null
+++ b/acc.yaml
@@ -0,0 +1,3 @@
+# @package eval.metrics
+acc:
+  _target_: src.metrics.accuracy.AccuracyMine
diff --git a/acc_ignore_index.yaml b/acc_ignore_index.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..03364aa1af55795a2b92efec04e46037b72c5761
--- /dev/null
+++ b/acc_ignore_index.yaml
@@ -0,0 +1,4 @@
+# @package eval.metrics
+acc:
+  _target_: torchmetrics.Accuracy
+  ignore_index: -100
diff --git a/acctop5.yaml b/acctop5.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f798ae0cb6d42a9f0651f928f23c13bc4ad16c1
--- /dev/null
+++ b/acctop5.yaml
@@ -0,0 +1,4 @@
+# @package eval.metrics
+acctop5:
+  _target_: src.metrics.accuracy.AccuracyMine
+  top_k: 5
diff --git a/activations.py b/activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..b00063b6bd497e10a70a201cfe246178174aad67
--- /dev/null
+++ b/activations.py
@@ -0,0 +1,135 @@
+# Copied from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/model/layers/activations.py
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+@torch.jit.script
+def bias_gelu(y, bias):
+    x = bias + y
+    return (x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))).to(dtype=y.dtype)
+
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def bias_gelu_back(g, y, bias):
+    """Assume that y has shape (B, D) and bias has shape (D)"""
+    x = bias + y
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
+        1 + tanh_out
+    )
+    grad_y = ff * g
+    return grad_y.to(dtype=y.dtype), grad_y.sum(dim=(0), dtype=bias.dtype)
+
+
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input, bias):
+        ctx.save_for_backward(input, bias)
+        return bias_gelu(input, bias)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, bias = ctx.saved_tensors
+        tmp = bias_gelu_back(grad_output, input, bias)
+        return tmp, tmp
+
+
+bias_gelu_impl = GeLUFunction.apply
+
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+@torch.jit.script
+def gelu_fwd(x):
+    return (x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))).to(dtype=x.dtype)
+
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def gelu_bwd(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
+        1 + tanh_out
+    )
+    return (ff * g).to(dtype=x.dtype)
+
+
+class FastGeLUFunction(torch.autograd.Function):
+    @staticmethod
+    # bias is an optional argument
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        return gelu_fwd(input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        (input,) = ctx.saved_tensors
+        tmp = gelu_bwd(grad_output, input)
+        return tmp
+
+
+fast_gelu_impl = FastGeLUFunction.apply
+
+
+@torch.jit.script
+def relu_bwd(g, x):
+    return torch.where(x >= 0, g, 0.0).to(dtype=x.dtype)
+
+
+@torch.jit.script
+def sqrelu_fwd(x):
+    r = F.relu(x)
+    return (r * r).to(dtype=x.dtype)
+
+
+@torch.jit.script
+def sqrelu_bwd(g, x):
+    return (2.0 * g * F.relu(x)).to(dtype=x.dtype)
+
+
+swiglu_fwd_codestring = """
+template <typename T> T swiglu_fwd(T x, T y) {
+    return float(x) * float(y) / (1.0f + ::exp(-float(x)));
+}
+"""
+swiglu_bwd_codestring = """
+template <typename T> T swiglu_bwd(T x, T y, T g, T& dx, T& dy) {
+    float x_sigmoid = 1.0f / (1.0f + ::exp(-float(x)));
+    dx = x_sigmoid * (1 + float(x) * (1.0f - x_sigmoid)) * float(g) * float(y);
+    dy = float(x) * x_sigmoid * float(g);
+}
+"""
+swiglu_fwd = torch.cuda.jiterator._create_jit_fn(swiglu_fwd_codestring)
+swiglu_bwd = torch.cuda.jiterator._create_multi_output_jit_fn(swiglu_bwd_codestring, num_outputs=2)
+
+
+class SwiGLUFunction(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, y):
+        ctx.save_for_backward(x, y)
+        return swiglu_fwd(x, y)
+
+    @staticmethod
+    def backward(ctx, dout):
+        x, y = ctx.saved_tensors
+        return swiglu_bwd(x, y, dout)
+
+swiglu = SwiGLUFunction.apply
diff --git a/adam.yaml b/adam.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f8821d74ced32918343f78ecb55d13c5102cf5ec
--- /dev/null
+++ b/adam.yaml
@@ -0,0 +1,2 @@
+# @package train.optimizer
+_target_: torch.optim.Adam
diff --git a/adamw-apex-distributed.yaml b/adamw-apex-distributed.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b7a5136ebe88ae64012cdf070dbc661be6577818
--- /dev/null
+++ b/adamw-apex-distributed.yaml
@@ -0,0 +1,3 @@
+# @package train.optimizer
+_target_: apex.contrib.optimizers.distributed_fused_adam.DistributedFusedAdam
+adam_w_mode: True
diff --git a/adamw-apex-zero.yaml b/adamw-apex-zero.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f19d7a0445db380a3038fe141a885f3090f90fbb
--- /dev/null
+++ b/adamw-apex-zero.yaml
@@ -0,0 +1,7 @@
+# @package train.optimizer
+_target_: torch.distributed.optim.ZeroRedundancyOptimizer
+_recursive_: True
+optimizer_class:
+  _target_: apex.optimizers.FusedAdam
+  _partial_: True
+  adam_w_mode: True
diff --git a/adamw-apex.yaml b/adamw-apex.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fdbf90fdfecac98313b6f51562b21f460521143c
--- /dev/null
+++ b/adamw-apex.yaml
@@ -0,0 +1,3 @@
+# @package train.optimizer
+_target_: apex.optimizers.FusedAdam
+adam_w_mode: True
diff --git a/adamw-zero.yaml b/adamw-zero.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..66ea2fd03150e3811fa9387815ab8adcb2921697
--- /dev/null
+++ b/adamw-zero.yaml
@@ -0,0 +1,7 @@
+# @package train.optimizer
+_target_: torch.distributed.optim.ZeroRedundancyOptimizer
+_recursive_: True
+optimizer_class:
+  _target_: torch.optim.__getattribute__
+  _args_:
+    - "AdamW"
diff --git a/adamw.yaml b/adamw.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..02252ec1cec06b2b06ef99fdeb9e324c1035738d
--- /dev/null
+++ b/adamw.yaml
@@ -0,0 +1,2 @@
+# @package train.optimizer
+_target_: torch.optim.AdamW
diff --git a/alibi.h b/alibi.h
new file mode 100644
index 0000000000000000000000000000000000000000..e714233e7eb8553f0c68d2a4ce02a4784dcfbd6b
--- /dev/null
+++ b/alibi.h
@@ -0,0 +1,74 @@
+#include <cmath>
+
+#include <cute/tensor.hpp>
+
+#include <cutlass/cutlass.h>
+#include <cutlass/array.h>
+
+#include "utils.h"
+
+namespace flash {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool Is_causal>
+struct Alibi {
+
+    const float alibi_slope;
+    const int max_seqlen_k, max_seqlen_q;
+
+    __forceinline__ __device__ Alibi(const float alibi_slope, const int max_seqlen_k, const int max_seqlen_q)
+        : alibi_slope(alibi_slope)
+        , max_seqlen_k(max_seqlen_k)
+        , max_seqlen_q(max_seqlen_q) {
+    };
+
+
+    template <typename Engine, typename Layout>
+    __forceinline__ __device__ void apply_alibi(Tensor<Engine, Layout> &tensor,
+                                      const int col_idx_offset_,
+                                      const int row_idx_offset,
+                                      const int warp_row_stride) {
+        // tensor has shape (nrow=(2, MMA_M), ncol=(2, MMA_N))
+        static_assert(Layout::rank == 2, "Only support 2D Tensor");
+        const int lane_id = threadIdx.x % 32;
+        const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
+        if constexpr (Is_causal) {  // Simpler, we add the same bias vector to all rows
+            #pragma unroll
+            for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                const int col_idx_base = col_idx_offset + nj * 8;
+                #pragma unroll
+                for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                    const int col_idx = col_idx_base + j;
+                    #pragma unroll
+                    for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                        tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx;
+                    }
+                }
+            }
+        } else {  // Bias depends on both row_idx and col_idx
+            #pragma unroll
+            for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
+                const int row_idx_base = row_idx_offset + mi * warp_row_stride;
+                #pragma unroll
+                for (int i = 0; i < size<0, 0>(tensor); ++i) {
+                    const int row_idx = row_idx_base + i * 8;
+                    #pragma unroll
+                    for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                        const int col_idx_base = col_idx_offset + nj * 8;
+                        #pragma unroll
+                        for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                            const int col_idx = col_idx_base + j;
+                            tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+};
+
+}  // namespace flash
diff --git a/all_params.yaml b/all_params.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..24a0b50486d3a579a15dd1b14e7aa45f3747aa88
--- /dev/null
+++ b/all_params.yaml
@@ -0,0 +1,49 @@
+_target_: pytorch_lightning.Trainer
+
+# default values for all trainer parameters
+checkpoint_callback: True
+default_root_dir: null
+gradient_clip_val: 0.0
+process_position: 0
+num_nodes: 1
+num_processes: 1
+gpus: null
+auto_select_gpus: False
+tpu_cores: null
+log_gpu_memory: null
+overfit_batches: 0.0
+track_grad_norm: -1
+check_val_every_n_epoch: 1
+fast_dev_run: False
+accumulate_grad_batches: 1
+max_epochs: 1
+min_epochs: 1
+max_steps: null
+min_steps: null
+limit_train_batches: 1.0
+limit_val_batches: 1.0
+limit_test_batches: 1.0
+val_check_interval: 1.0
+flush_logs_every_n_steps: 100
+log_every_n_steps: 50
+accelerator: null
+sync_batchnorm: False
+precision: 32
+weights_summary: "top"
+weights_save_path: null
+num_sanity_val_steps: 2
+truncated_bptt_steps: null
+resume_from_checkpoint: null
+profiler: null
+benchmark: False
+deterministic: False
+reload_dataloaders_every_epoch: False
+auto_lr_find: False
+replace_sampler_ddp: True
+terminate_on_nan: False
+auto_scale_batch_size: False
+prepare_data_per_node: True
+plugins: null
+amp_backend: "native"
+amp_level: "O2"
+move_metrics_to_cpu: False
diff --git a/baichuan.py b/baichuan.py
new file mode 100644
index 0000000000000000000000000000000000000000..97d030782187afdfa22b9ad0a9a264b9f6c0a95e
--- /dev/null
+++ b/baichuan.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2023, GGGGGGXY, Tri Dao.
+
+import math
+import json
+import re
+from pathlib import Path
+
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+
+from einops import rearrange
+from transformers import GPT2Config, AutoConfig, PretrainedConfig
+
+
+def remap_state_dict_hf_baichuan(state_dict, config):
+    def key_mapping_layers(key):
+        return re.sub(r"^model.", "transformer.", key)
+
+    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
+
+    # Word embedding
+    def key_mapping_emb(key):
+        return re.sub(
+            r"^transformer.embed_tokens.",
+            "transformer.embeddings.word_embeddings.",
+            key,
+        )
+
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = (
+        math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple)
+        * pad_vocab_size_multiple
+    )
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    if getattr(config, "tie_word_embeddings"):
+        state_dict["lm_head.weight"] = state_dict[
+            "transformer.embeddings.word_embeddings.weight"
+        ]
+    else:
+        output_embeddings = state_dict.pop("lm_head.weight")
+        # Need to recompute vocab_size since Baichuan shards the word embeddings and output embeddings
+        # differently.
+        vocab_size = (
+            math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple)
+            * pad_vocab_size_multiple
+        )
+        # It's possible that vocab_size is padded to be a multiple of 8, for example.
+        state_dict["lm_head.weight"] = F.pad(
+            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
+        )
+
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.norm.", r"transformer.ln_f.", key)
+        key = re.sub(
+            r"^transformer.layers.(\d+).input_layernorm.",
+            r"transformer.layers.\1.norm1.",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).post_attention_layernorm.",
+            r"transformer.layers.\1.norm2.",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+
+    # MLP
+    for l in range(config.n_layer):
+        w1 = state_dict.pop(f"transformer.layers.{l}.mlp.gate_proj.weight")
+        w3 = state_dict.pop(f"transformer.layers.{l}.mlp.up_proj.weight")
+        # Our ordering is different
+        state_dict[f"transformer.layers.{l}.mlp.fc1.weight"] = torch.cat(
+            [w3, w1], dim=0
+        )
+
+    def key_mapping_mlp(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).mlp.down_proj.",
+            r"transformer.layers.\1.mlp.fc2.",
+            key,
+        )
+
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+
+    # Attention
+    def key_mapping_attn(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attn.W_pack.",
+            r"transformer.layers.\1.mixer.Wqkv.",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attn.o_proj.",
+            r"transformer.layers.\1.mixer.out_proj.",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    for l in range(config.n_layer):
+        # pop rotary_emb.inv_freq from state dict
+        state_dict.pop(f"transformer.layers.{l}.self_attn.rotary_emb.inv_freq", None)
+    return state_dict
+
+
+def baichuan_config_to_gpt2_config(baichuan_config: PretrainedConfig) -> GPT2Config:
+    # HACK: the config doesn't have say whether it's rotary or alibi.
+    # So we have to infer from the hidden size (7B -> rotary, 13B -> alibi).
+    # HACK: the config doesn't have say whether it uses norm head.
+    # So we have to infer from the vocab size
+    # (v1, vocab size 64k, no norm head; v2, vocab size 128k, norm head).
+    use_rotary = baichuan_config.hidden_size < 5000
+    return GPT2Config(
+        vocab_size=baichuan_config.vocab_size,
+        n_positions=0,  # No absolute position embedding
+        n_embd=baichuan_config.hidden_size,
+        n_layer=baichuan_config.num_hidden_layers,
+        n_head=baichuan_config.num_attention_heads,
+        n_inner=baichuan_config.intermediate_size,
+        activation_function="swiglu",  # Hardcode since HF calls it 'silu'
+        # baichuan doesn't have dropout, idk if it's because they only release the inference code
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attn_pdrop=0.0,
+        layer_norm_epsilon=baichuan_config.rms_norm_eps,
+        initializer_range=baichuan_config.initializer_range,
+        bos_token_id=baichuan_config.bos_token_id,
+        eos_token_id=baichuan_config.eos_token_id,
+        # These are new arguments not in the original GPT2Config
+        pad_token_id=baichuan_config.pad_token_id,  # Idk if this does anything
+        rms_norm=True,
+        rotary_emb_fraction=1.0 if use_rotary else 0.0,
+        rotary_emb_interleaved=False,
+        use_alibi=not use_rotary,
+        use_flash_attn=not use_rotary,  # Alibi code path requires flash_attn
+        tie_word_embeddings=False,
+        norm_head=baichuan_config.vocab_size > 70000,
+        qkv_proj_bias=False,
+        out_proj_bias=False,
+        mlp_fc1_bias=False,
+        mlp_fc2_bias=False,
+    )
diff --git a/base.yaml b/base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..48de7a20d2d1d31e5c7166d618d16b1595d85395
--- /dev/null
+++ b/base.yaml
@@ -0,0 +1,82 @@
+# @package _global_
+defaults:
+  - override /trainer: default # choose trainer from 'configs/trainer/'
+  - override /model: null
+  - override /datamodule: openwebtext
+  # FusedAdam from apex speeds up the optimizer step a bit, for GPT2-small time
+  # per global step (i.e. batch size 512) on 8 A100s goes from 376ms to 368ms.
+  # For GPT2-medium time per global goes from 997ms to 972ms.
+  - override /optimizer: adamw-apex
+  - override /scheduler: linear-warmup
+  - override /callbacks: [default, norm-monitor]
+  - override /metrics: [perplexity, num-tokens]
+  - override /logger: wandb
+
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+
+task:
+  _target_: src.tasks.seq.SequenceLMModel
+
+seed: 1111
+
+trainer:
+  accelerator: gpu
+  devices: 8
+  num_nodes: 1
+  accumulate_grad_batches: ${div_up:${train.global_batch_size}, ${eval:${trainer.devices} * ${datamodule.batch_size} * ${trainer.num_nodes}}}
+  max_steps: 400000
+  val_check_interval: ${eval:1000 * ${.accumulate_grad_batches}}
+  check_val_every_n_epoch: null  # We don't care about epoch boundary
+  precision: 16
+  gradient_clip_val: 1.0
+  strategy: null
+
+datamodule:
+  batch_size: 16  # Per GPU
+  batch_size_eval: ${.batch_size}  # Fused dense only support batch size at most 64k
+  max_length: 1024
+  fault_tolerant: True
+  ddp: ${eval:"${trainer.devices} > 1"}
+
+train:
+  gpu_mem: ${eval:"round(float(__import__('subprocess').check_output('nvidia-smi -i 0 --query-gpu=memory.total --format=csv,noheader,nounits', shell=True).strip().decode()) / 1000)"}
+  global_batch_size: 512
+  optimizer:
+    lr: 6e-4
+    weight_decay: 0.1
+  optimizer_param_grouping:
+    bias_weight_decay: False
+    normalization_weight_decay: False
+  scheduler:
+    num_warmup_steps: ${eval:0.01 * ${trainer.max_steps}}
+    num_training_steps: ${trainer.max_steps}
+  loss_fn:
+    # This is faster and uses less memory than torch.nn.CrossEntropyLoss.
+    # It's also more numerically stable if we're using DeepSpeed 16 bits.
+    _target_: flash_attn.losses.cross_entropy.CrossEntropyLoss
+    inplace_backward: True  # to save memory
+
+eval:
+  log_on_step: True  # 1 training epoch takes too long, we want to see metrics per train step
+
+callbacks:
+  model_checkpoint:
+    monitor: val/loss
+    mode: min
+    save_top_k: 3
+    save_last: True
+    every_n_train_steps: 1000
+    dirpath: ${work_dir}/checkpoints/${oc.select:name,''}
+    filename: step_{step}
+    auto_insert_metric_name: False
+  model_checkpoint_progress:
+    _target_: src.callbacks.model_checkpoint.ModelCheckpointMine
+    fault_tolerant: True
+    every_n_train_steps: 50000
+    save_last: False
+    save_top_k: -1  # Save all the checkpoints
+    dirpath: ${..model_checkpoint.dirpath}
+    filename: progress_step_{step}
+    auto_insert_metric_name: False
+  early_stopping: null
diff --git a/benchmark.py b/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..15b30405f209921189b75f7307814876350e7317
--- /dev/null
+++ b/benchmark.py
@@ -0,0 +1,268 @@
+# Copyright (c) 2023, Tri Dao.
+""" Useful functions for writing test code. """
+
+import torch
+import torch.utils.benchmark as benchmark
+
+
+def benchmark_forward(
+    fn, *inputs, repeats=10, desc="", verbose=True, amp=False, amp_dtype=torch.float16, **kwinputs
+):
+    """Use Pytorch Benchmark on the forward pass of an arbitrary function."""
+    if verbose:
+        print(desc, "- Forward pass")
+
+    def amp_wrapper(*inputs, **kwinputs):
+        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+            fn(*inputs, **kwinputs)
+
+    t = benchmark.Timer(
+        stmt="fn_amp(*inputs, **kwinputs)",
+        globals={"fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs},
+        num_threads=torch.get_num_threads(),
+    )
+    m = t.timeit(repeats)
+    if verbose:
+        print(m)
+    return t, m
+
+
+def benchmark_backward(
+    fn,
+    *inputs,
+    grad=None,
+    repeats=10,
+    desc="",
+    verbose=True,
+    amp=False,
+    amp_dtype=torch.float16,
+    **kwinputs,
+):
+    """Use Pytorch Benchmark on the backward pass of an arbitrary function."""
+    if verbose:
+        print(desc, "- Backward pass")
+    with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+        y = fn(*inputs, **kwinputs)
+        if type(y) is tuple:
+            y = y[0]
+    if grad is None:
+        grad = torch.randn_like(y)
+    else:
+        if grad.shape != y.shape:
+            raise RuntimeError("Grad shape does not match output shape")
+
+    def f(*inputs, y, grad):
+        # Set .grad to None to avoid extra operation of gradient accumulation
+        for x in inputs:
+            if isinstance(x, torch.Tensor):
+                x.grad = None
+        y.backward(grad, retain_graph=True)
+
+    t = benchmark.Timer(
+        stmt="f(*inputs, y=y, grad=grad)",
+        globals={"f": f, "inputs": inputs, "y": y, "grad": grad},
+        num_threads=torch.get_num_threads(),
+    )
+    m = t.timeit(repeats)
+    if verbose:
+        print(m)
+    return t, m
+
+
+def benchmark_combined(
+    fn,
+    *inputs,
+    grad=None,
+    repeats=10,
+    desc="",
+    verbose=True,
+    amp=False,
+    amp_dtype=torch.float16,
+    **kwinputs,
+):
+    """Use Pytorch Benchmark on the forward+backward pass of an arbitrary function."""
+    if verbose:
+        print(desc, "- Forward + Backward pass")
+    with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+        y = fn(*inputs, **kwinputs)
+        if type(y) is tuple:
+            y = y[0]
+    if grad is None:
+        grad = torch.randn_like(y)
+    else:
+        if grad.shape != y.shape:
+            raise RuntimeError("Grad shape does not match output shape")
+
+    def f(grad, *inputs, **kwinputs):
+        for x in inputs:
+            if isinstance(x, torch.Tensor):
+                x.grad = None
+        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+            y = fn(*inputs, **kwinputs)
+            if type(y) is tuple:
+                y = y[0]
+        y.backward(grad, retain_graph=True)
+
+    t = benchmark.Timer(
+        stmt="f(grad, *inputs, **kwinputs)",
+        globals={"f": f, "fn": fn, "inputs": inputs, "grad": grad, "kwinputs": kwinputs},
+        num_threads=torch.get_num_threads(),
+    )
+    m = t.timeit(repeats)
+    if verbose:
+        print(m)
+    return t, m
+
+
+def benchmark_fwd_bwd(
+    fn,
+    *inputs,
+    grad=None,
+    repeats=10,
+    desc="",
+    verbose=True,
+    amp=False,
+    amp_dtype=torch.float16,
+    **kwinputs,
+):
+    """Use Pytorch Benchmark on the forward+backward pass of an arbitrary function."""
+    return (
+        benchmark_forward(
+            fn,
+            *inputs,
+            repeats=repeats,
+            desc=desc,
+            verbose=verbose,
+            amp=amp,
+            amp_dtype=amp_dtype,
+            **kwinputs,
+        ),
+        benchmark_backward(
+            fn,
+            *inputs,
+            grad=grad,
+            repeats=repeats,
+            desc=desc,
+            verbose=verbose,
+            amp=amp,
+            amp_dtype=amp_dtype,
+            **kwinputs,
+        ),
+    )
+
+
+def benchmark_all(
+    fn,
+    *inputs,
+    grad=None,
+    repeats=10,
+    desc="",
+    verbose=True,
+    amp=False,
+    amp_dtype=torch.float16,
+    **kwinputs,
+):
+    """Use Pytorch Benchmark on the forward+backward pass of an arbitrary function."""
+    return (
+        benchmark_forward(
+            fn,
+            *inputs,
+            repeats=repeats,
+            desc=desc,
+            verbose=verbose,
+            amp=amp,
+            amp_dtype=amp_dtype,
+            **kwinputs,
+        ),
+        benchmark_backward(
+            fn,
+            *inputs,
+            grad=grad,
+            repeats=repeats,
+            desc=desc,
+            verbose=verbose,
+            amp=amp,
+            amp_dtype=amp_dtype,
+            **kwinputs,
+        ),
+        benchmark_combined(
+            fn,
+            *inputs,
+            grad=grad,
+            repeats=repeats,
+            desc=desc,
+            verbose=verbose,
+            amp=amp,
+            amp_dtype=amp_dtype,
+            **kwinputs,
+        ),
+    )
+
+
+def pytorch_profiler(
+    fn,
+    *inputs,
+    trace_filename=None,
+    backward=False,
+    amp=False,
+    amp_dtype=torch.float16,
+    cpu=False,
+    verbose=True,
+    **kwinputs,
+):
+    """Wrap benchmark functions in Pytorch profiler to see CUDA information."""
+    if backward:
+        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+            out = fn(*inputs, **kwinputs)
+            if type(out) is tuple:
+                out = out[0]
+            g = torch.randn_like(out)
+    for _ in range(30):  # Warm up
+        if backward:
+            for x in inputs:
+                if isinstance(x, torch.Tensor):
+                    x.grad = None
+        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+            out = fn(*inputs, **kwinputs)
+            if type(out) is tuple:
+                out = out[0]
+        # Backward should be done outside autocast
+        if backward:
+            out.backward(g, retain_graph=True)
+    activities = ([torch.profiler.ProfilerActivity.CPU] if cpu else []) + [
+        torch.profiler.ProfilerActivity.CUDA
+    ]
+    with torch.profiler.profile(
+        activities=activities,
+        record_shapes=True,
+        # profile_memory=True,
+        with_stack=True,
+    ) as prof:
+        if backward:
+            for x in inputs:
+                if isinstance(x, torch.Tensor):
+                    x.grad = None
+        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+            out = fn(*inputs, **kwinputs)
+            if type(out) is tuple:
+                out = out[0]
+        if backward:
+            out.backward(g, retain_graph=True)
+    if verbose:
+        # print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=50))
+        print(prof.key_averages().table(row_limit=50))
+    if trace_filename is not None:
+        prof.export_chrome_trace(trace_filename)
+
+
+def benchmark_memory(fn, *inputs, desc="", verbose=True, **kwinputs):
+    torch.cuda.empty_cache()
+    torch.cuda.reset_peak_memory_stats()
+    torch.cuda.synchronize()
+    fn(*inputs, **kwinputs)
+    torch.cuda.synchronize()
+    mem = torch.cuda.max_memory_allocated() / ((2**20) * 1000)
+    if verbose:
+        print(f"{desc} max memory: {mem}GB")
+    torch.cuda.empty_cache()
+    return mem
diff --git a/benchmark_alibi.py b/benchmark_alibi.py
new file mode 100644
index 0000000000000000000000000000000000000000..55da356270238af214e35bce1a6b479c83a51bef
--- /dev/null
+++ b/benchmark_alibi.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2024, Sanghun Cho, Tri Dao.
+
+import pickle
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from einops import rearrange, repeat
+from flash_attn.layers.rotary import apply_rotary_emb
+
+from flash_attn.utils.benchmark import benchmark_all, benchmark_forward, benchmark_backward
+from flash_attn.utils.benchmark import benchmark_fwd_bwd, benchmark_combined
+
+from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
+
+try:
+    import xformers.ops as xops
+except ImportError:
+    xops = None
+
+
+def generate_cos_sin(seqlen, rotary_dim, device, dtype):
+    assert rotary_dim % 2 == 0
+    angle = torch.rand(seqlen * 2, rotary_dim // 2, device=device) * 2 * math.pi
+    cos = torch.cos(angle).to(dtype=dtype)
+    sin = torch.sin(angle).to(dtype=dtype)
+    return cos, sin
+
+
+def flash_rotary(q, k, v, cos, sin, causal=False):
+    # corrected by @tridao comments
+    q = apply_rotary_emb(
+        q, cos, sin, seqlen_offsets=0, interleaved=False, inplace=True
+    )
+    k = apply_rotary_emb(
+        k, cos, sin, seqlen_offsets=0, interleaved=False, inplace=True
+    )
+
+    return flash_attn_func(q, k, v, causal=causal)
+
+
+def attn_bias_from_alibi_slopes(
+    slopes, seqlen_q, seqlen_k, query_padding_mask=None, key_padding_mask=None, causal=False
+):
+    batch, nheads = slopes.shape
+    device = slopes.device
+    slopes = rearrange(slopes, "b h -> b h 1 1")
+    if causal:
+        return torch.arange(-seqlen_k + 1, 1, device=device, dtype=torch.float32) * slopes
+    else:
+        row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
+        col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+        sk = (
+            seqlen_k
+            if key_padding_mask is None
+            else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+        )
+        sq = (
+            seqlen_q
+            if query_padding_mask is None
+            else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+        )
+        relative_pos = torch.abs(row_idx + sk - sq - col_idx)
+        return -slopes * relative_pos.to(dtype=slopes.dtype)
+
+
+def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"):
+    assert mode in ["fwd", "bwd", "fwd_bwd"]
+    f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1)
+    return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f)
+
+
+def efficiency(flop, time):
+    return (flop / time / 10**12) if not math.isnan(time) else 0.0
+
+
+def attention_pytorch(q, k, v, dropout_p=0.0, causal=True, attn_bias=None):
+    """
+    Arguments:
+        q, k, v: (batch_size, seqlen, nheads, head_dim)
+        dropout_p: float
+        attn_bias: (batch_size, nheads, seqlen, seqlen) or (1, nheads, seqlen, seqlen)
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+    """
+    batch_size, seqlen, nheads, d = q.shape
+    q = rearrange(q, 'b t h d -> (b h) t d')
+    k = rearrange(k, 'b s h d -> (b h) d s')
+    softmax_scale = 1.0 / math.sqrt(d)
+    # Preallocate attn_weights for `baddbmm`
+    if attn_bias is not None:
+        scores = rearrange(attn_bias, 'b h t s -> (b h) t s')
+    else:
+        scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=q.dtype, device=q.device)
+    scores = rearrange(torch.baddbmm(scores, q, k, beta=1.0, alpha=softmax_scale),
+                       '(b h) t s -> b h t s', h=nheads)
+    if causal:
+        # "triu_tril_cuda_template" not implemented for 'BFloat16'
+        # So we have to construct the mask in float
+        causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
+        # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+        scores = scores + causal_mask.to(dtype=scores.dtype)
+    attention = torch.softmax(scores, dim=-1)
+    attention_drop = F.dropout(attention, dropout_p)
+    output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    return output.to(dtype=q.dtype)
+
+
+def time_fwd_bwd(func, *args, **kwargs):
+    time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs)
+    return time_f[1].mean, time_b[1].mean
+
+
+repeats = 30
+device = 'cuda'
+dtype = torch.float16
+
+bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)]
+causal_vals = [False, True]
+headdim_vals = [64, 128]
+dim = 2048
+dropout_p = 0.0
+
+methods = (["fa2_alibi", "torch"]
+           + (["xformers"] if xops is not None else [])
+           + ["sdpa"]
+           + ["fa2_baseline"]
+           + ["fa2_rotary"])
+
+time_f = {}
+time_b = {}
+time_f_b = {}
+speed_f = {}
+speed_b = {}
+speed_f_b = {}
+for causal in causal_vals:
+    for headdim in headdim_vals:
+        for batch_size, seqlen in bs_seqlen_vals:
+            config = (causal, headdim, batch_size, seqlen)
+            nheads = dim // headdim
+            q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype,
+                                    requires_grad=True) for _ in range(3)]
+            # alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
+            alibi_slopes = torch.rand(1, nheads, device=device, dtype=torch.float32) * 0.3
+            attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen, seqlen, causal=causal).to(dtype)
+            attn_bias = repeat(attn_bias, "1 ... -> b ...", b=batch_size)
+            f, b = time_fwd_bwd(
+                flash_attn_func,
+                q, k, v,
+                dropout_p,
+                causal=causal,
+                # alibi_slopes=alibi_slopes,
+                alibi_slopes=None,
+                repeats=repeats,
+                verbose=False
+            )
+            time_f[config, "fa2_baseline"] = f
+            time_b[config, "fa2_baseline"] = b
+
+            q = q.detach().requires_grad_(True)
+            k = k.detach().requires_grad_(True)
+            v = v.detach().requires_grad_(True)
+            f, b = time_fwd_bwd(
+                flash_attn_func,
+                q, k, v,
+                dropout_p,
+                causal=causal,
+                alibi_slopes=rearrange(alibi_slopes, "1 h -> h"),
+                # alibi_slopes=None,
+                repeats=repeats,
+                verbose=False
+            )
+            time_f[config, "fa2_alibi"] = f
+            time_b[config, "fa2_alibi"] = b
+
+            try:
+                q = q.detach().requires_grad_(True)
+                k = k.detach().requires_grad_(True)
+                v = v.detach().requires_grad_(True)
+                f, b = time_fwd_bwd(
+                    attention_pytorch,
+                    q, k, v,
+                    dropout_p,
+                    causal=causal,
+                    attn_bias=attn_bias,
+                    repeats=repeats,
+                    verbose=False
+                )
+            except:  # Skip if OOM
+                f, b = float('nan'), float('nan')
+            time_f[config, "torch"] = f
+            time_b[config, "torch"] = b
+
+            # F.sdpa doesn't currently (torch 2.1) dispatch to flash-attn but just to be safe
+            with torch.backends.cuda.sdp_kernel(enable_flash=False):
+                q_pt = q.detach().requires_grad_(True).transpose(1, 2)
+                k_pt = k.detach().requires_grad_(True).transpose(1, 2)
+                v_pt = v.detach().requires_grad_(True).transpose(1, 2)
+                f, b = time_fwd_bwd(
+                    F.scaled_dot_product_attention,
+                    q_pt, k_pt, v_pt,
+                    attn_mask=attn_bias,
+                    dropout_p=dropout_p,
+                    is_causal=causal,
+                    repeats=repeats,
+                    verbose=False
+                )
+                time_f[config, "sdpa"] = f
+                time_b[config, "sdpa"] = b
+
+            if xops is not None:
+                q = q.detach().requires_grad_(True)
+                k = k.detach().requires_grad_(True)
+                v = v.detach().requires_grad_(True)
+                if causal:
+                    attn_bias_xops = xops.LowerTriangularMask().add_bias(attn_bias.expand(-1, -1, seqlen, -1).to(dtype=q.dtype))
+                    # NotImplementedError: No operator found for `memory_efficient_attention_backward` with inputs:
+                    # `flshattB@v2.3.6` is not supported because:
+                    #     attn_bias type is <class 'xformers.ops.fmha.attn_bias.LowerTriangularMaskWithTensorBias'>
+                    # `cutlassB` is not supported because:
+                    #     attn_bias type is <class 'xformers.ops.fmha.attn_bias.LowerTriangularMaskWithTensorBias'>
+                    attn_bias_xops = attn_bias_xops.materialize((batch_size, nheads, seqlen, seqlen), dtype=q.dtype, device=device)
+                else:
+                    attn_bias_xops = attn_bias.to(dtype=q.dtype)
+                f, b = time_fwd_bwd(
+                    xops.memory_efficient_attention,
+                    q, k, v,
+                    attn_bias_xops,
+                    dropout_p,
+                    repeats=repeats,
+                    verbose=False
+                )
+                time_f[config, "xformers"] = f
+                time_b[config, "xformers"] = b
+
+            q = q.detach().requires_grad_(True)
+            k = k.detach().requires_grad_(True)
+            v = v.detach().requires_grad_(True)
+            cos, sin = generate_cos_sin(seqlen, headdim, device, dtype)
+            f, b = time_fwd_bwd(
+                flash_rotary,
+                q, k, v,
+                cos, sin,
+                causal,
+                repeats=repeats,
+                verbose=False
+            )
+            time_f[config, "fa2_rotary"] = f
+            time_b[config, "fa2_rotary"] = b
+
+            print(f"### causal={causal}, headdim={headdim}, batch_size={batch_size}, seqlen={seqlen} ###")
+            csv_output = ""
+            csv_output += f"{causal},{headdim},{batch_size},{seqlen},"
+            for method in methods:
+                time_f_b[config, method] = time_f[config, method] + time_b[config, method]
+                speed_f[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"),
+                    time_f[config, method]
+                )
+                speed_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="bwd"),
+                    time_b[config, method]
+                )
+                speed_f_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd_bwd"),
+                    time_f_b[config, method]
+                )
+                print(
+                    f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, "
+                    f"bwd: {speed_b[config, method]:.2f} TFLOPs/s, "
+                    f"fwd + bwd: {speed_f_b[config, method]:.2f} TFLOPs/s"
+                )
+                csv_output += f"{speed_f[config, method]:.2f},{speed_b[config, method]:.2f},{speed_f_b[config, method]:.2f},"
+            print(csv_output)
diff --git a/benchmark_attn.py b/benchmark_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d2ce3c264cdb73536c791958ba3c180b9dd5be
--- /dev/null
+++ b/benchmark_attn.py
@@ -0,0 +1,314 @@
+from functools import partial
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import time
+
+try:
+    import cudnn
+except ImportError:
+    cudnn = None
+
+
+from einops import rearrange, repeat
+
+# from flash_attn.utils.benchmark import benchmark_forward, benchmark_backward, benchmark_combined, benchmark_all, benchmark_fwd_bwd, pytorch_profiler
+from flash_attn.utils.benchmark import benchmark_forward, benchmark_backward, benchmark_combined, benchmark_all, benchmark_fwd_bwd, pytorch_profiler
+from flash_attn.flash_attn_interface import flash_attn_func
+from flash_attn_interface import flash_attn_func as flash_attn_func_v3, flash_attn_varlen_func as flash_attn_varlen_func_v3
+
+# Need to install triton nightly:
+# pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
+
+try:
+    from triton_fused_attention import attention as triton_attention
+except ImportError:
+    triton_attention = None
+
+def flops(batch, nheads, seqlen_q, seqlen_k, headdim, causal=False, mode='fwd'):
+    assert mode in ["fwd", "bwd", "fwd_bwd"]
+    f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1)
+    return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f)
+
+
+def convert_to_cudnn_type(torch_type):
+    if torch_type == torch.float16:
+        return cudnn.data_type.HALF
+    elif torch_type == torch.bfloat16:
+        return cudnn.data_type.BFLOAT16
+    elif torch_type == torch.float32:
+        return cudnn.data_type.FLOAT
+    elif torch_type == torch.int32:
+        return cudnn.data_type.INT32
+    elif torch_type == torch.int64:
+        return cudnn.data_type.INT64
+    else:
+        raise ValueError("Unsupported tensor data type.")
+
+
+def cudnn_sdpa_setup(q, k, v, grad, o, stats, causal=False, varlen=False, seqlens=None):
+    b, nheads, seqlen_q, headdim = q.shape
+    _, nheads_kv, seqlen_k, _ = k.shape
+    assert v.shape == (b, nheads_kv, seqlen_k, headdim)
+    assert cudnn is not None, 'CUDNN is not available'
+    q_gpu, k_gpu, v_gpu = q, k, v
+    o_gpu, stats_gpu = o, stats
+    graph_forward = cudnn.pygraph(
+        io_data_type=convert_to_cudnn_type(q.dtype),
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    )
+    q_forward = graph_forward.tensor_like(q_gpu.detach())
+    k_forward = graph_forward.tensor_like(k_gpu.detach())
+    v_forward = graph_forward.tensor_like(v_gpu.detach())
+
+    seqlens_reshaped = seqlens if varlen else None
+    seq_len_q = graph_forward.tensor_like(seqlens_reshaped.detach()) if varlen else None
+    seq_len_kv = graph_forward.tensor_like(seqlens_reshaped.detach()) if varlen else None
+
+    o_forward, stats_forward = graph_forward.sdpa(
+        name="sdpa",
+        q=q_forward,
+        k=k_forward,
+        v=v_forward,
+        is_inference=False,
+        attn_scale=1.0 / math.sqrt(headdim),
+        use_causal_mask=causal,
+        use_padding_mask=varlen,
+        seq_len_q=seq_len_q,
+        seq_len_kv=seq_len_kv,
+    )
+
+    o_forward.set_output(True).set_dim(o_gpu.shape).set_stride(o_gpu.stride())
+    stats_forward.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+
+    graph_forward.validate()
+    graph_forward.build_operation_graph()
+    graph_forward.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+    graph_forward.check_support()
+    graph_forward.build_plans()
+
+    variant_pack_forward = {
+        q_forward: q_gpu,
+        k_forward: k_gpu,
+        v_forward: v_gpu,
+        o_forward: o_gpu,
+        stats_forward: stats_gpu,
+        seq_len_q: seqlens_reshaped,
+        seq_len_kv: seqlens_reshaped,
+    }
+
+    dQ_gpu = torch.empty_like(q_gpu)
+    dK_gpu = torch.empty_like(k_gpu)
+    dV_gpu = torch.empty_like(v_gpu)
+    dO_gpu = grad
+
+    graph_backward = cudnn.pygraph(
+        io_data_type=cudnn.data_type.HALF,
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    )
+    
+    q_backward = graph_backward.tensor_like(q_gpu.detach())
+    k_backward = graph_backward.tensor_like(k_gpu.detach())
+    v_backward = graph_backward.tensor_like(v_gpu.detach())
+    o_backward = graph_backward.tensor_like(o_gpu.detach())
+    dO_backward = graph_backward.tensor_like(dO_gpu.detach())
+    stats_backward = graph_backward.tensor_like(stats_gpu.detach())
+    seq_len_q = graph_backward.tensor_like(seqlens_reshaped.detach()) if varlen else None
+    seq_len_kv = graph_backward.tensor_like(seqlens_reshaped.detach()) if varlen else None
+    
+    dQ_backward, dK_backward, dV_backward = graph_backward.sdpa_backward(
+        name="sdpa_backward",
+        q=q_backward,
+        k=k_backward,
+        v=v_backward,
+        o=o_backward,
+        dO=dO_backward,
+        stats=stats_backward,
+        attn_scale=1.0 / math.sqrt(headdim),
+        use_causal_mask=causal,
+        use_padding_mask=varlen,
+        seq_len_q=seq_len_q,
+        seq_len_kv=seq_len_kv,
+    )
+    
+    dQ_backward.set_output(True).set_dim(dQ_gpu.size()).set_stride(dQ_gpu.stride())
+    dK_backward.set_output(True).set_dim(dK_gpu.size()).set_stride(dK_gpu.stride())
+    dV_backward.set_output(True).set_dim(dV_gpu.size()).set_stride(dV_gpu.stride())
+    
+    graph_backward.validate()
+    graph_backward.build_operation_graph()
+    graph_backward.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+    graph_backward.check_support()
+    graph_backward.build_plans()
+
+    variant_pack_backward = {
+        q_backward: q_gpu,
+        k_backward: k_gpu,
+        v_backward: v_gpu,
+        o_backward: o_gpu,
+        dO_backward: dO_gpu,
+        stats_backward: stats_gpu,
+        dQ_backward: dQ_gpu,
+        dK_backward: dK_gpu,
+        dV_backward: dV_gpu,
+        seq_len_q: seqlens_reshaped,
+        seq_len_kv: seqlens_reshaped,
+    }
+
+    workspace = torch.empty(
+        max(graph_forward.get_workspace_size(), graph_backward.get_workspace_size()), 
+        device="cuda", dtype=torch.uint8
+    )
+
+    def run_fwd(*args, **kwargs):
+        graph_forward.execute(variant_pack_forward, workspace)
+        return o_gpu, stats_gpu
+
+    def run_bwd(*args, **kwargs):
+        graph_backward.execute(variant_pack_backward, workspace)
+        return dQ_gpu, dK_gpu, dV_gpu
+
+    return run_fwd, run_bwd
+
+
+torch.manual_seed(0)
+repeats = 100
+dropout_p = 0.0
+causal = False
+dtype = torch.float16
+device = 'cuda'
+verbose = False
+batch_size = 2
+# seqlen = 2048
+seqlen = 8192
+# seqlen = 4096
+# seqlen = 2047
+dim = 2048
+# headdim = 128
+# headdim = 64
+headdim = 256
+
+for mode in ['fwd', 'bwd']:
+# for mode in ['bwd']:
+    for headdim in [64, 128, 256]:
+    # for headdim in [128]:
+        for seqlen in [1024, 2048, 4096, 8192, 16384, 32768]:
+        # for seqlen in [8192]:
+            nheads = dim // headdim
+            # nheads = 24
+            # headdim = 64
+            # batch_size = 64
+            # seqlen = 512
+            # nheads = 8
+            # headdim = 128
+            # nheads = 16
+            # headdim = 128
+            nheads_kv = nheads
+            # nheads_kv = 1
+    
+            qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype,
+                            requires_grad=True)
+            q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, requires_grad=True)
+            k = torch.randn(batch_size, seqlen, nheads_kv, headdim, device=device, dtype=dtype, requires_grad=True)
+            v = torch.randn(batch_size, seqlen, nheads_kv, headdim, device=device, dtype=dtype, requires_grad=True)
+            q_t = q.transpose(1, 2).contiguous().detach().requires_grad_()
+            k_t = k.transpose(1, 2).contiguous().detach().requires_grad_()
+            v_t = k.transpose(1, 2).contiguous().detach().requires_grad_()
+            grad = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype)
+            grad_t = grad.transpose(1, 2).contiguous()
+            o_t = torch.empty_like(q.transpose(1, 2))
+            stats = torch.empty(batch_size, nheads, seqlen, 1, dtype=torch.float32, device=q.device)
+    
+            bench_fn = benchmark_forward if mode == 'fwd' else partial(benchmark_backward, grad=grad)
+
+            for causal in [False, True]:
+            # for causal in [True]:
+                print(f"\n### {mode = }, {batch_size = }, {headdim = }, {seqlen = }, {causal = } ###")
+                # For var-seq-len
+                lens = torch.full([q.shape[0]], seqlen, dtype=torch.int32)
+                seqlens_cudnn = lens.reshape(batch_size, 1, 1, 1).contiguous().cuda()
+                cu_seqlens = torch.cat([torch.tensor([0], dtype=torch.int32), torch.cumsum(lens, dim=0, dtype=torch.int32)]).cuda()
+                if headdim <= 128 and cudnn is not None:
+                    cudnn_sdpa_fwd, cudnn_sdpa_bwd = cudnn_sdpa_setup(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), grad.transpose(1, 2), o_t, stats, causal=causal)
+                    cudnn_sdpa_fwd_varlen, cudnn_sdpa_bwd_varlen = cudnn_sdpa_setup(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), grad.transpose(1, 2), o_t, stats, causal=causal, varlen=True, seqlens=seqlens_cudnn)
+                f = flops(batch_size, nheads, seqlen, seqlen, headdim, causal=causal, mode=mode)
+                ref_o = flash_attn_func(q, k, v, dropout_p, causal=causal)
+                _, m0 = bench_fn(flash_attn_func, q, k, v, dropout_p, causal=causal, repeats=repeats, verbose=verbose, desc='Fav2')
+                if mode == 'bwd':
+                    ref_dv, v.grad = v.grad.clone(), None
+                    ref_dk, k.grad = k.grad.clone(), None
+                    ref_dq, q.grad = q.grad.clone(), None
+                # pytorch_profiler(flash_attn_func, q, k, v, dropout_p, causal=causal, backward=False)
+                if headdim <= 128:
+                    if triton_attention is not None and nheads_kv == nheads:
+                        if mode == 'fwd':
+                            time.sleep(1) # Sleep to avoid residual power throttling from the previous benchmark
+                            _, m3 = benchmark_forward(triton_attention, q_t, k_t, v_t, causal, 1 / math.sqrt(headdim), repeats=repeats, verbose=verbose, desc='Triton')
+                        # TODO: fix Triton numeric errors.
+                        # if mode == 'bwd':
+                        #     dv, v_t.grad = v_t.grad.clone(), None
+                        #     dk, k_t.grad = k_t.grad.clone(), None
+                        #     dq, q_t.grad = q_t.grad.clone(), None
+                        #     torch.testing.assert_close(ref_dv, dv.transpose(1, 2), atol=0.05, rtol=0.05)
+                        #     torch.testing.assert_close(ref_dk, dk.transpose(1, 2), atol=0.05, rtol=0.05)
+                        #     torch.testing.assert_close(ref_dq, dq.transpose(1, 2), atol=0.05, rtol=0.05)
+                    if cudnn is not None:
+                        time.sleep(1) # Sleep to avoid residual power throttling from the previous benchmark
+                        if mode == 'fwd':
+                            _, m2 = benchmark_forward(cudnn_sdpa_fwd, repeats=repeats, verbose=verbose, desc='CuDNN')
+                            _, m2_var = benchmark_forward(cudnn_sdpa_fwd_varlen, repeats=repeats, verbose=verbose, desc='CuDNN')
+                            cudnn_sdpa_fwd()
+                            torch.testing.assert_close(ref_o, o_t.transpose(1, 2), atol=0.05, rtol=0.05)
+                            cudnn_sdpa_fwd_varlen()
+                            torch.testing.assert_close(ref_o, o_t.transpose(1, 2), atol=0.05, rtol=0.05)
+                        else:
+                            cudnn_sdpa_fwd()
+                            _, m2 = benchmark_forward(cudnn_sdpa_bwd, repeats=repeats, verbose=verbose, desc='CuDNN')
+                            _, m2_var = benchmark_forward(cudnn_sdpa_bwd_varlen, repeats=repeats, verbose=verbose, desc='CuDNN')
+                            dq, dk, dv = cudnn_sdpa_bwd()
+                            torch.testing.assert_close(ref_dv, dv.transpose(1, 2), atol=0.05, rtol=0.05)
+                            torch.testing.assert_close(ref_dk, dk.transpose(1, 2), atol=0.05, rtol=0.05)
+                            torch.testing.assert_close(ref_dq, dq.transpose(1, 2), atol=0.05, rtol=0.05)
+                            dq, dk, dv = cudnn_sdpa_bwd_varlen()
+                            torch.testing.assert_close(ref_dv, dv.transpose(1, 2), atol=0.05, rtol=0.05)
+                            torch.testing.assert_close(ref_dk, dk.transpose(1, 2), atol=0.05, rtol=0.05)
+                            torch.testing.assert_close(ref_dq, dq.transpose(1, 2), atol=0.05, rtol=0.05)
+                        # pytorch_profiler(cudnn_sdpa, backward=False)
+
+                if headdim <= 128 or mode == 'fwd':
+                    time.sleep(1)
+                    _, m1 = bench_fn(flash_attn_func_v3, q, k, v, causal=causal, repeats=repeats, verbose=verbose, desc='Fav3')
+                    q_var = q.reshape(-1, q.shape[-2], q.shape[-1])
+                    k_var = k.reshape(-1, k.shape[-2], k.shape[-1])
+                    v_var = v.reshape(-1, v.shape[-2], v.shape[-1])
+                    time.sleep(1)
+                    if mode == 'bwd':
+                        dv, v.grad = v.grad.clone(), None
+                        dk, k.grad = k.grad.clone(), None
+                        dq, q.grad = q.grad.clone(), None
+                        torch.testing.assert_close(ref_dv, dv, atol=0.05, rtol=0.05)
+                        torch.testing.assert_close(ref_dk, dk, atol=0.05, rtol=0.05)
+                        torch.testing.assert_close(ref_dq, dq, atol=0.05, rtol=0.05)
+ 
+                    bench_var_fn = bench_fn
+                    if mode == 'bwd':
+                        grad_var = grad.reshape(-1, grad.shape[-2], grad.shape[-1])
+                        bench_var_fn = partial(benchmark_backward, grad=grad_var)
+                    _, m1_var = bench_var_fn(flash_attn_varlen_func_v3, q_var, k_var, v_var, cu_seqlens, cu_seqlens, seqlen, seqlen, causal=causal, repeats=repeats, verbose=verbose, desc='Fav3 var len')
+
+                # pytorch_profiler(flash_attn_func_v3, q, k, v, causal=causal, backward=False)
+                print(f'Fav2: {m0.mean * 1e3:.3f}ms, {(f / m0.mean * 1e-12):.1f} TFLOPS')
+                if headdim <= 128:
+                    if mode == 'fwd' and triton_attention is not None and nheads_kv == nheads:
+                        print(f'Triton: {m3.mean * 1e3:.3f}ms, {(f / m3.mean * 1e-12):.1f} TFLOPS')
+                    if cudnn is not None:
+                        print(f'CuDNN: {m2.mean * 1e3:.3f}ms, {(f / m2.mean * 1e-12):.1f} TFLOPS')
+                        print(f'CuDNN varlen: {m2_var.mean * 1e3:.3f}ms, {(f / m2_var.mean * 1e-12):.1f} TFLOPS')
+                if headdim <= 128 or mode == 'fwd':
+                    print(f'Fav3: {m1.mean * 1e3:.3f}ms, {(f / m1.mean * 1e-12):.1f} TFLOPS')
+                    print(f'Fav3 varlen: {m1_var.mean * 1e3:.3f}ms, {(f / m1_var.mean * 1e-12):.1f} TFLOPS')
+    
\ No newline at end of file
diff --git a/benchmark_causal.py b/benchmark_causal.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c4797c83e0cc5c100d991a9d847f1e9b4351002
--- /dev/null
+++ b/benchmark_causal.py
@@ -0,0 +1,225 @@
+from functools import partial
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from einops import rearrange, repeat
+
+# from flash_attn.utils.benchmark import benchmark_forward, benchmark_backward, benchmark_combined, benchmark_all, benchmark_fwd_bwd, pytorch_profiler
+from flash_attn.utils.benchmark import benchmark_forward, benchmark_backward, benchmark_combined, benchmark_all, benchmark_fwd_bwd, pytorch_profiler
+from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
+# # from flash_attn.triton.fused_attention import attention as attention
+# from flash_attn.flash_attn_triton import flash_attn_qkvpacked_func
+# from flash_attn.flash_attn_triton_og import attention as attention_og
+
+# from triton.ops.flash_attention import attention as attention_triton
+
+from flash_attn import flash_attn_qkvpacked_func, flash_attn_kvpacked_func
+
+try:
+    from flash_attn.fused_softmax import scaled_upper_triang_masked_softmax
+except ImportError:
+    scaled_upper_triang_masked_softmax = None
+
+
+def attention_pytorch(qkv, dropout_p=0.0, causal=True):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, head_dim)
+        dropout_p: float
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+    """
+    batch_size, seqlen, _, nheads, d = qkv.shape
+    q, k, v = qkv.unbind(dim=2)
+    q = rearrange(q, 'b t h d -> (b h) t d')
+    k = rearrange(k, 'b s h d -> (b h) d s')
+    softmax_scale = 1.0 / math.sqrt(d)
+    # Preallocate attn_weights for `baddbmm`
+    scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device)
+    scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale),
+                       '(b h) t s -> b h t s', h=nheads)
+    if causal:
+        # "triu_tril_cuda_template" not implemented for 'BFloat16'
+        # So we have to construct the mask in float
+        causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
+        # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+        scores = scores + causal_mask.to(dtype=scores.dtype)
+    attention = torch.softmax(scores, dim=-1)
+    attention_drop = F.dropout(attention, dropout_p)
+    output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    return output.to(dtype=qkv.dtype)
+
+
+def attention_megatron(qkv):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, head_dim)
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+    """
+    batch_size, seqlen, _, nheads, d = qkv.shape
+    q, k, v = qkv.unbind(dim=2)
+    q = rearrange(q, 'b t h d -> (b h) t d')
+    k = rearrange(k, 'b s h d -> (b h) d s')
+    softmax_scale = 1.0 / math.sqrt(d)
+    # Preallocate attn_weights for `baddbmm`
+    scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device)
+    scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale),
+                       '(b h) t s -> b h t s', h=nheads)
+    attention = scaled_upper_triang_masked_softmax(scores, None, scale=1.0)
+    output = torch.einsum('bhts,bshd->bthd', attention, v)
+    return output.to(dtype=qkv.dtype)
+
+
+torch.manual_seed(0)
+repeats = 30
+batch_size = 8
+seqlen = 2048
+nheads = 12
+headdim = 128
+# nheads = 24
+# headdim = 64
+# batch_size = 64
+# seqlen = 512
+# nheads = 8
+# headdim = 128
+dropout_p = 0.0
+causal = True
+dtype = torch.float16
+device = 'cuda'
+
+qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype,
+                  requires_grad=True)
+cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                          device=qkv.device)
+
+qkv_unpad = rearrange(qkv, 'b s ... -> (b s) ...').detach().requires_grad_(True)
+# benchmark_all(flash_attn_varlen_qkvpacked_func, qkv_unpad,
+#               cu_seqlens, seqlen, dropout_p, causal=causal, repeats=repeats, desc='FlashAttention')
+# pytorch_profiler(flash_attn_varlen_qkvpacked_func, qkv_unpad,
+#                  cu_seqlens, seqlen, dropout_p, causal=causal, backward=True)
+benchmark_forward(flash_attn_qkvpacked_func, qkv, dropout_p, causal=causal, repeats=repeats, desc='Fav2')
+pytorch_profiler(flash_attn_qkvpacked_func, qkv, dropout_p, causal=causal, backward=False)
+
+# for dropout_p in [0.1, 0.0]:
+#     for causal in [False, True]:
+#         print(f"### {dropout_p = }, {causal = } ###")
+#         pytorch_profiler(fav2_qkvpacked_func, qkv, dropout_p, causal=causal, backward=True)
+
+
+# nheads_k = 2
+# q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, requires_grad=True)
+# kv = torch.randn(batch_size, seqlen, 2, nheads_k, headdim, device=device, dtype=dtype,
+#                  requires_grad=True)
+# if fav2_kvpacked_func is not None:
+#     benchmark_all(fav2_kvpacked_func, q, kv, dropout_p, causal=causal, repeats=repeats, desc='Fav2')
+#     pytorch_profiler(fav2_kvpacked_func, q, kv, dropout_p, causal=causal, backward=True)
+
+# dropout_p = 0.0
+# causal = False
+# benchmark_all(attention_pytorch, qkv, dropout_p, causal=causal,
+#               repeats=repeats, desc='PyTorch Attention')
+
+# benchmark_all(flash_attn_qkvpacked_func, qkv, None, causal, repeats=repeats, desc='FlashAttention Triton')
+# pytorch_profiler(flash_attn_qkvpacked_func, qkv, None, causal, backward=True)
+
+# q, k, v = [torch.randn(batch_size, nheads, seqlen, headdim, device=device, dtype=dtype,
+#                        requires_grad=True) for _ in range(3)]
+# benchmark_all(attention_og, q, k, v, 1.0, repeats=repeats, desc='FlashAttention Triton OG')
+# # pytorch_profiler(attention, q, k, v, 1.0, backward=True)
+
+# if scaled_upper_triang_masked_softmax is not None:
+#     benchmark_all(attention_megatron, qkv, repeats=repeats, desc='Megatron Attention')
+
+# from src.ops.fftconv import fftconv_func
+
+# dim = nheads * headdim
+# u = torch.randn(batch_size, dim, seqlen, device=device, dtype=dtype, requires_grad=True)
+# k = torch.randn(dim, seqlen, device=device, requires_grad=True)
+# D = torch.randn(dim, device=device, requires_grad=True)
+# benchmark_all(fftconv_func, u, k, D, repeats=repeats, desc='FFTConv')
+# pytorch_profiler(fftconv_func, u, k, D, backward=True)
+# pytorch_profiler(torch.fft.rfft, u.float())
+
+flops = 4 * batch_size * seqlen ** 2 * nheads * headdim
+ideal_a100_time = flops / 312 / 1e9
+print(f"Ideal A100 fwd time: {ideal_a100_time:.3f}ms, bwd time: {ideal_a100_time * 2.5:.3f}ms")
+exit(0)
+
+
+def time_fwd_bwd(func, *args, **kwargs):
+    time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs)
+    return time_f[1].mean, time_b[1].mean
+
+bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)]
+causal_vals = [False, True]
+headdim_vals = [64, 128]
+dim = 2048
+dropout_p = 0.0
+
+time_f = {}
+time_b = {}
+for causal in causal_vals:
+    for headdim in headdim_vals:
+        for batch_size, seqlen in bs_seqlen_vals:
+            nheads = dim // headdim
+            qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype,
+                              requires_grad=True)
+            cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                                    device=qkv.device)
+            qkv_unpad = rearrange(qkv, 'b s ... -> (b s) ...').detach().requires_grad_(True)
+            f, b = time_fwd_bwd(
+                flash_attn_varlen_qkvpacked_func, qkv_unpad, cu_seqlens, seqlen, dropout_p,
+                causal=causal, repeats=repeats, verbose=False
+            )
+            time_f[(causal, headdim, batch_size, seqlen), "Flash"] = f
+            time_b[(causal, headdim, batch_size, seqlen), "Flash"] = b
+
+            qkv = qkv.detach().requires_grad_(True)
+            f, b = time_fwd_bwd(
+                fav2_qkvpacked_func, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False
+            )
+            time_f[(causal, headdim, batch_size, seqlen), "Flash2"] = f
+            time_b[(causal, headdim, batch_size, seqlen), "Flash2"] = b
+
+            # q, k, v = [torch.randn(batch_size, nheads, seqlen, headdim, device=device, dtype=dtype,
+            #                        requires_grad=True) for _ in range(3)]
+            # # Try both values of sequence_parallel and pick the faster one
+            # f, b = time_fwd_bwd(
+            #     attention_triton, q, k, v, causal, headdim**(-0.5),
+            #     False, repeats=repeats, verbose=False
+            # )
+            # _, b0 = time_fwd_bwd(
+            #     attention_triton, q, k, v, causal, headdim**(-0.5),
+            #     True, repeats=repeats, verbose=False
+            # )
+            # time_f[(causal, headdim, batch_size, seqlen), "Triton"] = f
+            # time_b[(causal, headdim, batch_size, seqlen), "Triton"] = min(b, b0)
+
+            if seqlen <= 8 * 1024:
+                qkv = qkv.detach().requires_grad_(True)
+                f, b = time_fwd_bwd(
+                    attention_pytorch, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False
+                )
+            else:
+                f, b = float('nan'), float('nan')
+            time_f[(causal, headdim, batch_size, seqlen), "Pytorch"] = f
+            time_b[(causal, headdim, batch_size, seqlen), "Pytorch"] = b
+
+            # q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype,
+            #                        requires_grad=True) for _ in range(3)]
+            # import xformers.ops as xops
+            # f, b = time_fwd_bwd(
+            #     xops.memory_efficient_attention, q, k, v,
+            #     attn_bias=xops.LowerTriangularMask() if causal else None,
+            #     op=(xops.fmha.cutlass.FwOp, xops.fmha.cutlass.BwOp)
+            # )
+            # time_f[(causal, headdim, batch_size, seqlen), "xformers"] = f
+            # time_b[(causal, headdim, batch_size, seqlen), "xformers"] = b
+
+
+import pickle
+with open('flash2_attn_time_h100.plk', 'wb') as fp:
+    pickle.dump((time_f, time_b), fp, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/benchmark_flash_attention.py b/benchmark_flash_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..341ae4b213901d8b5406143bf6c8abb50bc422bd
--- /dev/null
+++ b/benchmark_flash_attention.py
@@ -0,0 +1,180 @@
+# Install the newest triton version with
+# pip install "git+https://github.com/openai/triton.git#egg=triton&subdirectory=python"
+import pickle
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from einops import rearrange, repeat
+
+from flash_attn.utils.benchmark import benchmark_all, benchmark_forward, benchmark_backward
+from flash_attn.utils.benchmark import benchmark_fwd_bwd, benchmark_combined
+
+from flash_attn import flash_attn_qkvpacked_func
+
+try:
+    from triton.ops.flash_attention import attention as attention_triton
+except ImportError:
+    attention_triton = None
+
+try:
+    import xformers.ops as xops
+except ImportError:
+    xops = None
+
+
+def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"):
+    assert mode in ["fwd", "bwd", "fwd_bwd"]
+    f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1)
+    return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f)
+
+def efficiency(flop, time):
+    return (flop / time / 10**12) if not math.isnan(time) else 0.0
+
+
+def attention_pytorch(qkv, dropout_p=0.0, causal=True):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, head_dim)
+        dropout_p: float
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+    """
+    batch_size, seqlen, _, nheads, d = qkv.shape
+    q, k, v = qkv.unbind(dim=2)
+    q = rearrange(q, 'b t h d -> (b h) t d')
+    k = rearrange(k, 'b s h d -> (b h) d s')
+    softmax_scale = 1.0 / math.sqrt(d)
+    # Preallocate attn_weights for `baddbmm`
+    scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device)
+    scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale),
+                       '(b h) t s -> b h t s', h=nheads)
+    if causal:
+        # "triu_tril_cuda_template" not implemented for 'BFloat16'
+        # So we have to construct the mask in float
+        causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
+        # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+        scores = scores + causal_mask.to(dtype=scores.dtype)
+    attention = torch.softmax(scores, dim=-1)
+    attention_drop = F.dropout(attention, dropout_p)
+    output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    return output.to(dtype=qkv.dtype)
+
+
+def time_fwd_bwd(func, *args, **kwargs):
+    time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs)
+    return time_f[1].mean, time_b[1].mean
+
+
+repeats = 30
+device = 'cuda'
+dtype = torch.float16
+
+bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)]
+causal_vals = [False, True]
+headdim_vals = [64, 128]
+dim = 2048
+dropout_p = 0.0
+
+methods = (["Flash2", "Pytorch"]
+           + (["Triton"] if attention_triton is not None else [])
+           + (["xformers.c"] if xops is not None else [])
+           + (["xformers.f"] if xops is not None else []))
+
+time_f = {}
+time_b = {}
+time_f_b = {}
+speed_f = {}
+speed_b = {}
+speed_f_b = {}
+for causal in causal_vals:
+    for headdim in headdim_vals:
+        for batch_size, seqlen in bs_seqlen_vals:
+            config = (causal, headdim, batch_size, seqlen)
+            nheads = dim // headdim
+            qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype,
+                              requires_grad=True)
+            f, b = time_fwd_bwd(
+                flash_attn_qkvpacked_func, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False
+            )
+            time_f[config, "Flash2"] = f
+            time_b[config, "Flash2"] = b
+
+            try:
+                qkv = qkv.detach().requires_grad_(True)
+                f, b = time_fwd_bwd(
+                    attention_pytorch, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False
+                )
+            except:  # Skip if OOM
+                f, b = float('nan'), float('nan')
+            time_f[config, "Pytorch"] = f
+            time_b[config, "Pytorch"] = b
+
+            if attention_triton is not None:
+                q, k, v = [torch.randn(batch_size, nheads, seqlen, headdim, device=device, dtype=dtype,
+                                    requires_grad=True) for _ in range(3)]
+                # Try both values of sequence_parallel and pick the faster one
+                try:
+                    f, b = time_fwd_bwd(
+                        attention_triton, q, k, v, causal, headdim**(-0.5),
+                        False, repeats=repeats, verbose=False
+                    )
+                except:
+                    f, b = float('nan'), float('inf')
+                try:
+                    _, b0 = time_fwd_bwd(
+                        attention_triton, q, k, v, causal, headdim**(-0.5),
+                        True, repeats=repeats, verbose=False
+                    )
+                except:
+                    b0 = float('inf')
+                time_f[config, "Triton"] = f
+                time_b[config, "Triton"] = min(b, b0) if min(b, b0) < float('inf') else float('nan')
+
+            if xops is not None:
+                q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype,
+                                    requires_grad=True) for _ in range(3)]
+                f, b = time_fwd_bwd(
+                    xops.memory_efficient_attention, q, k, v,
+                    attn_bias=xops.LowerTriangularMask() if causal else None,
+                    op=(xops.fmha.cutlass.FwOp, xops.fmha.cutlass.BwOp)
+                )
+                time_f[config, "xformers.c"] = f
+                time_b[config, "xformers.c"] = b
+
+            if xops is not None:
+                q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype,
+                                    requires_grad=True) for _ in range(3)]
+                f, b = time_fwd_bwd(
+                    xops.memory_efficient_attention, q, k, v,
+                    attn_bias=xops.LowerTriangularMask() if causal else None,
+                    op=(xops.fmha.flash.FwOp, xops.fmha.flash.BwOp)
+                )
+                time_f[config, "xformers.f"] = f
+                time_b[config, "xformers.f"] = b
+
+            print(f"### causal={causal}, headdim={headdim}, batch_size={batch_size}, seqlen={seqlen} ###")
+            for method in methods:
+                time_f_b[config, method] = time_f[config, method] + time_b[config, method]
+                speed_f[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"),
+                    time_f[config, method]
+                )
+                speed_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="bwd"),
+                    time_b[config, method]
+                )
+                speed_f_b[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd_bwd"),
+                    time_f_b[config, method]
+                )
+                print(
+                    f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, "
+                    f"bwd: {speed_b[config, method]:.2f} TFLOPs/s, "
+                    f"fwd + bwd: {speed_f_b[config, method]:.2f} TFLOPs/s"
+                )
+
+
+# with open('flash2_attn_time.plk', 'wb') as fp:
+#     pickle.dump((speed_f, speed_b, speed_f_b), fp, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/benchmark_flash_attention_fp8.py b/benchmark_flash_attention_fp8.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f700a31b005a13cceaa6873d27b6fef8aba908a
--- /dev/null
+++ b/benchmark_flash_attention_fp8.py
@@ -0,0 +1,333 @@
+# Install the newest triton version with
+# pip install "git+https://github.com/openai/triton.git#egg=triton&subdirectory=python"
+import pickle
+import math
+import time
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from einops import rearrange, repeat
+
+from flash_attn.utils.benchmark import benchmark_all, benchmark_forward, benchmark_backward
+from flash_attn.utils.benchmark import benchmark_fwd_bwd, benchmark_combined
+
+from flash_attn import flash_attn_qkvpacked_func
+from flash_attn_interface import flash_attn_func
+
+try:
+    from triton_fused_attention import attention as attention_triton
+except ImportError:
+    attention_triton = None
+
+try:
+    import xformers.ops as xops
+except ImportError:
+    xops = None
+
+try:
+    import cudnn
+except ImportError:
+    cudnn = None
+
+
+def convert_to_cudnn_type(torch_type):
+    if torch_type == torch.float16:
+        return cudnn.data_type.HALF
+    elif torch_type == torch.bfloat16:
+        return cudnn.data_type.BFLOAT16
+    elif torch_type == torch.float32:
+        return cudnn.data_type.FLOAT
+    elif torch_type == torch.int32:
+        return cudnn.data_type.INT32
+    elif torch_type == torch.int64:
+        return cudnn.data_type.INT64
+    elif torch_type == torch.float8_e4m3fn:
+        return cudnn.data_type.FP8_E4M3
+    elif torch_type == torch.float8_e4m3fn:
+        return cudnn.data_type.FP8_E5M2
+    else:
+        raise ValueError("Unsupported tensor data type.")
+
+def cudnn_spda_setup(qkv, seqlen_q, seqlen_k, causal=False):
+    b, _, _, nheads, headdim = qkv.shape
+    assert cudnn is not None, 'CUDNN is not available'
+    o_gpu = torch.zeros(b, seqlen_q, nheads, headdim, dtype=qkv.dtype, device=qkv.device)
+    o_gpu_transposed = torch.as_strided(
+        o_gpu,
+        [b, nheads, seqlen_q, headdim],
+        [nheads * seqlen_q * headdim, headdim, nheads * headdim, 1],
+    )
+    stats_gpu = torch.empty(b, nheads, seqlen_q, 1, dtype=torch.float32, device=qkv.device)
+    amax_s_gpu = torch.empty(1, 1, 1, 1, dtype=torch.float32, device=qkv.device)
+    amax_o_gpu = torch.empty(1, 1, 1, 1, dtype=torch.float32, device=qkv.device)
+    graph = cudnn.pygraph(
+        io_data_type=convert_to_cudnn_type(qkv.dtype),
+        intermediate_data_type=cudnn.data_type.FLOAT,
+        compute_data_type=cudnn.data_type.FLOAT,
+    )
+    new_q = torch.as_strided(
+        qkv,
+        [b, nheads, seqlen_q, headdim],
+        [seqlen_q * nheads * headdim * 3, headdim, headdim * nheads * 3, 1],
+        storage_offset=0,
+    )
+    q = graph.tensor(
+        name = "Q",
+        dim = list(new_q.shape),
+        stride = list(new_q.stride()),
+        data_type=convert_to_cudnn_type(qkv.dtype)
+    )
+    new_k = torch.as_strided(
+        qkv,
+        [b, nheads, seqlen_k, headdim],
+        [seqlen_k * nheads * headdim * 3, headdim, headdim * nheads * 3, 1],
+        storage_offset=nheads * headdim,
+    )
+    k = graph.tensor(
+        name = "K",
+        dim = list(new_k.shape),
+        stride = list(new_k.stride()),
+        data_type=convert_to_cudnn_type(qkv.dtype)
+    )
+    new_v = torch.as_strided(
+        qkv,
+        [b, nheads, seqlen_k, headdim],
+        [seqlen_k * nheads * headdim * 3, headdim, headdim * nheads * 3, 1],
+        storage_offset=nheads * headdim * 2,
+    )
+    v = graph.tensor(
+        name = "V",
+        dim = list(new_v.shape),
+        stride = list(new_v.stride()),
+        data_type=convert_to_cudnn_type(qkv.dtype)
+    )
+
+    def get_default_scale_tensor():
+        return graph.tensor(
+            dim = [1, 1, 1, 1],
+            stride = [1, 1, 1, 1],
+            data_type=cudnn.data_type.FLOAT
+        )
+
+    default_scale_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float32, device="cuda")
+    descale_q = get_default_scale_tensor()
+    descale_k = get_default_scale_tensor()
+    descale_v = get_default_scale_tensor()
+    descale_s = get_default_scale_tensor()
+    scale_s = get_default_scale_tensor()
+    scale_o = get_default_scale_tensor()
+
+    o, _, amax_s, amax_o = graph.sdpa_fp8(
+        q=q,
+        k=k,
+        v=v,
+        descale_q=descale_q,
+        descale_k=descale_k,
+        descale_v=descale_v,
+        descale_s=descale_s,
+        scale_s=scale_s,
+        scale_o=scale_o,
+        is_inference=True,
+        attn_scale=1.0 / math.sqrt(headdim),
+        use_causal_mask=causal,
+        name="sdpa",
+    )
+
+    o.set_output(True).set_dim(o_gpu_transposed.shape).set_stride(o_gpu_transposed.stride())
+
+    amax_s.set_output(False).set_dim(amax_s_gpu.shape).set_stride(amax_s_gpu.stride())
+    amax_o.set_output(False).set_dim(amax_o_gpu.shape).set_stride(amax_o_gpu.stride())
+    # stats.set_output(True).set_data_type(cudnn.data_type.FLOAT)
+
+    graph.validate()
+    graph.build_operation_graph()
+    graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK])
+    graph.check_support()
+    graph.build_plans()
+
+    variant_pack = {
+        q: new_q,
+        k: new_k,
+        v: new_v,
+        descale_q: default_scale_gpu,
+        descale_k: default_scale_gpu,
+        descale_v: default_scale_gpu,
+        descale_s: default_scale_gpu,
+        scale_s: default_scale_gpu,
+        scale_o: default_scale_gpu,
+        o: o_gpu_transposed,
+        amax_s: amax_s_gpu,
+        amax_o: amax_o_gpu,
+    }
+
+    workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8)
+
+    def run(*args, **kwargs):
+        graph.execute(variant_pack, workspace)
+        return o_gpu, amax_o_gpu
+
+    return run
+
+
+def attention_pytorch(qkv, dropout_p=0.0, causal=True):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, head_dim)
+        dropout_p: float
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+    """
+    batch_size, seqlen, _, nheads, d = qkv.shape
+    q, k, v = qkv.unbind(dim=2)
+    q = rearrange(q, 'b t h d -> (b h) t d')
+    k = rearrange(k, 'b s h d -> (b h) d s')
+    softmax_scale = 1.0 / math.sqrt(d)
+    # Preallocate attn_weights for `baddbmm`
+    scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device)
+    scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale),
+                       '(b h) t s -> b h t s', h=nheads)
+    if causal:
+        # "triu_tril_cuda_template" not implemented for 'BFloat16'
+        # So we have to construct the mask in float
+        causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
+        # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+        scores = scores + causal_mask.to(dtype=scores.dtype)
+    attention = torch.softmax(scores, dim=-1)
+    attention_drop = F.dropout(attention, dropout_p)
+    output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    return output.to(dtype=qkv.dtype)
+
+def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"):
+    assert mode in ["fwd", "bwd", "fwd_bwd"]
+    f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1)
+    return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f)
+
+def efficiency(flop, time):
+    return (flop / time / 10**12) if not math.isnan(time) else 0.0
+
+def time_fwd(func, *args, **kwargs):
+    time.sleep(1) # Sleep to avoid residual power throttling from the previous benchmark
+    time_f = benchmark_forward(func, *args, **kwargs)
+    return time_f[1].mean
+
+
+torch.manual_seed(0)
+
+repeats = 30
+device = 'cuda'
+# dtype = torch.float16
+dtype = torch.float8_e4m3fn
+
+bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4224), (2, 8448), (1, 8448 * 2)]
+# bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 8192 * 2)]
+# bs_seqlen_vals = [(4, 4096), (2, 8192), (1, 8192 * 2), (4, 4224), (2, 8448), (1, 8448 * 2)]
+# bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048)]
+causal_vals = [False, True]
+headdim_vals = [128]
+dim = 2048
+# dim = 256
+dropout_p = 0.0
+
+methods = (["Pytorch", "Flash3", "cuDNN"]        
+        # + (["Triton"] if attention_triton is not None else [])
+        #    + (["xformers.c"] if xops is not None else [])
+        #    + (["xformers.f"] if xops is not None else [])
+           )
+
+time_f = {}
+time_b = {}
+time_f_b = {}
+speed_f = {}
+speed_b = {}
+speed_f_b = {}
+for causal in causal_vals:
+    for headdim in headdim_vals:
+        for batch_size, seqlen in bs_seqlen_vals:
+            torch.cuda.empty_cache()
+            config = (causal, headdim, batch_size, seqlen)
+            nheads = dim // headdim
+            q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=torch.float16, requires_grad=False) for _ in range(3)]
+            
+            qkv = torch.stack([q, k, v], dim=2)
+            qkv = qkv.to(torch.float16)
+            f = time_fwd(attention_pytorch, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False)
+            time_f[config, "Pytorch"] = f
+            res_baseline = attention_pytorch(qkv, dropout_p, causal=causal)
+
+            if attention_triton is not None:
+                q_transposed = q.transpose(1, 2).contiguous().to(torch.float8_e4m3fn)
+                k_transposed = k.transpose(1, 2).contiguous().to(torch.float8_e4m3fn)
+                v_transposed = v.transpose(1, 2).contiguous().permute(0, 1, 3, 2).to(torch.float8_e4m3fn)
+                scale = 1 / math.sqrt(headdim)
+                f = time_fwd(
+                    attention_triton, q_transposed, k_transposed, v_transposed,
+                    causal, scale, repeats=5, verbose=False, desc='Triton'
+                )
+                f = time_fwd(
+                    attention_triton, q_transposed, k_transposed, v_transposed,
+                    causal, scale, repeats=repeats, verbose=False, desc='Triton'
+                )
+                time_f[config, "Triton"] = f
+                res = attention_triton(
+                    q_transposed, k_transposed, v_transposed.permute(0, 1, 3, 2),
+                    causal, scale
+                ).half().transpose(1, 2)
+                torch.testing.assert_close(res, res_baseline, atol=0.5, rtol=0.5)
+
+            # out = torch.empty_like(q)
+            q, k, v = q.to(dtype), k.to(dtype), v.to(dtype)                        
+            f = time_fwd(flash_attn_func, q, k, v, causal=causal, repeats=repeats, verbose=False)
+
+            # res = flash_attn_func(q, k, v, causal=causal)
+            # torch.testing.assert_close(res.half(), res_baseline, atol=0.05, rtol=0.05)
+
+            time_f[config, "Flash3"] = f
+
+            if cudnn is not None:
+                qkv_fp8 = qkv.to(dtype)
+                time.sleep(1) # Sleep to avoid residual power throttling from the previous benchmark
+                f = time_fwd(
+                    cudnn_spda_setup(
+                        qkv_fp8, seqlen, seqlen,
+                        causal=causal
+                    ),
+                    repeats=repeats, verbose=False
+                )
+                time_f[config, "cuDNN"] = f
+                # res, amax_o = cudnn_spda_setup(
+                #     qkv_fp8, seqlen, seqlen,
+                #     causal=causal
+                # )()
+                # res = res.half()
+                # TODO: CUDNN has numerics issues when
+                # num_heads=16, dim=128, seq_len=1024, batch_size=2
+                # or larger sizes.
+                # res_cpu = res.cpu().reshape(-1)
+                # res_baseline_cpu = res_baseline.cpu().reshape(-1)
+                # print(amax_o)
+                # print(res)
+                # print(res_baseline)
+                # for i in range(len(res_cpu)):
+                #     item = res_cpu[i]
+                #     item_baseline = res_baseline_cpu[i]
+                #     if abs(item - item_baseline) > 0.5:
+                #         print(i)
+                #         print(item)
+                #         print(item_baseline)
+                # torch.testing.assert_close(res, res_baseline, atol=0.05, rtol=0.05)
+
+            print(f"### causal={causal}, headdim={headdim}, batch_size={batch_size}, seqlen={seqlen} ###")
+            for method in methods:
+                speed_f[config, method] = efficiency(
+                    flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"),
+                    time_f[config, method]
+                )
+                #print (time_f[config,method])
+                print(
+                    f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, {time_f[config, method] * 1e3} ms, "
+                )
+
+
+# with open('flash3_attn_time.plk', 'wb') as fp:
+#     pickle.dump((time_f, time_b, time_f_b), fp, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/benchmark_gemm.py b/benchmark_gemm.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a7dc7bd79170c07a30ee3bf1266d4f8dc61b430
--- /dev/null
+++ b/benchmark_gemm.py
@@ -0,0 +1,43 @@
+import time
+import torch
+import torch.utils.benchmark as benchmark
+
+from triton.testing import do_bench
+
+
+def benchmark_forward(fn, *inputs, repeats=10, desc='', verbose=True, **kwinputs):
+    """Use Pytorch Benchmark on the forward pass of an arbitrary function."""
+    if verbose:
+        print(desc, '- Forward pass')
+    t = benchmark.Timer(
+            stmt='fn(*inputs, **kwinputs)',
+            globals={'fn': fn, 'inputs': inputs, 'kwinputs': kwinputs},
+            num_threads=torch.get_num_threads(),
+            )
+    m = t.timeit(repeats)
+    if verbose:
+        print(m)
+    return t, m
+
+
+torch.manual_seed(0)
+repeats = 30
+dtype = torch.float16
+device = 'cuda'
+verbose = False
+m, n = 8192, 8192
+
+tflops_matmul = {}
+tflops_matmul1 = {}
+for k in [512, 1024, 1536, 2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192]:
+    a = torch.randn(m, k, device=device, dtype=dtype)
+    b = torch.randn(n, k, device=device, dtype=dtype).transpose(-1, -2)
+    nFLOPS_matmul = 2 * m * n * k
+    time.sleep(2)  # to reduce power throttling
+    timing = benchmark_forward(torch.matmul, a, b, desc='cuBLAS', verbose=verbose, repeats=repeats)[1]
+    tflops_matmul[k] = nFLOPS_matmul / timing.mean * 1e-12
+    print(f'[torch.utils.benchmark] cuBLAS, {m = }, {n = }, {k = }: {timing.mean * 1e3:.3f}ms, {tflops_matmul[k]:.1f} TFLOPS')
+    time.sleep(2)  # to reduce power throttling
+    ms = do_bench(lambda: torch.matmul(a, b), warmup=10, rep=repeats)
+    tflops_matmul1[k] = nFLOPS_matmul / ms * 1e-9
+    print(f'[triton.test.do_bench]  cuBLAS, {m = }, {n = }, {k = }: {ms:.3f}ms, {tflops_matmul1[k]:.1f} TFLOPS')
diff --git a/bert.py b/bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..33d6935202a1b99393ef34d56a6b4fa0e188ab57
--- /dev/null
+++ b/bert.py
@@ -0,0 +1,764 @@
+# Copyright (c) 2022, Tri Dao.
+# This BERT implementation is based on our MLPerf 2.0 and MLPerf 2.1 BERT implementation.
+# https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py
+# https://github.com/mlcommons/training_results_v2.1/blob/main/Azure-HazyResearch/benchmarks/bert/implementations/ND96amsr_A100_v4/modeling.py
+
+# Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
+
+import logging
+import re
+from collections import OrderedDict
+from collections.abc import Sequence
+from functools import partial
+from typing import Any, Mapping
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import BertConfig, PretrainedConfig
+from transformers.models.bert.modeling_bert import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    BertForPreTrainingOutput,
+)
+
+from flash_attn.bert_padding import (
+    index_first_axis,
+    index_first_axis_residual,
+    pad_input,
+    unpad_input,
+)
+from flash_attn.modules.block import Block
+from flash_attn.modules.embedding import BertEmbeddings
+from flash_attn.modules.mha import MHA
+from flash_attn.modules.mlp import FusedMLP, Mlp
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+
+try:
+    from flash_attn.ops.fused_dense import FusedDense
+except ImportError:
+    FusedDense = None
+
+try:
+    from flash_attn.ops.triton.layer_norm import layer_norm_fn
+except ImportError:
+    layer_norm_fn = None
+
+
+try:
+    from flash_attn.losses.cross_entropy import CrossEntropyLoss
+except ImportError:
+    CrossEntropyLoss = None
+
+
+logger = logging.getLogger(__name__)
+
+
+def create_mixer_cls(config, cross_attn=False, return_residual=False):
+    use_flash_attn = getattr(config, "use_flash_attn", False)
+    fused_bias_fc = getattr(config, "fused_bias_fc", False)
+    rotary_kwargs = {}
+    if config.position_embedding_type == "rotary":
+        rotary_kwargs["rotary_emb_dim"] = getattr(config, "rotary_emb_dim", config.hidden_size)
+        rotary_kwargs["rotary_emb_base"] = getattr(config, "rotary_emb_base", 10000.0)
+        rotary_kwargs["rotary_emb_scale_base"] = getattr(config, "rotary_emb_scale_base", None)
+        rotary_kwargs["rotary_emb_interleaved"] = getattr(config, "rotary_emb_interleaved", False)
+    mixer_cls = partial(
+        MHA,
+        num_heads=config.num_attention_heads,
+        cross_attn=cross_attn,
+        dropout=config.attention_probs_dropout_prob,
+        causal=False,
+        fused_bias_fc=fused_bias_fc,
+        use_flash_attn=use_flash_attn,
+        return_residual=return_residual,
+        **rotary_kwargs,
+    )
+    return mixer_cls
+
+
+def create_mlp_cls(config, layer_idx=None, return_residual=False):
+    inner_dim = config.intermediate_size
+    fused_mlp = getattr(config, "fused_mlp", False)
+    if fused_mlp:
+        assert config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"], (
+            "fused_mlp only " "supports approximate gelu"
+        )
+    if not fused_mlp:
+        approximate = (
+            "tanh"
+            if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
+            else "none"
+        )
+        mlp_cls = partial(
+            Mlp,
+            hidden_features=inner_dim,
+            activation=partial(F.gelu, approximate=approximate),
+            return_residual=return_residual,
+        )
+    else:
+        if FusedMLP is None:
+            raise ImportError("fused_dense is not installed")
+        mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
+        # mlp_checkpoint_lvl could be a list, which contains the checkpoint_lvl for each layer
+        if isinstance(mlp_checkpoint_lvl, Sequence):
+            assert layer_idx is not None
+            mlp_checkpoint_lvl = mlp_checkpoint_lvl[layer_idx]
+        mlp_cls = partial(
+            FusedMLP,
+            hidden_features=inner_dim,
+            checkpoint_lvl=mlp_checkpoint_lvl,
+            return_residual=return_residual,
+        )
+    return mlp_cls
+
+
+def create_block(config, layer_idx=None):
+    last_layer_subset = getattr(config, "last_layer_subset", False)
+    cross_attn = last_layer_subset and layer_idx == config.num_hidden_layers - 1
+    # TD [2022-12-19]: For cross attention (last layer), we actually want to return the
+    # residual x_kv, not residual x. But it's annoying to change the API (and it only affects
+    # one layer) so we just choose not to return residual in this case.
+    return_residual = not cross_attn
+    mixer_cls = create_mixer_cls(config, cross_attn, return_residual=return_residual)
+    mlp_cls = create_mlp_cls(config, layer_idx, return_residual=return_residual)
+    norm_cls = partial(nn.LayerNorm, eps=config.layer_norm_eps)
+    block = Block(
+        config.hidden_size,
+        mixer_cls,
+        mlp_cls,
+        norm_cls=norm_cls,
+        prenorm=False,
+        resid_dropout1=config.hidden_dropout_prob,
+        resid_dropout2=config.hidden_dropout_prob,
+        fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False),
+        return_residual=return_residual,
+    )
+    return block
+
+
+# https://github.com/huggingface/transformers/blob/7032e0203262ebb2ebf55da8d2e01f873973e835/src/transformers/models/bert/modeling_bert.py#L748
+def _init_weights(module, initializer_range=0.02):
+    if isinstance(module, nn.Linear):
+        nn.init.normal_(module.weight, std=initializer_range)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding):
+        nn.init.normal_(module.weight, std=initializer_range)
+        if module.padding_idx is not None:
+            nn.init.zeros_(module.weight[module.padding_idx])
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config: BertConfig):
+        super().__init__()
+        self.use_flash_attn = getattr(config, "use_flash_attn", False)
+        self.layers = nn.ModuleList(
+            [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
+        )
+
+    def forward(self, hidden_states, key_padding_mask=None, subset_mask=None):
+        """If subset_mask is not None, we only want output for the subset of the sequence.
+        This means that we only compute the last layer output for these tokens.
+        subset_mask: (batch, seqlen), dtype=torch.bool
+        """
+        if key_padding_mask is None or not self.use_flash_attn:
+            mixer_kwargs = (
+                {"key_padding_mask": key_padding_mask} if key_padding_mask is not None else None
+            )
+            for layer in self.layers:
+                hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
+            if subset_mask is not None:
+                hidden_states = hidden_states[subset_mask]
+        else:
+            batch, seqlen = hidden_states.shape[:2]
+            hidden_states, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
+                hidden_states, key_padding_mask
+            )
+            mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch}
+            if subset_mask is None:
+                for layer in self.layers:
+                    hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
+                hidden_states = pad_input(hidden_states, indices, batch, seqlen)
+            else:
+                for layer in self.layers[:-1]:
+                    hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
+                if key_padding_mask is not None:
+                    subset_idx = torch.nonzero(
+                        subset_mask[key_padding_mask], as_tuple=False
+                    ).flatten()
+                    subset_seqlens = (subset_mask & key_padding_mask).sum(dim=-1, dtype=torch.int32)
+                    subset_cu_seqlens = F.pad(
+                        torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0)
+                    )
+                else:
+                    subset_idx = torch.nonzero(subset_mask, as_tuple=False).flatten()
+                    subset_seqlens = subset_mask.sum(dim=-1, dtype=torch.int32)
+                    subset_cu_seqlens = F.pad(
+                        torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0)
+                    )
+                hidden_states_subset, hidden_states = index_first_axis_residual(
+                    hidden_states, subset_idx
+                )
+                # It's ok to set max_seqlen_q to be much larger
+                mixer_kwargs = {
+                    "x_kv": hidden_states,
+                    "cu_seqlens": subset_cu_seqlens,
+                    "max_seqlen": max_seqlen_in_batch,
+                    "cu_seqlens_k": cu_seqlens,
+                    "max_seqlen_k": max_seqlen_in_batch,
+                }
+                hidden_states = self.layers[-1](hidden_states_subset, mixer_kwargs=mixer_kwargs)
+        return hidden_states
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        fused_bias_fc = getattr(config, "fused_bias_fc", False)
+        if fused_bias_fc and FusedDense is None:
+            raise ImportError("fused_dense is not installed")
+        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
+        self.dense = linear_cls(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states, pool=True):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0] if pool else hidden_states
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        fused_bias_fc = getattr(config, "fused_bias_fc", False)
+        if fused_bias_fc and FusedDense is None:
+            raise ImportError("fused_dense is not installed")
+        self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
+        if self.fused_dropout_add_ln and layer_norm_fn is None:
+            raise ImportError("Triton is not installed")
+        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
+        self.dense = linear_cls(config.hidden_size, config.hidden_size)
+        approximate = (
+            "tanh"
+            if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
+            else "none"
+        )
+        self.transform_act_fn = nn.GELU(approximate=approximate)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        if not self.fused_dropout_add_ln:
+            hidden_states = self.layer_norm(hidden_states)
+        else:
+            hidden_states = layer_norm_fn(
+                hidden_states, self.layer_norm.weight, self.layer_norm.bias, eps=self.layer_norm.eps
+            )
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        fused_bias_fc = getattr(config, "fused_bias_fc", False)
+        if fused_bias_fc and FusedDense is None:
+            raise ImportError("fused_dense is not installed")
+        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
+
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = linear_cls(config.hidden_size, config.vocab_size, bias=True)
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class BertPreTrainedModel(nn.Module):
+    """An abstract class to handle weights initialization and
+    a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__()
+        if not isinstance(config, BertConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                )
+            )
+        self.config = config
+
+    @classmethod
+    def from_pretrained(cls, model_name, config, *inputs, **kwargs):
+        """
+        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        Params:
+            pretrained_model_name_or_path: either:
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a BertForPretraining instance
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `model.chkpt` a TensorFlow checkpoint
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        load_return = model.load_state_dict(
+            remap_state_dict(state_dict_from_pretrained(model_name), config), strict=False
+        )
+        logger.info(load_return)
+        return model
+
+
+class BertModel(BertPreTrainedModel):
+    def __init__(self, config: BertConfig, add_pooling_layer=True):
+        super().__init__(config)
+        self.pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+        if config.vocab_size % self.pad_vocab_size_multiple != 0:
+            config.vocab_size += self.pad_vocab_size_multiple - (
+                config.vocab_size % self.pad_vocab_size_multiple
+            )
+        self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
+        if self.fused_dropout_add_ln and layer_norm_fn is None:
+            raise ImportError("Triton is not installed")
+        assert config.hidden_act in ["gelu", "gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
+
+        self.embeddings = BertEmbeddings(
+            config.hidden_size,
+            config.vocab_size,
+            config.max_position_embeddings,
+            config.type_vocab_size,
+            padding_idx=config.pad_token_id,
+        )
+        self.emb_drop = nn.Dropout(config.hidden_dropout_prob)
+        self.emb_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        self.apply(partial(_init_weights, initializer_range=config.initializer_range))
+
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        masked_tokens_mask=None,
+    ):
+        """If masked_tokens_mask is not None (i.e. last_layer_subset == True in BertForPreTraining),
+        we only want the output for the masked tokens. This means that we only compute the last
+        layer output for these tokens.
+        masked_tokens_mask: (batch, seqlen), dtype=torch.bool
+        """
+        hidden_states = self.embeddings(
+            input_ids, position_ids=position_ids, token_type_ids=token_type_ids
+        )
+        # TD [2022-12:18]: Don't need to force residual in fp32
+        # BERT puts embedding LayerNorm before embedding dropout.
+        if not self.fused_dropout_add_ln:
+            hidden_states = self.emb_ln(hidden_states)
+        else:
+            hidden_states = layer_norm_fn(
+                hidden_states, self.emb_ln.weight, self.emb_ln.bias, eps=self.emb_ln.eps
+            )
+        hidden_states = self.emb_drop(hidden_states)
+
+        if masked_tokens_mask is not None:
+            batch_size, seqlen = input_ids.shape[:2]
+            # We also need the first column for the CLS token
+            first_col_mask = torch.zeros(
+                batch_size, seqlen, dtype=torch.bool, device=input_ids.device
+            )
+            first_col_mask[:, 0] = True
+            subset_mask = masked_tokens_mask | first_col_mask
+        else:
+            subset_mask = None
+
+        sequence_output = self.encoder(
+            hidden_states, key_padding_mask=attention_mask, subset_mask=subset_mask
+        )
+
+        if masked_tokens_mask is None:
+            pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        else:
+            # TD [2022-03-01]: the indexing here is very tricky.
+            if attention_mask is not None:
+                subset_idx = subset_mask[attention_mask]
+                pool_input = sequence_output[first_col_mask[attention_mask][subset_idx]]
+                sequence_output = sequence_output[masked_tokens_mask[attention_mask][subset_idx]]
+            else:
+                pool_input = sequence_output[first_col_mask[subset_mask]]
+                sequence_output = sequence_output[masked_tokens_mask[subset_mask]]
+            pooled_output = self.pooler(pool_input, pool=False) if self.pooler is not None else None
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+        )
+
+
+class BertForPreTraining(BertPreTrainedModel):
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+        # If dense_seq_output, we only need to pass the hidden states for the masked out tokens
+        # (around 15%) to the classifier heads.
+        self.dense_seq_output = getattr(config, "dense_seq_output", False)
+        # If last_layer_subset, we only need the compute the last layer for a subset of tokens
+        # (e.g., the tokens we need to compute the masked LM loss and the next-sentence prediction).
+        self.last_layer_subset = getattr(config, "last_layer_subset", False)
+        if self.last_layer_subset:
+            assert self.dense_seq_output, "last_layer_subset requires dense_seq_output"
+        use_xentropy = getattr(config, "use_xentropy", False)
+        if use_xentropy and CrossEntropyLoss is None:
+            raise ImportError("xentropy_cuda is not installed")
+        loss_cls = (
+            nn.CrossEntropyLoss
+            if not use_xentropy
+            else partial(CrossEntropyLoss, inplace_backward=True)
+        )
+
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+        self.mlm_loss = loss_cls(ignore_index=0)
+        self.nsp_loss = loss_cls(ignore_index=-1)
+
+        # Initialize weights and apply final processing
+        self.apply(partial(_init_weights, initializer_range=config.initializer_range))
+        self.tie_weights()
+
+    def tie_weights(self):
+        self.cls.predictions.decoder.weight = self.bert.embeddings.word_embeddings.weight
+
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        labels=None,
+        next_sentence_label=None,
+    ):
+        """
+        If labels are provided, they must be 0 for masked out tokens (as specified in the attention
+        mask).
+        Outputs:
+            if `labels` and `next_sentence_label` are not `None`:
+                Outputs the total_loss which is the sum of the masked language modeling loss and the next
+                sentence classification loss.
+            if `labels` or `next_sentence_label` is `None`:
+                Outputs a tuple comprising
+                - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+                - the next sentence classification logits of shape [batch_size, 2].
+
+        """
+        masked_tokens_mask = labels > 0 if (self.last_layer_subset and labels is not None) else None
+        outputs = self.bert(
+            input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask.bool() if attention_mask is not None else None,
+            masked_tokens_mask=masked_tokens_mask,
+        )
+        sequence_output, pooled_output = outputs.last_hidden_state, outputs.pooler_output
+        if self.dense_seq_output and labels is not None:
+            masked_token_idx = torch.nonzero(labels.flatten() > 0, as_tuple=False).flatten()
+            if not self.last_layer_subset:
+                sequence_output = index_first_axis(
+                    rearrange(sequence_output, "b s d -> (b s) d"), masked_token_idx
+                )
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            if (
+                self.dense_seq_output and labels is not None
+            ):  # prediction_scores are already flattened
+                masked_lm_loss = self.mlm_loss(
+                    prediction_scores, labels.flatten()[masked_token_idx]
+                )
+            else:
+                masked_lm_loss = self.mlm_loss(
+                    rearrange(prediction_scores, "... v -> (...) v"),
+                    rearrange(labels, "... -> (...)"),
+                )
+            next_sentence_loss = self.nsp_loss(
+                rearrange(seq_relationship_score, "... t -> (...) t"),
+                rearrange(next_sentence_label, "... -> (...)"),
+            )
+            total_loss = masked_lm_loss.float() + next_sentence_loss.float()
+
+        return BertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+        )
+
+
+def remap_state_dict(state_dict, config: PretrainedConfig):
+    """
+    Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
+    """
+
+    # LayerNorm
+    def key_mapping_ln_gamma_beta(key):
+        key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key)
+        key = re.sub(r"LayerNorm.beta$", "LayerNorm.bias", key)
+        return key
+
+    state_dict = OrderedDict((key_mapping_ln_gamma_beta(k), v) for k, v in state_dict.items())
+
+    # Layers
+    def key_mapping_layers(key):
+        return re.sub(r"^bert.encoder.layer.", "bert.encoder.layers.", key)
+
+    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
+
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^bert.embeddings.LayerNorm.", "bert.emb_ln.", key)
+        key = re.sub(
+            r"^bert.encoder.layers.(\d+).attention.output.LayerNorm.(weight|bias)",
+            r"bert.encoder.layers.\1.norm1.\2",
+            key,
+        )
+        key = re.sub(
+            r"^bert.encoder.layers.(\d+).output.LayerNorm.(weight|bias)",
+            r"bert.encoder.layers.\1.norm2.\2",
+            key,
+        )
+        key = re.sub(
+            r"^cls.predictions.transform.LayerNorm.(weight|bias)",
+            r"cls.predictions.transform.layer_norm.\1",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+
+    # MLP
+    def key_mapping_mlp(key):
+        key = re.sub(
+            r"^bert.encoder.layers.(\d+).intermediate.dense.(weight|bias)",
+            r"bert.encoder.layers.\1.mlp.fc1.\2",
+            key,
+        )
+        key = re.sub(
+            r"^bert.encoder.layers.(\d+).output.dense.(weight|bias)",
+            r"bert.encoder.layers.\1.mlp.fc2.\2",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+
+    # Attention
+    last_layer_subset = getattr(config, "last_layer_subset", False)
+    for d in range(config.num_hidden_layers):
+        Wq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.weight")
+        Wk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.weight")
+        Wv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.weight")
+        bq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.bias")
+        bk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.bias")
+        bv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.bias")
+        if not (last_layer_subset and d == config.num_hidden_layers - 1):
+            state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.weight"] = torch.cat(
+                [Wq, Wk, Wv], dim=0
+            )
+            state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0)
+        else:
+            state_dict[f"bert.encoder.layers.{d}.mixer.Wq.weight"] = Wq
+            state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.weight"] = torch.cat([Wk, Wv], dim=0)
+            state_dict[f"bert.encoder.layers.{d}.mixer.Wq.bias"] = bq
+            state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.bias"] = torch.cat([bk, bv], dim=0)
+
+    def key_mapping_attn(key):
+        return re.sub(
+            r"^bert.encoder.layers.(\d+).attention.output.dense.(weight|bias)",
+            r"bert.encoder.layers.\1.mixer.out_proj.\2",
+            key,
+        )
+
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+
+    def key_mapping_decoder_bias(key):
+        return re.sub(r"^cls.predictions.bias", "cls.predictions.decoder.bias", key)
+
+    state_dict = OrderedDict((key_mapping_decoder_bias(k), v) for k, v in state_dict.items())
+
+    # Word embedding
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    if pad_vocab_size_multiple > 1:
+        word_embeddings = state_dict["bert.embeddings.word_embeddings.weight"]
+        state_dict["bert.embeddings.word_embeddings.weight"] = F.pad(
+            word_embeddings, (0, 0, 0, config.vocab_size - word_embeddings.shape[0])
+        )
+        decoder_weight = state_dict["cls.predictions.decoder.weight"]
+        state_dict["cls.predictions.decoder.weight"] = F.pad(
+            decoder_weight, (0, 0, 0, config.vocab_size - decoder_weight.shape[0])
+        )
+        # If the vocab was padded, we want to set the decoder bias for those padded indices to be
+        # strongly negative (i.e. the decoder shouldn't predict those indices).
+        # TD [2022-05-09]: I don't think it affects the MLPerf training.
+        decoder_bias = state_dict["cls.predictions.decoder.bias"]
+        state_dict["cls.predictions.decoder.bias"] = F.pad(
+            decoder_bias, (0, config.vocab_size - decoder_bias.shape[0]), value=-100.0
+        )
+
+    return state_dict
+
+
+def inv_remap_state_dict(state_dict, config: PretrainedConfig):
+    """
+    Map the state_dict of a flash_attn model to be Huggingface BERT compatible.
+
+    This function is meant to be the inverse of remap_state_dict.
+    """
+    # Word embedding
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    if pad_vocab_size_multiple > 1:
+        word_embeddings = state_dict["bert.embeddings.word_embeddings.weight"]
+        decoder_weight = state_dict["cls.predictions.decoder.weight"]
+        decoder_bias = state_dict["cls.predictions.decoder.bias"]
+        # unpad embeddings
+        state_dict["bert.embeddings.word_embeddings.weight"] = word_embeddings[
+            : config.orig_vocab_size, :
+        ]
+        state_dict["cls.predictions.decoder.weight"] = decoder_weight[: config.orig_vocab_size, :]
+        state_dict["cls.predictions.decoder.bias"] = decoder_bias[: config.orig_vocab_size]
+
+    for d in range(config.num_hidden_layers):
+        last_layer_subset = getattr(config, "last_layer_subset", False)
+        if not last_layer_subset or d != (config.num_hidden_layers - 1):
+            Wqkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.weight")
+            Wqkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.bias")
+            state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = Wqkv_weights[
+                : Wqkv_weights.shape[0] // 3, :
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = Wqkv_weights[
+                Wqkv_weights.shape[0] // 3 : 2 * Wqkv_weights.shape[0] // 3, :
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = Wqkv_weights[
+                2 * Wqkv_weights.shape[0] // 3 :, :
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wqkv_biases[
+                : Wqkv_biases.shape[0] // 3
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wqkv_biases[
+                Wqkv_biases.shape[0] // 3 : 2 * Wqkv_biases.shape[0] // 3
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = Wqkv_biases[
+                2 * Wqkv_biases.shape[0] // 3 :
+            ]
+        else:
+            Wq_weight = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.weight")
+            Wkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.weight")
+            Wq_bias = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.bias")
+            Wkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.bias")
+            state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = Wq_weight
+            state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = Wkv_weights[
+                : Wkv_weights.shape[0] // 2, :
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = Wkv_weights[
+                Wkv_weights.shape[0] // 2 :, :
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wq_bias
+            state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wkv_biases[
+                : Wkv_biases.shape[0] // 2
+            ]
+            state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = Wkv_biases[
+                Wkv_biases.shape[0] // 2 :
+            ]
+
+    def inv_key_mapping_ln(key):
+        key = re.sub(r"bert.emb_ln.", "bert.embeddings.LayerNorm.", key)
+        key = re.sub(
+            r"bert.encoder.layers.(\d+).norm1.(weight|bias)",
+            r"bert.encoder.layers.\1.attention.output.LayerNorm.\2",
+            key,
+        )
+        key = re.sub(
+            r"bert.encoder.layers.(\d+).norm2.(weight|bias)",
+            r"bert.encoder.layers.\1.output.LayerNorm.\2",
+            key,
+        )
+        key = re.sub(
+            r"cls.predictions.transform.layer_norm.(weight|bias)",
+            r"cls.predictions.transform.LayerNorm.\1",
+            key,
+        )
+        return key
+
+    def inv_key_mapping_ln_gamma_beta(key):
+        key = re.sub(r"LayerNorm.weight$", "LayerNorm.gamma", key)
+        key = re.sub(r"LayerNorm.bias$", "LayerNorm.beta", key)
+        return key
+
+    def inv_key_mapping_layers(key):
+        return re.sub(r"bert.encoder.layers.", "bert.encoder.layer.", key)
+
+    def inv_key_mapping_mlp(key):
+        key = re.sub(
+            r"bert.encoder.layer.(\d+).mlp.fc1.(weight|bias)",
+            r"bert.encoder.layer.\1.intermediate.dense.\2",
+            key,
+        )
+        key = re.sub(
+            r"bert.encoder.layer.(\d+).mlp.fc2.(weight|bias)",
+            r"bert.encoder.layer.\1.output.dense.\2",
+            key,
+        )
+        return key
+
+    def inv_key_mapping_attn(key):
+        return re.sub(
+            r"bert.encoder.layer.(\d+).mixer.out_proj.(weight|bias)",
+            r"bert.encoder.layer.\1.attention.output.dense.\2",
+            key,
+        )
+
+    def inv_key_mapping_decoder_bias(key):
+        return re.sub(r"cls.predictions.decoder.bias", "cls.predictions.bias", key)
+
+    state_dict = OrderedDict((inv_key_mapping_ln(key), value) for key, value in state_dict.items())
+    state_dict = OrderedDict(
+        (inv_key_mapping_ln_gamma_beta(key), value) for key, value in state_dict.items()
+    )
+    state_dict = OrderedDict(
+        (inv_key_mapping_layers(key), value) for key, value in state_dict.items()
+    )
+    state_dict = OrderedDict((inv_key_mapping_mlp(key), value) for key, value in state_dict.items())
+    state_dict = OrderedDict(
+        (inv_key_mapping_attn(key), value) for key, value in state_dict.items()
+    )
+    state_dict = OrderedDict(
+        (inv_key_mapping_decoder_bias(key), value) for key, value in state_dict.items()
+    )
+
+    return state_dict
diff --git a/bert_padding.py b/bert_padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d447d3f660e1a6ddd7e7f6fb7d1ae4241bfec73
--- /dev/null
+++ b/bert_padding.py
@@ -0,0 +1,213 @@
+# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/padding.py
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+
+
+class IndexFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, indices):
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
+        second_dim = other_shape.numel()
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        # return input[indices]
+        return torch.gather(
+            rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim)
+        ).reshape(-1, *other_shape)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        (indices,) = ctx.saved_tensors
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        grad_output = rearrange(grad_output, "b ... -> b (...)")
+        grad_input = torch.zeros(
+            [ctx.first_axis_dim, grad_output.shape[1]],
+            device=grad_output.device,
+            dtype=grad_output.dtype,
+        )
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        # grad_input[indices] = grad_output
+        grad_input.scatter_(0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output)
+        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
+
+
+index_first_axis = IndexFirstAxis.apply
+
+
+class IndexPutFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, values, indices, first_axis_dim):
+        ctx.save_for_backward(indices)
+        assert indices.ndim == 1
+        assert values.ndim >= 2
+        output = torch.zeros(
+            first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype
+        )
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        output[indices] = values
+        # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        (indices,) = ctx.saved_tensors
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        grad_values = grad_output[indices]
+        # grad_values = torch.gather(grad_output, 0, repeat(indices, 'z -> z d', d=grad_output.shape[1]))
+        return grad_values, None, None
+
+
+index_put_first_axis = IndexPutFirstAxis.apply
+
+
+class IndexFirstAxisResidual(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, indices):
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:]
+        second_dim = other_shape.numel()
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        output = input[indices]
+        # We don't want to reshape input (b ... -> b (...)) since it could change the channel_last
+        # memory format to channel_first. In other words, input might not be contiguous.
+        # If we don't detach, Pytorch complains about output being a view and is being modified inplace
+        return output, input.detach()
+
+    @staticmethod
+    def backward(ctx, grad_output, grad_residual):
+        (indices,) = ctx.saved_tensors
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        assert grad_residual.shape[1:] == other_shape
+        grad_input = grad_residual
+        # grad_input[indices] += grad_output
+        indices = indices.reshape(indices.shape[0], *((1,) * (grad_output.ndim - 1)))
+        indices = indices.expand_as(grad_output)
+        grad_input.scatter_add_(0, indices, grad_output)
+        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
+
+
+index_first_axis_residual = IndexFirstAxisResidual.apply
+
+
+def unpad_input(hidden_states, attention_mask):
+    """
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+    """
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
+    # so we write custom forward and backward to make it a bit faster.
+    return (
+        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_length):
+    """
+    Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model).
+    The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286).
+    
+    For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
+        ```
+        [
+          [2, 3, 0, 0, 0, 0],
+          [3, 2, 0, 0, 0, 0],
+          [6, 0, 0, 0, 0, 0]
+        ]
+        ```
+    , which refers to the 3D-attention mask:
+        ```
+        [
+          [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [0, 0, 1, 0, 0, 0],
+            [0, 0, 1, 1, 0, 0],
+            [0, 0, 1, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]
+          ],
+          [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [1, 1, 1, 0, 0, 0],
+            [0, 0, 0, 1, 0, 0],
+            [0, 0, 0, 1, 1, 0],
+            [0, 0, 0, 0, 0, 1]
+          ],
+          [
+            [1, 0, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [1, 1, 1, 0, 0, 0],
+            [1, 1, 1, 1, 0, 0],
+            [1, 1, 1, 1, 1, 0],
+            [1, 1, 1, 1, 1, 1]
+          ]
+        ]
+        ```.
+
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask_in_length: (batch, seqlen), int, a nonzero number (e.g., 1, 2, 3, etc.) means length of concatenated sequence in b-th batch, and 0 means none.
+    Return:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+    """
+    length = attention_mask_in_length.sum(dim=-1)
+    seqlen = attention_mask_in_length.size(-1)
+    attention_mask_2d = torch.arange(seqlen, device=length.device, dtype=length.dtype).expand(len(length), seqlen) < length.unsqueeze(1)
+    real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten()
+    seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx]
+    indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
+    # so we write custom forward and backward to make it a bit faster.
+    return (
+        index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices),
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def pad_input(hidden_states, indices, batch, seqlen):
+    """
+    Arguments:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
+        batch: int, batch size for the padded sequence.
+        seqlen: int, maximum sequence length for the padded sequence.
+    Return:
+        hidden_states: (batch, seqlen, ...)
+    """
+    dim = hidden_states.shape[-1]
+    # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype)
+    # output[indices] = hidden_states
+    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
+    return rearrange(output, "(b s) ... -> b s ...", b=batch)
diff --git a/bigcode.py b/bigcode.py
new file mode 100644
index 0000000000000000000000000000000000000000..234944d4d6907fb3e1b0c2c3c315a2bee29d7775
--- /dev/null
+++ b/bigcode.py
@@ -0,0 +1,233 @@
+import math
+import re
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+from transformers import GPT2Config, GPTBigCodeConfig, PretrainedConfig
+
+
+def remap_state_dict_hf_bigcode(state_dict, config: PretrainedConfig):
+    """
+    Map the state_dict of a Huggingface BigCode model to be flash_attn compatible.
+    """
+
+    # Word embedding and position embedding
+    def key_mapping_pos_emb(key):
+        return re.sub(r"^transformer.wpe.", "transformer.embeddings.position_embeddings.", key)
+
+    state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.wte.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
+        key = re.sub(
+            r"^transformer.h.(\d+).ln_(1|2).(weight|bias)",
+            r"transformer.layers.\1.norm\2.\3",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+
+    def key_mapping_mlp(key):
+        key = re.sub(
+            r"^transformer.h.(\d+).mlp.c_fc.weight",
+            r"transformer.layers.\1.mlp.fc1.weight",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.h.(\d+).mlp.c_proj.weight",
+            r"transformer.layers.\1.mlp.fc2.weight",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.h.(\d+).mlp.c_fc.bias",
+            r"transformer.layers.\1.mlp.fc1.bias",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.h.(\d+).mlp.c_proj.bias",
+            r"transformer.layers.\1.mlp.fc2.bias",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+
+    # TODO: add support for multi-head attention
+    assert config.multi_query, "Only multi-query attention is supported"
+
+    # Attention
+    for d in range(config.num_hidden_layers):
+        embed_dim = config.n_embd
+        head_dim = embed_dim // config.n_head
+
+        c_attn_weight = state_dict.pop(f"transformer.h.{d}.attn.c_attn.weight")
+        # with multi-query attention, the weights have shape (embed_dim, embed_dim + head_dim + head_dim)
+        # see https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py#L112
+        # see also https://github.com/ggerganov/ggml/blob/dd1d575956e54c5bdc07632f25506b3b1884dbd2/examples/starcoder/convert-hf-to-ggml.py#L183
+        # ((n_head + 2) * head_dim, embed_dim) -> (3 * n_heads * head_dim, hidden_dim)
+        q, k, v = torch.split(c_attn_weight, [embed_dim, head_dim, head_dim], dim=0)
+        # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
+        k = torch.tile(k, (config.n_head, 1))
+        v = torch.tile(v, (config.n_head, 1))
+        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = torch.cat((q, k, v), dim=0)
+
+        # same deal with the bias
+        c_attn_bias = state_dict.pop(f"transformer.h.{d}.attn.c_attn.bias")
+        # ((n_head + 2) * head_dim, embed_dim) -> (3 * n_heads * head_dim, hidden_dim)
+        q, k, v = torch.split(c_attn_bias, [embed_dim, head_dim, head_dim], dim=0)
+        # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim)
+        k = torch.tile(k, (config.n_head,))
+        v = torch.tile(v, (config.n_head,))
+        state_dict[f"transformer.layers.{d}.mixer.Wqkv.bias"] = torch.cat((q, k, v), dim=0)
+
+    def key_mapping_attn(key):
+        key = re.sub(
+            r"^transformer.h.(\d+).attn.c_proj.weight",
+            r"transformer.layers.\1.mixer.out_proj.weight",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.h.(\d+).attn.c_proj.bias",
+            r"transformer.layers.\1.mixer.out_proj.bias",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+
+    return state_dict
+
+
+def inv_remap_state_dict_hf_bigcode(state_dict, config: PretrainedConfig):
+    """
+    Map the state_dict of a flash_attn model to be Huggingface BigCode compatible.
+
+    This function is meant to be the inverse of remap_state_dict_hf_bigcode.
+    """
+
+    # Word embedding and position embeddings
+    def inv_key_mapping_pos_emb(key):
+        return re.sub(r"^transformer.embeddings.position_embeddings.", "transformer.wpe.", key)
+
+    state_dict = OrderedDict((inv_key_mapping_pos_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+
+    word_embeddings = word_embeddings[:, : config.vocab_size]
+    state_dict["transformer.wte.weight"] = word_embeddings
+    state_dict["lm_head.weight"] = word_embeddings
+
+    # LayerNorm
+    def inv_key_mapping_ln(key):
+        key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
+        key = re.sub(
+            r"^transformer.layers.(\d+).norm(1|2).(weight|bias)",
+            r"transformer.h.\1.ln_\2.\3",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((inv_key_mapping_ln(k), v) for k, v in state_dict.items())
+
+    # MLPs
+    def inv_key_mapping_mlp(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.fc1.weight",
+            r"transformer.h.\1.mlp.c_fc.weight",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.fc2.weight",
+            r"transformer.h.\1.mlp.c_proj.weight",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.fc1.bias",
+            r"transformer.h.\1.mlp.c_fc.bias",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.fc2.bias",
+            r"transformer.h.\1.mlp.c_proj.bias",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((inv_key_mapping_mlp(k), v) for k, v in state_dict.items())
+
+    # Attention
+    for d in range(config.num_hidden_layers):
+        embed_dim = config.n_embd
+        head_dim = embed_dim // config.n_head
+
+        Wqkv_weight = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.weight")
+        q, k, v = torch.split(
+            Wqkv_weight, [embed_dim, head_dim * config.n_head, head_dim * config.n_head], dim=0
+        )
+        c_attn_weight = torch.cat((q, k[:head_dim], v[:head_dim]), dim=0)
+        state_dict[f"transformer.h.{d}.attn.c_attn.weight"] = c_attn_weight
+
+        # Same deal with the bias
+        Wqkv_bias = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.bias")
+        q, k, v = torch.split(
+            Wqkv_bias, [embed_dim, head_dim * config.n_head, head_dim * config.n_head], dim=0
+        )
+        c_attn_bias = torch.cat((q, k[:head_dim], v[:head_dim]), dim=0)
+        state_dict[f"transformer.h.{d}.attn.c_attn.bias"] = c_attn_bias
+
+    def inv_key_mapping_attn(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).mixer.out_proj.weight",
+            r"transformer.h.\1.attn.c_proj.weight",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mixer.out_proj.bias",
+            r"transformer.h.\1.attn.c_proj.bias",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((inv_key_mapping_attn(k), v) for k, v in state_dict.items())
+
+    return state_dict
+
+
+def bigcode_config_to_gpt2_config(bigcode_config: GPTBigCodeConfig) -> GPT2Config:
+    return GPT2Config(
+        activation_function=bigcode_config.activation_function,
+        attn_pdrop=bigcode_config.attn_pdrop,
+        bos_token_id=bigcode_config.bos_token_id,
+        embd_pdrop=bigcode_config.embd_pdrop,
+        eos_token_id=bigcode_config.eos_token_id,
+        initializer_range=bigcode_config.initializer_range,
+        layer_norm_epsilon=bigcode_config.layer_norm_epsilon,
+        max_batch_size=bigcode_config.max_batch_size,
+        max_sequence_length=bigcode_config.max_sequence_length,
+        model_type=bigcode_config.model_type,
+        multi_query=bigcode_config.multi_query,
+        n_embd=bigcode_config.n_embd,
+        n_head=bigcode_config.n_head,
+        n_inner=bigcode_config.n_inner,
+        n_layer=bigcode_config.n_layer,
+        n_positions=bigcode_config.n_positions,
+        resid_pdrop=bigcode_config.resid_pdrop,
+        scale_attn_weights=bigcode_config.scale_attn_weights,
+        summary_activation=bigcode_config.summary_activation,
+        summary_first_dropout=bigcode_config.summary_first_dropout,
+        summary_proj_to_labels=bigcode_config.summary_proj_to_labels,
+        summary_type=bigcode_config.summary_type,
+        summary_use_proj=bigcode_config.summary_use_proj,
+        use_cache=bigcode_config.use_cache,
+        vocab_size=bigcode_config.vocab_size,
+    )
diff --git a/block.py b/block.py
new file mode 100644
index 0000000000000000000000000000000000000000..be8e8b864b600220068c2ec16aba5e2f1a81c121
--- /dev/null
+++ b/block.py
@@ -0,0 +1,397 @@
+# Copyright (c) 2024, Tri Dao.
+
+from functools import partial
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torchvision.ops import StochasticDepth
+
+from flash_attn.modules.mha import MHA
+from flash_attn.modules.mlp import Mlp
+
+try:
+    from flash_attn.ops.triton.layer_norm import layer_norm_fn, RMSNorm
+except ImportError:
+    layer_norm_fn, RMSNorm = None, None
+
+
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        mixer_cls=None,
+        mlp_cls=None,
+        norm_cls=nn.LayerNorm,
+        dropout_cls=nn.Dropout,
+        prenorm=True,
+        resid_dropout1=0.0,
+        resid_dropout2=0.0,
+        drop_path1=0.0,
+        drop_path2=0.0,
+        fused_dropout_add_ln=False,
+        return_residual=False,
+        residual_in_fp32=False,
+        sequence_parallel=False,
+        mark_shared_params=False,
+    ):
+        """
+        For prenorm=True, this Block has a slightly different structure compared to a regular
+        prenorm Transformer block.
+        The standard block is: LN -> MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add.
+        [Ref: https://arxiv.org/abs/2002.04745]
+        Here we have: Dropout -> Add -> LN -> MHA -> Dropout -> Add -> LN -> MLP, returning both
+        the hidden_states (output of the MLP) and the residual.
+        This is for performance reasons, as we can fuse the dropout, add and LayerNorm.
+        The residual needs to be provided (except for the very first block).
+
+        For prenorm=False, this Block has the same structure as a regular postnorm Transformer
+        block: MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add -> LN.
+
+        return_residual: whether each of the sub-layers (mixer and mlp) will return the residual.
+        This is for performance reason: for post-norm architecture, returning the input allows us
+        to fuse the backward of nn.Linear with the residual connection.
+        """
+        super().__init__()
+        self.prenorm = prenorm
+        self.fused_dropout_add_ln = fused_dropout_add_ln
+        self.return_residual = return_residual
+        self.residual_in_fp32 = residual_in_fp32
+        if self.residual_in_fp32:
+            assert self.prenorm, "residual_in_fp32 is only compatible with prenorm=True"
+        if mixer_cls is None:
+            mixer_cls = partial(MHA, num_heads=dim // 64)
+        if mlp_cls is None:
+            mlp_cls = partial(Mlp, hidden_features=4 * dim)
+        self.mixer = mixer_cls(dim)
+        self.dropout1 = dropout_cls(resid_dropout1)
+        self.drop_path1 = StochasticDepth(drop_path1, mode="row")
+        self.norm1 = norm_cls(dim)
+        self.mlp = mlp_cls(dim)
+        if not isinstance(self.mlp, nn.Identity):
+            self.dropout2 = dropout_cls(resid_dropout2)
+            self.drop_path2 = StochasticDepth(drop_path2, mode="row")
+            self.norm2 = norm_cls(dim)
+
+        if self.fused_dropout_add_ln:
+            assert layer_norm_fn is not None, "Triton is not installed"
+            assert isinstance(self.norm1, (nn.LayerNorm, RMSNorm)) and isinstance(
+                self.dropout1, nn.Dropout
+            )
+
+        # TD [2023-01-07]: TODO: During training, if sequence_parallel is False and dropout != 0.0,
+        # then the input to each worker in the tensor parallel group will be different.
+        # This would produce wrong outputs? Somehow we'd need to sync the RNG state across workers.
+        # For now this is not an issue because we always use sequence_parallel=True during training
+        # and only use sequence_parallel=False during inference.
+
+        # Mark the norm parameters as "sequence_parallel" so that we run all-reduce on their grads.
+        if sequence_parallel:
+            for p in self.norm1.parameters():
+                p._sequence_parallel = True
+            if hasattr(self, "norm2"):
+                for p in self.norm2.parameters():
+                    p._sequence_parallel = True
+        # Mark the norm parameters as "shared_params" so that we sync their values at init.
+        if mark_shared_params:
+            for p in self.norm1.parameters():
+                p._shared_params = True
+            if hasattr(self, "norm2"):
+                for p in self.norm2.parameters():
+                    p._shared_params = True
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        residual: Optional[Tensor] = None,
+        mixer_subset=None,
+        mixer_kwargs=None,
+    ):
+        r"""Pass the input through the encoder layer.
+
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: if postnorm, residual=None, If prenorm, hidden_states = Attn/MLP(LN(residual))
+            mixer_subset: for cross-attention only. If not None, will take a subset of x
+                before applying the query projection. Useful for e.g., ViT where we only care
+                about the CLS token in the last layer.
+        """
+        if self.prenorm:
+            if not self.fused_dropout_add_ln:
+                dropped = self.drop_path1(self.dropout1(hidden_states))
+                residual = (dropped + residual) if residual is not None else dropped
+                hidden_states = self.norm1(residual.to(dtype=self.norm1.weight.dtype))
+                if self.residual_in_fp32:
+                    residual = residual.to(torch.float32)
+            else:
+                if self.drop_path1.p == 0 or not self.training:
+                    rowscale1 = None
+                else:
+                    rowscale1 = self.drop_path1(
+                        torch.ones(
+                            hidden_states.shape[:-1],
+                            device=hidden_states.device,
+                            dtype=hidden_states.dtype,
+                        )
+                    )
+                hidden_states, residual = layer_norm_fn(
+                    hidden_states,
+                    self.norm1.weight,
+                    self.norm1.bias,
+                    residual=residual,
+                    eps=self.norm1.eps,
+                    dropout_p=self.dropout1.p if self.training else 0.0,
+                    rowscale=rowscale1,
+                    prenorm=True,
+                    residual_in_fp32=self.residual_in_fp32,
+                    is_rms_norm=isinstance(self.norm1, RMSNorm)
+                )
+            if mixer_kwargs is None:
+                mixer_kwargs = {}
+            if mixer_subset is not None:
+                mixer_kwargs["mixer_subset"] = mixer_subset
+            hidden_states = self.mixer(hidden_states, **mixer_kwargs)
+            if mixer_subset is not None:
+                residual = residual[:, mixer_subset]
+            if not isinstance(self.mlp, nn.Identity):
+                if not self.fused_dropout_add_ln:
+                    dropped = self.drop_path2(self.dropout2(hidden_states))
+                    residual = (dropped + residual) if residual is not None else dropped
+                    hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
+                    if self.residual_in_fp32:
+                        residual = residual.to(torch.float32)
+                else:
+                    if self.drop_path2.p == 0 or not self.training:
+                        rowscale2 = None
+                    else:
+                        rowscale2 = self.drop_path2(
+                            torch.ones(
+                                hidden_states.shape[:-1],
+                                device=hidden_states.device,
+                                dtype=hidden_states.dtype,
+                            )
+                        )
+                    hidden_states, residual = layer_norm_fn(
+                        hidden_states,
+                        self.norm2.weight,
+                        self.norm2.bias,
+                        residual=residual,
+                        eps=self.norm2.eps,
+                        dropout_p=self.dropout2.p if self.training else 0.0,
+                        rowscale=rowscale2,
+                        prenorm=True,
+                        residual_in_fp32=self.residual_in_fp32,
+                        is_rms_norm=isinstance(self.norm2, RMSNorm)
+                    )
+                hidden_states = self.mlp(hidden_states)
+            return hidden_states, residual
+        else:
+            assert residual is None
+            mixer_out = self.mixer(
+                hidden_states, **(mixer_kwargs if mixer_kwargs is not None else {})
+            )
+            if self.return_residual:  # mixer out is actually a pair here
+                mixer_out, hidden_states = mixer_out
+            if not self.fused_dropout_add_ln:
+                hidden_states = self.norm1(
+                    (self.drop_path1(self.dropout1(mixer_out)) + hidden_states).to(
+                        dtype=self.norm1.weight.dtype
+                    )
+                )
+            else:
+                if self.drop_path1.p == 0 or not self.training:
+                    rowscale1 = None
+                else:
+                    rowscale1 = self.drop_path1(
+                        torch.ones(
+                            mixer_out.shape[:-1], device=mixer_out.device, dtype=mixer_out.dtype
+                        )
+                    )
+                hidden_states = layer_norm_fn(
+                    mixer_out,
+                    self.norm1.weight,
+                    self.norm1.bias,
+                    residual=hidden_states,
+                    eps=self.norm1.eps,
+                    dropout_p=self.dropout1.p if self.training else 0.0,
+                    rowscale=rowscale1,
+                    prenorm=False,
+                    is_rms_norm=isinstance(self.norm1, RMSNorm)
+                )
+            if not isinstance(self.mlp, nn.Identity):
+                mlp_out = self.mlp(hidden_states)
+                if self.return_residual:  # mlp out is actually a pair here
+                    mlp_out, hidden_states = mlp_out
+                if not self.fused_dropout_add_ln:
+                    hidden_states = self.norm2(
+                        (self.drop_path2(self.dropout2(mlp_out)) + hidden_states).to(
+                            dtype=self.norm2.weight.dtype
+                        )
+                    )
+                else:
+                    if self.drop_path2.p == 0 or not self.training:
+                        rowscale2 = None
+                    else:
+                        rowscale2 = self.drop_path2(
+                            torch.ones(
+                                mlp_out.shape[:-1], device=mlp_out.device, dtype=mlp_out.dtype
+                            )
+                        )
+                    hidden_states = layer_norm_fn(
+                        mlp_out,
+                        self.norm2.weight,
+                        self.norm2.bias,
+                        residual=hidden_states,
+                        eps=self.norm2.eps,
+                        dropout_p=self.dropout2.p if self.training else 0.0,
+                        rowscale=rowscale2,
+                        prenorm=False,
+                        is_rms_norm=isinstance(self.norm2, RMSNorm)
+                    )
+            return hidden_states
+
+
+class ParallelBlock(nn.Module):
+    """The attention (mixer) and MLP blocks are done in parallel, similar to GPT-J, GPT-NeoX,
+    and PaLM.
+    """
+
+    def __init__(
+        self,
+        dim,
+        mixer_cls=None,
+        mlp_cls=None,
+        norm_cls=nn.LayerNorm,
+        dropout_cls=nn.Dropout,
+        resid_dropout1=0.0,
+        resid_dropout2=0.0,
+        tied_norm=False,
+        fused_dropout_add_ln=False,
+        residual_in_fp32=False,
+        sequence_parallel=False,
+        mark_shared_params=False,
+    ):
+        """
+        This Block has a slightly different structure compared to a regular
+        prenorm Transformer block.
+        The standard block is: LN -> MHA / MLP -> Dropout -> Add.
+        [Ref: https://arxiv.org/abs/2002.04745]
+        Here we have: Dropout -> Add -> LN -> MHA / MLP, returning both
+        the hidden_states (output1 of the MHA / MLP) and the residual.
+        This is for performance reasons, as we can fuse the dropout, add and LayerNorm.
+        The residual needs to be provided (except for the very first block).
+        """
+        super().__init__()
+        self.tied_norm = tied_norm
+        self.fused_dropout_add_ln = fused_dropout_add_ln
+        self.residual_in_fp32 = residual_in_fp32
+        if mixer_cls is None:
+            mixer_cls = partial(MHA, num_heads=dim // 64)
+        if mlp_cls is None:
+            mlp_cls = partial(Mlp, hidden_features=4 * dim)
+        self.mixer = mixer_cls(dim)
+        self.dropout1 = dropout_cls(resid_dropout1)
+        self.norm1 = norm_cls(dim)
+        self.mlp = mlp_cls(dim)
+        self.dropout2 = dropout_cls(resid_dropout2)
+        if not self.tied_norm:
+            self.norm2 = norm_cls(dim)
+
+        if self.fused_dropout_add_ln:
+            assert layer_norm_fn is not None, "Triton is not installed"
+            assert isinstance(self.norm1, (nn.LayerNorm, RMSNorm)) and isinstance(
+                self.dropout1, nn.Dropout
+            )
+
+        # TD [2023-01-07]: TODO: During training, if sequence_parallel is False and dropout != 0.0,
+        # then the input to each worker in the tensor parallel group will be different.
+        # This would produce wrong outputs? Somehow we'd need to sync the RNG state across workers.
+        # For now this is not an issue because we always use sequence_parallel=True during training
+        # and only use sequence_parallel=False during inference.
+
+        # Mark the norm parameters as "sequence_parallel" so that we run all-reduce on their grads.
+        if sequence_parallel:
+            for p in self.norm1.parameters():
+                p._sequence_parallel = True
+            if hasattr(self, "norm2"):
+                for p in self.norm2.parameters():
+                    p._sequence_parallel = True
+        # Mark the norm parameters as "shared_params" so that we sync their values at init.
+        if mark_shared_params:
+            for p in self.norm1.parameters():
+                p._shared_params = True
+            if hasattr(self, "norm2"):
+                for p in self.norm2.parameters():
+                    p._shared_params = True
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+
+    def forward(
+        self,
+        hidden_states1: Tensor,
+        hidden_states2: Optional[Tensor] = None,
+        residual: Optional[Tensor] = None,
+        mixer_kwargs=None,
+    ):
+        r"""Pass the input through the encoder layer.
+
+        Args:
+            hidden_states1: the output of the previous attention (mixer) or embedding layer.
+            hidden_states2: the output of the previous MLP layer (if None, will use hidden_states1).
+            residual.
+        """
+        # TODO: Ideally we should only do the allgather / allreduce once for
+        # the Linear to MLP & Attention
+        if not self.fused_dropout_add_ln:
+            dropped1 = self.dropout1(hidden_states1)
+            # For the very 1st block, we only want 1 dropout, not two different dropouts
+            if hidden_states2 is not None:
+                dropped2 = self.dropout2(hidden_states2)
+                residual = (
+                    (residual + dropped1 + dropped2)
+                    if residual is not None
+                    else dropped1 + dropped2
+                )
+            else:
+                residual = (residual + dropped1) if residual is not None else dropped1
+            hidden_states1 = self.norm1(residual.to(dtype=self.norm1.weight.dtype))
+            hidden_states2 = (
+                self.norm2(residual.to(dtype=self.norm2.weight.dtype))
+                if not self.tied_norm
+                else hidden_states1
+            )
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+        else:
+            weight2, bias2 = (
+                (self.norm2.weight, self.norm2.bias) if not self.tied_norm else (None, None)
+            )
+            hidden_states1, *rest, residual = layer_norm_fn(
+                hidden_states1,
+                self.norm1.weight,
+                self.norm1.bias,
+                residual=residual,
+                x1=hidden_states2,
+                weight1=weight2,
+                bias1=bias2,
+                eps=self.norm1.eps,
+                dropout_p=self.dropout1.p if self.training else 0.0,
+                prenorm=True,
+                residual_in_fp32=self.residual_in_fp32,
+                is_rms_norm=isinstance(self.norm1, RMSNorm)
+            )
+            if self.tied_norm:
+                hidden_states2 = hidden_states1
+            else:
+                hidden_states2, = rest
+        if mixer_kwargs is None:
+            mixer_kwargs = {}
+        hidden_states1 = self.mixer(hidden_states1, **mixer_kwargs)
+        hidden_states2 = self.mlp(hidden_states2)
+        return hidden_states1, hidden_states2, residual
diff --git a/block_info.h b/block_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a23a1e1f26da48dae1032b6fe36b208b43da2ab
--- /dev/null
+++ b/block_info.h
@@ -0,0 +1,46 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+namespace flash {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<bool Varlen=true>
+struct BlockInfo {
+
+    template<typename Params>
+    __device__ BlockInfo(const Params &params, const int bidb)
+        : sum_s_q(!Varlen || params.cu_seqlens_q == nullptr ? -1 : params.cu_seqlens_q[bidb])
+        , sum_s_k(!Varlen || params.cu_seqlens_k == nullptr || !params.is_seqlens_k_cumulative ? -1 : params.cu_seqlens_k[bidb])
+        , actual_seqlen_q(!Varlen || params.cu_seqlens_q == nullptr ? params.seqlen_q : params.cu_seqlens_q[bidb + 1] - sum_s_q)
+        // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb].
+        // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K.
+        , seqlen_k_cache(!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k : (params.is_seqlens_k_cumulative ? params.cu_seqlens_k[bidb + 1] - sum_s_k : params.cu_seqlens_k[bidb]))
+        , actual_seqlen_k(params.seqused_k ? params.seqused_k[bidb] : seqlen_k_cache + (params.knew_ptr == nullptr ? 0 : params.seqlen_knew))
+        {
+        }
+
+    template <typename index_t>
+    __forceinline__ __device__ index_t q_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
+        return sum_s_q == -1 ? bidb * batch_stride : uint32_t(sum_s_q) * row_stride;
+    }
+
+    template <typename index_t>
+    __forceinline__ __device__ index_t k_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
+        return sum_s_k == -1 ? bidb * batch_stride : uint32_t(sum_s_k) * row_stride;
+    }
+
+    const int sum_s_q;
+    const int sum_s_k;
+    const int actual_seqlen_q;
+    // We have to have seqlen_k_cache declared before actual_seqlen_k, otherwise actual_seqlen_k is set to 0.
+    const int seqlen_k_cache;
+    const int actual_seqlen_k;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace flash
diff --git a/btlm.py b/btlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..295e12062320be819dd835de4a866607650431b2
--- /dev/null
+++ b/btlm.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2023, Tri Dao.
+
+import math
+import json
+import re
+from pathlib import Path
+
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+
+from einops import rearrange
+from transformers import GPT2Config, AutoConfig, PretrainedConfig
+
+
+def remap_state_dict_hf_btlm(state_dict, config):
+    # Word embedding and position embedding
+    def key_mapping_pos_emb(key):
+        return re.sub(r"^transformer.wpe.", "transformer.embeddings.position_embeddings.", key)
+
+    if "transformer.wpe.weight" in state_dict:
+        state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.wte.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
+        key = re.sub(r"^transformer.h.(\d+).ln_(1|2).(weight|bias)", r"transformer.layers.\1.norm\2.\3", key)
+        return key
+
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+
+    # MLP
+    for d in range(config.num_hidden_layers):
+        W1 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc.weight")
+        W3 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc2.weight")
+        state_dict[f"transformer.layers.{d}.mlp.fc1.weight"] = torch.cat([W1.t(), W3.t()], dim=0)
+        b1 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc.bias")
+        b3 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc2.bias")
+        state_dict[f"transformer.layers.{d}.mlp.fc1.bias"] = torch.cat([b1, b3], dim=0)
+        W2 = state_dict.pop(f"transformer.h.{d}.mlp.c_proj.weight")
+        state_dict[f"transformer.layers.{d}.mlp.fc2.weight"] = W2.t()
+
+    def key_mapping_mlp(key):
+        key = re.sub(r"^transformer.h.(\d+).mlp.c_proj.bias", r"transformer.layers.\1.mlp.fc2.bias", key)
+        return key
+
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+
+    # Attention
+    for d in range(config.num_hidden_layers):
+        Wqkv = state_dict.pop(f"transformer.h.{d}.attn.c_attn.weight")
+        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = Wqkv.t()
+        Wout = state_dict.pop(f"transformer.h.{d}.attn.c_proj.weight")
+        state_dict[f"transformer.layers.{d}.mixer.out_proj.weight"] = Wout.t()
+    state_dict.pop(f"transformer.relative_pe.slopes")  # We don't store the Alibi slopes
+
+    def key_mapping_attn(key):
+        key = re.sub(r"^transformer.h.(\d+).attn.c_attn.bias", r"transformer.layers.\1.mixer.Wqkv.bias", key)
+        key = re.sub(
+            r"^transformer.h.(\d+).attn.c_proj.bias", r"transformer.layers.\1.mixer.out_proj.bias", key
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+
+    return state_dict
+
+
+def btlm_config_to_gpt2_config(btlm_config: PretrainedConfig) -> GPT2Config:
+    return GPT2Config(
+        vocab_size=btlm_config.vocab_size,
+        n_positions=0 if btlm_config.position_embedding_type == "alibi" else btlm_config.n_positions,
+        n_embd=btlm_config.hidden_size,
+        n_layer=btlm_config.num_hidden_layers,
+        n_head=btlm_config.num_attention_heads,
+        n_inner=btlm_config.n_inner,
+        activation_function=btlm_config.activation_function,
+        resid_pdrop=btlm_config.resid_pdrop,
+        embd_pdrop=btlm_config.embd_pdrop,
+        attn_pdrop=btlm_config.attn_pdrop,
+        layer_norm_epsilon=btlm_config.layer_norm_epsilon,
+        initializer_range=btlm_config.initializer_range,
+        bos_token_id=btlm_config.bos_token_id,
+        eos_token_id=btlm_config.eos_token_id,
+        # These are new arguments not in the original GPT2Config
+        use_alibi=btlm_config.position_embedding_type == "alibi",
+        use_flash_attn=btlm_config.position_embedding_type == "alibi",  # Alibi code path requires flash_attn
+        mup_width_scale=btlm_config.mup_width_scale,
+        mup_embeddings_multiplier=btlm_config.mup_embeddings_scale,
+        mup_output_multiplier=btlm_config.mup_output_alpha,
+        mup_scale_qk_dot_by_d=btlm_config.mup_scale_qk_dot_by_d,
+        mlp_multiple_of=1,
+    )
diff --git a/causality-monitor.yaml b/causality-monitor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fbac5b68e91761f1384876ca6ceb6cfa04981cca
--- /dev/null
+++ b/causality-monitor.yaml
@@ -0,0 +1,2 @@
+causality-monitor:
+  _target_: src.callbacks.causality_monitor.CausalityMonitor
\ No newline at end of file
diff --git a/comet.yaml b/comet.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6ac99f46ca6d2d81e1bd70c230648ec5547c0df0
--- /dev/null
+++ b/comet.yaml
@@ -0,0 +1,7 @@
+# https://www.comet.ml
+
+comet:
+  _target_: pytorch_lightning.loggers.comet.CometLogger
+  api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable
+  project_name: "template-tests"
+  experiment_name: ${name}
diff --git a/config.yaml b/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f7c8f510f6ef919dee609792bcd457b249d17104
--- /dev/null
+++ b/config.yaml
@@ -0,0 +1,50 @@
+# @package _global_
+
+# specify here default training configuration
+defaults:
+  - _self_
+  - trainer: default
+  - optimizer: adamw
+  - scheduler: null
+  - task: sequence-model
+  - model: null
+  - datamodule: null
+  - callbacks: default # set this to null if you don't want to use callbacks
+  - metrics: null
+  - logger: null # set logger here or use command line (e.g. `python run.py logger=wandb`)
+
+  - mode: default
+
+  - experiment: null
+  - hparams_search: null
+
+  # enable color logging
+  - override hydra/hydra_logging: colorlog
+  - override hydra/job_logging: colorlog
+
+# path to original working directory
+# hydra hijacks working directory by changing it to the current log directory,
+# so it's useful to have this path as a special variable
+# https://hydra.cc/docs/next/tutorials/basic/running_your_app/working_directory
+work_dir: ${hydra:runtime.cwd}
+
+# path to folder with data
+data_dir: ${work_dir}/data/
+
+# pretty print config at the start of the run using Rich library
+print_config: True
+
+# disable python warnings if they annoy you
+ignore_warnings: True
+
+# check performance on test set, using the best model achieved during training
+# lightning chooses best model based on metric specified in checkpoint callback
+test_after_training: True
+
+resume: False
+
+# seed for random number generators in pytorch, numpy and python.random
+seed: null
+
+# name of the run, accessed by loggers
+name: null
diff --git a/cosine-warmup-timm.yaml b/cosine-warmup-timm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f2bbbec01f0f13fd102b92196e52b34898b081da
--- /dev/null
+++ b/cosine-warmup-timm.yaml
@@ -0,0 +1,2 @@
+# @package train.scheduler
+_target_: src.optim.timm_lr_scheduler.TimmCosineLRScheduler
diff --git a/cosine-warmup.yaml b/cosine-warmup.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..afaf0f618ecc41c9182c7df34717c973aa4af974
--- /dev/null
+++ b/cosine-warmup.yaml
@@ -0,0 +1,2 @@
+# @package train.scheduler
+_target_: transformers.get_cosine_schedule_with_warmup
diff --git a/cross_entropy.py b/cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..1782338132ab7e797f6d4998aa5a78f84a217504
--- /dev/null
+++ b/cross_entropy.py
@@ -0,0 +1,318 @@
+# Copyright (c) 2023, Tri Dao.
+
+from typing import Tuple, Optional, Union
+
+import torch
+
+import triton
+import triton.language as tl
+
+# `all_gather_into_tensor` and `reduce_scatter_tensor` are new placeholders for
+# `_all_gather_base` and `_reduce_scatter_base`. They require the most recent
+# version of PyTorch. The following 2 lines are for backward compatibility with
+# older PyTorch.
+if "all_gather_into_tensor" not in dir(torch.distributed):
+    torch.distributed.all_gather_into_tensor = torch.distributed._all_gather_base
+
+
+@triton.heuristics(
+    {
+        "HAS_SMOOTHING": lambda args: args["smoothing"] > 0.0,
+    }
+)
+@triton.jit
+def cross_entropy_fwd_kernel(
+    loss_ptr,  # data ptrs
+    lse_ptr,
+    z_loss_ptr,
+    logits_ptr,
+    labels_ptr,
+    smoothing,
+    logit_scale,
+    lse_square_scale,
+    ignore_index,
+    total_classes,
+    class_start_idx,  # Useful for tensor parallel when each rank only has a subset of classes
+    n_cols,  # shapes
+    n_rows,
+    logits_row_stride,  # strides
+    BLOCK_SIZE: tl.constexpr,
+    HAS_SMOOTHING: tl.constexpr,
+    # if SPLIT (e.g. tensor parallel), don't include the LSE in the loss since it's not the final LSE
+    SPLIT: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+    col_block_idx = tl.program_id(1)
+    logits_ptr = logits_ptr + row_idx * logits_row_stride.to(tl.int64)
+    col_offsets = col_block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    label_idx = tl.load(labels_ptr + row_idx)
+    logits = tl.load(logits_ptr + col_offsets, mask=col_offsets < n_cols, other=-float("inf")).to(
+        tl.float32
+    ) * logit_scale
+    max_logits = tl.max(logits, 0)
+    if HAS_SMOOTHING:
+        sum_logits = tl.sum(tl.where(col_offsets < n_cols, logits, 0.0), 0)
+    lse = tl.log(tl.sum(tl.exp(logits - max_logits), 0)) + max_logits
+    tl.store(lse_ptr + col_block_idx * n_rows + row_idx, lse)
+    if label_idx == ignore_index:
+        loss = 0.0
+        z_loss = 0.0
+    else:
+        label_idx -= class_start_idx
+        if label_idx >= col_block_idx * BLOCK_SIZE and label_idx < min(
+            n_cols, (col_block_idx + 1) * BLOCK_SIZE
+        ):
+            logits_label = tl.load(logits_ptr + label_idx) * logit_scale
+            if HAS_SMOOTHING:
+                loss = (
+                    (lse if not SPLIT else 0.0)
+                    - smoothing * sum_logits / total_classes
+                    - (1 - smoothing) * logits_label
+                )
+            else:
+                loss = (lse if not SPLIT else 0.0) - logits_label
+        else:
+            # If label is out of bounds, we set the CE loss to 0.0. But we still want the smoothing loss
+            if HAS_SMOOTHING:
+                loss = smoothing * ((lse if not SPLIT else 0.0) - sum_logits / total_classes)
+            else:
+                loss = 0.0
+        if not SPLIT:
+            z_loss = lse_square_scale * lse * lse
+            loss += z_loss
+        else:
+            z_loss = 0.0
+    tl.store(loss_ptr + col_block_idx * n_rows + row_idx, loss)
+    if not SPLIT:
+        tl.store(z_loss_ptr + col_block_idx * n_rows + row_idx, z_loss)
+
+
+@triton.heuristics(
+    {
+        "HAS_SMOOTHING": lambda args: args["smoothing"] > 0.0,
+    }
+)
+@triton.jit
+def cross_entropy_bwd_kernel(
+    dlogits_ptr,  # data ptrs
+    dloss_ptr,
+    logits_ptr,
+    lse_ptr,
+    labels_ptr,
+    smoothing,
+    logit_scale,
+    lse_square_scale,
+    ignore_index,
+    total_classes,
+    class_start_idx,  # Useful for tensor parallel when each rank only has a subset of classes
+    n_cols,  # shapes
+    logits_row_stride,  # strides
+    dlogits_row_stride,
+    dloss_row_stride,
+    BLOCK_SIZE: tl.constexpr,
+    HAS_SMOOTHING: tl.constexpr,
+):
+    row_idx = tl.program_id(0)
+    col_block_idx = tl.program_id(1)
+    logits_ptr = logits_ptr + row_idx * logits_row_stride.to(tl.int64)
+    dlogits_ptr = dlogits_ptr + row_idx * dlogits_row_stride.to(tl.int64)
+    col_offsets = col_block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    label_idx = tl.load(labels_ptr + row_idx)
+    if label_idx != ignore_index:
+        dloss = tl.load(dloss_ptr + row_idx * dloss_row_stride)
+    else:
+        dloss = 0.0
+    logits = tl.load(logits_ptr + col_offsets, mask=col_offsets < n_cols, other=-float("inf")).to(
+        tl.float32
+    ) * logit_scale
+    lse = tl.load(lse_ptr + row_idx)
+    probs = tl.exp(logits - lse)
+    probs += 2.0 * lse_square_scale * lse * probs
+    label_idx -= class_start_idx
+    if HAS_SMOOTHING:
+        smooth_positive = 1.0 - smoothing
+        smooth_negative = smoothing / total_classes
+        probs = tl.where(col_offsets == label_idx, probs - (1 - smoothing), probs) - smooth_negative
+    else:
+        probs = tl.where(col_offsets == label_idx, probs - 1.0, probs)
+    tl.store(dlogits_ptr + col_offsets, (dloss * logit_scale) * probs, mask=col_offsets < n_cols)
+
+
+class CrossEntropyLoss(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        logits,
+        labels,
+        smoothing=0.0,
+        logit_scale=1.0,
+        lse_square_scale=0.0,
+        ignore_index=-100,
+        inplace_backward=False,
+        process_group=None,
+    ):
+        n_rows, n_cols = logits.shape
+        assert labels.shape == (n_rows,)
+        world_size = 1 if process_group is None else torch.distributed.get_world_size(process_group)
+        total_classes = world_size * n_cols
+        rank = 0 if process_group is None else torch.distributed.get_rank(process_group)
+        class_start_idx = rank * n_cols
+
+        if logits.stride(-1) != 1:
+            logits = logits.contiguous()
+        # Set these similar to https://github.com/openai/triton/blob/main/python/tutorials/02-fused-softmax.py
+        MAX_BLOCK_SIZE = 64 * 1024
+        BLOCK_SIZE = min(triton.next_power_of_2(n_cols), MAX_BLOCK_SIZE)
+        num_warps = (
+            4
+            if BLOCK_SIZE < 2048
+            else (8 if BLOCK_SIZE < 8192 else (16 if BLOCK_SIZE < 128 * 1024 else 32))
+        )
+        # We may split the lse computation across multiple blocks, then do a reduction
+        # lse(local_lse) to get the final LSE. This is faster for large n_cols (e.g., > 64k)
+        # where having just one thread block processing more than 64k elements is slow.
+        split = world_size > 1 or n_cols > MAX_BLOCK_SIZE
+        n_splits = (n_cols + BLOCK_SIZE - 1) // BLOCK_SIZE
+        loss_shape = (n_splits, n_rows) if n_splits > 1 else (n_rows,)
+        losses = torch.empty(*loss_shape, dtype=torch.float, device=logits.device)
+        lse = torch.empty(*loss_shape, dtype=torch.float, device=logits.device)
+        z_losses = torch.empty(*loss_shape, dtype=torch.float, device=logits.device)
+        # Need this, otherwise Triton tries to launch from cuda:0 and we get
+        # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
+        with torch.cuda.device(logits.device.index):
+            cross_entropy_fwd_kernel[(n_rows, n_splits)](
+                losses,  # data ptrs
+                lse,
+                z_losses,
+                logits,
+                labels,
+                smoothing,
+                logit_scale,
+                lse_square_scale,
+                ignore_index,
+                total_classes,
+                class_start_idx,
+                n_cols,  # shapes
+                n_rows,
+                logits.stride(0),  # strides
+                BLOCK_SIZE=BLOCK_SIZE,  # constants
+                num_warps=num_warps,
+                SPLIT=split,
+            )
+
+        if split:
+            # If there's no smoothing, if labels are in the vocab of this partition, losses contains
+            # - predicted logit, and 0 otherwise.
+            # If there's smoothing=0.1, for labels in the vocab of this partition, losses contains
+            # -0.9 * predicted logit - 0.1 * sum logit / total_classes.
+            # For labels not in the vocab of this partition, losses contains
+            # -0.1 * sum logit / total_classes.
+            if n_splits > 1:
+                lse = torch.logsumexp(lse, dim=0)
+                losses = losses.sum(dim=0)
+            if world_size > 1:
+                lse_allgather = torch.empty(world_size, n_rows, dtype=lse.dtype, device=lse.device)
+                torch.distributed.all_gather_into_tensor(lse_allgather, lse, group=process_group)
+                handle_losses = torch.distributed.all_reduce(
+                    losses, op=torch.distributed.ReduceOp.SUM, group=process_group, async_op=True
+                )
+                lse = torch.logsumexp(lse_allgather, dim=0)
+                handle_losses.wait()
+            # After the allreduce, if there's no smoothing, the total losses are - predicted_logit,
+            # we just have to add the (global) lse.
+            # If there's smoothing=0.1, the total losses are
+            # -0.9 * predicted_logit - 0.1 * sum logit / total_classes.
+            # Again, we just have to add the (global) lse.
+            losses += lse
+            if lse_square_scale != 0.0:
+                z_losses = lse_square_scale * lse.square()
+                z_losses.masked_fill_(labels == ignore_index, 0.0)
+                losses += z_losses
+            else:
+                z_losses = torch.zeros_like(losses)
+            losses.masked_fill_(labels == ignore_index, 0.0)
+
+        ctx.save_for_backward(logits, lse, labels)
+        ctx.mark_non_differentiable(z_losses)
+        ctx.smoothing = smoothing
+        ctx.logit_scale = logit_scale
+        ctx.lse_square_scale = lse_square_scale
+        ctx.ignore_index = ignore_index
+        ctx.total_classes = total_classes
+        ctx.class_start_idx = class_start_idx
+        ctx.inplace_backward = inplace_backward
+
+        return losses, z_losses
+
+    @staticmethod
+    def backward(ctx, grad_losses, grad_z_losses):
+        del grad_z_losses  # z_losses are only for logging.
+
+        logits, lse, labels = ctx.saved_tensors
+        dlogits = logits if ctx.inplace_backward else torch.empty_like(logits)
+        n_rows, n_cols = logits.shape
+        BLOCK_SIZE = min(triton.next_power_of_2(n_cols), 4 * 1024)
+        num_warps = 4 if BLOCK_SIZE < 2048 else (8 if BLOCK_SIZE < 8192 else 16)
+        grid = lambda META: (n_rows, triton.cdiv(n_cols, META["BLOCK_SIZE"]))  # noqa
+        # Need this, otherwise Triton tries to launch from cuda:0 and we get
+        # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
+        with torch.cuda.device(logits.device.index):
+            cross_entropy_bwd_kernel[grid](
+                dlogits,  # data ptrs
+                grad_losses,
+                logits,
+                lse,
+                labels,
+                ctx.smoothing,
+                ctx.logit_scale,
+                ctx.lse_square_scale,
+                ctx.ignore_index,
+                ctx.total_classes,
+                ctx.class_start_idx,
+                n_cols,  # shapes
+                logits.stride(0),  # strides
+                dlogits.stride(0),
+                grad_losses.stride(0),
+                BLOCK_SIZE=BLOCK_SIZE,  # constants
+                num_warps=num_warps,
+            )
+        return dlogits, None, None, None, None, None, None, None, None
+
+def cross_entropy_loss(
+    logits: torch.Tensor,
+    labels: torch.Tensor,
+    label_smoothing: float = 0.0,
+    logit_scale: float = 1.0,
+    lse_square_scale: float = 0.0,
+    ignore_index=-100,
+    inplace_backward: bool = False,
+    process_group=None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Arguments:
+        logits: (batch, vocab_size)
+        labels: (batch,)
+        label_smoothing: float
+        logit_scale: float. Multiply logits by this scale before calculating the loss.
+        lse_square_scale: float. If > 0, we add lse_square_scale * lse(logits) ^ 2 to the loss.
+            This is also referred to as "z-loss".
+        ignore_index: int. If labels == ignore_index, the loss is set to 0.0.
+        inplace_backward: bool. If True, we do the backward pass in-place by modifying the logits.
+            This saves memory.
+        process_group: if not None, we're doing Tensor Parallel: each process is responsible for
+            one part of the vocab. The loss will be aggregated across processes.
+    Returns:
+        losses: (batch,), float
+        z_losses: (batch,), float
+    """
+    return CrossEntropyLoss.apply(
+        logits,
+        labels,
+        label_smoothing,
+        logit_scale,
+        lse_square_scale,
+        ignore_index,
+        inplace_backward,
+        process_group,
+    )
diff --git a/csv.yaml b/csv.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f917e89ca1902e8ef89b524a9f2affb8bc402e3
--- /dev/null
+++ b/csv.yaml
@@ -0,0 +1,8 @@
+# csv logger built in lightning
+
+csv:
+  _target_: pytorch_lightning.loggers.csv_logs.CSVLogger
+  save_dir: "."
+  name: "csv/"
+  version: ${name}
+  prefix: ""
diff --git a/cuda_bf16_fallbacks.cuh b/cuda_bf16_fallbacks.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..f5641f61609172090da1c8e77e43f9f4694ccca0
--- /dev/null
+++ b/cuda_bf16_fallbacks.cuh
@@ -0,0 +1,257 @@
+// Downloaded from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/utils/cuda_bf16_fallbacks.cuh
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuda_bf16_wrapper.h"
+#include <cuda_fp16.h>
+
+namespace fastertransformer {
+
+#ifdef ENABLE_BF16
+inline __device__ float2 bf1622float2(const __nv_bfloat162 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float2 f_val;
+    f_val.x = __low2float(val);
+    f_val.y = __high2float(val);
+    return f_val;
+#else
+    return __bfloat1622float2(val);
+#endif
+}
+
+inline __device__ int16_t bf1622int16(__nv_bfloat162 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float2 f_val;
+    f_val.x = max(min(__low2float(val), 127.f), -128.f);
+    f_val.y = max(min(__high2float(val), 127.f), -128.f);
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = static_cast<int8_t>(static_cast<short>(f_val.x));
+    int8[1] = static_cast<int8_t>(static_cast<short>(f_val.y));
+    return int16;
+#else
+    val = __hmin2(val, make_bfloat162(127., 127.));
+    val = __hmax2(val, make_bfloat162(-128., -128.));
+    union { int8_t int8[2]; int16_t int16; };
+    int8[0] = static_cast<int8_t>(static_cast<short>(val.x));
+    int8[1] = static_cast<int8_t>(static_cast<short>(val.y));
+    return int16;
+#endif
+}
+
+inline __device__ __nv_bfloat162 float22bf162(const float2 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __floats2bfloat162_rn(val.x, val.y);
+#else
+    return __float22bfloat162_rn(val);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    __nv_bfloat162 val2;
+    val2.x = val;
+    val2.y = val;
+    return val2;
+#else
+    return __bfloat162bfloat162(val);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hadd2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl + fyl, fxh + fyh);
+#else
+    return __hadd2(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hadd(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) + __bfloat162float(y) );
+#else
+    return __hadd(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl - fyl, fxh - fyh);
+#else
+    return __hsub2(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hsub(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) - __bfloat162float(y) );
+#else
+    return __hsub(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    return __floats2bfloat162_rn(fxl * fyl, fxh * fyh);
+#else
+    return __hmul2(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hmul(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) );
+#else
+    return __hmul(x, y);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hfma2(const __nv_bfloat162 x, const __nv_bfloat162 y, const __nv_bfloat162 z) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh, fyl, fyh, fzl, fzh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);
+    fyl = __low2float(y);
+    fyh = __high2float(y);
+    fzl = __low2float(z);
+    fzh = __high2float(z);
+    return __floats2bfloat162_rn(fxl * fyl + fzl, fxh * fyh + fzh);
+#else
+    return __hfma2(x, y, z);
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hfma(const __nv_bfloat16 x, const __nv_bfloat16 y, const __nv_bfloat16 z) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) + __bfloat162float(z));
+#else
+    return __hfma(x, y, z);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fxl, fxh;
+    fxl = __low2float(x);
+    fxh = __high2float(x);;
+    return __floats2bfloat162_rn(expf(fxl), expf(fxh));
+#else
+    return h2exp(x);
+#endif
+}
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+inline __device__ __nv_bfloat162 operator*(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hmul2(x, y); };
+inline __device__ __nv_bfloat162 operator+(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hadd2(x, y); };
+
+inline __device__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
+{
+    __nv_bfloat162 t; t.x = x; t.y = y; return t;
+}
+
+#endif
+
+inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c));
+#else
+    return a + b + c;
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c, __nv_bfloat16 d) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c) + __bfloat162float(d));
+#else
+    return (__nv_bfloat16)((float)a + (float)b + (float)c + (float)d);
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hadd2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    return __floats2bfloat162_rn(fal + fbl + fcl, fah + fbh + fch);
+#else
+    return a + b + c;
+#endif
+}
+
+inline __device__ __nv_bfloat16 bf16hmul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b) * __bfloat162float(c));
+#else
+    return a * b * c;
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hmul2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    return __floats2bfloat162_rn(fal * fbl * fcl, fah * fbh * fch);
+#else
+    return a * b * c;
+#endif
+}
+
+inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162 d) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+    float fal, fah, fbl, fbh, fcl, fch, fdl, fdh;
+    fal = __low2float(a);
+    fah = __high2float(a);
+    fbl = __low2float(b);
+    fbh = __high2float(b);
+    fcl = __low2float(c);
+    fch = __high2float(c);
+    fdl = __low2float(d);
+    fdh = __high2float(d);
+    return __floats2bfloat162_rn(fal * fbl * fcl + fdl, fah * fbh * fch + fdh);
+#else
+    return a * b * c + d;
+#endif
+}
+
+#endif // ENABLE_BF16
+
+}  // namespace fastertransformer
diff --git a/cuda_bf16_wrapper.h b/cuda_bf16_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..efb6e798730879bc2cd16088b2091991862a6074
--- /dev/null
+++ b/cuda_bf16_wrapper.h
@@ -0,0 +1,23 @@
+// Downloaded from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/utils/cuda_bf16_wrapper.h
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef ENABLE_BF16
+#include <cuda_bf16.h>
+#endif
diff --git a/ddp.yaml b/ddp.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c9544407298d1e839b0c7851cc2cce4a0f01cc3
--- /dev/null
+++ b/ddp.yaml
@@ -0,0 +1,6 @@
+defaults:
+  - default.yaml
+
+accelerator: gpu
+devices: 4
+strategy: ddp
diff --git a/debug.yaml b/debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b2335c981ab1217d3e8a9f489c0d3c1af168ec47
--- /dev/null
+++ b/debug.yaml
@@ -0,0 +1,27 @@
+# @package _global_
+
+# run in debug mode with:
+# `python run.py mode=debug`
+
+defaults:
+  - override /trainer: debug.yaml
+
+debug_mode: True
+
+hydra:
+  # sets level of all command line loggers to 'DEBUG'
+  verbose: True
+
+  # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
+  # sets level of only chosen command line loggers to 'DEBUG'
+  # verbose: [src.train, src.utils.utils]
+
+  # sets output paths for all file logs to 'logs/debug/'
+  run:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/debug/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/debug/multirun_${now:%Y-%m-%d_%H-%M-%S}
+    subdir: ${hydra.job.num}
+
+# disable rich config printing, since it will be already printed by hydra when `verbose: True`
+print_config: False
diff --git a/decoder_masked_multihead_attention.cu b/decoder_masked_multihead_attention.cu
new file mode 100644
index 0000000000000000000000000000000000000000..13306f76868e7c46321998513b3a49634edf9e2c
--- /dev/null
+++ b/decoder_masked_multihead_attention.cu
@@ -0,0 +1,149 @@
+// Adapted from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "decoder_masked_multihead_attention.h"
+#include "decoder_masked_multihead_attention_utils.h"
+#include "cuda_bf16_wrapper.h"
+#include <assert.h>
+#include <float.h>
+#include <type_traits>
+
+#include "decoder_masked_multihead_attention_template.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, DO_CROSS_ATTENTION, stream)    \
+    size_t smem_sz = mmha::smem_size_in_bytes<T, DO_CROSS_ATTENTION>(params, THDS_PER_VALUE, THDS_PER_BLOCK);          \
+    auto kernel = mmha::masked_multihead_attention_kernel<T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE,                 \
+                                                          THDS_PER_BLOCK, DO_CROSS_ATTENTION>;                         \
+    cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_sz);                                \
+    dim3 grid(params.nnz_head_idx == nullptr ? params.num_heads : params.nnz_heads, params.batch_size);                \
+    kernel<<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// !!! Specialize the launcher for Cross attention
+template<typename T, int Dh, int Dh_MAX, typename KERNEL_PARAMS_TYPE>
+void mmha_launch_kernel(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream)
+{
+    constexpr int  THREADS_PER_VALUE  = Dh_MAX * sizeof(T) / 16;
+    constexpr bool DO_CROSS_ATTENTION = std::is_same<KERNEL_PARAMS_TYPE, Cross_multihead_attention_params<T>>::value;
+    int            tlength            = (DO_CROSS_ATTENTION) ? params.memory_max_len : params.timestep;
+    // printf("tlength, CROSS_ATTENTION = %d, %d\n", tlength, DO_CROSS_ATTENTION);
+    if (tlength < 32) {
+        MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 64, DO_CROSS_ATTENTION, stream);
+    }
+    else if (tlength < 2048) {
+        MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 2, THREADS_PER_VALUE, 128, DO_CROSS_ATTENTION, stream);
+    }
+    else {
+        MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 1, THREADS_PER_VALUE, 256, DO_CROSS_ATTENTION, stream);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#undef MMHA_LAUNCH_KERNEL
+
+template<typename T, typename KERNEL_PARAMS_TYPE>
+void multihead_attention_(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream)
+{
+    switch (params.hidden_size_per_head) {
+        case 32:
+            mmha_launch_kernel<T, 32, 32, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 48:
+            mmha_launch_kernel<T, 48, 64, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 64:
+            mmha_launch_kernel<T, 64, 64, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 80:
+            mmha_launch_kernel<T, 80, 128, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 96:
+            mmha_launch_kernel<T, 96, 128, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 128:
+            mmha_launch_kernel<T, 128, 128, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 160:
+            mmha_launch_kernel<T, 160, 256, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 192:
+            mmha_launch_kernel<T, 192, 256, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 224:
+            mmha_launch_kernel<T, 224, 256, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        case 256:
+            mmha_launch_kernel<T, 256, 256, KERNEL_PARAMS_TYPE>(params, stream);
+            break;
+        default:
+            assert(false);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void masked_multihead_attention(const Masked_multihead_attention_params<float>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<float, Masked_multihead_attention_params<float>>(params, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void masked_multihead_attention(const Masked_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<uint16_t, Masked_multihead_attention_params<uint16_t>>(params, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+void masked_multihead_attention(const Masked_multihead_attention_params<__nv_bfloat16>& params,
+                                const cudaStream_t&                                     stream)
+{
+    multihead_attention_<__nv_bfloat16, Masked_multihead_attention_params<__nv_bfloat16>>(params, stream);
+}
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void cross_multihead_attention(const Cross_multihead_attention_params<float>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<float, Cross_multihead_attention_params<float>>(params, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void cross_multihead_attention(const Cross_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream)
+{
+    multihead_attention_<uint16_t, Cross_multihead_attention_params<uint16_t>>(params, stream);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+void cross_multihead_attention(const Cross_multihead_attention_params<__nv_bfloat16>& params,
+                               const cudaStream_t&                                    stream)
+{
+    multihead_attention_<__nv_bfloat16, Cross_multihead_attention_params<__nv_bfloat16>>(params, stream);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/decoder_masked_multihead_attention.h b/decoder_masked_multihead_attention.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c79f88b856efbc6dd50bfb61747675727de402b
--- /dev/null
+++ b/decoder_masked_multihead_attention.h
@@ -0,0 +1,192 @@
+// Downloaded from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention.h
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuda_bf16_wrapper.h"
+#include <cuda_fp16.h>
+#include <cuda_runtime_api.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define CHECK_CUDA(call)                                                                                               \
+    do {                                                                                                               \
+        cudaError_t status_ = call;                                                                                    \
+        if (status_ != cudaSuccess) {                                                                                  \
+            fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_));              \
+            exit(1);                                                                                                   \
+        }                                                                                                              \
+    } while (0)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The structure of parameters for the masked multihead attention kernel.
+//
+// We use the following terminology to describe the different dimensions.
+//
+// B:  Batch size (number of sequences),
+// L:  Sequence length,
+// D:  Hidden dimension,
+// H:  Number of heads,
+// Dh: Hidden dimension per head - Dh = D / H.
+
+template<typename T>
+struct Multihead_attention_params_base {
+
+    // The output buffer. Dimensions B x D.
+    T* out = nullptr;
+
+    // The input Qs and the associated bias. Dimensions B x D and D, resp.
+    const T *q = nullptr, *q_bias = nullptr;
+    // The input Ks and the associated bias. Dimensions B x D and D, resp.
+    const T *k = nullptr, *k_bias = nullptr;
+    // The input Vs and the associated bias. Dimensions B x D and D, resp.
+    const T *v = nullptr, *v_bias = nullptr;
+
+    // The cache for the Ks. The size must be at least B x L x D.
+    T* k_cache = nullptr;
+    // The cache for the Vs. The size must be at least B x L x D.
+    T* v_cache = nullptr;
+    // The indirections to use for cache when beam sampling.
+    const int* cache_indir = nullptr;
+
+    // Stride to handle the case when KQV is a single buffer
+    int stride_q = 0;
+    int stride_k = 0;
+    int stride_v = 0;
+
+    // The batch size.
+    int batch_size = 0;
+    // The beam width
+    int beam_width = 0;
+    // The sequence length.
+    int memory_max_len = 0;
+    // The number of heads (H).
+    int num_heads = 0;
+    int num_heads_kv = 0;
+    int num_heads_q_kv_ratio = 0;
+    // The hidden dimension per head (Dh).
+    int hidden_size_per_head = 0;
+    // The per-head latent space reserved for rotary embeddings.
+    int  rotary_embedding_dim = 0;
+    bool neox_rotary_style    = false;
+    float rotary_base = 0.0f;
+    // The maximum length of input sentences.
+    int max_input_length = 0;
+    // The current timestep. TODO(bhsueh) Check that do we only this param in cross attention?
+    int timestep = 0;
+    // The current timestep of each sentences (support different timestep for different sentences)
+
+    // The 1.f / sqrt(Dh). Computed on the host.
+    float inv_sqrt_dh = 0.0f;
+
+    // Used when we have some input context like gpt
+    const int* total_padding_tokens = nullptr;
+
+    const bool* masked_tokens            = nullptr;
+    const int*  prefix_prompt_lengths    = nullptr;
+    int         max_prefix_prompt_length = 0;
+
+    const T* relative_attention_bias        = nullptr;
+    int      relative_attention_bias_stride = 0;
+    // The slope per head of linear position bias to attention score (H).
+    const T* linear_bias_slopes = nullptr;
+
+    const T*   ia3_key_weights   = nullptr;
+    const T*   ia3_value_weights = nullptr;
+    const int* ia3_tasks         = nullptr;
+
+    const float* qkv_scale_out       = nullptr;
+    const float* attention_out_scale = nullptr;
+    int          int8_mode           = 0;
+
+    const T *rotary_cos = nullptr;
+    const T *rotary_sin = nullptr;
+
+    const int *nnz_head_idx = nullptr;
+    int nnz_heads = 0;
+};
+
+template<typename T, bool CROSS_ATTENTION>
+struct Multihead_attention_params: public Multihead_attention_params_base<T> {
+    // output cross attentions
+    float* cross_attention_out        = nullptr;
+    int    max_decoder_seq_len        = 0;
+    bool   is_return_cross_attentions = false;
+
+    // allows to exist attention eary
+    bool* finished = nullptr;
+
+    // required in case of cross attention
+    // will need it here till if constexpr in c++17
+    int* memory_length_per_sample = nullptr;
+
+    // required in case of masked attention with different length
+    const int* length_per_sample = nullptr;
+};
+
+template<typename T>
+struct Multihead_attention_params<T, true>: public Multihead_attention_params_base<T> {
+    // output cross attentions
+    float* cross_attention_out        = nullptr;
+    int    max_decoder_seq_len        = 0;
+    bool   is_return_cross_attentions = false;
+
+    // allows to exist attention eary
+    bool* finished = nullptr;
+
+    // required in case of cross attention
+    int* memory_length_per_sample = nullptr;
+
+    // required in case of masked attention with different length
+    const int* length_per_sample = nullptr;
+};
+
+template<class T>
+using Masked_multihead_attention_params = Multihead_attention_params<T, false>;
+
+template<class T>
+using Cross_multihead_attention_params = Multihead_attention_params<T, true>;
+
+template<typename T>
+struct outputCrossAttentionParam {
+    // max decoder output length
+    int  max_decoder_seq_len        = 0;
+    T*   cross_attention_out        = nullptr;
+    bool is_return_cross_attentions = false;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void masked_multihead_attention(const Masked_multihead_attention_params<float>& params, const cudaStream_t& stream);
+void masked_multihead_attention(const Masked_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream);
+#ifdef ENABLE_BF16
+void masked_multihead_attention(const Masked_multihead_attention_params<__nv_bfloat16>& params,
+                                const cudaStream_t&                                     stream);
+#endif
+void cross_multihead_attention(const Cross_multihead_attention_params<float>& params, const cudaStream_t& stream);
+void cross_multihead_attention(const Cross_multihead_attention_params<uint16_t>& params, const cudaStream_t& stream);
+#ifdef ENABLE_BF16
+void cross_multihead_attention(const Cross_multihead_attention_params<__nv_bfloat16>& params,
+                               const cudaStream_t&                                    stream);
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/decoder_masked_multihead_attention_template.hpp b/decoder_masked_multihead_attention_template.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2ae1b2425b87c89b8b38f2b1d5c6707229495037
--- /dev/null
+++ b/decoder_masked_multihead_attention_template.hpp
@@ -0,0 +1,1619 @@
+// Downloaded from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "decoder_masked_multihead_attention.h"
+#include "decoder_masked_multihead_attention_utils.h"
+#include "cuda_bf16_wrapper.h"
+#include "cuda_bf16_fallbacks.cuh"
+#include <assert.h>
+#include <float.h>
+#include <type_traits>
+
+// #define MMHA_USE_HMMA_FOR_REDUCTION
+
+// Below are knobs to extend FP32 accumulation for higher FP16 accuracy
+
+// Does not seem to affect the accuracy that much
+#define MMHA_USE_FP32_ACUM_FOR_FMA
+
+// Seems to slightly improve the accuracy
+#define MMHA_USE_FP32_ACUM_FOR_OUT
+
+#if 0 && defined(MMHA_USE_FP32_ACUM_FOR_OUT)
+ // Does not seem to improve the accuracy
+ //#define MMHA_USE_FP32_ACUM_FOR_LOGITS
+#endif
+
+namespace mmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//
+// We use the following terminology to describe the different dimensions.
+//
+// B:  Batch size (number of sequences),
+// L:  Sequence length,
+// D:  Hidden dimension,
+// H:  Number of heads,
+// Dh: Hidden dimension per head - Dh = D / H.
+//
+// The different kernels assign a threadblock for B x H pair. The grid has size (1, B, H). We use
+// 64, 128 and 256 threads per block.
+//
+// Each threadblock loads Dh values from Q and its associated bias. The kernels run a loop to
+// compute Q * K^T where K is loaded from a cache buffer -- except for the current timestep. The
+// cache buffer helps with memory accesses and contains keys with bias.
+//
+// The layout of the cache buffer for the keys is [B, H, Dh/x, L, x] where x == 8 for FP16 and
+// x == 4 for FP32 where the fastest moving dimension (contiguous data) is the rightmost one. The
+// values for x are chosen to create chunks of 16 bytes.
+//
+// The different kernels use 1, 2 or 4 threads per key (THREADS_PER_KEY). The size of the LDGs
+// depends on the number of threads per key. Each thread sums Dh / THREADS_PER_KEY elements. At
+// the end of each iteration of the Q * K^T loop, we perform a reduction between lanes using an
+// HMMA instruction (Tensor Core). Each Q * K^T valuey is stored in shared memory in FP32.
+//
+// After that loop, a parallel softmax is computed across the different Q * K^T values stored in
+// shared memory.
+//
+// The kernel ends with a loop over the values in V. We use THREADS_PER_VALUE to control how many
+// timesteps are computed by loop iteration. As with the keys, the values are read from a cache
+// except for the current timestep. The layout of the cache buffer for the values is much simpler
+// as it is [B, H, L, Dh].
+//
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int Dh>
+struct Qk_vec_ {
+};
+
+template<>
+struct Qk_vec_<float, 32> {
+    using Type = float;
+};
+template<>
+struct Qk_vec_<float, 64> {
+    using Type = float2;
+};
+template<>
+struct Qk_vec_<float, 128> {
+    using Type = float4;
+};
+template<>
+struct Qk_vec_<float, 256> {
+    using Type = float4;
+};
+template<>
+struct Qk_vec_<uint16_t, 32> {
+    using Type = uint32_t;
+};
+template<>
+struct Qk_vec_<uint16_t, 64> {
+    using Type = uint32_t;
+};
+template<>
+struct Qk_vec_<uint16_t, 128> {
+    using Type = uint2;
+};
+template<>
+struct Qk_vec_<uint16_t, 256> {
+    using Type = uint4;
+};
+#ifdef ENABLE_BF16
+template<>
+struct Qk_vec_<__nv_bfloat16, 32> {
+    using Type = __nv_bfloat162;
+};
+template<>
+struct Qk_vec_<__nv_bfloat16, 64> {
+    using Type = __nv_bfloat162;
+};
+template<>
+struct Qk_vec_<__nv_bfloat16, 128> {
+    using Type = bf16_4_t;
+};
+template<>
+struct Qk_vec_<__nv_bfloat16, 256> {
+    using Type = bf16_8_t;
+};
+#endif  // ENABLE_BF16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int THREADS_PER_KEY>
+struct K_vec_ {
+};
+
+template<>
+struct K_vec_<float, 4> {
+    using Type = float;
+};
+template<>
+struct K_vec_<float, 2> {
+    using Type = float2;
+};
+template<>
+struct K_vec_<float, 1> {
+    using Type = float4;
+};
+template<>
+struct K_vec_<uint16_t, 4> {
+    using Type = uint32_t;
+};
+template<>
+struct K_vec_<uint16_t, 2> {
+    using Type = uint2;
+};
+template<>
+struct K_vec_<uint16_t, 1> {
+    using Type = uint4;
+};
+#ifdef ENABLE_BF16
+template<>
+struct K_vec_<__nv_bfloat16, 4> {
+    using Type = __nv_bfloat162;
+};
+template<>
+struct K_vec_<__nv_bfloat16, 2> {
+    using Type = bf16_4_t;
+};
+template<>
+struct K_vec_<__nv_bfloat16, 1> {
+    using Type = bf16_8_t;
+};
+#endif  // ENABLE_BF16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int V_VEC_SIZE>
+struct V_vec_ {
+};
+
+template<>
+struct V_vec_<float, 1> {
+    using Type = float;
+};
+template<>
+struct V_vec_<float, 2> {
+    using Type = float2;
+};
+template<>
+struct V_vec_<float, 4> {
+    using Type = float4;
+};
+template<>
+struct V_vec_<uint16_t, 2> {
+    using Type = uint32_t;
+};
+template<>
+struct V_vec_<uint16_t, 4> {
+    using Type = uint2;
+};
+template<>
+struct V_vec_<uint16_t, 8> {
+    using Type = uint4;
+};
+#ifdef ENABLE_BF16
+template<>
+struct V_vec_<__nv_bfloat16, 2> {
+    using Type = __nv_bfloat162;
+};
+template<>
+struct V_vec_<__nv_bfloat16, 4> {
+    using Type = bf16_4_t;
+};
+template<>
+struct V_vec_<__nv_bfloat16, 8> {
+    using Type = bf16_8_t;
+};
+#endif  // ENABLE_BF16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
+template<typename T>
+struct Qk_vec_acum_fp32_ {
+};
+
+template<>
+struct Qk_vec_acum_fp32_<float> {
+    using Type = float;
+};
+template<>
+struct Qk_vec_acum_fp32_<float2> {
+    using Type = float2;
+};
+template<>
+struct Qk_vec_acum_fp32_<float4> {
+    using Type = float4;
+};
+// template<> struct Qk_vec_acum_fp32_<uint16_t> { using Type = float;        };
+template<>
+struct Qk_vec_acum_fp32_<uint32_t> {
+    using Type = float2;
+};
+template<>
+struct Qk_vec_acum_fp32_<uint2> {
+    using Type = Float4_;
+};
+template<>
+struct Qk_vec_acum_fp32_<uint4> {
+    using Type = Float8_;
+};
+template<>
+struct Qk_vec_acum_fp32_<__nv_bfloat16> {
+    using Type = float;
+};
+template<>
+struct Qk_vec_acum_fp32_<__nv_bfloat162> {
+    using Type = float2;
+};
+template<>
+struct Qk_vec_acum_fp32_<bf16_4_t> {
+    using Type = Float4_;
+};
+template<>
+struct Qk_vec_acum_fp32_<bf16_8_t> {
+    using Type = Float8_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct K_vec_acum_fp32_ {
+};
+
+template<>
+struct K_vec_acum_fp32_<float> {
+    using Type = float;
+};
+template<>
+struct K_vec_acum_fp32_<float2> {
+    using Type = float2;
+};
+template<>
+struct K_vec_acum_fp32_<float4> {
+    using Type = float4;
+};
+template<>
+struct K_vec_acum_fp32_<uint32_t> {
+    using Type = float2;
+};
+template<>
+struct K_vec_acum_fp32_<uint2> {
+    using Type = Float4_;
+};
+template<>
+struct K_vec_acum_fp32_<uint4> {
+    using Type = Float8_;
+};
+template<>
+struct K_vec_acum_fp32_<__nv_bfloat16> {
+    using Type = float;
+};
+template<>
+struct K_vec_acum_fp32_<__nv_bfloat162> {
+    using Type = float2;
+};
+template<>
+struct K_vec_acum_fp32_<bf16_4_t> {
+    using Type = Float4_;
+};
+template<>
+struct K_vec_acum_fp32_<bf16_8_t> {
+    using Type = Float8_;
+};
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+template<typename T>
+struct V_vec_acum_fp32_ {
+};
+
+template<>
+struct V_vec_acum_fp32_<float> {
+    using Type = float;
+};
+template<>
+struct V_vec_acum_fp32_<float2> {
+    using Type = float2;
+};
+template<>
+struct V_vec_acum_fp32_<float4> {
+    using Type = float4;
+};
+template<>
+struct V_vec_acum_fp32_<uint32_t> {
+    using Type = float2;
+};
+template<>
+struct V_vec_acum_fp32_<uint2> {
+    using Type = Float4_;
+};
+template<>
+struct V_vec_acum_fp32_<uint4> {
+    using Type = Float8_;
+};
+#ifdef ENABLE_BF16
+template<>
+struct V_vec_acum_fp32_<__nv_bfloat162> {
+    using Type = float2;
+};
+template<>
+struct V_vec_acum_fp32_<bf16_4_t> {
+    using Type = Float4_;
+};
+template<>
+struct V_vec_acum_fp32_<bf16_8_t> {
+    using Type = Float8_;
+};
+#endif  // ENABLE_BF16
+#endif
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int THREADS_PER_KEY, typename K_vec, int N>
+inline __device__ float qk_dot_(const K_vec (&q)[N], const K_vec (&k)[N])
+{
+#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
+    using K_vec_acum = typename K_vec_acum_fp32_<K_vec>::Type;
+#else
+    using K_vec_acum = K_vec;
+#endif
+    // Compute the parallel products for Q*K^T (treat vector lanes separately).
+    K_vec_acum qk_vec = mul<K_vec_acum, K_vec, K_vec>(q[0], k[0]);
+#pragma unroll
+    for (int ii = 1; ii < N; ++ii) {
+        qk_vec = fma(q[ii], k[ii], qk_vec);
+    }
+
+    // Finalize the reduction across lanes.
+    float qk = sum(qk_vec);
+#pragma unroll
+    for (int mask = THREADS_PER_KEY / 2; mask >= 1; mask /= 2) {
+        qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
+    }
+    return qk;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int THREADS_PER_KEY>
+struct Qk_dot {
+    template<typename K_vec, int N>
+    static inline __device__ float dot(const K_vec (&q)[N], const K_vec (&k)[N])
+    {
+        return qk_dot_<THREADS_PER_KEY>(q, k);
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float4 hmma_fp32(const uint2& a, uint32_t b)
+{
+    float4 c;
+    float  zero = 0.f;
+    asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 \n"
+                 "    {%0, %1, %2, %3}, \n"
+                 "    {%4, %5}, \n"
+                 "    {%6}, \n"
+                 "    {%7, %7, %7, %7}; \n"
+
+                 : "=f"(c.x), "=f"(c.y), "=f"(c.z), "=f"(c.w)
+                 : "r"(a.x) "r"(a.y), "r"(b), "f"(zero));
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int N>
+inline __device__ float qk_hmma_dot_(const uint32_t (&q)[N], const uint32_t (&k)[N])
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750
+#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
+    using K_vec_acum = typename K_vec_acum_fp32_<uint32_t>::Type;
+#else
+    using K_vec_acum = uint32_t;
+#endif
+    K_vec_acum qk_vec = mul<K_vec_acum, uint32_t, uint32_t>(q[0], k[0]);
+#pragma unroll
+    for (int ii = 1; ii < N; ++ii) {
+        qk_vec = fma(q[ii], k[ii], qk_vec);
+    }
+#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
+    uint32_t qk_vec_ = float2_to_half2(qk_vec);
+    return hmma_fp32(make_uint2(qk_vec_, 0u), 0x3c003c00u).x;
+#else
+    return hmma_fp32(make_uint2(qk_vec, 0u), 0x3c003c00u).x;
+#endif
+#else
+    return 0.f;
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Qk_dot<uint16_t, 4> {
+    template<int N>
+    static inline __device__ float dot(const uint32_t (&q)[N], const uint32_t (&k)[N])
+    {
+#if __CUDA_ARCH__ >= 750 && defined(MMHA_USE_HMMA_FOR_REDUCTION)
+        return qk_hmma_dot_(q, k);
+#else
+        return qk_dot_<4>(q, k);
+#endif  // defined MMHA_USE_HMMA_FOR_REDUCTION
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int WARPS_PER_BLOCK, int WARP_SIZE = 32>
+inline __device__ float block_sum(float* red_smem, float sum)
+{
+
+    // Decompose the thread index into warp / lane.
+    int warp = threadIdx.x / WARP_SIZE;
+    int lane = threadIdx.x % WARP_SIZE;
+
+// Compute the sum per warp.
+#pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+        sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
+    }
+
+    // Warp leaders store the data to shared memory.
+    if (lane == 0) {
+        red_smem[warp] = sum;
+    }
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+    // The warps compute the final sums.
+    if (lane < WARPS_PER_BLOCK) {
+        sum = red_smem[lane];
+    }
+
+// Parallel reduction inside the warp.
+#pragma unroll
+    for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
+        sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
+    }
+
+    // Broadcast to other threads.
+    return __shfl_sync(uint32_t(-1), sum, 0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(float& dst, float src)
+{
+    dst = src;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(uint16_t& dst, float src)
+{
+    dst = float_to_half(src);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(uint32_t& dst, float2 src)
+{
+    dst = float2_to_half2(src);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef ENABLE_BF16
+inline __device__ void convert_from_float(__nv_bfloat16& dst, float src)
+{
+    dst = __float2bfloat16(src);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(__nv_bfloat162& dst, float2 src)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    dst = __float22bfloat162_rn(src);
+#else
+    dst   = __floats2bfloat162_rn(src.x, src.y);
+#endif
+}
+#endif  // ENABLE_BF16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(uint2& dst, Float4_ src)
+{
+    dst.x = float2_to_half2(src.x);
+    dst.y = float2_to_half2(src.y);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(uint2& dst, float4 src)
+{
+    convert_from_float(dst, Float4_{make_float2(src.x, src.y), make_float2(src.z, src.w)});
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(uint4& dst, Float8_ src)
+{
+    dst.x = float2_to_half2(src.x);
+    dst.y = float2_to_half2(src.y);
+    dst.z = float2_to_half2(src.z);
+    dst.w = float2_to_half2(src.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+inline __device__ void convert_from_float(bf16_4_t& dst, Float4_ src)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    dst.x = __float22bfloat162_rn(src.x);
+    dst.y = __float22bfloat162_rn(src.y);
+#else
+    dst.x = __floats2bfloat162_rn(src.x.x, src.x.y);
+    dst.y = __floats2bfloat162_rn(src.y.x, src.y.y);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(bf16_4_t& dst, float4 src)
+{
+    convert_from_float(dst, Float4_{make_float2(src.x, src.y), make_float2(src.z, src.w)});
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(bf16_8_t& dst, Float8_ src)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    dst.x = __float22bfloat162_rn(src.x);
+    dst.y = __float22bfloat162_rn(src.y);
+    dst.z = __float22bfloat162_rn(src.z);
+    dst.w = __float22bfloat162_rn(src.w);
+#else
+    dst.x = __floats2bfloat162_rn(src.x.x, src.x.y);
+    dst.y = __floats2bfloat162_rn(src.y.x, src.y.y);
+    dst.z = __floats2bfloat162_rn(src.z.x, src.z.y);
+    dst.w = __floats2bfloat162_rn(src.w.x, src.w.y);
+#endif
+}
+#endif  // ENABLE_BF16
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(float2& dst, float2 src)
+{
+    dst = src;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void convert_from_float(float4& dst, float4 src)
+{
+    dst = src;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float convert_to_float(float4 u)
+{
+    return u.x;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float convert_to_float(uint4 u)
+{
+    float2 tmp = half2_to_float2(u.x);
+    return tmp.x;
+}
+
+#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float cast_to_float(float u)
+{
+    return u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 cast_to_float(float2 u)
+{
+    return u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float4 cast_to_float(float4 u)
+{
+    return u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ cast_to_float(Float4_ u)
+{
+    return u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ cast_to_float(Float8_ u)
+{
+    return u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 cast_to_float(uint32_t u)
+{
+    return half2_to_float2(u);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ cast_to_float(uint2 u)
+{
+    Float4_ tmp;
+    tmp.x = half2_to_float2(u.x);
+    tmp.y = half2_to_float2(u.y);
+    return tmp;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ cast_to_float(uint4 u)
+{
+    Float8_ tmp;
+    tmp.x = half2_to_float2(u.x);
+    tmp.y = half2_to_float2(u.y);
+    tmp.z = half2_to_float2(u.z);
+    tmp.w = half2_to_float2(u.w);
+    return tmp;
+}
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float float_from_int8(int8_t u)
+{
+    return u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 float_from_int8(int16_t u)
+{
+    union {
+        int16_t int16;
+        int8_t  int8[2];
+    };
+    int16 = u;
+    return make_float2(int8[0], int8[1]);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float4 float_from_int8(int32_t u)
+{
+    union {
+        int32_t int32;
+        int8_t  int8[4];
+    };
+    int32 = u;
+    return make_float4(int8[0], int8[1], int8[2], int8[3]);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// clang-format off
+inline __device__ Float8_ float_from_int8(int64_t u)
+{
+    union {
+        int64_t int64;
+        int16_t int16[4];
+    };
+    int64 = u;
+    return Float8_ {float_from_int8(int16[0]),
+                    float_from_int8(int16[1]),
+                    float_from_int8(int16[2]),
+                    float_from_int8(int16[3])};
+}
+// clang-format on
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ int8_t cast_to_int8(float val)
+{
+    union {
+        int8_t  int8[2];
+        int16_t int16;
+    };
+    asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val));
+    return int8[0];
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ int32_t cast_to_int8(float4 val)
+{
+    union {
+        int8_t  int8[4];
+        int32_t int32;
+    };
+    int8[0] = cast_to_int8(val.x);
+    int8[1] = cast_to_int8(val.y);
+    int8[2] = cast_to_int8(val.z);
+    int8[3] = cast_to_int8(val.w);
+    return int32;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ int64_t cast_to_int8(Float8_ val)
+{
+    union {
+        int8_t  int8[8];
+        int64_t int64;
+    };
+    int8[0] = cast_to_int8(val.x.x);
+    int8[1] = cast_to_int8(val.x.y);
+    int8[2] = cast_to_int8(val.y.x);
+    int8[3] = cast_to_int8(val.y.y);
+    int8[4] = cast_to_int8(val.z.x);
+    int8[5] = cast_to_int8(val.z.y);
+    int8[6] = cast_to_int8(val.w.x);
+    int8[7] = cast_to_int8(val.w.y);
+    return int64;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+inline __device__ __host__ T div_up(T m, T n)
+{
+    return (m + n - 1) / n;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, bool DO_CROSS_ATTENTION>
+inline size_t smem_size_in_bytes(const Multihead_attention_params<T, DO_CROSS_ATTENTION>& params,
+                                 int                                                      threads_per_value,
+                                 int                                                      threads_per_block)
+{
+    // The amount of shared memory needed to store the Q*K^T values in float.
+    const int max_timesteps = min(params.timestep, params.memory_max_len);
+    size_t qk_sz = (DO_CROSS_ATTENTION) ? div_up(params.memory_max_len + 1, 4) * 16 : div_up(max_timesteps + 1, 4) * 16;
+
+    // The extra memory needed if we are not using floats for the final logits.
+    size_t logits_sz = 0;
+#ifndef MMHA_USE_FP32_ACUM_FOR_LOGITS
+    if (sizeof(T) != 4) {
+        // TDOD
+        logits_sz = (DO_CROSS_ATTENTION) ? div_up(params.memory_max_len + 1, 4) * 4 * sizeof(T) :
+                                           div_up(max_timesteps + 1, 4) * 4 * sizeof(T);
+    }
+#endif
+
+    // The total size needed during softmax.
+    size_t softmax_sz = qk_sz + logits_sz;
+
+    // The number of partial rows to reduce in the final reduction.
+    int rows_per_red = threads_per_block / threads_per_value;
+    // The amount of storage needed to finalize the outputs.
+    size_t red_sz = rows_per_red * params.hidden_size_per_head * sizeof(T) / 2;
+
+    size_t transpose_rotary_size = 0;
+    if (params.rotary_embedding_dim > 0 && params.neox_rotary_style) {
+        transpose_rotary_size = 2 * params.rotary_embedding_dim * sizeof(T);
+    }
+
+    // The max.
+    return max(max(softmax_sz, red_sz), transpose_rotary_size);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ constexpr uint32_t shfl_mask(int threads)
+{
+    return threads == 32 ? uint32_t(-1) : (1u << threads) - 1u;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    // The type of the inputs. Supported types: float and half.
+    typename T,
+    // The hidden dimension per head.
+    int Dh,
+    int Dh_MAX,
+    // The number of threads per key.
+    int THREADS_PER_KEY,
+    // The number of threads per value.
+    int THREADS_PER_VALUE,
+    // The number of threads in a threadblock.
+    int  THREADS_PER_BLOCK,
+    bool DO_CROSS_ATTENTION>
+__global__ void masked_multihead_attention_kernel(Multihead_attention_params<T, DO_CROSS_ATTENTION> params)
+{
+
+    // Make sure the hidden dimension per head is a multiple of the number of threads per key.
+    static_assert(Dh_MAX % THREADS_PER_KEY == 0, "");
+    // Make sure the hidden dimension per head is a multiple of the number of threads per value.
+    static_assert(Dh_MAX % THREADS_PER_VALUE == 0, "");
+
+    // The size of a warp.
+    constexpr int WARP_SIZE = 32;
+    // The number of warps in a threadblock.
+    constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
+
+    // Use smem_size_in_bytes (above) to determine the amount of shared memory.
+    extern __shared__ char smem_[];
+
+    // The shared memory for the Q*K^T values and partial logits in softmax.
+    float* qk_smem = reinterpret_cast<float*>(smem_);
+
+    // The shared memory for the logits. For FP32, that's the same buffer as qk_smem.
+    char* logits_smem_ = smem_;
+#ifndef MMHA_USE_FP32_ACUM_FOR_LOGITS
+    if (sizeof(T) != 4) {
+        // TODO - change to tlength
+        const int max_timesteps = min(params.timestep, params.memory_max_len);
+        logits_smem_ +=
+            (DO_CROSS_ATTENTION) ? div_up(params.memory_max_len + 1, 4) * 16 : div_up(max_timesteps + 1, 4) * 16;
+    }
+    T* logits_smem = reinterpret_cast<T*>(logits_smem_);
+#else
+    float* logits_smem = reinterpret_cast<float*>(logits_smem_);
+#endif
+
+    // The shared memory to do the final reduction for the output values. Reuse qk_smem.
+    T* out_smem = reinterpret_cast<T*>(smem_);
+
+    // The shared memory buffers for the block-wide reductions. One for max, one for sum.
+    __shared__ float red_smem[WARPS_PER_BLOCK * 2];
+
+    // A vector of Q or K elements for the current timestep.
+    using Qk_vec = typename Qk_vec_<T, Dh_MAX>::Type;
+
+    // Use alignment for safely casting the shared buffers as Qk_vec.
+    // Shared memory to store Q inputs.
+    __shared__ __align__(sizeof(Qk_vec)) T q_smem[Dh_MAX];
+
+    // This is one of the reasons we should have a separate kernel for cross attention
+    __shared__ __align__(sizeof(Qk_vec)) T bias_smem[DO_CROSS_ATTENTION ? Dh_MAX : 1];
+
+    // A vector of Q or K elements for the current timestep.
+    using Qk_vec = typename Qk_vec_<T, Dh_MAX>::Type;
+    // The number of elements per vector.
+    constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T);
+    // Make sure the hidden size per head is a multiple of the vector size.
+    static_assert(Dh_MAX % QK_VEC_SIZE == 0, "");
+    // We will use block wide reduction if needed
+    // static_assert(Dh_MAX / QK_VEC_SIZE <= WARP_SIZE, "");
+    // The number of vectors per warp.
+    constexpr int QK_VECS_PER_WARP = Dh_MAX / QK_VEC_SIZE;
+
+    // The layout of the cache is [B, H, Dh/x, L, x] with x == 4/8 for FP32/FP16. Since each thread
+    // owns x elements, we have to decompose the linear index into chunks of x values and the posi-
+    // tion of the thread in that chunk.
+
+    // The number of elements in a chunk of 16B (that's the x in the above formula).
+    constexpr int QK_ELTS_IN_16B = 16 / sizeof(T);
+    // The number of K vectors in 16B.
+    constexpr int QK_VECS_IN_16B = 16 / sizeof(Qk_vec);
+
+    // The batch/beam idx
+    const int bi = blockIdx.y;
+    if (params.finished != nullptr && params.finished[bi] == true) {
+        return;
+    }
+    // The beam idx
+    const int beami = bi % params.beam_width;
+    // The "beam-aware" batch idx
+    const int bbi = bi / params.beam_width;
+    // The head.
+    // const int hi = blockIdx.x;
+    const int hi = params.nnz_head_idx == nullptr ? blockIdx.x : params.nnz_head_idx[blockIdx.x];
+    const int hi_kv = hi / params.num_heads_q_kv_ratio;
+    // Combine the batch and the head indices.
+    const int bhi = bi * params.num_heads + hi;
+    const int bhi_kv = bi * params.num_heads_kv + hi_kv;
+    // Combine the "beam-aware" batch idx and the head indices.
+    const int bbhi = bbi * params.beam_width * params.num_heads_kv + hi_kv;
+    // The thread in the block.
+    const int tidx = threadIdx.x;
+
+    const bool handle_kv = !DO_CROSS_ATTENTION || (DO_CROSS_ATTENTION && params.timestep == 0);
+
+    // While doing the product Q*K^T for the different keys we track the max.
+    float qk_max = -FLT_MAX;
+
+    float qk = 0.0F;
+
+    int q_base_offset = (params.stride_q == 0) ? bhi * Dh : bi * params.stride_q + hi * Dh;
+    int k_base_offset = (params.stride_k == 0) ? bhi_kv * Dh : bi * params.stride_k + hi_kv * Dh;
+    int v_base_offset = (params.stride_v == 0) ? bhi_kv * Dh : bi * params.stride_v + hi_kv * Dh;
+
+    const size_t bi_seq_len_offset = bi * params.memory_max_len;
+
+    // int tlength = (DO_CROSS_ATTENTION)? params.memory_length_per_sample[bi] - 1 : params.timestep;
+    int       tlength      = (DO_CROSS_ATTENTION) ? params.memory_length_per_sample[bi] - 1 :
+                             (params.length_per_sample == nullptr) ?
+                                                    params.timestep :
+                                                    params.length_per_sample[bi] + params.max_prefix_prompt_length;
+    const int first_step   = max(0, tlength + 1 - params.memory_max_len);
+    const int tlength_circ = tlength % params.memory_max_len;
+
+    // First QK_VECS_PER_WARP load Q and K + the bias values for the current timestep.
+    const bool is_masked = tidx >= QK_VECS_PER_WARP;
+
+    // The offset in the Q and K buffer also accounts for the batch.
+    int q_offset = q_base_offset + tidx * QK_VEC_SIZE;
+    int k_offset = k_base_offset + tidx * QK_VEC_SIZE;
+    // The offset in the bias buffer.
+    int q_bias_offset = hi * Dh + tidx * QK_VEC_SIZE;
+    int k_bias_offset = hi_kv * Dh + tidx * QK_VEC_SIZE;
+
+    const bool do_ia3      = handle_kv && params.ia3_tasks != nullptr;
+    const int  ia3_task_id = do_ia3 ? params.ia3_tasks[bbi] : 0;
+
+    // Trigger the loads from the Q and K buffers.
+    Qk_vec q;
+    zero(q);
+    if (!is_masked && (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh)) {
+        if (params.int8_mode == 2) {
+            using Packed_Int8_t  = typename packed_type<int8_t, num_elems<Qk_vec>::value>::type;
+            using Packed_Float_t = typename packed_type<float, num_elems<Qk_vec>::value>::type;
+            const auto q_scaling = params.qkv_scale_out[0];
+            const auto q_quant =
+                *reinterpret_cast<const Packed_Int8_t*>(&reinterpret_cast<const int8_t*>(params.q)[q_offset]);
+
+            convert_from_float(q, mul<Packed_Float_t, float>(q_scaling, float_from_int8(q_quant)));
+        }
+        else {
+            q = *reinterpret_cast<const Qk_vec*>(&params.q[q_offset]);
+        }
+    }
+
+    Qk_vec k;
+    zero(k);
+    if (DO_CROSS_ATTENTION) {
+        // The 16B chunk written by the thread.
+        int co = tidx / QK_VECS_IN_16B;
+        // The position of the thread in that 16B chunk.
+        int ci = tidx % QK_VECS_IN_16B * QK_VEC_SIZE;
+
+        // Two chunks are separated by L * x elements. A thread write QK_VEC_SIZE elements.
+        int offset = bhi_kv * params.memory_max_len * Dh + co * params.memory_max_len * QK_ELTS_IN_16B +
+                     // params.timestep*QK_ELTS_IN_16B +
+                     tlength * QK_ELTS_IN_16B + ci;
+        k = !is_masked && (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) ?
+                *reinterpret_cast<const Qk_vec*>(&params.k_cache[offset]) :
+                k;
+    }
+    else {
+        if (!is_masked && (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh)) {
+            if (params.int8_mode == 2) {
+                using Packed_Int8_t  = typename packed_type<int8_t, num_elems<Qk_vec>::value>::type;
+                using Packed_Float_t = typename packed_type<float, num_elems<Qk_vec>::value>::type;
+                const auto k_scaling = params.qkv_scale_out[1];
+                const auto k_quant =
+                    *reinterpret_cast<const Packed_Int8_t*>(&reinterpret_cast<const int8_t*>(params.k)[k_offset]);
+
+                convert_from_float(k, mul<Packed_Float_t, float>(k_scaling, float_from_int8(k_quant)));
+            }
+            else {
+                k = *reinterpret_cast<const Qk_vec*>(&params.k[k_offset]);
+            }
+        }
+    }
+
+    // Trigger the loads from the Q and K bias buffers.
+    Qk_vec q_bias;
+    zero(q_bias);
+    q_bias = (!is_masked && Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) && params.q_bias != nullptr ?
+                 *reinterpret_cast<const Qk_vec*>(&params.q_bias[q_bias_offset]) :
+                 q_bias;
+
+    Qk_vec k_bias;
+    zero(k_bias);
+    if (handle_kv) {
+        k_bias = !is_masked && (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) && params.k_bias != nullptr ?
+                     *reinterpret_cast<const Qk_vec*>(&params.k_bias[k_bias_offset]) :
+                     k_bias;
+    }
+
+    // Computes the Q/K values with bias.
+    q = add(q, q_bias);
+    if (handle_kv) {
+        k = add(k, k_bias);
+    }
+    if (do_ia3 && !is_masked) {
+        k = mul<Qk_vec, Qk_vec, Qk_vec>(
+            k,
+            *reinterpret_cast<const Qk_vec*>(
+                &params.ia3_key_weights[(ia3_task_id * params.num_heads + hi) * Dh + tidx * QK_VEC_SIZE]));
+    }
+
+    // Padded len
+    const int padd_len = (params.total_padding_tokens == nullptr) ? 0 : params.total_padding_tokens[bi];
+    if (params.rotary_embedding_dim > 0 && !params.neox_rotary_style) {
+        if (handle_kv) {
+            if (params.rotary_cos == nullptr) {
+                apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, tlength - padd_len, params.rotary_base);
+            } else {
+                apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, tlength - padd_len,
+                                       params.rotary_cos + bi * params.rotary_embedding_dim / 2,
+                                       params.rotary_sin + bi * params.rotary_embedding_dim / 2);
+            }
+        }
+        else {
+            if (params.rotary_cos == nullptr) {
+                apply_rotary_embedding(q, tidx, params.rotary_embedding_dim, tlength - padd_len, params.rotary_base);
+            } else {
+                apply_rotary_embedding(q, tidx, params.rotary_embedding_dim, tlength - padd_len,
+                                       params.rotary_cos + bi * params.rotary_embedding_dim / 2,
+                                       params.rotary_sin + bi * params.rotary_embedding_dim / 2);
+            }
+        }
+    }
+    else if (params.rotary_embedding_dim > 0 && params.neox_rotary_style) {
+        const bool do_rotary = !is_masked && QK_VEC_SIZE * tidx < params.rotary_embedding_dim;
+
+        T* q_smem = reinterpret_cast<T*>(smem_);
+        T* k_smem = q_smem + params.rotary_embedding_dim;
+
+        const int half_rotary_dim = params.rotary_embedding_dim / 2;
+        const int half_idx        = (tidx * QK_VEC_SIZE) / half_rotary_dim;
+        const int intra_half_idx  = (tidx * QK_VEC_SIZE) % half_rotary_dim;
+        const int smem_pitch      = half_rotary_dim;  // TODO: adjust for bank conflicts
+
+        assert(half_rotary_dim % QK_VEC_SIZE == 0);
+
+        if (do_rotary) {
+            *reinterpret_cast<Qk_vec*>(q_smem + half_idx * smem_pitch + intra_half_idx) = q;
+
+            if (handle_kv) {
+                *reinterpret_cast<Qk_vec*>(k_smem + half_idx * smem_pitch + intra_half_idx) = k;
+            }
+        }
+
+        __syncthreads();
+
+        const int     transpose_idx = half_idx * (half_rotary_dim / 2) + intra_half_idx / 2;
+        constexpr int tidx_factor   = (QK_VEC_SIZE > 1) ? QK_VEC_SIZE / 2 : 1;
+        if (do_rotary) {
+            mmha::vec_from_smem_transpose(q, q_smem, transpose_idx, smem_pitch);
+
+            if (handle_kv) {
+                mmha::vec_from_smem_transpose(k, k_smem, transpose_idx, smem_pitch);
+
+                if (params.rotary_cos == nullptr) {
+                    mmha::apply_rotary_embedding(
+                        q, k, transpose_idx / tidx_factor, params.rotary_embedding_dim, tlength - padd_len, params.rotary_base);
+                } else {
+                    mmha::apply_rotary_embedding(
+                        q, k, transpose_idx / tidx_factor, params.rotary_embedding_dim, tlength - padd_len,
+                        params.rotary_cos + bi * params.rotary_embedding_dim / 2,
+                        params.rotary_sin + bi * params.rotary_embedding_dim / 2);
+                }
+
+                mmha::write_smem_transpose(k, k_smem, transpose_idx, smem_pitch);
+            }
+            else {
+                if (params.rotary_cos == nullptr) {
+                    mmha::apply_rotary_embedding(
+                        q, transpose_idx / tidx_factor, params.rotary_embedding_dim, tlength, params.rotary_base);
+                } else {
+                    mmha::apply_rotary_embedding(
+                        q, transpose_idx / tidx_factor, params.rotary_embedding_dim, tlength,
+                        params.rotary_cos + bi * params.rotary_embedding_dim / 2,
+                        params.rotary_sin + bi * params.rotary_embedding_dim / 2);
+                }
+            }
+            mmha::write_smem_transpose(q, q_smem, transpose_idx, smem_pitch);
+        }
+
+        __syncthreads();
+
+        if (do_rotary) {
+            q = *reinterpret_cast<Qk_vec*>(q_smem + half_idx * smem_pitch + intra_half_idx);
+            if (handle_kv) {
+                k = *reinterpret_cast<Qk_vec*>(k_smem + half_idx * smem_pitch + intra_half_idx);
+            }
+        }
+
+        __syncthreads();
+    }
+
+    if (!is_masked) {
+        // Store the Q values to shared memory.
+        *reinterpret_cast<Qk_vec*>(&q_smem[tidx * QK_VEC_SIZE]) = q;
+
+        // Store Dh values of k_bias into smem, since will need to add later
+        // if params.timestep == 0
+        if (DO_CROSS_ATTENTION && params.timestep == 0) {
+            *reinterpret_cast<Qk_vec*>(&bias_smem[tidx * QK_VEC_SIZE]) = k_bias;
+        }
+
+        // Write the K values to the global memory cache.
+        //
+        // NOTE: The stores are uncoalesced as we have multiple chunks of 16B spread across the memory
+        // system. We designed it this way as it allows much better memory loads (and there are many
+        // more loads) + the stores are really "write and forget" since we won't need the ack before
+        // the end of the kernel. There's plenty of time for the transactions to complete.
+
+        // The 16B chunk written by the thread.
+        int co = tidx / QK_VECS_IN_16B;
+        // The position of the thread in that 16B chunk.
+        int ci = tidx % QK_VECS_IN_16B * QK_VEC_SIZE;
+
+        // Two chunks are separated by L * x elements. A thread write QK_VEC_SIZE elements.
+        int offset = bhi_kv * params.memory_max_len * Dh + co * params.memory_max_len * QK_ELTS_IN_16B +
+                     // params.timestep*QK_ELTS_IN_16B +
+                     tlength_circ * QK_ELTS_IN_16B + ci;
+
+        if (handle_kv && hi % params.num_heads_q_kv_ratio == 0) {
+            // Trigger the stores to global memory.
+            if (Dh == Dh_MAX || co < Dh / QK_ELTS_IN_16B) {
+                *reinterpret_cast<Qk_vec*>(&params.k_cache[offset]) = k;
+            }
+        }
+
+        // Compute \sum_i Q[i] * K^T[i] for the current timestep.
+#ifdef MMHA_USE_FP32_ACUM_FOR_FMA
+        using Qk_vec_acum = typename Qk_vec_acum_fp32_<Qk_vec>::Type;
+#else
+        using Qk_vec_acum = Qk_vec;
+#endif
+        qk = dot<Qk_vec_acum, Qk_vec>(q, k);
+        if (QK_VECS_PER_WARP <= WARP_SIZE) {
+#pragma unroll
+            for (int mask = QK_VECS_PER_WARP / 2; mask >= 1; mask /= 2) {
+                qk += __shfl_xor_sync(shfl_mask(QK_VECS_PER_WARP), qk, mask);
+            }
+        }
+    }
+
+    if (QK_VECS_PER_WARP > WARP_SIZE) {
+        constexpr int WARPS_PER_RED = (QK_VECS_PER_WARP + WARP_SIZE - 1) / WARP_SIZE;
+        qk                          = block_sum<WARPS_PER_RED>(&red_smem[WARPS_PER_RED], qk);
+    }
+
+    // Store that value in shared memory. Keep the Q*K^T value in register for softmax.
+    if (tidx == 0) {
+        // Normalize qk.
+        qk *= params.inv_sqrt_dh;
+        if (params.relative_attention_bias != nullptr) {
+            qk = add(qk,
+                     params.relative_attention_bias[hi * params.relative_attention_bias_stride
+                                                        * params.relative_attention_bias_stride
+                                                    + (tlength - padd_len) * params.relative_attention_bias_stride
+                                                    + (tlength - padd_len)]);
+        }
+        // We don't need to apply the linear position bias here since qi - ki = 0 yields the position bias 0.
+
+        qk_max                        = qk;
+        qk_smem[tlength - first_step] = qk;
+        // qk_smem[params.timestep] = qk;
+    }
+
+    // Make sure the data is in shared memory.
+    __syncthreads();
+
+    // The type of queries and keys for the math in the Q*K^T product.
+    using K_vec = typename K_vec_<T, THREADS_PER_KEY>::Type;
+    // The number of elements per vector.
+    constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(T);
+    // Make sure the hidden size per head is a multiple of the vector size.
+    static_assert(Dh_MAX % K_VEC_SIZE == 0, "");
+    // The number of elements per thread.
+    constexpr int K_ELTS_PER_THREAD = Dh_MAX / THREADS_PER_KEY;
+    // The number of vectors per thread.
+    constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
+
+    // The position the first key loaded by each thread from the cache buffer (for this B * H).
+    int ko = tidx / THREADS_PER_KEY;
+    // The position of the thread in the chunk of keys.
+    int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
+
+    static_assert(Dh_MAX == THREADS_PER_KEY * K_VEC_SIZE * K_VECS_PER_THREAD);
+
+    // Load the Q values from shared memory. The values are reused during the loop on K.
+    K_vec q_vec[K_VECS_PER_THREAD];
+#pragma unroll
+    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+        q_vec[ii] = *reinterpret_cast<const K_vec*>(&q_smem[ki + ii * THREADS_PER_KEY * K_VEC_SIZE]);
+    }
+
+    K_vec k_bias_vec[DO_CROSS_ATTENTION ? K_VECS_PER_THREAD : 1];
+    if (DO_CROSS_ATTENTION && params.timestep == 0) {
+#pragma unroll
+        for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+            k_bias_vec[ii] = *reinterpret_cast<const K_vec*>(&bias_smem[ki + ii * THREADS_PER_KEY * K_VEC_SIZE]);
+        }
+    }
+
+    // The number of timesteps loaded per iteration.
+    constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
+    // The number of keys per warp.
+    constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
+
+    // The base pointer for the key in the cache buffer.
+    T* k_cache = &params.k_cache[bhi_kv * params.memory_max_len * Dh + ki];
+    // Base pointer for the beam's batch, before offsetting with indirection buffer
+    T* k_cache_batch = &params.k_cache[bbhi * params.memory_max_len * Dh + ki];
+
+    // Pick a number of keys to make sure all the threads of a warp enter (due to shfl_sync).
+    // int ti_end = div_up(params.timestep, K_PER_WARP) * K_PER_WARP;
+    int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
+
+    // prefix prompt length if has
+    const int prefix_prompt_length = (params.prefix_prompt_lengths == nullptr) ? 0 : params.prefix_prompt_lengths[bi];
+
+    // Iterate over the keys/timesteps to compute the various (Q*K^T)_{ti} values.
+    const bool has_beams    = params.cache_indir != nullptr;
+    const int* beam_indices = has_beams ? &params.cache_indir[bi_seq_len_offset] : nullptr;
+
+    for (int ti = first_step + ko; ti < ti_end; ti += K_PER_ITER) {
+        const int ti_circ = ti % params.memory_max_len;
+
+        // The keys loaded from the key cache.
+        K_vec k[K_VECS_PER_THREAD];
+        K_vec k_vec_zero;
+        zero(k_vec_zero);
+#pragma unroll
+        for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+            int jj = ii * params.memory_max_len + ti_circ;
+            // if( ti < params.timestep ) {
+            const bool within_bounds = (Dh == Dh_MAX || jj * QK_ELTS_IN_16B < Dh * params.memory_max_len);
+            if (ti < tlength) {
+                if (!within_bounds) {
+                    k[ii] = k_vec_zero;
+                }
+                else {
+                    if (has_beams) {
+                        const int beam_offset = beam_indices[ti_circ] * params.num_heads * params.memory_max_len * Dh;
+                        k[ii] = *reinterpret_cast<const K_vec*>(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B]);
+                    }
+                    else {
+                        k[ii] = *reinterpret_cast<const K_vec*>(&k_cache_batch[jj * QK_ELTS_IN_16B]);
+                    }
+                }
+                // add bias and update k_cache
+                if (DO_CROSS_ATTENTION && params.timestep == 0) {
+                    k[ii] = add(k[ii], k_bias_vec[ii]);
+
+                    if (do_ia3) {
+                        k[ii] = mul<K_vec, K_vec, K_vec>(
+                            k[ii],
+                            *reinterpret_cast<const K_vec*>(
+                                &params.ia3_key_weights[(ia3_task_id * params.num_heads + hi) * Dh + ki
+                                                        + ii * THREADS_PER_KEY * K_VEC_SIZE]));
+                    }
+
+                    if (Dh == Dh_MAX || jj * QK_ELTS_IN_16B < Dh * params.memory_max_len) {
+                        *reinterpret_cast<K_vec*>(&k_cache[jj * QK_ELTS_IN_16B]) = k[ii];
+                    }
+                }
+            }
+        }
+
+        // Perform the dot product and normalize qk.
+        //
+        // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!!
+        float qk      = Qk_dot<T, THREADS_PER_KEY>::dot(q_vec, k) * params.inv_sqrt_dh;
+        bool  is_mask = (params.masked_tokens != nullptr) && params.masked_tokens[bi_seq_len_offset + ti];
+
+        // Store the product to shared memory. There's one qk value per timestep. Update the max.
+        // if( ti < params.timestep && tidx % THREADS_PER_KEY == 0 ) {
+        if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
+            if (params.relative_attention_bias != nullptr) {
+                qk = add(qk,
+                         params.relative_attention_bias[hi * params.relative_attention_bias_stride
+                                                            * params.relative_attention_bias_stride
+                                                        + tlength * params.relative_attention_bias_stride + ti]);
+            }
+            if (params.linear_bias_slopes != nullptr) {
+                // Apply the linear position bias: (ki - qi) * slope[hi].
+                // The padding token locates between the input context and the generated tokens.
+                // We need to remove the number of padding tokens in the distance computation.
+                //   ti   : 0 1 2 3 4 5 6 7 8 9(tlength)
+                //   token: i i i i p p p o o o where i=input, p=pad, o=output.
+                // e.g. ti = 2, dist = (9 - 3) - 2 = 4.
+                int   max_context_length = params.max_prefix_prompt_length + params.max_input_length;
+                float dist               = (ti < max_context_length ? ti + padd_len : ti) - tlength;
+
+                qk += mul<float, T, float>(params.linear_bias_slopes[hi], dist);
+            }
+            qk_max                   = is_mask ? qk_max : fmaxf(qk_max, qk);
+            qk_smem[ti - first_step] = qk;
+        }
+    }
+
+// Perform the final reduction to compute the max inside each warp.
+//
+// NOTE: In a group of THREADS_PER_KEY threads, the leader already has the max value for the
+// group so it's not needed to run the reduction inside the group (again).
+#pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
+        qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
+    }
+
+    // Decompose the thread index into warp and lane.
+    const int warp = tidx / WARP_SIZE;
+    const int lane = tidx % WARP_SIZE;
+
+    // The warp leader writes the max to shared memory.
+    if (lane == 0) {
+        red_smem[warp] = qk_max;
+    }
+
+    // Make sure the products are in shared memory.
+    __syncthreads();
+
+    // The warps finalize the reduction.
+    qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+    for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
+        qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
+    }
+
+    // Broadcast to all the threads in the warp.
+    qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
+
+    // Compute the logits and start the sum.
+    float sum = 0.f;
+    // for( int ti = tidx; ti <= params.timestep; ti += THREADS_PER_BLOCK ) {
+    for (int ti = first_step + tidx; ti <= tlength; ti += THREADS_PER_BLOCK) {
+        bool  is_mask = (params.masked_tokens != nullptr) && params.masked_tokens[bi_seq_len_offset + ti];
+        float logit   = is_mask ? 0.f : __expf(qk_smem[ti - first_step] - qk_max);
+        sum += logit;
+        qk_smem[ti - first_step] = logit;
+    }
+
+    // Compute the sum.
+    sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], sum);
+
+    // Normalize the logits.
+    float inv_sum = __fdividef(1.f, sum + 1.e-6f);
+    // for( int ti = tidx; ti <= params.timestep; ti += THREADS_PER_BLOCK ) {
+    const size_t cross_attention_out_offset =
+        params.is_return_cross_attentions ?
+            bhi * params.max_decoder_seq_len * params.memory_max_len + params.timestep * params.memory_max_len :
+            0;
+    for (int ti = first_step + tidx; ti <= tlength; ti += THREADS_PER_BLOCK) {
+        float logit = qk_smem[ti - first_step] * inv_sum;
+        if (params.is_return_cross_attentions) {
+            params.cross_attention_out[cross_attention_out_offset + ti] = logit;
+        }
+        convert_from_float(logits_smem[ti - first_step], logit);
+    }
+
+    // Put Values part below so we leverage __syncthreads
+    // from the previous step
+
+    // The number of elements per vector.
+    constexpr int V_VEC_SIZE = Dh_MAX / THREADS_PER_VALUE;
+    // A vector of V elements for the current timestep.
+    using V_vec = typename V_vec_<T, V_VEC_SIZE>::Type;
+
+    // The value computed by this thread.
+    int vo = tidx / THREADS_PER_VALUE;
+    // The hidden dimensions computed by this particular thread.
+    int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
+
+    // The base pointer for the value in the cache buffer.
+    T* v_cache = &params.v_cache[bhi_kv * params.memory_max_len * Dh + vi];
+    // Base pointer for the beam's batch, before offsetting with indirection buffer
+    T* v_cache_batch = &params.v_cache[bbhi * params.memory_max_len * Dh + vi];
+
+    // The number of values processed per iteration of the loop.
+    constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
+
+    // One group of threads computes the product(s) for the current timestep.
+    V_vec v_bias;
+    zero(v_bias);
+    // if( vo == params.timestep % V_PER_ITER ) {
+    if (Dh == Dh_MAX || vi < Dh) {
+        if (handle_kv) {
+            if (vo == tlength % V_PER_ITER) {
+                // Trigger the loads from the V bias buffer.
+                if (params.v_bias != nullptr) {
+                    v_bias = *reinterpret_cast<const V_vec*>(&params.v_bias[hi_kv * Dh + vi]);
+                }
+                if (DO_CROSS_ATTENTION) {
+                    *reinterpret_cast<V_vec*>(&bias_smem[vi]) = v_bias;
+                }
+            }
+        }
+    }
+
+    // From previous, before values, step
+    // Also make sure the logits are in shared memory.
+    __syncthreads();
+
+    // Values continued
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+    using V_vec_acum = typename V_vec_acum_fp32_<V_vec>::Type;
+#else
+    using V_vec_acum = V_vec;
+#endif
+    // The partial outputs computed by each thread.
+    V_vec_acum out;
+    zero(out);
+
+    // Loop over the timesteps to compute the partial outputs.
+    // for( int ti = vo; ti < params.timestep; ti += V_PER_ITER ) {
+    if (Dh == Dh_MAX || vi < Dh) {
+        for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
+            const int ti_circ = ti % params.memory_max_len;
+
+            // Fetch offset based on cache_indir when beam sampling
+            const int beam_src = (params.cache_indir != nullptr) ? params.cache_indir[bi_seq_len_offset + ti_circ] : 0;
+            const int beam_offset = beam_src * params.num_heads * params.memory_max_len * Dh;
+            // Load the values from the cache.
+            V_vec v = *reinterpret_cast<const V_vec*>(&v_cache_batch[beam_offset + ti_circ * Dh]);
+            if (DO_CROSS_ATTENTION && params.timestep == 0) {
+                v = add(v, *reinterpret_cast<V_vec*>(&bias_smem[vi]));
+                if (do_ia3) {
+                    v = mul<V_vec, V_vec, V_vec>(
+                        v,
+                        *reinterpret_cast<const V_vec*>(
+                            &params.ia3_value_weights[(ia3_task_id * params.num_heads + hi) * Dh + vi]));
+                }
+                *reinterpret_cast<V_vec*>(&v_cache[ti * Dh]) = v;
+            }
+            // Load the logits from shared memory.
+#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
+            float logit = logits_smem[ti - first_step];
+            out         = fma(logit, cast_to_float(v), out);
+#else
+            T logit = logits_smem[ti - first_step];
+
+            // Update the partial sums.
+            out = fma(logit, v, out);
+#endif
+        }
+    }
+
+    // One group of threads computes the product(s) for the current timestep.
+    // if( vo == params.timestep % V_PER_ITER ) {
+    if (vo == tlength % V_PER_ITER && (Dh == Dh_MAX || vi < Dh)) {
+
+        V_vec v;
+        if (DO_CROSS_ATTENTION) {
+            v = *reinterpret_cast<const V_vec*>(&v_cache[tlength * Dh]);
+        }
+        else {
+            // Trigger the loads from the V buffer.
+            const auto v_offset = v_base_offset + vi;
+            if (params.int8_mode == 2) {
+                using Packed_Int8_t  = typename packed_type<int8_t, num_elems<V_vec>::value>::type;
+                using Packed_Float_t = typename packed_type<float, num_elems<V_vec>::value>::type;
+                const auto v_scaling = params.qkv_scale_out[2];
+                const auto v_quant =
+                    *reinterpret_cast<const Packed_Int8_t*>(&reinterpret_cast<const int8_t*>(params.v)[v_offset]);
+
+                convert_from_float(v, mul<Packed_Float_t, float>(v_scaling, float_from_int8(v_quant)));
+            }
+            else {
+                v = *reinterpret_cast<const V_vec*>(&params.v[v_offset]);
+            }
+            // Trigger the loads from the V bias buffer.
+            // V_vec v_bias = *reinterpret_cast<const V_vec*>(&params.v_bias[hi*Dh + vi]);
+        }
+
+        // Compute the V values with bias.
+        if (handle_kv) {
+            v = add(v, v_bias);
+
+            if (do_ia3) {
+                v = mul<V_vec, V_vec, V_vec>(
+                    v,
+                    *reinterpret_cast<const V_vec*>(
+                        &params.ia3_value_weights[(ia3_task_id * params.num_heads + hi) * Dh + vi]));
+            }
+
+            // Store the values with bias back to global memory in the cache for V.
+            if (hi % params.num_heads_q_kv_ratio == 0) {
+                //*reinterpret_cast<V_vec*>(&v_cache[params.timestep*Dh]) = v;
+                *reinterpret_cast<V_vec*>(&v_cache[tlength_circ * Dh]) = v;
+            }
+        }
+
+        // Initialize the output value with the current timestep.
+#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
+        // out = fma(logits_smem[params.timestep], cast_to_float(v), out);
+        out = fma(logits_smem[tlength - first_step], cast_to_float(v), out);
+#else
+        // out = fma(logits_smem[params.timestep], v, out);
+        out = fma(logits_smem[tlength - first_step], v, out);
+#endif
+    }
+
+    // Make sure we can start writing to shared memory.
+    __syncthreads();
+
+    // Run the final reduction amongst the different groups computing different partial outputs.
+    if (Dh == Dh_MAX || vi < Dh) {
+#pragma unroll
+        for (int active_groups = V_PER_ITER; active_groups >= 2; active_groups /= 2) {
+
+            // The midpoint in the number of active groups.
+            int midpoint = active_groups / 2;
+
+            // The upper part of active threads store to shared memory.
+            if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+                convert_from_float(*reinterpret_cast<V_vec*>(&out_smem[(vo - midpoint) * Dh + vi]), out);
+#else
+                *reinterpret_cast<V_vec*>(&out_smem[(vo - midpoint) * Dh + vi]) = out;
+#endif
+            }
+            __syncthreads();
+
+            // The bottom warps update their values.
+            if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
+                out = add(*reinterpret_cast<const V_vec*>(&out_smem[vo * Dh + vi]), out);
+            }
+            __syncthreads();
+        }
+    }
+
+    // Output the final values.
+    if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+        if (params.int8_mode == 2) {
+            using Packed_Int8_t = typename packed_type<int8_t, num_elems<V_vec_acum>::value>::type;
+            out                 = mul<V_vec_acum, float>(*params.attention_out_scale, out);
+            *reinterpret_cast<Packed_Int8_t*>(&(reinterpret_cast<int8_t*>(params.out)[bhi * Dh + vi])) =
+                cast_to_int8(out);
+        }
+        else {
+            convert_from_float(*reinterpret_cast<V_vec*>(&params.out[bhi * Dh + vi]), out);
+        }
+#else
+        // TODO: support int8_mode?
+        *reinterpret_cast<V_vec*>(&params.out[bhi * Dh + vi]) = out;
+#endif
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace mmha
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int Dh, int Dh_MAX, typename KERNEL_PARAMS_TYPE>
+void mmha_launch_kernel(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream);
diff --git a/decoder_masked_multihead_attention_utils.h b/decoder_masked_multihead_attention_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..98875aba9b8c42e53b9a107e28cb785c44f534e5
--- /dev/null
+++ b/decoder_masked_multihead_attention_utils.h
@@ -0,0 +1,2017 @@
+// Downloaded from from FasterTransformer v5.2.1
+// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuda_bf16_wrapper.h"
+#include "cuda_bf16_fallbacks.cuh"
+#include <stdint.h>
+
+using namespace fastertransformer;
+
+namespace mmha {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Float8_ {
+    float2 x;
+    float2 y;
+    float2 z;
+    float2 w;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Float4_ {
+    float2 x;
+    float2 y;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+struct bf16_4_t {
+    __nv_bfloat162 x;
+    __nv_bfloat162 y;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct bf16_8_t {
+    __nv_bfloat162 x;
+    __nv_bfloat162 y;
+    __nv_bfloat162 z;
+    __nv_bfloat162 w;
+};
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct num_elems;
+template<>
+struct num_elems<float> {
+    static constexpr int value = 1;
+};
+template<>
+struct num_elems<float2> {
+    static constexpr int value = 2;
+};
+template<>
+struct num_elems<float4> {
+    static constexpr int value = 4;
+};
+template<>
+struct num_elems<Float4_> {
+    static constexpr int value = 4;
+};
+template<>
+struct num_elems<Float8_> {
+    static constexpr int value = 8;
+};
+
+template<>
+struct num_elems<uint32_t> {
+    static constexpr int value = 2;
+};
+template<>
+struct num_elems<uint2> {
+    static constexpr int value = 4;
+};
+template<>
+struct num_elems<uint4> {
+    static constexpr int value = 8;
+};
+
+#ifdef ENABLE_BF16
+template<>
+struct num_elems<__nv_bfloat162> {
+    static constexpr int value = 2;
+};
+template<>
+struct num_elems<bf16_4_t> {
+    static constexpr int value = 4;
+};
+template<>
+struct num_elems<bf16_8_t> {
+    static constexpr int value = 8;
+};
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int N>
+struct packed_type;
+template<typename T>
+struct packed_type<T, 1> {
+    using type = T;
+};
+template<>
+struct packed_type<int8_t, 2> {
+    using type = int16_t;
+};
+template<>
+struct packed_type<int8_t, 4> {
+    using type = int32_t;
+};
+template<>
+struct packed_type<int8_t, 8> {
+    using type = int64_t;
+};
+
+template<>
+struct packed_type<float, 2> {
+    using type = float2;
+};
+template<>
+struct packed_type<float, 4> {
+    using type = float4;
+};
+template<>
+struct packed_type<float, 8> {
+    using type = Float8_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float add(float a, float b)
+{
+    return a + b;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 add(float2 a, float2 b)
+{
+    float2 c;
+    c.x = add(a.x, b.x);
+    c.y = add(a.y, b.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float4 add(float4 a, float4 b)
+{
+    float4 c;
+    c.x = add(a.x, b.x);
+    c.y = add(a.y, b.y);
+    c.z = add(a.z, b.z);
+    c.w = add(a.w, b.w);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b)
+{
+    return a + b;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b)
+{
+    return bf16hadd2(a, b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ bf16_4_t add(bf16_4_t a, bf16_4_t b)
+{
+    bf16_4_t c;
+    c.x = add(a.x, b.x);
+    c.y = add(a.y, b.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ bf16_8_t add(bf16_8_t a, bf16_8_t b)
+{
+    bf16_8_t c;
+    c.x = add(a.x, b.x);
+    c.y = add(a.y, b.y);
+    c.z = add(a.z, b.z);
+    c.w = add(a.w, b.w);
+    return c;
+}
+#endif  // ENABLE_BF16
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint16_t add(uint16_t a, uint16_t b)
+{
+    uint16_t c;
+    asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t add(uint32_t a, uint32_t b)
+{
+    uint32_t c;
+    asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint2 add(uint2 a, uint2 b)
+{
+    uint2 c;
+    c.x = add(a.x, b.x);
+    c.y = add(a.y, b.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint4 add(uint4 a, uint4 b)
+{
+    uint4 c;
+    c.x = add(a.x, b.x);
+    c.y = add(a.y, b.y);
+    c.z = add(a.z, b.z);
+    c.w = add(a.w, b.w);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint16_t float_to_half(float f)
+{
+    union {
+        uint32_t u32;
+        uint16_t u16[2];
+    } tmp;
+#if 0 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800  // Is it better?
+  float zero = 0.f;
+  asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(zero), "f"(f));
+#else
+    asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f));
+#endif
+    return tmp.u16[0];
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t float2_to_half2(float2 f)
+{
+    union {
+        uint32_t u32;
+        uint16_t u16[2];
+    } tmp;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
+#else
+    asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
+    asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
+#endif
+    return tmp.u32;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float half_to_float(uint16_t h)
+{
+    float f;
+    asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
+    return f;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 half2_to_float2(uint32_t v)
+{
+    uint16_t lo, hi;
+    asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
+    return make_float2(half_to_float(lo), half_to_float(hi));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float add(float a, uint16_t b)
+{
+    return a + half_to_float(b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+inline __device__ float add(float a, __nv_bfloat16 b)
+{
+    return a + __bfloat162float(b);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 add(uint32_t a, float2 fb)
+{
+    float2 fa = half2_to_float2(a);
+    return add(fa, fb);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ add(uint2 a, Float4_ fb)
+{
+    Float4_ fc;
+    fc.x = add(a.x, fb.x);
+    fc.y = add(a.y, fb.y);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ add(uint4 a, Float8_ fb)
+{
+    Float8_ fc;
+    fc.x = add(a.x, fb.x);
+    fc.y = add(a.y, fb.y);
+    fc.z = add(a.z, fb.z);
+    fc.w = add(a.w, fb.w);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t h0_h0(uint16_t a)
+{
+    uint32_t b;
+    asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
+    return b;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float fma(float a, float b, float c)
+{
+    return a * b + c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 fma(float2 a, float2 b, float2 c)
+{
+    float2 d;
+    d.x = fma(a.x, b.x, c.x);
+    d.y = fma(a.y, b.y, c.y);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 fma(float a, float2 b, float2 c)
+{
+    float2 d;
+    d.x = fma(a, b.x, c.x);
+    d.y = fma(a, b.y, c.y);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float4 fma(float4 a, float4 b, float4 c)
+{
+    float4 d;
+    d.x = fma(a.x, b.x, c.x);
+    d.y = fma(a.y, b.y, c.y);
+    d.z = fma(a.z, b.z, c.z);
+    d.w = fma(a.w, b.w, c.w);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float4 fma(float a, float4 b, float4 c)
+{
+    float4 d;
+    d.x = fma(a, b.x, c.x);
+    d.y = fma(a, b.y, c.y);
+    d.z = fma(a, b.z, c.z);
+    d.w = fma(a, b.w, c.w);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ fma(float a, Float4_ b, Float4_ c)
+{
+    Float4_ d;
+    d.x = fma(a, b.x, c.x);
+    d.y = fma(a, b.y, c.y);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c)
+{
+    Float8_ d;
+    d.x = fma(a, b.x, c.x);
+    d.y = fma(a, b.y, c.y);
+    d.z = fma(a, b.z, c.z);
+    d.w = fma(a, b.w, c.w);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+inline __device__ float2 add(__nv_bfloat162 a, float2 fb)
+{
+    float2 fa = bf1622float2(a);
+    return add(fa, fb);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ add(bf16_4_t a, Float4_ fb)
+{
+    Float4_ fc;
+    fc.x = add(a.x, fb.x);
+    fc.y = add(a.y, fb.y);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ add(bf16_8_t a, Float8_ fb)
+{
+    Float8_ fc;
+    fc.x = add(a.x, fb.x);
+    fc.y = add(a.y, fb.y);
+    fc.z = add(a.z, fb.z);
+    fc.w = add(a.w, fb.w);
+    return fc;
+}
+#endif  // ENABLE_BF16
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c)
+{
+    uint32_t d;
+    asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint32_t fma(uint16_t a, uint32_t b, uint32_t c)
+{
+    return fma(h0_h0(a), b, c);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint2 fma(uint2 a, uint2 b, uint2 c)
+{
+    uint2 d;
+    d.x = fma(a.x, b.x, c.x);
+    d.y = fma(a.y, b.y, c.y);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint2 fma(uint16_t a, uint2 b, uint2 c)
+{
+    uint32_t s = h0_h0(a);
+    uint2    d;
+    d.x = fma(s, b.x, c.x);
+    d.y = fma(s, b.y, c.y);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint4 fma(uint4 a, uint4 b, uint4 c)
+{
+    uint4 d;
+    d.x = fma(a.x, b.x, c.x);
+    d.y = fma(a.y, b.y, c.y);
+    d.z = fma(a.z, b.z, c.z);
+    d.w = fma(a.w, b.w, c.w);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ uint4 fma(uint16_t a, uint4 b, uint4 c)
+{
+    uint32_t s = h0_h0(a);
+    uint4    d;
+    d.x = fma(s, b.x, c.x);
+    d.y = fma(s, b.y, c.y);
+    d.z = fma(s, b.z, c.z);
+    d.w = fma(s, b.w, c.w);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float fma(uint16_t a, uint16_t b, float fc)
+{
+    float fa = half_to_float(a);
+    float fb = half_to_float(b);
+    return fa * fb + fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 fma(uint32_t a, uint32_t b, float2 fc)
+{
+    float2 fa = half2_to_float2(a);
+    float2 fb = half2_to_float2(b);
+    return fma(fa, fb, fc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 fma(uint16_t a, uint32_t b, float2 fc)
+{
+    return fma(h0_h0(a), b, fc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ fma(uint2 a, uint2 b, Float4_ fc)
+{
+    Float4_ fd;
+    fd.x = fma(a.x, b.x, fc.x);
+    fd.y = fma(a.y, b.y, fc.y);
+    return fd;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ fma(uint16_t a, uint2 b, Float4_ fc)
+{
+    uint32_t s = h0_h0(a);
+    Float4_  fd;
+    fd.x = fma(s, b.x, fc.x);
+    fd.y = fma(s, b.y, fc.y);
+    return fd;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ fma(uint4 a, uint4 b, Float8_ fc)
+{
+    Float8_ fd;
+    fd.x = fma(a.x, b.x, fc.x);
+    fd.y = fma(a.y, b.y, fc.y);
+    fd.z = fma(a.z, b.z, fc.z);
+    fd.w = fma(a.w, b.w, fc.w);
+    return fd;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc)
+{
+    uint32_t s = h0_h0(a);
+    Float8_  fd;
+    fd.x = fma(s, b.x, fc.x);
+    fd.y = fma(s, b.y, fc.y);
+    fd.z = fma(s, b.z, fc.z);
+    fd.w = fma(s, b.w, fc.w);
+    return fd;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef ENABLE_BF16
+inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c)
+{
+    return bf16hfma2(a, b, c);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b, __nv_bfloat162 c)
+{
+    return bf16hfma2(bf162bf162(a), b, c);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ bf16_4_t fma(bf16_4_t a, bf16_4_t b, bf16_4_t c)
+{
+    bf16_4_t d;
+    d.x = fma(a.x, b.x, c.x);
+    d.y = fma(a.y, b.y, c.y);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ bf16_4_t fma(__nv_bfloat16 a, bf16_4_t b, bf16_4_t c)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    bf16_4_t       d;
+    d.x = fma(s, b.x, c.x);
+    d.y = fma(s, b.y, c.y);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ bf16_8_t fma(bf16_8_t a, bf16_8_t b, bf16_8_t c)
+{
+    bf16_8_t d;
+    d.x = fma(a.x, b.x, c.x);
+    d.y = fma(a.y, b.y, c.y);
+    d.z = fma(a.z, b.z, c.z);
+    d.w = fma(a.w, b.w, c.w);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ bf16_8_t fma(__nv_bfloat16 a, bf16_8_t b, bf16_8_t c)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    bf16_8_t       d;
+    d.x = fma(s, b.x, c.x);
+    d.y = fma(s, b.y, c.y);
+    d.z = fma(s, b.z, c.z);
+    d.w = fma(s, b.w, c.w);
+    return d;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float fma(__nv_bfloat16 a, __nv_bfloat16 b, float fc)
+{
+    return __bfloat162float(a) * __bfloat162float(b) + fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 fma(__nv_bfloat162 a, __nv_bfloat162 b, float2 fc)
+{
+    float2 fa = bf1622float2(a);
+    float2 fb = bf1622float2(b);
+    return fma(fa, fb, fc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 fma(__nv_bfloat16 a, __nv_bfloat162 b, float2 fc)
+{
+    return fma(bf162bf162(a), b, fc);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ fma(bf16_4_t a, bf16_4_t b, Float4_ fc)
+{
+    Float4_ fd;
+    fd.x = fma(a.x, b.x, fc.x);
+    fd.y = fma(a.y, b.y, fc.y);
+    return fd;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float4_ fma(__nv_bfloat16 a, bf16_4_t b, Float4_ fc)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    Float4_        fd;
+    fd.x = fma(s, b.x, fc.x);
+    fd.y = fma(s, b.y, fc.y);
+    return fd;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ fma(bf16_8_t a, bf16_8_t b, Float8_ fc)
+{
+    Float8_ fd;
+    fd.x = fma(a.x, b.x, fc.x);
+    fd.y = fma(a.y, b.y, fc.y);
+    fd.z = fma(a.z, b.z, fc.z);
+    fd.w = fma(a.w, b.w, fc.w);
+    return fd;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    Float8_        fd;
+    fd.x = fma(s, b.x, fc.x);
+    fd.y = fma(s, b.y, fc.y);
+    fd.z = fma(s, b.z, fc.z);
+    fd.w = fma(s, b.w, fc.w);
+    return fd;
+}
+#endif  // ENABLE_BF16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Acc, typename A, typename B>
+inline __device__ Acc mul(A a, B b)
+{
+    return a * b;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float mul<float, float>(float a, float b)
+{
+    return a * b;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float2 mul(float2 a, float2 b)
+{
+    float2 c;
+    c.x = a.x * b.x;
+    c.y = a.y * b.y;
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float2 mul(float a, float2 b)
+{
+    float2 c;
+    c.x = a * b.x;
+    c.y = a * b.y;
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float4 mul(float4 a, float4 b)
+{
+    float4 c;
+    c.x = a.x * b.x;
+    c.y = a.y * b.y;
+    c.z = a.z * b.z;
+    c.w = a.w * b.w;
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float4 mul(float a, float4 b)
+{
+    float4 c;
+    c.x = a * b.x;
+    c.y = a * b.y;
+    c.z = a * b.z;
+    c.w = a * b.w;
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float8_ mul(float a, Float8_ b)
+{
+    Float8_ c;
+    c.x = make_float2(a * b.x.x, a * b.x.y);
+    c.y = make_float2(a * b.y.x, a * b.y.y);
+    c.z = make_float2(a * b.z.x, a * b.z.y);
+    c.w = make_float2(a * b.w.x, a * b.w.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ uint16_t mul(uint16_t a, uint16_t b)
+{
+    uint16_t c;
+    asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ uint32_t mul(uint32_t a, uint32_t b)
+{
+    uint32_t c;
+    asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ uint32_t mul(uint16_t a, uint32_t b)
+{
+    return mul<uint32_t, uint32_t, uint32_t>(h0_h0(a), b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ uint2 mul(uint2 a, uint2 b)
+{
+    uint2 c;
+    c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
+    c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ uint2 mul(uint16_t a, uint2 b)
+{
+    uint32_t s = h0_h0(a);
+    uint2    c;
+    c.x = mul<uint32_t, uint32_t, uint32_t>(s, b.x);
+    c.y = mul<uint32_t, uint32_t, uint32_t>(s, b.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ uint4 mul(uint4 a, uint4 b)
+{
+    uint4 c;
+    c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
+    c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y);
+    c.z = mul<uint32_t, uint32_t, uint32_t>(a.z, b.z);
+    c.w = mul<uint32_t, uint32_t, uint32_t>(a.w, b.w);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ uint4 mul(uint16_t a, uint4 b)
+{
+    uint32_t s = h0_h0(a);
+    uint4    c;
+    c.x = mul<uint32_t, uint32_t, uint32_t>(s, b.x);
+    c.y = mul<uint32_t, uint32_t, uint32_t>(s, b.y);
+    c.z = mul<uint32_t, uint32_t, uint32_t>(s, b.z);
+    c.w = mul<uint32_t, uint32_t, uint32_t>(s, b.w);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float mul(uint16_t a, uint16_t b)
+{
+    float fa = half_to_float(a);
+    float fb = half_to_float(b);
+    return fa * fb;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float mul(uint16_t a, float b)
+{
+    return half_to_float(a) * b;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float2 mul(uint32_t a, uint32_t b)
+{
+    float2 fa = half2_to_float2(a);
+    float2 fb = half2_to_float2(b);
+    return mul<float2, float2, float2>(fa, fb);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float2 mul(uint16_t a, uint32_t b)
+{
+    return mul<float2, uint32_t, uint32_t>(h0_h0(a), b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float4_ mul(uint2 a, uint2 b)
+{
+    Float4_ fc;
+    fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
+    fc.y = mul<float2, uint32_t, uint32_t>(a.y, b.y);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float4_ mul(uint16_t a, uint2 b)
+{
+    uint32_t s = h0_h0(a);
+    Float4_  fc;
+    fc.x = mul<float2, uint32_t, uint32_t>(s, b.x);
+    fc.y = mul<float2, uint32_t, uint32_t>(s, b.y);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float8_ mul(uint4 a, uint4 b)
+{
+    Float8_ fc;
+    fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
+    fc.y = mul<float2, uint32_t, uint32_t>(a.y, b.y);
+    fc.z = mul<float2, uint32_t, uint32_t>(a.z, b.z);
+    fc.w = mul<float2, uint32_t, uint32_t>(a.w, b.w);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float8_ mul(uint16_t a, uint4 b)
+{
+    uint32_t s = h0_h0(a);
+    Float8_  fc;
+    fc.x = mul<float2, uint32_t, uint32_t>(s, b.x);
+    fc.y = mul<float2, uint32_t, uint32_t>(s, b.y);
+    fc.z = mul<float2, uint32_t, uint32_t>(s, b.z);
+    fc.w = mul<float2, uint32_t, uint32_t>(s, b.w);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+template<>
+inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    return __hmul(a, b);
+#else
+    return bf16hmul(a, b);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b)
+{
+    return bf16hmul2(a, b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b)
+{
+    return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b)
+{
+    bf16_4_t c;
+    c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+    c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    bf16_4_t       c;
+    c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+    c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b)
+{
+    bf16_8_t c;
+    c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+    c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+    c.z = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.z, b.z);
+    c.w = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.w, b.w);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    bf16_8_t       c;
+    c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+    c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+    c.z = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.z);
+    c.w = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.w);
+    return c;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b)
+{
+    float fa = (float)a;
+    float fb = (float)b;
+    return fa * fb;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float mul(__nv_bfloat16 a, float b)
+{
+    return __bfloat162float(a) * b;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b)
+{
+    float2 fa = bf1622float2(a);
+    float2 fb = bf1622float2(b);
+    return mul<float2, float2, float2>(fa, fb);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b)
+{
+    return mul<float2, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b)
+{
+    Float4_ fc;
+    fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+    fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    Float4_        fc;
+    fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+    fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b)
+{
+    Float8_ fc;
+    fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
+    fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.y, b.y);
+    fc.z = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.z, b.z);
+    fc.w = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.w, b.w);
+    return fc;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b)
+{
+    __nv_bfloat162 s = bf162bf162(a);
+    Float8_        fc;
+    fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.x);
+    fc.y = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.y);
+    fc.z = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.z);
+    fc.w = mul<float2, __nv_bfloat162, __nv_bfloat162>(s, b.w);
+    return fc;
+}
+#endif  // ENABLE_BF16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(float v)
+{
+    return v;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(float2 v)
+{
+    return v.x + v.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(float4 v)
+{
+    return v.x + v.y + v.z + v.w;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_BF16
+inline __device__ float sum(__nv_bfloat162 v)
+{
+    float2 vf = bf1622float2(v);
+    return vf.x + vf.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(bf16_4_t v)
+{
+    return sum(v.x) + sum(v.y);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(bf16_8_t v)
+{
+    return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w);
+}
+#endif  // ENABLE_BF16
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(uint16_t v)
+{
+    return half_to_float(v);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(uint32_t v)
+{
+    float2 tmp = half2_to_float2(v);
+    return tmp.x + tmp.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(uint2 v)
+{
+    uint32_t c = add(v.x, v.y);
+    return sum(c);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(uint4 v)
+{
+#if 1
+    uint32_t c = add(v.x, v.y);
+    c          = add(c, v.z);
+    c          = add(c, v.w);
+#else
+    uint32_t c = add(v.x, v.y);
+    uint32_t d = add(v.z, v.w);
+    c          = add(c, d);
+#endif
+    return sum(c);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(Float4_ v)
+{
+    return v.x.x + v.x.y + v.y.x + v.y.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float sum(Float8_ v)
+{
+    return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+inline __device__ float dot(T a, T b)
+{
+    return sum(mul<T, T, T>(a, b));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename A, typename T>
+inline __device__ float dot(T a, T b)
+{
+    return sum(mul<A, T, T>(a, b));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void zero(uint16_t& dst)
+{
+    dst = uint16_t(0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+inline __device__ void zero(T& dst)
+{
+    constexpr int WORDS = sizeof(T) / 4;
+    union {
+        T        raw;
+        uint32_t words[WORDS];
+    } tmp;
+#pragma unroll
+    for (int ii = 0; ii < WORDS; ++ii) {
+        tmp.words[ii] = 0u;
+    }
+    dst = tmp.raw;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 rotary_embedding_coefficient(const int zid, const int rot_embed_dim, const int t_step, const float base)
+{
+    const float pos_idx_inv_freq = t_step / pow(base, zid / (float)rot_embed_dim);
+    return {cos(pos_idx_inv_freq), sin(pos_idx_inv_freq)};
+}
+
+inline __device__ float2 rotary_embedding_transform(const float2 v, const float2 coef)
+{
+    float2 rot_v;
+    rot_v.x = coef.x * v.x - coef.y * v.y;
+    rot_v.y = coef.x * v.y + coef.y * v.x;
+    return rot_v;
+}
+
+inline __device__ uint32_t rotary_embedding_transform(const uint32_t v, const float2 coef)
+{
+    float2 fv     = half2_to_float2(v);
+    float2 rot_fv = rotary_embedding_transform(fv, coef);
+    return float2_to_half2(rot_fv);
+}
+
+#ifdef ENABLE_BF16
+inline __device__ __nv_bfloat162 rotary_embedding_transform(const __nv_bfloat162 v, const float2 coef)
+{
+    float2 fv     = bf1622float2(v);
+    float2 rot_fv = rotary_embedding_transform(fv, coef);
+    return __floats2bfloat162_rn(rot_fv.x, rot_fv.y);
+}
+#endif
+
+inline __device__ void apply_rotary_embedding(float& q, int zid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    return;
+}
+
+inline __device__ void apply_rotary_embedding(float& q, float& k, int zid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    return;
+}
+
+inline __device__ void apply_rotary_embedding(float2& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base);
+    q               = rotary_embedding_transform(q, coef);
+}
+
+inline __device__ void apply_rotary_embedding(float2& q, float2& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base);
+    q               = rotary_embedding_transform(q, coef);
+    k               = rotary_embedding_transform(k, coef);
+}
+
+inline __device__ void apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+
+    Float4_&   q_    = *reinterpret_cast<Float4_*>(&q);
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base);
+    q_.x             = rotary_embedding_transform(q_.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base);
+    q_.y             = rotary_embedding_transform(q_.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(float4& q, float4& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+
+    Float4_&   q_    = *reinterpret_cast<Float4_*>(&q);
+    Float4_&   k_    = *reinterpret_cast<Float4_*>(&k);
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base);
+    q_.x             = rotary_embedding_transform(q_.x, coef0);
+    k_.x             = rotary_embedding_transform(k_.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base);
+    q_.y             = rotary_embedding_transform(q_.y, coef1);
+    k_.y             = rotary_embedding_transform(k_.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(uint32_t& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base);
+    q               = rotary_embedding_transform(q, coef);
+}
+
+inline __device__ void apply_rotary_embedding(uint32_t& q, uint32_t& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base);
+    q               = rotary_embedding_transform(q, coef);
+    k               = rotary_embedding_transform(k, coef);
+}
+
+inline __device__ void apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(uint2& q, uint2& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    k.x              = rotary_embedding_transform(k.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    k.y              = rotary_embedding_transform(k.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (8 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step, base);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step, base);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step, base);
+    q.z              = rotary_embedding_transform(q.z, coef2);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step, base);
+    q.w              = rotary_embedding_transform(q.w, coef3);
+}
+
+inline __device__ void apply_rotary_embedding(uint4& q, uint4& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (8 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step, base);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    k.x              = rotary_embedding_transform(k.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step, base);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    k.y              = rotary_embedding_transform(k.y, coef1);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step, base);
+    q.z              = rotary_embedding_transform(q.z, coef2);
+    k.z              = rotary_embedding_transform(k.z, coef2);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step, base);
+    q.w              = rotary_embedding_transform(q.w, coef3);
+    k.w              = rotary_embedding_transform(k.w, coef3);
+}
+
+#ifdef ENABLE_BF16
+inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base);
+    q               = rotary_embedding_transform(q, coef);
+}
+
+inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, __nv_bfloat162& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base);
+    q               = rotary_embedding_transform(q, coef);
+    k               = rotary_embedding_transform(k, coef);
+}
+
+inline __device__ void apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    k.x              = rotary_embedding_transform(k.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    k.y              = rotary_embedding_transform(k.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (8 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step, base);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step, base);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step, base);
+    q.z              = rotary_embedding_transform(q.z, coef2);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step, base);
+    q.w              = rotary_embedding_transform(q.w, coef3);
+}
+
+inline __device__ void apply_rotary_embedding(bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f)
+{
+    if (8 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step, base);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    k.x              = rotary_embedding_transform(k.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step, base);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    k.y              = rotary_embedding_transform(k.y, coef1);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step, base);
+    q.z              = rotary_embedding_transform(q.z, coef2);
+    k.z              = rotary_embedding_transform(k.z, coef2);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step, base);
+    q.w              = rotary_embedding_transform(q.w, coef3);
+    k.w              = rotary_embedding_transform(k.w, coef3);
+}
+#endif  // ENABLE_BF16
+
+template <typename T>
+inline __device__ float2 rotary_embedding_coefficient(const int zid, const int t_step, const T* rotary_cos, const T* rotary_sin)
+{
+    // zid is the index of the dimension (0, 2, 4, ..., rotary_dim).
+    // rotary_cos/sin stores those at index 0, 1, 2, ..., rotary_dim / 2.
+    return {float(rotary_cos[zid / 2]), float(rotary_sin[zid / 2])};
+}
+
+// fp16 is special because we use uint16_t for reading the data, for backward compatibility.
+template <>
+inline __device__ float2 rotary_embedding_coefficient<uint16_t>(const int zid, const int t_step, const uint16_t* rotary_cos, const uint16_t* rotary_sin)
+{
+    // zid is the index of the dimension (0, 2, 4, ..., rotary_dim).
+    // rotary_cos/sin stores those at index 0, 1, 2, ..., rotary_dim / 2.
+    return {float(reinterpret_cast<const __half*>(rotary_cos)[zid / 2]),
+            float(reinterpret_cast<const __half*>(rotary_sin)[zid / 2])};
+}
+
+inline __device__ void apply_rotary_embedding(float& q, int zid, int rot_embed_dim, int t_step, const float* rotary_cos, const float* rotary_sin)
+{
+    return;
+}
+
+inline __device__ void apply_rotary_embedding(float& q, float& k, int zid, int rot_embed_dim, int t_step, const float* rotary_cos, const float* rotary_sin)
+{
+    return;
+}
+
+inline __device__ void apply_rotary_embedding(float2& q, int tid, int rot_embed_dim, int t_step, const float* rotary_cos, const float* rotary_sin)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, t_step, rotary_cos, rotary_sin);
+    q               = rotary_embedding_transform(q, coef);
+}
+
+inline __device__ void apply_rotary_embedding(float2& q, float2& k, int tid, int rot_embed_dim, int t_step, const float* rotary_cos, const float* rotary_sin)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, t_step, rotary_cos, rotary_sin);
+    q               = rotary_embedding_transform(q, coef);
+    k               = rotary_embedding_transform(k, coef);
+}
+
+inline __device__ void apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, int t_step, const float* rotary_cos, const float* rotary_sin)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+
+    Float4_&   q_    = *reinterpret_cast<Float4_*>(&q);
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, t_step, rotary_cos, rotary_sin);
+    q_.x             = rotary_embedding_transform(q_.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, t_step, rotary_cos, rotary_sin);
+    q_.y             = rotary_embedding_transform(q_.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(float4& q, float4& k, int tid, int rot_embed_dim, int t_step, const float* rotary_cos, const float* rotary_sin)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+
+    Float4_&   q_    = *reinterpret_cast<Float4_*>(&q);
+    Float4_&   k_    = *reinterpret_cast<Float4_*>(&k);
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, t_step, rotary_cos, rotary_sin);
+    q_.x             = rotary_embedding_transform(q_.x, coef0);
+    k_.x             = rotary_embedding_transform(k_.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, t_step, rotary_cos, rotary_sin);
+    q_.y             = rotary_embedding_transform(q_.y, coef1);
+    k_.y             = rotary_embedding_transform(k_.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(uint32_t& q, int tid, int rot_embed_dim, int t_step, const uint16_t* rotary_cos, const uint16_t* rotary_sin)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, t_step, rotary_cos, rotary_sin);
+    q               = rotary_embedding_transform(q, coef);
+}
+
+inline __device__ void apply_rotary_embedding(uint32_t& q, uint32_t& k, int tid, int rot_embed_dim, int t_step, const uint16_t* rotary_cos, const uint16_t* rotary_sin)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, t_step, rotary_cos, rotary_sin);
+    q               = rotary_embedding_transform(q, coef);
+    k               = rotary_embedding_transform(k, coef);
+}
+
+inline __device__ void apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, int t_step, const uint16_t* rotary_cos, const uint16_t* rotary_sin)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, t_step, rotary_cos, rotary_sin);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, t_step, rotary_cos, rotary_sin);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(uint2& q, uint2& k, int tid, int rot_embed_dim, int t_step, const uint16_t* rotary_cos, const uint16_t* rotary_sin)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, t_step, rotary_cos, rotary_sin);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    k.x              = rotary_embedding_transform(k.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, t_step, rotary_cos, rotary_sin);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    k.y              = rotary_embedding_transform(k.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, int t_step, const uint16_t* rotary_cos, const uint16_t* rotary_sin)
+{
+    if (8 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, t_step, rotary_cos, rotary_sin);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, t_step, rotary_cos, rotary_sin);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, t_step, rotary_cos, rotary_sin);
+    q.z              = rotary_embedding_transform(q.z, coef2);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, t_step, rotary_cos, rotary_sin);
+    q.w              = rotary_embedding_transform(q.w, coef3);
+}
+
+inline __device__ void apply_rotary_embedding(uint4& q, uint4& k, int tid, int rot_embed_dim, int t_step, const uint16_t* rotary_cos, const uint16_t* rotary_sin)
+{
+    if (8 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, t_step, rotary_cos, rotary_sin);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    k.x              = rotary_embedding_transform(k.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, t_step, rotary_cos, rotary_sin);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    k.y              = rotary_embedding_transform(k.y, coef1);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, t_step, rotary_cos, rotary_sin);
+    q.z              = rotary_embedding_transform(q.z, coef2);
+    k.z              = rotary_embedding_transform(k.z, coef2);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, t_step, rotary_cos, rotary_sin);
+    q.w              = rotary_embedding_transform(q.w, coef3);
+    k.w              = rotary_embedding_transform(k.w, coef3);
+}
+
+#ifdef ENABLE_BF16
+inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, int tid, int rot_embed_dim, int t_step, const __nv_bfloat16* rotary_cos, const __nv_bfloat16* rotary_sin)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, t_step, rotary_cos, rotary_sin);
+    q               = rotary_embedding_transform(q, coef);
+}
+
+inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, __nv_bfloat162& k, int tid, int rot_embed_dim, int t_step, const __nv_bfloat16* rotary_cos, const __nv_bfloat16* rotary_sin)
+{
+    if (2 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef = rotary_embedding_coefficient(2 * tid, t_step, rotary_cos, rotary_sin);
+    q               = rotary_embedding_transform(q, coef);
+    k               = rotary_embedding_transform(k, coef);
+}
+
+inline __device__ void apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, int t_step, const __nv_bfloat16* rotary_cos, const __nv_bfloat16* rotary_sin)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, t_step, rotary_cos, rotary_sin);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, t_step, rotary_cos, rotary_sin);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, int t_step, const __nv_bfloat16* rotary_cos, const __nv_bfloat16* rotary_sin)
+{
+    if (4 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(4 * tid, t_step, rotary_cos, rotary_sin);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    k.x              = rotary_embedding_transform(k.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, t_step, rotary_cos, rotary_sin);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    k.y              = rotary_embedding_transform(k.y, coef1);
+}
+
+inline __device__ void apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, int t_step, const __nv_bfloat16* rotary_cos, const __nv_bfloat16* rotary_sin)
+{
+    if (8 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, t_step, rotary_cos, rotary_sin);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, t_step, rotary_cos, rotary_sin);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, t_step, rotary_cos, rotary_sin);
+    q.z              = rotary_embedding_transform(q.z, coef2);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, t_step, rotary_cos, rotary_sin);
+    q.w              = rotary_embedding_transform(q.w, coef3);
+}
+
+inline __device__ void apply_rotary_embedding(bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, int t_step, const __nv_bfloat16* rotary_cos, const __nv_bfloat16* rotary_sin)
+{
+    if (8 * tid >= rot_embed_dim) {
+        return;
+    }
+    const auto coef0 = rotary_embedding_coefficient(8 * tid, t_step, rotary_cos, rotary_sin);
+    q.x              = rotary_embedding_transform(q.x, coef0);
+    k.x              = rotary_embedding_transform(k.x, coef0);
+    const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, t_step, rotary_cos, rotary_sin);
+    q.y              = rotary_embedding_transform(q.y, coef1);
+    k.y              = rotary_embedding_transform(k.y, coef1);
+    const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, t_step, rotary_cos, rotary_sin);
+    q.z              = rotary_embedding_transform(q.z, coef2);
+    k.z              = rotary_embedding_transform(k.z, coef2);
+    const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, t_step, rotary_cos, rotary_sin);
+    q.w              = rotary_embedding_transform(q.w, coef3);
+    k.w              = rotary_embedding_transform(k.w, coef3);
+}
+#endif  // ENABLE_BF16
+
+template<typename Vec_T, typename T>
+__device__ __inline__ void vec_from_smem_transpose(Vec_T& vec, T* smem, int transpose_idx, int smem_pitch);
+
+template<>
+__device__ __inline__ void vec_from_smem_transpose(float& vec, float* smem, int transpose_idx, int smem_pitch)
+{
+    return;
+}
+
+template<>
+__device__ __inline__ void vec_from_smem_transpose(uint32_t& vec, uint16_t* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint32_t u32;
+        uint16_t u16[2];
+    } tmp;
+    tmp.u16[0] = smem[transpose_idx];
+    tmp.u16[1] = smem[smem_pitch + transpose_idx];
+
+    vec = tmp.u32;
+}
+
+template<>
+__device__ __inline__ void vec_from_smem_transpose(uint2& vec, uint16_t* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint32_t u32;
+        uint16_t u16[2];
+    } tmp_1, tmp_2;
+    tmp_1.u32 = *reinterpret_cast<uint32_t*>(&smem[transpose_idx]);
+    tmp_2.u32 = *reinterpret_cast<uint32_t*>(&smem[smem_pitch + transpose_idx]);
+
+    union {
+        uint2    u32x2;
+        uint16_t u16[4];
+    } tmp_3;
+    tmp_3.u16[0] = tmp_1.u16[0];
+    tmp_3.u16[1] = tmp_2.u16[0];
+    tmp_3.u16[2] = tmp_1.u16[1];
+    tmp_3.u16[3] = tmp_2.u16[1];
+
+    vec = tmp_3.u32x2;
+}
+
+template<>
+__device__ __inline__ void vec_from_smem_transpose(uint4& vec, uint16_t* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint64_t u64;
+        uint16_t u16[4];
+    } tmp_1, tmp_2;
+    tmp_1.u64 = *reinterpret_cast<uint64_t*>(&smem[transpose_idx]);
+    tmp_2.u64 = *reinterpret_cast<uint64_t*>(&smem[smem_pitch + transpose_idx]);
+
+    union {
+        uint4    u32x4;
+        uint16_t u16[8];
+    } tmp_3;
+    tmp_3.u16[0] = tmp_1.u16[0];
+    tmp_3.u16[1] = tmp_2.u16[0];
+    tmp_3.u16[2] = tmp_1.u16[1];
+    tmp_3.u16[3] = tmp_2.u16[1];
+    tmp_3.u16[4] = tmp_1.u16[2];
+    tmp_3.u16[5] = tmp_2.u16[2];
+    tmp_3.u16[6] = tmp_1.u16[3];
+    tmp_3.u16[7] = tmp_2.u16[3];
+
+    vec = tmp_3.u32x4;
+}
+
+#ifdef ENABLE_BF16
+template<>
+__device__ __inline__ void
+vec_from_smem_transpose(bf16_4_t& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint32_t      u32;
+        __nv_bfloat16 bf16[2];
+    } tmp_1, tmp_2;
+    tmp_1.u32 = *reinterpret_cast<uint32_t*>(&smem[transpose_idx]);
+    tmp_2.u32 = *reinterpret_cast<uint32_t*>(&smem[smem_pitch + transpose_idx]);
+
+    vec.x = __nv_bfloat162{tmp_1.bf16[0], tmp_2.bf16[0]};
+    vec.y = __nv_bfloat162{tmp_1.bf16[1], tmp_2.bf16[1]};
+}
+
+template<>
+__device__ __inline__ void
+vec_from_smem_transpose(bf16_8_t& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint64_t      u64;
+        __nv_bfloat16 bf16[4];
+    } tmp_1, tmp_2;
+    tmp_1.u64 = *reinterpret_cast<uint64_t*>(&smem[transpose_idx]);
+    tmp_2.u64 = *reinterpret_cast<uint64_t*>(&smem[smem_pitch + transpose_idx]);
+
+    vec.x = __nv_bfloat162{tmp_1.bf16[0], tmp_2.bf16[0]};
+    vec.y = __nv_bfloat162{tmp_1.bf16[1], tmp_2.bf16[1]};
+    vec.z = __nv_bfloat162{tmp_1.bf16[2], tmp_2.bf16[2]};
+    vec.w = __nv_bfloat162{tmp_1.bf16[3], tmp_2.bf16[3]};
+}
+#endif  // ENABLE_BF16
+
+template<>
+__device__ __inline__ void vec_from_smem_transpose(float4& vec, float* smem, int transpose_idx, int smem_pitch)
+{
+    vec.x = smem[transpose_idx];
+    vec.z = smem[transpose_idx + 1];
+    vec.y = smem[smem_pitch + transpose_idx];
+    vec.w = smem[smem_pitch + transpose_idx + 1];
+}
+
+template<>
+__device__ __inline__ void vec_from_smem_transpose(uint32_t& vec, half* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint32_t u32;
+        half     u16[2];
+    } tmp;
+    tmp.u16[0] = smem[transpose_idx];
+    tmp.u16[1] = smem[smem_pitch + transpose_idx];
+
+    vec = tmp.u32;
+}
+
+#ifdef ENABLE_BF16
+template<>
+__device__ __inline__ void
+vec_from_smem_transpose(__nv_bfloat162& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch)
+{
+    vec.x = smem[transpose_idx];
+    vec.y = smem[smem_pitch + transpose_idx];
+}
+#endif
+
+template<>
+__device__ __inline__ void vec_from_smem_transpose(float2& vec, float* smem, int transpose_idx, int smem_pitch)
+{
+    vec.x = smem[transpose_idx];
+    vec.y = smem[smem_pitch + transpose_idx];
+}
+
+template<typename Vec_T, typename T>
+__device__ __inline__ void write_smem_transpose(const Vec_T& vec, T* smem, int transpose_idx, int smem_pitch);
+
+template<>
+__device__ __inline__ void write_smem_transpose(const float& vec, float* smem, int transpose_idx, int smem_pitch)
+{
+    return;
+}
+
+template<>
+__device__ __inline__ void write_smem_transpose(const uint4& vec, uint16_t* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint64_t u64;
+        uint16_t u16[4];
+    } tmp_1, tmp_2;
+
+    union {
+        uint4    u32x4;
+        uint16_t u16[8];
+    } tmp_3;
+    tmp_3.u32x4  = vec;
+    tmp_1.u16[0] = tmp_3.u16[0];
+    tmp_2.u16[0] = tmp_3.u16[1];
+    tmp_1.u16[1] = tmp_3.u16[2];
+    tmp_2.u16[1] = tmp_3.u16[3];
+    tmp_1.u16[2] = tmp_3.u16[4];
+    tmp_2.u16[2] = tmp_3.u16[5];
+    tmp_1.u16[3] = tmp_3.u16[6];
+    tmp_2.u16[3] = tmp_3.u16[7];
+
+    *reinterpret_cast<uint64_t*>(&smem[transpose_idx])              = tmp_1.u64;
+    *reinterpret_cast<uint64_t*>(&smem[smem_pitch + transpose_idx]) = tmp_2.u64;
+}
+
+template<>
+__device__ __inline__ void write_smem_transpose(const uint2& vec, uint16_t* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint32_t u32;
+        uint16_t u16[2];
+    } tmp_1, tmp_2;
+
+    union {
+        uint2    u32x2;
+        uint16_t u16[4];
+    } tmp_3;
+    tmp_3.u32x2  = vec;
+    tmp_1.u16[0] = tmp_3.u16[0];
+    tmp_2.u16[0] = tmp_3.u16[1];
+    tmp_1.u16[1] = tmp_3.u16[2];
+    tmp_2.u16[1] = tmp_3.u16[3];
+
+    *reinterpret_cast<uint32_t*>(&smem[transpose_idx])              = tmp_1.u32;
+    *reinterpret_cast<uint32_t*>(&smem[smem_pitch + transpose_idx]) = tmp_2.u32;
+}
+
+template<>
+__device__ __inline__ void write_smem_transpose(const uint32_t& vec, uint16_t* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint32_t u32;
+        uint16_t u16[2];
+    } tmp;
+    tmp.u32 = vec;
+
+    smem[transpose_idx]              = tmp.u16[0];
+    smem[smem_pitch + transpose_idx] = tmp.u16[1];
+}
+
+template<>
+__device__ __inline__ void write_smem_transpose(const float4& vec, float* smem, int transpose_idx, int smem_pitch)
+{
+    smem[transpose_idx]                  = vec.x;
+    smem[transpose_idx + 1]              = vec.z;
+    smem[smem_pitch + transpose_idx]     = vec.y;
+    smem[smem_pitch + transpose_idx + 1] = vec.w;
+}
+
+template<>
+__device__ __inline__ void write_smem_transpose(const uint32_t& vec, half* smem, int transpose_idx, int smem_pitch)
+{
+    union {
+        uint32_t u32;
+        half     u16[2];
+    } tmp;
+
+    tmp.u32                          = vec;
+    smem[transpose_idx]              = tmp.u16[0];
+    smem[smem_pitch + transpose_idx] = tmp.u16[1];
+}
+
+#ifdef ENABLE_BF16
+template<>
+__device__ __inline__ void
+write_smem_transpose(const __nv_bfloat162& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch)
+{
+    smem[transpose_idx]              = vec.x;
+    smem[smem_pitch + transpose_idx] = vec.y;
+}
+
+template<>
+__device__ __inline__ void
+write_smem_transpose(const bf16_4_t& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch)
+{
+    write_smem_transpose(reinterpret_cast<const uint2&>(vec), reinterpret_cast<uint16_t*>(smem), transpose_idx, smem_pitch);
+}
+
+template<>
+__device__ __inline__ void
+write_smem_transpose(const bf16_8_t& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch)
+{
+    write_smem_transpose(reinterpret_cast<const uint4&>(vec), reinterpret_cast<uint16_t*>(smem), transpose_idx, smem_pitch);
+}
+#endif
+
+template<>
+__device__ __inline__ void write_smem_transpose(const float2& vec, float* smem, int transpose_idx, int smem_pitch)
+{
+    smem[transpose_idx]              = vec.x;
+    smem[smem_pitch + transpose_idx] = vec.y;
+}
+
+}  // namespace mmha
diff --git a/default.yaml b/default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..09b2aebd54031b6e080638cbed3605d0ef3fe83e
--- /dev/null
+++ b/default.yaml
@@ -0,0 +1,45 @@
+# rich_progress_bar:
+#   _target_: pytorch_lightning.callbacks.RichProgressBar
+
+rich_model_summary:
+  _target_: pytorch_lightning.callbacks.RichModelSummary
+
+model_checkpoint:
+  _target_: pytorch_lightning.callbacks.ModelCheckpoint
+  monitor: "val/acc" # name of the logged metric which determines when model is improving
+  mode: "max" # can be "max" or "min"
+  save_top_k: 1 # save k best models (determined by above metric)
+  save_last: True # additionally always save model from last epoch
+  verbose: False
+  dirpath: ${oc.env:CHECKPOINT_DIR,checkpoints}/${oc.select:name,''}
+  filename: "epoch_{epoch:03d}"
+  auto_insert_metric_name: False
+
+early_stopping:
+  _target_: pytorch_lightning.callbacks.EarlyStopping
+  monitor: "val/acc" # name of the logged metric which determines when model is improving
+  mode: "max" # can be "max" or "min"
+  patience: 100 # how many epochs of not improving until training stops
+  min_delta: 0 # minimum change in the monitored metric needed to qualify as an improvement
+
+learning_rate_monitor:
+  _target_: pytorch_lightning.callbacks.LearningRateMonitor
+  logging_interval: step
+
+speed_monitor:
+  _target_: src.callbacks.speed_monitor.SpeedMonitor
+  intra_step_time: True
+  inter_step_time: True
+  epoch_time: True
+
+loss_scale_monitor:
+  _target_: src.callbacks.loss_scale_monitor.LossScaleMonitor
+
+params_log:
+  _target_: src.callbacks.params_log.ParamsLog
+  total_params_log: True
+  trainable_params_log: True
+  non_trainable_params_log: True
+
+gpu_affinity:
+  _target_: src.callbacks.gpu_affinity.GpuAffinity
diff --git a/distributed.py b/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..74c55279645cd0fd687584bc1b7374c8c3c73e56
--- /dev/null
+++ b/distributed.py
@@ -0,0 +1,144 @@
+from typing import Optional
+
+import torch
+from torch import Tensor
+from torch.distributed import ProcessGroup
+
+# `all_gather_into_tensor` and `reduce_scatter_tensor` are new placeholders for
+# `_all_gather_base` and `_reduce_scatter_base`. They require the most recent
+# version of PyTorch. The following 4 lines are for backward compatibility with
+# older PyTorch.
+if "all_gather_into_tensor" not in dir(torch.distributed):
+    torch.distributed.all_gather_into_tensor = torch.distributed._all_gather_base
+if "reduce_scatter_tensor" not in dir(torch.distributed):
+    torch.distributed.reduce_scatter_tensor = torch.distributed._reduce_scatter_base
+
+
+# Raw operation, does not support autograd, but does support async
+def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
+    world_size = torch.distributed.get_world_size(process_group)
+    output = torch.empty(
+        world_size * input_.shape[0], *input_.shape[1:], dtype=input_.dtype, device=input_.device
+    )
+    handle = torch.distributed.all_gather_into_tensor(
+        output, input_.contiguous(), group=process_group, async_op=async_op
+    )
+    return output, handle
+
+
+# Raw operation, does not support autograd, but does support async
+def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
+    world_size = torch.distributed.get_world_size(process_group)
+    assert input_.shape[0] % world_size == 0
+    output = torch.empty(
+        input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device
+    )
+    handle = torch.distributed.reduce_scatter_tensor(
+        output, input_.contiguous(), group=process_group, async_op=async_op
+    )
+    return output, handle
+
+
+# Raw operation, does not support autograd, but does support async
+def all_reduce_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
+    input_ = input_.contiguous()
+    handle = torch.distributed.all_reduce(input_, group=process_group, async_op=async_op)
+    return input_, handle
+
+
+class AllGatherFunc(torch.autograd.Function):
+    """Gather the input from sequence parallel region and concatenate."""
+
+    @staticmethod
+    def forward(ctx, input_: Tensor, process_group: ProcessGroup) -> Tensor:
+        ctx.process_group = process_group
+        output, _ = all_gather_raw(input_, process_group)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output: Tensor):
+        grad_input, _ = reduce_scatter_raw(grad_output, ctx.process_group)
+        return grad_input, None
+
+
+# Supports autograd, but does not support async
+all_gather = AllGatherFunc.apply
+
+
+class ReduceScatterFunc(torch.autograd.Function):
+    """Reduce scatter the input from the sequence parallel region and concatenate."""
+
+    @staticmethod
+    def forward(ctx, input_: Tensor, process_group: ProcessGroup) -> Tensor:
+        ctx.process_group = process_group
+        output, _ = reduce_scatter_raw(input_, process_group)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output: Tensor):
+        grad_input, _ = all_gather_raw(grad_output, ctx.process_group)
+        return grad_input, None
+
+
+# Supports autograd, but does not support async
+reduce_scatter = ReduceScatterFunc.apply
+
+
+class AllReduceFunc(torch.autograd.Function):
+    """Gather the input from sequence parallel region and concatenate."""
+
+    @staticmethod
+    def forward(ctx, input_: Tensor, process_group: ProcessGroup) -> Tensor:
+        ctx.process_group = process_group
+        output, _ = all_reduce_raw(input_, process_group)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output: Tensor):
+        return grad_output, None
+
+
+# Supports autograd, but does not support async
+all_reduce = AllReduceFunc.apply
+
+
+def sync_shared_params(model: torch.nn.Module, process_group: ProcessGroup):
+    # We want to iterate over parameters with _shared_params=True in the same order,
+    # as different ranks might have different number of parameters (e.g., only rank 0 has bias).
+    pamams_shared = {
+        name: p for name, p in model.named_parameters() if getattr(p, "_shared_params", False)
+    }
+    for _, p in sorted(pamams_shared.items()):
+        with torch.no_grad():
+            # Broadcast needs src to be global rank, not group rank
+            torch.distributed.broadcast(
+                p, src=torch.distributed.get_global_rank(process_group, 0), group=process_group
+            )
+
+
+# Ref: https://github.com/NVIDIA/Megatron-LM/blob/52e636888cccc41e931251c417a7181fc36de926/megatron/optimizer/optimizer.py#L256
+def allreduce_sequence_parallel_grad(model: torch.nn.Module, process_group: ProcessGroup):
+    # We want to iterate over parameters with _sequence_parallel=True in the same order,
+    # as different ranks might have different number of parameters (e.g., only rank 0 has bias).
+    params_seqparallel = {
+        name: p for name, p in model.named_parameters() if getattr(p, "_sequence_parallel", False)
+    }
+    grads = [p.grad for _, p in sorted(params_seqparallel.items())]
+    if grads:
+        with torch.no_grad():
+            coalesced = torch._utils._flatten_dense_tensors(grads)
+            torch.distributed.all_reduce(coalesced, group=process_group)
+            for buf, synced in zip(grads, torch._utils._unflatten_dense_tensors(coalesced, grads)):
+                buf.copy_(synced)
+
+
+def get_dim_for_local_rank(dim: int, world_size: int, local_rank: int, multiple_of: int = 1) -> int:
+    """Get the dim for the local rank derived from splitting dim on world_size processes.
+
+    The split may not be even across the world_size processes.
+    """
+    multiple = dim // multiple_of
+    div = multiple // world_size
+    mod = multiple % world_size
+    local_multiple = div + int(local_rank < mod)
+    return local_multiple * multiple_of
diff --git a/dropout.h b/dropout.h
new file mode 100644
index 0000000000000000000000000000000000000000..4882f97d93832debd4fa51bf3a1d3f4eae916fc5
--- /dev/null
+++ b/dropout.h
@@ -0,0 +1,94 @@
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "philox.cuh"
+#include "utils.h"
+
+namespace flash {
+
+struct Dropout {
+
+    const unsigned long long seed, offset;
+    const uint8_t p_dropout_in_uint8_t;
+
+    __forceinline__ __device__ Dropout(const unsigned long long seed, const unsigned long long offset,
+                              const uint8_t p_dropout_in_uint8_t,
+                              const int bid, const int hid, const int tid, const int nheads)
+            : seed(seed)
+            , offset(offset + (bid * nheads + hid) * 32 + tid % 32)
+            , p_dropout_in_uint8_t(p_dropout_in_uint8_t) {
+    }
+
+    template <bool encode_dropout_in_sign_bit=false, typename Engine, typename Layout>
+    __forceinline__ __device__ void apply_dropout(Tensor<Engine, Layout> &tensor_,
+                                         int block_row_start, int block_col_start, int block_row_stride) {
+        // convert shape from (4, MMA_M, MMA_N) to (8, MMA_M, MMA_N / 2)
+        Tensor tensor = make_tensor(tensor_.data(), flash::convert_layout_acc_dropout(tensor_.layout()));
+        using T = typename Engine::value_type;
+        auto encode_dropout = [](bool keep, T val) {
+            return keep ? val : (encode_dropout_in_sign_bit ? -val : T(0));
+        };
+        static_assert(decltype(size<2>(tensor))::value % 2 == 0);
+        const uint16_t p_dropout_8bit_in_uint16_t = uint16_t(p_dropout_in_uint8_t);
+        const uint32_t p_dropout_8bit_in_uint32_t = (uint32_t(p_dropout_8bit_in_uint16_t) << 16) | uint32_t(p_dropout_8bit_in_uint16_t);
+        // if (cute::thread0()) { printf("threshold2 = 0x%x\n", p_dropout_8bit_in_uint32_t); }
+        #pragma unroll
+        for (int m = 0; m < size<1>(tensor); ++m, block_row_start += block_row_stride) {
+            uint2 rowcol = make_uint2(block_row_start, block_col_start);
+            #pragma unroll
+            for (int n = 0; n < size<2>(tensor) / 2; ++n, ++rowcol.y) {
+                // if (cute::thread(32, 0)) { printf("m = %d, n = %d, row = %d, col = %d\n", m, n, int(rowcol.x), int(rowcol.y));}
+                uint4 random_uint4 = flash::philox(seed, reinterpret_cast<unsigned long long&>(rowcol), offset);
+                // if (cute::thread0()) { printf("philox = %u, %d, %d, %d\n", random_uint4.x, random_uint4.y, random_uint4.z, random_uint4.w);}
+                uint8_t (&rnd_8)[16] = reinterpret_cast<uint8_t (&)[16]>(random_uint4);
+                // Special implementation for 16-bit types: we duplicate the threshold to the
+                // low and high 16 bits of a 32-bit value, then use the f16x2 comparison instruction
+                // to get a mask. The low 16 bits of the mask will be either 0xffff or 0x0000,
+                // and the high 16 bits will be either 0xffff or 0x0000, depending on whether
+                // the random value is less than the threshold.
+                // We then do a bit-wise AND between the mask and the original value (in 32-bit).
+                // We're exploiting the fact that floating point comparison is equivalent to integer
+                // comparison, since we're comparing unsigned integers whose top 8-bits are zero.
+                if (!encode_dropout_in_sign_bit
+                    && (std::is_same<T, cutlass::half_t>::value || std::is_same<T, cutlass::bfloat16_t>::value)) {
+                    uint16_t rnd_16[16];
+                    #pragma unroll
+                    for (int i = 0; i < 16; i++) { rnd_16[i] = uint16_t(rnd_8[i]); }
+                    uint32_t (&rnd_32)[8] = reinterpret_cast<uint32_t (&)[8]>(rnd_16);
+                    #pragma unroll
+                    for (int j = 0; j < 2; j++) {
+                        Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
+                        // if (cute::thread0()) { printf("random = 0x%x, 0x%x, 0x%x, 0x%x\n", rnd_32[j * 4 + 0], rnd_32[j * 4 + 1], rnd_32[j * 4 + 2], rnd_32[j * 4 + 3]); }
+                        // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
+                        #pragma unroll
+                        for (int i = 0; i < 4; i++) {
+                            uint32_t mask;
+                            asm volatile("set.le.u32.f16x2 %0, %1, %2;\n" : "=r"(mask) : "r"(rnd_32[j * 4 + i]), "r"(p_dropout_8bit_in_uint32_t));
+                            tensor_uint32(i) &= mask;
+                        }
+                        // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
+                    }
+                } else {
+                    #pragma unroll
+                    for (int j = 0; j < 2; j++) {
+                        #pragma unroll
+                        for (int i = 0; i < 8; i++) {
+                            tensor(i, m, n * 2 + j) = encode_dropout(rnd_8[j * 8 + i] <= p_dropout_in_uint8_t, tensor(i, m, n * 2 + j));
+                        }
+                        Tensor tensor_uint32 = recast<uint32_t>(tensor(_, m, n * 2 + j));
+                        // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); }
+                    }
+                }
+                // // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) {
+                // //     printf("n = %d, ph  Philox: %u, %u, %u, %u\n", n, rnd_8.x, rnd_8.y, rnd_8.z, rnd_8.w);
+                // // }
+            }
+        }
+    }
+
+};
+
+} // namespace flash
diff --git a/ema.yaml b/ema.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d5586db26b4ff8e8484dafc89fd47716f75ca4c4
--- /dev/null
+++ b/ema.yaml
@@ -0,0 +1,4 @@
+ema:
+  _target_: src.callbacks.ema.EMACallback
+  decay: ???
+  use_num_updates: False
diff --git a/embedding.py b/embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..33587d09413dbab5edccfa3806fca829a6f9f9da
--- /dev/null
+++ b/embedding.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2022, Tri Dao.
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+from torch import Tensor
+
+from flash_attn.utils.distributed import all_reduce, reduce_scatter
+
+
+class GPT2Embeddings(nn.Module):
+    def __init__(
+        self,
+        embed_dim,
+        vocab_size,
+        max_position_embeddings,
+        padding_idx=None,
+        word_embed_proj_dim=None,
+        device=None,
+        dtype=None,
+    ):
+        """
+        If max_position_embeddings <= 0, there's no position embeddings
+        If word_embe_proj_dim is not None (e.g., OPT-350m), we embed to that dimension
+            the project up to embed_dim
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        if word_embed_proj_dim is None:
+            self.word_embeddings = nn.Embedding(
+                vocab_size, embed_dim, padding_idx=padding_idx, **factory_kwargs
+            )
+            self.project_in = None
+        else:
+            self.word_embeddings = nn.Embedding(
+                vocab_size, word_embed_proj_dim, padding_idx=padding_idx, **factory_kwargs
+            )
+            self.project_in = nn.Linear(
+                word_embed_proj_dim, embed_dim, bias=False, **factory_kwargs
+            )
+        self.max_position_embeddings = max_position_embeddings
+        if self.max_position_embeddings > 0:
+            self.position_embeddings = nn.Embedding(
+                max_position_embeddings, embed_dim, **factory_kwargs
+            )
+
+    def forward(self, input_ids, position_ids=None):
+        """
+        input_ids: (batch, seqlen)
+        position_ids: (batch, seqlen)
+        """
+        batch_size, seqlen = input_ids.shape
+        embeddings = self.word_embeddings(input_ids)
+        if self.project_in is not None:
+            embeddings = self.project_in(embeddings)
+        if self.max_position_embeddings > 0:
+            if position_ids is None:
+                position_ids = torch.arange(seqlen, dtype=torch.long, device=input_ids.device)
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings = embeddings + position_embeddings
+        return embeddings
+
+
+class BertEmbeddings(nn.Module):
+    def __init__(
+        self,
+        embed_dim,
+        vocab_size,
+        max_position_embeddings,
+        type_vocab_size,
+        padding_idx=None,
+        device=None,
+        dtype=None,
+    ):
+        """
+        If max_position_embeddings <= 0, there's no position embeddings
+        If type_vocab_size <= 0, there's no token type embeddings
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            vocab_size, embed_dim, padding_idx=padding_idx, **factory_kwargs
+        )
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        if self.max_position_embeddings > 0:
+            self.position_embeddings = nn.Embedding(
+                max_position_embeddings, embed_dim, **factory_kwargs
+            )
+        if self.type_vocab_size > 0:
+            self.token_type_embeddings = nn.Embedding(type_vocab_size, embed_dim, **factory_kwargs)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None):
+        """
+        input_ids: (batch, seqlen)
+        position_ids: (batch, seqlen)
+        token_type_ids: (batch, seqlen)
+        """
+        batch_size, seqlen = input_ids.shape
+        embeddings = self.word_embeddings(input_ids)
+        if self.max_position_embeddings > 0:
+            if position_ids is None:
+                position_ids = torch.arange(seqlen, dtype=torch.long, device=input_ids.device)
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings = embeddings + position_embeddings
+        if self.type_vocab_size > 0:
+            if token_type_ids is None:
+                token_type_ids = torch.zeros(seqlen, dtype=torch.long, device=input_ids.device)
+            token_type_embeddings = self.token_type_embeddings(token_type_ids)
+            embeddings = embeddings + token_type_embeddings
+        return embeddings
+
+
+class VocabParallelEmbedding(nn.Embedding):
+    def __init__(self, num_embeddings, *args, process_group=None, padding_idx=None, **kwargs):
+        self.process_group = process_group
+        if process_group is not None:
+            world_size = torch.distributed.get_world_size(process_group)
+            if num_embeddings % world_size != 0:
+                raise ValueError(
+                    f"num_embeddings ({num_embeddings}) must be divisible by "
+                    f"world_size ({world_size})"
+                )
+            if world_size > 1 and padding_idx is not None:
+                raise RuntimeError("ParallelEmbedding does not support padding_idx")
+        else:
+            world_size = 1
+        super().__init__(num_embeddings // world_size, *args, padding_idx=padding_idx, **kwargs)
+
+    def forward(self, input: Tensor) -> Tensor:
+        if self.process_group is None:
+            return super().forward(input)
+        else:
+            rank = torch.distributed.get_rank(self.process_group)
+            vocab_size = self.num_embeddings
+            vocab_start_index, vocab_end_index = rank * vocab_size, (rank + 1) * vocab_size
+            # Create a mask of valid vocab ids (1 means it needs to be masked).
+            input_ids_mask = (input < vocab_start_index) | (input >= vocab_end_index)
+            input = input - vocab_start_index
+            input[input_ids_mask] = 0
+            embeddings = super().forward(input)
+            embeddings[input_ids_mask] = 0.0
+            return embeddings
+
+
+class ColumnParallelEmbedding(nn.Embedding):
+    def __init__(self, num_embeddings, embedding_dim, *args, process_group=None, **kwargs):
+        self.process_group = process_group
+        if process_group is not None:
+            world_size = torch.distributed.get_world_size(process_group)
+            if embedding_dim % world_size != 0:
+                raise ValueError(
+                    f"embedding_dim ({embedding_dim}) must be divisible by "
+                    f"world_size ({world_size})"
+                )
+        else:
+            world_size = 1
+        super().__init__(num_embeddings, embedding_dim // world_size, *args, **kwargs)
+
+
+class ParallelGPT2Embeddings(nn.Module):
+    def __init__(
+        self,
+        embed_dim,
+        vocab_size,
+        max_position_embeddings,
+        process_group,
+        padding_idx=None,
+        sequence_parallel=True,
+        device=None,
+        dtype=None,
+    ):
+        """
+        If max_position_embeddings <= 0, there's no position embeddings
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.process_group = process_group
+        self.sequence_parallel = sequence_parallel
+        self.word_embeddings = VocabParallelEmbedding(
+            vocab_size,
+            embed_dim,
+            padding_idx=padding_idx,
+            process_group=process_group,
+            **factory_kwargs,
+        )
+        self.max_position_embeddings = max_position_embeddings
+        if self.max_position_embeddings > 0:
+            self.position_embeddings = ColumnParallelEmbedding(
+                max_position_embeddings, embed_dim, process_group=process_group, **factory_kwargs
+            )
+
+    def forward(self, input_ids, position_ids=None, combine_batch_seqlen_dim=False):
+        """
+        input_ids: (batch, seqlen)
+        position_ids: (batch, seqlen)
+        """
+        batch_size, seqlen = input_ids.shape
+        world_size = torch.distributed.get_world_size(self.process_group)
+        embeddings = self.word_embeddings(input_ids)
+        if self.max_position_embeddings > 0:
+            if position_ids is None:
+                position_ids = torch.arange(seqlen, dtype=torch.long, device=input_ids.device)
+            position_embeddings = self.position_embeddings(position_ids)
+            if world_size <= 1:
+                embeddings = embeddings + position_embeddings
+            else:
+                partition_dim = self.position_embeddings.embedding_dim
+                rank = torch.distributed.get_rank(self.process_group)
+                embeddings[
+                    ..., rank * partition_dim : (rank + 1) * partition_dim
+                ] += position_embeddings
+        if combine_batch_seqlen_dim:
+            embeddings = rearrange(embeddings, "b s d -> (b s) d")
+        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
+        return embeddings if world_size <= 1 else reduce_fn(embeddings, self.process_group)
diff --git a/epilogue_bwd_sm90_tma.hpp b/epilogue_bwd_sm90_tma.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6741120ef5c41bb4f2bb389245d200d07923a69
--- /dev/null
+++ b/epilogue_bwd_sm90_tma.hpp
@@ -0,0 +1,270 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/cutlass.h>
+#include "cute/tensor.hpp"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "named_barrier.hpp"
+#include "utils.h"
+
+namespace flash {
+
+using namespace cute;
+
+template <class TileShape_MNK_, class Element_, int NumEpilogueThreads_, bool Varlen_>
+struct CollectiveEpilogueBwd {
+
+    using TileShape_MNK = TileShape_MNK_;
+    using Element = Element_;
+    static constexpr int NumEpilogueThreads = NumEpilogueThreads_;
+    static constexpr bool Varlen = Varlen_;
+
+    using GmemTiledCopydKVTMA = cute::SM90_TMA_STORE;
+
+    // These are for storing the output tensor without TMA (e.g., for setting output to zero)
+    static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element);
+    static_assert(get<2>(TileShape_MNK{}) % kGmemElemsPerLoad == 0, "Headdim must be a multiple of kGmemElemsPerLoad");
+    static constexpr int kHeadDim = get<2>(TileShape_MNK{});
+    static constexpr int kGmemThreadsPerRow = cutlass::gcd(kHeadDim / kGmemElemsPerLoad, NumEpilogueThreads);
+    static_assert(NumEpilogueThreads % kGmemThreadsPerRow == 0, "NumEpilogueThreads must be a multiple of kGmemThreadsPerRow");
+    using GmemLayoutAtom = Layout<Shape <Int<NumEpilogueThreads / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
+                                  Stride<Int<kGmemThreadsPerRow>, _1>>;
+    using GmemTiledCopydKV = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape<_1, Int<kGmemElemsPerLoad>>>{}));  // Val layout, 8 or 16 vals per store
+
+    using SmemLayoutAtomdKVTMA = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutdKVTMA = decltype(tile_to_shape(SmemLayoutAtomdKVTMA{}, select<1, 2>(TileShape_MNK{})));
+
+    // If we don't use TMA
+    static constexpr int kBlockKSmem = kHeadDim % 64 == 0 ? 64 : (kHeadDim % 32 == 0 ? 32 : 16);
+    static constexpr int kSwizzle = kBlockKSmem == 64 ? 3 : (kBlockKSmem == 32 ? 2 : 1);
+    using SmemLayoutAtomdKVSTG =
+        decltype(composition(Swizzle<kSwizzle, 3, 3>{},
+                             Layout<Shape<Int<8>, Int<kBlockKSmem>>,
+                             Stride<Int<kBlockKSmem>, _1>>{}));
+
+    using SmemLayoutAtomdKV = std::conditional_t<!Varlen, SmemLayoutAtomdKVTMA, SmemLayoutAtomdKVSTG>;
+    using SmemLayoutdKV = decltype(tile_to_shape(SmemLayoutAtomdKV{}, select<1, 2>(TileShape_MNK{})));
+
+    using SmemCopyAtomdKV = Copy_Atom<cute::SM90_U32x4_STSM_N, Element>;
+
+    struct TensorStorage : cute::aligned_struct<128> {
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutdKV>> smem_dk;
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutdKV>> smem_dv;
+    };
+
+    using ShapedKV = cute::Shape<int32_t, int32_t, int32_t, int32_t>;  // (seqlen_q, d, head, batch)
+    using StridedKV = cute::Stride<int64_t, _1, int64_t, int64_t>;
+    using LayoutdKV = cute::Layout<ShapedKV, StridedKV>;
+
+    using TMA_dKV = decltype(make_tma_copy(
+        GmemTiledCopydKVTMA{},
+        make_tensor(make_gmem_ptr(static_cast<Element*>(nullptr)), ShapedKV{}, StridedKV{}),
+        SmemLayoutdKVTMA{},
+        select<1, 2>(TileShape_MNK{}),
+        _1{}));  // no mcast for dKV
+
+    // Host side kernel arguments
+    struct Arguments {
+        Element* ptr_dK;
+        ShapedKV const shape_dK;
+        StridedKV const stride_dK;
+        Element* ptr_dV;
+        StridedKV const stride_dV;
+        int const* cu_seqlens = nullptr;
+    };
+
+    // Device side kernel params
+    struct Params {
+        Element* ptr_dK;
+        ShapedKV const shape_dK;
+        StridedKV const stride_dK;
+        Element* ptr_dV;
+        StridedKV const stride_dV;
+        TMA_dKV tma_store_dK, tma_store_dV;
+        int const* cu_seqlens = nullptr;
+    };
+
+    static Params
+    to_underlying_arguments(Arguments const& args) {
+        if constexpr (Varlen) {
+            assert (args.cu_seqlens != nullptr);
+        }
+        Tensor mdK = make_tensor(make_gmem_ptr(args.ptr_dK), args.shape_dK, args.stride_dK);
+        Tensor mdV = make_tensor(make_gmem_ptr(args.ptr_dV), args.shape_dK, args.stride_dV);
+        TMA_dKV tma_store_dK = make_tma_copy(
+            GmemTiledCopydKVTMA{},
+            mdK,
+            SmemLayoutdKVTMA{},
+            select<1, 2>(TileShape_MNK{}),
+            _1{}); // no mcast for dKV
+        TMA_dKV tma_store_dV = make_tma_copy(
+            GmemTiledCopydKVTMA{},
+            mdV,
+            SmemLayoutdKVTMA{},
+            select<1, 2>(TileShape_MNK{}),
+            _1{}); // no mcast for dKV
+        return {args.ptr_dK, args.shape_dK, args.stride_dK, args.ptr_dV, args.stride_dV,
+                tma_store_dK, tma_store_dV, args.cu_seqlens};
+    }
+
+    /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+    CUTLASS_DEVICE
+    static void prefetch_tma_descriptors(Params const& params) {
+        if constexpr (!Varlen) {
+            cute::prefetch_tma_descriptor(params.tma_store_dK.get_tma_descriptor());
+            cute::prefetch_tma_descriptor(params.tma_store_dV.get_tma_descriptor());
+        }
+    }
+
+    template <typename SharedStorage, typename FrgTensorO, typename TiledMma>
+    CUTLASS_DEVICE void
+    store(Params const& params,
+          FrgTensorO const& tdKrdK,
+          FrgTensorO const& tdVrdV,
+          SharedStorage& shared_storage,
+          TiledMma tiled_mma,
+          int thread_idx,
+          cute::tuple<int32_t, int32_t, int32_t> const& block_coord
+          ) {
+
+        auto [n_block, bidh, bidb] = block_coord;
+        Tensor sdK = make_tensor(make_smem_ptr(shared_storage.epilogue.smem_dk.data()), SmemLayoutdKV{});
+        Tensor sdV = make_tensor(make_smem_ptr(shared_storage.epilogue.smem_dv.data()), SmemLayoutdKV{});
+        auto smem_tiled_copy_dKV = make_tiled_copy_C(SmemCopyAtomdKV{}, tiled_mma);
+        auto smem_thr_copy_dKV = smem_tiled_copy_dKV.get_thread_slice(thread_idx);
+
+        Tensor tdVrdV_out = flash::convert_type<Element>(tdVrdV);
+        Tensor tdKrdK_out = flash::convert_type<Element>(tdKrdK);
+        Tensor taccdKrdK = smem_thr_copy_dKV.retile_S(tdKrdK_out);        // ((Atom,AtomNum), MMA_M, MMA_N)
+        Tensor taccdVrdV = smem_thr_copy_dKV.retile_S(tdVrdV_out);        // ((Atom,AtomNum), MMA_M, MMA_N)
+        Tensor taccdKsdK = smem_thr_copy_dKV.partition_D(sdK);     // ((Atom,AtomNum),PIPE_M,PIPE_N)
+        Tensor taccdVsdV = smem_thr_copy_dKV.partition_D(sdV);     // ((Atom,AtomNum),PIPE_M,PIPE_N)
+
+        // Make sure all WGs have finished reading K and V
+
+        cutlass::arch::NamedBarrier::sync(NumEpilogueThreads, static_cast<int>(BwdNamedBarriers::KVEmpty) /*id*/);
+        cute::copy(smem_tiled_copy_dKV, taccdVrdV, taccdVsdV);
+        cute::copy(smem_tiled_copy_dKV, taccdKrdK, taccdKsdK);
+        if constexpr (!Varlen) {
+            cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
+            cutlass::arch::NamedBarrier::arrive(NumEpilogueThreads + cutlass::NumThreadsPerWarp,
+                                                cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+
+            Tensor mdK = params.tma_store_dK.get_tma_tensor(params.shape_dK);
+            Tensor mdV = params.tma_store_dV.get_tma_tensor(params.shape_dK);
+            Tensor gdK = local_tile(mdK(_, _, bidh, bidb), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{}));  // (M, K)
+            Tensor gdV = local_tile(mdV(_, _, bidh, bidb), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{}));  // (M, K)
+            auto block_tma_dK = params.tma_store_dK.get_slice(_0{});
+            auto block_tma_dV = params.tma_store_dV.get_slice(_0{});
+            Tensor tdKgdK = block_tma_dK.partition_D(gdK);  // (TMA, TMA_M, TMA_K)
+            Tensor tdKsdK = block_tma_dK.partition_S(sdK); // (TMA, TMA_M, TMA_K)
+            Tensor tdVgdV = block_tma_dV.partition_D(gdV);  // (TMA, TMA_M, TMA_K)
+            Tensor tdVsdV = block_tma_dV.partition_S(sdV); // (TMA, TMA_M, TMA_K)
+            int warp_idx_sync = __shfl_sync(0xffffffff, thread_idx / cutlass::NumThreadsPerWarp, 0);
+            if (warp_idx_sync == NumEpilogueThreads / cutlass::NumThreadsPerWarp - 1) {
+                cutlass::arch::NamedBarrier::sync(NumEpilogueThreads + cutlass::NumThreadsPerWarp,
+                                                cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+                int const lane_predicate = cute::elect_one_sync();
+                if (lane_predicate) {
+                    cute::copy(params.tma_store_dV, tdVsdV, tdVgdV);
+                    cute::copy(params.tma_store_dK, tdKsdK, tdKgdK);
+                    tma_store_arrive();
+                }
+            }
+
+        } else {
+            cutlass::arch::NamedBarrier::sync(NumEpilogueThreads, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+            bool const is_varlen = params.cu_seqlens != nullptr;
+            int const offset = !is_varlen ? 0 : params.cu_seqlens[bidb];
+            int const seqlen = !is_varlen ? get<0>(params.shape_dK) : params.cu_seqlens[bidb + 1] - params.cu_seqlens[bidb];
+
+            Tensor mdK = make_tensor(make_gmem_ptr(params.ptr_dK), params.shape_dK, params.stride_dK)(_, _, bidh, !is_varlen ? bidb : 0);
+            Tensor gdK = local_tile(cute::domain_offset(make_coord(offset, _0{}), mdK), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{}));  // (M, K)
+            Tensor mdV = make_tensor(make_gmem_ptr(params.ptr_dV), params.shape_dK, params.stride_dV)(_, _, bidh, !is_varlen ? bidb : 0);
+            Tensor gdV = local_tile(cute::domain_offset(make_coord(offset, _0{}), mdV), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{}));  // (M, K)
+
+            GmemTiledCopydKV gmem_tiled_copy_dKV;
+            auto gmem_thr_copy_dKV = gmem_tiled_copy_dKV.get_thread_slice(thread_idx);
+            Tensor tdKVgdV = gmem_thr_copy_dKV.partition_D(gdV);
+            Tensor tdKVsdV = gmem_thr_copy_dKV.partition_S(sdV); // (TMA, TMA_M, TMA_K)
+            Tensor tdKVgdK = gmem_thr_copy_dKV.partition_D(gdK);
+            Tensor tdKVsdK = gmem_thr_copy_dKV.partition_S(sdK); // (TMA, TMA_M, TMA_K)
+            Tensor tdKVrdV = make_fragment_like(tdKVgdV);
+            Tensor tdKVrdK = make_fragment_like(tdKVgdK);
+            cute::copy(gmem_tiled_copy_dKV, tdKVsdV, tdKVrdV);
+            cute::copy(gmem_tiled_copy_dKV, tdKVsdK, tdKVrdK);
+            // Construct identity layout for gdKV
+            Tensor cdKV = cute::make_identity_tensor(select<1, 2>(TileShape_MNK{}));  // (BLK_M,BLK_K) -> (blk_m,blk_k)
+            // Repeat the partitioning with identity layouts
+            Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV);
+            Tensor tdKVpdKV = make_tensor<bool>(make_shape(size<2>(tdKVgdV)));
+            #pragma unroll
+            for (int k = 0; k < size(tdKVpdKV); ++k) { tdKVpdKV(k) = get<1>(tdKVcdKV(_0{}, _0{}, k)) < get<1>(params.shape_dK); }
+            static constexpr int kBlockN = get<1>(TileShape_MNK{});
+            // Clear_OOB_K must be false since we don't want to write zeros to gmem
+            flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
+                gmem_tiled_copy_dKV, tdKVrdV, tdKVgdV, tdKVcdKV, tdKVpdKV, seqlen - n_block * kBlockN
+            );
+            flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
+                gmem_tiled_copy_dKV, tdKVrdK, tdKVgdK, tdKVcdKV, tdKVpdKV, seqlen - n_block * kBlockN
+            );
+        }
+    }
+
+    CUTLASS_DEVICE void
+    store_tail() {
+        if constexpr (!Varlen) { tma_store_wait<0>(); }
+    }
+
+    // Write 0 to dK and dV
+    CUTLASS_DEVICE void
+    store_zero(
+         Params const& params,
+         int thread_idx,
+         cute::tuple<int32_t, int32_t, int32_t> const& block_coord
+         ) {
+        static constexpr int kBlockN = get<1>(TileShape_MNK{});
+        auto [n_block, bidh, bidb] = block_coord;
+        bool const is_varlen = Varlen && params.cu_seqlens != nullptr;
+        int const offset = !is_varlen ? 0 : params.cu_seqlens[bidb];
+        int const seqlen = !is_varlen ? get<0>(params.shape_dK) : params.cu_seqlens[bidb + 1] - offset;
+
+        Tensor mdK = make_tensor(make_gmem_ptr(params.ptr_dK), params.shape_dK, params.stride_dK)(_, _, bidh, !is_varlen ? bidb : 0);
+        Tensor gdK = local_tile(cute::domain_offset(make_coord(offset, _0{}), mdK), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{}));  // (M, K)
+        Tensor mdV = make_tensor(make_gmem_ptr(params.ptr_dV), params.shape_dK, params.stride_dV)(_, _, bidh, !is_varlen ? bidb : 0);
+        Tensor gdV = local_tile(cute::domain_offset(make_coord(offset, _0{}), mdV), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{}));  // (M, K)
+
+        GmemTiledCopydKV gmem_tiled_copy_dKV;
+        auto gmem_thr_copy_dKV = gmem_tiled_copy_dKV.get_thread_slice(thread_idx);
+        Tensor tdKVgdK = gmem_thr_copy_dKV.partition_D(gdK);
+        Tensor tdKVgdV = gmem_thr_copy_dKV.partition_D(gdV);
+        Tensor tdKVrdKV = make_fragment_like(tdKVgdK);
+        clear(tdKVrdKV);
+        // Construct identity layout for gdKV
+        Tensor cdKV = cute::make_identity_tensor(select<1, 2>(TileShape_MNK{}));  // (BLK_M,BLK_K) -> (blk_m,blk_k)
+        // Repeat the partitioning with identity layouts
+        Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV);
+        Tensor tdKVpdKV = make_tensor<bool>(make_shape(size<2>(tdKVgdK)));
+        #pragma unroll
+        for (int k = 0; k < size(tdKVpdKV); ++k) { tdKVpdKV(k) = get<1>(tdKVcdKV(_0{}, _0{}, k)) < get<1>(params.shape_dK); }
+        // Clear_OOB_K must be false since we don't want to write zeros to gmem
+        flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
+            gmem_tiled_copy_dKV, tdKVrdKV, tdKVgdK, tdKVcdKV, tdKVpdKV, seqlen - n_block * kBlockN
+        );
+        flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
+            gmem_tiled_copy_dKV, tdKVrdKV, tdKVgdV, tdKVcdKV, tdKVpdKV, seqlen - n_block * kBlockN
+        );
+    }
+
+};
+
+} // namespace flash
diff --git a/epilogue_fwd_sm90_tma.hpp b/epilogue_fwd_sm90_tma.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5133c55fc1e7e93629bd13833ff6235921cec239
--- /dev/null
+++ b/epilogue_fwd_sm90_tma.hpp
@@ -0,0 +1,296 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/cutlass.h>
+#include "cute/tensor.hpp"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "named_barrier.hpp"
+#include "utils.h"
+
+namespace flash {
+
+using namespace cute;
+
+// template <int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, typename Element_>
+template <typename Ktraits, typename Seqlen_traits>
+struct CollectiveEpilogueFwd {
+
+    using Element = typename Ktraits::OutputType;    
+    static constexpr int kBlockM = Ktraits::kBlockM;
+    static constexpr int kBlockN = Ktraits::kBlockN;
+    static constexpr int kHeadDim = Ktraits::kHeadDim;
+    using TileShape_MNK = Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
+
+    static constexpr int kNWarps = Ktraits::kNWarps;
+    static constexpr int kNThreads = kNWarps * cutlass::NumThreadsPerWarp;
+    static constexpr bool Is_WS = kNWarps >= 12;    
+
+    static constexpr int NumCopyThreads = !Is_WS ? 0 : cutlass::NumThreadsPerWarpGroup;
+    static constexpr int NumMmaThreads = kNThreads - NumCopyThreads;
+
+    using SmemLayoutAtomO = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutO = decltype(tile_to_shape(SmemLayoutAtomO{}, select<0, 2>(TileShape_MNK{})));
+
+    using SmemCopyAtomO = Copy_Atom<cute::SM90_U32x4_STSM_N, Element>;
+    using SharedStorage = cute::array_aligned<Element, cute::cosize_v<SmemLayoutO>>;
+
+    using GmemTiledCopyOTMA = cute::SM90_TMA_STORE;
+    using TMA_O = decltype(make_tma_copy(
+        GmemTiledCopyOTMA{},
+        make_tensor(
+            make_gmem_ptr(static_cast<Element*>(nullptr)), 
+            typename Seqlen_traits::ShapeT{}, 
+            typename Seqlen_traits::StrideT{}
+        ),
+        SmemLayoutO{},
+        select<0, 2>(TileShape_MNK{}),
+        _1{}));  // no mcast for O
+
+    // These are for storing the output tensor without TMA (e.g., for setting output to zero and var-seq-len)
+    static constexpr int kNumVecElem = ceil_div(128, sizeof_bits_v<Element>);
+    static_assert(kHeadDim % kNumVecElem == 0);
+    static constexpr int kNumThreadsPerRow = kHeadDim / kNumVecElem;
+    static_assert(NumMmaThreads % kNumThreadsPerRow == 0);
+    static constexpr int kNumRows = NumMmaThreads / kNumThreadsPerRow;
+    using TiledCopyOAtom = cute::Copy_Atom<cute::UniversalCopy<cutlass::uint128_t>, Element>;
+    using TiledCopyOThrLayout = decltype(cute::make_layout(
+        cute::make_shape(Int<kNumRows>{}, Int<kNumThreadsPerRow>{}),
+        LayoutRight{}));
+    using TiledCopyOValLayout = decltype(cute::make_layout(
+        cute::make_shape(_1{}, Int<kNumVecElem>{}),
+        LayoutRight{}));
+    using TiledCopyO = decltype(make_tiled_copy(
+        TiledCopyOAtom{},
+        TiledCopyOThrLayout{}, // Thr layout
+        TiledCopyOValLayout{} // Val layout
+    ));
+
+    // used for rmem -> smem O copy in fp8 kernel to undo column permutation
+    using ThreadLayoutrO = Layout<Shape<_8, Int<kBlockM/16>, _4, _1>,
+                                 Stride<_4, _32, _1, _0>>;
+    using ValueLayoutrO = Layout<Shape<_1, _2, Shape<_2, _2>, Int<kHeadDim/16>>,
+                                Stride<_0, _2, Stride<_4, _1>, _8>>;
+    using TiledCopyrO = decltype(make_tiled_copy(Copy_Atom<UniversalCopy<uint16_t>, Element>{},
+                      ThreadLayoutrO{}, ValueLayoutrO{}));
+    using TiledCopyShaperO = Shape<_8, Int<kBlockM/8>, _16, Int<kHeadDim/16>>;
+    using SmemLayoutrO = decltype(composition(SmemLayoutO{}, Layout<TiledCopyShaperO>{}));
+
+    // Host side kernel arguments
+    struct Arguments {
+        Element* ptr_O;
+        typename Seqlen_traits::LayoutT const layout_O;
+        float* ptr_LSE;
+        typename Seqlen_traits::LayoutLseT const layout_LSE;
+    };
+
+    // Device side kernel params
+    struct Params {
+        Element* ptr_O;
+        typename Seqlen_traits::LayoutT const layout_O;
+        float* ptr_LSE;
+        typename Seqlen_traits::LayoutLseT const layout_LSE;
+        TMA_O tma_store_O;
+    };
+
+    static Params
+    to_underlying_arguments(Arguments const& args) {
+        Tensor mO = make_tensor(make_gmem_ptr(args.ptr_O), args.layout_O);
+        TMA_O tma_store_O = make_tma_copy(
+            GmemTiledCopyOTMA{},
+            mO,
+            SmemLayoutO{},
+            select<0, 2>(TileShape_MNK{}),
+            _1{}); // no mcast for O
+        return {args.ptr_O, args.layout_O, args.ptr_LSE, args.layout_LSE, tma_store_O};
+    }
+
+    /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+    CUTLASS_DEVICE
+    static void prefetch_tma_descriptors(Params const& epilogue_params) {
+        if constexpr (!Seqlen_traits::kUseVarSeqLen) {
+            cute::prefetch_tma_descriptor(epilogue_params.tma_store_O.get_tma_descriptor());
+        }
+    }
+
+    template <typename SharedStorage, typename FrgTensorO, typename FrgTensorLSE, typename TiledMma>
+    CUTLASS_DEVICE void
+    store(Params const& epilogue_params,
+          FrgTensorO const& tOrO,
+          FrgTensorLSE const& lse,
+          SharedStorage& shared_storage,
+          TiledMma tiled_mma,
+          int thread_idx,
+          cute::tuple<int32_t, int32_t, int32_t> const& block_coord,
+          const Seqlen_traits& seqlen_traits_q
+          ) {
+
+        auto [m_block, bidh, bidb] = block_coord;
+        Tensor sO = make_tensor(make_smem_ptr(shared_storage.smem_o.data()), SmemLayoutO{});
+        auto smem_tiled_copy_O = make_tiled_copy_C(SmemCopyAtomO{}, tiled_mma);
+        auto smem_thr_copy_O = smem_tiled_copy_O.get_thread_slice(thread_idx);
+
+        Tensor tOrO_out = flash::convert_type<Element>(tOrO);
+        Tensor taccOrO = smem_thr_copy_O.retile_S(tOrO_out);        // ((Atom,AtomNum), MMA_M, MMA_N)
+        Tensor taccOsO = smem_thr_copy_O.partition_D(sO);     // ((Atom,AtomNum),PIPE_M,PIPE_N)
+
+        // Make sure all WGs have finished reading V
+        cutlass::arch::NamedBarrier::sync(NumMmaThreads, static_cast<int>(FwdNamedBarriers::ValueEmpty) /*id*/);
+        cute::copy(smem_tiled_copy_O, taccOrO, taccOsO);
+        cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
+        cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarp,
+                                            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+
+        Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_params.ptr_LSE), epilogue_params.layout_LSE);
+        Tensor gLSE = seqlen_traits_q.get_lse_local_tile_tensor(
+            mLSE, Shape<Int<kBlockM>>{}, bidh, bidb)(_, m_block);
+        Tensor caccO = cute::make_identity_tensor(select<0, 2>(TileShape_MNK{}));
+        auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+        Tensor taccOcO = thread_mma.partition_C(caccO);                           // (MMA,MMA_M,MMA_K)
+        static_assert(decltype(size<0, 0>(taccOcO))::value == 2);
+        static_assert(decltype(size<0, 1>(taccOcO))::value == 2);
+        // taccOcO has shape ((2, 2, V), MMA_M, MMA_K), we only take only the row indices.
+        Tensor taccOcO_row = taccOcO(make_coord(_0{}, _, _0{}), _, _0{});
+        CUTE_STATIC_ASSERT_V(size(lse) == size(taccOcO_row));                     // MMA_M
+        if (get<1>(taccOcO_row(_0{})) == 0) {
+            #pragma unroll
+            for (int mi = 0; mi < size(lse); ++mi) {
+                const int row = get<0>(taccOcO_row(mi));                
+                if (row < seqlen_traits_q.actual_seq_len - m_block * kBlockM) { gLSE(row) = lse(mi); }
+            }
+        }
+
+        int write_warp_idx = kNWarps - 1;
+        if (cutlass::canonical_warp_idx_sync() == write_warp_idx) {
+            cutlass::arch::NamedBarrier::sync(
+                NumMmaThreads + cutlass::NumThreadsPerWarp, 
+                cutlass::arch::ReservedNamedBarriers::EpilogueBarrier
+            );
+        }
+        TiledCopyO gmem_tiled_copy_O;
+        flash::write_O<!Seqlen_traits::kUseVarSeqLen, NumCopyThreads>(
+            epilogue_params.ptr_O, epilogue_params.tma_store_O, gmem_tiled_copy_O, 
+            epilogue_params.layout_O, select<0, 2>(TileShape_MNK{}), sO, 
+            m_block, bidh, bidb, seqlen_traits_q, write_warp_idx
+        );
+    }
+
+    template <typename SharedStorage, typename FrgTensorO, typename FrgTensorLSE, typename TiledMma>
+    CUTLASS_DEVICE void
+    store_fp8(Params const& epilogue_params,
+          FrgTensorO const& tOrO,
+          FrgTensorLSE const& lse,
+          SharedStorage& shared_storage,
+          TiledMma tiled_mma,
+          int thread_idx,
+          cute::tuple<int32_t, int32_t, int32_t> const& block_coord,
+          const Seqlen_traits& seqlen_traits_q
+          ) {
+        // using SmemLayoutrO = typename Ktraits::SmemLayoutrO;
+        // using TiledCopyrO = typename Ktraits::TiledCopyrO;
+        auto [m_block, bidh, bidb] = block_coord;        
+
+        TiledCopyrO rmem_tiled_copy_O;
+        Tensor sOacc = make_tensor(make_smem_ptr(shared_storage.smem_o.data()), SmemLayoutrO{});
+        auto rmem_thr_copy_O = rmem_tiled_copy_O.get_thread_slice(thread_idx);
+        
+        Tensor taccOsO = rmem_thr_copy_O.partition_D(sOacc);
+        Tensor tOrO_out = flash::convert_type<Element>(tOrO); // Element is Ktraits::OutputType
+        Tensor taccOrO = make_tensor(tOrO_out.data(), shape(taccOsO));
+
+        // Make sure all WGs have finished reading V
+        cutlass::arch::NamedBarrier::sync(NumMmaThreads, static_cast<int>(FwdNamedBarriers::ValueEmpty) /*id*/);        
+        cute::copy(rmem_tiled_copy_O, taccOrO, taccOsO);
+        cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
+        cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarp,
+                                            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+        
+        Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_params.ptr_LSE), epilogue_params.layout_LSE);
+        Tensor gLSE = seqlen_traits_q.get_lse_local_tile_tensor(
+            mLSE, Shape<Int<kBlockM>>{}, bidh, bidb)(_, m_block);
+        Tensor caccO = cute::make_identity_tensor(select<0, 2>(TileShape_MNK{}));
+        auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+        Tensor taccOcO = thread_mma.partition_C(caccO);                           // (MMA,MMA_M,MMA_K)
+        static_assert(decltype(size<0, 0>(taccOcO))::value == 2);
+        static_assert(decltype(size<0, 1>(taccOcO))::value == 2);
+        // taccOcO has shape ((2, 2, V), MMA_M, MMA_K), we only take only the row indices.
+        Tensor taccOcO_row = taccOcO(make_coord(_0{}, _, _0{}), _, _0{});
+        CUTE_STATIC_ASSERT_V(size(lse) == size(taccOcO_row));                     // MMA_M        
+        int const seqlen_q = [&] {
+            if constexpr(Seqlen_traits::kUseVarSeqLen) { return seqlen_traits_q.actual_seq_len; }
+            else { return shape<2>(epilogue_params.layout_LSE); }
+        }();        
+        if (get<1>(taccOcO_row(_0{})) == 0) {
+            #pragma unroll
+            for (int mi = 0; mi < size(lse); ++mi) {
+                const int row = get<0>(taccOcO_row(mi));
+                if (row < seqlen_q - m_block * kBlockM) { gLSE(row) = lse(mi); }
+            }
+        }
+        
+        int write_warp_idx = kNWarps - 1;
+        if (cutlass::canonical_warp_idx_sync() == write_warp_idx) {
+            cutlass::arch::NamedBarrier::sync(NumMmaThreads + cutlass::NumThreadsPerWarp,
+                                              cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+        }
+        TiledCopyO gmem_tiled_copy_O;
+        Tensor sO = make_tensor(make_smem_ptr(shared_storage.smem_o.data()), SmemLayoutO{});
+        flash::write_O<!Seqlen_traits::kUseVarSeqLen, NumCopyThreads>(
+            epilogue_params.ptr_O, epilogue_params.tma_store_O, gmem_tiled_copy_O, 
+            epilogue_params.layout_O, select<0, 2>(TileShape_MNK{}), sO, 
+            m_block, bidh, bidb, seqlen_traits_q, write_warp_idx
+        );
+    }
+
+    CUTLASS_DEVICE void
+    store_tail() {
+        tma_store_wait<0>();
+    }
+
+    // Write 0 to output and -inf to LSE
+    template<typename SharedStorage>
+    CUTLASS_DEVICE void
+    store_zero(
+          Params const& epilogue_params,
+          SharedStorage& shared_storage,
+          int thread_idx,
+          cute::tuple<int32_t, int32_t, int32_t> const& block_coord,
+          const Seqlen_traits& seqlen_traits_q
+          ) {
+        auto [m_block, bidh, bidb] = block_coord;
+        Tensor mO = make_tensor(make_gmem_ptr(epilogue_params.ptr_O), epilogue_params.layout_O);
+        Tensor gO = seqlen_traits_q.get_local_tile_tensor(
+            mO, select<0, 2>(TileShape_MNK{}), bidh, bidb
+        )(_, _, m_block);  // (M, K)
+        Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_params.ptr_LSE), epilogue_params.layout_LSE);
+        Tensor gLSE = seqlen_traits_q.get_lse_local_tile_tensor(
+            mLSE, Shape<Int<kBlockM>>{}, bidh, bidb)(_, m_block);
+
+        TiledCopyO gmem_tiled_copy_O;
+        auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
+        Tensor tOgO = gmem_thr_copy_O.partition_D(gO);
+        Tensor tOrO = make_fragment_like(tOgO);
+        clear(tOrO);
+        // Construct identity layout for sO
+        Tensor cO = cute::make_identity_tensor(select<0, 2>(TileShape_MNK{}));  // (BLK_M,BLK_K) -> (blk_m,blk_k)
+        // Repeat the partitioning with identity layouts
+        Tensor tOcO = gmem_thr_copy_O.partition_D(cO);
+        Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgO)));
+        #pragma unroll
+        for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(epilogue_params.layout_O.shape()); }
+        // Clear_OOB_K must be false since we don't want to write zeros to gmem
+        flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
+            gmem_tiled_copy_O, tOrO, tOgO, tOcO, tOpO, get<0>(epilogue_params.layout_O.shape()) - m_block * kBlockM
+        );
+        static_assert(kBlockM <= NumMmaThreads);
+        if (thread_idx < get<0>(epilogue_params.layout_LSE.shape()) - m_block * kBlockM) { gLSE(thread_idx) = -INFINITY; }
+    }
+
+};
+
+} // namespace flash
diff --git a/exp.yaml b/exp.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..032aaa9432899c83d41d292a07c0eb3c3edb7c7e
--- /dev/null
+++ b/exp.yaml
@@ -0,0 +1,17 @@
+# @package _global_
+
+# run in experiment mode with:
+# `python run.py mode=exp name=experiment_name`
+
+experiment_mode: True
+
+# allows for custom naming of the experiment
+name: ???
+
+hydra:
+  # sets output paths for all file logs to `logs/experiment/name'
+  run:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/experiments/${name}
+  sweep:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/experiments/${name}
+    subdir: ${hydra.job.num}
diff --git a/falcon.py b/falcon.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b02ec7727740eaa9ca70a7f0ca64df94fff4c3a
--- /dev/null
+++ b/falcon.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2023, Tri Dao.
+
+import math
+import re
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import FalconConfig, GPT2Config
+
+
+def remap_state_dict_hf_falcon(state_dict, config):
+    def key_mapping_layers(key):
+        return re.sub(r"^transformer.h.", "transformer.layers.", key)
+
+    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
+    # Word embedding
+    def key_mapping_emb(key):
+        return re.sub(
+            r"^transformer.word_embeddings.", "transformer.embeddings.word_embeddings.", key
+        )
+
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    if getattr(config, "tie_word_embeddings"):
+        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+    else:
+        output_embeddings = state_dict.pop("lm_head.weight")
+        # It's possible that vocab_size is padded to be a multiple of 8, for example.
+        state_dict["lm_head.weight"] = F.pad(
+            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
+        )
+        output_embeddings_bias = state_dict.pop("lm_head.bias")
+        state_dict["lm_head.bias"] = F.pad(
+            output_embeddings_bias, (0, vocab_size - output_embeddings_bias.shape[0])
+        )
+
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).input_layernorm.", r"transformer.layers.\1.norm1.", key
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).post_attention_layernorm.",
+            r"transformer.layers.\1.norm2.",
+            key,
+        )
+        key = re.sub(r"^transformer.layers.(\d+).ln_attn.", r"transformer.layers.\1.norm1.", key)
+        key = re.sub(r"^transformer.layers.(\d+).ln_mlp.", r"transformer.layers.\1.norm2.", key)
+        return key
+
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+
+    # MLP
+    def key_mapping_mlp(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.dense_h_to_4h.", r"transformer.layers.\1.mlp.fc1.", key
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.dense_4h_to_h.", r"transformer.layers.\1.mlp.fc2.", key
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+
+    def key_mapping_attn(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attention.query_key_value.",
+            r"transformer.layers.\1.mixer.Wqkv.",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attention.dense.",
+            r"transformer.layers.\1.mixer.out_proj.",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    n_head = config.n_head
+    n_head_kv = getattr(config, "n_head_kv", 1)
+    headdim = config.hidden_size // n_head
+    for l in range(config.n_layer):
+        # The weights are stored in a different layout compared to our implementation
+        Wqkv = rearrange(
+            state_dict.pop(f"transformer.layers.{l}.mixer.Wqkv.weight"),
+            "(group ratio headdim) ... -> group ratio headdim ...",
+            ratio=n_head // n_head_kv + 2,
+            headdim=headdim,
+        )
+        Wq = rearrange(Wqkv[:, :-2], "group ratio headdim ... -> (group ratio headdim) ...")
+        Wk = rearrange(Wqkv[:, [-2]], "group ratio headdim ... -> (group ratio headdim) ...")
+        Wv = rearrange(Wqkv[:, [-1]], "group ratio headdim ... -> (group ratio headdim) ...")
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0)
+
+    return state_dict
+
+
+def falcon_config_to_gpt2_config(falcon_config: FalconConfig) -> GPT2Config:
+    # The 40b config uses "n_head_kv" instead of "num_kv_heads"
+    n_head_kv = getattr(
+        falcon_config,
+        "n_head_kv",
+        1 if getattr(falcon_config, "multi_query", False) else falcon_config.n_head,
+    )
+    # HACK: the 40b config has 2 LN per layer instead of 1, but that's not reflected in the config.
+    # So we have to infer it from the number of heads in the key/value block
+    parallel_block_tied_norm = n_head_kv == 1
+    return GPT2Config(
+        vocab_size=falcon_config.vocab_size,
+        n_positions=0,  # No absolute position embedding
+        n_embd=falcon_config.hidden_size,
+        n_layer=falcon_config.n_layer,
+        n_head=falcon_config.n_head,
+        n_inner=falcon_config.hidden_size * 4,
+        activation_function="gelu",
+        resid_pdrop=falcon_config.hidden_dropout,
+        embd_pdrop=0.0,  # There doesn't seem to be any embedding dropout
+        attn_pdrop=falcon_config.attention_dropout,
+        layer_norm_epsilon=falcon_config.layer_norm_epsilon,
+        initializer_range=falcon_config.initializer_range,
+        bos_token_id=falcon_config.bos_token_id,
+        eos_token_id=falcon_config.eos_token_id,
+        # These are new arguments not in the original GPT2Config
+        parallel_block=falcon_config.parallel_attn,
+        n_head_kv=n_head_kv,
+        parallel_block_tied_norm=parallel_block_tied_norm,
+        rotary_emb_fraction=1.0,
+        rotary_emb_interleaved=False,
+        tie_word_embeddings=True,
+        qkv_proj_bias=falcon_config.bias,
+        out_proj_bias=falcon_config.bias,
+        mlp_fc1_bias=falcon_config.bias,
+        mlp_fc2_bias=falcon_config.bias,
+        lm_head_bias=False,
+    )
diff --git a/flash.h b/flash.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8ba8f22cf2b335c13f7a665d7f62a0621ef9426
--- /dev/null
+++ b/flash.h
@@ -0,0 +1,184 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cuda.h>
+#include <vector>
+
+#include "cutlass/fast_math.h"  // For cutlass::FastDivmod
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Qkv_params {
+    using index_t = int64_t;
+    // The QKV matrices.
+    void *__restrict__ q_ptr;
+    void *__restrict__ k_ptr;
+    void *__restrict__ v_ptr;
+
+    // The stride between rows of the Q, K and V matrices.
+    index_t q_batch_stride;
+    index_t k_batch_stride;
+    index_t v_batch_stride;
+    index_t q_row_stride;
+    index_t k_row_stride;
+    index_t v_row_stride;
+    index_t q_head_stride;
+    index_t k_head_stride;
+    index_t v_head_stride;
+
+    // The number of heads.
+    int h, h_k;
+    // In the case of multi-query and grouped-query attention (MQA/GQA), nheads_k could be
+    // different from nheads (query).
+    int h_h_k_ratio; // precompute h / h_k,
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Flash_fwd_params : public Qkv_params {
+
+    // The O matrix (output).
+    void * __restrict__ o_ptr;
+    void * __restrict__ oaccum_ptr;
+
+    // The stride between rows of O.
+    index_t o_batch_stride;
+    index_t o_row_stride;
+    index_t o_head_stride;
+
+    // The pointer to the P matrix.
+    void * __restrict__ p_ptr;
+
+    // The pointer to the softmax sum.
+    void * __restrict__ softmax_lse_ptr;
+    void * __restrict__ softmax_lseaccum_ptr;
+
+    // The dimensions.
+    int b, seqlen_q, seqlen_k, seqlen_knew, d, seqlen_q_rounded, seqlen_k_rounded, d_rounded, rotary_dim, total_q, total_k;
+
+    // The scaling factors for the kernel.
+    float scale_softmax;
+    float scale_softmax_log2;
+    uint32_t scale_softmax_log2_half2;
+
+    // array of length b+1 holding starting offset of each sequence.
+    int * __restrict__ cu_seqlens_q;
+    int * __restrict__ cu_seqlens_k;
+
+    // If provided, the actual length of each k sequence.
+    int * __restrict__ seqused_k;
+
+    int *__restrict__ blockmask;
+
+    // The K_new and V_new matrices.
+    void * __restrict__ knew_ptr;
+    void * __restrict__ vnew_ptr;
+
+    // The stride between rows of the Q, K and V matrices.
+    index_t knew_batch_stride;
+    index_t vnew_batch_stride;
+    index_t knew_row_stride;
+    index_t vnew_row_stride;
+    index_t knew_head_stride;
+    index_t vnew_head_stride;
+
+    // The cos and sin matrices for rotary embedding.
+    void * __restrict__ rotary_cos_ptr;
+    void * __restrict__ rotary_sin_ptr;
+
+    // The indices to index into the KV cache.
+    int * __restrict__ cache_batch_idx;
+
+    // Paged KV cache
+    int * __restrict__ block_table;
+    index_t block_table_batch_stride;
+    int page_block_size;
+
+    // The dropout probability (probability of keeping an activation).
+    float p_dropout;
+    // uint32_t p_dropout_in_uint;
+    // uint16_t p_dropout_in_uint16_t;
+    uint8_t p_dropout_in_uint8_t;
+
+    // Scale factor of 1 / (1 - p_dropout).
+    float rp_dropout;
+    float scale_softmax_rp_dropout;
+
+    // Local window size
+    int window_size_left, window_size_right;
+
+    // Pointer to the RNG seed (idx 0) and offset (idx 1).
+    uint64_t * rng_state;
+
+    bool is_bf16;
+    bool is_e4m3;
+    bool is_causal;
+
+    // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb].
+    // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K.
+    bool is_seqlens_k_cumulative;
+
+    bool is_rotary_interleaved;
+
+    int num_splits;  // For split-KV version
+
+    void * __restrict__ alibi_slopes_ptr;
+    index_t alibi_slopes_batch_stride;
+
+    bool unpadded_lse; // For varlen paths: LSE is in [nheads, total_seqlen_q] format instead of [b, nheads, seqlen_q].
+
+    int * __restrict__ tile_count_semaphore;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Flash_bwd_params : public Flash_fwd_params {
+
+    // The dO and dQKV matrices.
+    void *__restrict__ do_ptr;
+    void *__restrict__ dq_ptr;
+    void *__restrict__ dk_ptr;
+    void *__restrict__ dv_ptr;
+
+    // To accumulate dQ
+    void *__restrict__ dq_accum_ptr;
+    void *__restrict__ dk_accum_ptr;
+    void *__restrict__ dv_accum_ptr;
+
+    // // To accumulate dK and dV in case we're splitting the bwd along seqlen_q
+    // dimension void *__restrict__ dk_accum_ptr; void *__restrict__
+    // dv_accum_ptr;
+
+    // The stride between rows of the dO, dQ, dK and dV matrices.
+    // TD [2022-04-16]: We're using 32-bit indexing to save registers.
+    // The code probably won't work for arrays larger than 2GB.
+    index_t do_batch_stride;
+    index_t do_row_stride;
+    index_t do_head_stride;
+    index_t dq_batch_stride;
+    index_t dk_batch_stride;
+    index_t dv_batch_stride;
+    index_t dq_row_stride;
+    index_t dk_row_stride;
+    index_t dv_row_stride;
+    index_t dq_head_stride;
+    index_t dk_head_stride;
+    index_t dv_head_stride;
+
+    // The pointer to the softmax d sum.
+    void *__restrict__ dsoftmax_sum;
+    void *__restrict__ softmax_lse_log2_ptr;
+
+    int *__restrict__ dq_semaphore;
+
+    bool deterministic;
+    index_t dq_accum_split_stride;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int Headdim> void run_mha_fwd_(Flash_fwd_params &params, cudaStream_t stream);
+template<typename T, int Headdim> void run_mha_bwd_(Flash_bwd_params &params, cudaStream_t stream);
diff --git a/flash2_a100_fwd_bwd_benchmark.png b/flash2_a100_fwd_bwd_benchmark.png
new file mode 100644
index 0000000000000000000000000000000000000000..f529197bec8d885219f5cb44533c7ee2db50d667
Binary files /dev/null and b/flash2_a100_fwd_bwd_benchmark.png differ
diff --git a/flash2_h100_fwd_bwd_benchmark.png b/flash2_h100_fwd_bwd_benchmark.png
new file mode 100644
index 0000000000000000000000000000000000000000..41779e0afd3144bfed055b29aaddcc305d6e94c8
Binary files /dev/null and b/flash2_h100_fwd_bwd_benchmark.png differ
diff --git a/flash3_fp16_fwd.png b/flash3_fp16_fwd.png
new file mode 100644
index 0000000000000000000000000000000000000000..403d13944d1b884890d8d37c45f946f8d89848e6
Binary files /dev/null and b/flash3_fp16_fwd.png differ
diff --git a/flash_api.cpp b/flash_api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b12e93505d83bbec32a4ec270429901e61e735e
--- /dev/null
+++ b/flash_api.cpp
@@ -0,0 +1,952 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+// Include these 2 headers instead of torch/extension.h since we don't need all of the torch headers.
+#include <torch/python.h>
+#include <torch/nn/functional.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cutlass/numeric_types.h>
+
+#include "flash.h"
+#include "static_switch.h"
+
+#define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA")
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+
+
+void set_params_fprop(Flash_fwd_params &params,
+                      // sizes
+                      const size_t b,
+                      const size_t seqlen_q,
+                      const size_t seqlen_k,
+                      const size_t seqlen_q_rounded,
+                      const size_t seqlen_k_rounded,
+                      const size_t h,
+                      const size_t h_k,
+                      const size_t d,
+                      const size_t d_rounded,
+                      // device pointers
+                      const at::Tensor q,
+                      const at::Tensor k,
+                      const at::Tensor v,
+                      at::Tensor out,
+                      void *cu_seqlens_q_d,
+                      void *cu_seqlens_k_d,
+                      void *seqused_k,
+                      void *p_d,
+                      void *softmax_lse_d,
+                      float p_dropout,
+                      float softmax_scale,
+                      int window_size_left,
+                      int window_size_right,
+                      bool seqlenq_ngroups_swapped=false,
+                      bool unpadded_lse=false) {
+
+    // Reset the parameters
+    params = {};
+
+    params.is_bf16 = q.dtype() == torch::kBFloat16;
+    params.is_e4m3 = q.dtype() == torch::kFloat8_e4m3fn;
+
+    // Set the pointers and strides.
+    params.q_ptr = q.data_ptr();
+    params.k_ptr = k.data_ptr();
+    params.v_ptr = v.data_ptr();
+    // All stride are in elements, not bytes.
+    params.q_row_stride = q.stride(-3);
+    params.k_row_stride = k.stride(-3);
+    params.v_row_stride = v.stride(-3);
+    params.q_head_stride = q.stride(-2);
+    params.k_head_stride = k.stride(-2);
+    params.v_head_stride = v.stride(-2);
+    params.o_ptr = out.data_ptr();
+    params.o_row_stride = out.stride(-3);
+    params.o_head_stride = out.stride(-2);
+
+    if (cu_seqlens_q_d == nullptr) {
+        params.q_batch_stride = q.stride(0);
+        params.k_batch_stride = k.stride(0);
+        params.v_batch_stride = v.stride(0);
+        params.o_batch_stride = out.stride(0);
+        if (seqlenq_ngroups_swapped) {
+             params.q_batch_stride *= seqlen_q;
+             params.o_batch_stride *= seqlen_q;
+        }
+    }
+
+    params.cu_seqlens_q = static_cast<int *>(cu_seqlens_q_d);
+    params.cu_seqlens_k = static_cast<int *>(cu_seqlens_k_d);
+    params.seqused_k = static_cast<int *>(seqused_k);
+
+    TORCH_CHECK(
+        bool(params.cu_seqlens_q) == bool(params.cu_seqlens_k),
+        "cu_seqlens_q and cu_seqlens_k must be both null or non-null"
+    );
+
+    // P = softmax(QK^T)
+    params.p_ptr = p_d;
+
+    // Softmax sum
+    params.softmax_lse_ptr = softmax_lse_d;
+
+    // Set the dimensions.
+    params.b = b;
+    params.h = h;
+    params.h_k = h_k;
+    params.h_h_k_ratio = h / h_k;
+    params.seqlen_q = seqlen_q;
+    params.seqlen_k = seqlen_k;
+    params.seqlen_q_rounded = seqlen_q_rounded;
+    params.seqlen_k_rounded = seqlen_k_rounded;
+    params.d = d;
+    params.d_rounded = d_rounded;
+
+    // Set the different scale values.
+    params.scale_softmax = softmax_scale;
+    params.scale_softmax_log2 = softmax_scale * M_LOG2E;
+    __half scale_softmax_log2_half = __float2half(params.scale_softmax_log2);
+    __half2 scale_softmax_log2_half2 = __half2(scale_softmax_log2_half, scale_softmax_log2_half);
+    params.scale_softmax_log2_half2 = reinterpret_cast<uint32_t&>(scale_softmax_log2_half2);
+
+    // Set this to probability of keeping an element to simplify things.
+    params.p_dropout = 1.f - p_dropout;
+    // Convert p from float to int so we don't have to convert the random uint to float to compare.
+    // [Minor] We want to round down since when we do the comparison we use <= instead of <
+    // params.p_dropout_in_uint = uint32_t(std::floor(params.p_dropout * 4294967295.0));
+    // params.p_dropout_in_uint16_t = uint16_t(std::floor(params.p_dropout * 65535.0));
+    params.p_dropout_in_uint8_t = uint8_t(std::floor(params.p_dropout * 255.0));
+    params.rp_dropout = 1.f / params.p_dropout;
+    params.scale_softmax_rp_dropout = params.rp_dropout * params.scale_softmax;
+    TORCH_CHECK(p_dropout < 1.f);
+    #ifdef FLASHATTENTION_DISABLE_DROPOUT
+        TORCH_CHECK(p_dropout == 0.0f, "This flash attention build does not support dropout.");
+    #endif
+
+    // Causal is the special case where window_size_right == 0 and window_size_left < 0.
+    // Local is the more general case where window_size_right >= 0 or window_size_left >= 0.
+    params.is_causal = window_size_left < 0 && window_size_right == 0;
+
+    if (window_size_left < 0 && window_size_right >= 0) { window_size_left = seqlen_k; }
+    if (window_size_left >= 0 && window_size_right < 0) { window_size_right = seqlen_k; }
+    params.window_size_left = window_size_left;
+    params.window_size_right = window_size_right;
+
+    #ifdef FLASHATTENTION_DISABLE_LOCAL
+        TORCH_CHECK(params.is_causal || (window_size_left < 0 && window_size_right < 0),
+            "This flash attention build does not support local attention.");
+    #endif
+
+    params.is_seqlens_k_cumulative = true;
+
+    #ifdef FLASHATTENTION_DISABLE_UNEVEN_K
+        TORCH_CHECK(d == d_rounded, "This flash attention build does not support headdim not being a multiple of 32.");
+    #endif
+
+    params.unpadded_lse = unpadded_lse;
+}
+
+void set_params_dgrad(Flash_bwd_params &params,
+                      // sizes
+                      const size_t b,
+                      const size_t seqlen_q,
+                      const size_t seqlen_k,
+                      const size_t seqlen_q_rounded,
+                      const size_t seqlen_k_rounded,
+                      const size_t h,
+                      const size_t h_k,
+                      const size_t d,
+                      const size_t d_rounded,
+                      // device pointers
+                      const at::Tensor q,
+                      const at::Tensor k,
+                      const at::Tensor v,
+                      const at::Tensor out,
+                      const at::Tensor dout,
+                      at::Tensor dq,
+                      at::Tensor dk,
+                      at::Tensor dv,
+                      void *cu_seqlens_q_d,
+                      void *cu_seqlens_k_d,
+                      void *dq_accum_d,
+                      void *dk_accum_d,
+                      void *dv_accum_d,
+                      void *softmax_lse_d,
+                      void *dsoftmax_sum_d,
+                      float p_dropout,
+                      float softmax_scale,
+                      int window_size_left,
+                      int window_size_right,
+                      bool deterministic) {
+
+    set_params_fprop(params,
+                     b, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, h, h_k, d, d_rounded,
+                     q, k, v, out,
+                     cu_seqlens_q_d,
+                     cu_seqlens_k_d,
+                     nullptr,
+                     nullptr,
+                     softmax_lse_d,
+                     p_dropout,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right);
+
+    // Set the pointers and strides.
+    params.do_ptr = dout.data_ptr();
+    params.do_row_stride = dout.stride(-3);
+    params.do_head_stride = dout.stride(-2);
+    params.dq_ptr = dq.data_ptr();
+    params.dk_ptr = dk.data_ptr();
+    params.dv_ptr = dv.data_ptr();
+    params.dq_row_stride = dq.stride(-3);
+    params.dk_row_stride = dk.stride(-3);
+    params.dv_row_stride = dv.stride(-3);
+    params.dq_head_stride = dq.stride(-2);
+    params.dk_head_stride = dk.stride(-2);
+    params.dv_head_stride = dv.stride(-2);
+
+    if (cu_seqlens_q_d == nullptr) {
+        params.do_batch_stride = dout.stride(0);
+        params.dq_batch_stride = dq.stride(0);
+        params.dk_batch_stride = dk.stride(0);
+        params.dv_batch_stride = dv.stride(0);
+    }
+
+    params.dq_accum_ptr = dq_accum_d;
+    params.dk_accum_ptr = dk_accum_d;
+    params.dv_accum_ptr = dv_accum_d;
+
+    // Softmax sum
+    params.dsoftmax_sum = dsoftmax_sum_d;
+
+    params.deterministic = deterministic;
+}
+
+void run_mha_fwd(Flash_fwd_params &params, cudaStream_t stream, bool force_split_kernel=false) {
+    // HEADDIM_SWITCH(params.d, [&] {
+    //     run_mha_fwd_<cutlass::half_t, kHeadSize>(params, stream);
+    // });
+    if (!params.is_e4m3) {
+        if (params.is_bf16) {
+            if (params.d == 64) {
+                run_mha_fwd_<cutlass::bfloat16_t, 64>(params, stream);
+            } else if (params.d == 128) {
+                run_mha_fwd_<cutlass::bfloat16_t, 128>(params, stream);
+            } else {
+                run_mha_fwd_<cutlass::bfloat16_t, 256>(params, stream);
+            }
+        } else {
+            if (params.d == 64) {
+                run_mha_fwd_<cutlass::half_t, 64>(params, stream);
+            } else if (params.d == 128) {
+                run_mha_fwd_<cutlass::half_t, 128>(params, stream);
+            } else {
+                run_mha_fwd_<cutlass::half_t, 256>(params, stream);
+            }
+        }
+    } else {
+        if (params.d == 64) {
+            run_mha_fwd_<cutlass::float_e4m3_t, 64>(params, stream);
+        } else if (params.d == 128) {
+            run_mha_fwd_<cutlass::float_e4m3_t, 128>(params, stream);
+        } else if (params.d == 256) {
+            run_mha_fwd_<cutlass::float_e4m3_t, 256>(params, stream);
+        }        
+    }
+}
+
+std::vector<at::Tensor>
+mha_fwd(at::Tensor &q,         // batch_size x seqlen_q x num_heads x head_size
+        const at::Tensor &k,         // batch_size x seqlen_k x num_heads_k x head_size
+        const at::Tensor &v,         // batch_size x seqlen_k x num_heads_k x head_size
+        c10::optional<at::Tensor> &out_,             // batch_size x seqlen_q x num_heads x head_size
+        const float softmax_scale,
+        bool is_causal) {
+
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    TORCH_CHECK(is_sm90, "FlashAttention only supports Hopper GPUs or newer.");
+
+    auto q_dtype = q.dtype();
+    // TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+    //             "FlashAttention only support fp16 and bf16 data type for now");
+    // TODO: will add e4m3 later
+    // TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kFloat8_e4m3fn,
+    //             "FlashAttention only support fp16 and bf16 data type");
+    //             "FlashAttention only support fp16 and fp8 (e4m3) data type for now");
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+
+    const auto sizes = q.sizes();
+
+    const int batch_size = sizes[0];
+    int seqlen_q = sizes[1];
+    int num_heads = sizes[2];
+    const int head_size_og = sizes[3];
+    const int seqlen_k = k.size(1);
+    const int num_heads_k = k.size(2);
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+
+    TORCH_CHECK(head_size_og == 64 || head_size_og == 128 || head_size_og == 256, "Only support head size 64, 128, and 256 for now");
+
+    CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og);
+    CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size_og);
+    CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size_og);
+
+    at::Tensor q_padded, k_padded, v_padded;
+    if (head_size_og % 8 != 0) {
+        q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    } else {
+        q_padded = q;
+        k_padded = k;
+        v_padded = v;
+    }
+
+    at::Tensor out;
+    if (out_.has_value()) {
+        out = out_.value();
+        // TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs");
+        TORCH_CHECK(q_dtype == at::ScalarType::Float8_e4m3fn
+                    ? (out.dtype() == at::kHalf)
+                    : (out.dtype() == q_dtype),
+                "Output must have the same dtype as input dtype if dtype is "
+                "not fp8, or fp16 for fp8 input.");
+        CHECK_DEVICE(out);
+        TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension");
+        CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, head_size_og);
+        if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); }
+    } else {
+        if (q_dtype == at::ScalarType::Float8_e4m3fn)
+            out = torch::empty_like(q_padded, at::kHalf);
+        else
+            out = torch::empty_like(q_padded);
+    }
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size = round_multiple(head_size_og, 8);
+    const int head_size_rounded = round_multiple(head_size, 32);
+    const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
+    const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+
+    auto softmax_lse = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
+    at::Tensor p;
+
+    Flash_fwd_params params;
+    set_params_fprop(params,
+                     batch_size,
+                     seqlen_q, seqlen_k,
+                     seqlen_q_rounded, seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q_padded, k_padded, v_padded, out,
+                     /*cu_seqlens_q_d=*/nullptr,
+                     /*cu_seqlens_k_d=*/nullptr,
+                     /*seqused_k=*/nullptr,
+                     nullptr,
+                     softmax_lse.data_ptr(),
+                     /*p_dropout=*/0.f,
+                     softmax_scale,
+                     /*window_size_left=*/-1,
+                     /*window_size_right=*/is_causal ? 0 : -1);
+
+    auto tile_count_semaphore = is_causal ? torch::zeros({1}, opts.dtype(torch::kInt32)) : torch::empty({1}, opts.dtype(torch::kInt32));
+    params.tile_count_semaphore = tile_count_semaphore.data_ptr<int>();
+
+    if (seqlen_k > 0) {
+        auto stream = at::cuda::getCurrentCUDAStream().stream();
+        run_mha_fwd(params, stream);
+    } else {
+        // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0.
+        out.zero_();
+        softmax_lse.fill_(std::numeric_limits<float>::infinity());
+    }
+
+    at::Tensor out_padded = out;
+    if (head_size_og % 8 != 0) {
+        out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        if (out_.has_value()) { out_.value().copy_(out); }
+    }
+
+    return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse, p};
+}
+
+std::vector<at::Tensor>
+mha_varlen_fwd(at::Tensor &q,  // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               const at::Tensor &k,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
+               const at::Tensor &v,  // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
+               c10::optional<at::Tensor> &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &cu_seqlens_q,  // b+1
+               const at::Tensor &cu_seqlens_k,  // b+1
+               c10::optional<at::Tensor> &seqused_k, // b. If given, only this many elements of each batch element's keys are used.
+               int max_seqlen_q,
+               const int max_seqlen_k,
+               const float softmax_scale,
+               bool is_causal) {
+
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
+    TORCH_CHECK(is_sm90, "FlashAttention only supports Hopper GPUs or newer.");
+
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+    TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32");
+    TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32");
+
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    CHECK_DEVICE(cu_seqlens_q);
+    CHECK_DEVICE(cu_seqlens_k);
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    CHECK_CONTIGUOUS(cu_seqlens_q);
+    CHECK_CONTIGUOUS(cu_seqlens_k);
+
+    const auto sizes = q.sizes();
+
+    const int batch_size = cu_seqlens_q.numel() - 1;
+    int num_heads = sizes[1];
+    const int head_size_og = sizes[2];
+    const int num_heads_k = k.size(1);
+
+    int window_size_left = -1;
+    int window_size_right = -1;
+    if (is_causal) { window_size_right = 0; }
+
+    void *cu_seqlens_q_d = cu_seqlens_q.data_ptr();
+
+    const int total_q = q.sizes()[0];
+
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+
+    if (window_size_left >= max_seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= max_seqlen_k) { window_size_right = -1; }
+
+    CHECK_SHAPE(q, total_q, num_heads, head_size_og);
+    const int total_k = k.size(0);
+    CHECK_SHAPE(k, total_k, num_heads_k, head_size_og);
+    CHECK_SHAPE(v, total_k, num_heads_k, head_size_og);
+
+    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
+    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
+    if (seqused_k.has_value()){
+        auto seqused_k_ = seqused_k.value();
+        TORCH_CHECK(seqused_k_.dtype() == torch::kInt32, "seqused_k must have dtype int32");
+        TORCH_CHECK(seqused_k_.is_cuda(), "seqused_k must be on CUDA device");
+        TORCH_CHECK(seqused_k_.is_contiguous(), "seqused_k must be contiguous");
+        CHECK_SHAPE(seqused_k_, batch_size);
+    }
+
+    at::Tensor q_padded, k_padded, v_padded;
+    if (head_size_og % 8 != 0) {
+        q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    } else {
+        q_padded = q;
+        k_padded = k;
+        v_padded = v;
+    }
+
+    at::Tensor out;
+    if (out_.has_value()) {
+        out = out_.value();
+        TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs");
+        CHECK_DEVICE(out);
+        TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension");
+        CHECK_SHAPE(out, sizes[0], sizes[1], head_size_og);
+        if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); }
+    } else {
+        out = torch::empty_like(q_padded);
+    }
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size = round_multiple(head_size_og, 8);
+    const int head_size_rounded = round_multiple(head_size, 32);
+    const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128);
+    const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+    auto softmax_lse = torch::empty({num_heads, total_q}, opts.dtype(at::kFloat));
+
+    Flash_fwd_params params;
+    set_params_fprop(params,
+                     batch_size,
+                     max_seqlen_q, max_seqlen_k,
+                     seqlen_q_rounded, seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q_padded, k_padded, v_padded, out,
+                     cu_seqlens_q_d,
+                     cu_seqlens_k.data_ptr(),
+                     seqused_k.has_value() ? seqused_k.value().data_ptr() : nullptr,
+                     /*p_d=*/nullptr,
+                     softmax_lse.data_ptr(),
+                     /*p_dropout=*/0.f,
+                     softmax_scale,
+                     window_size_left,
+                     window_size_right,
+                     /*seqlenq_ngroups_swapped=*/false,
+                     /*unpadded_lse=*/true);
+    params.total_q = total_q;
+    params.total_k = total_k;
+
+    if (max_seqlen_k > 0) {
+        auto stream = at::cuda::getCurrentCUDAStream().stream();
+        run_mha_fwd(params, stream);
+    } else {
+        // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0.
+        out.zero_();
+        softmax_lse.fill_(std::numeric_limits<float>::infinity());
+    }
+
+    at::Tensor out_padded = out;
+    if (head_size_og % 8 != 0) {
+        out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        if (out_.has_value()) { out_.value().copy_(out); }
+    }
+
+    return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse};
+}
+
+void run_mha_bwd(Flash_bwd_params &params, cudaStream_t stream) {
+  // FP16_SWITCH(!params.is_bf16, [&] {
+  //     HEADDIM_SWITCH(params.d, [&] {
+  //         run_mha_bwd_<elem_type, kHeadDim>(params, stream);
+  //     });
+  // });
+  if (!params.is_bf16) {
+    if (params.d <= 64) {
+      run_mha_bwd_<cutlass::half_t, 64>(params, stream);
+    } else if (params.d <= 96) {
+      run_mha_bwd_<cutlass::half_t, 96>(params, stream);
+    } else {
+      run_mha_bwd_<cutlass::half_t, 128>(params, stream);
+    }
+  } else {
+    if (params.d <= 64) {
+      run_mha_bwd_<cutlass::bfloat16_t, 64>(params, stream);
+    } else if (params.d <= 96) {
+      run_mha_bwd_<cutlass::bfloat16_t, 96>(params, stream);
+    } else {
+      run_mha_bwd_<cutlass::bfloat16_t, 128>(params, stream);
+    }
+  }
+}
+
+std::vector<at::Tensor>
+mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_size_og
+        const at::Tensor &q,   // batch_size x seqlen_q x num_heads x head_size
+        const at::Tensor &k,   // batch_size x seqlen_k x num_heads_k x head_size
+        const at::Tensor &v,   // batch_size x seqlen_k x num_heads_k x head_size
+        const at::Tensor &out,   // batch_size x seqlen_q x num_heads x head_size
+        const at::Tensor &softmax_lse,     // b x h x seqlen_q
+        c10::optional<at::Tensor> &dq_,   // batch_size x seqlen_q x num_heads x head_size
+        c10::optional<at::Tensor> &dk_,   // batch_size x seqlen_k x num_heads_k x head_size
+        c10::optional<at::Tensor> &dv_,   // batch_size x seqlen_k x num_heads_k x head_size
+        const float softmax_scale,
+        const bool is_causal,
+        const bool deterministic) {
+
+    #ifdef FLASHATTENTION_DISABLE_BACKWARD
+        TORCH_CHECK(false, "This flash attention build does not support backward.");
+    #endif
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    bool is_sm9x = dprops->major == 9 && dprops->minor >= 0;
+    TORCH_CHECK(is_sm9x, "FlashAttentionHopper only supports Hopper GPUs or newer.");
+
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+    TORCH_CHECK(out.dtype() == q_dtype, "query and out must have the same dtype");
+    TORCH_CHECK(dout.dtype() == q_dtype, "query and dout must have the same dtype");
+
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse);
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
+    TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
+
+    const auto sizes = q.sizes();
+
+    const int batch_size = sizes[0];
+    const int seqlen_q = sizes[1];
+    const int num_heads = sizes[2];
+    const int head_size_og = dout.size(3);
+    const int head_size = sizes[3];
+    const int seqlen_k = k.size(1);
+    const int num_heads_k = k.size(2);
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
+    TORCH_CHECK(head_size <= 128, "FlashAttention backward only supports head dimension at most 128");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size_rounded = head_size <= 64 ? 64 : round_multiple(head_size, 32);
+    // This should match the kernel configs
+    const int kBlockM = head_size <= 64 ? 128 : (head_size < 256 ? 64 : 32);
+    const int seqlen_q_rounded = round_multiple(seqlen_q, kBlockM);
+    const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
+
+    TORCH_CHECK(head_size == round_multiple(head_size_og, 8), "head_size must be head_size_og rounded to a multiple of 8");
+
+    CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size);
+    CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size);
+    CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size);
+    CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, head_size);
+    CHECK_SHAPE(dout, batch_size, seqlen_q, num_heads, head_size_og);
+
+    at::Tensor dq, dk, dv;
+    if (dq_.has_value()) {
+        dq = dq_.value();
+        TORCH_CHECK(dq.dtype() == q_dtype, "dq must have the same dtype as q");
+        CHECK_DEVICE(dq);
+        TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension");
+        CHECK_SHAPE(dq, batch_size, seqlen_q, num_heads, head_size);
+    } else {
+        dq = torch::empty_like(q);
+    }
+    if (dk_.has_value()) {
+        dk = dk_.value();
+        TORCH_CHECK(dk.dtype() == q_dtype, "dk must have the same dtype as q");
+        CHECK_DEVICE(dk);
+        TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension");
+        CHECK_SHAPE(dk, batch_size, seqlen_k, num_heads_k, head_size);
+    } else {
+        dk = torch::empty_like(k);
+    }
+    if (dv_.has_value()) {
+        dv = dv_.value();
+        TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q");
+        CHECK_DEVICE(dv);
+        TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension");
+        CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_k, head_size);
+    } else {
+        dv = torch::empty_like(v);
+    }
+
+    at::Tensor dout_padded;
+    if (head_size_og % 8 != 0) {
+        dout_padded = torch::nn::functional::pad(dout, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    } else {
+        dout_padded = dout;
+    }
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+    // Need softmax_d to have seqlen_q_rounded since we want its address to be aligned by 16/8 bytes for TMA / LDG.64
+    auto softmax_d = torch::empty({batch_size, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
+    auto softmax_lse_log2 = torch::empty({batch_size, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
+    at::Tensor dq_accum;
+    at::Tensor dk_accum, dv_accum;
+    dq_accum = torch::empty({batch_size, num_heads, seqlen_q_rounded, head_size_rounded}, opts.dtype(at::kFloat));
+    // dk_accum = torch::zeros({batch_size, seqlen_k_rounded, num_heads_k, head_size_rounded}, opts.dtype(at::kFloat));
+    // dv_accum = torch::zeros({batch_size, seqlen_k_rounded, num_heads_k, head_size_rounded}, opts.dtype(at::kFloat));
+
+    at::Tensor dk_expanded, dv_expanded;
+    if (num_heads_k != num_heads) {  // MQA / GQA
+        dk_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size}, opts);
+        dv_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size}, opts);
+    } else {
+        dk_expanded = dk;
+        dv_expanded = dv;
+    }
+
+    Flash_bwd_params params;
+
+    set_params_dgrad(params,
+                     batch_size,
+                     seqlen_q, seqlen_k,
+                     seqlen_q_rounded, seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q, k, v, out,
+                     dout_padded, dq, dk_expanded, dv_expanded,
+                     nullptr,
+                     nullptr,
+                     dq_accum.data_ptr(),
+                     // loop ? dk_accum.data_ptr() : nullptr,
+                     // loop ? dv_accum.data_ptr() : nullptr,
+                     nullptr,
+                     nullptr,
+                     softmax_lse.data_ptr(),
+                     softmax_d.data_ptr(),
+                     /*p_dropout=*/0.f,
+                     softmax_scale,
+                     /*window_size_left=*/-1,
+                     /*window_size_right=*/is_causal ? 0 : -1,
+                     deterministic);
+    params.softmax_lse_log2_ptr = softmax_lse_log2.data_ptr();
+
+    // Will be zero'ed out in the backward preprocess kernel
+    at::Tensor dq_semaphore = torch::empty({(seqlen_q + kBlockM - 1) / kBlockM, batch_size, num_heads}, opts.dtype(torch::kInt32));
+    params.dq_semaphore = dq_semaphore.data_ptr<int>();
+    // printf("dq_semaphore: %p, [%d, %d, %d]\n", params.dq_semaphore, (seqlen_q + 64 - 1) / 64, batch_size, num_heads);
+
+    if (seqlen_q > 0) {
+        run_mha_bwd(params, stream);
+    } else {
+        // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0.
+        dk_expanded.zero_();
+        dv_expanded.zero_();
+        softmax_d.zero_();
+    }
+
+    // For MQA/GQA we need to sum dK and dV across the groups
+    if (num_heads_k != num_heads) {
+        at::sum_out(dk, at::reshape(dk_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3});
+        at::sum_out(dv, at::reshape(dv_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3});
+    }
+
+    if (head_size_og % 8 != 0) {
+        dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        dk = dk.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+    }
+
+    return { dq, dk, dv, softmax_d, dq_accum};
+}
+
+std::vector<at::Tensor>
+mha_varlen_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_size_og
+               const at::Tensor &q,   // batch_size x seqlen_q x num_heads x head_size
+               const at::Tensor &k,   // batch_size x seqlen_k x num_heads_k x head_size
+               const at::Tensor &v,   // batch_size x seqlen_k x num_heads_k x head_size
+               const at::Tensor &out,   // batch_size x seqlen_q x num_heads x head_size
+               const at::Tensor &softmax_lse,     // b x h x seqlen_q
+               c10::optional<at::Tensor> &dq_,   // batch_size x seqlen_q x num_heads x head_size
+               c10::optional<at::Tensor> &dk_,   // batch_size x seqlen_k x num_heads_k x head_size
+               c10::optional<at::Tensor> &dv_,   // batch_size x seqlen_k x num_heads_k x head_size
+               const at::Tensor &cu_seqlens_q,  // b+1
+               const at::Tensor &cu_seqlens_k,  // b+1
+               const int max_seqlen_q,
+               const int max_seqlen_k,          // max sequence length to choose the kernel
+               const float softmax_scale,
+               const bool is_causal,
+               const bool deterministic) {
+
+    #ifdef FLASHATTENTION_DISABLE_BACKWARD
+        TORCH_CHECK(false, "This flash attention build does not support backward.");
+    #endif
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    bool is_sm9x = dprops->major == 9 && dprops->minor >= 0;
+    TORCH_CHECK(is_sm9x, "FlashAttentionHopper only supports Hopper GPUs or newer.");
+
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+    TORCH_CHECK(out.dtype() == q_dtype, "query and out must have the same dtype");
+    TORCH_CHECK(dout.dtype() == q_dtype, "query and dout must have the same dtype");
+    TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32");
+    TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32");
+
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse);
+    CHECK_DEVICE(cu_seqlens_q); CHECK_DEVICE(cu_seqlens_k);
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
+    TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
+    CHECK_CONTIGUOUS(cu_seqlens_q);
+    CHECK_CONTIGUOUS(cu_seqlens_k);
+
+    const auto sizes = q.sizes();
+
+    const int total_q = sizes[0];
+    const int batch_size = cu_seqlens_q.numel() - 1;
+    const int num_heads = sizes[1];
+    const int head_size_og = dout.size(2);
+    const int head_size = sizes[2];
+    const int total_k = k.size(0);
+    const int num_heads_k = k.size(1);
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
+    TORCH_CHECK(head_size <= 128, "FlashAttention backward only supports head dimension at most 128");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size_rounded = head_size <= 64 ? 64 : round_multiple(head_size, 32);
+    // This should match the kernel configs
+    const int kBlockM = head_size <= 64 ? 128 : (head_size < 256 ? 64 : 32);
+    const int seqlen_q_rounded = round_multiple(max_seqlen_q, kBlockM);
+    const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128);
+    int const total_q_padded_rounded = round_multiple(total_q + batch_size * 128, 128);
+
+    TORCH_CHECK(head_size == round_multiple(head_size_og, 8), "head_size must be head_size_og rounded to a multiple of 8");
+
+    CHECK_SHAPE(q, total_q, num_heads, head_size_og);
+    CHECK_SHAPE(k, total_k, num_heads_k, head_size_og);
+    CHECK_SHAPE(v, total_k, num_heads_k, head_size_og);
+    CHECK_SHAPE(out, total_q, num_heads, head_size);
+    CHECK_SHAPE(dout, total_q, num_heads, head_size_og);
+    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
+    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
+
+    at::Tensor dq, dk, dv;
+    if (dq_.has_value()) {
+        dq = dq_.value();
+        TORCH_CHECK(dq.dtype() == q_dtype, "dq must have the same dtype as q");
+        CHECK_DEVICE(dq);
+        TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension");
+        CHECK_SHAPE(dq, total_q, num_heads, head_size);
+    } else {
+        dq = torch::empty_like(q);
+    }
+    if (dk_.has_value()) {
+        dk = dk_.value();
+        TORCH_CHECK(dk.dtype() == q_dtype, "dk must have the same dtype as q");
+        CHECK_DEVICE(dk);
+        TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension");
+        CHECK_SHAPE(dk, total_k, num_heads_k, head_size);
+    } else {
+        dk = torch::empty_like(k);
+    }
+    if (dv_.has_value()) {
+        dv = dv_.value();
+        TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q");
+        CHECK_DEVICE(dv);
+        TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension");
+        CHECK_SHAPE(dv, total_k, num_heads_k, head_size);
+    } else {
+        dv = torch::empty_like(v);
+    }
+
+    at::Tensor dout_padded;
+    if (head_size_og % 8 != 0) {
+        dout_padded = torch::nn::functional::pad(dout, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    } else {
+        dout_padded = dout;
+    }
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+    // Need softmax_d to have total_q_padded_rounded since we want its address to be aligned by 16/8 bytes for TMA / LDG.64
+    auto softmax_d = torch::empty({num_heads, total_q_padded_rounded}, opts.dtype(at::kFloat));
+    auto softmax_lse_log2 = torch::empty({num_heads, total_q_padded_rounded}, opts.dtype(at::kFloat));
+    at::Tensor dq_accum;
+    at::Tensor dk_accum, dv_accum;
+    dq_accum = torch::empty({num_heads, total_q_padded_rounded, head_size_rounded}, opts.dtype(at::kFloat));
+    // dk_accum = torch::zeros({batch_size, seqlen_k_rounded, num_heads_k, head_size_rounded}, opts.dtype(at::kFloat));
+    // dv_accum = torch::zeros({batch_size, seqlen_k_rounded, num_heads_k, head_size_rounded}, opts.dtype(at::kFloat));
+
+    at::Tensor dk_expanded, dv_expanded;
+    if (num_heads_k != num_heads) {  // MQA / GQA
+        dk_expanded = torch::empty({total_k, num_heads, head_size}, opts);
+        dv_expanded = torch::empty({total_k, num_heads, head_size}, opts);
+    } else {
+        dk_expanded = dk;
+        dv_expanded = dv;
+    }
+
+    Flash_bwd_params params;
+
+    set_params_dgrad(params,
+                     batch_size,
+                     max_seqlen_q, max_seqlen_k,
+                     seqlen_q_rounded, seqlen_k_rounded,
+                     num_heads, num_heads_k,
+                     head_size, head_size_rounded,
+                     q, k, v, out,
+                     dout_padded, dq, dk_expanded, dv_expanded,
+                     cu_seqlens_q.data_ptr(),
+                     cu_seqlens_k.data_ptr(),
+                     dq_accum.data_ptr(),
+                     // loop ? dk_accum.data_ptr() : nullptr,
+                     // loop ? dv_accum.data_ptr() : nullptr,
+                     nullptr,
+                     nullptr,
+                     softmax_lse.data_ptr(),
+                     softmax_d.data_ptr(),
+                     /*p_dropout=*/0.f,
+                     softmax_scale,
+                     /*window_size_left=*/-1,
+                     /*window_size_right=*/is_causal ? 0 : -1,
+                     deterministic);
+    params.total_q = total_q;
+    params.total_k = total_k;
+    params.softmax_lse_log2_ptr = softmax_lse_log2.data_ptr();
+
+    // Will be zero'ed out in the backward preprocess kernel
+    at::Tensor dq_semaphore = torch::empty({(max_seqlen_q + kBlockM - 1) / kBlockM, batch_size, num_heads}, opts.dtype(torch::kInt32));
+    params.dq_semaphore = dq_semaphore.data_ptr<int>();
+
+    if (max_seqlen_q > 0) {
+        run_mha_bwd(params, stream);
+    } else {
+        // If max_seqlen_q == 0, then we have an empty tensor. We need to set the output to 0.
+        dk_expanded.zero_();
+        dv_expanded.zero_();
+        softmax_d.zero_();
+    }
+
+    // For MQA/GQA we need to sum dK and dV across the groups
+    if (num_heads_k != num_heads) {
+        at::sum_out(dk, at::reshape(dk_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size}), {2});
+        at::sum_out(dv, at::reshape(dv_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size}), {2});
+    }
+
+    if (head_size_og % 8 != 0) {
+        dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        dk = dk.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+    }
+
+    return { dq, dk, dv, softmax_d, dq_accum, softmax_lse_log2 };
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.doc() = "FlashAttention";
+    m.def("fwd", &mha_fwd, "Forward pass");
+    m.def("bwd", &mha_bwd, "Backward pass");
+    m.def("varlen_fwd", &mha_varlen_fwd, "Forward pass (variable length)");
+    m.def("varlen_bwd", &mha_varlen_bwd, "Varlen backward pass");
+}
diff --git a/flash_attn_interface.py b/flash_attn_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..13ddff4bbe54e4caf2780276c9b909c20044e97f
--- /dev/null
+++ b/flash_attn_interface.py
@@ -0,0 +1,383 @@
+# Copyright (c) 2023, Tri Dao.
+
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+# isort: off
+# We need to import the CUDA kernels after importing torch
+import flashattn_hopper_cuda
+
+# isort: on
+
+def maybe_contiguous(x):
+    return x.contiguous() if x is not None and x.stride(-1) != 1 else x
+
+def _flash_attn_forward(q, k, v, softmax_scale, causal):
+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+    out, q, k, v, out_padded, softmax_lse, S_dmask = flashattn_hopper_cuda.fwd(
+        q,
+        k,
+        v,
+        None,
+        softmax_scale,
+        causal,
+    )
+    return out, q, k, v, out_padded, softmax_lse, S_dmask
+
+
+def _flash_attn_backward(
+    dout,
+    q,
+    k,
+    v,
+    out,
+    softmax_lse,
+    dq,
+    dk,
+    dv,
+    softmax_scale,
+    causal,
+    deterministic=False
+):
+    # dq, dk, dv are allocated by us so they should already be contiguous
+    dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)]
+    dq, dk, dv, softmax_d, *rest = flashattn_hopper_cuda.bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        softmax_scale,
+        causal,
+        deterministic,
+    )
+    return dq, dk, dv, softmax_d
+
+def _flash_attn_varlen_forward(
+    q,
+    k,
+    v,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    softmax_scale,
+    causal,
+):
+    maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
+    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
+    out, q, k, v, out_padded, softmax_lse = flashattn_hopper_cuda.varlen_fwd(
+        q,
+        k,
+        v,
+        None,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        None,
+        max_seqlen_q,
+        max_seqlen_k,
+        softmax_scale,
+        causal,
+    )
+    # if out.isnan().any() or softmax_lse.isnan().any():
+    #     breakpoint()
+    return out, q, k, v, out_padded, softmax_lse
+
+
+def _flash_attn_varlen_backward(
+    dout,
+    q,
+    k,
+    v,
+    out,
+    softmax_lse,
+    dq,
+    dk,
+    dv,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    softmax_scale,
+    causal,
+    deterministic=False,
+):
+    maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
+    # dq, dk, dv are allocated by us so they should already be contiguous
+    dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)]
+    (
+        dq,
+        dk,
+        dv,
+        softmax_d,
+        *rest,
+    ) = flashattn_hopper_cuda.varlen_bwd(
+        dout,
+        q,
+        k,
+        v,
+        out,
+        softmax_lse,
+        dq,
+        dk,
+        dv,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        softmax_scale,
+        causal,
+        deterministic,
+    )
+    # if dk.isnan().any() or dk.isnan().any() or dv.isnan().any() or softmax_d.isnan().any():
+    #     breakpoint()
+    return dq, dk, dv, softmax_d
+
+
+class FlashAttnFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        k,
+        v,
+        softmax_scale,
+        causal,
+        deterministic=False,
+    ):
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+        out, q, k, v, out_padded, softmax_lse, S_dmask = _flash_attn_forward(
+            q,
+            k,
+            v,
+            softmax_scale,
+            causal
+        )
+        ctx.save_for_backward(q, k, v, out_padded, softmax_lse)
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        ctx.deterministic = deterministic
+        return out, softmax_lse
+
+    @staticmethod
+    def backward(ctx, dout, *args):
+        q, k, v, out, softmax_lse = ctx.saved_tensors
+        dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v)
+        _flash_attn_backward(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            dq,
+            dk,
+            dv,
+            ctx.softmax_scale,
+            ctx.causal,
+            ctx.deterministic,
+        )
+        dq = dq[..., : dout.shape[-1]]  # We could have padded the head dimension
+        dk = dk[..., : dout.shape[-1]]
+        dv = dv[..., : dout.shape[-1]]
+        return dq, dk, dv, None, None, None
+
+
+class FlashAttnVarlenFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        q,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        softmax_scale,
+        causal,
+        deterministic=False,
+    ):
+        if softmax_scale is None:
+            softmax_scale = q.shape[-1] ** (-0.5)
+        out, q, k, v, out_padded, softmax_lse = _flash_attn_varlen_forward(
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            softmax_scale,
+            causal=causal,
+        )
+        ctx.save_for_backward(
+            q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k
+        )
+        ctx.max_seqlen_q = max_seqlen_q
+        ctx.max_seqlen_k = max_seqlen_k
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        ctx.deterministic = deterministic
+        return out, softmax_lse
+
+    @staticmethod
+    def backward(ctx, dout, *args):
+        q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k = ctx.saved_tensors
+        dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v)
+        _flash_attn_varlen_backward(
+            dout,
+            q,
+            k,
+            v,
+            out,
+            softmax_lse,
+            dq,
+            dk,
+            dv,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            ctx.max_seqlen_q,
+            ctx.max_seqlen_k,
+            ctx.softmax_scale,
+            ctx.causal,
+            ctx.deterministic,
+        )
+        dq = dq[..., : dout.shape[-1]]  # We could have padded the head dimension
+        dk = dk[..., : dout.shape[-1]]
+        dv = dv[..., : dout.shape[-1]]
+        return dq, dk, dv, None, None, None, None, None, None, None
+
+
+def flash_attn_func(
+    q,
+    k,
+    v,
+    softmax_scale=None,
+    causal=False,
+    deterministic=False
+):
+    """dropout_p should be set to 0.0 during evaluation
+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+
+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+        1 1 1 1 0
+        1 1 1 1 1
+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+        0 0
+        0 0
+        0 0
+        1 0
+        1 1
+    If the row of the mask is all zero, the output will be zero.
+
+    If window_size != (-1, -1), implements sliding window local attention. Query at position i
+    will only attend to keys between
+    [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive.
+
+    Arguments:
+        q: (batch_size, seqlen, nheads, headdim)
+        k: (batch_size, seqlen, nheads_k, headdim)
+        v: (batch_size, seqlen, nheads_k, headdim)
+        dropout_p: float. Dropout probability.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
+        alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of
+            (-alibi_slope * |i + seqlen_k - seqlen_q - j|)
+            is added to the attention score of query i and key j.
+        deterministic: bool. Whether to use the deterministic implementation of the backward pass,
+            which is slightly slower and uses more memory. The forward pass is always deterministic.
+        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
+           testing only. The returned probabilities are not guaranteed to be correct
+           (they might not have the right scaling).
+    Return:
+        out: (batch_size, seqlen, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
+            The output of softmax (possibly with different scaling). It also encodes the dropout
+            pattern (negative means that location was dropped, nonnegative means it was kept).
+    """
+    return FlashAttnFunc.apply(
+        q,
+        k,
+        v,
+        softmax_scale,
+        causal,
+        deterministic,
+    )
+
+
+def flash_attn_varlen_func(
+    q,
+    k,
+    v,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    softmax_scale=None,
+    causal=False,
+    deterministic=False,
+):
+    """
+    Supports multi-query and grouped-query attention (MQA/GQA) by passing in K, V with fewer heads
+    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
+    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
+    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
+    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
+    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
+        1 1 1 1 0
+        1 1 1 1 1
+    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
+        0 0
+        0 0
+        0 0
+        1 0
+        1 1
+    If the row of the mask is all zero, the output will be zero.
+    Arguments:
+        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
+        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
+        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.
+        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into q.
+        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+           of the sequences in the batch, used to index into kv.
+        max_seqlen_q: int. Maximum query sequence length in the batch.
+        max_seqlen_k: int. Maximum key sequence length in the batch.
+        softmax_scale: float. The scaling of QK^T before applying softmax.
+            Default to 1 / sqrt(headdim).
+        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
+    Return:
+        out: (total, nheads, headdim).
+        softmax_lse [optional, if return_attn_probs=True]: (nheads, total_q_seqlen). The
+            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
+            normalization factor).
+    """
+    return FlashAttnVarlenFunc.apply(
+        q,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        softmax_scale,
+        causal,
+        deterministic,
+    )
diff --git a/flash_attn_triton.py b/flash_attn_triton.py
new file mode 100644
index 0000000000000000000000000000000000000000..30420c057adf1916e16403d3f0d02d0e26c8b7a3
--- /dev/null
+++ b/flash_attn_triton.py
@@ -0,0 +1,1160 @@
+"""
+*Experimental* implementation of FlashAttention in Triton.
+Tested with triton==2.0.0.dev20221202.
+Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
+other than 64:
+https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
+We'll update this implementation with the new Triton backend once this is fixed.
+
+We use the FlashAttention implementation from Phil Tillet a starting point.
+https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
+
+Changes:
+- Implement both causal and non-causal attention.
+- Implement both self-attention and cross-attention.
+- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
+- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
+- Support attention bias.
+- Speed up the forward pass a bit, and only store the LSE instead of m and l.
+- Make the backward for d=128 much faster by reducing register spilling.
+- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
+small batch size * nheads.
+
+Caution:
+- This is an *experimental* implementation. The forward pass should be quite robust but
+I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
+- This implementation has only been tested on A100.
+- If you plan to use headdim other than 64 and 128, you should test for race conditions
+(due to the Triton compiler), as done in tests/test_flash_attn.py
+"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
+for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
+that there are none left for other head dimensions.
+
+Differences between this Triton version and the CUDA version:
+- Triton version doesn't support dropout.
+- Triton forward is generally faster than CUDA forward, while Triton backward is
+generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
+than CUDA forward + backward.
+- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
+- Triton version supports attention bias, while CUDA version doesn't.
+"""
+
+import math
+
+import torch
+import triton
+import triton.language as tl
+
+
+# Disabling autotune for now, set num_warps=4 if headdim=64 and num_warps=8 if headdim=128
+# @triton.autotune(
+#     configs=[
+#         triton.Config({"BLOCK_M": 128, "BLOCK_N": 128}, num_warps=4, num_stages=1),
+#         # This config has a race condition when EVEN_M == False, disabling it for now.
+#         # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_warps=4, num_stages=1),
+#     ],
+#     key=['CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL', 'BLOCK_HEADDIM']
+# )
+@triton.heuristics(
+    {
+        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
+        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    Bias,
+    Out,
+    Lse,
+    TMP,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
+    softmax_scale,
+    stride_qb,
+    stride_qh,
+    stride_qm,
+    stride_kb,
+    stride_kh,
+    stride_kn,
+    stride_vb,
+    stride_vh,
+    stride_vn,
+    stride_bb,
+    stride_bh,
+    stride_bm,
+    stride_ob,
+    stride_oh,
+    stride_om,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    seqlen_q_rounded,
+    headdim,
+    CACHE_KEY_SEQLEN_Q,
+    CACHE_KEY_SEQLEN_K,
+    BIAS_TYPE: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hb = tl.program_id(1)
+    off_b = off_hb // nheads
+    off_h = off_hb % nheads
+    # off_b = tl.program_id(1)
+    # off_h = tl.program_id(2)
+    # off_hb = off_b * nheads + off_h
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # Initialize pointers to Q, K, V
+    # Adding parenthesis around indexing might use int32 math instead of int64 math?
+    # https://github.com/openai/triton/issues/741
+    # I'm seeing a tiny bit of difference (5-7us)
+    q_ptrs = (
+        Q + off_b * stride_qb + off_h * stride_qh + (offs_m[:, None] * stride_qm + offs_d[None, :])
+    )
+    k_ptrs = (
+        K + off_b * stride_kb + off_h * stride_kh + (offs_n[:, None] * stride_kn + offs_d[None, :])
+    )
+    v_ptrs = (
+        V + off_b * stride_vb + off_h * stride_vh + (offs_n[:, None] * stride_vn + offs_d[None, :])
+    )
+    if BIAS_TYPE == "vector":
+        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
+    elif BIAS_TYPE == "matrix":
+        b_ptrs = (
+            Bias
+            + off_b * stride_bb
+            + off_h * stride_bh
+            + (offs_m[:, None] * stride_bm + offs_n[None, :])
+        )
+    # initialize pointer to m and l
+    t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
+    lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
+    # load q: it will stay in SRAM throughout
+    # [2022-10-30] TD: Triton bug - in the case of EVEN_M=True and EVEN_N=False, if we just call
+    # tl.load(q_ptrs), we get the wrong output!
+    if EVEN_M & EVEN_N:
+        if EVEN_HEADDIM:
+            q = tl.load(q_ptrs)
+        else:
+            q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+    else:
+        if EVEN_HEADDIM:
+            q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0)
+        else:
+            q = tl.load(
+                q_ptrs, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0
+            )
+    # loop over k, v and update accumulator
+    end_n = seqlen_k if not IS_CAUSAL else tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
+    for start_n in range(0, end_n, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
+            if EVEN_HEADDIM:
+                k = tl.load(k_ptrs + start_n * stride_kn)
+            else:
+                k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_d[None, :] < headdim, other=0.0)
+        else:
+            if EVEN_HEADDIM:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=(start_n + offs_n)[:, None] < seqlen_k,
+                    other=0.0,
+                )
+            else:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k, trans_b=True)
+        # Trying to combine the two masks seem to make the result wrong
+        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float("-inf"))
+        if IS_CAUSAL:
+            qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float("-inf"))
+        if BIAS_TYPE != "none":
+            if BIAS_TYPE == "vector":
+                if EVEN_N:
+                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
+                else:
+                    bias = tl.load(
+                        b_ptrs + start_n, mask=(start_n + offs_n) < seqlen_k, other=0.0
+                    ).to(tl.float32)
+                bias = bias[None, :]
+            elif BIAS_TYPE == "matrix":
+                if EVEN_M & EVEN_N:
+                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
+                else:
+                    bias = tl.load(
+                        b_ptrs + start_n,
+                        mask=(offs_m[:, None] < seqlen_q)
+                        & ((start_n + offs_n)[None, :] < seqlen_k),
+                        other=0.0,
+                    ).to(tl.float32)
+            # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+            # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+            # to multiply with softmax_scale here.
+            qk = qk * softmax_scale + bias
+            m_ij = tl.maximum(tl.max(qk, 1), lse_i)
+            p = tl.exp(qk - m_ij[:, None])
+        else:
+            m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i)
+            p = tl.exp(qk * softmax_scale - m_ij[:, None])
+        l_ij = tl.sum(p, 1)
+
+        # scale acc_o
+        acc_o_scale = tl.exp(m_i - m_ij)
+
+        # # -- update output accumulator --
+        # BUG: have to store and immediately load
+        tl.store(t_ptrs, acc_o_scale)
+        acc_o_scale = tl.load(t_ptrs)
+        acc_o = acc_o * acc_o_scale[:, None]
+        # update acc_o
+        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
+            if EVEN_HEADDIM:
+                v = tl.load(v_ptrs + start_n * stride_vn)
+            else:
+                v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_d[None, :] < headdim, other=0.0)
+        else:
+            if EVEN_HEADDIM:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=(start_n + offs_n)[:, None] < seqlen_k,
+                    other=0.0,
+                )
+            else:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        p = p.to(v.dtype)
+        acc_o += tl.dot(p, v)
+
+        # -- update statistics
+        m_i = m_ij
+        l_i_new = tl.exp(lse_i - m_ij) + l_ij
+        lse_i = m_ij + tl.log(l_i_new)
+
+    o_scale = tl.exp(m_i - lse_i)
+    # BUG: have to store and immediately load
+    tl.store(t_ptrs, o_scale)
+    o_scale = tl.load(t_ptrs)
+    acc_o = acc_o * o_scale[:, None]
+    # rematerialize offsets to save registers
+    start_m = tl.program_id(0)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    # write back l and m
+    lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m
+    tl.store(lse_ptrs, lse_i)
+    # initialize pointers to output
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    out_ptrs = (
+        Out
+        + off_b * stride_ob
+        + off_h * stride_oh
+        + (offs_m[:, None] * stride_om + offs_d[None, :])
+    )
+    if EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(out_ptrs, acc_o)
+        else:
+            tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim)
+    else:
+        if EVEN_HEADDIM:
+            tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q)
+        else:
+            tl.store(
+                out_ptrs, acc_o, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim)
+            )
+
+
+@triton.jit
+def _bwd_preprocess_do_o_dot(
+    Out,
+    DO,
+    Delta,
+    stride_ob,
+    stride_oh,
+    stride_om,
+    stride_dob,
+    stride_doh,
+    stride_dom,
+    nheads,
+    seqlen_q,
+    seqlen_q_rounded,
+    headdim,
+    BLOCK_M: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hb = tl.program_id(1)
+    off_b = off_hb // nheads
+    off_h = off_hb % nheads
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # load
+    o = tl.load(
+        Out + off_b * stride_ob + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :],
+        mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+        other=0.0,
+    ).to(tl.float32)
+    do = tl.load(
+        DO
+        + off_b * stride_dob
+        + off_h * stride_doh
+        + offs_m[:, None] * stride_dom
+        + offs_d[None, :],
+        mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+        other=0.0,
+    ).to(tl.float32)
+    delta = tl.sum(o * do, axis=1)
+    # write-back
+    tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta)
+
+
+@triton.jit
+def _bwd_store_dk_dv(
+    dk_ptrs,
+    dv_ptrs,
+    dk,
+    dv,
+    offs_n,
+    offs_d,
+    seqlen_k,
+    headdim,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+):
+    # [2022-11-01] TD: Same bug. In the case of EVEN_N=True and EVEN_M=False,
+    # if we just call tl.store(dv_ptrs), there's a race condition
+    if EVEN_N & EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(dv_ptrs, dv)
+            tl.store(dk_ptrs, dk)
+        else:
+            tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
+            tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
+    else:
+        if EVEN_HEADDIM:
+            tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
+            tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
+        else:
+            tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
+            tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
+
+
+@triton.jit
+def _bwd_kernel_one_col_block(
+    start_n,
+    Q,
+    K,
+    V,
+    Bias,
+    DO,
+    DQ,
+    DK,
+    DV,
+    LSE,
+    D,
+    softmax_scale,
+    stride_qm,
+    stride_kn,
+    stride_vn,
+    stride_bm,
+    stride_dom,
+    stride_dqm,
+    stride_dkn,
+    stride_dvn,
+    seqlen_q,
+    seqlen_k,
+    headdim,
+    ATOMIC_ADD: tl.constexpr,
+    BIAS_TYPE: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    # We need to make sure begin_m is a multiple of BLOCK_M (not BLOCK_N)
+    begin_m = 0 if not IS_CAUSAL else ((start_n * BLOCK_N) // BLOCK_M) * BLOCK_M
+    # initialize row/col offsets
+    offs_qm = begin_m + tl.arange(0, BLOCK_M)
+    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # initialize pointers to value-like data
+    q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :])
+    k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :])
+    v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :])
+    do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :])
+    dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :])
+    if BIAS_TYPE == "vector":
+        b_ptrs = Bias + offs_n
+    elif BIAS_TYPE == "matrix":
+        b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
+    # initialize dv and dk
+    dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    # There seems to be some problem with Triton pipelining that makes results wrong for
+    # headdim=64, seqlen=(113, 255), bias_type='matrix'. In this case the for loop
+    # may have zero step, and pipelining with the bias matrix could screw it up.
+    # So we just exit early.
+    if begin_m >= seqlen_q:
+        dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
+        dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
+        _bwd_store_dk_dv(
+            dk_ptrs,
+            dv_ptrs,
+            dk,
+            dv,
+            offs_n,
+            offs_d,
+            seqlen_k,
+            headdim,
+            EVEN_M=EVEN_M,
+            EVEN_N=EVEN_N,
+            EVEN_HEADDIM=EVEN_HEADDIM,
+        )
+        return
+    # k and v stay in SRAM throughout
+    # [2022-10-30] TD: Same bug as the fwd. In the case of EVEN_N=True and EVEN_M=False,
+    # if we just call tl.load(k_ptrs), we get the wrong output!
+    if EVEN_N & EVEN_M:
+        if EVEN_HEADDIM:
+            k = tl.load(k_ptrs)
+            v = tl.load(v_ptrs)
+        else:
+            k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+            v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+    else:
+        if EVEN_HEADDIM:
+            k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
+            v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
+        else:
+            k = tl.load(
+                k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0
+            )
+            v = tl.load(
+                v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0
+            )
+    # loop over rows
+    num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
+    for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M):
+        start_m = tl.multiple_of(start_m, BLOCK_M)
+        offs_m_curr = start_m + offs_m
+        # load q, k, v, do on-chip
+        # Same bug as below. Otherwise gives wrong result for headdim=40, seqlen=(128, 117)
+        if EVEN_M & EVEN_HEADDIM:
+            q = tl.load(q_ptrs)
+        else:
+            if EVEN_HEADDIM:
+                q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
+            else:
+                q = tl.load(
+                    q_ptrs,
+                    mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        # recompute p = softmax(qk, dim=-1).T
+        qk = tl.dot(q, k, trans_b=True)
+        # Trying to combine the two masks seem to make the result wrong
+        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+            qk = tl.where(offs_n[None, :] < seqlen_k, qk, float("-inf"))
+        if IS_CAUSAL:
+            qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf"))
+        if BIAS_TYPE != "none":
+            tl.debug_barrier()  # Race condition otherwise
+            if BIAS_TYPE == "vector":
+                if EVEN_N:
+                    bias = tl.load(b_ptrs).to(tl.float32)
+                else:
+                    bias = tl.load(b_ptrs, mask=offs_n < seqlen_k, other=0.0).to(tl.float32)
+                bias = bias[None, :]
+            elif BIAS_TYPE == "matrix":
+                if EVEN_M & EVEN_N:
+                    bias = tl.load(b_ptrs).to(tl.float32)
+                else:
+                    bias = tl.load(
+                        b_ptrs,
+                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_n[None, :] < seqlen_k),
+                        other=0.0,
+                    ).to(tl.float32)
+            qk = qk * softmax_scale + bias
+        # There seems to be a race condition when headdim=48/96, and dq, dk, dv are wrong.
+        # Also wrong for headdim=64.
+        if not (EVEN_M & EVEN_HEADDIM):
+            tl.debug_barrier()
+        lse_i = tl.load(LSE + offs_m_curr)
+        if BIAS_TYPE == "none":
+            p = tl.exp(qk * softmax_scale - lse_i[:, None])
+        else:
+            p = tl.exp(qk - lse_i[:, None])
+        # compute dv
+        # [2022-10-30] TD: A Triton bug: if EVEN_M=True and EVEN_HEADDIM=False, if we call
+        # do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0), we get wrong outputs
+        # in the case of headdim=48/96, seqlen_q & seqlen_k >= 512. If headdim=40 or seqlen < 512,
+        # the output is correct.
+        if EVEN_M & EVEN_HEADDIM:
+            do = tl.load(do_ptrs)
+        else:
+            # [2022-11-01] TD: Triton bug, there's a race condition if we just use m_mask and not d_mask.
+            do = tl.load(
+                do_ptrs,
+                mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                other=0.0,
+            )
+        # if EVEN_M:
+        #     if EVEN_HEADDIM:
+        #         do = tl.load(do_ptrs)
+        #     else:
+        #         do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+        # else:
+        #     if EVEN_HEADDIM:
+        #         do = tl.load(do_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
+        #     else:
+        #         do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q)
+        #                                    & (offs_d[None, :] < headdim), other=0.0)
+        dv += tl.dot(p.to(do.dtype), do, trans_a=True)
+        # compute dp = dot(v, do)
+        # There seems to be a race condition when headdim=48/96, and dq, dk are wrong.
+        # Also wrong for headdim=128, seqlen=(108, 256), and ATOMIC_ADD=True
+        # Also wrong for headdim=64, seqlen=(1023, 1024), and ATOMIC_ADD=False
+        if not (EVEN_M & EVEN_HEADDIM):
+            tl.debug_barrier()
+        dp = tl.dot(do, v, trans_b=True)
+        # There's a race condition for headdim=48
+        if not EVEN_HEADDIM:
+            tl.debug_barrier()
+        # compute ds = p * (dp - delta[:, None])
+        # Putting the subtraction after the dp matmul (instead of before) is slightly faster
+        Di = tl.load(D + offs_m_curr)
+        # Converting ds to q.dtype here reduces register pressure and makes it much faster
+        # for BLOCK_HEADDIM=128
+        ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype)
+        # compute dk = dot(ds.T, q)
+        dk += tl.dot(ds, q, trans_a=True)
+        # compute dq
+        if not (
+            EVEN_M & EVEN_HEADDIM
+        ):  # Otherewise there's a race condition when BIAS_TYPE='matrix'
+            tl.debug_barrier()
+        if not ATOMIC_ADD:
+            if EVEN_M & EVEN_HEADDIM:  # Race condition if we just do EVEN_M
+                dq = tl.load(dq_ptrs, eviction_policy="evict_last")
+                dq += tl.dot(ds, k)
+                tl.store(dq_ptrs, dq, eviction_policy="evict_last")
+            else:
+                if EVEN_HEADDIM:
+                    dq = tl.load(
+                        dq_ptrs,
+                        mask=offs_m_curr[:, None] < seqlen_q,
+                        other=0.0,
+                        eviction_policy="evict_last",
+                    )
+                    dq += tl.dot(ds, k)
+                    tl.store(
+                        dq_ptrs,
+                        dq,
+                        mask=offs_m_curr[:, None] < seqlen_q,
+                        eviction_policy="evict_last",
+                    )
+                else:
+                    dq = tl.load(
+                        dq_ptrs,
+                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                        other=0.0,
+                        eviction_policy="evict_last",
+                    )
+                    dq += tl.dot(ds, k)
+                    tl.store(
+                        dq_ptrs,
+                        dq,
+                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                        eviction_policy="evict_last",
+                    )
+        else:  # If we're parallelizing across the seqlen_k dimension
+            dq = tl.dot(ds, k)
+            if EVEN_M & EVEN_HEADDIM:  # Race condition if we just do EVEN_M
+                tl.atomic_add(dq_ptrs, dq)
+            else:
+                if EVEN_HEADDIM:
+                    tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q)
+                else:
+                    tl.atomic_add(
+                        dq_ptrs,
+                        dq,
+                        mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                    )
+        # increment pointers
+        dq_ptrs += BLOCK_M * stride_dqm
+        q_ptrs += BLOCK_M * stride_qm
+        do_ptrs += BLOCK_M * stride_dom
+        if BIAS_TYPE == "matrix":
+            b_ptrs += BLOCK_M * stride_bm
+    # write-back
+    dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
+    dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
+    _bwd_store_dk_dv(
+        dk_ptrs,
+        dv_ptrs,
+        dk,
+        dv,
+        offs_n,
+        offs_d,
+        seqlen_k,
+        headdim,
+        EVEN_M=EVEN_M,
+        EVEN_N=EVEN_N,
+        EVEN_HEADDIM=EVEN_HEADDIM,
+    )
+
+
+def init_to_zero(name):
+    return lambda nargs: nargs[name].zero_()
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False},
+            num_warps=8,
+            num_stages=1,
+            pre_hook=init_to_zero("DQ"),
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": True},
+            num_warps=8,
+            num_stages=1,
+            pre_hook=init_to_zero("DQ"),
+        ),
+        # Other configs seem to give wrong results when seqlen_q % 128 != 0, disabling them for now
+        # # Kernel is buggy (give wrong result) if we set BLOCK_m=128, BLOCK_n=64, num_warps=*4*
+        # triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
+        # triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
+        # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')),
+        # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')),
+    ],
+    key=["CACHE_KEY_SEQLEN_Q", "CACHE_KEY_SEQLEN_K", "BIAS_TYPE", "IS_CAUSAL", "BLOCK_HEADDIM"],
+)
+@triton.heuristics(
+    {
+        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
+        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _bwd_kernel(
+    Q,
+    K,
+    V,
+    Bias,
+    DO,
+    DQ,
+    DK,
+    DV,
+    LSE,
+    D,
+    softmax_scale,
+    stride_qb,
+    stride_qh,
+    stride_qm,
+    stride_kb,
+    stride_kh,
+    stride_kn,
+    stride_vb,
+    stride_vh,
+    stride_vn,
+    stride_bb,
+    stride_bh,
+    stride_bm,
+    stride_dob,
+    stride_doh,
+    stride_dom,
+    stride_dqb,
+    stride_dqh,
+    stride_dqm,
+    stride_dkb,
+    stride_dkh,
+    stride_dkn,
+    stride_dvb,
+    stride_dvh,
+    stride_dvn,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    seqlen_q_rounded,
+    headdim,
+    CACHE_KEY_SEQLEN_Q,
+    CACHE_KEY_SEQLEN_K,
+    BIAS_TYPE: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    SEQUENCE_PARALLEL: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    off_hb = tl.program_id(1)
+    off_b = off_hb // nheads
+    off_h = off_hb % nheads
+    # offset pointers for batch/head
+    Q += off_b * stride_qb + off_h * stride_qh
+    K += off_b * stride_kb + off_h * stride_kh
+    V += off_b * stride_vb + off_h * stride_vh
+    DO += off_b * stride_dob + off_h * stride_doh
+    DQ += off_b * stride_dqb + off_h * stride_dqh
+    DK += off_b * stride_dkb + off_h * stride_dkh
+    DV += off_b * stride_dvb + off_h * stride_dvh
+    if BIAS_TYPE != "none":
+        Bias += off_b * stride_bb + off_h * stride_bh
+    # pointer to row-wise quantities in value-like data
+    D += off_hb * seqlen_q_rounded
+    LSE += off_hb * seqlen_q_rounded
+    if not SEQUENCE_PARALLEL:
+        num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
+        for start_n in range(0, num_block_n):
+            _bwd_kernel_one_col_block(
+                start_n,
+                Q,
+                K,
+                V,
+                Bias,
+                DO,
+                DQ,
+                DK,
+                DV,
+                LSE,
+                D,
+                softmax_scale,
+                stride_qm,
+                stride_kn,
+                stride_vn,
+                stride_bm,
+                stride_dom,
+                stride_dqm,
+                stride_dkn,
+                stride_dvn,
+                seqlen_q,
+                seqlen_k,
+                headdim,
+                ATOMIC_ADD=False,
+                BIAS_TYPE=BIAS_TYPE,
+                IS_CAUSAL=IS_CAUSAL,
+                BLOCK_HEADDIM=BLOCK_HEADDIM,
+                EVEN_M=EVEN_M,
+                EVEN_N=EVEN_N,
+                EVEN_HEADDIM=EVEN_HEADDIM,
+                BLOCK_M=BLOCK_M,
+                BLOCK_N=BLOCK_N,
+            )
+    else:
+        start_n = tl.program_id(0)
+        _bwd_kernel_one_col_block(
+            start_n,
+            Q,
+            K,
+            V,
+            Bias,
+            DO,
+            DQ,
+            DK,
+            DV,
+            LSE,
+            D,
+            softmax_scale,
+            stride_qm,
+            stride_kn,
+            stride_vn,
+            stride_bm,
+            stride_dom,
+            stride_dqm,
+            stride_dkn,
+            stride_dvn,
+            seqlen_q,
+            seqlen_k,
+            headdim,
+            ATOMIC_ADD=True,
+            BIAS_TYPE=BIAS_TYPE,
+            IS_CAUSAL=IS_CAUSAL,
+            BLOCK_HEADDIM=BLOCK_HEADDIM,
+            EVEN_M=EVEN_M,
+            EVEN_N=EVEN_N,
+            EVEN_HEADDIM=EVEN_HEADDIM,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+        )
+
+
+def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
+    # shape constraints
+    batch, seqlen_q, nheads, d = q.shape
+    _, seqlen_k, _, _ = k.shape
+    assert k.shape == (batch, seqlen_k, nheads, d)
+    assert v.shape == (batch, seqlen_k, nheads, d)
+    assert d <= 128, "FlashAttention only support head dimensions up to 128"
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same type"
+    assert q.dtype in [torch.float16, torch.bfloat16], "Only support fp16 and bf16"
+    assert q.is_cuda and k.is_cuda and v.is_cuda
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
+
+    has_bias = bias is not None
+    bias_type = "none"
+    if has_bias:
+        assert bias.dtype in [q.dtype, torch.float]
+        assert bias.is_cuda
+        assert bias.dim() == 4
+        if bias.stride(-1) != 1:
+            bias = bias.contiguous()
+        if bias.shape[2:] == (1, seqlen_k):
+            bias_type = "vector"
+        elif bias.shape[2:] == (seqlen_q, seqlen_k):
+            bias_type = "matrix"
+        else:
+            raise RuntimeError(
+                "Last 2 dimensions of bias must be (1, seqlen_k)" " or (seqlen_q, seqlen_k)"
+            )
+        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
+    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
+
+    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
+    lse = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
+    tmp = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
+    o = torch.empty_like(q)
+
+    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
+    BLOCK = 128
+    num_warps = 4 if d <= 64 else 8
+    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
+    _fwd_kernel[grid](
+        q,
+        k,
+        v,
+        bias,
+        o,
+        lse,
+        tmp,
+        softmax_scale,
+        q.stride(0),
+        q.stride(2),
+        q.stride(1),
+        k.stride(0),
+        k.stride(2),
+        k.stride(1),
+        v.stride(0),
+        v.stride(2),
+        v.stride(1),
+        *bias_strides,
+        o.stride(0),
+        o.stride(2),
+        o.stride(1),
+        nheads,
+        seqlen_q,
+        seqlen_k,
+        seqlen_q_rounded,
+        d,
+        seqlen_q // 32,
+        seqlen_k // 32,  # key for triton cache (limit number of compilations)
+        # Can't use kwargs here because triton autotune expects key to be args, not kwargs
+        # IS_CAUSAL=causal, BLOCK_HEADDIM=d,
+        bias_type,
+        causal,
+        BLOCK_HEADDIM,
+        BLOCK_M=BLOCK,
+        BLOCK_N=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return o, lse, softmax_scale  # softmax_scale could have been updated
+
+
+def _flash_attn_backward(
+    do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None
+):
+    # Make sure that the last dimension is contiguous
+    if do.stride(-1) != 1:
+        do = do.contiguous()
+    batch, seqlen_q, nheads, d = q.shape
+    _, seqlen_k, _, _ = k.shape
+    # assert d in {16, 32, 64, 128}
+    assert d <= 128
+    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
+    assert lse.shape == (batch, nheads, seqlen_q_rounded)
+    assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1
+    assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
+    # dq_accum = torch.zeros_like(q, dtype=torch.float32)
+    dq_accum = torch.empty_like(q, dtype=torch.float32)
+    delta = torch.empty_like(lse)
+    # delta = torch.zeros_like(lse)
+
+    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
+    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
+    _bwd_preprocess_do_o_dot[grid](
+        o,
+        do,
+        delta,
+        o.stride(0),
+        o.stride(2),
+        o.stride(1),
+        do.stride(0),
+        do.stride(2),
+        do.stride(1),
+        nheads,
+        seqlen_q,
+        seqlen_q_rounded,
+        d,
+        BLOCK_M=128,
+        BLOCK_HEADDIM=BLOCK_HEADDIM,
+    )
+
+    has_bias = bias is not None
+    bias_type = "none"
+    if has_bias:
+        assert bias.dtype in [q.dtype, torch.float]
+        assert bias.is_cuda
+        assert bias.dim() == 4
+        assert bias.stride(-1) == 1
+        if bias.shape[2:] == (1, seqlen_k):
+            bias_type = "vector"
+        elif bias.shape[2:] == (seqlen_q, seqlen_k):
+            bias_type = "matrix"
+        else:
+            raise RuntimeError(
+                "Last 2 dimensions of bias must be (1, seqlen_k)" " or (seqlen_q, seqlen_k)"
+            )
+        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
+    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
+
+    # BLOCK_M = 128
+    # BLOCK_N = 64
+    # num_warps = 4
+    grid = lambda META: (
+        triton.cdiv(seqlen_k, META["BLOCK_N"]) if META["SEQUENCE_PARALLEL"] else 1,
+        batch * nheads,
+    )
+    _bwd_kernel[grid](
+        q,
+        k,
+        v,
+        bias,
+        do,
+        dq_accum,
+        dk,
+        dv,
+        lse,
+        delta,
+        softmax_scale,
+        q.stride(0),
+        q.stride(2),
+        q.stride(1),
+        k.stride(0),
+        k.stride(2),
+        k.stride(1),
+        v.stride(0),
+        v.stride(2),
+        v.stride(1),
+        *bias_strides,
+        do.stride(0),
+        do.stride(2),
+        do.stride(1),
+        dq_accum.stride(0),
+        dq_accum.stride(2),
+        dq_accum.stride(1),
+        dk.stride(0),
+        dk.stride(2),
+        dk.stride(1),
+        dv.stride(0),
+        dv.stride(2),
+        dv.stride(1),
+        nheads,
+        seqlen_q,
+        seqlen_k,
+        seqlen_q_rounded,
+        d,
+        seqlen_q // 32,
+        seqlen_k // 32,  # key for triton cache (limit number of compilations)
+        # Can't use kwargs here because triton autotune expects key to be args, not kwargs
+        # IS_CAUSAL=causal, BLOCK_HEADDIM=d,
+        bias_type,
+        causal,
+        BLOCK_HEADDIM,
+        # SEQUENCE_PARALLEL=False,
+        # BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
+        # num_warps=num_warps,
+        # num_stages=1,
+    )
+    dq.copy_(dq_accum)
+
+
+class FlashAttnQKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
+        """
+        qkv: (batch, seqlen, 3, nheads, headdim)
+        bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
+            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
+            ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
+        """
+        # Make sure that the last dimension is contiguous
+        if qkv.stride(-1) != 1:
+            qkv = qkv.contiguous()
+        o, lse, ctx.softmax_scale = _flash_attn_forward(
+            qkv[:, :, 0],
+            qkv[:, :, 1],
+            qkv[:, :, 2],
+            bias=bias,
+            causal=causal,
+            softmax_scale=softmax_scale,
+        )
+        ctx.save_for_backward(qkv, o, lse, bias)
+        ctx.causal = causal
+        return o
+
+    @staticmethod
+    def backward(ctx, do):
+        qkv, o, lse, bias = ctx.saved_tensors
+        assert not ctx.needs_input_grad[1], "FlashAttention does not support bias gradient yet"
+        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
+        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
+        with torch.inference_mode():
+            dqkv = torch.empty_like(qkv)
+            _flash_attn_backward(
+                do,
+                qkv[:, :, 0],
+                qkv[:, :, 1],
+                qkv[:, :, 2],
+                o,
+                lse,
+                dqkv[:, :, 0],
+                dqkv[:, :, 1],
+                dqkv[:, :, 2],
+                bias=bias,
+                causal=ctx.causal,
+                softmax_scale=ctx.softmax_scale,
+            )
+        return dqkv, None, None, None
+
+
+flash_attn_qkvpacked_func = FlashAttnQKVPackedFunc.apply
+
+
+class FlashAttnKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
+        """
+        q: (batch, seqlen_q, nheads, headdim)
+        kv: (batch, seqlen_k, 2, nheads, headdim)
+        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
+            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
+            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
+        """
+        # Make sure that the last dimension is contiguous
+        q, kv = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, kv]]
+        o, lse, ctx.softmax_scale = _flash_attn_forward(
+            q, kv[:, :, 0], kv[:, :, 1], bias=bias, causal=causal, softmax_scale=softmax_scale
+        )
+        ctx.save_for_backward(q, kv, o, lse, bias)
+        ctx.causal = causal
+        return o
+
+    @staticmethod
+    def backward(ctx, do):
+        q, kv, o, lse, bias = ctx.saved_tensors
+        if len(ctx.needs_input_grad) >= 3:
+            assert not ctx.needs_input_grad[2], "FlashAttention does not support bias gradient yet"
+        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
+        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
+        with torch.inference_mode():
+            dq = torch.empty_like(q)
+            dkv = torch.empty_like(kv)
+            _flash_attn_backward(
+                do,
+                q,
+                kv[:, :, 0],
+                kv[:, :, 1],
+                o,
+                lse,
+                dq,
+                dkv[:, :, 0],
+                dkv[:, :, 1],
+                bias=bias,
+                causal=ctx.causal,
+                softmax_scale=ctx.softmax_scale,
+            )
+        return dq, dkv, None, None, None
+
+
+flash_attn_kvpacked_func = FlashAttnKVPackedFunc.apply
+
+
+class FlashAttnFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
+        """
+        q: (batch_size, seqlen_q, nheads, headdim)
+        k, v: (batch_size, seqlen_k, nheads, headdim)
+        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
+            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
+            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
+        """
+        # Make sure that the last dimension is contiguous
+        q, k, v = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]]
+        o, lse, ctx.softmax_scale = _flash_attn_forward(
+            q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale
+        )
+        ctx.save_for_backward(q, k, v, o, lse, bias)
+        ctx.causal = causal
+        return o
+
+    @staticmethod
+    def backward(ctx, do):
+        q, k, v, o, lse, bias = ctx.saved_tensors
+        assert not ctx.needs_input_grad[3], "FlashAttention does not support bias gradient yet"
+        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
+        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
+        with torch.inference_mode():
+            dq = torch.empty_like(q)
+            dk = torch.empty_like(k)
+            dv = torch.empty_like(v)
+            _flash_attn_backward(
+                do,
+                q,
+                k,
+                v,
+                o,
+                lse,
+                dq,
+                dk,
+                dv,
+                bias=bias,
+                causal=ctx.causal,
+                softmax_scale=ctx.softmax_scale,
+            )
+        return dq, dk, dv, None, None, None
+
+
+flash_attn_func = FlashAttnFunc.apply
diff --git a/flash_attn_triton_og.py b/flash_attn_triton_og.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2ddb99487b4f162745e2f6dd3d1744946dc3fb2
--- /dev/null
+++ b/flash_attn_triton_og.py
@@ -0,0 +1,365 @@
+# [2022-10-23] Downloaded from https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
+# for benchmarking.
+# We fixed a few dtype cast to make it work for bf16
+
+"""
+Fused Attention
+===============
+This is a Triton implementation of the Flash Attention algorithm
+(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf)
+"""
+
+import pytest
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    sm_scale,
+    TMP,
+    L,
+    M,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
+    Out,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vk,
+    stride_vn,
+    stride_oz,
+    stride_oh,
+    stride_om,
+    stride_on,
+    Z,
+    H,
+    N_CTX,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    off_q = off_hz * stride_qh + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
+    off_k = off_hz * stride_qh + offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kk
+    off_v = off_hz * stride_qh + offs_n[:, None] * stride_qm + offs_d[None, :] * stride_qk
+    # Initialize pointers to Q, K, V
+    q_ptrs = Q + off_q
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+    # initialize pointer to m and l
+    t_ptrs = TMP + off_hz * N_CTX + offs_m
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # load q: it will stay in SRAM throughout
+    q = tl.load(q_ptrs)
+    # loop over k, v and update accumulator
+    for start_n in range(0, (start_m + 1) * BLOCK_M, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(k_ptrs + start_n * stride_kn)
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k, trans_b=True)
+        qk *= sm_scale
+        qk += tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), 0, float("-inf"))
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+        m_i_new = tl.maximum(m_i, m_ij)
+        alpha = tl.exp(m_i - m_i_new)
+        beta = tl.exp(m_ij - m_i_new)
+        l_i_new = alpha * l_i + beta * l_ij
+        # -- update output accumulator --
+        # scale p
+        p_scale = beta / l_i_new
+        p = p * p_scale[:, None]
+        # scale acc
+        acc_scale = l_i / l_i_new * alpha
+        tl.store(t_ptrs, acc_scale)
+        acc_scale = tl.load(t_ptrs)  # BUG: have to store and immediately load
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(v_ptrs + start_n * stride_vk)
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+    # rematerialize offsets to save registers
+    start_m = tl.program_id(0)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    # write back l and m
+    l_ptrs = L + off_hz * N_CTX + offs_m
+    m_ptrs = M + off_hz * N_CTX + offs_m
+    tl.store(l_ptrs, l_i)
+    tl.store(m_ptrs, m_i)
+    # initialize pointers to output
+    offs_n = tl.arange(0, BLOCK_DMODEL)
+    off_o = off_hz * stride_oh + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
+    out_ptrs = Out + off_o
+    tl.store(out_ptrs, acc)
+
+
+@triton.jit
+def _bwd_preprocess(
+    Out,
+    DO,
+    L,
+    NewDO,
+    Delta,
+    BLOCK_M: tl.constexpr,
+    D_HEAD: tl.constexpr,
+):
+    off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_n = tl.arange(0, D_HEAD)
+    # load
+    o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
+    do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32)
+    denom = tl.load(L + off_m).to(tl.float32)
+    # compute
+    do = do / denom[:, None]
+    delta = tl.sum(o * do, axis=1)
+    # write-back
+    tl.store(NewDO + off_m[:, None] * D_HEAD + off_n[None, :], do)
+    tl.store(Delta + off_m, delta)
+
+
+@triton.jit
+def _bwd_kernel(
+    Q,
+    K,
+    V,
+    sm_scale,
+    Out,
+    DO,
+    DQ,
+    DK,
+    DV,
+    L,
+    M,
+    D,
+    stride_qz,
+    stride_qh,
+    stride_qm,
+    stride_qk,
+    stride_kz,
+    stride_kh,
+    stride_kn,
+    stride_kk,
+    stride_vz,
+    stride_vh,
+    stride_vk,
+    stride_vn,
+    Z,
+    H,
+    N_CTX,
+    num_block,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    off_hz = tl.program_id(0)
+    off_z = off_hz // H
+    off_h = off_hz % H
+    # offset pointers for batch/head
+    Q += off_z * stride_qz + off_h * stride_qh
+    K += off_z * stride_qz + off_h * stride_qh
+    V += off_z * stride_qz + off_h * stride_qh
+    DO += off_z * stride_qz + off_h * stride_qh
+    DQ += off_z * stride_qz + off_h * stride_qh
+    DK += off_z * stride_qz + off_h * stride_qh
+    DV += off_z * stride_qz + off_h * stride_qh
+    for start_n in range(0, num_block):
+        lo = start_n * BLOCK_M
+        # initialize row/col offsets
+        offs_qm = lo + tl.arange(0, BLOCK_M)
+        offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M)
+        offs_m = tl.arange(0, BLOCK_N)
+        offs_k = tl.arange(0, BLOCK_DMODEL)
+        # initialize pointers to value-like data
+        q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
+        v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        # pointer to row-wise quantities in value-like data
+        D_ptrs = D + off_hz * N_CTX
+        m_ptrs = M + off_hz * N_CTX
+        # initialize dv amd dk
+        dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+        dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+        # k and v stay in SRAM throughout
+        k = tl.load(k_ptrs)
+        v = tl.load(v_ptrs)
+        # loop over rows
+        for start_m in range(lo, num_block * BLOCK_M, BLOCK_M):
+            offs_m_curr = start_m + offs_m
+            # load q, k, v, do on-chip
+            q = tl.load(q_ptrs)
+            # recompute p = softmax(qk, dim=-1).T
+            # NOTE: `do` is pre-divided by `l`; no normalization here
+            qk = tl.dot(q, k, trans_b=True)
+            qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf"))
+            m = tl.load(m_ptrs + offs_m_curr)
+            p = tl.exp(qk * sm_scale - m[:, None])
+            # compute dv
+            do = tl.load(do_ptrs)
+            dv += tl.dot(p.to(do.dtype), do, trans_a=True)
+            # compute dp = dot(v, do)
+            Di = tl.load(D_ptrs + offs_m_curr)
+            dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
+            dp += tl.dot(do, v, trans_b=True)
+            # compute ds = p * (dp - delta[:, None])
+            ds = p * dp * sm_scale
+            # compute dk = dot(ds.T, q)
+            dk += tl.dot(ds.to(q.dtype), q, trans_a=True)
+            # # compute dq
+            dq = tl.load(dq_ptrs, eviction_policy="evict_last")
+            dq += tl.dot(ds.to(k.dtype), k)
+            tl.store(dq_ptrs, dq, eviction_policy="evict_last")
+            # # increment pointers
+            dq_ptrs += BLOCK_M * stride_qm
+            q_ptrs += BLOCK_M * stride_qm
+            do_ptrs += BLOCK_M * stride_qm
+        # write-back
+        dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
+        tl.store(dv_ptrs, dv)
+        tl.store(dk_ptrs, dk)
+
+
+class _attention(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, sm_scale):
+        BLOCK = 128
+        # shape constraints
+        Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+        assert Lq == Lk and Lk == Lv
+        assert Lk in {16, 32, 64, 128}
+        o = torch.empty_like(q)
+        grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1])
+        tmp = torch.empty(
+            (q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32
+        )
+        L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
+        m = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
+        num_warps = 4 if Lk <= 64 else 8
+
+        _fwd_kernel[grid](
+            q,
+            k,
+            v,
+            sm_scale,
+            tmp,
+            L,
+            m,
+            o,
+            q.stride(0),
+            q.stride(1),
+            q.stride(2),
+            q.stride(3),
+            k.stride(0),
+            k.stride(1),
+            k.stride(2),
+            k.stride(3),
+            v.stride(0),
+            v.stride(1),
+            v.stride(2),
+            v.stride(3),
+            o.stride(0),
+            o.stride(1),
+            o.stride(2),
+            o.stride(3),
+            q.shape[0],
+            q.shape[1],
+            q.shape[2],
+            BLOCK_M=BLOCK,
+            BLOCK_N=BLOCK,
+            BLOCK_DMODEL=Lk,
+            num_warps=num_warps,
+            num_stages=1,
+        )
+        ctx.save_for_backward(q, k, v, o, L, m)
+        ctx.BLOCK = BLOCK
+        ctx.grid = grid
+        ctx.sm_scale = sm_scale
+        ctx.BLOCK_DMODEL = Lk
+        return o
+
+    @staticmethod
+    def backward(ctx, do):
+        q, k, v, o, l, m = ctx.saved_tensors
+        do = do.contiguous()
+        dq = torch.zeros_like(q, dtype=torch.float32)
+        dk = torch.empty_like(k)
+        dv = torch.empty_like(v)
+        do_scaled = torch.empty_like(do)
+        delta = torch.empty_like(l)
+        _bwd_preprocess[(ctx.grid[0] * ctx.grid[1],)](
+            o,
+            do,
+            l,
+            do_scaled,
+            delta,
+            BLOCK_M=ctx.BLOCK,
+            D_HEAD=ctx.BLOCK_DMODEL,
+        )
+
+        # NOTE: kernel currently buggy for other values of `num_warps`
+        num_warps = 8
+        _bwd_kernel[(ctx.grid[1],)](
+            q,
+            k,
+            v,
+            ctx.sm_scale,
+            o,
+            do_scaled,
+            dq,
+            dk,
+            dv,
+            l,
+            m,
+            delta,
+            q.stride(0),
+            q.stride(1),
+            q.stride(2),
+            q.stride(3),
+            k.stride(0),
+            k.stride(1),
+            k.stride(2),
+            k.stride(3),
+            v.stride(0),
+            v.stride(1),
+            v.stride(2),
+            v.stride(3),
+            q.shape[0],
+            q.shape[1],
+            q.shape[2],
+            ctx.grid[0],
+            BLOCK_M=ctx.BLOCK,
+            BLOCK_N=ctx.BLOCK,
+            BLOCK_DMODEL=ctx.BLOCK_DMODEL,
+            num_warps=num_warps,
+            num_stages=1,
+        )
+        return dq.to(q.dtype), dk, dv, None
+
+
+attention = _attention.apply
diff --git a/flash_blocksparse_attention.py b/flash_blocksparse_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..03798d16ffbb3cbf1806296d5b33f81360717315
--- /dev/null
+++ b/flash_blocksparse_attention.py
@@ -0,0 +1,197 @@
+import math
+
+import hydra
+import torch
+import torch.nn as nn
+from einops import rearrange
+
+from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
+from flash_attn.flash_blocksparse_attn_interface import (
+    convert_blockmask,
+    flash_blocksparse_attn_func,
+)
+
+
+class FlashBlocksparseAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_temp: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.1)
+    """
+
+    def __init__(
+        self,
+        sparsity_config,
+        softmax_temp=None,
+        attention_dropout=0.0,
+        max_seq_length=2048,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        self.sparsity_config = hydra.utils.instantiate(sparsity_config)
+        self.softmax_temp = softmax_temp
+        self.dropout_p = attention_dropout
+
+        # initialize sparse layout and register as buffer
+        max_seq_length = ((max_seq_length + 256 - 1) // 256) * 256
+        layout = self.sparsity_config.make_layout(max_seq_length)
+        self.register_buffer("layout", layout)
+        blockmask_converted = convert_blockmask(self.layout, causal=False)
+        self.register_buffer("blockmask_converted", blockmask_converted)
+        # logger.info(f'Attention class {self.__class__}: saving={self.layout.float().mean()}')
+
+    def forward(
+        self,
+        qkv,
+        attn_mask=None,
+        key_padding_mask=None,
+        causal=False,
+        cu_seqlens=None,
+        max_s=None,
+        need_weights=False,
+        convert_mask=True,
+    ):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
+            attn_mask: An implementation of BaseMask that encodes where each
+                       query can attend to
+            key_padding_mask: An implementation of BaseMask that encodes how
+                         many query each sequence in the batch consists of
+        """
+        assert not need_weights
+        assert attn_mask is None
+        assert qkv.dtype == torch.float16
+        assert qkv.is_cuda
+
+        if cu_seqlens is None:
+            batch_size = qkv.shape[0]
+            seqlen = qkv.shape[1]
+            # Convert mask to take a subset
+            seqlen_rounded = ((seqlen + 256 - 1) // 256) * 256
+            assert seqlen_rounded // 16 <= self.layout.shape[0], (
+                seqlen_rounded // 256 <= self.layout.shape[1]
+            )
+            blockmask = self.layout[: seqlen_rounded // 16, : seqlen_rounded // 256]
+            if key_padding_mask is None:
+                qkv = rearrange(qkv, "b s ... -> (b s) ...")
+                max_s = seqlen
+                cu_seqlens = torch.arange(
+                    0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=qkv.device
+                )
+                output = flash_blocksparse_attn_func(
+                    qkv,
+                    cu_seqlens,
+                    blockmask,
+                    self.dropout_p if self.training else 0.0,
+                    max_s,
+                    softmax_scale=self.softmax_temp,
+                    causal=causal,
+                )
+                output = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
+            else:
+                key_padding_mask_bool = key_padding_mask.bool_matrix
+                nheads = qkv.shape[-2]
+                x = rearrange(qkv, "b s three h d -> b s (three h d)")
+                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask_bool)
+                x_unpad = rearrange(x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads)
+                output_unpad = flash_blocksparse_attn_func(
+                    x_unpad,
+                    cu_seqlens,
+                    blockmask,
+                    self.dropout_p if self.training else 0.0,
+                    max_s,
+                    softmax_scale=self.softmax_temp,
+                    causal=causal,
+                )
+                output = rearrange(
+                    pad_input(
+                        rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, batch_size, seqlen
+                    ),
+                    "b s (h d) -> b s h d",
+                    h=nheads,
+                )
+        else:
+            assert max_s is not None
+            seqlen = max_s
+            # Convert mask to take a subset
+            seqlen_rounded = ((seqlen + 256 - 1) // 256) * 256
+            assert seqlen_rounded // 16 <= self.layout.shape[0], (
+                seqlen_rounded // 256 <= self.layout.shape[1]
+            )
+            blockmask = self.layout[: seqlen_rounded // 16, : seqlen_rounded // 256]
+            if convert_mask:
+                output = flash_blocksparse_attn_func(
+                    qkv,
+                    cu_seqlens,
+                    blockmask,
+                    self.dropout_p if self.training else 0.0,
+                    max_s,
+                    softmax_scale=self.softmax_temp,
+                    causal=causal,
+                )
+            else:
+                output = flash_blocksparse_attn_func(
+                    qkv,
+                    cu_seqlens,
+                    self.blockmask_converted,
+                    self.dropout_p if self.training else 0.0,
+                    max_s,
+                    softmax_scale=self.softmax_temp,
+                    causal=causal,
+                    convert_mask=False,
+                )
+
+        return output, None
+
+
+class FlashBlocksparseMHA(nn.Module):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        sparsity_config,
+        bias=True,
+        batch_first=True,
+        attention_dropout=0.0,
+        causal=False,
+        max_seq_length=2048,
+        device=None,
+        dtype=None,
+        **kwargs,
+    ) -> None:
+        assert batch_first
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.causal = causal
+
+        self.num_heads = num_heads
+        assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads"
+        self.head_dim = self.embed_dim // num_heads
+        assert self.head_dim in [16, 32, 64], "Only support head_dim == 16, 32, or 64"
+
+        self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs)
+        self.inner_attn = FlashBlocksparseAttention(
+            sparsity_config,
+            attention_dropout=attention_dropout,
+            max_seq_length=max_seq_length,
+            **factory_kwargs,
+        )
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
+
+    def forward(
+        self, x, x_ignored_, x_ignored_1_, attn_mask=None, key_padding_mask=None, need_weights=False
+    ):
+        qkv = self.Wqkv(x)
+        qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, h=self.num_heads)
+        context, attn_weights = self.inner_attn(
+            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=self.causal
+        )
+        return self.out_proj(rearrange(context, "b s h d -> b s (h d)")), attn_weights
diff --git a/flash_blocksparse_attn_interface.py b/flash_blocksparse_attn_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ce3fe8c1344dd33165c43e4cc1ef0f70feb5d04
--- /dev/null
+++ b/flash_blocksparse_attn_interface.py
@@ -0,0 +1,200 @@
+# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/fmha.py
+import flash_attn_cuda
+import torch
+import torch.nn as nn
+
+
+def convert_blockmask(blockmask, causal):
+    """Convert from the 0-1 format to the format used by the CUDA code.
+    0 means the block is skipped.
+    nonzero means the block is not skipped.
+    Argument:
+        blockmask: (row, col): a 0-1 tensor
+    Return:
+        blockmask_converted: (col, row), dtype torch.int32: for each column, it contains the row
+            indices of the nonzero blocks, padded with -1 to reach length @row.
+            The indices are multiplied by 4, with the smallest bit used to encode whether
+            it is the first nonzero in its row, and the 2nd smallest bit to encode whether it is
+            the last nonzero in its row..
+    """
+    assert not causal
+    # TD [2022-05-13]: The indexing and sorting is very tricky
+    nrow, ncol = blockmask.shape
+    # Sort does not support bool on CUDA
+    blockmask = blockmask.to(dtype=torch.uint8)
+    nonzero_val, nonzero_sorted_rowidx = blockmask.sort(dim=0, stable=True, descending=True)
+    nonzero_unsorted_rowidx = nonzero_sorted_rowidx.argsort(dim=0)
+    last_nonzero_col_per_row = blockmask.sort(dim=-1, stable=True).indices[:, -1]
+    last_nonzero_col_per_row_after_sort = nonzero_unsorted_rowidx[
+        torch.arange(nrow, device=blockmask.device), last_nonzero_col_per_row
+    ]
+    first_nonzero_col_per_row = blockmask.sort(dim=-1, stable=True, descending=True).indices[:, 0]
+    first_nonzero_col_per_row_after_sort = nonzero_unsorted_rowidx[
+        torch.arange(nrow, device=blockmask.device), first_nonzero_col_per_row
+    ]
+    nonzero_idx = nonzero_sorted_rowidx * 4
+    nonzero_idx[last_nonzero_col_per_row_after_sort, last_nonzero_col_per_row] += 2
+    nonzero_idx[first_nonzero_col_per_row_after_sort, first_nonzero_col_per_row] += 1
+    nonzero_idx[nonzero_val == 0] = -1
+    return nonzero_idx.T.contiguous().to(dtype=torch.int32)
+
+
+def _flash_blocksparse_attn_forward(
+    qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal, return_softmax
+):
+    context, softmax_lse, *rest = flash_attn_cuda.fwd_block(
+        qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal, return_softmax, None
+    )
+    # if context.isnan().any() or softmax_lse.isnan().any():
+    #     breakpoint()
+    S_dmask = rest[0] if return_softmax else None
+    return context, softmax_lse, S_dmask
+
+
+def _flash_blocksparse_attn_backward(
+    dout,
+    qkv,
+    out,
+    S_dmask,
+    softmax_lse,
+    cu_seqlens,
+    blockmask,
+    dropout_p,
+    max_s,
+    softmax_scale,
+    causal,
+):
+    dqkv, dp, softmax_d = flash_attn_cuda.bwd_block(
+        dout,
+        qkv,
+        out,
+        S_dmask,
+        softmax_lse,
+        cu_seqlens,
+        blockmask,
+        dropout_p,
+        softmax_scale,
+        max_s,
+        causal,
+        None,
+    )
+    # if dqkv.isnan().any() or softmax_d.isnan().any():
+    #     breakpoint()
+    return dqkv
+
+
+class FlashBlocksparseAttnFun(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal):
+        # Save rng_state because the backward pass will regenerate the dropout mask
+        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
+        if softmax_scale is None:
+            softmax_scale = qkv.shape[-1] ** (-0.5)
+        context, softmax_lse, S_dmask = _flash_blocksparse_attn_forward(
+            qkv,
+            cu_seqlens,
+            blockmask,
+            dropout_p,
+            max_s,
+            softmax_scale,
+            causal=causal,
+            return_softmax=False,
+        )
+        ctx.save_for_backward(qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state)
+        ctx.dropout_p = dropout_p
+        ctx.max_s = max_s
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        return context
+
+    @staticmethod
+    def backward(ctx, dout):
+        qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state = ctx.saved_tensors
+        if rng_state is not None:
+            cur_rng_state = torch.cuda.get_rng_state()
+            torch.cuda.set_rng_state(rng_state)
+        # S_dmask is None, temporarily use another tensor just to get it running
+        dqkv = _flash_blocksparse_attn_backward(
+            dout,
+            qkv,
+            context,
+            context,
+            softmax_lse,
+            cu_seqlens,
+            blockmask,
+            ctx.dropout_p,
+            ctx.max_s,
+            ctx.softmax_scale,
+            ctx.causal,
+        )
+        if rng_state is not None:
+            torch.cuda.set_rng_state(cur_rng_state)
+        return dqkv, None, None, None, None, None, None, None
+
+
+# We duplicate code to return both the output and the softmax for testing
+# Returning both makes backward a bit slower, so we want to keep using the other version for speed.
+class FlashBlocksparseAttnFunWithS(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal):
+        # Save rng_state because the backward pass is gonna regenerate the dropout mask
+        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
+        if softmax_scale is None:
+            softmax_scale = qkv.shape[-1] ** (-0.5)
+        context, softmax_lse, S_dmask = _flash_blocksparse_attn_forward(
+            qkv,
+            cu_seqlens,
+            blockmask,
+            dropout_p,
+            max_s,
+            softmax_scale,
+            causal=causal,
+            return_softmax=True,
+        )
+        ctx.save_for_backward(qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state)
+        ctx.dropout_p = dropout_p
+        ctx.max_s = max_s
+        ctx.softmax_scale = softmax_scale
+        ctx.causal = causal
+        return context, S_dmask, softmax_lse
+
+    @staticmethod
+    def backward(ctx, dout, _dS_dmask_ignored, _dsoftmax_sum_ignored):
+        qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state = ctx.saved_tensors
+        if rng_state is not None:
+            cur_rng_state = torch.cuda.get_rng_state()
+            torch.cuda.set_rng_state(rng_state)
+        dqkv = _flash_blocksparse_attn_backward(
+            dout,
+            qkv,
+            context,
+            S_dmask,
+            softmax_lse,
+            cu_seqlens,
+            blockmask,
+            ctx.dropout_p,
+            ctx.max_s,
+            ctx.softmax_scale,
+            ctx.causal,
+        )
+        if rng_state is not None:
+            torch.cuda.set_rng_state(cur_rng_state)
+        return dqkv, None, None, None, None, None, None
+
+
+def flash_blocksparse_attn_func(
+    qkv,
+    cu_seqlens,
+    blockmask,
+    dropout_p,
+    max_s,
+    softmax_scale=None,
+    causal=False,
+    return_attn_probs=False,
+    convert_mask=True,
+):
+    """dropout_p should be set to 0.0 during evaluation"""
+    func = FlashBlocksparseAttnFun if not return_attn_probs else FlashBlocksparseAttnFunWithS
+    if convert_mask:
+        blockmask = convert_blockmask(blockmask, causal=causal)
+    return func.apply(qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal)
diff --git a/flash_bwd_hdim128_bf16_causal_sm80.cu b/flash_bwd_hdim128_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..13132e86da365c8c2e9b41fe7b35bb7f73be3cea
--- /dev/null
+++ b/flash_bwd_hdim128_bf16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 128, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim128<cutlass::bfloat16_t, true>(params, stream);
+}
diff --git a/flash_bwd_hdim128_bf16_sm80.cu b/flash_bwd_hdim128_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..85a5dc88e164862ce420c643999513eeb310049b
--- /dev/null
+++ b/flash_bwd_hdim128_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 128, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim128<cutlass::bfloat16_t, false>(params, stream);
+}
diff --git a/flash_bwd_hdim128_bf16_sm90.cu b/flash_bwd_hdim128_bf16_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..372d0ed52fec3b6967207ed503c1fb769bd55711
--- /dev/null
+++ b/flash_bwd_hdim128_bf16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 128>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim128<cutlass::bfloat16_t>(params, stream);
+}
diff --git a/flash_bwd_hdim128_fp16_causal_sm80.cu b/flash_bwd_hdim128_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5d27cd97b2b85b8512c3c7c48dd178f10875331b
--- /dev/null
+++ b/flash_bwd_hdim128_fp16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 128, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim128<cutlass::half_t, true>(params, stream);
+}
diff --git a/flash_bwd_hdim128_fp16_sm80.cu b/flash_bwd_hdim128_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2d7ddf46b5739473fcf8db6a350144fc2632cd96
--- /dev/null
+++ b/flash_bwd_hdim128_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 128, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim128<cutlass::half_t, false>(params, stream);
+}
diff --git a/flash_bwd_hdim128_fp16_sm90.cu b/flash_bwd_hdim128_fp16_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..01a1d469fc10eff9757e473a475177868c8d0241
--- /dev/null
+++ b/flash_bwd_hdim128_fp16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 128>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim128<cutlass::half_t>(params, stream);
+}
diff --git a/flash_bwd_hdim160_bf16_causal_sm80.cu b/flash_bwd_hdim160_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c18a78c764ce2c32fbd9998fe8a083b5aeb165da
--- /dev/null
+++ b/flash_bwd_hdim160_bf16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 160, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim160<cutlass::bfloat16_t, true>(params, stream);
+}
diff --git a/flash_bwd_hdim160_bf16_sm80.cu b/flash_bwd_hdim160_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1b6173725e918cace475713da0bd505708ef00f7
--- /dev/null
+++ b/flash_bwd_hdim160_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 160, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim160<cutlass::bfloat16_t, false>(params, stream);
+}
diff --git a/flash_bwd_hdim160_fp16_causal_sm80.cu b/flash_bwd_hdim160_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a511162dc0bc7d536ca235c23982b3c39f4101b4
--- /dev/null
+++ b/flash_bwd_hdim160_fp16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 160, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim160<cutlass::half_t, true>(params, stream);
+}
diff --git a/flash_bwd_hdim160_fp16_sm80.cu b/flash_bwd_hdim160_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c9ce19acbbe5403589dfa25027bade9762567fe5
--- /dev/null
+++ b/flash_bwd_hdim160_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 160, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim160<cutlass::half_t, false>(params, stream);
+}
diff --git a/flash_bwd_hdim192_bf16_causal_sm80.cu b/flash_bwd_hdim192_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f492a717156550ddec40bda6bfe4b52d13109011
--- /dev/null
+++ b/flash_bwd_hdim192_bf16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 192, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim192<cutlass::bfloat16_t, true>(params, stream);
+}
diff --git a/flash_bwd_hdim192_bf16_sm80.cu b/flash_bwd_hdim192_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2df58daa2a73c9f87225b68f288cb44a1f3a1b0c
--- /dev/null
+++ b/flash_bwd_hdim192_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 192, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim192<cutlass::bfloat16_t, false>(params, stream);
+}
diff --git a/flash_bwd_hdim192_fp16_causal_sm80.cu b/flash_bwd_hdim192_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..69cad5ae4538942012a13c4777cd83e172eb7c6a
--- /dev/null
+++ b/flash_bwd_hdim192_fp16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 192, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim192<cutlass::half_t, true>(params, stream);
+}
diff --git a/flash_bwd_hdim192_fp16_sm80.cu b/flash_bwd_hdim192_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3d4cab58bc840135485632fd4970b6da2baae84d
--- /dev/null
+++ b/flash_bwd_hdim192_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 192, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim192<cutlass::half_t, false>(params, stream);
+}
diff --git a/flash_bwd_hdim224_bf16_sm80.cu b/flash_bwd_hdim224_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b2b58e2abc99a8921f4ce53dbea84a2ff078229c
--- /dev/null
+++ b/flash_bwd_hdim224_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 224>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim224<cutlass::bfloat16_t>(params, stream);
+}
diff --git a/flash_bwd_hdim224_fp16_sm80.cu b/flash_bwd_hdim224_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e65cdaedeae1586ef2309c542948970ea3b5b59d
--- /dev/null
+++ b/flash_bwd_hdim224_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 224>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim224<cutlass::half_t>(params, stream);
+}
diff --git a/flash_bwd_hdim256_bf16_causal_sm80.cu b/flash_bwd_hdim256_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6927445974973efc835fc68357d994e13d55a468
--- /dev/null
+++ b/flash_bwd_hdim256_bf16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 256, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim256<cutlass::bfloat16_t, true>(params, stream);
+}
diff --git a/flash_bwd_hdim256_bf16_sm80.cu b/flash_bwd_hdim256_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d718ec88be5a7dd0d9af069b8ddd8f2b24ad3089
--- /dev/null
+++ b/flash_bwd_hdim256_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 256, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim256<cutlass::bfloat16_t, false>(params, stream);
+}
diff --git a/flash_bwd_hdim256_fp16_causal_sm80.cu b/flash_bwd_hdim256_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..551c695e0559809daedecc94da62114dcbe69835
--- /dev/null
+++ b/flash_bwd_hdim256_fp16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 256, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim256<cutlass::half_t, true>(params, stream);
+}
diff --git a/flash_bwd_hdim256_fp16_sm80.cu b/flash_bwd_hdim256_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a5877002631b497344106ff76506b45aeb8292cf
--- /dev/null
+++ b/flash_bwd_hdim256_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 256, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim256<cutlass::half_t, false>(params, stream);
+}
diff --git a/flash_bwd_hdim256_fp16_sm90.cu b/flash_bwd_hdim256_fp16_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ee139bd2e5431aaa8314a0b61bd6c803f12842dd
--- /dev/null
+++ b/flash_bwd_hdim256_fp16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 256>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim256<cutlass::half_t>(params, stream);
+}
diff --git a/flash_bwd_hdim32_bf16_causal_sm80.cu b/flash_bwd_hdim32_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1282939a059c1e5e3f1f9a6297413e289f8456cc
--- /dev/null
+++ b/flash_bwd_hdim32_bf16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 32, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim32<cutlass::bfloat16_t, true>(params, stream);
+}
diff --git a/flash_bwd_hdim32_bf16_sm80.cu b/flash_bwd_hdim32_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d6d4036383f50ffbb3c3cc0aaea44be479686b0f
--- /dev/null
+++ b/flash_bwd_hdim32_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 32, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim32<cutlass::bfloat16_t, false>(params, stream);
+}
diff --git a/flash_bwd_hdim32_fp16_causal_sm80.cu b/flash_bwd_hdim32_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..60aa2d60b3b713931af611f6922467aab5333b1f
--- /dev/null
+++ b/flash_bwd_hdim32_fp16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 32, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim32<cutlass::half_t, true>(params, stream);
+}
diff --git a/flash_bwd_hdim32_fp16_sm80.cu b/flash_bwd_hdim32_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b06d50eaa8d6f35277c88ad1c32e41dad2669be1
--- /dev/null
+++ b/flash_bwd_hdim32_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 32, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim32<cutlass::half_t, false>(params, stream);
+}
diff --git a/flash_bwd_hdim64_bf16_causal_sm80.cu b/flash_bwd_hdim64_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..52b93be9d4f4aa95123f155b217fd65d56875b20
--- /dev/null
+++ b/flash_bwd_hdim64_bf16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 64, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim64<cutlass::bfloat16_t, true>(params, stream);
+}
diff --git a/flash_bwd_hdim64_bf16_sm80.cu b/flash_bwd_hdim64_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..09d9e2b75cefb6a1cbb5a0150aa5b47168d91548
--- /dev/null
+++ b/flash_bwd_hdim64_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 64, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim64<cutlass::bfloat16_t, false>(params, stream);
+}
diff --git a/flash_bwd_hdim64_bf16_sm90.cu b/flash_bwd_hdim64_bf16_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..587e6b944252643d7e7033cf5fd582949525551b
--- /dev/null
+++ b/flash_bwd_hdim64_bf16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 64>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim64<cutlass::bfloat16_t>(params, stream);
+}
diff --git a/flash_bwd_hdim64_fp16_causal_sm80.cu b/flash_bwd_hdim64_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5a4ea5f465e04105c7e9d55e968aee735a5727fd
--- /dev/null
+++ b/flash_bwd_hdim64_fp16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 64, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim64<cutlass::half_t, true>(params, stream);
+}
diff --git a/flash_bwd_hdim64_fp16_sm80.cu b/flash_bwd_hdim64_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fb115ff767908d601ecab113c139ef095a48792c
--- /dev/null
+++ b/flash_bwd_hdim64_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 64, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim64<cutlass::half_t, false>(params, stream);
+}
diff --git a/flash_bwd_hdim64_fp16_sm90.cu b/flash_bwd_hdim64_fp16_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f486e870f3d1b4434d753fe01ce9800ffcd1d951
--- /dev/null
+++ b/flash_bwd_hdim64_fp16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 64>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim64<cutlass::half_t>(params, stream);
+}
diff --git a/flash_bwd_hdim96_bf16_causal_sm80.cu b/flash_bwd_hdim96_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5f4c26a476c785edbe88f9824ff970494b60530e
--- /dev/null
+++ b/flash_bwd_hdim96_bf16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 96, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim96<cutlass::bfloat16_t, true>(params, stream);
+}
diff --git a/flash_bwd_hdim96_bf16_sm80.cu b/flash_bwd_hdim96_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..224213d79f796ad9095ff6d83147cd231ecf3594
--- /dev/null
+++ b/flash_bwd_hdim96_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 96, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim96<cutlass::bfloat16_t, false>(params, stream);
+}
diff --git a/flash_bwd_hdim96_bf16_sm90.cu b/flash_bwd_hdim96_bf16_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..19f5582b2755f0ba5d2f34bbbec9a5e724d6b8aa
--- /dev/null
+++ b/flash_bwd_hdim96_bf16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::bfloat16_t, 96>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim96<cutlass::bfloat16_t>(params, stream);
+}
diff --git a/flash_bwd_hdim96_fp16_causal_sm80.cu b/flash_bwd_hdim96_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d0349014f92075428286a7e2ac4371e13a96a358
--- /dev/null
+++ b/flash_bwd_hdim96_fp16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 96, true>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim96<cutlass::half_t, true>(params, stream);
+}
diff --git a/flash_bwd_hdim96_fp16_sm80.cu b/flash_bwd_hdim96_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..663fc85924e5cd4ce2639964f9dd9e25e2e2d4ce
--- /dev/null
+++ b/flash_bwd_hdim96_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 96, false>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim96<cutlass::half_t, false>(params, stream);
+}
diff --git a/flash_bwd_hdim96_fp16_sm90.cu b/flash_bwd_hdim96_fp16_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0952ee97144a11de98faea6926834ba978017c6e
--- /dev/null
+++ b/flash_bwd_hdim96_fp16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<cutlass::half_t, 96>(Flash_bwd_params &params, cudaStream_t stream) {
+    run_mha_bwd_hdim96<cutlass::half_t>(params, stream);
+}
diff --git a/flash_bwd_kernel.h b/flash_bwd_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4ba0ff47207a22189961f16710d4532e2cbb436
--- /dev/null
+++ b/flash_bwd_kernel.h
@@ -0,0 +1,308 @@
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include <cutlass/cutlass.h>
+#include <cutlass/arch/reg_reconfig.h>
+#include <cutlass/array.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/kernel_hardware_info.h>
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "utils.h"
+#include "tile_scheduler_bwd.hpp"
+#include "mainloop_bwd_sm90_tma_gmma_ws.hpp"
+#include "epilogue_bwd_sm90_tma.hpp"
+
+namespace flash {
+
+using namespace cute;
+
+template <class CollectiveMainloop_, class CollectiveEpilogue_, class TileScheduler_>
+class FlashAttnBwd {
+
+public:
+
+    // Type Aliases
+    static constexpr bool Is_causal = CollectiveMainloop_::Is_causal;
+    static_assert(CollectiveMainloop_::Varlen == CollectiveEpilogue_::Varlen);
+    static constexpr bool Varlen = CollectiveMainloop_::Varlen;
+
+    // Mainloop derived types
+    using CollectiveMainloop = CollectiveMainloop_;
+    using TileShape_MNK = typename CollectiveMainloop::TileShape_MNK;
+    using TiledMmaSdP = typename CollectiveMainloop::TiledMmaSdP;
+    using TiledMmadKV = typename CollectiveMainloop::TiledMmadKV;
+    using ArchTag = typename CollectiveMainloop::ArchTag;
+    using ClusterShape = typename CollectiveMainloop::ClusterShape;
+    using MainloopArguments = typename CollectiveMainloop::Arguments;
+    using MainloopParams = typename CollectiveMainloop::Params;
+    static constexpr bool dKV_swapAB = CollectiveMainloop::dKV_swapAB;
+
+    // Epilogue derived types
+    using CollectiveEpilogue = CollectiveEpilogue_;
+    using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+    using EpilogueParams = typename CollectiveEpilogue::Params;
+
+    static_assert(ArchTag::kMinComputeCapability >= 90);
+
+    using TileScheduler = TileScheduler_;
+    using TileSchedulerArguments = typename TileScheduler::Arguments;
+    using TileSchedulerParams = typename TileScheduler::Params;
+
+    static constexpr uint32_t NumLoadWarpGroups = 1;
+    static constexpr uint32_t NumMmaWarpGroups = CUTE_STATIC_V(size(TiledMmaSdP{})) / cutlass::NumThreadsPerWarpGroup;
+    static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMmaSdP{})) + (NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup);
+    static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+    static_assert(NumMmaWarpGroups == 2);
+
+    /// Register requirement for Load and Math WGs
+    static constexpr uint32_t LoadRegisterRequirement = 24;
+    static constexpr uint32_t MmaRegisterRequirement = 240;
+    // If you want to print from the producer warp, you'd need to increase the number of registers
+    // Otherwise you'll get CUDA error.
+    // static constexpr uint32_t LoadRegisterRequirement = 56;
+    // static constexpr uint32_t MmaRegisterRequirement = 224;
+
+    // Kernel level shared memory storage
+    struct SharedStorage {
+        struct {
+            union {
+                typename CollectiveMainloop::TensorStorage mainloop;
+                typename CollectiveEpilogue::TensorStorage epilogue;
+            };
+        };
+
+        struct {
+            alignas(16) cutlass::arch::ClusterTransactionBarrier barrier_KV;
+            alignas(16) cutlass::arch::ClusterBarrier barrier_dKV;
+            alignas(16) typename CollectiveMainloop::MainloopPipeline::SharedStorage pipeline_q;
+            alignas(16) typename CollectiveMainloop::MainloopPipeline::SharedStorage pipeline_do;
+            alignas(16) typename TileScheduler::SharedStorage smem_scheduler;
+        };
+
+    };
+
+    static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+    // Device side arguments
+    struct Arguments {
+        MainloopArguments mainloop{};
+        EpilogueArguments epilogue{};
+        cutlass::KernelHardwareInfo hw_info{};
+        TileSchedulerArguments scheduler{};
+    };
+
+    // Kernel entry point API
+    struct Params {
+        MainloopParams mainloop{};
+        EpilogueParams epilogue{};
+        cutlass::KernelHardwareInfo hw_info{};
+        TileSchedulerParams scheduler{};
+    };
+
+    //
+    // Methods
+    //
+
+    // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+    static
+    Params
+    to_underlying_arguments(Arguments const& args) {
+        CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+        // Get SM count if needed, otherwise use user supplied SM count
+        int sm_count = args.hw_info.sm_count;
+        if (sm_count <= 0) {
+            CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid SM count.\n"
+                "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+            sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
+        }
+
+        CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
+
+        cutlass::KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+        return {
+            CollectiveMainloop::to_underlying_arguments(args.mainloop),
+            CollectiveEpilogue::to_underlying_arguments(args.epilogue),
+            hw_info,
+            TileScheduler::to_underlying_arguments(args.scheduler)
+        };
+    }
+
+    // Computes the kernel launch grid shape based on runtime parameters
+    static dim3
+    get_grid_shape(Params const& params) {
+        return TileScheduler::get_grid_shape(params.scheduler, params.hw_info.sm_count);
+    }
+
+    static dim3
+    get_block_shape() {
+        return dim3(MaxThreadsPerBlock, 1, 1);
+    }
+
+
+    CUTLASS_DEVICE
+    void
+    operator()(Params const& params, char* smem_buf) {
+
+        static constexpr int NumMmaThreads = NumMmaWarpGroups * cutlass::NumThreadsPerWarpGroup;
+        static constexpr int NumCopyThreads = NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup;
+        static constexpr int kBlockM = get<0>(TileShape_MNK{});
+
+        using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+        using PipelineParams = typename MainloopPipeline::Params;
+        using PipelineState = typename MainloopPipeline::PipelineState;
+
+        SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+        int const lane_predicate = cute::elect_one_sync();
+        int const warp_idx = cutlass::canonical_warp_idx_sync();
+
+        // Issue Tma Descriptor Prefetch from a single thread
+        if (warp_idx == 0 && lane_predicate) {
+            CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+            CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+        }
+
+        // Obtain warp index
+        int const warp_group_thread_idx = threadIdx.x % cutlass::NumThreadsPerWarpGroup;
+
+        PipelineParams pipeline_params;
+        pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytesQ + CollectiveMainloop::TmaTransactionBytesLSE;
+        int warp_group_idx = cutlass::canonical_warp_group_idx();
+        pipeline_params.role = warp_group_idx == 0
+            ? MainloopPipeline::ThreadCategory::Producer
+            : MainloopPipeline::ThreadCategory::Consumer;
+        pipeline_params.is_leader = warp_group_thread_idx == 0;
+        pipeline_params.num_consumers = NumMmaThreads;
+
+        if (warp_idx == 0 && lane_predicate) {
+            shared_storage.barrier_KV.init(1 /*numThreads*/);
+            // shared_storage.barrier_dKV.init(size(ClusterShape{}) /*numThreads*/);
+        }
+        // We're counting on pipeline_q to call cutlass::arch::fence_barrier_init();
+        MainloopPipeline pipeline_q(shared_storage.pipeline_q, pipeline_params, ClusterShape{});
+        MainloopPipeline pipeline_do(shared_storage.pipeline_do, pipeline_params, ClusterShape{});
+
+        CollectiveMainloop collective_mainloop;
+        CollectiveEpilogue collective_epilogue;
+
+        // We need this to guarantee that the Pipeline init is visible to all producers and consumer blocks in the Cluster
+        if constexpr (size(ClusterShape{}) > 1) {
+            cute::cluster_arrive_relaxed();
+            cute::cluster_wait();
+        } else {
+            __syncthreads();
+        }
+
+        if (warp_group_idx == 0) {  // Producer
+            cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+            int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+            if (warp_idx_in_warpgroup == 0) {  // Load K, V, and do TMA on Q and dO
+                PipelineState smem_pipe_write = cutlass::make_producer_start_state<MainloopPipeline>();
+
+                int work_idx = 0;
+
+                TileScheduler scheduler(reinterpret_cast<typename TileScheduler::SharedStorage*>(&shared_storage.smem_scheduler));
+                for (auto work_tile_info = scheduler.template get_initial_work</*IsProducer=*/true>(params.scheduler);
+                     work_tile_info.is_valid(params.scheduler);
+                     work_tile_info = scheduler.template get_next_work</*IsProducer=*/true>(params.scheduler, work_tile_info)) {
+                    auto block_coord = work_tile_info.get_block_coord(params.scheduler);
+                    auto [n_block, bidh, bidb] = block_coord;
+                    if constexpr (Varlen) {
+                        if (n_block * kBlockM >= collective_mainloop.get_seqlen_k(params.mainloop, bidb)) {
+                            scheduler.prefetch_next_work(params.scheduler, work_tile_info);
+                            continue;
+                        }
+                    }
+                    if constexpr (Is_causal) {
+                        int const m_block_min = collective_mainloop.get_m_block_min(params.mainloop, n_block, bidb);
+                        int const m_block_max = cute::ceil_div(collective_mainloop.get_seqlen_q(params.mainloop, bidb), kBlockM);
+                        if (m_block_min >= m_block_max) {
+                            scheduler.prefetch_next_work(params.scheduler, work_tile_info);
+                            continue;
+                        }
+                    }
+                    auto scheduler_prefetch = [&scheduler, &params, &work_tile_info]() {
+                        scheduler.prefetch_next_work(params.scheduler, work_tile_info);
+                    };
+                    collective_mainloop.load(params.mainloop, pipeline_q, pipeline_do, smem_pipe_write,
+                                             shared_storage, scheduler_prefetch, block_coord, work_idx);
+                    ++work_idx;
+                }
+                collective_mainloop.load_tail(pipeline_q, pipeline_do, smem_pipe_write);
+            } else if (warp_idx_in_warpgroup == 1) {
+                TileScheduler scheduler(reinterpret_cast<typename TileScheduler::SharedStorage*>(&shared_storage.smem_scheduler));
+                for (auto work_tile_info = scheduler.template get_initial_work</*IsProducer=*/false>(params.scheduler);
+                     work_tile_info.is_valid(params.scheduler);
+                     work_tile_info = scheduler.template get_next_work</*IsProducer=*/false>(params.scheduler, work_tile_info)) {
+                    auto block_coord = work_tile_info.get_block_coord(params.scheduler);
+                    auto [n_block, bidh, bidb] = block_coord;
+                    if constexpr (Varlen) {
+                        if (n_block * kBlockM >= collective_mainloop.get_seqlen_k(params.mainloop, bidb)) { continue; }
+                    }
+                    if constexpr (Is_causal) {
+                        int const m_block_min = collective_mainloop.get_m_block_min(params.mainloop, n_block, bidb);
+                        int const m_block_max = cute::ceil_div(collective_mainloop.get_seqlen_q(params.mainloop, bidb), kBlockM);
+                        if (m_block_min >= m_block_max) { continue; }
+                    }
+                    collective_mainloop.store_dq(params.mainloop, shared_storage, block_coord);
+                }
+            }
+        } else {  // Consumer
+            cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+            TileScheduler scheduler(reinterpret_cast<typename TileScheduler::SharedStorage*>(&shared_storage.smem_scheduler));
+            // Initialize matmul objects.
+            TiledMmadKV tiled_mma_dKV;
+
+            PipelineState smem_pipe_read;
+
+            collective_mainloop.mma_init();
+            scheduler.init_consumer();
+
+            int work_idx = 0;
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (auto work_tile_info = scheduler.template get_initial_work</*IsProducer=*/false>(params.scheduler);
+                 work_tile_info.is_valid(params.scheduler);
+                 work_tile_info = scheduler.template get_next_work</*IsProducer=*/false>(params.scheduler, work_tile_info)) {
+                auto block_coord = work_tile_info.get_block_coord(params.scheduler);
+                auto [n_block, bidh, bidb] = block_coord;
+                if constexpr (Varlen) {
+                    if (n_block * kBlockM >= collective_mainloop.get_seqlen_k(params.mainloop, bidb)) { continue; }
+                }
+                if constexpr (Is_causal) {
+                    int const m_block_min = collective_mainloop.get_m_block_min(params.mainloop, n_block, bidb);
+                    int const m_block_max = cute::ceil_div(collective_mainloop.get_seqlen_q(params.mainloop, bidb), kBlockM);
+                    if (m_block_min >= m_block_max) {  // We exit early and write 0 to dK and dV
+                        collective_epilogue.store_zero(params.epilogue, threadIdx.x - NumCopyThreads, block_coord);
+                        continue;
+                    }
+                }
+
+                // dK and dV output accumulator.
+                Tensor tdKrdK = partition_fragment_C(tiled_mma_dKV, select<!dKV_swapAB ? 1 : 2, !dKV_swapAB? 2 : 1>(TileShape_MNK{}));
+                Tensor tdVrdV = partition_fragment_C(tiled_mma_dKV, select<!dKV_swapAB ? 1 : 2, !dKV_swapAB? 2 : 1>(TileShape_MNK{}));
+                collective_mainloop.mma(params.mainloop, pipeline_q, pipeline_do, smem_pipe_read,
+                                        tdKrdK, tdVrdV, threadIdx.x - NumCopyThreads, work_idx, block_coord, shared_storage);
+                collective_epilogue.store(params.epilogue, tdKrdK, tdVrdV, shared_storage, tiled_mma_dKV,
+                                          threadIdx.x - NumCopyThreads, block_coord);
+
+                ++work_idx;
+            }
+            collective_epilogue.store_tail();
+        }
+
+    }
+
+};
+
+} // namespace flash
diff --git a/flash_bwd_launch_template.h b/flash_bwd_launch_template.h
new file mode 100644
index 0000000000000000000000000000000000000000..268d835cea82fe8ff0746d2f5ff97e2b102e7cd7
--- /dev/null
+++ b/flash_bwd_launch_template.h
@@ -0,0 +1,200 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/device_kernel.h"  // For device_kernel
+
+#include "static_switch.h"
+#include "flash.h"
+#include "flash_bwd_preprocess_kernel.h"
+#include "flash_bwd_postprocess_kernel.h"
+#include "tile_scheduler_bwd.hpp"
+#include "mainloop_bwd_sm90_tma_gmma_ws.hpp"
+#include "epilogue_bwd_sm90_tma.hpp"
+#include "flash_bwd_kernel.h"
+
+using namespace cute;
+
+template <int kHeadDim, int kBlockM, int kBlockN, typename Element, bool Is_causal, bool Varlen, bool Deterministic,
+          bool dKV_swapAB, bool dQ_swapAB, int AtomLayoutMSdP=1, int AtomLayoutNdKV=2, int AtomLayoutMdQ=1>
+void run_flash_bwd(Flash_bwd_params &params, cudaStream_t stream) {
+    using TileShape_MK = cute::Shape<Int<kBlockM>, Int<kHeadDim>>;
+    using ElementAccum = float;
+    using PreprocessKernel = flash::FlashAttnBwdPreprocess<TileShape_MK, Element, ElementAccum, /*Clear_dQaccum=*/true, Varlen>;
+    int const total_q_padded_rounded = cute::round_up(params.total_q + params.b * 128, 128);
+    typename PreprocessKernel::Arguments preprocess_args {
+        static_cast<Element const*>(params.o_ptr),
+        {!Varlen ? params.seqlen_q : params.total_q, params.d, params.h, !Varlen ? params.b : 1},  // shape_O
+        {params.o_row_stride, _1{}, params.o_head_stride, !Varlen ? params.o_batch_stride : 0},  // stride_O
+        static_cast<Element const*>(params.do_ptr),
+        {params.do_row_stride, _1{}, params.do_head_stride, !Varlen ? params.do_batch_stride : 0},  // stride_dO
+        static_cast<float*>(params.dsoftmax_sum),
+        {!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded, params.h, !Varlen ? params.b : 1},  // shape_dPsum
+        {_1{}, !Varlen ? params.seqlen_q_rounded : total_q_padded_rounded, !Varlen ? params.h * params.seqlen_q_rounded : 0},  // stride_dPsum
+        static_cast<float*>(params.softmax_lse_ptr),
+        {_1{}, !Varlen ? params.seqlen_q : params.total_q, !Varlen ? params.h * params.seqlen_q : 0},  // stride_LSE
+        static_cast<float*>(params.softmax_lse_log2_ptr),
+        {_1{}, !Varlen ? params.seqlen_q_rounded : total_q_padded_rounded, !Varlen ? params.h * params.seqlen_q_rounded : 0},  // stride_LSE_log2
+        static_cast<ElementAccum*>(params.dq_accum_ptr),
+        {!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded, params.d_rounded, params.h, !Varlen ? params.b : 1},  // shape_dQaccum
+        {params.d_rounded, _1{}, params.d_rounded * (!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded), !Varlen ? params.d_rounded * params.seqlen_q_rounded * params.h : 0},  // stride_dQ
+        params.b,
+        params.dq_semaphore,
+        params.cu_seqlens_q
+    };
+    typename PreprocessKernel::Params preprocess_params = PreprocessKernel::to_underlying_arguments(preprocess_args);
+    int num_m_block = cute::ceil_div(params.seqlen_q, kBlockM);
+    dim3 grid_m(num_m_block, params.h, params.b);
+    cutlass::device_kernel<PreprocessKernel><<<grid_m, PreprocessKernel::MaxThreadsPerBlock, PreprocessKernel::SharedStorageSize, stream>>>(preprocess_params);
+
+    using TileShape_MNK = cute::Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
+    using ClusterShape = cute::Shape<_1, Int<1>, _1>;
+    static constexpr int Stages = 2;
+    using CollectiveMainloop = flash::CollectiveMainloopBwd<Stages, ClusterShape, TileShape_MNK, Element, ElementAccum, cutlass::arch::Sm90,
+            Is_causal, Varlen, Deterministic,
+            dKV_swapAB, dQ_swapAB, AtomLayoutMSdP, AtomLayoutNdKV, AtomLayoutMdQ>;
+    using CollectiveEpilogue = flash::CollectiveEpilogueBwd<TileShape_MNK, Element, CollectiveMainloop::NumMmaThreads, Varlen>;
+    using Scheduler = flash::SingleTileSchedulerBwd;
+    using AttnKernel = flash::FlashAttnBwd<CollectiveMainloop, CollectiveEpilogue, Scheduler>;
+
+    typename CollectiveMainloop::Arguments mainloop_args {
+        static_cast<Element const*>(params.q_ptr),
+        {!Varlen ? params.seqlen_q : params.total_q, params.d, params.h, !Varlen ? params.b : 1},  // shape_Q
+        {params.q_row_stride, _1{}, params.q_head_stride, !Varlen ? params.q_batch_stride : 0},  // stride_Q
+        static_cast<Element const*>(params.k_ptr),
+        {!Varlen ? params.seqlen_k : params.total_k, params.d, params.h_k, !Varlen ? params.b : 1},  // shape_K
+        {params.k_row_stride, _1{}, params.k_head_stride, !Varlen ? params.k_batch_stride : 0},  // stride_K
+        static_cast<Element const*>(params.v_ptr),
+        {params.v_row_stride, _1{}, params.v_head_stride, !Varlen ? params.v_batch_stride : 0},  // stride_V
+        static_cast<Element const*>(params.do_ptr),
+        {params.do_row_stride, _1{}, params.do_head_stride, !Varlen ? params.do_batch_stride : 0},  // stride_dO
+        static_cast<ElementAccum*>(params.dq_accum_ptr),
+        // {params.seqlen_q_rounded, params.d_rounded, params.h, params.b},  // shape_dQaccum
+        // {params.d_rounded, _1{}, params.d_rounded * params.seqlen_q_rounded, params.d_rounded * params.seqlen_q_rounded * params.h}, // stride_dQaccum
+        {(!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded) * (params.d_rounded / 32), 32, params.h, !Varlen ? params.b : 1},  // shape_dQaccum
+        {32, _1{}, params.d_rounded * (!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded), !Varlen ? params.d_rounded * params.seqlen_q_rounded * params.h : 0}, // stride_dQaccum
+        static_cast<float*>(params.softmax_lse_log2_ptr),
+        {!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded, params.h, !Varlen ? params.b : 1},  // shape_LSE
+        {_1{}, !Varlen ? params.seqlen_q_rounded : total_q_padded_rounded, !Varlen ? params.h * params.seqlen_q_rounded : 0},  // stride_LSE_log2
+        static_cast<float*>(params.dsoftmax_sum),
+        {_1{}, !Varlen ? params.seqlen_q_rounded : total_q_padded_rounded, !Varlen ? params.h * params.seqlen_q_rounded : 0},  // stride_dPsum
+        params.scale_softmax,
+        params.b,
+        params.dq_semaphore,
+        params.cu_seqlens_q, params.cu_seqlens_k,
+    };
+    typename CollectiveEpilogue::Arguments epilogue_args {
+        static_cast<Element*>(params.dk_ptr),
+        {!Varlen ? params.seqlen_k : params.total_k, params.d, params.h, !Varlen ? params.b : 1},  // shape_dK
+        {params.dk_row_stride, _1{}, params.dk_head_stride, !Varlen ? params.dk_batch_stride : 0},  // stride_dK
+        static_cast<Element*>(params.dv_ptr),
+        {params.dv_row_stride, _1{}, params.dv_head_stride, !Varlen ? params.dv_batch_stride : 0},
+        params.cu_seqlens_k
+    };
+
+    int num_blocks_n = cutlass::ceil_div(params.seqlen_k, get<1>(TileShape_MNK{}));
+    num_blocks_n = cutlass::round_up(num_blocks_n, size<1>(ClusterShape{}));
+    typename Scheduler::Arguments scheduler_args {
+        num_blocks_n, params.h, params.b, params.tile_count_semaphore, params.cu_seqlens_k
+    };
+
+    int device;
+    cudaGetDevice(&device);
+    typename AttnKernel::Params kernel_params = AttnKernel::to_underlying_arguments({
+        mainloop_args, epilogue_args, {device}, scheduler_args
+    });
+
+    // Get the ptr to kernel function.
+    void const* kernel = (void const*) cutlass::device_kernel<AttnKernel>;
+    int smem_size = AttnKernel::SharedStorageSize;
+    // int smem_size_q = sizeof(decltype((typename AttnKernel::SharedStorage{}).mainloop.smem_q));
+    // int smem_size_do = sizeof(decltype((typename AttnKernel::SharedStorage{}).mainloop.smem_do));
+    // int smem_size_ds = sizeof(decltype((typename AttnKernel::SharedStorage{}).mainloop.smem_ds));
+    // int smem_size_dqacc = sizeof(decltype((typename AttnKernel::SharedStorage{}).mainloop.smem_dqacc));
+    // int smem_size_k = sizeof(decltype((typename AttnKernel::SharedStorage{}).mainloop.smem_k));
+    // int smem_size_v = sizeof(decltype((typename AttnKernel::SharedStorage{}).mainloop.smem_v));
+    // printf("smem_size = %d, q = %d, k = %d, v = %d, do = %d, ds = %d, dqacc = %d\n", smem_size, smem_size_q, smem_size_k, smem_size_v, smem_size_do, smem_size_ds, smem_size_dqacc);
+    if (smem_size >= 48 * 1024) {
+        CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    }
+
+    dim3 grid_dims = AttnKernel::get_grid_shape(kernel_params);
+    dim3 block_dims = AttnKernel::get_block_shape();
+    dim3 cluster_dims(size<0>(ClusterShape{}), size<1>(ClusterShape{}), size<2>(ClusterShape{}));
+    cutlass::ClusterLaunchParams launch_params{grid_dims, block_dims, cluster_dims, smem_size, stream};
+    cutlass::launch_kernel_on_cluster(launch_params, kernel, kernel_params);
+    CHECK_CUDA_KERNEL_LAUNCH();
+
+    using PostprocessKernel = flash::FlashAttnBwdPostprocessConvertdQ<TileShape_MK, Element, ElementAccum,
+        AttnKernel::CollectiveMainloop::kNThreadsdQ,
+        typename AttnKernel::CollectiveMainloop::SmemLayoutdQaccumTMA,
+        typename AttnKernel::CollectiveMainloop::TiledMmadQ,
+        AttnKernel::CollectiveMainloop::dQ_swapAB
+        >;
+    typename PostprocessKernel::Arguments postprocess_args {
+        static_cast<ElementAccum const*>(params.dq_accum_ptr),
+        // {params.seqlen_q_rounded, params.d_rounded, params.h, params.b},  // shape_dQaccum
+        // {params.d_rounded, _1{}, params.d_rounded * params.seqlen_q_rounded, params.d_rounded * params.seqlen_q_rounded * params.h},  // stride_dQaccum
+        {(!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded) * (params.d_rounded / 32), 32, params.h, !Varlen ? params.b : 1},  // shape_dQaccum
+        {32, _1{}, params.d_rounded * (!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded), !Varlen ? params.d_rounded * params.seqlen_q_rounded * params.h : 0}, // stride_dQaccum
+        static_cast<Element*>(params.dq_ptr),
+        {!Varlen ? params.seqlen_q : params.total_q, params.d, params.h, !Varlen ? params.b : 1},  // shape_dQ
+        {params.dq_row_stride, _1{}, params.dq_head_stride, params.dq_batch_stride},  // stride_dQ
+        params.scale_softmax,
+        params.cu_seqlens_q
+    };
+    typename PostprocessKernel::Params postprocess_params = PostprocessKernel::to_underlying_arguments(postprocess_args);
+    int num_m_block_postprocess = cute::ceil_div(params.seqlen_q, get<0>(TileShape_MK{}));
+    dim3 grid_m_postprocess(num_m_block_postprocess, params.h, params.b);
+    // Get the ptr to kernel function.
+    auto postprocess_kernel = cutlass::device_kernel<PostprocessKernel>;
+    int smem_size_postprocess = PostprocessKernel::SharedStorageSize;
+    if (smem_size_postprocess >= 48 * 1024) {
+        CHECK_CUDA(cudaFuncSetAttribute(postprocess_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    }
+    postprocess_kernel<<<grid_m_postprocess, PostprocessKernel::MaxThreadsPerBlock, smem_size_postprocess, stream>>>(postprocess_params);
+    CHECK_CUDA_KERNEL_LAUNCH();
+
+}
+
+
+template<typename T>
+void run_mha_bwd_hdim64(Flash_bwd_params &params, cudaStream_t stream) {
+    constexpr static int Headdim = 64;
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+        BOOL_SWITCH(params.cu_seqlens_q != nullptr || params.cu_seqlens_k != nullptr, Varlen, [&] {
+            BOOL_SWITCH(params.deterministic, Deterministic, [&] {
+                run_flash_bwd<Headdim, 128, 128, T, Is_causal, Varlen, Deterministic, false, false, 1, 2, 2>(params, stream);
+            });
+        });
+    });
+}
+
+template<typename T>
+void run_mha_bwd_hdim96(Flash_bwd_params &params, cudaStream_t stream) {
+    constexpr static int Headdim = 96;
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+        BOOL_SWITCH(params.cu_seqlens_q != nullptr || params.cu_seqlens_k != nullptr, Varlen, [&] {
+            BOOL_SWITCH(params.deterministic, Deterministic, [&] {
+                run_flash_bwd<Headdim, 64, 128, T, Is_causal, Varlen, Deterministic, false, false, 1, 2, 1>(params, stream);
+            });
+        });
+    });
+}
+
+template<typename T>
+void run_mha_bwd_hdim128(Flash_bwd_params &params, cudaStream_t stream) {
+    constexpr static int Headdim = 128;
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+        BOOL_SWITCH(params.cu_seqlens_q != nullptr || params.cu_seqlens_k != nullptr, Varlen, [&] {
+            BOOL_SWITCH(params.deterministic, Deterministic, [&] {
+                run_flash_bwd<Headdim, 64, 128, T, Is_causal, Varlen, Deterministic, false, false, 1, 2, 1>(params, stream);
+            });
+        });
+    });
+}
diff --git a/flash_bwd_postprocess_kernel.h b/flash_bwd_postprocess_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3bfc5f5576804ed5c2fc14c9ef2ad748c5f4bc4
--- /dev/null
+++ b/flash_bwd_postprocess_kernel.h
@@ -0,0 +1,248 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include <cutlass/cutlass.h>
+#include <cutlass/array.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/numeric_conversion.h>
+#include "cutlass/arch/barrier.h"
+
+#include "utils.h"
+
+namespace flash {
+
+using namespace cute;
+
+template <class TileShape_MK_, class Element, class ElementAccum, int kNThreads, class SmemLayoutdQaccumTMA,
+          class TiledMma, bool dQ_swapAB>
+class FlashAttnBwdPostprocessConvertdQ {
+
+public:
+
+    // Type Aliases
+    using TileShape_MK = TileShape_MK_;
+
+    static constexpr uint32_t MaxThreadsPerBlock = kNThreads;
+    static constexpr uint32_t MinBlocksPerMultiprocessor = 2;
+
+    static constexpr int kHeadDim = get<1>(TileShape_MK{});
+    using R2SLayoutAtomdQaccum = Layout<Shape<Int<kNThreads>>, Stride<_1>>;
+    using R2STiledCopydQaccum = decltype(make_tiled_copy(Copy_Atom<DefaultCopy, ElementAccum>{}, R2SLayoutAtomdQaccum{},
+                                                         Layout<Shape < _4>>{}));  // Val layout, 4 vals per read
+    static constexpr int SmemdQaccumSize = size(TileShape_MK{});
+    static_assert(size(TileShape_MK{}) == size(SmemLayoutdQaccumTMA{}), "TileShape_MK and SmemLayoutdQaccumTMA must have the same size");
+    using SmemLayoutdQaccum = Layout<Shape<Int<SmemdQaccumSize>>, Stride<_1>>;
+
+    // We can't just use kHeadDim here. E.g. if MMA shape is 64 x 96 but split across 2 WGs,
+    // then setting kBlockKSmem to 32 will cause "Static shape_div failure".
+    // We want to treat it as 64 x 48, so kBlockKSmem should be 16.
+    static constexpr int MmaShapeN = get<1>(typename TiledMma::AtomShape_MNK{});
+    static constexpr int kBlockKSmem = MmaShapeN % 64 == 0 ? 64 : (MmaShapeN % 32 == 0 ? 32 : 16);
+    static constexpr int kSwizzle = kBlockKSmem == 64 ? 3 : (kBlockKSmem == 32 ? 2 : 1);
+    using SmemLayoutAtomdQ =
+        decltype(composition(Swizzle<kSwizzle, 3, 3>{},
+                 Layout<Shape<Int<8>, Int<kBlockKSmem>>,
+                 Stride<Int<kBlockKSmem>, _1>>{}));
+    using SmemLayoutdQ = decltype(tile_to_shape(SmemLayoutAtomdQ{}, TileShape_MK{}));
+    using SmemLayoutdQt =
+        decltype(cute::composition(SmemLayoutdQ{},
+                                   make_layout(make_shape(get<1>(TileShape_MK{}), get<0>(TileShape_MK{})),
+                                               make_stride(Int<get<0>(TileShape_MK{})>{}, _1{}))));
+
+    using SmemCopyAtomdQ = Copy_Atom<
+        std::conditional_t<!dQ_swapAB, cute::SM90_U32x4_STSM_N, cute::SM90_U16x8_STSM_T>,
+        Element>;
+
+    static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element);
+    static_assert(kHeadDim % kGmemElemsPerLoad == 0, "Headdim must be a multiple of kGmemElemsPerLoad");
+    static constexpr int kGmemThreadsPerRow = cutlass::gcd(kHeadDim / kGmemElemsPerLoad, int(MaxThreadsPerBlock));
+    static_assert(MaxThreadsPerBlock % kGmemThreadsPerRow == 0, "MaxThreadsPerBlock must be a multiple of kGmemThreadsPerRow");
+    using GmemLayoutAtom = Layout<Shape <Int<MaxThreadsPerBlock / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
+                                  Stride<Int<kGmemThreadsPerRow>, _1>>;
+    using GmemTiledCopy = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape<_1, Int<kGmemElemsPerLoad>>>{}));  // Val layout, 8 or 16 vals per load
+
+    using GmemTiledCopydQaccum = cute::SM90_TMA_LOAD;
+
+    struct SharedStorage : cute::aligned_struct<128> {
+        cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutdQaccumTMA>, 1024> smem_dqacc;
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutdQ>> smem_dq;
+        alignas(16) cutlass::arch::ClusterTransactionBarrier barrier_dQaccum;
+    };
+
+    static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+    using ShapedQ = cute::Shape<int32_t, int32_t, int32_t, int32_t>;   // (seqlen_q, d, head, batch)
+    using StridedQ = cute::Stride<int64_t, _1, int64_t, int64_t>;
+
+    using TMA_dQaccum = decltype(make_tma_copy(
+        GmemTiledCopydQaccum{},
+        make_tensor(make_gmem_ptr(static_cast<ElementAccum*>(nullptr)), ShapedQ{}, StridedQ{}),
+        SmemLayoutdQaccumTMA{},
+        SmemLayoutdQaccumTMA{}.shape(),
+        _1{})); // no mcast for dQ
+
+    // Device side arguments
+    struct Arguments {
+        ElementAccum const* ptr_dQaccum;
+        ShapedQ const shape_dQaccum;
+        StridedQ const stride_dQaccum;
+        Element* ptr_dQ;
+        ShapedQ const shape_dQ;
+        StridedQ const stride_dQ;
+        float const softmax_scale;
+        int const* cu_seqlens = nullptr;
+    };
+
+    // Kernel entry point API
+    struct Params {
+        TMA_dQaccum tma_load_dQaccum;
+        ShapedQ const shape_dQaccum;
+        Element* ptr_dQ;
+        ShapedQ const shape_dQ;
+        StridedQ const stride_dQ;
+        float const softmax_scale;
+        int const* cu_seqlens = nullptr;
+    };
+
+    // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+    static
+    Params
+    to_underlying_arguments(Arguments const& args) {
+        Tensor mdQaccum = make_tensor(make_gmem_ptr(args.ptr_dQaccum), args.shape_dQaccum, args.stride_dQaccum);
+        TMA_dQaccum tma_load_dQaccum = make_tma_copy(
+            GmemTiledCopydQaccum{},
+            mdQaccum,
+            SmemLayoutdQaccumTMA{},
+            SmemLayoutdQaccumTMA{}.shape(),
+            _1{}); // no mcast for dQaccum
+        return {
+            tma_load_dQaccum,
+            args.shape_dQaccum,
+            args.ptr_dQ,
+            args.shape_dQ,
+            args.stride_dQ,
+            args.softmax_scale,
+            args.cu_seqlens
+        };
+    }
+
+    CUTLASS_DEVICE
+    void
+    operator()(Params const& params, char* smem_buf) {
+
+        static constexpr int kBlockM = get<0>(TileShape_MK{});
+        SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+        Tensor sdQaccumTMA = make_tensor(make_smem_ptr(shared_storage.smem_dqacc.data()), SmemLayoutdQaccumTMA{});
+        // Tensor sdQaccumTMAnoswizzle = make_tensor(make_smem_ptr(shared_storage.smem_dqacc.data()), SmemLayoutdQaccumTMANoSwizzle{});
+        Tensor sdQaccum = make_tensor(make_smem_ptr(shared_storage.smem_dqacc.data()), SmemLayoutdQaccum{});
+        Tensor sdQ = make_tensor(make_smem_ptr(shared_storage.smem_dq.data()), SmemLayoutdQ{});
+        Tensor sdQt = make_tensor(make_smem_ptr(shared_storage.smem_dq.data()), SmemLayoutdQt{});
+
+        int const thread_idx = threadIdx.x;
+        int const m_block = blockIdx.x;
+        int const bidh = blockIdx.y;
+        int const bidb = blockIdx.z;
+
+        bool const is_varlen = params.cu_seqlens != nullptr;
+        int const seqlen = !is_varlen ? get<0>(params.shape_dQ) : params.cu_seqlens[bidb + 1] - params.cu_seqlens[bidb];
+        if (is_varlen && m_block * kBlockM >= seqlen) { return; }
+
+        int lane_predicate = cute::elect_one_sync();
+        int warp_idx = cutlass::canonical_warp_idx_sync();
+        // Issue Tma Descriptor Prefetch from a single thread
+        if (warp_idx == 0 && lane_predicate) {
+            cute::prefetch_tma_descriptor(params.tma_load_dQaccum.get_tma_descriptor());
+            shared_storage.barrier_dQaccum.init(1 /*numThreads*/);
+        }
+        __syncthreads();
+
+        // Step 1: TMA to load dQaccum from gmem to smem
+        // We reshaped dQaccum to have last dimension 32, so the offset needs to be multiplied by kHeadDim / 32
+        int const offset_padded = !is_varlen ? 0 : ((params.cu_seqlens[bidb] + bidb * 128) / 128 * 128) * (kHeadDim / get<1>(SmemLayoutdQaccumTMA{}.shape()));
+        Tensor mdQaccum = params.tma_load_dQaccum.get_tma_tensor(params.shape_dQaccum)(_, _, bidh, !is_varlen ? bidb : 0);
+        Tensor gdQaccum = local_tile(domain_offset(make_coord(offset_padded, _0{}), mdQaccum), SmemLayoutdQaccumTMA{}.shape(), make_coord(m_block, _0{}));  // (M, K)
+        auto block_tma_dQ = params.tma_load_dQaccum.get_slice(_0{});
+        Tensor tdQgdQaccumTMA = block_tma_dQ.partition_D(gdQaccum);  // (TMA, TMA_M, TMA_K)
+        Tensor tdQsdQaccumTMA = block_tma_dQ.partition_S(sdQaccumTMA); // (TMA, TMA_M, TMA_K)
+        static constexpr uint32_t TmaTransactionBytesdQaccum = static_cast<uint32_t>(size(SmemLayoutdQaccumTMA{}) * cute::sizeof_bits_v<ElementAccum> / 8);
+        if (warp_idx == 0 && lane_predicate) {
+            shared_storage.barrier_dQaccum.arrive_and_expect_tx(TmaTransactionBytesdQaccum);
+            copy(params.tma_load_dQaccum.with(reinterpret_cast<cutlass::arch::ClusterTransactionBarrier::ValueType&>(shared_storage.barrier_dQaccum), 0 /*mcast_mask*/), tdQgdQaccumTMA, tdQsdQaccumTMA);
+        }
+        shared_storage.barrier_dQaccum.wait(0);
+
+        // __syncthreads(); if (cute::thread0()) { print_tensor(sdQaccumTMA); }
+        // __syncthreads(); if (cute::thread0()) { print_tensor(sdQaccumTMAnoswizzle); }
+        // __syncthreads(); if (cute::thread0()) { print_tensor(sdQaccum); }
+
+        // Step 2: Load dQaccum from smem to register, then convert fp32 -> fp16/bf16
+        R2STiledCopydQaccum s2r_tiled_copy_dQaccum;
+        auto s2r_thr_copy_dQaccum = s2r_tiled_copy_dQaccum.get_thread_slice(thread_idx);
+        Tensor tdQsdQaccum = s2r_thr_copy_dQaccum.partition_S(sdQaccum);
+        TiledMma tiled_mma_dQ;
+        Tensor taccdQrdQaccum = partition_fragment_C(tiled_mma_dQ, select<!dQ_swapAB ? 0 : 1, !dQ_swapAB ? 1 : 0>(TileShape_MK{}));
+        // if (cute::thread0()) { print(tiled_mma_dQ); printf("\n"); }
+        // if (cute::thread0()) { print(tdQsdQaccum); }
+        // if (cute::thread0()) { print(taccdQrdQaccum); }
+        CUTE_STATIC_ASSERT_V(size(taccdQrdQaccum) == size(tdQsdQaccum));
+        Tensor tdQrdQaccum = s2r_thr_copy_dQaccum.retile_D(taccdQrdQaccum);
+        cute::copy(s2r_tiled_copy_dQaccum, tdQsdQaccum, tdQrdQaccum);
+        #pragma unroll
+        for (int i = 0; i < size(taccdQrdQaccum); ++i) { taccdQrdQaccum(i) *= params.softmax_scale; }
+        // Convert tdQrdQ from fp32 to fp16
+        Tensor rdQ = flash::convert_type<Element>(taccdQrdQaccum);
+
+        // Step 3: Copy dQ from register to smem
+        auto smem_tiled_copy_dQ = make_tiled_copy_C(SmemCopyAtomdQ{}, tiled_mma_dQ);
+        auto smem_thr_copy_dQ = smem_tiled_copy_dQ.get_thread_slice(thread_idx);
+        Tensor taccdQrdQ = smem_thr_copy_dQ.retile_S(rdQ);  // ((Atom,AtomNum), MMA_N, MMA_N)
+        // if (cute::thread0()) { print(smem_tiled_copy_dQ); }
+        // if (cute::thread0()) { print(smem_thr_copy_dQ); }
+        // if (cute::thread0()) { print(sdQ); }
+        if constexpr (!dQ_swapAB) {
+            Tensor taccdQsdQ = smem_thr_copy_dQ.partition_D(sdQ);  // ((Atom,AtomNum),PIPE_M,PIPE_N)
+            cute::copy(smem_tiled_copy_dQ, taccdQrdQ, taccdQsdQ);
+        } else {
+            Tensor taccdQsdQt = smem_thr_copy_dQ.partition_D(sdQt);  // ((Atom,AtomNum),PIPE_M,PIPE_N)
+            cute::copy(smem_tiled_copy_dQ, taccdQrdQ, taccdQsdQt);
+        }
+        __syncthreads();
+
+        // Step 4: Copy dQ from smem to register to prepare for coalesced write to gmem
+        int const offset = !is_varlen ? 0 : params.cu_seqlens[bidb];
+        Tensor mdQ = make_tensor(make_gmem_ptr(params.ptr_dQ), params.shape_dQ, params.stride_dQ)(_, _, bidh, !is_varlen ? bidb : 0);
+        Tensor gdQ = local_tile(domain_offset(make_coord(offset, _0{}), mdQ), TileShape_MK{}, make_coord(m_block, _0{}));  // (M, K)
+        GmemTiledCopy gmem_tiled_copy_dQ;
+        auto gmem_thr_copy_dQ = gmem_tiled_copy_dQ.get_thread_slice(thread_idx);
+        Tensor tdQsdQ = gmem_thr_copy_dQ.partition_S(sdQ);    // ((Atom,AtomNum),ATOM_M,ATOM_N)
+        Tensor tdQgdQ = gmem_thr_copy_dQ.partition_D(gdQ);
+
+        Tensor tdQrdQ = make_fragment_like(tdQsdQ);
+        cute::copy(gmem_tiled_copy_dQ, tdQsdQ, tdQrdQ);
+
+        // Step 5: Copy dQ from register to gmem
+        // Construct identity layout for gdQ
+        Tensor cdQ = cute::make_identity_tensor(TileShape_MK{});  // (BLK_M,BLK_K) -> (blk_m,blk_k)
+        // Repeat the partitioning with identity layouts
+        Tensor tdQcdQ = gmem_thr_copy_dQ.partition_D(cdQ);
+        Tensor tdQpdQ = make_tensor<bool>(make_shape(size<2>(tdQgdQ)));
+        #pragma unroll
+        for (int k = 0; k < size(tdQpdQ); ++k) { tdQpdQ(k) = get<1>(tdQcdQ(_0{}, _0{}, k)) < get<1>(params.shape_dQ); }
+        // Clear_OOB_K must be false since we don't want to write zeros to gmem
+        flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
+            gmem_tiled_copy_dQ, tdQrdQ, tdQgdQ, tdQcdQ, tdQpdQ, seqlen - m_block * kBlockM
+        );
+    }
+
+};
+
+} // namespace flash
diff --git a/flash_bwd_preprocess_kernel.h b/flash_bwd_preprocess_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0c185f72f6bf322a91d107671583d6963d4c159
--- /dev/null
+++ b/flash_bwd_preprocess_kernel.h
@@ -0,0 +1,246 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include <cutlass/cutlass.h>
+#include <cutlass/array.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/numeric_conversion.h>
+
+#include "utils.h"
+
+namespace flash {
+
+using namespace cute;
+
+template <class TileShape_MK_, class Element, class ElementAccum, bool Clear_dQaccum, bool Varlen>
+class FlashAttnBwdPreprocess {
+
+public:
+
+    // Type Aliases
+    using TileShape_MK = TileShape_MK_;
+
+    static constexpr uint32_t MaxThreadsPerBlock = 256;
+    static constexpr uint32_t MinBlocksPerMultiprocessor = 2;
+    static constexpr int SharedStorageSize = 0;
+
+    static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element);
+    static_assert(get<1>(TileShape_MK{}) % kGmemElemsPerLoad == 0, "Headdim must be a multiple of kGmemElemsPerLoad");
+    static constexpr int kHeadDim = get<1>(TileShape_MK{});
+    // We want kBlockKGmem to be a power of 2 so that when we do the summing,
+    // it's just between threads in the same warp
+    static constexpr int kBlockKGmem = kHeadDim % 128 == 0 ? 128 : (kHeadDim % 64 == 0 ? 64 : 32);
+    static constexpr int kGmemThreadsPerRow = kBlockKGmem / kGmemElemsPerLoad;
+    static_assert(MaxThreadsPerBlock % kGmemThreadsPerRow == 0, "MaxThreadsPerBlock must be a multiple of kGmemThreadsPerRow");
+    using GmemLayoutAtom = Layout<Shape <Int<MaxThreadsPerBlock / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
+                                  Stride<Int<kGmemThreadsPerRow>, _1>>;
+    using GmemTiledCopy = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape<_1, Int<kGmemElemsPerLoad>>>{}));  // Val layout, 8 or 16 vals per load
+
+    static constexpr int kGmemElemsPerLoadAccum = sizeof(cute::uint128_t) / sizeof(ElementAccum);
+    static_assert(get<1>(TileShape_MK{}) % kGmemElemsPerLoadAccum == 0, "Headdim must be a multiple of kGmemElemsPerLoadAccum");
+    static constexpr int kGmemThreadsPerRowAccum = kBlockKGmem / kGmemElemsPerLoadAccum;
+    static_assert(MaxThreadsPerBlock % kGmemThreadsPerRowAccum == 0, "MaxThreadsPerBlock must be a multiple of kGmemThreadsPerRowAccum");
+    using GmemLayoutAtomAccum = Layout<Shape <Int<MaxThreadsPerBlock / kGmemThreadsPerRowAccum>, Int<kGmemThreadsPerRowAccum>>,
+                                       Stride<Int<kGmemThreadsPerRowAccum>, _1>>;
+    using GmemTiledCopyAccum = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, ElementAccum>{},
+                        GmemLayoutAtomAccum{},
+                        Layout<Shape<_1, Int<kGmemElemsPerLoadAccum>>>{}));  // Val layout, 4 vals per store
+
+    using ShapeO = cute::Shape<int32_t, int32_t, int32_t, int32_t>;  // (seqlen_q, d, head, batch)
+    using StrideO = cute::Stride<int64_t, _1, int64_t, int64_t>;
+    using ShapedPsum = cute::Shape<int32_t, int32_t, int32_t>;  // (seqlen_q, head, batch)
+    using StridedPsum = cute::Stride<_1, int64_t, int64_t>;
+
+    // Device side arguments
+    struct Arguments {
+        Element const* ptr_O;
+        ShapeO const shape_O;
+        StrideO const stride_O;
+        Element const* ptr_dO;
+        StrideO const stride_dO;
+        float* ptr_dPsum;
+        ShapedPsum const shape_dPsum;
+        StridedPsum const stride_dPsum;
+        float const* ptr_LSE;
+        StridedPsum const stride_LSE;
+        float *ptr_LSE_log2;
+        StridedPsum const stride_LSE_log2;
+        ElementAccum* ptr_dQaccum;
+        ShapeO const shape_dQaccum;
+        StrideO const stride_dQaccum;
+        int num_batch;  // We need this to know the size of dq_semaphore in case of varlen
+        int* dq_semaphore;
+        int const* cu_seqlens = nullptr;
+    };
+
+    // Kernel entry point API
+    struct Params {
+        Element const* ptr_O;
+        ShapeO const shape_O;
+        StrideO const stride_O;
+        Element const* ptr_dO;
+        StrideO const stride_dO;
+        float* ptr_dPsum;
+        ShapedPsum const shape_dPsum;
+        StridedPsum const stride_dPsum;
+        float const* ptr_LSE;
+        StridedPsum const stride_LSE;
+        float* ptr_LSE_log2;
+        StridedPsum const stride_LSE_log2;
+        ElementAccum* ptr_dQaccum;
+        ShapeO const shape_dQaccum;
+        StrideO const stride_dQaccum;
+        int num_batch;
+        int* dq_semaphore;
+        int const* cu_seqlens = nullptr;
+    };
+
+    // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+    static
+    Params
+    to_underlying_arguments(Arguments const& args) {
+        return {
+            args.ptr_O,
+            args.shape_O,
+            args.stride_O,
+            args.ptr_dO,
+            args.stride_dO,
+            args.ptr_dPsum,
+            args.shape_dPsum,
+            args.stride_dPsum,
+            args.ptr_LSE,
+            args.stride_LSE,
+            args.ptr_LSE_log2,
+            args.stride_LSE_log2,
+            args.ptr_dQaccum,
+            args.shape_dQaccum,
+            args.stride_dQaccum,
+            args.num_batch,
+            args.dq_semaphore,
+            args.cu_seqlens
+        };
+    }
+
+    CUTLASS_DEVICE
+    void
+    operator()(Params const& params, [[maybe_unused]] char* smem_buf) {
+
+        static constexpr int kBlockM = get<0>(TileShape_MK{});
+
+        int const thread_idx = threadIdx.x;
+        int const m_block = blockIdx.x;
+        int const bidh = blockIdx.y;
+        int const bidb = blockIdx.z;
+
+        bool const is_varlen = Varlen && params.cu_seqlens != nullptr;
+        int const offset_o = !is_varlen ? 0 : params.cu_seqlens[bidb];
+        int const seqlen_o = !is_varlen ? get<0>(params.shape_O) : params.cu_seqlens[bidb + 1] - offset_o;
+        if (is_varlen && m_block * kBlockM >= seqlen_o) { return; }
+
+        Tensor mO = make_tensor(make_gmem_ptr(params.ptr_O), params.shape_O, params.stride_O)(_, _, bidh, !is_varlen ? bidb : 0);
+        Tensor gO = local_tile(cute::domain_offset(make_coord(offset_o, _0{}), mO), TileShape_MK{}, make_coord(m_block, _0{}));  // (M, K)
+        Tensor mdO = make_tensor(make_gmem_ptr(params.ptr_dO), params.shape_O, params.stride_dO)(_, _, bidh, !is_varlen ? bidb : 0);
+        Tensor gdO = local_tile(cute::domain_offset(make_coord(offset_o, _0{}), mdO), TileShape_MK{}, make_coord(m_block, _0{}));  // (M, K)
+
+        auto shape_LSE = select<0, 2, 3>(params.shape_O);
+        Tensor mLSE = make_tensor(make_gmem_ptr(params.ptr_LSE), shape_LSE, params.stride_LSE)(_, bidh, !is_varlen ? bidb : 0);
+        Tensor gLSE = local_tile(cute::domain_offset(make_coord(offset_o), mLSE), Shape<Int<kBlockM>>{}, make_coord(m_block));
+        static_assert(kBlockM <= MaxThreadsPerBlock);
+        float lse = thread_idx < seqlen_o - m_block * kBlockM && thread_idx < kBlockM ? gLSE(thread_idx) : INFINITY;
+
+        GmemTiledCopy gmem_tiled_copy_O;
+        auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
+
+        Tensor tOgO = gmem_thr_copy_O.partition_S(gO);
+        Tensor tOgdO = gmem_thr_copy_O.partition_S(gdO);
+        // Construct identity layout for gO
+        Tensor cO = cute::make_identity_tensor(TileShape_MK{});  // (BLK_M,BLK_K) -> (blk_m,blk_k)
+        // Repeat the partitioning with identity layouts
+        Tensor tOcO = gmem_thr_copy_O.partition_D(cO);
+        Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgO)));
+        #pragma unroll
+        for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(params.shape_O); }
+
+        // (8, kBlockM / 32, kHeadDim / 64) or (8, kBlockM / 16, kHeadDim / 128)
+        Tensor tOrO = make_fragment_like(tOgO);
+        Tensor tOrdO = make_fragment_like(tOgdO);
+        flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/true>(
+            gmem_tiled_copy_O, tOgO, tOrO, tOcO, tOpO, seqlen_o - m_block * kBlockM
+        );
+        flash::copy</*Is_even_MN=*/false, /*Is_even_K=*/false, /*Clear_OOB_MN=*/true>(
+            gmem_tiled_copy_O, tOgdO, tOrdO, tOcO, tOpO, seqlen_o - m_block * kBlockM
+        );
+
+        // Reshape from e.g. (8, kBlockM / 32, kHeadDim / 64) to (kBlockM / 32, (8, kHeadDim / 64))
+        Layout l = make_layout(get<1>(tOrO.layout()), make_layout(get<0>(tOrO.layout()), get<2>(tOrO.layout())));
+        Tensor o_fp32 = flash::convert_type<float>(make_tensor(tOrO.data(), l));
+        Tensor do_fp32 = flash::convert_type<float>(make_tensor(tOrdO.data(), l));
+        // Sum across the last dimension
+        Tensor dP_sum = make_tensor<float>(make_shape(size<0>(o_fp32)));
+        #pragma unroll
+        for (int mi = 0; mi < size<0>(o_fp32); ++mi) {
+            float dP_sum_cur = do_fp32(mi, 0) * o_fp32(mi, 0);
+            #pragma unroll
+            for (int ni = 1; ni < size<1>(o_fp32); ni++) {
+                dP_sum_cur += do_fp32(mi, ni) * o_fp32(mi, ni);
+            }
+            flash::SumOp<float> sum_op;
+            dP_sum(mi) = flash::Allreduce<kGmemThreadsPerRow>::run(dP_sum_cur, sum_op);
+        }
+
+        // If varlen, the layout for dPSum, LSE_log2, and dQaccum is that we pad each sequence in the batch
+        // by an extra 128, so that the write for each sequence doesn't touch the next sequence.
+        // Sequence i starts at params.cu_seqlens[i] + i * 128 and ends at params.cu_seqlens[i + 1] + i * 128
+        int const offset_padded = !is_varlen ? 0 : (params.cu_seqlens[bidb] + bidb * 128) / 128 * 128;
+        Tensor mdPsum = make_tensor(make_gmem_ptr(params.ptr_dPsum), params.shape_dPsum, params.stride_dPsum)(_, bidh, !is_varlen ? bidb : 0);
+        Tensor gdPsum = local_tile(cute::domain_offset(make_coord(offset_padded), mdPsum), Shape<Int<kBlockM>>{}, make_coord(m_block));
+        if (thread_idx % kGmemThreadsPerRow == 0) {
+            #pragma unroll
+            for (int mi = 0; mi < size(dP_sum); ++mi) {
+                int row = thread_idx / kGmemThreadsPerRow + mi * MaxThreadsPerBlock / kGmemThreadsPerRow;
+                gdPsum(row) = row < seqlen_o - m_block * kBlockM ? dP_sum(mi) : 0;
+            }
+        }
+
+        int const seqlen_rounded = cute::round_up(seqlen_o, kBlockM);
+        Tensor mLSElog2 = make_tensor(make_gmem_ptr(params.ptr_LSE_log2), params.shape_dPsum, params.stride_LSE_log2)(_, bidh, !is_varlen ? bidb : 0);
+        Tensor gLSElog2 = local_tile(cute::domain_offset(make_coord(offset_padded), mLSElog2), Shape<Int<kBlockM>>{}, make_coord(m_block));
+        if (thread_idx < seqlen_rounded - m_block * kBlockM && thread_idx < kBlockM) {
+            gLSElog2(thread_idx) = lse == -INFINITY ? 0.f : lse * float(M_LOG2E);
+        }
+
+        if constexpr (Clear_dQaccum) {
+            Tensor mdQaccum = make_tensor(make_gmem_ptr(params.ptr_dQaccum), params.shape_dQaccum, params.stride_dQaccum)(_, _, bidh, !is_varlen ? bidb : 0);
+            Tensor gdQaccum = local_tile(cute::domain_offset(make_coord(offset_padded, _0{}), mdQaccum), TileShape_MK{}, make_coord(m_block, _0{}));
+            GmemTiledCopyAccum gmem_tiled_copy_dQaccum;
+            auto gmem_thr_copy_dQaccum = gmem_tiled_copy_dQaccum.get_thread_slice(thread_idx);
+            Tensor tdQgdQaccum = gmem_thr_copy_dQaccum.partition_D(gdQaccum);
+            Tensor zero = make_fragment_like(tdQgdQaccum);
+            clear(zero);
+            // cute::copy(zero, tdQgdQaccum);  // Somehow this doesn't vectorize the write
+            #pragma unroll
+            for (int m = 0; m < size<1>(zero); ++m) {
+                cute::copy(zero(_, m, _), tdQgdQaccum(_, m, _));
+            }
+        }
+
+        if (params.dq_semaphore != nullptr && thread_idx == 0) {
+            int const num_batch = params.num_batch;
+            int const num_head = get<2>(params.shape_dQaccum);
+            params.dq_semaphore[bidh + bidb * num_head + m_block * num_head * num_batch] = 0;
+        }
+
+    }
+
+};
+
+} // namespace flash
diff --git a/flash_common.hpp b/flash_common.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cc601f9705c9bf53f015cbca195e5e1cf91be527
--- /dev/null
+++ b/flash_common.hpp
@@ -0,0 +1,38 @@
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+// Include these 2 headers instead of torch/extension.h since we don't need all of the torch headers.
+#include <torch/python.h>
+#include <torch/nn/functional.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#ifdef OLD_GENERATOR_PATH
+#include <ATen/CUDAGeneratorImpl.h>
+#else
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#endif
+
+
+#define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA")
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+
+namespace flash {
+// Copy from PyTorch
+// https://github.com/pytorch/pytorch/blob/8b61daaf7349e9102117e1aeefaa51666d887547/aten/src/ATen/cuda/detail/UnpackRaw.cuh#L17
+static std::tuple<uint64_t, uint64_t> unpack(at::PhiloxCudaState arg) {
+  if (arg.captured_) {
+    // static_cast avoids "warning: invalid narrowing conversion from "long" to "unsigned long".
+    // *(arg.offset_.ptr) is a broadcast load of a single int64_t to the entire kernel.
+    // For most threads' reads it will hit in cache, so it shouldn't hurt performance.
+    return std::make_tuple(static_cast<uint64_t>(*arg.seed_.ptr), static_cast<uint64_t>(*(arg.offset_.ptr) + arg.offset_intragraph_));
+  } else {
+    return std::make_tuple(arg.seed_.val, arg.offset_.val);
+  }
+}
+
+} // namespace flash
diff --git a/flash_fwd_hdim128_bf16_causal_sm80.cu b/flash_fwd_hdim128_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9383c1024927e1af6ce7d14ed358d6386913f5ba
--- /dev/null
+++ b/flash_fwd_hdim128_bf16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::bfloat16_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim128_bf16_sm80.cu b/flash_fwd_hdim128_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f03abda4869b516ce63d367551157f1d92995a32
--- /dev/null
+++ b/flash_fwd_hdim128_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::bfloat16_t, false>(params, stream);
+}
diff --git a/flash_fwd_hdim128_bf16_sm90.cu b/flash_fwd_hdim128_bf16_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..11bb9ddecccf85f346555f585d391d93f324a732
--- /dev/null
+++ b/flash_fwd_hdim128_bf16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 128>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::bfloat16_t>(params, stream);
+}
diff --git a/flash_fwd_hdim128_e4m3_sm90.cu b/flash_fwd_hdim128_e4m3_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..04b431f10b8aae6a5b3c80d79799fc6c8740f6a5
--- /dev/null
+++ b/flash_fwd_hdim128_e4m3_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::float_e4m3_t, 128>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128_fp8<cutlass::float_e4m3_t>(params, stream);
+}
diff --git a/flash_fwd_hdim128_fp16_causal_sm80.cu b/flash_fwd_hdim128_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c616628c871d1773f9cf1c28da43ede7f684c230
--- /dev/null
+++ b/flash_fwd_hdim128_fp16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::half_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim128_fp16_sm80.cu b/flash_fwd_hdim128_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4ff6b9fbfbbef50456a4e707638de23e6612eafe
--- /dev/null
+++ b/flash_fwd_hdim128_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::half_t, false>(params, stream);
+}
diff --git a/flash_fwd_hdim128_fp16_sm90.cu b/flash_fwd_hdim128_fp16_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..176c38eddcdc25aa4ba6ef9eccb2d9fd53ee2c03
--- /dev/null
+++ b/flash_fwd_hdim128_fp16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 128>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim128<cutlass::half_t>(params, stream);
+}
diff --git a/flash_fwd_hdim160_bf16_causal_sm80.cu b/flash_fwd_hdim160_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d6d4371bfbdddbdde9ead33cf5f6b054df7fd526
--- /dev/null
+++ b/flash_fwd_hdim160_bf16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 160, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim160<cutlass::bfloat16_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim160_bf16_sm80.cu b/flash_fwd_hdim160_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5af68ac38fb2ea3032026dae4fc82d27d1ee0437
--- /dev/null
+++ b/flash_fwd_hdim160_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 160, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim160<cutlass::bfloat16_t, false>(params, stream);
+}
diff --git a/flash_fwd_hdim160_fp16_causal_sm80.cu b/flash_fwd_hdim160_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1ef511a6b749c21f5b9c8dcec111cf71a517a7cf
--- /dev/null
+++ b/flash_fwd_hdim160_fp16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 160, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim160<cutlass::half_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim160_fp16_sm80.cu b/flash_fwd_hdim160_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..96abfbd8a1808ebff6da52798d6086efcfb3ad18
--- /dev/null
+++ b/flash_fwd_hdim160_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 160, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim160<cutlass::half_t, false>(params, stream);
+}
diff --git a/flash_fwd_hdim192_bf16_causal_sm80.cu b/flash_fwd_hdim192_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..077d25d091ed512b778039d88f3019a96a24c279
--- /dev/null
+++ b/flash_fwd_hdim192_bf16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::bfloat16_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim192_bf16_sm80.cu b/flash_fwd_hdim192_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ea5f265fe33e70461eb4c20090aced3ae9526df2
--- /dev/null
+++ b/flash_fwd_hdim192_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::bfloat16_t, false>(params, stream);
+}
diff --git a/flash_fwd_hdim192_fp16_causal_sm80.cu b/flash_fwd_hdim192_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a4a7bc2422829913afe6b97bb399c8ccf096ad18
--- /dev/null
+++ b/flash_fwd_hdim192_fp16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::half_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim192_fp16_sm80.cu b/flash_fwd_hdim192_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c30c4a14fe7b4712335041f1014338c2fa68754f
--- /dev/null
+++ b/flash_fwd_hdim192_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim192<cutlass::half_t, false>(params, stream);
+}
diff --git a/flash_fwd_hdim224_bf16_causal_sm80.cu b/flash_fwd_hdim224_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a12a5f4ad7adaa632aced8f0ed390f6451dd787b
--- /dev/null
+++ b/flash_fwd_hdim224_bf16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 224, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim224<cutlass::bfloat16_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim224_bf16_sm80.cu b/flash_fwd_hdim224_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8690bdb1a40992bbda9680e1669bc669d9e9bc0f
--- /dev/null
+++ b/flash_fwd_hdim224_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 224, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim224<cutlass::bfloat16_t, false>(params, stream);
+}
diff --git a/flash_fwd_hdim224_fp16_causal_sm80.cu b/flash_fwd_hdim224_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f01dad09cff91953f4e12119c8d77b2e83748aa0
--- /dev/null
+++ b/flash_fwd_hdim224_fp16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 224, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim224<cutlass::half_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim224_fp16_sm80.cu b/flash_fwd_hdim224_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7ec1e16b7fee69cb735d6ba0e92d944f253b310a
--- /dev/null
+++ b/flash_fwd_hdim224_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 224, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim224<cutlass::half_t, false>(params, stream);
+}
diff --git a/flash_fwd_hdim256_bf16_causal_sm80.cu b/flash_fwd_hdim256_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f84e978c911efabf1234530300cac47e2384c263
--- /dev/null
+++ b/flash_fwd_hdim256_bf16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::bfloat16_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim256_bf16_sm80.cu b/flash_fwd_hdim256_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c52f0417b9c41396b52f258f949180a65484f7d4
--- /dev/null
+++ b/flash_fwd_hdim256_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::bfloat16_t, false>(params, stream);
+}
diff --git a/flash_fwd_hdim256_bf16_sm90.cu b/flash_fwd_hdim256_bf16_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..06d0df617bdcc9dd8fd7d6fff25961512b18b6ff
--- /dev/null
+++ b/flash_fwd_hdim256_bf16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 256>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::bfloat16_t>(params, stream);
+}
diff --git a/flash_fwd_hdim256_e4m3_sm90.cu b/flash_fwd_hdim256_e4m3_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..78884313ecf2ad54eea58d8153eab7306b2717cb
--- /dev/null
+++ b/flash_fwd_hdim256_e4m3_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::float_e4m3_t, 256>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256_fp8<cutlass::float_e4m3_t>(params, stream);
+}
diff --git a/flash_fwd_hdim256_fp16_causal_sm80.cu b/flash_fwd_hdim256_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f96f7edc672bf802bca02368a648b869f281d1a2
--- /dev/null
+++ b/flash_fwd_hdim256_fp16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::half_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim256_fp16_sm80.cu b/flash_fwd_hdim256_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9c7c6b93d8a3c587eb4004d38743ce723aae5011
--- /dev/null
+++ b/flash_fwd_hdim256_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::half_t, false>(params, stream);
+}
diff --git a/flash_fwd_hdim256_fp16_sm90.cu b/flash_fwd_hdim256_fp16_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0cc26c79104d2b2a77d3b1acbef1459b06a07433
--- /dev/null
+++ b/flash_fwd_hdim256_fp16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 256>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim256<cutlass::half_t>(params, stream);
+}
diff --git a/flash_fwd_hdim32_bf16_causal_sm80.cu b/flash_fwd_hdim32_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e21d0408ca5d3156e8148d14d6c57d01e737bfbb
--- /dev/null
+++ b/flash_fwd_hdim32_bf16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::bfloat16_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim32_bf16_sm80.cu b/flash_fwd_hdim32_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f377a5b8fa8d20578ab840f6e0345d080f2ad72f
--- /dev/null
+++ b/flash_fwd_hdim32_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::bfloat16_t, false>(params, stream);
+}
diff --git a/flash_fwd_hdim32_fp16_causal_sm80.cu b/flash_fwd_hdim32_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..74e4d66ae97d190f2a99553df34410949643882d
--- /dev/null
+++ b/flash_fwd_hdim32_fp16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::half_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim32_fp16_sm80.cu b/flash_fwd_hdim32_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e85db18e39b1e24b102e4b4de15d539434812ad0
--- /dev/null
+++ b/flash_fwd_hdim32_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim32<cutlass::half_t, false>(params, stream);
+}
diff --git a/flash_fwd_hdim64_bf16_causal_sm80.cu b/flash_fwd_hdim64_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9297e8bb68d7e4f40c6525b81c6b530fd7cd0971
--- /dev/null
+++ b/flash_fwd_hdim64_bf16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::bfloat16_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim64_bf16_sm80.cu b/flash_fwd_hdim64_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8364b1e7ee90ef75d9edfb1074a05e473ced4b1a
--- /dev/null
+++ b/flash_fwd_hdim64_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::bfloat16_t, false>(params, stream);
+}
diff --git a/flash_fwd_hdim64_bf16_sm90.cu b/flash_fwd_hdim64_bf16_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d3839898f25d5c1c4163024fa951af4b26b30073
--- /dev/null
+++ b/flash_fwd_hdim64_bf16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 64>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::bfloat16_t>(params, stream);
+}
diff --git a/flash_fwd_hdim64_e4m3_sm90.cu b/flash_fwd_hdim64_e4m3_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..471a5037a1dd0028037c38f195b1bd583b0466c1
--- /dev/null
+++ b/flash_fwd_hdim64_e4m3_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::float_e4m3_t, 64>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64_fp8<cutlass::float_e4m3_t>(params, stream);
+}
diff --git a/flash_fwd_hdim64_fp16_causal_sm80.cu b/flash_fwd_hdim64_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1c6ed7ef02396ccc7769a2cc8073192ef2d13a2e
--- /dev/null
+++ b/flash_fwd_hdim64_fp16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::half_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim64_fp16_sm80.cu b/flash_fwd_hdim64_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3c87573ba264b741b9c501d32dbe4473aeee4b81
--- /dev/null
+++ b/flash_fwd_hdim64_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::half_t, false>(params, stream);
+}
diff --git a/flash_fwd_hdim64_fp16_sm90.cu b/flash_fwd_hdim64_fp16_sm90.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c6eac53520404a79253f0c23e84de95312bff4c1
--- /dev/null
+++ b/flash_fwd_hdim64_fp16_sm90.cu
@@ -0,0 +1,9 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 64>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim64<cutlass::half_t>(params, stream);
+}
diff --git a/flash_fwd_hdim96_bf16_causal_sm80.cu b/flash_fwd_hdim96_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..49fae856a58661871b7bf1924029f5ef62443d96
--- /dev/null
+++ b/flash_fwd_hdim96_bf16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::bfloat16_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim96_bf16_sm80.cu b/flash_fwd_hdim96_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c5af1cf63451e79c43a39835cda9fbab5682a8fb
--- /dev/null
+++ b/flash_fwd_hdim96_bf16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::bfloat16_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::bfloat16_t, false>(params, stream);
+}
diff --git a/flash_fwd_hdim96_fp16_causal_sm80.cu b/flash_fwd_hdim96_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b0d6c9928ecb62be1b4269ef42bf74bb4aee3ad5
--- /dev/null
+++ b/flash_fwd_hdim96_fp16_causal_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::half_t, true>(params, stream);
+}
diff --git a/flash_fwd_hdim96_fp16_sm80.cu b/flash_fwd_hdim96_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c97aa33f8bcb86110c28ccf4582389431924d8e7
--- /dev/null
+++ b/flash_fwd_hdim96_fp16_sm80.cu
@@ -0,0 +1,10 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<cutlass::half_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream) {
+    run_mha_fwd_hdim96<cutlass::half_t, false>(params, stream);
+}
diff --git a/flash_fwd_kernel.h b/flash_fwd_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b55021cf023491eb412bcca5d34ad21c447da5e
--- /dev/null
+++ b/flash_fwd_kernel.h
@@ -0,0 +1,385 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include <cutlass/cutlass.h>
+#include <cutlass/arch/reg_reconfig.h>
+#include <cutlass/array.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/numeric_conversion.h>
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "flash.h"
+#include "utils.h"
+#include "softmax.h"
+#include "tile_scheduler.hpp"
+#include "mainloop_fwd_sm90_tma_gmma_ws.hpp"
+#include "epilogue_fwd_sm90_tma.hpp"
+
+namespace flash {
+
+using namespace cute;
+
+template <typename Ktraits, bool Is_causal, typename TileScheduler, typename Seqlen_traits>
+__global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp, 1)
+    compute_attn_ws(CUTE_GRID_CONSTANT typename CollectiveMainloopFwd<Ktraits, Is_causal, Seqlen_traits>::Params const mainloop_params,
+                    CUTE_GRID_CONSTANT typename CollectiveEpilogueFwd<Ktraits, Seqlen_traits>::Params const epilogue_params,
+                    CUTE_GRID_CONSTANT typename TileScheduler::Params const scheduler_params,
+                    Seqlen_traits seqlen_traits_q, Seqlen_traits seqlen_traits_k
+                    ) {
+
+    using Element = typename Ktraits::Element;
+    using ElementAccum = typename Ktraits::ElementAccum;
+    using SoftType = ElementAccum;
+    using TileShape_MNK = typename Ktraits::TileShape_MNK;
+    using ClusterShape = typename Ktraits::ClusterShape_MNK;
+
+    static_assert(Ktraits::Is_WS);
+    static constexpr bool Is_WS = Ktraits::Is_WS;
+
+    static constexpr int NumMmaThreads = size(typename Ktraits::TiledMma0{});
+    static constexpr int NumCopyThreads = !Is_WS ? 0 : cutlass::NumThreadsPerWarpGroup;
+    static constexpr int kBlockM = Ktraits::kBlockM;
+    // static constexpr int kBlockN = Ktraits::kBlockN;
+    // constexpr int kHeadDim = Ktraits::kHeadDim;
+
+    using CollectiveMainloop = CollectiveMainloopFwd<Ktraits, Is_causal, Seqlen_traits>;
+    using CollectiveEpilogue = CollectiveEpilogueFwd<Ktraits, Seqlen_traits>;
+
+    using MainloopPipeline = typename Ktraits::MainloopPipeline;
+    using PipelineParams = typename MainloopPipeline::Params;
+    using PipelineState = typename MainloopPipeline::PipelineState;
+
+    extern __shared__ char shared_memory[];
+    auto &shared_storage = *reinterpret_cast<typename Ktraits::SharedStorage*>(shared_memory);
+
+    int const lane_predicate = cute::elect_one_sync();
+    int const warp_idx = cutlass::canonical_warp_idx_sync();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if (warp_idx == 0 && lane_predicate) {
+        CollectiveMainloop::prefetch_tma_descriptors(mainloop_params);
+        CollectiveEpilogue::prefetch_tma_descriptors(epilogue_params);
+    }
+
+    // Obtain warp index
+    int const warp_group_thread_idx = threadIdx.x % cutlass::NumThreadsPerWarpGroup;
+
+    PipelineParams pipeline_params;
+    pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytesK;
+    int warp_group_idx = cutlass::canonical_warp_group_idx();
+    pipeline_params.role = warp_group_idx == 0
+        ? MainloopPipeline::ThreadCategory::Producer
+        : MainloopPipeline::ThreadCategory::Consumer;
+    pipeline_params.is_leader = warp_group_thread_idx == 0;
+    pipeline_params.num_consumers = NumMmaThreads;
+
+    if (warp_idx == 0 && lane_predicate) {
+        shared_storage.barrier_Q.init(1 /*numThreads*/);
+        shared_storage.barrier_O.init(size(ClusterShape{}) /*numThreads*/);
+    }
+    // We're counting on pipeline_k to call cutlass::arch::fence_barrier_init();
+    MainloopPipeline pipeline_k(shared_storage.pipeline_k, pipeline_params, ClusterShape{});
+    MainloopPipeline pipeline_v(shared_storage.pipeline_v, pipeline_params, ClusterShape{});
+
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue;
+
+    // We need this to guarantee that the Pipeline init is visible to all producers and consumer blocks in the Cluster
+    if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        cute::cluster_wait();
+    } else {
+        __syncthreads();
+    }
+
+    static_assert(Ktraits::kNWarps == 12 || Ktraits::kNWarps == 16);
+    if (warp_group_idx == 0) {  // Producer
+        cutlass::arch::warpgroup_reg_dealloc<Ktraits::kNWarps == 12 ? 24 : 32>();
+        // cutlass::arch::warpgroup_reg_dealloc<56>();
+
+        int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+        if (warp_idx_in_warpgroup == 0) {  // Load Q, K, V
+            PipelineState smem_pipe_write_k = cutlass::make_producer_start_state<MainloopPipeline>();
+            PipelineState smem_pipe_write_v = cutlass::make_producer_start_state<MainloopPipeline>();
+
+            int work_idx = 0;
+
+            TileScheduler scheduler(&shared_storage.tile_count_semaphore);
+            for (auto work_tile_info = scheduler.get_initial_work();
+                 work_tile_info.is_valid(scheduler_params);
+                 work_tile_info = scheduler.template get_next_work</*IsProducer=*/true>(scheduler_params, work_tile_info)) {
+                auto block_coord = work_tile_info.get_block_coord(scheduler_params);
+                auto [m_block, bidh, bidb] = block_coord;
+
+                seqlen_traits_q.init(bidb);
+                seqlen_traits_k.init(bidb);
+                if (m_block * kBlockM >= seqlen_traits_q.actual_seq_len) {
+                    continue;
+                }
+                int n_block_max = collective_mainloop.get_n_block_max(
+                    mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
+                if (Is_causal && n_block_max <= 0) {
+                    scheduler.prefetch_next_work(scheduler_params, work_tile_info);
+                    scheduler.broadcast_next_work(work_tile_info);
+                    continue;
+                }
+                collective_mainloop.load(mainloop_params, pipeline_k, pipeline_v, smem_pipe_write_k, smem_pipe_write_v,
+                                         shared_storage, scheduler, scheduler_params, work_tile_info, block_coord, work_idx,
+                                         seqlen_traits_q, seqlen_traits_k);
+                ++work_idx;
+            }
+            collective_mainloop.load_tail(pipeline_k, pipeline_v, smem_pipe_write_k, smem_pipe_write_v);
+        }
+    } else {  // Consumer
+        cutlass::arch::warpgroup_reg_alloc<Ktraits::kNWarps == 12 ? 240 : 160>();
+        // cutlass::arch::warpgroup_reg_alloc<Ktraits::kNWarps == 12 ? 224 : 160>();
+
+        TileScheduler scheduler(&shared_storage.tile_count_semaphore);
+        // Initialize matmul objects.
+        typename Ktraits::TiledMma1 tiled_mma1;
+
+        PipelineState smem_pipe_read_k, smem_pipe_read_v;
+        // We don't need separate variables smem_pipe_release_k and smem_pipe_release_v
+        // (like in Cutlass's gemm) because the read and release pipeline states are always the same.
+
+        collective_mainloop.mma_init();
+        scheduler.init_consumer();
+
+        int work_idx = 0;
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (auto work_tile_info = scheduler.get_initial_work();
+             work_tile_info.is_valid(scheduler_params);
+             work_tile_info = scheduler.template get_next_work</*IsProducer=*/false>(scheduler_params, work_tile_info)) {
+            // Attention output (GEMM-II) accumulator.
+            Tensor tOrO = partition_fragment_C(tiled_mma1, select<0, 2>(TileShape_MNK{}));
+            flash::Softmax<2 * (2 * kBlockM / NumMmaThreads)> softmax;
+
+            auto block_coord = work_tile_info.get_block_coord(scheduler_params);
+            auto [m_block, bidh, bidb] = block_coord;
+
+            seqlen_traits_q.init(bidb);
+            seqlen_traits_k.init(bidb);
+            if (m_block * kBlockM >= seqlen_traits_q.actual_seq_len) {
+                continue;
+            }
+            int n_block_max = collective_mainloop.get_n_block_max(
+                mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
+            if (Is_causal && n_block_max <= 0) {  // We exit early and write 0 to gO and -inf to gLSE.
+                collective_epilogue.store_zero(epilogue_params, shared_storage, threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q);
+                continue;
+            }
+
+            collective_mainloop.mma(mainloop_params, pipeline_k, pipeline_v, smem_pipe_read_k, smem_pipe_read_v,
+                                    tOrO, softmax, n_block_max, threadIdx.x - NumCopyThreads, work_idx, m_block, shared_storage,
+                                    seqlen_traits_q, seqlen_traits_k);
+                                    // tOrO, softmax, n_block_max, threadIdx.x - NumCopyThreads + (work_idx >> 30), work_idx, shared_storage);
+            collective_epilogue.store(epilogue_params, tOrO, softmax.row_sum, shared_storage, tiled_mma1,
+                                      threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q);
+
+            ++work_idx;
+        }
+        collective_epilogue.store_tail();
+    }
+
+}
+
+template <typename Ktraits, bool Is_causal, typename TileScheduler, typename Seqlen_traits>
+__global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp, 1)
+    compute_attn_ws_fp8(CUTE_GRID_CONSTANT typename CollectiveMainloopFwd<Ktraits, Is_causal, Seqlen_traits>::Params const mainloop_params,
+                        CUTE_GRID_CONSTANT typename CollectiveEpilogueFwd<Ktraits, Seqlen_traits>::Params const epilogue_params,
+                        CUTE_GRID_CONSTANT typename TileScheduler::Params const scheduler_params,
+                        Seqlen_traits seqlen_traits_q, Seqlen_traits seqlen_traits_k
+                        ) {
+
+    using Element = typename Ktraits::Element;
+    static_assert(cutlass::sizeof_bits_v<Element> == 8);
+    using ElementAccum = typename Ktraits::ElementAccum;
+    using SoftType = ElementAccum;
+    using TileShape_MNK = typename Ktraits::TileShape_MNK;
+    using ClusterShape = typename Ktraits::ClusterShape_MNK;
+
+    static_assert(Ktraits::Is_WS);
+    static constexpr bool Is_WS = Ktraits::Is_WS;
+    static constexpr bool kUseVarSeqLen = Seqlen_traits::kUseVarSeqLen;
+
+    static constexpr int NumMmaThreads = size(typename Ktraits::TiledMma0{});
+    static constexpr int NumCopyThreads = !Is_WS ? 0 : cutlass::NumThreadsPerWarpGroup;
+    static constexpr int kBlockM = Ktraits::kBlockM;
+    // static constexpr int kBlockN = Ktraits::kBlockN;
+    // static constexpr int kHeadDim = Ktraits::kHeadDim;
+    static constexpr bool Delay_V_release = Is_causal && Ktraits::kHeadDim == 128;  
+    // for now, disable for hdim 128 causal to avoid perf regression with register spilling
+    static constexpr bool Use_max_offset = !(Is_causal && Ktraits::kHeadDim == 128);    
+
+    using CollectiveMainloop = CollectiveMainloopFwd<Ktraits, Is_causal, Seqlen_traits>;
+    using CollectiveEpilogue = CollectiveEpilogueFwd<Ktraits, Seqlen_traits>;
+
+    using MainloopPipeline = typename Ktraits::MainloopPipeline;
+    using MainloopPipelineVt = typename Ktraits::MainloopPipelineNoTMA;
+    using PipelineParams = typename MainloopPipeline::Params;
+    using PipelineParamsVt = typename MainloopPipelineVt::Params;
+    using PipelineState = typename MainloopPipeline::PipelineState;
+
+    extern __shared__ char shared_memory[];
+    auto &shared_storage = *reinterpret_cast<typename Ktraits::SharedStorage*>(shared_memory);
+
+    int const lane_predicate = cute::elect_one_sync();
+    int const warp_idx = cutlass::canonical_warp_idx_sync();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if (warp_idx == 0 && lane_predicate) {
+        CollectiveMainloop::prefetch_tma_descriptors(mainloop_params);
+        CollectiveEpilogue::prefetch_tma_descriptors(epilogue_params);
+    }
+
+    // Obtain warp index
+    int const warp_group_thread_idx = threadIdx.x % cutlass::NumThreadsPerWarpGroup;
+
+    // additional pipeline to synchronize out-of-place smem transpose of V
+    PipelineParamsVt pipeline_params_vt;
+    pipeline_params_vt.producer_arv_count = NumCopyThreads;
+    pipeline_params_vt.consumer_arv_count = NumMmaThreads;
+    MainloopPipelineVt pipeline_vt(shared_storage.pipeline_vt, pipeline_params_vt);
+    
+    PipelineParams pipeline_params;
+    pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytesK;
+    int warp_group_idx = cutlass::canonical_warp_group_idx();
+    pipeline_params.role = warp_group_idx == 0
+        ? MainloopPipeline::ThreadCategory::Producer
+        : MainloopPipeline::ThreadCategory::Consumer;
+    pipeline_params.is_leader = warp_group_thread_idx == 0;
+    pipeline_params.num_consumers = NumMmaThreads;
+
+    if (warp_idx == 0 && lane_predicate) {
+        shared_storage.barrier_Q.init(1 /*numThreads*/);
+        shared_storage.barrier_O.init(size(ClusterShape{}) /*numThreads*/);
+    }
+    // We're counting on pipeline_k to call cutlass::arch::fence_barrier_init();
+    MainloopPipeline pipeline_k(shared_storage.pipeline_k, pipeline_params, ClusterShape{});
+    // pipeline_v has producer warpgroup for its consumer in fp8 kernel
+    pipeline_params.num_consumers = NumCopyThreads;
+    pipeline_params.role = MainloopPipeline::ThreadCategory::ProducerConsumer;
+    MainloopPipeline pipeline_v(shared_storage.pipeline_v, pipeline_params, ClusterShape{});
+
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue;
+
+    // We need this to guarantee that the Pipeline init is visible to all producers and consumer blocks in the Cluster
+    if constexpr (size(ClusterShape{}) > 1) {
+        cute::cluster_arrive_relaxed();
+        cute::cluster_wait();
+    } else {
+        __syncthreads();
+    }
+
+    static_assert(Ktraits::kNWarps == 12 || Ktraits::kNWarps == 16);
+    if (warp_group_idx == 0) {  // Producer
+        cutlass::arch::warpgroup_reg_dealloc<Ktraits::kNWarps == 12 ? 40 : 32>();
+        
+        PipelineState smem_pipe_write = cutlass::make_producer_start_state<MainloopPipeline>(); 
+        PipelineState smem_pipe_read, smem_pipe_release;
+
+        int work_idx = 0;
+
+        TileScheduler scheduler(&shared_storage.tile_count_semaphore);
+        for (auto work_tile_info = scheduler.get_initial_work();
+                work_tile_info.is_valid(scheduler_params);
+                work_tile_info = scheduler.template get_next_work</*IsProducer=*/true>(scheduler_params, work_tile_info)) {
+            auto block_coord = work_tile_info.get_block_coord(scheduler_params);
+            auto [m_block, bidh, bidb] = block_coord;
+
+            if constexpr(kUseVarSeqLen) {
+                seqlen_traits_q.init(bidb);
+                seqlen_traits_k.init(bidb);
+                if (m_block * kBlockM >= seqlen_traits_q.actual_seq_len) {
+                    continue;
+                }
+            }
+            int n_block_max = collective_mainloop.get_n_block_max(
+                mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
+            if constexpr(Is_causal) {
+                if(n_block_max <= 0) {
+                    scheduler.prefetch_next_work(scheduler_params, work_tile_info);
+                    scheduler.broadcast_next_work(work_tile_info);
+                    // need to sync producer warpgroup
+                    cutlass::arch::NamedBarrier::sync(NumCopyThreads, static_cast<int>(FwdNamedBarriers::ProducerWG) /*id*/);
+                    continue;
+                }
+            }
+            collective_mainloop.load_fp8(
+                mainloop_params, pipeline_k, pipeline_v, pipeline_vt,
+                smem_pipe_write, smem_pipe_read, shared_storage,
+                scheduler, scheduler_params, work_tile_info, block_coord, work_idx,
+                seqlen_traits_q, seqlen_traits_k);
+            ++work_idx;
+            // don't need to sync producer warpgroup here
+            // if constexpr (Is_causal) {
+            //     cutlass::arch::NamedBarrier::sync(NumCopyThreads, static_cast<int>(FwdNamedBarriers::ProducerWG) /*id*/); }
+        }
+        collective_mainloop.load_tail_one_write(pipeline_k, pipeline_v, smem_pipe_write);
+    } else {  // Consumer
+        cutlass::arch::warpgroup_reg_alloc<Ktraits::kNWarps == 12 ? 232 : 160>();        
+
+        TileScheduler scheduler(&shared_storage.tile_count_semaphore);
+        // Initialize matmul objects.
+        typename Ktraits::TiledMma1 tiled_mma1;
+        PipelineState smem_pipe_read;
+        PipelineState smem_pipe_release;
+
+        collective_mainloop.mma_init();
+        scheduler.init_consumer();
+
+        int work_idx = 0;
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (auto work_tile_info = scheduler.get_initial_work();
+             work_tile_info.is_valid(scheduler_params);
+             work_tile_info = scheduler.template get_next_work</*IsProducer=*/false>(scheduler_params, work_tile_info)) {
+            // Attention output (GEMM-II) accumulator.
+            Tensor tOrO = partition_fragment_C(tiled_mma1, select<0, 2>(TileShape_MNK{}));
+            flash::Softmax<2 * (2 * kBlockM / NumMmaThreads), Use_max_offset> softmax;
+
+            auto block_coord = work_tile_info.get_block_coord(scheduler_params);
+            auto [m_block, bidh, bidb] = block_coord;
+
+            if constexpr(kUseVarSeqLen) {
+                seqlen_traits_q.init(bidb);
+                seqlen_traits_k.init(bidb);
+                if (m_block * kBlockM >= seqlen_traits_q.actual_seq_len) {
+                    continue;
+                }
+            }
+            int n_block_max = collective_mainloop.get_n_block_max(
+                mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
+            if constexpr(Is_causal) {
+                if(n_block_max <= 0) {  // We exit early and write 0 to gO and -inf to gLSE.
+                    collective_epilogue.store_zero(epilogue_params, shared_storage, threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q);
+                    continue;
+                }
+            }
+            
+            collective_mainloop.mma_fp8<Delay_V_release>(
+                mainloop_params, pipeline_k, pipeline_vt, smem_pipe_read, smem_pipe_release,
+                tOrO, softmax, n_block_max,
+                threadIdx.x - NumCopyThreads, work_idx, m_block,
+                shared_storage, seqlen_traits_q, seqlen_traits_k); 
+
+        #ifndef NO_FP8_COLUMN_PERMUTE
+            collective_epilogue.store_fp8(epilogue_params, tOrO, softmax.row_sum, shared_storage, tiled_mma1,
+                                      threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q);
+        #else
+            collective_epilogue.store(epilogue_params, tOrO, softmax.row_sum, shared_storage, tiled_mma1,
+                                      threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q);
+        #endif
+            ++work_idx;
+        }
+        collective_epilogue.store_tail();
+    }
+
+}
+
+} // namespace flash
diff --git a/flash_fwd_launch_template.h b/flash_fwd_launch_template.h
new file mode 100644
index 0000000000000000000000000000000000000000..df128c8316b57c6b99569f52ae00fa13158812d0
--- /dev/null
+++ b/flash_fwd_launch_template.h
@@ -0,0 +1,211 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/cluster_launch.hpp"
+
+#include "static_switch.h"
+#include "flash.h"
+#include "tile_scheduler.hpp"
+#include "flash_fwd_kernel.h"
+#include "kernel_traits.h"
+#include "seq_len.h"
+#include "utils.h"
+
+
+template<typename Kernel_traits, bool Is_causal, typename Seqlen_traits>
+void run_flash_fwd(Flash_fwd_params &params, cudaStream_t stream) {
+    using Element = typename Kernel_traits::Element;
+    using OutputType = typename Kernel_traits::OutputType;
+    using TileShape_MNK = typename Kernel_traits::TileShape_MNK;
+    using ClusterShape = typename Kernel_traits::ClusterShape_MNK;
+
+    // print(typename Kernel_traits::SmemLayoutVt{}); printf("\n"); print(typename Kernel_traits::SmemLayoutVt_tmp{});
+    using CollectiveMainloop = flash::CollectiveMainloopFwd<Kernel_traits, Is_causal, Seqlen_traits>;
+    using CollectiveEpilogue = flash::CollectiveEpilogueFwd<Kernel_traits, Seqlen_traits>;
+    using Scheduler = std::conditional_t<
+        Seqlen_traits::kUseVarSeqLen, 
+        flash::SingleTileScheduler,
+        std::conditional_t<!Is_causal,
+            flash::StaticPersistentTileScheduler,
+            flash::DynamicPersistentTileScheduler<Kernel_traits::kNThreads - cutlass::NumThreadsPerWarpGroup, Kernel_traits::NumProducerThreads>
+    >>;
+    // using Scheduler = flash::SingleTileScheduler;
+    Seqlen_traits seqlen_traits_q(
+        params.total_q, params.seqlen_q, params.cu_seqlens_q);
+    Seqlen_traits seqlen_traits_k(
+        params.total_k, params.seqlen_k, params.cu_seqlens_k, params.seqused_k);
+    typename CollectiveMainloop::Params mainloop_params =
+        CollectiveMainloop::to_underlying_arguments({
+            static_cast<Element const*>(params.q_ptr),
+            seqlen_traits_q.get_gmem_layout(
+                params.seqlen_q, params.d, params.h, params.b, 
+                params.q_row_stride, params.q_head_stride, params.q_batch_stride
+            ),  // layout_Q
+            static_cast<Element const*>(params.k_ptr),
+            seqlen_traits_k.get_gmem_layout(
+                params.seqlen_k, params.d, params.h_k, params.b, 
+                params.k_row_stride, params.k_head_stride, params.k_batch_stride
+            ),  // layout_K
+            static_cast<Element const*>(params.v_ptr),
+            seqlen_traits_k.get_gmem_layout(
+                params.seqlen_k, params.d, params.h_k, params.b, 
+                params.v_row_stride, params.v_head_stride, params.v_batch_stride
+            ),  // layout_V
+            params.scale_softmax_log2
+        });
+    typename CollectiveEpilogue::Params epilogue_params =
+        CollectiveEpilogue::to_underlying_arguments({
+            static_cast<OutputType*>(params.o_ptr),
+            seqlen_traits_q.get_gmem_layout(
+                params.seqlen_q, params.d, params.h, params.b,
+                params.o_row_stride, params.o_head_stride, params.o_batch_stride
+            ),  // layout_O
+            static_cast<float*>(params.softmax_lse_ptr),
+            seqlen_traits_q.get_lse_gmem_layout(
+                params.seqlen_q, params.h, params.b
+            )  // layout_LSE
+        });
+
+    int num_blocks_m = cutlass::ceil_div(params.seqlen_q, Kernel_traits::kBlockM);
+    num_blocks_m = cutlass::ceil_div(num_blocks_m, size<0>(ClusterShape{})) * size<0>(ClusterShape{});
+    typename Scheduler::Arguments scheduler_args = {num_blocks_m, params.h, params.b, params.tile_count_semaphore};
+    typename Scheduler::Params scheduler_params = Scheduler::to_underlying_arguments(scheduler_args);
+
+    // Get the ptr to kernel function.
+    void *kernel;
+    if constexpr(cutlass::sizeof_bits_v<Element> == 8)
+        kernel = (void *)flash::compute_attn_ws_fp8<Kernel_traits, Is_causal, Scheduler, Seqlen_traits>;
+    else
+        kernel = (void *)flash::compute_attn_ws<Kernel_traits, Is_causal, Scheduler, Seqlen_traits>;
+    int smem_size = sizeof(typename Kernel_traits::SharedStorage);
+    // int smem_size_q = sizeof(decltype((typename Kernel_traits::SharedStorage{}).smem_q));
+    // int smem_size_k = sizeof(decltype((typename Kernel_traits::SharedStorage{}).smem_k));
+    // int smem_size_v = sizeof(decltype((typename Kernel_traits::SharedStorage{}).smem_v));
+    // int smem_size_o = sizeof(decltype((typename Kernel_traits::SharedStorage{}).smem_o));
+    // printf("smem_size = %d, q = %d, k = %d, v = %d, o = %d.\n", smem_size, smem_size_q, smem_size_k, smem_size_v, smem_size_o);
+    if (smem_size >= 48 * 1024) {
+       CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    }
+
+    int device;
+    cudaGetDevice(&device);
+    int multiprocessor_count;
+    CHECK_CUDA(cudaDeviceGetAttribute(&multiprocessor_count, cudaDevAttrMultiProcessorCount, device));
+    dim3 grid_dims = Scheduler::get_grid_dim(scheduler_args, multiprocessor_count);
+    static constexpr int ctaSize = Kernel_traits::kNWarps * 32;
+    dim3 block_dims(ctaSize);
+    dim3 cluster_dims(size<0>(ClusterShape{}), size<1>(ClusterShape{}), size<2>(ClusterShape{}));
+    cutlass::ClusterLaunchParams launch_params{grid_dims, block_dims, cluster_dims, smem_size, stream};
+    cutlass::launch_kernel_on_cluster(
+        launch_params, kernel, mainloop_params, epilogue_params, 
+        scheduler_params, seqlen_traits_q, seqlen_traits_k);
+    CHECK_CUDA_KERNEL_LAUNCH();
+}
+
+template<typename T>
+void run_mha_fwd_hdim64(Flash_fwd_params &params, cudaStream_t stream) {
+    constexpr static int Headdim = 64;
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+        SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
+            run_flash_fwd<
+                Flash_fwd_kernel_traits<Headdim, 192, 128, 16, 2, false, 1, T>, 
+                Is_causal, Seqlen_traits
+            >(params, stream);
+        });
+    });
+}
+
+template<typename T>
+void run_mha_fwd_hdim128(Flash_fwd_params &params, cudaStream_t stream) {
+    constexpr static int Headdim = 128;
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+        SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
+            // Only use Cluster if number of tiles along seqlen_q is even and not Is_causal
+            BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128) % 2 == 0 && !Is_causal && !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] {
+                run_flash_fwd<
+                    Flash_fwd_kernel_traits<Headdim, 128, Is_causal ? 128 : 176, 12, 2, false, UseCluster ? 2 : 1, T>, 
+                    Is_causal, Seqlen_traits
+                >(params, stream);
+            });
+        });
+    });
+}
+
+template<typename T>
+void run_mha_fwd_hdim256(Flash_fwd_params &params, cudaStream_t stream) {
+    constexpr static int Headdim = 256;
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+        SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
+            // Only use Cluster if number of tiles along seqlen_q is even
+            BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128) % 2 == 0 && !Is_causal && !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] {
+                run_flash_fwd<
+                    Flash_fwd_kernel_traits<Headdim, 128, 80, 12, 2, false, UseCluster ? 2 : 1, T>, 
+                    Is_causal, Seqlen_traits
+                >(params, stream);
+            });
+        });
+    });
+}
+
+template<typename T>
+void run_mha_fwd_hdim64_fp8(Flash_fwd_params &params, cudaStream_t stream) {
+    constexpr static int Headdim = 64;
+    constexpr static int kBlockM = 192;
+    constexpr static int kBlockN = 128;
+    constexpr static int kNWarps = 4 + kBlockM/16;
+    constexpr static int kStages = 4;    
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+        SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
+            // Only use Cluster if number of tiles along seqlen_q is even
+            BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, kBlockM) % 2 == 0 && !Is_causal &&
+                        !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] {
+                run_flash_fwd<Flash_fwd_kernel_traits_fp8<Headdim, kBlockM, kBlockN, kNWarps, kStages,
+                              false, UseCluster ? 2 : 1, T>, Is_causal, Seqlen_traits>(params, stream);            
+            });
+        });
+    });    
+}
+
+template<typename T>
+void run_mha_fwd_hdim128_fp8(Flash_fwd_params &params, cudaStream_t stream) {
+    constexpr static int Headdim = 128;
+    constexpr static int kBlockM = 128;
+    constexpr static int kBlockN = 256;
+    constexpr static int kNWarps = 4 + kBlockM/16;
+    constexpr static int kStages = 2;    
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+        SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
+            // Only use Cluster if number of tiles along seqlen_q is even
+            BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, kBlockM) % 2 == 0 && !Is_causal &&
+                        !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] {
+                run_flash_fwd<Flash_fwd_kernel_traits_fp8<Headdim, kBlockM, kBlockN, kNWarps, kStages,
+                              false, UseCluster ? 2 : 1, T>, Is_causal, Seqlen_traits>(params, stream);
+            });
+        });
+    });    
+}
+
+template<typename T>
+void run_mha_fwd_hdim256_fp8(Flash_fwd_params &params, cudaStream_t stream) {
+    constexpr static int Headdim = 256; 
+    constexpr static int kBlockM = 128;
+    constexpr static int kBlockN = 128;
+    constexpr static int kNWarps = 4 + kBlockM/16;
+    constexpr static int kStages = 2;    
+    BOOL_SWITCH(params.is_causal, Is_causal, [&] {
+        SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] {
+            // Only use Cluster if number of tiles along seqlen_q is even
+            BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, kBlockM) % 2 == 0 && !Is_causal &&
+                        !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] {
+                run_flash_fwd<Flash_fwd_kernel_traits_fp8<Headdim, kBlockM, kBlockN, kNWarps, kStages,
+                              false, UseCluster ? 2 : 1, T>, Is_causal, Seqlen_traits>(params, stream);
+            });
+        });
+    });    
+}
diff --git a/flash_fwd_split_hdim128_bf16_causal_sm80.cu b/flash_fwd_split_hdim128_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a959c9ceb143bee1f46602c3af6a1e7d5564738a
--- /dev/null
+++ b/flash_fwd_split_hdim128_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim128_bf16_sm80.cu b/flash_fwd_split_hdim128_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e608e308ef75acb59696266439073babac5529fc
--- /dev/null
+++ b/flash_fwd_split_hdim128_bf16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim128_fp16_causal_sm80.cu b/flash_fwd_split_hdim128_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3dd74e273d5a4b4b7f114799883deace0ab8c4c7
--- /dev/null
+++ b/flash_fwd_split_hdim128_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim128_fp16_sm80.cu b/flash_fwd_split_hdim128_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..addacedf49c60ed3469a9c72ecb5f774860f79f0
--- /dev/null
+++ b/flash_fwd_split_hdim128_fp16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 128, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim160_bf16_causal_sm80.cu b/flash_fwd_split_hdim160_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8ace7bda96877448195bc53500bc87b986ee602a
--- /dev/null
+++ b/flash_fwd_split_hdim160_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 160, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim160_bf16_sm80.cu b/flash_fwd_split_hdim160_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1e133ec1af696fdac66785bac5148c06cb37e245
--- /dev/null
+++ b/flash_fwd_split_hdim160_bf16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 160, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim160_fp16_causal_sm80.cu b/flash_fwd_split_hdim160_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1723c69e08198f7e5898b5877fc387cc7c14bd21
--- /dev/null
+++ b/flash_fwd_split_hdim160_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 160, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim160_fp16_sm80.cu b/flash_fwd_split_hdim160_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..892d2352aa581c8a4bc32401256365b71c2e8492
--- /dev/null
+++ b/flash_fwd_split_hdim160_fp16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 160, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim192_bf16_causal_sm80.cu b/flash_fwd_split_hdim192_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d07ee0af2f18062323a3035897db97de10f033f9
--- /dev/null
+++ b/flash_fwd_split_hdim192_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim192_bf16_sm80.cu b/flash_fwd_split_hdim192_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..23cfa59d5a03861cab3e0c50ac3ff857cb623aa0
--- /dev/null
+++ b/flash_fwd_split_hdim192_bf16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim192_fp16_causal_sm80.cu b/flash_fwd_split_hdim192_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..273a28442790c8b063677fc0cbc43d662e51b871
--- /dev/null
+++ b/flash_fwd_split_hdim192_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 192, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim192_fp16_sm80.cu b/flash_fwd_split_hdim192_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0f588d1f4482008d8df314a31d75cd932e25f91c
--- /dev/null
+++ b/flash_fwd_split_hdim192_fp16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 192, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim224_bf16_causal_sm80.cu b/flash_fwd_split_hdim224_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ea024d9abf92199b436a598e2e4b1a36351b6146
--- /dev/null
+++ b/flash_fwd_split_hdim224_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 224, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim224_bf16_sm80.cu b/flash_fwd_split_hdim224_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b06ae5ace1fa516425b852776e470166f034430c
--- /dev/null
+++ b/flash_fwd_split_hdim224_bf16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 224, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim224_fp16_causal_sm80.cu b/flash_fwd_split_hdim224_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b217f37891014db4e40548cb630d85a1252a6793
--- /dev/null
+++ b/flash_fwd_split_hdim224_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 224, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim224_fp16_sm80.cu b/flash_fwd_split_hdim224_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8cf2eabed632d46a0f69578e56be6da5c2248f6d
--- /dev/null
+++ b/flash_fwd_split_hdim224_fp16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2023, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 224, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim256_bf16_causal_sm80.cu b/flash_fwd_split_hdim256_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..370fe9ca3eea4dea5ea26afd5bc9ab4ff267b690
--- /dev/null
+++ b/flash_fwd_split_hdim256_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim256_bf16_sm80.cu b/flash_fwd_split_hdim256_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..508f07f7d3d6eee0494b1ec96326c63487e74c3b
--- /dev/null
+++ b/flash_fwd_split_hdim256_bf16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim256_fp16_causal_sm80.cu b/flash_fwd_split_hdim256_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..019ded67f9d00fc38be09e975b7f8e52a0c48572
--- /dev/null
+++ b/flash_fwd_split_hdim256_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 256, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim256_fp16_sm80.cu b/flash_fwd_split_hdim256_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..708f5542acb78266a7e22323a5e50b333437b661
--- /dev/null
+++ b/flash_fwd_split_hdim256_fp16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 256, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim32_bf16_causal_sm80.cu b/flash_fwd_split_hdim32_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5a205b7e793a21258ca9052bcf4a0dc4a94b35df
--- /dev/null
+++ b/flash_fwd_split_hdim32_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim32_bf16_sm80.cu b/flash_fwd_split_hdim32_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2c576f118debc6476f509037c227a2d4d12acdc1
--- /dev/null
+++ b/flash_fwd_split_hdim32_bf16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim32_fp16_causal_sm80.cu b/flash_fwd_split_hdim32_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..484a15e9368fbfa37dd3cbf1553ba207aa109a87
--- /dev/null
+++ b/flash_fwd_split_hdim32_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 32, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim32_fp16_sm80.cu b/flash_fwd_split_hdim32_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5474ae89d92d9e3c2f96f67e334ccf0f19ee3762
--- /dev/null
+++ b/flash_fwd_split_hdim32_fp16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 32, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim64_bf16_causal_sm80.cu b/flash_fwd_split_hdim64_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8c7da41ddc854caa7afbdb6e9170a231b33fcc04
--- /dev/null
+++ b/flash_fwd_split_hdim64_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim64_bf16_sm80.cu b/flash_fwd_split_hdim64_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..93f29dea8afaf6dbbe9389d933a7ba47ec0262b0
--- /dev/null
+++ b/flash_fwd_split_hdim64_bf16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim64_fp16_causal_sm80.cu b/flash_fwd_split_hdim64_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1e2e12b8cb32f60bc50d64fa8123f3fd1ff06cf6
--- /dev/null
+++ b/flash_fwd_split_hdim64_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 64, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim64_fp16_sm80.cu b/flash_fwd_split_hdim64_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..16c34ed3f578e202eb452fa79ed9f1c3a5fb7792
--- /dev/null
+++ b/flash_fwd_split_hdim64_fp16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 64, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim96_bf16_causal_sm80.cu b/flash_fwd_split_hdim96_bf16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..50080c47e57f600bded3f5d250e54426512d553a
--- /dev/null
+++ b/flash_fwd_split_hdim96_bf16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim96_bf16_sm80.cu b/flash_fwd_split_hdim96_bf16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ae56ddd4cae718bebb1039861f83b49e77cc7a02
--- /dev/null
+++ b/flash_fwd_split_hdim96_bf16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::bfloat16_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim96_fp16_causal_sm80.cu b/flash_fwd_split_hdim96_fp16_causal_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ed305767e1b47334705ffcc8fef9c9f4998adbd7
--- /dev/null
+++ b/flash_fwd_split_hdim96_fp16_causal_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 96, true>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flash_fwd_split_hdim96_fp16_sm80.cu b/flash_fwd_split_hdim96_fp16_sm80.cu
new file mode 100644
index 0000000000000000000000000000000000000000..02206465616bb84092a7d3640c07f44d7fbdecc5
--- /dev/null
+++ b/flash_fwd_split_hdim96_fp16_sm80.cu
@@ -0,0 +1,7 @@
+// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"
+
+#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<cutlass::half_t, 96, false>(Flash_fwd_params &params, cudaStream_t stream);
diff --git a/flashattention_logo.png b/flashattention_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..b8bd5d849754b640f7d815ee04de6098b2a7ba5b
--- /dev/null
+++ b/flashattention_logo.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61969fc112a38be106744ce2c416a2bca8026a173ef3cbb883826c998732958c
+size 2738980
diff --git a/flashattn_banner.jpg b/flashattn_banner.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2dcee8582111d52b5bdba9bc7c8dbaf863b884ac
Binary files /dev/null and b/flashattn_banner.jpg differ
diff --git a/flashattn_banner.pdf b/flashattn_banner.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..c4ad4270ae0106f1e9b45f8f3ca4b053e861dcde
--- /dev/null
+++ b/flashattn_banner.pdf
@@ -0,0 +1,3356 @@
+%PDF-1.5%����
+1 0 obj<</Metadata 2 0 R/OCProperties<</D<</ON[7 0 R 8 0 R 9 0 R 10 0 R 45 0 R 46 0 R 47 0 R 48 0 R 82 0 R 83 0 R 84 0 R 85 0 R 119 0 R 120 0 R 121 0 R 122 0 R 156 0 R 157 0 R 158 0 R 159 0 R 193 0 R 194 0 R 195 0 R 196 0 R 230 0 R 231 0 R 232 0 R 233 0 R 267 0 R 268 0 R 269 0 R 270 0 R 304 0 R 305 0 R 306 0 R 307 0 R 341 0 R 342 0 R 343 0 R 344 0 R 377 0 R 378 0 R 379 0 R 380 0 R 413 0 R 414 0 R 415 0 R 416 0 R 449 0 R 450 0 R 451 0 R 452 0 R 485 0 R 486 0 R 487 0 R 488 0 R 521 0 R 522 0 R 523 0 R 524 0 R 557 0 R 558 0 R 559 0 R 560 0 R 593 0 R 594 0 R 595 0 R 596 0 R 629 0 R 630 0 R 631 0 R 632 0 R 665 0 R 666 0 R 667 0 R 668 0 R 701 0 R 702 0 R 703 0 R 704 0 R 737 0 R 738 0 R 739 0 R 740 0 R 773 0 R 774 0 R 775 0 R 776 0 R 809 0 R 810 0 R 811 0 R 812 0 R 845 0 R 846 0 R 847 0 R 848 0 R 881 0 R 882 0 R 883 0 R 884 0 R 919 0 R 920 0 R 921 0 R 922 0 R 961 0 R 962 0 R 963 0 R 964 0 R 1003 0 R 1004 0 R 1005 0 R 1006 0 R 1045 0 R 1046 0 R 1047 0 R 1048 0 R]/Order 1049 0 R/RBGroups[]>>/OCGs[7 0 R 8 0 R 9 0 R 10 0 R 45 0 R 46 0 R 47 0 R 48 0 R 82 0 R 83 0 R 84 0 R 85 0 R 119 0 R 120 0 R 121 0 R 122 0 R 156 0 R 157 0 R 158 0 R 159 0 R 193 0 R 194 0 R 195 0 R 196 0 R 230 0 R 231 0 R 232 0 R 233 0 R 267 0 R 268 0 R 269 0 R 270 0 R 304 0 R 305 0 R 306 0 R 307 0 R 341 0 R 342 0 R 343 0 R 344 0 R 377 0 R 378 0 R 379 0 R 380 0 R 413 0 R 414 0 R 415 0 R 416 0 R 449 0 R 450 0 R 451 0 R 452 0 R 485 0 R 486 0 R 487 0 R 488 0 R 521 0 R 522 0 R 523 0 R 524 0 R 557 0 R 558 0 R 559 0 R 560 0 R 593 0 R 594 0 R 595 0 R 596 0 R 629 0 R 630 0 R 631 0 R 632 0 R 665 0 R 666 0 R 667 0 R 668 0 R 701 0 R 702 0 R 703 0 R 704 0 R 737 0 R 738 0 R 739 0 R 740 0 R 773 0 R 774 0 R 775 0 R 776 0 R 809 0 R 810 0 R 811 0 R 812 0 R 845 0 R 846 0 R 847 0 R 848 0 R 881 0 R 882 0 R 883 0 R 884 0 R 919 0 R 920 0 R 921 0 R 922 0 R 961 0 R 962 0 R 963 0 R 964 0 R 1003 0 R 1004 0 R 1005 0 R 1006 0 R 1045 0 R 1046 0 R 1047 0 R 1048 0 R]>>/Pages 3 0 R/Type/Catalog>>endobj2 0 obj<</Length 61974/Subtype/XML/Type/Metadata>>stream
+<?xpacket begin="﻿" id="W5M0MpCehiHzreSzNTczkc9d"?>
+<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP Core 7.1-c000 79.b0f8be9, 2021/12/08-19:11:22        ">
+   <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+      <rdf:Description rdf:about=""
+            xmlns:dc="http://purl.org/dc/elements/1.1/"
+            xmlns:xmp="http://ns.adobe.com/xap/1.0/"
+            xmlns:xmpGImg="http://ns.adobe.com/xap/1.0/g/img/"
+            xmlns:xmpMM="http://ns.adobe.com/xap/1.0/mm/"
+            xmlns:stRef="http://ns.adobe.com/xap/1.0/sType/ResourceRef#"
+            xmlns:stEvt="http://ns.adobe.com/xap/1.0/sType/ResourceEvent#"
+            xmlns:stMfs="http://ns.adobe.com/xap/1.0/sType/ManifestItem#"
+            xmlns:illustrator="http://ns.adobe.com/illustrator/1.0/"
+            xmlns:xmpTPg="http://ns.adobe.com/xap/1.0/t/pg/"
+            xmlns:stDim="http://ns.adobe.com/xap/1.0/sType/Dimensions#"
+            xmlns:stFnt="http://ns.adobe.com/xap/1.0/sType/Font#"
+            xmlns:xmpG="http://ns.adobe.com/xap/1.0/g/"
+            xmlns:pdf="http://ns.adobe.com/pdf/1.3/">
+         <dc:format>application/pdf</dc:format>
+         <dc:title>
+            <rdf:Alt>
+               <rdf:li xml:lang="x-default">Print</rdf:li>
+            </rdf:Alt>
+         </dc:title>
+         <xmp:MetadataDate>2022-05-23T07:35:16-07:00</xmp:MetadataDate>
+         <xmp:ModifyDate>2022-05-23T07:35:16-07:00</xmp:ModifyDate>
+         <xmp:CreateDate>2022-05-16T17:20:26-07:00</xmp:CreateDate>
+         <xmp:CreatorTool>Adobe Illustrator 26.2 (Macintosh)</xmp:CreatorTool>
+         <xmp:Thumbnails>
+            <rdf:Alt>
+               <rdf:li rdf:parseType="Resource">
+                  <xmpGImg:width>256</xmpGImg:width>
+                  <xmpGImg:height>176</xmpGImg:height>
+                  <xmpGImg:format>JPEG</xmpGImg:format>
+                  <xmpGImg:image>/9j/4AAQSkZJRgABAgEASABIAAD/7QAsUGhvdG9zaG9wIDMuMAA4QklNA+0AAAAAABAASAAAAAEA&#xA;AQBIAAAAAQAB/+4ADkFkb2JlAGTAAAAAAf/bAIQABgQEBAUEBgUFBgkGBQYJCwgGBggLDAoKCwoK&#xA;DBAMDAwMDAwQDA4PEA8ODBMTFBQTExwbGxscHx8fHx8fHx8fHwEHBwcNDA0YEBAYGhURFRofHx8f&#xA;Hx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8f/8AAEQgAsAEAAwER&#xA;AAIRAQMRAf/EAaIAAAAHAQEBAQEAAAAAAAAAAAQFAwIGAQAHCAkKCwEAAgIDAQEBAQEAAAAAAAAA&#xA;AQACAwQFBgcICQoLEAACAQMDAgQCBgcDBAIGAnMBAgMRBAAFIRIxQVEGE2EicYEUMpGhBxWxQiPB&#xA;UtHhMxZi8CRygvElQzRTkqKyY3PCNUQnk6OzNhdUZHTD0uIIJoMJChgZhJRFRqS0VtNVKBry4/PE&#xA;1OT0ZXWFlaW1xdXl9WZ2hpamtsbW5vY3R1dnd4eXp7fH1+f3OEhYaHiImKi4yNjo+Ck5SVlpeYmZ&#xA;qbnJ2en5KjpKWmp6ipqqusra6voRAAICAQIDBQUEBQYECAMDbQEAAhEDBCESMUEFURNhIgZxgZEy&#xA;obHwFMHR4SNCFVJicvEzJDRDghaSUyWiY7LCB3PSNeJEgxdUkwgJChgZJjZFGidkdFU38qOzwygp&#xA;0+PzhJSktMTU5PRldYWVpbXF1eX1RlZmdoaWprbG1ub2R1dnd4eXp7fH1+f3OEhYaHiImKi4yNjo&#xA;+DlJWWl5iZmpucnZ6fkqOkpaanqKmqq6ytrq+v/aAAwDAQACEQMRAD8A9U4q7FXYq7FXYq7FXYq0&#xA;XUMEJHIgkLXcgUqae1RireKtI6uodCGVgCrA1BB6EHFW8VdirsVdirsVdirsVdirsVdirsVdirsV&#xA;dirEtc/M3yzpOtx6NJcJJqCysl9bhgjwRLZvd+uQ1AyAKiu1QqBuTEKCcVXSfml5DS6Fu2s20bqr&#xA;S3HrsYBHCjem0jGRVFFmIiapFHDKfiRgFUefO/lX9FTauNQRtMhngtXvFV2iMt00SRBGCkOrPcIv&#xA;JaqGqCQVaiqMttXF/Z6df6XEbuwv6SGZuUDJbtC8iSiOVVZuTBF4mn2q9sVYPN55846pNfXWiaab&#xA;LS9PsFvCl/wE90p+uKxtGgN1C3J7SERSFuIV2kKSK0dVU31zWb/SWspZ766s0tbiKzJvVspIdUUq&#xA;rzSJHbA3T3DRo6wRQhGaUj90yA4qzHFXYq7FXYq7FXmfmS3nj8wXzRaHqVyHdSbu2chT8If4F+ru&#xA;KcuvxHpmlz6KJyE1Lfu/6Rckdq5YAQEQYj3pN5hiuYLaewgs/N9vaJclzc6K7rM8cENwgZAIBX1A&#xA;0dEJG/HccPiztFhGMGIur6/LuHc1ZdXLNK5Cq/axMroN9Pq0kVn+YGqaxbWsNjO8/ovcW8cctrcD&#xA;0pI1laO4+ASqrbludeLGuZjWm0+nNduZXtvzCuL201HT4ra6uWSF14C6hkmiaKMoIRBz9VgKuWSt&#xA;OZYquGl22reVLr0dA833U9iTEyeYl9G5uEvLe5imaKRYrqVmCTSDZQvN15fDWhVK/MflSWzutU17&#xA;QPKuuhyZrRo7K5ltLiWKCSyEFzEn6Oll9WSTjODzr+7csSeS4FeyeQNVuZ9Pi0xtJvdMtdNs7WKF&#xA;r9CkjsPURlrxjRuKRoTxA3PQDjVVleKuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KpL5n8xSaNpNzfQ2&#xA;wupYJraFYXkMSsbmaOEEuElI4+rX7J6ZKMbNIkaFsb0rzTey6q0o8s25ubgu8slpdrJcu3pIp4+v&#xA;DaoeS28YPKVRRR1oBl09OYi2qGcSNIry+35VLp2my2FhpGlfpiztL+1sDFZwTGCeaI2xaOPY/wCk&#xA;yxKtCR6hHEk0zHbmSXOkaCukfUbixtjpNskdLNoUaBEtiHiCxcSoERQFABtQUwgWaQTSXzt5J1mz&#xA;Wxu7a0vbGxAuUgurdWghEAosirKnBeAOxHTJzxSiLIYxyRlyYro/5sWFlp8195nuJYpL9H1axtPR&#xA;Vfq2nT3Ednp8JIoXluCwfetGZuRRAuVs2RaF+Y3lzWNTj0pPrFnqcimlneQtDJ60a8p4KGtXhWhY&#xA;iqMDVGcVoqgNF/Nrytdy3ttf3cVld2LSSyK7FY2snu/QsLmJ3C+ul1G8bhoeaqW4sQSoKqbaT52s&#xA;tT1ufSYtPv4GgJU3lxb+lauRFDMqxys3xl0nPEKK/A9aDiWVZDirsVdirDvLmi2GrLql3qHrT3A1&#xA;S+hDm4nUCOG4aONQquqgKigCgwqkXnjXfy48q6jbaPqwvbafUba4nivIpLqaOFIYndnkCTept6ew&#xA;VT91SFXmmnXmjXdjZrF+ZmoWThktpFuNJmkleS4dkgLEtNwPqROjFjQ07VDFVG6H5k8m2esWN9e/&#xA;mFJrGgJZSz6hayWN9byOfV+rxzIYkZv74kFeQpQUUg1CtPYdA0zyTr+lxappXrz2UzSIkjy3sLco&#xA;ZGikVo5WSRSrowoyjAqK0WyhsPM+qWVqZFtRZWMyxPLJKBJJLdq7D1GahZY1Bp4YqyHFXYq7FXYq&#xA;7FXYq7FXYq7FXYq7FXYq7FXYqwz8yPV/wvqXpANJ9c0zgC3AV+uW/wC1xkp/wJyUASRXNjOuE2x7&#xA;yg+sf4ls/VgjWKrhmW59Qj90wB4G2hrv/wAWD+GZGSGURuUgR7nFxGPEKTnynovkG803QVgiSS/b&#xA;QLAWMx9SGZ9NhUGExHm5pFI6ueEjFH9NieXptmK5jOJ6GGQGP1QVNYjT4tvs7+ON1ugpfaSRTg2R&#xA;00w2jIeavGBHv1UqRQg5AZTI7gsYnpVJZF580pPL9vrF4ksQuraO+itIopp51t7mQJaiVET93LL6&#xA;ijg23LnQlUZsmzYP5p/OHyBc6rpn6PubrU/MGiwalrVpo9qg9Iz2unXSNFeTenJxPEyxqImPx9ai&#xA;mKvRND8022rapqemLay2l5pJgF1FNJauwNxEJVHC3nndKA9ZFUN+xyG+KpnJYWMtxDcyW8T3FuzP&#xA;bzMil42deDMjEVUsh4kjqNsVV8VdirsVeIan5h1S01W406HVn0m0a81OcyxqTylN9MoDlFd+NE7D&#xA;78tjVWUwAJopdJ5r8xLHI6+aJ2ZE5KgMtWbb4RWECvz298Nw8m44oV1es6L5p0NNHtr7UbuGPUns&#xA;9ObU5ioVy14OMAYqo+3I7cQOle2VEbuOGP8A5i+cWR7G10rVmsoRd3FrqdzCpLxy26Rtw3FdvVBP&#xA;HJwA3tkACQCwk+aNfD8f8Vyld/jBmptWnWCu9NtslxQ8m/wof0noH5Yanfam0l7fTGe6k020V5mA&#xA;BYR32oxrWgG/FRlcxu47PcirsVdirsVdirsVdirsVdirsVdirsVdirEPOGmWkvm3yXfyIXuU1Ge1&#xA;ikBKmKJ9PurmTgVoR6klnEH33UFPsu4ZVS/MgSHy5e+mQH+vaZQsKgf6XB1AI/XluH6g15fpSPyx&#xA;Hqh1uACaLkVlCcEKNyMT8SGYzAb+KN8sztR9BcTDXEE4svInlyxhsrm+e6Sey0q3tngt55VgQae7&#xA;yySg2kdrzlaWc8/gHOgpGN81jnqfki08l29tavb+Y7jWL3TvXgmuLzUruQNPZt9VupTbTTMgAkfr&#xA;xKjkpU0Kkqs8BBAINQdwRiqU23lHypam0NrotjAbBpHsTHbQp6DzEGVouKjgZCoLFetN8VV9N0Sx&#xA;0691S8tw3ravcpd3fIggPHbQ2ihKAUX07ZdvGuKo/FXYq7FXYq7FXky+SdZ1i7utV0q4t45otR1K&#xA;3kjueYWgvpJFZSivvU7gjKNTpo5oiMiRRvZv02oOKRIF2KQunad54802+rWKXNmIdJvpNMuVmWSK&#xA;s0EasWjIiqycZtm75jHs6Jq5SoV3dPg3jXEXURv7+vxeg32raf5J8o2L6kZbiCyWy08tbxF3d5Gj&#xA;tlYRgk05NyO5NPE5sSXBQvn7yhqHmBdPm0+aGK6sXkIS45CNllUBviQMQRxFNspz4RlgYHkW3BmO&#xA;OYkOYYXZ2fnvUtW1jyulzZiTSI7VrwusiROt4GkjEcgi5PT0fi8OmYZ7NiY8PFKv839TlDXkS4uG&#xA;N/H9bLfImhz6FqFzpdxKs08Gm2TSSRghOUt5qEpC13ovOle/tmxAoU4JNm2Z4odirsVdirsVdirs&#xA;VdirsVdirsVdirsVee+c7j9N+b/LelaeLab9GasW1SadJfVtpra3gvlSLlwgk9SCZRxPJgWWRFrC&#xA;7Rqo78ymuF8rambdEknF5pnpxyOY0LfXLegZ1WQqPficsxfUGGT6SxvyddeZBrtu11p9qoVZSFtr&#xA;t5nJETlVVZYLZCSdt3Ue+Z2ovgLiYa4gvtLnyVaaJo9p50aLVtSvNZ1a2tIDDK9rJd3WsSW87rby&#xA;NJEqxPccFaQlkjLBWPJuWtc5kxtPy31vU9QS80zSLyfTIi8tzLDbTgW08splZpCpCD6zFOJFLV5K&#xA;Wbriqfrq3l+0tlVby0t7aBQqqJI0jRFFABuAoAyZxy7ix4496Mtbq2u7aK6tZUuLW4RZYJ4mDxyR&#xA;uOSujLUMrA1BGQZKmKuxV2KuxV2KuxVjnkb/AHh1T/tr6l/1FyYqxPU9c1OL/FOr+XZtR1FoNcis&#xA;JdMsI4ah0tLSGdx6tndyH0zUn9nbqMVSm9846/qCpEdG85/VYWiu7gXNhYwykwUuofq6xWkySSBo&#xA;xVJJIxX4aljwKqa3XmnzNCIZorXzVNbD1heqtnYC4RooIpk9GI2fGZX9Vkr6inkhADHFV/lrztdf&#xA;4iPqeXfNM/6RWxtH1HUbGGARc/VlTksKQosUPqv60hJIagAOKsztP+Uz1T/tnad/yfvsVTrFXYq7&#xA;FXYq7FXYq7FXYq7FXYq7FXYq7FUBaaFpNpqF9qMFsi3uozC4u7g/E7SLBFbihNeI9K3QUG21epOK&#xA;sd/MMMfLt9QqB9e0vkWNAF+uW9ST7dclCVEHmwyD0lJdLkure9SfSxb6jfxiQwWTXAgEtENV9QLL&#xA;xNK0+GniQNxk5dRxRI4ZD3hxcMakEjudd/LbQtL0u78+2Eaa+66j5itdHq95JAuq3oIgELcQ0r+u&#xA;o48OPOORl+yWzEc16Hr0GjP5O1TUdMjt3juNKmNvdW3DjLB6MkkQWRNmj/esy9viqMVYtaLZJqen&#xA;ot1E0xu4TwMiqSBIhAUb1PX55kS1cyKMCHBxwFjdG6H5vutOku7SbR72WKK/vvraWFoJvqsrmS9Z&#xA;JSlxLK5ZKyoywj1RNEFUOSmY7nIv/lZoOl6bfN5f1W1k1XUo9JtdOvLcw3QklgM/ryRguRDEEcyM&#xA;vLiqs2/GmKqWi/m7o+rWEGoQaRqsNldXH1e1nuIIolkBuLW2WUcpdkZ70Urv8DinIKrKsm8qard6&#xA;x5W0fVryD6rd6jY213cW1CPSknhWR46N8Q4s1N8VTTFXYq7FXg3mHWtZtNQubazvri1gF7qMnC3l&#xA;eGrvfzqS3plS2yClenbqcw9XllGqLi6jIY1STxa3rkLStDqd5G0z+pMUuZlLvQLyaj7txUCp8Mw/&#xA;zOTvcbx597KrbQ/PtzYRXkWvT8Z0R4oWvbtZG9QAqFBoG2bcqSNjvUHLwcpF23jxCLtDazZeedI0&#xA;9b+51u5a3eX0YzHe3DFjV6MAWHwlYwwPcMvvQTnliLJYzlkiLJSL/EnmT/q8X/8A0lz/APNeU/mc&#xA;ne1ePPven/ldeXd4ZLi7me4uG061VppWLOQl9qKLyY7miqBU75tMRJiCXYYzcQWfZYzdirzf84tU&#xA;1SybR47K9ntEl+stL9XleEsU9ILyMZUmnM7VpmLqshjEU4+omYjZ5z/iHzH/ANXjUP8ApMuP+a8w&#xA;fzE+9xPHn3u/xD5j/wCrxqH/AEmXH/NeP5ifevjz73f4h8x/9XjUP+ky4/5rx/MT718efe7/ABD5&#xA;j/6vGof9Jlx/zXj+Yn3r48+93+IfMf8A1eNQ/wCky4/5rx/MT718efe7/EPmP/q8ah/0mXH/ADXj&#xA;+Yn3r48+9m35S6zrF15hura7v7m6ga0aT07iZ5gHSWNVK+oWK7OenXv0GZmkyyldlydNkMrt6xmY&#xA;5TsVdirsVYh510rUdRSSxg1GG2tp5bGSZPqUt5Mki3Keix9K4iIjkeMKT6dFAZmYAVEoyo2iQsUl&#xA;ul+UPMnrTzWPmG0SW3eW3WddMl+GThxZkMl0UcoWp0ZeQodwy5dk1BkKpphgETaI8v8Ali+ksdO1&#xA;W0u9HupZITcW+qzaK8dzJ9brNLM1LqHg87SM8gCLux+EdMx29Eavp2uW+iro0mraXZ2l8g0uyt7f&#xA;Sp6j1YyixwxJenZIwWNFoiKWaiqSFUr0vyXr1wlrqFp5gspI1kSVOWmTqeUThijo16rowZeLKwDK&#xA;ag75ky1UiCKaBpwDaeWGj+ZtMEsSazpgmu5pbqZ5LCcySSSuASSb6pC8kjT+VQqjYDMZvTnRn1CS&#xA;3aW7vrS/VmIilsoWhQcSVcHlPc8iGFNiKYqmGKuxV2KuxV2KvG9e8oXU+o3XrW2oxTC7vHElvZi7&#xA;hkhnuXnhZWWWKhpKeQP4U3pzYBkq+jVkxCfNLv8AA038urf9wk/9lOU/kY97V+Vj3l3+Bpv5dW/7&#xA;hJ/7KcfyMe9fyse8qMXlOGa2+tQy6lJbFSwnTTA0fEdTyF1SmP5GPeV/Kx7yrDyPKRULqpB/7VJ/&#xA;7KsfyMe9fyse8s//AC40q4sDcKYLmK2jtbe3jlvIhbvJKtxd3EnGLnIQq/WlAJP8cyox4QA5EY0K&#xA;ZtkmTsVYt558jnzR9RZL76lJZ+qKmL1lZZeFducZBBjHfKsuITFFryYxMUWK/wDKkrn/AKvqf9IZ&#xA;/wCyjMf8lHvLT+VHe7/lSVz/ANX1P+kM/wDZRj+Sj3lfyo73f8qSuf8Aq+p/0hn/ALKMfyUe8r+V&#xA;He7/AJUlc/8AV9T/AKQz/wBlGP5KPeV/Kjvd/wAqSuf+r6n/AEhn/sox/JR7yv5Ud7v+VJXP/V9T&#xA;/pDP/ZRj+Sj3lfyo72Q+Svy7PlrUZ76TUPrsksPoIiw+iqhmDMTV5Sx+BadKb9e1+HAIXTbjxCHJ&#xA;mWXNrsVdirsVSHzH5L0fzBcwXV8GFxa29xb2kyBOcD3LROLmFmVjHcQvbq0Ug+ya/QqxbQvynsbi&#xA;xnn1241Ka+vLWewmjvJLVpIlaE2AuIpIFfjNJZoqtIrD1BRnjVgFRVmfl3y1pHl+0nttNiKC7uZr&#xA;28mcl5Z7m4blJLK53ZjsPYAAbAYqqa9aR3Glz8pJ4WhVpY57NVe6QqpDehyST42Qsnwryodt8Vef&#xA;XXlz8s7WFtLS69LStToiWFvcW0loYdTS5e1kgtpAyx85HnSBLVeUjblXUEqqpWOk/kRZ+ZrXS7WS&#xA;xuL7U+WoJbSXiXMEsrLGIvUimlcPNMuo+pASpZgzEGmKsz8iy2n6KuLaJoVuEvLu6uLWGVZvS/SF&#xA;zJexh2QBeRjuFLcaryqFZqciqyPFXYq7FXYq7FUBr+i22t6PdaVdPJHBdpwkeIqHAqDtzV0PTcMp&#xA;BGxBGKsRtPya8t2jwS219qEU8IiVplkh5SfV4rWODl+64r6ZsInHAL8Va1U8cVS5/wDnHvyK+mnT&#xA;mmvGtiXA5tbyOI2MJjiEkkDuEiFuqpvWhapNcVUrb/nHPyPbyQMl3qLCC1lswkj20itHP6hkL84D&#xA;V2MzfH9obcaUGKovSPyF8l6Xr9trsE9819aXj38HKSFUV5GdmQLFFHSMmT7Ip0GKvSMVdirsVdir&#xA;sVdirsVdirsVdirsVdirsVdirsVdirsVdiqG1G0ku7X0EuJLYl42aWI0cokiu6A9R6iqUJG4B23x&#xA;VJV/LnySltLax6RBHaziRZ7dAUjf1J1uasqkAmOVA0R/3X0TiNsVRMnkvyu10t2tgkF2rrL9Zt2e&#xA;3lLpayWSO0kLIzMtvO6KSdtiN1UhVE6H5c0LQbGKx0axhsbaGNIUSFAp4RAhAzfaalTuxJqSe+Kp&#xA;jirsVU7a4hubeK4gbnDMiyRPQiquKqaHfocVVMVdirsVdirsVdirsVdirsVdirsVdirsVdiqE/Sl&#xA;t/Jcf9I0/wDzRkeMfgNnhHy+Y/Wx28vvOa3E7WElu8DyFoVurW8qiCnFR6cS9d+VS3iDvRaTKd7f&#xA;cXNhDBQ4rvyMf0n9X62Xep+e4o2e0WznenIpJbXy7hFqkahB+0G3d969qYmc/wAAsoYtOT6uIfGP&#xA;27/cE90/VZjZQnUI3F7x/fiC3ufT5f5PJOVPnlkZ7b8/cXDyYhxHh+nzMf1oyyvre8jaSAsUR2jb&#xA;mjxnkhowo4U7HvkxIFrnjMTRV8LB2Kqa3ELXD24as0aJI6UOyyFgpr03MbYqqYq7FXYq7FXYq7FX&#xA;Yq7FXYql3lr/AJRzSv8AmDt/+TS4qmOKuxV2KuxV2KuxV2KuxV2KuxV2KuxVjOr+Vdduten1aw8w&#xA;3Gnh7RLa2swplt4pkdy05haQROXWTiQUr8KnltQqpF5m8h+cL3R4ok8yXF61oXpYenHCl1E6xqsc&#xA;7liWaMxl1djUkkHbIzBIIDGQJBp55feRPM2n3dot5ZJD+kZls7TlPbgNPwllCbydWWM08TtmF4GW&#xA;uf2lxfByVzQC+XL6G9FjcNa293bhoZIZ7q2hLOqh29P1JF9QKqNyKVAoa9Dkjgyb79e9JxT33Qkd&#xA;j6lncTerbpHayyGZpZ4Y9orj034B2UuFf4arUbjxGPgZL59O/wAl8Kd80bL5a1FlmWNYJWZvQIiu&#xA;LeQrIaqFcI7FN1P2spy3i4eOVb+bbh0maf071z3Z1+Uel3dt5hvJZFVI1tWjZQytVmkjI+yT2U75&#xA;PRZhIkCXF8/0tw0ebELydXrObBXYql0H/KR33/MHaf8AJ25xVMcVeea/+YvnGy1+ew0/ydd3mn20&#xA;0aG/InpPE8kUbNAscDryVjLs7qOKh+VGAKqUn83fzAkQuv5f39qI1gaQTrdy8vW9BnVPQtXb92ss&#xA;vI8TTh0Lfu8VRsP5q+c5NHa9/wCVf6il0LiaBbFmlDMkUCSpKCbfZZGkZd1AojbmTjGyqM/5WJ5x&#xA;fVdI0+DyVdsdRit5Lq4kkljhtTKEab1JTbenSEM2xZXYrTgKiqqa+WvN/mHU9Ss7HU/Lk+lfWNPa&#xA;+mnZpZYopPUjWOD1DBEnNlkaqsyupRvg48HZVleKuxV2Kpd5a/5RzSv+YO3/AOTS4qmOKuxV2Kux&#xA;VC6pqdnpenz394/C2t15OQCSd6BVUblmJAUDqcEpACzySASaCraXVvd2sN1bOJbedFlhkHRkcVUi&#xA;viDhQq4q7FXYq7FXYqk2v+a9M0O70u2vVlLatcC1t3jUMquabvUg0+L9kEgVYjgrMqqU6H+anlPW&#xA;VtBaNcrdXk620dnNbyRTK7ep9sMOIAEDsd+g9xVVl+KvNfzJ0/V7vzDZtDDeT21vDFPa/VxOyR3S&#xA;PMrSL6WyyBHA5daHNbrxqLHhfHl+l2GiOCj4vw5/oT1vJmjXelW2p3uh2d55lSyAaa5hjMsk7W5j&#xA;dZnNC/LkVPJs2ButubhQETIcX02l/wCi7+eCSO58iaaXdS5djaMjTmUSM7JQn4pB6lOVeQ+1XcV8&#xA;c/5v2uYcGnvbJ/sSjrC3lS/WJ/JltZw3E4D3SNbNtR2MrrGrUI926tsfGMhxkCUb+1eCEIkwyb1y&#xA;oj4MksrLSYJJDYwW8Ui/BKYERSDseLcR+GTx4ccT6QB7g4uTJOQ9RJ96A1vzbpujazo+l3kU/PWm&#xA;mjt7pEDQRPCqtSdq1Tnzop4kV60y1qSfQfzV8ta0bdbaG8hku7hLW3iuIfTkaRg5aqciyhBC5blT&#xA;ptWoxVkMH/KR33/MHaf8nbnFUn8x+cvL0cOq6N9d9LVY7W7HpMkqANDbxSsfVKiPZbuGh5blqDcG&#xA;irzjzJ+bHnCx8w6nZW00K29rdTQxKYlJCxuVFSe+2Z2LTxlEElxcmaQNAJxof5peZz5buNUksP09&#xA;dW5f/cVYRN9cdVntI+aBPUqFS6kanp78ftDfMbNARlQb8cjIWUztvzfv2tLSW78ieZoppwizQxWP&#xA;q+lMSRIpLNF+7UrtIQOQNaDKmbLPK/mKXXrW7um0y70yGG5e3tkv4nt5pokRCJ/SkVGRWZiFB323&#xA;odgqnOKuxV2KuxVLvLX/ACjmlf8AMHb/APJpcVTHFXYq7FXYqwTzVdprevLooSebTdMBmvxalBI1&#xA;xwLIo9QhSsabnrVj/Mua3XZBI+HuesuGrr4/j5uw0cDEeJt3C7q/gt0a91TQ5Ljy5bW7LPKi3eiQ&#xA;3jxsSGb/AEmJmjZF+FuUib1I5bUWmDHkyRgYRHriBQlXL4Hpus8cJTE5H0EmzG+fx+Cavqnn5Dto&#xA;0EwXdgsqJzpSoUtIaV+ICo8K4YZdVxAGEavc/g+9ZYtNwkiRvp+KSbXdU/OuDU9ZTRtI0y6023iV&#xA;9FklJWW4lMsXKOQG5QKBE0nxGnxL03pmydepWPmD86ZdIl+veWLa21NYklilhnglRpCjB4fQa5Ub&#xA;SqDy9cAo3ZhiqD1XzF/zkBDodjLpvlfT7rVpXnW9hlkiiWJVMJgbj9dYHlWZSBKeit8O6Yq9B8uv&#xA;rj6LaNrqRR6vwpeLB/d8wSKru/UUPXFUq8/apdWOh3bWzenItrLOkqjk6PHJCqlev+/D2wS+kljI&#xA;sf8Ayh8watq0mrrqF3NdCAWxi9YH4S/rcqVA68BlGnnKQNteGRI3VbS9vz+bOsWJk1U6d9SgbhMx&#xA;WyErPGp+pgLvRT+8blUVNKdcyW5mE0CvqgUmTiyciQ8gAPxfZI+z0FeJGKWl0bTLqMXFzbLJPPGn&#xA;qu/Llso8dwdsiYg9GQySAoErj5e0Ulq2cfxLxNBTYeFOh9+uDgj3J8afefm7/D+i/wDLHHunp9O3&#xA;j/rf5XXHgj3L40+8/NRnOieX4numX6nahP3zxoShPNEUsFDMzkvQdzgJjAE8gvFOZA3JSXzP5vtL&#xA;jybql/od44mtTEhlWN0dC8qDZZFU7qx3pkPGjKJMTbDNCUAbFFI/yq8zazqM15HfXUt2vqwIvrg1&#xA;UNFcO3GoXqYlyGDJIjfva9MTKJJ6fsZ3B/ykd9/zB2n/ACducyWaC8z6Loz6Vql+9hbtfCzuaXZi&#xA;Qy/HCFb95TlusSA79FHgMVfPfnNF/wAX63sP977nt/xa2bbTxBgNnX5pHiLNfy717TPLPlqbXryF&#xA;nhtmu1l9L0lkIeTT41HOV4owoaSpLuABU5g6oVMuVgNxDMG/PX8ro5kgn1d4Ll7f636ElpeBhD6H&#xA;1nkaQkf3J5dcx25PfKnnzyt5sWSTQLtryKOKOcymKWEFJpZoVI9VYyfjtZB07YqyDFXYq7FXYql3&#xA;lr/lHNK/5g7f/k0uKpjirsVdiqSecvMkfl7Qpr74WuW/dWUTdHmcHjUVGy0LN7A5GcxEWWM5CIt4&#xA;95WsV1S4n+vTXrgMrvJbRvK7vJJylZikcvxsOTbgVPfsdMMMMsjKQ3acOszchKQHks1cyaVdabfa&#xA;VLdKeEdzHLcoyhZxRygPGPmvB15cdirdaHGMI4pCUQjLrMpq5SMfN6db/mBc3iafPYaLcXdneLGb&#xA;i4i5sIHLlJY2CxtyMRU+Fc2ksxsULB6thymxQsOfzzrq6et5/he9r6s0D21JTMGjtfXiZVEJrHLL&#xA;+55mgU7+NLMcjIWRTKEiRZFJRN+aHnG2vrKKbyLfS2d6ikXNq1xK0Tm3gnYTRvaxcFBnaMHlUsjD&#xA;iMmzVL/8zvM9vrtnp9t5H1W8tburveqjxrAnxcVk5IU5ngGI5gDkBUsKYqqeQvzH81eZdevLDVPJ&#xA;15oFlawvJFqFwLnhOwdVUR+vbWlKqSaH4vbriqH8weYJNe8lT6mbR7D1LK7VUlKuCEuLdeQKkGlR&#xA;TcA19qEz1GPhiRd/gtPFxC0F+SDlZNdaSRCqpaEkDiAKz9ak5g6Qek+9jpuRTLT9T83y/mfqkErG&#xA;48riOL6pPG70iP8Ao7DZVWF/UkeVD8TOAprxWnLMchkPmTUtPs7xRcyhGeEcQFYtQMQfsIzd/H+O&#xA;GLOATHTodMmtY0iZJWiRBIEevEla0IU7fbJwbsd191HpNnby3N06W9vAplnmlk4IiChLOzEBVHDq&#xA;fDG1srYl0WZImiljkS45LAyy8hJxBDBCG+KgrWmNraWebtEjvtKmtYOaTSJzTirSLVJ4pKsoNTuO&#xA;29K0zH1QvHIc7btPKsgN8nmut+V9KtNG1S7uLW6m1iD04or6SFoIYUaWJmHD1mDBii/ERXp0rmu0&#xA;sowicYjPrzHl+Onc39oxlOJmTHlyBRH5V3bWi6ndPxl+rtDLwX4K8La8alWJArTMrTmo3XX9Dgdn&#xA;w4gY8rr74sr1jXrhvL+va/Dcy6U8GjRXsdzbRx3LqsD3bh0juVjWRXVKhW4Eg9VO4zISsW3ZsfBM&#xA;x7kotvP+v6l5d+rvod5e2V3pHrrrxVYfXEmnxz+s0XH6vF8crIyCc0I+Dn8XCTU8u86GT/F+t0Ap&#xA;9fue/wDxa3tm205PAHX5q4i9J/JEymFQ9QgN6UooKkn6lyrJzDVHw0X06f5XbMHVfWXKwfS9VNtb&#xA;m4W5MSG4RGjSYqOYRyGZQ3UKxRSR7DMduVMVdirsVdirsVS7y1/yjmlf8wdv/wAmlxVMcVdirsVe&#xA;L+e9SfzHrReGYDTrHlBZilQ5r+9l+TstF/yQCOuU59PLIBRoOBqcoJ4Uv0m48w6G0g029+rGbi0o&#xA;ManlQEqaOG7NlUNHkj9Mg0RymPJT1SXW9UhtYb67WWKyUpbII0QIrBQR8CrX7A64J6PJKrI2RLKZ&#xA;c2QflrqsuiaqdOuZAbDU3VV6/u7qnFD8pRRD78ffL8OGUBRcrS5f4XrmWua7FXYqp3EbSQSRrI0L&#xA;OpAlXjyUkfaHNXWo91I9sIUvLpbqG7/Lb6xbw+kkllfFYWMVAReQg7wRQR0qK/DGMt1Yri+H3NED&#xA;cfmxzyVpF5f6R5jMWoT6cYoYf3NsYPSuTKlxEI7kzQyERfHvwKkfarUAjU4sgjjkWehjxGh3so8g&#xA;2QtNRlmN5ChuUjhRbQSu3JZUZg/qwRqo+ILWvfMbsuUI7CXET5F3XaIkdzHhA8wqed/K2rXXmayM&#xA;EkV0t6q2sb3knBkcC6uiKRQkcAsZA75k67RyzEES4aaNJrI4ruN2m0vlK2g8u6U8WjjVtTt47dDE&#xA;Lt7dVLKvqyozErsRypTc/M5shIhxBlkOSVy+Tby6tlsrny1EbKeGKO5ja7dnBMIikUyJNFy4gsKq&#xA;q160zAzZ9UJkRiDD8ebmwOGUPVIiXd+Anegfl/5fils9Xu9JW01q1kllThPMyo7hY+XH1XRiYokG&#xA;9foqcvwSmYAzFScPNGAkRA3FP9Wu5LYwtG5DFkUpSoKtcQox+fF6D55kRFlpJYbr3mCfzD+VV7qz&#xA;2iWq3BQQwib1fhS5RKu3BOJ5KdqHb7sGqx8AIvo1cfFAlif5fRCTTNdimUIkiIrFPjNDaXgJoTEN&#xA;v9YfMZr8NcB9/wChyOypUbHeP91H3sj8w+jY/l/5gkmgjrYeXYJIEu1t7iP1oBdGF+Dm5hYGRVZA&#xA;Sw6DMzGPSG3VG8sj3n8dB9yceXruW8/J3T7uYIJrjy/FLII0SJOT2YLcY41REFTsqqAO2TaHiHnN&#xA;1Hm/WxQ/733PY/79b2zbaeQEA6/NH1FmP5feXdO81+WLjy/fPJFazm6aWSF4VmFJLGROCSCR/tQ7&#xA;t6fHtWuYOqNzLlYB6QyyD8lbGCVriHzZ5lS9leJ7m9GoL60wgDCNJW9KjoOR+GmY7cn2iaFp/kyy&#xA;vb3UPMd/d2bJCJrrXb1ZY4RECgZXcRrGZC/xn9o0xVkdxc21tbyXNxKkNvEpeWaRgqKoFSzMaAAe&#xA;OKqmKuxV2Kpd5a/5RzSv+YO3/wCTS4qmOKuxVin5ha++n6WthbOUvtR5Rq6kho4Vp6sgI6GjBV3r&#xA;Vq9jkoi2rNk4I2xjypf6lBZTWtjFbmK3InPqJIWA+HYekQ/H92tQoPvtXJyAcHDOVGvemqyeZILi&#xA;0VRYyfV4haRSKssi1p6lDxBJkT6vWijY9t8GzbcwRy7vx8laV/NkK/V3+oskhg3UyAULiJWBUr9k&#xA;xryI3FRT2GyT4g226JF5yn1Zv9A1IoHuCLqVISwVG4ekqqST9kJ1B+1XJRAas05g78+ab+WtO03z&#xA;PZyXl7PMurREQ3wjZFHIIFWVV4dJYxXuAeQHTIkkORHHHIOLvV9f8nWVho11dRNfXzRtBP8AVFPq&#xA;8nt7uK5SQRxxO7mN460VSxBYDcjATbbjxCHJ5X5p8mzw+XppLfytrl/EbeDSksllt47hbVIPryus&#xA;EGnzwn9/cyxt14MBxowAQNr0H8pvy1h8ueXrWWC61bTp5rWaKbTLmeKVIpJpAxlCehGpdCn7ssuw&#xA;Zqjc4gqVfzVoNnoXkybS7V5ZbeCxuyryMDKfUuYHNSgTu3hks8zKJJauHhFJZ+RoUTa7QMPhtPtk&#xA;nvP4k5haT6Sw0/J6dOsca81IiZ5I+ThQSSzqtD/rUC1zJEQHJJKQ+abu3stU0W9uSUtob0GaQK7B&#xA;Q1leoCeKn9pwOvft3KG4fPXlOGzjMl+ESOMF2McwUBV3JJjXYfIYqyTFXYqleusfThQPu0kJEVPt&#xA;UuoBXl2pX8cnDmiTBZ0t1/JidLX4rEcfqrrM8zspu1LcmaG0IIkLLQJ0HXHW36r51+O9pj/d7JD+&#xA;WthJfWurWcErwSTtAiyE13Nvd7fGJNj0NBXw3zXYQTCvP9Dkdm5BC5Hev1j3PQo/LcF56mjahJKF&#xA;hsNMEklrNLbyF7eadgVmhMUi/EnVSDmZAUAGeaYlMyHUqfmzy/qS2P1qw1WSCy0zSr22ewn9a5Wc&#xA;yxpwlmkaZXkeNYiA0nI1YmvWsmt4b5z/AOUv1v8A5jrn/k62bjTf3Ydbn+ssx8h3nmez8pS3Pliw&#xA;i1PW45Lo21jM6xpIpl04TAuzxhT6XMqS3Wmx6Zgar6y5mn+gMy8s+Zvzau9R0q317ybDptnJEy6v&#xA;fR39vKEmCtR4oldmCFk+xVj8Y+L4Dyxm5N/zHkRPKN2ZNDfzHH6lvz0hPU/eAToeREUczssdObKE&#xA;aoFKHFXk09l+Wx0yRYfyu82TNbxyG1gmgvk9VjLE3Bn+syOAZFRqspoqmg2oVXvsbFo1YrxLAEqe&#xA;oJHTFV2KuxVLvLX/ACjmlf8AMHb/APJpcVTHFXYqxTW/IMeratPqUuoyo8qoiRhEKxxxjZFJFaci&#xA;zb92OSEqasmET5sY80+X/KPlSxjvvMHmF7C0mdoo5Xh5BnWN5uI4KxqUianifhHxEAnjLX+UgxuH&#xA;zN+UMyNJF5vleJOPKQWNxwq5YKvL0aFm9NiF60UnoDjxlfykEU2p/lclrNdS+a5IbeARF5ZrSWJT&#xA;9Yge5hCGSJQ5kiiZl41rt3ZavGV/KQQcHmj8pJbb1z5quI0U8JuVjORHLwMnpO6RNH6hCtxUN8RB&#xA;C1x4yv5SDIvLnmT8v9F8xIkOv3c11eMtglq1hdLHPJKUaLi4h4vTmCrA8Qr1rRgciTbbjxCHJNbX&#xA;8+/yuupZRb6q8lvAFMt4ttcmFS0qwgMfT5L8ciDkV4/EN8DY9BxV2KsQ/MmC7fQryWNB9WSxnWWY&#xA;MOSs0sDKApBrUI2/bGf0FhNjX5HqRLrhLlwVtKE07GfwAzF0gIibFNWnG3Knp904SHmZBEqshZ2F&#xA;QBzFa+FRtXt1zKchR+s2qXlwzTRr6ccQkqyjj8T05b1HXviqTeaZ7dfImsxeqgePSpw6hlqK27AV&#xA;AO1a7Yqn5urVSwMyAqvNqsNlPRjv0xV31q1/38myeofiH2P5+v2ffFUt1uOS+jgtrVllWV4nnVJT&#xA;E4hE0bNKkifECoU0owr2NRk4GjbGQtj3mzQbbRPyxvNKs5JZILcIUklKtJ8d0sjEkKq7Fj2yGomZ&#xA;gnyYGNQIDGvydhumub14f3yxz2zTM7BeKeldKSOK7mrDbMPTRNcq3/QnR0Iyvb+0PTIP+Ujvv+YO&#xA;0/5O3OZjYoeZr6yGlanYG4jF8+n3M6WpdfVMSJwaQJXlwDMAWpSpGKvnbzmG/wAX63v/AMf9z2/4&#xA;tbNtpweAbuBmI4jsyzyFruo6B5cn1azsZ9cltzcBNEs0LXDiSfT45JI+KyO3FH5U40+HqOowdT9Z&#xA;crB9LMLb8379rS0lu/InmaKacIs0MVj6vpTEkSKSzRfu1K7SEDkDWgzHbVTzL5svdU/LzX9Qi026&#xA;0oQTrbWg1G3aOSaLnCPX9CZVKq3qMFDDtvvsJ443IBjM0CXjw82+Y2ZA01rQMrfDYWSH4SCPiSJW&#xA;HTsczZaMAE240dTZqn1Fmvct2KuxVLvLX/KOaV/zB2//ACaXFUxxV2KuxV5X5186apL5mby5J+W0&#xA;3mK0guoY7bUbqKRrM+rEji4Dm0uI0WORirnlUUqK7gKsbg8y3dwLyzg/JZbS39GaWOaezrG0tpDJ&#xA;LAkkK2alixJReDN8TUUmpxVOdK80Xc80tteflk8VlcRXc3wW87r/AKJYrFGhSazhH+kpygjX4Wps&#xA;U33VQNz5gs4oLGS2/JqSeS5sjfSp9Qjj9C4CTiS3cm2rz5IgU0qyyE8QRxZV1t5x1KDU5mtvyelt&#xA;ZLKa5urK5iiaNpbjeJpKpZ8Q0iqrFi5qv2S7BVZVU1PVlt9A0+7tvyntp5LuAXT2kVr9YSIR3iRI&#xA;hMVmTyeJjMh41UCvE4q9O8o69qWuaZLd6ho8+iTR3M0CWlyau8cT8UmHwptIN+nyJG+Kp3irzjW/&#xA;z9/L/Sby6tC9zeSWbSR3DW8ahVeGQxSLWZ4fsupHgeoqN8VdF+e/lZ/qLNpuqxpf2kl9C/1eOQCO&#xA;NWIVhDLIwd2QxqKfb+EkHFUwvvzc8rwWcUlsJrm9nhiuI7Fo3iZY5okmQyOy8F+CVKgEkE0p1oJG&#xA;gSiUqFpMPzuBkKfobcAGv1nxJH++vbMX85GrouP+ZjVpR5n/ADbu9a8t32lWWniyudStZIo7l5hK&#xA;qCRQrVT01rVWp1wnVxF89knUxDHb/X7aab61YzavpfO+luZ4rTUEjSRp3VZAwW2DE0WiuSW8S2H8&#xA;3G6or+YijD5ss0SQy/pefiKzl9TB9UekY+MgFuAR+1Sn2t8idTGQrfdBzxIrfdk/5Yy6Nd+Y3+rQ&#xA;3cE8FvJKgluEmjKlwrKQIYm6y1G+Q0wgTcb270YBEmxb1bM5ynYql0H/ACkd9/zB2n/J25xVLvNH&#xA;lawvY7/V7ewW58w/oy4sLRzLLD6kbn1VgZo5IqK8qLU1BpUVAJxV5ff+T/O51SF7ryRYarGOB1Ke&#xA;z1Ke3ea6YLLJKj3cpPo1coVKszEE8h0ywZJDkSxMInozDyF5f1JbjUYtV8sHy1bxlDYPaapLOJQ9&#xA;VlDGKRG6xqwYqtQwFKqSYGRPNIAHJk2raI8WlXsmmfWZ9SSCVrKCS+uwjzhCY0Y+umzPQH4h88CX&#xA;n6RfmddLa6XrvksX1s1yI9WvIdakNvJbh1pLBbTXBkWnLmA7Enh0UsOJBpSGPaZ5Z88Q273F/wDl&#xA;Vb3Fy7Kv1eLXEhC8U3dAzzAKxHQuTX2yZyy7yx8OPc9B07zF+aEWppYTeSl/RYuTENSOrQuyWvq8&#xA;BKUcSSytw/eUJX+XY5WyZ5irsVS7y1/yjmlf8wdv/wAmlxVMcVdirsVdirsVdirsVdirsVdirsVa&#xA;dFdSjgMrAhlIqCD1BGKt4qhtT0yy1Oxlsb6P1rWYASR1Za8SGHxKVYbjscBFoIth/wDyqby/yLeh&#xA;ByIAJ/03oK/8vnvg8LH/ADfx8mvwh3Bjt95S0LT7ye3fy7dzrbEJDPaW9/PHIjJy2P1xd+S8fu36&#xA;0uGlxyHTfz/YwkAOn2KNx5d8v2sE8tz5bvVhhrKCkF61UASTk1b1QprIdqmnE13BGEaTGTtXz/Yg&#xA;0On2Mhs/yv8ALt7YxXP1NIVuolcwzC9SRQ614uv1w8WFaEZScOMGq5fjubBjFck+8reStL0GSW5i&#xA;ij+uSc4xNGZwBCzBlTjNLPv8IqRTEQgPpFMoQAZFhZuxVLoP+Ujvv+YO0/5O3OKpjirsVdirsVdi&#xA;rsVdirsVdiqlaWsVpaw2sIIhgjWKME1PFAFG/wAhiqrirsVdirsVdirsVdirsVdirsVdirsVdirs&#xA;VYV5x8heYdc1T6/pnmy+0QBbdBaQGRoP3DSs7cEmhBaT1FBrtRaMGG2KsdP5N+c2jkSX8w9SkWS2&#xA;+rvGwuDGz/Vzb+oym7LdGZyqsKvRifhxVofkv5sEjr/j/UTYm1a1SxP1n0hWze2Dn/S6k+o4mPiw&#xA;pt2VTLyX+V3mfy5qemXFx50vdTsLBJ45dMlSQQzCZaRluc8oUxdgo40A4qp5FlXo2KuxV2KqS2sS&#xA;3Ul0AfWljSJzXbjGXZdvnI2KquKuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2&#xA;Koe+sIL6D0JmmRKhqwTS271H+XC0b09q4ql3+EtK/wB/6j/3E9R/7KMVd/hLSv8Af+o/9xPUf+yj&#xA;FXf4S0r/AH/qP/cT1H/soxV3+EtK/wB/6j/3E9R/7KMVQ3k+e3a58w2ltdS3UGn6n9WT1ppbhoyL&#xA;K2keP1Jmd9nkY0r3xVkWKuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2K&#xA;uxV2KuxV2KuxV2KuxVD6jHNLp9zHBX13idYqGh5lSF37b4lXin+CvNdpo2tnV9Lu57W5shG0EFws&#xA;kskv1m3KkCFb5zxSMkkQueIIplGGEwTxFpxxkCbZd+Wti0V4xdvNKSQpPG8evymSCQj0I2cGnEtz&#xA;hLxsKcubt0NBe3PRcVdirsVdirsVdirsVdirsVdirsVdirsVf//Z</xmpGImg:image>
+               </rdf:li>
+            </rdf:Alt>
+         </xmp:Thumbnails>
+         <xmpMM:InstanceID>uuid:5e8338dd-fabd-a24a-b4db-8ead06edb8ec</xmpMM:InstanceID>
+         <xmpMM:DocumentID>xmp.did:e4501db6-eadc-4920-95eb-2f2ea570137f</xmpMM:DocumentID>
+         <xmpMM:OriginalDocumentID>uuid:5D20892493BFDB11914A8590D31508C8</xmpMM:OriginalDocumentID>
+         <xmpMM:RenditionClass>proof:pdf</xmpMM:RenditionClass>
+         <xmpMM:DerivedFrom rdf:parseType="Resource">
+            <stRef:instanceID>xmp.iid:8e66c410-d0c9-4f4a-a8c9-679fa3eb35a2</stRef:instanceID>
+            <stRef:documentID>xmp.did:8e66c410-d0c9-4f4a-a8c9-679fa3eb35a2</stRef:documentID>
+            <stRef:originalDocumentID>uuid:5D20892493BFDB11914A8590D31508C8</stRef:originalDocumentID>
+            <stRef:renditionClass>proof:pdf</stRef:renditionClass>
+         </xmpMM:DerivedFrom>
+         <xmpMM:History>
+            <rdf:Seq>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>saved</stEvt:action>
+                  <stEvt:instanceID>xmp.iid:1cd8aa97-ea75-4dd7-88b4-5f2b2d06a9da</stEvt:instanceID>
+                  <stEvt:when>2022-05-16T14:08:35-07:00</stEvt:when>
+                  <stEvt:softwareAgent>Adobe Illustrator 26.2 (Macintosh)</stEvt:softwareAgent>
+                  <stEvt:changed>/</stEvt:changed>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>saved</stEvt:action>
+                  <stEvt:instanceID>xmp.iid:e4501db6-eadc-4920-95eb-2f2ea570137f</stEvt:instanceID>
+                  <stEvt:when>2022-05-16T17:20:26-07:00</stEvt:when>
+                  <stEvt:softwareAgent>Adobe Illustrator 26.2 (Macintosh)</stEvt:softwareAgent>
+                  <stEvt:changed>/</stEvt:changed>
+               </rdf:li>
+            </rdf:Seq>
+         </xmpMM:History>
+         <xmpMM:Manifest>
+            <rdf:Seq>
+               <rdf:li rdf:parseType="Resource">
+                  <stMfs:linkForm>EmbedByReference</stMfs:linkForm>
+                  <stMfs:reference rdf:parseType="Resource">
+                     <stRef:filePath>/Users/danfu/Desktop/Screen Shot 2022-05-16 at 3.15.03 PM.png</stRef:filePath>
+                     <stRef:documentID>0</stRef:documentID>
+                     <stRef:instanceID>0</stRef:instanceID>
+                  </stMfs:reference>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stMfs:linkForm>EmbedByReference</stMfs:linkForm>
+                  <stMfs:reference rdf:parseType="Resource">
+                     <stRef:filePath>/Users/danfu/Documents/Research/25 Streaming Attention/gpt2_attention.pdf</stRef:filePath>
+                     <stRef:documentID>0</stRef:documentID>
+                     <stRef:instanceID>0</stRef:instanceID>
+                  </stMfs:reference>
+               </rdf:li>
+            </rdf:Seq>
+         </xmpMM:Manifest>
+         <illustrator:StartupProfile>Print</illustrator:StartupProfile>
+         <illustrator:CreatorSubTool>Adobe Illustrator</illustrator:CreatorSubTool>
+         <xmpTPg:HasVisibleOverprint>False</xmpTPg:HasVisibleOverprint>
+         <xmpTPg:HasVisibleTransparency>True</xmpTPg:HasVisibleTransparency>
+         <xmpTPg:NPages>1</xmpTPg:NPages>
+         <xmpTPg:MaxPageSize rdf:parseType="Resource">
+            <stDim:w>5.500000</stDim:w>
+            <stDim:h>2.149717</stDim:h>
+            <stDim:unit>Inches</stDim:unit>
+         </xmpTPg:MaxPageSize>
+         <xmpTPg:Fonts>
+            <rdf:Bag>
+               <rdf:li rdf:parseType="Resource">
+                  <stFnt:fontName>MyriadPro-Regular</stFnt:fontName>
+                  <stFnt:fontFamily>Myriad Pro</stFnt:fontFamily>
+                  <stFnt:fontFace>Regular</stFnt:fontFace>
+                  <stFnt:fontType>Open Type</stFnt:fontType>
+                  <stFnt:versionString>Version 2.062;PS 2.000;hotconv 1.0.57;makeotf.lib2.0.21895</stFnt:versionString>
+                  <stFnt:composite>False</stFnt:composite>
+                  <stFnt:fontFileName>MYRIADPRO-REGULAR.OTF</stFnt:fontFileName>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stFnt:fontName>MyriadPro-It</stFnt:fontName>
+                  <stFnt:fontFamily>Myriad Pro</stFnt:fontFamily>
+                  <stFnt:fontFace>Italic</stFnt:fontFace>
+                  <stFnt:fontType>Open Type</stFnt:fontType>
+                  <stFnt:versionString>Version 2.007;PS 002.000;Core 1.0.38;makeotf.lib1.7.9032</stFnt:versionString>
+                  <stFnt:composite>False</stFnt:composite>
+                  <stFnt:fontFileName>Myriad Pro Italic.ttf</stFnt:fontFileName>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stFnt:fontName>MyriadPro-Semibold</stFnt:fontName>
+                  <stFnt:fontFamily>Myriad Pro</stFnt:fontFamily>
+                  <stFnt:fontFace>Semibold</stFnt:fontFace>
+                  <stFnt:fontType>Open Type</stFnt:fontType>
+                  <stFnt:versionString>Version 2.062;PS 2.000;hotconv 1.0.57;makeotf.lib2.0.21895</stFnt:versionString>
+                  <stFnt:composite>False</stFnt:composite>
+                  <stFnt:fontFileName>MYRIADPRO-SEMIBOLD.OTF</stFnt:fontFileName>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stFnt:fontName>MyriadPro-SemiboldIt</stFnt:fontName>
+                  <stFnt:fontFamily>Myriad Pro</stFnt:fontFamily>
+                  <stFnt:fontFace>Semibold Italic</stFnt:fontFace>
+                  <stFnt:fontType>Open Type</stFnt:fontType>
+                  <stFnt:versionString>Version 2.062;PS 2.000;hotconv 1.0.57;makeotf.lib2.0.21895</stFnt:versionString>
+                  <stFnt:composite>False</stFnt:composite>
+                  <stFnt:fontFileName>MYRIADPRO-SEMIBOLDIT.OTF</stFnt:fontFileName>
+               </rdf:li>
+            </rdf:Bag>
+         </xmpTPg:Fonts>
+         <xmpTPg:PlateNames>
+            <rdf:Seq>
+               <rdf:li>Cyan</rdf:li>
+               <rdf:li>Magenta</rdf:li>
+               <rdf:li>Yellow</rdf:li>
+               <rdf:li>Black</rdf:li>
+            </rdf:Seq>
+         </xmpTPg:PlateNames>
+         <xmpTPg:SwatchGroups>
+            <rdf:Seq>
+               <rdf:li rdf:parseType="Resource">
+                  <xmpG:groupName>Default Swatch Group</xmpG:groupName>
+                  <xmpG:groupType>0</xmpG:groupType>
+                  <xmpG:Colorants>
+                     <rdf:Seq>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>White</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>Black</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>100.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>CMYK Red</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>100.000000</xmpG:magenta>
+                           <xmpG:yellow>100.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>CMYK Yellow</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>100.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>CMYK Green</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>100.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>100.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>CMYK Cyan</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>100.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>CMYK Blue</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>100.000000</xmpG:cyan>
+                           <xmpG:magenta>100.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>CMYK Magenta</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>100.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=15 M=100 Y=90 K=10</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>15.000000</xmpG:cyan>
+                           <xmpG:magenta>100.000000</xmpG:magenta>
+                           <xmpG:yellow>90.000000</xmpG:yellow>
+                           <xmpG:black>10.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=90 Y=85 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>90.000000</xmpG:magenta>
+                           <xmpG:yellow>85.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=80 Y=95 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>80.000000</xmpG:magenta>
+                           <xmpG:yellow>95.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=50 Y=100 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>50.000000</xmpG:magenta>
+                           <xmpG:yellow>100.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=35 Y=85 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>35.000000</xmpG:magenta>
+                           <xmpG:yellow>85.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=5 M=0 Y=90 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>5.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>90.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=20 M=0 Y=100 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>20.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>100.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=50 M=0 Y=100 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>50.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>100.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=75 M=0 Y=100 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>75.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>100.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=85 M=10 Y=100 K=10</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>85.000000</xmpG:cyan>
+                           <xmpG:magenta>10.000000</xmpG:magenta>
+                           <xmpG:yellow>100.000000</xmpG:yellow>
+                           <xmpG:black>10.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=90 M=30 Y=95 K=30</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>90.000000</xmpG:cyan>
+                           <xmpG:magenta>30.000000</xmpG:magenta>
+                           <xmpG:yellow>95.000000</xmpG:yellow>
+                           <xmpG:black>30.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=75 M=0 Y=75 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>75.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>75.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=80 M=10 Y=45 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>80.000000</xmpG:cyan>
+                           <xmpG:magenta>10.000000</xmpG:magenta>
+                           <xmpG:yellow>45.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=70 M=15 Y=0 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>70.000000</xmpG:cyan>
+                           <xmpG:magenta>15.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=85 M=50 Y=0 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>85.000000</xmpG:cyan>
+                           <xmpG:magenta>50.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=100 M=95 Y=5 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>100.000000</xmpG:cyan>
+                           <xmpG:magenta>95.000000</xmpG:magenta>
+                           <xmpG:yellow>5.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=100 M=100 Y=25 K=25</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>100.000000</xmpG:cyan>
+                           <xmpG:magenta>100.000000</xmpG:magenta>
+                           <xmpG:yellow>25.000000</xmpG:yellow>
+                           <xmpG:black>25.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=75 M=100 Y=0 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>75.000000</xmpG:cyan>
+                           <xmpG:magenta>100.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=50 M=100 Y=0 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>50.000000</xmpG:cyan>
+                           <xmpG:magenta>100.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=35 M=100 Y=35 K=10</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>35.000000</xmpG:cyan>
+                           <xmpG:magenta>100.000000</xmpG:magenta>
+                           <xmpG:yellow>35.000000</xmpG:yellow>
+                           <xmpG:black>10.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=10 M=100 Y=50 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>10.000000</xmpG:cyan>
+                           <xmpG:magenta>100.000000</xmpG:magenta>
+                           <xmpG:yellow>50.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=95 Y=20 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>95.000000</xmpG:magenta>
+                           <xmpG:yellow>20.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=25 M=25 Y=40 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>25.000000</xmpG:cyan>
+                           <xmpG:magenta>25.000000</xmpG:magenta>
+                           <xmpG:yellow>40.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=40 M=45 Y=50 K=5</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>40.000000</xmpG:cyan>
+                           <xmpG:magenta>45.000000</xmpG:magenta>
+                           <xmpG:yellow>50.000000</xmpG:yellow>
+                           <xmpG:black>5.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=50 M=50 Y=60 K=25</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>50.000000</xmpG:cyan>
+                           <xmpG:magenta>50.000000</xmpG:magenta>
+                           <xmpG:yellow>60.000000</xmpG:yellow>
+                           <xmpG:black>25.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=55 M=60 Y=65 K=40</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>55.000000</xmpG:cyan>
+                           <xmpG:magenta>60.000000</xmpG:magenta>
+                           <xmpG:yellow>65.000000</xmpG:yellow>
+                           <xmpG:black>40.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=25 M=40 Y=65 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>25.000000</xmpG:cyan>
+                           <xmpG:magenta>40.000000</xmpG:magenta>
+                           <xmpG:yellow>65.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=30 M=50 Y=75 K=10</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>30.000000</xmpG:cyan>
+                           <xmpG:magenta>50.000000</xmpG:magenta>
+                           <xmpG:yellow>75.000000</xmpG:yellow>
+                           <xmpG:black>10.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=35 M=60 Y=80 K=25</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>35.000000</xmpG:cyan>
+                           <xmpG:magenta>60.000000</xmpG:magenta>
+                           <xmpG:yellow>80.000000</xmpG:yellow>
+                           <xmpG:black>25.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=40 M=65 Y=90 K=35</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>40.000000</xmpG:cyan>
+                           <xmpG:magenta>65.000000</xmpG:magenta>
+                           <xmpG:yellow>90.000000</xmpG:yellow>
+                           <xmpG:black>35.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=40 M=70 Y=100 K=50</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>40.000000</xmpG:cyan>
+                           <xmpG:magenta>70.000000</xmpG:magenta>
+                           <xmpG:yellow>100.000000</xmpG:yellow>
+                           <xmpG:black>50.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=50 M=70 Y=80 K=70</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>50.000000</xmpG:cyan>
+                           <xmpG:magenta>70.000000</xmpG:magenta>
+                           <xmpG:yellow>80.000000</xmpG:yellow>
+                           <xmpG:black>70.000000</xmpG:black>
+                        </rdf:li>
+                     </rdf:Seq>
+                  </xmpG:Colorants>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <xmpG:groupName>Grays</xmpG:groupName>
+                  <xmpG:groupType>1</xmpG:groupType>
+                  <xmpG:Colorants>
+                     <rdf:Seq>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=0 Y=0 K=100</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>100.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=0 Y=0 K=90</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>89.999400</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=0 Y=0 K=80</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>79.998800</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=0 Y=0 K=70</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>69.999700</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=0 Y=0 K=60</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>59.999100</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=0 Y=0 K=50</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>50.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=0 Y=0 K=40</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>39.999400</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=0 Y=0 K=30</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>29.998800</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=0 Y=0 K=20</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>19.999700</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=0 Y=0 K=10</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>9.999100</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=0 Y=0 K=5</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>0.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>4.998800</xmpG:black>
+                        </rdf:li>
+                     </rdf:Seq>
+                  </xmpG:Colorants>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <xmpG:groupName>Brights</xmpG:groupName>
+                  <xmpG:groupType>1</xmpG:groupType>
+                  <xmpG:Colorants>
+                     <rdf:Seq>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=100 Y=100 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>100.000000</xmpG:magenta>
+                           <xmpG:yellow>100.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=75 Y=100 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>75.000000</xmpG:magenta>
+                           <xmpG:yellow>100.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=0 M=10 Y=95 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>0.000000</xmpG:cyan>
+                           <xmpG:magenta>10.000000</xmpG:magenta>
+                           <xmpG:yellow>95.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=85 M=10 Y=100 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>85.000000</xmpG:cyan>
+                           <xmpG:magenta>10.000000</xmpG:magenta>
+                           <xmpG:yellow>100.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=100 M=90 Y=0 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>100.000000</xmpG:cyan>
+                           <xmpG:magenta>90.000000</xmpG:magenta>
+                           <xmpG:yellow>0.000000</xmpG:yellow>
+                           <xmpG:black>0.000000</xmpG:black>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <xmpG:swatchName>C=60 M=90 Y=0 K=0</xmpG:swatchName>
+                           <xmpG:mode>CMYK</xmpG:mode>
+                           <xmpG:type>PROCESS</xmpG:type>
+                           <xmpG:cyan>60.000000</xmpG:cyan>
+                           <xmpG:magenta>90.000000</xmpG:magenta>
+                           <xmpG:yellow>0.003100</xmpG:yellow>
+                           <xmpG:black>0.003100</xmpG:black>
+                        </rdf:li>
+                     </rdf:Seq>
+                  </xmpG:Colorants>
+               </rdf:li>
+            </rdf:Seq>
+         </xmpTPg:SwatchGroups>
+         <pdf:Producer>Adobe PDF library 10.01</pdf:Producer>
+      </rdf:Description>
+   </rdf:RDF>
+</x:xmpmeta>
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                           
+<?xpacket end="w"?>endstreamendobj3 0 obj<</Count 1/Kids[12 0 R]/Type/Pages>>endobj12 0 obj<</ArtBox[0.685104 0.908539 396.0 154.605]/BleedBox[0.0 0.0 396.0 154.78]/Contents 1050 0 R/CropBox[0.0 0.0 396.0 154.78]/Group 1051 0 R/LastModified(D:20220523073516-07'00')/MediaBox[0.0 0.0 396.0 154.78]/Parent 3 0 R/PieceInfo<</Illustrator 1052 0 R>>/Resources<</ExtGState<</GS0 1053 0 R/GS1 1054 0 R>>/Font<</T1_0 1041 0 R/T1_1 1042 0 R/T1_2 1043 0 R/T1_3 1044 0 R>>/ProcSet[/PDF/Text]/Properties<</MC0 1045 0 R/MC1 1046 0 R/MC2 1047 0 R/MC3 1048 0 R>>>>/Thumb 1055 0 R/TrimBox[0.0 0.0 396.0 154.78]/Type/Page>>endobj1050 0 obj<</Filter/FlateDecode/Length 3401>>stream
+H��W[o�F~篘������/y�Ӵ��&m�B��A�Į/Jci���wfHy(���P�<�2������=,����Y�G%�����tt�i�?�W����x�?�?ɮ��O{W-�o���]��>�(dd
+���Mu�����-���]|WK�����-�o�)��ru[�^>��,���M)�E��iϭ�1�z^70p~��X7J��/�u������?�IϷɩ/����E2���Źb�3�:���g�/��/�?ؘ����.]���d��mInk���h�B;��4D��о���1���x:k������ne'&�b�M	����
+͔4ɥP�3r���f��d9�ۛ�*�ꙻ���^/�$�e���.��ÿ�Y���/ݒ�ȩ0�y��D�%��&	�0m�UD�ׯ���kF�]�����J�<��+�N�Rx0�M��j��r�V>Ŵ{�vW��G|�Q��(3y���tҲxJZ���n&��Fv��tn�f�Y=}�,O���op3�t)M�,ѓ�rd!��so�ڢd����>	�t���{�[��5���i3�r/�L)���AL �G�4Bf�4���$���Pe~?+I&�}̴E���3��C���S6+�>C'�D�qݙ{�4�(�&�I�(W�
+���h�%,G�)ʐe���ǟ��-iT�&v�2���{+-@�
+�Z�'��+��h�Z��<z�TO�ϠQ`<�6l���'J&��m�P$ܐ-��B�Lְ�ѝ�˛s�jqs���w!|6d1�E?��(�	�h���fEr�H���S̓���`g=E�!!������{֙>ݔ�D� �C������d=���J�ä��D��zQ �s�\0N)ņ�]1PF�4Es�f�K'`�F�����&5�ޣ� -eW�`��7T��7��;/��W��D��U電�\��Ƴe�'��W��F*j���7�����>�V�����}k����:�k����ڴ&�5������sZ�r�xFK�����뇽�3������9�j �׶Ot��C����:A،���'��oJ�u� V����6���dv�e��g�_4��É���v��@f�_�ps{�%Q����x�-nDv��`��[��8/7ʋ�i�2;C�Ĩ9�a�a&���d������Z�"��VE��.���L:�2%t0jq�=g�W�1fi�E<C�Wd���=����p't�tO3=W:Q�:^?fz�*W�Z,�Uѷ����+ZL���L��З�!F�1�T]�&��]��v8�����%�0�<�9��`x4M7�5�BmcF����}�q�Ί֍��k_w_������|ft��E5�B�/�L����C���r*��[8�����6���0q�߁-��C{�Ã	}�~��VrK���� �e)��1�<���&K�p�[h���,7���Eѥl�+f�2c��h*W��]�E�%aSTd]`�)�Z��ȧ�*_N�Far��g��H��@�u�����T���w/�Bz�C�NƍkbI �$4ƭt��JZ5"�ҴT�=(ǉ����E�'�D
+�ڴK�C�8�����2�h0�;R���'K�4J+S�|�@3Y���A�4[��];��X:P��.y����i0�.�m6"����¯O�m�%6�P����6_D:n��"��1������D�{��GT,`S�%�a�U��-\:�a��-2���;`-1���{`-1�	��@���5A:��>M��H�{�tM0_d����M��H��$���pU���Ё{�
+�qמ-o���G�^�^-�oPϖ���';�~�ǆ&3[�e�,"r���-Ϫ��/�jI�`����R�H2$ ��������:ҴxF�Z4��/�IlxZ��
+:I�/���j��G ��8���#ë��j/6| ��I �#h	(e���nu�4��<8��>�M��Pgt@�'�zQ���i�µJ_j�U�݌$|+g�^�W���`e%��%�&����H�l$�(=�)�9]���)�����b�r�d9���f@m�"�n�r.pCY�~Lȫ��(F-��t2�EU%b4\�\q�(J�(눌�5�M���!���K�t(I��5m�������5�.G:���_�-�EQH�DS�E��&,�N��Ouc���}myd���ߵ�l�Z}H�G~�6�Pn�OOGq���Pǒ����li�au}���d��Qm�b%E�m��4���LD�)�z3�
+У�����9�j�_�����k3��7�T� t���.N�ڤ�?u��p���.�yRh�  �X�l�Ų
+L�h��YR�l�tmr��w��aZ7s4�&o\-�:
+a�7e<�
+�g۬�V�[|����u��T�q�^ �E�F;�k��� �V�1�)�_��)�F7��^)����ǌ�ܱ*�E��͞��R�2.�(����^��k��ؘ�Co}	qA�L���Ď�ۏvu�� _;�m�N�qTG%-XD�����nz�zz>g��r(� ���ME���^�#i�����)}�ޯ�i���M{V0����2A�)��ܖ������q�Wc��R�9P��騻�[�%,��DSl�U㏋��Q�M���h���v�Ghl��,)�)�)��M4�G�	@�	k�,��n>�q1S�1±�,�(9p�£�^��Mt�`B�֫]�m���
+��`��_
+�I�.2$��%c4��z.%���e؋ڼ��s�Y��>�!w��($���f(��F+���fÍ��:��P�҉�PN0rl��0�ؠ�\�Ay�\⮎����:BYuK�cW�:��$ۺ��z8��#��q��n��6ڝ�i����,cp��BX�s���s�@�����3F0ܹ-k��꬏3Y�T���K��%s��Б9��1��C�
+K�]���D��ݮ󮵻�yZi�[����&,؝��]6O}Ě�*FA����{��̂��m�#���A���vπ�W���u �#���iDtl��Q3�����H����q��XY�O	��ǧ^e��#.@��e�*;ק��U4`%00w��d���&��(o(6Y��T���O�O	���[�Q����MP��汪	����E<�Y�w~Ԛ��c�_Yc��䠬�n��F�Erm9�Jf@��t%�aE������K+�D{�Z�!�����+/L�1W0.i0�7=�a���PX[{#Y?%�.���H�-^�����V
+%&�c��rjz��ҡb8���Q=��H(�m�]��
+���)��R��5�;��^�VK�Ţ��2'㮉�m.C3��A�[*��g{/v� 5���endstreamendobj1051 0 obj<</CS/DeviceCMYK/I false/K false/S/Transparency>>endobj1055 0 obj<</BitsPerComponent 8/ColorSpace 1056 0 R/Filter[/ASCII85Decode/FlateDecode]/Height 19/Length 385/Width 49>>stream
+8;W!u]5>J?%-92.=DAZf-[QG,D?=`q<R_l,+P4ONE1QK\qB<tse-S0%,2\0]aT(dn
+,5>l*2ck;ZE@K4c8[cB,&0]'Olb$C&@upF<K4[O<Bth_GbCD''M[_hus+03FCM+dA
+aj*8:cCmSNSpq;BDi\15!opGUTq(+W^n'MMYrKV/Dqh]&oCFmBB9=/t1>&UqjQi/3
+iDU>m!hh-c%imJ<Zl311^I=l3.*@TKlj/U<EpVq0<<&q3=I%^c.<uG,ka:-`QQFLM
+'4Q#sM)`uLKZ]'EDD&d">#>l`+_Vs4O,3\3$1<sTNDgpt,PP9V:=K&I,A1MGI_Z50
+.0IE"XukpIVQOX'+5$p+P7O^5q`Ro.U@tDYG8m]"4ImpM"p!j-"LJ~>endstreamendobj1056 0 obj[/Indexed/DeviceRGB 255 1057 0 R]endobj1057 0 obj<</Filter[/ASCII85Decode/FlateDecode]/Length 428>>stream
+8;X]O>EqN@%''O_@%e@?J;%+8(9e>X=MR6S?i^YgA3=].HDXF.R$lIL@"pJ+EP(%0
+b]6ajmNZn*!='OQZeQ^Y*,=]?C.B+\Ulg9dhD*"iC[;*=3`oP1[!S^)?1)IZ4dup`
+E1r!/,*0[*9.aFIR2&b-C#s<Xl5FH@[<=!#6V)uDBXnIr.F>oRZ7Dl%MLY\.?d>Mn
+6%Q2oYfNRF$$+ON<+]RUJmC0I<jlL.oXisZ;SYU[/7#<&37rclQKqeJe#,UF7Rgb1
+VNWFKf>nDZ4OTs0S!saG>GGKUlQ*Q?45:CI&4J'_2j<etJICj7e7nPMb=O6S7UOH<
+PO7r\I.Hu&e0d&E<.')fERr/l+*W,)q^D*ai5<uuLX.7g/>$XKrcYp0n+Xl_nU*O(
+l[$6Nn+Z_Nq0]s7hs]`XX1nZ8&94a\~>endstreamendobj1045 0 obj<</Intent 1058 0 R/Name(Layer 3)/Type/OCG/Usage 1059 0 R>>endobj1046 0 obj<</Intent 1060 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 1061 0 R>>endobj1047 0 obj<</Intent 1062 0 R/Name(Layer 4)/Type/OCG/Usage 1063 0 R>>endobj1048 0 obj<</Intent 1064 0 R/Name(Layer 2)/Type/OCG/Usage 1065 0 R>>endobj1064 0 obj[/View/Design]endobj1065 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj1062 0 obj[/View/Design]endobj1063 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj1060 0 obj[/View/Design]endobj1061 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj1058 0 obj[/View/Design]endobj1059 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj1041 0 obj<</BaseFont/FMVMDC+MyriadPro-Semibold/Encoding/WinAnsiEncoding/FirstChar 32/FontDescriptor 1066 0 R/LastChar 122/Subtype/Type1/Type/Font/Widths[207 0 0 0 0 0 645 0 301 301 0 0 0 315 236 337 536 536 536 0 536 536 0 0 536 536 236 0 0 0 596 0 0 636 576 588 683 0 509 666 672 264 0 582 493 827 0 704 559 704 569 519 525 666 601 0 0 0 0 0 0 0 0 0 0 508 0 449 581 516 319 0 572 256 0 509 257 848 572 564 585 0 356 417 351 569 0 749 494 500 450]>>endobj1042 0 obj<</BaseFont/FMVMDC+MyriadPro-It/Encoding/WinAnsiEncoding/FirstChar 32/FontDescriptor 1067 0 R/LastChar 120/Subtype/Type1/Type/Font/Widths[182 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 632 0 0 0 0 0 0 0 0 0 521 0 0 0 0 0 0 0 0 0 0 0 525 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 441]>>endobj1043 0 obj<</BaseFont/FMVMDC+MyriadPro-SemiboldIt/Encoding/WinAnsiEncoding/FirstChar 109/FontDescriptor 1068 0 R/LastChar 115/Subtype/Type1/Type/Font/Widths[821 0 0 0 0 0 397]>>endobj1044 0 obj<</BaseFont/FMVMDC+MyriadPro-Regular/Encoding/WinAnsiEncoding/FirstChar 32/FontDescriptor 1069 0 R/LastChar 121/Subtype/Type1/Type/Font/Widths[212 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 542 580 0 0 0 0 0 0 0 0 0 0 0 689 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 448 0 0 0 0 0 0 0 469 236 0 555 549 569 0 0 0 331 551 0 0 0 471]>>endobj1069 0 obj<</Ascent 952/CapHeight 674/CharSet(/space/B/C/O/c/k/l/n/o/p/t/u/y)/Descent -250/Flags 32/FontBBox[-157 -250 1126 952]/FontFamily(Myriad Pro)/FontFile3 1070 0 R/FontName/FMVMDC+MyriadPro-Regular/FontStretch/Normal/FontWeight 400/ItalicAngle 0/StemV 88/Type/FontDescriptor/XHeight 484>>endobj1070 0 obj<</Filter/FlateDecode/Length 1313/Subtype/Type1C>>stream
+H�|SkPW�M��FM�f���cx?,�`}�E
+� �Ƒ��D1��^b"U"Q��<�V;�hE�V��+�W�j�Q��V�Ҟ�K���?�չ3���{�s�9�☓�q|ކ����<��9m�:��{ű�2Ғ9���)�wu�绐�Y_�z%�۳�ќ/�K��0�?]�Obwi��ߊ���� O����w
+�5j}
+��g��^�Q�����\��U{3k�Ҙ)	ñ�39��V�h�5jX�I���B<Ǫ#��f�&s{�����R1Z#h1	:�����I֩}�T�T}���iY��φ�-��,̨ٝ�[�c��aQ8#�pL�aR��X4�	�����s�0>.
+��(���m)i㟷�.j�8�E��EX�-��$0F u�� �ǝ'�ĩ=�B'��c�L*�����n��G���K�����N
+D����ҟ֝i��x����J��Y���m�٥�ȸ�9���_H�l:���#�Ik��e�~{OXd�ʗ%"��	���j�e����G�ٴ���[�u`o��~T�ß{J�h,	
+��X?��o|fέ��5}y�Ic�G�b�B�4�7�q?ܭ�:OKAa������|��^N�%Qr:�%�lD�[��lȅh&,@�tQ8	��
+y�!9R"e&X�
+p������⍏���!���6o�<���Ӌ�!~�l��OwK�;�ߡ��;��2���RX|����-]^m�2�p&>0"��	��j"��6A�$�H��	�K��b+�rrM�yń�W�&TB�]<0�>�
+ �%�\H��Lـ,v�w�����ʣ6嵪����Pmd�����!/G��OL4(�	yK�CIaM=���Bz���O���@�1���I4��Dˑׯ0����B��x�D4]����i�
+�G�-�y��Ӗ�~��h�	bzxQ����,>�T�T��ko迦��s���_Ќ���X-��}��PF��mъ�D�f-�ڔ��]U7�tRc��H��w��+_��E��'�Rd�����n�ذ<}��#�VG�C���q&}��4V_�k�.~RVv�.���+6P����4T���`����2�★�������� t��)�-0+SsSRb�PݝQN�zg�4ip�z��e)!�:*�O5nnWH���413?�o�T`�d�䘥�`IAIA��|wS�Nk�z/C��q�f�k��+���W��+���Mx�����|�0��k͍�:7��*��&�8�m�c�f�4'��2U6����Ֆ9+����W��2�{�}����k�y]���0 �-�7endstreamendobj1068 0 obj<</Ascent 972/CapHeight 674/CharSet(/m/s)/Descent -250/Flags 96/FontBBox[-193 -250 1164 972]/FontFamily(Myriad Pro Light)/FontFile3 1071 0 R/FontName/FMVMDC+MyriadPro-SemiboldIt/FontStretch/Normal/FontWeight 700/ItalicAngle -11/StemV 116/Type/FontDescriptor/XHeight 487>>endobj1071 0 obj<</Filter/FlateDecode/Length 658/Subtype/Type1C>>stream
+H�bd`ab`dd�q��uq���,�LL	(��N��L��I�,�+��f�!�����C�G�w��>��_	�����(x맰#����@�3J--�t@�	�4�Q0200 �&
+�)�I�
+���%���
+�y��E�E�%�)z
+�99
+`#��R�S��@�w)d+�f�d�)$%�3���RSJ�SRs���A2H�4<V)d�) �R���K���
+�y)�@S���$��e����T�*X(���1 �C30쀌;�?:���x��H��e�f���3�O����GE�}�����o��D��8&��a�������~��~�x|?O.(�aSv�|U�������50|g����S��wI��߫X��=y����i����a�W�~ge�����Ӽc�Jo���X�Q�^!�{	H������K��/�z��Sν�/+ޓc_{�{io��8�N6�w�����&�D4��~"��C����ǏO������w��Ҋ�G�8�sv==��A�d���r���-�Z�r��ͫ��ɭ��:�,qz��ovKO���I�s�w�.ϕ��/����;�H��a�O�鿣�}י�gڟ�Il?�'�֘�'��ɋ&�C��}�I�L���5��D��+&r�-Hpm�����)�w~�>�  %�1�endstreamendobj1067 0 obj<</Ascent 953/CapHeight 674/CharSet(/space/N/X/d/x)/Descent -250/Flags 96/FontBBox[-185 -250 1090 953]/FontFamily(Myriad Pro)/FontFile3 1072 0 R/FontName/FMVMDC+MyriadPro-It/FontStretch/Normal/FontWeight 400/ItalicAngle -11/StemV 84/Type/FontDescriptor/XHeight 484>>endobj1072 0 obj<</Filter/FlateDecode/Length 744/Subtype/Type1C>>stream
+H�|O[Ha�1�OKW�qd��?�A����i^BmK��2ԭ���.�Ӯv�U�4/��`MȰ����JB�
+��Q��R���٨Y{�����wX&8�aY6��pА�oh�-FS�lM,T��4��b�+,��� ]˯��Az׽�U� �ϵڴ�j��SSRR��W\z��zL¥�vE������V�f���dJ�9��x_`̎�IvIv��J�p��P�༴�����Ԥ���Ŏ�X��-�6Y2aE6��:�\��U�?��J�7�$��MR��$3[�|�0������� [�N��Oх)V�MS�ڃi���ߊH��Ez8���	��ڏ�Wb�G�� �t����3��b�S?F�5K~p���g�����C�=Xv3�$�׉o|�6��9��T���{��ĳ3���� �����28����e8�:��VEAY ���@��E1�<݆Z�\cg��q3"<��!��	'�oM���܈�Dn�t��)lu��y��fϒ����K+}G�J8��:,jm�HH���Y__<���^l'���9�5��wj�N����x/�����1y��3D�D"H��h��r�{���q�������	mH0�3tr��	�\��I�U��'�?!�5 �A*���G�d�o>�`�nE-��#McG�D�����r7IvO��V�M*ĩ~�{B���߲��Тn�ݭ���׻C7x*�[~��>[�q-)��^�  �gc�endstreamendobj1066 0 obj<</Ascent 972/CapHeight 674/CharSet(/space/ampersand/parenleft/parenright/hyphen/period/slash/zero/one/two/four/five/eight/nine/colon/greater/A/B/C/D/F/G/H/I/K/L/M/O/P/Q/R/S/T/U/V/a/c/d/e/f/h/i/k/l/m/n/o/p/r/s/t/u/w/x/y/z)/Descent -250/Flags 32/FontBBox[-161 -250 1198 972]/FontFamily(Myriad Pro Light)/FontFile3 1073 0 R/FontName/FMVMDC+MyriadPro-Semibold/FontStretch/Normal/FontWeight 700/ItalicAngle 0/StemV 124/Type/FontDescriptor/XHeight 487>>endobj1073 0 obj<</Filter/FlateDecode/Length 4363/Subtype/Type1C>>stream
+H�|TPg�a����;�Uw���a"�!,���^�FYE%�s���  ^"$(�c-����r���B@��G�!���C���I�ޏU7KL�U]��T��_�������8fm��8.	�
+t�m��i��jm�.ސ��X9^��rk�ɞA>��d]��X:`j�#'i������@��;�t���O�f����b7N���9-q�i�ZN�;-]��ƭ�'�;Ƹt�ƃ{/%�����iZc�E�+'N��iu�i�\�`L�	�F��K7�i��q���b�/��Rq:='��6�u�N�i\�^�P�b�Β`إO7�iC�w��rK8�6�p���f�c�186�
+c1l�{[��Ya�b, ÂD�j%0��"���2ca�z�x$��M�9���V[�ʭxQ��l�����%�"v����fry��CN�7��$�� ����M��A�������6�������:��>3�f�RK�ϩ>��9vsr�z���o���,*����
+H0�-�����C�X����)2z�ˑ�I�����QSoR9��=7��/��x�3�o~CFG�_M��������dE�e���$^�|#�Z�i���������9�U-��Td�VY ��-��X�%w������J���/�u,�4[BC����֛�/�V�b{z��$;wj2?T�GJ�/��򄤨��$6�&��^~�̥��jy'�z*�P�jf�(��-����X�&C��Y,�UM0��_��<l�-�1��b�,�-kA��_�"�]�F�P��D�9�n��EH���a�y�b|�𐘚�r@�n"��0��kS��c�'o���]e)t������4�u���_�*g���k�v��Θ]��(�E���	\�y_ �� ��[���P�B��3�;`U�}�߯�e���2��A�����$�^V�v7��*��c&px
+">S6��J��_@��?�������/H��R�-"�T7��PUn���.C�h���+�B��+�x�B0HD|���.��|1*�r!�B��`8��BN֕V���s�<�9��(G�b*g��=�����mf%��qh�0��0]P ��ryH|�zu�1k� �j���(���3p�Oj��d��n8�������l�0�TΏ|�����`��[0��Of���Q��=����'�@2�T�4�黽�箷�B����=p�����t+Y	:.���5�����W:K�}�MI��t�:6~���[�07+��#���?a�Jc�Z�U�]J*w��p�1^`�h$�)+�(��韇�U��Hn}��X9x/[䆔�������X^W�,!���#cuI����|}AVaf�V��@�J����O�Ԭ�����5-��_m�b�G�V��0��"�	��y�ژZ���-�Rw�E���GG���π�8�������̔����� �x���EI�	�գ��(SZ��w�Šf:���9$2�}	�E��W��d����L����G��$)!���v4��Ϧ��F!m�K{�Rm�y��/{K���o��R/�ob/V�m�!Y�t��4���R{���p52:��^�࠼�4,P���hyI��^�~mxT�J��;I������.	ҿk�(2�a��9�������Z�ѷ�J�0'o΍T �1pߧ�Ͷ��W��O��Qcx��ţ�H��(dH��{���K�#o���	��2�����^�����;���~fȁL\ݪl	�9������}��s�D��<���,Z�B���ƝE�l���}y�=�D�ʄ��x�:8�lJNR7�&`�8$Lnϔ�`G���K��;`���y{��V ��1��׳!p�m؛}��+�����5�K(M�,N(���_���\��H���Sǅ!�^����Μ~�E�/��W �p5���9��<-�а��|3�����WL��MMY^BA.��=��.�IT�ς��e��(��I�N۳sNt#��������e��st���H
+2�3�6����	"���Ҩ��-�s���W�И��V��VPP�V�����@TFT�L�@�L���֐��B��� �T��Z�Z�h���a�Zo_lW���=���;���>����
+�4�D�#��G�ĭ��N�+��o��%��M�����7�S�T��ٱt�SR/yF�@Seo���Mɤt�gd9Y�P�X�Fx5`g�����k�d;��Y$��2��K������X���M�-��׈���y�LZ<`��|�½�i�Hz���r�J;Q�+��6����5��ץ�":����n�.��nI9�ZwB��3�����/V��:F�fd���0�-���ut�F�q�ʹ-�:K0Rw3X"w�с� �D���0`� �a�ty馔�C�ÞEX�e�sYO �h��Ax�|Hf��w�$V�*�B��8���B�U�G{2�pQ��	��
+k��J���<V�=�U`�)�^���/픟��9���@�uMI�A\����,�mAk6��wa�P�e��,����hl����՟G����C��Gπ7��E&6�N�ۥuU�����M(aDM��{A��54�wt!V?n�	loH�W;���Wl�RX��@!���>�|��,�@��X�L��C���� U�jz�N@�^;a�Y�f��L�& ٖ�.� a,�� �ŉ��k4�p˻v����5�t�E�K#��-��R�Eߏ�u�+"� 5*tT��!��P��)�t%؈Z,[�I��o��_�Ys�1/�W՚�L�e��ِAx��P���ڗ����T0��(�O>XQ)��s�RQ�� N:����%�C�����_��\�0�zq�sF[tƹ�k�B����n�_�Ot�n?�+�4��EC苹a���[V�>*i�����͘[�
+c�#����� k{2�?u�KLʵ���4���q�`���l���U}���(>g%x��J{�Uj� �ibQK�.1]����W��x����Kg}�^����
+�5C`��=��;	qCJ����b������X���C����X���=�'�H�N�zf ��К}�������~#�(v���=��oga�38�Dp����8?-��{�ω��9D��N8x��px�f�|���1V ��3^!	�����&g�M�1Øe`jMէ��[���XRƙ;���E���#�H&2Am�����0�R*���.`�G��l��(��@����z�ް�}ʾ�?WVYÞ�ET졹�� m<+������y�a˨�-������a#x��l
+K|A�a�<1v��}����v7���׸�f�S+lo������T�C���)�ޔed�rO�|O_.�ve�i\o������de�U%E���^zq��Ml�R(�cQ\���D����f�)QK��J�G��.������i����:���O��?�ʐ�r\o��E�,���a0VSʿ����	<~��A2���Β*&�3�'���g��� ��o��o�6}��q��U��\׭�����ӽ��0����?�C��)�ѥ�o/l���K�"�i|=Y��Z�t�2a�����=L��'�Pz_��MR��+G�g.�׶ѿ4�[<��m�lwMk�5��R!IG�"���牌��s(Q7
+Ko`.�k;�<�Nk`EFD�zMrF"-����Y�:ȟNe���ވ�6i�����^�;��,V�t��p�}�	��д����Ѧ��JsQ�:eb!S����1z҄�v��mgw<f��*I���=�eߞ�w�^�:y}�U��x�z��$N�)ׯ,��,����>S��u�⧩u1����
+�0q�L���+WN¶��bm��,J19O���T��������\?+ ���[���06g����6wJ�0�(�:��.g�/�5a0�9��Ƃ����fj�m6\�Yհ������xI���3��Oާ<IH��DY��Cݑ�y�H�M$2a��Sí'u�;�wA�D8{d�r�lK���;̙!+H�R�����̦rCLᦈ��.	|L�e�ΐ��v� >R�
+ǰ��YY�mP"��+��V��Y�VǸ��c���Č�Ti�695�9|(<4*>2.R-O-�)��j�oi�����p�n�+���5!I���qFJ.S�j*��$��/��%��2Ϫ��2�%�FO����+`=�I��́�����(���(����:	��i?ͦ����]{�ީ�'�����{��i;&.������]ڞiJ&q����"�b"�܂צ��y8�s}����?��������oD �Emendstreamendobj1053 0 obj<</AIS false/BM/Normal/CA 1.0/OP false/OPM 1/SA true/SMask/None/Type/ExtGState/ca 1.0/op false>>endobj1054 0 obj<</AIS false/BM/Normal/CA 1.0/OP false/OPM 0/SA true/SMask/None/Type/ExtGState/ca 1.0/op false>>endobj1052 0 obj<</LastModified(D:20220523073516-07'00')/Private 1074 0 R>>endobj1074 0 obj<</AIMetaData 1075 0 R/AIPDFPrivateData1 1076 0 R/AIPDFPrivateData2 1077 0 R/AIPDFPrivateData3 1078 0 R/AIPDFPrivateData4 1079 0 R/AIPDFPrivateData5 1080 0 R/AIPDFPrivateData6 1081 0 R/ContainerVersion 11/CreatorVersion 26/NumBlock 6/RoundtripStreamType 2/RoundtripVersion 26>>endobj1075 0 obj<</Length 1662>>stream
+%!PS-Adobe-3.0 %%Creator: Adobe Illustrator(R) 24.0%%AI8_CreatorVersion: 26.2.1%%For: (Dan Fu) ()%%Title: (banner_pdf.pdf)%%CreationDate: 5/23/22 7:35 AM%%Canvassize: 16383%%BoundingBox: -48 -188 428 138%%HiResBoundingBox: -47.0016520173776 -187.4638671875 427.513185028343 137.479710095181%%DocumentProcessColors: Cyan Magenta Yellow Black%%DocumentFiles:/Users/danfu/Desktop/Screen Shot 2022-05-16 at 3.15.03 PM.png%%+/Users/danfu/Documents/Research/25 Streaming Attention/gpt2_attention.pdf%AI5_FileFormat 14.0%AI12_BuildNumber: 197%AI3_ColorUsage: Color%AI7_ImageSettings: 0%%CMYKProcessColor: 1 1 1 1 ([Registration])%AI3_Cropmarks: 0 -154.779635293237 396 0%AI3_TemplateBox: 198.5 -180.5 198.5 -180.5%AI3_TileBox: -180 -365.389817646619 554 210.610182353381%AI3_DocumentPreview: None%AI5_ArtSize: 14400 14400%AI5_RulerUnits: 0%AI24_LargeCanvasScale: 1%AI9_ColorModel: 2%AI5_ArtFlags: 0 0 0 1 0 0 1 0 0%AI5_TargetResolution: 800%AI5_NumLayers: 4%AI17_Begin_Content_if_version_gt:24 4%AI10_OpenToVie: 144.363704196929 -55.4272679317655 10.8769304979451 0 8196.57158459312 8110.74124052643 1716 1053 18 0 0 6 58 0 0 0 1 1 0 1 1 0 1%AI17_Alternate_Content%AI9_OpenToView: 144.363704196929 -55.4272679317655 10.8769304979451 1716 1053 18 0 0 6 58 0 0 0 1 1 0 1 1 0 1%AI17_End_Versioned_Content%AI5_OpenViewLayers: 7777%AI17_Begin_Content_if_version_gt:24 4%AI17_Alternate_Content%AI17_End_Versioned_Content%%PageOrigin:-108 -576%AI7_GridSettings: 72 8 72 8 1 0 0.800000011920929 0.800000011920929 0.800000011920929 0.899999976158142 0.899999976158142 0.899999976158142%AI9_Flatten: 1%AI12_CMSettings: 00.MS%%EndCommentsendstreamendobj1076 0 obj<</Length 65536>>stream
+%AI24_ZStandard_Data(�/� X�Nv��.��%m�+|R�=�Q�׌3)�d�a�U�tN�ݛ6�y�=�J���!��
+|
+y��I�Lvi�U<��/^�o\��䯬�,s[��2�X6!���d���&��%�M��$_���|9�)S���lY
+�栀4ӕ�&���K���ه�Rؤ4iuiiSΖ`����c�lr}�f�<#�e�]��A�@B� � 
+�  ��@�,$H"&&I
+P<@0TP,	dX@⠀@HT�A2�@d�� id����$I9( ��X$㠀���y��$:9��X�ŷ1K�I���nhV�� <<@DL4H�PPA��` �
+�P�z,�2HO.CɑM)Z����-�����R�s�V&y��`@��x�` ƓY�Y�@����;�9���!�@�E^9�"H��ʠ���zV��`��Q�����a9ādف$"F���BF�#i��X��E���=�8c7�Sk}�ٲ�s�73D
+q04�W��(B��`` �g��u��8B$��c�D�Ҳ�����@�np04��Y���Z˙�M֍�6�B��V�,�T_e�2bs��x@
+� ) r��!JC�9��jM;H���ژ��A��Nu1U9�C�R� R d�l&*�eXW&9�GK2	���xs� @�'Z�<C,Ɂ$�#�*RH#�vcJe�����@ƽl��E�aR �p v�H~�4� i8�l��Z}�9W�{�C�+�#�ڞ��x��q e*cq$�xH��� M&�)�LR�*���w���c��@�"%����@��Ln!�rӁ����H�C��x�!��X*q�K	�k�eD� E�D�+R�6,� �!�8RGZ�T��@_=v��"�0�Հ ��x��� �1����q4�Ƣ��kR���h0����k8�1G1 YX6�0��j b9���k�:uLc�sSC�Lpw�2��Lr%,�P��J]R�K]ie�:�\*��6O��Z&���v��K� �����b�c�X�����`,�x��8�DG��@�0��ȑ�3y�!�#�4��4,�xV��:���xg��C�a��⊬�l6rĀ504���U4���I�EfetN�����HT7�PV1εX�2��<��g0`e�X��CHC�ȅ�"F���P� A�ł�h,�Q1�(�1���d��``8�i�A7�aIf(c8dHG-��5CuȢ�:�Zdx� dۆg0P��^Ȣ�2���� Tgyg8�
+1�.����C:��~���(RrY���u9�&ⱔ��vz��1�Y��GD���2ja�[�"_4�F���h44�P�.��Ƣ�h h���@�'�� 1 2ӡA<56789��0�1㘆��T�j�b�6�PG:T"2!%��0�A��d�l>����7�`�<բrUY]ae�ŨF1��j�F�������ncX��a6ף�������Ǹ�1��n�Hh,4<DL<.a`Ё8٤tVZ^bf>��F2�#���u���w���^�~G�F������F�aѰ@�)��0�`0�h0Sr�a��I�	�<H< <HHH�^�rL<��N.���WUAKN8�*�`��	2����!� bI�$�,H�."��Dts[����Z��T�%Ӯ�e�ʎ�LҗI�4e48�c�ĳr����X��h`H<+�6b��@D�@j����NIIHHGGFFE&Q	j���2��F]�}����Zu��v�0�a��0�1�������xh:3H�!��gX00���������0�q�Q9�aG9�as���@G:�q��p0G��p08�������ihC�����j�k��k�lȆlx��F����h`h`hp48*����t 6��<&"	���8�k���û�������s�5�a����������Q�(F-ꕅueU�jM5�a7���w��Cc�8#�8�1�a1�F�±`X �c�X|�\�����]̢�3�q������N���H�l$#��ļ��Dʃ` HC:�qs���u��������``0,ΐ�p�Pf �2CC��`h4�����5�Q7���4��`0� @��A
+`�$�x<�x�p�	����pP@,`8($3Mƒ�gf�$�T���᠐1 2� �� Y� ‑P2	2�:`A�@��C�	�`$C���� q 1a"��ƣb�	�Á��xx@*���xX Ń�C����@��H6 A�@��X@1Q���| ���#��G����FŢ�b����H8@�����@�	(	ţ����B�,xD4���xx 	&"�p@����� A2A�x�h� �C� ���D��$$T4xp@H<F A1Q��1	��$*�0a�� &D< �� �D��$&�`��1�"��ayDD��0E���cUJ*"�&!����a�$����{������M"(<*$  ",T4��`�"�$,\p����
+�� ,4�P<*$HxD��B�	���&<𨘐X06��"��	�"�taHE8$<��p0�@���	"�� sD ���xh�Ё��ă���	&P<<������Cń���X���h� aaB��GDń	H@p�*�`���c�X, $$L��@�#�C$L@P<",Dh�A�, X@�p����@ⱀ<&X,"HX� !!DDģ�GD��GDĄ��D� LD@x�h�ǂ&HL@THxL������!1���(&8��� aA�!�	$L���l@�C�,8<TLH��P<@x8x��X���
+D�h���A�X@$Dh����ǃ�Dx�x�@�a��1AH@DP<*$,TH<,<(<8���L�0aabbbbb�����x<ȣ���B@�Ѐ�����Â,<D@T4`�Q1���c�!"���Ac��bA�����b!�����#���G��,��*<����x��"�B��DÁ�CA��e�jDU�z�Ӈ�N�L@P��$9e@��
+<<<@�	����	����6-�lb�&Wl�:�)ja�6�o�V�ro&�����0�41;��GHU�$��[�>�	LD@��SDA��8\ �P!񨐘h8H@<x0�,�J(�@��� Lh@VL�����@���C�L2�CD%D<L0�	���<P<,����E��( 0SeA��!1р�< >�x@�6SvzeJI�%�]6�.vf�L�AD�L�Ab%�G�CD�88�;A�O�;b)+�d
+$t��B�	�	2 1��HЄX�d
+$<����xX�x� ��`��$<<H���+X( �&P<4 aBD@L`8P+X4�x�@1����hp�
+
+�"����`Ab�B�QQ�HH@@xxPTX�-l���C����<��P���S��I�Cre`Àh(��؀���A������`X�p�ă�		X�@�#������� ����!b���TT�`�� (D���	h��H@0�@�P!�x���D�A��		&(P!�0����Ҭ}�x��n�r�Xֵ�T�a/}U0� 9Dc��:SP�PP@�M�*�,�$�ABA&$ HL�`(( N_
+#�� �����L&D�K(xT<0HL@Lp���(� ��Qd	�	&4$$PL4(( -T�
+$& $P<&��,�)E���b�⁢B��
+������ECAI���C,x  Ųo�7�/�Jf�?��l�e���+��{u��M���vC�bY4���5X}�]h��ƒ����Ue9�U��i���Y�g���g��*��Tr��ȓg���`�O��ۚX���V�Ūٗ���NV#��ݚW^��UI:'I�ѓNƼ�>��5�{u�޾J9o�{�eq�׻�=�~�!�^�,�bf�K�dw!�{�[f�No_f/˖�~��5���/f�)�N�n��^�[��Vql�;,��z�_�S�l5�;�T�̅t�*�>dUl�|��,t5��z�jrfm�8�n�]k9�Uދ�h��c���yMݫu�j����
+US^�D/��cx?󕤢$���`��*��=}/w�+GHŻ�8~��Ix%�}~�L��rU���M�Fu̬�YU~��v����Y�t�o���gXf,���ty���Dg�����乽T�4ÛM��#:L7_��o��I:f� Vr3��O�œv�f�+*���bO��%Lw^���;�3۬��Bm�sy�TOA��{K�����k�n0��B5s��Ֆ7;/ݷ�̦XXd�L�W�f�9KFw�_��k���t-��`�v&+f�Vr��#��l���T&!M�Wf�����=��ʗC梴�DrK�M��Ta9���scfO�/���2�t��l��/�}i�m��h�WV��.���Y�YĴ������<i=k�,�e��ei�C�;Y�h
+���%�{���P���/-o�f7͞�ݸ��ϲ�;�g�2�X�,��r�����ݔ�Y�-�x�Ϊ�^�4���X_�e�e�2���EY��A���͹���1�e����r̪ԙ���Z��K�ZqYu��2]�VNô��҆
+Պy�)�n0�n
+�_L���v�K3�r5�l�iu��UQ��b���"��X���.kOa��V��ز�aZ��f����܍�ʦ����bdf���$N������^[��Uh����9[0�C��VucyDTz3���f�V��Y��H������6˾�^]?��/�Zb���c�<+t˲B���J;��ަ�J�y�[����bN<W���Yi/SI��_٣,�]rvwd��TbT�V'��W%a���璻�2���J��K��
+Z��yR�^���Ŀ�Wf��n~r�����.��]V��o	��+!�˪��fͥ�kBW%wyn91*iS���-��NLSL'�UI[�����ZHW�I����ʬ�OI]�鴛,SI�%)��l9�g�,H�����Z/I�bJ�y�Up�����,F��r�rM�L���+�Ċ��̻���⌓�S�����9��LJ̓���OIn�b�J���9�Y�l󂕼��-Y��&o�2�PZ9�}�Ԓ�2�ҭ��%GW��.2ry%uĺ��RK�FG'Zi]bW-iIˊ��
+]��]�.�V!K|�KgZZI_����>���SzI�5U9I��KYr���̪)G�x����S~�y�Ll�J�9�ܗ�r"�gbgYn��*�9{hz���k+�еkYM�do)�$oX�sMUy��DY�	��X9E����2�Tl�Ѝf��%?��+�K��e�xFT�FU��.�����*��%���Z+���	b_����&�4�k��\�	%MQ�ae1]ruX�Kt�1��V����^t�%���Lz�3��=������tT<M�L�|I��I�p�Y�,�b��*Ӝ��y�[�~�N���+1C�:����K���JB#9/]n�%�X��P-��x.�K���*��BywVz�)�%�)��=c���K�%���L��S����l�؉y,s���a�W�:K��wIM�%�R�]�/��T�,�$�&9���w�W�Gy�O^��31��]R�T�ə�k�K��R���|T
+�$^�'8�:e�^b2%��h�	e��i�UҮ�&�/�+m�s?��{���I�ꠀL *�L Ar�I"�xL �b�T�hH�8( !���x`@��(&< P1р�d��� m��gq&&���Hx�Ɛc�8%31�21']�Bٱ�a��Ď����
+���Ҁ�8�D.��=�����
+�^6�OejT	Y�Vϔ�8( *��X2[��31���g/d��M�Q�� ��j���+obxջ��~A��kW���bbࠀ<`$MlU�lɥY��;i���x��k2	M���d�
+H�F����iY+K�tXU\MȌ1��rn�.���֝��M*g�V��X�6�3��qbދ����i��J�zF��P�j�D��o�i2�*��Zʚ�3���J��{!Q�5)=1�G�XIb�枽*hM��$�ӲPֳh�U�6I���!b&�Je%��e/ڕ�&�f�Z��m��;1ѥYI��ʞH-����z%�4J��T��ܙaR�
+��n�X��&��y�)%�=B�%���deaZՊ���x��lB�;aa��&N�+9Dâ�ՍI�/�J�lU��S�lR�?z�oxX���F��������,�L3,5�dU�w��2����Ŭė����y5w�FR�g���R�-搽|,��!����Jɍ�˴���=v��ai)�×/�e��Pr�=-q�%�2t_ϴK�2,ud���ԛP��,�C%�˫�:�]����gi2�;K�9���-UJp���k���w��+%J����1{ɲ$�Ψ^6c�/��m�z�ζ1;r���6�2E���o�ˊ��זvY�f�%Ѽ̤��6�rD�}�]^�
+������ݥ]/H�R��1��Q�e�~�ү�K%vg���&�Wȵ�k�����J��$�md����!�"K
+�G����D	a�����������B��Fc�Y���y?ͨ�)IOs��tP@.pP@,��l�M:��67�OlӰ܌\�].72N�ˌ�<FF�e1CB����2��٥泹��|�$g��i�C��Gs���C*�>�⣲b�����r���Ӎ�Hs���?��W?k~ⱜe�OB�8d��?SU��*�V]�S]�^����+3�Y�X�ҽze�hV���3���:�¹��6g'D,u-Ӡ��z�N�$Z1��;r;ǻ���{�/I�;�ʦ�ͪ�ɭ��3ۥ�tuS9T�{ߒ�Z��j��{��fϊE��e��+i�问vv7��G����ۧ�2ך+g�C촚-}�x,5�rFX�7Dlb�j"%�V�U�{�Z
+��M�rE�[tO�����^**c�]/"��C/�[�A����>���9( �MQ�����e���ү��S4xtu+�B�4>i
+�,�`*K�A�@[�Z��B�S.���=Z�9*wY)���N�,�*u��*v�cV
+��qJ]�J!�"R��]�,u�^A��*�V93�J�<-�j�h���H�^FS�ދ��֞�����x�2T�>��6X�+yzyE�c��9��<��&_sV�6Xś�B=�[�*;�3�?�*u?�����6կ���l�;( ��KM���-a����U��<���\�
+�������l�F��
+ߩ�5V�GW��;D���镮�:��DeS-�9�ꠀ4J�y��,F5��fU7՚^�z?�p�W��Y��~�vlҽ�X�j��Ɔ�9��.�M��:늯��E��>k�z'�;Hٻ�I��Fwx���V�P[V{��Z�c�l�X������b�k���n݄��&�Չ��D7i纉�hN�z�Q=YTw_9�֘lT��:W}�>��Йu
+m�,��8����3�71���r��������y�¼2�K���S���<���>X��);e)��Y�AoU�#vЎy/��ֹ���lTƼ"�2�쭺&�6�����7�xY<�S�D��ͯ>��b��3�C3eW�ҕ�ꥏ}���E�#,TD�]�dJ"�ϛV%�"�ʒ�.M9Q?vVV���:a]bՋ�J��k��"��P��A�p"�����W�~uSi��z3H�:��*i<�{+ц�X��Y�͟�����3��M4����-z�݊��LN�Z�ܻ���m�yS�GoV�k����zL,d}�иʲ�zE��a���t���)����렀@hݾCY?�ݫ�-�ت4;S{tw�b�g�Ɋ�ǿ,�X;�3�bq�Ai\�eG?�zbo�4��P_�;?���d���/醞U�"�C����l�:]�GHdJ��S���ϗu�Qmvs��������P�-�:�x�u�s1+�J�V!��2�h��t��X�!z���C�7K�ڟ.F42"��͡ɨٺX&�A�-K:��|�uƇu�lڝJ"���<z�$�yUk!ѽ
+3��b�k�s6j��>j��ϑ����<%�~u�oDTk�Cwe�\FI�#�+�䳷|��2G/��F5�̨�/�g��EidE�[��Z�R�TT��S7a�Ϣ���5o&B�M�Lf��-����zbNI�	�X88xT<<"< � ��§��	_�)8Ϊg�<�<_O�U�O+>�P�Z-U*#�`o�೔�x+7�-�h�J���ҟ��/�Y˙�ri۵�{�L�l,{s���K�������+��n�� �u��=���=�9����S�O>�5ˍ�+Sf���y�F�����F�+���qz՟��+{W��*�9( ���eV�D�r��X��7�s�OY������^��ә��Q�$��Q�sObb�y�lYf
+$Nz8�|�2�?":oZ�$�����gj$FO[xcH�v�P�f^lo�D	6��z���~gg�瑌�#^�+�M�cmJ���u|R��tgI�<Kh2�.g���Og*.J�<��\%��l�t���UihϤ�dg�j1���yZj��r���rlWY�uw��Z����o���=����ď=хp~h����u��b��shY�W|�\O5u^���˜�K�@ r<cS=1qY�i��FFW�*v'pn�Ѡ��w�Ա7��̸�� ����~7���h
+MbW��ͩb��e�h���&�O���F&<��|�Ү�z�Pj��ŵ�q�e��!Q�e����̆�if�Y�h�<�-��Vfh,[ڡa���](�4d9WX,�iN�Bi���(�/�e�X�̂�f�I˴�1]��~fcf�Q'�2�w��g�3���)�&XIg,�]�IV�nuXj3�^HX�MLg����;�r�~t�uHM����i3.��>4TTʫ�6��g�$��U�\�"^6���~�&U���wS�;�C�R�lΎJ��A��n����7h0�r�b�YI�b�W�s
+��U�cL�]h�.���Z����/{�b����=�Sa��Cte(�VJ0k%֝pf��6�r����1|g�*�u,+�?��p�Yf��b����BdS�,o��-�^0'��r�/N�9%Rf9�$�������Ĩx�����+�&wՉr쎜Uw�����F��P���?S�3�d�f�����(�Z�~���J�f�L��?�DÖ���I���s��vp��ӆMS׍��SC���<�G��,i�k�ز�	6�^uVn���z���V���>4T��Z��`!%d�C��Y�n������R/������qh��i��viڌ�Kκ����\�{Ԭ�N�;}�1�/�'�zՍ!:�����c{���N�kI��߿�+�ޡ��g�{��2/M�-+˳����dGc;U=V�9W��=�u���:ԏ��J[�ӅFc���J�d+����\S���3�ga�:5t�cE��U3V�ֹr*���k.Eu׳�ʬs-Fu&��ΰ�rN�ڮ�,O��x.��!*ڕ]LK�J|�V:��.���5���3�V*:�� �)����։�<��f�	?U��X�G��f��e>Qa9��i���x~Y�D�ѷ�c�f���ʿQ��G?}�e�*m���chԫ�H�GEe��:��BHIG�~$ÂG=MR��ly���;��H����fC::����twfM�fQ��f��:X��f�I�aRU�vq���V�MF4b�����F�C/����͋�J�<s�L)3s��pn���D�2���k�x.�4ĜZ]Q�Q�t�X��sdJ[5-��Q��{�\���m$W֕Ne-S�n�X�XK驡<"Y�,��k��-˭���l�.<�:�Cw������;��NiL"#�oz�и��T�;?2L[�O>ˣ��X�ft���5o��	����A�,�Dv�zhv�CL^��$/�U�����L��S�N9k��w�ÚA�NT��J�Z~��7?�o����_f.:���B��Ќ��,:MNf��HN:Z�8)�ʒaڜ����	3/1�5�KJ�n7/��CDW�q	f+���LϮ�����X,J(������gөЙs��Nfz�����4�Vl^��Zn5�9��V�!&Qe��&))���<+���vJ��Y�c�,?f���o�B��$�DS���|릺{NQ����ND�w��h.���3�b���M�#�i^i�N�'D줍��$4ڜ�2	mn��3��S�>��ng?�堀@�2��8�f�����%��jb9*,TWm�$#J,Y�I�>i�9|ν���<^��z;'��e��Z�b�8�>��V��s�U�=�z��d#�O��yc�����-���S�c5n������,'6�%^Y�'�+fvP@Λ+��g����.�z-��M��1'6���7�S��gg/f7�g�ì����isJ#�V�C��Φ�|t���ש֊�k�ǚ
+�s�|g�RYa歅��:���X�<#N���:���Ş��;�4}�;�sL�J���cK#t�y~iL#�܌�����ݴv��*&y�=۞�^rj�dͩ�1�ArW����|�Pٓ�'������2��iR��q��������\g�r�>�子���6��cQ����P�X��8�X嶕۽����f]��KS���C~�}�"�<�+Q��Sk\z�~f�����u檦��Q˭�u����v�Rv�s�pj3�͹l��	�%��fÜgK����8ǳo�%���S~�W��V��m޼%�XC�Os��7����5+6�O�����.v�����b>gj�����Vz�G�Z�t�viҕ�y�t�n�L�nl>���J�Q���/k�;:a��p��?�a�dzꅰ���˭����x�B��r[��-�4m����QSM�Г5�7��^�u.b�ӥ����b������\�yVX��J��jiV/���-��F+�Z����̻���GU�^f�2kȊ���lW�\R9+�̚�ԕU&:����ǎ2늆��/l�է�uʯ&�X�L�c5�XI�[�L��V�Y#��¦Y���,��j�����)���dU�<+��̱�闉UD��VEǒ�)���U��s���2"f��̧�Ȝ�s�ҪW����[>��쓦h>U�VQ�������3����e��5���JMM��Ӛ��ӭ�eɩ�~�¡��;��Vh
+�Xx�쓊2{��V[�|��ҩ,��+��IO�TbT痡BW&�Y��ЪU���*f�ԹL�R�U���i~�����XˌTr
+�$*�]g7̬�>q�9D-����`���T��Ğ��<�j�Yf)Օ��7����iv4ϳe�4�F<�|�3��E=�|vj�w�Ȩ3w����3��~Ų3�3;4�6��ܲ���C��~�gC��A;�͍\e��1C#�]��<.y>�)�v�⳵mQ)s�m���M��37�*��N��TifWꈭ�u]�*��m|3|)����$�Lg���b���g��K�	�왙h\�;�x��2*z������J����蔘S��n������hB�ұp
+�����w9$��n�)�56ݖ/�aщ.��?,z"�����Ӿ�����q�-����DE�c"��T���\���ej2+$z�Yl�Z��U���2����.Xca^)�I�۸��Ѧ����/s����T�c�D�����r������i��^Ar�����e�x3�e��w�s�ٶ�Ľ9��b�r_C+T6;�Q�r�G�9�Q�7y�M_��������&�*��o�Be��w)�YU�l��������Xx%��L����Z�4�V�S{5E��D/�K�ː�|}��,�'͔��R�?��z���]*D�]z����s�����M$r]�3k�v�_^.��٦��'ˠ?�	�tt�U�L"ݎd������,�f���f:�U���{����j�����Ůnw�ew��ʠQ��t��+dyW>U�����b��?nBR�?���ݤڕo�WҎז�iT����-Xߔ\j=%�1�7�36���B6�.K�����Mx,}\14X0$$ (*,XH&�H@T?2�˅�ȶ��O�R�ڦ��b�uc����T�����U����h��4t�!��"a��  $	���l�} |d8N6 �Ł��q��Ae  ����   �SY�4@�/Ѐ2;ް�ɵBjOkc�x�H)~h�?�OH6]&�w�������r�v�t��Xd�"3x+Z d��R�'��ٯ�#��+*�-
+� �wEm�P�|'EM�"�n�u�T
+��U�5ק�������k*�I��%�ƣn���Z�M�ȷ�i�7��nD�m x*H�}��R���'��xВ��I�z�W�uuu����c��6G	/d|���kk�)$�L��F8���`���������t�O�6<�@�5����}Y4��Ǻ��(�S��-:�]�'�'c�}�[���g�E�A��x�ZjX��['(�4�޺��CW�5�i���z2�� _#�~9KA|�)�T��O<0 �ɨx@���\<���P�C�V��n�i�0ﲽ�'��������	�w��bm��~�;F/��-�����E��p�A�`{��z��i���SΊr�2@�A��<�i����|�c����K��b&l4�~�8S ~��/;a�q�H@�TdT�%���̫R!Rbꀞ"�"EC���d����Y*XC��
+�,���d��2�t�M�]�#9Ae�ϸ��}���̭>��U��&
+���b�*"XV����K:��xH0.k�u�CĨ?g��/��jaM�UB�hNp�O�~�#tҹ�BG.@Y���|��Em�6��Op̋s��6fl
+�O	��m��3\oԇ�1�:Z�]��N���Q�rR�$�W�	Z����XD�!y�I!0c��{H�l�0��{�7��'B�|���įm�6d��t�eԎl�O�����:oyB���Y���6�Mǵ������m^�Ms��7��� �z}#�_ņ߇-C^�Kr^�M� �|��[�$�:U�&��0w�������g54���?�>@|�y��?k�z�����Vo�m�}�cE��O�����2���ܸ-�N��eq��nB��y���w/RK���۔[������:�q^�����$�8B�)=�S"�D��ˊ/!ƶ��bxLS9ce������v�-|"X��_t���$��&�L�+�o!��ˤ�3J~��/��9��L�fi��ȗI,��E��^bW�&�.�������sZ᭻����viz�l*4x[�2m ~���]C��^�}z���*�w
+1�h���
+]���2�s� �ݱ��,n��va�d����F9�h�����Y�}��=��䷗��=�VQ�w�p��X����Ida!���%���;��'_8�b���f�>a�+W��[|� }��\�{�r��6��\�F��aek�;��e�*J�%�ل�E�_�EV1]Xg��c�%�L�)�(*X� 9�-����[��2V������`}���@Ɓ�3�G �6�pyͷX7��u�0��a	^������פ@��]$��bl��˟Hە��N�9k�����s�:5/�`ş�����u�k�W��A9��ɟ�{�c��l�&n6�W.��y�^\MP��h�}�p��������I�b%iH��(�F���Q���.-H�Ry�cb5ܕ�f���Hdx��k��!(���p�ABS�q-�6K;�����E*Ȃ>q4ktG�`�8�ܘC�m�"Mi;�
+$�
+�P������,�J���r����@�e:չ�UVdY��X��8|�TF��Dk���Jծ}�?�WMs�;r�ZRs6lVk�ٵ/�%�qZ�:�;�伉�� �p�՛���A@_�{�����Y��;;v���+�mA�X>�^���F�h
+���W?��[S(I�6blȕ��+ķ���YB�D�vݗ)�"5¶�U�h
+wV���/V��Y{���a��*�2�=��O`�M`�Άgr �u�2��k�3�=;l���y�R�����*D�[���*���.*��S(�F	+�l[L�8B[Y�*43�La
+��\@6��j�33�OKS���C %�y�'5Z��&*���-��8����e�:~�]��sV�����08���^|�
+�A4Y���̐U��0ue�z��6�h��mx��e:����%����/R<��<״� ξ�́�ƃ��[�VmY�,=�j]� 	��#\G������2��w) 
+6��8Ak|ϿP@��D5�ˁ�@3�zu���BK�)G���i��h�+S�XA��21 ��k-!5m��_>�{ǫ��.�����hN�/yab ���A� E�
+�ߖ��Ȧ"�E �,��9�ѩ� \���
+�U�S��7���%
+����AX��b�W��
+���ÛSD]"�����8=\!�k�Ἐ�9� ��L�A<��,�D�AS#-��B[��ϩ���A��weM��A���cQ a�8UsV@�c�_-���}%�
+���s��"�	Ճ�Wg1�醠-�6
+����aj#�kte��*	>a�=��D >2wan�
+yG��oi�La�/�I�{����&�gYO<�?^w�>q3�E����\�>P�8I^�h:���H��t�i���4c-��B�n9�˻!Ti��a�(�0 �v�:c���)��E���AYG�N�=��m=(��?W����*ԟ�)>��̞l������t��e���wj�.��$'�Mݧj΀U����z>�*���
+|BJV���Kַ�%�Y�^����J���0�2�� ~�����d{�Fޝ7	�:�ʎÃp�̣���!��@�G���c'tH| Zi[�P�)u��R�4��F���������ɧ�7/�$q$���gM���������ɉ�C�m�|��#��0 ��,B�����;�j	���_a[�����>g$Κj��j$�xDO}�HLu���L�*"=�(�.�sz���xP����Ң  .�5�s�|��9i�]�������^ �}>�I,9���%}�/�|d��/��4)<7��9�{�,��k�׈0���s�*�D�ɬw0^�q�Lͪ�VTȭ:�T��B����0^�
+��k��
+8�!ƍ9�������#�Ep��ѿ=:��A_�l@-�~�#��ŧ�}������}
+����{`�3�Cő�{T`}��ޙ��N��.W�����a�����Z� Xn� �����̦��\Rp�h,� 5� �L�"�o��}��kh.#�ot��%��A,;���pC�A�7����y����4+n��I�U�ɗ�����-��b�ket���^`p ceP�����z�ڇ�0�*+�Qej�I�Sy��Q�WV6�����|g�Y�h�~�P��=�jj�4�����0�Yop�F�KJ���{���5�V����@�+J���<�K���F ���&�Ʉ��S�~�W�ʚf�6�]v��5X,zD���=�R����g�M����5X�a$%УK[aM��x�j�A����2f��q�R\���`jMwC�-NĢ2���Eq�$GS�A�/��t.$B��p�?���nO����;j2��Q��o0���D,�L3�' MS���'#���uf���D�ש)����څ��'|Y4��mS�y����� �a���ڹ;����#t3H��ӫz��q�dPҤ1k���6Ƞ���Ǡg��Mяt�i���L���	j�+vMf�A'�X[�t7_�B`��	͐�,M��U�w>�\�
+��Z�Q����8A�
+TlC%gT�E+����Q$m�s�6Y�+�_kd�_��g��臀��6���"�ɰ�g�K`�S�$����a(�=�s�SؕA����f��R���G��
+!ĺV���U8'��d"H9g���)sd��L���'7���sC���9 �p�}Y(!�����9�܎�J��ª��d����ev50�����ZK^�����b�2XEh7DC��1ϸ�*n5�5a�W�(�^��A��{� ?*мWN;�b�Ɉ�����l1��Za0�p/���`jҲ	���'xF.������*#~��0��cӆ0��B�+�pf�*jM�����*��t�� ���S�q:�+UOa,�e��Y����A2	��������T4J�t f�b�f�"���?�d$y�4M'�"������}E���43�cߟ�VmlW�E36!��W<E�%Tlx�p��p�*4r�A\|!z�y�+
+/�c��=D����T�<N`�?Mţ�#{	t�
+��]�A0��(z��=�x��U�f� y`����,B*�@���<B��Ѫ�suւ\�.���V�����[H\ގ�Xb�ɫ>�-[0;��ʀMj�-���A,�z�t���i\�"W�p�i���)�L_��Li4�%�,Y�T�bu N)�I�[6� �2����8*؂1�8�ά���喅�!��(� n�����6![���b��[p����?���^��;��|�[�M�'+W�m�imb}9�]T?��L.�up�Z\^��H[�A��Q�J	 T�r��ے�X$�j�_�zh@��
+�y�"��-Z����`QA���(��W��c#����g?&e"4���"=r���%�Ɨ`�B�H��">��:M�6������I#�ü�vb�+)ձ&��cl�$����=�����\�X�r�Kڑ�*Y<o��UM��AQ�_8z2��!Gj�~���eo����6C��p�m�=�j�
+̰�b�~�h�����:��A>�U��j�jXqQ*KJ�ǀ�p�	��R���d���>��X����/�W�ܴd�l��e�T{B�t��v�hH ,���2\=b��%)�9��l�K�شy��~xX+�y��*�TfS�͕��Y럫.���SA�����m�T���JNU�����~�	E�,�O�?���@�(�`1��+�����*�/gZ���4��ֆҪgl5���Ò�Q��iE��So4SL}_�i��~�< R�6�}B.
+%P�eE]J[PF�)����z2��O���C�8N2��b���h�4id[��y�Q����{�pIpD�����J^�X�$E��汔����Oş�����t���CI���D|g�$F��N�L�[m_���(q�G�m�\���!��ve�^�+�s�
+uBO��Pvi��]N�Y�3�/��rO���0�U�⣠���?ν�'Q��e�dj����7�������ԗ��A%��8�����'�����k�<e
+�5X"�Ŧ��{�H`=��zBW��U�Aׄ9'��g�O�΢ޮw���y�����9����L@z�-x[0,s��i��%�J'	�4e@�},��%���h&�����P,��Z �v�E�����g����0�tz�m#�a4¼P������߈,�WӦ�j��dllk���x��ʁ���e���.��r�x��?��)оζ�+��$F�"{�T�
+��x��rNC�zox1i������4�v�	7F��D�y^`�g�&���!�sUj���gE��K F��.�H��6#'I�zWuU�h�7j�w������-��$�w���Zew7� Iv�t6.?h�v)�0^��.�y��yz[T���øK�f��9��OoW"�������s���vT܌o������wz�M;�:���pu�9���Sګ��M�/Ong�)�h�.mv��:M*�cV�/(��HT�y("�ZL���Pq�l�b�JE�����ݎ�E%'���B<�f�B���A��L�0<��݅�hTĭaV�"�!����"l�g���zz�"��1"���	�j�aZ�w��Sh*��#��&�Z;�Y�O�^/|e�d�F~��E�������5�k�Z��Po��t�3�Pp�"��$\<4-D�����>���#-�"t�]���@���G���ư�X�����X`T7\� �|��F���Vi1*���QO�Do`��e]*��D Y���)g\	X���Q���w~$��C(,� ��|	��,l��j�%J�y�y����D��琋J%�~Y����ֈ�� !ښ��FOJg�{���D
+��@qd��s5(}������K	A[,�����94���a	D�� ���g�o/��N�'�3�z
+��<��F��wf��C)���sd�лwǦ�*�H��~7E����^Wx��z���4a_Y�ؿo+7����8,�/VV("Ь�����u튞�D@��?N�3���*�ܳiݫ\�h���-Sm\3;�q�T&�t����������#������xA��пQ���lfO�`B��W4�6��p��]�q�d{�,�߂�*y�E>q�N����_����㞇շ��
+��	�H��%��j,z6��u�Q`�|?��x �E���:��sh�T9�kg�	eH[�`��	>^�TI2 ���Ǔ�l\+jsݎ�Dmy��$� ̰Dm��gD�"�L�����$eD�ڗZ�C'�[��C�9����ߜ�V��6�X(���"8��B��匠�n��_�Q�.)JFV���nH�fm�'���DI
+6��C���!�-θ�7NȄ±��V��� ЫR�}�*"�;�� \2�O��
+������ej��h;㴪'��������G9���	m�����S0���Rz&����r�De��dͲ��g]J�z;W�؉7,e2�Y�����4�^m!J���.Ζ*�B:��N�~~C:��W��	'�1%�z��M�Y^=���<N��5\���D��2F�P7�����kX��FVr��yRp^j�?�'�a��k��j�d����Ǯ!%W��'\א1��h��� "c��E��M�m�;*�
+92�����n)Y�: �F���`�{����I��b�5t(Eٛ���M�9lL%6U�	 6�{�����*a��X�R�b8���S�� �����+�2	��n�<PEY.�vNX�3���	H�"v��PX�qP��m�{o��=K�	�2Ť��5�$�4�(�.@Xr��]����F�=���W5j�i$�*-�	Xc.
+�% �T�'!_�w`�V0�I,J����P���^�{��e�]�i$��	v0�t�QQ�&�_v�![�#6@,"��$�ˠ4@p��He���$zt1-Pcv��@�I3�7��� xl}1h���D���[�'Q����%|}Α�-F�=�r��u��iRɯ&�걮�4K�%�
+a�鶓L�優�vL�+CbS�`�v2 '�,��,Ήe�����'��l��x�p�U������&m�Hp��i采�LS�U6��]�Q��A�U���{��>i!�?�
+���CK|�HV5���VB�zީ�jI�t����jM\�Tؤ�o�Hq���%����H��v�Nd'B�f2��� 	���>�I���������A��֎OҠ�6�fLiD	r��a45��Ix�]��`9�H#�jN��"EI�NN�~+�$��T#z���2ܞ���Fv\�/ �!Aت�V��/�J�`rN��B�'P�6;���p!��b>]��ga��l�{2����]�A+L5�؟�(��"6���h�?��]����1�΃TD��+[���pA�L��ΝVH�&�(t��N���S���W}4�"5�繶ͩ�FK�kdX�}\T�MX�����]3�Ÿ[l���8��ϻ~-hK�_5��@��C1Q
+��+
+8.���I3��Ӟ+���<R6X�rD�-�;�Q�M�DO�A�D�W+�=��v��o��M��+��T}*���~r�/���mBMg��_�0QC$5_.܉��f	������~�:�u� �\�N<����� ���@�������#,�0���(z]=�D��MZ��!��әScr��Ҍ�K\w+�p�������J�?`V�XN@7�":�\[�V���:���w7��W�3�ӫ@g^�f�U뺭o�E*ꇫ�C���Cd�<b �t-6�rA�u�5�ȥ/E�mf���-Ѻ�v�q�j�����i������΂�<ǪyS�"��x�AU�8�TQJ�@1�@���`�'))3�h�@�rE��@-��yK%��5�*�����A�H��#�2�!�DFԸ"�N(fD
+�7���#�PZ~�Aɶ	߬�X��>��]��tw�J�Q�S8���2*�������(@o�����u�"��������ʠ�F3%5��Jj7������E�G�3�3
+S�;��/M��]��ς �#&�ԄEb�z�J�k�
+Љz(e�:{J��N��l�ɲ;����	4��5҆n�M��\� OiB��|&��WJ^TA;�H�Hd'����\�)S���b��|cd+@�{�L_ċU��S�h!O�N���,T�%Opg�O���L�(k�$�	Ĭ��%ǓF5*��?��TDʫ>v�<):�Pc#�^H�I�(��$�C)wR�v�:.)����d��Լ��o�*���?p{+��gi,6W�sOy~_5�]h'B�_�r�z����PJ�m0Bb̩�e@p�e�t�^<Ml�jo���c�}?�,��V���_菢���o	�B�Ͻ��eՕ8٥�@���}�}K4l���5�����2.5R-sa4-m� 'Fy�זvt����zA�-۲|��w/�̪t*�ڏѠ9��8�(2�BJ|c���\��0w����A�=�F�싻��җ��@[')�R�T`��j���`�I�A�k/���d���+z���-<��xX����oBNX�߷bR
+GvJ̧�)�A��t"����~#f�Z�;H��M�׽��g�EBs1�F˱\�2J�]��}�Gʦ��� ��8}�縙�v`�K��G��XO.�Y�s{�r#G|��ei����=��CY��V�ڠ`��j���W�>�%7�>d��fՈ����L�A�M?���f:�W�d�c.�g���<��fe�u/ESJn^tr��ao��IGÇj�$_�)jA#q����b�/�fHx�f���N�P�x�yy<�P�7��y�3�o.�Sz]J�l<{}��L83��C�I�4m�N]"������XO�f&ͧ禙9`�*�d��%���Ci���=��_ v�t��q�4̝(��aw�@K2�d�:��8+�ZM��{�[�j�v�����HS�L�3�ڳS�k�Ӌo��g����5�
+g�[�6H^Hes�W��ebS�dg������������/�Lu_<u���z���D:�*)�:̙Y��e{W��\h�M_���_Y��H�R��6 �G4�y�$	3WW�����"9��>/�hݨ�j�v�+m�={�`r�3)$C����e��E慄?W�|��1ifV%�4��y�+P��k*��:���l&jfeR;�/<�k?�s��	Kᮈ�H�%�5����tf�$���Og���Q��3}��a�pe9���Ӈ=�T�D��e5{,D�pX��'t�$c�w>��S
+M8S��5^3����<jF)���%�V���D}��쮙��]òf��eju�/.��aY�Йwm{��Gg��eN7t��1��1��tfZ�{uG�H��� ����B7�<���}�/�}��\�
+&N��ǐ*П� ޣ�qAD��B�=�`��M�]}��و���>��5Lj�H��Y�>��J�>2�����5�-����]	m�c���*����3ԺLY�����k��H�"��PL~��h�vc�c�D8Z~$�"=l�vq~�B�`��K8�w[�-Cf�Gn[��ң)�o���Tp^��'��!/�g����E����G8u]	�9n��j�	��D��#���*�_>�@<qͽ��h�ť�c�zh�`Ѳq�+�%��7�B��Tg��8��N�������SW6L�q�˪XT�b��y�?	������e�%��yt�#�&�C�k����O�4��>E���:����s�]�e�o�CK�8�et|B��TiT�2��v�?B���ⴡ��m`{7�Ӊ ���@x߇���jv��b�nN���������:H��9�]Z�㬺��بI��l6���
+��Ҷ���c�D��s?�'
+�r�׉�ltm;���c�p� �![�NL٬���B������L4e�w�=5eCz02���B2�L�U6�ںpf�-1ɛ�l��:C�툶[qh#㡎B�n_Kh���<Ofh+j1��H��vRo���fNe��`J�������ж-P��zF�����v��� 	��b_����c��^��a�5n��`^g�� }�ᥒ�I,�8��la�����������5�������}�z����� ���y��|��>��2;`�t�I�f����mΔ'ZR��IQ|�Oޯ6'��1�|��C��d��BP̪�z����q!n�GI���N���J�K �z���|�=�\$!G��QC0-��%�^5aR4,䢑VA
+/�~M+k���(!�3Q�AH9=$�P� ^^�uS�U���{�)N���OD��I��j�Z�������m  Af��8	;��X`!�a��#r0� %?!����|@��ZP�|��[9.�]����l�cłT5�=�aq3u'��y�`�u}��Q���)�دΟ���,���f@]��.���8��4�7a!�D�bo��d�D:МK� �r� �!^0YZ� ��,א��˛���������m���]P����/�m)����t�&;�גGX�L~�E�9k~��Һn�B�R$��5.�)z��<�p`o�m/L&�ظv#�ҕ��Y)�;!8��7}������nZ�X��]�� ���O�P?�+t������p.��l��A��8*�1�c�����-N�̀���I�F�'[D�B��jDk�	�V0�ٸ��;�H��|T��8�E��JМUʠ�i�!/�w�U��u����U� �(}O�&�xb&����E�w1�������M_.���*��pŋ�Jۀ�̓�1���>����f��Qٜo֟RN��ߘqѳ�,�~4+/���,"=���@�B�i�0�|'��3�������ې�w"kK%1n.q�.ܨ�X��O(Vk������d�&A�v駓����T<�� nTX�����cӰ�۲�
+ԍK�H+`�-��UD��{2�k/u�(OuS�9ɍVCƽC���n|Rrxq�pGP4�������GA3�FYrEo���T�jJ7���7	
+��<I��?%�9�=��+N��	)�q���l��P�K.�퀌�[J������}�B��D��0zF�+D��
+B���ۇl��C((�����9o�yP�rj;����.#.��_��H�x��[�=+��E��ry��� x�]�@�|F3.�pe��i�7��"�'�Y��N�pw�B��Y�.�)�<
+x�C71x�d���;w^����}��T��2eS��@����6 4�C�Z�-?���)״�f��<�=͈i�Y�|b<�d�y0/�5�#���/�fٻ^}�O���>�
+ϑ�~ḆL}�SM�X����j"� ѓ���`��٩p�J�.��8覕0~Ԃxg��C�
+ ^�}���Dц�w�&v���J�uU�ts/���\O<�.�ۃ�[������v�Z��fz�n��^祵�a�n��r����#^#sK2f���8�\z���(	YE Z�����қ����?r��,�����%�M�^7�9�4�zf�W�K��X�$��P�_:�7d�����������1�'�"�=�1i^��2���K�D2t�N�X��T����X̱�,�J�5�	�4jE�$Q4�$�՚�R�����3L�p��B�d�/��!��rFu�dey����2lW2 I`�/!�@�t�����a�T�'T��d �B�}�0ޖ���$�I��H�GZѹ�'p(r�k�"��]��v{�7�QoΈ��&�'"Nu�����X����#�5����,��ķ[��c�J�v�ICp�L&~ڧ��"y��#	T�n^}-�>AӇ�%)nG�1�����b��FC�(����i��D�!fjBXm%�����=�� O�jK
+�oR1(���GP��p�A�P����/�-�[����%��3�F�+�' �a�7�H��Sr�bp����ɪ�dW����/�ҐN�g� B�5'N��F�����a��$��QE/X1������n�$?ɶ�ʬ�����&�h��'_;e,yVcY���S3^�V�2+�� �&3�c~����ċ���@|8=��i-	�L0a�SԒ��p�E����a�⒒��N�sI��cc��C��X�R+�W{f��Z[����,k�w[���
+���ծ%�4�a/��'��[>�����$��i���Q^�e�4䃤��Jj�S]�#��ؽ7�f���)jq��w�����I�۬J,A����f!(��LJ��	i�u7�a�\ne<�/ñ�}I�$d_y��� &X(���ս�� �Z��.�����d�s�����;gn���䟯'"\[m�Ƥ�
+�Do�y b��`@�Uv3<?w�Rd}W�k�,�ƹ+���I��)xb�h�*i�ZqTv6�͏�^���9�nn#��Ŗ8�'��E�Lr�3�G���T�%	��8�!�"� ��g?Ƹ-�6rq��&M��x(t3q)���g�!Q���("�B�X2��^��{�g�����d����ٵp��$�M���&sZ���|e��G(��%H�3K���Q��݈��Y�f�yo��[ଅ��)>��r��gO�]H�t��#�F��6���&�s�����ã���u7�	�����}jt �XM)��K0�u4T"m4����� I_4���}�!�+(��3@q�����+ɛ���	2�[�ª�~��J� �0#=��f">x�I�\TK�	{�� F|�>``��E���H�H.�=�����ס~*Jt�����Ωǈÿ��)�6���$��і��l!!'�jt��# A2�칏u a�xnd�;��e��m�h��]��؛dê�
+�A�+��-��J�	p�����TOy��#��=��)�~|��vR�f���o�C=�o 9H[0=��S�d�+�~8��x�nYs����4�M�a]��D��8|�/���3�ӻ�c�v�E�,Y�1>�Ia��"6+�ݺ�[�Y�W�t���ۏ�xQ:� v�����*{P�j	.�ɦ��:�ċ�"�)�%�2l���k���S��?�#ڸ�P����g	��b&o����XM�N�	�ˀܓ9��d�6�L8�tqy(�}�`E��o� �����s��R�ħ�2��AK��m#V}�k#Kxy���1G &�z�ɰ�	�S�^G�Ԑ���\�V
+ds6���^�rZ	�a� ����xd��ۮ�3Qo����ę��\vw�;�%�� b��uF�6�8����h|��/�_L��ayడ��� PR0�M��bTg�f��[��y٩p(|�Ͱ�ߔ8hڌ�o��!�jW|����/��i\�Q��k����`ӕf�t&V��:$f����VH��|dt�����gT��D/~7���I� @~Μ��c�s;h�ԺA�����/�}T�i�<\�N�+�*ؘ�����;��w4���/o�|��]\/��sPz���SGa7�.��2H��x�//�����s�Ys�GC�����ٍP�[t_Sh�6R�?ݚ��`ېn�rv3�A%DV�?�%�����xڗ�Δ$�+��T�0�
+f�]N?�k��+��l�J�;�?u��N96�>O���hs�o4�޶:?A��F�ŁH�юq�B�Cyp0�MYP���c�ؕ�FJ��P3�O�^����qWq��>Y{T#!������|<ĽC�|X��n���6����mf�Z�D�,i�&惭������ �R�Gz���C��j��7@�b����R�������
+x�)OBX_��ۯ6$Y=����-c6�T����?'F�C�w)iX�箜���R8���"+LL9�M�O�N����5Y݇N+G�pgF�������99�������{�^	Y�0���/,��K�"��2oڦ{l�#�k�&�,l��H#W��#��d����l��[)M�y{5�z<� �۞�}�Xξq��\qO��Ǹ=���%0�۽�V��]�My���B�o}���`p�?QT�8�������7�����C�4^����]h��1���/1��T=�K�� E��-r���m�{X��F����j���^��G u�����ᾘ�I eS�����*=�ر \�׾��i"���ێ��Ѡk�*2_.4>A.�ʀ�`�0�Eޖ>�HQc>��k����e�ʷ�rN�^�Tb��,A��+Q�(1jJ�Ӏ��(��zYize]Sk)��JL�$&����;^�hӝP'C �g�,gRuK<���*%��������^X�oOe�T�cV9JW��=VZɞb>=�͋����<���Y�:����f��d��,!�Y�������d�K�7�|�����a*B73jS��ub��(ocT�.�wJ�~�6>����R�QoO�I4L�{*��bN\�w�ʝ	��Z��a�Ô����}t��ۙ�֊�̀C8�c�� �[5�kF��S4��2����9)�3���t�LeZ��2�A|��95/o������8�������"P�]R�'�/�]��զvJ��J,x>�FI�X�O�3��YmQ�[�Fõ�0�@��(z���Bl�)QOĵl�#Zb�0��ţ��n�,�~tn���4"���Ԅ�����v/
+	��GV�p c��V��\
+���S)�0�/���	�T�g��d��6��`DyfwXR?8���� X��'���	�j�>2����"U�*��� �(J`."bK#I����+12%'A��v�RĠ`P$d���I{��j�����/�#f�l����|�Vx�q�$@��� ��Y"ۂ)([���@���U�m/�����:�$Ck:��~&�jz�n(��O�)\KM �O��Qn�x1�j�~����0ly��$��p�!'`���$=9���͜�::��8��?1#�#yO�Ĳ�-��]9�F�ߣz�x?�!��U2|�A2涘c1\�Mg��Y�d*���@I�O�
+b!!	��A7�0�&�7a~Ϝ�Uo�M#��4���(s�q�\h^�t��q?���B̿S	�����нx+(t���H�T�%(�P�4L���r0T�_�ak"ǀ%p�	�'�n��I���~u�Y����Ԃ�W�~J��${���g���3K��X��i���qy�������9ǃ�S��ru���q�ĽSJf!������OB�-��)�+��Ђ�;�ђ���E.{>��?5��9�t
+�E�^?BuWӰ�u;pV-L>0u9��~1O*��1�<�m�f`�\��o8Jz�&�z�i�[?0@��5m�S�xM0x=�@�ߝ�dhV���]���x����k���C��˺C4 )l�P�D�����n���yz/'ugcS�3|�I"��gi���S��f G�4�fLd�]�~��C����A��1�o�Ʊ��X
+��f8f5m['�eÀ����7�w�M��);8�)J(��kqbk���������.]: �0e�h���!�\
+���mzv���u��I�#�!1�m�7�����#=l�Z�j��]]e��l25Z�&��m{�W��8vi��!�f!�d�J�mzQ?��qe��xzr%P��_��Qȣ�x�P�iZB�Bz����d#����*�}��S̸�Ek��&��
+Ϟf^Ӈ�����WՊV\c"@����M�/�B�������~O҇�f���=���C���|94!l5b�Sfbb��
+�}ۚ^ft�VQ�!I�#?1���dc���`���������&wPd����j�9XSѴ��� �V�w�}�X�@�|��E$4n
+�_C�g)���oۊKE�q��������-E$�;5F�"�Λ��S�:��(�w�Tp��N��Ra��E*dp�_�d�����3���#k�aZ�z<�퀣�kW{\y�&���T[;���7(�/J�UW���p����h��k�o]9<����Q�����(�#xFo�������:�
+cK��b�Eɧ��\W���~O ��CfY��٫����q�fv̥m̪M�6{,@̚l����r�G�s���Ǣ��q0>���,-�C��	zЫ����{|Z�\�e�W���jL�������V�K݉���Ǝ��%�>�2���u���V�	D(���K���t�@ƙ[�Y#O-s�KW��F^{�]����"~r<k�ܸ��	e݀�6j�k����{������9���l�D���X#H?�r����Ru0P&�0`��Ap�e�Tw�@��>Z��������-z����%�Q>��J�6/�g� xR|�.���=Z�������5H`�V9��F�9Mb&L�����X|���¾[1&�Y����)���J�N`PrREA�S�1�\)��OE�\YGB6"*��C�TX)�
+�����MI����`��w� ai�&k��׫-��k�������2f����Sr��d)U\�8�J�0?���&���'U���7s��!� �]�!ȅ����`���!���J� Hu ���d;=Ҍ��>@�h���NZ
+�NH�,pC�	����P��K-,K^v>p��N���ΐLQ��$ى��"salh��k�󧙞�	��K4})��ΝX[ Cb�+0&��t�L�KZH9?a铈��tc��A�940x7A���ꫢ6��F��@p�a��#P�zD�;���ǩn)���1�x���O�i���JH�e�n"�H�6�X!��v@s��I"H��߽`1�����p��Qdx�6߉�;���{�'�N"�Y�Fh���u���%��S5��3����z�Gϓk���:{�#�ē���B�������c�GU���Q�֖_��;�wj���qP��ܑ&)� �n�%DE�?N�FVф��B+�0���'#l�2�cV1��;��4����b�[��B�֐���"�V���W��-���uo�A�oSV�`"�QU��s!\<�k,Q#���@"���8���SwT��v8y�P��&����Q#��GR)'K9+;��#˥��.�	ף�߯�@�n �ڒ�$aݡi�@/�[c,��c|T7{�p�����j��2��^�h�bnm�I�k���YqA���4:ي�J�DV"%�Fp0��Q��n,@Y�K]�|�A:?�����t����<�������-���]h�����af�e���P��5ļp����" t�ۺ��u�^ם���cl�ذ��Ůt|ư>
+4�x�/�ðWKR|1%XP���@��Y�^�R�`�����z;�l8;`!�����]�3�z�`��
+�_��D
+a9�� ��.��9aĩɢ�7���G1�8T�8N�o[����yR��o���o&�7���n��ⱄ��<uXSa�,=�3,8�9��pk"A	^P@7(�sS�7�f.��L`�B���x~��d�;I�`���|r;�����J�=���K�g��e�+���ZBHx6�]��?���a��7Ӏ���Bԩ��%�+�31��X���G9��"�f�WF�
+*��(�{����#N�̈-EIh��N�PX@>�d���j{z����|�'��ͳJ�)Cxc��`�nJw��1`��i� .	�gp���ב�u �����t�����<t��Q��93��!��o��~�h�ר�(����'�~��
+�����?|S��ʿ��o�h-!��Kr�`�B�5��W��1��kN �~ϱ��u�R�Ą��XL��bh٤��V`�u1�Z�_�~1�z���Zt�Ϭ*��~�:e/T%�CT�� ����bQB�$���w��i- :���/��w���T�΁���..32��~� �pB�2��x�WA������E�{ \��T\9���[j
+h9μ|<�)<�i`2�C��r�� z��F�o�����>�����O
+�� ��m2E��sl�����i�J�l�B+�Y�bF�f�,v�����'�@������q�nŎEb��a��ź`�kY	��뜯�{�����5^C����X�� 7����%���U.Z���O���:�~ʫf�a��13a���8=��޲Ի�iCh��g������T96�V�f���0v�d��>�Bo��D�d��2[�J�t�]������w�Pc
+ÐB%4��!$�)TA#�3��C���hǒ���K8����R�,�Xȹܕ�vg��A4ҔHJ H9Z�����؞'����t]b�%gj�I+[��<Ӭ̲LY]Țc��b��Qr������Xw�V,`�b�u��rڀ�L��䒶�D5U]Q�'��u�$1;X�*Cư#GCG��)�� C��L�B��_�Aˊ����B�U�^���A��%�q�(����V(�0`oxG��z��}����|k������o"̭pe�N��ϐ�������V&$�)I�&�*�u�X
+NU�,9�׵/�|9j�����.�5�$�\O\x��n�U��=x�i?�~�+�f,"bٟ��W(^y�1��ijXI̪����ô�&U��JN�j�`���H��>��cJ�(�m�_k�G!��y�XD3��v6�#+�6�(�i��l��d��q/��fZ� �ޔ��a��`��v��c����o�:nǉ蟹Vt�0��&I������J�5Ì��.���ͼ�%����t��a�����H�O�F�v�)��k�����(b�*�L����8Y=
+��V׼Ku��D(����^&h`�����33=�R��qm�Y-�q`��,,̯��U�W��0�����.#�9^;���nDn�J��L�v�sKw)����)��T��|D���Q�:�\y%�4)�S��B���8�1��~-��8�:�6��}l�����bM��-�!�[�.�^^f�C(G�0�q̢�xگ���
+��є�a���0�G�4�/���f�]��|d����ؑ�N�^ ;�t��~Y	e�1E[�ws��s̘k�gBB�E]��i��n]�����Q��n�b��v���[s�4���P�F�sL(�" Ƕ��Ē�k-Gɱ�ɫ�	f/x7�� ��q�L,�e"n���P���#�	����������j�-$��T-�0
+8`*�8�dMv��X���4K5/���O_E^��f�fZ�?9\-pD�J����7W'�Fe����Ié�5�Kw]�h1�^h�i�_�a��Vu�*g+��:Ъy2C������g���z�sI84M��Xρ��۷��J�������d���DI�?/X��ȗ��f�ԋ���$�Ri��G͜b�4�����+q�B5ƱT�,:2Ef�Ӆ4G�/߅J�0 c7s������~�:�Bd� !�~^�VlӐV`��6��q�	P�� �&�zxo[B9��C�Šh﹍�4��Pi<4j�}`���\��.EH����RK�oa5�� ����a&���׌!���xM�2�%h�\�݅�y����Qj�+ �%�7��� g �2#�;����ӿ!���`�(
+O& ����+o��`��B2��:U���lDF�o�x0�]V�T��<N�d%�m�B.B$� �^Q/X"�ۆ�����w���e�IQV2W�ZEP��/*a����Ut�*�+���V�#j��Ĉ��)�'�фdf
+B�;	+��u爆.#������N> F�2Oy
+һq�dh�D�z���K���al��4�� ���f5nbk�4��� b�v�^�ً���g��O�I�������" y��V*�sƸ P�/(�
+)I)SJ�.UU�(	��T�� l�B).B����,
+�����Ӑ�3�Цt���J'd>j�"G�A��B0+�#rZ�Y�L �(ؠ �H�#P����� �K"�>lPlp�����(8�J�* �d�A.�@T@�*�`�6(�.A��@�d@�,�@�p�`��` �4� 1<�l �V@�,� l@'Ƞ
+��+��#� �dpH��N0�&!y;�ʒ�,X�ۣ�m�R:^����X.��J8.�S�Ŀ�E�W�8�5�p 4a��&�@Ȳ��l!1���<�-0+��#��d;a]�yvl�dj5߃���S��,�E�ʄ���8i�v����V��E�:��X���(h���y��
+��1�T����DB�w�ƃ>� ��&�E���f�󉢊�Q��`�\�fcu	RQi��D+�eR�L�ʬ0��`f�]jCW��^Ѫ�Q�(�|�$�"�q����!3P#� )��bO��
+��<�-j.3�,W��A]kuE�Lj/�
+��-Pz)�@B}U��X��-Q�XJj��U}ɗ��QK��Z�s��6/�_��G���Y	@5�Q��'FU����J7D��G� �T�Od���=�J�C�)�g�p��
+�1Y���L�_];�8�����?hB����؏��C$�8qF�����I���P��C'�m�T�����2����ЦQų�oQ����&�g�B�jP�����3��D�"'��t�]�VO)�T���'�q�^^d8cM=A%sW�Si%t��:G8@��Q��5$E�F�G,��?��S��~u�*	�q[wE���!nDbj25x�}�h�9$�9I�7��깰��N�b�z0�-~��D�P�pD;ȍu�E�ٮ�S��@d�eVN8n����s2���Uy/�!��8F��QG�i��tJn�Պ�qi��N�l������=`EEJ�ӫN	��s�H�ƑK�"o�v�r�ʩ8TK�(��*F�%Rc?p\G�jQ��#�p�S��K��9Rr�E�[1�PX����鄑<�Ir��}�Kf<�\��p�c���^l�!�X(��mك5򏴱
+F��U�S����.T{��J�v�߫~j�NRw��>�G
+O�����]E��S�`hx��ً(����ST��p%�=X��Z�G�P��,��(�X�b�q_����%�0?h}��F
+.���jpC26�M��Ec�
+�~��A򅝔J������T�,a<H��	Wt�-x��B���['��O���2��/b�f��VEQ�yFI����DBq߫��д{��!��o���Ѫ�i|Q ���ea��	�ya?�W���u�ƝA�!����0\/�������L��}�~��hK�^m�5��**�6���6}cF�]��cn���Rn���z-�#i��-��(�2C�o�5�˪�ML��Ja����,)�HI�I��By\(���\�E^j�BQ���;�\�\��Y��  )��V�Z�4�UJ�ҵ����i����sUo�f�D{|��7����_��Լz�"��xA�7�=�@A�|�1���ȇ�����n�U��j*X�2Ɂ4�xQ��_ҤB&�l&��',r{��"�(H�P��ъ��+4CU�gH�r��qQH��"5���Z�4$gI0��6�����6/w<�K�9
+����!�S����Ô%nsHK��]��PU>T6}'V�3��_e�gmG5r��S�G�QN��p�IY�1q{���Kh^{��n1uM���94��Q�uejL�c\R�U
+��0	�5A����ܗM}{��JB�A/b����;�'ݙ�=yϿ �fd�;VW��L'�1�Gy����F��,{h
+��Q➄U����&�:��K-:��IU1���D�:cNT��f1.4S����^j��y�r����\������Q�:�E'i�@^��bO�1o�RS֭ ���P�t�l��ԔH��Z!���O��nZ4��%�&l��r�$U(�G��Z4CD�mj�"5Q���N��^B>CQ&Ydܖ�ژD��h	щ��~�y-5ק$sy4ȹ�C�>(��e�z!E��>}���d�]���䐢�&2/���71��'�V��=F��	3�V���$�	�#C���T���0JF�E.��ۥLC���I��Cc
+��e�s�ԝ��\�����Ɇ]f\�,�t��R�c�P!�{�*���k�*L���ff�S�@EBѰTy����%&|XS|����YԶ����(1��8�z8q:���b9(�bc/�`b�J;�8+7X
+;�{B,!w��ME��&��50��iyo-M��Fo��*��ۭ���aш�Um���{�؎��%��y�Py6�L��_�cއ�=�)��2a��|r�����M
+��)�KļAs?#�0񐌛�K�XÈ��	��p�e�d�E,q�r��_�(a�>+�Qe���B��*J��pӪL�Z��ie�Vv�3a������5N|��ϢL��NG�V�� J��#*	�B!yP��&�U�=��ȶ��@ �D�WH��8���XVH'�>p�_�A$ќ�vBRP�>��U#"��1����'�B�oL���D��012I�:"���D��/ц��jд��hF�GŌ��l^~��9��g���g�j%H�yӒ�L�d&���H��y�dd:��(��^�+.��\*dw<2�֭���u�2����H��Xĸf����c�@�����M���PI�y��D����Ta�+���W��+f\� � y�hJDbH��VCtx�GЫD~��ldXe��-pZUЋ!��2�Q3Q��\Ee�(�EU�FQ��<�<�OEYEm�z�x�6�jg�Y�H�3�^��JA�qg�fJ35+�ȃ:s@�39����.��r�&!�)�P���xNo�!&#5�>efZ�G�dT�6ů�0W���H4�,��>e�.�]C$�<��M�uS��!�ޠ��X��'>d�d�`����p��H�j\����#"AS�Vy^qoB!�wD���Zu�k��ۇ�S%!o�R]"Pj�>߃.Q3����T$�nmI���Tjw璸Tz���9I�i�Jh����E*UP��d2<�=�LQ�"B�UI-{f,k�Է�)#Ֆ����Cna=����ܮ�ȷ�R�m9'Þi	������'X���4�
+̈́M�:[4QECs!!�xh��Lh�y�޴�}*|2��CJV�z}0��{1X�q�OT��CHj�I���L�%�3�?&�ā�C�EWkb5�H����3SR4E�XognP�A���C��K���B�5B�=N��D�/�P��-i-D~ ��d�>�;�#64�>Bvr���op��NE�1$S�"w$�c[l��Mշ�z�I"�]Q�bk���`i����҅�Ϻ2k�>K����:Z���)�#��ᔼ���D����H����At-�i�����q�A�(�0Ձ������m�TPLy
+�tr�7t0BQr{-ǈ�ى#�0��Em�~�ۢ*�����נ
+�q[�(;ʟM�����AUC�j�Z�`T��LAs��>(Ib�N��WL�D��*�$��w�ف�9"E�U�p�х�TJ9�x\䟛��I�&|��v��V�ۉN(
+%�A�C9��ȵ�`%"%gj<�W�/^� W���@�w�����fC�Wd���"&�	3���)�&�1��4E����X�"o�QK�fA���3� ���<�P�Ps2k�/�M�L�9 ���旄oI��)J��Ds��e���s�`�4�B\�\�r�Y�O�P�EJ>pB�=2;2�a2��%ۢ�GT<�A�C4��P�$e��i�U5�SI�`�j�4�����jMD)��E!%�X�b�u�;rL(�"Ȱ���3�3a4!3.�	UBVBQ�yRRG<��b!W��%�N�W\�N��s��=a��'�z@ՙM���q��wQ�he�� )���`��9A�>����,�69���s#��vDe�����ȷ�=b�u�F"��Hݵ1��H�Մn
+�)���DSM�sPY�tD�S��eд0�"LVӃ��N���;Z=�;
+a�D+
+�"�T��m�ЩE�����K�}��bA8�e��AY"���E|΃G�O����*9�M'�K�&���p����D)	�?}n
+!#	eI���f�<V��0k�Ѱ)���z�6���:r_2M_GB]B$�Ӆ�D:k�#��'�ӡ����;M�K1˯�}ʨe�o\����i�0o�8��t�A���H��(���T����g�"Rg,���ʬʭ�F'�xc�-�#�ZUL���\�e�-ߌ~6��fd��2W�!Ģ�6��ޒ�r2I�Ve��jƢ�Ítj��U�e��gƌLҐ�REz�#�8(Rae��|$5��RDFj3�y��?��T�34��d6�
+�x�*-�3�D���FA���(H�b��X#�Y<��#��>%��y�M�^�y�f���	W�U�"215�H�f��V���P�V2ռI��R�?���ѥ3	
+"�CZ��IʨU���;�䈲��n�L��s1Ѹ�Z�Y%�`A����t�!�"+X�$�V'�&�TL��b�*FR'"�J�3�_Y�ɈLT����Pٹh��o��**r�*"c���j�$��f��J�1#�� )��(8*����=����:E)��")�b|e��7+�L"��ئNf�6�XII�t��դ�s>.O(!s�U/W��d�2��=beZCRdu�%��_�ͫ�!;��^;�:	Gؔ�k$n��NybQ���W�w��Q��T������9��zB���>)㒰T�t�>?��43%t�8��q"[�=��qNa����GHS�����|��Hg���Uj�;��낤e֌"Ŏ�t6ړζ/!���_�אּR[n����DGpIѭL����|��x��b�{9d��~!Ғ�$��J���ܩ���9��[�x񚣯�fs�}3<G�]���S�
+u�FLl2A��8O
+��-6$�8Y����Y��qᅝ�ʳ�⬢��QaW/JXx�J�y)y�E���oQFt�u��E���Z\�)��RT)��(�*EO��;����ih#2X]	}�t2"b�Hʎ�x��)#!STu����~���JGh%��B�#���%�H��%ӵUP^J��K-b9#�S=v$Ԛ��S�f�":�ehSc:u32#�G6��S���y�DQf�"���̉/M�.5�^���;��L�rxJ^��1��T0�<7UP��� �S�K(�LZ�f�d��f�ۻk�J����0%�Ą�:�w(R��7;������+ᳲ��0d��\,ӏE*|Q߄�n�s���xD+D.1} 3�VM(:ظ�P<�6-�8�yc�t:����u�cR��"�7U��>�2H�&dQ��T��`;0?Cv��Ӛ�֡�PM��NѪh���*	թ��v��S�ٸ����"����`��h��T��1Q5!8�(�o��R�p�?���,��f�M�:-7Ԛ���f�9(: �V�Iަ�QYʕ+�0C�i�01t̢R�Oh#��Y��	@AKށ�s~<=*�z(�㖪�bv˽��n��^  d f��堚�~�B逛��C4��o����`��AD��k�ʪ��t�	�h4Z�(�{���"#�"S%�b�ǐݡg�4]H�#Ǵ�94)/��Z#�B��Xf!R��1md$ԍ�����=)�h</O�:��Č��-��Q+�[�*
+�-u�>C-KdME�AKh�}�\���9;4�i��e4gmh��F�k��D�^�����G�<R
+5�CX�J�����UNO�a�6'佲�e��\Z��U�?cKb��L��j��|���>h�#�����P��XV�`Q�c�$bTEv�� Q���Z$#�`q�2qxLն��ԕ�c��4j]"a�?�r��S�ޭ^!�S|�^�ԫ�TU��-�%J�Z��Ԕ_��5�y�𨮚���R�D٨�D&Q���T!ED5WTy�K�RT��*�\�	��fj܉��
+*��V+��L鬞)��Q�f�U�Z��ݔ��:,��H�5�ˋX�UaYv�H���K�]U�����_�.��g'�HÍ����:�S�F䌨�D���42��T���2C
+U�B.',y)����J�l�FIح���<�i�5�F���>Әd���?�O���G�D��6�yM5B�i�^1]Έ��\팢H
+�*��ơ�+b�>��""G��"Z�B��F]X��X����mTi>y����Zd��c�h�H�8V�T����J�WQHmx�"�⮞��(I�%8���S�w!�
+I<h��� =x�{�u~̿�W�ge�'VlQ\�AEU���&��$5��*	�KE���Np�W\"[�Yáb\�Т?B<^���j(G�"������	g���'���]�bbk��'bQD�5.����B#�|d�N�`Z=I(�B�]?�斶̩�?����*��@�ܕ��R��i��
+cx�NTt`K�9�_���>𘪾�z#Η�#H'�2��'j܃��E���t����`B�:�ԟ�����ú���I�>���߰����wMħ�6��0KE�3,^|ш��q4^�!˕Ni�xpj��?��Z�M�=�g�yp%$5!���.����-}"��I�Ԥ�;بr]�ѹ�M��/����,�t�h�Ph�e�"��c
+*��N#(HT���G���wVT/��7lHѹX��hF:���)J�`�òs����%���qw���N[�����v�S�*o����Q��S�|o)=�ӹ������A���.���f�ی8O9�aF����~���h[a��� ��VnRo�&���0�.�hh�zo���q�`�(��H"��C��.B���F4��)�+��d͈�\���F�X��S�G���)�15�p����|*�e���4���$QŇ_��݊sXCU�)��q!z*C�R���7�wq�-�$�hv���h��'N|��9��t��^��3�ǐޒH)�I1\���Ý��LB+����M�4b��l�]��F�����,���A�[�a��a<>Rp/��~kLk�dKN��IJ�9�!�i�?d��|��%��J9�2Qe�&D�	�G�{���t_�L���.��mB%DB����7�=��x��%Ɔ)���k�ڋv�4�m�|U��[�1<�\�̶J�$��#���<8c�i�4&e�ݽX������ĝ�"ܘc�73�9U�hHf"k�Mڹ��F��Sr�V6%Yl�����V�4�J����K�/_H*�f2{�k�xw%S]IE��&�)������1+�iH��5[Uհ+���<�"kZi۶Ϊ���<�d����ˣ��'\G��\�R��n2��^4ӎd�|*#o%�v2֨�5FR�B6Q�'���Yy�H"���I[	�XIy���� �Ȑ<i�E�=�L̐'ͼi�zk��\�x�q�ŌE�r֢�����&��\��j��7��>�t���qdJ�=d�:��1�ٜ���+��'��L:�3�M�}��R�c�Z��mX\y���et�^�l��,�M�bްŉ���+,��"m_�<ȵ�����#��ҍ8EHdŉX�(.���3�>��,,$.'�;M��<�(,G��b���MNԓx�7�.N��Q�g�:�ħ8\�f_+׬Z���q1+g�L�2����,5����^5���|J֓Oi�(����Ƥ2���N���Q��u�P�X!��)��Q���ͥ�8oe)"�A~���3���9��s��狫��5vZe½�珞��D��G%��%WvAM?*����?.+i#�S��D���kwm����|�iF�� �[][x�^�X�Ҿo��&W��~�>^�D��'�F������VH�����=;V��f�7�MZ����.e��Zm"�j����xc�������\���M���:¯&�j�oZlēF����&JR�s�����D�i#Ȝ�'.�SCL��J��m.����O�b�7y�U+1�H71�0:�?!�(
+݋"O�p�1��j#�N'Ʃ'�щ��أ<���15�1�b(�ƋS�Y�d\&E!�8	G�=1�0�<�W��A~fS�թ`Х,��$2[q"�qK�/kDB���I͂�S��<�D�Tr�7��P������!�"k�I���DPD��[VȊ�XA5��c:A�R~U^�ۘ��Y���.�b'��*�Ü�q�R�q��ɬ��p��̢�~N�#J�s]�L;D$.g�|U�|�!ܻ����>�GP���2;}��N�ٿV�f]�ȏV�h5�
+��SU
+Kh7
+j��[F��&Z�D_�e8O��R�f���
+W٧;ח�����Z���c���	I��zr��g�W��M��Oq;)Q�)Hs-/��7��%A򑔏��D��TrK�M,�H���_H�����[��r�T�gt5g�(q�����_�b���,s���e:�ZJڔefb9���"��%�.�:D�q^�^��𲊙�.�Є3��RB��c�8�Xgԡ+T3�����66Y�S��K�(�F��|uU�G�/���X� ?�
+�}�t�;�9�`�0�\�h ���Ut��t
+2nrh�ݿ�BZ�E�F�{��:�{�<F�;z1�͒�B��C����~B+�j�8�L��HXR���e��,i�O����Ii�D��BZڌ�Ec-]giebO#uX�\.TN9K&�'�{���i���BEݼ6����9���2/��p#.�׼��(O���kJJ\S/���9�R�*J�; mc|5�O#b���U�a;StP��+�)RՁt03O�PUm�`���wu���֣*a�\��lJW��T�.UO���
+��c�z�J¼ژU߅�r��*�+�%�)bSǱ�����z�.o�4<�͵V��*�\!L���*Sg�eV�T.�c�_�L-xЪw$du	�r�D�hf�$ǎf �PP
+�CbA�p`��Վ8�@(uN*         �yAb�ۭiO�ŋ�˫ܲ��Un����4�IO���0����9Uڦ��ȅԻ�g�z�@WO7N�Ֆ=��܌� '�ğS�iNk�b<��Щ¬�`�F�S:7ѳc��)�*�o�����f��v�q��ZJ��U�ﾩ,:�c��!/�c6gn\�j�S�����B�ԑ��W���c�a�����CH�]}������\��~It�ꧽh������8p[sHMU7�T/wj �SR:x�3��A��W.��-\U�k��Kh��SD27����Ne	�K`;���SZ?�E4Sө G������m��z0�"�d)E��ɐ�)���)SP[�z���N���N�tJ��OF ��P�T��:��3}y�S��?�[ �
+\+��7$ĵ���L����{��.��t���;�Ԍje	�E��w��K�T�,NdQ��
+�N�Tv��ڱ�E�:}h���$ɕ����B��S�#C.��*��v
+��
+^��l��.U.F�|Ҵ�ZS������=�I�`��:�S�i
+O]f(}�۩���Z��s�F�N�!C��N�n��� *p�nT�l��#�i��N�B�hp��)��7}���G�S-АZ�(�ѿ����弝b֫���un;e�1�;����8�����N���G��T�C��Vir"�g�)no+�8K䟶�)t������T�:o�؍��Naއ�@����w%w��v����5��S7�3]z�^�T%-�┯��;%o�\��\�N�$0uZ�!�Tu�u�5 �S�bT+K�5
+ҝ�<|�DU��(G�S��[��w����&�}q�Lq���-vq��S��O�)W�]l�u˷n�s�2��O�Ɲ>'�N)֦�1S3�N8Y�$]gp]w�o'����;��0"�B�NŸ�dJ�x;u��ʐiÌh�S�G����;e͠�2����e������p杂�Ս�pj�Nݏ���Q����pcZx蝪�F�N�2����w�7��</��z��R8�k(|[�ۢ����w��Ֆ��[�ީ��dI���Az��}.�{KU��Ǉ�;E�$���v�j��B_�5��S����wv�8$�A�nY�N��-ɿճ��ZQe�i8�;�����)k�M� ��w�z��sw��
+3R?�<ݩ֩��=�N-m���N-1�����=�+Lw�!S�E�Wk؝"�~�;%هG��;5�kM�ڰ��gu=�N}���ߛD�۩��"���T�(�%<ַS~{*��m�:"��^=��S��]�� o�<D�P�5�)����г��k�Nņ$�F�h�rR��Ƀ6��NE0����T�z��T����ɍ���vj_Y,B�P��)��kг�6n����O>l�K�N�t� ک�	�o��ǿC)�*g?��9;U;lO9��y?�N<�œ˳x�ڈK�OQ~�y
+<G�A��\�0
+��K�Tݩi�XM����<�>:t�G8�Q֑j��P��=�1�SB��X���8����n+��?y�|Z���³p�)E�H 
+L�����v��VU����W���1���܇�Ϋz�S�X��U;IfO����)i̭���!͖�S�Y��a����oQ<�$�$�4�p�9�[�C'���rC�٪0�)?���-c�<��x�y]����d�"]T��t���U�k�f;�%����w�S��Ƌ[i}�m<u�C�)����,��8�> j�)���%7\�U�x*dv?3� h�����S�Hsxp�����
+	���S���+b�b<e�YS\-%5O���h&_�SO�ٌ�������� 䋭�i<�Z�[~��ԕ!�HGՁ������:��t2~ڂ~���)�k_ZC�����>�,2��ӭg�E�P��A�oX�O��@���\I5�"��gg"�X�v5�"L�����ob:�G�4g4%q��0|!��TNJ��i�g!������x*�*Y3�:�Z��J�� ����v"��r�5���7C�Gok����h�Gxf�R6.E O%�-4Ϩ�S����E�@�w>��ŏv���xH��`
+QbE��I���T�
+ܳ"O������)Lx�)��{��`��󶧀V�M��ځ���Sht�۞�يqC���9�ڞR���.���bg�B�H�{�C L�.�=ExQ ��j��\H��SL�d�Q��B,�) ��a>��S��sd��9qd��t��zDO�vOu�u��B���QP�P�cOh�1^��f[��hst$�{�<$:�N� � ����G��=|ݳ�ձ{
+, _����S\Y����u�������x�(��������5����#�4&b{�R:���*�F{Ji �c�Zβ����bg��q�O�lɖӼ�X��)yȬ)���r�=�Y�9JutX��.�>��:��U�c�,���{�,�S�z�o1���V`����6�m����P����:i���<6�v���DQӞb�#F���
+���w��-ZEوJ�X��)
+Lf�TIg�*��0�5�����4����_�&X�'s���)øirJ�Ԯ��%|
+{�Ҟ!A�S�	)=|y�G_�^Z���S��}uܳ]���p�=|
+����0�4�����jo *�\�$t��6��5��l>�VJ&ç�`j��糌�+��j\çH�9p�p�$��B8��)���eç��l��v��)� Q߶§�u�٥e~@R/��=�S=�ԡPཥ��>��qw~����%�����)@T�_�5��.��A�$����j}ѩ��O��������7��$5a'��T��m�B�ħ��i�񩄫�z��"��&a�Y>q�軗;Z��O!�Y�$�����S�y�u����
+]��P�yj�n��U��p+Xi�(��� �z�|�B�Ұ�BJF�z�=���>3F���n���=Ew�G`�)R6��`�J��O1L����§ F�簀�Ou�67�d���2|�\1#�#�S���	|ç�3���O����S��" )�B�S��f
+���)�]��i������Ki�C>J;�i�J��z>�)I��
+|�|�xT�O�>�~�O��=H�O�J�|����g-{w��)�Z�Sa��Ld���V��S �<P*wf�Oi �ׁ�O�����r�y�ۃ�k��raNF��D"�Z����4R�O�$|SL5a�[>���Ͼ�{)�S����C��OE!�0�ԁ���@*���O>�󳳴�4A�_�Y/p2 >%ӤS��)>������(��+��"&�4=&&�)��( ��*"����TVw�h�<𤋮�,T4Z@B�Z٢�Q�)c�Y�ç��q�%|���V��Yj�o�/>5��ćY��z��z��PѼ����y,��j�A���V��T�5BA���/T|���\/�S}�4�K���Tx�:��`���Ú�T�@Pǧ��E�p?f����iM������Y�/�jO��ڏ�7!���B*(Ĕ-ZpI�nkO��#!}�����"ŉ}J˘�A�콽Rj�b�
+�����dq~�٧��Y�}�p{�8bէ
+��W;5���}�P�z�V�JV�S�����RT(��O�\�a���f��W�ڵ�G�9̡�-�D���R�a�`���>��B���>���Ⱦ%����S+�,:��OY�b��`�?��i��O7~ʡ��li�Oq[>��
+L�9QĽ�I�)�KJ�b�t��\J��C�Lۈ�v�u�2�J�G��T��Od �{�b�P��^\�Է���U �K��K�rq � ?���=�F,����} ����wŕ7� T�#�S�+�H`^�&q'��j�AV*��T�`�Pu�30��ꑷ3������\\��V��@�o�M��Q{���wdR��T�qy�AP'���tp��k����C^h�j�QV�OBG�m�|��ΰy��^���R�'�*���
+�_T��d�$�P�*�&,�7L��J)���JPݘ�����������|������3t9	�����y�T�$-���G�*��E��!���"��	�r�,����~�S[:����Os�����c��t�u#�a�%�R��ffWQ����=��ݶ�W��q��j�*�i��VP�Z�~����Z�*����x��A�:f.�)�Z�j��_/l�v22]!���<��\O^��L�1�⦠��t�|~t
+��)8Z�rj2�F��ϕ\��#@�&D�Ү�,��3ε�l���i'X���������%�
+!���m���-�贄�*R&�h�m�aT˚h˓ҭؑؠ2�b։36ڹT��^)�tP��d�&G��2�8��^NK�I<����fxC#P��A�__)ÿ��&�X�AU�����I�k}���k�N¾�<�Sf�&S�p�09�u���8�J��NN0WP�*��ă	C�m@�Ae�`�EB7���9^}z�*'���ri��A�3B�A�$i�P=6'^�^�~��/��]&�B�D^sZ��<���p]]"�A�`a����A%�ה�"��?����Ko�r���ʪ�U�@2��*���wޗd�L������tP-��Yi+7���Ы������AEB �%��A�D�]�yx㲧Í��(S�-��t`��A%_w}��A�#X��[�=���L��(T�g9n��}P!|��FX�}P},rȂ�V0Go��E��}�48�����\�pGM���	�J2�	�n�2 TzZ_��	Ŋcj������1�	�(�f��-�3�hO�;��Gp�S�;}��P�C�O��r
+�-20�(^A�
+���U�� ��O�I~ z(T��e� �)TI�W`WQ &
+USaJ���d�P%�F�v��*R�rF ��S����]�B%��ԉ�Xɱ�P���b!�BM �B�d���O�=�,;����Ԯ�������񸀈�3�݄�_�đ�V��݉Iu잾�T=�	�*�Sh\�Զ��x0\�\��	���Q
+U�_��0����nѠP���~��_�Q Z�[&�8�J������#]�PY�ѫƃқB��/9e�ߦP'�4�5q�R���X�U��$��׶2D�%��
+U;){�Zh��c�M����9�U'� �31��/T�l0o��sO�]��qZ���e�B��� !kj]�k(T�\�1�ߍp*���V�A��.��D�/I#1P)TJ����(T�6��Ē B*�C�kG���K�Kq	,*�UO���"��n,�b�:�h9�*�"Iki��)�(X+���{̪��J<Ud�,i>H�*���^��P��]�x�7�V��V���*���M���h��v�|��+������OYv	�b�S|K*��+n ���CpS�Lp^ۼ*]���u
+@�ጬ�O��z�P�����*4���T�ǸN���*��1��BE��ۊt�C����W��,c7��i|���`>�O)���7E~��;Da#�����*��o��<�B�a�F�@F�����/208|<�P��G� bq�+T��E������ _*����r^���p�E�|�Bo�"���f��P�_:���s����'�?�F�=u����+�<TG����P����YH��ʡʆ�����|*�`��C5�۝*�P��x4�P�&^&��P���x�ibǖǄ�X�d����:O��[ۄ��t����w��'�
+�W(�Lk�]~l-S�#�t����N׸�jL�s d��G���O�Eކ����y��s�H�r
+�n��="g�l.��� �$g�i��y�h�Oӵ[�P���Z�zD[@��d���#��P7T�0���{]LΈ�US���
+"d�؆J�ʙ̡�����C�7�G�4�t]�P}|��y#��3  ��tKw����lV�@���r�l�n��Q`�$�XZ��UX4�L��I�l�b��YX�������0m��rxVY]=�u2�TLԧ��Y�mk:�*��u��Q��U���Ϫ�{��9��t������g���G�����>*;_ �3T��G5!_�e��+/wD�l^��P�X\W�b�j�H]h��,+T�Q��͊g�S*�f�&��I�Z��L9���:
+Y��}��DaA��y�R�b]Ë0H��|�|7*T�0���sn��?N�������#�8����@ۖC�WUF��H���7�P�[�c���kY=jDt�!�y p��0���|D+�ڦ��7�+T�i��U��N��,T[�8V�Wo�4�^�oդ������20���d>&CE5b�b��BPP*��4��.T�m)4��U��y�OBTIqC�Mb����0T���X�4�L�7��`�J�)C:f
+:kKfQ��a�9�S�}�i�,�5x��ڴC��41C��K������+H��1�.2TN
+�Z«D8C4�2T��&����Nn	*�B/TR���URR�c-AT�����;��u�o��. �Bz�W������4���s�ˌ�%�r��'�~�q{�xC~Q87k�_[*�^�Җ�P9�2�w\��������).u"��2T���:����5L*Q2� �S0���+S�]�%�ۻ��0�O`�oT�1���?����h��.|t)���k���*/%����I"����P��8*kTø^�G�Xh��d�n,X�n�%�k�N(c,1�4TN�!�P}����8Tc
+�4ɡZm�"��ٔ�Ŀ�QE�Ug��Au���ZB�����P�ƼְQu�PU�,v=�@��!=f����@>���	ȸ��`?�����-+�L,C6!����Pe�H�:�j¦�?�&3#MD��T�_<V;CE�`�u���8C��}�������:V�b�L�JL}���NA�)�3Tb['1w��B8UF��*d�� ��4��z8�� J�Sg�ݞZR���ed�T�NAq+ʚk�%�P��ҹ�P��n�X4��]�+CT9�L��P���v�R�.@@}����_n����J	L�]�quD�4�GCE�g�ʎ�;g}ÌY�L����_�\YV�d;��2�m��_N�ExS%�8+���y�C\ʣBJFP�0��n�z,�֣q�@���ń�3?����,�q����U�zZZ)&�p�g��S"t��pȔZa(�(� T%("��AYM�9:��^Bi�ќ{��YN�Fw ��;0�� TÄ/,�B�<-�P���O�GU=�n���a�j�j"2�0�e�Pm�n�E�*%��H���e��j�+��mE�C( ���P��BU'Ȣ�u��
+ yY��v�*���D�<Ue��@��(�`�n��#@��.ue�� �)U��������C�>� ���
+�-�X	�:M|��C�ؽ	��ݫ@A T;E�Ym��T�?�N�*zx:h9�J�9���.s&��\fE��"���V�=��O#����/�����$8,3*��=��*Q��ɛ��v����Ⓦk�)2F�8��&C�I��vX�z�9��#I�-�
+t��CMU�DO�*@F�OAG� F�i~!��9���:6&��v�� Y��=w�
+�S�+gШ�	J8ԕ�
+*8�/I5b���$�5��tT�u3��>s��R�sa����ml�%6T��'0=a�X
+V-�m�G�p�XuT����d��ˬ�S^�n�������^E�D�R��F���~_9�%�sV�1��f�G�Z�E�LLd8���M�A�J}�[p1:�QNg�]􊥬1�T��Nb�AX��e��Y�j2��e��4�G�)dhW�me��7,%�*�C��_t��7�T�HX{�|e�"GftD���}x��ι��
+���U���D7�;)�k{Oe@�b�!�.�s���|!��A�]��diQ�6��S¡E��g׈?�0)�Y�%�r�޹DPib�>�S�P�����`[S=;1-q�k��:�f��  ִ'��O��:	Q��O�cD ��- �����t0��/MG8��s�����K�ֳ�����H*$��6�cfD2�8q��.��}�'˽"� ����R����8T#�ă�Ec�96���Q<�R��:�AO��0�4wQP��u�@�(��Rwg޻X��[.�SXp�Y��!#u��G(���.���������e��k�f˸m,B4T�����}��Dg��ꇂ�_�77��:�,~�_�[�=B��(����Pf�P*5���'���@�8���W7�����Y�6`�&�8@���@b��y�e�Q�#���I� ��� N��P��µ�#<�_g�Z`���p"x������T_��x*�}šA��p�T1 ��DĄ$��&8-wQM.��"%� n�� ��-�L,M�畋jqRQ��!y��O!DAe��3�á�z��"]�C��!�#��A����GE����r�1�j������
+*<�(�ڨ�?>hY��\�>}GKi2�0���"�מ���	�6�
+��@Z
+�S��Lxrua�%2=�v�?3Z��@+�iY��.��b�h|�1���#G��v�gG���|��
+�-��B
+�\O���Kkx�~�왥#�|J��	��gUJ~�� vF7�	�%5r����~F���S��O0�Q�����퇠y�LY���gM�t3��� PvY>����s�+BX��Ax��{N�e҃m�w�)�$�=���Z�����-p1�m; ՙ���.��)�l5���l�|����p,����F��=]��݈3�\��1�%n��v]m�L��{��8��_l�"��)���[��F�����BO�������&��_��i�S�
+��0�z�Y�����w!�������7*�`��eMD�=� ǹ�EI��uJ���������V~D�$ڷ�	�֊4
+��@ |��	>Ѣ*��T����ؐ�����&V��5igqsZ&Z�e����UD�ϤEjGְ}˄'�0a-� Om1}�T����4������򓮪p��N�y�(�5U#�2����ȵlw���M�#憎����fj���� �}�I�Kn���G:u-ȉ�ub��f�u��E�d�ь/�Zm�)�K��|��^sb(a�x�J9��H���pS��p��6�JNU�9�	�G.	��Q�	Q��71>��tu�f��`P��z�
+����o��� �Y�Ixx�^��(�V�C(�"���BZ�促�?~�q���lΘ_�U�Ɣ�=H�ب1u&zq�P�Ri��i�3�j���/�Dq�dj�W� ן/���2���m|���d�2����-S��2��>|�3U�$��� �s�j�g�#�#�CW���)�>ZU��@��A�Z\P$kL��᭴z,�k"b=St����޵����k�Ł����&�`�х��~p*ܕ7����Y⃳�(#@b��l#I{گY���1�A�ϼ<�4釂�ﭴ��t�B�[���ijx:�x�����^a����l�Q��ߥ�N@-�T�]6YH��ӝ�4�QG@Gi]�����pb���Q�T��5�o_ŉ���p���BWy���/1q��5��bq#e*���xh���v�!���7�4q��Ȇ�_yv|���22��Y5���[�J"��lʓŕ��&Aد90��^m��B��\'�\V+X�*j@��"���
+-ƽ�4,~��t�7����l���*nJт�+2�= 	ה�0�� Ha'.`6�j�Lƕ+B�Pݝ(��Nׯ:#�a?չ�_�zL�� m�)�M��F����&�� ǃ��LY՝���x@.�f5Ki��<� �YiT14u.開7W���7��{��1K{�B���1'u�� �tC����}My�S�"͖_�_#i��!y�[��!����
+\�!���E��j�$�Q^�%5�lY{��$�Ġ��hbr* �.��pO��s2��e�۴t�Lwь"`S P�f0�J�u�쥃Wx��v��� ����I<`Mc='I٬�ǘ>]a�&"Bj��R�L��<�	1 �z/�5�e%�1��V�c0�hX�����z(�^��	�|��3��Wgh� z�.DrU$g/U���Q�B��2<��.LZܰ�UN��(Ǔ�u�'��S}s5�<�����'��Y�L������6�Xf�l"e�JR����eS�	�d�h�����g���2b���H���]��8��[qb� n!�X,�l����4�ȱjoVt�&�՘AoCeLm�u������;$���S�\Bw}n�%�U��%�)�*W�`gh���s�fY��� �}A�k��&_}
+�-e��\'M��y���u[w[H�b�AI��TI3��3&�L#�y��� �5��sʱ�_E.�.:����P�y�I+�E�{�u������*�;@bk+r
+��oc1��<��2����Iv`�b!j=�h��S�$,�I�r$���z��\��6F/ea4�-F�{@���Sd�̝������
+����7;s(�nT���I�^"oҒ��v��w!�ց�΅+V�����q��sU&���ξza9���^��c���h�A�N)؆���D�V��>ty0js,:cu��ǡ�A��\b4X��&��H��4���C�pJ���K2��{-Snɗ�V�SZ��\E�����n�Yr=1�T��٦�Ƌ|�-r!v��L�ipDkO�F���½����Ќ��BQQ�7��R��Q��&Wmӈ�����������k�) ^_
+�_�E������&�!+���Q�oz-�B0�y��3) ]���c�H[(�(�3Jwa���9~��
+�q��nP�43�R��	fS���(ԗ�IY��r].R,� Q�G�jڊ�q�.٧f���a\� ���[B1�v�� 	,XZ�Μ
+�x`��90��O}�������v�[�8Yn��Xg���G�C$m��v���s�X�WF跤�^��@+��Ͷ�7�È���ۉ��B8!���P���MZ1�.y�I w��C�3���uAd^,+	�� n=xdU��w��Oa$�� �CK�8i���eSs�st��"E"3�(?
+�
+�cs|����K�龨P����s�v�<��P�|��C�z���KGM��mwn"j�������V2����)$��b�&AP�S^��v>�NQga%�j����Tv��Z��H���G���8(ޠ����:�K�+4����OEU(�u���,QmН邤�R��Q����A���D�H�N�_�鋀@@�߲n���8:�h�e�\u�޴���9�R�k�FP᱙ϝW����U^����~c�j���u<X�l�B$B@��A�^��\.�G9�z}07!eG'K���U(I�X)1"�C�e%�;e{����0���JqO1?	my��e�x��^5����-51?K�:�%y>j?\��I��?!R����L��al1�8���D��j�[�.qR册)I�>&S5Q�e^M�����y�S�qO�M�g��"cn�8�
+���@�_n߃�����(DP�l���I1z��c�C&�^I�
+� W�]b1�m��%E��{�7�W�3;����)�X&�H�E|˗,դ�}Ax��AN��|����ð�����/S*w"|��Ǚ"�6�9�	�M����ۋ����?������u+m#�ݼ��67�+����h� �tgW��	�I1��]�J�:s����AYx���I߱S���A���~��g��
+���L���Qt6_t�
+�Kٴ8RO�p;l���A|�7U�/�7l�צּj�Pw�\��'�CC�%'yW�?Q!�~��&�E���}ێ��eL��UJ���m�R2:�A>k+O�R�pA���ݮj�A:pS;X�'m^��ճ^z�8���S���1�<�e�Vq^썢���*KYk�m���rs'`Zض�V��ā�����F/���Ga��v�cN=�h�� rG���M�9 "����������:��&��F
+�p���㰔�����Z���0����>��钃�P�mU��#R�U�"����#_l�S���]�A�=��3+�����Wѵ�s��ET��<-�S��C�E)�}�;����
+��Ű����	����S��3Me����m�Mi�K�7���T��F�@1���݄G��#p�Es��1���ԣ�,�m�,�ך�>iN$UM=��-޾�C�J�, �Rb����	-�V�N��~�� ��՝w�$S:$aB�f�55����ο��p=��6�� o�قLUP�>����H�jug��őȩ
+��V��*�c7�M�O����p-���>5���΁�3
+��m`�狝�i=q!��y�v!����#�5_��	�:t1f��1ו��swy���]���2�v�)Qt��Z���	"���S}~T\Q�B��ķ�\��|h%�|tAoF�qn�F}M�G�q��I��79m<'��gpF����fc�T��=0,jXt�B�_��zs��[��s�&*���V��p:�C�8�����R�uݛv��rf�Jd�N�6t�[�xq�}<����v��Φ2�'D�L�1�w�J��)��P��Օ�l+���쿧$# �J�b�ͱ&k��ty�w7�0sW���L��T
+��ժʡKHr�B���t�6�*��D/���N����:���o��9��fX`���۹�HX��+��*��'��dnٛQ���J�;Ao��و�Pۮ�:7�B�9���>����N�p6/�&�����tll������-:��gφ��I�9^���W�0�K�7�����L���|A�w�F{J�	iE�Ә� xmF�H�)� Mh��g���5XI����G*ZZ+��H5���b
+��
+	�/z��,��7���o7�]0��0Z.��)`�2�����`Ƞm��kj���vV�T� �����Kc����Ƽa6Vyv�"�_d&�yg���K��ψ[^��Ʀ[+)��='=�=剰	�ƒ�~�)t\(K*��K'5A�BYH��s�J��B>�s
+8�5�m'i��j�r���ۙ.d�(��~�mbQϔ*"�s{etW��Fa�(� �O��G?E�ԃ��lj1���nR��Ʀ��W5��HD׏	�d�ZE
+�ŧ�&i�����cǨ�EQi�
+C�<{\��ٌ�>8�OTd��Ҡ��z����������#�bQ*.���!(b�\����[z	�}��:���΃%	+��k,H���%=�T~P!|�C�R3��& ��5=�Y誉��zb�e��5��鳙 k@��elZ�O)���P���/x�y��r3��Gg����5�X��A~�n�#�9V^|�t�nEHQ�*#���Jt=��{�F�Z��s�7�YXI�hċ�����A'�̆S	!S�%��1(�"�T�PX��'�5/�G�U%8�7�Ә@�	A��F���P�FUl�e����u5o�ݽ�:�sk���` ��`��B3����Z�g�dJ5ךO������(�`��r�6Q�o)H?���Л) �F�"��L�B�n�<0�$w-
+�$���|B�Kg�)�
+@dp��h����B�^'����>y@<�X؈8J`E��hR�R�6��P߸.Q���l���"3Q?Z�s�ږ�����c����S.��ꂦ8����n_���k�"��$���H|�<���QK$�Y�Լ����مs��3U�ڤRNz�OBL:;�S�Q�Q�T�m�L�܄rث�0ϴj��斒�.�M6l=)��XcJ�k3�`:Or�4$�j�̣C�^J��G���k""C9��)�����$
+7�=M�����Nm� =�����[��{B��R��N�_���O�B侳�S���d�"��$�:�¢�ھ ��2^�)Ė&>�0���7�RƧ0V�D����=S��&�>�2I�.\�����lA��v[A$����r���@�p-�%�"� 7�3@0��+X)�ը��G�mM���b�|'�0i���<�s��������U�@"C��������Nd��nJ5��m5db����UʓBn��+t��@�+�4mbmlAb�m���J[3�Qz���9�H��L�MH�/m���8�/�_?e3T1�ܤq��FP�8Qy84�+Y+�+h��*t&�a �+xhΕ6 ��M�?�<�����T��B���0p3`��D3m����%�h)袆'�L�t6!o���l����,�"�
+���Ig���#����Q8yhC6_M���~U?3\g�`�!�zR#63�fЅ׿�X^�	�B�L-�H����p�fj�Ӣ����X-�/��W�~C�̓�S�\�|��_��~,��&X������5Y�7�9wX��
+�Tx+��������Y'N����Q�oY�,En�7�Bn68�gd�VF��b$��e$�0�E	�i���;�ӏx�^�;>��NМ���B�7s?���9�f�p#L�<�nݿs�p�Ёݡ�=W���O�m�x<��|�L�R�S�u�㘬1����4wJ�r0Ip�a>�����<��1Y,�;��`�4�c�v6�dVZ�G�-z���4Բ�!��>�xG��6@�L�{Q �ᄅv��Í%�'�������f�+$�Jo��;���W �@^"�R�ͬ��V� ������(��a���Ę��H/��
+C$��;x�d�m"9Ԁ;D��pc�3�G�.�ݳH��1��f�8�f�5���)=�5/�8U6��/e����tt�c�E�![��ĳ����^��Z��e� ��Ĥ/�Lє�}
+��WJ5١]~��F#6�6�擮�pJ�0��XM<�"����g>��Q��x)�e�v��+.Jb�e������O�.��j/v�`�x��M��BL�n�!8��[�W�egJ�D4^3�˕Bl"��t$k#��R�_n��p�d�(lA��\�q9!��I�;ir�[N��T火%�9����Q/J�l} �r�U2s����_�,�`�\�����S��m_�����
+��#��(�Ǆ7o�~��v�æ�Y[�e&}��J����{����O���Xm��� �jK`P�n���W��q2l�X	�P�|��	���je��$�\���uU&��/�Ǧ`cy�(t�Ĕ��_z*�6��M�Gв���&����v�0�F1�#(��J���@�����m)C� ��3d����&R�K�t��Eo �>񶆦Է�B����gF�HI��{�.�!%�h3�.�{l��� �A\! V�l�
+�`����Ɓ�����pN�A�ќ��;�h�-�t}v�%�V���Z���/�Q&�5d��C�$=�|­��jD>I?��b���ST�3�0y����$$�Z��$��ɋ�����kײ0+D�����n��� ���5�F�ɐc��НOu�p_�n�݆С�����?-�����M»?jP�q�/o�,̦Tzb�2�ؐ�B��H���j㖐jz0�(�~5EѴ�`�/�^@���M.�,/h�0N����~�I�R\��(���	���}YSIn�j���XxTP��������Uǂ}�g�ԹA�uh'o%���]a2��W.Y�ڵ
+T�g� T�)�����U!�A ����D��,\@�Hw�2�dW�0Q�B;_M�I�����@�`E�WfLH���sPor�?,)p����΢�T���"�Q�Y[���N���Z���@Z!i������}n����c�cޘ\C7B6�0Y=#��q9ZT�E��(����-^)/�M6@�)$����Ě%��ϟ�M��t���&w!A�x4��+����+�⍆��!�KV�E�\��ʂL�nI��+i��]���0Mk��A'��?1�J8qS�'!F���g-����Z�nB�=1ݒ�b������tb��H��LA�-9y�}12��ۏ�Ѥ9�:�*�<��������Q	]�5_��!�Z��G�78k��HWT�����T�h����=l��f�B��B_�ht���̅��)z��!��邚õm�4K��P���m�R��4��J��ܐ1<����1��83:�ҕ@˖Q�u[�]f`%���e#���+�k�����EX�A4���K�*�����/��G�L�	���0�]�)���έ��a^�n� �_����π���tc�����m��F�-�P�4���b���0@�9�E�Zۇ���Eڪ ��i\ۺ�$����`��xlϙI�1^�k[�C��Ƞܧ��-9m�3����"�:
+['`�����OruT00
+���Fr)0�6P�]2|������QѓR�^���
+X�U�<e˗��N�0H`�ט�L�$�p��'�-�Aʃ#'�=�Y�	'���JjG���C��T؀�`*�량��"J�z}�>Bɥ�a�YeeF�_�.�Z54��.�4��@�nv�6�[����u�����Pf
+? �3�η��<]T��Q|Mj|q��u���CI{~w���fc����+X<d�
+g��7|քOP#�H((�,����|����@F�R������S���� �0�{�gǐ0�66��z#���Z�Y��e׏t+|��[i�@;([�/P���2e�Ǐ��Ob�t��++iH���QTm��
+�5
+d��ɑ������#q�%5�4�5!K�OeÆ���F�ީ�ʱ���#�ʇ"�ӌ&����m�
+�U@D9x�b�rd�?~Y�R}�`zV�R�<L�@�VՎ��P�H�Q�{�l�v-����3c9H�c�;�}5�9��:lT�eI���f�} ��������P�C�`TwZ�A����/���
+����ۙK���]>�ƞ�0�5�R@�d*Xk��!�׵ڋ�1^���z.d�;��"�죓m��5�������`�Ֆ���i�؜�,�J�|�K�TG�7�_;w���!�[�j�L�rg��# e9S�W�2אHOSI�lP���nQ
+�I�tU���S��W�,	�\�Z���tQvM��X��{��yi|��1�͘��Q�忋ai,��NV1�M�����<�%<膿.`M�`�P��epP@�B�,�;*��@�;%��h���4XM�����f�P�*����g�5Fc���q��Eg�� �V‎<�M�c8�`7)T��t�G�fgd0��?���`�����X}d�VF�ˢ�ߊ�*���D���y[]z+*�~/�%��Z�ȧ��u�=7��`�(�$]o9"�� Z�Z�L��'�%��A��i����?�_L#[4���n��ET}��5�\�,�a
+ŕ1Aj�����P���w#��=���ƾ����K���}�)���+�y��(� �pƦ���O�W1OT���#��[�4�;8j��0���
+亪����y��P(�S@�e՛��D��r1�򃎢���X�"F JТ�I��P_��h3�!4�Ȁ��+���đ#8yo81G|���n�h$��� ��k��{D���ٓ[�kj�]=�a��-��b�v ���5�f���9#�]��a ~�0����4�t��Ѕ�'34r~��)d `s�[�r�J��d��V��þJ�K��h�f��FVKΊ��w��{+�D���l���	"��y	�5��ܨ�,���E����tQv�qő��?��B�մ1�����{'�7_
+�2;1<F"��"��:==V�~�,�P��wק�¢-r7�g�(ל�7��
+>e/��)���;ٵ/��0�h�L��U�ZI/���兠b�϶GW�ـ|˭�h�ë'�?���7M�8�C����F��(Ҫ�����}"/��z�#HwK�N;��m1V���:��7��8PY�6�OL����
+��� 4��:?��(�d�#��wx���6�l�s�:��%G��D�}B��T�g����.��&�+������WF�Y&�i�.1���Ϥ��9%���4�D�O;��4l��-�!
+)�jh9��8^[���5v�y�wJdK x���{��́���m�m�\>���ׁz�-�.�o��[5��0Y�- �1S�2��@ot�,~&��V��e���o���ߡ!6���+T�9��������ʲ�0F��~m��R�y�����G�5�bq�c��?,��ѐg�xl��;0�6뼸1+5+�Ԋb���O�*K1���?��g��� �J���f��cfw	��0�~�ّ�)62�G	�NA��Zϴ���2���Ȋ�0~��Xu�7� ��=��Rwb+X`Y"|���Q���Օ�o��_9��dDw=T�̊�o�'��`���WOX&���-j4Pd �zz΁_A�q�U<��gpA�2T2�Io�%RCJ�L�,:o�,�������OW�������2�t��p����.c{&	�p2EP�y"��Q��Gj����$8F���pm�X��c￞G���"�U�`���#��dI������Jx+L�.S���z/�/������ջ�)�:���.�a+Ie�AL��q�	�LM�*o�/'n���?
+��1,�T�l�
+�tl8�Y��F�KKW�|H�R�
+֐?�����W� �~.���e�)�P��c��+�Gy�9?��FN2|�^��-s������@z,n1L\}���Ċ�+�	���{Ō��
+u�.��I���������E����y�&I����a��dH2�fE��iL��)�"����-�߱c�u �
+��d�'��+y��R��~�bzۂ�pɏ C�}h'x��:Fnw�a�5�n۬^�>l�P�`��%�5A�XnZ����x�
+ƿډ:M�B�$�89QJ�G�k�*�>�Jk&$N���	�.I��K���3��m�.���k7S��:�d>���͎��������?����`��[Gx���Z�$�P�av�Vyp�ȟ��Ai#� �i^�0 s��yDϣ���!���Ӥ=6�!�W��'����r���I��F�d�ca\���+���c�oݭ��M;��o�L��P�컘	�| ���O�����ʤQ�!C��}�Æ����0��wՏ�6�NwH�I0#�� A, ��<�yE���7(�S�P�ʻS+�����J��Y@$4��n����rl�ܕ�!,���\U՛�j�j�2�I��|��:�7Ǝ��|��k7���yH�S\ͪ�I�l�ye@��Nd��1�`�Bc鞸2̟��Y�h�L|JӴ'e),\0�K2x�1�"���3MpF�9�r�JXG���{�@��f4��2�i�u]�%D�x+0�2ᆅ���`�a%�X���#b�/3���Y��,*�������$-�\�?�w&���i֧Y�J�x�	�CI���<�`�� �*1�|���r��G�����M�ܓe�>��2��R��L�����8���cs=n*�`+Ͽ�����=�N��ȣv��j��;0������T�Z}#/���&��fk$Å���2�n�(t�����O��#
+�z��"h��%��d���K��5=��yi<���#c�uz�8&DJ%�fu�ܐN�t��1��������z����ei`|��i�$���jW��EWJ��n�I�gق#yW����H9/wF�6����ݧ�#Y|�v}�_!В0LQ��=��\��R���
+ǗzhR��"��g !g��	#���.���җ��XQP�̝n��g�'����_�_�E< r Ɔђ���'�m&_
+��EI��Ha2;ڤ��K*o�9��r��V�s��J������I�7w8�`9?�
+?1f֔V�z~9��j_��xç	��m[e�Y�4���j���JSq��>#ŔT}ˑ���jP3"�K�Y����_�A������h�O�����镪�e�`��>U�+z���m��r!˖=�jݐ��Mo���0X},����C�;bì��0�O���4w2�r-�cZ�O���_z�C��>?�z��>c$�v�EO�h;�nǿppA�o��h�<g�s!r��8d�bt\Ee�;� �D_�;��?S�4�mO ��M�8V��'�("2�dZ2ݦS_���"~U�&��a��7ܬ����V�J.1p�?ך��0s���z�l�7��`�#�všҹn�DZ������^͊�cM{Xz�����_��=�P0vn�`$T��9� ���*�X�D!&�Y|<�{�C�m�dP>�{�,̈́�@��2�Qg@�<(K�
+�����&9�B)cHP����I�o3�Yk���r�B;�V�+_�
+���;�����ri܆Õ�(�8}T��!D�3������~g���
+��(< X��<��y'5��&��Ŭ��	9�'7�5�$�Z�1�8��_\�C��`�qg�-DA�僙�F�K�$��<S�j0n�'��)[��nzN�Z	��<�����L{8��<Yܼ���'�)�����r������(�|�8�f��C� �g>n�)�_���?�mH�H�;؟�nĺ���/"��&.�C(��u(=���<�t^ Kqs�d��(��%������`U@����1L�
+�|V%��V䃨�H�7ؾ��L�1����D��hE��&��il��֚v?��D<3xFxO�< ����U��<4������5�3�� mV�|���i�_*��*�c����K@�؍�&n�8h\sz�`�3W����������Py�(=?a��X)k�+)�Y�Nd���A�
+?d��GȌto8����/�:|�9���˼~P�Q��7�U�5 p5(Fhh|~����ڱ�s,��G3��9��M0`�y���T.OX�X0�o-���,�"�.���3����O��Rg���7�P���Z`	���\�,B�"t
+2��CU+56����V�,�Y��Z�}2	�PΜ�2MCJU��cf�l�7���2R0�@/�ݐKC��F�|���V/��U��� �	�i���	� � �S�鞄ަ	�Y�֎ 6D�x�V�P��9d��zi�aJPtm��a}�$<E��%V���Fmwt�,�`-<�vE��T(l�	o��:㹍&L�EY����U�e�}wና4r*n�r�l)�~���\�aއQ���|�6�Pm99���/��P�$8O9�gB��甋a��ˈ�<�؆�O$l��{��p�Q�?�Pϋ�{��"nތX�,�H���9_L��I����U� T%�g�xpG�B%P5����r���Ε��&˹�J*��d6�|Cڿ+�>��M~y_�Z4�sn�>Ɗ��EdR���4o���3J��=#a���ּ�.�Yh�hm��`~�|��jd���2�) R��a���l^�L b2R �=��)ڡ�����MoC1��)Z8��U�6e[��-J4�-%�ה��A��3*`��QwG�v=�o�)�J��*����`n�׆�b[V�֡7��+�M����f�E���s�W3*� [l�)���%ٝ>R/�'��_<����K]��L��S�/��+8�`F
+H�dh[D&���6�l߃���dl�WB��(slM��+�l1�����`�d���d���P���ؓ�ce�E��c,���Y�k�+�l;ӭϲe�P�ĥ�,�l?t��u�e ����H��O�&�6�.V�-qo�|��|*���3�B��c��^%��e;*�6��XK���NA8�� UՋ�iӳ���ȫ:	�]oR�l%}�"8pW5�N���4� Ӵ>��ӄ��&kѧWUU��YU>�O�)��O���YX N9��f,�V�{��DWъ���+�^@C����"�n�\J�Jc�=�1��W�]@�Â.��4�p�V�����t�Q�k5���b��B�k�Ug�S�Il��r��%Q��rƯ�x_����we;�%����Kͅ�x��q�O��u�R/�yK'}g��!�cL�9���:7sYi�A���5_�x�>�pLN�5sy��Jb�#�з�@�_���Q	��1���T�$��0��gKYو�����բ,A�E�	��h-���Y�Mb�lV�*��e��ʂ��{4מ_x�b}e��5ݏO_7�8�0 �����*�=gxHp�#��[O؛����o.`C=�о���7��Bp�Um�J_��s����\� �a����h�K�$���C���`6ז�^����t',ň��*x���!D��젏ދ�i�� G~�5��a����H�f���!F�E�Pa�=��,*�B~���ћ���b���h|7�\��7G��A�;����K�xEX.��Z���1���8�� XWt���*S�h��7(8nVd])�^���%Y��(n�����k>!�b�����Kn�O~�z�\��z��l���,f��R.f�:X��������m�/�rm�p����[�l~0�c�}m��%��b<IL���,ޝ�_�-%���Mj^�2�4N!2�� ���r�'��ڞ�8��orEI��<).fe��#��l�,F��߃M���}u�\_w��Žʘ�o}�<��]��C�?u�%U����m�l��}�j���V4�x��.�z�^��gE��_jև�,�e�.�2Q��!�z��gT�7�ά+��h��ٙ-K��J��f$��j~�0Ϝ�#�#�L�XI0_PD�\;fyC�d�"�Q�B�:�'@��%[��`�x-�p��3�{�O�����ۢ�2�a�"7���� ���<H'�3�rmnE�/LH���\�>��C ��DI@��8�����X./ɺU�(騏[�蓴?�����.�N�@ȖY�J�Z#ر�J#���������fǊ_>�`:���TR^��Aa�y�������"��e`8��I��i�!{\AA�I_
+�@��$֏`��/zC�CE?x�1�`�(t�W�g2���I\�GO���W���3b��.\+;�}4�dP�a�������cqрJk� k[� S�<ԅ���_X^����><�V���<'^8wq�w����jz5�������1�vp��5��h�F"\��\�N��d9�@��\.x���-��>6����?�T�j�mn�N������$��gg�T��/��o���3�+��8��O{ 
+U��~�������8�3�^ �)�>��˓��YW��ʯ�qQy���2.��~���#��B���[!�uȾO-%{S_�����{h�3�����q��v����9{mO~]|�*�n��Z���-��P]x��bA=	�Lӟ�B%��������B'�
+���^!1��y�g�xn~3���/�/gWq�e�'��֒�L\��qڄ>�8N�;�+N��endstreamendobj1077 0 obj<</Length 65536>>stream
+�pe*�+�pe�<�i8�Q�m����A{�����
+��վ�.L�xV*�����Z�Ys��}vo��̦�&d��򰋅��ߟ�u��k��u�+me�'����t� ��:���0jvI?7>-l�?��Lf|K��/�[��FVM톤^`4�i^�D�������x�s���v����(��L�5�s���Q�6���4S���i�"z�� ��2��cR����i�ʥRʏ.�c���Dr0&�<��0 s����خ�.��������k՘.����)�}��$�xU�)DĎ�|��6����%_���e����J�0L˖E0���	���2irx�h���?������P*)�V��2���
+i'��_�G睸���\hl2n㲨�H&��#l���N���O_�+Zc^���Q(�`��K
+��������z�E��V�W�ͬa�*X9���-�%ؼ9n��H�Cyi��4+�5���e5UX��IG��6������U=�O/PUd�s�z����T�f9J��`:�b-��o��y@�K&�So�!�	�"E��h�]��b`*l�b�W��h���r`�^{z����m7�d�.*�]�_����s(=?69*k7��
+��������#�z������@���Hk��m(�H8EI�z�w[�İiA�s�A�͕r���Y�1F����)S$���t�d�"�^"͵�B��-��i�-�(��R`Jya,��B�s��3CFK�HL��E
+uNJĤ��L����*�1��ֳ�b��v��U�ngH�sp����>�"��{:���[�qq��G���N���<�������~&r��!а��&bg	��.���虿�T�(h8�{���[jё�AcE�����˪��6�.]�8��r�Q�����u��D��CpJ�לK��~��/�Ba��ġsk«yn��$@P�E	`�*��  q"ȵ��D��f��W�b����]��l�HK��s~�&b+��$�w�O��4��&�ep�O���L�a�aA�Y3,���]X�A|.V�?d��vOř����ę��x��ח<�z���8�V,�2z�^;K�y�;�y.iP��:B�ԫt�i�آt3J猬�܊9��('�;\�W��`	޽�8�>wv�F{;Q���}sc]�����A�����]G��&��d=:��Z���]E��+<ӫ6�=:uM�>���~N�);�����o\řw���R�4��r&�
+ew���i�zsdYa_��W[�F~�[�/��VUxx����l��K[Y����Vo�7�*0�7��
+Q�a���uN�sǛZ�ʸI�V  �׋�m�E�>�=i�t�\{g#Y�D岡.�~욺�_6eeK~=rjѭ]38Z>ڭ!V�$���,p�ꆉ�4�j���z�x�ܢ�\���F�ZI��q�XA����0i���;M�RɴE��v�N9�9̔_�3�+�X,��k`�I��p�$�M��/T�=@�Z�� ?�a���།�-� 6ɼ��<���s`����FEP��r�����"9k�|+)0�nl�M��%\%���,]�U^���I���
+�♵���U[ �^%��&��Z�U'����B>Y�D'��Uq�6�ժ��n��a 0Ѫ�'>�Pj�5­���A�񽵭�>�-]��f���qSO����O��2�18qN�p
+%\0V����2��g����,���c���û�F���d�`?�'+�Hib����	�n"	 oZL���"��|6��٨��8;!��S��R]���`��|�Aw�bQq��l^�5�އ��tTm��#��U��#�@���Ώ��v��6733W�ڮ�P��O�0'	��,j�� ���`��r1
+��L�얒}��IH�'4��4�����(�}�>��x[CV�>֠��"�xt�P�kF�&��9@��g�|�Oօn��t�� ��a���5 ��&~���^�C	�	���?�ɯ0���$��J$���6��S̞d$5 ���^V�5��^���� ����ػ��#����%f���i$�3r�,���F,�[5��_�d�$�lVՈz�
+'�!I�˖X"��2r�o`�b����tjD)Na-�bV|��F�~o�X�@�,��G`lT��e��˪x�_��� d��Q#�˖Q# !�����teJcvAl�+]��lG�"ƥ�{w�Ưx��������▢��o��c�[�p�)߸��,*.xe3��\�h}3)o?�,L�uI������pp��q:���i�q�����I�,�m���]�8�B�qɑ�u�"�3��6ϊ��3���4f/_'UE�k_�F�X%��q���g̼��׍PHzUu9b%ػ��%�qs\ϻ��X%j0�X�He �<o��o�jZ�Sj�md���}h`� %"�Ai��G'کNE��[��D[��eٷ���آ�^ŨȧNg:���p����t'��\��F��;�(<����r~�hثBw�ݞK�=h�9DtNW��]j>"�l�޾(U�zAg�2��ќf4��F{Q.��9�Q��f���K������ ��
+��/;V5I�����$Yth�@' ]e(4�l5�p�BhvL���fI�$0���e�өI���G�E����i|����U��qf����Mvf��gd�Dg��	�ėtf�������3y��>��9 �dM8	i��RJ�#Q�4r|���[�t�D4�1n5��8$]��X8���/��.��)N�:-@ANZ߃S��6�W���&��;�#����t�?uo�4�[��.�JQ{��eZ��P�*N�Ѓ�����W�s�����^{�Ꭼ�	\Pᰁ���2�zoi���XC�C���O��,�Q�B�po��|P:���[93��n YY�`Vc�1Y�z��ty)�>q����0�@2��A����%�~B�ѝ��s�ߦp��*o����A����,�t���c��@�GUYK@����K�4yO��ϻ��9�z�5�s�ovqUQ���8\(';R�y��q3����*�)J�*V��'�YIP
+�T��<S�9�4�3d@('�qD�k�a�U��{���5Vk��~)�5+��MiPL�aI��Y,�F*F�s�~@si�P��I"0����#��E�-���̙��g�1:�xa�0�{=׳y��ԡMij��\�9�`Z�L7k�����g9UW��̋<Aɯ������rr�
+��O?&��m�[Hl$����.���M��l���f���Eo?���=�Y#ەSc�VX1i|���)���yl`<�<PB�z�$�u��%�hH#�䉦�(���}z����N�9�v{jt��>}ٙ��LU���	@ܞ6�Z��v�ڜ���"j�����e�h�Ȓ�a�W���Q���X��!�?�
+g�N<[ve�%�nR�DTQ���~�-�������A��F1�hC>揩�%�ӎO��Q$YI�Y�}g��izG��5����S�4ބ�)��{@��c�$4�i��'p��J�LS�Ja�S$���+²��cX�il�� �oLl�V�K��R_�.$L{̙�*�0ߝ�z�6��5�{��Q�LW6�䀯�	XIJ:",�� l� Ap3����Ϝb@}e��hb�X�R�KY̥oiK�+���<vz���)������z:4"���	���7�b�p�r���y񥯃�D0^X�	H���ډ��{�7(,!q<��h�<;�T�D����&ѳ4�u����\��[rx��pw#�
+t<w�"T�K.��Ϲ|�
+������Wt��e�MM�5��u�����zL�|�f���U�d>u@�Ԅ���O��i��M{�Կ���r�q�O��}��$�e�U���������a��0�՝�]��-07�����]��i�hm�s�i.�$�*�a�'�R�T�3�<W�4<X�9ǥ�Zs6�HP�A���qp�+V��I�w*-G%��ɲ�߮�e��W��ѫ��M��k�ͧ|?��X�|�F�Q�%��������Mg�NV����Q[��f�e�ܮ�IS�Am�Ih��d>�Y&��1�@��iFaE��6D�,��Lo�d�TՐy
+(����H�1��͊��h���x�Z�e���$�D��m�>r��g������ǐ%t���Z�1̔`�n�qHb
+�fx[�D�Y���R-��
+Z��Ԙv�
+�~�FU�t�E��{~���8�#����I�S��Y�K\�(��@*7%q=%^L��:z���N�BM;�%T>:��u��!gq�,W�0�F�L��P�d��`Xv�0�Bw�x% |�z
+����Oj�S�K��#�~D	��\T+zp��J;*3'��A�<�O������G�T�/5.Q' r����D���b2 �{�.BCt�kT��>���M��k���i���k�c�7I8���`{ 3 ��˛��ڄƷ�8rñ���&�8�6���>�4�CU�q�qT=��㝳�KjP�HhȀ��f�	=T8�A��������Ic)'Űyi���?�l��@��7(	m<Sgz
+� �
+}�� �X�~�}G�.RI����v\�ԟ����B4��5f�w{��E �k��Wy��f�h�H���l�R�m=�@�����p:�� z�'�M�2����o^��?I�#�������Bgh��Π��F:  ��
+�@�x��O��^��F�S���^%KU��֊&�� NC�YA>(���$R-�(s�M�^:���G��,:�I����A���S$g0�/#@�N@r���s�7$WZU��+3-x�$�K]i�[/*iO�:�L�$)	��a{�致*]����Z�-� 2�`f����ݖ&
+U��S����S%VWqԧ����!�选���Th\�>+a~�����SM1 ��V����,P7G�������)�l��c�L�j{��*�5v�����J�H�N����R���#x�}1+	�h������,�Q����."�&��/)ö�X�t�+5aS��Bt���x����9�: d(T���[7�Z�Q[� �.�{�.ڶQ�Ťa��{o=�����PD�NAu� u��a�������l���l��,`��7#:N�5��.���򡇎� ){�,��6��D8�
+���6���Ί��+h�g���Dw�=�",�����=�l%��͊�c�Y�PĂ:#@�H�� �N쎠D�wD��
+���c۔�5V ����A�Pe��<Y(��V��3 )Yv��g]�?a{���|��W��6s&܌�?:Y�������2`�/���E��:oG��7VZ��0� �h_:(��L2e�tB �:��`/�ĕ3�t7��a���9��ױ������,F��a�{��$��9�s���-QbV��ût�j���� �/���#(5'�0�Ȱ'@ Ʒ����u4�mt�X�����q2x_�`���O{�'?pƳ���Ť� �J%#�r��L����+rO*��x���x�:��c:��G�*��_�����e�i�'%8o��4]&R�>
+��y�!�4��U��8s��b�8�_Z:M����K��\��V���H�T{�Y���.s�,��w��0Ug4
+p91��x���4Mi��SHٓT�������.�r�p��Gڪl�w�ߠ����d�a�ʟ��0l����  U�B[�����%cC@��:<�飋���@!\#����SAd��hĺ����4l�.�sn��$�
+W��ln�����n�g�A@}l�oy��s���<���V9ŮU1ݞ���]���q��!��%��n���*K���lc4 U1㑺�J��6�����\�y����2���9@6�&�2|ٺ���ΦZ���dH+qOq�0�V��@k�� ��c&q��J��ި�t�gh����9J�,�����A��fq�ǔ+�5�.�y�y���4Q��r��� @Q;H.~M��Ơ��/��n��'�2e_���,�a��H�����\\>���G�J!:�Yr�!z���n&�5�	�Nu��8�疇H������5XN���@�[V���\�񖑟�c��;&�$'��� 싰�o���5�/���B�\��d�Fg�#Eҫ4�W��R�XƲ�餈(�m��}p�J���[ cβG����L]&��(����+��ɬ��$ԫ����e����m���'�x����a��eg^�A�e
+�{�f!C�V2qŒ�$�8��Oc)��xUXݲ��~Ӱ�j%-��L�G!q@��m�W�.L!���6��d���x�^U��UNr��sN�R �jG���T�h'��������ݶv�:��'�j��K�Z�����-�ɱI��H\S��'�BE����2�q����qzT(��7o0\aه�/��ݡ��U��%Kx?����x�����@�.y��2�K;��z��z�U��=�&�YR/�z����s����.��<꣒�$Ӡ4N�R�dz:/��yN�k�����ӶkR���:��������:?�+ޕa�/1���Fbz�R�fk��������!�쐢�]V�w�GRx�Il�8��r;�
+.�]���6�A��O�T���9*C�� Z�P�+Y$=ܵ�8����G,tr>�e���[��&*��y,�`�.���o^S�}(m"��ɑJ�� ��8G$�D(��
+�(���,�����og���ڨ�h6]à�`�er�F��)ˌ��/vk���\�r�W>:e�xS@2֦6dWڴ�FH`K�h���V�7e	`�]��:�>��Ծ�D�C��ɂ���#�J�{,��=����RV��e����p����T��=
+�����Fq!Z5nT�Y�{��oN�䍬-��<�⬯�����N���d=̝X�{y�ʤ$�w�B�NB_��Q����ɻ�A�#q�&�j�;?�� �pw�t/֙t;X�����)�ʏ��C��.�F�|�K��7/�I��އ�T��;�K'��j����tu���O6�k����R�b9���XG����#��~Yc��.�1��n(��ҐMbG����@�yO�����ک���!<&Ⱦ�'�� �>ߑ+�]��bc�@�Ta'���n����� ;��� ��A�#��}E8��L�q�|B9%��_O�*��&qj�a�H����_�����$G�GAb8}��)�~��[���]��Ё`��#q^s&����׀j�*������H�Li�Ȁ�%����n{y�R�$*�Q�K�{��>r����I���8x�8ʉ�Z�-	�c ��J?��|��\�Ld�{�����_�!�r�LP��$n�|�8��"��%�Fk�@��a*�߄�JQ̚��+��]m�R���0��=I�9�i��$g���O�s���= ����Xr���xt�P`�%����pb��'J��x��j����\y�=px��=T�g�e���Υp�Bu�d:*���"�4�T�ٺ�t*��ۛuA?��`��M��}BC�L�����q����m*�8��0U������M~I^]!�P��o1@B��y (�y((��+Bx�ų���d�/JQ\���@�7����嘾F���N�M����cq�PkfY��a�I�O�h/�����:q��1����>PK��f�K,'��̃0ο�y���M-�����3�OS�q-��9	n��.^�(���n{�0�Ǔ�}�hS�ǣJ��,�yǃ�K#�xK<˫�l�q<�������$�gO|��L$
+��I����`��VT�Rʔ��5�����M"��M>5E���gN���tA�=���e��3� A�����6H��L��Ø'����8/�p�h#���Z���ڐ�F����J�_�h\�"
+a�u�8!!Ɵf�����*#�W�rh��А?��_��+��`E�!��6�7�{9�7����a���,�"/�fI(�x�S��>�XR�W��F�""b"_+w��#3EN��G^3H�G����'�ǔ���9M9�F���V4��ERa�S����������V�3&D�b_���@���^2
+E$j熠�|�ug�Zij��Y&&��'陏LEWݣ�i�LhW�y��`��'����I��Ї!ICM��?j<jd4�	/=dN��@C���9:��t�L�EH��|�6a;	��փ+3�#���c�D&1�z�I��̠x.md�J�bx�p��d(����Q��Mo%i������7�F���끛��`,�@�``X�J�a��.�pa�B	�CyBUu5��<� �H�!����#�C��9%y�,2F������e�*��W���b�8-�!֜1uMM��t3�=]�_���:�b�&>*]fFZ1�tdHL1I�s�G	[#��/���IZ��H�g��B#�N�f�%'������VC<�1"2S�Jӷӂ֒h�i��9c�һ��+}�Q)Z���u���ef|MǢ����wo-�~@�Ј��S[�V$֑�,E�{����Kڧ*��gњ�ў\�Q;�ZZY�;W��V���UHW��[J)�2әR|�/�g:M��ڝ��t�Kmu"���_� �Eݚ��F;�)��4-jn���/3ȮQ��O9T���j�b!͙�5�����KDf~�]��U]��$���4Zs��S袩o&�pTnU��L?!�$3�yx*�/,���$��DD� W"��#��ԗI�����F�՜z1���?�Dbi�jh�{�9�Hv������H]��Xo?ȖoT���J�<��3�Oj����ٹB2�D?DR=��t$^��;�2��Mg:BB�R���y�����5�p��>�H��9T�.3�OcD��+�(���I�L�S�
+F?�����d���$[]2ټ���Fo�y"һ����PQD�����Aa�;�����HQs/~�+8ުt���Ƥ��H��/���n3}������=�<f��sK�y�9�Y"����yp&SЈ�8�ä�M^p��S|%v�j��yR�f��4�A�^�Ί�?��3)����/�>���,QJM�HH�c	�I\����)©��%� ��,������� ilb�^,�/�YTk�2�A��/���:����0��T�p���2�Fs�D8d.�9aj�������<8�����#�E�x�{��)4��kt[$�p�Ү��c_3Ƚb��+�h�ă��� _�� ��%և��o�~f.�P�e�w�br-V�L�o��.2~7^CN<�D��'��>K���LC��Q_Ơ�<�qU.��.c�H�iQ�CL���yIB���f&��e3U}�|#�K�̎b�5{sJGU�z��V�u<2Ca�����"gB�sB�}"b�g���pY\JBjf��f$5��j�)D����f��z����J6{�l�|�L�f�I�"�	j�ά9�[kn�*�5�Xw��yt�>��=���mVzL�(���]v�|9�t���'�U�0N�
+3E��)�Z���
+��%Ɂ�t��T*�K�ŖR�Z�+�M��8�����m�@"C)�Z��V�Ti!B�Uu ԰�
+��#DH�f��
+�v�Yr4^tBd�+�Ohf&x&xH	��a�����x�X���PW�
+s���/a~~�����"Ix}�u5�1->�E�&F�}�G~L���;�����t�J���4��.�ClҒ6�����4l�f�SD�
+E� � �)*�	2A�r)V�5��:U���%H^�Fdh��E���2_�?/��x����G�(�&$Q��+�S���(���͐�&$ᡨ�^O%��x���+qȋ�C�\􌒧���)G���Q�3֐5���	^֑G�ՑJ��îF}�B_F2�*HA�!�)��+�d��4r^�"_���(V�\j����ĸ%�s��!!rMD����"Q8"]�t#�Q�kf3$����}!��a��*:r�Mkd��'����꘹p���*| ��N��	��OP��0�0����Pc�(j��oI�M��sE��n+=\Ӛ�l�u㸁>��om07�Gm��]jKȏy�F��%f�U�B���l�J%]L�&j>�=j��,�f"j/�5�܃Aj�°u�hc�S��'�I7�i��J(*%�ފ�-�&H�K�KWz�\:%��'��������{�11�j�A�+3!�q�*jZ��eL�J}Эh��]�R1Ji�N�%���}���GIc�MŦ����3�)�LI$=Um�}�t8q��W9W2���u�pAB�A#�V���4bo�hRv,�ќ�����_�PF[Z���TjZ�`#ȑٵ^�N;�4Nآ	K��Qf�;���G��2�h�}T.�}�6����N6�ܮ.�dڰ��V�Ӕ+m$���tO��J)e9w�B���W&F�,�PV�SR^[myY�N�&��I���~ġ%&h����XdQ(g�B9�¡����PNF�Dΐ-��E�c�Q��f�����J"�7���$��d���:��y:��̲n=?���h�giz�/Jo�]xZ��ȸQ)U�����R�CJ�H�EτH5O�Z�)�E���t8�VxD�Ǡ_
+	g!	$k̠˲ǩ��JR̃I�M]�)�KJ�!�S��AB�Ӣx����#&��O�N���aIĥ5������W9����o�5�	�|�(ZP������I��qȢc.UD���Bc|3�L�o��(D�_U4��3�KZ��>7�hʗ2ĉ��L�������jhD5Co��E�L�ƚ�K)�+2�$B>A�ױ{��t�u�%������lm-̕�^n܍˘�ZŚOdPV*}b$� 甚��B
+K�e����Gq2������l%��J"�7��Q�}�����r)��V��ф-�h��4�ڥdŚ�WJщ��Q5Yt�墋^ffї}�.�ٚ�����[J��K��>�PQ:Yh3��K�MΑ�s�/�Qiݟ!	���dHk]T�v~R��f�ғ{"���'�L�B[GV>5S�N5'Ȼ�b���4{�b��Pe�P"��e�Ω�PO":���d����uBL��y�b`�i����VNU�Ŵ6����Gd�&,���UGu�h�X�{,yV�16Qo"ۧ*S��IY�F6ge㪲�T#�V/�lȜ�)�g��T�dH�bkVt��an��k��ojt�P�i3�XK#��u�w4ܬ��r�!
+A�1��^J*B+L��Af:��qZ�G܊D������^ @0@\��
+-���z��@��
+�!�
+djX.�"�
+�M����*)	�'2ӑ��f�G����)���i�)��*#2�L���%��	2*�1k¿0������א�W��#')�cꪋ~V��`�RA��x�K�0aEI�4Syˊ�>Z���X���[%�6o��N��=���x#�Yi�*5�f�Vo�荛�'b1iF$��d��0��C�f�%C�
+y�ٖ[YWڦJ��^
+�<��U���[cM"��T���Ѣ��."��Z��Jx�!"��6�řL;2etƻ����B�e�RL��AZH��D_8�14�����T|�m�h�8�*^8��O�m]J�)�8�R�|I)Q���=A�q���tB>�Cy�h�E���1�^�+�~��D�q4g����1K�U=hUw��Zi����7#�1�(��3ᥤ�ȶd�,8���b�׬/3�:㌱�F�H�)$�e�ـ�n3"2ʯz\$�����t�q[mDԚ���d�zS�x�0��j��Ek���/�49�0Is�|��hR#�G��6E6<UՂ��-؉�=���Gg#�.�bGޠ_�鴂&�hEp�7<��&�Q������+X�j��4+$C�E�MQ��RS%5E�����T+�!k�/h�k��8ETk3S��R�,��I#HD,�b�,1�\��n�lX�ez9X���,:�QpH��xF��j^\�67�����C�&T2�7ڬ�P$�h>a��ec0n�
+"�S_,��z�Y�R0�VGH�:����Q]#+�ھ�|�W�h5�Ku��F6Hr���h��'W,����P*���5���õ5k�Vag�m5���A������ÊJE���4��%V�����.�ǹgS�bDoaɝ-	)�4R&��T'5����'�=Q�4B"F{��|e�P�e�1|QFA'+8��YD�*N�&��:�:����,�Jk������:�ޥ��;WZY7(�X����Īy���v���ϩR*$���,��xԴ�s��8$"%"#�Hq��G��]#���Z�U��*3��i7r1����'s��D��RduY�ܜ&�[<�D�!�x��!:�C���d��ō�0e�J3�p(��fP�2�DJ[���<9�9!�H�Qܢ��Z�i&��	��45C3g�y|֩U�����|E;��OOtOZ$8����?,5�Qڰ��Q<^+T[����'�0q�7��8�#�W?�̲N�֩�t*jJ�~S���,��4,z'*R{*.V�C��
+Y�ǡFm^-��"}�W#ӑx��&5G9aWC^m��گ��Tj�d&   ���������j� �^ :B1hh�J� "  �  �b��>h�˖LA��>�x��p������/��0�����;b��T�d�����a"�����������Ϩ���XC(F$>��-�=��o��)�=^��E���N�=_��d� �|����8�1�׻6�����	��%�MQK裎	��$�(������U��l����߃>zރ��O�6=R�A���˄���t�wڵ�P��yӪq��.�� �{�ml?��}ϣ=�}=XJ�ި��YB��>���.g���<X�,ȃ~�#�v��+x����;����;��$^B_�7����q^ہ�G;��!;p�_�g�@�ju��H��M�qI�L��H���W�9��f�9X
+}�6hB���Ĵ\A`�A�K9�˃&@B��
+}�������`���"q��p�@�e��� �w�`i�7��7���K� w�ї�����I7���t�A춁��s�ڀ
+Jt��ݜ�3�N@��:�� �l�z�y�p���вү��ꩪ� �258������G�D����i�Ci 9���h���mѠ����Ƥ5��C�w��}F�A�g�r���P��X��oo���t����G��R�0��e�k�	�2�kKe�M@�KyGR�@��1X��1�:��k�G�	1����,���fb�1�ΈA>�SGE�����`���0��!�����q0���/x���I���#�B��/p���m�.���"��4�@�di,�r�3�h��\%}A%_Dk�Km|AY�7{�F�"}F����|�naѤC��������j������P�����LIN�0��,�H��,��p��f����W�g40��1�*_p~|%}x~�WW5�Np|��I�M�v��g �!'}�/(ۉ��t|A�;b�r�i��pr�6�K#/�{~�$}���׺@�M�`mn.�"}T��C��\p���-8���b�o4ׂ]�yU/-�K��Z�H��n� �N(x1Yp�|ǂ��.�z�Y�>��搾��%��&�X���{�ł�kU'�A��C,}|����>bZ,}B�:��y���b��C,(����-��;���]�>�A"��`*C��	O,�`�,B�:�)��Üe�)�-Jei[&Aq�С�M:�iJv�>���l#�OS��$���Q�E���/Jg���Ădb����8�X��a��V��O��;,`��
+,�
+fS}�w>B-#��8U`��|qyq6޶����ó�&�RQϋ+��8Le�׵��[��u^��m�d���G��B�R��n��3����m7]}��
+��֕����\��R���R�|R�7�T�4�����)���:]�(zއ5I��|��.8�ub^}K%X_V�����OǛ�>e�~�M"��]$��\Af���I�� �����6E�:����I~����
+�_}���N5
+���X�<�^W�ś�sS�qT�;֗W�"lZ����4�W�@��� ��/)��m��7>� ;5~��^0�
+.a��7�H�
+�\뚵>�U���J��g�h}�Z�z꟱* �;��������l��l#[��)q�
+z�1�JQA��8�J�3"�ۿ߁
+�s�s������Z&#��2S��W�`���
+��߷џи��)�[���g��(0�2����p���R�sK
+��B
+$G��)�`�Z]��� Q ��PPy>(�]o[�ڟ��#���e?�~�' ���IO`�ʩ�3��D	x�[��:uqO���yq��M���M0����	��	z~`������n[��z+�X�[���(����eI�;f�`	��U����wY���̓�ַ5�``3M����5	r�M�O��/;U� z%��		|9@��֗w� �Owz��Tq�8��FP[߻�7���S��%�2A���c��w"���3%����4͇�)L�}�������!�=4
+�g__x!h��i\�*fy&4�@rOr���&���q�� �;{ �
+�:�jt��A|`�c�?�g? ;(?��A���y}��~���ԧ�{��<�I�z@8=���3������7�w���F��؁��u�H>��c��mDL:�Ⱦ^߁o�%J�� 4B�8`~,�=��/2^_}N-��}�]/o��Fb��˚J7@'#A�cK%��F�L���H���k@Y]��B�j E����k`�H�h@��g����6�t}c2�e��3�nI��@��1��}1���x&��� N2��*"p������^�l��.0vh]`~�#7x	��n�v<��j�Z Z@��b���	\�������pI^+�����
+��>�f�-�觀DbS�F����d=^�Rn��P`�(����yw��>Y9�H�����:�ϛ� @�L -L@��G�\_��@��ؕ@o^��8XJ�u�ה��Nj&ǂ�K�������ur#�P�0��#v�*~�|u}�����~��B`�@X�^ر|��@��i�����0[}�ݩ��l� ��= ��燆:)���Į�HL�>�x��y�B?�������X�$ǡr}��`�y���)�W$����������7H��ҭ�u}�]�u�X���e�I��Mu}Cy4:�%�}Bp}����<t�\�®/��������sEj��i�p��FS�ũ�O�F���t�����A�͒xH%f�dt@�ۓ�I(F�m���^t�
+�h���� �:�y@��}x�u�����a�s�2�u��o�b}L�����j:�'�<: @1���U�G?#��~ҟA_�)AG�"��֓��`ۏ����?�b�l8 !�����O=�Ms����j380o��Y�n�69S��Vg�_k��V�\�+��ҬӉF8��ʔ�wH��A>@=4��tU���@���E�|Y�����$����2 <xT��~�&��P�r����e85��W��`ZGp���Y`Ab�x������%&u�Ё��y��<''+��G����פe)��Z��U@c��Y߀m[_�!-��[f|0��$4]�d����X�N��^�� �d Ru� /�Z���`I��=�B+P�[#Eo.�a��jZl�,N ��i���!P �W
+�C������?���nT��-�����`+Y,A��oY��ؗ K�%z1�������P��$ ��?�t�"ȳJ$��q)S͎ Ү��'jQ�:q�����.|���-N��0���Hp�t! mHYXw�_ �O�UuUݼ@ ���	��)<rՇ]��� :��"ϻM�#`�6	b(S5�'=���ը���**P6�������I��\xk$�n�|(s�C2)� ����#��k��վ̙:�ؾ&NI@8}�� $���٥c� ]����,�\�U�.8^�<� ��\�)��,`�ZH��0(���I�����$¿� � ޑ��Z ��ƅ6�,�߱T�#���� 3���l �p��p�%��l(l� �F ��'U�%�� ��}�f�(kb8m�BPv�(�ڣPQ�(ٽ�"��N~f�,���y� K�J�D��c��l�����u��;U�eC��(��+��p�ɩ?5��ʦ5�32ҸNl�f {>�B�1����t|��`Q/�����ߊrXf���_�Qe0�@�Qc�'� 2zjl���,�' ��O ��W��Z��Q0�%-	&�n�bǧ��9��Gf�U��s|�c���e� b������
+B��,z��X<@����6���� ����@�»�@NR�k�y!�
+E���j�������!���>�) ����&'~'% E ¦��ո�gr�8 ,SxypG�O�3ฮWIW��q r$L�l�S��f�d��&E+a����{�WѵN�/ Y�]:�cr�R�\�t>��a�%h+ < �sa(FT>���+���.�́���G� {��~b���	����J���4h�&S���2��'�.f�����E��-ۭ��|����Uz�,.����(�`D��S�����r�'f�Oa�[5�9n3���v���{/�@���"yih���.h�ۡ��*@�_�T������9~[F�hQ�^�_�}�B#����2�XI�a�B�#a\���{s����`�o�|�G�ȶ�Ú�̭�F.�@]���f?��=�opM���o#���j�5��\u� U��?&�N��Tǚ�O�ګ�H����g�Z4�mL�+B���K}�8S��CVah�u(�G��{@!��WͿ�s� ��V����UHM8Ҿ\��i�%���b%�M5�����:횏.��(r����5�ǔ��?Ao��PrJ���Np
+Fj ���!'n6�{h��a�QƏ��3��Dl��<f�!޷�)��|����g�����5����l�X����w�TیU�y3����T�t�����������	�)��3��n}�&�������{;�O����XF����[�A����yv�/��5�#��Bj3\�?u�rN�P}�-�^M�=�	�cΜd��`��.�}�:×W3��0��@qf�ş�.�����U�w�Vf��J$��$3<;U$$��I��b�V!�˅��͑oN�M�o��U�����f�U1�c�@��h�Ύ#�h@����w�oD�3ZzaΧ�l7�Ї-�<	�_��sq�R��@���W�k�ύ&`
+b��wc�8���Y/���
+�eef�8$�!c����&P��o���/RB�1޷��:q���Z4_�}d�<�F`��?8P,����Fc6l��
+;����FpҞ)�0�.)\� ���	n�p�θ�\��r�\,�n��u����:��s�ߣ<�?�8�s��7��u2x)�_���@��_1�/���0!73���phY��3-�6�KMf��9� ���V��g�)�����i:����3��\"?�����3�)�	 v$=v�?(a4�qi��{`��cH�OY~`6�%��������<����b�+�&ƻ��-����5��j��@������H�����ey�����U����7�C�8+6'��~�V���W��R��/	�c�m1����>�L�Rg��?V��+C���[�ٛ'8� *��i��p�G���ګ��i��>;Xס�R1�S�A�=��&��� �UޣCr����R-��8���X��z���u�Z&M�E��Xᅷ���:b�i���K��;����4-������1��n���|�(���ox{�	�I�]�֌Fȗa�;��>ie�aR@�U�!4 ���*
+�w���@���0u�b��hQ����7�ǜ��%X��(B5� B׆��A�y�1F�͗!�����6iN�m> {��~XN!W���t<5W���Gȇ3֩-H5�]U�Ϥ�ÿ#h��|g����#�mƉ�(��$ٍBZֲ(+7������nW�Ӱ��@nI_��)"&iׁ��:�k���^��u��F���ٌd��W=S��T��OD.�X����q0��<�����iS�:�':�w��5�r9��lRU0�>�e�>�(�Q�:W1��_JtǷl����<�[��t찼T�;�xle����_MA��,������ҡ��z�f�P���I���Vے�9ĿR�.�;�-!�U�T'̀��m�Jg�I;�Zz�C\l� �??�S��Sj�����C팾��X$�����u�h1��{�"��|�"���V1�fD�������,�����O�nB�	�-���!8?����m�~?�)�	��	`-"3������9݃����|��{ڼ�3~�!���oO}v�b|H��ӓ9��:�O�"Ĝf���j��BU7f���?�A��B��	��f����@5w9H��_�س�?)[�ؼ6k����o��A	IC� ��� &�wӿ��oӤ"$�K�=P�~�$�{���zi��N:X|�߿��% 6�Ķ�mv�50��Em����V�d��v2h8���-�N|7&2	wh&�?�A�,'��?4�~��p6�c�ÿ2���=8���oJ$��8�mD"�P�P�[�I�V0��9���@����Q���_5��*#�>P%�me�4)����Jdd\�Oy�?���k|��S�wtU*+�@�� ���$ #��uA��~;��������� �MQUt{���"v�}S!m|����%����|��������x^�0ǲ���O?M�dt`nx�I嗧�T���ŝuX�[�UHm=i���MM��n������?_"���^?��䬗$tY����ݔ�D�-Z_���
+ooן.��&ѣ��?�ȗ�?3��56�N[��85��W#`���}�>Q���e���q�S�f��S�<u��f�[bJ�m�QSyT��G� E����g��MIҟP%�ֆ��=�ѯ�sT>��Ϲ�+��P����_�I '��~��y[������"�r�"+�Z�H��*ikq~"Y��_��Q�	��"���Ѫ�J��?.)u��E8b��E'9��_���gY@�U!Z~;��")&u^��zv�'�.d���󛚣��%A��H&��Fݯ��J���G*�rʅ�C���΢�{�ۏ�V���n��:��P��>��o��5�M�vG4QIi�����Whx-~a�`1;�y`�;��I^���>�g�b,KMCW���Ls:LM�4OH�`��[��L�O�h鉊~�*��6�� h�Ai�8s�e*$��ˋ�I&���;�@��5� ���M�m�����ǄF�e���J����T��J~Pj&Z,o�^_+aͷnĎl�-/0v�o�#�%B�����}Fy$͞��4���G�<Ѧ��2ox_��	������w�Gz�~Ź����`��V��~��>�2�c��_Q�}$��(�4V�[��h/ܢ�
+p`�,؃�7q���u7���� Įa���>%�}�m�<N�/r����Bo?<7&�f�j��G�n	QU�<���_h�i7^ �b���S��w*�%�p�P[z�U?�z�"�q�$3p��#F=/t#(C�Of�f��l��WG�\�6���zo?�a8��
+��5�(����}��ST�O0�8�:��'.�\/���oP�����H���������ܿ�n�%Ҟ��g�1��Ջ�,���J7.8J���?j����2�틝g9|�"��[o�Q;h�0��6+ �X�����~��{)�m����Y�(NX���UR�Y
+}r��z�,���RQ�>U`�[�}�?�����t�.�8�O��}�TTk,�M��׾��`���+dD�#�ؾ��� p޿�?sߩ�P�3U�{g��A��/���}�*�D���i�:��?<�f<�?Ck��(gS�bY^T����l��j�&��h�q�j_2(��D�a�i��/������h�E��(�p��P�g_��/��WH�}=�xǻU0ѠUi�e��q	��r�o��d�Z�WŮ��Ė���ԆXF�����/1|o,	T1h�}X���#�7�U�l&�%���{v�䈨�k�}�M���D6���ؤ{��ʱ���,�ǀD@��z�+�H�PϢe���p�Ƹ��w.�<�?��9��E�m���֟�VF����{�x3���m��W�Z\�Y��i\�n��#=��X�!��D�Ai��g��@@�`e\���<�f\�b����j�'Z���w�֌[��?u����R&U����m��x"4�f������SrX������0ֲ���?��MZ.��\�6�NWh�["���A�P�B�ȱU��o�N=�⨫��xP V��0�ZD��Z�������L?�h�Z7���Ъ�`�7�5ɩ֗�8@�Z�PbB��	��0q��Mk���X��}��Z$�MU�S\B�V��Զ *Z�Z�l�M�N� ��j}g"�F��#KV��M��Xo���Y�������j}D	x�_�p��>�i9�ut���_�,GJ�1o �X)e
+ta���m��w�Lq�˄��c���[?¿��ظ%�	��i�Ig��Y�X���P�	�OÜl� �����@D���Zl�	2�P@!~	^9����eSá�����G|��e�����N�D��b����W�:Y�������~0�
+L���n6���q���~jcq� m����2	���Yy Z�̷��衍p���XI˩jfR_[u�+��ش��a��1Pl��৒fo�<H��頩_m��I �H	���?4��-�sb����E��GX�"f�F@-j�ټvU�^N(��3�Tnb*C᫏�������\�l��Lw6�>)�afy��ƣ��M�v��т��G��rC~�������i#̈J�{#�P�)=�����@�?�2�fv�?>N{�����VA�뉢n�d�����Ǿ�]��_�t��7Z�0�B*��ےt*E�`���g�=g�-J������ �f�ei��������_���_/�|�&D��'��)��tb���?/�^o�����������_��6)p@�v炌SEE?����>C��'����q���(Q���,Lv)�el]�;vѨЛ�Z���@hK�`Z�t��0Ԍ�����g8���Qm�}��~�������`�ra�	٥ɥ�x	�N��oP�P�s
+�e����ƧZ�X��jP+��1��9[�=.�� ��\]f-投��>O*� 8.��<��@�.����t�X��k��Շaf�*���(�v�#�i�WcW�ŷ���Q����y,b$9ɝ�&΋x�)��L>1D�Vxq���1�f�q8E�P����<��7+����i1�M*�o���kq�@C�5�sN_�4���0%X�:�	�>>d=<������FW�� ��S��Y.����1:��K�m��;��Z�[}�K	)!:�4㗺+�ߩ�����/~��+3�h|��G��6A�%DB[ŗ<8^�y���ŧ�q���C�q�iC� ���i��*ي(�v&��!~���6�>uϺ�A�*����[�5��y�l����x?ۏO�3.D����/IYc�b&?{�UX�.?ct�C���wx��Z6�W(-T�����!Rn�C���҃/��3Ҟ��S3W��ݍ(�MO���m�7A;����Xp!���X� ���p��^Ui�Z|b�� !��@�����_�~N	U��� !�J9��C�i ��cl[��ҩ��gDBI���sC@G�T�U�ϧ%��ؽa˔��_3$��aEa�y�|��W�8��)L"�+��l�z�%��&��ޱ&J陸�?��1>��)��
+7��k:ˌ�+�������i��M���j��A%�� \T����&��*/w�,�5^�(C�{cf0-.��-u�AٽB��@Ÿs����ވ���Iu���RK���AE�{"=Ry����g��#G�KN�����V��Jq���<�(��ѳ�ݻ�15&�<���$R>Y�{��]�Æ�g�;�!�HC�Md�'(�7��=�'f��W �=�!�j����� �j�^��}��*n����н�fD�m���!��cO����&6�xx�8t��4�� !�!1�b�p��?�I�7t����{�6ҏz�O�� A�KzH炉?a��3n�-W
+���
+u����ś!�MÊ�*��	U��V��ŝ��ڢ6t_u��>U��7��F���i��EE���#�=Q�g��m�x��ի�&������C������r��hS�<�� ��C��NDӢe��[(��r�(�,��H&��ǻz���Qk�W/�\B�W,��g�КU���>��{h�(,i�cD#��v�-���f�ĩ�n��߳۷��J��U���н�~2RV89��S��B�~�5�rbX[Z��oG�o�ZOu���X��xqy ��Z݃$�I7w'a?tt6���5�4����]̽H.S"�O�NX�� O�-�'��k���|�@!1E�Q�{����)������A�&�ۃaĝ�Z�=��Ӷ����F��y�j���~v�^�VIk{T�K`2�Wh����}�_G��{�3������w����ޒmR-�.w�Ij�tQ��k�q�b	j߯�m�I�#%�f7-���i�e�������n}A��C�Mj6�˗�S�>�^��ٓ����i�^����=6����6�48l��79����[����ՄGY����+ɼ�B�,�6g۬�B
+I�=�r�g��y�Ө����څ��s@b��@$��gݰ��s�̍,ss��v�ǽ��=9�Z+�)k��w�lpQ����7����*�R�>��B�6�ؤ���n�7�v�d�n�QCZ����t����$��	�a)7�_R�*fj��k�'�+)���mn���	�l�-t���7{�z�|��쩤�8���k�%�#�cp�FЕiK�>+9g���χq��+l�5pKd��9�uօ���T�AUl%����$��A�X��]��8��M�[�����-c��PLr/��%�
+���}o�'��ӟ����Gyl���w���Z	<�n��	�r�u ���翬0[-6V���e��|h�gU��!�zz�Bn�d�잚Z�
+��+����"9��h\u��}B!$E����Y� �Bxf��F^���L��7g�U�P�>�b�>��1q+М�U�ԁ5�^5g���T�q��p6��؃A2��"�� ���@�=3���Qv��٫�J���!7SI���o[P�Y�1g�%����^F���G�����{Y䗨}�D�Y��O�c��H��xҨ��䓽���ɾ.�^i�uQS�˳$���^w�p����D�T�)t�9�B���}V�;[MU0�Kbn���e�{���/dO��$���+�� {/ ��=a������i��Ȑ���肤YR�~�"�~(ր WE0#d��=�G�1
+������=�;+�V�B��}2��2�0J/��:|4d/�"6��6����)ľ��˛�20:iC���pG����j�h8�7�Ј�K~�K���v��A�T ���ƞb��6�	 Î���c%�qQCC��H/"*�DM�*�^KC�໢�&�_bLK��ah��I1syp9�/��{��z�0�fJ�Q	vd�=o�k˺�G �m@�u����3���A��n��o�����I�[)5�}�Qn���@�.�
+F�x!��~{m��@��,r43�2$4�i�h����� ����!7��(�!+�r�<B�,�M���;�<7���6�{�R���g��9�n�dd���Ϩ�Y T4��>2yd�.T�"�%${��$�.��A�[������ͳ�D[] !ɾ��<����
+��9r�eU	�I��9�
+W��${H�+�lӮQtboړ�n%�~$T���#��^��P�'�Y �;g���X����Y��s��m~�WĲb����(�{�/J:�F�A�b��{���ġ
+�;QȂwu@�#b$��asc_��c��%O��ҍ����WƠpcO�l�"`Γ����,(�Q���\=�>{����>�>~��a���.ȧ"�=��9��!U���-ϓ�{�xpњ[Sg��[r"�y*�&�]p�B9I2�
+��uD�`]���+��O�*%���Oh���X���~+�8kkN�]���l�PN��^qH����ެ��F_�P��Q�}�v5k�Cc���S�j�h�4o��w�Pi�7FH�~6"R��ܴ�:���~ <1�/��)&Z�;Ln�oz�����-���iH=�^���L9�>,���@�'j�b\h"a� 9��z�����x�i ��pRIk��Y�r1��t^�Vh����mo�Ma�Y�W��/�k�����xx�'1����7�܂����Q!I��x����Z�z�8aKT�NcΫ�u2�(0��������s��d%��B��e�8o���?q���`oK���T�Q^φw$��f5�^��:f}"']ed}�*�&�% E��E���$Mh$x��s�q���k��T��~��XR���K���V_\:[%#Vd���M�)\��mDa`��z��굹4����Q�?��o�4S�>R���z~
+�O���I���ۦ~�cS��)m��l��|�JR?_�3��6������ԨWk����eD�i����K�p�t��%������;�� ����s��.ܶ\��PVzz�l�r�(g���d<�X�����棧#���>�Iq�R����DI��I�K/QP[�J]d<^Ahۚ��.e]��jR�P;ԥ[06韃Z��+$��b�:I_TV��wԢ%�����3�X��.z,�;��� �|���"�-� ��L����8��L��� vVa���	���&�Φ�1U�W+�:$t�9ۄ�-�$�5�,�D&�C��ϵ��{���~:�6�+8���%CoY׍Sә� ��ښV�=�ה�#�矒Ɂ^8��@
+���.I��+��bUO��ܺ��y�����|9o�} $Aw���y
+�^�w$>�u��6����2v˞nN"T�/���Ȕ��%���:�yֶ@aUc��Ơ���v~�Ĭ�g�k16T�#>e�ў������m�(��j�ހ�\�����������en��c��/�SkGh�"�˴4�(7��-͂��#��0@��7��|_P��)P�D󠯣���ݍ���@�a\�Q���Q��W��}�̈jq�܍ya� ��y���)n��w��
+��ED̿�y���V�*��]� @��wy·vI�֝�1��j��6h�����6���K-_��O��B@岼��G�f&����o�S�
+%J�Z��[��W_�4*H�ʣ({DQm)@y]�DCA,���q�\��B�By��\�|�9 #u�}�Iˬ� �&O~����|���z��2��C=G	
+\�%�S�XK�rR�W{�2E��Qʋ�X����$?0CJxq�	� EB�W0��fY��|��W��K4tH]��g���'+���N�Q\��Ʀ 
+ �E�z��g�1L�����L"�����i�7�)2�)��-8Rh�H�fI���n���X��;V��* o]3ݩ�ju���`>>���aﰷ9�G&:V�i.��~�����x����xNa3'/��a8�nd�e�Mթ�t�}�t1�>v���f��Q�ŀ~&Ѝ��|��W&��z|�I��2�l]rLN�ۭ���y��/�{�u��:I�Iv�N�_|g���
+���;�Ş�e�Qx��.`���#u!�w�ҿY�½C?��*>�Բ�/�O��Q|�攧Q���?�{����5�^z������%^9\�����s-`3���_ӈW�1�H�G~t��FB�r&��%o��o��!Pt3��=<S|36�ɥw�ǝ�X�C�J8T��[��_��<x-0�1�4<�^����g��m��ճ�D0�i>�u|�.w,�܃:��醿��
+��	�w���BO���(F*��움!T��� #�� �:ŵ�Ü%,�V@��o�<=�-Ñ�6�f^ߖ�"%���/9�o����6@AP�Z�� ~F��M�6@��]-:�tD�����.%(��;�X���x��\�W%=�1����a���0C�]����a$����!���݉p�-�N�����A[�9��-��KX�tT��E�=��ԕ��/Vu�(����w�L�\�3��s�R�w����a-r�*���EV D�t��߽��Z��}�������Ͼ��`���T}�7����}/��R������m�b�(���N�<���{� l��\��{Z�$b]
+^g�Ph�⭿D��Y�!�[.
+�\%�G�"x����+g3� �}LM/�G�� ��t�U�p�=�j��������h p��56ywx������/Ӥxq�*�-�*y��j~x���G���ո���������=��{0�;�z�n��� W�X ʻ�#��&DP���/F�W!IEy�9�q��r���&�yP�kIZ�
+j �;�e_��w��6�e�����{c9�x�>�o���0Oދ��Lnf�,VFPk�m �fJ�O�K���;�~Yia_��j���vDE���]0���&�h*�C�n������z<�1�1�ޘX�����6?�ԕ��H�.���;݀O$q��_Uɀ�-�\2�I�
+�@ӽ�>EpH̲�}1�溡�r�z'AÊo�5:H�3 [}�Ǖ�3�6��#* ��}�{s��sy#�nO��}BՁ�&��*�,��s�6�`�w���j�	�`a�=�XO�d3�����	������Tsw�����C����0���W!ex�BE�s4��	��c��W��x,�Ux�i`I�!�~��:@�&Tu-���M�ʤ�'6����vA�e�\��8�Qy<�4s)b���1*�x�+�����]�m��Vj�"Y���%�t��������yq� ���މa����Ś4t�濻"�� �������F5<����?a�&X�ed	¿;�ο��{>�U�]P:z&�~��n/�H������rۿ�!AA�w�tn_��R��8�R�uj�Oh9V������h����%g���Γ�w'��]VoU�޺>��=�� l"��|@��m�(ta��v|�w��0�T<��T��b�ǃE�Aߡz�n� �j��Ǽ2�롮�;m��Ji�&S���5E���1}��{J棢�MvGu�S
+3f�Y��g�Q�'���%6��5�	p�OEI��Xn,u����>����V�i`>7A�aE��Z��T�@�(1�"�mjE�*�� d�!v�;�?#�h �nD��m��M����*��)w�{M�Q��U�Q@vN��H��8�)ڔYv˜�	���`�^�uvUD@�*S�ڕ��n��<dн������\��9��*��y~�n��n���P�v�F�Sd��n�G�nT���~�
+����������f�nH6�+�n��y+��P>���s|���+�@��l�[ݨ~�����n�i����+��Eb+����#�}`t��Ç�����;^�n:���z���e 4�n�1
+U}Ғ�:��S�w�N8-	���,h��m7w�-��#ӉN��Fػ�) ��������%�Wɉ ,�=²���½��d%l���Cw� �F����YS�=]��EV{��n�́n��q0���÷m*�~ˊ�=��7.�0��j����%���ft��v�6`�T5��n�o��8fs�k2�BSi�o~o��<>��^�Ƣ~�Jˑ���qoX���[v�F�U)�&�U��}���	ŰH�s����[v����9��(��_QФ��Z!}��xB�Mu���C��dHi����7�}���^��iћ�����8����0�\�JPX
+��>pb��V�H :�&؊s����s�;,�3�S�a|w2�	o�~�h��^�7����y�m�:�c�En���!c^�8aTkkd�*�:X����7���ւB�Q6�Q���Ӊ�����yXj����Vd�T��lKp	^��M�q�L�K��.o��R���)preu��xC��9K?�$繊r�xE*�&4�Y7�Z��s3hF8�G,�DV�g�WyC=Ɋ�'H�@�?7~��7o�	L<V$H؊R�{]�Z�a���̰b��т�n���:��q�VT��<��T�J��3�� :ݐ4�h�˧��8�RM��M���B�1ZQ�����1�tE�h2���@�e�F�"�)|���O��Lr�F+�#�Ml%�g��Ў�^7���ՓG��F
+"�kX���"Ɛ\�-���e��@��J��$�!2p�ǣ��n�/����Ǯ���	�W�����* �MZo�J��O�x�����x�ׯ�t�|j8p�R8��c_���Jx�T`71@!�*a	vs���ƫ���ݔ�L?4�P{�tV�7| N�nΈ�1u7��i4�d~�����G���d<1t��7�sX�@���^Q{��4�y����0���E�<�ȡ�^���˧=���32Ru�k!��e��ǖ�Y�&�~�ܠ�|���t7���(B�v����Z�x�~a�ʡxS�j!���߰�OUj�s���	�n�w��SM-��w��!MᲥ���M��P�x?�q�l=+F�QA���˿���Ζ���)sqTxҽ5�Pğo��7G����H�jv-���hL��sOsZ]��b�9ͭ���/��pؠ1r��'6��(c��h�8+�d�} ��1�Q�H�O��09�gf��h�q���q�����TԽ��?��#�����8�sO�Z�A��&G\���4�H9�4�4���3'jFMD#l{�r�-��27�`� �|rN~����J:&����#h�����|x�9[�;�;n?�	�	!W�-ٺ�'R�9�.<G����p�#�23�0S�����q�~ѯu�p�o��@{`	��3�c]o�@U�a1��������"��Q����E^0�����9Z����8��y�����9R��{�}�2v^�g�[v���@�$�$���s$�#� 9B��S���Q�����V���=@�툵e}��w�d,3r$w�!�EHw�����arF��&��h����#'#� =��>>��}S��wCxNG%<�A2�G�x:�����
+-<a8�D��h��&�d0��Qp$�H��р��^4�j�r��ϲ����.2O)��ʟ
+R�CM�$:H�w*)=���q�n}���r�S���C�h��X�<9\F���6(͹6�����1��!��C:a��z^��i�z�����l"YPyuϹ,�ފ��c�� :�.e��O2���e����yF����ͷ��3s�;Z(Q��+�-�E�����"�ڙ���=���=�AQ�{���d��+Y�H����2QJN�B�9n���3;�H�(c� ��ϝ�9�}"˸�/��6=N[�$�vlܜp�s�О�g{<���mŖ��|��ٞ"�sv>!ѳ=�QU����=�H��D�^�,4�TK:Ƈ�(e�t6�hy�)م��55E������?�ى�,�&�e�輕��z#K"q�"��D{Md��2�c!�����=������Bq����iE��r�	x�!�G�ؓd?�����y��EH�U+L<�=��4��7@D�#�y2�q���=�H���uJ��Yt�h��{C�&�7������'q� 
+�݉{�{� �Z��*�{8�-"ݥ�=j"���tb�T�u�*y/��:�"Q�o&u�\����w卋���BbU�ϝK)�k����q�_T��i ?�4��t�$A�w�6�	�OC�Ѩ,zH�-��2��v��C�����Nr�u�L����n��o=�mc���{ �u8�^o�4jʝ@�0��H���U~"�{��L�
+�ɥ��H������+Rx��.�vy���HgK��Q�i_���IR0 ���='�d��"e�,��8��#����*V��z�A!�=0���-��� %i�k�g@bp��}�|�����>T|��ǁZ>W��U,i�G�����y8k�4|V��t5|p)��*������S��pM���͗�� H���C��[_����Y�a9�b��l������E|Q��Y�T��d�~�A�U2|f� ����2|~FԀk������g�<�A7��.|���E��HFb�7�W����B.|��q�b-��ga��4�3V�a��G��m�܁�
+���a0�%�y��{0�:i*`$���W���}����N�MO�F�_���w������="��OHi%�>�r��`$p�:�(x�����L�|;"Z�m���S�{���$ME	Vf�<lB_�M���?��ό���'�F���4k/�NS(�-������ ����ܯ���uٵ�hv|d&'a @?7�haZQ��1��dc���O�)��ż�с�˽�A��0P`�Y���_�h������:3I���M,�&
+.�qO�OHB��������)��f���'!�d|Ǌؐ`ބ�ʐ�� D�h�IW.����V�a���l#��!�NBzX!����uMbxIr#�6䴴.�!�G�e�LC|8���SzXl$\�zv��_Ll�cw�*ٺ">i��\5I���ŌC�V�lt5>�K���͍����+(��Qu��O�YݲP�(õ���\�y����D?�m�o�>_�*�Ԩu��
+�M|��>?pj8RǨa��6^-��4��Z���2����&>2�,��18>��Nh�I.�8��a�����Y��ǟ�~2CG��aġ	�d���K�A}vq�:���T�Ƒ/�5(��SѢ��+}F3��$�]�J��!E���j$5Ʈ,2��N�c�M}$���HR-{�:=��q�CU�H��ʋ�P�"I��n!����(���P}�K$G��C,t�� Pu�C�H��2wpI�r�E�G�!�U�m���/T�ƄF/�$�-I��&"� iS��<�(�%��A��ل+^ۺ���|�2Ɍ��y��9i8�.�@����G����\� h���	z�֢���%�h�f����������GV��7���#9�;f�����[�8Ih�+�
+�מgF�ȏ���'>�b`�Ɲ����n�#9��B�ta9*���	k�x��Bl��H�#��f��gSW:�� �q|���>#�0z$*#�u��|���[s��*����u!�Eaz?R|��50�.��j%���i�'z�<�i
+*�간�_9>��4 ���1�g �>^�����1q|��ZA,	����\M+'�r+��َ5�����Gٽ���W��`�ԥ�AW;<�h�����m	�
+�z�$)�}�f��S����+�����̘���}����vN������뮌���V�EF ��#cZ�X?̘��$Zl���#Q��baN����8]3a}����R�Q�Q4�Ƨ슰��g0��Q ҧ����)|���I�:_�O�-�9 P!c=��t�����fV�i=���N �l�H��2؈����$�ّl�{B��&G2'�P������hU�;�~Uu����'�3��j�����lħ�I�a:'n7�óH���̊�`������B�G��'Pz9Y؏���^�\��,Q���x����;s��#W��5h�+}3�\���L|��l!y�;�/ߜqI*>����|�kM��������i��GX�!�����;���`�B�;�>��W|�Ŧ�;RL��G�nC�O����9/��U��y�DSPL��0_�{$���,@�y�{�*l&��+>�80͜��5�M���=Ň7���ǷF��<=�I�A�Gvi���S|�?Rg�	���u�p��#pQ|d�&$r�����# �E=�ّ<��3��ّ�m7�qn�l�R|��4h
+�#�q4S�a�u$�dð���S��
+��#�ⓟp���kʺ�g���y��¦��6�^�?RB���Ǘ�|� �z�2w�|>b�HDg0�?�Z�e"�E�������?��C}8�T��;Ӽ �7�d�׎$�Jp�[@���Ht�/�ʠ�ӎ�ݗ�t��	V+;R�ydb\������G�T�ZXGZ��ri�a)0wg[G:�[}�K��7|����kN	1������[y��%��ħ���4`��c΋s(�����l��((��V|(��]w_���7�b|����t�@�#Q��[pZ�N�x�'���)�ҫڨ=Rnk꿪����x�.�G����.c�d�y�N�]	�Ňc�<����c	:�-�DT|�!�`�z����Xi=e�Ň%���	��O�V0�][e����g�r$���MۈF�������1`��m��E�J���$/�zQjD�����F&�t�|~�kD�kj�
+�#Q[�!����  ��7jϗ��>��p�f�Y�{$���h ��r�J��p"��|�с�Z�|�K��xD�ys��s��>��s/)i"s��
+�G�"b�އZ_��R����K��wI����O�	�����cd��G@KT�?�WF8X�q�m� .Ke	���J�TS�D������y6o�$g�����d�Tb�we�p���/:�p��;a@�л�%����H�l�w�(���?vg�]*�������2�p#%3��쁔�jH\&�ZPb�؁��E���2w��)���5^J�S��C���r�d���@Z�����)Q�Nk8�N�)tS�J��ćJ��B��e*U��}܂��tU�{�	�A��ש4��~��O����ˣk����(���=��qd?�f"k"yn�<��Rv#4l딄�����P	�]��P������'�.
+��n-p�P~�a�*��2��xw�
+���� ��qgF��u��%�M�୥>p�s��	��еp������Zj^ۀR�� ۤ��N�~'�j�Ր���$�|�B��F
+���
+��o��)��4O��WN��q2E�S�-V�"i�j��˗cS�ۙU�e�1�N�-�ץ�����M0�����������8�2�NԀ�-�zR��l�%.�Z)�yJ�=��{{/-��H����S�/9�W�C��.�퐊D(t�
+�[{�J���؇����&$<8�"���������ʭ12g%��M8��s䊨389�S�Rb���
+I�L�N����l�rĄ~��,Q������iXjk��։*�H���o����"{�:xO
+)�o���L��IٟDEᛔ��$������/*A�Fu�2vGa�"�ă��t�9�\�*�L���5(��th�������FB�`=����#����}�?�j�P	0�ScP�(����+,.��E���YOkX]dl�@���>^A6"�'3�Q���413]%v��9`f����d��~��W��X��mo��;�3�rsa�D���T����͢�J� ���Yt��ȡ)8]��,�3�.@����,�� �y+�*��s҆.��Ȕ=�:��L�K7����5�͢��
+̢��W�����T '�`\f�����L�\�ýh�I6c9S-��G��)�3|&Z���g�T~,;�_?���2�#��cm-# ���%M�U�"�����h͕�`���`���g�f�� �lޠմ�X���KP�;[SG�=톣�EQ����d���T�@>K]6��8�%�_-���$��ԫS%|�}MV��$��~8B����'/��u�ٙ^/ �Lꇦ�?�Oc8ؾ
+׋�w%Gr�٦S�b%�8�Nuq��9_]�R�f��"M����5K�ɢC�#?��rЉ(n;�~ѐ�7�s7�G"������#q`(&��d�v/�XwӯioF�������Q���-Gž���(rhȽ��%�\]��ԑp���pĭ� Gb�.��S�h��#�M��9�&�x����&�?���at3qp��V:Ű9r-�7s���~e�!G�M	����ՔI�U�G��I'�D�U6���m]���J�M.�	$=�t��1��&9궀�D��bZZ�:#GG{�W��"�N�}�S�ά%�06�o�1���-�`�|���
+Z݌M4�~bX���,-�h�#"h�耚�#*B����lj�c�/�e�:�w7��ӟ��N?G�z�l��sT��m�q���M���a�(�d�:�Y��BE(��ѷd�smҏxIr͵)_��C�tU]��nq��QtG&s0��;Br|�����wvmrM�]�M�r~�������s��c�m��L���H!�&��١Л���K�P��m�e��%ُ�q�~n��8�5͖]��%����I�	s�Q8�̞��;{m6� ̸�"ˎ��g:�oxON\vdX�gC/ )�QnRN�&����Ɏ>�k�F����T0�5�:ª����x�t��Eؑo�	fn��ߝGe�72� ������d�Oצ��*�;*�#bv$�H��ܷ^w��~Ld��DҎ̞x���}=��0�M��۷�ʵ������< �eu�7�o��Xu��'�"����>3�gPG����Z�w���\g��QG.��ػ� �PG�r���ȧ�x͚�Be�wa����e����q�&�^�G��_2�f��h��a��D�Ѧp|��(�sT�?�Ѧn�e�4��s�����z���:�
+{:zwz�r�%�����v���_t{:�O���&��Y��CפM�nj�6w�Rl�X�qܛv$* 0w��Y���8�k�t��<�|�8_P�������Vl>�����U�YS!���� �W���Yu��mj��
+�������I������t )� R�ĉS"5��֥H����H�C�쌌��mio#u�H���M�jL$]�a��W�����EUDDJW!���:	�B:}�J� ;)��(|I��1�_��Mbo�i֫�>Ӹ�i�Q�_Jz6�)��d�5����:��Y�jfަ�;��?�@7��ީ� ���l:I^|�2>���I��GB���]�Ia���X�N*wxT�s:ɼ*D|��B�wzL�F��9[�x<���;��\zv������J�&�=�	ܥk׫YWv˝"�?i�Κ��X�v/������v���T��isKñ��Ӹk�R
+���E)?+Ŷ��Kf^���]\�X�.�X��ȓ��+[5cl�E�G�'vÙc������)�oQr&s? �X���E)����{o��I㣜�����D)v�{r�E�����+,CO���	��"b��d'�#��kR��%j%�9M3�6�dd��x�u'��M������`+;+�Bb-�F:�YZ���9X��!��R����q�,�BI��p��B)�N��J��Y�c�)OY'G��xR�۶J���;�w�5E����b�)�J�%�y�zD`J��O̍'Z��,KB�4�L�dD^�/7Q�1I(�œloл#�"9����1E�Y�?�+��qR�g)�χ%3���ݼm�m�$�7.NR	���q���S�<���^QL��;��������&�$�V����k��-���V�lXc�IǼ �T��j��&YH���Xs6IO�@BI`�����:N:�,_�G��2���CK؇������;A~щ�ު�����O�M"���+�B��B�6~�%ך�'Uz�����A�j�'����'O;��Z��� �,J��3`L)��RE:�$5�Q=-}Xw�e���{�c���$y{��
+�����S�cC3+�&�7�[)<]U�Wf7K�G܊$��x:�d)��H�,�Ӫ&�#�K�v{n--�u2ɶ��)���!!�z�� ���f�	{�S��&���L�~��zX�'��5���M4A����iɶ�	��o��@��<��]B���l�%�|cޤ�T�]��d.��۾&��i �W/=�$���5�� �%��0�0�a�?��3D�[�I��g�ճC�62zDԑ.��$3�� �?�]XI$"x���g�ӥ�ۨ���29�r��
+�D�
+La�1]�2t�&�t�f#R��;�L.]M�v�Y!��5ίV����
+��N�����t��.�N@Q�<��{�&�w��RK��a���[	�d�u���n-X�K���h��4սB�OJ=�_���\����T��m�ɟ�~`�	��2B\gP�\r��U�1钔.�O�����P���c?��M�ğM������F��g���WYoV3�5ƀ�
+�2�
+q	�/�;�vE.�����.)"���v����4�'��`�x����y��.��(g@��f�X�:�u��	��kS
+{ܾT�%m ���ퟮ4ߗ�-�.�}��lXi�̅�\q�!˙�`�<�� p���/Q���Fj�S���Ǻ6w57�%Hڻģ:���gNc�R�0y 8-1�x 2g��aM-e|	 ���ql���Q�O��9oإV�a�/R5�F�=O#�R��u���b�K�A�]��c�a���\�B�V\�X:F���3��E�U]J�ןԧ���dIB����z(E���Z��LH�̱�Ȳ>��쨱���s���:�O�(<��ꘘ�B��4k����x�{aL��� o�O�9�I �V��8���I;C�Q���K�f
+������xLK�Q`&��]��w:r�*f�ԣ�HбK���ya���O���=�e�q��?�\��J���9��D/gaQ�s�3�`ZjI�F/%t�F>���ϥx���9<���^�sI�/���
+|���*G��-��K�������+�
+B�����ރ� �K�;����ܝL��^��Ʒ�O��Gx�i��%���c�'�<C��n9�撮h�g�\���U�a�Js�Ŧl�H�K��b���OХ�jz�1.��3
+Х�MT
+Ȑ?��D1w�yhS;�^�b�����CM�Iꥼ|�lw���GL�H�l�7�p�|���ajh)N(&o�1&ɚ��	2�C&{أe2�:�A٨E�VL����͔Mk��mJ�y��əT*�Vkl�{B��;�DS�,w�iS#���\D��-T�"��詀`�>���E��O�b�C~�Rkh�|m臃n}��r��-�7��}S�}�G��b��u�w@g�v���(؛�A�IZ�3MN9��z9AX�n��y�����F9�Ԣ��O�9�#�E�pX�A*تܜT �����!MQ)�����ޜqʉ+�t.��ңE�LN;����p�'�b�1�O�q�`�܅P�r�f�)
+]��W�s�����qN���w2�/���o��q\�)�?÷TT.(�[�t��P)ʯ&�r�gN����D@�M��FT���e��5�EFP�*�ȳD'�����~�d���Nu_^S�����zl��>��!ya�N�5�n;`�8ѩ
+�W�T���Oe$�,D��vȺ���?��"ʷ :��I,���
+T�x�)��͎�`B^�EɎ������ %J��-��ʝ�\v�v���Z��5f�S��@k���r?�6���ީ���v)і=_�R��fo�v��Ǆc��PF|z�=�GUr���=*���Nr��~���Ia���- �j�O�.H�P���z�((��)�&�cxP���뒔�E�H%�R<`�f[y+�b��IY�K�����k�R?�b��v�5D՝�I �񌢥!�A}��(��3��*���]�X칕�_^̢J~)K��T���{J]����|�Al��RT�G���*�"�zy��f��P�kur
+DLE~����7�J�f`�wS&˔�ٿn'�\U�W1BwREs�)7�pM=�0r"L��Y��ԓkV�M)u�3�~�T��Rb���,b.��KU�����q�kN"L�Ɣ��?���L�Dڟ�m�;�	M<Ce*Z�����v7C,uSd�W�%���*�QLe������hEP'偕7�|�~pLCp#oJ�T�-��/E�6*�ܔ����pV���`��vg�-�w@"���)��zLtxo���oB�~�7E˘5Ș����Z�s)����L�M�xSYp2�
+`��mW�5��n,�)�`E�������p��wA�9soj��kQy� �6Q_�xS��c���/-mw��G��M	S��oA�{*�=��Bp�a�s��)F�"poJ���Mq�ޏ�c
+g�ߔF�)��>�F�;���(�L`��c%?�k �1����M&	q�G~'z��:3�Ņ[!�� Q�|�;n���ԛ�8��������6�>P>O��Iw����R���S��F�C���:,�gmߣ�M�W-�z9�I(���XN�|ؕ-�t�g!V\&d'��T�,ײ�_җ�Soq��������~Ϝn=�'�w�S����#� 9%�ј�E�7#&Y*�KNI���p۴sq$@r
+��C�N��)�,�s%�`┸��c���S�ľ���c�Gf�-37?�.����j��-�@C{��5���
+/0'x��e�[yrjIk�V8�/�A����4��&J�|�3&eb�ߘ�ږ���sKz�krj�-(�h���ރ
+��H�JN�'�d�X?f�S�ԑ���Rn��)@�Lk�)KP[� އ��P��-N	��8�S�=aG�fJ�Sz۽.�S׮հ�(/ 9�� �j]JN���a��?�"��⥌�\�Ь�+ ��49Ŝ�>�/J�AONE�An�)��ɤn�Ip�Pl$�P�TY���)h˕�$MNI�����0%�"�x�$9�z����:�|��T�����F���p�IrJ�9��3Uk"���Ē���T��9��,��7�B� �2�5U�$Uil?�^ejAU�d�����f'M�N�kS�n�)���i��Ԙ�EN]Rg�ǩ��T��2I�8��֔ �q*vG�[Q���q���$���\ܯL!F�m�)�e4��c|ũP�pi���F9�T���}�]�!�hF>t�J\g�)�LE	[���6p]ps��+N--�	�q^<���n3{z�䟲�:?���q�Me,N���5�O���>=t��!N�QJV�UM���ȥ�S�BQn��7�S��p�^XD����)K`۝*N�K���V��n����֧q�tuZ٬�F�z�w8%��Ⱦ���cG�M(��h��7T	����8���pu��h8%��4w8P�5�X7$��z8��/O�RV6&�0l8�+��6�(�p����ƥN)�Թ�`�S�?�̧��%C4u�%�oW�v N]�3�Q�S�z��І�8e�����]���'$,��T�w�T ��)�eN8��v�nd�ip�4������;�|f��/��%ŭ:XK��Ni�N�S���	<�Sz�]ut�_����+��Ɯ�?m���r�4q�.���j��ȐR┦.�(�ĩ5�9�W��SS���7E�)������p�S ��Ȼ	�����@�Zcp͢'B'2m�(	��`��;Bd�J=U�Ni��;YZ�J{�%� ��8�"�B�'p*��N'��&T�.�?��1P�#p�nZ��>�; #k84*�qDJ�{�Wz"�a���j��
+�@��iG��ܥZ)�Q�8���fit�S�TZp*nm	5�=18��E��/{���L��6��P��2'8��8�^���~[j/��$�U�x�]!�`�2���H�K5=�+3�Q0�cz̲b�Js�M6�+�7E*sw0TG�U�(�G��M�������Z��ѩf�|S�V�n`I�+P��N��MQ�q���M�a%XK����L�J�T�gm�		�T���N��|�7Ŕ=�)q���ӛ��⤈�,��M-�J?�)1���~"�Z���S������27�^"U��p�,�7��m>�~��,���g���`������6�S��fv'��F���I谤�=��@��w�u@�Hઃ�:8ŠG	Ez�M�^)�𛚯NҘ�������s���_���Rx3v�D雒>NZM���Me�[HV&"���j�zt�9�⛒�(�Wg����pՉ�T��l~S�S��<g���)��� �����ZX��]g\�do<Lʒ��OKQք�
+?w�~G��üt+"��7�1�!׹ћ������'~����{S"���ޔs�h�v��4�gp�����M�Q��Bѷ92a�:�3�P���z~QZ�/�X�}�No�-y�oh�����YP��M1����7E�0b�K�X��Z�{�TĤM��ނ�LP�]w�To���J.L�ŖD�����naG�	)gp`�'�6��9��Z9��<���9���D�E�M�Jcc�+���ְ��v0�W��FL�߱�ekŮ�0�%P��D=t7u�b�NiOvS0-X"��˼O�a����+��HI��a��ls
+x�39KyS�㪅���7_~�M��O������+ީ� ΁���g�K-W��.�[�l�S�!%"I\x�%oʧ`��ApzyS�}��oꝾ��-(̛�US�)���ySCS�����ʛ����xS���d��)���p��8z75�r��u1m�.�C���n��2B�j�)c�0NrA~cSt觻oJ��ޥo�{�T����=���S�7����чo꓈�������c,
+����(-D�ojf�J���{�W�P#o����2��p�O 棣ϛz q��Ő*u`�ԃ7ŗ����zS�x��;�"�"m� ����D��T��>ћ��:e�<"��(\�dٛbk��|�GX�D��~�zo���膯�1]]�)��y������՚�]���r��_N�hZr[��D�M`��A"�|�sr�����i~?�x�Z��!8S����]t.�{�S��9�0���#֪B�l<�JN�Q��F�q�#|�Zg=L.Ni���¡M��TR�I�=�B����@�$Z�#�4�mw&�僚J��<�cNaw�N541�w*�J#��!�)��3���h(OS�U� ��IN�$�	�����K�Sx��fYP���T5jd��{�W�j"�LJ�pp���X��0�Z�F�N�����~�T©�"�H�﹵��џ�%$�ʖkxht"���������zÚ�Ap*����2��Pヘ�4:��"���(�d��������o�k�X$N�"�2����H��|��/��,��<6M�d��qJmC�ԾZ��$�%��dEJ��X\���B��|}D�� 3
+°?C��`a)�d���F�;�}���`l�z(XQ�v�EY���9E/�g��	�<�Q�7lx92{(�x�a(p&U8�U�n�Qź�2!B1	�E�],�F���B2Ž��Q��Of� *HB���H1s�$��,Z��4��TR���yǆ�,?;k�l<�>�V�^�u����^�	"�>�Ť�m>�L�*��4��'M��4q����G�Obf%���'�у�T�-A�W15���	ń.��	�J�n�	Sz��PlS�8#
+a���凇 ��������7w?��������	C���7��.��T�A��`�H���2X!�"�#d�K�ʰe��	E"�2C���'k�9gza����j����AK`�6�]�8ş�~� ?ZK(�0A�!�CG��_Iͧ)&�/@J��;�0~)��O�`C�8x��[g���vݏ�"![�	1I
+,�� �V�þ�3J<V�y��e(���kx��%27ʟ�)��/J���T�<e��s������i��p���Eݨ��ȣ���>��=�B&QS���Ր�Tw����CD>��?�"2e+J&�H�0���P��W!��pC%����!UmVI�L���¬"�"�j�d�޳��0�=jcf��jI��:(�� '+�6��NVn����k��Ji5֐m�3Z�'�|�!�!R�#J|������d=E�P�CV�8CL����_�jC�<�a��pZ�%?HhL��Q��ǯ)#�FS��U�)|R��F��䃶?�K;V�2P��3�.H("��]�Ke<H�}���+N�9��P���p����lC+����7��-^l��i�<)�������������C|V	��2��f �)��G��Z� ���f8y�Z�a�1h"g�����T��Q��o�r>Srn|��'D	�Z�=�!��E��ܣ�!>��x�����b!�'��&S�$۴I&{�'�
+soBU����|����)�(#�-d���3��!:z=ӄ��F�\�YA}E��!�	�\+CT�_������:������4�?yX����DΙ>%0ȣ��$C�s�yɘ�d�����(�P�Q�CY'U���Ͽ%���i�þ�8��r��"�E��/R��;���e��	$�+�~	�?���G�� �?Dˣ���Җ\����Y!h4�q�4)\��~����W�UI`�d��KK�
+�6���� ���c��9��b	�B�*�	>ՠ�U)c*�~8��q�~��`�3LP�xgŃ�C��kX���!vNe���g��\f�w1E�fd�$a�MS�VoV��{�P3^;1EC�I~L"6��|f��tk�<O�|b�ڬ�E+a	��a!	���3��X�ˌ$'���+� �1T#�2Qxڴ��A-�
+�wuR=�fE���G�����A!��9+J������:����4ѽ��A6�ѤO�)D���������B�B�6�^LA�D�i�޵޽�8�*�mް5aꐙ{6�C/r��!����N�Q��5�(a��L���0�������������9�zN��MrH��f&�>�����🚐R�,W���q�##��w-=��=�[ad�]^dX(Ö3����@���7*?�QB6<uK	dP�)d_H��:��݌�B-ޢ`�/��
+%���"���N���fA3��8<��1�3?v�n2%��,!'����=��A;������b^��9Q��`9���\��Rc��m5ƍ6j`�O���x��w��#��k�~O1�"4���B�,�E!XΏ<��_��������m�B坙�$�c	�O(��#H�}|��g�|�6�TG��
+UB���l|ѥ�w�(B��D��P���)��y�~�1�z$yW Cp� �p8l�� 	x,b
+@Z!$Y	  @@@   �Q�NRY�2}�������d����,:Q��*�ݠ>�)��4eq�soݛ�/�e���(�Ϗ0����ʂ�?y�B���s�>FeQ9T���,��N������S�>s�eKe�4���[�x���f���8�ᝈ�g��c���z+H��뾢��檯;�H<�ꫯB��⥺d�7hD2���ڷk5e��)��8���K�/��>���V��Т�c}��2�����S4��Ĵ�p�� O��+�^,�_����V����n�����Lo��d�()��R�b����Z_
+�>�:�8vw����S����Xo}g��dj�E�"M�x����,z~�I�x�X�d�☔�h)�GV���M��Qh���:r`����m�Z�o}���,0a������Q�\��<�`n}�<eQ�m��1e!2���R�
+��������S��4ʢ���J[_`��pq�� ������P?��V�z��Z�n�Ў�Q���� �������h�K�v�s���㦍�B��,�[���(p5ʸh����K�>��i����__`g��꾔E������b��<�b�P�O���`9=W,�e��O��ŦX��ɢj���=��fN���{2<^t_ߨXh	�d�^(�����2�ח�D����;��aO�8R���V�,�v}�K1K��>�da���Qˮo٦��\2Y�ٺ��dA�L��V��,������,`�RɄ3Y�����#�5Y�ڄ��2Y���}��ڵzFu}�&f�$mr�'�r�dQ�JtI%3�!�t��ﮮ������5Y�ɢJ�, 3�s"��/Ƃ=|� :�ʂ�P͍��C��D^eABRW�wN���hR�]�?�BH��3��W�׷�(9�����P�U�@�0�D�(|4�oρ���� r(3Q�^_A]����Ϯ��6��/e�F��V�Q�1��������Q&�(��]Ʈ��HY<�$�=��:y��(:�oU�f}݋�Rʂ~R4v��k�f�,��oCY聲UXMI��P(��X^���N0&rE�ڏ� �����Q�^���}���^.}%4e}e<�7��н5��9��PR;��L���{���E�U�?Dd���v~��/�a|���'Ϊo� ~F�Ȁ$0L�C+�{'�YK���&�t�BȪ���H[���,�.�t�xZ��KEY$���NDY]�m˲b/��:�M/��s�y����`E�D
+~.^q@Yd�TŢP��/��_ߜ�E7(�������u~j��/�
+��L�%�]�m���B[���6�'��2��"���f��l�J���%r��rb�(m��E�ő����z�ث#��U��9�m��8"_�5�p� ���(�zPNlЮ���=3�DY�E����.�-�!�-�Bu���?S�����dWۊS�"�Op�x߁����i��R��(Pލ�	���,.O�����*�E'�#��ɋP��[��K�.H�S�Q���{)"��rlr�EY4��Aq�[���]δ�(z+#ԨVQq|�נ��텉�����TL���9�O�s���V��q���@����P�M�f�e��e�Z�PRy�<&�=t���ge�?+%s=t�����p���:�����:a�rM�P�4���(�$�y��
+��QJ\@blYm��.u�ii��1�Yd����.䅻g������Z������PD��-\6�#e�ߖ&��N鱏B�
+wI����B)�����~�Q:JYh#M�Eo3(��<���ΞX/��R���Ҵ��nV���"��D1^E,eA�k��.�nث�!?^���(������&�s)��C���N�0��waw[�\@ci�*ea�_1f�AS���4H<��Z�u6�%��H�B�7
+��$���zP�,e�1mitt�$��KYL'U��C��{�s���D�h(~�|�� �0��	)���S�*
+@�Js��L[���͈L��|1e�v�
+πה�.��/S�AU0>����j�B!N����f�҈�MYly.�ߞZp�r�B�F���K�)�(	 Fe��h�P�����2�����~Av�u�,����T$�R�	�0��u�H���l��9�ԓ�a1�D�<�]i,P��}+m}��<�
+����Z{�<��V��j)���@y�B�*�X���PJ�E|��Af2�]�����k��P�q������
+��,x��)�B�++��8���E��Uֈ���4pvҦȻp��L��"M;�擅{�re��8�Mh�MT�y;$W��Y�[��+"�L����eB�fne���!�`l���`�����6U�VP"X¨	��,�W�q�p����i����L5�F'������G �#��KV'��{Gb���?V�)�n~��4W#�,(��h����^��K����p��+�n�g%s�V�^嶄bd�ɾsi�e`�j+h:p1���rrr���ˍl�+��f���蒛��$�=k�ڬ,��=�z���EX�-%1��
+���>|�C��u	�֮,���&�3>{�]���G�P�
+V��V�oe��*�/1� @���������(d�K�<P5��8FzW�Р��Q��h��{6+�+�?�`X@����^��\�����+�8���D���|�P����z?Hj?^�{���/B�o7Y� {�,���b��7�����3�+�w��=��h_Y0�Z��b���������w��ڿW��}�.h��lc&�Q��m+_��*Z�,X6a/�）U�r�G}e� k(i�⚰�G�>�E�\TĶ/�
+��sOy�+PV4��ӫA �<���+��.��@���
+~���3�k�q������c��p+&�˷2i�_�+�[⎛�&���,��x�}�����N���e�^�C(O&��V@���Dj1:>Әӹ�K��
+�������^��tw����M�vv�i?�+-�r���fdW��`Y�1Mq����eq3wE
+�ŲW�'|�,z�p���S���b�p0㋸8�ߧzp�+��\���{��ea^]�U[&lޜ�,�l�b�LeŲ(eDEWB,{Kq� S�^��BJ�e� �f�C��ziU�|��T>K���i&�Xʳ,΍�`���{25�S7��IM�M9��QLڝB���!h�,V�mŊ�(�l��,�����A��"0n7I�E|�(�`��R˲��?M�{���O�-OA]�E��m -zJ}�u�;as��$�a��-��<���{h� �R��ʂ*: �
+-ƅ'a�X�j��
+E�~�ї�$�B�ߛ�����pj%�g_A^�P�dY�>��N̒X��
+����/Y*:�A4�R��pc� �"b>�T��W��$pIj��ɲx��(@C�d���n�)��
+z�������dY��(>4�g]���1�⊂w�>�e|�T�e��W06cdp�i"�!Ț�d���ϻ�e�( ��b�,�۝3rc�|�lkF�Ts�.yW8�o�8���X��cE�x�Ĳ�#+�t
+���X��"��������Y�d�b],��&����\�E�1"`~H�92�Nb��]����}�$�3Y���biO92��Jw��nG�혹�����@��]Г�g�Ζe�.�f���Z��]"*��Жk4j��g�<׻��g�p��{љ�,�l� ��e���QN�q�CѲ8��N�-�fY�9|���&[l=�]����R�E��6�<ʜ1�Sf�[�4o�4F�?��gY��(|*��,���L���vW�l�ϮV�,��:Jwky��� ���UQPbY�k�5�ߴ!�)��XF��c f���
+��Z��8��Be�yW�U+DT�{=`G��?z���BFƻ��H�<��CH��Jv��,�L�@P��bQi��xW@���m��2vz	���e� eP�Uw�ӠMe%����
+���q��׆wbY|6���Q3�pO�s�5>���륣���&�a�I��Em.�!��ϫhP�	QXŲ�B�հ�˓�e�F˒��J�!��G�B$���+@�Xv�H���H�f�n�$AH��s},�so�mx,�3�л+,Î3l�����dJ,Vh���X��r��*V�:	��"�[9��`(�颟�����≢^q@^Y�9�U�Y
+�N`��ڴNg�]AT<AT�k��]������^d���6�__�vW����+QY�\�2����������	�
+T'gveqR`�)�
+�XY ��ÎXYlƏ_�����x*.���� !2��On+E,D��M��}�9�=��?瓠(J���g �(	&]q q2'»+t$��|�H2����M����;#�~�⿜�|
+�Xg��8��g.E$��2�<W�;�M�����?�$|=D[Y(����1���NJ+�TD���n� �c�5�ǟK��D)r+�/{��Q+�h��P�9��i�wҳ�<��$>)BS��<����,4Aa���@�+��T�#$Y'a̕��I���u�p4�ۻ+��d�/<�������c�d��U�/��Om'C�;/�h�/S=�?��Mƭ�_��c/4E�ի �>E\!�+��\�X�ʢ�J��i �� �*��B"_2����s��8���ye�e鞆��`��$^Y� oU�R*i����9��T��S����H��W���,yeӿ4���Be�s�j\������D�+�������4�S�T ��>/�)c*��YQ��޶�B`Y����zg����ӭ �`Y�5l�K��� ���e|��V��g{�b4� ,�����}�/l�oD�9+��"���(�t4���[U	��,�)�
+X�ݜ��Zn�ڃL����2�,�e��:;�qy�b����w�F�MIů� !�ۇ����k,���*��aY��m�JKa�O�/�ò�DΏ����;�)8��b������|;t ˂v �ΰ,���o��B%!�$�!��Z����^��ea\�%iɯ,�
+�HW� ����P���W�d@�2��R�Q���T�ȓ����$=i��\������o��~yF蕅���{�FHe�ze��v5a�{�x���U ���KMNh��z�� �;%�~�<�>���ƣ|@��(P��E��l����Q\PiY�,�BX�.��J��MǠ�+�g�V=;D;
+{�y�cY��B��gN�������њI敱,hc?-���J=&��(1S"��9�VI�J3 �Xk���%�b�o��e�,@4�jvCL���,��rM���A1r����cXL�mރeꌤF<[���tP�镅�K�^|��;%4C�'�W/��a��+����X�z
+�"��;WIaYh��J�4*�A��!�/�w���y	W���,2�Ө�,$^���g(l
+
+�oT���!���f����((�,��"���bY��@����M=!ךz�G�,��}�Ʋ ��Y~7��,���#�B���P,0�c�ٞJ�OGy���p��;p���`�eq��7V�l�3X��@|�
+�+.�c.2�d0�]���B�>�W�/���ǴJ� �a�UB
+
+X ��k���}�D�eAO���(�N�'�e!T��|e�V���X�:�$�|߶���p���e)�N�0�X���J���OP���T���H!���B,��C�?=yo��(#nr�n�DwZu��-9`Y��
+҅�q�����u��;3t}_���b��-60�3,�F���d��
+�}ĭT�U
+w�VʯX�`�[(D#��ZzVED"S��ǲ�q0۟�q,�3�w.4�qUxu�E�-P�m��؀BA� �,�Q�@�^Ȳ�v[�־�,�@����o P 2T��To����p�²p-�Z_Y�!�zԯ,.�����C���f\j�_��bϹk)?���{��=y�f�+�̬�iu^Y,2��N�na;�k�+�S-d����	eE�`Y&��!�O �[0ȹ�|�U��֓%����	t,j(>R��𕅆��b���Qګ��uv}BQwy���<��8)����,"Jt/�>
+����؋Ӟ�@tyل�W~A7��P�F`Y�⫎�%��[=��]l=�Jɮ,�.���U��>����N��C���
+z���j=a�/�U�x�gɒ�e���S����	e晝��Z���A��e�Q�g-�'t	�0�[���(^??1	�P�+Ǟ�LYA��	��n'R;�K@ׯ,�.�O��$���v�Q8P)�V������.�qFO,�y;�+�'ȵ��x�ZW�#h�f/�x�Nد[R/mw)����N�Μ�����7�)������$�A�kYYZ^\|s��śWY O�ine�s�����;!����&l�}�E@�犿$����h�qA��8��c�,�Lm\���q��-"O^k����&�,�}�Avy$3}'0ө���.�ސr��Z
+��7ԩ�	�'����璫,~v�-���W[eq�5AbP~H���&rj�U�Lm'fCe�S�UG�$k ~S�*��P4ky]�=%�;���H�Y������b���7��@5'��"�TY����;Jea�|@s��*�w���7>���;�4_ө7��#��N~w#=gW0N������j�/�[}'���7K#�*t�)�\�V ��)�;T���=�,�׉�Rr�M���:ҩ,<�3�����hPYR��ʂ�w�ԡç��;��5!|����E��U\Ҹ��;�,*��V��%@e������?��	�A�g�el1[T�w�jQ���B�40bZ.~'�C;���@*��m$oV-��y��w��Hځ�, r�K^4"�Ce�K��SNF�a:n��(K
+�,�;�Z��;A�����m�iIe�:�f���L�	��C��%dTɔr��6$:��cW��R�,��6�7>�K��T����MJ��TO�yV���hu,qU�P��,t+�d�lT�V*7�)��)�^N(}�O����ܖ"��)�/�+/%�� �,��=��|إi��hUx��Y�2�`V�UY���Ȕ��BX�xlE�A��t�ʂ?K��Ew�󆤲�����рTASsl<�|�ۦ�	Pxv��\:RٗjB(Ԇ�� *�SY���Gb��ʵ<>Seq���P��P�-��y_��~�X�`^�HT8�Ak���W�ZA1t�Ӑ�ӟ�]*f:.�����0R��Ε/��<u�E��2̑.�ʂ�C�k6��R�[ej�� ?s�%��	H"@Ò�ʢǆ���x��_Ef�|�j�E5(��ѭ��(�ɚ������bJyĩ�� ����VY@b���Rze���"VC�9�Gg+�?����,L.�y��)Е���j�X<8FsK�;H�Cs@%:�W@e�7(�`��~�E2��⾅����F��ḣ�Z�������c1EL 
+R}�E[���A9�n�`�~e!e덳���W�I/'�,���"T�DF��Ť��sA�&�����!f�_�ʢ;��o[�s	}U�ڬ,��6fUq�/t�U���9)�GV�9�����G	��:�A���N�_t͚bmeASM��V'���"3��q�3�J[�.K;��I�v�Ώre�%����]e/��+ZY4J)��p�K��� <�g(�_�C�7�� s��0��#ƫ,h�r�1_w�r�#�d�����x�|�*���`ۇ�PoI�g�LM����=�a̴ta��EX<a\��	��Xz���T\eq�2���^���J�H���,�TW�E�;�Y�[>S֑;65��i�Ň�;M�M5�N�͏V+y*�NL+a�H�J�J+�Q��e�,�O*Z�1����9k���/�@�	o� ��%�-z�|�����,�Um/��x��C�XHde!8
+�6K0�[4�[Y���1�5��fs}���pV@k�z��;�E��r��5���������؊oZg YY(9 �Y��8�׶>«�fѠ
+@��]Os�G��⧓B�Ye!��r��ޕ^f���b�Y�����ܫ,��,ș��앿c�d�rv�_|���Z~�>,^�,�F"߃YD]e�"	���>�,�>B�-���el,������]A�lS���Rz}��	E��F`./����ۣ������mh�q,�9�ʢNS#�#�!E�T*gfju`��>��>iTO4BH���`i2�<��-�*��ȾO�q�We!�T5��v��B�;ʀ!�:�E_�]�T�^��)�/{���#��nN��#6��jb�֨>��l�+Z�#�(�Q.2(����>`V�n�G8���?�Ʉ�c�>��g�GP��{�* �sJ8���*��O/����~��"Ov�i_��P� l��p3�v2�t����HD��^FA �}��bec�I�$EYȍ�C�9�(�`(��<$%j0=mBv(��,x��i�ϙ���&X=9X�E���t,Y|N2�D �֯A��	�b[��h~a��$�S��.:��	ȧ�q������\.)�揔b�,�<�4n��Ea���.��{�u�BJ�(eA����+���"�4I�j4���MY$���'�M��)k}��� 8°]V�zX,�5��ǔ���^���LY슔��I��4f���������|��R:m��<��w�JY��g���h�����Ӷx�b���	�)��xɍ�$ �,�l��I�{��� ��Z���,�E�.is%?*�qX*���^$�D@41�����CK�zkGHe�2��l�v�@/Fh.8����TX�Am����a*�v���'��(�!����ӧ,�Ɉ�)v�,睅����#�1�eYW9��ŋ
+ר|��s�A����X4��m�C���\6Os��km��T�@��vP�D�P�&�@*��U�P�b����Vk������E��*�KF8�Ӽ���YYxG�ò�Ȝ׏҉W��B'�Z���Y�,�Q>I�T�3z1Bb�'�ʢ���1�y.F�le���b�ޢ��~7H�A�b�X��u�Ib�p�y��l#�s1���l�D1_��bL�X:&@}@��8��8]��PV�M4QY�W��26��p1��\�܇�NY��K^Ij1��gA�I�m�o]��s��7 w�b��.-Ȏ�����Np�V��ROY|,#gX�w�mg:@���^|4#X)O~P:���S¹Y�bs�-�lS�T�,=,� �����6e!�O���z>�8�:3="⚲��0���!��v5�J�yyđLY����*S)�X}H
+�)m ���{�є�"��J~��S�0�<	��%��)�ʂ�3���}�a���h�^�ziLYH,�!�l��hΖ��MY�fF��=8�S�)�Ik`������;e�ldQ�}
+s�L(�
+F����X������jB�T^, ���|5�>eѦ��xP�d ]_;m��a�DA�u0SRYЉ�,\!	��>@�,��\]�K�ȅ��J[����VK�I�5HvVn��qk����?b�ř��U�O�j���9]���d	>q��hE*��0�j(KN���O��m�koS���"��	m��0 �W\ֱ�VeU]�$�K���L�UY0���!mi��E�LͲ!US��p�Ql�*�5�x�H��Re�j�&U��»��~���4�(��.���uk��M �Ee�0����� �q��u���S�iD�����ť�)U#��SZ�BR�%["�,^f*_پ Z+����I$x/��Z5�ِ+?2X���4�j�N��>�S��������E0�r�s׬MZ�j�{���ZX�	5#���m�D���eC���l����"�e�+:�"�>:�s�M���R3g--����3���"���8o�� ��ڰ�,Ҥ�WL���Һ�*_�<3�U���w].�q�z" -$�R�mI5��?��z����.E-e�rfʽcL+e���|-����6�Ϯ�7EdtC2���@�m�A���؊[�RSp]z��=QTp��.���:,s�.B�,B��;�j�(#�"l���_0�"�SR��e���U}ΔS�
+�Vh�yI�Y�� �	/�8ς,��>i��0� ����F��bJ�)�����O,�"eq�7���ؗ��!�صh����(�)59�W�V���ǁ��)�Lw��U�J��"Y���`D�;G�=�8�c$'@k5�c���+���p��es�C!�[A-��po�6�*���y��]�9�TG��,8 4�`>���o
+�!S4eqZ�&�7�t�)`��pQ�诗02��P
+_ �pr���K�BY.T^�Jr�P�,p�Y�K���<�b�>Z���U��Ou���3�Ug%C���D(������WbpBtlnN)&�F��݅@(��]2�Q�� '}Qov��Ca/n鐎X!��ܚ.�1҉�M9ԾnR�R�曒jx����R�dշ�U�\�,.n����l)��I�G��B᭖��҉}�ŉF��@\�-��z7��n��	�e~,2P��FO͘R�-�]u�?:����Z(ghʢ����,�*v�uA�}JY� �R}S)�Q_;�C,��, �<|�S�%z)�*SI�\)h�-%�(<�¦"^~�
+����(��B���Yȣ,��Ǟ6����R��`���j�),�lEY C�z��G�DQ�-�y��l�(�?9�+8IEYp�Ǔ�ɔt�X�ALgA�(�Q�k���_"�>ʢ̢�/Xpd�ѐ�(�&t?�϶yIU�Iڌ\4�b�{���5�<��$I�e���� ����筿efEY�fgi5@Q�&C�_��"�&��*�!�jgٰZ�q���Z0���e�,�e'�W�<��@��k��%�DY̑y���n�C�HY��k�P�N(��+�h� �,2�A
+DP�nx�9��,$�{�]�ʛ���S���ϡS�p��X<c"3��J�&!{�J��")|6e!(���a:zh���Q;u��f�.���p�L�OMMY�q7m �}ʂh���đ�=Z�eʂ4P�V�rV�pI�Kϔ�#��2iLݩr��S�Ov�Z�Ȕ�:�[�^t�J�,6hcg�F�"E?�����n�@�$И)��!e7�_
+ϸ/4�@ �(�@8�����=
+;��9��M옔1t�ʕF%)���>ݐ��r".:��aIYX�@ad�z <F��z����x[�1�lH� �E�y�b�#٢,�′�$i�	\8�ņ'R�a�S�Ȱ��*!zu	���Z� K�ҪRZB�d��z�^��_�Xq��b�^�
+M�5Y`�U~W�'�&-��k��m��x>����	�]��
+%z�Mo�b�#���,�7Y@t��H:!�[�k�Vn��*�~�_��M��&���\)�9tD>w�R�+�H�Y�����n�,2�0&��f.F%�=d3	�|��Ω�Ι���Sq? 1�E��|�풉�a�	!&%��{�yt~�"����(wR6�����՟3Yw��=���2Y<���� W�I�!��T��������1�d�ȧIեx���l)�e���D��?�)�#��\E�/Y<����R����x�]Ǘ,�ęב�&�|��$)�`A�o��9�nr��A�/�f*��,%qV�Z�0V�$hKr{!&��>G� ����斫�c�0'��{K5�`�x��_fo���Pc;L�6��t��,��t��&g�B��O�6�[(a��ǟ��ˮ�6�d0Y�Q%(��c`����v:�Q�K���r�[��d���� a5���|C'/]2Y�(6�J6��5�����s���b�`��p�����a�X;��0Y�ʲ�>w�je�(5*��d�����^�g�н���� �`;��$��6\�ws����gh����Ѱ�n0Y �4��LWC�G�Z���D��Z��afnf�P�,*`�������R�,th�$�ތf�WB,�I �NE&�d��{���3Y|�c�"�W2Y�L��bp�b3�1Y۳�g��'I�d;`1�
+!N�.�U�����ֽf�%5Y�f�ȭ���9��+�Yr*bHi� ��=kd>�$s �f��*���ɂ	��X�N��1
+�l���5��Dc�2��b�6����Kv�PE[�So��:�vuY�ۣ���ؔ*�ᴃJ�,�T$�����:��Ae��w��fe!�x����"Q9�Li�(%��_sXhҳ�é�6D��Zl�eၕ�K)��L�;
+Ig��9��[��H�Q�{��L�+���W��P{f�(B�ŗP�@�K�p��px����a���s�/cF"9��t�G���{�-�ۄtNAY�U��4P����"�C	��$ku�t�^��  �>!%��`J�A��D먧b\|v(��K88��)!� �ă�� �g�~9�eΓ��c�Y���`fD�l75�C��xO�o��i>`�_�Њ7�RMӉ��(d��GO�(�eU�³�A�v�*����d�VAY|��f-}�D�	l��^�A�W:1`T��4�z��Cc({Fwq�����o�W��\���P�t}�R�D�[���%�&5�O�7�(�֗h�u��0��Z�=_� ������`�������{=��RD7`��,ґJ	�`'ʢ�S�E]\N	���y�fGY�/cMP�S�(k�]�7ʢ �E���N�(H�K��A ��D�ە�n7��8!��C��(�,�"S�-;��2�#��K���-,�$C7X��(���c�(����@�ڈt|De�_�5�~�/�4���A#��6�������u��.t�l���
+P=�*��,����s'� 9�܉s��m�P���$
+����(���П���R*�K(����^:��z�������.zBł}{�'�Bڦ.Hso��,�!�s�?�.�X��n�ba�����t��iwtJю��U�k����.�j�x�ȁf�����eo�3�B����-ʢ{>��CJ7lx�� �gTIbƘAyn`R��6�.މ��JR�*."�q=�AQuf��Ei�,�ԋ
+]B�,2��l�74��(.�BBo'l��ho��Z)���<7Pm�t��	e�6F� /X�v�co�V@Y,k�.[����4(�A]�uVsM��}���Ѝ�X��m�Ҿ2�<(�+@"�C�����Csd1�a!����c1�_�d�����E�Xl��m���bd �X�!�(w ��������Ŷ3����Q2X b�0�!���P�m�+dFY��2�
+/�W�W�D�hy��wE�*� &�)�Q
+n+�	,6W�/�v/� /0\�jފ܊���&��¨jE��R�.0�ʵ��`͘eŔ��c�+@V��1��1��K�� v�2^�9���"�U��mn���0g�\�Uŀa��U�ƪ�AUA�O�3,U@8�nT�z�!����=��QN�J=��X�T����(z7�M`��B11O��u!��P�~eT %��/�\`�7Go`��S�z��O��6�
+��E��u~
+]�)r�Sps`�w����S�h�)ZN�'P8��y�vsS Vus�����ԁ��HѼi
+]Fw�X���P����$��&�QlS蘨�	L�0�s�M�R���R ڔ�u`kK�&D�b	��I|�{��MJq���RHoR��%6���A�ܤ���]\0�)|�ԍ�Z!)X||xG��9�c7
+)�(V,��F��S&O���(d9�H��
+�t�)&D�3���nŀ��}�c
+:�b�:��F6��ܲ��L���'����S`�-��?�``�t�J���C脢�N�8�p�����ʤ���n`9S��/����H���`5���� ���;F�L��`�&����#K�8~�p�
+�ӥXbuar��0!3��jX��7<�veB�·�K(�@��19A[�#F3�A%�K�`
+�X -)/�Xn)�*�I4�ϲ�}ͯ���b�(���ba
+ �Xv����83����j�s���c����Z�ǚ*��)���P�)���Y��V^T<V7� 0#k��ed6cd�02� G��P��� �.{Iv��Y%k�S//�A�5��6����X22���iʠ�e��,X�&�JPh�L��	f�WY6A��+���e���OP*�l�ZFC�e~6��{��n|�f�f03j�bl�6�l�[f��f����� ͼ5{����c3ز�n%(T{��$PQ�[�͐	���n6%(m3|_n�BٌPPL�f���R3�����s8A��(5ˆj7i��$I��\yp�a}%����3�-�f>���;yF�[�~����� �d3B��6+���v3������TE��,iri,(R?ػ�}�
+��G��� �ߑ�
+�(|�@!9�$Þ��v�k�B�o�� �?�Z?���	~��q&�p?>a�=��=�`=h8㍞�=Abe�x*��<�,���D��c݉5�	rPt�g��z� ��j��z�rb�	I�N��{3��D/��`|3"9�=��l���'6�8QA� ��	,ߌ�M�(���o����%qV�A�h�o�M��;iS��gy�&('+��U��g}r�L�rV�&X���g��3���ҙX��\3�
+詙�XbNp�L̼f �����)�l�e��Ņ�5	D&�V��RiFdL I1�М�AL���0�Mθ��F&���	_�+���~	t䌒/��^"��i�%��%L��(.aj������{�x"̖�19#TK\BK�0K ��ĥ�)�%X�JH�+Q�V�V�
+�8���37�c�y�6z0�A?���u�G�Y%�r�U����I�3V��dg�֔ Ɲ�o�TXJ �g�H�d�&�Č�8�y'�P�3@��=	�'q6'a���$��I�&�2	q0�.	j̳땄єįGN��İ��xV]wV;�-�Y:t��$A�s�}���j���3��3涯z�
+��YK`&���t�J�Q���r�R��
+/�~�61�K�%%��ʙI�
+�h)g��59�DM�L���HXq$ Xx#��P�H�����HX*��i$��H\�"��D"զ��_�ѭ�\qF>p�	o6�D$��o�k8�o���?g^��~	?	��sq4 ���'�Q�#����2��q&����w�g��~�̄����3�!H�#�֖��8�jk�U  �o@��]Sćݲ��Yr٫X ���@CǙ8��Q�?�3��[��;>t�)Dg�
+B.r��#@��Q,��B�Cge4��M���-����1g&{G���Љ�Y͏@�,y�i8��^��LЏ ��ho�\�3f^t�������G�ʙ�X��mȜ�[t&�� ��x�)��<b��#�l�L��E:�}�#d�3T;�=sYG0�g���O�>��ܚ�gw�O�39b�����@kLJ����m�&�r�KWC�\#�1�h�F`-W�|�瀤,�g�Ѹ��k4j2��َ|S��Ҁ#i.E�a# ��*�JC��H�4�\Ā,w� wi�NL��"nIXk�Q�"6kZ��ƛ�{Q�Tğ�ĝ" �i�R�բ�wp��"�J)��y"�D4t�L�>�F�D@�i�J�a&�]���D`5���:W��tR�0"L-��@nj�CD2P�T!bH�aS� O��W�����g��F2�j>�s[�yb�j�wx(��c5��D<�Ri�ʄ�d�����D�km\!�֘�! }k94��5z���;�E�=��gנa��)ќ/�Y�B�/bOqQ�q�Z�
+�x�=�F�BE!��Bt��xgB���J�,EB8Ƶ'#�}��!B(�k� ^��A��k�=���n���4r��F�b�`åA`W�D4���"�H�ؠ_�06�b�تZX!�
+"Y�Q� 6����e�� x3G���O���I��l;�@��y�F�@�Q�S/������~�AK����6�R�پ�mH��ն�o��Dn#�@dn���f� ܼ�p$q��7�In!���.jn�.ޏn��c(�6щ�Ԥ�.���nz#�^ z.�ֺ)E��|�M����-UfY)ft�Qt���M�����q^��B}vCx� p7�wSO�r�7�����;o��6�\ P�ޘ���;�ۂ&ߔ��a�����7���~���&���eoy1v�#�WR*���0"�v^@���� �� X(@� ��g3�M D�����P��p�� �����h������p��ٲ�S?$A?�O���9�lU~��A�G��Kq	8=-~h[��8�����I��L�?�X�
+�t�xp�����fp��#pv���5�Ӛ\��Y}�?��[�c�2~К�a�o������PI���9O+�-~�mSV���*��CpA�����=*�QTy6~��o,~����Jj� ��k��.�z8Bphr�x<?�>d!���=�A�$������Svׇ��HY���>`:��Z�<�{J�n}�`� �����0�\�Rp�O����9�l\"�&���_8 �)��ZEDܛ��V��Q�����},Ϻ��y]ܖq�q�'�8i���>��qx4��M[6)�{S>h�C�>�v�,	�p�5N�I�\���>$\r����4�\}�rx���� ���%�,�c,G�XN�~Xb�x�n�\�����e_��sh�>�'(���$|x�tԿ����&�G&����b	�\io|hG�Έ��W�,@�j����e�̭d�̕��a)_<�9�����
+1W_
+0b�>�z�#�h3�/~aN
+�ٮ��C]>��|�	sD�t�4���C����V��zd��M�J]��ɉ�(���C�\E5�$y+>�>@>PC��+�����= ���rȹ�!M���0*���jM�\�=#����c���q|�V�X��yɘ˗�ha������a�����84{�W��]�lΖ���7Ǜ�9V�Ҝ�3=�(���D<��aj߹�8:z�� $>ǭ���q:����y Y��<
+��:t(�,�s䁿� ����x@8H'h<%X� ���`:Y���P<�
+��������(������� /eI'��i���Aw��g�v�Sہ��X;L@����b��\�7���k[r��p��աp���y(&��:h��#�uசu ]�	����y!���/D��%@�n}�C<)(����k��7t�v����p����9�7ρ��Ah8�B:�a,��5,��TJ�Ks�t��[���L�!U���v|�a'�5%�%����t�ҭt�����f��c*��8�K�n/BWPV
+������2���p�I�!��1	M�+�i �"���piy���Z�ҵ�oXu��+]ro �A���O��B�����;E$��p�!*l����\n@�qýs�X�r�8�Bk���.�S,�:�؆�X�\(�,��H($S;G
+}6h,�lX��-�8��B�����o�����`����5�(����\����5ܩ5�	!���e�H:8aCO�W��$��j ����p���a5D$]����9}~�Ū�ჺ ��	�`[��n�4DG��I��HÔףA�!h4�e��ˏ�hhFh0@�pMҝ���*��g@Z:ܞ!+�3�1��a�.��t�)���tCg������k��N�f��w@�����xV��2^̀�X'�e �: ���2�׉^���We��ס���:3Q���#�1�®��(��2h��I'�=2��u�"�fGI@Z����l�� �v<��v:<���}mW�1P9W�K_R���Mk^�6c�UbN��̋��r�h1h}��U uW�b��G&8��������Ì�a?@��B���dÀ翙�`v'-DKa8��O��j��K ������|�]��a�����cw���	$^�S�0,fw౺�QYwޫݩA�+�U߻��;�`g��;c׻�%cA$�;�<�����A~%�E����7�������D��͹�]���3�<�=
+^��
+�������Á!#$o%C��x�����V��Y<�.���w�bWJ�/T�ޮO[(>x�g����Ɓ���؁!�C<0�j��>0���o�Ehϳ�ց����T䤥yI ��� &U:螺�'d�)G\ӣ�z��_T�)ӌd�xK�;[���2]A7]�����Q�{�{�!fo3�=�����BC���ѽ2�{rz���Jd�W��30���rN˷3x����o` ���j���0�>�"������W�i*�ML��[�z��!N0�'�N���_(�� ��� ��m�/X��}�_����f}_@Y���"|a��G�0m}/$�|��eY_?�c��J�d���
+� ��>��T��t�2�]�5w������]>�ƈPD�pftS�G$��o2!�|�{�W�\0p�%.��p�G�&̷{��-�[��mA�GDU��+�37Z�ǵ 4��x�כZ -��;��I|E#Ђ�� ��~��˷���,<�tY�l�0�$��p�����o� .�bUͷ/�7����t
+�/X��|��N@~Y)�
+7��<��8M�q��|�YAs"�\�i�� �Qn^8I�<W�dib8+�'+�Q�pV`[ͷ +i������b�
+���ʲ
+)W�Y7~�*IU�M��P���
+ϑ�GU1UZ*H*�٨@_�vB_�|���
+�O�@ْʃ
+�\0��"��@\5�D�q0CS�i2I����8HaIe������J���e4�{��\�(")\�|���¤���(�4�M�6�&%��GnQH)E��N�� 
+�e>p7�>�Ba��/�||�N*(<2P�8@a�?.~������{���R��|I��fg�����|}O͉����ǸzNۑ�|�L���ƸXO���_X=a�N�[�H�;x]3���Dz��j�Xriأ��c�����O-�����e��d�d�b���������1�x=>b>�<=���%�a-x�p�bi�����| N#R�b�<�=�)�[z��Y����ݕ9z�w	�� ^�ȔO=ї��`�~��+�CC��t^t0�ROR#���@���%D�P�W�	T�:���|p���.#���S"9�N����0_�*Ȏ���+"^'hj�@���Bo�6u��[��:���M��t�g!X'�ٵ�u��N�/�xFoJN�|�u�S-�!�ke�)��]�u1�J�:Ab�s�2�U�Ĕ��:�֊���aoavB��*��勘����	X\>r��`:;�_�l/$;�9�
+��w�iR�N:��图�`$���8ao���8a�ڽ	\r�M���|M�&X8M�Y4a���83!��@e�IR&��� t���h���&x&8�KH�K`ʴG8���%b���%P{�H�����YBec	@��1��@�|sUXT�	� TBr)�$�w��Œ`
+�% u��F�I�"�|F���gwҷ+	�N@)J�6·$�d$0��	��"���� A/���?��1�a%�q�����j�>�#'���0�����#�s#��F���3���ZA��#x� ��:��3��.��,�+�"��"��"��c�"4�D��N�1�(h?"�g��!�� B�}t�h}����7U:��h	.���k�3C�C���|.g�2A�T�s>
+��bBK��!t�A`\;�6�t�%��߂p_�� �H�Y�l��`�|G���.�S:_G���qPo(�\7���8 �	�����?��r�B��r��X3�P���Zl@4 ݲJ%0WX.[\;>_,7 �#΢�|�g�,¸~�(�
+� @����	x?� �?�Yc�@�4��3������o��Y��>`����Q?߉��`�v[�?���Q}+N����\��������?_�c
+t�>qE�O �$��CKI[,��Wu�J������s�86
+SU}�J(���g����s{ww��tJJJJJ�^�z~6r���좣�y� 8O1]�$/��җg��U1L� V�&0"(�!#�ݔ��������~c���zR��crc�%��cnc@�drW;&�֓vw�K#��U3^�Ka&h�D�hʘ��[l@�x�G
+�M�Kb%�J�A�`=g�_�+�S�08�v(��2r�_0���\�Y����T���;N�69&�$V�bc�&\�>���'q-�4>R��"n�8���Y/��/P7�#��0@��+>�@����R^;�4�z4�쀤X�Y��x6�Ê�6Ȑ�0@�\pG��w���~�lzGWq���ے���"{�$Z�]%&��Ė�KNɝ�����W�<p���ۚ�rO�%�Z�'qS��[z<�0A�PԎR6����"8�����ā%��2▘iUv@�`8�&�|�"�Sd\bC���.��1�	��B�-��ץ��Q̔\/�o��|K8��3��@��Pvi�-�F��{*�/E&0b�'Em	%�CN}��4i��h��9��O��������vϾ�w��o�E�"`@�	\:P��Ɍ���"��&r>��
+-9�"\G�Z��,X�a�]��hlOn���0R�ۯ��Z���hg�-��ٿr��M'd7����{�A�7��J���tR��ڝ�1�$/e���Ծx��<�=�
+��AN���Q}����ȱR�xJ��#��'�r����7VH��pzqtG[}����U�E�)0i�$��SfI-M��m��7�f�AJ]�abQZ�.�*�݀��Ę~̭�����$�
+��+V��$��ڥcx$�)zPcjQnl'�w��/�A�4���}N��_����8e2�;ƨ�>}����z�g��]<gWDˈ{S�6��g�mQf̟���К��m��Ԇ�'5�,�r��k�:Djw��ڥ�������xѿ�X���?��mt�-���k@gsc�XG~���o���	����� E݀Ο&'�g`���6�);^E���VC�* �@IJ!x�����E#m_�Q0q���'r�p���v���|
+Nh�w�ւJn.��@�w�vcnNendstreamendobj1078 0 obj<</Length 65536>>stream
+����G���,�$��c�4 ������c����-����J�8���"t� #$U�SrmxF���x"�ðG�R[�;�$1"�M�՞��T�K�!X>�hxǬ�����S'3#)r���{�I�f���L
+�����܋F&�D�Gҋ�(�Ê��U���-����_�������Cpv�l�O� ENhǩ�o�u�T(bH~�E�9��t
+�{.��?Ă@Y��a�ΐT���s��8D�,�JZ@NZEW�����M��[�[�7
+��ޏڽ���(��)�ZQ��{�8ߴKs{�.4'7%��W�i׵s�8`��)>�=��t��SA��[�=��2�&w�z��S��'
+��D��D�[OJUs��27��Ê�i��*�k��ӧ�9��
+����飘1�7�hB�̖�!�16�6���mrϞ֋vw�i��Wr'�R�ZF�-���Y�'��E��N� "y}>�o;J�r�,�c���j#jj�۝���tQEv����"3��	�{.��xi��b�S�9೻�E�\<��l��,?�X��c)E��Q؎�%hDR*7�,	�(�B;nmtK��sv]:g�e�����b�ٿ�|�^v@N/�P�mx�*��N��W��r+ʚЈ��ߑb��@�3�C�E&��(2��%[���O�|�����azHa�D"`��4��0Ҳ������0;�+��~H���sͰ�%�����D�cnq��?�%�0��G���hK�h�mv+Ċ�(hHn���orMEv��Ң�={��,.<��3?Ml��`pɭImG��I��җ��$f@[7BY�&�� O"fA�`xBQ��rJ�Jlɬ���+�eE�������C+�aŃ��
+%�FZv@L�_ۂCbS䎴4<#���a>e_�;�Ђ�D���4�~�\l�k���anL�	Rb5����Ubѿ��ԶЊ�P؂�TrCS.��6T�)pGY�`��ϳ�K�@��� �� K�%䱄��"!�0K��g8��"B	�-%8�ظv80� �E�,�
+"�pr8�
+r������8�%��L�u]� A�g}�q���\�i;�s���Z��Z��ܮcx��z�s\����`9<�wy�9I�-���8��8��߱��xL¬�5=븮�����z�eݦs�u���	65\븍���0�ۦcx�u�'���$$�u�(Hj�u\�=�9��:>!B		��K�,a�9		�{��'�<�qY��9��8�%�8<�:.K�јH��y��Eh� �����|��9�s8���i����o;�uZ�e��g{�aھ��i�a������P���t\��]Ӱ\�pm�r]�r]��׷�7m�9=õl����ڦ�Y�i\�a�����ᚮi��繦��c��o�����뙖c'��چ�i��|�霮���;�QXa�C�����pN�vN�踆u������e�۶.�4l��A���4,�x��L�wNb���u��9n�9��w��y����8��x~�wlӱ��3�㷎�N�6��9��z�x\O�eX�qX��[��ܖ�X�u�u:�e]�s|�mx�g�u����u<��9��z^���z<����s��}��}����r���|�q�z.�������۸߷>�z��9��w�ۺnǱ��.�����7<��=�2�7]��<����u,�6|���8��w|ñ,�p��<�����-�w>�sM�0]�t.߶.��9���ey�q��g���Z���e|��|�eY��zN�x,�z,��X�s\�k���X��Y��:��[�s\�i��@�qMǲ\G��u��u]�����\�g��q\�����뛦uz���uZ�aY�o���X�c�����X�qX�e��s�g<����;��s�s<�m���<�'�y����r��x�:>�x,�=�3n�6.�r=����3��p��8LӸ�8��2.ױ�w|ú~��M�7M�x=�9m�0m�0~��}��}�u\�8,߲l���}��l��m�2���8L��Ϻn�6�9�׺}۸}�4Nӷ}��m�0m˹^�������r�϶���L�3L�3m�3m�u-˳=۲-���5<�u�5�u\O�o��ۮ��度g:�s9���k��Z�s:�ᜎI��u���\�﻾�x��x�q}����g=�s\�kz�sX����Y�iݶi��ᛮk��i܎c��k��i�c��c���Z��:�I�X��Z�i�����:�a���X��Z�qܞ�Y�k��mx��{�����چ癮9�4�v�j��뺶uخ�\��چ�:�'�5�8㵭�6N�6��0^�5��q=!���������m8�c��:�'�z�8뵌߱��L�pl۲-�8|ǲ��4n׳����7n�2�m��l����m��i;�aږa:�ﺾ��m[��Z�k;�iؾkَc��m�m9�o[�a{���o���Z�aܞ��o���{�aۮa��m������oX�����ۖ��o[��y��;�e��q������o{�e��a��i[�cخu��u���9�����Z����㘮����g���;�i��o��m�m:��������	�}�7^�1~��㷌�w]ô��8��6�5n�r�8��2�׷����5�:l�lϰ�ߴ�t״��5�Ǹ��5l�p~������7��}�7��8�����uX�k��q|�qX�q��z��9��pn�7,�,��LG�Y��@"�~��M�pn�8���}�7�ø}�r>�5�۸^���2�òM�v��8|˶~�����<Ǻ��q<��|Ǹ,�t\�mװ]��9|�p[�q9��o���X��Y�g8����c�c=�e�㰜볌�4L�v�5��9��6<��\�s=��]�3<�9�9.�v��}�um�4|�wn�u��������8����aڞi9�m��Z�q\���8.�9N�u,�L�pM�M�����X��z�m���:�q8���#�"p�뼆뙮iY�s��c���9�u��y�k]��|����qX�oX����u��a��c9�e]��X��:��Y��<�u�k��k��{��������$�=��l�sN˳�밬�3��\ó�v�U��t^�5\�4�x}�5��5L��m�t\����0M�3\�5-�V8Ķ-�p���<˲.�p��6.�7<�v�y��|�8,�4>�1,�2�i���z��z�mZ�e���{�gZ�q��隦����y�c���\�aݶq��s����o��e��i۶��mt�g܆���a���<�k]��<��|��kX�e<��:�i�cX�c]��z��:N��8>��˴�߳L�<��]�L�t}�pL�5�4nǴN�2�ӱ<��w˺ù=㰌˴�p.�v\O�����9�s]����u<�ᙖ���oخs�����y�i���X��;�i��g��i���Y�︞״�4���9\�V8�5N׳l�5l��}�r=Ǹ]�v>��߰���t��|��<�t=���`�����z��	�$sL�sn�<���8��t[���uz�sX�kl��a��g	x�x\K�aL̳�D��zB��i��`��,�"��H��$�p$Ժ<	q��Z8�mC�߲\��<��<�2ǱM-�5-���փ�2��I�Ĥ#9êw��=��_Q�~y§$6w ����!�P���n7$�o��.���M��N���=���xQ����6:%׵����w�#�g�D{,�=�����,�1x�>�ѓ��B[ngrM��{�;�'4"�7��l��1�f�܊���Z�Ë�J8x��!�3ಫ6���jEO�NR��	��:_{�t��eǒd���������Y����#�޳K�d�A}��)Z&H��R{�A�� �H�G������Sbl,�І�0H��Y�G����1�08e��Pf���5�PЌ�0���)4�<��c	��orO��Y��5�E��&h?Y�ƃ�� L��&�f����TO�����A�����D;���1�j��<pBǐܲ�������	�>���cvAX�#�b2{R�~xE�zN~H|��߳���yB矂Gu#I��,���B��/��|�8��&n?�xt�N��ѓ��R���r{�h^�6@B�� yU����耘h<�8�zKhBP1�c�l�<EjW�
+����wE��e�-�5>E`rGS6�e�C��(.Um����ҽ&���Q�X��5Z0w!K���z{�H_Ƥ�J���������b;�����.η�u������By�p��U5�k�E{�r
+�c^�4pn���c�ni;kw($��j�R�����ُ^;Pno��Ҁ��N1���>P�Ci�(�R[rW�f�.y�R��=��b�_AD��_����x�nQh�ba�~�C�_6�X�����bR�g��Ԭޖ����M����!J�������|/�_�L��R[�c^i>(wE���ȑ�?^<{T<�/��؞Y��駸)�(jHl��O	XQT��xu�%�*r�U1��{aD����tzSؐ;
+��^#��1?�*p��$�54�B�����;������-�^�ZrS������^#���]4<��0>��z��j��OĎv�eH[� -*@W6Dٛ�������1��~���R��%��`��2ҮzJ�kFn�EhxF����O(�1��P�w���vY��mkxF[����.��(:�����0�.�S�=ᓊ��7���4��׀�I��Qd����������;=G�
+Hˁ%�A�WOك��8i`5u`A�Ԗ\���s�{���[?�8��Bᴗ��{�N����ymz���p���N^�(��������ɮ����ZT�"���Xa����B O����krg?'��Q�<R�L���_ƪ7���c�&w�z����L���ج]cP���o�jwF����>�L����c��A�����9y����O?����<���(7FP�a���)�'fIm��ɍ��;q8�_�H���ljI`G'�t��I��ԗ�Q���g��:ӟG�_�-�(�#��S
+e�Ēx%�/��.�Q�/RՊ�ɗ�E�Аל���ߛo�����8��xQ?e���ت_W�7��~AP1����z5Бz50Z�.�H_ ��E�rG��Z
+1J�܂Ԛ���(�i�Ly���xOmh�{=���[rQ̈�#^CS�\s�`���|�-��!%0$Ԍ'�?� I�ݳ_�;N/ɳzȽT�1������
+2@M'��-�,ٟ�{[�.c+ʢ̈�#d?P?�+s+��)��Vd����~i����1{�\�vDlhJ��s���K�o�Y�p�]ƈ;�%hJ�е�[�-�$�F��a>��������R�!��1@Wԅ�OT�츇x�.6�o��(�.�4�#+7��dVԃx�"�����t�Ft��a�F����)��)B�etu�9֭*ڳkK�n����Xz�1�1�����}1��!ĥ�5�ʚ�	i1� A�v���*G�	Rx %���5�1^��iA�~�<���DĊ�Rfȩ�NڳؘY�R[���p��րFؗ�Q�S|z��&��(
+%���x�^�R�O��`G��_.$TΙe��	�+c��]��?����Q�'����˂�DU.Q���χ�%��*�H]:�.#�7�[�¹I�ߏ���G��mnN?��JM�����LN�-Cb5���LĐWٯ٧В[0dG��q��CknejOn�G��x���C��vJ_��s���~�'�Y:�����_��)N�>`���Zapڳ̞��̹�r�.�W��p�����@�:���Th��	Y�bSjk�&�E��Wa�v�¨w�K:�r�����휼Mɍ�^b�Z�3D
+!�{�.2�6���&^�>C�r7�b�46���Iw����1F�9��&��Jfq��X����O1�}�i�BXhJ(!���tЮ��KC^g>�$2���ҷ�!�[���I�hpE\`6��7=��M ���0�rKrk�fO�2SnSؚ��,�Ű�����4"Ft-�]Ul�,�-ɓh5���4Z����IK�"��&2K�{�-�[�K� b@P06'��f�ꁔD&@��0>�LdȽŦ�gnM*����<!�1�':L��hN�DFT'p$�Ė[�O�����@�2K>�~L�DV��]|�d���*BP� =�/>��x�~��)D����Rqp�p� �*!f@�JfCW�"���E�,0�VvCDVnG�
+4?NAI%v�`�5�!�)l����YlJ��'/�a[lAP+���Ŗ�{�R*�O���1�� ��A�+N{\R�23¢̒\ڒk>���r�`�T�Ħ̚Z�s[�I�p����b��3�z{�KU=Ei���
+�?����nχ�}ᐺ8^�7L������޴{B�Ķؖ�W�ɣ̔[�Ys7�)�)����{�-��Rc� �K�=}�$��J���I#p��'H$�o�*���4v������B�r��=ya����������W��IU-]����;�Tuc�� VH)��TI,9�e�b[fm@�^4:�-v������S�~gj�.
+ʏp)�$\K�ʬ��Ԛݗ��S��54)��8���|�@��c�ѕ��ɭ�E�\�˓�w����[��~ѿīɟĞ=��2�Rcr;���#^ES��S;J�59&o�{�36�cs�48&"�r`]KbG\W����#^^L#�.v�"V@�:�bc).>�:���'d�,J�(k�;�?�~�`�'-�c���Ճk�ʡ%�U$e"S�5�)��Mz!V(/0B#
+5BPr������8Hm���
+5?O#`FW����8FXב���Dyp	]?���l~�p|�$d?S3��-N|C�Y.4�"FsBK8��S�l��̎X4�l,L��s��i1��1$'p�_��7�m���BSra=f��d���I�X��t�����O����5E��M��&fDٚ�X6�4A�!^>E&H�A�'=�T�1¢Ĝ?N���|
+Mɥ�1�,x�\��x�x)�D �t��t�Οf�����*�~<��&�̜�/��C��ae�]�)�7f�2[r{>���E�U.���_�*������X���� 
+9�cŴ+cG�r�-����vJ����pH�"p�*'��a�ze�P��^�'��dpI�.��?��S(2&WD��@ctE�%�,�r��:ϯ���c<�/ss�)�嶥���R��K��(V�@ZP�<П'���|P����9_���%}s=$*cP�0�D�J����{���z�_���4�(��ly5[bWhLmo��U�@�;Q�'���Ү�'����
+��sXio��)9ѵN��]bIG~=�#-r�GJhA�6�9�Ȩ]����%I�Ey\�_�|����p��Sؖ|����~W��ܦ_��+�[�*�#�&��u��׵��%��V�����	l�x9�2���#.�<�\$@v)��X����K8�`i)6�� N(4��&D�!P��l��� sL!p��l��n��Xh��6�j�T78i���&[Sn���f{�aᖢ,�B�� ��xA�U.��ܚ�Z���MM�b��Bb Ln��#dA���{��V ČG,���3<�^@�.��4��]?��S$�d�&��Fh͎h+׀�t�&�	�lɗĎ� ZHl�Q�V�e����cn��&3�Z@DV�$��E�G�:&(j��6�v��)᎑�
+.�Ƀ�*�m��К����03t��:bKdG���u�H�1�{�E�'1�a%$eL��my�@���P!�Y�>Oq��0Z�����
+�� �d�$���O�I�Pڽ�T@��ꅎ��!DH�J(2���V����{J��wX!�23���Qh�}>�.�UMι��=��t_�[���|Ю���u��U�v4��M�05���F��И�����z�F��Of�썎���������ߚ�����T���;�e���9}�p��`�*��������%Q�IT�^��RcjorP��X��Ԟ���:��wAT��zӞ��Qf��͹�|Jn���ܒڔs��%���|S*5?St"8�e?�e��@	P�'G�5s~_nK������}�m���={������P���?������]�o�%�"`DQj��@��G������/��0F�jxF�0�"�������h{����	BHp=�d<�����W:��3<��܈�ۦG�)v8U����D�Ԁ�VrDS+6d���K`�m�Q�����r;C�V�|R*Q�ǀK^�k����rc�(f��D��c�9^_jP�'�vsr���&9&G����Ќ�^>RX?�8�~�LhBY�M`6�0�rဋ)��%]�����	�H�2K�%2#nʍhk��#e�[�\?�tx5�ܖY��	�_Y��pCt5K^apG�Pɯ�|�0p¢(ċ��n���~��ȷ����o��	�<���y���bc�/�g7%��Ԛ�,�(�&��ڏ'!Ѯ�D<P�A���S	��kn]d�m��hJ������}�����-�7��u&��E��T-��7&�ܮК\�M�z�$��>N��j���4b��lS��F��xA����Ycʽ�S���I��tR~���� �K�SrelM���u�]qvK튜�OC���B#ʂ@�G�\��J���G��}���1w��ɧВ�Vnڥs��[�Qh�[3˝6e��i=��"g�t��=ӾO��ct�8�$�Ŗ��џ(�vQ܎Y6�(2!�� �����_[�t�_�R��T&�!�\�i&ׄb�)�X;���KJ��-K��gnP����'��`r eEҦ�A$&��vQ?�vo�hj5Љ�3�g�2��4]��L���-����߀���"��`g��	������'���/�BÈ�6�,�Em�9ҶȢ]Z/��	��ͯ�I7D>�Љ�7�i���~c��#����=��!�/�����o��R��x�gz,�ؘיn��To�������rC��6<���r;,*���vE��ehM^��i�g�6�������ze�P�-޲o�۞!�׫~g���Fwޏ����� u5P��s��_���.ݴ��U���귧8���_��:�=��b���~W<&v�[J�R_�Q���}K��~U(��/tk�My9�,<H?�?F�t����'�s��o̭��5DM&b�l��z֪_o��]R2��[c�vk�*U�o�IjW7��Ny��
+�[�o�;�	���B�AN��Eɋcg�������Q��*wDk�5�zfQ^�,g�i�(}o<'7��(�"&�[���P}F)���� ��0Hպșnu���&6�!1��!�_sJG(�q�Xj?p����$�F#����8hq4%�"�� H6��*�n���+�=�t_�5�?��O�ׅ�s��A�@ߒ�uS�B��#X�k	�2J���������*K�(�9�E"�����ܓ��A�5��s�R�0Z�d�<]T������)�IY��	��q2CfU�<�6Z(�9F��r;B���kk���p�cu�-�nvʨ�o��J!� ��l<��72P�W�%���ܜ?I����5��s�au��XA�Zni9��[���Ҿ(�zc<f�R3�~h=��\������/������$E���:�VF\����?���02����)�9R���p#�Ű
+r�*�Z�%�����dJ�7�����8چ��<uX�J^�)�2R����	�Y�O���5y�/�̆��Dhp�=��Ҁ��eQ\�,r?�!�'3��'��=`��k~�@���߯� .UH_��ڵ��>��u�IT� ґ���E��s��ݝ���&Q	�Uo�'��Bi_Ny�a��9��e�����X�2r�?QH�9�I��Ġٕ��[c��8^��Z{=H���k�v`��5�$0#)6@���M�X�;S��,1(ρQ�x�U�������j��/�I�\�����@x���#�ۢ���ȩ��U�(���V��c��*1����}KU9A���s�)��vƛ�:Ũ?�A���ɟĒY3�mOPȿcE���"�u�%�vC&�/��j�zZ�=����^rGX�����Gꝱ��v�kI�(�$���/DP� ��0�ԾpIGb��#:�R��KN}9���Β7�C��ܞ�����������o�����_"���h�܅D���kb�A��\�*��S̠|����r���Sr{�f�9��̐��l����_Z���x�
+����	�{���'$e2;:H<㴗�}KL��@��MfC�PV�'te���I���̦�M}��Z��U�fh�#55kU�-����W���-�#`Fזڑ���{@���K�f|�Vf�mIl�M�(����zAK��:�'
+�l��xή�ˈ�S|DMp����w���Í�֖����ݗ��Q�(2�U\����[ruv�{0��2=�R`�(b����I,��`C$�E��v�����Sb>O�̎����Č�b�Dy@Ď!!@!$����������U��P�����B;>���ے[�F����̲p���'��0V��`�_�-���R�S����t?�O����̠��n���O�����'�a�迣'����jE��{����`'��	A��rq;���g�Y���ʵ����Nʅ�R:D(�6䄁�#鶫���R�ΗDU�U�3t��r��.���$��t���"җ����������ԟ��trA�s��޶C�{���C���B�PS��E!s�)j�}�������/�I}\��A�+����;D�V�J�zPl�*W�r3�)zZ�1�,��G!S�/4fO�=�����v�_(��:`2[���*Y9����EV�"�r�)���M��@$*�H�FjϢh�a�D�a����
+�Hck�8:���{v9�y�@0�2z�Y���
+�_�A{��s�"��7�mM^N����wBf�1���V&��xL����(1f_af�auԾxK��x���a_�$v*�6�0���E6�S`�f'���iQ��L���Z�����ΐ���]������v�i�)fӿgh�z�C�]�d���
+�Qo���9�\Hj$8�N���[��[�!����{��hW�1{�o�!yR�����Q�imv��L�5!;�Ȏ��pZ�R+��.>�B��È�V���6�Ēr����Q�MΉ5�^�PJ5c�vqlR����xM�N��mB�^�;Ү܎�18$��F,@�RLa*�zvWdQ*���~�[b�ir���ZP�-�Eq�j��5w[OJ5�={]�����O��"���?�UV.ڛ��ڛZ�����<C���H���^t�_�"�;�A��O��0&�@*�P��I��p)���$���!VI)Vʭ���W����`�T}-�;�x|�	�Q3v�]�Y�O��/�H�F$o��7F��kB3b�Q2x������Ѿ�8�]�L�9Ү�����m�'պ &�?/�[V�y�2�r�%�-]�+�M�6ߴ�}���zP�lW��~U/��$]��N��`tjE��{�c�u��}�s�3��'y	M} E&7���F��rV*ٮ��}�R+,:�+��.�,�W�Bǹ�ب_�J����qr�=Ƌn?��1x��;M�n���xϿ���գ�j8|n7��3z��OmM��9�3�zy�g�(Xc� DF���NlBQ3������%wDf]j�mk��G���XHm6EP69�V���u�Hnoh`�U����Xb�ǰ��� V���̍�9Hӌ��Ѣ�����m�X�B㲧����JŪ%U�tIT�ഫ.��_�,*y9N?b$fr2i=�� �&^��dV�M�i�E�whlzs����=����EA#��QP��-3�l�4���|/�����ʍ(�;�1��H�Ԋ�(����\ѵ���D�o:�]��|O�C
+!狆pb�j��XR�I��eO؆�Vn�,K�K߀@���"2�$�Mh��64�!��dB#�b��;��H�#.��aR�5�+�ro�m3�~Ha �#�Df���)�.��U�H�y�I��/�\�����ܓK�knit�,0(�{��*�.��]h�_�S:B�I{��]ᢻ�O�gʫ(N�B!u������]���?�)�&Z�~H�n{�xj�2��<P������N�8t��ߴ;�=}Y����<V���]ap�5q"$ed	�[jo=�7��;Dj?R�:έ
+�/�{
+��~�#�ĬHj���zP����Xa�k�P{�p����a��!�6��!�Q�!�Vl�&Z���:T*��ٷИ\c��U�Eah���:rz���<Ӟ�(�����^����{�{�f���5{�J�3�R���M��r������ݱ��� �E�v76�r �~ҟ��vin��&��_����cra�L�d=!.h�j�$�~�^�k�D����)�8�+"�=�sw���b;�����ޏJ�#D�
+�?���z��#i<F�h�{A�)Tpb�ajgxE�]�Vf����9��V�Ft�*�����E�nz)y%�f��O�KnqxʽXt�9`���詆f�K\��Tt4�3DSq�Д�h?F�8`ӏ�1�*��"�{�(`B�+1�$���-����=����58�`pAQ���o�������|�n	��u�$
+5>MXGP(f̬������^45� @��^Cc���3;�w�u�1�*6$�dF�4�;J��F=B���x!��%w����\�0 r[&��`ˢh����Dy�E�'N�#����|O�������nvέm���*Q�(&U��	9��R.�r�F��jW(��> ��RkrG���,4&���R;^���wƎ�,*�fš!=�c,�גՍ�<����oٔ��0V�6z�:�.�'7�sBq�����l��@iE���o��5
+��<Ū��X�50"y{�R�.Ȑ֏���8�[fO.+�i��BF�Ai?�G�{��gtCSRx�l�\a�J�3�z��h�B��ι5�E��Q�����DdƦ����;��5���S��5{ܳ���]��d��̆�d�#d�sjwB��Bg��N�O�y�I_:֯�~&4�|Ƚ��^�Ifgj�_�~_�P=S��
+�CE��b#����i���18f�����5]җG($��*U�"�j�K�\8�����͏{b#��vM~�[j{��-=P��'�amzER;!�7�Ae@�C�d�̖Ȏ��C������9C��(��	��!��:���H��M����0�#7�s�B(��^���@�ĘQ/���p�)�M�S�͹�؜��6�?!GbjL�*3�15'�����3FS(�#v��[�h��U� ��V���Q}�ZhxLcE<`RC	����*Ҟ�ic����8�#&�")�q�`2#I`4�?M(1斥����N�JZ:�BĖ��� �q�ov1�	=���X�\{�kj�E��bE���� n������&H��TBf�����N1��1F��X�^h=�&`i���5��Z�Z�T/����[�N��xJ�aU�q�A�!b#E��a�2�[�9�39�����ܚ\
+bf@ir�+����z�>�{�3���nF��`� i���0��w�	�A[oZJ�� ի�Q�:P�t˃Ŵo労.`�a�Cd���_�7G���S�s�$ue� uw�R�&\fU�,�0x�;�S�I��`җ�S��R�9�g�c��?b���	EقI���������1�89h��Q��ڔ'
+���׼���Q�@w`P��rI�V�Ꟑ!�^n����vq?i7W<n3�rLY�c�O������zr�|��y�Ҟ��Պ"ջ&wX�Ƚ��IY,�*��Q�qj�W8�`PSc�``T'�|�����)fW������U��Nt`���L�hR����/6��L���"p~<Y`�h21� ���Jt�PQ�����^8�n�� ��a�e����I�S���HT4+�C:�惥'�\<����z��\g�!���B.�x0u��E(�#���Zcc�17�\c��11�n4�,جhN��3��I(�>��N�$9P��@+���t"��)�S��	����Bj��4��h�@�KX���R��ҭV
+1K&p?�F�r\9`�#2 ٍ	4 M %'I*���"��ƒ����.����-)b< �䁘PV,m ���1-�^ �q6@z 	�FD�p 8 ��x���x�d�#&4�)����~�.h�Ѵix0=�D��Nl�DAK#j��'1%���opM�+HTn�#��@Z��0r����("A�b�L����l`�0�A$fw�Z���b��2;rA�Rc�D� =�B��AL&��-@�V\T*18�L�Ë����Vt�Y�`\bL)�j�W6�^x?P==G���24�0�d��䚁�����R�AUg�(+ҲzG<�
+\r��J`a� ʎ��JV?QZ>�.�z4�ؐ��#�B�#�&݅c���Ѥ�#����Pԉ�6.9I���(��� �`J�ecHbXHpDB���Ofn� �%0NPY%@��`Ѥ �+M�R,�È��5�M�M�9���W[��_x�0����̰J`ـ��r4�M#j?�NnBZ1P$l?��!��Q�@���)^P5�cZ�K��/�m���+�#���ԲrN_DLh�&��W��o"Cn?���%1�V���[�d�A�G�I-(���#*34�6�c���k3�6�@X�	=�t��,�����4"�3b�#��U'�� P^r����-��ُIU��F���5Lq){��,��L!dA�F�|<q�� ��� ���4�|�p�d'��(�� rᕃ�
+NH��W�M��(R��� 	|�!�pb�Z:@�]7�Tx��A$*���L��!e@��!�@�Рb@�0N$l?�Ln>R)rD�3Pd<�*�p4�7 Y��(��H�[3�ԭ��H v�p�1�-;���!�k@ R��x�W=�0�t4��	]Q܊�/��6�GJ�kGS�CC�}��H	���2s�$�#m��)������K��J�l@�mj$���2Q�"1��:�I��X�/��xN���'*�����@!������5t�j�3���x�#b?�B�~�PyA5����	��pa��L��׎"0;�)Z0����\�k23ھܔ��,T^@3�����`B�!�2:�^�;rWrA�4",��FjF2�~�0���(��@�Ď[����E�������8�~1�]QlD[�Q��h+�[�3��	dZ8�V(
+�&�Дݚ�T���
+8:�&��X� 惥�| ĂKGT1X1p�ip� ���Z>Q����-�(�!������#0Q��[�0SC�/Xi�L9'� Q`>Qb>R8�"�I�Ǔ�AH&� RX?�bC@���Pl�kL.������g�������R/���]r��j��epM�dF���
+�b��ܲ���`A��
+fh m ��3PQ䀜ZB�)��H����zI�G�5�ak�N�K�'��0R���"#^Dٕn��]l�@Ɋd��"2M'�� bA�F*p��t� �������]I�~�.�~D��]�۱���ф�n�W8�x�I�蒻PH��$�%$����6���-14�\`�A�54�dD8���,��OR��ҀK���� b>Od=�:�|DU`�`�mr� ��� �Z��^=�*��s��c�ìGS/Mxa1����ڍ"&;O!n8�l��p�mj�or0ax���W3�2��@��aan�݃k�c�vS��<���8]���_��h���J��-Q�q���4��D��K<�;4&�� ��)�+8#����~H"h�?����W�	e-�v e��Ђ�H^�}�q�u��آ^�{��(��%[!b,ѭ�2��RBV�����ܦ|+G�[fN^��R�&��cKf3��!2�u4�Rk�2�$W�{~W��G=�Rc�(�$ׄ�=!CjI�����)�ّ����x������|P����u@�/�K�(5�-N.��*�VX�p���;nkx�=�4���eÎ��j����1"n��h��f���������/��h[���ɶ���RS�+�j�����;�Ī�&4#l�XP���#�M"$�#�*(
+�k�K�5���F�#*�6������Wj�,I��ZBV�%�mWh����zopQo�n��O/��h��%��ٽ��T(fKE��op�.KmɛĖ[�$�ê�J��h�cv�W>�>�~8�솲.;��M���2="�؏�1"m�ѶC�H�RC�XB�	Vp���ȋnH��X���yB��0do�([��L�2B�7E\1$?�C�"d@�>����GӇ�OTI��:ʎ�]�D� FJ`�}�h{��`����/3�qM#ZAS&dG�1)�Ɛ0 M)7�k�W#$� R`?�8��<�܊�&dH�
+MɃxa�J1 R*E�H��t�K�"@�A� ��RW&\�uC��j��("�#e�> ���)��,�����T@�����8 t��E{� )�f����ǐzHӉ���[�^����:H_x�����a�vmvH��욀){-���%t��5�O�%
+�^\�"3�#�s�!`?P%0#����v����7"*��'��s�bg�1��;:�x�)���)� �Y�/���X�9ؙ�; T�C��9ê�Sr[f�E��Ф}��-��h���E�F)W���B�)�^j�}e��~�nc���H�W��^����/,W�~X-�p��LbI���IA�~]8�����08%��N��	��j��qn�Rf��v���1�ZO������[�=�[o���I�-�e�kvY:i�I���x=��d����5��ݓOqS�%`GZ4G)ٓg����}�i�f�ܲxJm/�?���H���-��s�~k8%����"Z?�8��@�~I.OQ��0V�d:��hK�F�"C�B`5��Z��E�=�ӷ���H�B� O+��?�{R�Ԏ�$3�)]���U�:^ґ��Z�2[�VA�V>S^A���_�%y�%�Gd@����e23�J�1y
+3CRVG����z�f�ܾ�Hm�W�&�| ����ZAO&d�,�L��:b?��Z
++ L d@�����0���4���>���%��l��H���b#���ۜ��߂���g�U�eir̟Ŧ�M̎�%bGZ�]��)y��7�Z�X�1��E�F2���2���!`C��P_���ţ��Ӄ)��A$+H*M�}�^/���S>�1�#ʿzGP,����̹�Р}ʬ�e�9w���Y�d�|��
+�X��1J�4ޔje�~qnU*��9�ۓ��|o���șva�X�C"s�'���T��1����1�9B�����:�~�[6�h�i���Ln����B�򫜴��=��T{��-��Vi����؞��m��R�Q��)}[����H�sr�Q'�#'\3��I�4����%u���������:J_r�����*�.��_���{���Ȕ��Ȕ�Z7�������_�����ålx���aBq;bmt��B�Aq黢X���<���v�ߵ�vmpЮ��#�6K�ˬ�����p�s?�/s{�0�R�Rs�)1�ss�ݓ7�B��ݙ��%�����q�&��A�+'_!&���� 1#�"Cr����S��@!���1�&�g��'`J�I���9������"��`���mC���PĒ�
+m��ܢ��r/�airK�\A�T��nZu2K�&���DMH���	e���ܬU9^��$�O��({&�,2 L y�{��a��|R�ЎW�R["3��kz��ҡ�
+I��9�̎�LdFW����
+�
+1�QdD-���Mn�M��\CR+���F��_��O*�ٯ��|Kl��̾x�~�C���=�[R�bCjW����A�ݏ����T^@�.�|<���H��|���T���L���\�O�M�aA�~�JfDWn��tT��A���Q�CE��B�4��������YtOA��3��wƓzc��3�I��0��zQ/�W=҂�4���H���x�ps���A�.r�=���!F�<�]�A�C���`����o����;A%��"ҿ��*7xb�5�S.��$����*�����(V{ -�\��m��v��n�C���'|�;�$oOp�"����S�s�N���Б�:Ƨobfe�Q��O�+�8�*Fq����/�I]D!_C!�C%*�N�e�]�NH�KU+�I�Xn��Ԥ~mg�c�F0K�^c�Cbc����[��&uw��כ�=CjwEN��:���٧ȜX-�VE�WjG���+�mnR^'��m�_ڔJ��ze<(Z���sW�%�\=����'����[J� V�-��gXi��Ã	�V���!��]sk�I�"�UQ��8�g��Ċ`)����J�ꍡ�W�<��_��S���a��r[�7�ԫB�+�#>ev�_h����5K�08��1}^QT�-Ȑ�S�	������U9`��F�?"��B�y�[r �Ԫ�0�M�ei�!p�Q37玳Srs�"l��kRrj�W\�ٿؚ�=�)���d��H��������٣�!����!N������w������>�p�jT��|T���6������k�Y:(��={��fw�քG�-��S䎰2���s�~c�(?�9�5��Vfw���;��G�D5Zb��� &}yA���f{��=��t��3�[����Q�2�hW�
+*�O�	�Bcninү�W�EI+�Ԫ��'ڔ*��Je�������C2�0N�>T$�HP�
+ct��H�W���rT_���W�Q��G��F����B���afUrAEZhGN���w���Q��-z���;0H�W�H�E0�;�Э�����~�'�@�NJ���S;$B�2x�[/�\*������'�ni��6%��b�EwW�� "�AV$�>x�|U�]���UQ��8_t������DU.�咾5�R�KG�w�ӯ=s#iD+��/��~R/��b��Yf�S @$�>s`8�Cbi��3��RJ.�8��� �%C        ,�w��_6Yl�����Wx]ؿW$�(�')z���R�ULۺ����t��l>Y@�D)�H��K�qV9���`���٤ə��>�/��֞���~�3�����Ŏ�b�ӱeM`wf\L3�`��kn8�~b0R$)ք�4o����,|��
+�ǟ`ᔠ"����Q u���̹r��
+:x��x�c\* ��.Q ̘"��R����� v'R=r4����{3�HF"s�����E#�$.�4�^��J��Ul�L"���0�e<�v�7�g��<G,������X`����~�j��X�.�Pa�Sd�����"��X���-��S�D>hQ^t>3��4�a*���>�����,�e0�j֚m��h`�f���@����E�-�ʶ��1�j%��}����G�ٕ~���1p&��5�6��d�����J�5�%�j<%^��A-k��A見lX6�� @j�RB5�I������U�QL��nq�DE�h�? !� cu�pd�������~,��K����K��K���ĵ�Ỏ�6h��A}�Z�`c��!{ 6���6@@=B�����l�A0��G �%]�������0~h����Q�m�wm�̍���).R��36@!G��F��8���[�z���1ݹ����?�4&�~���%����$��1�9�b)f�XVAA�{3m0��9v�0Y���w!�u�T_)���#e�'�N��}�����~��Bͩ�S�SÖ�WF��TGW���a�����qI�a!5�^WP��.�:�Xe2��xal��E���
+����=`wˌnk'���^"��"yr��N���X-�\7������=6�M�M ��_fN��{	��(=CA�[dq�byI�zΗ�P��m�����[L�A�`����@1�`�GM�G3�Y9��Z�֛Y���T;����b}���-��Y^��6����Q+����C<�,���3��r�%�1"7[�P7�~�����n�Ci��b%S��#�:��ߤ4���+P��Or���)/�ĘL�_X� ����>"��P�S����b�25��x�,䮎���0���՟���Fd`�!1] ���p{M�K��n�r�r<�G�H�.���.)j�ˆ���)�ϋ�4~a�?���Ȅ��JA�Ln�%L��3�n�NVѰTHiZ����NE����r�k<Q~�{�us�<.@�f�u��.�B��	*�U�60�R�e���N�5�c�\�-TZ�@{���bKp�ޙ�\7���
+�m{�	�N+}Y&�F�)�&�Ʉ���a�/:���ύ�J�h�qC�&O��>��,S]W�Y%h��~��������		��$�C�ኌ'E��)�\��6T5����eϭ[��1$�O�����6�'������i�)�l~8U�pPC�	��o	y�ݸy�������V�BL+��BG7$N`���M��ğ�@0�H����
+����b6k�U��
+�c�$�24�=�,�HH��	L�JPF��j�x9щ�c�U�M�BW��9E��3�9RD��|/+?����P7��ql�]�K�[��ʀ\~*��#o�bq����ںd�<�DN�$1�F�h�<�x�Xp穓�JW��S:�.d ܱ�7��S*a\�ȼ;fVJ^_x�u��9Nh�J�E���:4��t��:O�D9CPp����\`.�G�5���!���Q����(FO��� =���[+����lb�0�µc�:��Jxd�1���F��9���l�v+�Ǵ�7u�-��ɫ.�����z��v䵴86o�Q��J猞t�I�hk��P�����'(/��;�����Ɍ
+��o\�Ѣ�?l���e!1��MQ�#���}a��-�y����Z�T���]e'�^ηw�&��ŕ��{��DK|�%A&�?��I>Uή���*O¯K�^�Q�{�y���+!!�Z z'�������pU���8�m+siCMW �I�ԘzX���PüS�bV\f�eq&b� +ݎ��Z�	ߕ�Fz� C�{j��i5�u^� ��$
+M��l
+�S`�z~3#�ZH΂<)�Í�n[>�P�1��10�CB��Y�H4�@Jx��g_퍧	<g1L�b�<�7BR(�xYa>x_!�Z=�b���ݍ�dw!���;����J�,p�\�D������f�x�Ԣw����e�5�K�@��>��m:&��3���.��&����L^4�a����?˴�sk��Q��W�6,�|o��B�F���^�v�N��`#���#C���V]X7��؈��I�B2j?���^	��U�{��������B҉ l5/��?�~�=�rN4P�L����Lx�-W�~X%\]z��a��m`�Zy;�O��U����u+bIވ�%����,���'ޟ�KP؄
+"C��~x.�HL�J�X���@�'Y���[x�"���k�� ��5�0��	���Wşw�俜A	CZA����z��� vF7�DߏQ�{�z!j��g &=.��X��VD/W
+$*B��I$�E��W$�:�A«Q	��8�=��=��`�攅�%�V�f%����(�4��c����L�b�����g�O0fOLNd�P�#`��"�sd'���0�'����M�!y@��lc�r�� ��B��*���l�k�MNRZM����������s�V%��rc�q!���j�m�?g�W>��F�8'-;,)Jp$%s}1��l�*�"�Ė��|+L�~d�͢&�v�/�֘��T�K���xr~�IkՋ%�m�V�[$x7[+�W"�.QkZa��a
+�O^�g�g��)ǉ��(�Ʊ���	��q�$�v`{�u-"?��P9�๰�d���KW˔�F���^�<���\7�ԁ7���+@7���3���zDk�/� 5d|��-$8o��rV\=+�پI�Pب��6�#�2:mi(Ւ����H���@��;(S�eXH��\Z㧌H_a��m^{|�"YR�T dG:A��� E�GaJ�����N�ҰS�$�\m%Z���f�b�M�^$f[o+,�t�+җJ��{r�q���W����aE5�ց���י�kx�+���Y9�<T��et��s}~ �nD���*�z����HI$�b��G��֞�l@�$,B�f�e�K6��L|����/9�H�=t�����d���@��P��kw66���;G9+	b�djBmA�k	�3e82gt�/�]�̋A�n3������ql0�Nf�Dvӹռh�{ܬ�k���@��-mPT�%X9���-�\����ȣ,�g$tN:��qd����wq�ai��Q���aLܕo�I��\ٮC�ʆ[W�v���fȕ����J�Ƀk,�_fŋ@��,i�W��R�Mf���Wy��=E�������LT�ҁ7��w4�8������/&/��7vi��u;&�#�}S(�![Xh7�tG§�D� )�$�2�n��\�/��7��˶�"��(׎����m���^���D�o���a��^t�F�~GC��9Ŗsٱ	 )Y�+l��r�_,���v��)|7��z��Z�j���b�bңݳ��͈lC�����+x��y������x�>$��`WH_�E O�����<:U�хW?�
+�B�q���J��l�!�)>���-�v?dV�T+驦�R+g3�qҫ��
+�7]�8�`�y�Ȁa@\�R1���T`��i@��S0���*}B����a��g�,��쐬�/��C���fߴ���>��{Jq��3I�!��s��"'.�u��%D�z�#|y����i!{m��Id�1��ء������T&���T�(=Q�؋'�M�������y��� ��?��ı�!m
+��D'�a�>���~0�T�|΀ʣ�󈕔	����� �~D��IAYn�!П�|�ɠ
+��k�5c0��+�� ���ĠC��{��tg$�Nj�HڋkJN��Z��/Db�*�n�m���vЧe*m�z��)	�dgFl�,MM�Y������"	ʨ���#Qo���̓�P 4.��z�]6��8C"\���Ǜ];ӦF	ҁ��1<D��#n&.�@=b�Ad����\!��@�0جG�tzYҍ����jEJDp��7������7|b��kPr��	��8ޜ=��vf[�R��ݛ+�J�7�zxERYr�3�p8�4E�X�eI�e (��y���g��xh�s&i���(r�lڷ�擅(�c$���M�ash��ǻ��U�--��t 5:�rXx�ۿ@)u�p����*Ξo<O�>������h�)L�4dd�5�O-`�����}ȼ�������������Cg�z.f�d��E��>�?9�����T@��0rUS�:�����N���М[HW���:��F����ȉv��b���d "���6�G����{%{�_�� 7��
+����aGAw��I�)�a�\B�GD�48��P��b
+��<2�d��#��RF`E�#5��co��C@)��\{��#xq�FY�Ah��H��%��
+�lN�L�(~�́�P�fmF�����DH5�o >`#�o�λ`�+�i1)S2�t�o�m|�I2+�mU^t��zΩ{�]M�v��@�vz�	��:% 9d�4X��	�B��-6'����-�Y�*k�(�����IM%$�XMj�7�԰��lQCz�\La�veJF�ww˵[����xD�M���"{�*o����-/"�4e�z����V��׍�D�<�K�HH�A�����H�Fc�a, ����=&w;�&
+���+��*�3���U�+�ʓ����o�]�U��{
+�2HpvfF-�M��1�00� mG(�,w�2M��
+���K)- �Ҁ�(�	7�P7䛳-qc =.`*3�6��sp1�7 |���������KE�?�T�dt��C3ß���)�".�@��Ri�אe�X�I?�/,��zQ�Q�V���4�&
+��:"R�+00ح1K���l�)��=��ő�t�4�)>����`N��)�_�>��gq�[��1F�Y�l�荺~V$+��#�@��f�8���բJ5��x�`Iw\1^^��@_��;=Oᮭx�A�P��;���g���_�Q��T=� ��#H̖����l���^�79��ۛ?(h�m�`�<6a֝0Rdmȸ��p�F���|Yl	���	M�V�o�Ml���Օ��p����M��`=>6ݗ�J6�^
+I�W��T�U�4PH͠o88�	�-�3��,᥃�\yP���%
+�x�T+��"�������:q��VA�40D��cv�<���^���xҜF�="Mr��������E3{Ă�P~��\�ILr<�a������&2F2��ٰ�^���$bB�,��Aѷ1�HG�Ό�0�\�G[�o���I�Ua��7��pq�Ćx�L�@ �o���DX��'@ ��^�F���T�N��mQLC8e�v�ޝK�Dz�-�o~���un{\��S
+?!�����|*�c.-����Ћ s������{D�m���(������r}���^�h�Qr2�S�ح�a���7����$s�ZC�L֙��˩�:9��ps	��,Fa�Ą鵝V/��8o&d�O�G������Wv�V Fy�㈯_��9�Eb�f����W��\q�!���#�er*���t�FF�7�3�1�U�!4L
+���sc�0���+���i���d'�l�wU��>�H(Ga�b͇���E���e��J(��7-�ύ�[�?��*�D�gsG�]'��k��q���~�k1%@�p��	��QB��#h� ���X0�u=fϬ�d�=�d ���������G�׮��W���ǲ�oa�d�bV�I� ��J53�7A�L/F*>�s_�p��^.�>���Gt�m_�N�|��f�%��'���|��&�b	��3Z$!@`~�\����!̵��^強�ߧ���9������J�%�UX�f�q����P�o���{ּpe6��8Խv����x >z�{ƍe�Uu����+!�Rr�����	��aXr{��e(�t����C����0:��T|�h1�R$	�R�V=S�z����D����'M�{�~��X�� 3ǣ<�� �Ifw�\����p̤�R����,��q�#!r�"������^) �8R/���|\��
+~�2�@�B9��E�̥T�:�U����QD4��C���q�ġ����B�Ypj'��	�Z�P8ڭ�`?O�X��\J�7������#j�n�.l���Ie��j>��SJީ��#I��0��X���7_O�=&�j�L��.^T�R�s{�����k}ȁZ@��+�!jOb7�,�!f��B1R�5���H�o=��H�&H4�<�<����u�g�p�א1�Q���{�%\Ɂ0�bD�ɋl��=4�`��]ǐ���&��F�xr�n��Ѫ�Ͷ�����I�=Hd+L�4�*g:]e�G�:��8v��t���-�U�[1Yhc	�����̕'��# �7Ë5�P�5��a��E���E�S���5G��詓$ؗ]1�1�@�x�~F�E�R���4�>B�V|�.	u�CM�w	RS�y^m��ҡ��;�s��t2�4�߸���� pОsd{6���$~8@1ݨl���U��� �"���&u�Z���.�=CV���Pܦ�D��ȡ�oC��34�B�ա9M�	czՕ猳��	�{��ggg�s��a�2�����Z#��q��6��B97�����A�m�Z^�������b��6�dbߓ��|W���^�c���t8�z��8�k�s��k"�Qk��B,1?� �&�WX�˧-����i���ո��Z��]�˴���]-��b�����i	k��B��͠׍n:�T�^F�1r�������(�2�#>�8�y�ٻ�y0�߇�'�`?ӨB�@�ېq���`q�����&�����C^�ɥ�iVG؇��HK��i��s�mJ5�ְ/����p��/UB�vŢ�}v0	�ma��[��>`޼�B�_>>��'�D1;����I^I��a�B��.�M`y�$��tQ!(�3o"ȥ�C�o�A�oR�Y$�YAP�rx��Z��Mq�� ������l
+9��lYj�Z�蒒��mx�}���z4�,b.���(O����H�C���Q��l ������q�s�@�U����Ou�C�F��� 5$B�u�W���5 ��i���=.��o��kcs��0?��`�0���^ǻ���}(7��'l�s4>�S�}�tO�	�����p��fEOE�<��V�5 �u����Aw�2]�qV@�
+��S�u�]af&p����b��W�����i-������Ƌ���ԴI X}9M�7cO��q��m�o�$�̊�:��d9�N[ü3�%b�(t�����(�4sd�(&	���aǃO��b	o"�$�A5l/|�(�z�A5�x~�J��~��mP.D3�Ԍ�d���}�/VT����;�S���r%B`�|,�Y��\d����x�ִ!��&��f��!c�0�X�~��d�T�bSP̧Y\#o!;M����O`�������q�]���Ҳ&J�F%R�	6��r5l��q4���_�S�e{U�����%����T"W4�]4���g�xf@�D�<�ȱ�dŴ}N���-���x�1|ch����2���(�,��l�����
+������`�~�iF3�9h�4л{K��	W�����t�3��x&e�8�mH�F�GB��)�-`!�?<pCK��%¾2t���o�M݅�%$Y5���@?jg�-$R��ʡ�]�>
+!)�]�a&t7!��j�r@����>��j���W����#�����Ҋ[���|� E�8��O �1f��&�ii����`�,Gڏ���v%d"q��4I5r���AN#u��/�̍�R;H>�r�fq��e��xq�9�2EDn����O#�3ߠgý��cH��$��~t�׿�b��3���V���++h�+ػH�=��7�>����)#�2|sgR4���x.��?� �t��ϼ����XD�V�Ŏpn�J`�r�r	K$h��0�� 2~�ҿӂPy�^e	����p,� ����Y����67.�<&$�Y��W���B��_�� �~(�Sv����Z
+A.l�7�Ӽ��4H��m@t����w��Y�^ga :9u
+�X(&6L)N�fw#�u)k��-�8C�����p�A9��%E��HW��� �)/>�(�؊��x�����)�r�2Ai�Y|k��V�^��~D�Oa�aP	�+�s���v����e1�K��������Ӻ����6�H��>2q�w��UQ����_��ub<��	��+�DѤ�}j�*�h<¢"ŀS�]��<C6a���O@�B�WC2�v��S�4���&�+b�� ��aD̏���l��A_���"&xiPìu��uI�mb����.D��[ �@�>�uN��p�h�x ����P�"\�G2�քb��.���t� ���n��;�}�|�?��.�++G�@�SV@��e��*�	��|!�vp�����4�}��w���1���9�=M�\��,���^��)Q]�U�|��)�B��(�_��[}����9��~�_7|�>}�$��͝_��r�D}5�,с�D<u�+���Ft����mc��m��!�ٯ�G�pt�aq�c O�:+I�P/��i�hP�K�;���'�{����ü:��g��-�q��\K�ʟ�P,�Ov'3����]���Ӏˡ3v?@��	��.�적��xk��-��R��Ũy_��6��B�*�٧߮�v>;L[��X?&��ˣ�AS;��BV��6?�b�Ρl&�wB3 �J06(�Q,�bj1*����G& ����Qi��
+�a0�!���옖`�ҕ(��ylᎉ�T�����Ą��M>r��f�*j�M����4��V,��$���,�i��=�l���I5B����W���K�4��|����6,�O�zx����z�}���%�bK�J4!1^�� �}���ǀ�w;A��x����ڒ��]���M7�Hᦗ?	O�f�
+| dI��l/0�YT���W��c�9T���:���k��������ۘy3�L���!�G
+b7v���=��!.Uk�@�F�z�/��ط��«)<�#��;!��8k3��0dȱ�.X`$�� ��P�O|x� ��!����<*����7��YN��U�0����;��I��p�Ja)R�]g�Z��v!
+��M�b��-�b�x\{����E���!a�ˬ��qz����������}e�Խ3��0�:&��Z2�B�BB7ܓ4EE�xE_������HZk�^~ �X���)�`��G#۬��_��jlJ{ߋS
+�YM�hCC�ۡ����5/d��S�y�y\`� �]3�C~�x�PX�G�R2"8�:�F���1A}Q�|ql����3_�`m���w��m�`�!Z�]����Cz�i�lN������b\�c5+)����}�⠿Y��*���bk���C0����ߡh��zO4}�����@$u��j��{�Y��oPs����MpI������5I!��Ӥ��^��!�>U\�x�t�$Ԕb¼B�ѕ�\�w/�2�נ��j��
++@7I��{�qW��(˼�^���S%O�}��G�����&�*\{R�@��YG_�\G�[*Gf"���G���B�F��w�p�"$��0�'aa+�rۆXn���!g�y���X�/���U�����8�N�M%a]C� ŉ�#�*gԯ�z����|i��Q[5����K(��w�R-;ݖ�xǞ
+�Om����j��k�d�Dz��� �`'l8�
+EP�*��z�55������3t��k�Ha���Mg>���%���6~�ZZ f����O�>z��K�ѱ�ȍ ��O���Un���o�3ިBô֪!��m�}j�՚����T|����(;$�"�K8�̊!�L�Ov�)b9P��)Pnu�)F!֪��/����C�xxXJ�\f\^���`6�X�ܻ���B��&p:Jr������JV?��ðơlU�>zo�NXK�,	�� ���c��#��gZm��&��;w|g�g���G@�(A��As2Zn>m�G�?����ѝ�}�5�m��2P'5vV�d��Y<z��A����"� �;B�@>�$3�H-�����3����|�А����uH!�}�@3��G� �,M��uB����&&V����"�m�tC.څ��u8R	���+�,�If̓�W�:�F.:B�
+Z�$�x�����ade�v8�����)iӇ#��q�D-������
+o>�P��?U�M�J��i������FQ$E��C�c���yQ��%����kc�����½�����
+@�謓��O�|��ɑN�4΂?ll`�#f3K�c��7��!<�1��1!��M���ȧ���.�7hy�	��7kͳ�9��)6ց����>�S���{����]t�,&�g�7b0����%~� {���^p�.y8�� �l?����xt�������;݁��a��g����3AcC�A	��W������+��`(�acӳ���qJ��+cߙ����H���9[�E{�Y���\�q�g��Tr%]�&��V�3�FP��8Xq��E0wV?y��?�)�<߫�{���2�t�t���-��0B�����X����yv 阃�[?�(�o�$+����>�Q�f1�cT#�T�P�DT�褿	�8F"Π���5���%��%}UZל��kRV�)������N��&^N9�r$���"��ԟȵ,� �jrғ���7T��("��)~_���C������a@�ͷ`yQ��^/ �5�l���M�����%�UG$&�f��8���8��i�T��ܔRW���e'�#��@�&k&�	�Vl����q�hsN�TGd��Q)l�0S�h��& ����?������4���G7�����r��xU!�Ӏ^,���q�#�����*�����B�`�@W�؛kX�}С��C��|M�w���g�'j�MF�U�)F������ݧ���X٘��-�-�:�[6�Ǥ���=��h!����W�_�Yf�E3��K?����}� +�	�x:`um�1��_`�2����/ā9-���#����Rf�3"C	ĂO� +Y�q�*�'JX(Q4�7)�Z�>l���y�=�~Ƃ�0��,:}���e�X��^��`u_U�b��Pv$�vL�6������؝���e|q��`2��`,X��`;a�,=����<	�hf�R8��{=q�b� ��	v(�`���^���8���5:c�!ە�����1X�R^z`xfQ��5x1��U���c0�@iO����(3X��"��{h���f��e�
+��3�Ŧ���%���P,�F3�q`́ 6��I�h�.�R��y�S�?7�5�`X��`�9�au��Y�`=�,�K����i��
+D��YB(�Y0�GV�M�m����5!��ӴnyY�m�,���7����͂.XL��#c�R�\��3�������s*�b�tL_դ8Xz0`#&��a"����˛��Ha�
+�n���*2�xѰ��h����j'Mos���\��W ���}g��C$fh�P�+�&���X{�X�"z��a��-����+�h�X�b��\LuP�d,�Mlc��و�d��dL����0H��p�����'U%���w����c�齏�5 �N�,[2��E\����U)��h�P�ȪG&�f��U���[�f�� �p���5Y(�-�����,Y��d���ɺ�~�F���1�[���ݤ�f�L++j�`,K�M�RlYR7��� T��#�q�H�����`������2Oe�XfV�f��Όt��Ǜf�Z͸����l���֛5p��fQ��q�V�y����]��ؕH�sǪ��Y����n�rVǆr� g[��u�r,�͢���l�ܬ1	r�@����%��t?I2��s�~[� ��&m6kF��QܬG�>&�*���;��B�I���dl9c�d�H��|�E���J��,��Y��we�"�:��G���,,�A�0����b9��9�Sԙ�m͂��{;ӟ��Y�@��&r���
+Z�ŵ|S��u�U�5���#�0�"��6�6�\s6�*�mҙ!l��򉢃b��Kg�Z)q\g縝���\��$��,�wv須� �3��;kE��w��D�
+���f�g�ڝ��YT�����tgz2NYegi�����dg�-[a��Y6�fn�3�Z����4��E
+�Y��Bёz��UߏAҕ�̏ȳnD�B{6=|V�f!��������E��e`�E�N?[��f�'H�>�9:���YX8��.�~�k��V?˅}6}|V��l��Y5ѕ��m��c'�B���g�6b�l�zF%�����C�t������
+M#�E���x5�<��6��G0�<a��;ˮ�,��띭s�X�Y@��[�C��fA�{gn�Yt�;�{m�fa���ł�*����;+G;�Y�,H��S��Ά��, �1g�w�:�>ՙ����������mlMZ}���ݶ�$H�Fogy�β��BWҴ����
+<��Y@�@�K����3/�5��E!ۺ�������Y5��,}~g�5��<����Y5�U�Uo��\�gY#G�Y����EA8��G��;��묐I���u]j��]4�.ng��w�c��,Bw��A\��J��1����#�3�8�;S�Y\ة/�,��4Y��{�g[X��\i�L�³�����E]��;��z���449�?+Q���3�h�@�S�֔���,���,j��"j-� Zv��D��@�QѲS�({�ʞ�℣�;�� ͯEZ�)i�:iy����,j}��K��Y0%��3ʴ1i��ݦ�����N�/Ԟf�?��:��{��5�&G(�&�Ө�&���ӹ���je+��DSkc��O�Q-��0�����Z]�Z���+W���~k�a�5|�����If�h���,�Śg��9�5�Ŗ�V���K޵���n����,���b"`�k^��+�U�b��6B	���5�`.��L�f��k�&����Y�fB:%ě�<�Ƌ���f�N�<��V�Ħi�mk����c{E�đP6v�l��lc�>S��@�l�}65fhK�h����P[횅��r�Mx�<��[��|���YŶ�)�V��-�y��淥�o�Np��f!o�u�,p�����rk_��h疷D��M��(��j��Dg7u5�� v֊-��Z�o���mg���b�i��Q�����-M׻��ƻ��;5�ib��[<��������v�f����wxk�Y�y�9�YL˼���mo��
+5i�[��~��F'74㛑������B�}+��o�-.�o|�oep�݀�Y�������J3�mzٺJp%y�3�����I��ݑ��p Gdp�O�53J�Y�vU_�ER���Y�Spu@p�3�Rp|��hʉ"��v<i����-H���nB�M���4�u/�k�\�2�=�,��R����n�\�i�5�K��j�,up?�����F^.�pSPµ;�����I��m
+g�\�l�`م3�b�,?	��Mڅ��,US�L���
+�yS������PS�埄{C
+W�Aǡ.�C�p$�SF��y)i`n6���?@�,hiD��F��g��p64�bNyq]I8���D�E��f��YxG9pe�f���_�\)W3b�f�T�'�VM�l��"T��R�<���'��Ն�y �R�M FN7�g�X\oP�n;���ˌ�^4�m��8����Vj5=.�W�1��y�5�f���ڼ�;���`�,���h�؅f1�q��,��G�Y����X���*�3�i�r��3E�C�,�w1�,\���v1�,�NM(�h�3|b ���ƜT�E�7����,�\���+36'O�Id���ᙅWӇ�*��p�9�.��Y��t���ʘܜ�����n�&`bs6S�H 0�Y �\�_sO��.�?�������֜��"9X��E�r�Q�9�N]X(����,z��r3��.<M!�+�h�N�3����ş�e@a��tf�Q�sr]�,bә�Z��\6�\�|_s!���#��������d���H8s@��Ks�&��]s�KO��Z�MS���tn
+w���B����=�>�nfA~�k��c�ۄ5ta��<X�u�F��������B�X:�0]?YM�cN�{@�b{t;�,��@�9]�NfKI\iYz:]E��H���YH����qM5�;5AUxW�;�`��OZ��`BI]��/:]��f��G�p�t��b�|���uo��h�;p�k:_:]�$M� Yte�N'����e�˙�%)�Q�?lȢ��1� K��"����Y���"���@�@�tԘ�t,f�8���`�����`) u~�<	Q��)fa S^\+��Xh5�B��,6fq����kS�ө�t�
+�N�a�H�r�`S<]�ź�tu�:� fA�0PW�ԽFOg�H�.�\���U�`gfQ�B*N�2��\��Ž���i� ߗ<F���8].6A�a��<��澢R�jf���.BK���ƂQ����.)�y�:�ٚ����1/c��Y,(�:�3���ֱX�S�]7�{�q�q&���:���Y���0DEv$�,�Vvl�cO�Z�윂��έ�|�%2��],���.E�ŀ^;� ��퀫>�P��y������(wNf�;w�,��gf��#qj�M �"�v���b��yW&z'��M^(7w�i��8��⻛�-���nSf1|%<�B3��e!�w3��#���+��k�Xf���5Pf��wџzW�T���[&�h��Ũ9m�k��[�3�C�::󝶶�����'LL�� /:x}S��vw�;99���,@�B<b�����j@�K��˰��O����ߑ'/8����H/RxOy�D�bP^'����<�7�XR/\e^�J�˯7�-���,0[��T�,�>O����'�hRC/΢�����2 J���v˱�/�8z�^���Ιz���7u�z���`fQ-?��o?˦W���zR8֗��U3�o��j�ޅ��@�٫re���Ӟ�̢�����kpo�r/��ޏ��e�����F�{��:�χ
+�]_����u[f1_|=-�o� _ ��Ylg��Z�V�o�|ɡ�7'.�X�fi���uԑ�Q�BY�'Q<痯E�c}���Si�O�$~F�Ejn`x:O�c/�f���;0_����e�\z�>%T��$�,N �(�l�/0(��/"_$��p�
+ʅ��)m��)�ނ��w��p"ʂ.U;�Қ�9ʂϼA����Dd�,(#£��5_��-`#>���$��/��K-����1�j��:�yĨOU�X�Yا,z�;�O&~����&���|5ą5�,`��S�VY�h���k�V�;�	��h>��/�^�ʂ��c��̧ܕEh��Jg>���G0�9��Nk�|��|͎�f���7 �Q��~e�6'&W+m���El�2�|�kjg��gf>3eE�2<%�J�ImmN�3��x6hR!s�̧�����fR`YX-�bF2�;'||I�|�_�h@ ���f��e��T��l!����������Ac�	�O�'�;�'<�,к6��4�2���gY���e�0,�>��K�e��,L��ωa>i"�j�W{����C�a>NM]��A�k?�,��1_&�Q��fۧe���p�U�,�0�6���0_
+ �¿�0�K�,
+#���w�E{R�e��>nŌ�����SCfY�"�X�eY`��L���Y����~+�cԠ�O�RDH��.��YɑvB�eA���/�� ��e�KH/�o!��ʂu�$%��lek%�r�̺+�4�j�x�|R��!t�,�t�_>�i�\Yp��+��)�^Yh�/�n�A1�9�\p��|����X�������,�-�%!�|v���ٛA�Sp����6'��|G+���R\�P�U �8�
+P�|���̷��a�s�b5��E�54(*�C��|�aY�	5�,j��1���S����'ʲH�@�[3��~Yo�bY�"����y�� ���+��c:�ʄ�6�Ea_w�:_6�|H��6^N�#�E�c>ߚ,��ԩ�-6����fM���,џNc��ɦ3��1葎�,�q�$!���)���R&��eY@?�]��Ž��,�Z`Y����*�e�IX�^YxE������h`Yp[0��YbY/�B�����'TbY@}��*k�eQ����|,@_���|\Y�諡X�ẻ���K�V	������ctd%̲>�Lj +1��C�V:X��绔�{k����=_�B�*B�\Q���op�>�4�[J�ʢ�O�6*�,Y%I@�5�sT=	Bߡ��
+�⛅�ᄾ�����,��C�'�;�"U�v=�6�����څ稆*��Wz�����g%!j�TY��U��\C�;�DP��E�U+���1Usm�VY�d0.����P�\�Ч;���Z�����n�XÓ��G_EVP�������������ۮ,�WS��t˽�Ϧ��� �gҭ�E�,X�H����h��=���V���+�PRr��~�WUVR�\Yt�N+��K�1����	keф�>�	�CN"}����%��YY@���kW�b��P+��Įo�`�+���O:C�	��7+���1�+e�!(��H���ʢy=a%(WY��*(H��c�7WY��I�՝��I��UeQlM�P�SY�
+��-�����w\e�e���:�b�IY�)}�ۢ��Ǒ����)eH�XI(R��y���XL��,��6%}*�#}�!%�[8eqTS)-?UT�s�T�q��2��y���u^�,F&�E�B���P�,�C}*�fzL�8.}ֵT�'�����K�[�rV�x
+#�L�S�����.���ч��\����WP�>N@1��}�w�}8WJ)��3Y���,  �����     UUUUUUUU    @�E�vtttttttt��x1K��_g?�>?eȈrQxi*{!���P�|�<o�}�	�i:�����ϲ߂ɔ� Ȓ�C���H1̾%�}�p�.�]�Ye���Rf�{>Qhl��l��x��n��t��L�\nK��o ���.W�+-ů��C�	������
+4�v���H�� �I�)י�K�����`�c|�Q�C]��H���h�$D�cb�xn��Z�d
+X���-�8Ogf��-O/��OI����h�����������˾��^5ih��;��<��@������,�S�c���v�~(!�9&��9$�1?��n˅Ϲj�� �w&�/���Yx�2�w����i�.#-�E�^m)Fs�����@�|�X��R�)��7��?��L�i��exq�K8!r���#I��#G�~��
+_���׹��B��R�[n�-���EA����Gn��,~wԶ�^����	������:�S���j3qr��4��+�q�d�7¼Jr���%�.���������KMg]��q; i�Nhu������_���[b�w��FJ�엁��̬Y�#�£<��z��7����0rO`S����A�7�����#��D�؈��?����������.`ŏ(W�?Tj۠7ݓћ����#ƫ4'��T��CuV����j�HNG�#����y�r�o/O1]����������u��R�oJ��;Af�7��Ј>��*v�<�w0��L�_�I��J,�i�[K���g$��8���&�0�����y��V�a�:Lg�l���� r�҈�e4��t݈��2A*4�jy����@�3���SE&Øe�$
+��Si	�0BE�S@#�ΰ�4?1�b��̊Kyj��<��z��+����+.OR]���؉ ���+��UeyL,��F��o��rܯ��Ű˭C��6��r#b����~�{�#:���%�$h�$j��f����V܈���é��*8�=��lE�J�J���NU��������훎���+�k>R�?�j�_�Q�bU���֏ ��H�\m'L1���Z���*B�s?��5�m��+����D���%Y�E��0�rJBgxخ|��]���~f)G�I���4�����J��Ӕ��@�~e|��I��Z�u1���Q+ME9Ɵ��?+6��W�#�T�!Tr�`�lJG���P�/c[i��Y!�풆�w�l���<�Q�al�m;/�4�?�_m'f�S3�V�������mƙ�����+�����H����ַ0�s"Ʋ~$)Ƈ�x��Z͎�!	��a��:'�:��Q�^O��SE6��
+m��eV���P��O���u q����#�.>��"2���J�6�jZD�;���_5��Z��A�{�	R�f��8~�Q�a���֙����=�r:%W�
+�T<yu��8]�zf|�ͩ ��|������cQ��%��?��!��V"G�M��+_B�ʍ�OJͤ������$�@�PQ�STN!�jSi��2�\o!�/���Y���\w�U����2+r�w k�B'J-"�����6[q/|c�K���r�j 9��1DkaWZ�p,W�剾a��l�P,N�n+Z��g�ܟ��-@B�;�,��W�	SLo��ΐ|����M���N�^��4��(h��5dj��5���R�$��h�n��-����UEfx1ȫ�2Ee��N�^��M��c�P+��Q�p*�Og+�O�(X
+�":(_�� 4��3?�8�(�l�N&��7anݗ]uݍUl��Y�����n�}�	!��NP��)^�*����N���D�_p@�7ݳz��͞��n=K��r�wCA�9���EG���bl��R:Ԑ��XH�?��_��8���u-7͢��q����1�g���/�6����e�hxI�<(:È��#%�U�]v>NjƩ89E�=L��8Χ*G��<�ެ\����^竒�3Ҳ�����~�И�Xҙ�MӪ,�h�4\6��$E��e8�G������Wq���H����8����(��[��d���b����]2=�!�ZEj��|մ\�'�4��vݒ��k�s;!֜Z�|���/���,��X���+���5ߋ��sQH�9�|�
+��	�羊������P��`5Ϳy��~�k��K�b���)��y5۝X1�K�i.%��8�p,�Wh(�1�h��A�u=-x~�����w]�ھ����d��)�Zp"ʰ�ǚ���U����.L��=L�&�[J������5��h��Á��^�j?��_-���m���g����k���� �u*I��������~�K�;R�{i>�����B+1��|����]��ܳ�*΃`Ab90��g������[ ����#��-��/��C��V�^��dֽ���.Ȳɭ!���@$+��K-������~�;v�e�V�E��v����,�}�tX�d��ةѳu6�gKM�q���J���_�	L�t#(�F���h�b���^�T�3ܧ�����L��+Q:��:��޴�G�K.hu����q����e��_v�\�_J�Г~�|�]��=���h}��陙��p�Us
+B�TA�{�%��v������u끪�3+�#�ey߯|+�N��m� 8��E��9�����X�����?�`q%�4Y�߱���`�^�� �-<������k�<�sz��<�H��D��+���'#8Kv[n��m�oȋVc�%�J��0D�7�0�|ωn�5(��M�A����������`6]�</�eύ4��T�`o*�Xd-�_k:���ӗ����5���nH�U0#�	�h7'"�~K�SD5�a�J����;���y��y���:B���8�������g~��Vی��_r��/ћn�h�G�$�㩊�C�\m3T1�2���������έ[.�H��R�ב�����Z14-��q��$Gs��h�kA��<�7?�h>eD��Qf�M I"y(��`��|�3�Ǒ��Ƭۏd�[�������g=�=�_o ��ی��TϹ,�x?�Ϩ>7D��[����+�#���T���X�tG#-��x�U=o�I~�t$�ө����ܒ�>�0����Z��KmF*Ċ�u�Zl9U�~F1l�pC������g�b��f\I�+?C�]�`�,�[�g��z�i��8��\���wq��/M��h����h4�V������ֹ5_�@�|L�=w�|�=�n=�P�n���o����'h�	w(���Y��5;\r3�c����3
+ZӸPr��@�<2b��$� ��j �^�.�R�.�Z�n�T�x#���D�˟���6X}���kzaFiyRCt��6�ς�-�]���Cb�w<S5?��u���GActUEgwKAhtUEfta&y�H(������-�n'zƓ��F�al!L��G�4��X܌�N笖��I�z���[����o;'A�PD�.�P��4�vB*�ʥ�Y��>���c%��KBcuA��Υ���4�x����r4�e���/;֟�ъ%�X�t��?��M'�l��������kv�uR/�N�u뙄�u5�06���E��N�쀢�9� پ�'�mHe��Ke�RZ��mG)�ゾe4�]�]MTlߑ��Tx��4ng����Z�݊�>�I�h�f���~�vNFr� �M��t�g���R��z��Z�h9�>�����t�m��%Zˁ��z���m�]緒��Y�E�*MH���@�x=Ϲ�{��$Df����j�av>J�,�xm�_�h=�r�SU���|���鞐�t#Ʃ�@P�'[b.Ď��G1|����[3�HH>��T��~�w�՘MU�F�<�e��q!D�]E�֟$�y��:����Z������4!�8t=:�U��~����8�w��Җ�\������$�8�F�Nj%جwv�x)�0�:�+�g=ӑ�Ή��I0k-En�U����zom�1}�jx�gz��{�(h/��vh:��}�(�|���ĭ6��_����X��H ��l��&�Wh0�Zp!J1==g����v;ש6�,U�q��C��81��*~��^�zı��|wz�x�������'j�rA�9���H���(���D�s��?���z�h�,�q`�VFD�81��ݑ���(]˥��^�b���Y�&jփ��!��{0���UT�kV�w4ɳ>���{ș:s S侠�$���@�{n���[,���������9^P
+�+1��=Ur[�ј���2�T���MCgvVDku������D��>�Ug>���������f����&����;!�=�������sJD��%�9�s�7�F#R�~�,���VAQ�^R����S]�E��SE�bV�WQd��0V��H�}i�솷{v�X킠U�'^��lw��Gm�a��OT^ǳ$��9�t�+|�|F�j��e�^���C�:o#=�U4��[N*)��:��Ŕ���Mdw`��~�5ㅂ���y�J���y��b�e9�#��a�Y�����s7V/;�VT�oj�����^���YN�5�B��w����u#E�Jҫ-xM��B�9$���+��'�j�G�w��7��|�5�ԘP�=�	����޹v�ͯ
+��n�H����f*�&���f�g���F�L�M睖�|*��_
+)Ng��zc����~�����3���� ~Ź@�rX"5kI>'U4��"�K�ӔΑ��,N1\���3e۷�1;��/�$�C?���v�u�^���Y�]�}�l��s|_�~��6�] ��5O3�D�iL�����A,�_���_A��ZSh~V�{�)HM/�������y��<�} ������^0����H��m�l���O�\���y���k?k;_����t;���E�����D�:�筆��n�f9
+)Bg,_1�W������MftO,��$�z��HC���4	$ܳ������<Zn/�_oNmܿ������]��P����W�P���+H5]�L����"�P����߮�5\�zN;an�?�[hz��"�,4($x��E˱(Z��P�ܜ_���e������i����>eDF�*�����8�rLDu�.������4ߵL�SE���s/�t���B���8�{��8�뚟���n��uL���Y�q���߹u�y���;f'�u�`	�v��;�ۂ�v��|ΧE�)��i=`/��)�X	�f%�^�zb�A)�ծ���'#y�Eߟ^��(�,�E�ޏ��rCI���i:_#%�)�p�֑��|u��Q��p��C��������N�ᵓgX-�5�!���H绑Q�����ۡ��*���p�ހ\�݇=�	�m~2��_)���Or<$!����,�Z���|K5K��bk��!��ҺMbm�&1����3�-%�u����Cl~�����ٔ�t����rMJ�>���8�p�)|n'���e��"�t�[�����x@ͯ����
+3���B�����u���T��N�IN'����]5��U㍈����ǡ��t���2�)��@��]v�-�% &� "�W�ao4R3~*��2j멒�sT�/v�F����XU����`��#�E�ܪ�D��\��&S�{;�&&e�E�Oe�#'�JԘF��\����׭ߡ��Vy����Z�����Q�\lЯZnh�ge
+��a�U��ľ�*�ӗ��F�E���D�u���[K�r��pV�q�bdT"�۔����j#yv��(�ڐ�6�P�n#<fC��B�3�V{Ej���B�X~�@$��<�M���{dZ�m����c@EZt.Sڤ�d^�^�U�_f;���1��g�s�мυd6^*�#��XIc���v
+�}h��(��K$��'AF[0��'�%�*
+�a���e
+vik�e.At2�8��b��_>gݒ�I��XLp9)%x��(����v^�i��|�ҀHZe!Y�u���*Ej(+F�6��2����yz�2bꊡUҒ�v�]�b4�י������j�<��F*4���O{�'��;��&T�"�a��z�`���5HM�M�n��	���$f�u�a�B�I���ʌCan>�.5��T]溅�b��`����7�l��(�OZB������R��̐9���'4a�R�$��0%��WW�oVW��jL�*�	�Js�Z�=��mX"�lfCJR��������~��{�h9��{?tĦ3r�t*�,�aR��qi��K�6���4f���d��F%�1fѸ��E��g5'X'ܚߒ\�}�h�OC��\"����-�f{�����S&������)�l�J���b�tz�X�ȜHahդ1����Ԕ~o�z���Gv�p� �����a�,b F'Y �/8�@ �ZNx8NDt8P� !�E
+pr��P#�+T�Z�V2r۟ߵ��t��r�ꒂ��f��(��] ��,��*�����@ˊ�.+.tXp�aG(� �����@��,`�0�2���k�t��Q�1�J�j~�������f;T�������VZ�4�\X�203�# �Lf��C�� ^(����55�87�@�������T[���J�@��G���i�l�Ȭ.L��n���q��;��c���V�R)	��,!2&.���� ���AB(fp2#����A &�d5IŴ��+�f&qчA`k�.�^��>nX��ֆ�d�[���f\�a.�Z+L!U�� Ȋ4��a��
+#F`�����'�e)ix������D��ѐ��ܦCn�{�vl�
+�� �(��m��p� ��� ���<Z������-80���Z�2�%u�5Z4"��jq��Xr�X�u��z�[�d�8nT��l��P�Td4`%7�&�B���*\8&Lp�#"|�01�6#>rZ��:����Hj��(la�x� �TD�rK��ƞFaiV"��UHm'ar��Z��<��h�Hk��T���-/M	lh��� U��:U\��eѨ��H�⚭:A-��F��p��K�3���Nd���O��v)�m*eE'^�U��Z�:�?��:%@c��F�Z��2J�6j56cq%3!5���r�+�,�g��j�Dhn�t�x�@�
+@pL�P��(p$��.��Pq�E�H�P1 �帀���!��J��hFGk��P�m�}�)�r��L/E��� ��� _Q#>(���`�Ǉ)8 p�GH�1��`�p�HP�cC(���P"� ����J�&�V�Q�VeDF�<�I,������~��L d�#C `�0!"p@��
+h� ��
+8�h`$��(aⅣŊ�@D|�Ǣ�$s!ҙ�6Ķ�έܯ������\Њޛ��~I�䆀"H8$��GL�!b�
+���h��	 �
+$8P�@�P����E#$=��A��*�얌(ԼcE��Z3i~��P�r$̨2
+Zz�H�"��*P8*D �� t�H �8h�
+$8N�`Ḱ������H`.�Z`��V�Wl��X��|/�#E6#�u�cM�}�g{�׌��M���		h���.pL ��
+<px� ��&��1AB�(H��aB@D=�3b G(T�h�zQ-
+뀱�/٬�*VZ�)Bg��@���C���@pL0A���`�@G<�x�@Ǉ/8H���0�b;;I�eQ��s��K�]Z���O4��2�Tq��YmX��� /PD�0!�D�Hp(��<��1B���0���1�� "(-�Vl��E1�G]9GUi��W��KKr���z��(U��:k("���4�(a��2p8Ѐ�P@�	1xI��C�D��3m���u �:uX
+i��a�.2��C �< �D�1�GL�a�pL� � Dp��A��X�b�I��A�Ѻ�����P4:�<��8Z���8�lV'�e�����d��8��7�d�	�0)r@x0�C��A�0p �@8@�C��,��x���&�� �q2�)U�i�@��3W05��,d��A����]k.�H�	Nl�&R�p0h`�À�p��(��@h������	��� �6��,(AIB�!�R4��$N-��hY��e���8]�������60ڀ	)P8"�p�#*pp�40�88��`�$���PB����#�VX����U�Aa̩�E���X ��r�	��d��@�!0���(����pP ��
+'80�(�1r�R$�äx@M4`Ri����gz�p?մ�B�<ЖY�}/A��P��	��p9Tx�p@�@�$�!�8 P��C�$J�p�`�ñB�G
+,���s@�MU�!rI��D�tBA
+[�F��D`i,֭4K����� #H	Xp<�h���a �L���
+�D8"Dh�6��p������c?셨T*��|��"�bO Ҏ=��)�LrNx�!���x��J�pD�@,^�p�0�q�C��"����ɹ.@�}@G�D"*�e�:�d��ˮ����H�up��f�N����� "��,p$�@(���0���@��(F8JN|��.�c��7�����ъ�m�� p�M�!��J�P�5S�b���~ E�	d�(`���d��4R42��\j<��[Q��U�~��Y�%
+����BAs��JfG��ɭ�&��f�想j6��cy ?ްL������x���8p@�L�4�"C] F����D�ٔz����ADv����H��T��&Ư�=Sc�,5�mح��C��KT�m���BT��P1C �!�A.P�2e�ŉ@G	��(/b�X�灗���Л��n��6�˙��Pn�.�]�Y��7�jN�u��[6h]�s�Wi9�p����j:�p0x�����B`E��A���M�J1Ƃ�%���=�Nt�V,�����v#I��X��k7�W^��e5�_I"�%�A�\ F
+�#�>�HA��$y���5�`
+�V��u�8~�����V��څ�r��Ұ[�#�
+���7Eۏ�y�Y�K����1������ I�� F8$h���
+#
+x1��h)�CQ�5�	�� ��j�SZaPZ�B�El�"�zG�	�����kQY��_�Rhl�R����A�C�>��b��(�Fd���h	h�⧀�10W�|E�K(Ǩ��hPمKU��n�}8��J�Ui?���Ś�f�FOx1*Y�2�AD
+ǆ%̘y0Ce�@��6�#RX���H AI6`�� �����ve0z�!�F�P����:g��[�fz�-Y����[m��-O\p�� 8X@ d���!��4�� ��T���APj�BZt��^��,�Ԝر:����ƪ��SE8�7���zUJ�.�,7!��W��v��T�f99Qs8Ik�b��y-��*2�7��۠6���5v��m��8+��q��X�H���Y���y��H2�G�It@�u���*�6��bYv��0��Z�Sj(�Od3U�7g�9/t���(Z�1ɊM	B��~����U�}&^��e�3�f�ԘEa?����tZC}��Ge�,V\���]�f�@凴��bC��R�e��-$8�3�r;g���~�t;�sx=��Br:'#z��ݮG�o�Js!�r���tHP_t%ѡz��U2D' W�nܦ�V�a�A���%��Pk�?�l;g��j�x�j�m�Z^�~�pXGc�-�^'�,�/��_�VqJ"�~Tɫ/vR�q5��|��z��M�kB��U�i�I*3Qt"n��i����T?���z��+�1:-��:j�=g�\�U a�u�؂�.�I�[� ,fe0F�<�Iu����ȕ	-���#D�ReB��Jc2i��x��&�x�$~��z��=�(��Z�^s�f
+��q܂��r�y�Pw���tI>�u�Qp�!-�L�	�zd���f���N��\��#|����7�,͛�4ݎ�+��Ŗ��ދ8��+�J��7��4
+�M4�g�ֳ�4�����t��&�_�7ǬDt�)�Co�SZ�Uj�#�d��:k�j��a7��9�-R��H���,3�c��#T�˵*��J����h��a��cT��S��Ժ+���l���l��7-"�u��~�&��eyƇ��n��;���4�q�c�
+Q�b�_��t�渧���R8����J�z�uD_���V��`u\P�e4V�$��Af���ak)�1�	�(�
+��ǏK-��7Sł{�n݁Y�>(h������Rk��U7`�#!�J���ݝA�|�ȼ�����0�r�(oEH֏�Q�WPْ�`��x�*��ހ��Ov���������Ԫ�6I09B,�p��q
+�f��vD����ߑ���A�ߔ='��!�hm�'j�Ű�E)ƃ��vCC���=͐�xUFĪ��n�.����)m�� Ҝ�Uϕ ��:�[m!H����MD=�Q�\pŬ�iXܱ{�;)��TGc{��oi��+L��)���?r���H��:�Ns�j��HMW��OH9�Kpa*[��� �ڿ(j����=�{�c�/[�Cj��ⴽW1��N���^��v`?�4��ApB|���p?�Oq�a��,���i��f0V��o;�u�U*�a�Tlh��4N5]��Ɠi��J�]l(���T ��G�g?�P��N��L_���I�%�rd*k��Ug���'�H�.� �=�Yoʯ�h�L���
++��dXv��a�����	�PZ�O{%�e��Q�Tk~Dq��U����c�ҳD��b4��@ �!XQ>�d��N�]��W��Ӭ籞�ȭ�����=�0�0[�� Z��b�)��<����D��'�*8M����ڎ7�U��0^�\�O!�+������-��\t
+n����v3M��#h5V���y�h��a�c��"�p���g�,��*��5�5��2P�ED��@�j\o�8m�;�Ygȯ�6T�΃5��T/{�;\5Z*f��oQ��1Ks>��֗,��J��R��d�b�o��F���,�Ʌ��sz��Q��R��@��"�����Ӡ��;��Kj�uj���X��L�$�|�.|��N��KR�9r��P��5�GA��(|��N�`���JCY���-z����2�Vd>�Y.�e�K���)]��"v��*5�$��4�g�a��rn�Щ��uw��7կ���W[���4J��!�5L�-��l�d��:R��FZޟ�q�`S�ƀ�4O!�rb���&8�(�7I��	�i=���j���g���U��\��-Ѱ�/;ޟ0��"�-�ī���ؿ�4ߙ�z�1�ZCYV��\��Ai������x>�7 *�O̮�~�4���Kn%�Ga%+�p�R!��?��`��� ;��r?`��#���z�e���!��wq����<�%ɦ� Su&��
+M%I��@�w=Pyޑ��c�^;pB\z�<�D���PZ) CIA%;���U�i�$�������I�k�fZ�� jŽ�b� s$� +�;.l��h���m���V&S�qJC8efa���Z{aĲ�8V��86�w�^n.֮6+�����n���K�MW��-�Z�^0�-'����9X1��!���_���m�C%�}�0_9��g!�S �tbq��P�b��d��*�ov�uDCr<'�9�
+08�=�E�fA6KV�O�4�{�s@��xM�-��}��3a�D�p�bn����[�����yp���e�r� �Lާ$���4�@���ͭ�$��d[��C��q���H�ŭ4�z �Mg�k ��Ӽ�g�s��Q���$
+V��KV�RC�2����^�6M�j�/I�������-�\m/�_m!x���d��hf�R���g�n��%7A��f��>�8�u`�#�H�@�(���~	�ѹ��|��R����"ı��V[V1	F��0Id|�֪����/���#(��̢��<Pgb�b'v��l����e�1r��9G�6���2��y����܈�Kd5�m�f=��5�5�_^��!`�d9]�N6�w�vɹv�M�i�=Zml��#ʬ5eW�W1�Z����A��A�(�&H�!�:��풒�g6N�����4N���I���g�MAL�`��.��4���{�a"��E[��r�w�^�æ4G)7Ƨ�c�Y�T.��#	��%��;eY/���<������� r�Mh�`!�B ��	F����~ɭ�%�G�Uh=V����J.B�������@�]�Q�m�v�^BMR����6=]q-|Zm3�]q%�/�'�Ј�����#rJt+�Ű
+�O��bV܆$2<Hc-�Oh1�Ke#T�2d��~��d��1�f��{N��/�(�/Q��޴l����"S����P�dV���u�0C4߀�T�B�%����h~�,n�����M�1�f��,X���eQ����|�M��.J)d�he��=	%���fW3�]Z��?Qmf��B�_��Ḱ<_;Fj .�([23���/{��u�V�RL���B�Sk�P�`��4Ͷ�﹮c��!�.|P���8��l�F	�6��N�57�E�Ag��B�u�jc1�jf��jY�CE�y�Yn'l�b#x��"H�5 ���I�[oɪ�=Eq,�.�����9����@��9Tk$v��3�a����c���bD��v����Z� �>B����\IQ�����2R���=ӡ���9���4��6�4�]�9�;�ᬒ��*k��V��G�~��z���ߑ��ϭ;�Í�Z2��b��b�����z�g���r�z6H+8F�p?��c���t��x�h��$���hvy�1rC��~0���4����1O4���/Q��,̲�d�ׁ��JBe7@|���~�V5iM��H��@�[��d��م��bCAl��v�?��'��U��&�|�n�lH�4�U��y���b��$�ފX����������+~Yx4B�	b�n:A�@E�Ә�e7I6�e����T��h�p��"~Ng:�09�i؞�0Ψe���?�bT�����$9����"ͯ^�L�Un'Q�5?Dn��W�S� C+X�b�p�h��6�D��X^���_q?oXN�ƣv�5T�b~;BtJ�Wj��H���v��d�7�a����ɲ�_֓�ٺ���z��~��T��A*����P���`�D�0�T/��6$�i����+_�'���:�$���,� 9>i�A����������gA��U�\���4Ȑ�!$(U@��(�y����":�2L������9�8B�(S���y��'N��2��)Ap)B�+`7LSO��+@`�Dk��|
+�)�/Kf\��4�L��(�z��O���"��8h�p �
+0�XU7ac�� �� �:f�r-K C_���S�_qZ�T`�T"D+�8E��(3A+�@,�H��*V��+ֳ�2�p�4WY��]�b��2�Md�7��&�����E*fBH�>yB�-0"��FI͒d�C����	f�c	9Z�������0{��j;3Qf2�b�h����.��#X�H�F#�����h�0Q�����-Z��ԙ�c�FQ�8����r멆�N�|�F����$���<]7�X��A�;�^v���oT�1�䒣PB�B�S���cjn��+��ۃOT��Ԛ��9���yL����$�Hp��xG�E˞��z�p3�Ơ�T�p�4���TƂ��V��8��u ���?�� ��$�.D�b� r�@%����R��"@B	
+f�Y|Es2�􈠒Y#U	 ՚� ����p)�A��� �좨�Vhʍ�,�v���38Ct<E�*yO��O��@D|E�̪7@Mil��$�(�)�(�Q�Wj["�N2j�Ctkïk֐�%[�bs�a��b�Y���Z�Mc\��2M�^��[��V�k
+8E�@�5�d���{��h?�u�/j����^��)Y�L�م�$�<Gd'��]�wU����1�AD�8�z�/+�B�Q�����2�������y��O+`����/�Zpz��>�\x��Z�/덃N�!�Z�T=G�*���p��:Wq%|��<�Zq@��]E���`��� �T�Z�5(� 	�;@�2{$��<�t3�����Gx��)�Eف��$�G	N���>HX�c7E�?FsH�:J�N�
+�i�J�����| "	T�_r��2�8��@j��l�z�=��2c�*K�Z���d���ɬ���Ĳ;P�!�t�-�L�i��^�X��'�3����zV e������oiL����k��<�|=Zh/Kr���
+�D�՝�����S,�R�A�P�������R@DK3�̷��X���xG
+-PӘd՚������+�`�j��\�o�e;Ey��(��#�-�hX.D��bh�6"����,b �D���Z��\��J��\��X�]�0̩,�'	([u&~St �"X٪1�8�� �zS�zarš,��T�Xd�Zf�� �: "�,5B�8Bp6Aj$\�-� 3x$�d�"�t���"���)��A���`���W~#��# ����WŏU�"P�J�d	^��P�iH�bd�ei���/��X��St�SJs�*f�$kN@ƶ���m��q�,�-�Y�T�T�ȹ���b�|:d:��K1z{8��N��R���r��9/C�.Bq8Et<Og&~Zn �.8�6�s%�`�]���[�T����g��T[�.<��}�⧩.��K�9V����#v��H ��P���\���P��"֍���$|^3
+"5��T���&4	4Jif��" ٪���=�X�m�e7���x(ǰ|��7��7f4�V��/L�	.N�<��	�
+�)���I�# #9�i=�|����"|Ofr��Z�lJ�a.Vq��+��O�,Q<_v
+#Z����m��Q�cI��ᇈ��z#TM�N���u#�Okx��%z��"I0��%2�&�D��.`6��[U�/af�} b��C'!t"+irŭ��b� ~�c��n��(�Nf,�Rk!�Kf,�Sn"�Lc.Q������E�}b�!Bo)Xp"j��#t��P�h�E�X�� É��6�d�
+���2�vEpbg�Y2���D0C��S����Y�_BI�KB�7��3�,�Q�1�O���\�l��=Hc'�/;�q��ʓk���/ÎQ܃��l�O�Ő����e��:�#���R��V(�Ep���ua~�Q�]wŧ3i�}'���0�D�����7��b��Y���ҳ0^��@f�u �u�Y�H�4	6�8�������=Nl%~Xvǰ�[��D��f$��"ʩ4(�:1�`X�j@Vw	4Fd�4�*�d�+Q/{"@S^�Vn��P�c	\�g2��!���9�dTUl�b��.���A�w� ��끓�x���������,Y��3��}�:G� *Y�z7X�!� �%�$�I�\��$��N-���a�����C���)�N�C�W*ݲ��
+m�Әq�df 8)v��2�s��*�7Jdv��N��`4�МZ���u��D��dNc���)��Z�t�E��9�"����!Z���0~d�j�C�2���)ͰL�c9Gsi�>4䦋���>�Td����"Ā�#|��0�(�9��;�D���]��8��;��?�xն��JV�d(5!'�݀F�f �US�Q��b��@��(X>�m��@��)Yr���� 7���z��K�DY�+Q��+Ͳ�����m��
+�L���dV)I�Z�c:C��$X�ŉ�S4�� 9šGi�AK�M�h.a�o0�]��y�� �t�k���`�r�mz_��Sv�v�7�@��f�c��d ��-Nql�0Z�VW��t���I9�᎔�qբr[5����� �܀�6?�]�O���n�S9خ9�N�׌���na������i�o[��pB�w�`fZ!8ީ���x�'�(�G�bqRk��*�MϨ��G(���41P*����������Q��Ü߈R�n,}�,�R��	">vPclAf��Bu~<�(��l��8h߂������p�UdF�Ib�L�"���m�o������Żf��mc�Y:���t���C�T�V ��h�Yq�j���Z+3>Zi/K5y��	y��z�b{
+�KM�m��f;JTj,Ej��8efB�D���<��9Ce+�"ͤ0V�-�w�`E�5���x����i�OT��:ǃ!܊c!���L�gR�u��{���i�-����q�dx�l{�ŪU�4ņ	�z�p��\�Tf/�I�4۳�jDOR���e��9��b8��(�,��k��&����ʒ�V�b������<ڲ����2�5�q0�����V�oi��շ�H�~g���=[��W�l�E��X�k*�,�+OV|ga�s�tz�8J�I`i^E�R���6�:s�v��l�nGN��Wk-����Yj�o�+|_-��0��>�Zh"|���m�����'��ُu�7j���u�wj�4����k�#�e��ᔜG�����B�(�Y���UΗf���z^�t����!ͬ������`���P�����z@�M�.�G�Yf� 4?�i�,�z�i���)��L�{.њO�t~y�1�۰Ce6+�XNVL�u���:3+�#b�u?,z.������í 2�9����0�s�I��V��P��Į9��"R���;~+v�w�'9�G*��d��@�{����1�-��VBf�5BE�$���>�l��$�wP^��Ev�Kgwe�2�)�|��XF�4H{��Ŷ]�,��0��������xDBt�.8�d�f�"��@
+������0�E+��fH���6Ul$x��>�\����7ë�*�h�0�����B{`e����魒,�m��~5[��4��5tV#F̗<�	��
+*4 �/D�h�1���"�&A^�)��=R���x���0#���e'���1D	���F���)�����:�Ս����H�}�d�N�S�.@���(�s�1<Yo��Y������"�[�
+����	�A�-? 2d"�Lv�L1'`�#�I�KN����A|�o�[����N�Z�� A^A�1rwбR���?Qq��!IJ�}
+��T�yC�6B,3���
+ ��K��57XA.�@��פqZ2h���[3yb��.��(�p�Րs���3�~����P�E��0�If%Ϧ��!�	�lCq�)$ւ���d��Z0��!�)�x���~Y�];��S���U�M�(��z�~��9��bCA|���Bs���z��?,�[ivŽ@V��i��O��Va�i,��S��8�˹�ٽI�{V�M��P;�q
+%j�������ێ��s�j��d��r��0K��r-ǢT��x��^g�D�W)��Y!v�i'in͋��|.��/b�{e8.��;�̑�̐�Ű��d��xF���2�Y�Yh �+�
+7Mdn��|Pd*R�7 /�}K���`��`�Xr!u~�����h����6����bH�棪��g}�h�H�z��5OAiyݮ���;v�5�#������n�x���'zΣ,�����B�:�����D��$�
+����j�)��r�?���������MG��(�{<UmJ��A-��#u�R����r�`dNb�w���gb��V��z��bU�w�2�V��(af���J���2t*����Z��O�ˍE�ņC�b{Y:ũ߱�˱�M=b�J+Yr��@�}%U[��+yC
+6�	��D�J��eF��u��KV!��4��ܑ:FC�=�A���:Z�Zx�H}�1�B
+���_R�+��OXb�w��`�a�.�/	>��P ���L�� y�p�����#U�p��o�L�z�Z�u3�&�(L~4�-2�f�'�3/Ke/�[o-Q�8�W~w����02�: �2�(�Ι�Ņ�	ZO@�f5 ´R d� 
+��.���;�J`�|L�	Ll��QU�Eɓ 'Pf�\�Gȱr�H��x�T��(G4�W�'5�,���`����S��ѻ�=�=�JrD#uHtj~B�2�ɂ�u��|����B����G��5հ���Ԙrm�Km�*�c�z��0�a���"7���S	>��J� ܢ@.-`�|��.�<��^�\l�+����n���1;��1��#����'���p��K��|<�� J�M�l��$�v��؝E��oj�KMG�Wb�*���뀯�D�� ~��*�Wh'~[m&�Vj.�O�+�Z�dR�d�tƉ��~��̨��*�}%
+f����?*���QD�����]�?gZ~{f��(W^������1z�+��9_5�ߙ���l��)��F����8����_b�}��l�bo�-�;����J����4�/�jڊSl��ˀb�>T��Kj����Yb�������-�Q{Z�|�w�Z��#��їn'�F&����R���)Ѯ�ͪ��(�ɓ�-�����b���E0��ǫ��,��@��:�YoD��-�� R/��ވ=�Y�\lp��/K�8�vL�'��!�cy�(��H���<�w����]�?|Vl1n��T�h�
+^f*���U\�dK1n�Q�K����j�v����oy$�Ն#�c$��:~��@~Vg3E�29P+k�=O�X��8�N��L�ܿ�����Ie���A�k`�r�4��J�`�P(���C�����BL�<�O��%2?Je� 1��K!��0���x𡅣B
+�^�`'�4�,�H�~��+�Of�H�p������6,�S�Fy�y*C��z�s�t����e�c���>��"�G��V�c	J�T�T�H���b ����tE���0�Su��D������@Hx,��t�p�k�IJ1�r����)�@��UgH]�gi��y�����`��l��$����.�o3=ߙ\�^�	�錄��N�\jF��j��ܶ�\��&� �5�4�M�Zl%�(3�%)3��3E#x�j~�R;!�B�`C���5o�	�I��>ެ���&2?Mc=�08�6������ݹ��K׃*"��B*���l7����T�q1U�6���V��ߺ[�:�"5��!��'�cU��v�ꂾ������3�1��"�
+R��y��)�07�Y��X5?ar�M���x�G��A��̠Ź��^�gw-���P-}.Y��E�ctO��CU�[��`�b�_�|�b�rCDｳ���P��_���ق�}$��&8yjU����HF��\�ΎE[m�%����Xԋ���Ut�ø�6��Z����p�ao9R�7�V�����1�4����u���W�܍�/f�y�+�Zt�+���j�K}g��R|��9Vk"|��+~ۯ���:�I�F���Ҭ*Kr���hZ�qs��a�}��x��}�8���$�08`n7���Gi��;�Wi3�aq0�5_Gʞ㡪�� �ܐ궛���g���|2s�bG ��%KN�dE�t�#x���h�2��C��5A�T��p�aFJv���C-3P	>5��j�):kБ*c���V���Vw^�J�a��i�b�Z�p88���,怉3� @Q�J1 �+�x�$x��|R�]��5J�4/�t�H�X
+��	Z ��u!��b&��%�m���!lB��"E�1AD� +�����#&r�$8�v�(J��\���!��;���H��z�ⵥW����ೝ'+f�y���8��h�����,M�K�����ÿf��:�y�%�9�L�`�Q��0��e4��D��z��	r�f8�t��:���§+�4��V��e��V����*k�����2c�rÑ���-���^��o��8Y�/���b�$e�s��f�"$�u��<RP^�S7!f�B�e��:&���'E��$����p�E? ��a��t�Yf�xi&�᲌ᤚ�$��V��\��`��Z���:ff�VA��j��O��%�5���2+�kV�w>g���4��2�&�m����� �K�G
+�;1��#v��\�_� � Xiv} �Ш��~'��pH�+f����\�^m�k��:��X���w^�ud���+�Pr=Hi9Ch�h��������9��ך�[vÂ�vcT}į	>�9%A
+(,d�|_3��|R2�]��iU�j�s���C������B�jCn��$p�+!�
+N�b���U��U��ɍ�}�m>��
+S�k��͒���'�_o^�P"�Eh�c�Z��m��7��r������h쁓`��dS\d�Ŗ�ia滆@Bc�#����k�NR��������ߖ\���U�W�c��8�q���}�!�$I���A���<�Yw�y.�9�A����d��'ˮ�,۹L���-�?Ep	m�Q<�R
+NE�u7N���iBÀ#Dא�S#C�AJ�h6�5�y�P�zA��+��G�Zn@-Y?��3O.�6]/d��Uۯ�x�=��T��H�anª�n�eۉ����0.��yR�b�I0���qAE���)�*�Slp�*���(E���4̀(�P�D�H�&�ɖԀ�%Ϭ3���ނ��A��q�Z������h��g|���1=��{��5�3���M�fu5Li+��U�F���C!���j��Ւ�e�塙���w�-�#���4˙��J�a���B�����s�~�Ef2�b�]�{'�%V�����L��L U���M��$��X�^q<�5x}���z�!���-3��'	����5Vi$x�ސT��W��ڮ#�U�CGLPK��I�sf��
+ ��d���ϯ3Gr��G��>^�b�,o�bmX��*�5'2@���&�?����޳.X�Z��y��8�!x�뮃#U��$�|!�[� E�c�����0Z���P�z8C�)ʬ=_�I�
+�di��q�jÙ��XDiy4�.�>Ej���O
+����S��J�Kb��!��1F�S ��8�3����&JƗ,��$L�8 ���2}"{�"���f�	�Yn3�Oh~��x�sJ �ΐ$M�?Jc:[�8E#��tKm$)&G��(��J5���|BUFc�z������.c�uV5����Ks,G�<�]��9��.e)������ 6]���0�|�/��*6;]�;Vl1�Vk*L0|�덨���=�#���k��/������̗���j�]��Ĩ�B�R\"Q\��2��8��L�^v�*Y�*��G��/k��T��(�2��$���o�n&�����zQS��@�6�c�6��a%�e�
+�/�T	�36HhZ�m/P1�!X������,�+W+t��,�.��o��{����;J	t��&�p�7�q�%�Q�Xc¨z��L�E�e}�NŚ�}�fuOBgx�3� �����`���H���-��]/�yu�Y��8=^l3Js^��2� !�f��B�R������j�KYb���)�W�٦x���^�b����G�XfB�X��6���D�w�%��!wI2KM������	 �5����y���}�jp�d5\�Td<Cdg֬���^S��ZqS�.f��$��5O2iE�G�@�X�h�j��������e�����n<o�%�#��U�B�+6����Mߕ�rޅ�
+���(�^bh�j��>�Y�g9UF�g��'��%i�;�b�U@��%j�T�e:Jf ~��\�⻒W�?�r@ƍ5�O���۹$�v-H�]�~׼��9�u�;CeR�EH�N�N�̾�f�r1�Uk-�H�>Cn@QdDk��K���O�#�>!J���b��1���=�����U��7���;گ�K��B%����	�$x��՚�N;��W��N5,��m�I���V�)�3^o/~�
+s��z���P{��U��w>�mϥ0��x���p���A�������
+��B-Zn%v{A~��@���c�����J�ʁ9B� �8B��q�&�Es��9�U,oC%������#��i�q;�VN�DT������4�u(p�{2^x1Fr2C�c���e �ȴ b���9����!z"O1"J/L�]�P�v��!M�53Y1�#xJ���]�Y�H���rb��^��4��%J��<J�F����8%��Y5˛
+j�3'��q!��U��=�f��rk-���t�)��:�[e4N�q�{�m#�/4�T�� �� ��!)v��"l��	 @�!æd �"H��8���^9��ܚಔb��/̣<Gm^�*��C�`��h�r�r�r�~��w]N�l_q��1��:#6nw]�A����ь1��W2�7Ӭ&� i0��IS�!f�Y0i^=1B9h��1��E�Zn�k�Ȫ�����p�+�Cj�("2�j��\�[���:��z�x#A+�	V�� �?R���y��$�o��}
+s�h����,Ȱ�(Dp)A�2�q	��:��h;)�g���-��E��	�':JR��i��%�k|�p k�Վ�4��B�Ll$#��1����a�=Py�S`�o �t+B�К��#�g�8�4�Б�Q�j�*�Y���=��~1���:�Ejg�����;�c�Y�t�L-�H�iڈ����Mg����/�{N
+)nw��r;d��H�r8�6]O�]�3]�K�^n2�Xi<T��+�̮�G)�@�Mu�����6�z����6Ҳ��R΄9~f�yhPZ��2��3��P�E{�bw��#Hy�M��:^,;t)m�9�:���m�ܞ�BCq:�#�<����o}����ٺ�5ӫv�4x�ὋbU(�=Pf r��`�`��H�z	:Bp@Kd&F*7�,|)Y�'M0
+8^w�s��&�G����4Hݧ
+��`��3��"��<�\�aDx4��!F'���I�#N�t�m}�z�]w�ȌvGR�*3��Nb�pX�Xz%��b��Y����Y�i��Y��(K���X4\3��t͒�͈E�y 9�5��1-C'	;J��<�T�b��d��%v��r�l�$v��E�Վ�M��9�w��a�m���n-�g�h|t�����v��t^����2�p�����hqB�@F�) #��AI.��M&���f�} �քP�݊1���Mk��K�f��C	�٨n���v�wj�tޮ���n�<Fz�)��}�ޛ,��+�Vk&�Ni%�KeK�3��=���!@�g�4�{�����w��������@Vk=W�ܶ���rZ�-��}��v�]� �Y��i��2J��o�L�I�C���ܱ~�N�;�NS�$������L�v3�s�G��$�{6E�;G�%� 8D|��:��i�ԚG�����,EJ�V��!�@�&�0�B��%�Oc3�[o0N����L�u� yUQ��ʎ����]�S�h�Q s�VTE��l�-�i��{n��v#�I5�5 ?^������Y���+�.<Vg&�s�����j~���"?n��+�d�u��D6�v-b�D�I����Ė逞LiV~�B�D^S�s\��W�q��:V����o}�v*��b��p�Y#Q*94�ƒ�a�g�J��gy��!G��Z�b/�-5��Kn��e8��b'D(1Ӱ��Y�������d��ݱ��I�r�j{�j�I��̪ٝ�Z�;��ny�h0�XeJ�z?�����+��� w\��^�q\/]���Q6�.G���{�Yr�b��H��)ZO�D�����m�����9�R�z%ԋĦ�A�p�B
+��A%2�Q�8�\��P�Xh(N�4 +�Ԣ�7U2�o�gu�l{�cE�`�~�m�}U16-���b�r>T�݃8�+�����i���L�Y��X�yN':���Ou�ϙ��-�Vkb���P�0~Zo�-Y�!!u�u��:#��e�cu�j�����@i�����;h�~7͒ݕ4�lK,4�8���)ά63S�2U.f������U&��R���E�y"�1��@�s��{N�K��2��		��ݺ�H��gI��HjޱZ�'������r:�3Iv�u��_�Wf>ײ~�/��g|/���B(e6���3I~����TG6b�V)�IcR�[��b ��r�r%G�r� >��<��>�`�(�,�\�a6�#���mG��kj�yMCh<�s3Ms?����ŭ(�rӭ<�y���arB+�?���f��H��uMvi���9p�l���q�J[Yzű�BSt�&���F��(�B�T��nŁ n���eG$���SY�4���%�cE�a�Lc@v�*�BW5W�5"���r�uB߲�,�:�yʭ�c��1��3.�U�X�iP�Զ�f��?Xl ���&x�)��O�_p&`�[��E��R� &�1�$��:��@b��D��:X1<vh�o`�K�P*^#M�Ը��-��o�tKf7E�V��Df����1�ېRv����P��G-;��e۹޹��{�*�r�e ��b��$�,�Sf����p\�q���6�$�i�s�kw���(l��#T�Q9���H�{��bs@����$=�06�+���P:�M�T��~\�_n�{��cUۗ[����?*Z�_�|�V ��:ѳ�)��X^��P�p(̮5����Ǔ�l���Vs�J���@���a��-C-{��K��9�_��=�K�S���UZ	�-W��#qΓ���)�dwB��N9��-�s:�,�,V�/��&���B�SD��b݊0��j��D�t(N0����֡�BCq���T�}!�=��D�e��`�L/����З^��<��r�T[v�u�A�bp5^k2r�o?R���M������6��ks1��1K��JƷ8�v�Z	�i,��Y�9�:]�m��3A�&1�~A@	b_�I�� B��D��>�\�(1�4��L�X����:3Z�v^0������^��Z���ʮ�����m�.e�u���Y y���}�从g��K���6٥�'ģê� *�%T�3�]�XbEX1bk��:�`#e���K	�#��2� ��bX��`S�і[��L�av�9O��J��j��4�ʸ����P����%��\Rj;�{����F�Q=�0���g���G�p����b�:�S��@r��0��f�q<b�;u��n��Ug�qBT�ցԊC��ݓ[t_Y?��3�No�"rO�������>�N�T������Quډ�.����s�5����CEo�0�~㩢�[w�}�Ws����DT�����3�_v�f�o���a�cuN���y����v
+H�W5�.�&x�C�ә&��$�aQ��j~�z�{���+�����#�l9%���v�{���S�o��>W�͟Yj��a�$��_'����*f�v�8�F*ƻ@�r@U5�#Xe���)�����ܾ�4P������g�V�L瓆�td&2:�#��
+ ǥ1�*��8���4;�'��#e���y�������-�-r�:Og��M��˂�p�}W ��'�����镢�T���z1�5�$�L�P���/B<�]���V�]iYp|�44��B2�+Z�{��Z����H����킾cy�n���U�c�d<�T}2b��T��\��1�We x��b�b,�ViD/���`Y�r�$�G"(ն��.���� f�@0�� *��ܱ���ס�i�$�1�p:f6^�b<�U���A��з�����[He5l��=�17��(=���lυ��b��4��-��c�`m(z���a=C !r���2�Ž4�z�Y/�,�?|���p�2БS�P�-�b)�a���KT|~�rNG��/J�g�)B�4�nG-����gUۏ���,$�:k�?f��j��Ҕ*;A\r�@�q!z���L��0�0�d�n��V�Z��L�Qer��f��'�0�fJ��P��*�0�#���Q�rJm���EDo}��U��]��W(n�$$�K
+"�kf�s�j�.hmϕ��nt�m9���f�U���m��|�������dx��Y��j�h�"�:��f�Ζ`���ǉ�◅6R��0�{�Tݏ #x��	�*�`}F���B��z��X�Ў[��Hh��D����=����[��H�O���T��k�����M.|OEtF7��W�Zd<�󛑗��j�$��4�(�=�\j.�2>xM��uծ|�&���n�sծ܎�e��X�~x}����Qn�Ɍ7n��*��?#��I��A�����'Y�����"#�3E&�$ߧ�8�	�����cx��F�ޠd�v������h��`�b	I&��Y
+�Ӝ��Ъ���TD�{:�υ��좈�vE౺���\R��o�K�l�Y��>�9�O*�#b��)�,7�g�v�r�V�1�d��������@e9�q�JSA�2��v��4�����*�,�<��j(F�/�+$T=
+F瑌�+�����v�f��VN����(R�*(V�*F�u+F��h�CI��T��z%�� �{�2���b����M��|��j�|�8�'�n��"��"U��rkmC�S������3�dƧ`��Z@�W����l�ԈAaq�n�M�u�\2�kW��Բ�d�c��[�sr�~'���+#2�!��Hm�{�o?̳�w���d�\o�: �fˀ��o.�OV�d
+6^���ڑh����d&�D?Y��<�L�i�q:�a�s�s;?Ke,��i�~�t�V2����}`W�ϱ��S���R�������v���&2=�w�?y^�[�^kB�3#?[m��z)���期)��U2<,$2�d�|7�kJ�0�Є�j6`�d���C��-�'ˬBJ�<BJT�G���c�6�#�j�4�d��㖚a���U[	�+n�z����UҜ�z��8>ō[5��J~Cr�wi7�����8�1\OU��r��>�X�:�x�^�b�b�����2�J{�Do9b��8�x�*[����2�Uf.�1Q��?��9/��XM��D�v��:�tVgD�o�sTn!dR�D���({m�o�i����'�뾈�"��/�ul�1�R�`�TǠ�4��̂빞�n��!vM�Z�Q����(n��P�x"!�_��>��TfG5t�ޚ�Q����?��Y��%y�n,�hz*G5߅Ы-������R3�b��@��Z�Rb�_!h�g,ٱ�'3��1���Z�ǃ,p����G�,��jC�^��L��?T��K����wQ�����i��t�n�Ω������9"L����P�����.�]�]i4�2�d�����A�b{�����CE�;��YA�)փh��2k�4�_��#p��R�p�9Y�+G�;������:b����N��n����`:^R]��u��ڹ�X��q��=�m�i��S2=�v��1�JS ��*Lи�i�֢�e�㤖!�K";!D*��;��N��H~�c��-������S�t���XO���CBr�����+��;!���!�%��C��v���-�kD�eT�l<k�]���S/������4ޤ��ցs��A��6t���f�n��e4��8]���$�o>�ZN��vA�V�"p�	���n�Az���Z�������(:� Z�J.���X � 8��1�,�y ���Z7s��{����g�Z!�z�5��T�ژ]5�����4�x]�!��=u[�{`��"���V�z3M�`��`�\��7Qd4Ot����u��m8dyE�3=�U�bq%H1�GQ,�G���j�xXEi4�[�_�N�Y�(�n�u+�2�ך	�*�����JS쮂Xu���j;Ib�I��g�존B�Hn�_�c�K��ϡ���.9����@��J�]o-��۶;�3�k9h֭�i��4˷�G��r>����k��o���?��g��!���*T��H���0�T���6���Q��h�?�$E�L�TT�Rp�V�]sL��m�-�s���pl����6�5?��������u�!���#��}�e6�S�/I��Lsݿ���X�]g,Ԭ1���~ܒ�>�Vs�5��y��v��Ի�?��~��M�	�j��ԩe�%����K�P�5(��8��T���%
+h���\N�O!Fj%��z�H�w����|��z�K��4TFL�L�hnG$D�
+��y��'��N(%�F�y;�t�è�6f�|r�"C��endstreamendobj1079 0 obj<</Length 65536>>stream
+��*�3Kc�t�f��*�,��R��L��'N�8�f.X����lw̏���+L-5é2H,��Y/��vg�߇��#ƫ:�W톌��
+*�v���2G�;ۮ;��}%�呐�F���jm��9&� ��3?/�5Mc j��<�d�y�ؼ_y��*�� u�bf�LHk�5�״˒<gbԊ�`�=��цy�\j1�-6/Y��bV�HL��m��MЕK����|�1�^�w�an^����hO�=��r���x߅I�O�	:�4�Ԛ�4�B�T�BKP���uf���m���e�A��?�T�t�K��_՜W�ZCa��]7]]�
+�N�ʍۑ��$�s\Mϻ%��!G��xU�4߅T5��e��]�ܴ�K��ƠD	u��2�0^�M�%�eI�!G+MEpKM�]��Qo;�(5�p+��P��|��*��e����Sm����{n�uZ��n�vQz��<�s1��	��1Ju9Pf1~[w2G�@�2�*�݅�k�@:Ebgi�H=�T,�N}�)T?�.��X�Ƅ�qQK��Pb�y���� ��"��W՚�Pj�s�mۯ��Ϯ��44v'��SQ��=���x�۱0�����]�Cj?P�,�s�E��o�3��u&�����rc�~���]m�!,� �����X
+"�
+�+������R	!����J�m+��]�w�([nH�sj�s`0Y�[FCL���D���D�k��U)��ɬ���\�C��e�-���{Kr�r8֮6?M����o��r�C�x��Q�j�Xr�	��=����H�ڰ`�'h�((Yfy��(�2?'��W�evEBt�e�lG"z��h5f���M�d���~�*�Ω��Q�q�(�z�NUlR�Ú�}&!3�<�+�8�W��� �x;͹^uK�uD�a�Ԋ����P�s�`��g��m+"-/�BXŀ	_�)��Mg���-�Ps��,���s�Y����DR�tIAh6������u?�*~����1/~f���2;Q�>)�`���ڞ���Q�G#�u�"ԲK� F=G��옃���Y�2,Ԟӑ��T�Ѐ��PP�h�ۑ��<aqA1����y�k��Rk@I�Y%�n�t*�-;��(��'y�u�6�*�c1~�I����Ny��j�g��s,'b�ZkA�bsQ�rkf�cN�D�ٱ�-��.�-��y��V��P���b���Q��(��`��V/
+"\2�'2�a؞&z�ʖ�.O+6	J���EC��7-�4"^�|�(㧩��V����5��amt��x�c�'X2,o�����8�v��z��m�y��w�+4�'.���<�~dd�H��P��H��{�g�����f��������P��LN���&�l:�P�E7Hq~E�r{���H�Udn��1�Q�g���$֛	3,�3M��]y�����h,Ze�� j�M�qUZ���3NDu<h6�o^�}�6�O^�~�N���[�Ⱥ��D�~���A�z;a��-�p���뎁f�,�o���
+�B�SJ��z���bvB'����(�5�׸N�ݟ����+��X�sb���HjuF\4��������Z��E�A�^x<Wp$G1���ӹ���𨺾�=�7�,����^	�fC�=�!����1=)w~R�j3��欖��!��|l�.z��P�`r&C1~���=����@Jr5U�
+6�/"��祈�2�Xl<S��he������C��#p�؈V��ꈌ��;���� A1\�ѽ�y������ʦ����)̰�y��(C4f �� <�@>q�q2{!�U��h5�Y�`o��z�3e�]�齎���n�4��h8���U�o�R�1L�O�hN4l�`�zF`Č7�$.D�@`��9���P�Tg	� Nh�<��D�8�M�2�~@%{�#�F����Vv�&j�3���[��!�W���Z���P���H�bw�����Ɖ��vG0������8��'[��I-L�k�̶a&hc8eR�J��B������]S@�E;�ɞ+�|�2ծ��h.��
+-�	vW�zɡ J��$��\�f�R��y��/�0FM�z����4J�[pb��.z��$D��F��#���Q{ƿ4��^�ւ�s=`�[%�1&Y1�7�p3�\$|���nX�Ir���
+�*�~?p�'��?��{��S:��y=��>|��#l��Ԯ|Ή����t+Ey]H��	��/ɸ���3��>v��c��L����8��֫�}�[����3l/CY�F�����:A�;��+8f��Ӗ�Q���م�y���:W0&�=����~���;��v�lu�ly�����耠�}!P��)��f��*�@�U��j���^.Z�]�R���J3A\bO�h�UZ���m7gI�b,��4�s-~Zk:Ͷ?zӋ��.�CV�-�����M�w'p��$(��-@B�"������ּ7r�|����m���]
+���=]q`��R�p٥Y�]����ډ�+���m�n�$�s1K���\	 ���h�/�5� DI�� 0�.Ұ?Cs�*8>��$X�E�eGY��x��=�Y��W��t1��K�q;%����jQs$opf����k����
+� Z�#�~QV��0��,�0~v߱��A�Uif����~y$��4��M�H�y��h��>�T=���:�vʝ�5��Xf�t(~Zm7P�>�}��P�u���_T�g���F�0�e��8�w��ۣ�ɺ7�b)Ɓa�j@
+!S��9�ʾ��!Z/�B&Z�����j�)PI^5 #��@�/i�|�8^�}$���ٳ��Y���gg�z��L�y>)56Ngp��:�DՃ׶���솔Q�FM���)��d׉��\ ��T E����*��T7I�ً��y+~Qg\��5H�c8Hj-�O��
+�K�'����Mw��~�H��H��!H��1��[�b}ӫ�c��z�7rt�� �hLm���K�����Z�h,�f4�U:�rۦ+r�{�)�Zqk��z�v$G1}	�4
+(^s��40F�	0B�eW��������:S�~�w�ay$WM���Aq���kMh���]���֭�j��v��~�l�q��C�{�3u㓾e����;aԽ�Q��3Kv�K�}���N�x�m�Θe�[�bm�������k|N��Y#ܴ�p,7���]:ޑ�4K��*����z��Y�I��`q�B �����=&(�8�Ī�4U��GK��
+����^��܋_��p�'V�5�_����X�.��e��4�+�p:	\s'�N��O Y��o��K���d@�9+F)������íj��*;M�ي�>��M� B�#���W�cy*|`qZ��P�� n�؊��u�j��/�.��ɩ��� ��L�,�Y���'�X�`��3U2$T�w��U_���%�h�3��CYr��@f��Hn�5�r;,�3�����b�H����4]H�+�~S1��%l��$�4��q��%�X�������;
+��	$zMT��~�7H�Oh��H�MfhV���8�6A-`��8�Z��|�q^IIM�����C�� N��N���Hk-GI�'�r�!��_N��d��|̲-7���~�d����5�n4�v3-��u��fCA��ڰ"�����^�4�)���"|��<��}�MU�O����8�rh�b ��y�Z�����qpI��C��R��C���j����+h��b��,L1�g���q��J�8�Y`�3��_��p�bw2�1 F'?Rkd��]���5�GR�u(@Umj��j�n;�$��BJ݃��D���fǑ��7N�(z֧,��\�b��Y�Ķ�>�H����r#��>�,wL/����@Oe*�Uc�jZ5�z�4YwZ8��d�"v��d��}�;�K^�v+�]o'J1>�M�I���Z�b`�_#���%J*��u��*�K��R�A��G(<�z=���5���$w1c�	6�O�^q��{�G-�?�Zm<U��g��� b*C�4��BCI��2�^n1N�\�4M� 	��C��)'���H���-�\vh�di��@E���;Q��'~��!v��f��?����xa��W7^�2���E�m�z���X��s$ı~����H��eV~g�u���z����Z��^� ��)P�k�Q23D��c�O�Zot��-Q���iy=�[�`r"v��%˯����DY�o��z>��J��7L�"�^ei�Ka��H0��#�	a��3̎3=�=�@�q��:���D6CΏ�u/���'Iq�k�gU�u��
+���W�&#���q�����s��*��f��DڳRAi4@���60���dU_1��d���?�(�!��C���P������\��I���K��5�s���X�CI낈��O�ٌ#X[JS�#�c����3�Fa~��cz��\���K)��V���*������lρ�s=_x7Vj@`r&I2�t�?I����=a.�8n��Z�e�Y{�r4�������N���~Q��%Z�N�p��^�4�{9Oe`��f��&��m�aBc1��>�X��0��G���W��@#���b��ES˂�Y��4�w��z��Mw��$ju]�K�b��*6<TkR��&���5�H��0�v?����
+�6����Ld(������jŹ��Z1��"�N�ɶ���xY>} ��;�Pg@���Z��D��)�Xj�o\��ۑ��&|UhR�a>�U(t��E,;�$��y��X��N���2��M7mAG{�`^���n�m�*�N��`�-�<�Ȝ�t~�E�1 ��|H�&�t��Hd0P�3(���jű �뙶�L�bz$~D�#y���Ze��L�+�ފ[�|"����!�l<A'3>Fl���G��M�.Z��8��6Ht7Ku�׾F��I^�U���D� �)�x�p��>,:_r��رJK�զA�(���{ ��R��~��K|Z�TF̞�>�Xz#���"���+5>Ogz��H�\p�m�N��VS�o�i;��.�o�.�&��,�O�ar?k�n�)��Pj�u��Z�������W��Y�kJ�n�q�f��i��&`��2B� ���%����r�':�jL�jX�k,�h5f�j~	 �9���V���D�Zg%|��-d��J��%�p�hu�q�s(F�6	b�[|�g�h9o��{`Q*��T6�F?���7��;�{a��z��@*��3mϱz�/�4�Q�)�ЂT�����U��|K�����4�C�<�5(�~Ep�G�_r+�RlA+���������V���2]g��f"܂s�*��B�:T�G(29Pr��E	�ŵ$�|��~FI�$��*~Qg'�Rf?�t�B��� Ʈ	@��7a�5b�:�/�Q�|F@C	%áİ��s|Qt<�B�L�y ��3M6^��M�A�� $N|6��Q�@�G�D�m؁R�sDVa��ly�3	�o��ٚ%�J��X���K/�
+��P}�K�$Ft(B���%�EZe��J�Q�J�3���T�A�I��@�V��T�Vd«��"���Y���3:Rj+�Tn�,��w��0v�G���6hm'�EV"��V�^��$��'�Rj%���)�ೈ�c)E׹��G�Zp|�� |LqD-�Ů�ǒJM�:ο4��^�d��R�zkprM#����@3�AI4�hX�n�E
+�fr�B�#Tva���j�gx�h�FIК����cU���_"�� ��ˀ�D����L��ƭ7;H�@Oe�/��(pk֓,��&�Vjz��V��R�c~M�ܶ�x�iJs����d�뿋��oj�x@P�����-Xl;c�>d9g]ϚM� ǹ����	�m��W��NSN7C8�K��������:�3��
+1�&�e&�R+F�.P��kx!�op�+��)��K`�O
+u죞� I��$�̂�����
+�4��8	�����-��bSf����o�3~B�/;�,< ,|�$4	>�[V�aB��_�MWa�	!�	!�
+���
+!��"TZ���=!�YՀe����b�|��28NdP�i4G�cy_�lב��&��3Z@F�$	�����J�܁�5����":$2���}G��۰�D���H�,�U�k;6Leb��;Gu8B�?H�)X\3L���X���S�y��:|��2|]�@�?CJSK1�MQ:��һ��� v����C�b?��r��0�@�"Xa��YZ���m�) ��;� �S�@�����{���w�ABYV�)(azAø4b���A���*|NN�n4Qm!l��>�(�o�����7/h�#�Np�x�nS�$�����+�#]�E��b �$(	zW�P�2^�!N0>�Z�6!�T�ɍ��r� C�����
+Qv���1�I�8^1��!� c�Ҁ�$w0#Զ���[������e�a�� ��'�Sql��"� ���U��V���1j��8�(�Q@�Tw�bD�@��a)��Hզ��3��:-W3�,�ITkM����A�W���� �,	4Lq�-5��X�0H-!'[����a"3�&��&R÷4��噟eӉ�b�����+������(������{���?���N�=�ed�����.~^j.�a� Y����q�h���x#Bq�C�β��V� ��ɬC�FMS�I\��&�6Mh%z��~�꿉r�WR�, ���3БEC@Ԏ`FU�@hB�v#��H�r!J�PX�\i��9*ٵ���z�,zɕ���^�4�}�7�h�B�C3P�6���(�6$O�`�Dpb��`���ï$��&�(6�؉����0�WnE-89Qh*v���`�n��VM���F!�(΢�j���݂ә��&����49Ogv��d��Ot�Q���0sqc��CDg se�.��@��@�Sh~��í/���$t��8�D�y�������A�8��^��`��|�K���w���<� �3� �Y���E�_ze��a���a�B뙪�p�m�p���\@+
+D�Xl��N�L�� ��t,�$0ѦQ�l�M�h�G�L���Zˑ��N�Qk`�^ ˫�ï{����X~�u����S�^z��܊��5!U.4[�F�;���L�\h��`��T�h9I�����X��
+�Y���;F���`$�ua&�^�G���� �3 ��z���$��\��H�k<��R��:�^��x��E�1���bc���w �{�"�/#�eabű0�{	1�6�"����mCO��2�t�.:�8M�'e֑�R+yj�uй�A��L
+9b�=ТXU "�3��f��L�g�L�]g�\ϩ���=T���ʶ��i�?`Bg����B����6���m�Y���vk%�)B�y-v�%;K�U #��	���b���O�=�V�o��j�nL��,X1���zf����Yx�5��;���P ��~P2=*�- )`�� �|�3�b�2�?��3�a=
+�)6;Sh n��l��}���4^)I)c�J���j�� bť�ٲWxa"��V�\�#�P�$�$�Cx��0t��)t��\���|�p;�ۮ��\�X� ���`��d���zE@�Z{��{�i�l��O{BM6���)@r/�by�-:B�RY����c��y5���E� ߌ_ڌ3MW�W��GqK�&O>�8! �z� ��}R�]
+� �b�(L�=Pܗ�B����+�0Y��gf�w[/�� D'"Ȩ8Mu;^p!�3�KR���d�Az�P"ͽI_��$6&29Qh-x��J�4�E@��I�l�Y�d�G�K���{���E��h!X2�����Z�K�r�w�a}������~�y:�0�8Am	D�d	B�j,Ht��^�J���3�Dh�z�")UƁ'�́ͱ���jm���P��B�4D�Q`�XI�5?Pl%|Yq1�bz@-�@+6@Sh;��B���`�{6������G�|�S���D�bzhX�bX�VC����b�m��"�(Hs8Vkǯ���N��������r���j�PV�} Z�K��+��3��W��Z�Wp�@�J��&z��-~��r�q��/L���
+�엁��?Sw�S;�c^�uj��L�T��,�x;�:��ϳl�y�s;����H5�9�禂�  Ũ���m����ۈ^�����~��7Qz�!��~�i��a�Jm���|P��i��b�e?X�VsJF���ه%9p
+�3�3�I�A�R	���g���׀^� �b&ȼ��.^�hg��	�(jqh9��r�K��~�Zf�V{fAЩ�����bg�]��㙸��[h�Z� ��r\��8�x-ʶ��J�1�I�i����f��=\z;[�)G�?�5�o���d�bv-W/���黎�	��g"�j�����%���S=�;�Xn��U�T��p)�7 ��%H��E�d�5�X�a�yBIV��_�:Kcj��J�`���	Ǒ�����GՒ�d(�ֲ�BÙ��,˵��K�^w�p�/������(��B�\q0ΰ{s�?�3�� cU*�s+U1J�Z"t�Ȃ�9��w$ѯ?Pf(ɳ�
+�U�/\��Z�#y��#�Hr��m�5(AVy��$��$�?B�
+n��Q��6��I�*?(E'3�>E��؆�&�D��Ů8��֛G��-d�%"�4V�"���r{STF�KR	��8��z��)����؉��&x�^� �R�L�����,L�%���6@�8����+�:�H~���w3��4b�j-�/��f���,�=�Wp�-�rJ-�N�Z��i��+��V�bם��gc�F��,��3կ<b�څ�$��8 �<Ȏ��3=wU��@�t>���)f7I�ݞ��uKVbã��b{����A����k'�镙��HAfx?N9^�Z����q�o��O��P�D�!�r+V��,�����t�ݩ �jӁ��7��܊Ҽ_�Z����I�쾌�Mg���[����(��7%����K�B��ć랢&JM9����d�d��2M�����G��^�w�p>����eA~��H�|!0=��(�ȩB�Y���3  E�6r��;'��O�}�X\�_nY%�۠E����-7������$ǰ8D-6����˖j�y�$��͖��'����&�O����շ݋�ڄoذ��<�t%|��0�(�S�`ŵ�	ۯ$�u=U1��u�.�'���ϕ�M�]��<�o�i�����u�l�Ʋ[�+�i��S|�q��9Nu�˲}��K�x*�0bA�d曪�6T�Ukt�&�h�>�[p,�YxB-;���!��?�Zm/�-����^�M��!8�/��
+R����H�O#�a�tL�T3|Mi���Z��6!9 N0f(� ��)p��$�� H���'�(�)36Ks/Fn6J�B�5�䗽�]�}�d����i��@��f���',�A��M�Z /���]����R��@�D�u�P�}��V��:�!�?"��N��1|f��%���9$޷�!� Y�3}�A�kH��_Eщ�pi��zsaԂ#���J��p
+�fJ��P��@[�
+���â�(:�L�#�m�*@9��)|��(~��L���T ��z�工!3?QP�G5ۓ�5h���r�����̾��.}/�5��*Z��$Tf�ܪ�!F���09��<:��n�x�e�R#U��CU����2��i�N����������XǣI��B���r�ZKI��Vk��7�v�+64Pj*n��@�8��(�u>ԷސJ���D�u�h�Mt,��zk!�j;1��M�`|%�>�e��]�?E����F�5���xq��$Ǳ>���[��1F����=׼��T ��B�bv�d9��L�M�b�Ir�!��w��>R����Ӡs��*�w�i��
+L��?I2,G�,��:^p�kY�ۀW���I�KY�كѴ�晦�1$�g �|2Hs8Iu�.8�h�b�I��A�p=T;� �{9`v���Z������Y�Nxe��D�_s��v��%C4�ǯK��7���Z���K!Z�y�i�	Sl��,�w����ĺ�๞�TL��	���|"h�v�$�Q�幏�����-���WyR���=	����9����"H����	�BS1Ie-�[wf�2�2;a~�e�Q��R��D�zH-7B�2A,6�X�Ǌ��L�tD+����6#��&#�g"��������H����Y_�{�����/�̪�(��������6]wg\��W�bF�F Oф���8���橮�q��a��\�_qLny����)�d�.�WfJ-��gV�Tl����T�|1S�&�$���
+�K�����}��8�tJ��j��5��?pK~#v�m�oZmi��V����~��0�ʀ�踭w��":�����\Hd�"Vmϱ�׼����=�Q����E�x*v��4�\�u�h���rH^3�,!�ϫ��2;�W	���cu)z��4�(�3�(�� �zS
+�S/��e�du�(|�[vC�{ ��nA�)43O�8Te'�Zi�kZ�	�f�F�웊�)��:���_�
+Mx�a�L�TZ	R�^5��G�ݢ�X�ƨ�2��zv�����z���e�i�Qz���+Mω�׉$�p3�丟��?���v�j?��J-�ɵ�S�kf�~p����}���x=�ط��I�K�Ł�Y��0�؀���\��B�������y��1Ǳ�O:��d<�Vs�T*����\�r��d6ˑ��|�����Aￓ��� �jc1��Mm��
+��e4v�r��(�|0Ul2|�چ�4�(�gu��\��b�W��V|�$6F"Ū�HJ�W��#L���K�lg��&ܞ�L���U�m�����Md]��V�\��Wif��`��b,��X��D�[j#G�6�d��&z��P�y�L�������j��XR��,��n��2;�[b�z����5�[�ak����Se�y�n��,r��@��)���b�����q?�Y��ʾ��9��y�<�W��9c5nc%��A�����Jk���F�b���6>�6�`O����uINc��!6G��N���t��@�0�9�(�e$��x�`oN����Q�B�v�h+-�)>��(㷵&�������-T�[�諼�����aei�]U௰��I+l\;o��p��x>��B�|��H.z�I���nJ��3��m�es/���G��g����B�9��M�M��p��/�)V:��e~�B�ad���H_j(�y̧�v����,��>ٲ������bIizL���"���1�j�q��~���X��eۧ޸=k�F��J�-��h8V3[N��̶��o�%bߋ��y��9��d6#�*����f1R04�h{�������S^���I��%���غ];T_��K�VdԦ0UJ���Z��cPKs6+7,� �Ǌ⇊���K�\�#�$�+���2~�Қܷ�����L�{��v�^��*����Va�:�w~8�S_k�MuT\O!픝��f),gV���܁�	=BG*�b<�]:����]�Ss����v�k<��$Uu2ŔԢu4�ɫ݀T��ȪJ[���XR���nQJ��T�ԓ��UYf����v��Ebc_Sk6��v�o+FQ~��8��y���f�N+��������x�D�%�ق��P]g�*2�eR�"�b�N�W�D���f�\n瑸��;K�n��L��:�4�=�H��4�}pP��+$�W��wj���z�v�*�%���H&/�Q���hik�i�ȕ��V�*�Q�M� -6�6�l�e�ߵߧ���2�B#j��$��()�G"u����v���A.(1�I�'�uN�j�B��Qt��Ä�6m�{���@F]��Rx���P��6R���q%�JT+>m"��R[LBؤ
+��-	����ф]T}i+y�r*$�/(���+�fU���8��wS7������B�b<��o9�'����&�x]-�p5����!c��H�h
+55�()sf�Z����$|��٢Z�$r��G�X���G`k-R��K��
+��+ٍE�i��ݞCg6�Y�F.c�\��E}qY�aSTu�M����UEÒ�	WI�@EM�(=]�05i�@Uue���F_ba,�Y�5'���!y�jh�jy�.��\*׍Qa3�W�V�N��{B��� ��Ț]�KY��&"��$�/�$�qUSىɇ�j�}UQ��:e��ŝEf6�-�ZNu��E�b�"ۡr"���}�B&-��8�<�A�҃�̆
+�h����4uhB�Uv��P��&nt�(���I�h*����z�z����=�!��<QH~D��r(��)q,�G:$
+و 43r��#�戋�hP��x�d�����[�&��_�#&&e�W���Kl�4U�#��4f.���U����x������r�2��Z����4�D4�Bj
+�dzHr&ELY�_��͘�	��I��J����ŵ��Y�;�Rli�M%
+��<��D��������v���@`MW2[`!/��!Kڜp�T> dQ��$�i��gȊ��H���B�����
+���[��t��,�Sp痝��R�]�yɯg'���["���Ah��dFߚ�WH�[Ϥ�.*�c,�e1'	᠛�|�RSЯ-���/q��:jD^KV���+���QR��'т1�� `>8P&p� �X��Pp^�h�C�`�q�!�"+�[j��jB
+���.��ACo|���e�B�*�j��Z��̘{�դ	��`p?m&��i�d3� 4D@"(0p����ǅ) ��mRWڱU��	�z��z�B!Rª�v�W�^u��X���쫄VS:}��Q_h)�����1w9��%�(o��8.�7*�xZ>Tq>\M�(��WVW+0/V'���ֳ���][ԥ�ʨl���]���)�v���N�D�]բ�#��J7$	ɘ�i� ���'�_Ρ��'��a&��9c_�֠)q]x}�D�q�J������x��4����j�I64����т��>.pQ����0��F��lV$w��=&в��(�U��E�ki�x��*�W�&�����ft��Aw����9#D%'8|#(�����-YA �c�
+����alA� (�
+tKX@p#rak,Qq�j����ӎ�RT�Kr܊��k�"���)�3�X_�ĞD
+Xʛ
+NA�ޚ�.$LMf |A�+ɊZ@0!�T��������̙�D������JL#c
+��|Y��
+E�5�U�"!�n��M�=�X`�q"�����tZ�y�x1ᡀ����pF�P�r"Cpi�'룭Q>�T����n�[�Zp��]ITV�Zy��aKV2����n^
+Тtp�Nl�P!��|xp�ԤE���`H$t%{R̷J��"k�I�RJ�YIEYuM��i�(������N�
+��0c�8N(`@@u���&8�*B�����L̩@g�6�=�)�Y�MYX�t��E���������4�lZ0ZM��	����� �IP ~�� iD')8�1Rp�����cڡ���GX�	xSB"��A�E��%][�F^XZ�Z�+��U۷��u��^\f#�����dbnE>�  �.�	�3Np Ot����,�� � �@� ��̃0�ס=E�0�*k┰�3*�,0���df�r��)���:�'��2��%��	R;(p\AZ@�"B 
+f8P`@Ӄ�D,&����1B��W;(*ܸ�7��\)����	��˸5ǵX�A\�R����0\�R��	�#8N(@ �)�K.(	|Kf$�T#3�X���^x;)ܸE _
+�sڪ���Ȫ-/-�x��DNC�ј�T`"`ЧQ.й�����#��0��rA�\@آ�0\a����`RXzA`;�b���p)r�u�ݢP�u��l���	}��D$��|�;�"m��� `��C�bG���j�lI<8~Bl���0���N3#��2��6Ƣ;,�&�M�UԎ�V��X�$UV�6�ӒVWh1��D><0]�  �BǒE	��8^84��I��[À!��&!�.8�1��f�|�_�+޸���jO�i�jƺ�fi�_���E�-�N��	,�6�I
+�	F8�#08d>��`��8�!.���`m� ����H�Ƙ%m�SK�+��eW�-*��x��8Ȕ�8 ��H@eaq���'��:�T��N?t9�ĤL��!m�ݪ�[���zsuI���O᯵�l,v㧌��KI�7*%+�|Fxp��""A+�# �	$��s��
+� 8<"H �f	�#$��߄.�O$��葄Nd�IS�6�PNL�,��5+��͂� -}�\%a���f+���wf8�)8p y�@�פ�������8@jf�,�!#	��0����#��9d9#�NӦ���Z[AR`�%�l��V{���o�)e�(������p� �e�#�ҁ��G&&
+�H�+��b��5�P��pk�:t'�e{*jz�����S�Te�4�&�L�:l�Q.VW(5 0��(��҄%BtƧ�'����괣|�������mON�8��"�iI)�j�ƥ����2]�����?�d�!%tۖ8\�[�����-8�t�px`@Q@�hej.\%�-¬�\S��0�+36D���ݶ�mO��T�3��Z��Z��1�"O��j�!�Y���W�)ct����-\p�x�߈U(x�H����)����@�GG�(���j^$��?'ήˎ4o���+����c'���š0�S�:�*,FΑ�.����$%04p����Ӎ���
+�pt0H%T��J*0���)">m�ȁ�4����Z�v��Jrzwi����XXd�g��Ei���u��r���ŋ`6P�H� ��U������!%
+@Wd&�
+"'\����m��ǖ4�U�b򲳮��`f��5�U����T�_��c+\�q�Q՘�H���
+�U�̂ꚱ�ʹj��na}5e��EM쳬�x\��,/n:�}��UDjwFBc�k�ۮ�D�?mu��UH];GBY]�*>D����
+�Q�ל��#��Qz�$�ʰ�vG��E<b&'-%���)m���J��Ɏ�x�bk۠t�霦u����!C[(�#���'�>p�x�p�6�� �K�H=:���hi�C|����|e�v�CKQ��0�*�N��[�`g?Uu����&�t.�k�@1��z�0��Hh��<8��h`��h������Re�DI(NU!ܡt�t�75�1U�LI�y���X����)�4S�+=�ŵ&
+e�}$��7ٮ5�.VY�I�M��u�R�[�"�d���,�T+�_��(�
+Ȥ/��]�͓�Xl� -"�,�&�/o�V�싍fu4��(�w?/6l�N��T6w��;r*Á���H�M�@L�!��"H�+J`� ���Kl�t%FC6]�_�������`Pu�:��M#`ܨ�)�.W����q��~�t��jJ���ZS�,?.\
+0��� #`��P`��E����ؒ^[�ޢ�XQ��yk�«��-z����j)+e8��ee��^��Hc�*&�~kj}GQ�bA��Q:�ANX��T���� W�*f.�Nؑ�Kjm�r!���1�W�jWV�8or�]Y%î��R[bES���Z�M�7��k4CU��%����E���"��C��Ĝ�|m�d¶Z%Z�*RϤ`Y$6�i9O�å®���ߦ�XSHm�tǓ�����4��4l�M�}�d|oQ��F������#1���R�(jL���Vۂ��﷬_E�v�5�fC�k��ծ�=	��5������e��>��.�[��q)�-�NU���G���CZRh(�e�F�"�q��d�'������d%}
+��I�:Ӓ��
+~�q��<XR��F�4T����њ]�t�f$;:J�z��6�I�@D�Fp*FIHY�XBD%VHQ�)+3U�۪ܕ�
+���Cj�,�M�m�M�q|s���6�W�vj�=��o�%���]"H�$2Ue⒟º�1�ˊ���"���9LPe+$�'�X����ن��8�n@���)�n��̀���J$03cY�:�~���6�-��'U}�*E7�,2�p��p�Yi)T�v��*N�ᔐ�x/���$���Z�x�q�Y�%��ة�g
+*��a���v_�����T�{��oO�q��(L���*31�k�QrT�X��h�L�C%��Ck<Ou�v�3���d���@XL靫����U�
+��'
+"�+���~�y�!�5�ڇ�H|B�Z��a
+�ԩz�
+�,X^�� �.YJ\E�js���N�(s��QW��)l��%JU�{��ĒTV�d��lJm׊R�Y=��+��-jp���"�)!/�(���TR�g��S[~����5߈�n�-o�*�F����\Qa�U�����zŭB�8�$:liN��$R����E�����E][fA�*3�!&���y,��Zi�%���(��Z,VFYL�9�E�^Feœ��qQm�_����}+#3�w�}�n���.]5k�Z�i(Ѯ�/;^�z�|&���K��^��`�_ec�z�+i����7η~GʶW���\_�D�?)��\4��t�64�~�B�ǉ)���Mf~����)V��ۜ^7\���"ժ�`��(Z�5�;�~�o4N15�x-�%ӯ���l�2>W�>ʥ�%��7�!2>���$�W9��8ܩ�ӠrST�ĉ(+��(��IJ�4%VŎ�J�5M�ǾIh5���_^���!�\�6\�s-�i�r��Z��֏2��J"��}�Q���m�����jDOb4%���%�%��h�����jf�y$�� ɨ�ѐ��3U�I�aj5T��[~s������^ʨ>��E��#d�~�U�m}Nt�ǈ푔�#�两$�=���fT�r� =��|��T���#�5��]��H�nK�|���簖�sEEj}��j�R�"n�oH����No�̒����*�+�,ٍ���`��í��W]��v_�M�s�^l1֮�cW}OG��QI����R��>㶗&��i�F4�u
+��2m��)��VT端0=�)��n��,��E�ͫdN�:��E��n0X�:R�K~��"�
+��\1���me�V��F�B�y���"���W�-�g�q�5轇J���[/7+�#5C��V<��Ԇ�^�m�`d�!�[���_���b�|&�艀b��t��JKl<��,g��c�c{P;�C��}w��wy.���Q��D]e,X�z�;^�b�ϩ��ue':]�R�NFq�VC5�Q�p;�v���Ӕ�5���\_u4�7
+���Bo<7�-�C�2;r�gh'���E�!����kf7>j�S"�G�l��k��xm�S��<�X󧡸�r����נ^w�S�רݹ��}σ[{�2B�&��C]��jW}�ZV[f�v�/Y?�(�FJ�g�卆�rC`3=�	�z�M�m�w(�gn�{h��lm�]�%��u!�/�֖���J��$
+CK���#�TȬ�r�_t�,-Dn�J��C���{v[~�n4�0�����ۋ��r�j��&D"7U�&#7�}�h(ο�~��P��d�b8�m��㴚�x/�ێ���Qo����1�_ld��f �ye8��صV#-ۛܵ~�5�U�al"Ǯ4��Z����3\����$"�A9�٬��q�$3s����݆�ctMl�*�#>��|�[g=Y���P{���,�Xf|JkC*�Gҫ��XuFS��D���\�h�W���ie�B����b+!��:�7�S;�{z��$�-�$T6�E�Fc�uߕߴ��ۖ[j���m�me;7y�u��F��)�[Xu۽�1t��/�i{�(�=Zۅ�����V�*{�~�A��ܖ
+���d��*�����j?��l:X��L&s�cїM��l:���h��P�Jm�!������x��.�ճ��pZNl��R;OD��`�5{���︭犾� ��k�[�e�{��n����$zE�b��.�4"�|�̞C�QH�f�TFSK��OG���}��!K��G�9�q���Rci��y%��5%��I��O��NV]�������+�����3��Z����mo���J�WrjU���CR�z��1��|/ܢِ�����9~
+I�'j�n;Ա��j�ˑ��n�c7��:N�y5���[v�qY�\��m���3�n�z�zb�}O"�{!��E�p}QP�e4F������C{ӱ���/��*�>�%FrE2�a�"&�������\�h:�0[L�͇�Ft�Η0��+�Eh��9n�E�{�e8b��_f�qDk:��J�rĂC%��G!1������@�z!��":]RQ|��߶Ak7�"2~1��w$��n�c;4�R���Z�m��U9]�3�f�(5�Se�Ms�7)��/I�޺��a�݅���r�[d����ɬ��kV��� �.��΃�]�o���-�OB�����@�Zn%��(�p��O]�Ϩ�a��&���|�]l̩1���-�e���?�m���5_�u�_�>ڕ獈��!�5~�-��g��um?f�� �|
+��A�X�.�	,J�˴�]̺�X��7�a��i��d�oJ�:��M��f8f�m*z߿Gj�)����Z����2� �(�'U����d�O�R��p�I0X-��{�Y�y�ۇ�(�����Z�����}e�U��jJ�y���n�p���.3�Z���ߚ[�~��B��R�e�Zs+��T��Ug�n��p�ڌ`r�\�c�՜Bbl?�1�P�~kn��(ϯF�_�t�&|��T��<،	=C��Z6YщG[h@S�9Σ�{�|�EӋ"*�C:��D����+v��Ke���PCivVEgw@k�M�E�U��CDl��[^S�f�c�ak7�2�OU�����8P18�UZ���]��k��VM��%o~�pX��﫢�@k_���6S�}ʍ�����E˱D��L�Zg>Y�}��u��ְJak/�K뚫W��m��n��X��*�.��
+�
+�g��"�[��(�G�j�Z?�<�K�`p1Q2\���j߳���+�+!��C�xM�{�cxF��O̚�|ٰ� w�6�ۄ��Yw�-'Ԛ�4V�7fw�wv�}�����"xu��4˝4��b$�Ҧ\y�q��~��9c�oy��1�ｖ[�af�k�V�%�s[6�}���R��:�Y�g�OԢ�O.��ܾ�'|B�<�/`-��E3t^��&"���3j,Y�r�B{����c���W[5�FC-�]�r�n��̮(���ڝ�u ��>���v��z�}dֽ�a�r��X�=�#�H�P�y�@��J�ag3N������$���,3����&���T�w�.�b�i�]d��8��e�]�i�3��2f�ڊ�Ҝ�Q������f�ë �+�j�0#��~+`Ȁq"�B�/0P�p!�f�*�H(�ą2`�â���Ċ1N����	1ZOW[����� 5xr���&�,XH���0��I�a1�/�2�A�'�/��*T]�(��0��0`�h��A�<!:Q>|x@CL�����{�=�p�����{�=�p�����{�=�p�����{�=�p�����{�=�p�����{�=�p�����{@�p�����{�=�p�����{�=�p�����{�=�p�����{�=�p�����{�=�p�����{�=�����������������������������������������������������������pooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo��|+�
+��;��#y+��6TjJ+ٍ�g�o"H���,��sK�L�{ ��k7^��S��b�jd�/ .�v��+�D���t1Ǻ��VM ���� ��Ő�ͣ�喴��|Ѹ]����ϗ�y/�� 
+�V �v��TLn��k��MJ����^8�KPBu����BG]wR��q���
+��.�Oo�f���*�S1��b��>]x	L�p���_��V�K��Q�Z[j�sEC�<(~CX�{������fc)hMo���������b���]5`n��iY�<Yof��T���-β�Ǌ�Ki��-K��X����2X�Y���Wm�Q�f�oJ��n�*�n�*����4�w�R��������z�j:�鹟���H�z�g�0Ȳ
+>�p����r{a��r��<%�N�}�� ��X��TKv�[� �ʶ��;�p���Zsy�����ƅ�$�+�`(z��t��<`5�w�� k� ��U Ì���7�d��	�3@9�C���@S�&b4��d5��l�t<�u�w�M�@ʕ�)��j��D�݅T9]�Й%"2L��&��W�d6��x3�4އ�.������Ė��f�.X�s�OH	�3Q����j�mJ� @�슓q�bSi��ª�?͞����H�t�E02u6�I��	�m'����:����A,�W�t=�$������i�.橮����&�d� Ŭ�hn�$鵗0�T���� ��R��Z3]#����e�8���8�y�7��p��r��Ё�k�H�7|��`~C�H	�+��8�a�-$G�s��@R�,xb�l�<��H���4n�\�?�a�%Zu+��#��
+|�;�n��:��G+����w�oX�m0��&����p�J)���+�Kk��C��/:��b��ն`(�㡸�:���KBJ��B��Ƽ��J+|nE�
+�U�	p�g|�@�T+@͒��ԝ��,�O���F�bp0�b�I��GF�,�f����PJ� ־	�X�ە�]���d6~g�$�� ��I*o�V�Pw�'v�'�d�[��=T+�{J�����d���G��`�f��ˮ��<�3}�0Z�z-�t���ߥ�����t��O��/�@aq;�x�o
+�[�ceFr,�-�u<n��F�+��8����fA竆��j˂�4��u݋��oI����V�� �ԲP�bz�%y�إ#kCv���H�뭄ϖۈ�;�W�s���v� �bl��ق[A���9��z.��;N�n �g6v��V- -�2݊��e��J�5~�m�5�狎��D�uҬ��ZRCj�T�}�#y���7R�zABr��u��,�m�k:��=�%R����]BJ����#�Ή�!Ֆٰi��~O2�mٍbU�b�?�P�=@�.���y��9�t��m��<�}?���MzN���I��m���m���V�z��=Nz�9SoJ��2Qe%ĳ�eI����:���/��/��$�!t�]0���(U�३�<�z�hzo���J��"B�������R��mT/܂��ڮ�Oˬi%��I��&z_�,˙��)��FSPڍ�5n�3E�?�_n-L�~�yV��V���v�JU�1�d��p�+�L�y-�#V��Ա;��\������7��d4b��>	ќ�i�픂�n|�f4f����P�&̾4�}�p���M��gY��}��\����n P��>��)50Qo#n��f����5����2��v�xWDk$�cy2������Pn����D�x6My��q��i��X-Z���_�d�_���%|�c��#?��1�?^�t!I�5�9/�Әj�. ��+01��<���ʱ]烒�I�fx�-��Jm��}h�F�* z���q�i� �uh��F�������X�p3�ZjN�e%հ� U�к�;^�w;Qr�D&g���r��@ar"f��"�(z����S�j��Z��D�qƭ����ٷ]���?��u�!��<�vB+oX��$���d���Ѡb�.(N�j�0�t�h<dxW�ƫֱn��\��+�[h@Ѹ_�-�cV�jH�h6��3�-=&*���Y��;k̢�0v��&f��2J�]NT엱V���3�,ɗٹ��:�CI��&���`���*ϓn�4�\zN��_a��#ȱ^ŏX>F��?A��/��>�U�{�n��ŭ/J��r|���#�T�������^�I+���NӚ*�o1Um.�2]��ӑ��^��h�N(�~Ci%��{�U �<���C��|��H���׬7n��&{���i �sQ������	�\����rBݵ�kܿ��h�a�:���5η���Ϋ�X��pI�pC�&�(��y��Bk�?���B�|N���9��Բ�k��t~�3��i�h4�ڴ A���4��(�� R�e ��D�X^�
+�[1��*~�o�+���͂du,��hޗ��G�X���ޠZ8]:ɭ�b�M�N�����vպ�T��N��	��z
+ۏTn�5�����^�l{S�{�lX�o�1��^�h:Bq�E0,�G��S�dx2�bp3�bq?ι�yVc�4�a��߹�	�[ ��b�̖\��Ɉ^W���|P�2�G��Ѳ�(�#Ԭ	����S��~�}�u��Xn�OX�1�Y��T�i�?Yo0�Xk!H07"��
+���m3ϳ݋`�}DO�����Rl��L�{#�΢\���c6b�r$n���P��\�|Ke,�Xn3հ��O3=�E��<�ٟRt��0�w ���#�?bL��@�vH+=�Ĩ� ��*~�r5Nv��ھ�������+�R��#�1;���;��~B'��'|߁��^��S��ˡ��5԰5�����V{vA��-�a�t�v��N��ӫ�ˡ��0ϱ��)��<�t"�q�DH��D�t�j�nh���L����Y/�GND����O��Y���:kw���U�Ӭ�h��D~��
+�M���)�u=au�w������o��w'I�\`��C[��p^���sr�zEC�>��ݑzvc(E� )e�����5�8Eo1�*����o��s+��3�� �� J�s�c5�ڱ
+`�����#8J��g���r"h���p�J��3~���gj��T�qF+��n��ŲU@��݀Q��4Iz���F�ZǋZ����\ ��0���c��噍`{�Xη��a ��-H���KF㉈,Oy��� �s2A���*��3m���9+��b�h���ƬA�����R�3�ga�4F2�Ry�P*��i��4ϴ]i�T�ڍ$�L����y�e��M��W7�K����A�cxG`4Q���c{�RT�A��y�q~��˾��r�U�w��^�5������Q�wK,=�)���T���0�Rk:�.��T��D�}���O*�+V�4\�l�=�J���D�!��7�P�{��U��H��eY�r<�c��'��݇��>[i8���h�ט�3���3�|� f��>�^�>���ՙ
+�z��n?M�m'�����T&��,d���4�4��r1O���X��(�w0Jw]�Ҭ'vŭ�Z31��S�Y�k��7�����Zo2�3	��2��Gp�w!��,r�n%F���}����섾fzH+]����	�*7��xNE)���Z����Op��s-�}�� l��\���>�:��\����&���4�t/J�݊�����K-/ʍ�m�c5��1��X�u�WY��*x��g�y��ժ]@Ŧ]`}ב ��>�8;^v>f�s��]���o��tLO�]�`�U� Y5����S� A�+Ho<We:R���%���Up��m�(�w'�8�q�wӌ�=Q�c��:����p:(��Bt��P�O3���d� �#�<�N�T�)�ezVBp7�Y����;O��S"��u�c4����W���9Zj7M�߈�v������6�t��4�l,�h�e5f��KҬ�i���)ٝV�fA��=�m��T�R. \-�nW~�����u�y��m�yF_�C��&� ��Y �U�qE�V����H����^S���K�b�~f���ѐ^�n ��P"ӳj�jP�cxA^�8����%z��$:��z�1,n��K��|$w������{v�FY�!����t�!��8f��L����n��$��@���������Y�i�8�5D����]���� �]�Q=O8�����b��H�p��0�~`5�w��k���&���M�!��fX�@	V,�Jй�f����LoZ��rу����4�v�)�Lҝ�
+-���Ou��n�<�q-�]g&����ض�i�i|���2Xjf��j�i}ԑ<o���8�v"r��"h��8��ˑng�l��<�c��.�-�kV�4p��;�Σ��se���tv��۹��m�f��t�/V�t�h�f	�s`�/��*��E�y��Y�ޏ�ˉ��j�j?�T�/^�xA,��t�S��?q���H�~�"����5�d5�X�A��g
+��I�x@j\��پ)��b��>�?d���ez٭��m�F�Y.�5^�.=�+7=Zl$Ǳy�Ӱn�,�r�*`^�n�$�s*Hx��^�˥��g7cW� .�;`v�d{��=���|ӭ���>�d=�2,o=��׵ |�f6~�kK�|�Q�{���\�
+d�����a��<�� E\4 0�n��X΃vI��U:?g9������T�]�ZF�X��]�dxMAh5�V4C޴�n���bL�?�h�J��Ǒ�����I(͂&��
+��8^з�FvkVA��v��Y��bL�M��;�3rӋb�s:Ӵ�+�[˻b�j��cw4�u��u_��3N�t�l�n������U�t>����uOyvũr�����_��Z����C��_��zsy��y��:����D���￐}������겊�j �q:��O�t���)�k��<薎w���Vu'��s�dyNDez>皎Jل���09��������\���x�ܶ�~�m�g���X�_�Q2� �4A캧4��!"7�r˶#1��1�c�Q�wi��:Q1��E�}�����?	�ၿdv3Nv_��
+�6����T����U�籢嘌�uKDd7�S:ތ�9Ȭw1��~Pz���kb�tJn��m��������w�k�6Idd��T�캝h����Z��|����I�v/˶]H���F��:��N���E��(�_��k�0�}9MpE05�*��3}��n���y�T�y��:�h{�H�����;(��7���H�t�l��
+ϛ���-̴�I�#J�.hV� ���Ո���#J��8��(���������X��0�t2M��g��;b�zb � hA�]����n��M��9�������J���������N����x�q���w q��,����N�n����Fk�/����L�1"c�Fe��_�cJ�`�6�D���f5h��9'��n���.l�N����V9������s/x�v��,o�����b{1���i�x����(��6kV�����3`GizĪ�n�ɞ�Q���7��:��n���[4#H1{Ksl�~�u�n�?�d�e�o�=�b$�=~]v ��.���� ������q*{����`�J��uVB4�e��;�y?ܚ�0R/�Ƨzp+�+�n�����2Nudo��璈�sGD��;�cq��ع�j��fu, "ǰ�&Z�(��1Ui#���������.�SΣ���"�Ÿ�C���&bG*�D��On�w<���b\�֛�O+.�x��<�v1��^�]��D��c�=7��i�T�s&|��>�\�� ��u�l��;��z�|��G��_�g4������0�D��ݟ]��h��B�m��{��9��i�[j�~�4N�HkMD��Y�Q��M�t<�0\�6��Ӣ�z�j��R,�r<�M�t�%���ȫ��)��U�\7���Y�r�i�jV�)���N�@��D�}����Cm߭(����9�3]�e�x��yO̲�ֺ��vQzVA
+����_2|��f�0�|���D�S�b�fY����y�UÇ���:Q���=4������(�Dx����;1��r�t�g��}�!p�����#u��D��e`V�.�A��L~��8����4*ɱ\���z��zM��Z���[5���-�浬�y-�K���b%jL�<GR,�L�L����J[1���+oj(�F�jv7�M�{*��Q��f�����.�(����D~�z��L�H��[�f<R���+�Á��+~��F�캖��7��8
+*�s"��y��ª�_�5��Œ�(J�s>�6]�q��0���+|���#Yf��@�r��Ul���X$6����>��{���'��Z{ � V�V�I�s<�NGJ��j?J2W-�U�{������"�M3������yǫ{��X�K��jsV��1R��k��Ǳˎ#E�w��<=f�1�R���:��4�'R���ѺȐM���G�c�框JM�0L.hu����:G/��X�7uי$����y����&�D�c�d<s�3Z�x3P���	ǫa��^��&Y.G�ί4�p2�_j7S���%�3j���(Z�B���y����Y�z��]i@Z2�̔��sEە�t>g�Ɵ �o7�s����3��a'鍂%����j�g9�5ݟ���p��<�(�;�Xo*�/�/��#��\�-��_�8��J����H�b�:Sj8S���ڶ#^�4|�ﻛ�y�SE�Q�c�������V9ޱZV�YU���H��Q�U��,�!�����r�03U�!�a�d��5��Q�v/�v]��0{	Q�����H��;K�7^�u�,����rLۑ�v$Ŷ����M�w-�s^*}��{��%�ki��D��7$����Y�i4���.(R�}�\5[�Q��(�c9��1~�{s�niE�[Z��Y4h��>j�Ў˶]�:��sE�u��<K�|�q��>P2;���jV��ؽ�X����@�M�QՖ�U�cy�m/X��V7~i�N���a%H�����9V�qjv��Z�1����u^CL�ئ�v+���0�G�F,�Rt��e7�W�ݶkF��=��b8u�c-ۏ�y���Z�,�橮;A��D�幗���'���XqVA���mQ^͑�uf9֟ءJ��i~�W��!,�C�j?�;�����~�7U����z+B�^EΖ���0<������Y5���VAMt����"�z�tA^��jZ6N������ߵ��r�u5N�%���p<$"x���WU�Y�f���-�<Zm>��?ʍ߭@V�g�d6�iZϢ\�U���\ ��!n��`�҈S2�,��yο0v��8��&:��<�o�i�O������ƫ|�-��Y�}ŭ4��2�f�-g��^�}_u�7Q��Y��{��{�c4Z������C�6%�=g��[Y��-�\p&|Xl.Ͳ>��{��¬�dY��Y�ʞ�Z���]�y�o�tK�����@b�z*�q^�Yֻ8�~���CE����>	.�`��&���D�s�*O�V����l�F�3�WY��)Ǵ�C�֛���ī4�-��r��C�(8��a���%X� ���-����߲�� �Vw�;/H���H�y)F���g=�����d�K��g��ƫ|�(=��ӄ��(�r3Ju�5����1��F�z��IQ-����a�l6��q\�l6�Wy�o:����J(N 2̴�����C]ֲ۟�,�����8��g��a'I��m�a���J�f=�|�A��2Ƕ��vc)���*i>G�������Q���;�{i��B1�?16����B^3���w1��E�7���,�n���9���ރ���8�w��v�v4�u�h��=�c�D�4X��?ͱ]�2L�QyjQx��M�T�m��:c)��U�Y�.�ǹ�sI��z��:����y%��h���*z��^���r�V�Ժ���m�d�[��9�ey�Ò�[��pK�{z�z#�1<i6~G�T�w��>PT�7��Z3Q��	��۴���@f�� �� �s���yn��sq��-�t��Y�K�K���v����Ȯ{!�m� ��h�ﻝ�.�K�f*v���F��&|]p!E�p�q#�w���%�~��k[b��&!4=f�}7R$�] ��.��y4`�2[k>����u�i�潏uͯ���%�r?D�����݂K\���'����r�����wy��1����OA��Z�l�2;��n�x�m���:��X�z�e�]�%�����/���9���x�k��y̪��/�����?X4�we��[�Xe�u }�M�`��Pj��H�r*�0�Isk-��㹞�:ֳ���?��;�~�C]끬n?�T�Gn��*��'f�����o������M��2��z�m���Y�[)��;�xv�&�/���i�M竚�x��1H�ݫm�@X����~�:�G�duLk��8=�cZ�j��<��Y����HjxRBk4��y�LS��a��5N��ƹ�#u��^��B�\�9��Xҙ^W�v#)���w�T��Zz���H����Y��/ӯ:u+�k�jzA^�;+wNw�������T1�
+�U����Z�h0�d7T�h�� u��0�z�f{N�%��%��E�����Z��	���~�e��G��{j�5p���Nռ�n�uH�3�%N���-�W	�1<�w��-���+�Sf�s\.Z��g=���8�D]"'K�����8�x^0=,��3ib��H�j�k��"G��A�V�čZ�*�>�7�i�_���R�d|�4�?f�vR�[��i����S�hŵ�}�e$�֊�t\�ֻ<�����DT��(�l A�F�1�#x�k��}�r<�e\�C8��,�sB�YDi�.�9���_^�y.�0�$���U���K��wf�Q��ֳ�*����nGڞ+A��(��8��u�y;\�.Vn#�q?f��cQ��~�v^`܊!���כ����V[��܏Q��r�����cm��L�{)���]Ӆȩj[ D�� e�6���a��8�~f����?��W�a;V�����$L0;�q�oa����^�+�#~�|�V�nƑ��D�
+�Ő�-$�tU��6�Ʌ��\��O�|����*�Wh)ϰ���̪�Q���gI���S�w�p���HճV9]�o��h�[�k6��rp>�  2��B���/�n���6��Ո���gS����|"��k��"|�y ���!���d���x��P��g:/���V��)�Lo8��M�t�������P�:Z�	���M'#g��=�4��2��X^�k�)?��-����))�Z����-���G,����%�k9�'���!��֌N)͆�ʞ�UV�i����%���z�xKl��I��8�t�k�i�ndHΓ��8Q�?�]��Bk�ѻ_��P���T�GN�w��&p�w�����\�8hNw6ӣY��X���M|Yi�軮�5�@���q��`��	���5�Ivō��*�B�~!z������09�'ڟeө0���V~�$dv#�}��#����%��?�c�ʹ�_z�r5�0����ۮ�e���l��$z.���cX���u�*Y9Pw��?V�h���`ؽ�WΧ���?�ۮ�]�y��9p����z����Ԋf�Q:ϣY��.J�\1�G�#������'�����m��N�ƙ�+9��Ű����z��,��%�
+�UZ�S��j�6��x"C.<��X�r�I��O��~��?	Q,_�����C%��m����C���	ӧ�	�O���>��;MU�Qz�S��=ͳ�O^�y>*�.f�uwq�2Yn��T�r3�[n|Rf*�\i?V�^�e���1�f���B�Ryf��,��h�����b�`��'��rK���j�/����o7"���`�^hWEhySk}D��[��=�O�.���{M��
+�'�w�v�ÍPz$I����r&���)_8���<��IVAN C_4?V18����S�6wW�f#�����}�u��d�쾏����w=W���E�		��Aڶ ��u �0>����
+�Ӽ�fI���Zs��p5ج2��WZq��#�p���M�bg*�CԘ	��2%٭)�L����"E�#���7K�^���I�}��M7���o�j���4�A����-'���|���k�W�����۳��	��X5?Ei�KP�Z;��uoA��v�p;�.Z/�,/��ն��#	��`Z�x+ų?���6:\v>XiB,Z�CP	���^���(�p>��WN�-Vl1�axcV�gb��Ĭ3i��ʕ�a�g� ��:�WX�f=Yn.�^o`"7��t�'I��)J�>i���H�v*˳�Ey�(�w軏���= ��G�@��@�t@�w�*��b\��	�39��ˮ����� ��(K�.Os�����t�)<Ⱥ�� :�G��V�~�H�SPBǠ�Dv�4ו𱲇 ���8S{�z���W��2��s`�
+�W�E�6�i���,�w����u�'qz���b|����B����vpE����`C�c�يk�����L���ۊ�|��98XnV��J�lfl�h��.Ѱ��h��	��T�[�d�
+ �34���Q���S�Ef����@��_h���8�u٭�U�GZ����Z������v���j,��+�T�#��ݪѨf��Rx������e�m�QB���rD��g5F~�~rMǴ�+8N�y3�u9�[�a�`�D�X�&����.�����y��8�vJ�4P�
+�W2=a4n��&*~r�*S
+2�s^��H�T�� �x8Kx�JRM�Ů �ӈ�b��&�2�*�e�m>���+�g���.E50:��-� (J�� j�a�R�v��|�n��.8>Vk(x��F�h�J�����:�h�N�V7�2~�=�!H/A��L��K)��V�ܕ;�缦�$t��#G-3p��FQ��굌��Q�gA��ֱ<$6N$�;Af���=�;�\i<�6ݏ4��Ă3a#����5C��9�4�o�g>��J���a��r�o7̱�f����-�IU�K��W��:�z�NX�ɑ<LZ0�OY��F+��d)�Z����Y?c��#������v��0��"�9E��1��h~WS�.�������+v��*Ukd��Z�e���K9��'~��`�e���[f�x<��\����U�
+cUZ��,�-�A�b}�R�߉��X��9�,�.�6��J4���;�r���&���ۊ�5��ס��DN��)`t��}�9��p�����O�U���v�#����@gI�_����4��Z��
+m�E�%�s�<��J��N�tɡ��� ��%�p����%:�;�q<:Wm�Ȟ ��b��
+C�T�
+`���Al���.jH Fi���h|.�Ы��l�����w���͚��(�}��Ng�|���1:#U>'#�u�a�	ނ�M�pپcF�n\�k�@�wJ�Y."8�<�x"|Xi)~Te:ϯ�d�����q<cU�'���|����Ռ�=Gb�RZ�����obL�Q~ao>���%��E�sB����V�*�;I��~�������F��$�Ɇ��d�j:�q[��n�;�W���z��9�f�ߢ���%iނ��H�{:G�&�^ݎ�A��<���u��e� S�!��N����ڏ���e��� ���9�rJ��)��(�/����H��Nd���H��$:䷵f�T��Xz^3�cf��=I�,�,d���m�C�8�9��9�:1yjIXY�� "�Sx�
+� ����5�7A��)��'H�� ��=PٮW ��G��.E(�V"���2�J^��=[w*FcZ��B1��q�V�t2R19B#w�ͦ9����B34�ֲL�y��;�"Y.�E��~����+����{�q<<a��(�<`�>_p z��|�h�0��D��)���ㅀ�z� ��%A�i��g�N����6,��Ŧ"�uO��;G�F�~B����zar��`�qʢ����ך����r�a���'~��%|SlG�6�jWN6,Nks�ӌ��|� ׽�Ƿ������$w�U|�����.Ÿ��c7�U�ވ�(8	@�l	+Gj@Yi!I1�g�����S�|�C��,�A,�E��9f����EE��SY^���J���z�Z�!�Y^4K��4�z#{?���y�k6��� �,�{.����;i��WŒ�)�cuB(��E������ڴaE� �r���1gU]�i-�z�m��	��)��0�4]M�=WŪ�8)��B�\kB_�<��FOt��m�W�e9�����Z���n�s ɮ3�Y8>G���b�bz=Qd$K,��c������$TVw��D�{cNw���e����g�/���G��s�d9�n����D�0�tfB'.�h��8�s�,�H�m���L8�����_I0�?����\��(�u���b�C�bk/Jv~�5����i��~>Xl)z��<�4�yȑ*�y��4��}�̗�G�b�j3)��(|`y9^q0~��h�m"�&*�g�I�S�5���#@^�>Xrǯ~�3�?A%{����2�2Ù��h��0�(����r�a��|�m;�i�n�OS�������^��Jm�J|F����V�
+j���˲�F���c٥&��7P���Z�&:�2��OL�0l�i���-8F�3e֝gZ��"�n+ ����R��"��1LN�E�]�\k3�^m2�4�G1k?��F�p��� ���ɓ�D���R�"s�b���n[�<���`rE;0?x!�� B��<�ڌ�1ӹ����S����Z�0�� �rsy��c�a�h��3��y�,8^�x,�ToZ��D�H�u��G2?G��si��"Hq��
+�É��`��7\n.���9�5��u��_�_F*�o�m4��kwJG�/+��c5�}�k4v��_���/��4�#F��|M1���g����g�[�s��b���En7���lx���d��w���X�]n6[�3d�W���Z����*����M� ZE�1���HY��`�ao]oO�֧,�w��=Ǵ���7��
+�V�s0�Sez���n��)�����T�3�`	`�)z��]��=��D��KBgy*�0�"0YT�����HP�O�ǉءB�@�4c�ՖĦ�Bv�h6U.��,��i�@�w0K����t����O���4��t��2�4~i�V�6K�q2�W�l�� �x�Xn�O�M���_�e5^���-�X�}��R{9��I�c>���_b�7Z�c}"��E��֜Wi��,�]q*|^q�0�`����G�S��@FH)U���V��;+KgP�m
+b�o,A�<WjI,�n�U�T�Y���0�t"t��Z��D�`�I@�eK��O�a��#Yob,��1������Z�p>O��-sȱJ{��X�y��|g���h�c{�T�oY��<�c����]��A�ي�*5�v~]��x�/�Td'ʫ��,8�-7@�81Ug+Fs��}� ����eZ�p�]�(��&x��� �&@qjU�1� zsq~�-��°~�:L;D�Z	 �Z�b8neY��<��L�Uf@�~k��+~�@Zq^��B�p�A�e���7Z�t8�4�%���?J��� ��H��G�x�S ��^��v����>w�\Ϲ(��虯#mץ8��!̬����[v�}�W��{Hҋ�ʅ���w��Ok�D�U����Ҿ]���ѠR�u\l]�Î�zEOӛ�%�;�ݽ(����n"G�,(��@�x'R�7_l�C�Ď�����~�I��}�ƣ$��H�L�����S�i���3b�r ͪ�Okv�~��չ��Ѭ7ѓ�֡�hL�>OP�bC�{��oj���T=��Y�m��8�|�x��4�q�e�RѼ��ϭZ��,���1��t��e�	��9�5�A#T� ��~�}��/B�[�t�O��Q�sR�t���.�qʬC����m�*Ѐ�D�������4��#�W1��Sz^v�VAK���a�*�v�y�k�F���'�����J-Ÿ�������2�	&�c������m�n����<��9Qip��J�l�_ ���n<O�uCh�M�p�뛎�i�SI��`���L����jz����s3�Zhv��;Tt_�e��H�w,�Z�ŭ8�b�\�Ҭ��h�4�x�-6;^n"F���m��D�[��|�Y��Q��fsQ���t�b��$|��>�ay���ΧʶV�w*���AL���u���n��9U��rL��0�yJk]��i����tFH�l�eύ�U�M0"%Kpb�H�Ĩ%�}+�����:�q���5�#5Kt��d�e?�R{!�M�d�M�n;x)��0n���f>���n�x<Ѷ^��\b�s��:�Pc�f�&��P��� ��"��>&馁rTۅ�|�c^FJ޻H�q4T��!w�o�v�7�X�Up��� �i��ϖ[����ᗚ���+����l� �Z�EV�d���4�*��"��&!F�"����&'��� ����Dǩ ��eլFw�f�qZ�B�� Cw�!r�8>�[�Xi<�.48Y,-R(�u�GJ�v0�3_�����
+/\��o��)n㑲�-�_p1�c���B�� ~�_���,��R��қ�M�}��:���?Tg�.8%��c=��ڰ�ε�E�I���b��r�3e����$��P��$Vn����Y9^�q�GR�5��6���Sp�� #��v]�!�� 5=в�AIj�����Z���
+.8�!Z����<�{#�,����b�<�]�az�(�w>K�.C6�H�Ԛ�R<�U��$Yb�9�o����1��)z�O�a�dW���b�}2�-9[*�U�l�)f�#��·e�Q���Z�
+���]PQ��%B�c ��b��v�bx�,��j��0�}«��a����G��:e�	���gڶ�@S�E����b�e�P�4�@F�4�ހ�j:,�r� V[�RL/�<�;~[p1�\o-Mt���#A��7Q3�
+����!��ߙ��\����^P��0,�Q$���?���V\Mt�o!�rC�34o�t���F§	����K���u�ps4�sKa���u��3܇q���H���h�O��o��<���+�t(��'F�<\qC/|��[���_���E��_Ķ�R�\�AXA�78	j��e�� ꒣�4�"��fbe��4�W��9�x�Q�s�c��5�'��i�t�	���u�G�j5��i��`��򨢠��<j���Q�$)�t�P�AV�@!!x�@�!�g$'vQ���es�tTn'T ��F.�n .���9�ъ��ٯ�H]LbF=qӁj��^#���R��E��00�F�'�N)Ll�1�j�3�*��D��Y'�����4��h��Þ�Ղf�T>ۿ%s�t��{��X�DX��ž�-��|�]��E_�.�E~�-��|���E_�/؅|�[�����E��/��|�[����x���]����[d�"v�p��_�9~���U����St�"W�x���W�
+�§�]W�+x�W�Wt����{W�+�_��p}���*zE��+�_1�pz�O�+�
+)� P�V� T�abc12d12-bde2-4512-ac98-772bdaf169dcf661dbea-1926-462b-b850-c31cf2a-7918777687548l56.209637hWnQ48 >�:U��.�'�����+1Z4�y ���vQ� �Ѩ�< �	H	0�;�CTg\/Ŷ���Eʏ0�� ���Lc�5�x��hl�+�Kt�������31e[ǁ�jy\l y56c3d710-d1a9-46e3-aa32-e827611835d9dbf-3ab4-42d8-a661-4a48c49ef7918055773925682 v`ˣ:�p`�	V�ƹ�����Q��1TP�[�Q
+�t�� h��0� �Z�:a%7��m�XTw!�.����0�CqtpL ��ba001d1-9e5c-4c24-a0c1-3aa654b077c8944eea1-9c3e-9ffe-d04c1fef2ee4087955035305023549664382977-284762f13b8-4b68-9c60-462c2ca4dfc5e6-7186-4489-806d-ec42b305e92d3241882m10SVGFilter/ :/XMLNode :(fxmlnode-nodenamvalu1t/ArrayeTurbulence;childrenbaseFrequency(0.052attribute; ,numOctaves(stitchTilnoStresult,feCompositope(ininSourceGraphiinw100%hyxxobjectidAI__id/Def ;4fractalNois���A"����I��a�D �141%�@@)#
+�8�D���@E�@&`���D8�3��Fr=6ж�����p&4��`W$J��c�ܴiӛ9@��viw~����){I��G6�tv��G-B�
+��o�i1Zb�]݃h+W��ՇA�8-�+����/'QGM
+c����U���)�n�6Cx��D>�Yi\�E�Ci��A6f��`�M��g=�虆�(G���A����meJ�L�v��	��G��e��dd����u�\w�~M�* 1�G2���#mg��>܂��S�9�y�r�<�W��`��j���Z��v�kB#l�6�fp4�� N�ӫ�"�q�_��~.�A�rF�Q}��ƹ�.�DWs¢uر�'�]��tR�.������=�V��L�ˈ��w�^�:'k3��҉��5j����/#�1���E�	�G�Zu�/p�E;-5��6�q�A$��6Ju��� Y�G����ґr3 �*�M
+� )M�t	F<4Q�o�ѐ��<>���w	##��[�� R 8����/)��O�~��58�����͈Pl��;�8�b��1���,��A@� ��X�h��]��ű(��,@��}�pb5
+Xx4����X� �&Q@|P��[�<<@Kah��RE�0��a
+h�AU�����o���bq𮪮/�e�H��
+�,̲P�P�bq @�YD�Ec��40
+�eq,@��a����G���B��)#<(���X�O})no���T�T������XX�Ł��0,�H�X�����D��� ��(0M�8�Y���Zk�Z+�ڵ����X`��"�8�cq(Kc� �e=t���@�i40�R�@�8"�B�P�c��4Ƣ��ӌ==]8 
+��iX�4�,��� y�V��� ��0��0��fGE�0�An�B�84LÀ��  A�P4�,�e �sM�Y ���˧�6h�4 �h4ɣA��0<H��a��� y,KYp4<��aqUUƃ	�ƢY �`��P���04�(�屰�����,
+�������,�ŀ�j�9��-t��,�=���QjWܚ�����TV�{�����.��v�j���C[���N_DC��_�R��xh��BO^T����Y	���S����h	������p����	o-�jUϺ"�h	�_�x�\S%�'G��%�gO?kJ��{R9��-[�xps��[���G������lL����l�H1vZTyN�FK��Sx��s0�/����&�C�r�]�!ԒCxG��^ғ%c�*�+B$�^����^K8�'/r��7�]MU���
+���΍�NB{�Z�B�*uU�Z�Za�Ej�#^:�8����������L���w��R�5����G���q��o�SWS�սާޮ%ש�ک�{I��Z┫����~��w�:��΃�p�Q6�<Nx���;&\�G-itT�0qLȷ6r:Z��S���3�E�������{E�`�G��l�� �[G�x�����b ����u�ďB;=癯ճk����z���U��C��X�1.�[�i1%[�S�!,9�}�q�|�k�w�!N�<�8�:%^� �����nZ��h����D햠�׭�8o�tfdα�:����E[�l��oO��K{Z;�qB�@��{���- ���"��O���1�cKN���+Ok��CpR;�O>�������%{��wO��B��`���߻w�|G����U����g�s��R�X��1�-�:?w�%��Y�jK�~Z��̙ҹ<�L���C�q2�j�Vt�O�%�f�2�m�\�v�����-����I�R�i/-�݄�H(Q���YL8�����M�x����F�2z{�V��P��{��lٲŶ����\o�Z������Zh���r[��ٖ-[�U[�l�����I.)L�A��~�8����
+%[$�c�g��e+ѹ�܇��H�1JO�\B�6�=ɖ�����HbK��O[�=���s��NR�,IhW�m��|K�:�����&K����J��V�EF^~���O�M�t��8��W/���j�ڤ��U5ӡ�;���������^�-s��P��G��3�Ry:���O��0�AGI�b��8w�W�:�,��S��C躈�y?f�kt��Q������?�O_�<*]�Y���[��A|v�/!�����T���O�����Ǥ��z�g�C�wzY$Y�,��J��:�$!z�GclJ9|��M���I[�S�{&�Dk��=�PJoLh��?�0��Se�d4 �! �@($�D3� �  � �n,0�C� YD$�Ba� @ � `@
+� ��Hb� 60g�a�� �k�W'mT��F9Y��Ta�$��@l��k���TO;�E
+�v�g���y�N��b��$'�0X�gսe�`ų����l[.���R�B��%�~��2Õ�`��>�t�q���2�&WI�0:�v��>/؟0��\m�����e��l�D�6,,�	7챌�oQ�+�mJ#|0��o��LEB3�l���H/xEm0V'��7�q� ds�t����I�y�zN�Z�<��M��Z��کa2���Xc�Xd����a=� $|�,��ʖsM�ܧA@u�,q���1g�`�Rz��v����k�tBVW�4�^=<!��,'�`k�:�9�������Q3 ZO�n����f���bd�O��N�M-X�p�Z4�gl�Km5���J��dmh��w{9�
+3�[}C�Wu䴍��<9��V�)�MNY���FãA�(L=������D@�����'T��5�H�Iq�xY�)�����3����b�(���"��	����>Y�����Z�6T�8˖�œ9*��N�`�Zc����Q`���PiL�C~n�L?<��Y��. ����/eI���ܾ��Q�.PS�×�f���G.!���"�tw�K�t��B��c�| hSw�]W;J��.]�O�ѿ�8�ŋ������^�r�E�r[|P_/"R)��Y���%�~��{T���x5T���fz��F��P��� 0����)��We�� z��M�N"I�!��!ك��~o� ���7��_��-c����-�F���u
+�s�8MOA��πs�8F�1nǞ��:�_�K��u��[Gu4QG��U�'��M��M�R#�
+�5cƓ�C��%�Z�8F-��Y��T�
+�Y�8���G�jb4�������IY��ސj��D���h�l��ZUL�7�'|�E��Q���=1�a���B&�<���?t��+���|*u;PcGˁ@��U�N�+�?=� ���B����4�*j�� �O��s���	��S�d	��B7�^7|�/,9EvBŷ�
+���@J(A`"�Roo�i:9#�?��Q	��N�^uS��J8Qԗ����pӍl��.�l!�={jd@��MS�_�Dϙ�@�-_�y72���L��Th�+�X��SԒ!�c	-G�M6"��%�x�����*�Y�4k>7�	,�f�Z��p19p�LT���S��xz��+@�j�&�_Ň{	��K���Oe�zVt]��<�Kg(l�-��=�S2��9� I�H�~Lb�2Ѳj A(;�� ���&xlo�Q�#�����F���Ѡ�.������`V�&b`5�XU ��zť�<dvs��.½Y�.�|ob�19AB�{=<D��E?�*J��b
+~�ޯ%?t�"�����{����<�8yX輂��{�,�ʫo/���C%��K��������٢���]��0�k*H�E����׫�B��n(l��aMk��;��Б\3�q(;ԴX��0��I�
+QG�n�M�m+fx~�^β�*���s�D��=�Fay�����A������W�l?a1~�J��� ��2�
+��Vf!3A�U�H&�*�R��x�2G�d������2�Ǣ��0(2����;�ϕ���=�P~<@~ܿ!��w�@*�!2��G-N�J@ ���ca�LZ�q�+�;a	b!ʄ�C�0|��S
+P��q�+�	��KS��Ǭ'���:���Ik4#@vN,�0���h�7{_�ͺp
+N��X�MR5���矤T�u�/�by�&��HuΑ�O�eYPa����nu�K�T���|�H�Lk:l~�h	�E�:j�o��~!�R�_2ƤT�J��iFG���_��1(ef����q����R�Ѵ�psg�hc�6��o��#ӏ�W�ǝD���h�����c�y�<k�ov��MM�&�����⛗��w,L�J�?�c���i�
+bJI�<�뚓��@�P�tR����bJ�smܓ�B���4�j&���h�w�%Ka��oΖ�U�S�����]���|�p2u�h�!>PR��{
+ R�a�c�&^@i�S%h��"#�vw�1�W0A-I���S��~G�]�����L��T�U�����M�8SO3��!��*�6���<߂��A=T���:���/�Nخ���N����E��F��_�������w���c"rgb'�(>[1�z[��){�v��VY�ԡu>B�VS�7���78I��}� dT���"��5M�Ӹ~�z�7�EdP:<C�-[?��q,�6�*��4`�9���G?.�RE��:�=7;H�(9�L=.S50ʒ1pV0O���@e�?�z�1�#��hP{�n�R��q۽E�x12{�&D݋U�3yUD�0��s~�$�>�!^��IA�rI������l��fb����:Kj��8��K}WA"���>=I�u��� �#I'��F��~o�e-u�L�>�d[�(@O�H���>�X��	�,Rz8�LizAZ���C�D��"$j�Rܐk~�2ҧ6��9�y�'E.в�4������6ڒ`���)�PX�x���H���;Sc�tCR�#���G�a2Gb4߿�>)ccǙ2%!k6J��O畺6��Mޖ|�%������z(^J���H��e������!#�W�o�h!Q���xu��c�l��5�t�J�S�Բ)����+r*�����/?�]f6Qd��[��i��F+�&�˰L�����)�gP#�T����*�I�t�3�.#] ����"[)������ՕK����e��=�7����f��Z�:T������,;�CJ��S��";%�|�t�%����1C	���7��Gv��R;h�ړ��]i����,x#�2^���D���3�J��U� b ��^���"k��zǲ������H��iȅcI�1ȉ����͈�5�9�M`���x�������4gf�Zm��Z_؛�瘡k޲�DV
+��f\E�گ���(N4wG�_�ܼ�����9*1N��W��Vyl�IP3d� 7	R�&����t�?/9�til�6��Vg�|��l��>!�ޑ���:B�th���sNҧaO�V��dmȥD=�U���A��9�Vl�k�@���&mN�̂�V����Ո��ǭ�qPF�@�A�U����D^��(����+�e��&�m9ך[q��[=�ݗ�2}���e�'b� ��G�g���#��䪵v�ŝ�zE�,�D�<m�#��PeK�����	��+>�"Z�xF׾����FMb��bt'�lŔ%BER6>�-F�cߕ�Ն����34�).4jZ�E���3�Fj*�T�`[�m�ˍ�\��N!{2�Ǟ�]��Ujg�3����]���LV�,Y*���Z�<�H�b�3��t҇��o�&��h�K�>]����o�a$4-��%=����`/��z��V!�2&1
+˴�S�WF�5{@`�����w�<�n��p�@�*���V4��z-�O��?�VŞ$�4֘���dv��)��$W�92��r�·��R�.O�������B�zr���������3Ti���E��f�@�lF����e�d+I���r��Pu|]������{F��E�7S���ɶo��i	N��b+�O��Y���*��g	�7��PXǼk���r9��P�2B�Ԗ+I��>�RɑK}��lr��k'q����Ҷ(����@����K���;a�`���ԛ��x�!ٖ{�;<�/�}Oj�5}n$��y4+ ^��I� m�(�P_Ö|%���I.n���W)��S�I|G���$��\�X��} <a�������(�'�Cd�j?��
+v�3���+���h��_5F���[ 	��O�'�J�@�>�_T�����=L�^�Muޢ�m�ӱ�����,^;7��M���:2�~.�}���gHԓ��J��W�{�Vm�M.\�y����j����g���$�r^��|?>zI,�p�?��?�f��I��pHF-�[*��fƆv�Wy�+|��H6��HQ�,��Q���6\�{���A�\+�h��c��b������tV>؋��/��v���mX���R2���-�����~�n��t5���*I��c�Brx:H��,T\�򽱥l>}Ca��:�[.Q)pEM\Ig�2*?�#`8��3T�-܍�����$��Fm�Ǳ�6e�J�u-O��K��$׬f����J��<��;��qN��'!��I��*)a��Ҩ��&F���Q�yU�qT<��O��n�4I�$0jO7/}$u�
+��*ף6�B�{�$��M�4Yd0|���Z�?�'�ʸ���>�"*|�B�"s��h��1׀B{���.y�YF�d�C�����)��d��iuHm��C����W�ڄxmư)j^��4�<:�N}�([X����p��#p��(�j49]g^�j0�) ��Qi����$�q��`���*q�Ё��G�H��	C�ʝ'%v�� ���$j������C��f)�4�HZ��S(�����%�qY�Ba��L��[��ƃ�(K:��j��o�?�B��*�?�Q�]!..	�.jg������4
+A6�����E�T�qi6&��)�&�n�Ayo'u7LRV|���$�(���Y��T?z�I�����{�L�P�8e��Q��[4�G�1Mbd�$�Ր��s��&�*��3%r6T���Mƻ�AXk
+�'�x�̎ʋ�q{)��0T��T�[`9�
+����$'�	� ��xɎ�{�Д�p���J&�����B�f�M�H�3���n�?�Ad%�z�q�3.|{��v���o���ڒ#ˤU�3�.&��'MDx+H����f/䈣b�{Q<�Z���0�Jf_�s��^��^���	o����ekJ8`��ʉ���0I�=~��u�S��2�m9��!�j�5!S678�����I��ڷ/�,N�Nc��( ����CJ	���+�G�a:�p��
+��s��7����Vʓ����H:��0.�h������xU��8DH��$
+��lы���ZW���-�HC1Nu&��)Tɚ���MR[4E����P��. �̩�ϦsRzQ�V��1���]�Zާ��Ot�Dvsɍ�B��� ��2X���NQJ��P�-\ЎrB-`� ��m(����������D�8w�b�X�BG'�P,�Z�J����2� m���9*�<s �����f�[v��K���;G��'�"
+��\$��0�*����o,}}hջ ���U�*
+���`T1�܏��7�h��
+>Ԍ�����/����2ď���d�"���_�N��mQ�/9��^a�c�n ��\����$-��Ǯ����6Q<����P�eA���B�4�H֋H��~H(�[B�ڹ�h���ޏ��C���o�N�m�5��ψ�$Upў��g��} �D���y�Q��]4��%8�٦�.q*���i�(��͔C�F&��	c�#n&u�W�9�o��9�!�n�k��i�BC�L���} �Ф�)\���v�f	d�.*���)֒����Gм�x�	���3�!Da���yI&E
+�n$�JKQ��J���A��T���	��ȋ� iI9��/��a�:m`�iXە�:�8�4���Λ��@*}�>���&Q�&�?
+�l�4ּR�PD�$�d���ϣd�qX��6���3,K��tQ�2}%�C{b�k�nTY6'����Y�DpZ��A���B��%r�<�rzw�mI	��~'QDV��/��вa!�,b^$�T�l�_p�m��E��ͨ�O���N�pQ�󄱄�4w�/�?���qe��H��s�*����ā"���P�q� 3Ԋ���A�P~��Jɂ�q8ZXt���~�^��O]9��Ŋ�%�a��C�q��1�(t�x��,SU;��(g�O��e�Ś��g��\�2ԯ�(��
+b��1��ʼ#�f��쳠�a!#r�f}"�1�ۄ�J��/�h�g
+�F�u��8 �A*,�QK�*u\�hX�E�Jh-��Z#�A:�"J�ߤ����h�#��B�&�v*�F�D�̎,\��#��XE&� �ȑ��G��%؂u(+
+ @v\�rr�;4^ O>Kv�?�"V�MA���|}%��8�W�C��*�5���hm�{U��x$@�2` �8Upc^�AV�'5]��G��1��"��d4N>���ThU����J���ǋ�$G�����ۭ��>n�f/1^�GR�j晇��5���)	X�nTl3�QU
+!K�Bn.N鍴� @@B`'Z�5Э?Z�?��K��Zlg���Q���QJۜ�U����֢���ߚ�&�}����p�1-�t�CvdA0@�x�S&|�����$Y,Q\���;�tW\i֤Y�;��)Ib��-T	g{3��O!���K���|_���I��ü��K�̈%�����X6�h./�d�X���8!ǹ��R0��QH��Q��]J��w�b���V>�d�)x�Y�؛;�o��:���C�b�祃8�J����� +�9���'G_St�Y7 =�2���/S� �>�7އ�ƀ1-�3�5�[m-L.t
+��|M�"��v�@^�7���A(~տ�#�����Zi^iS����MH�W��Zs�L�Vn MO�є�b����(���G��n�����N���˃�ŵ�2�rZ�zG�����q��$K�,qT�s3����æ��/�-�!G#�>%噟��q~U_$Ȝ�k�k]�z���j��8x}#�$�>g�x��<�l�t�A���)��⋭�^Q�E/%qr�+ݽ�v]�B��c�^�F�u��6�n���i�N"
+�}���$���=7с���0s=�
+oԠ�
+�۾�AD/�s���Z��%@}/�P&���LY��X��H����u����w>B5L@$;C<�x��~T��j��fczx��$_Ըd7#��f�ƾ]c��/�lPt/�7��!%��1�R���R�4�>>D�E'b���-坥����	H;^]�r��e�&��(��J`)M\�&�4�8{�E�༴<�O"xL�i��<��+$|�?��Uר���4��y^��l&x҅6uUƭh�#��Y��m����Y����g������+y� c���R</L4~�f���WD�G5bE�������!(�rW�ѕ�Р_=�$^��D���q.�<��w$���6�U�S/��#;(����!H���[�݄D߹d-
+�����=�F�ԩ_4�� �.݆5s�ڗ�	ÚΌ���K�q���Zy�T!8�m��8O����n�E��#_0�c�J^�q� -�{[�! ���~T�I^��E�ҥe[)X9��H�K��y\��I�F�ؤ���X�X�`g��
+D��B�=.͜*�r?��d)aSQ��(���b,�L��q_�yd菴=���Ne ��q�xd�s��f�.�f<���w��\��<u���o~�|����@��I.���"^p|�RY
+ۙF! ��r���Syϵ��'���\�����ꂘ�5�6��ʳ��4up�\4GL�/=.2���Qo��FH�-,�?dG��^[�3��_zJ�".в�\M���@�~�#N�h�-��+�U��7� Ý�E?����r�F��D�°��H9�F���n������I��g�I��#*���a���z��x�$\q��`F�V�o�Sڵ#3Rg�8�;k�#�/��f�j�
+Ӿ�R��$Ч�h�uY^�P�e�)J[�a�����j�;z���D,����u�;��f�L���pOS-�+�dѪ�,46M�%�����m���%>&����b�Vq+ؘE쟟sF�t�!Q{�W0���Fس�����c	4�'���<"zs!(+�hNZNu�q�iDL���;��� Y�8�k�^2C�]v���d�^���H 7����=%��ZE�.����T�ޚR�$*5[�� uV�)�[��S�42����SR�҆lf��8]qQ$�?P��h�4E^t���cn~N6��B�	��H���o,V,��ƨ�acD�͙wȕP@��������ߤ`* �n&s���ݑ\I��O2���"hv��ôxV�Y�3|�h��Z�>�7��Ol�3Z�/�$'���Vb�;0�SHq0��[� kݸ����Z�l�@�(ɥ'�9����[�0�����6/��}c�>PBRUr,u$�;a13c�(��4`����GY�����X���8���-iX)�� kPH�P��uD���g���|xzB���aϪm�����l��~�AAW��w��El�'Y���p^���!b����2��#�w�c|�	�~�ҕ��zcR!��+���^��"{ё�)7f��(V��P{1kRR�-A=�.k��|k���m�����:��h��!T�$��� 60�/q]d�k8G�u��)-�r���������?0�U�9
+s�A�`c�&���W�K�bc�������P�&@�p�5�'�Б��Z�ؼx-,�ό ���a$�<�;�O�/�k^7�0"��t2�BE��}6�+�*�M����EC M��0^U �$��NG�#W�z�_I�M��$_9	G�|��|bs.�Ĳ��}�&�'���ɿK �t��R�HL�	�-"  �   �Q����������������������������������Ͻ�ݻ���{��=��a7tw��W����un>�����ο���_������������gO嫷J�T��r{���	�R�=�Ou�Ye�o�v��w�����3_=�e_��i�n��<5e�}1e9���}1W���{��]�zW�_���/���S�ܩS��ЩC��ٝNԧ���?���ݥ�*ݫ{���Vw�u:��W�ܽzu�^}j����Ma7l:a��>��+�+W�J9y%�t��%�X9J)�rz�t$�?��+�S�ӯ�������w���|Y����,y����4y6O��W폕#O�qڹrΟ=���6;��{��W�����uNYeS���Sw*�/?e�{:�ѿet�R�t��M��%C�tʓO��+���>�ӧԟ�W�Tv]��[�f���t�"���B<�w���$�l��.��r�{ߛƾ�i��/�Ť���L/��ǺB��N:+����P6��D�&��nU�P��Q�������q�O��:)C��t�<��.iǦܰ7NםR(����+���>U�ڲҝ�s{>�;�cIi��t2�Xi�P�c�[�'������a|�ޒ���|˿p�)O=Q�p������b��NɐB���M#�q�M��Vɑe�+�$��!w��{N��F(����t���o�V~u���^�/]��{.��K�N�u��޸��23G�r#�Y�w)��\I�t��gN�Ε���	�2�
+#�|u�\����4+�Q�$z��������[ybo��12�X#�)}�R�P�/?��θ���gy�H��c�'n8{�ݟ���e�hW���u+wW�q����}?�ok�*a��ُ�����v�J������a�v�!��ӭn�}�z�^��a�e���ᾗ�����r��s�(%�;��ۧ��{S�J����y������$��Q~�'�?��0�D�g��q��p.����o�z]H���'V~�%��ҍ���,e7?��ߌT�]��n�Ֆ��L&r��(�f��p�tԇF��d,n�O�������K�|��Z�֍�,}^����Iʟד�J�Z��O�;х��O�$ڞ�],�x,���:�NE�x�S�&�<Ev��Ҧ�Oef��$����֍t֦�J&��0>ԓ��n�)��#�*�gùﰷ�v(g��)l~�ﻁ�%�z�X����m/��>�Y����PX�L�pѻ��2cy��T:���}s����-��F�h���~�$I#i$��dD�lt��<��֥�J�q{�L��f�t�H��uN��"�\n�uF��L�y\vnZ%d��'W�S~����FZk�yJ��}��<9'?%x5���k�y[_|+y2Qƅ2�K(E񁪎�,��6�[�,��B�K�)r�H��Iu��l*���q�Iio�=��H�}�$z��7�ߥ�W�p������-��������+���NoƩ'��W%ʓ�#�T����Ч��c�^����ލC9')w��YRi���e�Чw����J'�ܾ�yy}��9>�������J�)m�vK�'��M�]�K��RF���0���������&�B"��NJ��1��� t���`��+�m���Z4���0��ZD.����CB�p�^lb��,8�Q�5
+Y݈�2n�HR@���愣,B2���lt��QH=�̟d>rt#,��ѧ�pq�2�ǒ'�q]��R.*n�,�aՕ�p&X�a����S�B�J�6�R]��=h�=˓�z����
+%ۺ�n�>.O�Y#�},�ʠ|�.��X�Ery�%�0@��R�X�s�"^%�:.���"d�E�j���ϼ����=���%�3��E�޻\��"��Ez�H��a����}��3a�<֦�:��2�̓�ᾟ�a|��Ez�2�Ez��~/��{Q)��wԅ���{�_�����"=X��+�D|U�<��U��yh�],�ꤣ*�7���>u�y���(y���0L��J�x��^�E�����2�;������mo�L(��n�I�c7��+z�F�p��^��FY��N�fd��k{\�����$�~��N���>���y+?�I��N�JȒ�����^���Kؾ]v�]�.i�gK����ʷ?�K��ov奒�IZ'o��e�*������M�vO͛��M]���O���So�����G�}|�[Ҽ��O[6�K��{�G�'c�)i��گ���.��ozt���)eSZ�ϖ4r���&��r���r�ߝV~�'�K{'���奴RX�u�����h�rNJ�|�O�t�9i�����+�)��M�zO�$M�mɽ�,�N�F��'%�2����5�]�����8͖Le���~t_w	�����Ý�o��گ_�.����/'9Q��rz�V9��p��PBIa���9Mt��p��dZ%�vl9U�rc�?��Ӭ����~�����SO�����iN����{���vO�O�?�����;N�3d�NIS{�8�����Bڒf�l��{z+�4�m�����"��=��+5�+�{I�%}�߶��	�F���Dt����'I3�='ϩ����h�s�rN�hN-��.��]�:s�.��_}�翭>%OMYJ��V�'����k^}u>�}g{N�]'s�������$���?���h���ӫ��K�r��L���(;��R�Y��3����������:���c�I��Su��:����+%Ѿ�D��:%�WȑNIT!�)i6�ǧ.���t�l�uB�>S���|?=)��ߟ����*i�����oz�Ӻl9��і4������)�W��V�0ʮ������=W9MY�|I3%�+#C)w�?�N'��H'�$��ɹ�����I��[g���B:q���iS���|9Y�szq6tT��S�i֞��^No}�RN��NrZ��Mi?��+�Y[�#A�>Uh�~#��2L0�>!uη��\����w�ܱ���R�NK'{��TN�.��1��y��T��λ��g}�/��c�p�gw���O^�$�v;�z�s��Zk�1���QƮ1NƏ�e����������/n޽v�U���ߥW��#��Ǹ1ƫ��O>�Ŕ2�^���QNZ�y������O����]:�E=��_����{���]ٽr{���+��]�RJ�.7>������~?Rr˖+%���K���[�g���������&a�8��@M�v���~�j�C�n����*��v}�L�ܮO�d@Г=Ь+Aۇ�NL��Ұ�B��1�2� h�x�'{�Ogu����*J�("�
+��PҠ��|x �0�4�S!x0�X.�5�$�MӮ
+��@������m5�4�Sq�;,��F!� ���;,��ʄYp�4�'�I�ANK�umUJA�g�(Qgy�'{��P6J�U��B��.D�h�QօyV�a�<	եY�f#G���F�aX|X� �(��\��Q�(��fVG�T�g��e��X��ND���*�ʘ@me�8[���
+�:�q|M0�
+YV�6@X��@m�Y�(�N>�q�	Ԧ�<���,ʖE�6�(�02��Y0�:l�c]���`�e���-�ue����mTf5��Of�j�Q ��B�6��H<���0D�&����ʄ�0�@D�D��WY0�"d
+6z��^ō�Ea���U���w�q�c]�F�n�M��G=�z���ui/����7�&lً�fQ&&��Y��a�èK{�lB�q#bX`��W]V�a�3���{Xu�q����U�*�:�t0,>v70fiD�NB��Ҭ�q�*�_�Y���m����P0�r����<^FU��y!o��T.o���S�,��٧
+�P����;�����M��v]�FnӋ�L��,�j{�K���	f�pF�l)VG]�O]H�Q��U��.u�>����6�׍2�O��T��Uue��F"�{[�Qګ��M�`�Fi�n`�n�y"J��Ee�}���L��y��W706�U�eɓ~��dB��c�Y*��;�̝��K�L�:	]������>�sT
+Fi��^'�yUʶ+����D,6���6�JDu\�'�|2��Ҹ��0�4�UiV���L�A��ܙ.Tm�Sٴ�J�x��M�F!o��R�͛�Y5:�L0�zlYvapܫ0��FY�L�uU���2��.�YF��ܺ��al`P"���BUF�6��6@0�j���!��\�i���3?�����ƭ�F �
+�Q���0�N��>,�S�@2���2�xbb�\#��(Ove�t�j��+�e�2�F&�9�*Ϋ���^7Һ�ǫ��T^Wq���
+�:m�U!ow���F��,�t*RD�ׅ��(��#A�&P��ۙ�N�>ֵB�'���XDTGW�lTJ��Q))<@�u����v�$PeQ]���$Rt*\W�T���6*e��Q]%�Tj�ҩ�,�J�"I�K�RA�E�ySᇐ�N�Y�̇��¸�g����F�f��s�Z����Zg�Zk�qM>޾�χ�/"������g�Op��%/ʏ�%��	΋��}��=Op޳������y��=?O����'J��	�ez���>��0���p��~Ǹ���	��t���:�yqBcyw���:Q�b�	����X$�F��*��ѕYj�&bU�el	���|�+�,���Qu]uQ �S70��@t׈
+Ub�=~�Va[�y�%��ݎ	^���Pa|1��%���).�!�ʄ��#�����>i]�s�}�����'e�6��0>�O��,-ӈFFY 
+�
+�e ���ͤ��T�e[��4��L5��CatH��6%ݧ�B��}tH��L$E��(ڲ�q���P6
+dQZ�U)l�	(�]���}����&��F�0J�p��>-�,bY��eR�{�:��IG}<��	0?�җ�#s��P�*��Ը�i�����yi׏s3�c7�)�"@hL,'��O<j �J���� �8� UU�X\`<���&t�0P��BA z�������FwI}�eGgdZe����� �������|�*�r�����a�����͸r+W��%sG~�k�K�~�T[���V��1��0>e��-7zɥ�R:���s�Yk��4Ư��]��#g9B!@��q�W��xe<Ă
+���z�L80��gG%���A���L8�FQ( ,�UA�!0�	��䲜���Cq`#."�ʨ6T�iԄ *RĎD���Ђ��6pd�QBq��a�CU���AE�ǀ }�����R �b��DlD�����#�3�{�@�4.�����y2( .��F�B�"�dP`!I���ĉ���h�Ԇc@Xx��6��4bE��`�.�$`�ш� 4��H�F{( ,�,��3a]*�7+B 4|P@��4�
+
+��K?�X��B �Eur�XT'��e 8��X�P5|P؈"8⑱QQ( 4\��D4�"�����m
+\��A��u�iG"va�h4l`,D(��p�D�X�.(��� A4`�h��PR� N�c� Q�@ h�o�@��HB`Р]�ed�B���΀�0H������m( <�f��8p�Ǆ/l��A����}�LX�Ц�e�`�0ʹT�4�"J6Q\�w8&
+����0zX`Qu"B�D(  ��P�:�i8$	����	���H(^8(`Z�"�K�I�DD�@�84d�A �J��\l�A�� �yE�����ȢQT��Np
+W�Pñ��;<8(|p��#0@s�J�d1J�f86ă�!�P�B=� �i�Ԏ��
+=`�A�"B��@����AA��\H��"aj qS`2!���Y&FH��
+��"E��H�N�H�HBb��P4!�����'j*�� m"��lbn���	~�A�"V���㡀ت����4�F��$XAԱ%;y�m��H�����.�( Q�N<2��`�54J&*��^t�	��C#�!@'�]`E����..:M(~h,���A���dcK2\�BB%�$���&@�"���	���Ĩ��&�@`�*RF�@���
+�A��0ul�Ɓ.�`,T$�� ����R��:	8��H�X#\E� �j%��Fd���"i6Z� �"S`B9Z���pHbE �a�6�A$� �HN��Hֆ�B����A؄D��Q�4�˃~J6t��!`mbF�2H�TҠcc��P�� /�h,��p�*��'PajР$�@��I�i���P�DD2��))��+2s�#rFCEz
+cD4� D�Ϩ2'�%܉�p0?8	n{P ��̊�!29���F�#�!i�5��`�1E<�:8|P�ȼ��4���a��p|c�a�1d�p,�����p #��xE��4 �$P bD TX$ m��� ���I�y�B##0p�Kax�tP�"!��8�<
+�,52&*R��$�3�K�_��ǀ�n��q��P�2�"�0 AgPǜ7(dP@��!5�l'd�PPR-,�aP.Bi!O!���P`�mpIgIThDTL���E�D8PQ��#�����0� &�H	�����vA�
+2�����*G��-㙠ض� ~V����� s�J�P��0�Dƹlm�Vp8��3Rq�H��R���`h
+H� K(2P���LhlEڨ�p06�s� ��1*�� 2Bm4�h���Њ��"]0:1d�]���Є8���Wh��v&*ƨ�Wl8|E����9?�Qxlb�g3�`�H���������4�h\,h\@`d� ��+�"aŶi��F� ���_pD ���6�i��A �l6*N0
+8��V4V�@eZ��L?p����!�V7� �"�ǃF�5 hD����h`Р',XT�G	&ܥ�� ����j<w�(OQ�  �c{P�P���$	��*:Ls��+������*��,`���&�Yb�(�!j�A%��,2�H�p�%p�@\bw' .�K�xH
+@L ��"E�
+�}*�����I<��P�,�Q"P��Iŝt��1����sY'4
+�B؆'3 =���h
+W�PY�mUJ�AO�າ�d����<�z/�^�\D6������?�Ц�l�����'��,��U��,4^w�e��A���(��@Nەu~ ��n�R4	����>[�N8�i]�AN۠&�j(��(j���D�$1��    C  `(�D�DVd� ��>lF�Ƀ$�Q)d!�   �`
+��IxK��N�&�����%�m�?{[���C��;(�A�
+��G�-A�N�3W��%P �*������@��_��*}%Znb��������hS�q6�����X�v����M�G�0���_61r���~Ϻ��J9������*:��lh�=Q�[R%V�9��H�;�cjF�H�ǯ��ux$�~�٥�<S�,te���$+�l��)�g��;�.K �z!he�:wf����)6%`S%�A&�v<��|��)	E�ڡ}���IyȬ.�>�\����.�w�f �^��*��$>yO��.���g�`� z�?��1�s;lVa�1Udl��q�38A?�aU���ޫ涌[���n�B��[�l(�'<��RD6TSM0���L6L�{�D�
+db�*�Y#Æ�Y��o��R��i�h9��hc��ԉކ���}� ��ɯ��ꙗ�ȩ�Y(��׷��{��~tb�{�l7��hʷ!,����)��k����ؼ( t%�y?��y��"��䌙HS�G������4j�d��z��W�S@��gG�7��<���<J��@@sv��F���������F��Rְ �Bܲ{���3?z$܇n��$S�z���9	��BZ�Lke��0�^jW�9�R��p��fԑX��ԃ|�w���>=K��9F%D`ް��5�'��VD�-"�%D��K�������$X
+��	�044����_�P��	4 ��� aP�C$p��/�>�+��"�k�����!�%F�Ы,74�%�S��g۾aU��``y���ֵ�z�xǱr��uq� ����/P_�Fy�pv���� 	���xIu�	b�Y��gE�J�(�]"��`1�"�W{,��������Et%�F{|Qx�x}�>lɗq�Ѯ3�;-e�qu'vK��P��1�N`}�8�#�}R�,��-'0x�,�����B4���^2kYa}������]��:��
+��C�#��$�9]+����c�~4���Y�s͝^E�H8�{4G ��W��[-W��� �+w�u�9�zb���YA��Q�}�b����]x�R\��W�T�7#��f��^Bc�%�񮘯[W���N�*����j�>w��fY�+��g�l�~�Lyݱ�R��j9��G��8@ep�ItK]ݮ��rzJ�1��o�:T�	�R��� �y,��sia]Tk%3��;�dN՘I��U�a�G�81��NM=rZ���P]��Q�}��`��O5�. T'�p�A\=~SV�r�"��D�0�Q�69W,nR���mE�b���-Nww��w�+�s<�(]��������}� 5���^NM�d��q�t�(Rj�)c���?�8I=D զ�&Wpz��A��WlS�������N�ٱ夨����<�☬ih����������mU&�1bn�%�� ��H��|kf.���3XSk���a6z��0ή���&�_��#P�A��t��N�������>���[GH1@sG�L��_� ��_�D��3*TB��W�8�o�o�4z��d�jȥ�4"�G ��x�rJc&���O�״2̴����[��n+@�jľ8�߀=�C�}N=�q�2eZ-��Z�DL=�h�^^����p����L�V��7��$��T�xCY�#�oH��Ⱦ���#���ms��镩��!�j���<���~ޔ�~p��@�FT����"i�E���\U���h��gN�e���0v2LNB"T�3�̉��\��S��+�!�<b�9�>�I�A\h�?ͼ�$$Th?F�.�^���m�J��e���Y2?3�=⤃3=�������X;R*w�:����%u<3�Sw�
+���Tφ�6��SIg� ܄p�@�F�N+p�#�-H��$����l�q\^�V�~I)|/�z.l��Q�<T[���q�y�/���WK���&3>�������+�,=��G L�������"����#�n��e>ϛw���%�����[�ϿX����i@��{x2p��rGVı)����*8�b�Df<o�:��OYc4�� Tb���	��Qq�N��|
+��G���bv�ɓ)�h?�_l�%i�������d�Ϝ��"�m_�H<�1��f�����Ec�L1ǋ�7�k-R��;����Wd	�o/0��ʄ�n��e�RQ�y� �_�.�GÄK
+���Y���[�����VݟkL%��4�/@�ղDX���~�Ö����n�?�kQ�Z(�jr�ߎ������.y��K2M ���U�ߓ�1P*���Ç)Y�['u6Mo�]�]g��hF�¶xߩ:��:n�oFZ���9���`8;8�J�h�t!zwd�u��g��ѫS����oG�]���e��3DYŀ$Ρ� ��?K���i=�P�4�A��S��=mJ��U��a��u+�5�Z�'�&ʷţ�"@aC�}I�|�1��)���o�ţ�`+�U��������,�.�a����jŸ́��ԞNl�qXs��β{�e�Vw�`�5��%�f��.F�
+�V�F��ᖜՐ���?#M�������7j�Dt��?��I�лr���Y����F�4��om��2�����\0.1��"=� ����m�dS�0��i�P�
+З.k��?m�D�:F�iu��'ɚdv<����'�vB�U�ښ�2W �N���8��7<M؟�b?���:��J��"f����x���,��-������1'jJF.��0L.�?���8�Y6��ɻF��H8^N��k)@Be0V����7Gt���_���ī�`
+dBʸ}l����E�kz�"O�!�7�<��� 	����ÞDס�,�$�c����P���K�?**�;���ߘ#���Ѣj1��J� P���+%p�i���uO����z��$4��B����$�_m y��T��W>sE�_�����P7%�?*��Wue�Q��+v����ͬ�<[?C�M����Y�Z+�9cZ9[6X��޶p�~zD���gw�4z�q��ҒhN�H�MSq�]��ս1C�ۿ0k�^v���U37����q��fZq�#Lʶ�������T���<i+'���-�r���o�t����2F��E�;���B��� ��H��7��l�˛$�D�v�.lM���p��3_�U�*I�����2��f���p��	��{6�����Њ8��O��83!Sܴ���U��¬��\��ٌ	2��>������~�3*���y*Xd�(�����e���b�1 �{����?��Y�t��
+l���6�XX�O����=KoH�`����^�l"	n����`�fI�AxF�B`�Ԙ�\"��0�x�:6{Q5
+�?��/~��uT1l(B^n��Ӽ��A�R��E}e��6�3B�b/Fɰ�-o�@���%�7!����s�Ꮦ2� �*����$(F�]!^� �����dn�6�XBڧ.�|��.����B[Wi�2`i���:�EW^�Ѻ�r*�^�?�NZ�B�Ӽ\[��R$F��ԛ�aOFC[$���5Xh5�'��X�=(<����B<��Hr�g[8�����m��Y
+�I�͞��b\����,S�<��1J[30��M�+�:���=A ��7���փï�0V����}m�a�e[�H�#�Q�!��ࡒ���i)��X���n����ഭ��5pܘ�a[~�v���}��c+���ܒ�ܳ����<�����.jY&=q�~�?��� b7>�>��#������lAC�O�F[6���sj���O��y�����4A��,ڷ�=\8�0~�m�Wn��E1�@F,4�7W �Yc��[����w�6�90Y�o)����\�6��c�=t�.m�Qm ����iT~s\�K�L2�;A���&�2(*Eu��7K�*�t\"<�Ρ�	ժ������rz�Ko�F"��D��g'��B�J�/��m	�sV4\*��VEJcG:ЛT������s�>8�)�7
+����~��h����A�M�0FYL�Ł��3�+�>՞aQ�4$��Wg.@�PM�{��j�����ba�}n�)�8]{t���7^\��������i1+��W��A��u��g���F��U�`(��%�"--אQ��uM7��c� �NX<�^�:sq�h�$�;9z|6�~��|������i6���d�g� ������X�9()�Zpoe�D�#+}ԝ�����gA;5��.��M"2�l����^�@�B
+����OȘ������4���0K��vg#�������/W�o4�g]��L?c$H�8�'H��h�Y�h���� h��>��v�F��%�>��\�����R�bӻ?w	�4b�5�<1IZF��׮2�����;��~0���e�턬��F��P����T(ƖW?V4�#��D�Ù�M�0`p���`����uS��(��Z��ww����^�,AB�c_z��_J�8�|��P~٣O�T�`}����	���l�lK����~���xƾw�U~Ϯ���&��t-��6XI�K�ͩ9ݬO���4[�dIl;`��L�7N٘@%�&�P�[��7���ڬ^ۯ����(��Ø���Y[����>�J.;�ܗ��ʞ��\�QȺ�3<�:'��r$�_��k*#u����d]����U�a�����>���%��`�32�k��Mt�������ޣ42�=w&ߐ��~-��8��\��X!�����`'7�S�6՜֣�p��*7�̛d����b�"=)�
+��������G�A2�p�v���%���t��mK��Ps�'R!;�g'����s���7�츥f�����O A90���6'�
+WW��U���Wʤ�dK(o�F(a�"�Q�$1�z�����]�B}?)��<9�O�iS� �>[�a��07�:�v]TR��,Eʐi1��[m�MDvӚ�)\��Q[#��I���	�T�͸�M�8��o��1�Y��;t��1�Tw5A�a�Tp�q��=�*\�K̈́Aq�A���9t�"j>h����O����e��'��4i�]��lNډ9͐R$��ɦQ{Xߴؒb�����= 6�E��F�r2*V�j?H|BM����a���:��a�?�u�������m�{��C!P>�ME�q��p�R��4R��tMÃ�Q�Gz�t?��nt������EOI��OH\fGȞ�7Z��i�)�1L��ų���6CH�fS<��L�;���h�����u�gqc���� 3%4�v���KS�ē��I��=)��hg��̡Z��Pn<*�D4Go�A,=y\:�}@�ܱ��5��@CN�V�GB	q�4H�w
+���gLG��uq��4���Ъ��i���|��}�}�ꐾI���Q�>����5�I�"@��L �6:q/	�0+���&Y*��ˬ�����5$e���FK?��RE�0 �a��CM�C�\�!��|�Bgi�"6������A/�B����Ͷ/\���pʅΥ�2��&�7��Bcq�1����Nݲ�^����o ����>|@C���3�I:kf�r^:��W��,�s�r�)7��|fCP�����\�=�S����. �2�mNf3�tk��L��b3���N0k�"��l2�)�hh�+����P��O���������շ�v9�;�Rf ����0s��������Z 1�}w��(z�uo��xG�#}�V&{!7��ǁ/^D'.K%��Crf���>F�f�102��."����FX)��� ���G�RZH�T�f���$	~�p+`�7�=��g��ӝ�S��;a.o ����.��Qm�z�#������%�]X��Q]�z�+TQ��+�z��?乆N�#(�9�)DT����\�����C��
+�Q>|A�Rb7��-�U���J8��ǵ�?��*��T�A�+�9V�[[�-#����ޔ,Y�Ub8c��FR��	�:.JS��;���Q��O��!4�]+�K�7Rl�l�Rt��	�T
+`x��S���@�2j�Լ��e�%�ƞ�p#���ݜ�M�Bd�!J��쐕W���x����РR&l)�o2�=pvf�1"�WޣT>!���fʅlS��g�L ���M�+���F]ы���G���� �����i s��l�J��C����΁��% �p�N�ve���Tj�R�3Ts����\(�.
+Ǡ�~�Ŵly�N�X�'���A��� tF���e�hbA͐�����@��|�9�OqpL���k@��LR��0�.�
+D�������s����S$D���nW8�i�fM�7��8?p�P:��l�6i��o4���It*��!�Ɲ?�t�	����Z�H�X�4~ik����@��ūT�
+�!��Ȣk�m��cn[܌�T�c�E9Y�On���^�j��+���?k��KKZi�}F��.�3����/1B+�<����C����5b^���+�[���h��l_p����@ff�%�]�*��6Pd%��J���Uap��&zo"�Sq	���s��'o佻��vܓ�����W	"�(��U�L��d�͕5�wf���_�ZxŞ|�5s_�!���u;�/�Ș0pah� �i5���J3xǖ\,�씜+�I,��#cj� ��@���T����f�pf
+�����Y}�\)Yh�*���b��Ih��w<LE�a3�gʸ�* �̞�@�鐜���#��z�?�9�@��,^�M"�3 -���G��L呗��h�:ʞF�dV��3�F#���6"���a<�$����j��m�A�>Ѷ �>���0T
+����@	���h9��X=�X����aɘ�ɼ7&of�~�u���#>qu��T�tI~�Mţ���lc�2���fO��x�qLY�X����Ӝ�fBI�&���U���$�M�1�ܜ
+|�@蝝���1�x4��hJ�T��5e�P3;B9�ެ�TՀ�
+�Rl�����;hZ�ݥ�ZSD��]BFn��L����3��)��P!I�����!"_�hJ�HLǃ���F�M=l+���B�𹃏�0��i����L�D�F[������6�+Y�*�)D�Ea���O��6M�*�_X-�w4�3�Y�x��m��,���ˆ�В���v1���n 4v�k>XIx�*Ɵ�!3�EĽ�}V<[�k�(f�O�W�
+�9�>}�5�$�S9��}�Z/��gD�ȫo�]���8�oҥ�+�yl�K6eaU�@�w���M�R}_�zA�P��$NJ�?������x2��K��5���oX�z�>�	��O<�gXq�#�~��$ �n7fM��=�r6�r�R��,����*��4�x�K��J{�:�
+��Pt�7Jq�9���7{Ȳ����rā�d��A��6��qN�SV�A�L�5q�;߰�'aBm3WHaZ���� �\�	�AA�lҏTIrp@؈���ӴjH���M+�q[L#S�`���,��ltK��bd����pc��܏+����9d��9���z���P���la*n����)�$:@��f[���|^�#����/��K���Wα̸�-����o�=Qb��k��RioYU�e�h ZI��L%ѹ�t��&�hz�0�TN�#��1;��g��Ӿ���ó�][��O��@��e|���nO,���
+���h��߼a�N��h �����bl�^�r��� �W.LE��!��N7S��#�?�VQ�֓k6���X��| ��ʝ�Uw)��8+�Ž0�Y��Ȱ/��ez�\��*�ʄ�Xٰ�\SA
+� �I��R��K��z.
+eT^�%�i�n��Q�4�)�v!�t��`i(�����֋F� ��ȍ.�[��� 0k*H[AfЊ�@�)����ǌ
+������6����=��&$+���LX���7\M!D�w�$o��kԃu�s�<����#B
+��GR�����m/�%�?H�~|�qS�`������R���}K�P���-�۩@�����+�%n�·�ES���S��+�T၊8c*���C�)�!�s��`�~�jS�6ծ�c�j�1KR��bpFC^�\pU\~m]C�Mu��^ݢd�\mj��0蒁�֝К���&�Y�:c+R�p� �����/��~{&�	uԫ�H��s��H�lף�R�3%|p�|�y+2��r�N���A���&@���T\m�[T���6��k���I�\��!� n@S*��j�"¾�-���ٜC`9�$�_\�,�?����1zZw/���?8��)<���0�p��c�vF���ah��v禷~�Ԧ��>���Tp� 1p�߀#��0��������� ���%��-�����G3⍶����y{n������1��U�Hx�,��"���C)���TTM�#����.�V�`���٘ɔ�9�9 θt]dmS���teL6Bcg�|�h�E��wK�*�ػ�����@�x�D7��-�XŚ
+�L�؇�M��a�[�>�(�&�8m�	6~�Z�)��GǱ:�i �h����/����UP��%���I�Q�L���M��緲�u��ރ�੷��],��W��hണnx�ﷂg���RVr��{��<�{)���AW��:%\T�M��V(r�+:���ْ�D�T�]�����(�%��V��O	��IX.!L�����ϸKI#�L�
+�����PH��� �����5Nl���{	�q"��I�Ez]�M�vq��)VE=Ӣ!;��+O��l|�>7!(�G�H��DW#��Z�rK�A�
+"�@�r5M����u�C��-��K�v>m��;�Tq���m��@Mi��*�ơ����Z��G)b���0[�nޞ2'���h����H����N����œ5I??�Tb�k�{�ݨiɿ�p���'��g���i��r�U- ˖�t�`|%���*Z��̌B㖎�tGUr���UW_����识���	Օ�V0�Ͱ�i;�h��i�';Im6��Ds��=����%�QS����5c���.��u�j���U�`�=�YH�㵰��?���qC��-(�14�LP�Fٱ��m'P����J��� D	�{K���V� �h*���������b1���w7P�|�i��m���5����ؓ��[G�`$��V�b)݉<[���7��XaGL�oʥ��k�:bj<3ָ,�,;`b�u��ɿ��+���|s�+R=F�ۉ �m�C��u��nxm��9m_���_�������ԟ���6g�>��n"k�Ȗq
+l��Π/S��!Z�azs��b�Xi�m���T�u!9�<I	�L�|ko�B��w�ٔ�y���͊-�N�9h�#x��UH������u�l�m]-�n�t��L��1��8�9��z�7�+z;�(���z�bN����/�hu�+�J V�;�a��M�*�'�	DA��~"¯';AY+��)�޳/����S�}0����[�<�/���o�@{$(\�A�I�~�Ȼ�[Iך)-��>�З�h+#0�aՑ�%�Yˁ4[T�ݶ�����U�X7����3O�A���Gҵ�e'C<��P��ڙ3�B�Ż�l"{D_;�!i��q���gp�O_.h;�Vo�e�-��4�3( �A�(���i�&����������J�|�}�f�Ypd)��k�`���s���$˒%Z�Vbr��]"��M�:�ȯT�Th����fS�)*��}�F+vO�Rε_-·Z���0�=�KB�:�`oқf�\$v~endstreamendobj1080 0 obj<</Length 65536>>stream
+ʚ�|�Ӕ,Km����º��`	 =������+Ą�:"�dh���P%d�0�.{('7��`ơ��kY�U����{�X�ݢ1I�EC{]L�j*x�R��O�ֺ���K���W�͵B��w�Z��I|ڥP
+��*��^� :�څ���&-N�1'�����.��� l92v�L�4T��8���j�8���^��Cr����8ZH��Z�G疳��!|!�T�iD���&��g&wZej��඗�s��x�xIΙ2(�Y+4&�.��fW��/������ /.�F�?|5��7�ھ݌�1ۅ��d�M
+��"�]�����c��<ܘ��c�hے<-#	uz��x[Y�m�%��am�˝�F)�A�R�M��ی0?Cc)d5o����]���I�$�p3t � ��47IZ����W��Ϯ�*L�K݁���Ia�L��$go��.���'H�"��yu��`]��Z�r�	���^4� $�LR��0��O�2���R�� +"�[���,H��6
+�VykhX���o0���0����Gga--qVY%M��B�A�z�fQԣ�,R"�} ����~�j"g}��+/�>���t�n�k�+��/+Cx���D�#�4C�]} u�	{6���,c\�-������.6G�n�[���d(�}��q'V���&V�ڨo���yS�~���<�1��̷~�� zSի��  .�{Iz�)&�SߖV�,I�}qJ�+9	dc��iŐ|�P�����7�ጓ����Ȇ����g p��;���dQ[աf?P���y���vo��T���1,46{�}���vH+Б~��vO�&6;��ֻH�#��yJ�"@��Lz:�M!I��8!nc:���_�;O���E�LZձ tb�r��B ��j'�WeU�8&0�B(����ih3Q�*�6Ig��"��=��rOy��7�0�Ƈ1�����Y��ZU����,��j�$:�������ٕ���i�:���u��Y�.S�H~��z���IB�;5�|RZ'q�Ds'����0L�-��w�Q��;�r=�3J��,^����`ak��`�<�d��!�f���j��k�l��눶�_�;�����k��ͪ��^��6�g������I:{�ZG�ܳp�vZr�pq%�ܩ���_���a	O�!�.\�!� ���U�zC�z�j�ycA~tR�PtI��L���^��{�F��UU(�T��?�x����6����`c,cL�92x�/y�m�7��Q,C0�U�'>o춬��pT#8���s�|�4��x�@�����+dC�q���*�<�4p#CRY�?7��VE�aIxGj��gm4X���}t@�)���e��k�r��f��F%�\�Z��8Tݯ ����"R�	�xJG�� ��M0-oXB(k�T���,�NG���ݕ�{Py�n,q�䩉�@�a��n��y{.���n�1tL�=�nf���1s�Q[�ձlJ~
+U� 0�H��p��?R8_��P���뢀<��2YD<�
+z��ު�E�XB_����I�\�8�t��u�r�t�hp�>�!��hw����s�-U���#=�Ӳ0�+�a����2i�^Fv��9-�� =	1�;h��S���-q��D�DeS�+�X �	�J6��R�&ګ*[I('����
+励��Z�B��W�gjI�jN2��4<����(�����p�X�v�y�R���uP� 11$ q�����W�Mx	���'���e<�@�IF�l�Fp��~Ӈ8��\
+�4�o�#�����-���Y�A��{0`�����q=��~����Q������*6�wH����"@�����-@�@P�G����Pj��E�a`9ǀ�6xW��O�*e�4$�����2L�[��p8��@|+`�A�������/�FZ$��{�K�^
+(t��KBH\_]W%
+�I��LT� ��%�$h��!���wR~Ov�R���6�,�B<N���S<�Ú�|��\���|��Z5��C|Gy�\p�Zȧ��J�GINMy��$�.��������E�Li��%��3�X9��  �8�"KӐ{F��/ϋA��و��wP4kK~*Cz{���;���j;�m�A8��dQ/�&�b��^6c��v@�$j�rN�?M�퀸n�Y�^f@�C:�3�o([�0Y8�A&�[IfO��
+#��|����ȉ�����o���lnh}E9���Z���}�A�R��ů	!�6_O3Nc�� �P�����r#)���kl� ��� �қ����Π�_����?���X�<����
+�sS��4�S�{�V(m����1옾2��Y���={���%�q�}����.^��V��Z����K�`���L�z<��:
+�`'����W'{���<!q*�Jt�B�h�,����>�ϫ��>�ko��`v���z�l0�d� ;��9Cư��a����$�Nu9{E`e)�*`�b�V���h,���AN�L��@�m�Z�Ր�-t��w�p*�l��-�0b��k���M�o�Y��y�� ~�#_
+��L�pw<4S��bsZBv�l$kg�=�_,XhQ@t��@��<����s	O?ng�*���1Ѯ^�e��r`���d1,bs����*��2���)�B#�-Մ;�5���ݴ���g�� uth���Ks���}�K�LH'��A��2�j��d��)��U��6���~���E@�w����.هh�E��xwSՙM'p4�1�A#p��8`��)#L�Z�r�{�䲒�G�&� �[�D?��:���@!� �.�ڞ(vzM4�IG�N�j�ƀ#X��h͍��)��/<�g��de�76G\/��~��v��*�;�r�;s��~�jfNp��^���pu�>�֨�ٷh���� I�ֳNI�%�n�o%�����z)9��^*��jdV�8T}�w}]o�-��D@f�k���/��ʩiVm�5!�2�����	����a(��D�O 
+})�eP�����>�޷��I��z���*pVP!V��ݨ�Z@�`'�M�ߜq�>O��|\p;���cD�.<HԒ�����}x�<�B9NGL	��h>�繛r��k�ъL߃h��T�R�(��G���L������s�dۀ��MJR��*.���De�������|�bY�ǟZ�e�i@����l�����ce���U[����K>&�| �!��|1q��Pgu�+��!N��TPf'�[�jȠ5.V�ĪLBʿ�D��a���\���媋�D s��Ԃz�l�+����������0ف!|�(�%ƈX��*�b��	1|
+����7�Ov��,����CA���@8cY�M<$1��3�g[2���w�� �{���{n�'fk���/���O(��8 ,�mA&��"k�ӗED�6�����a��%��1�Uå3Tp:�6�D
+��0�)e�~��dHY��k9Ō�ɷ��|��%���{����MI��
+�<@G�Bt���IU7}�ݓ���B�8���PSN��L�R�tp��7jг7z�GVL�\W��=E*ݸ?!��d������<u�)�;�8޳ʚUf"��
+o��"pB�.�f)��M,}�W���tųH��Ζ+3�Ls�5e,.(o��]EC�m��)
+�ෟ�Պ|-2��yL�Cc���[h��a��	��"�4~���
+1��GA'�.^�)K���Bص�i�m���&!���g��q��e�L)��pɔ�"�mX���.캻�媓�+�,MT2[�P$U�����=k$&ދ?=�a9@�$��ɔ�R)�Vp�D8h8e8"�b��)�|X�����HB`DX�W� �~��џG�c��� �n���[�n�?'z)+�V�GY�'_��O[[��me������z(�IeX�0B�r��H,Zq��މs_w�_ʫ7s�R${�$A~�zUA���\�Ry����<&�<.htޠ�zgf��%�X�n~&|^�b�JF{��vS[T%J��`���PE��5%yKMO�fc�=L�ֶ���5�`����T�5��d�cW<� U�`�����.4r���6f�ݵJa��X=�\r�\ЪƖe��������kH<qv��SÀT��F2L>�%A\&�*v�J��Tۓ��$ 6���2>�4Lej�� 6k����u��m邸DR����'���.�Ny9����	�T��k0���7i��Y�0(.�W�^�/��f�UXP��E�H�m��H���\L0��� *g%˗g��[&�M8�?T�+.#+��~���p���d�P�o����z�U��sad�u�Ȣv~��hUwS�U�L'h!Ht0���J�_95(��s������V���Ym�弲]o�jO����}]::*{�ʫڣ�]|kp�dz�]�V��k���� 1����h,�T	Ox����r�} ���0*���X���&���Feͬx�<��c��du@��y��VL.E�ȕj�����|GUq�49W+X^���9������r�����!u�kzo��$ӺqqԱ�II�V��VGqG+�b��89���.�P���r�I�nlT�,ca����"zƒ?=?�$�gQ�B�a�E> ��lk����W���4M+���Xiŕ�����QhE/��:M���du%7+��G���?&r�H@G�1m�9���[]�ެd	��S�R�@y���V���B�[l�ٺ��#�-�Q˟���xG��@:>1Ģ����v��3/��0�;"X�t|�2;�0aůp)s%Bb�u%xC�6���̅���9[ �Q���n?`�H�{�]�>}�R
+o�x�g�-#s�����o?�X�kɐv�lT�ݰQ�UmlA�p��i��\lo�I.K�
+��I�Z��6���9M;5�����2��Jʡ�3ԻW}s�jK??(;�T!�,dQ��aY@ꡚ;йN�w�y�-���^w����|�]�����o�v�#aS��w/����ʷ�I�k��?�|�E�'�x NST2z�V�2�{���sʷ��uEN7Lw���'T�~�n�B-����1��KM��1m��:J�M8���ؠ����[��=�P�]��=h�����o�w�7�Ӯ���H��.?L�:r��w���d��F@����Ock&�FB�#��ָ'��R7��|c���NNR��1r��7��P�._�=��1%�YH�� 2-{j��v}��M*����������D�`�~/�����S��P��.��}�-��NV�
+��À�ռ�#��G��!n-��C����c����-f������!�?��B�u��;ĠU�t�C�������1���"�����:_�タ���(��S�`����l^��{���w(�@��T�� ��I<���1�O�9Eӎ��l�a t��FsL���ǽ�/r�;D���rN'>Yf���I��1|��)�@3�8�3�u�����ľx�͹ �s;y��0�V��T�u�ٝ��~c+��2/�!|��N���<�M��!���jͅZ\9Z��!Ⱦ�5;���β�p�q�:ʯ�$aqY��l؉\L�H�����ص	�s�J�z�3�4A#ID-�-�K���ɵ�j����v�U�_ďfZ1�f$m"sP(��̡ �̟C�u�W< 1ς(�X:�_̊��{5��	�bxF&wӃ-�T�2�F�ށ�0���"����3��U
+��H�Z�l: l���Z��r$(,[���uN�͠*T1��y�m@+b���j�4㳜�yT%�*̒�H���O�d-Y	��J�!�������v`���|P�i���4�j���Bjڀ���K��6H�3�U�0�x�^��2�0}@�A��Bj7��jBo#c�y	A��!�3F��f-�K�����-�y�0Λg�U�o ̙��f�23oy�D���H�&�Ĩi|c�����?$�7��T�=@��ᯠ�CpJ��@x�����o�$�A�I���a>�@дPB�h�6>r�9�y����p��TdE�%&�|�������
+�J�����.��!"�.f1���#�O��Ł��\�̵�b�Y7gϛ߅i�켣�%[K��;&��L�y�r��v,�V�pƬ��ǿΡ�a����,1>y�4�m�y�R�LFZ[�lA"|:�S�4�4��S�$l���d�LK��J��C%�Xl��+�?�g1�Ws`	�$Y=Ls>�x���.9t���L������p�|#��I�\�3f*+�<�M�����j��$R{�4* ڽ@.UhX_�(��Ut���v:!s��6�F�fq�j�m�7+���C�
+�R�c�v���Ƒ�z��*T�_p9��`"�KQ�1�`y �4!d���?��;v*�D�*o4��ٟ��2��ß�Y
+Â��A�7.6l�4`5㯵��'�?y`��bKC�"P�/���#9tLU�K"Հ��r͔�)��%���ˮ{�7��D~��vT� fђE�Y�O>!n��� �#�|�m.�Y�ןwP���?;wX��[���y)=��F�2�������CS�9'����!I���F�� f���A�	�������e�|g��k�C=n�z�y�a05�_��/��Q�3"ON���\3O)�m��j���W)��9����6��숴J7,W��?Q@�֟��1s�.�V�T1e	����Ƌl	n���q ۇ�h	/j�"`	Vm����`�*�+��W�u�zl	0�Ֆz�0���2���.��C����$���u�d���It�H������{ZN2w-�l���F�#	C�X��\�'p�ͅ�����RP�͢@�Ux�M֨^��8�}����a!����''A��˻4�B�.��
+S�_�
+���	3�W�8A;b�.4�f��c��H�\[E�V�
+��hM1!W~h|�}#�_>6!gv���)P�(��Xǡ|��m�@�h[���~�KJ�Bvk�`TKf:Ӎ���Z�BrVi�t����>ʢ�2��_ ����ͱcewPȞ�դ� .�x��ґq����'���vl��[��*b���U�w�$L����0-D�DX�N�w�����3�.�/���ٝ�o!e�;��q>��ڝ�V���5��i'���M�$|����ם���R�~�:l�QM(�����,�1��N�mz�6<�;���'L�{IYhp'�g|Qɝ��O;�����;KE�5��I3а��s��	� �0�;U�(�A}�@o��Z��mX�].�_�\�3���O>�b$c}AmN�%���s)��a�;e��w�D������,;?����w7X���eg����SҼQ�$�c�K��e�H�tʉ��m����;�L�ֲ ��� �iC���r�޹^�3��;{�w�<3�%�)#[�V��$M�S�)�E��ƕJ�<�-Υ��,�;�k4S���O7�\r�q'l�;h%ΝL�I3'��y7g�cꝅ��Xer��qò��5V��hܩE#���ػW\wJ�맵y���u��2�em׺�'��Pw��l�����;���uF��b<�a}�B�wg�xaT�ݟ2
+!9����Eu�Im��t�PP~"�
+ӥj�aj|�M��VĄy��B���^��F��z,r�ϡI1:i��a�i��t�lb��;��v�f���Y��\}�[�;����N�V�\�	w>;���G�Ky� a�b�_KЎ��<Q4/�=�!��E�!`n�(�A�� 2@A���>�UOL��.����nO]�R��G6̴���]������9��S٥�,w	O8�jcUy�����-^#���������x:��ް�Kx��&����q�~b�v�a}(X�BВBMY��%lDu�Ur�HE����5�TIq (�`�Rju���x'�<�� ℗+N��9-���T�-���._�7�c�Z�vOǰ��n���@�	�	�?�%%%��MOh-A��M��r���R���қ�Q7�L�q�ݐ��(ܖT�`���� <�x��ҥA&#t]��׻�i��0�A�WՔb�����EiQɮ�k��bh�>_t6 ������ �ٞ�����hs_�I䤵A��e7E�L�J��{Q�{�o�:X*j��uɄ���ιh��U�Y+�*�h�Tu���u�!EWNt����X+G�T�0zْ���:����L[�Uz���K̢sP� 9�ȞՀ~o��^	�����;��$	��2]�rrl��[�3��b�����8�i�H�v̽��T=��|�.9 װ�d�7��IM# �&"ezRF����&Y~X6��"��aaɢoR�$N��el�=g�Y(�i5wۿ>�Dp�P�0Ɛ��IhIz�㔵 ��C�� �ON'�0�#��K��TQQU����x
+�e����F��lUE��f������b�,���TOp����є���q���ל�7̍l�h���_u}����%�A���L�k�T,kOQ�f����?ݰ����Y2>N\�y-b'O�*���<\�5��u�\�? 獺��+T]g����D�-�0;�s�Lا�c��z1��W�(@����xF7�K��H�1��[+h�:T/�ݱXG�ST�,<~�$p��/<�Q�0�#����$�s��^`� s�B��U�w<ua�`��ݬ羓$����#��|��.�8������v����X_�x:{��
+v�K�f�w�}^R,6���ͯ��qx"2ށ���3�;m~�����4��m,������V~��l}����}�{gx�'�Ю��V|�[��ȩD^�^���ۧ!��4��� 6\�M��"e�s���PU���j H���f~
+��m��A�!E d؃jMkq�����+k�%�b�r/��R2��cZDX3�t�y_�YX�T*%�N�ت��(T5l"�f�7s�UX���������Y��k7w�HG+\*Q	� ���+v�[A�tS�����
+ߞ�����u�6�Fx��K⌕?�
+B0	t]���[��ݎ�A-hwS昆���u�|_N>�nğ�c��M4߄��d?T��DW8����73u�0,��3w,Zo9DA�la�[��N�D��aR� �Lu�J0PfF���vej�:k�ߑx�q�+t�S?������E��1�Y8*�֖�F�fc�V�F��p>��dN��T����l�k#wKZ���\W��듔W'-[)�1�bu�����F�:׈d�cK����,_��^a8�hjG=#�����U%��j�jx�� @�=�d��gnc���kS ��g��j�OA=d��i��*��7��6�f&�pѦ<{LPR]�~���zx�o�U����bt���H��V%zo� ?�p�-�_K�AEZ�s[��ڣȥѭ1� ت�ǌL͕�n>�]�5���ٿ�{�¼m��Wd�1c�9_*�ˊ5nX�y0s��z����K�>�� 6hlN�HS��(o��۶�B���F:��W_4q����pi��vTeN���Omuڡ��x�
+֎�Q)3�ȍ�<F�+�gDzP��Ԃ���i^mHsT+�8yS���p����S�@��O ��x]�h��"�!�wg8��5�ΗK=��h�S��$ۉ<�.�@��p�})B=�3��& ���S"����נ�2���<��JXg$XP�9�g���,' �[�'x�a��"�K�r�w"P��� IC��7#D�b#�^LΪQ`- 8&L\�v�w���W, 'a4 ��I~I������R�� �(U��Ǻ���w�!�ꋋh}УC5���d9���.�6oY�>O:��d
+�R���D�'��T�<�p���\@��?�=��ԥӚ�/	��lT��w<����~��՟z<�s�C[��@;�_p��Ph���K^�=�BR�}��aXt������[�d�)q�L�'��k��W���b4X���� <*t�ʉ��V����M�;M��� ��n�p�ER���}�qM�������peX�t�^uoW?<��;*�V�^<���	�c8��>|����������M\v"��51_���j~�@�p�Z�5����9�rg�d�a	}J�#A�vlX��,QV*��X�Y~{��Q���Ch�#��dmB ]z�� sW���ך��p���~��zz�Rs`��aK5c�2>��Û�����q��T�ik�[f˼�Mj���-��&8&Nzɓ����BTp�����g��^��rZMt ^�Z�_�Xf(��_�XȎaTP� ��Q	-\:@��4I�];�'5k�@�/0�Zs��P��>RXi񶝏�M�u %D��A_��]�=^^����C.�:�R��@\Rp?�d0��q��v��u'��" dړm�o�����v�U�{z�f;�ܴ�~�cY��̈́�����0X��
+5}a{���rݷ��s/�^��צ�� is ��&��Z���댩e*d�ikߧ�-R��'t"߆�c
+DQ�:A`L����竳J�O��!IR-�	(���z]a,m�����l�N��x�Z1��C�-�f �B�{��?R���tw��6����#%���001��_�uB���� I-R���MͳM*g��xc�T��̈1����k�@�ģW �5��Q� �t�j��|���75��pٌ����d�9�,��Ꭓr���@���x�|�������\��ȡ&��r������w �;�n� p�j���z�7��B�k?\��B�?��E=��|U��ue?��iF��ZUbH������tm6!�Ƴ.K�W���_�e��<��~����9H��V0ȼ4V�;Ӵ�S��1��2T=
+I˶�K�;G¯���bD)~��g"G�.0Z�7��q�U�u+�)�N�v�6������-��w�1r~ED]�>*k��漡���WP�
+֜$�e��_�I��=!�+[.��KbZTqQ��o{�=Vź+�/(N�#s���E�El����mj���*	!|�-ݾ�:	�@�N?_��=طj�\T�������mϜ��4{�mlb�E_�B�;�ŕ�0%���Dh�L������n˝�� Od9�T�^6ըx0a_�#URz��"�EP�¶d1��b�/��H:�PR7�2K*���S	 �U*z��6��D;%�3��z������n��&��˶pT�l^֍�W��>L�,���y4�K�֝�I.C��;`�֥��,)�Z�Cn�*t�+�DodQ�.n�1��V
+�\6��[g���^��i�\���k�qF��^ ������VH��$�FLr��;�00���k�U���?�0��D
+
+��(���<	�vy$u5ڼ�sF���� X%���(�����$��#]$��A�2�����~�m���Y_�㰫��|��xi����~��d1#�ީo���.m3즠�uB3�r;����y]ܭ�Ц�bQf�k�p���G���Z��\A�b�Z���`�^�e/�4����)��,�ok]K�H��ܲ~EaǦ;ޮ�G{�Ѝ����x�$�/:��P����/���0Mد��Ì9���>E\4f2������άu�{��ّ5����,D
+K�Bt�.��&��O����`d�^A���^9�@�v!�шl�pv>{TL8���Ej�0�כ��1Tک�#��Ӑe)�5����m`���	�/�	�B/��IH-�Β�|EK��(���-��7�߫*���J|�"�3�7�Sl�6Y�q��z�<����(����䰟���4���q�����K�>=H][f���6L�9C���;�*̒���q�!I��5���ĸl$��X�r
+���Y|�e��._q/_�w*�k�^��V�B�����֕��?�n���J��l��ǳ���Λ�R�)\0�Hjρ-�x�H��;���B@�}ȋV�vh�w?���w6���=�����%WJ閘`�z`|;����;�xo�t�*"|�[!����*�tFJ���^3�c����;��]|���\��V����}N���a��sK����&�l[q���W��7�";�Q��(��e�Rh�_}q����4ћݚ{��=��K����|�Ӏ�h��SXj0�q���7g�h3�ʂ ���w?L�J�9
+W1����Y�^�A-���r�[�H��dx��@�������>�8jA���!��y_�7C&�3f�4K$*Fu��I�<[@w�T���6�QM=�N.�vA�q��Y�C��b�F'�b��X��-x��]�T��	���-=�&��z�
+�F)}cN�$�|%�(!��`��� ����ct#�����.�pb	�E�5Z꠶����'-O�6�W�&\+�.+{ �eKދ�'蕆dz��q�L���)("�R=Y�M���I�Ab_q�3p�R�����"�� H+wo��I#���g���oE�U�	��{��Y�f�0芁�V�E��cؾ5z�^�k�"�P=��XK
+�Q\�-�5�=/�hϵł=>c={:P����h�Bn���>�)�CL~��!�`'�{.����g�jKN�aځفС^t�<	)����L�lG%خ��2��`�+�ZD�>��_#�Yf����]��yb�)6A!��U�PK���/�9?@�1��O�Q1�+ס��Ú���tCF�m+�:��`͘��/��7m �0
+_u�`X/��x�*�D�|=U�H�i��`��j�#G�����0]2�$
+	VF��ftͳ�4	$����Q4� 1	����?�I�pE��p�%L�y�I�K���CN���K�?��0�����`ʟn�H���D1`q����6��j�q����ѤI�
+\��R��	k0!�Y�PW�}V"��g�����*�	B8�����9�6��ȱ���W�x�ƌ.⎺T0���o�7ά�����}q�.F������}pK���5�pĨx��եÂ�{$`��J�DO�J���u�@���.�
+uw��pN�B�:�� ���h�_��\ 4ё���3�ZA( �����d�E�4�A�N��r�;4�5��(I�}��[b؆�k��H�{�"�v>����:�a��LA���YS}������@]S0��Jn�+:�Ѡ"q�� w*��������V� 3X��4)���R���[^�Y<*�J7�%L�hyk���5�&���e�"�v��;x����;P�U���գ�[��˕���.�U�±晸h��2�b|��2Q�+%٬�@�qh�-0�h��HS�JJdEQ�;	���?ud����o��+o��^�I� l�pBBF���;�B��hj�A�LV���	mVC0��O�?�ޢ�|K��o
+��%Ph�(E���u��E06����o8k�y��fV�u�@K�*��� Hq�شj��� ŝ#Xڳ��i��d,s��(NT7�ѐ(�%\��Q�gki�z_�����q�6���'g�:t�����]oζ���	���G?4��o��h��>��x�yiG�^)��Y¶�8���ө�����߫��Zt��B�认�?Ql�8�DT��hٌ�/���L��m1�������$��Q�� b
+�@$쑃�I����p���!��z�f%B�~��2i��.�p���|���ve=��3X��9�b_���h�O��l ;��r��.�3���D���#Yxk:�u�b�xnx9Y2��B���Sc��?�g��J�Ո���ם�2q8��<E
+W���|��KO�@����آF����x8f\%M���-��@)i�8��H��.Հ�f�m���h���������t�2��+����I��"�-Us���=�ۻ'�°-=���-�a�hJ�%�`L��`!�Þ�Qev�C�%��أ'p�����h5焉�-4��1�V���]>���#B�h㏓A���%м�����95*�ĥ�o-.�4Ī5�<���iz�[e�\���	��Zv�l4I�|IM�����ӧ;��UU��Ȋ�Y�$p��"�I3+[;P�i�B���9�n�w�Qff*p�<-�峬Β|BL)F�QӞz��GNS�UW���k
+�{�A_U^�CPzN���0�W(mʔ��U����jF;�p��D�3�ֽ"��|8%�7 ����pM�I�BX��;��P�[�蟢d���ۨ��3�� �m#�~�hIS����5�����4m^rqxЀ��sG��5Ę𻹈����ihV�C��0�P�����Qh�F�E2�Wg�Y�ǉ��\\>�G2a�lΙ��U0�	+�j���5�$�mU~�0U^B8xo��7jKG�
+�O�_�2+�F���+x7u�zfO��Dͮj�ea�ZȠt���*$��#�S�UHk�gͰ�M'�XƇ9�1N���HJ��)��U-[֮������&Q^�N�'Si<�ԼE���tr�>8:
+Υ�PH@Tɳ�)�6��mUY�^�*-��[02�@4��7�^�`���<r&�v���LB�1��I���Uuq,���3$7;9bs���E�9���1�۫K9�Q ��|K�SiS
+�bV��Z�T?�bɮ�KĮk�XE!�ѓ��`J�&B�;:u���+I_^˪�P�8�_b�rSSD�>����p�a䄁l�v��~�`.����o@V��N-���F�~4��䮇�`�K�ǽ�y��h���pI|��l�01�m�8xi��@�+. E"!����@Kv�v(���]��gI铇;�    �  �B,K�9���(��[�QX�z�c&m�UfS�2����U��q� }D�cU8q5����G�τ�_H�(.m�90CG��q�xg��p���;o�P߳�}ښ�	{��"�}�u ���$k�L-Q%Dڮ�������BW��}� ;����f�,�?b�Sp3!߭8�7	���V�h@�[zN{�D�����1Ho�i2�ͽܔ��V$T�~)$Jˋ��8� ��T��ˋGDc�Su]��	�Z�U����Q��x�O/Y;D��?R&�����r��<�qQ��bpQ�?xh�w���������*!;f��Rft�9�:I�qZ��A�i*!;��\=P��~�d�_�����i�^E��(">��( ��BQ^�y��LXK�e�Dn��Pr�t>��5+�C�\��u�$�����.iOf��[�z�GF}N�M�C�A�x"g23�)���W�o�9*9�3�N�m)s�����В5dS֮#��y.�s���N�,��ɯ�!�߷���Mc�6�m+��)%��V�x�B�&Z�� Zԇ�X���q���V���@�Q�x�
+y(��������$���-��* �Z��x"���6���0���`v����!>�����T+��x:���z|d��PU!�8-Ҡ|���ཨ��ǬJ����%�ȟ\R�U܈h)Rߝ,��|�c�ÿ��j����w���5�����jr�85#�ӎ/���vޏd�N�3l��3껣�����]�#�%����Z��sc࿦��90���tK���F҂�aP�?��X�~t����紇��������YE[>�{��<f �x~�b<�;!�t�Hu���[QVJ{=�N�ݞ7A������X�/� >��	���X���^�j�E��?Q,/o�����%��Vu���"�K�i����O�Diȸu�P ��c���0@֜(��5m|��fT��I6�aZ#�Qg7�ڻ����{�;�S�*��u��3D��>�[Ê�0�� ��X"?���p8?�d-�UJ�E�}�E�C6��ق\����9�-['m�>��0Y�kW)ei�*G������7k����xz�/���yF9�L�H��*�sY���H~�w$����J7d ��#.rx�]\&��������Q���
+j��	���}��D Б���RN	��ߩDapaF=
+���g��������
+�YV\�]����gjV(�q��^t�IU�m�<4R��u~�z��Q�W�!���1
+�4g Ge;�H̤ݻ~��t�Dq�ll�Ω.%J1;�32҆U<���K�,��r��U; @F�1��ݭ���._]��9m� �"OOc�[v�0R���I��d�<ҭN2it�u���nD�R�紘!�=,f�<>Gkp�1iv#�	����8!!r����Eq�7��{l]�[(���GrB��;܌k7b��v@�JW��ZS��j��v�D>BH3�X��f�"� ��SA��Sڔ�R#1�,�jqi�(����p9>�=N�K|�͉qH3� ��n�_�p�y�@��,����:��� �d�VXN/��j�E��2�Z^�=�VW�*K��z�_��� �e^�^��Q_jV�CX;����
+M��Zq�w������5Ä �%ӧ�@o�=v��c�-p(_$X��@�*�,
+�v~�����;�H���N��5.������n�O�1>]��>��)3ώW���!34hH34�y!͐4u�c{g3�)����N�?4k7\H348޵��h�u���F��?}�� 4_�F�9TL�Q	��洭H�`���-D�XuC��D|] U ���hD�W�wW"Ä4	�4Ñ�B��@\'�#����!u�4�]�F��� A�rU�a���EN�y�M�Gա@<t�zsB������������SA���3N\���f��;��֠�z�6���Ro�C �`���=^0�Z�!�����eHS|�mX�f��\ ��X��s�@~,n�>l�h'�л�P��k>�~YH3m��s0��k�FǣvV-c�(ܐiT0R#�P1�V��Ta�)��\�_�p��4��SAc��-YO/�`�
+��2�/��-N�S,��v��atQ�`s)ubY9�=�V��1�ē?����}(X6e&��h7�.��]v�c�f���
+��
+�Q4b�J(��M�W�'&�~z"���-/��[�F�I�=	�'��yZ�9mQ8B�e2�m{C("b�4��'
+;E$�n/�6��n� b�+I�7� ��<a�u��Ex��.B�(�|�as��[��`#x��p0�	]%F8�$)2��B���>�c��ڭ�P�@	�� ��ˉ, $�)�Q��x�Jx^1 bt�H�`@ŭ� �Dt2NӔ�o��	4��	.�� �����'��Jn��	�~L.��K9B�"����9�*!;=7�:�����_/lBX��OAv���.+%/#�H���j,�8f�8
+�x��{�l�(t�yB?
+xɏ�Ź\(W_�㚖B�P*�4WF#iU���� L�g�������nDڬ�eͮ	�?�T��H�qA��|����OJ������O��~s��y�܅	{%�s��Y(�=�l�N��H}�w�L1��	���k
+_���1�;e��P��V��5���9ͥP�l������Y7}ݬ��u�ج5Y��s>x"%�� X���iԏ�^ł0
+�� �������qJx���#��X��Y� ًv;>3X�;lX���*<�t� ґ����H��<���z#���(���*`��:�"����l��:�[@�$n�L��#��c��tϋA�7�ft%�Pt��8�5X�8��~����,��">�������2waf2�X�����4�p���R��W)kY �, �/���&�
+x)��ve\WƕqM����Bi[¯��,-Ke� W�rU#mv�`������~��'��9*���>����&�E���a�_��p�a,��s�Z|��3e[���5��N�t���I)]�C.β���c��pX.fX���V7wN[��0����fs tt�����sB9�dV��L"�it"�]g��M�`���4�	sx���u�h1�=j�̡�58p.'���ج\���Xq��SUR@����5�8��h;(a�r;VȲrOB9Z�l�$>�1#z0'��x�I�E.�tk�8:��T��Ȥ�3L�ՄA��b!7r�Aa�ra�W�<��\.-ˋ��P�
+@� �zs�_���*�\�bqe\�P#��i�,�Y�|L��-�۲�&HJ�6)E"�H��<R��Y3kvY�k`d�ϻ���n�u��[= ��>�߸��esy��kA�f���ì/	^�'�J���jA��P&`��������k�1���cf�>�S�`җ)39�BJ�6�T�;�c[tw�b�S�^��i�2[P�[�y��vX�v2�oc�=ہ=8,�[�lr�ڒ���4��rp��r���!�Л���_���U��F�m�e-��"F�(	�*4��� ���y2��9ps�n�|������%
+1N�����Ga�ഊf�zWF�$�6@��n7/?3�;pZf��櫘3���w���G-Ӫ�Fg7<�Y��� ��\`�`��:U�jpx�2�XZ͚(mJ�7��؛�sD���n%�Gڭ�sɚ0zJ����\h9��^�Ҏ�d�����
+�v5e���0� �����Խ��(t#ΎB� �].��$ ������i/�E���a�6�6\9,/�6��2��ʸ�EM��\�tEF�h�R� >�ծ��vJqULkY\�`� �  �� �k�83��nu{@��i��G��Y��q��%���!.�s�S�&%dRB\�X+LJN�lY��aV�q�~-4?�/�is��`̓��%���O���q���@��/mW�5O1��R@����Iyv�w>R�;�1c��Ꝕ��:x0�6ഘɚ]N�N$�	����&wa�2vs��3���`
+�����hŀ�sG�����F�[�7�A�}p!Iv�`�FN�`*� G w�,�h`�C�E�rƍ���ͣ��p�4Xe������
+�XG��%!�c>{n������ATf�Q+a�,`	F7�c7�b%l ��4ҝ\<�9M������+F-m�v�����;**x����*�qG%��d�,d��������"��i^��f Jh�����+��A!@<�9������Ľ?���e����"�����jl�zσ�or O�v�ѩπ��i�IaF[��ٍ)�!)J�-�R�m!�uL���V��0�T	�I'�������A6i�ܕ�(�i��hU!��սu�6>�p���	����Q��h��B-CG�O������O&�,K�Y�-*�\^p#K!3l��9M�"Ow%.Jw'���y�00�^O/��5k��7L" ������7�`r9Z�NZ�[���X;�n�0�<����Sָ��	�hL��(&cn�O�4���a�I��x� }��z%��ic(&�c!�I���#ˬP wD���y,��Z1#z��xT����QY��7ji#< ���7�RE�,��;#�a7�'X�3@�M|����q�̈́+!��P��53 ��¢������+�9F���E+���Bi�"��#�y��y��8bc�	R!;�ج��d���l|�������>+!���c{�(� ��d1��nS\ک:���?�<�H{�A�!�V�I�aI��WN��)��xi�����0/��Y�̡�|�V�QA��!�����aB�4C�!D���Q���r���A�g�c������.rv�Њt0��WKGa�d�Y"οJ�ո�ӟ�*.wm`�Մ�m�:@�'>(�B?[t2פ�T�\!��h��oc���?��(����jv(��\:�`.�e�,<3}28��z���r����N�w~eP���3ط㕶�������uB�!	�*����k�X�ilY�������8E<vʘ�v+�zEH�a�d\![aDtJ}����HɌ���(t�pr9��0|I3���I�BI�	�p�I M�H$�Uơ���ݨ*�7>��&���d�����vD��0����\���:���r�-�X	]�U�Z�S�	���#U-�yBP���[�9�U8�W�<��=�  t�TO2`�hJ�,��� �������?�����+*T}���EB��
+΋!�O�F��o�>��u
+�(�b���x��iLa�yE���dPz��;J����r�<�������A޺��>)�S�
+��E\E�l�����C���:�Ga�<�2^�ɼ3�����������
+������7X�T�e\?Ä�A��������t�l���W%��}�w{��r���w�3i��'���q�E�Vǭ����v�A��1��`����1�ДY�Q���hx�D� ��kcry�����h��=G�:��P_v|/1��i�,���X�$�_�H|�� �?�`l�eu�L3���4�I�E�i�V�",��Q��f����y#�Dڜ� o� ��[ָvN�_?C�"�?��kPߛi/(3O?K޻략v##�Wq�� Q�]l�?l��a�����gI�y��0:�)�6xI�T����!i᰼a�l�~�Zэ�L�2�C������3�\��i<=�����R��;.J��r�8c����t�@>�y��2�b�cM��d@8(��s���	xi���*�iY��Ş̝"�`cD2�L�{��8
+]�!᠂wk�x�J���`C�e>"zր��9�]��&ko魴��6̤=u��֜�*�#�s��p. W*tXst{�x�2��q��O$�^�OKX��G�?��|RNKw����Њ�c�h�NM)�� �K~��"�T�f[��Dڪ�1��;E��:���z��׽uް�]~V��Q�w���ύ
+�^���qQ�%$^�f����|J�3��'G�;X��ϼ~����
+��r<]��P߯���9�u�|p��o!�H���`��/20�Iw0�~���,�{X*�)"��dm����Q:�Wg�w![()������Og��95�R��_Gi�i`��+� �� ��V��q��2��$���@��'ʀ�5#�\_���7�䷮b1Qp�!�P�	CF�6��i9�k�?�������y�]!��CH�r���.穥D�M�U��!�q9����L��s���S���u��Dќ�
+qٵ���~:o��Z� �5r�f�`�r��:R*I�=vH�&��F��:0��4�Т(&��bh7�$InM���������n;��ݓ&,Te����oN�Z���[G����Y�W ��%'H������k�ln䴐�8�L
+cukü�ϺJ����V�l�ԓ vF�;��y�&�lf���hͤ���ֆ�WR��KB��u��a�m� .0 �&���0��$�[@��@7X0iײ��q��
+@�o��|޽r=ԟ1M
+H��TR�ឦ@�3��5dQ��X��d��ނ�
+���h�݆�:�E L�Q���[u��婄���Å�7�k�����	��`�������-Y�5��b$�q9��&n�z &���w~ �+TuBi�٨t!�1�C�9m�-S,$z���ep�� �[�b[{�xz�����ƅH{���\�%>�d���}D��v'��,�o�霦E.���������vӏ<]�&^��޵C`����c.YC����K���D/���:.W�2�fPΗ����F�+����N������M��n�Q��P���-�� ��x)"�J��_?Ï)K��:"�_�c� מ׌>B3ɖQ<�?\.Y�+	���sN�lAI�t�]*�i
+�uHK����)my�=]F1Q��'�z��E�
+��H4 tr�D>ߛ���~:��������rZ���O�,<�SG(䎰$��kV(8�%�y	��L�%�������o�(�׸���(���N�tTO�|�B��nD���뭧0tuT�6�Qbp�e�W1��&/�=����!��蒴>v!=zl܈�IUhkՑ��}�~ޜ�x�-GP�'C���P	ٗ��?!.��\.3�����(L�!�0�d-w��#"���9W�垞)R� uD��d��Bu�Ep#�g����> ;�zT�#�
+�e��"�}~; -�\$)D��:�K��8��<����SE��	�t$����]�"���G�2ê��':��N5t�2�L�v��]�	9d?R��,��;"�?Y���b�3"�Sb�W>��c.��0&��44�
+qXxʿEʜ>�?�~9Ş{yx�Tb������gȲ:!�.��oh��zW=L�����uRR�j�<�~\�q�_�D>������AAP�����oN�Y�f�q��ls7��m)�X������˿c���i֮(+�wA��7$!x� ����2J|4�r9�[�PXF��%�*��1��ލ�c`��TVkG%3Jw���)mr��J�t;;H{��ov_',Ј�$��_���"��*O�w̳I4��G�h�
+�ॸ���$��u"�icAŹd��P��x(��"~_O%�G>��cHu�vB�b�Y�|:j��;L(#F�K"�7,����(��}�q���|�m��K�2��AD�Gc�<�7�ΚL^?tP��L@�?@ �r�5�[��(�S7"��T��d���� -�9�ő�~�����__7�{��g�i{Ex�]̑씢"m�MU�LbF��P��@Ca�Xyq�|P�|Ad�3�Nc��+:͆O|�'��
+��#1z���P4�Ű}⳪1�]�x�&�+m)��>�� ��%�D��J�f������ҏs����ȇ�����.���r�"���#D4�?]`?�&�P!�,U!�#��ib�Ҽ���v��S���t���"b��c����i����Q�5�V	[�ǅ��r�7P�+�ю�:�����3����g��4�F>"Ze�B\� �ۑ�_s� }l_J'�����h����]�f��)Y�}�2�������-��B[��yN�q��=$%d범�i,��
+H�m�h��QtA�R?H�8�3Wo��SA���/"�[���N���._�}� �6�*F�T��7�
+qڢ|�e���,�:=} C hN�&����F�l+V�	P1)����E�i�A�r ���6��H�\.oM���.��_0�4�.	�u~}�(M	�EU!O�l�q�M%Kq��肔�3Jr��=��9�C�Q�F��v� v~ݨ������B����m���<�n�n�J�V�?y�W�
+��'��&��)�vG�].�*�m�H
+f:ѱ9�\�i.%�Ғ��y"!�h��r��J]�[��n��ZE�Ϋ>+r���>A�<��S��2�\it~��"w�J�f�f�_1@T� z �H��+tK���r���|=�R����!�d=���K<,������6�l�L��L.�fC1Q�E��W /g%�~}�x�j�
+��P�~�Au�Q�J.aӌ3<���k֚���(t��`Ѳrz��CV&"N�<��U��� ��
+@��8�XS�p�9t�F����yG%9������7�y��
+���OW���,�%�EF�Y�f�t��J�l֚��  8���F{2ΰ���B��Χe�!���	o��]��+K�?#7pZ@�K�"�J��g.J�~�n���8E#K��DL6E��P��j m!3��>���H�_�(�⦊��t)���x�`�����v#�� ��
+�����o����L�j�|w✦9sx2L�Ɇ"� @{����+.��NO'���_��E�9M���O�$�&�8��r��1�[KC�~��vq)VA?q�z���7���vP��-T����>e���Av�*E&b^ڜ0�"�6,+ B��Ow���ޑ�w��I�c��Ύލ����I��w^��� E��J�����$�m�z7�� M��_��-[��f'.m�7�E��"9�03�7|6�O(�<] ������y����'��	D�-qD��I#����eїCZ�
+: QC\��� �$�.c���,��+���kA}:�%o�#�^hcy� ՚�̬X	۪X	[�X	���������z��� ��T��>���X\�����y>��G1��O�t�����4�h���h�4|���,����WA�7���;�q~�
+�eeC���1��^��i!�m��%������v?*�7,����S4ö�)m��F#��x����҆\Y��&2�YgW�T��b_�
+7�
+K̝hIT;2P�b�v��]:��� p͈)1A~�|���ɧ�#�9-u�"ޠt�>a���fk|�^�VI��M�X	[� 
+[��~�<w�\�z�����"���7"Aq��NZ>׍��y����H?�g�m��3��W#F�n�*���S��%s��T�����^��L^?�,���Bi��:��/��.�]?��� ZM��p���0`䲀Ӈ_��i���Ggx��9�!9E1<e�L�^!6t��"�3�\.�t(�o	gq8��� �^`IB?�G�i!Jf�K��$:Y�L\���n܈�q:��Cwq�:�؁O(@6Y�)h\����.��Jt�aF4\v��0���ӟ}@�Wy���M��`Ҹ�r��:��Wh�@6��i;R�?$$�׾����:��穸�і���ΐx�Ke�.O��MR�f��I{,4s&�퍇�
+��2����b����(-97N+xD�x�H�j�4<��7E�λ����k�k�`�B�I�.].w�Bf�i�WD�x-~`֚!"�I{?�\S�@q�Zq��;rZ����v8B���ۑ٫˃lXv�I�i6 X�/��[VO��&n��$����g]� ���Hvxʛ���iA�]*�!8�{�qj�m+���D��6�L�-�t�dy�F�A��q�`�X���F�%f�2��됂��?	��ٍ��8��a*��9:�#i��tG^i������q�����}nk� Ј�_T������h|���>�tr@:��c��P�	���`���*ی ������K�mHiކ*V��H+�ͅ!A�_��ذ9N��2����u��8,<,B O�y�w�n%����+�4[pY��UPG��M�(�x3�>Pf�>��K�L���3&xJ���nTN?���/���S
+�\�M��o�7%�([�|![������dό�n䬎�����"Pm���NP����+����p@}76#��H����;*d?�&L�K�<�&i��eFq��7��*l�Q1n+F���f�e�o��Z�d��M��}Hx#"��9�K:!��<�N��UC��%_��Ό��y���h\�����y��:Z'-�P�y��P�i��8��Gގ�v���i�)����l<�4�Ư����}H �������k����g��dz*wG�{m"|=�Y	���h��u�H�,�Rڜ�������-޺q��6N�A�}�X�͢G���B|�b�\h,$��>"Z�Olm�tF%����&UWo�O��m��s�N�#n����hO��D!���.�h(�&H7�~7�Z;�bB(}Pt��*�y�0^iʇ��d�����9m�:=�-|(ڃe�'�{s/X"Pn+^�*�._�~���:��2L�%]��刂���G@���8m�=�r:D �}�f���B(�"/��+��\'����G*BOxn�*��u�p�_W)��ۊ�εìqM�v3����'b�vq[����L�\�q�r���C�|��Ƙ�`Q����0j�8B���YP�EE7�_�j 8�x��� i��􅚺i�2�kaFa����נ/�zu\��f�M�Q	�6��4��ԟ� �{��b���t쮟��RE�VL!�	O�W��?V��R�~���F�?3J�.��p��6]�oH����B�vq�|d�����a��c|ȳ��j��t;��g�I$����`C�ö���P���R[6�*Ts�sb�k %��:�3kCҧ��1��e���
+\>��__�W�+����P�&����π��d�`�k�2(
+8L"Wb��;ROO����c[��H���[���p������G��i��,�&��q��@9_�nB(-��)����0_�̥Pň��?@q��qW]x�-�gG���ӼɃ�ƈ�^��H�1r�\Ѡ��c�`p������s�|}s�"W����=��g(7@��i/��#d���`����(+��2FHX��A�k��.���^Ӹ���H^o[��^K�&��gY
+�:߱�:����� ��s�kAX�UL���_���qy�L�@��P���p=z	���r��X��]Y�f��\wʟӺ,����"�5�!!�Pb�4�W��P�i���CH��+� �\)�Ѷ�������U�d��ZRO�f�ֳ]C![���� @�D�^�T�aM�ep�l�-�(w(:@�E\V�N_�\�	űg�h_�
+Q~\=L��xIΕ��, q�+ qDB��X�#���^`����~�p�wN�p$�n[�k����������v����v�Q���a	Z;���5������B�I;#�~�紎vJ;�dP:AIʘ�h~z�zh�"�/�|�p̌�"��A�Ӽ����L��#����J-Țd�f�!x�����I�g�{NK;&�sQ�8 ��L�1�mv�-;�f��*��k7�O&�c��9���z����S|gxp�ў�f3$-߇����H;Au^�Z3*(�]��3�h|�L^��>��a����n�SA�<�%r����b�	jqQ��h${3lD܁��]Z��l��g�(���+��t��.U��)^���y�qQ�t~�@�
+��!�6�g�6p��m�4*#�-�lQ<�kTs�[L��~�~�2�紳�\�Ew����{2Q9�A��|�F���E[��L�Է�.x��W�钘�������X����7i_Ľ"B}�)��S�D��� Q�tM�g���՗���Hn8��*�/�N��t�k��K܁kud-�J3�1B߷)�{j��@�e�b���i�r���]�Yj�DCD��H��P�Y�HoN���j��>`��oa:^o�dD�[B�/G�;�����|}+x:'~3
+c܈?�x	����cM|�P\�E8� �F�1V���:�/��#q����x9���q��5#z�p���v;Ϧ����
+y�i:��]:T!W���hy���\���L�����d6ۑ���+1��k�k���G�u�x�����J�����:Z'�BF_g)��jkF��|�(�P�����X�Ə��y�U(m��:iw?�|Ɠm�벫,m���� �t��x"_<d��7
+��?���S ��⒀�xK��\��ѫ6$�`�F�i*�3��oI�*X�_�zo�>�Q��U]���!�SbM�ӟ	'�`��Y��Py'D|@F�_�n�/��B7�U��{�H��
+qG��u85rf��a��Y��O'� dC�@��RAaŉ���t6�7���ED2t/�'�P�=3F��v��-Yg��
+�4VB����S����N����W�vl��0�"~���;(q��*�D��V2(� ��KL^�:���JL93��|Knrit&�\��GS��ב|�23�g%DB�৳�L���������?��������:���*q�PZTa4k����y�
+�a�~��g���y�2,�4�	�?\D3t����}��ő9�}̤�7�Ο��3�u�����AD��x7�78���|ߺ���̐䕓5y��;�	�T�_Rnc���Q�#��vӤB��av�!B=��%h��c됭[�I�N߳��IL������RK[ݸBܳ��h�����]3ݨ��-�{Q�O���##��8ҥ>%0�z�R�|�AȞ{|�`:�m���zN;_��T�.rt4��)QC\t�Z�J�VaօZ/O"��J3
+B��š���%k�����B6"��t����$���e-Y�ԣ#�r�E��NO�h8����Vg�N���=��1-%:{F��Щ�k�R΂�B�w�����q�d��O�ќ����a"��C����b�/�5�2ou�=�<����o��a݊��8���a�rF�6�'25`\F�8>�P�K[�2.���^�S�..�����I��i|z�"����6:��TD�vdTL^O�8�ϧ�
+^p����Fd#��x�������f��F�G.�o�Sڱ���8���8��s�>67��#H}UQK��z.�)dD�
+^�g�A��H!����"�H�_�\�.h����d��rPm.%���8�C�D>��W1dD����ĻB�4��G�f��B�N�J�8M���]&��=R'q�6o#��fq݂A���+m��Q�pGo��!���(=���i��ݍ�R�t^��"�4y�u�L���nO_l��a?��z�=il��A���z��[��D��D?q-f���'�].�}�3������H|�����c�ކc����~�|��Ս�ݨ.����a���/Ygb�*xu/T���|\/q�@����dc�rvSB���|��"簣�lOY�]���Q�����WL�� �.�����c7���v׏8�ihlFt�}�'f�V�� Ŀ�q� u��\�	�?Uz.w����g-��("���_g�W���9��)QV2������3��`�"�����*|w��.�M!���[�{GoI���"�..�/��6kW�^�g'.�`��R!�	Qߓ2�D]>����a@�x�y_��d�/�lrZ�"|�5uD���:ԑ��J�d��2����D3*v�o��|����\>��Y�m��@�#��èY���3 �c�X���&�~�'���d`�����q��Ο�qF7���0���vj72�֭�?�䥓�3�|b7څL�I�y���6"�����_b1�a�@P�/6_-����������J�C�1"�oS��:����όF �����3q��<%l��Y�|�V��p�3��H�z�E��W"@�m'v�`V�QN��sMU&����\!�'�/��;���1��S���޵����}� ^�j����I)K;@�������N�r����p������O�<���2(�;2��4G�"�3�3`����������:��K�єZ��J:��)��"J��� ���D�"�iJ�'3�s�wF���L���H9T�{��o���×6"�g+&J�{p�~X<�������W��K/�k��fbl��V:!N("�i'�A�+����ڒu�ҩ��@��H�
+�a�-C!��<�)�_ +��XU(�ҡ��x�����N 5�s�����Iَ]x��h����b� ?"J�)�ɮ���d`�>On������N� $:S���B[Eŉ��S�+�C�'�/4�9���(��oc"�g����;�����FKP~�1�oQ�4�k��Χ���?�mA[�r(9�	p��
+���/�X�$4(V��7��I�D��E����O	w9C�3\Y�s��r�1����\�2���c��D_�hE�^z���A2�nA&�#4�1Lr���Q���y�# q��ΰ#�,O[!N�yz��(�9������rd� ��*Bd�g_�f�-�D:�.<�_f��V�%��?�\ɟe?�o`��Hx���D��/���9�P�[�����hr�����A	/W$J���H��ܘ�������9���B��𒺱<]�K�@>�30ei+&���/�q���]>��0=E��3l���]�Y�}���R�q��Q�{��yG�S�rA��� P�#z3|b.h�PIu�3�J�a��d?��K�Es��(N\}!3*��k'�`NSSei�{���aF�S��Ix��t�C��� X��N�',����"폚(�������u36#�!p���Ot�w����E]"�s�d�Y�g�"��Ӿ/܌+�}ԅ��E�Z'��/�s�h�����6'���.n���\��ؾ�3p"�/�пH3zY�=p/�i8��d�	|DN���.�2�5���2 ��4U��EC6*���~��|J��,�V����]t<zaa2<W�H���	���!$ qD�^�� �_;�zc��:A���Ji/$������������i.��r����������KJ�Σ1oc�W�u>�z��� �Ku�2�I���{q"�ሞ_oJ'���6T������t[#�6�!`伒�s���gG��[��1#�a�8�	�ַ f��*�-�!i��"m�ш�YPz��Yj��W�x�u ��`S(����zh�T)_��\�9:���i5+�)<rF>��Ӿ�6�i>��
+'���8�c�R��*�����Y�G��H��j׎��>������������t����䚊n�ۄK��31@i�#v8���E���ʌ���8�c �bb��*��UCD�B��sZ�# �f��a�����q�"�k;�{w䟈\:��BD:�����q�l�p��ƷAX�nl��i!A�����_w?��:��:��Fk���ك��.�"(��zD���N=A�R�+�KڃÇ�*���H$Z O%�V�9�������iKL&*8� Ë����ARɌ��x����$�\�j���!�������S�c�)�qiۉ��g'sե^���=�:Y;�T�U��2-s�4zf�1���==5;=��O���_���9-�:dk@�#��B(�87k#�ɅJ�}�˷�I�)p����>����O`�j�Y�y��r�l���m������'�E�>��/�"��H�Bi���`�0F�y9���l^i�m#@@��46�"�x5e}�A)K��yDަ"���x6���e?=��(݂`�I�{���
+�f��˘!o߯>�.���]�,���yt�
+�G�9�	Ⱥ bN-J�S���"y.���x���[����6�֔*rDb>Y_�4�!�wu\��d�^`��YS�#���hx����@���T�=�mw[!�w"��];�N<���n�yYk_�!�^���v��T��V�$�R���,�*!�y�m~���i d�{��,���y@с*D�AƐ�\  ��3by�  T>$PDJ,*"�I��0$�X��(�� �Řb�	x�m0���Rla���;������!������Q]	�c�{�ʾA"����#�@q�͍�phU#�21Vc�]��
+�}&
+%��V0��!��Kq��I�����)붝�jB��k�Z~���aM[�,�׈%G$�
+�RG�>Z9�BH�F:�������4�D�U^fg�ʏ-�f�B�L{z�󩏘~U�IY�K{���$氱ޯ�h|��{!9fW֦?Ϫ|F��Xk#p�%�R�fmwBi�EK�y����� #:�x����JDt���0�n�u��T�C�NGt��m��+���ᅥ����C�(��ڴ=�`��q���{u���n7:��,#��Ml���%$f9�A�{u�8���Էu��J�{ݱ���Ҙq�ܔ��B�-[E��^o�o�r��O �m|�ΰp��ϝXFr^A�lh�����C6������1	�r��j�[M���w���\Sl,��x�U��<ɿ �q0��TR�y�a��iq�N��0P�B4"+�N�w�B-�~���c���������jH>Ĩ�|���k�B0����?��෴v�v�0��xmijL*.��l�8+��1�;+c�5ބ}��a�ۑ�`ۑ��[��򤲔7��V�r�p��bJf�Ĺ�Q�mG����|R]�[�-����Bm����fW+&�%�����8�/QLAp��N|��RV�����]�Ԅ����P��7��
+�G��"
+[���'�A�/�a;D���"0���L0��c\,5ZK�&N���hZy2�L�v�[�A�A����`�i�9���2[$��aH�!��4ݎ���Hm�ya����/	)m�B�硶f��ی	Gդy��(�A%��u>����b��#Q(7>����~N���JÅ\�y�(��-:�� �L�Y�y2k>��P�� ��Xm<#,��:<Y$���xj���wkU�F�����( �-��EP��/p�Žf��(�`�a��DA�}	�?m�:n,�%�m�̭%58��ЁZ+V��Z�Y��KU3�liL��4���\�|�w��kV:8?剰糑�W���zB^����yhf/�x���x��н �J�Q-A��>I�dpVu.K���q�P+��Cu�8��t���|0ͅ7ZY^B�e}۳2����KG�EcĞ��8J��I$ivo����ď�}�<I��#2��W]J(��0��GǪ�$u��ͥd
+�������I�=�7@.�Ѩ,iO�L�Ü�eA$	��?�O�͵����8�#��2‷Sz
+:6{/�XgTm媅j��"q he��]67	$|c����"��0����^l��?�FX�t;sg�P�:�/9n�Jg-��Vz�K���n�퉀������?g�a����	��T�ӕ0�5;�����E|�"&-�Πk�)�[
+����s�M��z���CxU.ԇ̽��!��Ȅ�GR��]k7 �B��A�<�xR���Usk���c6+I�H���'l�M�H:�+\V1��u��?�I���{�]G:I2:��y@<�>b����I�y���T����H��q�c^��+=�E�֐�|<�<���x��֤؈y�� sg;�0B�b�D���q��Ai-'��ax���{Ne�-�i��k��P`�E:p,¤�N�?Y�ꖬ�;2j��RηY��j���\�{j{�ܐlD������0'ٜ'�snZ�2����,+7���r��gEY����	�[`&�7�7�;B���س��C~(J܊��`1nh��k$�g]�Cx�%Dp3��#��rDd����&qjr�`��1���[K����l}�f�<�;C��3�>��4�����2a�X����E�5��r(ԁ���@m���0	q�k����F��u!����{��@��%�O���d�`z�eo��Xa:��n<>�w����b�a�"�ߟ�f3>%��4B�gہ���S̇=:�=JMc���~�����c�G��[(�θR(���é�$j�Z���_g��*!Wp䕐�7�O8�Q E��6pkh((�G��O�j-.P�gQ@<�D$�m>�K�ؗ�D��/;��i��~Ww���D��٣������-����ť2Ƙ��u����9���k�'�r��Ⓤ6E	u��HPO̟gM6�u���<@��P�(�D�|��]$�2�֢�����!�J3E��	&lM���BT�{�:���"Ձ)�sᖶ�@�e���%�v��)�\�v-�r)�kg
+���O��G�b��v�g�X	��ۍP����,�����+�,8�y�\E��lSLV��	<�BL�ǎ� �� �7�＠����Z�c�ZJ`�2	�9�5�*P��U�Z����D�с��:�����Wd"[�l%1-Pd'�����gw�+�B�\/���i5�4_e�e�
+���Dr�P`hmm*�Mկ����%�o�����E���S"��F�B��A��f	m�����D6�˞��<C`�`���7(�m�>�j�z2����;����*�6��kj�SG�����߶X�)�*��[�h%{e)���+�%"�ǫ�x�ޏQKv��E�@9c�	R�4�Y#�B�6��9�A�!�������4��Ŭ�ioz����¾�%��Y+ub���P��N7�e�c?|э����n��ECb�������p�f��Hq�����>Ń�7��̓�8�<�*�q� �Ap�n�8Y5��;@���rg�5��tVDȇ_g�E4�	P�򇇪�kڣ�O�U���K�!-2������f�-����9���ܿI�&�/�=����5պ�lO���G�CЄVBj�����,�����ܕw�0���q�Ҫ?�0kyܷ���(N��,B�9=s�΃3��a31RH�\�R�*♜�`�[3/����!��l��Q���"�0&�^ � �ֳ��:>�d_�ɦ"Uwo�O	��V�,�Ň�+̼�-D�b�Vqʦ���
+��Y�o�r��*��������d�H,�om5.�)��q2�X�����3; ��f���R1�7�=!���bx!��w�B�5�k��@I�f��A�F/㧍9{��snl�)t����w7���}:�'�I�7J�^��˖=i'_ ��������f�}T}�|v=���[s1G��dkd>�F��;�q�i7NE��^9���|@�h�q�P�i�����AP� �.)�_�>��i��~�7���V���+�O��F��|.�T�X2�du�E�ft&�ʊEN�TQ�����t{�E`�z�@D�?�40��aş6sځ~�@�~/���vN%\ُ��3G�1�b,e�1lf��h^��5C|�n��>��h��2��*�k�^�2��5� g�tb�m���_^�)jO8��Aq�@�e�Z?^@�+C@�S���$���|�`Tmn�����a	]��z�w�H&���?=�Ԟ3>#R\>�)���A,���ﬃ���ҹB��!j��}ӈ_�ʹ���}�A�O��&,~�py�bu�Kp&ΤT�jQ�a�
+�*��X?z�-1�	�4��4�4�,�	����d��%10���$�+$@¤2`2��rjH/�Iz����|K��6�o^/$!Wv�a"\��+A��hib4+�e���Nگj���(���X�A��MK���,�I؂�ǁ���ʠd��Rl 麌܋����4u_=z�g�C*~ y09�[_&}B��Y�b���������h�[��Aby��8��n83N�dM�ht��Go~�����ޱ�e�	dX2Z�6>G�.Y'g$�:LRȏx&`��s׽�7Iǝ���pO���^45��e���b���"P{&	�}��wŹ�#�L��ML���k�D��L�r.b��p�;+����C��?�y���+�͍��x���I��O�)y��ž��N?������m�z�k���T�a��S�4���������?(Q��'"���*�wh.�~�Q.	�� 1�g�	�{�����q�E59�� Vn��#3C]sC�@�����L�r��.��nM�j+������%BC���q��8�;��1�) _s:WM;�C��H�:��U�̭A<�����u�<��J�t,�&�#F@�ÝTw!BNx"�5���,.�vn[a3���I�n:J�m��X��}ȩ<�b)�uh�^_�_��Yۻ�t�)�D8��̸0[�}�%h��C;";���ߣ��vy����9��K�^M
+��h|��_�_R� j����48�;1 (�8L���m�#U�_g�Kt���A�r�9���;l<N�t����t�����'盚����(�|a��]�:���;A�ߔ�[�h�uB����c�&�w&I=@��&p�z{%;���B
+�ي�`wlTjȇ������2�!�p@��+h���j�E<��[��U�e��=���Ht��{|֝�v6c��n��Ò�Z0�b*�IY�۩�_�s3��7�Јj��)���J�,�;&������ Q�$#�-O�`2iH�IYGS��|��J����SA�P
+��j�,Gj�j��S6@��o��7>e�ˀ�����ea�T���BVx�S{�0
+�^��.T���酈�PdR�L.~,����g+���W�3ZɄl���,����QM���!4P�cR	N5�9� ��A^��Ϙ�b���7�{;�w�2���k�"�����Ĳ3��{��S�M�
+�ި��H�|3,,+a��̌� �1ڠ�6QM�'.W;VݕLmz��W+��w���_s�����&�{�&�n�N�px�*�aH�iހ�����7���3O��@|\0��Ɛ���q(5��k\V��Ty�����V�arS��V��W%�H(I�mE �Q)�"ln�=�z��OX�7��?� }��WJ��@��"��-�h{Ňcz���9"Vj��eF���B5V±��fO@
+���(w��aŝg5��-K�<���D��l�@�����{�r��q!q�	J8��E>��4�\/Oc;��θ$�wc@�m��i�&��$?�%�~Ree���G�������
+�MK���ʹ|��i�< �JW|���nD�m1�ê�\�%��2����a�%?�>�K�*��|�G���nPn*	(�����W2�e���W�5נ�?@p�\��4�%��eJ��Ν��*�9�ُ�G�>����	x����?�Z�c(T����e��X��2ʶñ=���*�����cŏ����?�> �*�z��i�I���I6������rq�DԘ���χ�����aP���9��<J�8 "{����pɎ�Y�S�MjU !��6��$��>�}u;�݌�%�6�yo#�����r�
+5�E�ncw��c^mː���K	O�7 S󟿅��D��m�]j��J=;��&a ��k�d��)�#�j�A�RW��-k�,�!���e��%��O:;�S.u�tP�o=�ږ���)�;�at�����Dl�i$C�p��k�E`#[B�5�G�j#钴)���f}bɢ䇬�-��6�ܞ�m!�H��/�I�}�3���	�%Nrw?�i���Ys��"$&�1���v��N/:�0��j�Y�Y���d�w��̎����E������XL�$v:��$P��d�kk-�h�}�~7(�\<?��d�g���E�2 ��� �և�8zH�\�?�א]=�x��-�9�T8�q��"���H	�e��8�P�mj����b,���uy�CS��'!J�L>���z����W�A�΀�J���^��X�69:����8·4#�p&�$�*IfF~D���،l9���X����[��h��Q�ya��7�%��G7�[���#m$�o���	o�|�j�AX�J���v���D��c�8�0�8~�WJ��>�(�����FVe���h��q:��'���u�������˖��9v�*���0�|M\��1iL}�qko�?�P�e�T?�&v)4�`�['���<P����|��<᢮���M�,9m-���F�6��^���h�Ĝ8L�5�T�+���t ��s;.�{�_�`���1$t}�#�p,�$�	�n�*Ln��)��������4�xx�$B�3�������G^�j�J����ⅎZ����j�ڴ/�
+�>&�xF,*wǎ2��؁O�XpW���x�l�3����+�~{��!����`����܀�<���4���������<?e�d¾�d�h�C�n&�C�G�v%��3c~��}�yB!��[���{ES�Jv�,����������n�'I��Đ�-�Pw�u�rR�Z�=��Tk�i$ƅ�oQ��i0�/Ɗdk1[�l7S�ŝy�� ����p֗���f�9�a3Y��.L�����5���+=��æn[�_�e����jԿZ=�7��sN볼O�w�ȑR����Y��k$� Q"?� Č�]�V�B��l�d�p�#��M�5H�ɫ��@}��8��V�������3��Q��?#ĺ��&�d���a{Y��`1yw�|��׃�.#�<,V8�p*Q��X�I��RI}&V�ɞ�O��#�{2"U�=Gb�z�>5�S.��Y���*�O2�WI�%�C�n�T!� �L!��t�q�����>X���15�21z�l���FJ�3΋H�Y =��p���̭Jƻ��U�h.y�`u�N##����ny���I��������P��"�x�9&�s��;ru��`̈��9�,W�a���>'���MM��e�����Iˊ���hH"u�*+�wb3�0�|���(�Ҫc�-kH�v��❩9�%��l��H�Jj4���+�B~nL8�Bޜ�0ZP�%�j�)���)m�����SR�c�J�����k�5HK̂B�.�E�y�]E���t+a�/]����K��Ż����Gk�?}�����3)InN�U��o|(�o���W%S����I��E�$'I�k^��FI���&β^b������C�ghG	d�໓���jw���S�N��!�������ī-G�?E4��?U�?jF�;�6F/��(�KaέZn9t[�k���B������k	�aX���raNޏ�B1g��8!ڵ��~Ջiy�M~�M���͊�5 Dy�	�����=�ߠ�(W���d��`.�&��K��;j_��ܩ����v��ܓ�ˎ���D�%Z��Hg\�_q6:�	�"�v�Jw��(Sr�7H���Ya�g:��7�O�$���	�@d%܊AA6�~[v�Wq���%K����ba]���/ʩ����@Aѽ���r�λ(ĭ�l��Q.;��$�����"M�YS.<��dgJ{_B9��ߛ��3�n��%��ZL��N���a��"
+-aDiڶC3��6�3�Ek�<�n[�2�	�?A�r&��R�i�t�iQ9��AD@,Y%,��K�P�h�*�
+�-�r�N���"��W���_0R2F:�ʴ������KY$�Q�j2U
+{�P��1f$ '�??�~��	�D���$B�2�4��0`	�Ev���j�.L�=0vz�.�F����v�}��X?ݩ�
+Z~�8*M=c�E9�I������ԅ_��aV򙀤&T��y���g�Ik�ٗEe&X~vMH�dg����7�-�	�';k�z�	CM #��}��^2����a��	�5��'h�&*��|��&E���"ŗ��<K�P������A��(��9	�k���bwW���rN`p��n㜃��n�jX�s�eѿ�/i�@�B�lI�됇��u$SūZ{c+���HDI��5~��M��_�_�����ϑ�+�FLԔ+�J{a8��W��i+���ϖY����cm\˚���B�G���W��^#�6dz ��QVƚz�Q�'���) Pq�n9�2�i�F���T��ل~'U6)�2�=V�X���}'��z�No��)5v�Ӕ�}���a�~��<!>'���$[j;G�[��3L��5܇��Ֆ����D<�Lz��D�?I��d�[�E�-�98̣��N���oό
+�Ka��EZNQ�T��"i�Ar�4p��ى	������Hbw�w�t�7�08lNkZ��D����P��2��)���e�n;^��yk���}wZ��M[��eW&����ab~ssz��0�87��	�aq�A���k�/��7yd{�}���m0��X��ཿը�t�ߕ��j���W2B�e��N�,/��QiW��G�7�2�q�v-�]��͌~��`�u��W�Ȟ����kЛ�؂C7n?x��� j��q���:���ذls��8�O8X���a��e���lAR�*��dB�EA@�K2�w��}������щ���H���U�1�c2���ۗ(ُ��]%	���u�d��a�Ĕ�sgEi������Gl�SAl���D��d���&�f�
+�Km�����*����_���^�P?� ���Ւ�i����/�����Ц���Nʞ<��b�(�y&1zj����I�GV��T���b0��G3���+��]��RTY18��o��#�0-ς�Nܿ`�j �PU-�]�dLy��4�z|�6�U��k���P����90������&j
+��S�Q'H�n�R��xi�	�
+[�� �bzyu�H�5D�#�`���.Q!�fnKSdrXpKz��jT^���j�LfA����%�ܨ�H�њ�6��_
+�XZ�{�5�Z!�R�D�Ʒ �D��<Y��6�y��S���rb���f�-3�a�˕w�ZI�?��GO�)aBP���(�ʽ� ۿ1f�	�
+��a�s|�AH4��m}
+Miw��O12�����IX�>`O�Mk9�~�o����/-���{c������Z�Y�;���=NT�5�j����X���.)���r����ӜA���<�7����Dk�s�?��D�hL���W	֎����bG�T������ �]�̺\s����BH��H1�2�L6�&�O�Q$^�	�o5�0�h�Q�"�������cp�Z�XBV��a����]B�Z�˫�`yI*s�+X@���������� �FS ��|E��/��&��Bn;���qgWىˠU���َ���M����N��j0���$�PJ:��ˇț]߮��4�SbrOVv7�,�m���_�Y�%�����3�p����]���q�i�2S�=��:�!���q�QO����,B�0����!�[������X�bZkQt�lƺ���v������x��0,ԕ��}����D�\�
+�вłr����Ԕ:��&4)���z-^"�:��l�$^����J�"��dsK������"ѠK��93��T���ۚU���Q����Z�(��[wP�cg���ia�)�A�ZR%,i���Q|TR��Z��W �+��):��hc����w�����k�$�E��ʽ�}�I�����oc!�1�w&����H;yTŇh���h6�b�'��X`��Mpx]��h���s�q�� :ށ��=[�� ��.+:����b�+��N������7�B$��;,[��T+jX[GO9�ET�b���B@�6yVD�{�&|7Uh��hġ�䉏�U`�)�����K���$�����(��¯`�m���N< ��ZC�P���Z�1
+F�$x�9��Fv�[F�[��%"w�m�:)�{-4�+�f�vWX����v,veZ ��8�%��kVR����ʩc�>��FE���fYNU*sq�Q����]����_��,x� ��5���Ʊ�l��(q|}V�H�z����"�4�W�H�Y5�%��'?�I�X��Q��	`��
+�d�	�P�l���	�T#S)�Jh�/���UPޣ�HrQ]]%^��ī,�ґ��`5Y��'Ben�{0���T�!�����C����h�y��B��m�*;gIG�����3�~V�B��u�[pq�ne�A���ԜS��]�hzOZ*����Dş
+���}ȴ�kK5�~����[����l��a�� �,���/��0���I�X�X����ܻ�gy�)CX�`���eL8H�ň�غ4�z^�^�hgB䕛9�Z�����[�Tc�ھc�m��е(��[D��}��Q�NR��Qd~o����ؖ|�b��v�ZI0���ݎ@��>u��e�~}�	�m�pFрB���d�˴��S�6�*��� ���1�_;�Al6��,������SM��Z�+�Ѿ�IЋ�GS�/﯆	K��T\(V]"h�����6E�QG]y�|g��~]�[	�%H5c�ɮ���11�0[ȸ��*u��tA�4���t����*ĳ�㠻"�q�7'%fG����t��,��4sG�!��"
+�ģ��iC{��.�n5�=ZL�B��^�><*��82��U�V��&��'"�0�ˆw�Z���]�[x\��@2�c!V+%
+�Fj�,!����L&/�g5b��LL��-ҵ�V"�f=Y�tW��L�����X�R����"�|�sw��W<f̂�Ė"Y�j+�@�ҟ��\�!9�V�g���^�)TBk�N�H��m�jT�ս��x;P[8��"�,!�<�(���� �u%��~����� }�,dP�v�
+�0���Ml���/iߜ�H��P2f�uNF�r�/o�d\X7e������K� J�`Q���?� � ��ߘ,�1�!��/����s�F�g:)�}@�W�O�$�Nt�rM��2`f_͍��t@8<�%�}�X"¨Ms��"��sn�,����>G�� �>�IN
+{�Q3�$~�f�}��4�d����~ ��b���Ƨ�e�X�%��!�9{��ڡ@��#�{����>�O^�+��un�J��פǰ�~D�M\��敱(����s�Qs�h7-��7F�2C�ъ�kQ�OG��2�����D�Bf���/�.����>���/+c��5�����
+#�2XrO�������~qZ=T�	W´���Q����&���G��kR�;t}Q�.`��=�6Cu����*1*1CV��S�
+���x����i���@5�A�3Ob=v(	����A��i�h
+񙦝��Y���td�O;�v0;����q*2	�+���3����6}`RU��&s�������)ڎ�K/;-.o���_jf'm�DNۤ{���=C��H]N�'.��罡��m�{!��fb*�rJ6O�L�p/�KC�s��&
+/�y\ǸU,~��mI�NR���$岖 ���Q�gb�	k[ \�2���߫�8�F�KjLD�!Z�	�B[���L��U� %��6Ksg�۞u�o�`�����cY�m�!�c�2�W�
+Ɯ_,�ڑ�pI	Y>���e!�ʰ��w*����m(�"��T:��(6��#9�H��$D��6�E-���?�?��\����|��tK1��bڿ���K������<���AKR\/�y�t�xR��O��v�U:R���d\ͦ����x�	������̰4֩�x��@�z�{<�x�:��$v�%%�r��U��"���S]_rh	�Q	������xR���)��|I;���?�� �hD�M�/�3wbJ0��y���d�͇����u�����@{�2��H�ou6	�1�Ѯ774�R�/d�g�GAe"����d_x��e�5A_�����o?Y�q�h�U�G�e�g��xp���X9t�@*c��t7��Ա��g��׼�d�^T�&ӝ�8�!݇��C"ZE�W21f�0[�n��ދ�'�63��Д�`�"KĄF��V>2��E]g�d���7��G�a'�����/�\HJܼ5o0@91o	$���c��p��H���'L;�,?�SF?n�Y��N�G�y�̆���8nF��4�p�����G%6	��sbfb`���'�t�L,����An��Ǥ��T����P�@�3����	G�k�����v��O�!�+\/yB��*��{mf����J�*E!!++��%����ƹ����Z�&!��V
+�©�����o�����M��icV�A����։���>���U8�w�bm3��|��TЙ}�%�; Y�ܟj���P�S��Kؖ�����y@7�ަ;Vɱ?@�6&o�M^%��D�����u���$�{�bo�YX��F��t+|�[\<c9-!�0w�l��Dt���7�t)�.�_\��I�p��i�ĭ� 5[-���.ִD�2��b�IjE�;Ҙ9�r��o]ʟ��������&�57���m17��M�ȓi2�n�������Ԁ�{qs��G��P���[B����q�A��t����R�_T��?�7ex�I��57-��Ԓ�����8�{��$�s������_JL�J8Ce^	��'��\_�R�}��b}������;	(��.��ߤXF�����,ſz>տ�����r��_z/L�+�F�Lx_���_ĿM,�ֿ.��U��L��x�JM�=��`�a�G�W�}E���B�̑���ri�79���$��V3��� ����'���i�J��w+v|[�3еxV��W�K�S�O�e$���9����q��IgJ�>;KS���p%���ʵs�r�i��T��
+Y�4J���~��E��@�D�����&��)�����4�B�ȓT�(tk|��3�Oϛ��� 1���"��	;%e	���&�Z}��u۫��=��!	�d��vd����( H�l���B���`�Ìڹ���v��4���C"E�`�Ͷ�r���/�8;�����c�6"��k͓�YvM��{G�uM�-Y��=S�V/�A׍��M"Ӵ,n�9�愽����i������{,�V�˾�����5�Į�x�*7��y��6�/�(|�1lM�\�!�қ;N��6��9���b8X;��K&�b�*)�GR0�*�c.��%q�����3���6�`���@��%r�R��3�G��ʼ���Po�>���>��
+����ڱH����2�sĎ;��D��1�hz�W5Q�)�0��,���$�R��U��d���Ɩa�
+��;�� ����d��3M
+�t��l}����,�a��w�@�P�������©^6@���'��9(k��'�(���dr�b@ń� ���(Ǌ��'�~�	�!+(�\��?IŇ�wL��9�U��yˌ^�s�X�uǘ��r��w
+�gE�g�]��IN�(+/|lg�CZN,Z����,*p. ��&n 6���S^G#��R�L��<[�������&%�d�4�џ���5��3ɽ<�S�ld�ކ� Qg�KfU�xd�KUZ��F���"r|c�Z@�M<�U�8�b�������hӪ�ޛ�&��x'4�~d��77�;�|\������ʧ�'�)�YZ]�#��N�V��u�!�8U���.y�]wn����u�Ǡ>l��W�����U/�fX'�_K�-������J�E�A�i�D4����H�K�!�1�j[����8~_��Z㻚 ;eM�o�|��v��X& HsX+?\Vi��Ǜy�o�9SM���Y�P���A,�e+�kG�KA�+1�=r�_�B�b�l�B��FZ�V��a���8cr�9nY`�y�0a牋���1<�r�GZQ����0��^^x���|��W*M������!���Wk�69!��	|}TPI�����1S��/߆�KG�]����#����CW!�ز�I?�=2��$:%^�o����=X�14�\�Ԇ�Z�]n��[�㬄��s�����+ .�C���[5��5�:��3˽Η�J(�ê ��wQ)�%@�t���]c���w7�|�]����6�M:�۩dA5�k��q�̢Y3�C�ʶ�ӻ�3��6n�^#�]:K�T+���M5��B��aj�(�,�эǣ�iһ�f�?��<~w�F��.�$�M�����E啻o�'[�E��P�(�eȾ�Ȱ��!�������D�;�a��KɅ����ZM�X��{o���a��i_�V�@��R�К��O���ХBl�;���gɇ��	>1�#֠*"V��b��X�݊��!��P��=�	�x�K��B���x��H�㬚	���Dp�b��s���3�aA5d�YIl#SD����J��,��q�)鲈S�jX*�1�qׄ٣P��}�m�Iɥ-�+����B�e�-�C����N9sg@v������\�	V->��F�wq���6%�yQ��Y�_a�	Hw���3��1�o{&��uz��2;Gǟ�Y�AE�{� '���^�p= +X�b/|�{=�A�K�1X�w�+s�8n��T=�]�d��e=z���]Li����n�̝�-�슞��r3&�m>����TbT3�|?���0V!~�M�S�7T@N�Z��	��A�$:L��}��OM�cn��h�Wߕ�h����:�W���l�,iC�:5�;��
+4�y��7�r�2�V)R�p�#'�"� zw 2���b6�	��3#�5����Z�p) �����k?˂�k���_ϬF�=�RtuY���P�c��$GwK����~;aX�D@IG�������6�E?�M�����ƹ�����ut't��34RD���|2��&�s��0k�	&�TP�7��=�)vO:i�QK4���~Ϻ%{�`J�W\�2���O�ds��D��.���t(.�bC{@���31̽��P,�v��5�F1����!�C1�
+�8T�|�B2��e"N�H�N�z7��y}Ǎd\	�#���� �;�y�ڒ��5� 2<�2���ܔ����ߒ���Hj�򥬧	"�Ĕ2l@����9h{U�5��iN�%s�ʻ�6r��zV�0�6J�qQ��
+jCZ���V�/�W��77�+��{L�\�]�by?��c���L#�[D-/Û�:�]{Q<����x�6w���ҏeࠉ�Б0�ߪ���V���dT�6�8�1����:w�J	�.#Z���K���$C�K��=��E�;T�{�R��,��\��1ݑ�Ծ�;��KE��4a�hc���h��U<�r��YJׅ�m0��LmC�A����0k�������� ��C�x����=����iVL�*�N�C���0h&��_��Yq���b�����t9).T��DU�#\��3����7�*�7�*�k(�Z�,�wGQN��2XF��81,NG�Γ���"���Oqeċ�9����Y�%��D7��mWn�e���[�R*��l�Щ0U��g�5�������D>��1<�`E֥�5_+� ����r�d*�p�$�)+Y 
+ߡq"�����o���1�OS���#�gM�DdOW�����}bl�s9�ʮ��<-Xg2-��f_���[Ә_b:�X_��s&��1)O	�0�5ƙ1���������dWz�A�y�uٓb���d8�(�h�(1�h9`
+Z��r��v��;U�t�d2�r� �R���a��b��7_���6�L�	�H��WTwN״Yo-ĝL�U�O�oֻ�pv1n/�2DUET�h�,h������E��~���|#b���	xR�m��)�P��f"�S�5��¦�[.�R�1�\�]M=�]��hc��r��>���N�� gv)d�W)�84��,�C��T�����%0[�~�]��JHZ���m&���f.�l����1E1s�"$����kD��v�AG�k˪���O�s��>��٣�Y�7>�ʦ��-�gw���m�����_y�rn�g�}Vk8~���$������7�)ўo<��Ԑ�ji��H�+�J�f/F��/�v�f-�F�|�JF�E1[��j�$�]���~]:9ɟK��U���v�j�j�=��[������( ��O�ǲ�_�5T�Z*p/���:��52Zo?<k2�@��&�g.���EC�8ܵ�O��4��.�Bj�J ��!u����^6t�U�f�W!��T/��s�꫃�ig	Ρ��H�f��hF�(йe1�NN�I6��Q?�B7l�ǖ��m� ���h��u�\�����R���.��?���WD�Y�)ۖ��_d:g�>o�Pǔ��$�h�b�
+8%Y�Nu(�ٞ����*ɩ'>aɖ� @�N�әną�	��o�-�*�g}M��� X7u�+����ɝ����V��`���R�(q��#��8��NF���I:��AŐ6��j�[1����!R~s}��"��n�,���r|�^���0����qM�e���"�g�u��"ח��ݝ����Z?�+��-d�L��2l<�"�S����j���kw	E���nh�S�}�yC�@2�ӟR|�+����/N����ʩ���:��2�E�A�#�2 �l��?-�֠���ӱ�z�� !"�^Ƙ�ځ
+SW��ǹ����_������:ۻ�:c�6��=��%�|�Ѿj��}Y0n5����"m�Q��$�����C
+��<�3�#}/ˣ9A�b�L��	��+��j2��#(o*��p�i��?ǐo�/�)�<�c�490���I���G���S(�~^��{W�,�����W̦l�+̒������d�*[���y��F��;�W�ﾗ���T<7���D��rb^�~�I��U��6z̥vr���vqlEa{���d,������Y�7��_B�Q->��p^|�� |�Q��+̫��jf@>���!�(c�-���S0B��w��3G�v�)F�����4Sض&{�a�_�~�>�DPs�\�T�BO/X�4�m�V�qΆ��G=�k� t�St����}��U�8��I�U��֒H������Ԝ��!hW�Q*Q'R��|X�dY%�|�Avl���d,�yA�>�%aq���YU�v5%!g�2Ȓq�i���yrJ��ɇ�GS*�j��:�ÇB��rus��x��<2q<��x�*n%�����㵷�M;Ft7�{Su���8��'L �,�{�H�ME�6ء���*6���Xkڟ]�T.����)�T��![y ����U	�U���Zdp��:b��֤y)#tG�f2���=�5���2�f��iO�	!Y~ߘi/��BY������<5�o�\�9=]j��Jl�AA�u�D|�����|������<"q,�5�n� Qx5�$����j�e���(�=}�/�c0=���!�Ex	���yF����:6�8�6"�:h�d���z"d:Z>�iA0E�����������BT��ޖ$I��K���Z�D�m��6]�_J)SJ)��������0P�qx�M	x ���Ӌ�+�,L�ڐt�Q�8�T+X7��Jo�jC�%D����kA+�/�X�e�F�����0�A|���}ht"H��F�0�L��ql��d��G���z&"D[�0��:��n���&�놏�׉|xA%b�>��ڑ��H̴�Qx`mE�RD��KU�q�8%Ғ(��P��O(%��J�0H\� "2�l���KBF4�^��Oܨ�#���2�M&c�#���x|�*-rh3�B�<���kx�s)̀��y��1'�p�H>ď���@@L�ļT�(�L#�e�m�҆��22I\p�>,�Ȱ0,i`�)��A-�Ai��<�u*=�Ѷ�Ƃ&l�-|� ��c�á5]2-X��� ��`0���{�Ȓt�#9�h3��:����,I	��;u��W2 Q`��e"C�%�fK\"��x"�~�D���X��0|HŇ!&� �mk�҄ȑY�,I;	^P��8&��Sa�Y-{J�f6@D��ô��:-"�E1L�4�a�Vd�
+t&`��F�K7�h`]� d�����ͻ�d����9p����0u��
+�����u20^����`� � I�4
+J����ļ���FL�7�J�M�� �"Z��Kk��Va�؀3���7 ���'�!@ⱂڶ����H)�@X�ӭF.xKv+�La�I�h��Ԡ[Z��5�N
+�BD� �S�iY^��<$�2�P�a^]�`�Ē�Ň=���0��H�òӐx�B8& �hЭ��>ʧ�������
+V�F2��j䂿�hA-�"�����Y+������KT���"4[1��T�<"΢ن&��*»�<����:Ad-��R��<����� � �H�
+D.��|�;x�!���@���KaF��nQ�l��љH��kC�ԁb�8 �:���G�
+@����*	���0u'�H�Cց?��va����H��=*N��i
+h^����hP-�@��#��>'ϓ�0u��j�B���� �1��($0���xHZ���:ɔ�:})b+XWJ�TZ(�kX�n
+�6w�N�"T:ݙ8���X�!��Z��hA����C)�鐉լ�L]�g�� �hG2%���Y��u204��(H6-#��6�e �.|�@���TiGT.ql������� �G�!��aD���hA7�!脆ՂjفŒ�k� e�V!�b��,/�M�����|=5�J��|p$�H��o5\�u��g��D��%�jE޳X��L�����t� ¤xa�pCǋ��D]L�kЁ7���.%+�J6�s��>�N�B�2-a l>4�PS*=8��;NȀ
+>���X�+X�0j�/>�s u�;��K?��v������o"^�7�od��Qƅ!�) Z���U Ş�)�d�(����A�9y
+��O$1`�<��4|lJ�F��#%�VL�HG��>D*M�x�)�1t��ւ6|@�-��B �Dhv���|ht�h?�Og�m63D����R�p��"B�0-��b�R\ !:}t����p|T�G�!},$у�i`��	�00�� ð���؅;+�] Y�El��s2��W�*���MP�D+XW�*������F�����bH��ɂ� %�%�M �U�{��O���=�lm$^)��DZf1�X4�es�R�����+<�m6.���&ǢQ�����l#�v��b�.�� �p���4H��
+.1A�����Z6 ���E�~"8\
+��!i,yDkR|��t��
+@2m��.(~��� >�� �pÄ=t��j	9tx�\cP��	T]*��T4X�� �d��9>�p�A�h���u�b�3H������ɁVI�B	��
+��X��c�VD�j��N?c�pPr��T�yX�J�TCL�.�B���FhY�@�Q�p���8�j���ĕN��l
+�N�,�9q���FH�)F�]6"-q.<N�Zv1�</�\4���P*6�Q*��F�и6\��`0-�L`0Fء���R@�
+\&
+\L��r��V#�Հ�H-�Lw�a<86	-+�p
+���@�8:o�-Z�38�p|�$$~5r�80!��D���!Z����T��
+}��Q P(\rZ��x\(-�+K�DD<.B4 !���lhmPLt�2p��X-��EzN��I�h���P͆pH*�F��B&A{#%0���Qf��߸΢��`Ltd ����t�I���Y0u$@�����v%8<�������EXD��SW)�l��c(v��D�aя�t*	�@��Ef>
+���_<'��J�^<'c���d��9m�T��9-��A���
+Nq�VT<'�!��93 ���s2+������1���A�a�\pJO�
+��d��!J�х;�s2�o&h�֮�s2R!����s2����c�l"K����X9<'�	�]�m��F�1x��ae���tD���2���h@&2��l�%�)_�>@LE�=-��R<�1��F���5�6��b=<ڶ�ӧ0aѬ�f�ZO��"!�nb��Ëq(/eh@�<+���$x�8�R�L�G���}��*�T4�#D���:��!Ԥ�gQziBF��^�Ң�Ƈ�%�hyDd�I�!��ƃ�s})ψ@�6 ��s2���z@4ˑ8�`�Z�fc�����8G��.��e�A49&'�i��p��,#H"��?�n#�yT����Y��9�ψ���R!�h��`BH^ãe�A�)��*�N-ފV1HX$�!5<���8�S
+�^u��SJ(��ZpLpt�=px�U�N?Xarc��Zt��a5IT#U�j �q�� �>�O�f�7*��U��E�1����s2mq�$6	q�T�f��S'K<�r� �8>�Щ����:�Q�%#|)0�a���RNA��� �t���pE��!�H��b����F��p��d8����/�p��ԲQ ��V�0b���yXG)�T� �u��yi������%��T��b��n"V���ÅA��~L���������h?1�J�Ѿ��҃�J49���"D�NE@�e)�{/ղZ�A�\��2-;}��hG P�G�<�������X�Q@<��^jI�S�x|Z���XQ
+p���d`���Y���+
+�xq���7��px$�kM���!خ��L��H��eth:<-�X��W�`@��E� �m�_�g�#ۀ���}�8��y*>)�-��3d`��&�@�elH��!�R_��_P�J�u�J�e�#�0���8Hq���Y����N2�+��ZD_�wa� ^"_�^�]�*��*����QDH�}d���q�:��kXP��+�>���0u���NP<�O�2������Llhh(����� V Y�:����G�m�q�L�jA��N�(�`hV�Z�0�1�A8l*R�����e[��5��"�Y���	
+��ë2�Th`�G�)\X�J�H��F%S7�1'�@��hABb�*N�NXx(p�T�G) b�J4X;r�q�@�ű�S�J4����k��2Jb#w�B)��c��.�!P`�g�q�u��:�@>��X���7@2컄X�����.*�� \�O+��ـ. Y��^��_��C-W���.�<9�{���pY������z�-%�2w2�73M�įβ�?O���e��ʹ.���ݑ����rΝ�<9�`-s�'�s�2��vsC���9��V��y�=B{ެC���_��	d]���I{d�!��uֹ�������e����uy�e\������P:�q#C��s6�K���+����Y�t��*�9e�o��Nn}�~|	~K]{���~U�zGNewd�u	��TN�^'d������N9���o�(!{t�����os��+)t^��#Q�|<@��9��p�d�J��EX��f��&�qcP�}c�0Y�o��/����W���5�	_��Bצ���٬������.�-�\��^�7תT*������k{��6]������ҥ,���+�6WT��bk�ek��_��U|��[�\n
+_��X��:�d�^i|��0�Z��¹�X.�tB�X*	{��[K<���%��{�0�^��R��̽�'7!o��pJN��v� ��������������,������1�8�����E��:�ʐ�����e��>�� �bgle�H�#���b g�xs�m��ӡ��9v�0yN�����*��y�U�������s.�{Cf�����XR#d������cϸ�'�<:+��:&n��p�&P�G����\9�eG�ɜC���I�������\���.|~8_�o�������g(a��	%�1#=D�ϭeW_��%]��ծ��Xӷ����!�.��RUХ�k{�Kn�R��j��껾^����̊Ί��V��&[-�zB�X��d�3��	W���rS�T���z�Z:����j��P'�룣3��G�e�D�D�WKp� @>D<�ϩ�/cǩF9% ��
+��
+��
+(� &
+`�(`�(`�I�	�DzO�D�� �6�X��Ҵ,u)yy�Z݈�k$��d:�OvUNz�9��*?'YN.���:�d��C�g��WZd;qu�B���X��ӊ��/K_��Kʍ�rߠ�|k��Cgi�wE�.�Z��[���-�˵-K���D .�˷U����]��[*ߤ�� ���,� \݄ �A2@ ����+�	}�<"\.�P7�l3�����)w�syB�9����3zG�1*\�(9���.g��+}�T����2����oWo���R�F�SuQZ�w_�<�Ce�{�n��(-�@�R՝%J�
+�����m6. �J�J�:�(>d�9e�蜌*���ġC#�y(p����o�-��f�3r۪���E7�  2<`-���������g�������l6���yd���;�WB�Ř��y:k�teU��ҧ������y>B��{.|n�����k{T�n�rU�?t�*�s��w��K��(����\��uF]��W=J��>�e$"J'�FM6��BD�4с��f�ɥ'�lU��D��'�<�q��DE\P�@JH\Z M�Z �n�!��;h��#-���!Z��<�K�D,�b��u�	X��</%]P�.Ȍb�SƵ�҂�ŷU%1�:u�U�����kd�c�9�6����Efn���6����/�p��ҧ��re�2r�˫��-d�F�Pcr�ܽ�	%'���zl8}���v������3�;˞���e�R��
+_vSwn���+W��%���]r��u�(�7��گm�Y����ntV�V�;%��P�3��vCo}���S6�p���<]�/{����Se������<_���W��aV�H�m9�N6>�
+�F�ckC�A����ͽ?�'�p[N�ᶿ�?ì��P:|�w(�.G���}�]���|g|(W*�uasѽ��Tn'�]�.�?t7�2J8#��^�W�ߛ|��`���T�۪寒yz��Ug�P��qJ	�����T�T���mW-�T�uk���7�qr�~乽����[J�p�g�#��Ψ�3B�J���#�D�P���}F��!��S6�r�Be1+�TY����[d�͡�֫��*햼�����O�����wÎۥl���~;t���a��qe+�o.J痼+�Wr*'3Nߕ�gO��;����yz������,U���d��f�oξ��+���gd	ץ���3v��\�,��*GNr��c�P�+�O;F�<]��^��=g	gt��3�o+7cO�y�F~�*�.}��9�7ǆw�}�9]�c�`�1����o�|�?��[9}F�P��T����9�N݇�R7J_]�D��N؜�`/��$;�_��Y���қU߹)U�Jn	?���՟匓w}������7B�pU�����s�����,QJCi�yw�Nx�\�տp�#�;�vO�;cMdt���l�p�l���b���N��Ңz��-?zg�_��������:�.��dN�����
+5Ε��U�㌑�����4���Z 촐DDE2�;Q�u�g��r����ߝ5�r��X����=Nf�1j�s��-�'ϭ���%=cM�B���ry��UnO���ɯw�~�f�8��fNN�>�}��2�2o�kC9����!tk�b�&�6�
+�6Y*�X/�K��z�|}+n�(W[+F�\����Z�+���6��sB��r[~o����1��Qc���a��>%7W_y�=z\��}@���Y�U��:J2\#
+�P!��n��8m60\��.��*�2+u��2�����Cm��7j��\�,&�R#7�0����-}wv̇s����8�[���=�<���u�Cn-�F���Q�ݥ�ҝ�ru.d���jT��9��3�K��s�}��Rgw��({�++IW�&�+?�2GYʆ\d�z3�c{ԇ�)J�J�{�Q�|�pNuz�%l~솪��8ao��p�X�;��f������.��o�.u@ɵܬ4��r7���k��s>G���Gt��$A������  @��<R��7�nX"F2:*4*G@�c(b �a ��0��(� �- �j�G�@
+�7���%�/��1��G��Qg��<DfE.fa�y��2�q�@���S5E���=M�,`Qa���]�P A'BE^�tu����Oh;�B� +� �<��D(�\M�U�!�	����n��kB�+i)L�:@G�4�V�礼���
+S���������м�����Ѭ�}�p��������i��*�l$�4��ϻi�5ggں��,2i�u��v݈:QB����:o\�4�a�(�+���!�]yZ��n�>m�>n�l�\$�y�m��
+:���\ g�����HA����=`��d�`�������;��َ�o��@r{-6�9�:�Y>�تXb9�*��qg���bҳ�k�@���&@mV�/|������u��<�;�F����o�7�.5�!� CF^�
+1�{ޛ��WO���������|	d�����|���Yq.�0��.ڬV�����tI��4���`G�N�s�(^PمF�ڈ��#�u�ÚJ��s�Kc|AB��֡OW	}�'�߶A�/j#-��q��I8�l�rK:<ۀv�����F���J�����+�_wby����&s��R�k`��o\����Bjf�]���)� _x�Ts�3]��n�|qs1�\m�b�/I���. ^�SH� �%��P�g��9��	�x����z�*�}�%q��=ځD����i�cd���'DҹR�j	 �r��R��< ��_SCek�$�,���)�$rrS��Ug<�(�\�(��<Z8���/eɜ���fD�)9���/ҁ�����;�`�S[��W<�đ���p��Qqz Y����12��y#�rӍ��,Śj������$�$����C��TN�:a<�~v�n��R�cXurT�g��� �q��]���x+�c{0�:��"��$4��K��8}x�Nͬ�u�I�E�r�4�F��$;S�)�����D��^�͹P��F1I�{�z�tT�j�Q�W�Χ�^��7V���9�jw_��-��s��R2��w��<����I��Ģ-�lx3��Su�ΎB�ܣ��N՘��Ç3��!{�{��q����QO��e��W����2J}Γ�Vf��ćkX���4oQΝ�T7���7j�r{�2�4�9칣4��*l�$���{�!��ە࿅A2�B�g�6��P���*x���p��4Aq���������^N��N0���v~'��\��8X��в�?���D��s�؂�{�5<�����3��y�`�'�	.����L� &(x��S)1��f���/��_�#��/�$���35����	o�;� ��q���ȗ�����b�ט -�UB�S�Oiy�&XO/�>ͪz��cuת��P��`U'XM���?/M0��(|��`FIy��Kp�׬��H�X�_��	
+^�GLp���|L�`���
+��	�5���;��`�h�1f� X��*˛`��&�<9E	=>mʶ�@��	2��H�/�"8h�Q��+����Ψ_&�Ú�|wĻ����S�.����@���%��sV�W��1Vywk�i�A�l�L0x�iO'&�'.$��	f�&86��$�b� �-Z�?"6Mp���ȱ�&������\�h��xo��23c,� K��Ī�p�`�3'��x:����[��	&H�kJ���螠}c�}����(h4oI
+r�C
+lrY҇R��G&��t�8t/��X���&� �0V%zt�bq[`S��`���?�Gz�vl�%ܔz�2V��.�xooK��U���?�G���*�ba�!�����cQ��$ҫ6�����n��Gdp�ץ�/ȂH2u�#�q�(ob������� #9C�~�H�o��=�1�+�'V���w���9�`߽l����]|�)���J��Bca���Y��V!h�5�YI���gw�3M��$<��^qsL���We�rAp�aٛL���/�_�� 	�h��V ����� W@�6��j�~l��"�~�b�I��
+�XQD�mG"��O�H(� =�z�2�Z��<��6$�AY�4���4��(�XD��8
+�̟�e"Q����'u��Y�AH��Jh��ؿ���'̺�'#�3s$�W �v�� ɔ-�D
+��% c�,����Y�Z�f�,yN��OAVmkVgp��T�z�y��q"PKjU����\y)�B�����S�c5��ʹfV��;\�N�p����住�L�!�d�f]�Bn���.�.����ћ}���Mhέg�;�m���E���\pπ����#��3�.��W�ƻ}�L�?���s�[�f��i�P�ʐޅ���8�P�&�+�~��D�����-�DRs��$�"��������)]��':Rs~����y���+��w��nfY�_��"���	��w�a�a����>��nX w���;�.g�����nQQ�q&W�_�I �'S�`%Y�<�n�c�d	C�p���ԙ�.���?ܖ�����"����'y�P�Kbo�l C�}�5���i��2?���1�qi���o����I�(Ӕ���t�kIfw�i�7����ߍ�?�Qn��̩~��������B�adI�K��:&��ޙ��jo�
+�09�y�x�QM�a��$6]�N�3�r�a�d>��lV1�`�!�]�^���4]]��!Ÿ���2ٌk��n����(�f~Z�^�RѮN���0u�=u*���12��C,��~\:����Ёܚ7ұ�v)�ȥ����igF
+0=!�=�����w}��G|"�ة����)�z�]q�4?�WƉ�R;�K� �b2!@��57-L����e}�z6�U�@	�Ra��±�Q�Q��"��e���)B,�'wRuZu�ӫu���{�ˑ���T����$�e���.����	x|�J����3S�U���j("<}�H֛���"���g�{���P6/]�﯂�s�°KEs%���NQ�RW7"¨ m��'c�\���Tf9'�j�"5��5�O?��"-���ojO�MT8Y�i�o�=I���N0��v�tX8?�L��MN�~��<j鵮�#~U��g�:�����! /�Y�����=Λ��@J9�긿[(���L�����pq����=��G`�gX�����z�'e;��!)�9����j�=W�M�/�Nʃ�UD8��#�J4u�	F	�����ᦏ�<w+��������J��.�V�����l�[N<�܌w����G�y�	�6T�,�w�9c�n��:U�O����z��>�^�]Ə��E������zW���?K��b��H(NG�Hy���jf�Xٸ� ��#�$��IBz�'���X�7W��Q�,}�����ô]�������h���Jݹ�{�P5HY�c��Df)�m7O]x�킦X鑌��H+�.r�;��T��@
+���fuo�1�������?�-9�Wؓ��^R$�ipf�1���N�tr��*I�RJ
+�@�����/7�A�YGX`�%jz�ܪh+|pDB�q�����
+�(
+�S�f��'��y^��y��#{�b��ޘ���%�A%�oy��Fۍ��@��Z�%
+"��Ǣ��2M������Ƒ����c�� �Dpq{�4y!J1
+v3�?(2r�C�M/�%�\�Pt=#cڠ�����P��<e|:L���3�0)ƥ<B`I��X�#RF�I/�����kğGca��7�����q��]6��� �Y#8T9H',��" �N
+���N'�Ɉ���'�K=�(d4�q1�|v�Ks+O�>3��/W(��!�����:�$,q���0	S��C���(wO�=\=Y;�	�=8�2:ZO�c�$x�
+C�~��*P ��Y=8ko�J��n�G|IAX�>�Fm{O
+6H����>��Ԏ�C���/T�D���6��5��6���0��)���<��{��&h3�'+R���-T�c;b*vQ����eZ9�W�i�2�D��nEDmp	_O�a=Q�CU���G9��g��`'ֹS'��ay!�T��9����oTC�b�L�F8�4DЍm��n��ϔ�mg�K�r�Z���U��*�z? ��t�.B�H��^c� �}�
+#`��ݗeG��s��T�=~o~L�qf���_��Wb���XMґ�G9��[����ҏ��D��Qn?�/�bQ���%Vr��u4R��{!�ca^������ �J��-�+�  q{�?{��K}���$H1kDA��9e�+m�r�:��F���yr� �� 4
+U��zK�g��|;�L �B�6�#hTsX�1u��g�t|����v5�#��9��b?o����od��Aj�2�ؔ,/���9��Rg?fݸ����m@����@�wW�9�VZT�d-�0_H�@��~a���?"�XdĽ^Ť\�\kU*�>7Ǹ�7~�ؗ�����X�Y��S"��\�Xb����K>
+�ӊX(�����v�a1���3� �y2Y�J�}�䑝�QX�����\_��-(�>��ۍ�Sy�(�H��Ӎ�f/~��o]�)��� ��B�.&�T��!j���~����-�N
+�މI����s�-TPP�d	 ŘHj!����V
+���8�]�j�͋X�2f���r��W����.r1}[$n��Lc�.��$h�bQH�W�穌O�1�P-J�X|���5D�U�k�<���u4��f������oy���
+�-֙�W���t��[�V� �n�2W�Ֆ����<�0��~�S�xW� q��$,K�f*��8K�Y!z�*�0S���`�$��CH��j���WOܢ$��GW�FԶtv1�Z��,��P��*�8�|�(Ü�2�������qAtz�!)ʝ�*m� 4�!�Կu\���K9Ho�O2J�X��G�.�[�Ml5�L��;I��}�T��V3ـ=eɒ޻���%����#Z#��pY ���K��|���}��b{N�w�N�鳇�� @7�2�$�wDǠ6���4hhؠ5L܉�'I��S$^�����;����r�q�ki"|yn<_$� XE3�Swؠ�\�Je�7�@��֩��/��ƅ����ChMd�v��ST�O����~�a=ݯniL�n36��(��B%T�Ot�w�-����z�׊dz�U�s��<��2���� ��u�x������ꇔ�/E`��4P2�`��d���A?��I�q<!�ί[\*	=��j��c�ݯ� ��|,��U�=ĝr�q	���sk �`�S!,s�	���.i$�������0H\�Q�מ�K�Ҕ�w.X[�� ����~�W�O����(��K��?W��_�1�>\(�gwT�y��ߺ�Bc������?���$��r1H����c:��5%bO(3:��`w���y�B���ñ����i!��wx��f�A��Z]���#r[cg/hI�1+F��ǹ�\
+%�����YL��C�[�f\���d�	�%���i��М�f�K��h`\��@�=)tJYA��($�v�k(��_�|~��x4 t�,��P��?�����&�5j��C���Q)��c�9?]�|���Z���q<����"k�1Ĉ�)���o��(�z?��P�%�-��^)���"ҭ׌����!�aD�}�c���l�cA!�vD!��(��%����l���
+B F<�*�e���S�_��Y�-��e
+�s2
+�3e
+�92
+�s2�Й2
+A�	Ҳw�����ܸM���Pl���Zm>"Zd���������0eV��~���w�fe׿��?�QӸ���d�n��ҏ���`3��pQÅ����gS�q�l ,든ӪG������v��1���ϟ���(埃Mz}�oPk�^�������Q���p�7[o6D��G�*�u�(��	.Čh�W� r����;��Jq�ď�w�Q�7ۛhX�*�؄Oǜ��<(���5bK�g�4U&���?����-�4<�.��2��S�V>f���_V9'�hۦ��\��X7���E����[��2�ZM�ܤ��ՔN4tf��$��&R�`�bq����O\�3�k������Z�~�,%5��������|z���]b�J-c�{�Y�z�����T��&[�x
+Z�z���������-L��z�W�uT`r�/)�y�5L{;m7��so��k_lE�1mN#`vj�O
+i�}���8Vg���Ox�p�k�g��$�af◼�����^���Z��U��F���˃o���^�k&@����r�u��
+;����Ϣaڭ=U�t����
+y�*2��ЅBP#k(��y�J��?E�)k4�O���b����М��r�K�Q��>��Q�e�;��Ci����+Qh'�G�Ǜv�M����B�]���ls�P>s���:�Ba�C�K�����k��8�p���<d���:@����̫V�!�:xG����U\^�4��g�����8K�Ev��g4����k��B(W�Qy� �ة��X`�ߐk��LB/C|=��3��!UrW�[���QS��,7㿷H���j�������
+A�0 7�WW=:���=��n� 3���9�X �W��)Lp�0
+��z�4���$ͱ��4Fk�m�����?�5�{�\����:i'X�ťŶ;u􇞋)ˑ!�T��"�n���[�O���L�n��koW�����f&�Q�t��o�vPIl!����eD!�z\^X������`�<��E��ʳ���2�0N�>$+Y�w[\C��r���)*�V\j `�!��hyՀ���	 ��1��aAP4���O��{��'����l{j�/���s�,A� ~x��P&�f#@��15b+����ӿ!>�]�GV����S��n�tq2��q�?��8�����@`���l`�>Ab��||\8t��݃�8y����$����ތ�}Ӣ�g�*�P0qK��I�VS�@= ��i���EM���d ����fc��T;#�!0��.�Q]�w��'�̲!�-R�n#ܰ�2OO9Q��1X�������.u���1_ɜ��u�X�US2��
+7f�Q͵'�2&��H�鰕
+��5S>Ӧ[��#�LqQ�Ц�T~`��L�"�� �l��M>��s�"�	TNt�v���味���@y"G1/>�
+�^�&>Y��W����ڬ�H*�s���>!DPJU}x6mnzm�K]��b�:i��P��Dvæj�2�3�{����G����W����t�4����a�Q�:r���*i���ގ8U�\�sL4�轍�4��t&yƸ�)y܂;<�sB�CA��#� �V�;�xF�ԩ�2�3�x]V�Q�B݊�7�����8o����5�R8a�}} FZzF�E�*?Q���V�'���%2?�&`?�݋�T�9�:�E&:N
+KN�/���˶8k��4��ٺCw�,���6�X:T�10G1Cf6`Z������3AFc��Ī�RPo��GE��,H�i�� E��͋�Ƥ�w^�nb�H0#.H����=��W2�@\��gס�e��0�g���ݭ�P�$�����#:<'>�I���}�X� v~H5dNްz��l�#cM�bS���,8i��5�c�J�W����kl	�FLʙ�dU�I+6��Z	e�&��
+�m!��o>>fk��B�޽�#�Lڑa�|jc���k��9K�Px�:��i��L��U��̮�n�uE(�%5�l�Z��XS�&���S?�������xY�OG�Y�g��Sa��p
+;f�hw�7?�.@��2��C��?�?{�y2�qnǐ�# ���e��>�D�[��h$��hR.Wx�4j��geP�H���KA&.��2&7&�_((��U�&Q��B�
+W�D)�Y�/�ν}�3��T�k��?�˂�6�b
+�~1�'�i�t0�	�}��Jz�*�M0/���� �N9��IpNM��(��q,v9>�9?K��K������	H8?͋��Gl&	(�.����:�R��o��;OJ~}9�:��+�A�O�4?7���99[>�;�K�sMzZC�UŽ��)9�� �/>�4��ܷ�c���Lk%9��_Oc��q<�m���.t8�>�ޡ����BB^Y��˔qI6�*J$V/w��bY-|:�1�1�o,'�5�0��j�Rn ����+�A�̩"��U�uE^L�j�/+��t��H^���q�a��:���N�X�ʸ��O<�hW���tɞ]��#����6�s�@��z�Ɛ�Fɀ����͝䔬:��뜛��=���qH�Ȝ8��ğL@ҌF=L��Z8w�?���X4~����{�����XQ�j�c�����T��}��<�x8��R�A���7A����#�(�o���h�!~v��QU�g�����p�X�qnI{�*}&� ��!%�FxHg�փ��s ���m���^�e:�t3A0'��5T� 2X!�g��~܉`&U8���'b�v�8Z�㯠D�z��XT�[-݇�$O�j��F������o��V��=�
+����>A��8��H�x\,51X*�f?G�;��/��F"�81|�DA�/(D��F��#���`.���ѡ�vj+�߯�7��d�S�qܽABg>��q�^��(��~}���JB���������{�o���J<��%0U���$>0�.�����
+D�=T�'>NK�z�Y�8��PT�%4$�<Ͱ<�wl�o��7�����q�d��Ѡ\�%�a�b	i�a���a>䔎�B!�Eyp�t��c/z�<��j��󱇴$��8U�6����y�^��{�B�`S���8,��+f#���d4\y!\��MF��FT��J���/&G�`4Q��?D�*�)Y���	5��h�v�V�8K��^��/�� ӓpzy�M6G>]���'L���p�f�������%Y�/©�0���8��cA�sY!�PK;��9�|.DZ�c�s৪o�">�t.������bY>�����c��Oȴ
+q���yZ���߬��gt,�������R�и=�E�l[�K�q�R�Dc4��N�V���2~���g���ױq]����M �Ț8��!���UL?.<�@�-����p�Go���T�3e����@D�e���I�o2}⛓邦�� ��0��,3�����S�9WflW�0����S-3z��J,i�5	���"�3�&�Ѡ2#U�3�!M0������!d�8��_b7Ѷ�����NS�j�yʌ)}���4R�I�9�e ��/�3��|D�ʆE[Ȧ����ʑ�Pt`�Ͳ���m���ZT�����4��A���-�}z�I ٣�b��~�̌"x�j�U�Y�Ёa8s�~�t���4��,	�8i����#�LD0���*r��Ո�\�'��f��`n�V)um�eɹ�"�P�5`f�cRlkendstreamendobj1081 0 obj<</Length 20389>>stream
+����%G18��T��\��d&<+Z\�a�y_�DC���T�t��]@��x�BA1���A�$1�p��ip�[�*[)a�tL큠8�[� \;P�N��aXm�t�?aǮ����.e�Z���&���P`p��u4���x@���9v�a�Z&P�X�;���;ځ�z�jZr�~���+Z�﵎%)���j
+�ir*�&���Y��T�P
+8kK ;
+ň�����&X����Q  ���J	��`풾����j1�Σi7D��C����S�K��q8u���D���k�'�/�nJ��&<��B"�t��K��8W�yv`���&���&�<����ѿ� B�P��w�6���Ǒ�l�F��y���޴��G��V��Z7����O�t�{Si���Q�#6�M?C���d�E��o��!c��h,e���qH�(!(��~ы�5�v6�"k��B���n�41t"��8Qo��"��qQa�b(/E#��t$X��n!�2��lc43�#b��\�Rc�¨	����QХ2H���M��׸-�\к0�/��7����S���L�4@{�d�	X�Y���"��JDн��� �Y}��$u�Jo����Dhv�uG��z0�U��-Ͷ��|1p��M�
+8��GA�N���h ����0�
+���/���KA*���	L�?U��*�d��ڂ��<���}Y剪4�a:O/x?d>"�H_��>\��|eE�����b�����Iu�]]�4
+مn�LT���S�o*�\�B��Y,��s�\�Zր嵫ɼ#��1JS�D�����XC�EV!s��L,�c��^͕��������ȹ���鴀��M��o��e$G���8��H®9î�����!�6)������z�j�
+T c�
+�O�O5WS�$�b��j��3{��Q5.tTg�(�ɬe+�߇ �Մ)��M�W�j�п�&��eǌ�0@�&e�PD�D.W��u�hr��M��(V8�;�p�|O?�O.5IdK���fv����m�T�eߢ�)|���CR�O �N,�l����T�����a�{1��u�A/�n��G.�Y�㎇�C�Avd���~�eX��5����Q�+��	 ����ٌqO�nϪ-{�i&V����?��8�a[�3N_^�51fOnJ�Ԙ|,���K,�6랑�W�7���9�P$�"o2�Ť�7��35��z�j��:��[��Z�O ՝�N�Yt�⡝�e�v	:E����/Q����X�\4�s��ΚuY�^�)���{����6jᱪ����Gw@���.w5�����������8�n���5&6��2��������ɕ�h���'���ݘDae��]���b�S���m����_OӸ'��{LP����Ќ�	�PyP�.�1h�D O��L ��X�,tE���}b��M�䉠�����0;��>4\��7)n7ģ.�,2,��1^T\���Ea�S�(꫈ʶ��N\V����\X��_�I���.��#�Y%�Ǝ%�8]�0��cd��z���򏃃q*S�"Tqb#�"IH(�.��:ߑ$B6X~ �Kj�8��4� �P7h��	k�g"3g}?,��[�I�lO����G��!�Ǚl��R�2l�G(��m�QU!D=��#G��٦�؟�.PG �I�6dMhdQ������;?D�D�9�r؛@j�����yw��J,˛���F�U�Q�6��K��-���I><H<���Т;w̋c�d�An��_k3O쿲�Fw��{�O9�Ή���b<�J%4�D��n�@ ���"����64TUc$!�U��@u���sP����!^�9 �|���\�ټr�qh��x/ �9%	]	ZZZ/S��#���ڛ4q�+�/))݇ <xx�4
+����(�}���!���������sUU_���	3FUG�������~�ܰ��0 �!F0` BD�\����I�$!b�!@�80���ށ)��&L�x|l��	���}� r���Ņ�HҨ��¢@�¹l�r�A����'M����d��^����]�mld|`��,-�mY*V�P�BRR'f̠�y��C������aCWW�8!�K�����||W��Ɩ@O��z��y�^���۱s�xxy���:�n�GFB:b��aI��W�XѨ����=:9����5h��6���(a�jkkk��HL�0񺮹��.)	Y�D���ňG�X`��4mvv���%&&�/�/,,|$J��3!8�1�? ݍ�7f��D''[�p��t7�>66�1
+x�O5M�C�	��>dH#F�
+J�<G��8p�'L��.]�ܹ��%J�x�$)b��9:i�$5�ޔ �#!Ȑ���aw'	@9<y��Q$��9z*TDFB~jj��
+V��\�cE�]Mm���IH���4��K�G'')L��d	�/P��4=G'%���7%H07W�%L֨as���#k���� ptl���|�<z Az�(Q�F�dde(M�4="D�&I�C��0`��#BB�#A�$/O�а�QPP{|V����	����O�4�D� (��ˑ3�0`�]��W�d�r�|�t���J��,X����W�P�몾�i�]ρÆ�@�⎎���ϟ����4=��|?��M���y�=z���2����c��[8~��?�p?mx����i��nߗ�7����nׅeV~w;�ƛu�6�e�U�1��j�����m���^_6����l˲�x���n9��p�*ߝ6��]����g3n�)3��;�˗�6�1Tf�Ov�����~ն]�X�e�
++׭��n쪛��r��
+��ܾ����u���u�n�*՝�p���t�P���&��tUef���Nו)���3~���.����m�VY�u��덍��+�tWV��u}�~[�u]vս�[��%��5^�+�6��2u�͝�O�����˶�cW���.��^Ux����UV7����]�U_�c�n�Սc�UWfWu�k
+�ͪ��e�-�ƩC�i��w*�l�:_6�Y�{��7�2�����x]��+��JVe���6��V^�����r��
+��~�Vnٵ�[W���e��][Y����dB�2�]��x������ܩ��͔}���|:_���/��2�.ӽ2w�2�g���Wu��f�~������k���ʮ��V)�k��.���u�r��8].坶��M�Ph,��ma��]W^���[������Ӻ�u}e�}]6^]Y�k�n��¬�1�ƫ��{�-,ﭺ����UW5n�U%sc��[6V�n�l���ʮm++c�x�{j�5�7�T�W���m7�[���uu���ʲ���ו����8^�X]6��e.���NaVmS�*3e���ׅW�����m��b�Ua�7�q��-��-˺���V�Vvw���0��K'��'���i��q�2ٝ6�L���l����+ì*�1��������jevue�]W^a���t�/�r����q����auð̶l����n��Y5V�/��]5n�Օ�o
+;�8���N����6�/�����k҂[9������mܾ��
+��%�;^]�m�7�]��_W	���ڶ�+�o��p�d���eƩ̶�4vߝ�+��>�k�6����*��.������%�{}٭���ӻ�p���m۪��r��vW���r˾�V�p˪��U�u]VU��ڮ����_�]�\e�Y-��R��*��­�������eۭ�.˺����n�UƬ+�VW�uYU^��s�Tn�j�ʯ�eݘ���fS��2�W}w3�8ee�
+�Wg�\�r?�uu�)�:��ˮ;U�6N�͸]6�vu�u�͔}_9e�u�/C�'���Xn���t2ݥs/���:�f����S�2^��%sUe7^�oV]c�e[x�l;��+���U���ꮫ��뫾/�d�n�:�mvu[�m��e��U�˾�ܲ,���[����x}�n�5VY�U����+��kMW���d6�%%�z��퇁d]w+s�#E�tab����$IҒ��(A��l&A�d�G<<�� ΜIsȐ!ȋ�~~O0<P�@���-&�Т�3r�tE��ӆZRR����c툂c_�s��_�p�u]뺺QG`I�%%=x���3i���uݮ�.8p�|�����]�;A�P������x�"���9�;D�"%:]�h��0`x{�����c툃|L2�˽��5�(�=|�t�����F������ϟ?iФyZZ�-`�\�x$h��ڐ$Ee�wH�#E2�VSk@%8V�g����t*)	���B�+S��c.\�,))}���!:d
+�o�P�fO'��'���:*z�������l����ɓ(�NJ.)x_��_G�A�c
+Of��!�����)|�F�N�������hӻ"�*G���A�+"̻���П�O*�#=i~�&�'�sI�����rnpu$�5�-^h�Y�p�� ":'DT�w�����5>z"��ZL��H��0��(�H:*9	�$�/�T��K�ѺT��������̞�B�ffh73K{�����~D�O� ����{4�k��'%����̹�;c�=��ܡ2�������a�9��ּ��0⚗����\���Iȇ�AO�����Ȉ#6kظ�P�h��L����I{�6�����=+ojj:6	Y��˜1,cHx3���3|�>Bp�����!1<�����].NnppЋ�/�繦i0`��uT�����j��o�KnOE���[�@q)UUUT��۠n�9s���.)����6��ɣ�5-}�$?~�M�Cm��*7�&���Q�;4玎�o|T����iFv7���E�}���bba~~uM���.]��� !"8p�o^�ޔ)yffg4h��ԹQ10ߝb\u]��d/_TT,C�oΜ722ғ����'����9�>l�xhΣ��>���AAA���X�@�)S&&�� &�b%K`��=` u]S�˓�����8�3�!��0c���%I�#H�uĕ(A����K_ߙC�"%1Q�_��AH��
+�>R!:_̡��'�K����IIF2>EV��.�WU��C���"y(9�~���B�45���%L����H����"�2��'t�,�~}��l����]�xyM�jjle��A>>���}��TT������v,,,F����#�]�:}Oh净����
+M���%%����%%hhiW�d)� _R2pw�.�iФ�#G�v��quue�Ν9�d�r��.5�5*xyww<.Cƌ�ǃ'ω&���(P�EK|��Mjj�����O�Dy�)���!
+#%B��ݎ��<�J����p��i(t��KJ�xs6l��ޔ �_�Ϙa�4=w����#�@��󌈈�&����=z���$D�Є��7��8��С7�Ω�)�G�	��f+��h$b�^"@=��]Q�/��tA�-���u�m?��A�dȔy��Ç����Ӕ Ad$�_���?�7���uY���'M�$�ل&`b�Qg��$����(P�T���%D����tI�H''�+**��B�"����C~V������=��'��Et���Ѡ��|(8�~����7*T����5b���q��acc;=G�y����_����$I>7W#:�+W�<:6f���Æ��裓�R'$�Z�=��(=?r��s4:��麮�<*U�����{���)wr?L��������z� A�	�繰��-d��AB>փ	�||���lk{S:�<���n�cĐq�8��O���㈃|G��w��Dz���U��z\��umXh�'�ΙC� :4i�ε7�kw:�Q�&3��&�O�ڝM�7�Nfz(t�
+��{��qj�Σ��1b�\��=��������u��5���O�dee�B���Ⴧ#8p`РA���z� B�={���k%J�8r��n�\PP����6l���<.��n�H	��#F�<z�/1?r$	�ɼԔԣ����С�8��u\\�67n�\U�?��D���2�>^��)S�РQ��ND�GEA����)��A.CF�gee����	'ע��ń�W�W�iz^[c����*y{�2�J����	&��n���{�T"����y��Q��xu��iK��!�H�"5
+�y?~^�ƍ+)x]׫:B�|ɒ&߰a�ѠIsG�Ƹ���WE��%����&M��J�u����y'-m��u�!C���ʺ3g��ܘ ��_]]�7`�`��K&����R����r4����O���1�4����	9"t.t���/g���8p�}F�&�bĐi:�4hP2��(���yd�<�=�!��I�s�H�#���o�9B�r����-�.''� gnnp_TT$#+;$H�ՠjw� AM��mY3f�lРEUՅ4i�$L173;������2Zc]�a��5]��Ň������@b�3`g.��\~V@5�(1q��<=(P�dm�M���_��Q+hjr����	U(t�����^__Q�������2,^x�)���X ���p�[[[4=�H��f		��:kq4(�њ���zM�&����OO0�m����1
+������kz��:t�p��M��wE�!B4��h�(�z���tN�[�t�#ȑQ�Ĉ�w��:\�C����{����Ы����9�'9"�O�$�*yU�g%��X��0O�|���op1�����+Ir�1d 
+�eKH�ݸ��48���y�y�/����+p��ʛ0cG��)��f������1$�֘חo��#F(�������"E���qȏ�p���H�	�QsBk�(�å���3&�s�A+#>�rlk�����eK�R��t�P��ĉojj��EKM�LL�g����as2y�P�/Zd腅��Y��lļM,�w��hѻ��s��x]F''���|�ec����0a"KŊ���w��<�W>��� �/�#拈� n��`��+on]���뚦k#F�|���G��a��#<c�8H���\U�aww�eW&&��*xB��O����n�#"".))�5M���MB�C�����	*DFfǝ���@��:�J���f<:6���E''�t��'X@Ք9s���a��:J�)2d�]J�)���˞=|� ��ȨG�>�+\�!M-��q��gee��x���gg�#ME<:9		ң# ��	%%#����?\���z�������$H���w��u_�4���˱R䇁U�Z(�������
+�����d9����ӧO�XwݽfM�����Z��1nܸ���s�`��X!�C�=DA>DX��[W�QwG55	ң.]��0QB����>UU
+Jf @����		2�aabϧ���'T͹<�si��ۡϵ?�jjϵC��GN�Ӎ�����k����
+�yd$������r�r��ǷF_UG,)HKc��B��%K�d���Eؔ)T)RTӴ/`]�;#F�`��M*)	 ��1**�N�@!A"JP Aooq���<���C~{�{��{V�Ƚ��N瓑qYYنcdd���<n?���!`���3D3�f�|P_�*��zA� ���A��\T&�����務�M���<��-�uf`���Jd0_Ň�g)r�j�q�(@~g��&��8�!6x�0�����N�܏�?̶z�"
+��Xv]{�����6m������	��2}G������&��7��_�y�y��y�r$H����%Qrıxܛ9r�
+ruy=�����cČ���ߣ���$�a��:b�"�i�u�+*F���������ȤH�k���Y��J��4%%��.�	��^�ɣ�	��%%ss3	��y@�P/�Mƞ�H�%K���K��2f9r�	��4M,-Y��{��Q�N���4��7o��bT��'�ǌ#�[�}�����>f�8q�011EEA��|ZZ���p�"!�������k��AÆnO�G���뾾>99�EE���B6l؜	���~{���Q,�#+�z1��5��G �Ж0��n�1-\�Xcc�A����k]w���^__Q�H������.� ��c]g����"$d�@>� =r�<!7~���z\؊���6m�&%%/���G�!����)��?�V�����:t�p0΢��ۺ!4~�ǌ+?��u5�b,NH�htlL���Ǐ � =�(P�|upOx�ߙc\���_oؠa�iz�S��M� ��#:u���$7� $8p0� $1dX��e�|�l�{K1-���dْ�KJPTU����	��JGw#����Y�����"E��(vk[#[qBJ���F��)������$J�8���'N�	:��;99uuq�eY
+��� � w:�{�Q�0�Ñi�5��)S�kMMTTSn��������>}ڶͣg����sI�H;v윍�-Ll�}]7���@��>,Z,X����Ԕ��ױy]F���;A���#���ww:z��q()	 Ll쀀�,PS[K�����866v�LNE5u�̙{��E�"�?�.\�ѠA�y�&pc� ��k���a ���v��q�*u]�,
+�3!8P�ݛ7p"E���KJX6MW\V:t���T[ư!�6���!7c�)C�驶��|z�XΩ5�x��-e�x�-�:�������{�-�r,�뫮�n_�����sC㥚�Kx��ͽ�v�Ʃ�2����t��>a��:��f����4f�k�:��l��&��2�WN�%,�̥�Sf���l��P^��f��7�dS&��L�.s��~�̗��M��m3�&�e���2㥜.[x���SN��딟�]�P���0����n>�tu�UN�����fWVm.]��>���ƫ�L�ϽSYn�����m:Uv�2cn�%Sv�V�.�u��Թc8u�
+î,���mw�}���^���ܩ��l
+ue撹��f�O6�.�q�t�:WWMYU��']��U[Y]�����jܪq����Y�U���-;��K�e�;^�*�k�K���p�tU�]�0�>�UN�*s�N��P���
+ï���6�햅Y�}�m�w�l�cuu����{ݯ��veue��\w+����}��e]Y��q+����[V�w��R��V]��}�ǫ�����+��ʫʲ�Nս��+ەmR�񪪲,���JJJ"""zyIւ	��:��799��8�隦��ܾhѢy��9JE5Uw#G����͛7u�ܹ��:��? ���Ȓe�#GVU�jf��`�B�	�&LHSٰi�)����C0̈1 ���!F�AC��M�0�����	�đ� Mϐ��,���|~�m�<z�S[c{]�+�+;�K� �KK�-�avց��45���U�ԔTM�?~aBp 
+
+jI�@tr2�S����ÅUUU��0� �q��}�7jԨ������=��9�c��@ ����e��8y�
+� QBGW��&���2�h�)s&4�0A����]Y[cS��X�4!C�\Pp���J�-ZL�0A���G��HH4���ڨ��v��QӴT*�xxyPQPA�G��Y ���K�(�A����:9����R�Bwv:jz��!@̘Q�gΜ�_?vyueɢ�2<]�5RSSJpww;hz� :@��1��$�%K��������Ψ�w���d̽D�ccdP�H�����-M�8������&%%u-Z�����������=z����bڒ�lB���9s�<�&O�9Z�N�?��.\��~ǎ���7�pHI�����ՕՒ�ܠa
+�4A�󢢢��q\��������"D�Аᩦ�!B�<��Ӡa���o�.2�HVVV$���UTT��D�I�AÆ���Fɐ�۳gϝO�=jt.���aɲEK��!Az�F�$I8"F^�t�tddd��ȑ�d$X�LX]]� A@]ב"A� � 0����!BN ��$,Ll�jjm�-�a@�@I�HIIȱ��CA�ggh��T�К4i� A��I�$���#B'\L�1"D��Ғ����|�:�����nqq�4m�P���p��ۉ_``�����&j�������2��C����("*yz##�$I��������_�y�������=�PP��(Z.iq��i@�@ɴ��-h�ՕWW����)ST�t����p�"LPp��f͚��C�
+]�|�i��LYًh4#F^^^�ReOPP�����Z.��	�l>����DK�򌉆�v���3����-ovf�\�=�ڞ�
+�WXXVX���:U��(�YXWXXXZ)Vƒ���v3l�����W�_��q����k����H\kK��
+�Bu�)ת�z�TԹ���5E� >xx�/`��
+��Zl�]=���/��^��i]Yh\WwG��_�ef0`�
+���$J�Ą�s�9� "���۸�;M�J4�4(uo_�� ��c[�{AvH�n�"��N���HNB���x]�J�,����-[��E�뱭�1�"566F''�̡���7GŎ��LLC�#��C>:�C&
+)SSRB��C���f2�b\_{���������"rDA=(~�(R\��"d�Ɍ!s((9,,�OOOccdUUUf.^�uM�38q���01}N(��� �4��^w��]d��K� A#��45�H����oG������@=D�W����NS9rܭ�Zȱ��T���zzzzYً=~�5M��Ÿ�>4��n��ƢJD]eӃ��2X8�^A��c� ������18q� �0x:���Bdii	ˎ�;t�ع(XP�A������"D8�N�A�[pz��B����<p��������봴&�#J �Сs?�?�E���V�`�0``��nG?gbb��Q�2}tt7Xlq��d�"����M�����(��Od�W FIHi0B0� �@A(,�ⰰ�� �$D8���PH8ā@8
+�@  A �@ǡ(�i��#V-�\$�9�Ǖ+@��h��)%ЗJ�܊oNx�Z�l:l-=��)5$DR蕎��cc�$�c��v�?`A�Şl�������I�w�����S�y��~%52ݍ��vwS�XB��;���KY�T�sQ"�K3m񉀜���^�G�¨B�\_���6���ސ=%���$�ׂ:II�b
+���/�b� Pz��Bo�\��e�4SE��Gމ!��������J�@�DF�\��3|}D���k���T��O�w1!�$��F�i|ԅ`�삑�>�@T�z etJ���}{j��ڡ��`7@?��t�,4�������"<���[5ر-FKC�fuW�1ɼ�k\]h�_n�
+����UU�M3���
+@P�)���("�49�ۚ�|����Q%9�)9���0k��7����3^�5&wfa*��ma�f�fS�;�n���P׬Z�����Mp?�?�h�"����E�-���������NS�ȅ\������@����L�&:�%..[ų��%�Ò�P���Iȟ��������/�aInjA�?�p��pW�D�z�(ߧ@VduX6��yXb	.�D�z�S2^��o�r9���H�u-�D��R��L�K���%_�t�������W�u�|�lH�C!g�®�	4@6�t����Kr(����to=���o܃�I� ��x�W�C�3�����N�G\�x+���ƗV�w��Ӳ�����C��9<��O�K>��	�m؀q�jM�V�bK
+��-}W$u��)4ӯ�3�r���}��5P��&��������Z�D�3��ڜ����?��A���_-��w�k�%������A�#^����f;<��_��M�Ή��&I�׿�[��r��я2��1,*���%:��@%\���9<[O�?��m�^��;53�q��p;Y9[����t��鋮y�'�O8�9*'��e�ď��%�2s�he�W]W��Q^Ib������J��t��͆�n�D��s��
+��̿�ɸ^,�~����@۽�r[��i��n<	�~����U`��	q������fj���2��~�3}OY�I$�KnٮY9d�ܛ�O՗M���Å���i]�4�x��w�p!������-���| ��?*��+�9UI�/�N؆,A<��3U�
+J��a�,�
+�9�R`�p_��M\+���t����M
+Y�ȥ[Ӄ�%�8ovg<p��2᳋0���Tp�KB�>\*� ��/��ߓ��C3��)�74j{%x�B?(���<(�I�����{@x�}��3^�B@u��L��>b_"vJ�w���ud�<ɑ���r��꟡m��n>��-��I�]\�5���p,0`~i�e�*0��;"`.���Xn�<+∋fk�wj�Gg�����Z��)��{�}|y��{���r��[�h�*�>P臔?I?ݾv��H��e�IƐ��)��X���w���^��kE��z�A�K���7/+��R��2�^
+q� ~�_M�.��%B�as;RM�)\�`��Mm[�պ��'OH���[���w4��K`�D�-�$�2�Kx��]��⤛�,!���M�GБ����ڸ[���5;YF͌��o>�/�@�$UR{�`�t���2f���J"�E���!_�a��
+j	�.��.Y��RsD��ܡ��=ep\�0~�I���:�0��:&u��$#�m7V��t�ut�K�������*�+u��-�P��������,�s��'4;�~�S���'��u�0�hK�
+#k���#b�D4v�)}�΋ U}odaK&��Sy
+��\��Y��d�h�'>�e���p,V��s�-q��S���}�=��:�ϡ�,�E�wYB��	�8a��"?�ǅ%'����̧�Y�Sko�Џ
+V�,����P8$�X3�$�\}�c�bVV;�2:Gكy��ȚA@;�p����\�����z&�̿�����p��O=�������3��DҰ��l�t[]t�~S�yy� 0ORJM�濊F���,E�[�iC>CSJ-�o7�C\e�k����Zܣ�W��1��d��rqA�<2$���+v\#���>���՗o�Ls� L��d�K��"}��\�|����}���%����%ۤ�&�&���F�?5����W�_�H2ѲD������������iv? =�O%������|�����l?F�#�%�G��]�}	�u7gtnNhL#�)�/��xb&�C�L�^��� ��5X�q:'+ n�@M=T���O�����%�֭��I�k�?���z����2�Q��l��P�/!"2�&c��]�jp�̡}�\����Ó9�S��Esp?0⨴gݾ�va��SmK ����X;��=j���e�����B��`�R��s,Ys�[�����#���KP���Mi%'��z7���g�cY�����Aq}����w���FjA�[�?�ҕ��~ �GR��	��7�mc|R؇����Nc'�j=�~>�t���
+�ol�-,]y�Q�Q�QkHsys�.��а�T6�ϰ��B�o�+�.J�<	33�y9�I�����CN�î��Ju�����0D}�/���Y�v�!��s3�r������k�
+l~�+\��Ob:��V��k?N���`��������������������E�J��mMV9�zK�_s����2���L��'�t���t˖�����L���B��>�s@�C����m�d���b����j�	0Ȗ�09nö4�4�� |�q$���a���͔l�.��D(��y`�ۦ]��������Ei���٣��fe��Şئa��)�K"g����U%c�����P�Z�>�%�7���T=7BkC]:>�����<�Q xƶ<���$�Vϋ�%8 �F~7�.ȃ�mb6$���� :}���j[�7��#"W��G��n����<J˟��M���9T(^��n��ת��u��X�&;������~�d���{ �����Jl�1q	>W����+|�؍�;<���u��(�Ù&G�qƾ
+����2B�^)�����.����~��3I�}V�{H�i��Λj��y	IA�a��e�~&Ʌ�Z����/
+��?�4�k�9�1��.#�&s� ����QTڶj��z9-�?��dQ��Z���\N����j�1�t���*�2x�ߺXu�� �K�]'�c���v�?�XKJ ו|xC���h���`��J�%���K�e"��j�'�7[MxkQ��s���o\�w"9f�Bh�mIO�TK��'���k��YQ]>�(�(E)>�W����)�Qt�9f����A¸F��$fY�N�DjY��X��v2�A�6��t�����*/��s"|M� �Fi\��_6 ��Q��ΐ2}�¾截 c�)�%(۾�����$����>r
+|�>�{\�|ln?�X�Mu�l��S�`�-�Z	VB�6bKर����Z"r|�}�KnW��HD��$7�ŀ�Y�Eڈ��������<��l��y�d�K�ddn*U.<�ZR��[y
+Ru��w �bN�<���� ���Œd���~�ys�_2ne�=>="|%:W��eMV獉��)�	gW\G��XY�=��}�A�,�?�Z�°�����v{2\n���]2���x�������w�k�Is�ϝsퟂ�j�[s��iy�/���;c�8�O@˟�~�ggT�W��?��3���&Ӫ�.��wp�T���y��m�}�o{`b�r�x��'�����d���$�N򢯴vkݥ����HX޾��-.��>�b�TNή;T��2�,�<�[��Z�.��_��x�/��睍�M/�D3�.�Xf�]���� fFD�����,����%��۴Z����K�4���V=K�H��NY1T86����I�#�^�?�������g,�ί,_��'f(9�ܙ�F9�uI��<�("M��80s��ltxg��K�]�M�b���j�	Vs���[��g���$4��%��A�7���}]��[����yx%6�oD�!�p��������=k>��f�2���X��,�VJ0� �?����I��m������o)�d����9V�A�yu��Hym�'m�$H��Par�X����J��BT�R�ȇ��N��ާ������2��9����O7|k��O����a�՝g�d%6f��Q�t���d�G9չ��Q�B�:u��>���AM���$��-�/-��,י��`�u]bwup<�BŖxn;�eY:�˞���r�.�^�ϒ�&�;%0/ltig=�o�
+���F���~�"�D�նO$e�!������&쇐ta�Q��@�/
+(z	��8���g1g{��t]�C:y�u��cɷ�=}1 �H�~��	j������n�&���e	Ϫ��1%~�r��D���;#�}3��_!���^h�؎�#�*Vସ>+�&�!G�C�I�%�<[V;b��#`��^"���zgܼ�rT�!�3��#";�҇������U�TfO�l�f�p!ڜ#�~2[[V�2�j,v#	�g��������=f�%ت�4�s5�����K�]�@��*ۻ׽�l�>����d�CC��b�����p�m$>�%����?�;i��{�����F�q���:9� b�����;|L�Xw�C����������!�k������y��N�wrb��)���/z���U��r���CUVݏ��u��e��]������`44Q���i
+Ư/�琗D�����qy�UO8��]�m�-����	��:O�����K����*
+���"[�ݵc��S�i��
+��ї�Rs���r@e	���{��6���/ ��c2v��쿽J�����塢��S?�2Z��,_��R�H��Kz���׸zߦ�c+�̐��K�n ��1�t��b�q:w�f� ���q߼=<k����s���������o��7r�wjއ~�$���</(�.�xw<Y����ݟ ��8W"� ؁�3��� o�Ͷ�1�v��P�C�z(����T�\����Ѭ�*�W��b�k�
+�&��"s�g�dԍv�!��P_Zv�,k�im=I�dm�"������{z��|�k�����0�@�ƛ(�};
+(���Ǥ︯�ٿ}��.�`�du9�R��uki\v�o��z���ƯZn_W��˫��΀=�����DO�닭��b挏���$O���i�P���ϝ�?&�\�F�2ː*>p�\���[���ВI,pl��[�g~�)��$��{�����M��Q��[+|�R�ܒ5]�W�M�ܥ'��Q������x��h�U��`�gU�6�\�e^��%9=��ڪ�%M_z bT���sxH�U�]��o2˹0��� ���R�ۿ�а��WK��i�ϦaH�M� K%ݧ�9���ź��u+��v{�m�bL-�����GS<%D����DRN Б�O���$����7���kK��.X�-�� ��d�x��m�7�b�����+L�<4Vrc������\Yvр�Uz��4��\�VՆ����r�k�[�����l)L��8�[Z�p����?u����G�O�a+8ރ-��o�Oju̖p�F�Z�:���E��b�y�������%�{K~I]q8]���A	�#�pkȯ��f<�V�ܝ�W�/�ۜ^�����%����8_&�T�wɢ-jL���r^����JE�'��1y�:Cwvp�-�%L犑F����ڸ�9�b��tts�D�\֯���;�~o28Oy���K��G6�ӥ�T۷⇝i���<MS���T��u��Y�#��&�m}\T�K8�3���]n�qb��q �P{�/nN�n?B sw�����o	���ҭ�:�$�C � �/����������hlJ4S+�T���+�^�Lj�����ڰ�#K���8�/y��b3�n�o�J����nO�th�	�Ɏ���Ae�<N�M�'$u��rȲ�%�Z��Z����OG ����
+p���*�I�}/X"Tn���cIq
+����^��R��JU�	^	�P����L�/`�zD8��&I�z���%�S��c���L�2�Ӱ�X2s�s��r��¸r�>��50g�7�).��=��^�K����Xl��F�g����Y�t�f���z�U�,Ľ ?�I��� �Ü����KyD��0�bR+0�@M��:Σ�]�:��1p��Dt�@�I1�Dn�j*J0jeq#q3�����!�,@�]RW��=UmRl�`���Ԉ-F�d�1!N���������\'b��
+mH�q�T�/Iy�:5�Gs�/��%Y��5MִQ�WZ妳��Α���������L<�Z��p %���Gl���W��N�6m�c"�0'�'"����<:���W��w�O��X��։e��4&������+�a���S�J��+`�9��y3X);�%c��I�ӌ�I�a��__���^�����3��@\���@�6c��鮹@�9(t���������z��|.�t�^6#X��sni�+�8��0Q���~��t�g�H�p��K`P�glڙL�z�aRP�H32��;�#�L�B�S#��[�en�>�'��<�3i#}��:tKJF)�"��m�>�[�gq?��BN�(�����M@�%��D�3YY���pXÆ1"V��_J�k�}1�R{��y���&���fُ�!��u�x4�0����j\�D~,��`\
+G�^@K,�����MpB�	˱,+��T� VR)ݺ�K��s�=��x��J�Y?<�«K.r�\F0|�U����:g�9�=���i�ҟ�į��w�	���?z�	�{sL����#Ԡ�רx�Y.�ԙ;ڷ��R������K�I�����t:n���r9~�3��<C&m�����(@d����H���yDNY����:P֡����X��Na� (h`��z��][~�jL<K�h�܊<Z��!XW����(�%&�ZU>ܶ���x�m���x"g�'M��)�H�3EY�l-�C�ue�]�� Xxxy��O@�2t����Z��{�JBl�0
+������ӏ{���	5 �ڰ���^����DHfz�y���К�$������4�F�+�oྪ�єȆ��鄳[��t10��ipo�.W<C"��{���(~�ϬEa��9¾� z��jD���� ��b2���b�F�?P�0J��>�AEw��K �Ua' �%H� 2~z�ᥪ�C���Q�0��j�&�	 }-��@id�\��w��a�;;�;̝�T K���D���\-_޳>Nh�>���j��n�ʒ��|'�$%suZ5A_ ;���kͩ!�����ǵ���qu3�H��jbxC�?�3��l���nUA�I�)���7Uh�g;=��6�u�iG.||s4|�XQ;h�WTOQz}����Ś$\a���e:m����3���FUc���I��+����qA=�dAmT�48��K%�xE�7_^V���;�H������Y���L�(���n�h����Z�ha�P���Ge���I��"�f]6�.c=`�1��1�{�c}��Q�ھa2p�kɷ�
+^3j�lu3��
+���/��,����ߨJh�\�}I�8鰩�#��y5G���}wOF�T� sa�P�����ʘE��u�)���8rȷ�M��!�"�2�� �}v�t���\��?
+3B�/�Ejl��l1,kЛ�X8Wk���XV���H%>��҃'����,)����/�}�'#��@��t� �W�c�pw-��$�
+��m�'�2K�nY���E���:vg��L2��JT�&��"�0�(�����5F����e��=b�k� �h��zi��a��ThXoxM�Q�7�c#]U:��XĆw2`SO_^�c���τ)��}�B�H�Ġ ����i_��s�	:� $��l���
+����;6}�ںc^�n���e��S-
+b�� ��%!�p�R�����	��o�p�%�Q�!R����P��H%���_.�����#}��I=z��5f? ��}���B���ȔS-K=h�\Fn�k�";���7�P�0{9J�KۗFB�U-,� �����*�5��dw_1��s�}�"<�굚<h�#<R3C�� ���@�3r��,��	�^���]�h���ug�:V.w�(U8���p�W6@������pR}��r�N����U5~�2�G���ݻ���%��۪iY�܎A&�cvz�۸RL����Λ0}';2�5�O�q�Xx�߯=0�*�?Ԝk�@A�ǧ�u��9Y��7��,9��g�Hn���,C�Ɓ�2ҰTV��M�<�pE�~z}�ȟ��
+�Sc�a���?z�+U�B�߻�[���97�F�9.�2;�]���B$�i<M�����09�������f~��U�E<j*��ԋ�eiu��w�����ZZƔ�6P%(5ȇ!n��,�^��([�Y	��M�[���j��l �@2�r."�Fu��F�m4A#�p�*F�D��g���KT�e]ŗ�	��R����t���A)3��QN@��d"�qa�u�	Py; e%'�SE�����4Z��K�p�%Wvɩ���4t=l�˱
+��O�>Z�U�zP%��Ȼ�s�4Ǎ �m������46�q��A+���N�'CE��?���&V"*�4G0��6�+*��ɰm��Վ����jGB/o� @��6��8�/e�wޝύ�$Wzs����m�ѣԋ>�ʞw������5Os��� ���O�l����[��Sh0����L�EL���dD�c�����~�&ptZ�o��۬�Ճ����4�/��0c΄� �t�戮�(��B�Z9���t��'bz�b��n>�/`QU���w}#�GhBfI��B�f}��(�*��w���p�|���3��HĿUA�������D�Sl]���f*]P�vܡ�#�B�g!hn��bt�zᯟ�R����js���bV�T
+.���X?1Y;
+S[�7�?�C� U���`�iNA��]�1�>i�:"D��b�����i\j5�?���b����endstreamendobj7 0 obj<</Intent 22 0 R/Name(Layer 3)/Type/OCG/Usage 23 0 R>>endobj8 0 obj<</Intent 24 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 25 0 R>>endobj9 0 obj<</Intent 26 0 R/Name(Layer 4)/Type/OCG/Usage 27 0 R>>endobj10 0 obj<</Intent 28 0 R/Name(Layer 2)/Type/OCG/Usage 29 0 R>>endobj45 0 obj<</Intent 59 0 R/Name(Layer 3)/Type/OCG/Usage 60 0 R>>endobj46 0 obj<</Intent 61 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 62 0 R>>endobj47 0 obj<</Intent 63 0 R/Name(Layer 4)/Type/OCG/Usage 64 0 R>>endobj48 0 obj<</Intent 65 0 R/Name(Layer 2)/Type/OCG/Usage 66 0 R>>endobj82 0 obj<</Intent 96 0 R/Name(Layer 3)/Type/OCG/Usage 97 0 R>>endobj83 0 obj<</Intent 98 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 99 0 R>>endobj84 0 obj<</Intent 100 0 R/Name(Layer 4)/Type/OCG/Usage 101 0 R>>endobj85 0 obj<</Intent 102 0 R/Name(Layer 2)/Type/OCG/Usage 103 0 R>>endobj119 0 obj<</Intent 133 0 R/Name(Layer 3)/Type/OCG/Usage 134 0 R>>endobj120 0 obj<</Intent 135 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 136 0 R>>endobj121 0 obj<</Intent 137 0 R/Name(Layer 4)/Type/OCG/Usage 138 0 R>>endobj122 0 obj<</Intent 139 0 R/Name(Layer 2)/Type/OCG/Usage 140 0 R>>endobj156 0 obj<</Intent 170 0 R/Name(Layer 3)/Type/OCG/Usage 171 0 R>>endobj157 0 obj<</Intent 172 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 173 0 R>>endobj158 0 obj<</Intent 174 0 R/Name(Layer 4)/Type/OCG/Usage 175 0 R>>endobj159 0 obj<</Intent 176 0 R/Name(Layer 2)/Type/OCG/Usage 177 0 R>>endobj193 0 obj<</Intent 207 0 R/Name(Layer 3)/Type/OCG/Usage 208 0 R>>endobj194 0 obj<</Intent 209 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 210 0 R>>endobj195 0 obj<</Intent 211 0 R/Name(Layer 4)/Type/OCG/Usage 212 0 R>>endobj196 0 obj<</Intent 213 0 R/Name(Layer 2)/Type/OCG/Usage 214 0 R>>endobj230 0 obj<</Intent 244 0 R/Name(Layer 3)/Type/OCG/Usage 245 0 R>>endobj231 0 obj<</Intent 246 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 247 0 R>>endobj232 0 obj<</Intent 248 0 R/Name(Layer 4)/Type/OCG/Usage 249 0 R>>endobj233 0 obj<</Intent 250 0 R/Name(Layer 2)/Type/OCG/Usage 251 0 R>>endobj267 0 obj<</Intent 281 0 R/Name(Layer 3)/Type/OCG/Usage 282 0 R>>endobj268 0 obj<</Intent 283 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 284 0 R>>endobj269 0 obj<</Intent 285 0 R/Name(Layer 4)/Type/OCG/Usage 286 0 R>>endobj270 0 obj<</Intent 287 0 R/Name(Layer 2)/Type/OCG/Usage 288 0 R>>endobj304 0 obj<</Intent 318 0 R/Name(Layer 3)/Type/OCG/Usage 319 0 R>>endobj305 0 obj<</Intent 320 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 321 0 R>>endobj306 0 obj<</Intent 322 0 R/Name(Layer 4)/Type/OCG/Usage 323 0 R>>endobj307 0 obj<</Intent 324 0 R/Name(Layer 2)/Type/OCG/Usage 325 0 R>>endobj341 0 obj<</Intent 354 0 R/Name(Layer 3)/Type/OCG/Usage 355 0 R>>endobj342 0 obj<</Intent 356 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 357 0 R>>endobj343 0 obj<</Intent 358 0 R/Name(Layer 4)/Type/OCG/Usage 359 0 R>>endobj344 0 obj<</Intent 360 0 R/Name(Layer 2)/Type/OCG/Usage 361 0 R>>endobj377 0 obj<</Intent 390 0 R/Name(Layer 3)/Type/OCG/Usage 391 0 R>>endobj378 0 obj<</Intent 392 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 393 0 R>>endobj379 0 obj<</Intent 394 0 R/Name(Layer 4)/Type/OCG/Usage 395 0 R>>endobj380 0 obj<</Intent 396 0 R/Name(Layer 2)/Type/OCG/Usage 397 0 R>>endobj413 0 obj<</Intent 426 0 R/Name(Layer 3)/Type/OCG/Usage 427 0 R>>endobj414 0 obj<</Intent 428 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 429 0 R>>endobj415 0 obj<</Intent 430 0 R/Name(Layer 4)/Type/OCG/Usage 431 0 R>>endobj416 0 obj<</Intent 432 0 R/Name(Layer 2)/Type/OCG/Usage 433 0 R>>endobj449 0 obj<</Intent 462 0 R/Name(Layer 3)/Type/OCG/Usage 463 0 R>>endobj450 0 obj<</Intent 464 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 465 0 R>>endobj451 0 obj<</Intent 466 0 R/Name(Layer 4)/Type/OCG/Usage 467 0 R>>endobj452 0 obj<</Intent 468 0 R/Name(Layer 2)/Type/OCG/Usage 469 0 R>>endobj485 0 obj<</Intent 498 0 R/Name(Layer 3)/Type/OCG/Usage 499 0 R>>endobj486 0 obj<</Intent 500 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 501 0 R>>endobj487 0 obj<</Intent 502 0 R/Name(Layer 4)/Type/OCG/Usage 503 0 R>>endobj488 0 obj<</Intent 504 0 R/Name(Layer 2)/Type/OCG/Usage 505 0 R>>endobj521 0 obj<</Intent 534 0 R/Name(Layer 3)/Type/OCG/Usage 535 0 R>>endobj522 0 obj<</Intent 536 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 537 0 R>>endobj523 0 obj<</Intent 538 0 R/Name(Layer 4)/Type/OCG/Usage 539 0 R>>endobj524 0 obj<</Intent 540 0 R/Name(Layer 2)/Type/OCG/Usage 541 0 R>>endobj557 0 obj<</Intent 570 0 R/Name(Layer 3)/Type/OCG/Usage 571 0 R>>endobj558 0 obj<</Intent 572 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 573 0 R>>endobj559 0 obj<</Intent 574 0 R/Name(Layer 4)/Type/OCG/Usage 575 0 R>>endobj560 0 obj<</Intent 576 0 R/Name(Layer 2)/Type/OCG/Usage 577 0 R>>endobj593 0 obj<</Intent 606 0 R/Name(Layer 3)/Type/OCG/Usage 607 0 R>>endobj594 0 obj<</Intent 608 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 609 0 R>>endobj595 0 obj<</Intent 610 0 R/Name(Layer 4)/Type/OCG/Usage 611 0 R>>endobj596 0 obj<</Intent 612 0 R/Name(Layer 2)/Type/OCG/Usage 613 0 R>>endobj629 0 obj<</Intent 642 0 R/Name(Layer 3)/Type/OCG/Usage 643 0 R>>endobj630 0 obj<</Intent 644 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 645 0 R>>endobj631 0 obj<</Intent 646 0 R/Name(Layer 4)/Type/OCG/Usage 647 0 R>>endobj632 0 obj<</Intent 648 0 R/Name(Layer 2)/Type/OCG/Usage 649 0 R>>endobj665 0 obj<</Intent 678 0 R/Name(Layer 3)/Type/OCG/Usage 679 0 R>>endobj666 0 obj<</Intent 680 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 681 0 R>>endobj667 0 obj<</Intent 682 0 R/Name(Layer 4)/Type/OCG/Usage 683 0 R>>endobj668 0 obj<</Intent 684 0 R/Name(Layer 2)/Type/OCG/Usage 685 0 R>>endobj701 0 obj<</Intent 714 0 R/Name(Layer 3)/Type/OCG/Usage 715 0 R>>endobj702 0 obj<</Intent 716 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 717 0 R>>endobj703 0 obj<</Intent 718 0 R/Name(Layer 4)/Type/OCG/Usage 719 0 R>>endobj704 0 obj<</Intent 720 0 R/Name(Layer 2)/Type/OCG/Usage 721 0 R>>endobj737 0 obj<</Intent 750 0 R/Name(Layer 3)/Type/OCG/Usage 751 0 R>>endobj738 0 obj<</Intent 752 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 753 0 R>>endobj739 0 obj<</Intent 754 0 R/Name(Layer 4)/Type/OCG/Usage 755 0 R>>endobj740 0 obj<</Intent 756 0 R/Name(Layer 2)/Type/OCG/Usage 757 0 R>>endobj773 0 obj<</Intent 786 0 R/Name(Layer 3)/Type/OCG/Usage 787 0 R>>endobj774 0 obj<</Intent 788 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 789 0 R>>endobj775 0 obj<</Intent 790 0 R/Name(Layer 4)/Type/OCG/Usage 791 0 R>>endobj776 0 obj<</Intent 792 0 R/Name(Layer 2)/Type/OCG/Usage 793 0 R>>endobj809 0 obj<</Intent 822 0 R/Name(Layer 3)/Type/OCG/Usage 823 0 R>>endobj810 0 obj<</Intent 824 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 825 0 R>>endobj811 0 obj<</Intent 826 0 R/Name(Layer 4)/Type/OCG/Usage 827 0 R>>endobj812 0 obj<</Intent 828 0 R/Name(Layer 2)/Type/OCG/Usage 829 0 R>>endobj845 0 obj<</Intent 858 0 R/Name(Layer 3)/Type/OCG/Usage 859 0 R>>endobj846 0 obj<</Intent 860 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 861 0 R>>endobj847 0 obj<</Intent 862 0 R/Name(Layer 4)/Type/OCG/Usage 863 0 R>>endobj848 0 obj<</Intent 864 0 R/Name(Layer 2)/Type/OCG/Usage 865 0 R>>endobj881 0 obj<</Intent 894 0 R/Name(Layer 3)/Type/OCG/Usage 895 0 R>>endobj882 0 obj<</Intent 896 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 897 0 R>>endobj883 0 obj<</Intent 898 0 R/Name(Layer 4)/Type/OCG/Usage 899 0 R>>endobj884 0 obj<</Intent 900 0 R/Name(Layer 2)/Type/OCG/Usage 901 0 R>>endobj919 0 obj<</Intent 932 0 R/Name(Layer 3)/Type/OCG/Usage 933 0 R>>endobj920 0 obj<</Intent 934 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 935 0 R>>endobj921 0 obj<</Intent 936 0 R/Name(Layer 4)/Type/OCG/Usage 937 0 R>>endobj922 0 obj<</Intent 938 0 R/Name(Layer 2)/Type/OCG/Usage 939 0 R>>endobj961 0 obj<</Intent 974 0 R/Name(Layer 3)/Type/OCG/Usage 975 0 R>>endobj962 0 obj<</Intent 976 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 977 0 R>>endobj963 0 obj<</Intent 978 0 R/Name(Layer 4)/Type/OCG/Usage 979 0 R>>endobj964 0 obj<</Intent 980 0 R/Name(Layer 2)/Type/OCG/Usage 981 0 R>>endobj1003 0 obj<</Intent 1016 0 R/Name(Layer 3)/Type/OCG/Usage 1017 0 R>>endobj1004 0 obj<</Intent 1018 0 R/Name(GPU Memory Blocks)/Type/OCG/Usage 1019 0 R>>endobj1005 0 obj<</Intent 1020 0 R/Name(Layer 4)/Type/OCG/Usage 1021 0 R>>endobj1006 0 obj<</Intent 1022 0 R/Name(Layer 2)/Type/OCG/Usage 1023 0 R>>endobj1022 0 obj[/View/Design]endobj1023 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj1020 0 obj[/View/Design]endobj1021 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj1018 0 obj[/View/Design]endobj1019 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj1016 0 obj[/View/Design]endobj1017 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj980 0 obj[/View/Design]endobj981 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj978 0 obj[/View/Design]endobj979 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj976 0 obj[/View/Design]endobj977 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj974 0 obj[/View/Design]endobj975 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj938 0 obj[/View/Design]endobj939 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj936 0 obj[/View/Design]endobj937 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj934 0 obj[/View/Design]endobj935 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj932 0 obj[/View/Design]endobj933 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj900 0 obj[/View/Design]endobj901 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj898 0 obj[/View/Design]endobj899 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj896 0 obj[/View/Design]endobj897 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj894 0 obj[/View/Design]endobj895 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj864 0 obj[/View/Design]endobj865 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj862 0 obj[/View/Design]endobj863 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj860 0 obj[/View/Design]endobj861 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj858 0 obj[/View/Design]endobj859 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj828 0 obj[/View/Design]endobj829 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj826 0 obj[/View/Design]endobj827 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj824 0 obj[/View/Design]endobj825 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj822 0 obj[/View/Design]endobj823 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj792 0 obj[/View/Design]endobj793 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj790 0 obj[/View/Design]endobj791 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj788 0 obj[/View/Design]endobj789 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj786 0 obj[/View/Design]endobj787 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj756 0 obj[/View/Design]endobj757 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj754 0 obj[/View/Design]endobj755 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj752 0 obj[/View/Design]endobj753 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj750 0 obj[/View/Design]endobj751 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj720 0 obj[/View/Design]endobj721 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj718 0 obj[/View/Design]endobj719 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj716 0 obj[/View/Design]endobj717 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj714 0 obj[/View/Design]endobj715 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj684 0 obj[/View/Design]endobj685 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj682 0 obj[/View/Design]endobj683 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj680 0 obj[/View/Design]endobj681 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj678 0 obj[/View/Design]endobj679 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj648 0 obj[/View/Design]endobj649 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj646 0 obj[/View/Design]endobj647 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj644 0 obj[/View/Design]endobj645 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj642 0 obj[/View/Design]endobj643 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj612 0 obj[/View/Design]endobj613 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj610 0 obj[/View/Design]endobj611 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj608 0 obj[/View/Design]endobj609 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj606 0 obj[/View/Design]endobj607 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj576 0 obj[/View/Design]endobj577 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj574 0 obj[/View/Design]endobj575 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj572 0 obj[/View/Design]endobj573 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj570 0 obj[/View/Design]endobj571 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj540 0 obj[/View/Design]endobj541 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj538 0 obj[/View/Design]endobj539 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj536 0 obj[/View/Design]endobj537 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj534 0 obj[/View/Design]endobj535 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj504 0 obj[/View/Design]endobj505 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj502 0 obj[/View/Design]endobj503 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj500 0 obj[/View/Design]endobj501 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj498 0 obj[/View/Design]endobj499 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj468 0 obj[/View/Design]endobj469 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj466 0 obj[/View/Design]endobj467 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj464 0 obj[/View/Design]endobj465 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj462 0 obj[/View/Design]endobj463 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj432 0 obj[/View/Design]endobj433 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj430 0 obj[/View/Design]endobj431 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj428 0 obj[/View/Design]endobj429 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj426 0 obj[/View/Design]endobj427 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj396 0 obj[/View/Design]endobj397 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj394 0 obj[/View/Design]endobj395 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj392 0 obj[/View/Design]endobj393 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj390 0 obj[/View/Design]endobj391 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj360 0 obj[/View/Design]endobj361 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj358 0 obj[/View/Design]endobj359 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj356 0 obj[/View/Design]endobj357 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj354 0 obj[/View/Design]endobj355 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj324 0 obj[/View/Design]endobj325 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj322 0 obj[/View/Design]endobj323 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj320 0 obj[/View/Design]endobj321 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj318 0 obj[/View/Design]endobj319 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj287 0 obj[/View/Design]endobj288 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj285 0 obj[/View/Design]endobj286 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj283 0 obj[/View/Design]endobj284 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj281 0 obj[/View/Design]endobj282 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj250 0 obj[/View/Design]endobj251 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj248 0 obj[/View/Design]endobj249 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj246 0 obj[/View/Design]endobj247 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj244 0 obj[/View/Design]endobj245 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj213 0 obj[/View/Design]endobj214 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj211 0 obj[/View/Design]endobj212 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj209 0 obj[/View/Design]endobj210 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj207 0 obj[/View/Design]endobj208 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj176 0 obj[/View/Design]endobj177 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj174 0 obj[/View/Design]endobj175 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj172 0 obj[/View/Design]endobj173 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj170 0 obj[/View/Design]endobj171 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj139 0 obj[/View/Design]endobj140 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj137 0 obj[/View/Design]endobj138 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj135 0 obj[/View/Design]endobj136 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj133 0 obj[/View/Design]endobj134 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj102 0 obj[/View/Design]endobj103 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj100 0 obj[/View/Design]endobj101 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj98 0 obj[/View/Design]endobj99 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj96 0 obj[/View/Design]endobj97 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj65 0 obj[/View/Design]endobj66 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj63 0 obj[/View/Design]endobj64 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj61 0 obj[/View/Design]endobj62 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj59 0 obj[/View/Design]endobj60 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj28 0 obj[/View/Design]endobj29 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj26 0 obj[/View/Design]endobj27 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj24 0 obj[/View/Design]endobj25 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj22 0 obj[/View/Design]endobj23 0 obj<</CreatorInfo<</Creator(Adobe Illustrator 26.2)/Subtype/Artwork>>>>endobj1049 0 obj[1048 0 R 1047 0 R 1046 0 R 1045 0 R]endobj1082 0 obj<</CreationDate(D:20220516172026-07'00')/Creator(Adobe Illustrator 26.2 \(Macintosh\))/ModDate(D:20220523073516-07'00')/Producer(Adobe PDF library 10.01)/Title(Print)>>endobjxref
+0 1083
+0000000004 65535 f
+0000000016 00000 n
+0000001977 00000 n
+0000064028 00000 n
+0000000005 00000 f
+0000000006 00000 f
+0000000011 00000 f
+0000430892 00000 n
+0000430962 00000 n
+0000431042 00000 n
+0000431112 00000 n
+0000000013 00000 f
+0000064080 00000 n
+0000000014 00000 f
+0000000015 00000 f
+0000000016 00000 f
+0000000017 00000 f
+0000000018 00000 f
+0000000019 00000 f
+0000000020 00000 f
+0000000021 00000 f
+0000000030 00000 f
+0000452525 00000 n
+0000452556 00000 n
+0000452409 00000 n
+0000452440 00000 n
+0000452293 00000 n
+0000452324 00000 n
+0000452177 00000 n
+0000452208 00000 n
+0000000031 00000 f
+0000000032 00000 f
+0000000033 00000 f
+0000000034 00000 f
+0000000035 00000 f
+0000000036 00000 f
+0000000037 00000 f
+0000000038 00000 f
+0000000039 00000 f
+0000000040 00000 f
+0000000041 00000 f
+0000000042 00000 f
+0000000043 00000 f
+0000000044 00000 f
+0000000049 00000 f
+0000431183 00000 n
+0000431254 00000 n
+0000431335 00000 n
+0000431406 00000 n
+0000000050 00000 f
+0000000051 00000 f
+0000000052 00000 f
+0000000053 00000 f
+0000000054 00000 f
+0000000055 00000 f
+0000000056 00000 f
+0000000057 00000 f
+0000000058 00000 f
+0000000067 00000 f
+0000452061 00000 n
+0000452092 00000 n
+0000451945 00000 n
+0000451976 00000 n
+0000451829 00000 n
+0000451860 00000 n
+0000451713 00000 n
+0000451744 00000 n
+0000000068 00000 f
+0000000069 00000 f
+0000000070 00000 f
+0000000071 00000 f
+0000000072 00000 f
+0000000073 00000 f
+0000000074 00000 f
+0000000075 00000 f
+0000000076 00000 f
+0000000077 00000 f
+0000000078 00000 f
+0000000079 00000 f
+0000000080 00000 f
+0000000081 00000 f
+0000000086 00000 f
+0000431477 00000 n
+0000431548 00000 n
+0000431629 00000 n
+0000431702 00000 n
+0000000087 00000 f
+0000000088 00000 f
+0000000089 00000 f
+0000000090 00000 f
+0000000091 00000 f
+0000000092 00000 f
+0000000093 00000 f
+0000000094 00000 f
+0000000095 00000 f
+0000000104 00000 f
+0000451597 00000 n
+0000451628 00000 n
+0000451481 00000 n
+0000451512 00000 n
+0000451363 00000 n
+0000451395 00000 n
+0000451245 00000 n
+0000451277 00000 n
+0000000105 00000 f
+0000000106 00000 f
+0000000107 00000 f
+0000000108 00000 f
+0000000109 00000 f
+0000000110 00000 f
+0000000111 00000 f
+0000000112 00000 f
+0000000113 00000 f
+0000000114 00000 f
+0000000115 00000 f
+0000000116 00000 f
+0000000117 00000 f
+0000000118 00000 f
+0000000123 00000 f
+0000431775 00000 n
+0000431849 00000 n
+0000431933 00000 n
+0000432007 00000 n
+0000000124 00000 f
+0000000125 00000 f
+0000000126 00000 f
+0000000127 00000 f
+0000000128 00000 f
+0000000129 00000 f
+0000000130 00000 f
+0000000131 00000 f
+0000000132 00000 f
+0000000141 00000 f
+0000451127 00000 n
+0000451159 00000 n
+0000451009 00000 n
+0000451041 00000 n
+0000450891 00000 n
+0000450923 00000 n
+0000450773 00000 n
+0000450805 00000 n
+0000000142 00000 f
+0000000143 00000 f
+0000000144 00000 f
+0000000145 00000 f
+0000000146 00000 f
+0000000147 00000 f
+0000000148 00000 f
+0000000149 00000 f
+0000000150 00000 f
+0000000151 00000 f
+0000000152 00000 f
+0000000153 00000 f
+0000000154 00000 f
+0000000155 00000 f
+0000000160 00000 f
+0000432081 00000 n
+0000432155 00000 n
+0000432239 00000 n
+0000432313 00000 n
+0000000161 00000 f
+0000000162 00000 f
+0000000163 00000 f
+0000000164 00000 f
+0000000165 00000 f
+0000000166 00000 f
+0000000167 00000 f
+0000000168 00000 f
+0000000169 00000 f
+0000000178 00000 f
+0000450655 00000 n
+0000450687 00000 n
+0000450537 00000 n
+0000450569 00000 n
+0000450419 00000 n
+0000450451 00000 n
+0000450301 00000 n
+0000450333 00000 n
+0000000179 00000 f
+0000000180 00000 f
+0000000181 00000 f
+0000000182 00000 f
+0000000183 00000 f
+0000000184 00000 f
+0000000185 00000 f
+0000000186 00000 f
+0000000187 00000 f
+0000000188 00000 f
+0000000189 00000 f
+0000000190 00000 f
+0000000191 00000 f
+0000000192 00000 f
+0000000197 00000 f
+0000432387 00000 n
+0000432461 00000 n
+0000432545 00000 n
+0000432619 00000 n
+0000000198 00000 f
+0000000199 00000 f
+0000000200 00000 f
+0000000201 00000 f
+0000000202 00000 f
+0000000203 00000 f
+0000000204 00000 f
+0000000205 00000 f
+0000000206 00000 f
+0000000215 00000 f
+0000450183 00000 n
+0000450215 00000 n
+0000450065 00000 n
+0000450097 00000 n
+0000449947 00000 n
+0000449979 00000 n
+0000449829 00000 n
+0000449861 00000 n
+0000000216 00000 f
+0000000217 00000 f
+0000000218 00000 f
+0000000219 00000 f
+0000000220 00000 f
+0000000221 00000 f
+0000000222 00000 f
+0000000223 00000 f
+0000000224 00000 f
+0000000225 00000 f
+0000000226 00000 f
+0000000227 00000 f
+0000000228 00000 f
+0000000229 00000 f
+0000000234 00000 f
+0000432693 00000 n
+0000432767 00000 n
+0000432851 00000 n
+0000432925 00000 n
+0000000235 00000 f
+0000000236 00000 f
+0000000237 00000 f
+0000000238 00000 f
+0000000239 00000 f
+0000000240 00000 f
+0000000241 00000 f
+0000000242 00000 f
+0000000243 00000 f
+0000000252 00000 f
+0000449711 00000 n
+0000449743 00000 n
+0000449593 00000 n
+0000449625 00000 n
+0000449475 00000 n
+0000449507 00000 n
+0000449357 00000 n
+0000449389 00000 n
+0000000253 00000 f
+0000000254 00000 f
+0000000255 00000 f
+0000000256 00000 f
+0000000257 00000 f
+0000000258 00000 f
+0000000259 00000 f
+0000000260 00000 f
+0000000261 00000 f
+0000000262 00000 f
+0000000263 00000 f
+0000000264 00000 f
+0000000265 00000 f
+0000000266 00000 f
+0000000271 00000 f
+0000432999 00000 n
+0000433073 00000 n
+0000433157 00000 n
+0000433231 00000 n
+0000000272 00000 f
+0000000273 00000 f
+0000000274 00000 f
+0000000275 00000 f
+0000000276 00000 f
+0000000277 00000 f
+0000000278 00000 f
+0000000279 00000 f
+0000000280 00000 f
+0000000289 00000 f
+0000449239 00000 n
+0000449271 00000 n
+0000449121 00000 n
+0000449153 00000 n
+0000449003 00000 n
+0000449035 00000 n
+0000448885 00000 n
+0000448917 00000 n
+0000000290 00000 f
+0000000291 00000 f
+0000000292 00000 f
+0000000293 00000 f
+0000000294 00000 f
+0000000295 00000 f
+0000000296 00000 f
+0000000297 00000 f
+0000000298 00000 f
+0000000299 00000 f
+0000000300 00000 f
+0000000301 00000 f
+0000000302 00000 f
+0000000303 00000 f
+0000000308 00000 f
+0000433305 00000 n
+0000433379 00000 n
+0000433463 00000 n
+0000433537 00000 n
+0000000309 00000 f
+0000000310 00000 f
+0000000311 00000 f
+0000000312 00000 f
+0000000313 00000 f
+0000000314 00000 f
+0000000315 00000 f
+0000000316 00000 f
+0000000317 00000 f
+0000000326 00000 f
+0000448767 00000 n
+0000448799 00000 n
+0000448649 00000 n
+0000448681 00000 n
+0000448531 00000 n
+0000448563 00000 n
+0000448413 00000 n
+0000448445 00000 n
+0000000327 00000 f
+0000000328 00000 f
+0000000329 00000 f
+0000000330 00000 f
+0000000331 00000 f
+0000000332 00000 f
+0000000333 00000 f
+0000000334 00000 f
+0000000335 00000 f
+0000000336 00000 f
+0000000337 00000 f
+0000000338 00000 f
+0000000339 00000 f
+0000000340 00000 f
+0000000345 00000 f
+0000433611 00000 n
+0000433685 00000 n
+0000433769 00000 n
+0000433843 00000 n
+0000000346 00000 f
+0000000347 00000 f
+0000000348 00000 f
+0000000349 00000 f
+0000000350 00000 f
+0000000351 00000 f
+0000000352 00000 f
+0000000353 00000 f
+0000000362 00000 f
+0000448295 00000 n
+0000448327 00000 n
+0000448177 00000 n
+0000448209 00000 n
+0000448059 00000 n
+0000448091 00000 n
+0000447941 00000 n
+0000447973 00000 n
+0000000363 00000 f
+0000000364 00000 f
+0000000365 00000 f
+0000000366 00000 f
+0000000367 00000 f
+0000000368 00000 f
+0000000369 00000 f
+0000000370 00000 f
+0000000371 00000 f
+0000000372 00000 f
+0000000373 00000 f
+0000000374 00000 f
+0000000375 00000 f
+0000000376 00000 f
+0000000381 00000 f
+0000433917 00000 n
+0000433991 00000 n
+0000434075 00000 n
+0000434149 00000 n
+0000000382 00000 f
+0000000383 00000 f
+0000000384 00000 f
+0000000385 00000 f
+0000000386 00000 f
+0000000387 00000 f
+0000000388 00000 f
+0000000389 00000 f
+0000000398 00000 f
+0000447823 00000 n
+0000447855 00000 n
+0000447705 00000 n
+0000447737 00000 n
+0000447587 00000 n
+0000447619 00000 n
+0000447469 00000 n
+0000447501 00000 n
+0000000399 00000 f
+0000000400 00000 f
+0000000401 00000 f
+0000000402 00000 f
+0000000403 00000 f
+0000000404 00000 f
+0000000405 00000 f
+0000000406 00000 f
+0000000407 00000 f
+0000000408 00000 f
+0000000409 00000 f
+0000000410 00000 f
+0000000411 00000 f
+0000000412 00000 f
+0000000417 00000 f
+0000434223 00000 n
+0000434297 00000 n
+0000434381 00000 n
+0000434455 00000 n
+0000000418 00000 f
+0000000419 00000 f
+0000000420 00000 f
+0000000421 00000 f
+0000000422 00000 f
+0000000423 00000 f
+0000000424 00000 f
+0000000425 00000 f
+0000000434 00000 f
+0000447351 00000 n
+0000447383 00000 n
+0000447233 00000 n
+0000447265 00000 n
+0000447115 00000 n
+0000447147 00000 n
+0000446997 00000 n
+0000447029 00000 n
+0000000435 00000 f
+0000000436 00000 f
+0000000437 00000 f
+0000000438 00000 f
+0000000439 00000 f
+0000000440 00000 f
+0000000441 00000 f
+0000000442 00000 f
+0000000443 00000 f
+0000000444 00000 f
+0000000445 00000 f
+0000000446 00000 f
+0000000447 00000 f
+0000000448 00000 f
+0000000453 00000 f
+0000434529 00000 n
+0000434603 00000 n
+0000434687 00000 n
+0000434761 00000 n
+0000000454 00000 f
+0000000455 00000 f
+0000000456 00000 f
+0000000457 00000 f
+0000000458 00000 f
+0000000459 00000 f
+0000000460 00000 f
+0000000461 00000 f
+0000000470 00000 f
+0000446879 00000 n
+0000446911 00000 n
+0000446761 00000 n
+0000446793 00000 n
+0000446643 00000 n
+0000446675 00000 n
+0000446525 00000 n
+0000446557 00000 n
+0000000471 00000 f
+0000000472 00000 f
+0000000473 00000 f
+0000000474 00000 f
+0000000475 00000 f
+0000000476 00000 f
+0000000477 00000 f
+0000000478 00000 f
+0000000479 00000 f
+0000000480 00000 f
+0000000481 00000 f
+0000000482 00000 f
+0000000483 00000 f
+0000000484 00000 f
+0000000489 00000 f
+0000434835 00000 n
+0000434909 00000 n
+0000434993 00000 n
+0000435067 00000 n
+0000000490 00000 f
+0000000491 00000 f
+0000000492 00000 f
+0000000493 00000 f
+0000000494 00000 f
+0000000495 00000 f
+0000000496 00000 f
+0000000497 00000 f
+0000000506 00000 f
+0000446407 00000 n
+0000446439 00000 n
+0000446289 00000 n
+0000446321 00000 n
+0000446171 00000 n
+0000446203 00000 n
+0000446053 00000 n
+0000446085 00000 n
+0000000507 00000 f
+0000000508 00000 f
+0000000509 00000 f
+0000000510 00000 f
+0000000511 00000 f
+0000000512 00000 f
+0000000513 00000 f
+0000000514 00000 f
+0000000515 00000 f
+0000000516 00000 f
+0000000517 00000 f
+0000000518 00000 f
+0000000519 00000 f
+0000000520 00000 f
+0000000525 00000 f
+0000435141 00000 n
+0000435215 00000 n
+0000435299 00000 n
+0000435373 00000 n
+0000000526 00000 f
+0000000527 00000 f
+0000000528 00000 f
+0000000529 00000 f
+0000000530 00000 f
+0000000531 00000 f
+0000000532 00000 f
+0000000533 00000 f
+0000000542 00000 f
+0000445935 00000 n
+0000445967 00000 n
+0000445817 00000 n
+0000445849 00000 n
+0000445699 00000 n
+0000445731 00000 n
+0000445581 00000 n
+0000445613 00000 n
+0000000543 00000 f
+0000000544 00000 f
+0000000545 00000 f
+0000000546 00000 f
+0000000547 00000 f
+0000000548 00000 f
+0000000549 00000 f
+0000000550 00000 f
+0000000551 00000 f
+0000000552 00000 f
+0000000553 00000 f
+0000000554 00000 f
+0000000555 00000 f
+0000000556 00000 f
+0000000561 00000 f
+0000435447 00000 n
+0000435521 00000 n
+0000435605 00000 n
+0000435679 00000 n
+0000000562 00000 f
+0000000563 00000 f
+0000000564 00000 f
+0000000565 00000 f
+0000000566 00000 f
+0000000567 00000 f
+0000000568 00000 f
+0000000569 00000 f
+0000000578 00000 f
+0000445463 00000 n
+0000445495 00000 n
+0000445345 00000 n
+0000445377 00000 n
+0000445227 00000 n
+0000445259 00000 n
+0000445109 00000 n
+0000445141 00000 n
+0000000579 00000 f
+0000000580 00000 f
+0000000581 00000 f
+0000000582 00000 f
+0000000583 00000 f
+0000000584 00000 f
+0000000585 00000 f
+0000000586 00000 f
+0000000587 00000 f
+0000000588 00000 f
+0000000589 00000 f
+0000000590 00000 f
+0000000591 00000 f
+0000000592 00000 f
+0000000597 00000 f
+0000435753 00000 n
+0000435827 00000 n
+0000435911 00000 n
+0000435985 00000 n
+0000000598 00000 f
+0000000599 00000 f
+0000000600 00000 f
+0000000601 00000 f
+0000000602 00000 f
+0000000603 00000 f
+0000000604 00000 f
+0000000605 00000 f
+0000000614 00000 f
+0000444991 00000 n
+0000445023 00000 n
+0000444873 00000 n
+0000444905 00000 n
+0000444755 00000 n
+0000444787 00000 n
+0000444637 00000 n
+0000444669 00000 n
+0000000615 00000 f
+0000000616 00000 f
+0000000617 00000 f
+0000000618 00000 f
+0000000619 00000 f
+0000000620 00000 f
+0000000621 00000 f
+0000000622 00000 f
+0000000623 00000 f
+0000000624 00000 f
+0000000625 00000 f
+0000000626 00000 f
+0000000627 00000 f
+0000000628 00000 f
+0000000633 00000 f
+0000436059 00000 n
+0000436133 00000 n
+0000436217 00000 n
+0000436291 00000 n
+0000000634 00000 f
+0000000635 00000 f
+0000000636 00000 f
+0000000637 00000 f
+0000000638 00000 f
+0000000639 00000 f
+0000000640 00000 f
+0000000641 00000 f
+0000000650 00000 f
+0000444519 00000 n
+0000444551 00000 n
+0000444401 00000 n
+0000444433 00000 n
+0000444283 00000 n
+0000444315 00000 n
+0000444165 00000 n
+0000444197 00000 n
+0000000651 00000 f
+0000000652 00000 f
+0000000653 00000 f
+0000000654 00000 f
+0000000655 00000 f
+0000000656 00000 f
+0000000657 00000 f
+0000000658 00000 f
+0000000659 00000 f
+0000000660 00000 f
+0000000661 00000 f
+0000000662 00000 f
+0000000663 00000 f
+0000000664 00000 f
+0000000669 00000 f
+0000436365 00000 n
+0000436439 00000 n
+0000436523 00000 n
+0000436597 00000 n
+0000000670 00000 f
+0000000671 00000 f
+0000000672 00000 f
+0000000673 00000 f
+0000000674 00000 f
+0000000675 00000 f
+0000000676 00000 f
+0000000677 00000 f
+0000000686 00000 f
+0000444047 00000 n
+0000444079 00000 n
+0000443929 00000 n
+0000443961 00000 n
+0000443811 00000 n
+0000443843 00000 n
+0000443693 00000 n
+0000443725 00000 n
+0000000687 00000 f
+0000000688 00000 f
+0000000689 00000 f
+0000000690 00000 f
+0000000691 00000 f
+0000000692 00000 f
+0000000693 00000 f
+0000000694 00000 f
+0000000695 00000 f
+0000000696 00000 f
+0000000697 00000 f
+0000000698 00000 f
+0000000699 00000 f
+0000000700 00000 f
+0000000705 00000 f
+0000436671 00000 n
+0000436745 00000 n
+0000436829 00000 n
+0000436903 00000 n
+0000000706 00000 f
+0000000707 00000 f
+0000000708 00000 f
+0000000709 00000 f
+0000000710 00000 f
+0000000711 00000 f
+0000000712 00000 f
+0000000713 00000 f
+0000000722 00000 f
+0000443575 00000 n
+0000443607 00000 n
+0000443457 00000 n
+0000443489 00000 n
+0000443339 00000 n
+0000443371 00000 n
+0000443221 00000 n
+0000443253 00000 n
+0000000723 00000 f
+0000000724 00000 f
+0000000725 00000 f
+0000000726 00000 f
+0000000727 00000 f
+0000000728 00000 f
+0000000729 00000 f
+0000000730 00000 f
+0000000731 00000 f
+0000000732 00000 f
+0000000733 00000 f
+0000000734 00000 f
+0000000735 00000 f
+0000000736 00000 f
+0000000741 00000 f
+0000436977 00000 n
+0000437051 00000 n
+0000437135 00000 n
+0000437209 00000 n
+0000000742 00000 f
+0000000743 00000 f
+0000000744 00000 f
+0000000745 00000 f
+0000000746 00000 f
+0000000747 00000 f
+0000000748 00000 f
+0000000749 00000 f
+0000000758 00000 f
+0000443103 00000 n
+0000443135 00000 n
+0000442985 00000 n
+0000443017 00000 n
+0000442867 00000 n
+0000442899 00000 n
+0000442749 00000 n
+0000442781 00000 n
+0000000759 00000 f
+0000000760 00000 f
+0000000761 00000 f
+0000000762 00000 f
+0000000763 00000 f
+0000000764 00000 f
+0000000765 00000 f
+0000000766 00000 f
+0000000767 00000 f
+0000000768 00000 f
+0000000769 00000 f
+0000000770 00000 f
+0000000771 00000 f
+0000000772 00000 f
+0000000777 00000 f
+0000437283 00000 n
+0000437357 00000 n
+0000437441 00000 n
+0000437515 00000 n
+0000000778 00000 f
+0000000779 00000 f
+0000000780 00000 f
+0000000781 00000 f
+0000000782 00000 f
+0000000783 00000 f
+0000000784 00000 f
+0000000785 00000 f
+0000000794 00000 f
+0000442631 00000 n
+0000442663 00000 n
+0000442513 00000 n
+0000442545 00000 n
+0000442395 00000 n
+0000442427 00000 n
+0000442277 00000 n
+0000442309 00000 n
+0000000795 00000 f
+0000000796 00000 f
+0000000797 00000 f
+0000000798 00000 f
+0000000799 00000 f
+0000000800 00000 f
+0000000801 00000 f
+0000000802 00000 f
+0000000803 00000 f
+0000000804 00000 f
+0000000805 00000 f
+0000000806 00000 f
+0000000807 00000 f
+0000000808 00000 f
+0000000813 00000 f
+0000437589 00000 n
+0000437663 00000 n
+0000437747 00000 n
+0000437821 00000 n
+0000000814 00000 f
+0000000815 00000 f
+0000000816 00000 f
+0000000817 00000 f
+0000000818 00000 f
+0000000819 00000 f
+0000000820 00000 f
+0000000821 00000 f
+0000000830 00000 f
+0000442159 00000 n
+0000442191 00000 n
+0000442041 00000 n
+0000442073 00000 n
+0000441923 00000 n
+0000441955 00000 n
+0000441805 00000 n
+0000441837 00000 n
+0000000831 00000 f
+0000000832 00000 f
+0000000833 00000 f
+0000000834 00000 f
+0000000835 00000 f
+0000000836 00000 f
+0000000837 00000 f
+0000000838 00000 f
+0000000839 00000 f
+0000000840 00000 f
+0000000841 00000 f
+0000000842 00000 f
+0000000843 00000 f
+0000000844 00000 f
+0000000849 00000 f
+0000437895 00000 n
+0000437969 00000 n
+0000438053 00000 n
+0000438127 00000 n
+0000000850 00000 f
+0000000851 00000 f
+0000000852 00000 f
+0000000853 00000 f
+0000000854 00000 f
+0000000855 00000 f
+0000000856 00000 f
+0000000857 00000 f
+0000000866 00000 f
+0000441687 00000 n
+0000441719 00000 n
+0000441569 00000 n
+0000441601 00000 n
+0000441451 00000 n
+0000441483 00000 n
+0000441333 00000 n
+0000441365 00000 n
+0000000867 00000 f
+0000000868 00000 f
+0000000869 00000 f
+0000000870 00000 f
+0000000871 00000 f
+0000000872 00000 f
+0000000873 00000 f
+0000000874 00000 f
+0000000875 00000 f
+0000000876 00000 f
+0000000877 00000 f
+0000000878 00000 f
+0000000879 00000 f
+0000000880 00000 f
+0000000885 00000 f
+0000438201 00000 n
+0000438275 00000 n
+0000438359 00000 n
+0000438433 00000 n
+0000000886 00000 f
+0000000887 00000 f
+0000000888 00000 f
+0000000889 00000 f
+0000000890 00000 f
+0000000891 00000 f
+0000000892 00000 f
+0000000893 00000 f
+0000000902 00000 f
+0000441215 00000 n
+0000441247 00000 n
+0000441097 00000 n
+0000441129 00000 n
+0000440979 00000 n
+0000441011 00000 n
+0000440861 00000 n
+0000440893 00000 n
+0000000903 00000 f
+0000000904 00000 f
+0000000905 00000 f
+0000000906 00000 f
+0000000907 00000 f
+0000000908 00000 f
+0000000909 00000 f
+0000000910 00000 f
+0000000911 00000 f
+0000000912 00000 f
+0000000913 00000 f
+0000000914 00000 f
+0000000915 00000 f
+0000000916 00000 f
+0000000917 00000 f
+0000000918 00000 f
+0000000923 00000 f
+0000438507 00000 n
+0000438581 00000 n
+0000438665 00000 n
+0000438739 00000 n
+0000000924 00000 f
+0000000925 00000 f
+0000000926 00000 f
+0000000927 00000 f
+0000000928 00000 f
+0000000929 00000 f
+0000000930 00000 f
+0000000931 00000 f
+0000000940 00000 f
+0000440743 00000 n
+0000440775 00000 n
+0000440625 00000 n
+0000440657 00000 n
+0000440507 00000 n
+0000440539 00000 n
+0000440389 00000 n
+0000440421 00000 n
+0000000941 00000 f
+0000000942 00000 f
+0000000943 00000 f
+0000000944 00000 f
+0000000945 00000 f
+0000000946 00000 f
+0000000947 00000 f
+0000000948 00000 f
+0000000949 00000 f
+0000000950 00000 f
+0000000951 00000 f
+0000000952 00000 f
+0000000953 00000 f
+0000000954 00000 f
+0000000955 00000 f
+0000000956 00000 f
+0000000957 00000 f
+0000000958 00000 f
+0000000959 00000 f
+0000000960 00000 f
+0000000965 00000 f
+0000438813 00000 n
+0000438887 00000 n
+0000438971 00000 n
+0000439045 00000 n
+0000000966 00000 f
+0000000967 00000 f
+0000000968 00000 f
+0000000969 00000 f
+0000000970 00000 f
+0000000971 00000 f
+0000000972 00000 f
+0000000973 00000 f
+0000000982 00000 f
+0000440271 00000 n
+0000440303 00000 n
+0000440153 00000 n
+0000440185 00000 n
+0000440035 00000 n
+0000440067 00000 n
+0000439917 00000 n
+0000439949 00000 n
+0000000983 00000 f
+0000000984 00000 f
+0000000985 00000 f
+0000000986 00000 f
+0000000987 00000 f
+0000000988 00000 f
+0000000989 00000 f
+0000000990 00000 f
+0000000991 00000 f
+0000000992 00000 f
+0000000993 00000 f
+0000000994 00000 f
+0000000995 00000 f
+0000000996 00000 f
+0000000997 00000 f
+0000000998 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000439119 00000 n
+0000439196 00000 n
+0000439283 00000 n
+0000439360 00000 n
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000439797 00000 n
+0000439830 00000 n
+0000439677 00000 n
+0000439710 00000 n
+0000439557 00000 n
+0000439590 00000 n
+0000439437 00000 n
+0000439470 00000 n
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000000000 00000 f
+0000070048 00000 n
+0000070507 00000 n
+0000070854 00000 n
+0000071040 00000 n
+0000069250 00000 n
+0000069327 00000 n
+0000069414 00000 n
+0000069491 00000 n
+0000452641 00000 n
+0000064612 00000 n
+0000068085 00000 n
+0000080410 00000 n
+0000080180 00000 n
+0000080295 00000 n
+0000068153 00000 n
+0000068683 00000 n
+0000068735 00000 n
+0000069928 00000 n
+0000069961 00000 n
+0000069808 00000 n
+0000069841 00000 n
+0000069688 00000 n
+0000069721 00000 n
+0000069568 00000 n
+0000069601 00000 n
+0000075263 00000 n
+0000074148 00000 n
+0000073114 00000 n
+0000071410 00000 n
+0000071714 00000 n
+0000073404 00000 n
+0000074433 00000 n
+0000075730 00000 n
+0000080488 00000 n
+0000080784 00000 n
+0000082499 00000 n
+0000148089 00000 n
+0000213679 00000 n
+0000279269 00000 n
+0000344859 00000 n
+0000410449 00000 n
+0000452697 00000 n
+trailer<</Size 1083/Root 1 0 R/Info 1082 0 R/ID[<BB3C67C0D97347C89CFF132697949E9F><F4F92B8EA8614191851F186DF60AA898>]>>startxref452884%%EOF
\ No newline at end of file
diff --git a/flashattn_memory.jpg b/flashattn_memory.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d0b6b19e8bb1664e7fd2ce49c4dd5bcf1114df10
Binary files /dev/null and b/flashattn_memory.jpg differ
diff --git a/flashattn_speedup.jpg b/flashattn_speedup.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..af9d1cf8edf979c65f6d24587a365471224d53d4
Binary files /dev/null and b/flashattn_speedup.jpg differ
diff --git a/flashattn_speedup_3090.jpg b/flashattn_speedup_3090.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3e608e9f71e099c58ec93666333e620430514fb3
Binary files /dev/null and b/flashattn_speedup_3090.jpg differ
diff --git a/flashattn_speedup_a100_d128.jpg b/flashattn_speedup_a100_d128.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ac6a677baca7b09fcfa62c6287cb5bdaf5245925
Binary files /dev/null and b/flashattn_speedup_a100_d128.jpg differ
diff --git a/flashattn_speedup_t4.jpg b/flashattn_speedup_t4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8dec610efe67b16b322de621e6f101bbd96008c8
Binary files /dev/null and b/flashattn_speedup_t4.jpg differ
diff --git a/flashattn_speedup_t4_fwd.jpg b/flashattn_speedup_t4_fwd.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4cbcc5b7749e7f35fdd0cf5ad2397de2e9f82bc6
Binary files /dev/null and b/flashattn_speedup_t4_fwd.jpg differ
diff --git a/flop-count.yaml b/flop-count.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ee45b91585fffee5424f12b311641f42491e235f
--- /dev/null
+++ b/flop-count.yaml
@@ -0,0 +1,5 @@
+flop_count:
+  _target_: src.callbacks.flop_count.FlopCount
+  profilers: ['fvcore']
+  input_size: [3, 224, 224]
+  device: null
diff --git a/ft_attention.cpp b/ft_attention.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b307cffc2bcdd3bff117d3581d97849d594523e9
--- /dev/null
+++ b/ft_attention.cpp
@@ -0,0 +1,232 @@
+#include <torch/extension.h>
+#include "ATen/cuda/CUDAContext.h"
+#include <c10/cuda/CUDAGuard.h>
+
+
+#include "decoder_masked_multihead_attention.h"
+
+#define CHECK_DEVICE(x) TORCH_CHECK(x.device().type() == torch::kCUDA, #x " must be on CUDA")
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+
+#define DISPATCH_FLOAT_AND_HALF_AND_BF16(TYPE, NAME, ...)                  \
+  if (TYPE == at::ScalarType::Half) {                                      \
+    using scalar_t = at::Half;                                             \
+    __VA_ARGS__();                                                         \
+  } else if (TYPE == at::ScalarType::BFloat16) {                           \
+    using scalar_t = at::BFloat16;                                         \
+    __VA_ARGS__();                                                         \
+  } else if (TYPE == at::ScalarType::Float)  {                             \
+    using scalar_t = float;                                                \
+    __VA_ARGS__();                                                         \
+  } else {                                                                 \
+    AT_ERROR(#NAME, " not implemented for type '", toString(TYPE), "'"); \
+  }
+
+template<typename T>
+void masked_multihead_attention(const Masked_multihead_attention_params<T>& params,
+                                const cudaStream_t& stream);
+
+template<typename T>
+void cross_multihead_attention(const Masked_multihead_attention_params<T>& params,
+                               const cudaStream_t& stream);
+
+template<typename T>
+struct SATypeConverter {
+    using Type = T;
+};
+
+template<>
+struct SATypeConverter<at::Half> {
+    using Type = uint16_t;
+};
+
+template<>
+struct SATypeConverter<at::BFloat16> {
+    using Type = __nv_bfloat16;
+};
+
+template <typename T>
+void set_params(Masked_multihead_attention_params<T> &params,
+                const size_t batch_size,
+                const size_t nheads,
+                const size_t nheads_kv,
+                const size_t memory_max_seqlen,
+                const size_t headdim,
+                const int timestep,
+                const int rotary_embedding_dim,
+                const float rotary_base,
+                const bool neox_rotary_style,
+                const int q_batch_stride,
+                const int k_batch_stride,
+                const int v_batch_stride,
+                const int nnz_heads,
+                T *q_ptr,
+                T *k_ptr,
+                T *v_ptr,
+                T *k_cache_ptr,
+                T *v_cache_ptr,
+                int *length_per_sample,
+                T *rotary_cos,
+                T *rotary_sin,
+                T *out_ptr,
+                int *nnz_head_idx) {
+    // Reset the parameters
+    memset(&params, 0, sizeof(params));
+    params.q = q_ptr;
+    params.k = k_ptr;
+    params.v = v_ptr;
+    params.q_bias = nullptr;
+    params.k_bias = nullptr;
+    params.v_bias = nullptr;
+    params.k_cache = k_cache_ptr;
+    params.v_cache = v_cache_ptr;
+    params.out = out_ptr;
+    params.cache_indir = nullptr;
+    params.stride_q = q_batch_stride;
+    params.stride_k = k_batch_stride;
+    params.stride_v = v_batch_stride;
+    params.batch_size = batch_size;
+    params.beam_width = 1;
+    params.memory_max_len = memory_max_seqlen;
+    params.num_heads = nheads;
+    params.num_heads_kv = nheads_kv;
+    params.num_heads_q_kv_ratio = nheads / nheads_kv;
+    params.nnz_heads = nnz_heads;
+    params.hidden_size_per_head = headdim;
+    params.rotary_embedding_dim = rotary_embedding_dim;
+    params.rotary_base = rotary_base;
+    params.neox_rotary_style = neox_rotary_style;
+    params.timestep = timestep;
+    params.inv_sqrt_dh = 1.f / sqrt(float(headdim));
+    params.total_padding_tokens = nullptr;
+    params.masked_tokens = nullptr;
+    params.prefix_prompt_lengths = nullptr;
+    params.max_prefix_prompt_length = 0;
+    params.relative_attention_bias = nullptr;
+    params.relative_attention_bias_stride = 0;
+    params.cross_attention_out = nullptr;
+    params.max_decoder_seq_len = 0;
+    params.is_return_cross_attentions = false;
+    params.finished = nullptr;
+    params.memory_length_per_sample = nullptr;
+    params.length_per_sample = length_per_sample;
+    params.rotary_cos = rotary_cos;
+    params.rotary_sin = rotary_sin;
+    params.nnz_head_idx = nnz_head_idx;
+}
+
+torch::Tensor single_query_attention(const torch::Tensor q,
+                                     const torch::Tensor k,
+                                     const torch::Tensor v,
+                                     torch::Tensor k_cache,
+                                     torch::Tensor v_cache,
+                                     c10::optional<const torch::Tensor> length_per_sample_,
+                                     c10::optional<const torch::Tensor> rotary_cos_,
+                                     c10::optional<const torch::Tensor> rotary_sin_,
+                                     c10::optional<const torch::Tensor> nnz_head_idx_,
+                                     const int timestep,
+                                     int rotary_embedding_dim = 0,
+                                     const float rotary_base = 10000.0f,
+                                     const bool neox_rotary_style=true) {
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); CHECK_DEVICE(k_cache); CHECK_DEVICE(v_cache);
+    int batch_size = v_cache.size(0);
+    int nheads = q.size(1);
+    int nheads_kv = v_cache.size(1);
+    int memory_max_seqlen = v_cache.size(2);
+    int headdim = v_cache.size(3);
+    auto input_type = q.scalar_type();
+    TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
+
+    CHECK_SHAPE(q, batch_size, nheads, headdim);
+    CHECK_SHAPE(k, batch_size, nheads_kv, headdim);
+    CHECK_SHAPE(v, batch_size, nheads_kv, headdim);
+    CHECK_SHAPE(v_cache, batch_size, nheads_kv, memory_max_seqlen, headdim);
+    // k_cache shape: [B, H, Dh/x, L, x] where x=8 for fp16 and x=4 for fp32
+    int packsize = k_cache.dtype() == torch::kFloat32 ? 4 : 8;
+    CHECK_SHAPE(k_cache, batch_size, nheads_kv, headdim / packsize, memory_max_seqlen, packsize);
+    TORCH_CHECK(q.stride(2) == 1 && q.stride(1) == headdim);
+    TORCH_CHECK(k.stride(2) == 1 && k.stride(1) == headdim);
+    TORCH_CHECK(v.stride(2) == 1 && v.stride(1) == headdim);
+    CHECK_CONTIGUOUS(v_cache); CHECK_CONTIGUOUS(k_cache);
+
+    TORCH_CHECK(q.scalar_type() == input_type);
+    TORCH_CHECK(k.scalar_type() == input_type);
+    TORCH_CHECK(v.scalar_type() == input_type);
+    TORCH_CHECK(k_cache.scalar_type() == input_type);
+    TORCH_CHECK(v_cache.scalar_type() == input_type);
+
+    if (length_per_sample_.has_value()) {
+        auto length_per_sample = length_per_sample_.value();
+        CHECK_DEVICE(length_per_sample);
+        CHECK_SHAPE(length_per_sample, batch_size);
+        CHECK_CONTIGUOUS(length_per_sample);
+        TORCH_CHECK(length_per_sample.dtype() == torch::kInt32);
+    }
+
+    if (rotary_cos_.has_value()) {
+        auto rotary_cos = rotary_cos_.value();
+        CHECK_DEVICE(rotary_cos);
+        rotary_embedding_dim = rotary_cos.size(-1) * 2;
+        CHECK_SHAPE(rotary_cos, batch_size, rotary_embedding_dim / 2);
+        CHECK_CONTIGUOUS(rotary_cos);
+        TORCH_CHECK(rotary_cos.scalar_type() == input_type);
+
+        TORCH_CHECK(rotary_sin_.has_value());
+        auto rotary_sin = rotary_sin_.value();
+        CHECK_DEVICE(rotary_sin);
+        CHECK_SHAPE(rotary_sin, batch_size, rotary_embedding_dim / 2);
+        CHECK_CONTIGUOUS(rotary_sin);
+        TORCH_CHECK(rotary_sin.scalar_type() == input_type);
+    }
+
+    if (nnz_head_idx_.has_value()) {
+        auto nnz_head_idx = nnz_head_idx_.value();
+        CHECK_DEVICE(nnz_head_idx);
+        int nnz_heads = nnz_head_idx.size(0);
+        CHECK_SHAPE(nnz_head_idx, nnz_heads);
+        CHECK_CONTIGUOUS(nnz_head_idx);
+        TORCH_CHECK(nnz_head_idx.dtype() == torch::kInt32);
+    }
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    torch::Tensor out = torch::empty_like(q);
+
+    DISPATCH_FLOAT_AND_HALF_AND_BF16(q.scalar_type(), "single_query_attention", [&] {
+        using DataType = typename SATypeConverter<scalar_t>::Type;
+        Masked_multihead_attention_params<DataType> params;
+        set_params(params, batch_size, nheads, nheads_kv, memory_max_seqlen, headdim, timestep,
+                   rotary_embedding_dim, rotary_base, neox_rotary_style,
+                   q.stride(0), k.stride(0), v.stride(0),
+                   nnz_head_idx_.has_value() ? nnz_head_idx_.value().size(0) : 0,
+                   reinterpret_cast<DataType*>(q.data_ptr()),
+                   reinterpret_cast<DataType*>(k.data_ptr()),
+                   reinterpret_cast<DataType*>(v.data_ptr()),
+                   reinterpret_cast<DataType*>(k_cache.data_ptr()),
+                   reinterpret_cast<DataType*>(v_cache.data_ptr()),
+                   length_per_sample_.has_value()
+                       ? length_per_sample_.value().data_ptr<int>() : nullptr,
+                   rotary_cos_.has_value()
+                       ? reinterpret_cast<DataType*>(rotary_cos_.value().data_ptr()) : nullptr,
+                   rotary_sin_.has_value()
+                       ? reinterpret_cast<DataType*>(rotary_sin_.value().data_ptr()) : nullptr,
+                   reinterpret_cast<DataType*>(out.data_ptr()),
+                   nnz_head_idx_.has_value() ? nnz_head_idx_.value().data_ptr<int>() : nullptr
+                   );
+        auto stream = at::cuda::getCurrentCUDAStream();
+        masked_multihead_attention(params, stream);
+    });
+    return out;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("single_query_attention", &single_query_attention, "Attention with a single query",
+          py::arg("q"), py::arg("k"), py::arg("v"), py::arg("k_cache"), py::arg("v_cache"),
+          py::arg("length_per_sample_"), py::arg("rotary_cos_"),
+          py::arg("rotary_sin_"), py::arg("nnz_head_idx_"),
+          py::arg("timestep"), py::arg("rotary_embedding_dim")=0,
+          py::arg("rotary_base")=10000.0f, py::arg("neox_rotary_style")=true);
+}
diff --git a/fused_dense.cpp b/fused_dense.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..52a203889b239e857cb1cf34c1260243438d3404
--- /dev/null
+++ b/fused_dense.cpp
@@ -0,0 +1,216 @@
+// Adapted from https://github.com/NVIDIA/apex/blob/master/csrc/fused_dense.cpp
+// We make it work for bfloat16
+#include <torch/extension.h>
+#include <torch/torch.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <vector>
+
+#include <stdio.h>
+
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+
+// https://github.com/NVIDIA/apex/blob/master/csrc/type_shim.h
+// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#define DISPATCH_HALF_AND_BF16(TYPE, NAME, ...)                                \
+  switch (TYPE) {                                                              \
+  case at::ScalarType::Half: {                                                 \
+    using scalar_t = at::Half;                                                 \
+    __VA_ARGS__();                                                             \
+    break;                                                                     \
+  }                                                                            \
+  case at::ScalarType::BFloat16: {                                             \
+    using scalar_t = at::BFloat16;                                             \
+    __VA_ARGS__();                                                             \
+    break;                                                                     \
+  }                                                                            \
+  default:                                                                     \
+    AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");            \
+  }
+
+template <typename T>
+int linear_bias_wgrad_cuda(const T *input, const T *d_output, int64_t in_features, int64_t batch_size, int64_t out_features, T *d_weight, T *d_bias, void *lt_workspace, size_t workspaceSize);
+
+template <typename T>
+int linear_act_forward_cuda(const T *input, const T *weight, const T *bias, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, T *output, void *pre_act, void *lt_workspace, size_t workspaceSize);
+
+template <typename T>
+int bias_act_linear_dgrad_bgrad_cuda(const T *weight, const T *d_output, const void *pre_act, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, T *d_input, T *d_bias, void *lt_workspace, size_t workspaceSize);
+
+std::vector<at::Tensor> linear_bias_wgrad(at::Tensor input, at::Tensor d_output, bool has_d_bias) {
+
+  int64_t batch_size = input.size(0);
+  int64_t in_features = input.size(1);
+  int64_t out_features = d_output.size(1);
+
+  TORCH_CHECK(input.dtype() == torch::kFloat16 || input.dtype() == torch::kBFloat16);
+  TORCH_CHECK(input.dtype() == d_output.dtype());
+  TORCH_CHECK(input.is_cuda());
+  TORCH_CHECK(d_output.is_cuda());
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(d_output.is_contiguous());
+  CHECK_SHAPE(input, batch_size, in_features);
+  CHECK_SHAPE(d_output, batch_size, out_features);
+
+  // Otherwise the kernel will be launched from cuda:0 device
+  // Cast to char to avoid compiler warning about narrowing
+  at::cuda::CUDAGuard device_guard{(char)input.get_device()};
+
+  // create output/workspace tensor
+  auto opts = input.options();
+  auto d_weight = at::empty({out_features, in_features}, opts);
+  at::Tensor d_bias;
+  if (has_d_bias) {
+#if defined(CUBLAS_VERSION) && CUBLAS_VERSION < 11600
+    d_bias = d_output.view({-1, out_features}).sum(0, false);
+#else
+    d_bias = at::empty({out_features}, opts);
+#endif
+  }
+  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind setting this to 1M.
+  // However, Apex sets it to 4M and TransformerEngine sets to 32M for Hopper and 4M for other GPUs
+  // https://github.com/NVIDIA/TransformerEngine/blob/a0f0065498bbcfc1da78cf9e8b166f5381613fbc/transformer_engine/pytorch/module.py#L91
+  size_t workspaceSize = 1024 * 1024 * (at::cuda::getCurrentDeviceProperties()->major >= 9 ? 32 : 4);
+  auto lt_workspace = at::empty({static_cast<int64_t>(workspaceSize)}, opts.dtype(torch::kUInt8));
+
+  DISPATCH_HALF_AND_BF16(input.scalar_type(), "linear_bias_wgrad", [&] {
+    auto result = linear_bias_wgrad_cuda<scalar_t>(
+        input.data_ptr<scalar_t>(),
+        d_output.data_ptr<scalar_t>(),
+        in_features,
+        batch_size,
+        out_features,
+        d_weight.data_ptr<scalar_t>(),
+        has_d_bias ? d_bias.data_ptr<scalar_t>() : nullptr,
+        (void*) (lt_workspace.data_ptr()),
+        workspaceSize);
+    TORCH_CHECK(result == 0, "linear_bias_wgrad failed.");
+  });
+
+  return {d_weight, d_bias};
+}
+
+std::vector<at::Tensor> linear_act_forward(at::Tensor input, at::Tensor weight,
+                                           c10::optional<at::Tensor> bias_,
+                                           bool is_gelu, bool save_pre_act, int heuristic) {
+
+  int64_t batch_size = input.size(0);
+  int64_t in_features = input.size(1);
+  int64_t out_features = weight.size(0);
+
+  TORCH_CHECK(input.dtype() == torch::kFloat16 || input.dtype() == torch::kBFloat16);
+  TORCH_CHECK(input.dtype() == weight.dtype());
+  TORCH_CHECK(input.is_cuda());
+  TORCH_CHECK(weight.is_cuda());
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(weight.is_contiguous());
+  CHECK_SHAPE(input, batch_size, in_features);
+  CHECK_SHAPE(weight, out_features, in_features);
+  if (bias_.has_value()) {
+    auto bias = bias_.value();
+    TORCH_CHECK(bias.dtype() == input.dtype());
+    TORCH_CHECK(bias.is_cuda());
+    TORCH_CHECK(bias.is_contiguous());
+    CHECK_SHAPE(bias, out_features);
+  }
+
+  // Otherwise the kernel will be launched from cuda:0 device
+  // Cast to char to avoid compiler warning about narrowing
+  at::cuda::CUDAGuard device_guard{(char)input.get_device()};
+
+  // create output/workspace tensor
+  auto opts = input.options();
+  auto output = at::empty({batch_size, out_features}, opts);
+  at::Tensor pre_act;
+  // If ReLU, cuBlasLT stores a bit-mask (1 bit per element)
+  if (save_pre_act) { pre_act = at::empty({batch_size, is_gelu ? out_features : out_features / 8},
+                                          is_gelu ? opts : opts.dtype(torch::kUInt8)); }
+  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind setting this to 1M.
+  // However, Apex sets it to 4M and TransformerEngine sets to 32M for Hopper and 4M for other GPUs
+  // https://github.com/NVIDIA/TransformerEngine/blob/a0f0065498bbcfc1da78cf9e8b166f5381613fbc/transformer_engine/pytorch/module.py#L91
+  size_t workspaceSize = 1024 * 1024 * (at::cuda::getCurrentDeviceProperties()->major >= 9 ? 32 : 4);
+  auto lt_workspace = at::empty({static_cast<int64_t>(workspaceSize)}, opts.dtype(torch::kUInt8));
+
+  DISPATCH_HALF_AND_BF16(input.scalar_type(), "linear_act_forward", [&] {
+    auto result = linear_act_forward_cuda<scalar_t>(
+        input.data_ptr<scalar_t>(),
+        weight.data_ptr<scalar_t>(),
+        bias_.has_value()? bias_.value().data_ptr<scalar_t>() : nullptr,
+        in_features,
+        batch_size,
+        out_features,
+        is_gelu,
+        heuristic,
+        output.data_ptr<scalar_t>(),
+        save_pre_act ? pre_act.data_ptr() : nullptr,
+        (void*) (lt_workspace.data_ptr()),
+        workspaceSize);
+    TORCH_CHECK(result == 0, "linear_act_forward failed.");
+  });
+
+  std::vector<at::Tensor> result = {output};
+  if (save_pre_act) { result.push_back(pre_act); };
+  return result;
+}
+
+std::vector<at::Tensor> bias_act_linear_dgrad_bgrad(
+  at::Tensor weight, at::Tensor d_output, at::Tensor pre_act, bool is_gelu, int heuristic
+) {
+
+  int64_t batch_size = d_output.size(0);
+  int64_t out_features = d_output.size(1);
+  int64_t in_features = weight.size(1);
+
+  TORCH_CHECK(weight.dtype() == torch::kFloat16 || weight.dtype() == torch::kBFloat16);
+  TORCH_CHECK(weight.dtype() == d_output.dtype());
+  TORCH_CHECK(is_gelu ? (pre_act.dtype() == weight.dtype()) : (pre_act.dtype() == torch::kUInt8));
+  TORCH_CHECK(weight.is_cuda());
+  TORCH_CHECK(d_output.is_cuda());
+  TORCH_CHECK(pre_act.is_cuda());
+  TORCH_CHECK(weight.is_contiguous());
+  TORCH_CHECK(d_output.is_contiguous());
+  TORCH_CHECK(pre_act.is_contiguous());
+  CHECK_SHAPE(weight, out_features, in_features);
+  CHECK_SHAPE(d_output, batch_size, out_features);
+  // If ReLU, cuBlasLT stores a bit-mask (1 bit per element)
+  CHECK_SHAPE(pre_act, batch_size, is_gelu ? in_features : in_features / 8);
+
+  // Otherwise the kernel will be launched from cuda:0 device
+  // Cast to char to avoid compiler warning about narrowing
+  at::cuda::CUDAGuard device_guard{(char)weight.get_device()};
+
+  // create output/workspace tensor
+  auto opts = weight.options();
+  auto d_bias = at::empty({in_features}, opts);
+  auto d_input = at::empty({batch_size, in_features}, opts);
+  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind setting this to 1M.
+  // However, Apex sets it to 4M and TransformerEngine sets to 32M for Hopper and 4M for other GPUs
+  // https://github.com/NVIDIA/TransformerEngine/blob/a0f0065498bbcfc1da78cf9e8b166f5381613fbc/transformer_engine/pytorch/module.py#L91
+  size_t workspaceSize = 1024 * 1024 * (at::cuda::getCurrentDeviceProperties()->major >= 9 ? 32 : 4);
+  auto lt_workspace = at::empty({static_cast<int64_t>(workspaceSize)}, opts.dtype(torch::kUInt8));
+
+  DISPATCH_HALF_AND_BF16(weight.scalar_type(), "bias_act_linear_dgrad_bgrad", [&] {
+    auto result = bias_act_linear_dgrad_bgrad_cuda<scalar_t>(
+        weight.data_ptr<scalar_t>(),
+        d_output.data_ptr<scalar_t>(),
+        pre_act.data_ptr(),
+        in_features,
+        batch_size,
+        out_features,
+        is_gelu,
+        heuristic,
+        d_input.data_ptr<scalar_t>(),
+        d_bias.data_ptr<scalar_t>(),
+        (void*) (lt_workspace.data_ptr()),
+        workspaceSize);
+    TORCH_CHECK(result == 0, "bias_act_linear_dgrad_bgrad failed.");
+  });
+
+  return {d_input, d_bias};
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("linear_bias_wgrad", &linear_bias_wgrad, "linear bias wgrad");
+  m.def("linear_act_forward", &linear_act_forward, "linear gelu/relu forward");
+  m.def("bias_act_linear_dgrad_bgrad", &bias_act_linear_dgrad_bgrad, "bias gelu/relu linear dgrad bgrad");
+}
diff --git a/fused_dense.py b/fused_dense.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e45b8e609812a1545781011141ec80f6dc3af0f
--- /dev/null
+++ b/fused_dense.py
@@ -0,0 +1,688 @@
+# Copyright (c) 2023, Tri Dao.
+# Inspired by https://github.com/NVIDIA/apex/blob/master/apex/fused_dense/fused_dense.py
+# We make it work with pytorch amp and with bfloat16.
+# The TensorParallel linear modules are inspired by https://github.com/NVIDIA/apex/blob/master/apex/transformer/tensor_parallel/layers.py
+from functools import partial
+from typing import Optional
+
+# import fused_dense_cuda  # from apex
+import fused_dense_lib as fused_dense_cuda
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.distributed import ProcessGroup
+
+from flash_attn.ops.activations import gelu_bwd, relu_bwd, sqrelu_bwd, sqrelu_fwd
+from flash_attn.utils.distributed import (
+    all_gather_raw,
+    all_reduce,
+    all_reduce_raw,
+    reduce_scatter,
+    reduce_scatter_raw,
+)
+
+
+class FusedDenseFunc(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx, x, weight, bias, return_residual=False, process_group=None, sequence_parallel=True
+    ):
+        """
+        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
+        with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
+        """
+        ctx.compute_weight_gradient = weight.requires_grad
+        ctx.return_residual = return_residual
+        ctx.process_group = process_group
+        ctx.sequence_parallel = sequence_parallel
+
+        if torch.is_autocast_enabled():
+            x = x.to(dtype=torch.get_autocast_gpu_dtype())
+        x = x.contiguous()
+        if process_group is not None and sequence_parallel:
+            # We want to kick off the all_gather early, before weight dtype conversion
+            total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
+        else:
+            total_x = x
+
+        if torch.is_autocast_enabled():
+            weight = weight.to(dtype=torch.get_autocast_gpu_dtype())
+            bias = bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None
+        weight = weight.contiguous()
+        if process_group is not None and sequence_parallel:
+            handle_x.wait()
+        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
+        batch_dim = batch_shape.numel()
+        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
+        if min(batch_dim, n, *weight.shape) > 65535 * 32:
+            raise RuntimeError("fused_dense only supports matrix dims <= 2M")
+        output = F.linear(total_x, weight, bias)
+        if ctx.compute_weight_gradient:
+            ctx.save_for_backward(x, weight)
+        else:
+            ctx.save_for_backward(weight)
+        return output if not return_residual else (output, x)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output, *args):
+        grad_output = grad_output.contiguous()
+        if ctx.return_residual:
+            (grad_input,) = args
+            grad_input = grad_input.contiguous()
+        process_group = ctx.process_group
+        sequence_parallel = ctx.sequence_parallel
+        if ctx.compute_weight_gradient:
+            x, weight = ctx.saved_tensors
+            if process_group is not None and sequence_parallel:
+                total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
+            else:
+                total_x = x
+        else:
+            (weight,) = ctx.saved_tensors
+            total_x = None
+        batch_shape = grad_output.shape[:-1]
+        batch_dim = batch_shape.numel()
+        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
+        if ctx.needs_input_grad[0]:
+            if not ctx.return_residual:
+                grad_input = F.linear(grad_output, weight.t())
+            else:
+                grad_input = torch.addmm(
+                    grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight
+                )
+            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
+            if process_group is not None:
+                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
+                grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True)
+        else:
+            grad_input = None
+        if ctx.needs_input_grad[1]:
+            assert ctx.compute_weight_gradient
+            if process_group is not None and sequence_parallel:
+                handle_x.wait()
+            grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad(
+                total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2]
+            )
+        else:
+            grad_weight = None
+            grad_bias = grad_output if ctx.needs_input_grad[2] else None
+        if process_group is not None and ctx.needs_input_grad[0]:
+            handle_grad_input.wait()
+        return grad_input, grad_weight, grad_bias, None, None, None
+
+
+def fused_dense_func(
+    x: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor] = None,
+    return_residual: bool = False,
+    process_group: Optional[ProcessGroup] = None,
+    sequence_parallel: bool = True,
+):
+    dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
+        x.dtype == torch.float32 and torch.is_autocast_enabled()
+    )
+    if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible:
+        return FusedDenseFunc.apply(
+            x, weight, bias, return_residual, process_group, sequence_parallel
+        )
+    else:
+        assert process_group is None
+        out = F.linear(x, weight, bias)
+        return out if not return_residual else (out, x)
+
+
+class FusedDense(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        return_residual: bool = False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        super().__init__(in_features, out_features, bias=bias, device=device, dtype=dtype)
+        self.return_residual = return_residual
+
+    def forward(self, x, process_group=None):
+        """
+        If process_group is not None, we're doing Tensor Parallel with sequence parallelism:
+        we do an all_gather of x before doing the matmul.
+        """
+        return fused_dense_func(
+            x,
+            self.weight,
+            self.bias,
+            return_residual=self.return_residual,
+            process_group=process_group,
+        )
+
+
+class ColumnParallelLinear(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        process_group: ProcessGroup,
+        bias: bool = True,
+        sequence_parallel=True,
+        multiple_of=1,
+        device=None,
+        dtype=None,
+    ) -> None:
+        world_size = torch.distributed.get_world_size(process_group)
+        if out_features % multiple_of:
+            raise ValueError(f"out_features ({out_features}) must be a multiple of {multiple_of}")
+        multiple = out_features // multiple_of
+        # We want to split @multiple across world_size, but it could be an uneven split
+        div = multiple // world_size
+        mod = multiple % world_size
+        # The first @mod ranks get @div + 1 copies, the rest get @div copies
+        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
+        super().__init__(
+            in_features, local_multiple * multiple_of, bias=bias, device=device, dtype=dtype
+        )
+        self.process_group = process_group
+        self.sequence_parallel = sequence_parallel
+
+    def forward(self, x):
+        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
+        # we do an all_gather of x before doing the matmul.
+        # If not, then the input is already gathered.
+        return fused_dense_func(
+            x,
+            self.weight,
+            self.bias,
+            process_group=self.process_group,
+            sequence_parallel=self.sequence_parallel,
+        )
+
+
+class RowParallelLinear(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        process_group: ProcessGroup,
+        bias: bool = True,
+        sequence_parallel=True,
+        multiple_of=1,
+        device=None,
+        dtype=None,
+    ) -> None:
+        world_size = torch.distributed.get_world_size(process_group)
+        rank = torch.distributed.get_rank(process_group)
+        if in_features % multiple_of:
+            raise ValueError(f"in_features ({in_features}) must be a multiple of {multiple_of}")
+        multiple = in_features // multiple_of
+        # We want to split @multiple across world_size, but it could be an uneven split
+        div = multiple // world_size
+        mod = multiple % world_size
+        # The first @mod ranks get @div + 1 copies, the rest get @div copies
+        local_multiple = div + int(torch.distributed.get_rank(process_group) < mod)
+        # Only rank 0 will have bias
+        super().__init__(
+            local_multiple * multiple_of,
+            out_features,
+            bias=bias and rank == 0,
+            device=device,
+            dtype=dtype,
+        )
+        self.process_group = process_group
+        self.sequence_parallel = sequence_parallel
+
+    def forward(self, x):
+        """
+        We're doing Tensor Parallel with sequence parallelism: we do the matmul and then
+        a reduce_scatter of the result.
+        """
+        out = fused_dense_func(x, self.weight, self.bias)
+        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
+        return reduce_fn(out, self.process_group)
+
+
+class FusedMLPFunc(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx,
+        x,
+        weight1,
+        bias1,
+        weight2,
+        bias2,
+        activation="gelu_approx",
+        save_pre_act=True,
+        return_residual=False,
+        checkpoint_lvl=0,
+        heuristic=0,
+        process_group=None,
+        sequence_parallel=True,
+    ):
+        """
+        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
+        with sequence parallelism: we do an all_gather of x before doing the matmul.
+        If sequence_parallel=False, then the input is already gathered.
+
+        checkpoint_lvl:
+        0: no recomputation in the bwd
+        1: recompute gelu_out / relu_out in the bwd
+        2: recompute pre_act and gelu_out / relu_out in the bwd
+        """
+        assert -1 <= heuristic <= 4
+        assert activation in ["gelu_approx", "relu", "sqrelu"]
+        if activation == "sqrelu":
+            assert heuristic == -1
+        if not save_pre_act:
+            checkpoint_lvl = 2
+        assert checkpoint_lvl in [0, 1, 2]
+        ctx.return_residual = return_residual
+        ctx.process_group = process_group
+        ctx.sequence_parallel = sequence_parallel
+        ctx.checkpoint_lvl = checkpoint_lvl
+        ctx.activation = activation
+        ctx.heuristic = heuristic
+
+        if torch.is_autocast_enabled():
+            x = x.to(dtype=torch.get_autocast_gpu_dtype())
+        x = x.contiguous()
+        if process_group is not None and sequence_parallel:
+            # We want to kick off the all_gather early, before weight dtype conversion
+            total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
+        else:
+            total_x = x
+
+        if torch.is_autocast_enabled():
+            dtype = torch.get_autocast_gpu_dtype()
+            weight1, weight2 = [a.to(dtype=dtype) for a in [weight1, weight2]]
+            bias1 = bias1.to(dtype=dtype) if bias1 is not None else None
+            bias2 = bias2.to(dtype=dtype) if bias2 is not None else None
+        weight1 = weight1.contiguous()
+        bias1 = bias1.contiguous() if bias1 is not None else None
+        weight2 = weight2.contiguous()
+        bias2 = bias2.contiguous() if bias2 is not None else None
+        if process_group is not None and sequence_parallel:
+            handle_x.wait()
+        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
+        batch_dim = batch_shape.numel()
+        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
+        if min(batch_dim, n, *weight1.shape, *weight2.shape) > 65535 * 32:
+            raise RuntimeError("fused_dense only supports matrix dims <= 2M")
+        if heuristic == -1:
+            pre_act = F.linear(total_x, weight1, bias1)
+            activation_fn = (
+                partial(F.gelu, approximate="tanh")
+                if activation == "gelu_approx"
+                else (sqrelu_fwd if activation == "sqrelu" else F.relu)
+            )
+            with torch.jit.fuser("fuser2"):
+                output1 = activation_fn(pre_act)
+            # This is before adding bias1
+            # pre_act = F.linear(total_x.reshape(batch_dim, n), weight1)
+            # with torch.jit.fuser('fuser2'):
+            #     output1 = bias_gelu(pre_act, bias1)
+        else:
+            is_gelu = activation == "gelu_approx"
+            output1, *rest = fused_dense_cuda.linear_act_forward(
+                total_x.reshape(batch_dim, n), weight1, bias1, is_gelu, save_pre_act, heuristic
+            )
+            if save_pre_act:
+                pre_act = rest[0]
+        output2 = F.linear(output1, weight2, bias2)
+        if checkpoint_lvl == 0 or (checkpoint_lvl == 1 and activation == "relu"):
+            # For RELU the pre_act is very small (just a bit-mask) so we just save it
+            ctx.save_for_backward(x, weight1, weight2, pre_act, output1)
+        elif checkpoint_lvl == 1:
+            ctx.save_for_backward(x, weight1, weight2, pre_act)
+        elif checkpoint_lvl == 2:
+            ctx.save_for_backward(x, weight1, weight2, bias1)
+        output2 = output2.reshape(*batch_shape, output2.shape[-1])
+        return output2 if not return_residual else (output2, x)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output, *args):
+        grad_output = grad_output.contiguous()
+        checkpoint_lvl = ctx.checkpoint_lvl
+        activation = ctx.activation
+        activation_fn = (
+            partial(F.gelu, approximate="tanh")
+            if activation == "gelu_approx"
+            else (sqrelu_fwd if activation == "sqrelu" else F.relu)
+        )
+        if ctx.return_residual:
+            (grad_input,) = args
+            grad_input = grad_input.contiguous()
+        process_group = ctx.process_group
+        sequence_parallel = ctx.sequence_parallel
+        x, weight1, weight2, *rest = ctx.saved_tensors
+        if process_group is None or not sequence_parallel:
+            total_x = x
+        batch_shape = grad_output.shape[:-1]
+        batch_dim = batch_shape.numel()
+        if checkpoint_lvl in [0, 1]:
+            if process_group is not None and sequence_parallel:
+                total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
+            if checkpoint_lvl == 0 or (checkpoint_lvl == 1 and activation == "relu"):
+                pre_act, output1 = rest
+            elif checkpoint_lvl == 1:
+                (pre_act,) = rest
+                with torch.jit.fuser("fuser2"):
+                    output1 = activation_fn(pre_act)
+        elif checkpoint_lvl == 2:
+            (bias1,) = rest
+            if process_group is not None and sequence_parallel:
+                total_x, _ = all_gather_raw(x, process_group)
+            if ctx.heuristic == -1:
+                pre_act = F.linear(total_x, weight1, bias1)
+                with torch.jit.fuser("fuser2"):
+                    output1 = activation_fn(pre_act)
+            else:
+                output1, pre_act = fused_dense_cuda.linear_act_forward(
+                    total_x.reshape(batch_dim, total_x.shape[-1]),
+                    weight1,
+                    bias1,
+                    activation == "gelu_approx",
+                    True,
+                    ctx.heuristic,
+                )
+
+        grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
+        output1 = output1.reshape(batch_dim, output1.shape[-1])
+        pre_act = pre_act.reshape(batch_dim, pre_act.shape[-1])
+        if ctx.needs_input_grad[3]:
+            grad_weight2, grad_bias2 = fused_dense_cuda.linear_bias_wgrad(
+                output1, grad_output, ctx.needs_input_grad[4]
+            )
+        else:
+            grad_weight2 = None
+            grad_bias2 = grad_output if ctx.needs_input_grad[4] else None
+        if ctx.heuristic == -1:
+            # grad_pre_act = matmul_dgelu(grad_output, weight2, pre_act)
+            grad_output1 = F.linear(grad_output, weight2.t())
+            activation_grad_fn = (
+                gelu_bwd
+                if activation == "gelu_approx"
+                else (sqrelu_bwd if activation == "sqrelu" else relu_bwd)
+            )
+            with torch.jit.fuser("fuser2"):
+                grad_pre_act = activation_grad_fn(grad_output1, pre_act)
+        else:
+            # The cublasLt epilogue has to compute both gelu/relu grad and bias grad, we can't
+            # just compute gelu/relu grad
+            grad_pre_act, grad_bias1 = fused_dense_cuda.bias_act_linear_dgrad_bgrad(
+                weight2, grad_output, pre_act, activation == "gelu_approx", ctx.heuristic
+            )
+            if not ctx.needs_input_grad[2]:
+                grad_bias1 = None
+        if ctx.needs_input_grad[0]:
+            if not ctx.return_residual:
+                grad_input = F.linear(grad_pre_act, weight1.t())
+            else:
+                grad_input = torch.addmm(
+                    grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_pre_act, weight1
+                )
+            grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1])
+            if process_group is not None:
+                reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw
+                grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True)
+        else:
+            grad_input = None
+        if ctx.heuristic == -1:
+            if ctx.needs_input_grad[1]:
+                if process_group is not None and sequence_parallel and checkpoint_lvl != 2:
+                    handle_x.wait()
+                grad_weight1, grad_bias1 = fused_dense_cuda.linear_bias_wgrad(
+                    total_x.reshape(batch_dim, total_x.shape[-1]),
+                    grad_pre_act,
+                    ctx.needs_input_grad[2],
+                )
+            else:
+                grad_weight1 = None
+                grad_bias1 = grad_pre_act if ctx.needs_input_grad[2] else None
+        else:
+            if ctx.needs_input_grad[1]:
+                if process_group is not None and sequence_parallel and checkpoint_lvl != 2:
+                    handle_x.wait()
+                grad_weight1 = F.linear(
+                    grad_pre_act.t(), total_x.reshape(batch_dim, total_x.shape[-1]).t()
+                )
+            else:
+                grad_weight1 = None
+        if process_group is not None and ctx.needs_input_grad[0]:
+            handle_grad_input.wait()
+        return (
+            grad_input,
+            grad_weight1,
+            grad_bias1,
+            grad_weight2,
+            grad_bias2,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+def fused_mlp_func(
+    x: Tensor,
+    weight1: Tensor,
+    weight2: Tensor,
+    bias1: Optional[Tensor] = None,
+    bias2: Optional[Tensor] = None,
+    activation: str = "gelu_approx",
+    save_pre_act: bool = True,
+    return_residual: bool = False,
+    checkpoint_lvl: int = 0,
+    heuristic: int = 0,
+    process_group: Optional[ProcessGroup] = None,
+    sequence_parallel: bool = True,
+):
+    assert activation in ["gelu_approx", "relu", "sqrelu"]
+    dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or (
+        x.dtype == torch.float32 and torch.is_autocast_enabled()
+    )
+    # If we save pre-activation, dimension must be divisible by 128 (relu) or 8 (gelu)
+    dim_eligible = not save_pre_act or (x.shape[-1] % (128 if activation == "relu" else 8) == 0)
+    if (
+        x.is_cuda
+        and weight1.is_cuda
+        and weight2.is_cuda
+        and (bias1 is None or bias1.is_cuda)
+        and (bias2 is None or bias2.is_cuda)
+        and dtype_eligible
+        and dim_eligible
+    ):
+        return FusedMLPFunc.apply(
+            x,
+            weight1,
+            bias1,
+            weight2,
+            bias2,
+            activation,
+            save_pre_act,
+            return_residual,
+            checkpoint_lvl,
+            heuristic,
+            process_group,
+            sequence_parallel,
+        )
+    else:
+        assert process_group is None
+        pre_act = F.linear(x, weight1, bias1)
+        activation_fn = (
+            partial(F.gelu, approximate="tanh")
+            if activation == "gelu_approx"
+            else partial(F.relu, inplace=True)
+        )
+        output1 = activation_fn(pre_act)
+        output2 = F.linear(output1, weight2, bias2)
+        return output2 if not return_residual else (output2, x)
+
+
+class FusedMLP(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        bias1=True,
+        bias2=True,
+        activation="gelu_approx",
+        return_residual=False,
+        checkpoint_lvl=0,
+        heuristic="auto",
+        device=None,
+        dtype=None,
+    ):
+        """
+        If process_group is not None, we're doing Tensor Parallel with sequence parallelism:
+        we do an all_gather of x before doing the matmul, gelu, then matmul.
+        Finally we do a reduce_scatter of the output.
+
+        checkpoint_lvl (increasing lvl means slower but more memory saving):
+            0: no recomputation in the bwd
+            1: recompute gelu_out in the bwd
+            2: recompute pre_act and gelu_out in the bwd
+        heuristic:
+            -1: don't fuse gemm + gelu (separate kernel)
+            0..4: use this heuristic for the algo section in the fused gemm + gelu
+            'auto': heuristic will be picked automatically:
+                For CUDA >= 11.8, we set heuristic=0 for both fp16 and bf16 for best perf.
+                For CUDA <= 11.7, we set heuristic=1 for fp16 and heuristic=-1 for bf16.
+                For H100, we set heuristic=-1 for both fp16 and bf16 as the fused cuBlasLt implementation
+                is slower than the unfused version.
+        return_residual: whether to return the input x along with the output. This is for
+            performance reason: for post-norm architecture, returning the input allows us
+            to fuse the backward of nn.Linear with the residual connection.
+        """
+        assert checkpoint_lvl in [0, 1, 2]
+        assert activation in ["gelu_approx", "relu", "sqrelu"]
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features * 4
+        self.activation = activation
+        self.return_residual = return_residual
+        self.checkpoint_lvl = checkpoint_lvl
+        self.heuristic = heuristic if activation != "sqrelu" else -1
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs)
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
+
+    def forward(self, x, process_group=None):
+        dtype = x.dtype if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype()
+        if self.heuristic == "auto":
+            if self.activation == "gelu_approx":
+                if torch.cuda.get_device_capability("cuda") == (9, 0):
+                    heuristic = -1
+                else:
+                    cuda_ver = tuple(map(int, torch.version.cuda.split(".")))
+                    heuristic = 0 if cuda_ver >= (11, 8) else (1 if dtype == torch.float16 else -1)
+            else:
+                heuristic = 0
+        else:
+            heuristic = self.heuristic
+        out = fused_mlp_func(
+            x,
+            self.fc1.weight,
+            self.fc2.weight,
+            self.fc1.bias,
+            self.fc2.bias,
+            activation=self.activation,
+            save_pre_act=self.training,
+            return_residual=self.return_residual,
+            checkpoint_lvl=self.checkpoint_lvl,
+            heuristic=heuristic,
+            process_group=process_group,
+        )
+        if self.return_residual:
+            out, x = out
+        if process_group is not None:
+            out = reduce_scatter(out, process_group)
+        return out if not self.return_residual else (out, x)
+
+
+class ParallelFusedMLP(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        activation="gelu_approx",
+        process_group: ProcessGroup = None,
+        bias1=True,
+        bias2=True,
+        sequence_parallel=True,
+        checkpoint_lvl=0,
+        heuristic="auto",
+        device=None,
+        dtype=None,
+    ):
+        """
+        process_group is required. We're doing Tensor Parallel with sequence parallelism:
+        we do an all_gather of x before doing the matmul, gelu, then matmul.
+        Finally we do a reduce_scatter of the output.
+
+        checkpoint_lvl (increasing lvl means slower but more memory saving):
+            0: no recomputation in the bwd
+            1: recompute gelu_out in the bwd
+            2: recompute pre_act and gelu_out in the bwd
+        heuristic:
+            -1: don't fuse gemm + gelu (separate kernel)
+            0..4: use this heuristic for the algo section in the fused gemm + gelu
+            'auto': heuristic will be picked automatically:
+                For CUDA >= 11.8, we set heuristic=0 for both fp16 and bf16 for best perf.
+                For CUDA <= 11.7, we set heuristic=1 for fp16 and heuristic=-1 for bf16.
+        """
+        assert checkpoint_lvl in [0, 1, 2]
+        assert activation in ["gelu_approx", "relu", "sqrelu"]
+        assert process_group is not None
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features * 4
+        self.activation = activation
+        self.process_group = process_group
+        self.sequence_parallel = sequence_parallel
+        self.checkpoint_lvl = checkpoint_lvl
+        self.heuristic = heuristic if activation != "sqrelu" else -1
+        self.fc1 = ColumnParallelLinear(
+            in_features, hidden_features, process_group, bias=bias1, **factory_kwargs
+        )
+        self.fc2 = RowParallelLinear(
+            hidden_features, out_features, process_group, bias=bias2, **factory_kwargs
+        )
+
+    def forward(self, x):
+        dtype = x.dtype if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype()
+        if self.heuristic == "auto":
+            if self.activation == "gelu_approx":
+                cuda_ver = tuple(map(int, torch.version.cuda.split(".")))
+                heuristic = 0 if cuda_ver >= (11, 8) else (1 if dtype == torch.float16 else -1)
+            else:
+                heuristic = 0
+        else:
+            heuristic = self.heuristic
+        out = fused_mlp_func(
+            x,
+            self.fc1.weight,
+            self.fc2.weight,
+            self.fc1.bias,
+            self.fc2.bias,
+            activation=self.activation,
+            save_pre_act=self.training,
+            checkpoint_lvl=self.checkpoint_lvl,
+            heuristic=heuristic,
+            process_group=self.process_group,
+            sequence_parallel=self.sequence_parallel,
+        )
+        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
+        return reduce_fn(out, self.process_group)
diff --git a/fused_dense_cuda.cu b/fused_dense_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..32600e205ff09396509d77c20f70387ff90a3538
--- /dev/null
+++ b/fused_dense_cuda.cu
@@ -0,0 +1,717 @@
+// Adapted from https://github.com/NVIDIA/apex/blob/master/csrc/fused_dense_cuda.cu
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <torch/torch.h>
+
+/* Includes, cuda */
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000
+#include <cublasLt.h>
+#endif
+
+// FP16 Tensor core wrapper around cublas GEMMEx
+cublasStatus_t gemm_bias(
+    cublasHandle_t handle,
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    const float* alpha,
+    const at::Half* A,
+    int64_t lda,
+    const at::Half* B,
+    int64_t ldb,
+    const float* beta,
+    at::Half* C,
+    int64_t ldc) {
+  return cublasGemmEx(
+      handle,
+      transa,
+      transb,
+      m,
+      n,
+      k,
+      alpha,
+      A,
+      CUDA_R_16F,
+      lda,
+      B,
+      CUDA_R_16F,
+      ldb,
+      beta,
+      C,
+      CUDA_R_16F,
+      ldc,
+      CUDA_R_32F,
+      CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+
+// BF16 Tensor core wrapper around cublas GEMMEx
+cublasStatus_t gemm_bias(
+    cublasHandle_t handle,
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    const float* alpha,
+    const at::BFloat16* A,
+    int64_t lda,
+    const at::BFloat16* B,
+    int64_t ldb,
+    const float* beta,
+    at::BFloat16* C,
+    int64_t ldc) {
+  return cublasGemmEx(
+      handle,
+      transa,
+      transb,
+      m,
+      n,
+      k,
+      alpha,
+      A,
+      CUDA_R_16BF,
+      lda,
+      B,
+      CUDA_R_16BF,
+      ldb,
+      beta,
+      C,
+      CUDA_R_16BF,
+      ldc,
+      CUDA_R_32F,
+      CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+
+#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
+
+template <typename Dtype>
+int gemm_bias_act_lt(
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    float alpha,
+    const Dtype* A,
+    int64_t lda,
+    const Dtype* B,
+    int64_t ldb,
+    const Dtype* bias,
+    Dtype* C,
+    int64_t ldc,
+    void* pre_act,
+    bool is_gelu,
+    int heuristic,
+    void *lt_workspace,
+    size_t workspaceSize
+    ) {
+  static_assert(std::is_same<Dtype, at::Half>::value || std::is_same<Dtype, at::BFloat16>::value,
+                "gemm_bias_act_lt only supports fp16 and bf16");
+  bool save_pre_act = pre_act != nullptr;
+  float beta = 0.0;
+  cudaDataType_t abcType = std::is_same<Dtype, at::Half>::value ? CUDA_R_16F : CUDA_R_16BF;
+
+  cublasLtHandle_t ltHandle =
+    reinterpret_cast<cublasLtHandle_t>(at::cuda::getCurrentCUDABlasHandle());
+
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+
+  cublasLtMatmulDescOpaque_t operationDesc = {};
+  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
+  cublasLtMatmulPreferenceOpaque_t preference = {};
+
+  int returnedResults                             = 0;
+  constexpr int requestedAlgoCount = 5;
+  cublasLtMatmulHeuristicResult_t heuristicResult[requestedAlgoCount] = {0};
+  // constexpr int requestedAlgoCount = 1;
+  // cublasLtMatmulHeuristicResult_t heuristicResult = {};
+  cublasLtEpilogue_t epilogue = is_gelu
+      ? (save_pre_act ? CUBLASLT_EPILOGUE_GELU_AUX : CUBLASLT_EPILOGUE_GELU)
+      : (save_pre_act ? CUBLASLT_EPILOGUE_RELU_AUX : CUBLASLT_EPILOGUE_RELU);
+
+  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
+  // for details about defaults; here we just set the transforms for
+  // A and B.
+  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+
+  if (save_pre_act) {
+    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &pre_act, sizeof(pre_act));
+    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc));
+  }
+
+  if (bias != nullptr) {
+    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+      goto CLEANUP;
+    }
+    epilogue = is_gelu
+        ? (save_pre_act ? CUBLASLT_EPILOGUE_GELU_AUX_BIAS : CUBLASLT_EPILOGUE_GELU_BIAS)
+        : (save_pre_act ? CUBLASLT_EPILOGUE_RELU_AUX_BIAS : CUBLASLT_EPILOGUE_RELU_BIAS);
+  } else {
+    epilogue = is_gelu
+        ? (save_pre_act ? CUBLASLT_EPILOGUE_GELU_AUX : CUBLASLT_EPILOGUE_GELU)
+        : (save_pre_act ? CUBLASLT_EPILOGUE_RELU_AUX : CUBLASLT_EPILOGUE_RELU);
+  }
+
+  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    goto CLEANUP;
+  }
+
+  // Create matrix descriptors. Not setting any extra attributes.
+  status = cublasLtMatrixLayoutInit(
+    &Adesc, abcType, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatrixLayoutInit(
+    &Bdesc, abcType, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatrixLayoutInit(&Cdesc, abcType, m, n, ldc);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+
+  // Create preference handle; In general, extra attributes can be
+  // used here to disable tensor ops or to make sure algo selected
+  // will work with badly aligned A, B, C. However, for simplicity
+  // here we assume A,B,C are always well aligned (e.g., directly
+  // come from cudaMalloc)
+  status = cublasLtMatmulPreferenceInit(&preference);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatmulPreferenceSetAttribute(
+    &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize));
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+
+  // We just need the best available heuristic to try and run matmul.
+  // There is no guarantee that this will work. For example, if A is
+  // badly aligned, you can request more (e.g. 32) algos and try to
+  // run them one by one until something works.
+  status = cublasLtMatmulAlgoGetHeuristic(
+    ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, requestedAlgoCount, heuristicResult, &returnedResults);
+    // ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+
+  if (returnedResults == 0) {
+    status = CUBLAS_STATUS_NOT_SUPPORTED;
+    goto CLEANUP;
+  }
+  status = cublasLtMatmul(ltHandle,
+                          &operationDesc,
+                          &alpha,
+                          A,
+                          &Adesc,
+                          B,
+                          &Bdesc,
+                          &beta,
+                          C,
+                          &Cdesc,
+                          C,
+                          &Cdesc,
+                          // &heuristicResult.algo,
+                          // TD [2022-04-29] Somehow algo 0 and 2 are a lot slower than other algos
+                          &heuristicResult[heuristic].algo,
+                          // NULL,
+                          lt_workspace,
+                          workspaceSize,
+                          at::cuda::getCurrentCUDAStream());
+
+CLEANUP:
+  // Descriptors are no longer needed as all GPU work was already
+  // enqueued.
+  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
+}
+
+template int gemm_bias_act_lt(
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    float alpha,
+    const at::Half* A,
+    int64_t lda,
+    const at::Half* B,
+    int64_t ldb,
+    const at::Half* bias,
+    at::Half* C,
+    int64_t ldc,
+    void* pre_act,
+    bool is_gelu,
+    int heuristic,
+    void *lt_workspace,
+    size_t workspaceSize);
+
+template int gemm_bias_act_lt(
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    float alpha,
+    const at::BFloat16* A,
+    int64_t lda,
+    const at::BFloat16* B,
+    int64_t ldb,
+    const at::BFloat16* bias,
+    at::BFloat16* C,
+    int64_t ldc,
+    void* pre_act,
+    bool is_gelu,
+    int heuristic,
+    void *lt_workspace,
+    size_t workspaceSize);
+
+template <typename Dtype>
+int gemm_bgradb_lt(
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    float alpha,
+    const Dtype* A,
+    int64_t lda,
+    const Dtype* B,
+    int64_t ldb,
+    Dtype* C,
+    int64_t ldc,
+    Dtype* bgrad,
+    void *lt_workspace,
+    size_t workspaceSize) {
+  static_assert(std::is_same<Dtype, at::Half>::value || std::is_same<Dtype, at::BFloat16>::value,
+                "gemm_bgradb_lt only supports fp16 and bf16");
+  float beta = 0.0;
+  cudaDataType_t abcType = std::is_same<Dtype, at::Half>::value ? CUDA_R_16F : CUDA_R_16BF;
+
+  cublasLtHandle_t ltHandle =
+    reinterpret_cast<cublasLtHandle_t>(at::cuda::getCurrentCUDABlasHandle());
+
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+
+  cublasLtMatmulDescOpaque_t operationDesc = {};
+  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
+  cublasLtMatmulPreferenceOpaque_t preference = {};
+
+  int returnedResults                             = 0;
+  cublasLtMatmulHeuristicResult_t heuristicResult = {};
+  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;
+
+  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
+  // for details about defaults; here we just set the transforms for
+  // A and B.
+  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+
+  if (bgrad != nullptr) {
+    status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad));
+    if (status != CUBLAS_STATUS_SUCCESS) {
+      goto CLEANUP;
+    }
+      epilogue = CUBLASLT_EPILOGUE_BGRADB;
+  }
+
+  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    goto CLEANUP;
+  }
+
+  // Create matrix descriptors. Not setting any extra attributes.
+  status = cublasLtMatrixLayoutInit(
+    &Adesc, abcType, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatrixLayoutInit(
+    &Bdesc, abcType, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatrixLayoutInit(&Cdesc, abcType, m, n, ldc);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+
+  // Create preference handle; In general, extra attributes can be
+  // used here to disable tensor ops or to make sure algo selected
+  // will work with badly aligned A, B, C. However, for simplicity
+  // here we assume A,B,C are always well aligned (e.g., directly
+  // come from cudaMalloc)
+  status = cublasLtMatmulPreferenceInit(&preference);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatmulPreferenceSetAttribute(
+    &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize));
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+
+  // We just need the best available heuristic to try and run matmul.
+  // There is no guarantee that this will work. For example, if A is
+  // badly aligned, you can request more (e.g. 32) algos and try to
+  // run them one by one until something works.
+  status = cublasLtMatmulAlgoGetHeuristic(
+    ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+
+  if (returnedResults == 0) {
+    status = CUBLAS_STATUS_NOT_SUPPORTED;
+    goto CLEANUP;
+  }
+  status = cublasLtMatmul(ltHandle,
+                          &operationDesc,
+                          &alpha,
+                          A,
+                          &Adesc,
+                          B,
+                          &Bdesc,
+                          &beta,
+                          C,
+                          &Cdesc,
+                          C,
+                          &Cdesc,
+                          //&heuristicResult.algo,
+                          NULL,
+                          lt_workspace,
+                          workspaceSize,
+                          at::cuda::getCurrentCUDAStream());
+
+CLEANUP:
+  // Descriptors are no longer needed as all GPU work was already
+  // enqueued.
+  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
+}
+
+
+template int gemm_bgradb_lt(
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    float alpha,
+    const at::Half* A,
+    int64_t lda,
+    const at::Half* B,
+    int64_t ldb,
+    at::Half* C,
+    int64_t ldc,
+    at::Half* bgrad,
+    void *lt_workspace,
+    size_t workspaceSize);
+
+template int gemm_bgradb_lt(
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    float alpha,
+    const at::BFloat16* A,
+    int64_t lda,
+    const at::BFloat16* B,
+    int64_t ldb,
+    at::BFloat16* C,
+    int64_t ldc,
+    at::BFloat16* bgrad,
+    void *lt_workspace,
+    size_t workspaceSize);
+
+template <typename Dtype>
+int gemm_dact_bgradb_lt(
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    float alpha,
+    const Dtype* A,
+    int64_t lda,
+    const Dtype* B,
+    int64_t ldb,
+    const void* pre_act,
+    Dtype* C,
+    int64_t ldc,
+    Dtype* bgrad,
+    bool is_gelu,
+    int heuristic,
+    void *lt_workspace,
+    size_t workspaceSize) {
+  static_assert(std::is_same<Dtype, at::Half>::value || std::is_same<Dtype, at::BFloat16>::value,
+                "gemm_dact_bgradb_lt only supports fp16 and bf16");
+  float beta = 0.0;
+  cudaDataType_t abcType = std::is_same<Dtype, at::Half>::value ? CUDA_R_16F : CUDA_R_16BF;
+
+  cublasLtHandle_t ltHandle =
+    reinterpret_cast<cublasLtHandle_t>(at::cuda::getCurrentCUDABlasHandle());
+
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+
+  cublasLtMatmulDescOpaque_t operationDesc = {};
+  cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {};
+  cublasLtMatmulPreferenceOpaque_t preference = {};
+
+  int returnedResults                             = 0;
+  constexpr int requestedAlgoCount = 5;
+  cublasLtMatmulHeuristicResult_t heuristicResult[requestedAlgoCount] = {0};
+  cublasLtEpilogue_t epilogue = is_gelu ? CUBLASLT_EPILOGUE_DGELU_BGRAD : CUBLASLT_EPILOGUE_DRELU_BGRAD;
+
+  // Create operation descriptor; see cublasLtMatmulDescAttributes_t
+  // for details about defaults; here we just set the transforms for
+  // A and B.
+  status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa));
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa));
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+
+  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad));
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    goto CLEANUP;
+  }
+  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &pre_act, sizeof(pre_act));
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    goto CLEANUP;
+  }
+  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc));
+
+  status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    goto CLEANUP;
+  }
+
+  // Create matrix descriptors. Not setting any extra attributes.
+  status = cublasLtMatrixLayoutInit(
+    &Adesc, abcType, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatrixLayoutInit(
+    &Bdesc, abcType, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatrixLayoutInit(&Cdesc, abcType, m, n, ldc);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+
+  // Create preference handle; In general, extra attributes can be
+  // used here to disable tensor ops or to make sure algo selected
+  // will work with badly aligned A, B, C. However, for simplicity
+  // here we assume A,B,C are always well aligned (e.g., directly
+  // come from cudaMalloc)
+  status = cublasLtMatmulPreferenceInit(&preference);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+  status = cublasLtMatmulPreferenceSetAttribute(
+    &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize));
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+
+  // We just need the best available heuristic to try and run matmul.
+  // There is no guarantee that this will work. For example, if A is
+  // badly aligned, you can request more (e.g. 32) algos and try to
+  // run them one by one until something works.
+  status = cublasLtMatmulAlgoGetHeuristic(
+    ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, requestedAlgoCount, heuristicResult, &returnedResults);
+  if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP;
+
+  if (returnedResults == 0) {
+    status = CUBLAS_STATUS_NOT_SUPPORTED;
+    goto CLEANUP;
+  }
+  status = cublasLtMatmul(ltHandle,
+                          &operationDesc,
+                          &alpha,
+                          A,
+                          &Adesc,
+                          B,
+                          &Bdesc,
+                          &beta,
+                          C,
+                          &Cdesc,
+                          C,
+                          &Cdesc,
+                          //&heuristicResult.algo,
+                          &heuristicResult[heuristic].algo,
+                          // NULL,
+                          lt_workspace,
+                          workspaceSize,
+                          at::cuda::getCurrentCUDAStream());
+
+CLEANUP:
+  // Descriptors are no longer needed as all GPU work was already
+  // enqueued.
+  return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
+}
+
+template int gemm_dact_bgradb_lt(
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    float alpha,
+    const at::Half* A,
+    int64_t lda,
+    const at::Half* B,
+    int64_t ldb,
+    const void* pre_act,
+    at::Half* C,
+    int64_t ldc,
+    at::Half* bgrad,
+    bool is_gelu,
+    int heuristic,
+    void *lt_workspace,
+    size_t workspaceSize);
+
+template int gemm_dact_bgradb_lt(
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    float alpha,
+    const at::BFloat16* A,
+    int64_t lda,
+    const at::BFloat16* B,
+    int64_t ldb,
+    const void* pre_act,
+    at::BFloat16* C,
+    int64_t ldc,
+    at::BFloat16* bgrad,
+    bool is_gelu,
+    int heuristic,
+    void *lt_workspace,
+    size_t workspaceSize);
+
+#endif
+
+template <typename T>
+int linear_bias_wgrad_cuda(const T *input, const T *d_output, int64_t in_features, int64_t batch_size, int64_t out_features, T *d_weight, T *d_bias, void *lt_workspace, size_t workspaceSize) {
+    const float alpha          = 1.0;
+    const float beta_zero      = 0.0;
+    int status = 1;
+#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
+    status = gemm_bgradb_lt(
+    // (cublasLtHandle_t)handle,
+    CUBLAS_OP_N,
+    CUBLAS_OP_T,
+    in_features,
+    out_features,
+    batch_size,
+    alpha,
+    input,
+    in_features,
+    d_output,
+    out_features,
+    d_weight,
+    in_features,
+    d_bias,
+    lt_workspace,
+    workspaceSize);
+#endif
+
+    if (status != 0){
+        cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+        status = gemm_bias(
+          handle,
+          CUBLAS_OP_N,
+          CUBLAS_OP_T,
+          in_features,
+          out_features,
+          batch_size,
+          &alpha,
+          input,
+          in_features,
+          d_output,
+          out_features,
+          &beta_zero,
+          d_weight,
+          in_features);
+        // TD [2023-01-17]: I can't call Pytorch's gemm for now, due to linking error
+        // https://discuss.pytorch.org/t/how-can-i-use-the-function-at-gemm-float/95341
+        // at::cuda::blas::gemm<T>(
+        //   'N',
+        //   'T',
+        //   in_features,
+        //   out_features,
+        //   batch_size,
+        //   alpha,
+        //   input,
+        //   in_features,
+        //   d_output,
+        //   out_features,
+        //   beta_zero,
+        //   d_weight,
+        //   in_features);
+    }
+
+    return status;
+}
+
+template <typename T>
+int linear_act_forward_cuda(const T *input, const T *weight, const T *bias, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, T *output, void *pre_act, void *lt_workspace, size_t workspaceSize) {
+    int status = 1;
+#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
+    status = gemm_bias_act_lt(
+    CUBLAS_OP_T,
+    CUBLAS_OP_N,
+    out_features,
+    batch_size,
+    in_features,
+    /*alpha=*/1.0,
+    weight,
+    in_features,
+    input,
+    in_features,
+    bias,
+    output,
+    out_features,
+    pre_act,
+    is_gelu,
+    heuristic,
+    lt_workspace,
+    workspaceSize);
+    return status;
+#else
+    return 1;
+#endif
+}
+
+template <typename T>
+int bias_act_linear_dgrad_bgrad_cuda(const T *weight, const T *d_output, const void *pre_act, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, T *d_input, T *d_bias, void *lt_workspace, size_t workspaceSize) {
+    const float alpha          = 1.0;
+    int status = 1;
+#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600
+    status = gemm_dact_bgradb_lt(
+    CUBLAS_OP_N,
+    CUBLAS_OP_N,
+    in_features,
+    batch_size,
+    out_features,
+    alpha,
+    weight,
+    in_features,
+    d_output,
+    out_features,
+    pre_act,
+    d_input,
+    in_features,
+    d_bias,
+    is_gelu,
+    heuristic,
+    lt_workspace,
+    workspaceSize);
+#endif
+    return status;
+
+}
+
+template int linear_bias_wgrad_cuda<at::Half>(const at::Half *input, const at::Half *d_output, int64_t in_features, int64_t batch_size, int64_t out_features, at::Half *d_weight, at::Half *d_bias, void *lt_workspace, size_t workspaceSize);
+template int linear_bias_wgrad_cuda<at::BFloat16>(const at::BFloat16 *input, const at::BFloat16 *d_output, int64_t in_features, int64_t batch_size, int64_t out_features, at::BFloat16 *d_weight, at::BFloat16 *d_bias, void *lt_workspace, size_t workspaceSize);
+
+template int linear_act_forward_cuda<at::Half>(const at::Half *input, const at::Half *weight, const at::Half *bias, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, at::Half *output, void *pre_act, void *lt_workspace, size_t workspaceSize);
+template int linear_act_forward_cuda<at::BFloat16>(const at::BFloat16 *input, const at::BFloat16 *weight, const at::BFloat16 *bias, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, at::BFloat16 *output, void *pre_act, void *lt_workspace, size_t workspaceSize);
+
+template int bias_act_linear_dgrad_bgrad_cuda<at::Half>(const at::Half *weight, const at::Half *d_output, const void *pre_act, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, at::Half *d_input, at::Half *d_bias, void *lt_workspace, size_t workspaceSize);
+template int bias_act_linear_dgrad_bgrad_cuda<at::BFloat16>(const at::BFloat16 *weight, const at::BFloat16 *d_output, const void *pre_act, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, at::BFloat16 *d_input, at::BFloat16 *d_bias, void *lt_workspace, size_t workspaceSize);
\ No newline at end of file
diff --git a/fused_softmax.cpp b/fused_softmax.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2aaed913314d33a9a7b5092cfc27b51a28a65f89
--- /dev/null
+++ b/fused_softmax.cpp
@@ -0,0 +1,148 @@
+/* coding=utf-8
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_fp16.h>
+#include <torch/extension.h>
+#include <vector>
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_masked_softmax {
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input, 
+    torch::Tensor const& mask,
+    float scale_factor);
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor);
+
+int get_batch_per_block_cuda(
+    int query_seq_len,
+    int key_seq_len,
+    int batches,
+    int attn_heads);
+
+torch::Tensor fwd(
+    torch::Tensor const& input,
+    torch::Tensor const& mask,
+    float scale_factor) {
+  AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
+  AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
+	     (input.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
+  AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
+
+  return fwd_cuda(input, mask, scale_factor);
+}
+
+torch::Tensor bwd(
+    torch::Tensor const& output_grads, 
+    torch::Tensor const& softmax_results,
+    float scale_factor) {
+
+  AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
+  AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
+
+  AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
+	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
+  AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
+	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
+      "Only fp16 and bf16 are supported");
+
+  return bwd_cuda(output_grads, softmax_results, scale_factor);
+}
+
+int get_batch_per_block(
+    int query_seq_len,
+    int key_seq_len,
+    int batches,
+    int attn_heads) {
+    return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads);
+}
+
+} // end namespace scaled_masked_softmax
+} // end namespace fused_softmax
+} // end namespace multihead_attn
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_upper_triang_masked_softmax {
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input,
+    float scale_factor);
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads,
+    torch::Tensor const& softmax_results,
+    float scale_factor);
+
+torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
+  AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
+  AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
+	     (input.scalar_type() == at::ScalarType::BFloat16),
+      "Only fp16 and bf16 are supported");
+
+  return fwd_cuda(input, scale_factor);
+}
+
+torch::Tensor bwd(
+    torch::Tensor const& output_grads,
+    torch::Tensor const& softmax_results,
+    float scale_factor) {
+
+  AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
+  AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
+
+  AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
+	     (output_grads.scalar_type() == at::ScalarType::BFloat16),
+      "Only fp16 and bf16 are supported");
+  AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
+	     (softmax_results.scalar_type() == at::ScalarType::BFloat16),
+      "Only fp16 and bf16 are supported");
+
+  return bwd_cuda(output_grads, softmax_results, scale_factor);
+}
+
+} // end namespace scaled_upper_triang_masked_softmax
+} // end namespace fused_softmax
+} // end namespace multihead_attn
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("scaled_masked_softmax_forward",
+        &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 
+	"Self Multihead Attention scaled, time masked softmax -- Forward.");
+
+  m.def("scaled_masked_softmax_backward",
+        &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
+	"Self Multihead Attention scaled, time masked softmax -- Backward.");
+
+  m.def("scaled_masked_softmax_get_batch_per_block",
+        &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block,
+        "Return Batch per block size."
+  );
+
+  m.def("scaled_upper_triang_masked_softmax_forward",
+        &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
+        "Self Multihead Attention scaled, time masked softmax -- Forward.");
+  m.def("scaled_upper_triang_masked_softmax_backward",
+        &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
+        "Self Multihead Attention scaled, time masked softmax -- Backward.");
+}
diff --git a/fused_softmax.py b/fused_softmax.py
new file mode 100644
index 0000000000000000000000000000000000000000..382f94f092cd3999b2378dfc2fa165a7c08017e2
--- /dev/null
+++ b/fused_softmax.py
@@ -0,0 +1,201 @@
+# [2022-10-23] Copied from https://github.com/NVIDIA/apex/blob/master/apex/transformer/functional/fused_softmax.py
+# for benchmarking.
+# We added support for seqlen=2k and seqlen=4k
+
+# coding=utf-8
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from apex._autocast_utils import _cast_if_autocast_enabled
+from apex.transformer.enums import AttnMaskType
+from fused_softmax_lib import (
+    scaled_masked_softmax_backward,
+    scaled_masked_softmax_forward,
+    scaled_masked_softmax_get_batch_per_block,
+    scaled_upper_triang_masked_softmax_backward,
+    scaled_upper_triang_masked_softmax_forward,
+)
+
+
+class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
+    """
+    Fused operation which performs following three operations in sequence
+    1. Scale the tensor.
+    2. Apply upper triangular mask (typically used in gpt models).
+    3. Perform softmax.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, scale):
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_upper_triang_masked_softmax_forward(inputs, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_upper_triang_masked_softmax_backward(
+            output_grads, softmax_results, scale_t[0]
+        )
+        return input_grads, None
+
+
+def scaled_upper_triang_masked_softmax(inputs, _, scale):
+    b, np, sq, sk = inputs.size()
+    assert sq == sk, "causal mask is only for self attention"
+    # Reshaping input to 3D tensor (attn_batches, sq, sk)
+    inputs = inputs.view(-1, sq, sk)
+    args = _cast_if_autocast_enabled(inputs, scale)
+    with torch.cuda.amp.autocast(enabled=False):
+        probs = ScaledUpperTriangMaskedSoftmax.apply(*args)
+    return probs.view(b, np, sq, sk)
+
+
+# NOTE (mkozuki): `ScaledMaskedSoftmax` somehow doesn't work well with `torch.cuda.amp.custom_fwd`.
+# Without `cast_inputs` kwarg, somehow inputs are not cast to dtype used in the autocast context.
+# So I needed to manually write two `torch.autograd.Function` inheritances.
+# Fused operation which performs following three operations in sequence
+# 1. Scale the tensor.
+# 2. Apply the mask.
+# 3. Perform softmax.
+class ScaledMaskedSoftmax(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, inputs, mask, scale):
+        scale_t = torch.tensor([scale])
+        softmax_results = scaled_masked_softmax_forward(inputs, mask, scale_t[0])
+        ctx.save_for_backward(softmax_results, scale_t)
+        return softmax_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        softmax_results, scale_t = ctx.saved_tensors
+        input_grads = scaled_masked_softmax_backward(output_grads, softmax_results, scale_t[0])
+        return input_grads, None, None
+
+
+def scaled_masked_softmax(inputs, mask, scale):
+    # input is 4D tensor (b, np, sq, sk)
+    args = _cast_if_autocast_enabled(inputs, mask, scale)
+    with torch.cuda.amp.autocast(enabled=False):
+        return ScaledMaskedSoftmax.apply(*args)
+
+
+class FusedScaleMaskSoftmax(torch.nn.Module):
+    """
+    fused operation: scaling + mask + softmax
+
+    Arguments:
+        input_in_fp16: flag to indicate if input in fp16 data format.
+        input_in_bf16: flag to indicate if input in bf16 data format.
+        attn_mask_type: attention mask type (pad or causal)
+        scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion
+        mask_func: mask function to be applied.
+        softmax_in_fp32: if true, softmax in performed at fp32 precision.
+        scale: scaling factor used in input tensor scaling.
+    """
+
+    def __init__(
+        self,
+        input_in_fp16,
+        input_in_bf16,
+        attn_mask_type,
+        scaled_masked_softmax_fusion,
+        mask_func,
+        softmax_in_fp32,
+        scale,
+    ):
+        super().__init__()
+        self.input_in_fp16 = input_in_fp16
+        self.input_in_bf16 = input_in_bf16
+        if self.input_in_fp16 and self.input_in_bf16:
+            raise RuntimeError("both fp16 and bf16 flags cannot be active at the same time.")
+        self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16
+        self.attn_mask_type = attn_mask_type
+        self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion
+        self.mask_func = mask_func
+        self.softmax_in_fp32 = softmax_in_fp32
+        self.scale = scale
+
+        if not (self.scale is None or softmax_in_fp32):
+            raise RuntimeError("softmax should be in fp32 when scaled")
+
+        if self.scaled_masked_softmax_fusion:
+            if self.attn_mask_type == AttnMaskType.causal:
+                self.fused_softmax_func = scaled_upper_triang_masked_softmax
+            elif self.attn_mask_type == AttnMaskType.padding:
+                self.fused_softmax_func = scaled_masked_softmax
+            else:
+                raise ValueError("Invalid attn_mask_type.")
+
+    def forward(self, input, mask):
+        # [b, np, sq, sk]
+        assert input.dim() == 4
+
+        if self.is_kernel_available(mask, *input.size()):
+            return self.forward_fused_softmax(input, mask)
+        else:
+            return self.forward_torch_softmax(input, mask)
+
+    def is_kernel_available(self, mask, b, np, sq, sk):
+        attn_batches = b * np
+
+        if (
+            self.scaled_masked_softmax_fusion  # user want to fuse
+            and self.input_in_float16  # input must be fp16
+            and (
+                self.attn_mask_type == AttnMaskType.causal
+                or (self.attn_mask_type == AttnMaskType.padding and mask is not None)
+            )
+            and 16 < sk <= 8192  # sk must be 16 ~ 8192
+            and sq % 4 == 0  # sq must be divisor of 4
+            and sk % 4 == 0  # sk must be divisor of 4
+            and attn_batches % 4 == 0  # np * b must be divisor of 4
+        ):
+            if 0 <= sk <= 8192:
+                batch_per_block = self.get_batch_per_block(sq, sk, b, np)
+
+                if self.attn_mask_type == AttnMaskType.causal:
+                    if attn_batches % batch_per_block == 0:
+                        return True
+                else:
+                    if sq % batch_per_block == 0:
+                        return True
+        return False
+
+    def forward_fused_softmax(self, input, mask):
+        # input.shape = [b, np, sq, sk]
+        scale = self.scale if self.scale is not None else 1.0
+        return self.fused_softmax_func(input, mask, scale)
+
+    def forward_torch_softmax(self, input, mask):
+        if self.input_in_float16 and self.softmax_in_fp32:
+            input = input.float()
+
+        if self.scale is not None:
+            input = input * self.scale
+        mask_output = self.mask_func(input, mask) if mask is not None else input
+        probs = torch.nn.Softmax(dim=-1)(mask_output)
+
+        if self.input_in_float16 and self.softmax_in_fp32:
+            if self.input_in_fp16:
+                probs = probs.half()
+            else:
+                probs = probs.bfloat16()
+
+        return probs
+
+    @staticmethod
+    def get_batch_per_block(sq, sk, b, np):
+        return scaled_masked_softmax_get_batch_per_block(sq, sk, b, np)
diff --git a/fusedlamb-ds.yaml b/fusedlamb-ds.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a4fffbfb3e5fa97124841810a3ed66207bf1bf6a
--- /dev/null
+++ b/fusedlamb-ds.yaml
@@ -0,0 +1,2 @@
+# @package train.optimizer
+_target_: deepspeed.ops.lamb.FusedLamb
diff --git a/fusedlamb.yaml b/fusedlamb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c8d7b2b8ede4c8733b5dde4dca6b75a942e88882
--- /dev/null
+++ b/fusedlamb.yaml
@@ -0,0 +1,2 @@
+# @package train.optimizer
+_target_: apex.optimizers.FusedLAMB
diff --git a/generate_kernels.py b/generate_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..119e34956f97107adddd82b57425d14d762246eb
--- /dev/null
+++ b/generate_kernels.py
@@ -0,0 +1,108 @@
+# Copied from Driss Guessous's PR in PyTorch: https://github.com/pytorch/pytorch/pull/105602
+
+# This file is run to generate the kernel instantiations for the flash_attn kernels
+# They are written to several files in order to speed up compilation
+
+import argparse
+import itertools
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+
+DTYPE_MAP = {
+    "fp16": "cutlass::half_t",
+    "bf16": "cutlass::bfloat16_t",
+}
+
+SM = [80]  # Sm80 kernels support up to
+HEAD_DIMENSIONS = [32, 64, 96, 128, 160, 192, 256]
+IS_CAUSAL = ["false", "true"]
+KERNEL_IMPL_TEMPLATE_FWD = """#include "flash_fwd_launch_template.h"
+
+template<>
+void run_mha_fwd_<{DTYPE}, {HEAD_DIM}, {IS_CAUSAL}>(Flash_fwd_params &params, cudaStream_t stream) {{
+    run_mha_fwd_hdim{HEAD_DIM}<{DTYPE}, {IS_CAUSAL}>(params, stream);
+}}
+"""
+
+KERNEL_IMPL_TEMPLATE_FWD_SPLIT = """#include "flash_fwd_launch_template.h"
+
+template void run_mha_fwd_splitkv_dispatch<{DTYPE}, {HEAD_DIM}, {IS_CAUSAL}>(Flash_fwd_params &params, cudaStream_t stream);
+"""
+
+KERNEL_IMPL_TEMPLATE_BWD = """#include "flash_bwd_launch_template.h"
+
+template<>
+void run_mha_bwd_<{DTYPE}, {HEAD_DIM}, {IS_CAUSAL}>(Flash_bwd_params &params, cudaStream_t stream) {{
+    run_mha_bwd_hdim{HEAD_DIM}<{DTYPE}, {IS_CAUSAL}>(params, stream);
+}}
+"""
+
+
+@dataclass
+class Kernel:
+    sm: int
+    dtype: str
+    head_dim: int
+    is_causal: bool
+    direction: str
+
+    @property
+    def template(self) -> str:
+        if self.direction == "fwd":
+            return KERNEL_IMPL_TEMPLATE_FWD.format(
+                DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.head_dim, IS_CAUSAL=self.is_causal
+            )
+        elif self.direction == "bwd":
+            return KERNEL_IMPL_TEMPLATE_BWD.format(
+                DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.head_dim, IS_CAUSAL=self.is_causal
+            )
+        else:
+            return KERNEL_IMPL_TEMPLATE_FWD_SPLIT.format(
+                DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.head_dim, IS_CAUSAL=self.is_causal
+            )
+
+    @property
+    def filename(self) -> str:
+        return f"flash_{self.direction}_hdim{self.head_dim}_{self.dtype}_{'causal_' if self.is_causal == 'true' else ''}sm{self.sm}.cu"
+
+
+def get_all_kernels() -> List[Kernel]:
+    for direction in ["fwd", "fwd_split", "bwd"]:
+        for dtype, head_dim, is_causal, sm in itertools.product(DTYPE_MAP.keys(), HEAD_DIMENSIONS, IS_CAUSAL, SM):
+            yield Kernel(sm=sm, dtype=dtype, head_dim=head_dim, is_causal=is_causal, direction=direction)
+
+
+def write_kernel(kernel: Kernel, autogen_dir: Path) -> None:
+    prelude = """// Copyright (c) 2024, Tri Dao.
+// Splitting the different head dimensions to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"\n
+"""
+    (autogen_dir / kernel.filename).write_text(prelude + kernel.template)
+
+
+def main(output_dir: Optional[str]) -> None:
+    if output_dir is None:
+        output_dir = Path(__file__).parent
+    else:
+        output_dir = Path(output_dir)
+
+    for kernel in get_all_kernels():
+        write_kernel(kernel, output_dir)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate_kernels",
+        description="Generate the flash_attention kernels template instantiations",
+    )
+    # Set an optional output directory
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        required=False,
+        help="Where to generate the kernels "
+        " will default to the current directory ",
+    )
+    args = parser.parse_args()
+    main(args.output_dir)
diff --git a/generation.py b/generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d9120c386596f25b544391af10dc479cf00c822
--- /dev/null
+++ b/generation.py
@@ -0,0 +1,740 @@
+# Copyright (c) 2023, Tri Dao.
+# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/forward_step.py#L31
+import gc
+import time
+from collections import namedtuple
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Callable, Optional, Sequence, Union
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import Tensor
+from torch.profiler import ProfilerActivity, profile, record_function
+
+try:
+    from transformers.generation import GreedySearchDecoderOnlyOutput, SampleDecoderOnlyOutput
+except ImportError:
+    GreedySearchDecoderOnlyOutput = namedtuple("GreedySearchDecoderOnlyOutput", ["sequences", "scores"])
+    SampleDecoderOnlyOutput = namedtuple("SampleDecoderOnlyOutput", ["sequences", "scores"])
+
+
+@dataclass
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+
+    max_seqlen: int
+    max_batch_size: int
+    seqlen_offset: int = 0
+    batch_size_offset: int = 0
+    key_value_memory_dict: dict = field(default_factory=dict)
+    lengths_per_sample: Optional[Tensor] = None
+
+    def reset(self, max_seqlen, max_batch_size):
+        self.max_seqlen = max_seqlen
+        self.max_batch_size = max_batch_size
+        self.seqlen_offset = 0
+        if self.lengths_per_sample is not None:
+            self.lengths_per_sample.zero_()
+
+
+# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
+# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L231
+def modify_logits_for_top_k_filtering(logits, top_k):
+    """Set the logits for none top-k values to -inf. Done in-place."""
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits.masked_fill_(indices_to_remove, float("-Inf"))
+
+
+# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
+# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L170
+def modify_logits_for_top_p_filtering(logits, top_p):
+    """Set the logits for none top-p values to -inf. Done in-place."""
+    if top_p <= 0.0 or top_p >= 1.0:
+        return
+    # First sort and calculate cumulative sum of probabilities.
+    sorted_logits, sorted_indices = torch.sort(logits, descending=False)
+    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+    # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
+    sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
+    # scatter sorted tensors to original indexing
+    indices_to_remove = sorted_indices_to_remove.scatter(
+        1, sorted_indices, sorted_indices_to_remove
+    )
+    logits.masked_fill_(indices_to_remove, float("-inf"))
+
+
+def sample(logits, top_k=1, top_p=0.0, temperature=1.0):
+    """Sample from top-k logits.
+    Arguments:
+        logits: Tensor of shape (batch_size, vocab_size)
+    """
+    if top_k == 1:  # Short-circuit for greedy decoding
+        return logits.argmax(dim=-1)
+    else:
+        if top_p > 0.0:
+            assert top_p <= 1.0, "top-p should be in (0, 1]."
+        if top_k > 0:
+            top_k = min(top_k, logits.size(-1))  # Safety check
+            logits_top, indices = torch.topk(logits, top_k, dim=-1)
+            if temperature != 1.0:
+                logits_top /= temperature
+            modify_logits_for_top_p_filtering(logits_top, top_p)
+            return indices[
+                torch.arange(indices.shape[0], device=indices.device),
+                torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1),
+            ]
+        else:
+            # Clone so that when we modify for top_p we don't change the original logits
+            logits_top = logits / temperature if temperature != 1.0 else logits.clone()
+            modify_logits_for_top_p_filtering(logits_top, top_p)
+            return torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(
+                dim=-1
+            )
+
+
+@torch.inference_mode()
+def decode(
+    input_ids,
+    model,
+    max_length,
+    top_k=1,
+    top_p=0.0,
+    temperature=1.0,
+    eos_token_id=None,
+    teacher_outputs=None,
+    vocab_size=None,
+    tensor_parallel=1,
+    cg=False,
+    enable_timing=False,
+):
+    """Decoding, either greedy or with top-k or top-p sampling.
+    If top-k = 0, don't limit the number of candidates (pure sampling).
+    Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first,
+    then top-p.
+    We assume that all sequences in the same batch have the same length.
+
+    Arguments:
+        input_ids: (batch, seq_len)
+        max_length: int
+        teacher_outputs (optional): (batch, seq_len). If provided, instead of sampling from the
+            logits, the next token is taken from the teacher_outputs. Useful for testing.
+    Returns: GreedySearchDecoderOnlyOutput or SampleDecoderOnlyOutput, with the following fields:
+        sequences: (batch, max_length)
+        scores: tuples of (batch, vocab_size)
+    """
+    batch_size, seqlen_og = input_ids.shape
+    teacher_output_len = teacher_outputs.shape[1] if teacher_outputs is not None else 0
+    if cg:
+        if not hasattr(model, "_decoding_cache"):
+            model._decoding_cache = None
+        model._decoding_cache = update_graph_cache(
+            model,
+            model._decoding_cache,
+            batch_size,
+            seqlen_og,
+            max_length,
+            tensor_parallel=tensor_parallel,
+        )
+        inference_params = model._decoding_cache.inference_params
+        inference_params.reset(max_length, batch_size)
+    else:
+        inference_params = InferenceParams(max_seqlen=max_length, max_batch_size=batch_size)
+
+    def get_logits(input_ids, inference_params):
+        decoding = inference_params.seqlen_offset > 0
+        if decoding:
+            position_ids = torch.full(
+                (batch_size, 1),
+                inference_params.seqlen_offset,
+                dtype=torch.long,
+                device=input_ids.device,
+            )
+        else:
+            position_ids = None
+        if not cg or not decoding:
+            logits = model(
+                input_ids,
+                position_ids=position_ids,
+                inference_params=inference_params,
+                num_last_tokens=1,
+            ).logits.squeeze(dim=1)
+        else:
+            logits = model._decoding_cache.run(
+                input_ids, position_ids, inference_params.seqlen_offset
+            ).squeeze(dim=1)
+        return logits[..., :vocab_size] if vocab_size is not None else logits
+
+    def sample_tokens(logits, inference_params):
+        if teacher_outputs is None or teacher_output_len <= inference_params.seqlen_offset:
+            token = sample(logits, top_k=top_k, top_p=top_p, temperature=temperature)
+        else:
+            token = teacher_outputs[:, inference_params.seqlen_offset]
+        # return rearrange(token, "b -> b 1")
+        return token.unsqueeze(1)
+
+    def should_stop(current_token, inference_params):
+        if inference_params.seqlen_offset == 0:
+            return False
+        if eos_token_id is not None and (current_token == eos_token_id).all():
+            return True
+        if inference_params.seqlen_offset >= max_length - 1:
+            return True
+        return False
+
+    start = torch.cuda.Event(enable_timing=enable_timing)
+    end = torch.cuda.Event(enable_timing=enable_timing)
+
+    if enable_timing:
+        if tensor_parallel > 1:
+            torch.distributed.barrier()
+        start.record()
+    scores, sequences = [], [input_ids]
+    while not should_stop(sequences[-1], inference_params):
+        scores.append(get_logits(sequences[-1], inference_params))
+        inference_params.seqlen_offset += sequences[-1].shape[1]
+        sequences.append(sample_tokens(scores[-1], inference_params))
+    if enable_timing:
+        end.record()
+        if tensor_parallel > 1:
+            torch.distributed.barrier()
+        torch.cuda.synchronize()
+        print(f"Prompt processing + decoding time: {(start.elapsed_time(end)):.0f}ms")
+    output_cls = GreedySearchDecoderOnlyOutput if top_k == 1 else SampleDecoderOnlyOutput
+    return output_cls(sequences=torch.cat(sequences, dim=1), scores=tuple(scores))
+
+
+def sample_speculative(logits, logits_draft, tokens_draft, top_k=1, top_p=0.0, temperature=1.0):
+    """Algorithm 1 from [1]
+    [1] Fast Inference from Transformers via Speculative Decoding
+    Yaniv Leviathan, Matan Kalman, Yossi Matias
+    https://arxiv.org/abs/2211.17192
+
+    Arguments:
+        logits: Tensor of shape (batch_size, seqlen + 1, vocab_size)
+        logits_draft: Tensor of shape (batch_size, seqlen, vocab_size)
+        tokens_draft: Tensor of shape (batch_size, seqlen)
+    Return:
+        tokens: Tensor of shape (batch_size, seqlen + 1)
+        num_generated_tokens: Tensor of shape (batch_size), with value in [1, seqlen + 1].
+            For each sequence in the batch, the number of valid tokens that were sampled by
+            speculative sampling.
+    """
+    batch, seqlen_p_1, vocab_size = logits.shape
+    seqlen = seqlen_p_1 - 1
+    assert logits_draft.shape == (batch, seqlen, vocab_size)
+    assert tokens_draft.shape == (batch, seqlen)
+    assert tokens_draft.dtype in [torch.int64, torch.int32]
+    # TODO: if top_k = 1 we can simplify things and only work with indices
+    if top_p > 0.0:
+        assert top_p <= 1.0, "top-p should be in (0, 1]."
+    # Clone so that when we modify for top_p we don't change the original logits
+    logits = logits / temperature if temperature != 1.0 else logits.clone()
+    logits_draft = logits_draft / temperature if temperature != 1.0 else logits_draft.clone()
+    if top_k > 0:
+        top_k = min(top_k, logits.size(-1))  # Safety check
+        modify_logits_for_top_k_filtering(logits, top_k)
+        modify_logits_for_top_k_filtering(logits_draft, top_k)
+    modify_logits_for_top_p_filtering(logits, top_p)
+    modify_logits_for_top_p_filtering(logits_draft, top_p)
+    probs = torch.softmax(logits, dim=-1)
+    probs_draft = torch.softmax(logits_draft, dim=-1)
+    gather = lambda probs, tokens: rearrange(
+        probs.gather(dim=-1, index=rearrange(tokens, "... -> ... 1")), "... 1 -> ..."
+    )
+    # (batch, seqlen)
+    accepted = torch.rand(batch, seqlen, device=probs.device) * gather(
+        probs_draft, tokens_draft
+    ) <= gather(probs[:, :-1], tokens_draft)
+    accepted_all = accepted.all(dim=-1)
+    # (batch,)
+    first_rejected_idx = torch.where(accepted_all, seqlen, accepted.int().argmin(dim=-1))
+    probs_diff = torch.clamp(probs[:, :-1] - probs_draft, min=0.0)
+    # torch.multinomial can deal with unnormalized probabilities
+    # probs_diff /= probs_diff.sum(dim=-1, keepdim=True)
+    resample_probs = torch.cat([probs_diff, probs[:, -1:]], dim=1)
+    resample_probs = rearrange(
+        resample_probs.gather(dim=1, index=repeat(first_rejected_idx, "b -> b 1 d", d=vocab_size)),
+        "b 1 d -> b d",
+    )
+    resample = torch.multinomial(resample_probs, num_samples=1).squeeze(dim=-1)  # (batch,)
+    tokens = F.pad(tokens_draft, (0, 1))
+    tokens[:, first_rejected_idx] = resample
+    return tokens, first_rejected_idx + 1
+
+
+@torch.inference_mode()
+def decode_speculative(
+    input_ids,
+    model,
+    model_draft,
+    max_length,
+    speculative_lookahead=3,
+    top_k=1,
+    top_p=0.0,
+    temperature=1.0,
+    eos_token_id=None,
+    vocab_size=None,
+    tensor_parallel=1,
+    cg=False,
+    enable_timing=False,
+    debug=False,
+):
+    """
+    TD: WIP, for my own understanding, lightly tested. Only support batch_size == 1 for now.
+
+    Speculative decoding, either greedy or with top-k or top-p sampling.
+    If top-k = 0, don't limit the number of candidates (pure sampling).
+    Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first,
+    then top-p.
+    We assume that all sequences in the same batch have the same length.
+
+    Arguments:
+        input_ids: (batch, seq_len)
+        max_length: int
+    Returns: GreedySearchDecoderOnlyOutput or SampleDecoderOnlyOutput, with the following fields:
+        sequences: (batch, max_length)
+        scores: tuples of (batch, vocab_size)
+    """
+    batch_size, seqlen_og = input_ids.shape
+    assert batch_size == 1, "Speculative decoding implementation only supports batch_size=1"
+    assert eos_token_id is None, "Speculative decoding implementation doesn't support eos_token_id"
+    if cg:
+        if not hasattr(model_draft, "_decoding_cache"):
+            model_draft._decoding_cache = None
+        model_draft._decoding_cache = update_graph_cache(
+            model_draft,
+            model_draft._decoding_cache,
+            batch_size,
+            seqlen_og,
+            max_length,
+            # draft model needs to process either 1 or 2 tokens at a time
+            decoding_seqlens=(1, 2),
+            tensor_parallel=tensor_parallel,
+        )
+        inference_params_draft = model_draft._decoding_cache.inference_params
+        inference_params_draft.reset(max_length, batch_size)
+        if not hasattr(model, "_decoding_cache"):
+            model._decoding_cache = None
+        model._decoding_cache = update_graph_cache(
+            model,
+            model._decoding_cache,
+            batch_size,
+            seqlen_og,
+            max_length,
+            decoding_seqlens=range(1, speculative_lookahead + 2),
+            tensor_parallel=tensor_parallel,
+        )
+        inference_params = model._decoding_cache.inference_params
+        inference_params.reset(max_length, batch_size)
+    else:
+        inference_params_draft = InferenceParams(max_seqlen=max_length, max_batch_size=batch_size)
+        inference_params = InferenceParams(max_seqlen=max_length, max_batch_size=batch_size)
+
+    def get_logits(input_ids, inference_params, model, num_last_tokens=1, cg=False):
+        decoding = inference_params.seqlen_offset > 0
+        if decoding:
+            seqlen = input_ids.shape[1]
+            # if inference_params.lengths_per_sample is None:
+            # TODO: in the case of batched decoding where each sequence has a different length,
+            # we need to compute the position_ids for each sequence using lengths_per_sample
+            if True:
+                cache_seqlens = torch.full(
+                    (input_ids.shape[0],),
+                    inference_params.seqlen_offset,
+                    dtype=torch.int32,
+                    device=input_ids.device,
+                )
+            else:
+                cache_seqlens = inference_params.lengths_per_sample
+            position_ids = cache_seqlens[:, None] + torch.arange(
+                seqlen, dtype=torch.long, device=input_ids.device
+            )
+        else:
+            position_ids = None
+        if not cg or not decoding:
+            logits = model(
+                input_ids,
+                position_ids=position_ids,
+                inference_params=inference_params,
+                num_last_tokens=num_last_tokens,
+            ).logits
+        else:
+            # NOTE: careful, CUDA graph is set to have num_last_tokens=input_ids.shape[1].
+            # This might not be compatible the num_last_tokens used here.
+            assert num_last_tokens <= input_ids.shape[1]
+            logits = model._decoding_cache.run(
+                input_ids, position_ids, inference_params.seqlen_offset
+            )[:, -num_last_tokens:]
+        return logits[..., :vocab_size] if vocab_size is not None else logits
+
+    def sample_tokens(input_ids, get_logits_fn, inference_params, sample_fn, num_tokens=1):
+        """Sample `num_tokens` tokens from the model, given the previous logits.
+        Also return the logits of the sampled tokens.
+        Arguments:
+            input_ids: (batch, seqlen)
+        Return:
+            tokens: (batch, num_tokens)
+            scores: (batch, num_tokens), which contains @previous_logits and the logits of the next
+                (num_tokens - 1) tokens. The logits of the last token isn't computed.
+        """
+        assert num_tokens >= 1
+        sequences, scores = [input_ids], []
+        for i in range(num_tokens):
+            scores.append(get_logits_fn(sequences[-1], inference_params)[:, -1])
+            inference_params.seqlen_offset += sequences[-1].shape[1]
+            sequences.append(sample_fn(scores[-1]).unsqueeze(1))
+        return torch.cat(sequences[1:], dim=1), torch.stack(scores, dim=1)
+
+    sampling_kwargs = dict(top_k=top_k, top_p=top_p, temperature=temperature)
+    sample_fn = partial(sample, **sampling_kwargs)
+    get_logits_main = partial(get_logits, model=model, cg=cg)
+    get_logits_draft = partial(get_logits, model=model_draft, cg=cg)
+    sample_tokens_main = partial(
+        sample_tokens,
+        get_logits_fn=get_logits_main,
+        sample_fn=sample_fn,
+        inference_params=inference_params,
+    )
+    sample_tokens_draft = partial(
+        sample_tokens,
+        get_logits_fn=get_logits_draft,
+        sample_fn=sample_fn,
+        inference_params=inference_params_draft,
+    )
+
+    if debug:
+        from transformers import AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    if enable_timing:
+        if tensor_parallel > 1:
+            torch.distributed.barrier()
+        torch.cuda.synchronize()
+        start = time.time()
+
+    sequences, scores = [input_ids], []
+    num_main_model_calls = 0
+    num_draft_tokens = 0
+    num_accepted_tokens_history = []
+    if seqlen_og >= max_length - 1:
+        # Don't do speculative sampling, just sample 1 token from the model
+        tokens, scores_new = sample_tokens_main(input_ids, num_tokens=1)
+        sequences.append(tokens)
+        scores.append(scores_new)
+    else:
+        # Sample from draft model, which produces @n_spec_tokens, and @model
+        # will then use to produce between 1 and 1 + @n_spec_tokens tokens.
+        # We want seqlen_og + 1 + @n_spec_tokens to be <= @max_length.
+        n_spec_tokens = min(speculative_lookahead, max_length - seqlen_og - 1)
+        tokens_draft, scores_draft = sample_tokens_draft(input_ids, num_tokens=n_spec_tokens)
+        num_draft_tokens += n_spec_tokens
+        if debug:
+            scores_draft_ref = model_draft(
+                torch.cat([input_ids, tokens_draft], dim=1), num_last_tokens=n_spec_tokens + 1
+            ).logits
+            print((scores_draft - scores_draft_ref[:, :-1]).abs().max())
+
+        # Evaluate the draft tokens with the model
+        logits = get_logits_main(
+            torch.cat([input_ids, tokens_draft], dim=1),
+            inference_params,
+            num_last_tokens=n_spec_tokens + 1,
+        )
+        num_main_model_calls += 1
+        if debug:
+            logits_ref = model(
+                torch.cat([input_ids, tokens_draft], dim=1), num_last_tokens=n_spec_tokens + 1
+            ).logits
+            print((logits - logits_ref).abs().max())
+            # breakpoint()
+        tokens, num_generated_tokens = sample_speculative(
+            logits, scores_draft, tokens_draft, **sampling_kwargs
+        )
+        num_accepted_tokens_history.append(num_generated_tokens - 1)
+        if debug:
+            print(tokens)
+            print(num_generated_tokens)
+            # breakpoint()
+        # TODO: we're using the fact that batch_size == 1
+        # TODO: check eos_token_id
+        sequences.append(tokens[:1, : num_generated_tokens[0]])
+        scores.append(logits[:1, : num_generated_tokens[0]])
+        # Note that @model has not evaluated the last sampled token yet, so we'll need to pass
+        # that in the next time we call @model.
+        num_generated = num_generated_tokens[0].item()
+        inference_params.seqlen_offset = seqlen_og + num_generated - 1
+        inference_params_draft.seqlen_offset = (
+            inference_params.seqlen_offset - 1
+            if num_generated > 1
+            else inference_params.seqlen_offset
+        )
+        if debug:
+            cur_ids = torch.cat([input_ids, sequences[-1]], dim=1)
+            scores_ref = model(cur_ids, num_last_tokens=num_generated_tokens[0].item() + 1).logits
+            print((scores[-1] - scores_ref[:, :-1]).abs().max())
+            # breakpoint()
+
+    while True:
+        # seqlen_offset is total length generated - 1
+        if inference_params.seqlen_offset >= max_length - 1:
+            break
+        if inference_params.seqlen_offset >= max_length - 2:
+            # Don't do speculative sampling, just sample 1 token from the model
+            tokens, scores_new = sample_tokens_main(sequences[-1][:, -1:], num_tokens=1)
+            sequences.append(tokens)
+            scores.append(scores_new)
+            break
+        # Sample from draft model
+        n_spec_tokens = min(
+            speculative_lookahead, max_length - inference_params_draft.seqlen_offset - 2
+        )
+        # If the main model accepts all the draft tokens, plus it samples one new token,
+        # then at the next iteration the draft model need to evaluate the logits of the last draft
+        # token and the logits of the newly sampled token. So here we pass in the last 2 tokens
+        # of sequences[-1].
+        # This exception is when the main model rejects all the draft tokens, in which case we
+        # will only have 1 token to pass in.
+        tokens_draft, scores_draft = sample_tokens_draft(
+            sequences[-1][:, -2:], num_tokens=n_spec_tokens
+        )
+        num_draft_tokens += n_spec_tokens
+        if debug:
+            scores_draft_ref = model_draft(
+                torch.cat([cur_ids, tokens_draft], dim=1), num_last_tokens=n_spec_tokens + 1
+            ).logits
+            print((scores_draft - scores_draft_ref[:, :-1]).abs().max())
+            # breakpoint()
+        # Evaluate the draft tokens with the model
+        logits = get_logits_main(
+            torch.cat([sequences[-1][:, -1:], tokens_draft], dim=1),
+            inference_params,
+            num_last_tokens=n_spec_tokens + 1,
+        )  # (batch, n_spec_tokens + 1, vocab_size)
+        num_main_model_calls += 1
+        if debug:
+            logits_ref = model(
+                torch.cat([cur_ids, tokens_draft], dim=1), num_last_tokens=n_spec_tokens + 1
+            ).logits
+            print((logits - logits_ref).abs().max())
+            # breakpoint()
+        tokens, num_generated_tokens = sample_speculative(
+            logits, scores_draft, tokens_draft, **sampling_kwargs
+        )
+        num_accepted_tokens_history.append(num_generated_tokens - 1)
+        if debug:
+            print(tokens)
+            print(num_generated_tokens)
+            # breakpoint()
+        sequences.append(tokens[:1, : num_generated_tokens[0]])
+        scores.append(logits[:1, : num_generated_tokens[0]])
+        # We've evaluated 1 token from sequences[-1][:, -1:] above, plus
+        # num_generated_tokens[0].item() - 1 tokens from the draft model.
+        num_generated = num_generated_tokens[0].item()
+        inference_params.seqlen_offset += num_generated
+        inference_params_draft.seqlen_offset = (
+            inference_params.seqlen_offset - 1
+            if num_generated > 1
+            else inference_params.seqlen_offset
+        )
+        if debug:
+            cur_ids = torch.cat([cur_ids, sequences[-1]], dim=1)
+            scores_ref = model(cur_ids, num_last_tokens=num_generated_tokens[0].item() + 1).logits
+            print((scores[-1] - scores_ref[:, :-1]).abs().max())
+            # breakpoint()
+
+    if enable_timing:
+        if tensor_parallel > 1:
+            torch.distributed.barrier()
+        torch.cuda.synchronize()
+        print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+        print(f"Number of calls to main model: {num_main_model_calls}")
+        print(
+            f"Acceptance rate: {torch.cat(num_accepted_tokens_history).sum().item() / num_draft_tokens * 100:.2f}%"
+        )
+    sequences = torch.cat(sequences, dim=1)
+    scores = torch.cat(scores, dim=1)
+    if debug:
+        scores_ref = model(sequences).logits
+        print((scores - scores_ref[:, seqlen_og - 1 : -1]).abs().max())
+    output_cls = GreedySearchDecoderOnlyOutput if top_k == 1 else SampleDecoderOnlyOutput
+    return output_cls(sequences=sequences, scores=scores)
+
+
+class GenerationMixin:
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        raise NotImplementedError
+
+    def generate(
+        self,
+        input_ids,
+        max_length,
+        top_k=1,
+        top_p=0.0,
+        temperature=1.0,
+        return_dict_in_generate=False,
+        output_scores=False,
+        **kwargs,
+    ):
+        output = decode(
+            input_ids, self, max_length, top_k=top_k, top_p=top_p, temperature=temperature, **kwargs
+        )
+        if not output_scores:
+            output.scores = None
+        return output if return_dict_in_generate else output.sequences
+
+
+def allocate_inference_cache(
+    max_batch_size,
+    max_seqlen,
+    nheads,
+    headdim,
+    layers: Union[int, Sequence],
+    device,
+    dtype=torch.float16,
+):
+    assert dtype in [torch.float16, torch.bfloat16, torch.float32]
+    kv_cache_shape = (max_batch_size, max_seqlen, 2, nheads, headdim)
+    if isinstance(layers, int):
+        layers = range(layers)
+    return {i: torch.empty(kv_cache_shape, device=device, dtype=dtype) for i in layers}
+
+
+@dataclass
+class DecodingCGCache:
+    max_batch_size: int = 0
+    max_seqlen: int = 0
+    device = None
+    dtype = None
+    callables: dict = field(default_factory=dict)
+    mempool = None
+    inference_params: Optional[InferenceParams] = None
+    run: Optional[Callable] = None
+
+
+@torch.inference_mode()
+def update_graph_cache(
+    model,
+    cache,
+    batch_size,
+    seqlen_og,
+    max_seqlen,
+    decoding_seqlens=(1,),
+    tensor_parallel=1,
+    dtype=None,
+    n_warmups=2,
+):
+    if cache is None:
+        cache = DecodingCGCache()
+    param_example = next(iter(model.parameters()))
+    device = param_example.device
+    if dtype is None:
+        dtype = param_example.dtype
+    if (
+        (device, dtype) != (cache.device, cache.dtype)
+        or batch_size > cache.max_batch_size
+        or max_seqlen > cache.max_seqlen
+    ):  # Invalidate the cache
+        cache.callables = {}
+        cache.mempool = None
+        cache.inference_params = None
+        gc.collect()
+        cache.device, cache.dtype = device, dtype
+        cache.max_batch_size, cache.max_seqlen = batch_size, max_seqlen
+        if hasattr(model, "allocate_inference_cache"):
+            inf_cache = model.allocate_inference_cache(batch_size, max_seqlen, dtype)
+        else:
+            headdim = getattr(
+                model.config,
+                "head_dim",
+                model.config.hidden_size // model.config.num_attention_heads,
+            )
+            inf_cache = allocate_inference_cache(
+                batch_size,
+                max_seqlen,
+                model.config.num_attention_heads // tensor_parallel,
+                headdim,
+                model.config.num_hidden_layers,
+                device,
+                dtype,
+            )
+        lengths_per_sample = torch.full((batch_size,), seqlen_og, dtype=torch.int32, device=device)
+        cache.inference_params = InferenceParams(
+            max_seqlen=max_seqlen,
+            max_batch_size=batch_size,
+            seqlen_offset=seqlen_og,
+            key_value_memory_dict=inf_cache,
+            lengths_per_sample=lengths_per_sample,
+        )
+        cache.mempool = torch.cuda.graphs.graph_pool_handle()
+    for decoding_seqlen in decoding_seqlens:
+        if (batch_size, decoding_seqlen) not in cache.callables:
+            cache.callables[batch_size, decoding_seqlen] = capture_graph(
+                model,
+                cache.inference_params,
+                batch_size,
+                max_seqlen,
+                decoding_seqlen=decoding_seqlen,
+                mempool=cache.mempool,
+                n_warmups=n_warmups,
+            )
+
+    def dispatch(input_ids, position_ids, seqlen):
+        batch_size, decoding_seqlen = input_ids.shape[:2]
+        return cache.callables[batch_size, decoding_seqlen](input_ids, position_ids, seqlen)
+
+    cache.run = dispatch
+    cache.inference_params.seqlen_offset = 0  # Reset so it's not confusing
+    return cache
+
+
+def capture_graph(
+    model, inference_params, batch_size, max_seqlen, decoding_seqlen=1, mempool=None, n_warmups=2
+):
+    device = next(iter(model.parameters())).device
+    input_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device)
+    position_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device)
+    seqlen_offset_og = inference_params.seqlen_offset
+    inference_params.seqlen_offset = max_seqlen - decoding_seqlen
+    inference_params.lengths_per_sample[:] = inference_params.seqlen_offset
+
+    # Warmup before capture
+    s = torch.cuda.Stream()
+    s.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(s):
+        for _ in range(n_warmups):
+            logits = model(
+                input_ids,
+                position_ids=position_ids,
+                inference_params=inference_params,
+                num_last_tokens=decoding_seqlen,
+            ).logits
+        s.synchronize()
+        # This might be needed for correctness if we run with NCCL_GRAPH_MIXING_SUPPORT=0,
+        # which requires that graph launch and non-captured launch to not overlap (I think,
+        # that's how I interpret the documentation). I'm not sure if this is required.
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+    torch.cuda.current_stream().wait_stream(s)
+    # Captures the graph
+    # To allow capture, automatically sets a side stream as the current stream in the context
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph, pool=mempool):
+        logits = model(
+            input_ids,
+            position_ids=position_ids,
+            inference_params=inference_params,
+            num_last_tokens=decoding_seqlen,
+        ).logits
+
+    def run(new_input_ids, new_position_ids, seqlen):
+        inference_params.lengths_per_sample[:] = seqlen
+        input_ids.copy_(new_input_ids)
+        position_ids.copy_(new_position_ids)
+        graph.replay()
+        return logits.clone()
+
+    inference_params.seqlen_offset = seqlen_offset_og
+    return run
diff --git a/gpt.py b/gpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..3539f8f901695b29454358972d65031f4c4fabeb
--- /dev/null
+++ b/gpt.py
@@ -0,0 +1,1080 @@
+# Copyright (c) 2024, Tri Dao.
+
+import logging
+import math
+import re
+from collections import OrderedDict, namedtuple
+from collections.abc import Sequence
+from functools import partial
+from typing import Dict, List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import GPT2Config
+
+from flash_attn.models.bigcode import remap_state_dict_hf_bigcode
+from flash_attn.models.falcon import remap_state_dict_hf_falcon
+from flash_attn.models.gpt_neox import remap_state_dict_hf_gpt_neox
+from flash_attn.models.gptj import remap_state_dict_hf_gptj
+from flash_attn.models.llama import remap_state_dict_hf_llama
+from flash_attn.models.opt import remap_state_dict_hf_opt
+from flash_attn.modules.block import Block, ParallelBlock
+from flash_attn.modules.embedding import GPT2Embeddings, ParallelGPT2Embeddings
+from flash_attn.modules.mha import MHA, ParallelMHA
+from flash_attn.modules.mlp import (
+    FusedMLP,
+    GatedMlp,
+    Mlp,
+    ParallelFusedMLP,
+    ParallelGatedMlp,
+    ParallelMLP,
+)
+from flash_attn.ops.activations import sqrelu_fwd
+from flash_attn.utils.distributed import (
+    all_gather,
+    all_gather_raw,
+    get_dim_for_local_rank,
+    sync_shared_params,
+)
+from flash_attn.utils.generation import GenerationMixin
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+
+try:
+    from flash_attn.ops.fused_dense import ColumnParallelLinear
+except ImportError:
+    ColumnParallelLinear = None
+
+try:
+    from flash_attn.ops.triton.mlp import FusedDenseSqreluDense
+except ImportError:
+    FusedDenseSqreluDense = None
+
+try:
+    from flash_attn.ops.triton.layer_norm import layer_norm_fn, RMSNorm
+except ImportError:
+    layer_norm_fn, RMSNorm = None, None
+
+logger = logging.getLogger(__name__)
+
+
+def create_mixer_cls(config, layer_idx=None, process_group=None, device=None, dtype=None):
+    factory_kwargs = {"device": device, "dtype": dtype}
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    attn_scale_power = 0.5 if not getattr(config, "mup_scale_qk_dot_by_d", False) else 1.0
+    softmax_scale = 1.0 if not config.scale_attn_weights else (head_dim ** (-attn_scale_power))
+    softmax_scale *= getattr(config, "mup_attn_multiplier", 1.0)
+    if config.scale_attn_by_inverse_layer_idx:
+        assert layer_idx is not None
+        softmax_scale /= float(layer_idx + 1)
+    dwconv = getattr(config, "attn_dwconv", False)
+    if dwconv:
+        assert process_group is None, "TensorParallel MHA does not support dwconv yet"
+    qkv_proj_bias = getattr(config, "qkv_proj_bias", True)
+    out_proj_bias = getattr(config, "out_proj_bias", True)
+    rotary_emb_dim = int(getattr(config, "rotary_emb_fraction", 0.0) * head_dim)
+    rotary_emb_base = getattr(config, "rotary_emb_base", 10000.0)
+    rotary_emb_scale_base = getattr(config, "rotary_emb_scale_base", None)
+    rotary_emb_interleaved = getattr(config, "rotary_emb_interleaved", False)
+    use_alibi = getattr(config, "use_alibi", False)
+    window_size = getattr(config, "window_size", (-1, -1))
+    use_flash_attn = getattr(config, "use_flash_attn", False)
+    fused_bias_fc = getattr(config, "fused_bias_fc", False)
+    if not fused_bias_fc:
+        assert process_group is None, "TensorParallel MHA requires fused_bias_fc"
+    mha_cls = MHA if process_group is None else ParallelMHA
+    serial_kwargs = (
+        {"fused_bias_fc": fused_bias_fc, "dwconv": dwconv} if process_group is None else {}
+    )
+    parallel_kwargs = (
+        {
+            "process_group": process_group,
+            "sequence_parallel": getattr(config, "sequence_parallel", True),
+        }
+        if process_group is not None
+        else {}
+    )
+    num_heads_kv = getattr(config, "n_head_kv", None)
+    mixer_cls = partial(
+        mha_cls,
+        num_heads=config.num_attention_heads,
+        num_heads_kv=num_heads_kv,
+        qkv_proj_bias=qkv_proj_bias,
+        out_proj_bias=out_proj_bias,
+        dropout=config.attn_pdrop,
+        softmax_scale=softmax_scale,
+        causal=True,
+        layer_idx=layer_idx,
+        rotary_emb_dim=rotary_emb_dim,
+        rotary_emb_base=rotary_emb_base,
+        rotary_emb_scale_base=rotary_emb_scale_base,
+        rotary_emb_interleaved=rotary_emb_interleaved,
+        use_alibi=use_alibi,
+        window_size=window_size,
+        use_flash_attn=use_flash_attn,
+        **serial_kwargs,
+        **parallel_kwargs,
+        **factory_kwargs,
+    )
+    return mixer_cls
+
+
+def create_mlp_cls(config, layer_idx=None, process_group=None, device=None, dtype=None):
+    factory_kwargs = {"device": device, "dtype": dtype}
+    mlp_fc1_bias = getattr(config, "mlp_fc1_bias", True)
+    mlp_fc2_bias = getattr(config, "mlp_fc2_bias", True)
+    fused_mlp = getattr(config, "fused_mlp", False)
+    if fused_mlp:
+        assert config.activation_function in [
+            "gelu_new",
+            "gelu_fast",
+            "gelu_approx",
+            "gelu_pytorch_tanh",
+            "relu",
+            "sqrelu",
+        ]
+    fused_dense_sqrelu_dense = getattr(config, "fused_dense_sqrelu_dense", False)
+    if fused_dense_sqrelu_dense:
+        assert config.activation_function == "sqrelu", (
+            "fused_dense_sqrelu_dense only " "supports approximate activation_function sqrelu"
+        )
+    assert not (fused_dense_sqrelu_dense and fused_mlp)
+    if not fused_mlp and not fused_dense_sqrelu_dense:
+        assert config.activation_function in [
+            "gelu",
+            "gelu_new",
+            "gelu_fast",
+            "gelu_approx",
+            "gelu_pytorch_tanh",
+            "relu",
+            "sqrelu",
+            "glu",
+            "swiglu",
+            "geglu",
+        ]
+        if config.activation_function in ["glu", "swiglu", "geglu"]:
+            activation = (
+                F.sigmoid
+                if config.activation_function == "glu"
+                else (F.silu if config.activation_function == "swiglu" else F.gelu)
+            )
+            mlp_cls = GatedMlp if process_group is None else ParallelGatedMlp
+            parallel_kwargs = (
+                {
+                    "process_group": process_group,
+                    "sequence_parallel": getattr(config, "sequence_parallel", True),
+                }
+                if process_group is not None
+                else {}
+            )
+            mlp_multiple_of = getattr(config, "mlp_multiple_of", 128)
+            mlp_cls = partial(
+                mlp_cls,
+                hidden_features=config.n_inner,
+                activation=activation,
+                bias1=mlp_fc1_bias,
+                bias2=mlp_fc2_bias,
+                multiple_of=mlp_multiple_of,
+                **parallel_kwargs,
+                **factory_kwargs,
+            )
+        else:
+            if config.activation_function == "relu":
+                activation = partial(F.relu, inplace=True)
+            elif config.activation_function == "sqrelu":
+                activation = sqrelu_fwd
+            else:
+                approximate = (
+                    "tanh"
+                    if config.activation_function
+                    in ["gelu_new", "gelu_fast", "gelu_approx", "gelu_pytorch_tanh"]
+                    else "none"
+                )
+                activation = partial(F.gelu, approximate=approximate)
+            mlp_cls = Mlp if process_group is None else ParallelMLP
+            parallel_kwargs = (
+                {
+                    "process_group": process_group,
+                    "sequence_parallel": getattr(config, "sequence_parallel", True),
+                }
+                if process_group is not None
+                else {}
+            )
+            mlp_cls = partial(
+                mlp_cls,
+                hidden_features=config.n_inner,
+                activation=activation,
+                bias1=mlp_fc1_bias,
+                bias2=mlp_fc2_bias,
+                **parallel_kwargs,
+                **factory_kwargs,
+            )
+    else:
+        mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
+        # mlp_checkpoint_lvl could be a list, which contains the checkpoint_lvl for each layer
+        if isinstance(mlp_checkpoint_lvl, Sequence):
+            assert layer_idx is not None
+            mlp_checkpoint_lvl = mlp_checkpoint_lvl[layer_idx]
+        if fused_mlp:
+            if FusedMLP is None:
+                raise ImportError("fused_dense is not installed")
+            activation = (
+                "gelu_approx"
+                if config.activation_function
+                in ["gelu_new", "gelu_fast", "gelu_approx", "gelu_pytorch_tanh"]
+                else config.activation_function
+            )
+            mlp_cls = FusedMLP if process_group is None else ParallelFusedMLP
+            parallel_kwargs = (
+                {
+                    "process_group": process_group,
+                    "sequence_parallel": getattr(config, "sequence_parallel", True),
+                }
+                if process_group is not None
+                else {}
+            )
+            mlp_cls = partial(
+                mlp_cls,
+                hidden_features=config.n_inner,
+                activation=activation,
+                checkpoint_lvl=mlp_checkpoint_lvl,
+                bias1=mlp_fc1_bias,
+                bias2=mlp_fc2_bias,
+                **parallel_kwargs,
+                **factory_kwargs,
+            )
+        elif fused_dense_sqrelu_dense:
+            if process_group is not None:
+                assert fused_mlp, "Tensor Parallel is not implemented for FusedDenseSqreluDense"
+            assert FusedDenseSqreluDense is not None
+            mlp_cls = partial(
+                FusedDenseSqreluDense,
+                hidden_features=config.n_inner,
+                checkpoint_lvl=mlp_checkpoint_lvl,
+                **factory_kwargs,
+            )
+        else:
+            raise RuntimeError("MLP type not supported")
+    return mlp_cls
+
+
+def create_block(config, layer_idx=None, process_group=None, device=None, dtype=None):
+    factory_kwargs = {"device": device, "dtype": dtype}
+    sequence_parallel = getattr(config, "sequence_parallel", True)
+    mixer_cls = create_mixer_cls(config, layer_idx, process_group=process_group, **factory_kwargs)
+    mlp_cls = create_mlp_cls(config, layer_idx, process_group=process_group, **factory_kwargs)
+    use_rms_norm = getattr(config, "rms_norm", False)
+    norm_cls = partial(
+        nn.LayerNorm if not use_rms_norm else RMSNorm,
+        eps=config.layer_norm_epsilon,
+        **factory_kwargs,
+    )
+    # TD [2022-07-30]: Force residual in fp32, seems to make fp16 training more stable
+    residual_in_fp32 = getattr(config, "residual_in_fp32", False)
+    resid_dropout1 = config.resid_pdrop if layer_idx is None or layer_idx > 0 else config.embd_pdrop
+    prenorm = getattr(config, "prenorm", True)
+    parallel_block = getattr(config, "parallel_block", False)
+    if not parallel_block:
+        block = Block(
+            config.hidden_size,
+            mixer_cls,
+            mlp_cls,
+            norm_cls=norm_cls,
+            prenorm=prenorm,
+            resid_dropout1=resid_dropout1,
+            resid_dropout2=config.resid_pdrop,
+            fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False),
+            residual_in_fp32=residual_in_fp32,
+            sequence_parallel=sequence_parallel and process_group is not None,
+            mark_shared_params=process_group is not None,
+        )
+    else:
+        assert prenorm
+        block = ParallelBlock(
+            config.hidden_size,
+            mixer_cls,
+            mlp_cls,
+            norm_cls=norm_cls,
+            resid_dropout1=resid_dropout1,
+            resid_dropout2=config.resid_pdrop,
+            tied_norm=getattr(config, "parallel_block_tied_norm", False),
+            fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False),
+            residual_in_fp32=residual_in_fp32,
+            sequence_parallel=sequence_parallel and process_group is not None,
+            mark_shared_params=process_group is not None,
+        )
+    block.layer_idx = layer_idx
+    return block
+
+
+class GPTPreTrainedModel(nn.Module):
+    """An abstract class to handle weights initialization and
+    a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__()
+        if not isinstance(config, GPT2Config):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `GPT2Config`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                )
+            )
+        self.config = config
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name,
+        config,
+        *args,
+        strict=True,
+        device=None,
+        dtype=None,
+        world_size=1,
+        rank=0,
+        **kwargs,
+    ):
+        """
+        Instantiate a GPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+        """
+        # Instantiate model.
+        model = cls(config, *args, device=device, dtype=dtype, **kwargs)
+        # Load state_dict in cpu because we already initialized the model in GPU, and we don't
+        # want extra stuff taking up more GPU memory
+        state_dict = state_dict_from_pretrained(model_name, device="cpu", dtype=dtype)
+        if model_name.startswith("gpt2"):
+            state_dict = remap_state_dict_hf_gpt2(state_dict, config)
+        elif model_name.startswith("facebook/opt"):
+            state_dict = remap_state_dict_hf_opt(state_dict, config)
+        elif model_name.startswith("EleutherAI/gpt-j-") or model_name.startswith(
+            "togethercomputer/GPT-JT-"
+        ):
+            state_dict = remap_state_dict_hf_gptj(state_dict, config)
+        elif (
+            model_name.startswith("EleutherAI/gpt-neox-")
+            or model_name.startswith("EleutherAI/pythia-")
+            or model_name.startswith("togethercomputer/RedPajama-INCITE-")
+        ):
+            state_dict = remap_state_dict_hf_gpt_neox(state_dict, config)
+        elif model_name.startswith("tiiuae/falcon-"):
+            state_dict = remap_state_dict_hf_falcon(state_dict, config)
+        elif model_name.startswith("meta-llama/Llama-"):
+            state_dict = remap_state_dict_hf_llama(state_dict, config)
+        elif model_name.startswith("bigcode/") or model_name.startswith("WizardLM/"):
+            state_dict = remap_state_dict_hf_bigcode(state_dict, config)
+        else:
+            raise NotImplementedError(f"Model {model_name} not supported")
+        if world_size > 1:
+            state_dict = shard_state_dict_tp(state_dict, config, world_size, rank)
+        load_return = model.load_state_dict(state_dict, strict=strict)
+        logger.info(load_return)
+        return model
+
+
+# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
+def _init_weights(
+    module, n_layer, initializer_range=0.02, mup_width_scale=1.0, rescale_prenorm_residual=True
+):
+    mup_init_scale = math.sqrt(mup_width_scale)
+    if isinstance(module, nn.Linear):
+        nn.init.normal_(module.weight, std=initializer_range * mup_init_scale)
+        optim_cfg = getattr(module.weight, "_optim", {})
+        optim_cfg.update({"lr_multiplier": mup_width_scale})
+        setattr(module.weight, "_optim", optim_cfg)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding):
+        nn.init.normal_(module.weight, std=initializer_range)
+
+    if rescale_prenorm_residual:
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                nn.init.normal_(
+                    p, mean=0.0, std=initializer_range * mup_init_scale / math.sqrt(2 * n_layer)
+                )
+
+
+class GPTModel(GPTPreTrainedModel):
+    def __init__(self, config: GPT2Config, process_group=None, device=None, dtype=None):
+        super().__init__(config)
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.process_group = process_group
+        self.sequence_parallel = getattr(config, "sequence_parallel", True)
+        assert config.activation_function in [
+            "gelu",
+            "gelu_new",
+            "gelu_fast",
+            "gelu_approx",
+            "gelu_pytorch_tanh",
+            "relu",
+            "sqrelu",
+            "glu",
+            "swiglu",
+            "geglu",
+        ]
+        pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+        vocab_size = (
+            math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+        )
+        self.embeddings_multiplier = getattr(config, "mup_embeddings_multiplier", 1.0)
+        # TD [2022-07-30]: Force residual in fp32, seems to make fp16 training more stable
+        self.residual_in_fp32 = getattr(config, "residual_in_fp32", False)
+        # These 2 options are for OPT-350m
+        self.prenorm = getattr(config, "prenorm", True)
+        use_rms_norm = getattr(config, "rms_norm", False)
+        word_embed_proj_dim = getattr(config, "word_embed_proj_dim", None)
+        # For GPT-J, GPT-NeoX
+        self.parallel_block = getattr(config, "parallel_block", False)
+
+        if process_group is None:
+            self.embeddings = GPT2Embeddings(
+                config.hidden_size,
+                vocab_size,
+                config.max_position_embeddings,
+                word_embed_proj_dim=word_embed_proj_dim,
+                **factory_kwargs,
+            )
+        else:
+            self.embeddings = ParallelGPT2Embeddings(
+                config.hidden_size,
+                vocab_size,
+                config.max_position_embeddings,
+                process_group=process_group,
+                sequence_parallel=self.sequence_parallel,
+                **factory_kwargs,
+            )
+
+        # We change the order of dropout, residual and layer norm:
+        # Instead of LN -> Attn / MLP -> Dropout -> Add, we do:
+        # Dropout -> Add -> LN -> Attn / MLP, returning both the residual branch (output of Add) and
+        # the main branch (output of MLP). The model definition is unchanged, but the mapping of the
+        # nn.Dropout probabilities are changed.
+        # This is for performance reason: we can fuse dropout + add + layer_norm.
+        self.layers = nn.ModuleList(
+            [
+                create_block(config, layer_idx=i, process_group=process_group, **factory_kwargs)
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        rotary_emb_fraction = getattr(config, "rotary_emb_fraction", 0.0)
+        if rotary_emb_fraction > 0.0:  # Tie all the RotaryEmbedding modules to share the same cos/sin cache
+            for layer in self.layers[1:]:
+                layer.mixer.rotary_emb = self.layers[0].mixer.rotary_emb
+
+        self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
+        if self.fused_dropout_add_ln:
+            if layer_norm_fn is None:
+                raise ImportError("Triton is not installed")
+        if self.prenorm:
+            self.drop_f = nn.Dropout(config.resid_pdrop)
+            norm_cls = nn.LayerNorm if not use_rms_norm else RMSNorm
+            self.ln_f = norm_cls(
+                config.hidden_size, eps=config.layer_norm_epsilon, **factory_kwargs
+            )
+        if process_group is not None:
+            for p in self.ln_f.parameters():
+                # Mark the norm parameters as "shared_params" so that we sync their values at init.
+                p._shared_params = True
+                # Mark the norm params as "sequence_parallel" so we run all-reduce on their grads.
+                if self.sequence_parallel:
+                    p._sequence_parallel = True
+
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=config.num_hidden_layers,
+                initializer_range=config.initializer_range,
+                mup_width_scale=getattr(config, "mup_width_scale", 1.0),
+            )
+        )
+        self.tie_weights()
+
+    def tie_weights(self):
+        if self.process_group is not None:
+            sync_shared_params(self, self.process_group)
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return {
+            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+            for i, layer in enumerate(self.layers)
+        }
+
+    def forward(self, input_ids, position_ids=None, inference_params=None):
+        # If using Tensor Parallel with sequence parallel, we combine the batch and the seqlen
+        # dimensions so that we can split on it easily, in case of small batch size.
+        # Only the attention layers need to know the seqlen.
+        embedding_kwargs = (
+            {"combine_batch_seqlen_dim": True}
+            if self.process_group is not None and self.sequence_parallel
+            else {}
+        )
+        hidden_states = self.embeddings(input_ids, position_ids=position_ids, **embedding_kwargs)
+        if self.embeddings_multiplier != 1.0:
+            hidden_states = hidden_states * self.embeddings_multiplier
+        if self.parallel_block:
+            hidden_states2 = None
+        residual = None
+        mixer_kwargs = (
+            {"seqlen": input_ids.shape[1]}
+            if self.process_group is not None and self.sequence_parallel
+            else {}
+        )
+        if inference_params is not None:
+            mixer_kwargs["inference_params"] = inference_params
+        for layer in self.layers:
+            if self.prenorm:
+                if not self.parallel_block:
+                    hidden_states, residual = layer(
+                        hidden_states, residual, mixer_kwargs=mixer_kwargs
+                    )
+                else:
+                    hidden_states, hidden_states2, residual = layer(
+                        hidden_states, hidden_states2, residual, mixer_kwargs=mixer_kwargs
+                    )
+            else:
+                hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
+        if self.prenorm:
+            if not self.fused_dropout_add_ln:
+                dropped = self.drop_f(hidden_states)
+                if not self.parallel_block:
+                    residual = (dropped + residual) if residual is not None else dropped
+                else:
+                    dropped2 = self.drop_f(hidden_states2)
+                    residual = (
+                        (residual + dropped + dropped2)
+                        if residual is not None
+                        else dropped + dropped2
+                    )
+                hidden_states = self.ln_f(residual.to(dtype=self.ln_f.weight.dtype))
+            else:
+                # Set prenorm=False here since we don't need the residual
+                hidden_states = layer_norm_fn(
+                    hidden_states,
+                    self.ln_f.weight,
+                    self.ln_f.bias,
+                    residual=residual,
+                    x1=None if not self.parallel_block else hidden_states2,
+                    eps=self.ln_f.eps,
+                    dropout_p=self.drop_f.p if self.training else 0.0,
+                    prenorm=False,
+                    is_rms_norm=isinstance(self.ln_f, RMSNorm)
+                )
+        return hidden_states
+
+
+class GPTLMHeadModel(GPTPreTrainedModel, GenerationMixin):
+    def __init__(self, config: GPT2Config, process_group=None, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(config)
+        self.process_group = process_group
+        self.transformer = GPTModel(config, process_group=process_group, **factory_kwargs)
+        self.tie_word_embeddings = getattr(config, "tie_word_embeddings", True)
+        lm_head_bias = getattr(config, "lm_head_bias", False)
+        pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+        vocab_size = (
+            math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+        )
+        # This option is for OPT-350m
+        word_embed_proj_dim = getattr(config, "word_embed_proj_dim", None)
+        embed_dim = config.n_embd if word_embed_proj_dim is None else word_embed_proj_dim
+        if word_embed_proj_dim is not None:
+            self.project_out = nn.Linear(config.n_embd, embed_dim, bias=False, **factory_kwargs)
+        else:
+            self.project_out = None
+        mup_width_scale = getattr(config, "mup_width_scale", 1.0)
+        mup_output_multiplier = getattr(config, "mup_output_multiplier", 1.0)
+        self.output_scale = mup_output_multiplier * mup_width_scale
+        if process_group is None:
+            self.lm_head = nn.Linear(embed_dim, vocab_size, bias=lm_head_bias, **factory_kwargs)
+        else:
+            if ColumnParallelLinear is None:
+                raise ImportError("fused_dense_lib is not installed")
+            self.lm_head = ColumnParallelLinear(
+                embed_dim,
+                vocab_size,
+                process_group,
+                bias=lm_head_bias,
+                sequence_parallel=getattr(config, "sequence_parallel", True),
+                **factory_kwargs,
+            )
+        self.norm_head = getattr(config, "norm_head", False)
+        # Initialize weights and apply final processing
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=config.num_hidden_layers,
+                initializer_range=config.initializer_range,
+                mup_width_scale=mup_width_scale,
+            )
+        )
+        self.tie_weights()
+
+    def tie_weights(self):
+        if self.tie_word_embeddings:
+            self.lm_head.weight = self.transformer.embeddings.word_embeddings.weight
+        if self.process_group is not None:
+            sync_shared_params(self, self.process_group)
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.transformer.allocate_inference_cache(
+            batch_size, max_seqlen, dtype=dtype, **kwargs
+        )
+
+    def forward(self, input_ids, position_ids=None, inference_params=None, num_last_tokens=0):
+        """
+        input_ids: (batch, seqlen) int tensor
+        inference_params: for generation. Adapted from Megatron-LM (and Apex)
+        https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
+        num_last_tokens: if > 0, only return the logits for the last n tokens
+        """
+        assert (
+            input_ids.ndim == 2
+        ), f"Expected `input_ids` to have shape [b, slen], but got shape {input_ids.shape}"
+        b, slen = input_ids.shape
+        hidden_states = self.transformer(
+            input_ids, position_ids=position_ids, inference_params=inference_params
+        )
+        if inference_params is not None:
+            assert hidden_states.ndim == 3, "sequence_parallel is not supported in generation mode"
+        if num_last_tokens > 0:
+            hidden_states = hidden_states[:, -num_last_tokens:]
+        if self.project_out is not None:
+            hidden_states = self.project_out(hidden_states)
+        if self.output_scale != 1.0:
+            hidden_states = hidden_states * self.output_scale
+        if not self.norm_head:
+            lm_logits = self.lm_head(hidden_states)
+        else:
+            lm_head_weight = F.normalize(self.lm_head.weight)
+            if isinstance(self.lm_head, ColumnParallelLinear) and self.lm_head.sequence_parallel:
+                hidden_states = all_gather(hidden_states, self.lm_head.process_group)
+            lm_logits = F.linear(hidden_states, lm_head_weight, bias=self.lm_head.bias)
+        # During inference, we want the full logit for sampling
+        if isinstance(self.lm_head, ColumnParallelLinear) and inference_params is not None:
+            lm_logits, _ = all_gather_raw(lm_logits, self.lm_head.process_group)
+            lm_logits = rearrange(lm_logits, "(n b) ... d -> b ... (n d)", b=b)
+        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
+        return CausalLMOutput(logits=lm_logits)
+
+    def load_state_dict(self, state_dict, strict=True):
+        # Remapping from our checkpoints that used a different ordering of layers in the block
+        # Previous: Attn / MLP -> Dropout -> Add -> LN
+        # Current: Dropout -> Add -> LN -> Attn / MLP
+        if "transformer.ln_0.weight" in state_dict:
+            n_layers = len(self.transformer.layers)
+            ln_weight = state_dict.pop(f"transformer.layers.{n_layers - 1}.norm2.weight")
+            ln_bias = state_dict.pop(f"transformer.layers.{n_layers - 1}.norm2.bias")
+            state_dict["transformer.ln_f.weight"] = ln_weight
+            state_dict["transformer.ln_f.bias"] = ln_bias
+            for l in reversed(range(n_layers)):
+                ln_weight = state_dict.pop(f"transformer.layers.{l}.norm1.weight")
+                ln_bias = state_dict.pop(f"transformer.layers.{l}.norm1.bias")
+                state_dict[f"transformer.layers.{l}.norm2.weight"] = ln_weight
+                state_dict[f"transformer.layers.{l}.norm2.bias"] = ln_bias
+                if l > 0:
+                    ln_weight = state_dict.pop(f"transformer.layers.{l - 1}.norm2.weight")
+                    ln_bias = state_dict.pop(f"transformer.layers.{l - 1}.norm2.bias")
+                    state_dict[f"transformer.layers.{l}.norm1.weight"] = ln_weight
+                    state_dict[f"transformer.layers.{l}.norm1.bias"] = ln_bias
+            ln_weight = state_dict.pop("transformer.ln_0.weight")
+            ln_bias = state_dict.pop("transformer.ln_0.bias")
+            state_dict[f"transformer.layers.0.norm1.weight"] = ln_weight
+            state_dict[f"transformer.layers.0.norm1.bias"] = ln_bias
+        return super().load_state_dict(state_dict, strict=strict)
+
+
+def shard_state_dict_tp(state_dict, config, world_size, rank):
+    """Convert the state_dict of a standard GPT model to the state_dict of a GPT model
+    with tensor parallel.
+
+    This function modifies state_dict in place.
+    """
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    assert vocab_size % world_size == 0
+    assert config.hidden_size % world_size == 0
+    inner_dim = config.n_inner if config.n_inner is not None else 4 * config.hidden_size
+    assert inner_dim % world_size == 0
+
+    n_head = config.n_head
+    n_head_kv = getattr(config, "n_head_kv", n_head)
+
+    embed_dim = config.hidden_size
+    head_dim = embed_dim // n_head
+
+    def shard_first_dim(state_dict, key):
+        if key in state_dict:
+            x = state_dict[key]
+            dim = x.shape[0] // world_size
+            state_dict[key] = x[rank * dim : (rank + 1) * dim]
+
+    def shard_last_dim(state_dict, key, multiple_of=1):
+        if key in state_dict:
+            x = state_dict[key]
+            dim_each_rank = [
+                get_dim_for_local_rank(x.size(-1), world_size, local_rank, multiple_of)
+                for local_rank in range(world_size)
+            ]
+            beg, end = tuple(sum(dim_each_rank[:pos]) for pos in (rank, rank + 1))
+            state_dict[key] = x[..., beg:end]
+
+    def shard_gatedmlp_fc1_dim(state_dict, key):
+        if key in state_dict:
+            x = state_dict[key]
+            dim = x.shape[0] // world_size // 2
+            state_dict[key] = rearrange(
+                rearrange(x, "(two o) ... -> two o ...", two=2)[:, rank * dim : (rank + 1) * dim],
+                "two o ... -> (two o) ...",
+            )
+
+    def shard_qkv_headdim(state_dict, key):
+        if key in state_dict:
+            n_head_each_rank = [
+                get_dim_for_local_rank(n_head, world_size, local_rank)
+                for local_rank in range(world_size)
+            ]
+            n_head_kv_each_rank = [
+                get_dim_for_local_rank(n_head_kv, world_size, local_rank)
+                for local_rank in range(world_size)
+            ]
+
+            beg_n_head = sum(n_head_each_rank[:rank])
+            end_n_head = sum(n_head_each_rank[: rank + 1])
+
+            beg_n_head_kv = sum(n_head_kv_each_rank[:rank])
+            end_n_head_kv = sum(n_head_kv_each_rank[: rank + 1])
+
+            if n_head_kv == n_head:
+                x = rearrange(state_dict[key], "(three d) ... -> three d ...", three=3)
+                state_dict[key] = rearrange(
+                    x[:, beg_n_head * head_dim : end_n_head * head_dim],
+                    "three d ... -> (three d) ...",
+                )
+            else:
+                x = rearrange(
+                    state_dict[key],
+                    "(nheadqkv headdim) ... -> nheadqkv headdim ...",
+                    nheadqkv=n_head + 2 * n_head_kv,
+                )
+                state_dict[key] = rearrange(
+                    torch.cat(
+                        [
+                            x[beg_n_head:end_n_head],
+                            x[n_head + beg_n_head_kv : n_head + end_n_head_kv],
+                            x[
+                                n_head
+                                + n_head_kv
+                                + beg_n_head_kv : n_head
+                                + n_head_kv
+                                + end_n_head_kv
+                            ],
+                        ],
+                        dim=0,
+                    ),
+                    "nheadqkv headdim ... -> (nheadqkv headdim) ...",
+                )
+
+    shard_first_dim(state_dict, "transformer.embeddings.word_embeddings.weight")
+    if "lm_head.weight" in state_dict:
+        shard_first_dim(state_dict, "lm_head.weight")
+    if "transformer.embeddings.position_embeddings.weight" in state_dict:
+        shard_last_dim(state_dict, "transformer.embeddings.position_embeddings.weight")
+    for i in range(config.num_hidden_layers):
+        shard_qkv_headdim(state_dict, f"transformer.layers.{i}.mixer.Wqkv.weight")
+        shard_qkv_headdim(state_dict, f"transformer.layers.{i}.mixer.Wqkv.bias")
+        shard_last_dim(
+            state_dict, f"transformer.layers.{i}.mixer.out_proj.weight", multiple_of=head_dim
+        )
+        if rank != 0:
+            state_dict.pop(f"transformer.layers.{i}.mixer.out_proj.bias", None)
+        if config.activation_function in ["glu", "swiglu", "geglu"]:
+            shard_gatedmlp_fc1_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.weight")
+            shard_gatedmlp_fc1_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.bias")
+        else:
+            shard_first_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.weight")
+            shard_first_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.bias")
+        shard_last_dim(state_dict, f"transformer.layers.{i}.mlp.fc2.weight")
+        if rank != 0:
+            state_dict.pop(f"transformer.layers.{i}.mlp.fc2.bias", None)
+    return state_dict
+
+
+def combine_state_dicts_tp(state_dicts: List[Dict[str, torch.Tensor]], config: GPT2Config):
+    """Convert the list of sharded state_dict of a GPT model with tensor parallel to
+    the state_dict of a standard GPT model.
+
+    This function is meant to be the "reverse" of shard_state_dict_tp.
+
+    Precondition:
+        - state_dicts should be ordered in the same way as the shards were created.
+    """
+    world_size = len(state_dicts)
+    keys = state_dicts[0].keys()
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    assert vocab_size % world_size == 0
+    assert config.hidden_size % world_size == 0
+    inner_dim = config.n_inner if config.n_inner is not None else 4 * config.hidden_size
+    assert inner_dim % world_size == 0
+    assert config.hidden_size % config.n_head == 0
+    headdim = config.hidden_size // config.n_head
+
+    # Sometimes the word embeddings are sharded on the 0th dim, sometimes on the 1st dim.
+    # vocab_size // world_size coordinates are nonzero.
+    def combine_word_embeddings(state_dicts, state_dict, key):
+        dim = 0 if state_dicts[0][key].shape[0] == vocab_size // world_size else 1
+        state_dict[key] = torch.cat([s[key] for s in state_dicts], dim=dim)
+
+    def combine_dim(state_dicts, state_dict, key, dim=-1):
+        if key in state_dict:
+            state_dict[key] = torch.cat([s[key] for s in state_dicts], dim=dim)
+
+    def combine_qkv_headdim(state_dicts, state_dict, key):
+        n_head = config.n_head
+        n_head_kv = getattr(config, "n_head_kv", n_head)
+        if key in state_dict:
+            if n_head_kv == n_head:
+                xs = [
+                    rearrange(s[key], "(three d) ... -> three d ...", three=3) for s in state_dicts
+                ]
+                state_dict[key] = rearrange(torch.cat(xs, dim=1), "three d ... -> (three d) ...")
+            else:
+                n_head_each_rank = [
+                    get_dim_for_local_rank(n_head, world_size, local_rank)
+                    for local_rank in range(world_size)
+                ]
+                n_head_kv_each_rank = [
+                    get_dim_for_local_rank(n_head_kv, world_size, local_rank)
+                    for local_rank in range(world_size)
+                ]
+                xs = [
+                    rearrange(
+                        s[key],
+                        "(nheadqkv headdim) ... -> nheadqkv headdim ...",
+                        nheadqkv=rank_n_head + 2 * rank_n_head_kv,
+                        headdim=headdim,
+                    )
+                    for s, rank_n_head, rank_n_head_kv in zip(
+                        state_dicts, n_head_each_rank, n_head_kv_each_rank
+                    )
+                ]
+                wq = torch.cat([x[: n_head_each_rank[rank]] for rank, x in enumerate(xs)], dim=0)
+                wk = torch.cat(
+                    [
+                        x[
+                            n_head_each_rank[rank] : n_head_each_rank[rank]
+                            + n_head_kv_each_rank[rank]
+                        ]
+                        for rank, x in enumerate(xs)
+                    ],
+                    dim=0,
+                )
+                wv = torch.cat(
+                    [
+                        x[n_head_each_rank[rank] + n_head_kv_each_rank[rank] :]
+                        for rank, x in enumerate(xs)
+                    ],
+                    dim=0,
+                )
+                wqkv = torch.cat(
+                    [wq, wk, wv],
+                    dim=0,
+                )
+                state_dict[key] = rearrange(
+                    wqkv,
+                    "nheadqkv headdim ... -> (nheadqkv headdim) ...",
+                )
+
+    def combine_gated_mlp(state_dicts, state_dict, key):
+        if key in state_dict:
+            xs = [rearrange(s[key], "(two d) ... -> two d ...", two=2) for s in state_dicts]
+            state_dict[key] = rearrange(torch.cat(xs, dim=1), "two d ... -> (two d) ...")
+
+    state_dict = state_dicts[0].copy()  # don't modify state_dict[0] inplace
+    combine_word_embeddings(
+        state_dicts, state_dict, "transformer.embeddings.word_embeddings.weight"
+    )
+    if "lm_head.weight" in state_dict:
+        combine_word_embeddings(state_dicts, state_dict, "lm_head.weight")
+    if "transformer.embeddings.position_embeddings.weight" in state_dict:
+        combine_dim(
+            state_dicts, state_dict, "transformer.embeddings.position_embeddings.weight", -1
+        )
+    mlp_combine_fn = (
+        combine_gated_mlp
+        if config.activation_function in ["glu", "swiglu", "geglu"]
+        else partial(combine_dim, dim=0)
+    )
+    for i in range(config.num_hidden_layers):
+        combine_qkv_headdim(state_dicts, state_dict, f"transformer.layers.{i}.mixer.Wqkv.weight")
+        combine_qkv_headdim(state_dicts, state_dict, f"transformer.layers.{i}.mixer.Wqkv.bias")
+        combine_dim(state_dicts, state_dict, f"transformer.layers.{i}.mixer.out_proj.weight", -1)
+        mlp_combine_fn(state_dicts, state_dict, f"transformer.layers.{i}.mlp.fc1.weight")
+        combine_dim(state_dicts, state_dict, f"transformer.layers.{i}.mlp.fc1.bias", 0)
+        combine_dim(state_dicts, state_dict, f"transformer.layers.{i}.mlp.fc2.weight", -1)
+    return state_dict
+
+
+def remap_state_dict_hf_gpt2(state_dict, config):
+    # Word embedding and position embedding
+    def key_mapping_pos_emb(key):
+        return re.sub(r"^wpe.", "transformer.embeddings.position_embeddings.", key)
+
+    state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("wte.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
+        key = re.sub(r"^h.(\d+).ln_(1|2).(weight|bias)", r"transformer.layers.\1.norm\2.\3", key)
+        return key
+
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+
+    # MLP
+    for d in range(config.num_hidden_layers):
+        W1 = state_dict.pop(f"h.{d}.mlp.c_fc.weight")
+        state_dict[f"transformer.layers.{d}.mlp.fc1.weight"] = W1.t()
+        W2 = state_dict.pop(f"h.{d}.mlp.c_proj.weight")
+        state_dict[f"transformer.layers.{d}.mlp.fc2.weight"] = W2.t()
+
+    def key_mapping_mlp(key):
+        key = re.sub(r"^h.(\d+).mlp.c_fc.bias", r"transformer.layers.\1.mlp.fc1.bias", key)
+        key = re.sub(r"^h.(\d+).mlp.c_proj.bias", r"transformer.layers.\1.mlp.fc2.bias", key)
+        return key
+
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+
+    # Attention
+    for d in range(config.num_hidden_layers):
+        state_dict.pop(f"h.{d}.attn.bias", None)  # We don't store this bias
+        Wqkv = state_dict.pop(f"h.{d}.attn.c_attn.weight")
+        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = Wqkv.t()
+        Wout = state_dict.pop(f"h.{d}.attn.c_proj.weight")
+        state_dict[f"transformer.layers.{d}.mixer.out_proj.weight"] = Wout.t()
+
+    def key_mapping_attn(key):
+        key = re.sub(r"^h.(\d+).attn.c_attn.bias", r"transformer.layers.\1.mixer.Wqkv.bias", key)
+        key = re.sub(
+            r"^h.(\d+).attn.c_proj.bias", r"transformer.layers.\1.mixer.out_proj.bias", key
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+
+    return state_dict
+
+
+def remap_state_dict_megatron(state_dict, config):
+    def key_mapping_transformer(key):
+        key = re.sub(r"^language_model.encoder.", "transformer.", key)
+        key = re.sub(r"^language_model.", "transformer.", key)
+        return key
+
+    state_dict = OrderedDict((key_mapping_transformer(k), v) for k, v in state_dict.items())
+
+    # Word embedding and position embedding
+    def key_mapping_pos_emb(key):
+        return re.sub(r"^wpe.", "transformer.embeddings.position_embeddings.", key)
+
+    state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embedding.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = (
+        math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    )
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.final_layernorm.(weight|bias)", r"transformer.ln_f.\1", key)
+        key = re.sub(
+            r"^transformer.layers.(\d+).input_layernorm.(weight|bias)",
+            r"transformer.layers.\1.norm1.\2",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).post_attention_layernorm.(weight|bias)",
+            r"transformer.layers.\1.norm2.\2",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+
+    # MLP
+    def key_mapping_mlp(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.dense_h_to_4h.(weight|bias)",
+            r"transformer.layers.\1.mlp.fc1.\2",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.dense_4h_to_h.(weight|bias)",
+            r"transformer.layers.\1.mlp.fc2.\2",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+
+    # Attention
+    def key_mapping_attn(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attention.rotary_emb.inv_freq",
+            r"transformer.layers.\1.mixer.rotary_emb.inv_freq",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attention.query_key_value.(weight|bias)",
+            r"transformer.layers.\1.mixer.Wqkv.\2",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attention.dense.(weight|bias)",
+            r"transformer.layers.\1.mixer.out_proj.\2",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    # Megatron stores Wqkv as ((nheads 3 headdim), hidden_dim)
+    # while we store Wqkv as ((3 nheads headdim), hidden_dim)
+    headdim = config.hidden_size // config.num_attention_heads
+    for d in range(config.num_hidden_layers):
+        Wqkv = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.weight")
+        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = rearrange(
+            Wqkv,
+            "(nheads three headdim) ... -> (three nheads headdim) ...",
+            three=3,
+            headdim=headdim,
+        )
+        bqkv = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.bias")
+        state_dict[f"transformer.layers.{d}.mixer.Wqkv.bias"] = rearrange(
+            bqkv, "(nheads three headdim) -> (three nheads headdim)", three=3, headdim=headdim
+        )
+
+    return state_dict
diff --git a/gpt2-hf.yaml b/gpt2-hf.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d6cb22f0a6576b5bd84f809caa0677ba49d16cf1
--- /dev/null
+++ b/gpt2-hf.yaml
@@ -0,0 +1,13 @@
+defaults:
+  - _self_
+  - gpt2model: gpt2-small
+
+_target_: transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel
+_recursive_: True
+config:
+  _target_: transformers.GPT2Config
+  # Mistral's config: https://github.com/stanford-crfm/mistral/blob/main/conf/models/gpt2-small.yaml
+  # However, reorder_and_upcast_attn slows things down
+  reorder_and_upcast_attn: false
+  scale_attn_by_inverse_layer_idx: true
+  n_positions: ${datamodule.max_length}
diff --git a/gpt2-large.yaml b/gpt2-large.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..434a61eb99141305a7f01b9fa812614a0a1b7109
--- /dev/null
+++ b/gpt2-large.yaml
@@ -0,0 +1,6 @@
+# @package _global_
+model:
+  config:
+    n_embd: 1280
+    n_head: 20
+    n_layer: 36
diff --git a/gpt2-medium.yaml b/gpt2-medium.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..786091836aa683b26e4d39ab557b1556a5331250
--- /dev/null
+++ b/gpt2-medium.yaml
@@ -0,0 +1,6 @@
+# @package _global_
+model:
+  config:
+    n_embd: 1024
+    n_head: 16
+    n_layer: 24
diff --git a/gpt2-small.yaml b/gpt2-small.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..039c9180226c1f504b43ce559ad6bd9cbdcc8cf9
--- /dev/null
+++ b/gpt2-small.yaml
@@ -0,0 +1,6 @@
+# @package _global_
+model:
+  config:
+    n_embd: 768
+    n_head: 12
+    n_layer: 12
diff --git a/gpt2-xlarge.yaml b/gpt2-xlarge.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d67a0e4185463f178e4f4510e3b753f3c7e0c169
--- /dev/null
+++ b/gpt2-xlarge.yaml
@@ -0,0 +1,6 @@
+# @package _global_
+model:
+  config:
+    n_embd: 1600
+    n_head: 25
+    n_layer: 48
diff --git a/gpt2.yaml b/gpt2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6c3868d06e7f9e873c3146cd85318cc0bc7bc101
--- /dev/null
+++ b/gpt2.yaml
@@ -0,0 +1,13 @@
+defaults:
+  - _self_
+  - gpt2model: gpt2-small
+
+_target_: flash_attn.models.gpt.GPTLMHeadModel
+_recursive_: True
+config:
+  _target_: transformers.GPT2Config
+  # Mistral's config: # https://github.com/stanford-crfm/mistral/blob/main/conf/models/mistral-small.yaml
+  # However, reorder_and_upcast_attn slows things down
+  reorder_and_upcast_attn: false
+  scale_attn_by_inverse_layer_idx: true
+  n_positions: ${datamodule.max_length}
diff --git a/gpt2_training_curve.jpg b/gpt2_training_curve.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bcf31e5b101a3856fc68633cca50f50a6a13d26c
Binary files /dev/null and b/gpt2_training_curve.jpg differ
diff --git a/gpt2_training_efficiency.jpg b/gpt2_training_efficiency.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..60be8139c7eabb53e94c3ee64cff8c7339774a61
Binary files /dev/null and b/gpt2_training_efficiency.jpg differ
diff --git a/gpt2l-flash.yaml b/gpt2l-flash.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dacda10e5e02227b571c244562cfd15a8c190066
--- /dev/null
+++ b/gpt2l-flash.yaml
@@ -0,0 +1,41 @@
+# @package _global_
+defaults:
+  - /experiment/owt/gpt2m-flash.yaml
+  - override /model/gpt2model: gpt2-large
+  # TD [2022-08-03] Surprisingly it's faster to use the ZeRO optimizer than just AdamW.
+  # Still, fairscale is even faster and uses less memory.
+  # I think it's because Pytorch is using ZeRO stage 1 and fairscale is using ZeRO stage 2?
+  # However, fairscale has issues with saving checkpoint (either OOM or very
+  # slow since it goes through the CPU?). Fairscale says Pytorch ZeRO is the
+  # upstream version of OSS
+  # https://github.com/facebookresearch/fairscale/issues/937
+  # Pytorch ZeRO as also very slow for saving checkpoints due to
+  # consolidate_state_dict(), but I've fixed it to save separate checkpoint per GPU.
+  - override /optimizer: adamw-zero
+
+  # FusedAdam doesn't seem to speed things up here, time per global step
+  # (i.e. batch size 512) on 8 A100s is around 2056ms for both AdamW and FusedAdam.
+  # This could be because each GPU is only doing the optimizer step for 1 /
+  # world_size of the parameters.
+  # Maybe the bottleneck here is the NCCL call to exchange parameters (ZeRO).
+  # - override /optimizer: adamw-apex-zero
+
+# Can enable mlp_chekcpoint_lvl to fit batch_size 16 on A100 40GB
+# model:
+#   config:
+#     # mlp_checkpoint_lvl: ${eval:"[1] * 18 + [2] * 18"}
+#     mlp_checkpoint_lvl: 1
+
+datamodule:
+  # batch_size: 16
+  batch_size: ${eval:"4 if ${train.gpu_mem} < 24 else (8 if ${train.gpu_mem} < 40 else (16 if ${train.gpu_mem} < 80 else 32))"}
+
+trainer:
+  # strategy: null
+  # strategy: ${eval:"None if ${trainer.devices} == 1 else 'ddp_sharded'"}
+  strategy:
+    _target_: src.utils.ddp_zero1.DDPStrategyZero1
+    find_unused_parameters: False
+    gradient_as_bucket_view: True
+  # TD [2022-08-03] Deepspeed makes the ppl curve go wild
+  # strategy: deepspeed_stage_1
diff --git a/gpt2l-hf.yaml b/gpt2l-hf.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b8a2924925b95ffedcabd55b8ed075d9063f45dd
--- /dev/null
+++ b/gpt2l-hf.yaml
@@ -0,0 +1,14 @@
+# @package _global_
+defaults:
+  - /experiment/owt/gpt2m-hf.yaml
+  - override /model/gpt2model: gpt2-large
+  - override /optimizer: adamw-zero
+
+datamodule:
+  batch_size: 2
+
+trainer:
+  strategy:
+    _target_: src.utils.ddp_zero1.DDPStrategyZero1
+    find_unused_parameters: False
+    gradient_as_bucket_view: True
diff --git a/gpt2l.yaml b/gpt2l.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..83d3ccf256af8c423dbb16aa84a989f7a634fa3a
--- /dev/null
+++ b/gpt2l.yaml
@@ -0,0 +1,14 @@
+# @package _global_
+defaults:
+  - /experiment/owt/gpt2m.yaml
+  - override /model/gpt2model: gpt2-large
+  - override /optimizer: adamw-zero
+
+datamodule:
+  batch_size: 4  # Per GPU
+
+trainer:
+  strategy:
+    _target_: src.utils.ddp_zero1.DDPStrategyZero1
+    find_unused_parameters: False
+    gradient_as_bucket_view: True
diff --git a/gpt2m-flash.yaml b/gpt2m-flash.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bec671e26abc67dc3b8c2ab4a51ccaf710a18cd3
--- /dev/null
+++ b/gpt2m-flash.yaml
@@ -0,0 +1,17 @@
+# @package _global_
+defaults:
+  - /experiment/owt/gpt2s-flash.yaml
+  - override /model/gpt2model: gpt2-medium
+
+# Can enable mlp_checkpoint_lvl to fit batch_size 32 to A100 40GB
+# model:
+#   config:
+#     mlp_checkpoint_lvl: 1
+
+datamodule:
+  # batch_size: 32
+  batch_size: ${eval:"8 if ${train.gpu_mem} < 24 else (16 if ${train.gpu_mem} < 40 else (32 if ${train.gpu_mem} < 80 else 64))"}
+
+train:
+  optimizer:
+    lr: 1.5e-4
diff --git a/gpt2m-hf.yaml b/gpt2m-hf.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e570e21b86ff1dfcc71540867b032c5d2b755a9
--- /dev/null
+++ b/gpt2m-hf.yaml
@@ -0,0 +1,11 @@
+# @package _global_
+defaults:
+  - /experiment/owt/gpt2s-hf.yaml
+  - override /model/gpt2model: gpt2-medium
+
+datamodule:
+  batch_size: 4
+
+train:
+  optimizer:
+    lr: 1.5e-4
diff --git a/gpt2m.yaml b/gpt2m.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4cc99335b7f5c8bf8d4114becfe19df9450e3c29
--- /dev/null
+++ b/gpt2m.yaml
@@ -0,0 +1,11 @@
+# @package _global_
+defaults:
+  - /experiment/owt/gpt2s.yaml
+  - override /model/gpt2model: gpt2-medium
+
+datamodule:
+  batch_size: 8  # Per GPU
+
+train:
+  optimizer:
+    lr: 1.5e-4
diff --git a/gpt2s-flash.yaml b/gpt2s-flash.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f2dc6956d97e074e55c514053daff39b15295c86
--- /dev/null
+++ b/gpt2s-flash.yaml
@@ -0,0 +1,19 @@
+# @package _global_
+defaults:
+  - /experiment/owt/base.yaml
+  - override /model: gpt2
+  - override /model/gpt2model: gpt2-small
+
+model:
+  config:
+    # n_positions is already set to ${datamodule.max_length}
+    residual_in_fp32: True
+    use_flash_attn: True
+    fused_bias_fc: True
+    fused_mlp: True
+    fused_dropout_add_ln: True
+    pad_vocab_size_multiple: 8
+
+datamodule:
+  # batch_size: 64
+  batch_size: ${eval:"16 if ${train.gpu_mem} < 24 else (32 if ${train.gpu_mem} < 40 else 64)"}
diff --git a/gpt2s-hf.yaml b/gpt2s-hf.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9b0f65cabb393e7179416509fcfede1abedeaaa6
--- /dev/null
+++ b/gpt2s-hf.yaml
@@ -0,0 +1,23 @@
+# @package _global_
+defaults:
+  - /experiment/owt/base.yaml
+  - override /model: gpt2-hf
+  - override /model/gpt2model: gpt2-small
+  - override /callbacks: [default, norm-monitor, flop-count]
+
+datamodule:
+  batch_size: 8
+
+train:
+  # Use the standard torch.nn.CrossEntropyLoss
+  loss_fn: null
+
+callbacks:
+  flop_count:
+    input_size:
+      - ${datamodule.max_length}
+    input_dtype:
+      # It's surprisingly hard to get hydra to return torch.long since it's not a callable
+      _target_: torch.__getattribute__
+      _args_:
+        - long
diff --git a/gpt2s.yaml b/gpt2s.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c9faf60b01a81e59263ce6634c4860b9f7a4b18b
--- /dev/null
+++ b/gpt2s.yaml
@@ -0,0 +1,8 @@
+# @package _global_
+defaults:
+  - /experiment/owt/base.yaml
+  - override /model: gpt2
+  - override /model/gpt2model: gpt2-small
+
+datamodule:
+  batch_size: ${eval:"4 if ${train.gpu_mem} < 24 else (8 if ${train.gpu_mem} < 40 else 16)"}
diff --git a/gpt2xl-flash.yaml b/gpt2xl-flash.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..717847ec20cc167787b203509ea590ded3c3f1b1
--- /dev/null
+++ b/gpt2xl-flash.yaml
@@ -0,0 +1,21 @@
+# @package _global_
+defaults:
+  - /experiment/owt/gpt2l-flash.yaml
+  - override /model/gpt2model: gpt2-xlarge
+
+# Can enable mlp_checkpoint_lvl to fit to A100 40GB
+# model:
+#   config:
+#     # mlp_checkpoint_lvl: ${eval:"[1] * 18 + [2] * 18"}
+#     mlp_checkpoint_lvl: 1
+
+datamodule:
+  batch_size: ${eval:"2 if ${train.gpu_mem} < 24 else (4 if ${train.gpu_mem} < 40 else (8 if ${train.gpu_mem} < 80 else 16))"}
+  # With adamw-zero optimizer, on A100 40GB:
+  # checkpoint_lvl=1, batch size = 4: mem 37GB, 4650ms / batch of 512 (285ms * 15 + 375ms * 1)
+  # checkpoint_lvl=1, batch size = 8: mem 46GB, 4330ms / batch of 512 (530ms * 7 + 620ms * 1)
+  # checkpoint_lvl=2, batch size = 8: mem 41GB, 4570ms / batch of 512 (560ms * 7 + 650ms * 1)
+  # With adamw-apex-distributed optimizer:
+  # checkpoint_lvl=1, batch size = 8: mem 41.5GB, 4500ms / batch of 512 (550ms * 7 + 650ms * 1)
+  # checkpoint_lvl=1 for 24 layers and checkpoint_lvl=2 for 24 layers,
+  # batch size = 8: mem 39GB, 4640ms / batch of 512 (565ms * 7 + 675ms * 1)
diff --git a/gpt2xl-hf.yaml b/gpt2xl-hf.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f8560bd11d7e3efaed4d607a37bd4fbd9cf3afe8
--- /dev/null
+++ b/gpt2xl-hf.yaml
@@ -0,0 +1,7 @@
+# @package _global_
+defaults:
+  - /experiment/owt/gpt2l-hf.yaml
+  - override /model/gpt2model: gpt2-xlarge
+
+datamodule:
+  batch_size: 1
diff --git a/gpt2xl.yaml b/gpt2xl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a43db2f85b46e1003aa1d29339060f043e2b6644
--- /dev/null
+++ b/gpt2xl.yaml
@@ -0,0 +1,14 @@
+# @package _global_
+defaults:
+  - /experiment/owt/gpt2m.yaml
+  - override /model/gpt2model: gpt2-xlarge
+  - override /optimizer: adamw-zero
+
+datamodule:
+  batch_size: 2  # Per GPU
+
+trainer:
+  strategy:
+    _target_: src.utils.ddp_zero1.DDPStrategyZero1
+    find_unused_parameters: False
+    gradient_as_bucket_view: True
diff --git a/gpt3-2.7B-flash-8k.yaml b/gpt3-2.7B-flash-8k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f126d18b769bfa652a649bc3e19c29bdbda00499
--- /dev/null
+++ b/gpt3-2.7B-flash-8k.yaml
@@ -0,0 +1,18 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3xl-flash-8k.yaml
+
+model:
+  config:
+    n_embd: 2560
+    n_head: 32
+    n_layer: 32
+    initializer_range: ${eval:"(2 / (${.n_embd} * 5)) ** 0.5"}
+    mlp_checkpoint_lvl: 0
+
+datamodule:
+  batch_size: ${eval:"1 if ${train.gpu_mem} < 40 else (2 if ${train.gpu_mem} < 80 else 4)"}
+
+train:
+  optimizer:
+    lr: 1.6e-4
diff --git a/gpt3-2.7B-flash-hdim128-rotary-8k.yaml b/gpt3-2.7B-flash-hdim128-rotary-8k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..09fdee93cc89f1b1b2ad3e4e70681ef325c57723
--- /dev/null
+++ b/gpt3-2.7B-flash-hdim128-rotary-8k.yaml
@@ -0,0 +1,18 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3xl-flash-rotary-8k.yaml
+
+model:
+  config:
+    n_embd: 2560
+    n_head: 20
+    n_layer: 32
+    initializer_range: ${eval:"(2 / (${.n_embd} * 5)) ** 0.5"}
+    mlp_checkpoint_lvl: 0
+
+datamodule:
+  batch_size: ${eval:"1 if ${train.gpu_mem} < 24 else (2 if ${train.gpu_mem} < 40 else (4 if ${train.gpu_mem} < 80 else 8))"}
+
+train:
+  optimizer:
+    lr: 1.6e-4
diff --git a/gpt3-2.7B-flash-hdim128-rotary.yaml b/gpt3-2.7B-flash-hdim128-rotary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d5caafd150cd192a103e7c3b4a6e72b33f738fec
--- /dev/null
+++ b/gpt3-2.7B-flash-hdim128-rotary.yaml
@@ -0,0 +1,18 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3xl-flash-rotary.yaml
+
+model:
+  config:
+    n_embd: 2560
+    n_head: 20
+    n_layer: 32
+    initializer_range: ${eval:"(2 / (${.n_embd} * 5)) ** 0.5"}
+    mlp_checkpoint_lvl: 0
+
+datamodule:
+  batch_size: ${eval:"4 if ${train.gpu_mem} < 24 else (8 if ${train.gpu_mem} < 40 else (16 if ${train.gpu_mem} < 80 else 32))"}
+
+train:
+  optimizer:
+    lr: 1.6e-4
diff --git a/gpt3-2.7B-flash-hdim128.yaml b/gpt3-2.7B-flash-hdim128.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9fd391b79ce11e674c9c61f95f65756aec706987
--- /dev/null
+++ b/gpt3-2.7B-flash-hdim128.yaml
@@ -0,0 +1,18 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3xl-flash.yaml
+
+model:
+  config:
+    n_embd: 2560
+    n_head: 20  # Headdim 128 is faster than headdim 80
+    n_layer: 32
+    initializer_range: ${eval:"(2 / (${.n_embd} * 5)) ** 0.5"}
+    mlp_checkpoint_lvl: 0
+
+datamodule:
+  batch_size: ${eval:"1 if ${train.gpu_mem} < 40 else (2 if ${train.gpu_mem} < 80 else 4)"}
+
+train:
+  optimizer:
+    lr: 1.6e-4
diff --git a/gpt3-2.7B-flash-rotary-8k.yaml b/gpt3-2.7B-flash-rotary-8k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b259a29997e7f9bd39bb3f63d8803ae09a75a764
--- /dev/null
+++ b/gpt3-2.7B-flash-rotary-8k.yaml
@@ -0,0 +1,18 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3xl-flash-rotary-8k.yaml
+
+model:
+  config:
+    n_embd: 2560
+    n_head: 32
+    n_layer: 32
+    initializer_range: ${eval:"(2 / (${.n_embd} * 5)) ** 0.5"}
+    mlp_checkpoint_lvl: 0
+
+datamodule:
+  batch_size: ${eval:"1 if ${train.gpu_mem} < 24 else (2 if ${train.gpu_mem} < 40 else (4 if ${train.gpu_mem} < 80 else 8))"}
+
+train:
+  optimizer:
+    lr: 1.6e-4
diff --git a/gpt3-2.7B-flash-rotary.yaml b/gpt3-2.7B-flash-rotary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e1684c4f5068f42d48cea7eba45b0b612036fa9
--- /dev/null
+++ b/gpt3-2.7B-flash-rotary.yaml
@@ -0,0 +1,18 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3xl-flash-rotary.yaml
+
+model:
+  config:
+    n_embd: 2560
+    n_head: 32
+    n_layer: 32
+    initializer_range: ${eval:"(2 / (${.n_embd} * 5)) ** 0.5"}
+    mlp_checkpoint_lvl: 0
+
+datamodule:
+  batch_size: ${eval:"4 if ${train.gpu_mem} < 24 else (8 if ${train.gpu_mem} < 40 else (16 if ${train.gpu_mem} < 80 else 32))"}
+
+train:
+  optimizer:
+    lr: 1.6e-4
diff --git a/gpt3-2.7B-flash.yaml b/gpt3-2.7B-flash.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0dbfc21dbe98960c4732491392ab47f166b946f1
--- /dev/null
+++ b/gpt3-2.7B-flash.yaml
@@ -0,0 +1,18 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3xl-flash.yaml
+
+model:
+  config:
+    n_embd: 2560
+    n_head: 32
+    n_layer: 32
+    initializer_range: ${eval:"(2 / (${.n_embd} * 5)) ** 0.5"}
+    mlp_checkpoint_lvl: 0
+
+datamodule:
+  batch_size: ${eval:"1 if ${train.gpu_mem} < 40 else (2 if ${train.gpu_mem} < 80 else 4)"}
+
+train:
+  optimizer:
+    lr: 1.6e-4
diff --git a/gpt3-2.7B-hf-hdim128.yaml b/gpt3-2.7B-hf-hdim128.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cc365d9108124bb5f5c15e76dd9a183cedf0148d
--- /dev/null
+++ b/gpt3-2.7B-hf-hdim128.yaml
@@ -0,0 +1,17 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3xl-hf.yaml
+
+model:
+  config:
+    n_embd: 2560
+    n_head: 128
+    n_layer: 32
+
+# OOM on A100 80GB even with batch_size = 1
+datamodule:
+  batch_size: 1
+
+train:
+  optimizer:
+    lr: 1.6e-4
diff --git a/gpt3-2.7B-hf.yaml b/gpt3-2.7B-hf.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ff0a7a720f73bae3e0dd4303dbd870d83395fee6
--- /dev/null
+++ b/gpt3-2.7B-hf.yaml
@@ -0,0 +1,16 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3xl-hf.yaml
+
+model:
+  config:
+    n_embd: 2560
+    n_head: 32
+    n_layer: 32
+
+datamodule:
+  batch_size: 1
+
+train:
+  optimizer:
+    lr: 1.6e-4
diff --git a/gpt3_training_curve.jpg b/gpt3_training_curve.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..50cc9b51d3e2da1ab843a5e0db4f2b086969306a
Binary files /dev/null and b/gpt3_training_curve.jpg differ
diff --git a/gpt3_training_efficiency.jpg b/gpt3_training_efficiency.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..541fccf8ba24a712a2bcdf5eb5553c32d1b5ca55
Binary files /dev/null and b/gpt3_training_efficiency.jpg differ
diff --git a/gpt3l-flash-8k.yaml b/gpt3l-flash-8k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ccbbebfd00472d1e1fc6e5340c8ffa1137240b8c
--- /dev/null
+++ b/gpt3l-flash-8k.yaml
@@ -0,0 +1,10 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3l-flash.yaml
+
+datamodule:
+  max_length: 8192
+  batch_size: ${eval:"1 if ${train.gpu_mem} < 40 else (2 if ${train.gpu_mem} < 80 else 4)"}
+
+train:
+  global_batch_size: 64
diff --git a/gpt3l-flash-rotary-30B.yaml b/gpt3l-flash-rotary-30B.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..74c6bb9ce57c7e7aad50f93588f93f976f032774
--- /dev/null
+++ b/gpt3l-flash-rotary-30B.yaml
@@ -0,0 +1,10 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3l-flash-rotary.yaml
+
+trainer:
+  max_steps: 60000
+
+train:
+  scheduler:
+    t_initial: ${trainer.max_steps}
diff --git a/gpt3l-flash-rotary-8k.yaml b/gpt3l-flash-rotary-8k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2b3ba3145b0e8bdb40acb30db4e6d86354afc0fb
--- /dev/null
+++ b/gpt3l-flash-rotary-8k.yaml
@@ -0,0 +1,8 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3l-flash-8k.yaml
+
+model:
+  config:
+    max_position_embeddings: 0  # Disable absolute position embedding
+    rotary_emb_fraction: 0.5
diff --git a/gpt3l-flash-rotary.yaml b/gpt3l-flash-rotary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f2856320273305147bf96633ed445507c0bdcc82
--- /dev/null
+++ b/gpt3l-flash-rotary.yaml
@@ -0,0 +1,8 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3l-flash.yaml
+
+model:
+  config:
+    max_position_embeddings: 0  # Disable absolute position embedding
+    rotary_emb_fraction: 0.5
diff --git a/gpt3l-flash.yaml b/gpt3l-flash.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eebc19a85338a69c623effc80533cf07c7962bc2
--- /dev/null
+++ b/gpt3l-flash.yaml
@@ -0,0 +1,24 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3s-flash.yaml
+  - override /optimizer: adamw-zero
+
+model:
+  config:
+    n_embd: 1536
+    n_head: 16
+    n_layer: 24
+    # mlp_checkpoint_lvl: 1  # To fit batch_size 8
+
+datamodule:
+  batch_size: ${eval:"2 if ${train.gpu_mem} < 24 else (4 if ${train.gpu_mem} < 40 else (8 if ${train.gpu_mem} < 80 else 16))"}
+
+train:
+  optimizer:
+    lr: 2.5e-4
+
+trainer:
+  strategy:
+    _target_: src.utils.ddp_zero1.DDPStrategyZero1
+    find_unused_parameters: False
+    gradient_as_bucket_view: True
diff --git a/gpt3l-hf.yaml b/gpt3l-hf.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f70af10ccb4f81ee11cdb0d977d846708d0e53a1
--- /dev/null
+++ b/gpt3l-hf.yaml
@@ -0,0 +1,16 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3s-hf.yaml
+
+model:
+  config:
+    n_embd: 1536
+    n_head: 16
+    n_layer: 24
+
+datamodule:
+  batch_size: 2
+
+train:
+  optimizer:
+    lr: 2.5e-4
diff --git a/gpt3m-flash-8k.yaml b/gpt3m-flash-8k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d75e6d3a3a434213a919eb85280aeb7a2ee34c5d
--- /dev/null
+++ b/gpt3m-flash-8k.yaml
@@ -0,0 +1,10 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3m-flash.yaml
+
+datamodule:
+  max_length: 8192
+  batch_size: ${eval:"2 if ${train.gpu_mem} < 24 else (4 if ${train.gpu_mem} < 40 else 8)"}
+
+train:
+  global_batch_size: 64
diff --git a/gpt3m-flash-rotary-30B.yaml b/gpt3m-flash-rotary-30B.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..04630753e5643f01281d0310b0766f0b28c8e47c
--- /dev/null
+++ b/gpt3m-flash-rotary-30B.yaml
@@ -0,0 +1,10 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3m-flash-rotary.yaml
+
+trainer:
+  max_steps: 60000
+
+train:
+  scheduler:
+    t_initial: ${trainer.max_steps}
diff --git a/gpt3m-flash-rotary-8k.yaml b/gpt3m-flash-rotary-8k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f217ac5211c4c61b3b789f3d572aa0908fe72898
--- /dev/null
+++ b/gpt3m-flash-rotary-8k.yaml
@@ -0,0 +1,8 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3m-flash-8k.yaml
+
+model:
+  config:
+    max_position_embeddings: 0  # Disable absolute position embedding
+    rotary_emb_fraction: 0.5
diff --git a/gpt3m-flash-rotary.yaml b/gpt3m-flash-rotary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..adb0cb6142a2ee3ad5d82f7321d4f5f26be43053
--- /dev/null
+++ b/gpt3m-flash-rotary.yaml
@@ -0,0 +1,8 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3m-flash.yaml
+
+model:
+  config:
+    max_position_embeddings: 0  # Disable absolute position embedding
+    rotary_emb_fraction: 0.5
diff --git a/gpt3m-flash.yaml b/gpt3m-flash.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..830b2d5df079f2aae91e0ebdb235c753b7a51ba7
--- /dev/null
+++ b/gpt3m-flash.yaml
@@ -0,0 +1,16 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3s-flash.yaml
+  - override /model/gpt2model: gpt2-medium
+
+# Can enable mlp_checkpoint_lvl to fit batch_size 16 to A100 40GB
+# model:
+#   config:
+#     mlp_checkpoint_lvl: 1
+
+datamodule:
+  batch_size: ${eval:"4 if ${train.gpu_mem} < 24 else (8 if ${train.gpu_mem} < 40 else (16 if ${train.gpu_mem} < 80 else 32))"}
+
+train:
+  optimizer:
+    lr: 3.0e-4
diff --git a/gpt3m-hf.yaml b/gpt3m-hf.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e0e09e4e964a6e0aec5096d45badc5f787096d8c
--- /dev/null
+++ b/gpt3m-hf.yaml
@@ -0,0 +1,11 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3s-hf.yaml
+  - override /model/gpt2model: gpt2-medium
+
+datamodule:
+  batch_size: 4
+
+train:
+  optimizer:
+    lr: 3.0e-4
diff --git a/gpt3s-flash-8k.yaml b/gpt3s-flash-8k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..06ce6453d103c92c2c141dcd780f2c1aca756f7c
--- /dev/null
+++ b/gpt3s-flash-8k.yaml
@@ -0,0 +1,10 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3s-flash.yaml
+
+datamodule:
+  max_length: 8192
+  batch_size: ${eval:"2 if ${train.gpu_mem} < 24 else (4 if ${train.gpu_mem} < 40 else 8)"}
+
+train:
+  global_batch_size: 64
diff --git a/gpt3s-flash-rotary-30B.yaml b/gpt3s-flash-rotary-30B.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d434480060acfb240b12c74943a3617bc77a023e
--- /dev/null
+++ b/gpt3s-flash-rotary-30B.yaml
@@ -0,0 +1,10 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3s-flash-rotary.yaml
+
+trainer:
+  max_steps: 60000
+
+train:
+  scheduler:
+    t_initial: ${trainer.max_steps}
diff --git a/gpt3s-flash-rotary-8k.yaml b/gpt3s-flash-rotary-8k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bdee8766f9795fb72eafc7cdaa0e7c6e8712423f
--- /dev/null
+++ b/gpt3s-flash-rotary-8k.yaml
@@ -0,0 +1,8 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3s-flash-8k.yaml
+
+model:
+  config:
+    max_position_embeddings: 0  # Disable absolute position embedding
+    rotary_emb_fraction: 0.5
diff --git a/gpt3s-flash-rotary.yaml b/gpt3s-flash-rotary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..41176eea179f279c7ac161540d9e67b00bf34d07
--- /dev/null
+++ b/gpt3s-flash-rotary.yaml
@@ -0,0 +1,8 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3s-flash.yaml
+
+model:
+  config:
+    max_position_embeddings: 0  # Disable absolute position embedding
+    rotary_emb_fraction: 0.5
diff --git a/gpt3s-flash.yaml b/gpt3s-flash.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..45302fd4d3cb01b804ec5d5b99c86d24379b5366
--- /dev/null
+++ b/gpt3s-flash.yaml
@@ -0,0 +1,18 @@
+# @package _global_
+defaults:
+  - /experiment/pile/base.yaml
+  - override /model: gpt2
+  - override /model/gpt2model: gpt2-small
+
+model:
+  config:
+    # n_positions is already set to ${datamodule.max_length}
+    residual_in_fp32: True
+    use_flash_attn: True
+    fused_dropout_add_ln: True
+    fused_mlp: True
+    fused_bias_fc: True
+    pad_vocab_size_multiple: 8
+
+datamodule:
+  batch_size: ${eval:"8 if ${train.gpu_mem} < 24 else (16 if ${train.gpu_mem} < 40 else 32)"}
diff --git a/gpt3s-hf.yaml b/gpt3s-hf.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..459121759ad1ed25f943ee7857dc70ecf11b60cc
--- /dev/null
+++ b/gpt3s-hf.yaml
@@ -0,0 +1,12 @@
+# @package _global_
+defaults:
+  - /experiment/pile/base.yaml
+  - override /model: gpt2-hf
+  - override /model/gpt2model: gpt2-small
+
+datamodule:
+  batch_size: 8
+
+train:
+  # Use the standard torch.nn.CrossEntropyLoss
+  loss_fn: null
diff --git a/gpt3xl-flash-8k.yaml b/gpt3xl-flash-8k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d411906a136a8c0f41d1dfc4a3128a1c0da92463
--- /dev/null
+++ b/gpt3xl-flash-8k.yaml
@@ -0,0 +1,10 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3xl-flash.yaml
+
+datamodule:
+  max_length: 8192
+  batch_size: ${eval:"1 if ${train.gpu_mem} < 40 else (2 if ${train.gpu_mem} < 80 else 4)"}
+
+train:
+  global_batch_size: 128
diff --git a/gpt3xl-flash-rotary-60B.yaml b/gpt3xl-flash-rotary-60B.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..48e421346257a78663221540b26be47e33b56afe
--- /dev/null
+++ b/gpt3xl-flash-rotary-60B.yaml
@@ -0,0 +1,10 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3xl-flash-rotary.yaml
+
+trainer:
+  max_steps: 60000
+
+train:
+  scheduler:
+    t_initial: ${trainer.max_steps}
diff --git a/gpt3xl-flash-rotary-8k.yaml b/gpt3xl-flash-rotary-8k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d4c4cbe2fff8da4ddccced6cec9b0b4107330dfe
--- /dev/null
+++ b/gpt3xl-flash-rotary-8k.yaml
@@ -0,0 +1,8 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3xl-flash-8k.yaml
+
+model:
+  config:
+    max_position_embeddings: 0  # Disable absolute position embedding
+    rotary_emb_fraction: 0.5
diff --git a/gpt3xl-flash-rotary.yaml b/gpt3xl-flash-rotary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f05f70595da94de4a42ce78a42db7da338c9a6d4
--- /dev/null
+++ b/gpt3xl-flash-rotary.yaml
@@ -0,0 +1,8 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3xl-flash.yaml
+
+model:
+  config:
+    max_position_embeddings: 0  # Disable absolute position embedding
+    rotary_emb_fraction: 0.5
diff --git a/gpt3xl-flash.yaml b/gpt3xl-flash.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f432e35539fe3832e8347f512d17b6418ffb1610
--- /dev/null
+++ b/gpt3xl-flash.yaml
@@ -0,0 +1,35 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3s-flash.yaml
+  - override /optimizer: adamw-zero
+
+model:
+  config:
+    n_embd: 2048
+    n_head: 16
+    n_layer: 24
+
+datamodule:
+  batch_size: ${eval:"1 if ${train.gpu_mem} < 24 else (2 if ${train.gpu_mem} < 40 else (4 if ${train.gpu_mem} < 80 else 8))"}
+
+train:
+  global_batch_size: 512
+  optimizer:
+    lr: 2.0e-4
+  scheduler:
+    t_initial: 300000
+
+trainer:
+  strategy:
+    _target_: src.utils.ddp_zero1.DDPStrategyZero1
+    find_unused_parameters: False
+    gradient_as_bucket_view: True
+  max_steps: 400000
+  val_check_interval: ${eval:1000 * ${.accumulate_grad_batches}}
+
+callbacks:
+  model_checkpoint:
+    every_n_train_steps: 1000
+  model_checkpoint_progress:
+    every_n_train_steps: 12500
+    fault_tolerant: False  # Saving takes too long
diff --git a/gpt3xl-hf.yaml b/gpt3xl-hf.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..58f29bd6b865fb3b2d12b22c4ac0577ea78a841a
--- /dev/null
+++ b/gpt3xl-hf.yaml
@@ -0,0 +1,35 @@
+# @package _global_
+defaults:
+  - /experiment/pile/gpt3s-hf.yaml
+  - override /optimizer: adamw-zero
+
+model:
+  config:
+    n_embd: 2048
+    n_head: 16
+    n_layer: 24
+
+datamodule:
+  batch_size: 2
+
+train:
+  global_batch_size: 512
+  optimizer:
+    lr: 2.0e-4
+  scheduler:
+    t_initial: 300000
+
+trainer:
+  strategy:
+    _target_: src.utils.ddp_zero1.DDPStrategyZero1
+    find_unused_parameters: False
+    gradient_as_bucket_view: True
+  max_steps: 400000
+  val_check_interval: ${eval:1000 * ${.accumulate_grad_batches}}
+
+callbacks:
+  model_checkpoint:
+    every_n_train_steps: 1000
+  model_checkpoint_progress:
+    every_n_train_steps: 12500
+    fault_tolerant: False  # Saving takes too long
diff --git a/gpt_neox.py b/gpt_neox.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3894044172260a25c9c561fbaac8add91db5b23
--- /dev/null
+++ b/gpt_neox.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2023, Tri Dao.
+
+import math
+import re
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import GPT2Config, GPTNeoXConfig
+
+
+def remap_state_dict_hf_gpt_neox(state_dict, config):
+    def key_mapping_layers(key):
+        return re.sub(r"^gpt_neox.", "transformer.", key)
+
+    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
+    # Word embedding
+    def key_mapping_emb(key):
+        return re.sub(r"^transformer.embed_in.", "transformer.embeddings.word_embeddings.", key)
+
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    if getattr(config, "tie_word_embeddings", False):
+        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+    else:
+        output_embeddings = state_dict.pop("embed_out.weight")
+        # It's possible that vocab_size is padded to be a multiple of 8, for example.
+        state_dict["lm_head.weight"] = F.pad(
+            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
+        )
+
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.final_layer_norm.", r"transformer.ln_f.", key)
+        key = re.sub(
+            r"^transformer.layers.(\d+).input_layernorm.", r"transformer.layers.\1.norm1.", key
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).post_attention_layernorm.",
+            r"transformer.layers.\1.norm2.",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+
+    # MLP
+    def key_mapping_mlp(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.dense_h_to_4h.", r"transformer.layers.\1.mlp.fc1.", key
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.dense_4h_to_h.", r"transformer.layers.\1.mlp.fc2.", key
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+
+    # Attention
+    for l in range(config.n_layer):
+        # We don't store these biases
+        state_dict.pop(f"transformer.layers.{l}.attention.bias")
+        state_dict.pop(f"transformer.layers.{l}.attention.masked_bias")
+        # We don't store these
+        state_dict.pop(f"transformer.layers.{l}.attention.rotary_emb.inv_freq", None)
+        # GPT-NeoX stores Wqkv as ((nheads 3 headdim), hidden_dim)
+        # while we store Wqkv as ((3 nheads headdim), hidden_dim)
+        headdim = config.hidden_size // config.num_attention_heads
+        Wqkv = state_dict.pop(f"transformer.layers.{l}.attention.query_key_value.weight")
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = rearrange(
+            Wqkv,
+            "(nheads three headdim) ... -> (three nheads headdim) ...",
+            three=3,
+            headdim=headdim,
+        )
+        bqkv = state_dict.pop(f"transformer.layers.{l}.attention.query_key_value.bias")
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.bias"] = rearrange(
+            bqkv, "(nheads three headdim) -> (three nheads headdim)", three=3, headdim=headdim
+        )
+
+    def key_mapping_attn(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).attention.dense.",
+            r"transformer.layers.\1.mixer.out_proj.",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+
+    return state_dict
+
+
+def gpt_neox_config_to_gpt2_config(gpt_neox_config: GPTNeoXConfig) -> GPT2Config:
+    assert gpt_neox_config.rotary_emb_base == 10000
+    return GPT2Config(
+        vocab_size=gpt_neox_config.vocab_size,
+        n_positions=0,  # No absolute position embedding
+        n_embd=gpt_neox_config.hidden_size,
+        n_layer=gpt_neox_config.num_hidden_layers,
+        n_head=gpt_neox_config.num_attention_heads,
+        n_inner=gpt_neox_config.intermediate_size,
+        activation_function=gpt_neox_config.hidden_act,
+        resid_pdrop=0.0,  # No dropout
+        embd_pdrop=0.0,
+        attn_pdrop=0.0,
+        layer_norm_epsilon=gpt_neox_config.layer_norm_eps,
+        initializer_range=gpt_neox_config.initializer_range,
+        bos_token_id=gpt_neox_config.bos_token_id,
+        eos_token_id=gpt_neox_config.eos_token_id,
+        # These are new arguments not in the original GPT2Config
+        prenorm=True,
+        parallel_block=gpt_neox_config.use_parallel_residual,
+        parallel_block_tied_norm=False,
+        rotary_emb_fraction=gpt_neox_config.rotary_pct,
+        tie_word_embeddings=gpt_neox_config.tie_word_embeddings,
+    )
diff --git a/gptj.py b/gptj.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca2330d79ce5b78a1229351956da20d88e356083
--- /dev/null
+++ b/gptj.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2023, Tri Dao.
+
+import math
+import re
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+from transformers import GPT2Config, GPTJConfig
+
+
+def remap_state_dict_hf_gptj(state_dict, config):
+    def key_mapping_layers(key):
+        return re.sub(r"^transformer.h.", "transformer.layers.", key)
+
+    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
+    # Word embedding
+    def key_mapping_emb(key):
+        return re.sub(r"^transformer.wte.", "transformer.embeddings.word_embeddings.", key)
+
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    if getattr(config, "tie_word_embeddings"):
+        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+    else:
+        output_embeddings = state_dict.pop("lm_head.weight")
+        # It's possible that vocab_size is padded to be a multiple of 8, for example.
+        state_dict["lm_head.weight"] = F.pad(
+            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
+        )
+        output_embeddings_bias = state_dict.pop("lm_head.bias")
+        state_dict["lm_head.bias"] = F.pad(
+            output_embeddings_bias, (0, vocab_size - output_embeddings_bias.shape[0])
+        )
+
+    # LayerNorm
+    def key_mapping_ln(key):
+        return re.sub(r"^transformer.layers.(\d+).ln_1.", r"transformer.layers.\1.norm1.", key)
+
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+
+    # MLP
+    def key_mapping_mlp(key):
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.fc_in.", r"transformer.layers.\1.mlp.fc1.", key
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).mlp.fc_out.", r"transformer.layers.\1.mlp.fc2.", key
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+
+    # Attention
+    for l in range(config.n_layer):
+        Wq = state_dict.pop(f"transformer.layers.{l}.attn.q_proj.weight")
+        Wk = state_dict.pop(f"transformer.layers.{l}.attn.k_proj.weight")
+        Wv = state_dict.pop(f"transformer.layers.{l}.attn.v_proj.weight")
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0)
+        # We don't store these biases
+        state_dict.pop(f"transformer.layers.{l}.attn.bias")
+        state_dict.pop(f"transformer.layers.{l}.attn.masked_bias")
+
+    def key_mapping_attn(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).attn.out_proj.",
+            r"transformer.layers.\1.mixer.out_proj.",
+            key,
+        )
+
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+
+    return state_dict
+
+
+def gptj_config_to_gpt2_config(gptj_config: GPTJConfig) -> GPT2Config:
+    headdim = gptj_config.n_embd // gptj_config.n_head
+    return GPT2Config(
+        vocab_size=gptj_config.vocab_size,
+        n_positions=0,  # No absolute position embedding
+        n_embd=gptj_config.n_embd,
+        n_layer=gptj_config.n_layer,
+        n_head=gptj_config.n_head,
+        n_inner=gptj_config.n_inner,
+        activation_function=gptj_config.activation_function,
+        resid_pdrop=gptj_config.resid_pdrop,
+        embd_pdrop=gptj_config.embd_pdrop,
+        attn_pdrop=gptj_config.attn_pdrop,
+        layer_norm_epsilon=gptj_config.layer_norm_epsilon,
+        initializer_range=gptj_config.initializer_range,
+        bos_token_id=gptj_config.bos_token_id,
+        eos_token_id=gptj_config.eos_token_id,
+        # These are new arguments not in the original GPT2Config
+        prenorm=True,
+        parallel_block=True,
+        parallel_block_tied_norm=True,
+        rotary_emb_fraction=gptj_config.rotary_dim / headdim,
+        rotary_emb_interleaved=True,
+        tie_word_embeddings=False,
+        qkv_proj_bias=False,
+        out_proj_bias=False,
+        lm_head_bias=True,
+    )
diff --git a/gpu-monitor.yaml b/gpu-monitor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6780f6d1c1bce7a4792fe15cc45eb57b5488d4bb
--- /dev/null
+++ b/gpu-monitor.yaml
@@ -0,0 +1,11 @@
+defaults:
+  - default.yaml
+
+gpu_stats_monitor:
+  _target_: pytorch_lightning.callbacks.GPUStatsMonitor
+  # [2021-08-13] TD: I just want the intra_step_size but it'll error if I
+  # don't have memory_utilization and gpu_utilization.
+  # Maybe I should write a callback with just the intra_step_size.
+  memory_utilization: True
+  gpu_utilization: True
+  intra_step_time: True
diff --git a/interface.cpp b/interface.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..41a783fd0fcc661276d34f2cc7a758b80a27d453
--- /dev/null
+++ b/interface.cpp
@@ -0,0 +1,59 @@
+#include <torch/extension.h>
+
+// CUDA forward declarations
+std::vector<at::Tensor> softmax_xentropy_cuda(
+    const at::Tensor &input,
+    const at::Tensor &labels,
+    const float smoothing,
+    const int total_classes);
+
+at::Tensor softmax_xentropy_backward_cuda(
+    const at::Tensor &grad_loss,
+    at::Tensor &logits,
+    const at::Tensor &max_log_sum_exp,
+    const at::Tensor &labels,
+    const float smoothing,
+    const bool inplace,
+    const int total_classes);
+
+// C++ interface
+
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<at::Tensor> softmax_xentropy_forward(
+    const at::Tensor &input,
+    const at::Tensor &labels,
+    const float smoothing,
+    const int total_classes=-1) {
+    // For tensor parallel cross entropy with smoothing, we want to pass in the total number
+    // of classes so that smoothing can be applied correctly. If total_classes=-1, use the
+    // last dimension of the input tensor.
+    CHECK_INPUT(input);
+    CHECK_INPUT(labels);
+
+    return softmax_xentropy_cuda(input, labels, smoothing, total_classes);
+}
+
+at::Tensor softmax_xentropy_backward(
+    const at::Tensor &grad_loss,
+    at::Tensor &logits,
+    const at::Tensor &max_log_sum_exp,
+    const at::Tensor &labels,
+    const float smoothing,
+    const bool inplace,
+    const int total_classes=-1)  {
+    CHECK_INPUT(grad_loss);
+    CHECK_INPUT(logits);
+    CHECK_INPUT(max_log_sum_exp);
+    CHECK_INPUT(labels);
+
+    return softmax_xentropy_backward_cuda(grad_loss, logits, max_log_sum_exp, labels,
+                                          smoothing, inplace, total_classes);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &softmax_xentropy_forward, "Softmax cross entropy loss with label smoothing forward (CUDA)", py::arg("input"), py::arg("labels"), py::arg("smoothing"), py::arg("total_classes")=-1);
+    m.def("backward", &softmax_xentropy_backward, "Softmax cross entropy loss with label smoothing backward (CUDA)", py::arg("grad_loss"), py::arg("logits"), py::arg("max_log_sum_exp"), py::arg("labels"), py::arg("smoothing"), py::arg("inplace"), py::arg("total_classes")=-1);
+}
diff --git a/invsqrt.yaml b/invsqrt.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bb16f3c15bfd4f3a337a3b082a12e37f14bafd1b
--- /dev/null
+++ b/invsqrt.yaml
@@ -0,0 +1,3 @@
+# @package train.scheduler
+_target_: src.optim.lr_scheduler.InvSqrt
+num_warmup_steps: ???
diff --git a/k_activations.py b/k_activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..efb83c358eb4a85d069ee340a3c83f418f9a805b
--- /dev/null
+++ b/k_activations.py
@@ -0,0 +1,162 @@
+# Adapted from https://github.com/facebookresearch/xformers/blob/main/xformers/triton/k_activations.py
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from enum import Enum
+from typing import Optional
+
+import triton
+import triton.language as tl
+
+_sqrt2pi = math.sqrt(2.0 / math.pi)
+_sqrt1_2 = math.sqrt(1.0 / 2)
+_gaussian_pdf_normalization = 1.0 / math.sqrt(2 * math.pi)
+
+
+class Activation(str, Enum):
+    SquaredReLU = "squared_relu"
+    GeLU = "gelu"
+    GeLUApprox = "gelu_approx"
+    LeakyReLU = "leaky_relu"
+    ReLU = "relu"
+
+
+def get_triton_activation_kernel(activation: Optional[Activation]):
+    return (
+        {
+            Activation.ReLU: relu,
+            Activation.LeakyReLU: leaky_relu,
+            Activation.GeLU: gelu,
+            Activation.GeLUApprox: gelu_approx,
+            Activation.SquaredReLU: squared_relu,
+        }[activation]
+        if activation
+        else None
+    )
+
+
+def get_triton_activation_bwd_kernel(activation: Optional[Activation]):
+    return (
+        {
+            Activation.ReLU: relu_grad,
+            Activation.LeakyReLU: leaky_relu_grad,
+            Activation.GeLU: gelu_grad,
+            Activation.GeLUApprox: gelu_approx_grad,
+            Activation.SquaredReLU: squared_relu_grad,
+        }[activation]
+        if activation
+        else None
+    )
+
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def cosh(x):
+    exp_x = tl.exp(x)
+    return (exp_x + 1.0 / exp_x) * 0.5
+
+
+# a Triton implementation of the most used activations
+# See for instance http://arxiv.org/abs/1606.08415 for an overview
+
+# ReLU
+@triton.jit
+def relu(x):
+    """
+    ReLU_ activation function
+
+    .. _ReLU: https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html
+    """
+    zero = 0.0
+    return tl.where(x >= 0, x, zero.to(x.dtype))
+
+
+@triton.jit
+def relu_grad(x):
+    # ReLU is different from other activations
+    # in that it does not require the input to retrospectively compute its gradient
+    # here the input is the downstream gradient, and we return the upstream gradient directly
+    zero = 0.0
+    one = 1.0
+    return tl.where(x >= 0, one.to(x.dtype), zero.to(x.dtype))
+
+
+@triton.jit
+def squared_relu(x):
+    """
+    Squared ReLU activation, as proposed in the Primer_ paper.
+
+    .. _Primer: https://arxiv.org/abs/2109.08668
+    """
+    x_ = relu(x)
+    return (x_ * x_).to(x.dtype)
+
+
+@triton.jit
+def squared_relu_grad(x):
+    return tl.where(x >= 0, 2.0 * x, 0.0)
+
+
+# Leaky ReLU
+@triton.jit
+def leaky_relu(x):
+    """
+    LeakyReLU_ activation
+
+    .. _LeakyReLU: https://pytorch.org/docs/stable/generated/torch.nn.LeakyReLU.html
+    """
+    scale = 0.01 + 0.0
+    scale = scale.to(x.dtype)
+    return tl.where(x >= 0, x, scale * x)
+
+
+@triton.jit
+def leaky_relu_grad(x):
+    min_grad = 0.01
+    max_grad = 1
+
+    min_grad = min_grad.to(x.dtype)
+    max_grad = max_grad.to(x.dtype)
+
+    return tl.where(x >= 0, max_grad, min_grad)
+
+
+@triton.jit
+def gelu(x):
+    """Gaussian Error Linear Unit (GELU)"""
+    return x * 0.5 * (1.0 + tl.libdevice.erf(x * _sqrt1_2))
+
+
+@triton.jit
+def gelu_grad(x):
+    cdf = 0.5 * (1.0 + tl.libdevice.erf(x * _sqrt1_2))
+    pdf = tl.exp(-0.5 * x * x) * _gaussian_pdf_normalization
+    return cdf + x * pdf
+
+
+@triton.jit
+def gelu_approx(x):
+    """
+    GeLU_ activation - Gaussian error linear unit, with tanh approximation
+
+    .. _GeLU: https://arxiv.org/pdf/1606.08415.pdf
+    """
+    return 0.5 * x * (1.0 + tanh(_sqrt2pi * x * (1.0 + 0.044715 * x * x)))
+
+
+@triton.jit
+def gelu_approx_grad(x):
+    # CREDITS: Fast implementation proposed in
+    # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/fused_bias_gelu.py#L30
+    tanh_out = tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    return 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
+        1 + tanh_out
+    )
diff --git a/kernel_traits.h b/kernel_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..b75a8c3c14ceaf5f0fe09506d5d39eb6ff6052d7
--- /dev/null
+++ b/kernel_traits.h
@@ -0,0 +1,952 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/algorithm/copy.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/layout.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+
+using namespace cute;
+
+template <int kStages, class Gemm1Type, class Gemm2Type, class OutputType, class SmemLayoutQ,
+          class SmemLayoutK, class SmemLayoutV, class SmemLayoutO>
+struct SharedStorageQKVO {
+    cute::array_aligned<Gemm1Type, cute::cosize_v<SmemLayoutQ>> smem_q;
+    cute::array_aligned<Gemm1Type, cute::cosize_v<SmemLayoutK>> smem_k;
+    union {
+        cute::array_aligned<Gemm2Type, cute::cosize_v<SmemLayoutV>> smem_v;
+        cute::array_aligned<OutputType, cute::cosize_v<SmemLayoutO>> smem_o;
+    };
+    struct {
+        cutlass::arch::ClusterTransactionBarrier barrier_Q;
+        cutlass::arch::ClusterBarrier barrier_O;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_k;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_v;
+        int tile_count_semaphore;
+    };
+};
+
+template <int kStages, class Gemm1Type, class Gemm2Type, class OutputType, class SmemLayoutQ,
+          class SmemLayoutK, class SmemLayoutV, class SmemLayoutO>
+struct SharedStorageQKVOVt {
+  struct {
+    cute::array_aligned<Gemm1Type, cute::cosize_v<SmemLayoutQ>> smem_q;
+    cute::array_aligned<Gemm1Type, cute::cosize_v<SmemLayoutK>> smem_k;
+    cute::array_aligned<Gemm2Type, cute::cosize_v<SmemLayoutV>> smem_v;  
+    union {
+        cute::array_aligned<Gemm2Type, cute::cosize_v<SmemLayoutV>> smem_v_out;
+        cute::array_aligned<OutputType, cute::cosize_v<SmemLayoutO>> smem_o;
+    };
+  };
+  struct {    
+    cutlass::arch::ClusterTransactionBarrier barrier_Q;
+    cutlass::arch::ClusterBarrier barrier_O;
+    typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_k;
+    typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_v;
+    typename cutlass::PipelineAsync<kStages>::SharedStorage pipeline_vt;
+    int tile_count_semaphore;
+  };
+};
+
+// If Share_Q_K_smem is true, that forces Is_Q_in_regs to be true
+template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, int kStages_, bool Is_Q_in_regs_=false,
+         int kClusterM_ = 1, typename elem_type=cutlass::half_t>
+struct Flash_fwd_kernel_traits {
+    using Element = elem_type;
+    using ElementAccum = float;
+    using OutputType = elem_type;
+    using index_t = int64_t;
+
+    // The number of threads.
+    static constexpr int kNWarps = kNWarps_;
+    static constexpr int kNThreads = kNWarps * cutlass::NumThreadsPerWarp;
+    static constexpr int NumProducerThreads = cutlass::NumThreadsPerWarp;
+
+    static constexpr bool Is_Q_in_regs = Is_Q_in_regs_;
+    static_assert(kNWarps_ == 4 || kNWarps_ == 8 || kNWarps_ == 12 || kNWarps_ == 16);
+    static constexpr bool Is_WS = kNWarps_ >= 12;
+    static_assert(!(Is_WS && Is_Q_in_regs), "Warp-specialization does not support Q in registers");
+
+    static constexpr int kBlockM = kBlockM_;
+    static constexpr int kBlockN = kBlockN_;
+    static constexpr int kHeadDim = kHeadDim_;
+    static_assert(kHeadDim % 32 == 0);
+    using TileShape_MNK = Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
+
+    static constexpr int kClusterM = kClusterM_;
+    using ClusterShape_MNK = Shape<Int<kClusterM>, _1, _1>;
+
+    static constexpr int kStages = kStages_;
+
+    using AtomLayoutMNK = Layout<Shape<Int<kBlockM / 64>, _1, _1>>;
+    using TiledMma0 = decltype(cute::make_tiled_mma(
+        std::conditional_t<
+            Is_Q_in_regs,
+            decltype(cute::GMMA::rs_op_selector<Element, Element, ElementAccum, TileShape_MNK>()),
+            decltype(cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShape_MNK>())
+        >{},
+        AtomLayoutMNK{}));
+    using TiledMma1 = decltype(cute::make_tiled_mma(
+        cute::GMMA::rs_op_selector<Element, Element, ElementAccum, decltype(select<0, 2, 1>(TileShape_MNK{})),
+                                   GMMA::Major::K, GMMA::Major::MN>(),
+        AtomLayoutMNK{}));
+
+    using SmemLayoutAtomQ = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutQ = decltype(tile_to_shape(SmemLayoutAtomQ{}, select<0, 2>(TileShape_MNK{})));
+
+    using SmemLayoutAtomK = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutK =
+        decltype(tile_to_shape(SmemLayoutAtomK{},
+                 make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<kStages>{})));
+
+    using SmemLayoutAtomV = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutV =
+        decltype(tile_to_shape(SmemLayoutAtomV{},
+                 make_shape(get<1>(TileShape_MNK{}), get<2>(TileShape_MNK{}), Int<kStages>{})));
+
+    // Note this is the transpose in terms of the view, not in terms of memory.
+    using SmemLayoutVt =
+        decltype(composition(SmemLayoutV{},
+                    make_ordered_layout(
+                        make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{}), Int<kStages>{}),
+                        Step<_2, _1, _3>{})));
+
+    using SmemLayoutAtomO = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, OutputType,
+        decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutO = decltype(tile_to_shape(SmemLayoutAtomO{}, select<0, 2>(TileShape_MNK{})));
+
+    using SmemCopyAtomQ = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
+
+    using SharedStorage = SharedStorageQKVO<kStages, Element, Element, Element, SmemLayoutQ,
+                                            SmemLayoutK, SmemLayoutV, SmemLayoutO>;
+
+    using MainloopPipeline = typename cutlass::PipelineTmaAsync<kStages>;
+    using MainloopPipelineNoTMA = typename cutlass::PipelineAsync<kStages>;
+    using PipelineState = typename cutlass::PipelineState<kStages>;
+    // using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+
+};
+
+// Traits struct for fp8 kernel with in-kernel transpose
+template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_, int kStages_, bool Is_Q_in_regs_=false,
+         int kClusterM_ = 1, typename elem_type=cutlass::float_e4m3_t>
+struct Flash_fwd_kernel_traits_fp8 {
+    using Element = elem_type;
+    static_assert(cutlass::sizeof_bits_v<Element> == 8);
+    using ElementAccum = float;
+    using OutputType = cutlass::half_t;
+    using index_t = int64_t;      
+
+    // The number of threads.
+    static constexpr int kNWarps = kNWarps_;
+    static constexpr int kNThreads = kNWarps * cutlass::NumThreadsPerWarp;
+    static constexpr int NumProducerThreads = cutlass::NumThreadsPerWarpGroup;
+
+    static constexpr bool Is_Q_in_regs = Is_Q_in_regs_;
+    static_assert(kNWarps_ == 12 || kNWarps_ == 16);
+    static constexpr bool Is_WS = true;    
+    static_assert(!Is_Q_in_regs, "Warp-specialization does not support Q in registers");    
+
+    static constexpr int kBlockM = kBlockM_;
+    static constexpr int kBlockN = kBlockN_;
+    static constexpr int kHeadDim = kHeadDim_;
+    static_assert(kHeadDim % 32 == 0);
+    using TileShape_MNK = Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
+
+    static constexpr int kClusterM = kClusterM_;
+    using ClusterShape_MNK = Shape<Int<kClusterM>, _1, _1>;
+
+    static constexpr int kStages = kStages_;
+    static_assert(kStages > 1);
+
+    using AtomLayoutMNK = Layout<Shape<Int<kBlockM / 64>, _1, _1>>;    
+    using TiledMma0 = decltype(cute::make_tiled_mma(
+        cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShape_MNK>(),
+        AtomLayoutMNK{}));
+    
+    using TiledMma1 = decltype(cute::make_tiled_mma(
+        cute::GMMA::rs_op_selector<Element, Element, ElementAccum, decltype(select<0, 2, 1>(TileShape_MNK{}))>(),
+        AtomLayoutMNK{}));
+
+    using SmemLayoutAtomQ = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutQ = decltype(tile_to_shape(SmemLayoutAtomQ{}, select<0, 2>(TileShape_MNK{})));
+
+    using SmemLayoutAtomK = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutK =
+        decltype(tile_to_shape(SmemLayoutAtomK{},
+                 make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<kStages>{})));
+
+    using TransposeShapeAtomV = Shape<_64, _64>;    
+    using SmemLayoutAtomV = decltype(tile_to_shape(GMMA::Layout_K_SW64_Atom<Element>{}, TransposeShapeAtomV{}));
+    using SmemLayoutV =
+        decltype(tile_to_shape(SmemLayoutAtomV{},
+                 make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<kStages>{})));
+    
+    // for fp8 in-kernel transpose -- src layout
+    using SmemLayoutDivideV = decltype(tiled_divide(SmemLayoutV{}, TransposeShapeAtomV{}));
+    using SmemShapeLDSM = Shape<Shape<_8, _8>, Shape<_16, _4>>;
+    using FactoringShapeV = decltype(make_shape(SmemShapeLDSM{},
+        shape<1>(SmemLayoutDivideV{}), shape<2>(SmemLayoutDivideV{}), shape<3>(SmemLayoutDivideV{})));
+    using SmemLayoutTransposeV = decltype(composition(SmemLayoutDivideV{}, make_layout(FactoringShapeV{})));
+
+    // For fp8, this is the memory transpose.
+    using SmemLayoutAtomVt = decltype(tile_to_shape(GMMA::Layout_K_SW64_Atom<Element>{}, TransposeShapeAtomV{}));
+    using SmemLayoutVt =
+        decltype(tile_to_shape(SmemLayoutAtomVt{},
+                 make_shape(shape<2>(TileShape_MNK{}), shape<1>(TileShape_MNK{}), Int<kStages>{})));
+
+    // for fp8 in-kernel transpose -- dst layout
+    using SmemLayoutVtTrans =
+        decltype(composition(SmemLayoutVt{},
+                             make_ordered_layout(product_each(shape(SmemLayoutV{})), Step<_2, _1, _3>{})));
+    using SmemLayoutDivideVt = decltype(tiled_divide(SmemLayoutVtTrans{}, TransposeShapeAtomV{}));
+#ifndef NO_FP8_COLUMN_PERMUTE
+    using SmemShapeSTSM = Shape<Shape<_16, _4>, Shape<_8, _8>>;
+#else
+    using SmemShapeSTSM = Shape<Shape<_16, _4>, Shape<_16, _4>>;
+#endif
+    using FactoringShapeVt = decltype(make_shape(SmemShapeSTSM{},
+        shape<1>(SmemLayoutDivideVt{}), shape<2>(SmemLayoutDivideVt{}), shape<3>(SmemLayoutDivideVt{})));
+    using SmemLayoutTransposeVt = decltype(composition(SmemLayoutDivideVt{}, make_layout(FactoringShapeVt{})));
+
+    using SmemLayoutAtomO = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, OutputType,
+        decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutO = decltype(tile_to_shape(SmemLayoutAtomO{}, select<0, 2>(TileShape_MNK{})));
+
+    // used for rmem -> smem O copy in fp8 kernel to undo column permutation
+    using ThreadLayoutrO = Layout<Shape<_8, Int<kBlockM/16>, _4, _1>,
+                                 Stride<_4, _32, _1, _0>>;
+    using ValueLayoutrO = Layout<Shape<_1, _2, Shape<_2, _2>, Int<kHeadDim/16>>,
+                                Stride<_0, _2, Stride<_4, _1>, _8>>;
+    using TiledCopyrO = decltype(make_tiled_copy(Copy_Atom<UniversalCopy<uint16_t>, OutputType>{},
+                      ThreadLayoutrO{}, ValueLayoutrO{}));
+
+    using TiledCopyShaperO = Shape<_8, Int<kBlockM/8>, _16, Int<kHeadDim/16>>;
+    using SmemLayoutrO = decltype(composition(SmemLayoutO{}, Layout<TiledCopyShaperO>{}));
+
+    using SmemCopyAtomQ = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
+
+    using SharedStorage = SharedStorageQKVOVt<kStages, Element, Element, OutputType, SmemLayoutQ,
+                          SmemLayoutK, SmemLayoutV, SmemLayoutO>;
+
+    using MainloopPipeline = typename cutlass::PipelineTmaAsync<kStages>;
+    using MainloopPipelineNoTMA = typename cutlass::PipelineAsync<kStages>;
+    using PipelineState = typename cutlass::PipelineState<kStages>;
+    // using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool Has_P_smem, int kStages, class Element, class OutputType, class SmemLayoutQ, class SmemLayoutdO,
+          class SmemLayoutK, class SmemLayoutV, class SmemLayoutP, class SmemLayoutdS,
+          class SmemLayoutdK, class SmemLayoutdV>
+struct SharedStorageQKVdOdKV;
+
+template <int kStages, class Element, class OutputType, class SmemLayoutQ, class SmemLayoutdO,
+          class SmemLayoutK, class SmemLayoutV, class SmemLayoutP, class SmemLayoutdS,
+          class SmemLayoutdK, class SmemLayoutdV>
+struct SharedStorageQKVdOdKV<true, kStages, Element, OutputType, SmemLayoutQ, SmemLayoutdO,
+        SmemLayoutK, SmemLayoutV, SmemLayoutP, SmemLayoutdS, SmemLayoutdK, SmemLayoutdV> {
+    struct {
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>> smem_q;
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutdO>> smem_do;
+        union {
+            struct {
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>> smem_k;
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutV>> smem_v;
+            };
+            struct {
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutdK>> smem_dk;
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutdV>> smem_dv;
+            };
+        };
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutP>> smem_p;
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutdS>> smem_ds;
+    };
+    struct {
+        cute::uint64_t tma_load_mbar[8]; // 8 TMA barrier pre-allcoated for usage.
+        cutlass::arch::ClusterTransactionBarrier barrier_K;
+        cutlass::arch::ClusterTransactionBarrier barrier_V;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_q;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_do;
+    };
+};
+
+template <int kStages, class Element, class OutputType, class SmemLayoutQ, class SmemLayoutdO,
+          class SmemLayoutK, class SmemLayoutV, class SmemLayoutP, class SmemLayoutdS,
+          class SmemLayoutdK, class SmemLayoutdV>
+struct SharedStorageQKVdOdKV<false, kStages, Element, OutputType, SmemLayoutQ, SmemLayoutdO,
+        SmemLayoutK, SmemLayoutV, SmemLayoutP, SmemLayoutdS, SmemLayoutdK, SmemLayoutdV> {
+    struct {
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>> smem_q;
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutdO>> smem_do;
+        union {
+            struct {
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>> smem_k;
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutV>> smem_v;
+            };
+            struct {
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutdK>> smem_dk;
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutdV>> smem_dv;
+            };
+        };
+        union {  // Put smem_p in a union just so we can still refer to it in the struct, even if it's not used.
+            cute::array_aligned<Element, cute::cosize_v<SmemLayoutP>> smem_p;
+            cute::array_aligned<Element, cute::cosize_v<SmemLayoutdS>> smem_ds;
+        };
+    };
+    struct {
+        cute::uint64_t tma_load_mbar[8]; // 8 TMA barrier pre-allcoated for usage.
+        cutlass::arch::ClusterTransactionBarrier barrier_K;
+        cutlass::arch::ClusterTransactionBarrier barrier_V;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_q;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_do;
+    };
+};
+
+template <bool Has_P_smem, int kStages, class Element, class OutputType, class SmemLayoutQ, class SmemLayoutdO,
+          class SmemLayoutK, class SmemLayoutV, class SmemLayoutP, class SmemLayoutdS, class SmemLayoutdQacc,
+          class SmemLayoutdK, class SmemLayoutdV>
+struct SharedStorageQKVdOdKVWS;
+
+template <int kStages, class Element, class OutputType, class SmemLayoutQ, class SmemLayoutdO,
+          class SmemLayoutK, class SmemLayoutV, class SmemLayoutP, class SmemLayoutdS, class SmemLayoutdQacc,
+          class SmemLayoutdK, class SmemLayoutdV>
+struct SharedStorageQKVdOdKVWS<true, kStages, Element, OutputType, SmemLayoutQ, SmemLayoutdO,
+        SmemLayoutK, SmemLayoutV, SmemLayoutP, SmemLayoutdS, SmemLayoutdQacc, SmemLayoutdK, SmemLayoutdV> {
+    struct {
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>> smem_q;
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutdO>> smem_do;
+        union {
+            struct {
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>> smem_k;
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutV>> smem_v;
+            };
+            struct {
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutdK>> smem_dk;
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutdV>> smem_dv;
+            };
+        };
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutP>> smem_p;
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutdS>> smem_ds;
+        cute::array_aligned<float, cute::cosize_v<SmemLayoutdQacc>> smem_dqacc;
+        cute::array_aligned<float, 128> smem_lse;
+        cute::array_aligned<float, 128> smem_dpsum;
+    };
+    struct {
+        cute::uint64_t tma_load_mbar[8]; // 8 TMA barrier pre-allcoated for usage.
+        cutlass::arch::ClusterTransactionBarrier barrier_K;
+        cutlass::arch::ClusterTransactionBarrier barrier_V;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_q;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_do;
+    };
+};
+
+template <int kStages, class Element, class OutputType, class SmemLayoutQ, class SmemLayoutdO,
+          class SmemLayoutK, class SmemLayoutV, class SmemLayoutP, class SmemLayoutdS, class SmemLayoutdQacc,
+          class SmemLayoutdK, class SmemLayoutdV>
+struct SharedStorageQKVdOdKVWS<false, kStages, Element, OutputType, SmemLayoutQ, SmemLayoutdO,
+        SmemLayoutK, SmemLayoutV, SmemLayoutP, SmemLayoutdS, SmemLayoutdQacc, SmemLayoutdK, SmemLayoutdV> {
+    struct {
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>> smem_q;
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutdO>> smem_do;
+        union {
+            struct {
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>> smem_k;
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutV>> smem_v;
+            };
+            struct {
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutdK>> smem_dk;
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutdV>> smem_dv;
+            };
+        };
+        union {  // Put smem_p in a union just so we can still refer to it in the struct, even if it's not used.
+            cute::array_aligned<Element, cute::cosize_v<SmemLayoutP>> smem_p;
+            cute::array_aligned<Element, cute::cosize_v<SmemLayoutdS>> smem_ds;
+        };
+        cute::array_aligned<float, cute::cosize_v<SmemLayoutdQacc>> smem_dqacc;
+        cute::array_aligned<float, 128> smem_lse;
+        cute::array_aligned<float, 128> smem_dpsum;
+    };
+    struct {
+        cute::uint64_t tma_load_mbar[8]; // 8 TMA barrier pre-allcoated for usage.
+        cutlass::arch::ClusterTransactionBarrier barrier_K;
+        cutlass::arch::ClusterTransactionBarrier barrier_V;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_q;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_do;
+    };
+};
+
+template <bool Has_P_smem, int kStages, class Element, class OutputType, class SmemLayoutQ, class SmemLayoutdO,
+          class SmemLayoutK, class SmemLayoutV, class SmemLayoutP, class SmemLayoutdS,
+          class SmemLayoutdQ>
+struct SharedStorageQKVdOdKVSeqqPar;
+
+template <int kStages, class Element, class OutputType, class SmemLayoutQ, class SmemLayoutdO,
+          class SmemLayoutK, class SmemLayoutV, class SmemLayoutP, class SmemLayoutdS,
+          class SmemLayoutdQ>
+struct SharedStorageQKVdOdKVSeqqPar<true, kStages, Element, OutputType, SmemLayoutQ, SmemLayoutdO,
+        SmemLayoutK, SmemLayoutV, SmemLayoutP, SmemLayoutdS, SmemLayoutdQ> {
+    struct {
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>> smem_k;
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutV>> smem_v;
+        union {
+            struct {
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>> smem_q;
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutdO>> smem_do;
+            };
+            struct {
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutdQ>> smem_dq;
+            };
+        };
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutP>> smem_p;
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutdS>> smem_ds;
+    };
+    struct {
+        cute::uint64_t tma_load_mbar[8]; // 8 TMA barrier pre-allcoated for usage.
+        cutlass::arch::ClusterTransactionBarrier barrier_Q;
+        cutlass::arch::ClusterTransactionBarrier barrier_dO;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_k;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_v;
+    };
+};
+
+template <int kStages, class Element, class OutputType, class SmemLayoutQ, class SmemLayoutdO,
+          class SmemLayoutK, class SmemLayoutV, class SmemLayoutP, class SmemLayoutdS,
+          class SmemLayoutdQ>
+struct SharedStorageQKVdOdKVSeqqPar<false, kStages, Element, OutputType, SmemLayoutQ, SmemLayoutdO,
+        SmemLayoutK, SmemLayoutV, SmemLayoutP, SmemLayoutdS, SmemLayoutdQ> {
+    struct {
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>> smem_k;
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutV>> smem_v;
+        union {
+            struct {
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>> smem_q;
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutdO>> smem_do;
+            };
+            struct {
+                cute::array_aligned<Element, cute::cosize_v<SmemLayoutdQ>> smem_dq;
+            };
+        };
+        union {  // Put smem_p in a union just so we can still refer to it in the struct, even if it's not used.
+            cute::array_aligned<Element, cute::cosize_v<SmemLayoutP>> smem_p;
+            cute::array_aligned<Element, cute::cosize_v<SmemLayoutdS>> smem_ds;
+        };
+    };
+    struct {
+        cute::uint64_t tma_load_mbar[8]; // 8 TMA barrier pre-allcoated for usage.
+        cutlass::arch::ClusterTransactionBarrier barrier_Q;
+        cutlass::arch::ClusterTransactionBarrier barrier_dO;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_k;
+        typename cutlass::PipelineTmaAsync<kStages>::SharedStorage pipeline_v;
+    };
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_,
+         bool SdP_swapAB_, bool dKV_swapAB_, bool dQ_swapAB_,
+         int AtomLayoutMSdP=1, int AtomLayoutNdKV=2, int AtomLayoutMdQ=1,
+         int kClusterN_ = 1, typename elem_type=cutlass::half_t>
+struct Flash_bwd_kernel_traits {
+    using Element = elem_type;
+    using ElementAccum = float;
+    using index_t = int64_t;
+
+    // The number of threads.
+    static constexpr int kNWarps = kNWarps_;
+    static constexpr int kNThreads = kNWarps * cutlass::NumThreadsPerWarp;
+    static constexpr int kNThreadsNonWS = 8 * cutlass::NumThreadsPerWarp;
+    // static constexpr int kNThreadsdQ = cutlass::NumThreadsPerWarpGroup;
+    static constexpr int kNThreadsdQ = 2 * cutlass::NumThreadsPerWarpGroup;
+
+    static_assert(kNWarps_ == 8 || kNWarps_ == 12);
+
+    static constexpr bool Is_WS = kNWarps_ >= 12;
+
+    static constexpr int kBlockM = kBlockM_;
+    static constexpr int kBlockN = kBlockN_;
+    static constexpr int kHeadDim = kHeadDim_;
+    static_assert(kHeadDim % 32 == 0);
+    using TileShape_MNK = Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
+
+    static constexpr int kClusterN = kClusterN_;
+    using ClusterShape_MNK = Shape<_1, Int<kClusterN>, _1>;
+
+    static constexpr int kStages = 2;
+
+    static constexpr bool SdP_swapAB = SdP_swapAB_;
+    static constexpr bool dKV_swapAB = dKV_swapAB_;
+    static constexpr bool dQ_swapAB = dQ_swapAB_;
+    static_assert(!(SdP_swapAB && dKV_swapAB));  // If SdP_swapAB, then we don't swap for dKV
+
+    static constexpr bool Mma_dQ_is_RS = AtomLayoutMSdP == 2 && AtomLayoutMdQ == 2 && !SdP_swapAB && !dQ_swapAB;  // If dQ_swapAB we can't use RS
+
+    using TileShapeAtomSdP = std::conditional_t<
+        !SdP_swapAB,
+        Shape<Int<kBlockM>, Int<kBlockN / (2 / AtomLayoutMSdP)>, Int<kHeadDim>>,
+        Shape<Int<kBlockN / (2 / AtomLayoutMSdP)>, Int<kBlockM>, Int<kHeadDim>>
+    >;
+    using AtomLayoutSdP = std::conditional_t<
+        !SdP_swapAB,
+        Layout<Shape<Int<AtomLayoutMSdP>, Int<2 / AtomLayoutMSdP>, _1>>,
+        Layout<Shape<Int<2 / AtomLayoutMSdP>, Int<AtomLayoutMSdP>, _1>>
+    >;
+    using TiledMmaSdP = decltype(cute::make_tiled_mma(
+        cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShapeAtomSdP>(),
+        AtomLayoutSdP{}));
+
+    using TileShapeAtomdKV = std::conditional_t<
+        !dKV_swapAB,
+        Shape<Int<kBlockN>, Int<kHeadDim / (2 / AtomLayoutNdKV)>, Int<kBlockM>>,
+        Shape<Int<kHeadDim / (2 / AtomLayoutNdKV)>, Int<kBlockN>, Int<kBlockM>>
+    >;
+    using AtomLayoutdKV = std::conditional_t<
+        !dKV_swapAB,
+        Layout<Shape<Int<AtomLayoutNdKV>, Int<2 / AtomLayoutNdKV>, _1>>,
+        Layout<Shape<Int<2 / AtomLayoutNdKV>, Int<AtomLayoutNdKV>, _1>>
+    >;
+    using TiledMmadKV = decltype(cute::make_tiled_mma(
+        std::conditional_t<
+            !SdP_swapAB,
+            decltype(cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShapeAtomdKV, GMMA::Major::MN, GMMA::Major::MN>()),
+            decltype(cute::GMMA::rs_op_selector<Element, Element, ElementAccum, TileShapeAtomdKV, GMMA::Major::K, GMMA::Major::MN>())
+        >{},
+        AtomLayoutdKV{}));
+
+    using TileShapeAtomdQ = std::conditional_t<
+        !dQ_swapAB,
+        Shape<Int<kBlockM>, Int<kHeadDim / (2 / AtomLayoutMdQ)>, Int<kBlockN>>,
+        Shape<Int<kHeadDim / (2 / AtomLayoutMdQ)>, Int<kBlockM>, Int<kBlockN>>
+        // Shape<Int<kBlockM>, Int<kHeadDim >, Int<kBlockN>>,
+        // Shape<Int<kHeadDim>, Int<kBlockM>, Int<kBlockN>>
+    >;
+    using AtomLayoutdQ = std::conditional_t<
+        !dQ_swapAB,
+        Layout<Shape<Int<AtomLayoutMdQ>, Int<2 / AtomLayoutMdQ>, _1>>,
+        Layout<Shape<Int<2 / AtomLayoutMdQ>, Int<AtomLayoutMdQ>, _1>>
+        // Layout<Shape<Int<1>, Int<1>, _1>>,
+        // Layout<Shape<Int<1>, Int<1>, _1>>
+    >;
+    static constexpr GMMA::Major MmadQMajorA = !dQ_swapAB ? GMMA::Major::K : GMMA::Major::MN;
+    static constexpr GMMA::Major MmadQMajorB = !dQ_swapAB ? GMMA::Major::MN : GMMA::Major::K;
+    using TiledMmadQ = decltype(cute::make_tiled_mma(
+        std::conditional_t<
+            !dQ_swapAB,
+            std::conditional_t<
+                Mma_dQ_is_RS,
+                decltype(cute::GMMA::rs_op_selector<Element, Element, ElementAccum, TileShapeAtomdQ, GMMA::Major::K, GMMA::Major::MN>()),
+                decltype(cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShapeAtomdQ, GMMA::Major::K, GMMA::Major::MN>())
+            >,
+            decltype(cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShapeAtomdQ, GMMA::Major::MN, GMMA::Major::K>())
+        >{},
+        AtomLayoutdQ{}));
+
+    using GmemTiledCopyQdO = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+    using GmemTiledCopyKV = cute::SM90_TMA_LOAD;
+    using GmemTiledCopydKV = cute::SM90_TMA_STORE;
+
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
+    static constexpr bool Has_cp_async = true;
+#else
+    static constexpr bool Has_cp_async = false;
+#endif
+    // For the dot_do_o preprocessing kernel
+    using Gmem_copy_struct = std::conditional_t<
+        Has_cp_async,
+        SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>,
+        DefaultCopy
+    >;
+    static constexpr int kBlockKSmem = kHeadDim % 64 == 0 ? 64 : 32;
+    static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element);
+    static_assert(kHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad");
+    // Using kBlockKSmem instead of kHeadDim here to avoid bank conflicts, but doesn't seem
+    // to affect speed in practice.
+    static constexpr int kGmemThreadsPerRow = kBlockKSmem / kGmemElemsPerLoad;
+    static_assert(kNThreadsNonWS % kGmemThreadsPerRow == 0, "kNThreadsNonWS must be a multiple of kGmemThreadsPerRow");
+    using GmemLayoutAtom = Layout<Shape <Int<kNThreadsNonWS / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
+                                  Stride<Int<kGmemThreadsPerRow>, _1>>;
+    using GmemLayoutAtomdQ = Layout<Shape <Int<kNThreadsdQ / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
+                                  Stride<Int<kGmemThreadsPerRow>, _1>>;
+    using GmemTiledCopydO = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape < _1, _8>>{}));  // Val layout, 8 vals per store
+    using GmemTiledCopydQ = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
+                        GmemLayoutAtomdQ{},
+                        Layout<Shape < _1, _8>>{}));  // Val layout, 8 vals per store
+    using GmemLayoutAtomdQaccum = std::conditional_t<
+        kBlockKSmem == 32,
+        Layout<Shape <Int<kNThreadsdQ / 8>, _8>,  // Thread layout, 8 threads per row
+               Stride< _8, _1>>,
+        Layout<Shape <Int<kNThreadsdQ / 16>, _16>,  // Thread layout, 16 threads per row
+               Stride< _16, _1>>
+    >;
+    using GmemTiledCopydQaccum = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, ElementAccum>{},
+                        GmemLayoutAtomdQaccum{},
+                        Layout<Shape < _1, _4>>{}));  // Val layout, 4 vals per store
+
+    using SmemLayoutAtomQ = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutQ =
+        decltype(tile_to_shape(SmemLayoutAtomQ{},
+                 make_shape(shape<0>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<kStages>{})));
+    using SmemLayoutdO = SmemLayoutQ;
+
+    using SmemLayoutAtomK = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutK = decltype(tile_to_shape(SmemLayoutAtomK{}, select<1, 2>(TileShape_MNK{})));
+
+    using SmemLayoutAtomV = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutV = decltype(tile_to_shape(SmemLayoutAtomV{}, select<1, 2>(TileShape_MNK{})));
+
+    using SmemLayoutAtomP = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<1>(TileShape_MNK{}))>());
+    using SmemLayoutP = decltype(tile_to_shape(SmemLayoutAtomP{}, select<0, 1>(TileShape_MNK{})));
+    using SmemLayoutAtomdS = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<1>(TileShape_MNK{}))>());
+    using SmemLayoutdS = decltype(tile_to_shape(SmemLayoutAtomdS{}, select<0, 1>(TileShape_MNK{})));
+
+    // using SmemLayoutAtomdQacc = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, ElementAccum,
+    //     decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    // using SmemLayoutdQacc = decltype(tile_to_shape(SmemLayoutAtomdQacc{}, select<0, 2>(TileShape_MNK{})));
+
+    // Note this is the transpose in terms of the view, not in terms of memory.
+    using SmemLayoutQt =
+        decltype(cute::composition(SmemLayoutQ{},
+                                   make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{}), Int<kStages>{}),
+                                               make_stride(Int<kBlockM>{}, _1{}, Int<kBlockM * kHeadDim>{}))));
+    using SmemLayoutdOt =
+        decltype(cute::composition(SmemLayoutdO{},
+                                   make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{}), Int<kStages>{}),
+                                               make_stride(Int<kBlockM>{}, _1{}, Int<kBlockM * kHeadDim>{}))));
+    using SmemLayoutKt =
+        decltype(cute::composition(SmemLayoutK{},
+                                   make_layout(make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{})),
+                                               make_stride(Int<kBlockN>{}, _1{}))));
+    using SmemLayoutPt =
+        decltype(cute::composition(SmemLayoutP{},
+                                   make_layout(make_shape(get<1>(TileShape_MNK{}), get<0>(TileShape_MNK{})),
+                                               make_stride(Int<kBlockM>{}, _1{}))));
+    using SmemLayoutdSt =
+        decltype(cute::composition(SmemLayoutdS{},
+                                   make_layout(make_shape(get<1>(TileShape_MNK{}), get<0>(TileShape_MNK{})),
+                                               make_stride(Int<kBlockM>{}, _1{}))));
+
+    // using SmemLayoutdQacct =
+    //     decltype(cute::composition(SmemLayoutdQacc{},
+    //                                make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{})),
+    //                                            make_stride(Int<kBlockM>{}, _1{}))));
+
+    using SmemLayoutdK = SmemLayoutK;
+    using SmemLayoutdV = SmemLayoutV;
+    using SmemLayoutdKt = SmemLayoutKt;
+    using SmemLayoutdVt = SmemLayoutKt;
+
+    static constexpr int kSwizzle = kBlockKSmem == 32 ? 2 : 3;
+    using SmemLayoutAtomdQ = decltype(
+        // composition(Swizzle<kSwizzle, 3, 3>{},
+        composition(Swizzle<3, 3, 3>{},
+                    Layout<Shape<Int<kNThreadsdQ / 32>, Int<32>>,
+                           Stride<Int<32>, _1>>{}));
+    using SmemLayoutdQ = decltype(tile_to_shape(
+        SmemLayoutAtomdQ{},
+        make_shape(Int<kBlockM>{}, Int<kHeadDim>{})));
+    using SmemLayoutdQt =
+        decltype(cute::composition(SmemLayoutdQ{},
+                                   make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{})),
+                                               make_stride(Int<kBlockM>{}, _1{}))));
+    static constexpr int kSmemdQSize = size(SmemLayoutdQ{}) * sizeof(Element);
+
+    using SmemLayoutAtomdQaccTMA = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, ElementAccum,
+        decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<1>(TileShape_MNK{}))>());
+    using SmemLayoutdQaccTMA = decltype(tile_to_shape(SmemLayoutAtomdQaccTMA{}, select<0, 2>(TileShape_MNK{})));
+    using SmemLayoutdQacc = SmemLayoutdQ;
+    using SmemLayoutdQacct = SmemLayoutdQt;
+    using SmemLayoutdQacc2 = decltype(tile_to_shape(
+        SmemLayoutAtomdQ{},
+        make_shape(Int<kBlockM>{}, Int<kHeadDim>{}, _2{})));
+    // using SmemLayoutdQacc = decltype(tile_to_shape(SmemLayoutAtomdQacc{}, select<0, 2>(TileShape_MNK{})));
+    // using SmemLayoutdQacct =
+    //     decltype(cute::composition(SmemLayoutdQacc{},
+    //                                make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{})),
+    //                                            make_stride(Int<kBlockM>{}, _1{}))));
+    using RmemTiledCopydQacc = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, ElementAccum>{},
+                        GmemLayoutAtomdQaccum{},
+                        Layout<Shape < _1, _4>>{}));  // Val layout, 4 vals per store
+
+    // using SmemCopyAtomQ = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
+    using SmemCopyAtomPdS = Copy_Atom<
+        std::conditional_t<!SdP_swapAB, cute::SM90_U32x4_STSM_N, cute::SM90_U16x8_STSM_T>,
+        Element>;
+    using SmemCopyAtomdKV = Copy_Atom<
+        std::conditional_t<!dKV_swapAB, cute::SM90_U32x4_STSM_N, cute::SM90_U16x8_STSM_T>,
+        Element>;
+    using SmemCopyAtomdQ = Copy_Atom<
+        std::conditional_t<!dQ_swapAB, cute::SM90_U32x4_STSM_N, cute::SM90_U16x8_STSM_T>,
+        Element>;
+
+    using SharedStorage = std::conditional_t<
+        !Is_WS,
+        SharedStorageQKVdOdKV<!SdP_swapAB, kStages, Element, Element, SmemLayoutQ, SmemLayoutdO,
+                              SmemLayoutK, SmemLayoutV, SmemLayoutP, SmemLayoutdS, SmemLayoutdK, SmemLayoutdV>,
+        SharedStorageQKVdOdKVWS<!SdP_swapAB, kStages, Element, Element, SmemLayoutQ, SmemLayoutdO,
+                              SmemLayoutK, SmemLayoutV, SmemLayoutP, SmemLayoutdS, SmemLayoutdQacc, SmemLayoutdK, SmemLayoutdV>
+                              // SmemLayoutK, SmemLayoutV, SmemLayoutdS, SmemLayoutdQacc2, SmemLayoutdK, SmemLayoutdV>
+    >;
+
+    // using MainloopPipeline = typename cutlass::PipelineTmaAsync<kStages * 2>;
+    // using PipelineState = typename cutlass::PipelineState<kStages * 2>;
+    using MainloopPipeline = typename cutlass::PipelineTmaAsync<kStages>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int kHeadDim_, int kBlockM_, int kBlockN_, int kNWarps_,
+         bool SdP_swapAB_, bool dKV_swapAB_, bool dQ_swapAB_,
+         int AtomLayoutMSdP=1, int AtomLayoutNdKV=2, int AtomLayoutMdQ=1,
+         int kClusterN_ = 1, typename elem_type=cutlass::half_t>
+struct Flash_bwd_seqqpar_kernel_traits {
+    using Element = elem_type;
+    using ElementAccum = float;
+    using index_t = int64_t;
+
+    // The number of threads.
+    static constexpr int kNWarps = kNWarps_;
+    static constexpr int kNThreads = kNWarps * cutlass::NumThreadsPerWarp;
+
+    static_assert(kNWarps_ == 8);
+
+    static constexpr int kBlockM = kBlockM_;
+    static constexpr int kBlockN = kBlockN_;
+    static constexpr int kHeadDim = kHeadDim_;
+    static_assert(kHeadDim % 32 == 0);
+    using TileShape_MNK = Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
+
+    static constexpr int kClusterN = kClusterN_;
+    using ClusterShape_MNK = Shape<_1, Int<kClusterN>, _1>;
+
+    static constexpr int kStages = 2;
+
+    static constexpr bool SdP_swapAB = SdP_swapAB_;
+    static constexpr bool dKV_swapAB = dKV_swapAB_;
+    static constexpr bool dQ_swapAB = dQ_swapAB_;
+    static_assert(!(SdP_swapAB && dKV_swapAB));  // If SdP_swapAB, then we don't swap for dKV
+
+    static constexpr bool Mma_dQ_is_RS = AtomLayoutMSdP == 2 && AtomLayoutMdQ == 2 && !SdP_swapAB && !dQ_swapAB;  // If dQ_swapAB we can't use RS
+
+    using TileShapeAtomSdP = std::conditional_t<
+        !SdP_swapAB,
+        Shape<Int<kBlockM>, Int<kBlockN / (2 / AtomLayoutMSdP)>, Int<kHeadDim>>,
+        Shape<Int<kBlockN / (2 / AtomLayoutMSdP)>, Int<kBlockM>, Int<kHeadDim>>
+    >;
+    using AtomLayoutSdP = std::conditional_t<
+        !SdP_swapAB,
+        Layout<Shape<Int<AtomLayoutMSdP>, Int<2 / AtomLayoutMSdP>, _1>>,
+        Layout<Shape<Int<2 / AtomLayoutMSdP>, Int<AtomLayoutMSdP>, _1>>
+    >;
+    using TiledMmaSdP = decltype(cute::make_tiled_mma(
+        cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShapeAtomSdP>(),
+        AtomLayoutSdP{}));
+
+    using TileShapeAtomdKV = std::conditional_t<
+        !dKV_swapAB,
+        Shape<Int<kBlockN>, Int<kHeadDim / (2 / AtomLayoutNdKV)>, Int<kBlockM>>,
+        Shape<Int<kHeadDim / (2 / AtomLayoutNdKV)>, Int<kBlockN>, Int<kBlockM>>
+    >;
+    using AtomLayoutdKV = std::conditional_t<
+        !dKV_swapAB,
+        Layout<Shape<Int<AtomLayoutNdKV>, Int<2 / AtomLayoutNdKV>, _1>>,
+        Layout<Shape<Int<2 / AtomLayoutNdKV>, Int<AtomLayoutNdKV>, _1>>
+    >;
+    using TiledMmadKV = decltype(cute::make_tiled_mma(
+        std::conditional_t<
+            !SdP_swapAB,
+            decltype(cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShapeAtomdKV, GMMA::Major::MN, GMMA::Major::MN>()),
+            decltype(cute::GMMA::rs_op_selector<Element, Element, ElementAccum, TileShapeAtomdKV, GMMA::Major::K, GMMA::Major::MN>())
+        >{},
+        AtomLayoutdKV{}));
+
+    using TileShapeAtomdQ = std::conditional_t<
+        !dQ_swapAB,
+        Shape<Int<kBlockM>, Int<kHeadDim / (2 / AtomLayoutMdQ)>, Int<kBlockN>>,
+        Shape<Int<kHeadDim / (2 / AtomLayoutMdQ)>, Int<kBlockM>, Int<kBlockN>>
+    >;
+    using AtomLayoutdQ = std::conditional_t<
+        !dQ_swapAB,
+        Layout<Shape<Int<AtomLayoutMdQ>, Int<2 / AtomLayoutMdQ>, _1>>,
+        Layout<Shape<Int<2 / AtomLayoutMdQ>, Int<AtomLayoutMdQ>, _1>>
+    >;
+    static constexpr GMMA::Major MmadQMajorA = !dQ_swapAB ? GMMA::Major::K : GMMA::Major::MN;
+    static constexpr GMMA::Major MmadQMajorB = !dQ_swapAB ? GMMA::Major::MN : GMMA::Major::K;
+    using TiledMmadQ = decltype(cute::make_tiled_mma(
+        std::conditional_t<
+            !dQ_swapAB,
+            std::conditional_t<
+                Mma_dQ_is_RS,
+                decltype(cute::GMMA::rs_op_selector<Element, Element, ElementAccum, TileShapeAtomdQ, GMMA::Major::K, GMMA::Major::MN>()),
+                decltype(cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShapeAtomdQ, GMMA::Major::K, GMMA::Major::MN>())
+            >,
+            decltype(cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShapeAtomdQ, GMMA::Major::MN, GMMA::Major::K>())
+        >{},
+        AtomLayoutdQ{}));
+
+    using GmemTiledCopyQdO = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
+    using GmemTiledCopyKV = cute::SM90_TMA_LOAD;
+    using GmemTiledCopydKV = cute::SM90_TMA_STORE;
+
+#if defined(__CUDA_ARCH__) &&  __CUDA_ARCH__ >= 800
+    static constexpr bool Has_cp_async = true;
+#else
+    static constexpr bool Has_cp_async = false;
+#endif
+    // For the dot_do_o preprocessing kernel
+    using Gmem_copy_struct = std::conditional_t<
+        Has_cp_async,
+        SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>,
+        DefaultCopy
+    >;
+    static constexpr int kBlockKSmem = kHeadDim % 64 == 0 ? 64 : 32;
+    static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element);
+    static_assert(kHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad");
+    // Using kBlockKSmem instead of kHeadDim here to avoid bank conflicts, but doesn't seem
+    // to affect speed in practice.
+    static constexpr int kGmemThreadsPerRow = kBlockKSmem / kGmemElemsPerLoad;
+    static_assert(kNThreads % kGmemThreadsPerRow == 0, "kNThreads must be a multiple of kGmemThreadsPerRow");
+    using GmemLayoutAtom = Layout<Shape <Int<kNThreads / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
+                                  Stride<Int<kGmemThreadsPerRow>, _1>>;
+    using GmemTiledCopydO = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape < _1, _8>>{}));  // Val layout, 8 vals per store
+    using GmemTiledCopydQ = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, elem_type>{},
+                        GmemLayoutAtom{},
+                        Layout<Shape < _1, _8>>{}));  // Val layout, 8 vals per store
+    using GmemLayoutAtomdQaccum = std::conditional_t<
+        kBlockKSmem == 32,
+        Layout<Shape <_32, _8>,  // Thread layout, 8 threads per row
+               Stride< _8, _1>>,
+        Layout<Shape <_16, _16>,  // Thread layout, 16 threads per row
+               Stride< _16, _1>>
+    >;
+    using GmemTiledCopydQaccum = decltype(
+        make_tiled_copy(Copy_Atom<DefaultCopy, ElementAccum>{},
+                        GmemLayoutAtomdQaccum{},
+                        Layout<Shape < _1, _4>>{}));  // Val layout, 4 vals per store
+
+    using SmemLayoutAtomQ = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutQ = decltype(tile_to_shape(SmemLayoutAtomQ{}, select<0, 2>(TileShape_MNK{})));
+    using SmemLayoutdO = SmemLayoutQ;
+
+    using SmemLayoutAtomK = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutK = decltype(tile_to_shape(SmemLayoutAtomK{},
+                 make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<kStages>{})));
+
+    using SmemLayoutAtomV = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutV = decltype(tile_to_shape(SmemLayoutAtomV{},
+                 make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<kStages>{})));
+
+    using SmemLayoutAtomP = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<1>(TileShape_MNK{}))>());
+    using SmemLayoutP = decltype(tile_to_shape(SmemLayoutAtomP{}, select<0, 1>(TileShape_MNK{})));
+    using SmemLayoutAtomdS = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<1>(TileShape_MNK{}))>());
+    using SmemLayoutdS = decltype(tile_to_shape(SmemLayoutAtomdS{}, select<0, 1>(TileShape_MNK{})));
+
+    // Note this is the transpose in terms of the view, not in terms of memory.
+    using SmemLayoutQt =
+        decltype(cute::composition(SmemLayoutQ{},
+                                   make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{})),
+                                               make_stride(Int<kBlockM>{}, _1{}))));
+    using SmemLayoutdOt =
+        decltype(cute::composition(SmemLayoutdO{},
+                                   make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{})),
+                                               make_stride(Int<kBlockM>{}, _1{}))));
+    using SmemLayoutKt =
+        decltype(cute::composition(SmemLayoutK{},
+                                   make_layout(make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{}), Int<kStages>{}),
+                                               make_stride(Int<kBlockN>{}, _1{}, Int<kBlockN * kHeadDim>{}))));
+    using SmemLayoutPt =
+        decltype(cute::composition(SmemLayoutP{},
+                                   make_layout(make_shape(get<1>(TileShape_MNK{}), get<0>(TileShape_MNK{})),
+                                               make_stride(Int<kBlockM>{}, _1{}))));
+    using SmemLayoutdSt =
+        decltype(cute::composition(SmemLayoutdS{},
+                                   make_layout(make_shape(get<1>(TileShape_MNK{}), get<0>(TileShape_MNK{})),
+                                               make_stride(Int<kBlockM>{}, _1{}))));
+
+    using SmemLayoutdK = decltype(tile_to_shape(SmemLayoutAtomK{}, select<1, 2>(TileShape_MNK{})));
+    using SmemLayoutdV = SmemLayoutdK;
+    using SmemLayoutdKt = SmemLayoutKt;
+    using SmemLayoutdVt = SmemLayoutKt;
+    using SmemLayoutdQTMA = decltype(tile_to_shape(SmemLayoutAtomK{}, select<0, 2>(TileShape_MNK{})));
+
+    static constexpr int kSwizzle = kBlockKSmem == 32 ? 2 : 3;
+    using SmemLayoutAtomdQ = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    Layout<Shape<_8, Int<kBlockKSmem>>,
+                           Stride<Int<kBlockKSmem>, _1>>{}));
+    using SmemLayoutdQ = decltype(tile_to_shape(
+        SmemLayoutAtomdQ{},
+        make_shape(Int<kBlockM>{}, Int<kHeadDim>{})));
+    using SmemLayoutdQt =
+        decltype(cute::composition(SmemLayoutdQ{},
+                                   make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{})),
+                                               make_stride(Int<kBlockM>{}, _1{}))));
+    static constexpr int kSmemdQSize = size(SmemLayoutdQ{}) * sizeof(Element);
+
+    using SmemLayoutAtomdKV = decltype(
+        composition(Swizzle<kSwizzle, 3, 3>{},
+                    Layout<Shape<_8, Int<kBlockKSmem>>,
+                           Stride<Int<kBlockKSmem>, _1>>{}));
+    using SmemLayoutdKV = decltype(tile_to_shape(
+        SmemLayoutAtomdKV{},
+        make_shape(Int<kBlockN>{}, Int<kHeadDim>{})));
+    using SmemLayoutdKVt =
+        decltype(cute::composition(SmemLayoutdKV{},
+                                   make_layout(make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{})),
+                                               make_stride(Int<kBlockN>{}, _1{}))));
+    static constexpr int kSmemdKVSize = size(SmemLayoutdKV{}) * sizeof(Element) * 2;
+
+    // using SmemCopyAtomQ = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
+    using SmemCopyAtomPdS = Copy_Atom<
+        std::conditional_t<!SdP_swapAB, cute::SM90_U32x4_STSM_N, cute::SM90_U16x8_STSM_T>,
+        Element>;
+    using SmemCopyAtomdKV = Copy_Atom<
+        std::conditional_t<!dKV_swapAB, cute::SM90_U32x4_STSM_N, cute::SM90_U16x8_STSM_T>,
+        Element>;
+    using SmemCopyAtomdQ = Copy_Atom<
+        std::conditional_t<!dQ_swapAB, cute::SM90_U32x4_STSM_N, cute::SM90_U16x8_STSM_T>,
+        Element>;
+
+    using SharedStorage = SharedStorageQKVdOdKVSeqqPar<!SdP_swapAB, kStages, Element, Element, SmemLayoutQ, SmemLayoutdO,
+        SmemLayoutK, SmemLayoutV, SmemLayoutP, SmemLayoutdS, SmemLayoutdQTMA>;
+
+    // using MainloopPipeline = typename cutlass::PipelineTmaAsync<kStages * 2>;
+    // using PipelineState = typename cutlass::PipelineState<kStages * 2>;
+    using MainloopPipeline = typename cutlass::PipelineTmaAsync<kStages>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/layer_norm.py b/layer_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fcf50e1786503a6ab253d13114ce2b56bae1eff
--- /dev/null
+++ b/layer_norm.py
@@ -0,0 +1,1086 @@
+# Copyright (c) 2024, Tri Dao.
+# Implement dropout + residual + layer_norm / rms_norm.
+
+# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
+# This is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
+# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
+
+import math
+
+import torch
+import torch.nn.functional as F
+from torch.cuda.amp import custom_fwd, custom_bwd
+
+import triton
+import triton.language as tl
+
+
+def layer_norm_ref(
+    x,
+    weight,
+    bias,
+    residual=None,
+    x1=None,
+    weight1=None,
+    bias1=None,
+    eps=1e-6,
+    dropout_p=0.0,
+    rowscale=None,
+    prenorm=False,
+    dropout_mask=None,
+    dropout_mask1=None,
+    upcast=False,
+):
+    dtype = x.dtype
+    if upcast:
+        x = x.float()
+        weight = weight.float()
+        bias = bias.float() if bias is not None else None
+        residual = residual.float() if residual is not None else residual
+        x1 = x1.float() if x1 is not None else None
+        weight1 = weight1.float() if weight1 is not None else None
+        bias1 = bias1.float() if bias1 is not None else None
+    if x1 is not None:
+        assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
+    if rowscale is not None:
+        x = x * rowscale[..., None]
+    if dropout_p > 0.0:
+        if dropout_mask is not None:
+            x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p)
+        else:
+            x = F.dropout(x, p=dropout_p)
+        if x1 is not None:
+            if dropout_mask1 is not None:
+                x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p)
+            else:
+                x1 = F.dropout(x1, p=dropout_p)
+    if x1 is not None:
+        x = x + x1
+    if residual is not None:
+        x = (x + residual).to(x.dtype)
+    out = F.layer_norm(x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps).to(
+        dtype
+    )
+    if weight1 is None:
+        return out if not prenorm else (out, x)
+    else:
+        out1 = F.layer_norm(
+            x.to(weight1.dtype), x.shape[-1:], weight=weight1, bias=bias1, eps=eps
+        ).to(dtype)
+        return (out, out1) if not prenorm else (out, out1, x)
+
+
+def rms_norm_ref(
+    x,
+    weight,
+    bias,
+    residual=None,
+    x1=None,
+    weight1=None,
+    bias1=None,
+    eps=1e-6,
+    dropout_p=0.0,
+    rowscale=None,
+    prenorm=False,
+    dropout_mask=None,
+    dropout_mask1=None,
+    upcast=False,
+):
+    dtype = x.dtype
+    if upcast:
+        x = x.float()
+        weight = weight.float()
+        bias = bias.float() if bias is not None else None
+        residual = residual.float() if residual is not None else residual
+        x1 = x1.float() if x1 is not None else None
+        weight1 = weight1.float() if weight1 is not None else None
+        bias1 = bias1.float() if bias1 is not None else None
+    if x1 is not None:
+        assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
+    if rowscale is not None:
+        x = x * rowscale[..., None]
+    if dropout_p > 0.0:
+        if dropout_mask is not None:
+            x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p)
+        else:
+            x = F.dropout(x, p=dropout_p)
+        if x1 is not None:
+            if dropout_mask1 is not None:
+                x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p)
+            else:
+                x1 = F.dropout(x1, p=dropout_p)
+    if x1 is not None:
+        x = x + x1
+    if residual is not None:
+        x = (x + residual).to(x.dtype)
+    rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
+    out = ((x * rstd * weight) + bias if bias is not None else (x * rstd * weight)).to(dtype)
+    if weight1 is None:
+        return out if not prenorm else (out, x)
+    else:
+        out1 = ((x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1)).to(
+            dtype
+        )
+        return (out, out1) if not prenorm else (out, out1, x)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=1),
+        triton.Config({}, num_warps=2),
+        triton.Config({}, num_warps=4),
+        triton.Config({}, num_warps=8),
+        triton.Config({}, num_warps=16),
+        triton.Config({}, num_warps=32),
+    ],
+    key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
+)
+# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
+# @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
+@triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
+@triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
+@triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
+@triton.jit
+def _layer_norm_fwd_1pass_kernel(
+    X,  # pointer to the input
+    Y,  # pointer to the output
+    W,  # pointer to the weights
+    B,  # pointer to the biases
+    RESIDUAL,  # pointer to the residual
+    X1,
+    W1,
+    B1,
+    Y1,
+    RESIDUAL_OUT,  # pointer to the residual
+    ROWSCALE,
+    SEEDS,  # Dropout seeds for each row
+    DROPOUT_MASK,
+    Mean,  # pointer to the mean
+    Rstd,  # pointer to the 1/std
+    stride_x_row,  # how much to increase the pointer when moving by 1 row
+    stride_y_row,
+    stride_res_row,
+    stride_res_out_row,
+    stride_x1_row,
+    stride_y1_row,
+    M,  # number of rows in X
+    N,  # number of columns in X
+    eps,  # epsilon to avoid division by zero
+    dropout_p,  # Dropout probability
+    IS_RMS_NORM: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    HAS_RESIDUAL: tl.constexpr,
+    STORE_RESIDUAL_OUT: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    HAS_DROPOUT: tl.constexpr,
+    STORE_DROPOUT_MASK: tl.constexpr,
+    HAS_ROWSCALE: tl.constexpr,
+    HAS_X1: tl.constexpr,
+    HAS_W1: tl.constexpr,
+    HAS_B1: tl.constexpr,
+):
+    # Map the program id to the row of X and Y it should compute.
+    row = tl.program_id(0)
+    X += row * stride_x_row
+    Y += row * stride_y_row
+    if HAS_RESIDUAL:
+        RESIDUAL += row * stride_res_row
+    if STORE_RESIDUAL_OUT:
+        RESIDUAL_OUT += row * stride_res_out_row
+    if HAS_X1:
+        X1 += row * stride_x1_row
+    if HAS_W1:
+        Y1 += row * stride_y1_row
+    # Compute mean and variance
+    cols = tl.arange(0, BLOCK_N)
+    x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
+    if HAS_ROWSCALE:
+        rowscale = tl.load(ROWSCALE + row).to(tl.float32)
+        x *= rowscale
+    if HAS_DROPOUT:
+        # Compute dropout mask
+        # 7 rounds is good enough, and reduces register pressure
+        keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
+        x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
+        if STORE_DROPOUT_MASK:
+            tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
+    if HAS_X1:
+        x1 = tl.load(X1 + cols, mask=cols < N, other=0.0).to(tl.float32)
+        if HAS_ROWSCALE:
+            rowscale = tl.load(ROWSCALE + M + row).to(tl.float32)
+            x1 *= rowscale
+        if HAS_DROPOUT:
+            # Compute dropout mask
+            # 7 rounds is good enough, and reduces register pressure
+            keep_mask = (
+                tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
+            )
+            x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
+            if STORE_DROPOUT_MASK:
+                tl.store(DROPOUT_MASK + (M + row) * N + cols, keep_mask, mask=cols < N)
+        x += x1
+    if HAS_RESIDUAL:
+        residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
+        x += residual
+    if STORE_RESIDUAL_OUT:
+        tl.store(RESIDUAL_OUT + cols, x, mask=cols < N)
+    if not IS_RMS_NORM:
+        mean = tl.sum(x, axis=0) / N
+        tl.store(Mean + row, mean)
+        xbar = tl.where(cols < N, x - mean, 0.0)
+        var = tl.sum(xbar * xbar, axis=0) / N
+    else:
+        xbar = tl.where(cols < N, x, 0.0)
+        var = tl.sum(xbar * xbar, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Rstd + row, rstd)
+    # Normalize and apply linear transformation
+    mask = cols < N
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    if HAS_BIAS:
+        b = tl.load(B + cols, mask=mask).to(tl.float32)
+    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
+    y = x_hat * w + b if HAS_BIAS else x_hat * w
+    # Write output
+    tl.store(Y + cols, y, mask=mask)
+    if HAS_W1:
+        w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
+        if HAS_B1:
+            b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
+        y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
+        tl.store(Y1 + cols, y1, mask=mask)
+
+
+def _layer_norm_fwd(
+    x,
+    weight,
+    bias,
+    eps,
+    residual=None,
+    x1=None,
+    weight1=None,
+    bias1=None,
+    dropout_p=0.0,
+    rowscale=None,
+    out_dtype=None,
+    residual_dtype=None,
+    is_rms_norm=False,
+    return_dropout_mask=False,
+):
+    if residual is not None:
+        residual_dtype = residual.dtype
+    M, N = x.shape
+    assert x.stride(-1) == 1
+    if residual is not None:
+        assert residual.stride(-1) == 1
+        assert residual.shape == (M, N)
+    assert weight.shape == (N,)
+    assert weight.stride(-1) == 1
+    if bias is not None:
+        assert bias.stride(-1) == 1
+        assert bias.shape == (N,)
+    if x1 is not None:
+        assert x1.shape == x.shape
+        assert rowscale is None
+        assert x1.stride(-1) == 1
+    if weight1 is not None:
+        assert weight1.shape == (N,)
+        assert weight1.stride(-1) == 1
+    if bias1 is not None:
+        assert bias1.shape == (N,)
+        assert bias1.stride(-1) == 1
+    if rowscale is not None:
+        assert rowscale.is_contiguous()
+        assert rowscale.shape == (M,)
+    # allocate output
+    y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
+    assert y.stride(-1) == 1
+    if weight1 is not None:
+        y1 = torch.empty_like(y)
+        assert y1.stride(-1) == 1
+    else:
+        y1 = None
+    if (
+        residual is not None
+        or (residual_dtype is not None and residual_dtype != x.dtype)
+        or dropout_p > 0.0
+        or rowscale is not None
+        or x1 is not None
+    ):
+        residual_out = torch.empty(
+            M, N, device=x.device, dtype=residual_dtype if residual_dtype is not None else x.dtype
+        )
+        assert residual_out.stride(-1) == 1
+    else:
+        residual_out = None
+    mean = torch.empty((M,), dtype=torch.float32, device=x.device) if not is_rms_norm else None
+    rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
+    if dropout_p > 0.0:
+        seeds = torch.randint(
+            2**32, (M if x1 is None else 2 * M,), device=x.device, dtype=torch.int64
+        )
+    else:
+        seeds = None
+    if return_dropout_mask and dropout_p > 0.0:
+        dropout_mask = torch.empty(M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool)
+    else:
+        dropout_mask = None
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+    if N > BLOCK_N:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    with torch.cuda.device(x.device.index):
+        _layer_norm_fwd_1pass_kernel[(M,)](
+            x,
+            y,
+            weight,
+            bias,
+            residual,
+            x1,
+            weight1,
+            bias1,
+            y1,
+            residual_out,
+            rowscale,
+            seeds,
+            dropout_mask,
+            mean,
+            rstd,
+            x.stride(0),
+            y.stride(0),
+            residual.stride(0) if residual is not None else 0,
+            residual_out.stride(0) if residual_out is not None else 0,
+            x1.stride(0) if x1 is not None else 0,
+            y1.stride(0) if y1 is not None else 0,
+            M,
+            N,
+            eps,
+            dropout_p,
+            is_rms_norm,
+            BLOCK_N,
+            residual is not None,
+            residual_out is not None,
+            bias is not None,
+            dropout_p > 0.0,
+            dropout_mask is not None,
+            rowscale is not None,
+        )
+    # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
+    if dropout_mask is not None and x1 is not None:
+        dropout_mask, dropout_mask1 = dropout_mask.tensor_split(2, dim=0)
+    else:
+        dropout_mask1 = None
+    return (
+        y,
+        y1,
+        mean,
+        rstd,
+        residual_out if residual_out is not None else x,
+        seeds,
+        dropout_mask,
+        dropout_mask1,
+    )
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=1),
+        triton.Config({}, num_warps=2),
+        triton.Config({}, num_warps=4),
+        triton.Config({}, num_warps=8),
+        triton.Config({}, num_warps=16),
+        triton.Config({}, num_warps=32),
+    ],
+    key=["N", "HAS_DRESIDUAL", "STORE_DRESIDUAL", "IS_RMS_NORM", "HAS_BIAS", "HAS_DROPOUT"],
+)
+# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
+# @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
+# @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
+@triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
+@triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
+@triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
+@triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
+@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
+@triton.jit
+def _layer_norm_bwd_kernel(
+    X,  # pointer to the input
+    W,  # pointer to the weights
+    B,  # pointer to the biases
+    Y,  # pointer to the output to be recomputed
+    DY,  # pointer to the output gradient
+    DX,  # pointer to the input gradient
+    DW,  # pointer to the partial sum of weights gradient
+    DB,  # pointer to the partial sum of biases gradient
+    DRESIDUAL,
+    W1,
+    DY1,
+    DX1,
+    DW1,
+    DB1,
+    DRESIDUAL_IN,
+    ROWSCALE,
+    SEEDS,
+    Mean,  # pointer to the mean
+    Rstd,  # pointer to the 1/std
+    stride_x_row,  # how much to increase the pointer when moving by 1 row
+    stride_y_row,
+    stride_dy_row,
+    stride_dx_row,
+    stride_dres_row,
+    stride_dy1_row,
+    stride_dx1_row,
+    stride_dres_in_row,
+    M,  # number of rows in X
+    N,  # number of columns in X
+    eps,  # epsilon to avoid division by zero
+    dropout_p,
+    rows_per_program,
+    IS_RMS_NORM: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    HAS_DRESIDUAL: tl.constexpr,
+    STORE_DRESIDUAL: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    HAS_DROPOUT: tl.constexpr,
+    HAS_ROWSCALE: tl.constexpr,
+    HAS_DY1: tl.constexpr,
+    HAS_DX1: tl.constexpr,
+    HAS_B1: tl.constexpr,
+    RECOMPUTE_OUTPUT: tl.constexpr,
+):
+    # Map the program id to the elements of X, DX, and DY it should compute.
+    row_block_id = tl.program_id(0)
+    row_start = row_block_id * rows_per_program
+    # Do not early exit if row_start >= M, because we need to write DW and DB
+    cols = tl.arange(0, BLOCK_N)
+    mask = cols < N
+    X += row_start * stride_x_row
+    if HAS_DRESIDUAL:
+        DRESIDUAL += row_start * stride_dres_row
+    if STORE_DRESIDUAL:
+        DRESIDUAL_IN += row_start * stride_dres_in_row
+    DY += row_start * stride_dy_row
+    DX += row_start * stride_dx_row
+    if HAS_DY1:
+        DY1 += row_start * stride_dy1_row
+    if HAS_DX1:
+        DX1 += row_start * stride_dx1_row
+    if RECOMPUTE_OUTPUT:
+        Y += row_start * stride_y_row
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    if RECOMPUTE_OUTPUT and HAS_BIAS:
+        b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
+    if HAS_DY1:
+        w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
+    dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    if HAS_BIAS:
+        db = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    if HAS_DY1:
+        dw1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
+        if HAS_B1:
+            db1 = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    row_end = min((row_block_id + 1) * rows_per_program, M)
+    for row in range(row_start, row_end):
+        # Load data to SRAM
+        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+        dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+        if HAS_DY1:
+            dy1 = tl.load(DY1 + cols, mask=mask, other=0).to(tl.float32)
+        if not IS_RMS_NORM:
+            mean = tl.load(Mean + row)
+        rstd = tl.load(Rstd + row)
+        # Compute dx
+        xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
+        xhat = tl.where(mask, xhat, 0.0)
+        if RECOMPUTE_OUTPUT:
+            y = xhat * w + b if HAS_BIAS else xhat * w
+            tl.store(Y + cols, y, mask=mask)
+        wdy = w * dy
+        dw += dy * xhat
+        if HAS_BIAS:
+            db += dy
+        if HAS_DY1:
+            wdy += w1 * dy1
+            dw1 += dy1 * xhat
+            if HAS_B1:
+                db1 += dy1
+        if not IS_RMS_NORM:
+            c1 = tl.sum(xhat * wdy, axis=0) / N
+            c2 = tl.sum(wdy, axis=0) / N
+            dx = (wdy - (xhat * c1 + c2)) * rstd
+        else:
+            c1 = tl.sum(xhat * wdy, axis=0) / N
+            dx = (wdy - xhat * c1) * rstd
+        if HAS_DRESIDUAL:
+            dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32)
+            dx += dres
+        # Write dx
+        if STORE_DRESIDUAL:
+            tl.store(DRESIDUAL_IN + cols, dx, mask=mask)
+        if HAS_DX1:
+            if HAS_DROPOUT:
+                keep_mask = (
+                    tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
+                )
+                dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
+            else:
+                dx1 = dx
+            tl.store(DX1 + cols, dx1, mask=mask)
+        if HAS_DROPOUT:
+            keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
+            dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
+        if HAS_ROWSCALE:
+            rowscale = tl.load(ROWSCALE + row).to(tl.float32)
+            dx *= rowscale
+        tl.store(DX + cols, dx, mask=mask)
+
+        X += stride_x_row
+        if HAS_DRESIDUAL:
+            DRESIDUAL += stride_dres_row
+        if STORE_DRESIDUAL:
+            DRESIDUAL_IN += stride_dres_in_row
+        if RECOMPUTE_OUTPUT:
+            Y += stride_y_row
+        DY += stride_dy_row
+        DX += stride_dx_row
+        if HAS_DY1:
+            DY1 += stride_dy1_row
+        if HAS_DX1:
+            DX1 += stride_dx1_row
+    tl.store(DW + row_block_id * N + cols, dw, mask=mask)
+    if HAS_BIAS:
+        tl.store(DB + row_block_id * N + cols, db, mask=mask)
+    if HAS_DY1:
+        tl.store(DW1 + row_block_id * N + cols, dw1, mask=mask)
+        if HAS_B1:
+            tl.store(DB1 + row_block_id * N + cols, db1, mask=mask)
+
+
+def _layer_norm_bwd(
+    dy,
+    x,
+    weight,
+    bias,
+    eps,
+    mean,
+    rstd,
+    dresidual=None,
+    dy1=None,
+    weight1=None,
+    bias1=None,
+    seeds=None,
+    dropout_p=0.0,
+    rowscale=None,
+    has_residual=False,
+    has_x1=False,
+    is_rms_norm=False,
+    x_dtype=None,
+    recompute_output=False,
+):
+    M, N = x.shape
+    assert x.stride(-1) == 1
+    assert dy.stride(-1) == 1
+    assert dy.shape == (M, N)
+    if dresidual is not None:
+        assert dresidual.stride(-1) == 1
+        assert dresidual.shape == (M, N)
+    assert weight.shape == (N,)
+    assert weight.stride(-1) == 1
+    if bias is not None:
+        assert bias.stride(-1) == 1
+        assert bias.shape == (N,)
+    if dy1 is not None:
+        assert weight1 is not None
+        assert dy1.shape == dy.shape
+        assert dy1.stride(-1) == 1
+    if weight1 is not None:
+        assert weight1.shape == (N,)
+        assert weight1.stride(-1) == 1
+    if bias1 is not None:
+        assert bias1.shape == (N,)
+        assert bias1.stride(-1) == 1
+    if seeds is not None:
+        assert seeds.is_contiguous()
+        assert seeds.shape == (M if not has_x1 else M * 2,)
+    if rowscale is not None:
+        assert rowscale.is_contiguous()
+        assert rowscale.shape == (M,)
+    # allocate output
+    dx = (
+        torch.empty_like(x)
+        if x_dtype is None
+        else torch.empty(M, N, dtype=x_dtype, device=x.device)
+    )
+    dresidual_in = (
+        torch.empty_like(x)
+        if has_residual
+        and (dx.dtype != x.dtype or dropout_p > 0.0 or rowscale is not None or has_x1)
+        else None
+    )
+    dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
+    y = torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None
+    if recompute_output:
+        assert weight1 is None, "recompute_output is not supported with parallel LayerNorm"
+
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+    if N > BLOCK_N:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
+    _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
+    _db = (
+        torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
+        if bias is not None
+        else None
+    )
+    _dw1 = torch.empty_like(_dw) if weight1 is not None else None
+    _db1 = torch.empty_like(_db) if bias1 is not None else None
+    rows_per_program = math.ceil(M / sm_count)
+    grid = (sm_count,)
+    with torch.cuda.device(x.device.index):
+        _layer_norm_bwd_kernel[grid](
+            x,
+            weight,
+            bias,
+            y,
+            dy,
+            dx,
+            _dw,
+            _db,
+            dresidual,
+            weight1,
+            dy1,
+            dx1,
+            _dw1,
+            _db1,
+            dresidual_in,
+            rowscale,
+            seeds,
+            mean,
+            rstd,
+            x.stride(0),
+            0 if not recompute_output else y.stride(0),
+            dy.stride(0),
+            dx.stride(0),
+            dresidual.stride(0) if dresidual is not None else 0,
+            dy1.stride(0) if dy1 is not None else 0,
+            dx1.stride(0) if dx1 is not None else 0,
+            dresidual_in.stride(0) if dresidual_in is not None else 0,
+            M,
+            N,
+            eps,
+            dropout_p,
+            rows_per_program,
+            is_rms_norm,
+            BLOCK_N,
+            dresidual is not None,
+            dresidual_in is not None,
+            bias is not None,
+            dropout_p > 0.0,
+        )
+    dw = _dw.sum(0).to(weight.dtype)
+    db = _db.sum(0).to(bias.dtype) if bias is not None else None
+    dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
+    db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
+    # Don't need to compute dresidual_in separately in this case
+    if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None:
+        dresidual_in = dx
+    if has_x1 and dropout_p == 0.0:
+        dx1 = dx
+    return (
+        (dx, dw, db, dresidual_in, dx1, dw1, db1)
+        if not recompute_output
+        else (dx, dw, db, dresidual_in, dx1, dw1, db1, y)
+    )
+
+
+class LayerNormFn(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x,
+        weight,
+        bias,
+        residual=None,
+        x1=None,
+        weight1=None,
+        bias1=None,
+        eps=1e-6,
+        dropout_p=0.0,
+        rowscale=None,
+        prenorm=False,
+        residual_in_fp32=False,
+        is_rms_norm=False,
+        return_dropout_mask=False,
+    ):
+        x_shape_og = x.shape
+        # reshape input data into 2D tensor
+        x = x.reshape(-1, x.shape[-1])
+        if x.stride(-1) != 1:
+            x = x.contiguous()
+        if residual is not None:
+            assert residual.shape == x_shape_og
+            residual = residual.reshape(-1, residual.shape[-1])
+            if residual.stride(-1) != 1:
+                residual = residual.contiguous()
+        if x1 is not None:
+            assert x1.shape == x_shape_og
+            assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
+            x1 = x1.reshape(-1, x1.shape[-1])
+            if x1.stride(-1) != 1:
+                x1 = x1.contiguous()
+        weight = weight.contiguous()
+        if bias is not None:
+            bias = bias.contiguous()
+        if weight1 is not None:
+            weight1 = weight1.contiguous()
+        if bias1 is not None:
+            bias1 = bias1.contiguous()
+        if rowscale is not None:
+            rowscale = rowscale.reshape(-1).contiguous()
+        residual_dtype = (
+            residual.dtype
+            if residual is not None
+            else (torch.float32 if residual_in_fp32 else None)
+        )
+        y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd(
+            x,
+            weight,
+            bias,
+            eps,
+            residual,
+            x1,
+            weight1,
+            bias1,
+            dropout_p=dropout_p,
+            rowscale=rowscale,
+            residual_dtype=residual_dtype,
+            is_rms_norm=is_rms_norm,
+            return_dropout_mask=return_dropout_mask,
+        )
+        ctx.save_for_backward(
+            residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
+        )
+        ctx.x_shape_og = x_shape_og
+        ctx.eps = eps
+        ctx.dropout_p = dropout_p
+        ctx.is_rms_norm = is_rms_norm
+        ctx.has_residual = residual is not None
+        ctx.has_x1 = x1 is not None
+        ctx.prenorm = prenorm
+        ctx.x_dtype = x.dtype
+        y = y.reshape(x_shape_og)
+        y1 = y1.reshape(x_shape_og) if y1 is not None else None
+        residual_out = residual_out.reshape(x_shape_og) if residual_out is not None else None
+        dropout_mask = dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
+        dropout_mask1 = dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
+        if not return_dropout_mask:
+            if weight1 is None:
+                return y if not prenorm else (y, residual_out)
+            else:
+                return (y, y1) if not prenorm else (y, y1, residual_out)
+        else:
+            if weight1 is None:
+                return (
+                    (y, dropout_mask, dropout_mask1)
+                    if not prenorm
+                    else (y, residual_out, dropout_mask, dropout_mask1)
+                )
+            else:
+                return (
+                    (y, y1, dropout_mask, dropout_mask1)
+                    if not prenorm
+                    else (y, y1, residual_out, dropout_mask, dropout_mask1)
+                )
+
+    @staticmethod
+    def backward(ctx, dy, *args):
+        x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors
+        dy = dy.reshape(-1, dy.shape[-1])
+        if dy.stride(-1) != 1:
+            dy = dy.contiguous()
+        assert dy.shape == x.shape
+        if weight1 is not None:
+            dy1, args = args[0], args[1:]
+            dy1 = dy1.reshape(-1, dy1.shape[-1])
+            if dy1.stride(-1) != 1:
+                dy1 = dy1.contiguous()
+            assert dy1.shape == x.shape
+        else:
+            dy1 = None
+        if ctx.prenorm:
+            dresidual = args[0]
+            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
+            if dresidual.stride(-1) != 1:
+                dresidual = dresidual.contiguous()
+            assert dresidual.shape == x.shape
+        else:
+            dresidual = None
+        dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd(
+            dy,
+            x,
+            weight,
+            bias,
+            ctx.eps,
+            mean,
+            rstd,
+            dresidual,
+            dy1,
+            weight1,
+            bias1,
+            seeds,
+            ctx.dropout_p,
+            rowscale,
+            ctx.has_residual,
+            ctx.has_x1,
+            ctx.is_rms_norm,
+            x_dtype=ctx.x_dtype,
+        )
+        return (
+            dx.reshape(ctx.x_shape_og),
+            dw,
+            db,
+            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
+            dx1.reshape(ctx.x_shape_og) if dx1 is not None else None,
+            dw1,
+            db1,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+def layer_norm_fn(
+    x,
+    weight,
+    bias,
+    residual=None,
+    x1=None,
+    weight1=None,
+    bias1=None,
+    eps=1e-6,
+    dropout_p=0.0,
+    rowscale=None,
+    prenorm=False,
+    residual_in_fp32=False,
+    is_rms_norm=False,
+    return_dropout_mask=False,
+):
+    return LayerNormFn.apply(
+        x,
+        weight,
+        bias,
+        residual,
+        x1,
+        weight1,
+        bias1,
+        eps,
+        dropout_p,
+        rowscale,
+        prenorm,
+        residual_in_fp32,
+        is_rms_norm,
+        return_dropout_mask,
+    )
+
+
+def rms_norm_fn(
+    x,
+    weight,
+    bias,
+    residual=None,
+    x1=None,
+    weight1=None,
+    bias1=None,
+    eps=1e-6,
+    dropout_p=0.0,
+    rowscale=None,
+    prenorm=False,
+    residual_in_fp32=False,
+    return_dropout_mask=False,
+):
+    return LayerNormFn.apply(
+        x,
+        weight,
+        bias,
+        residual,
+        x1,
+        weight1,
+        bias1,
+        eps,
+        dropout_p,
+        rowscale,
+        prenorm,
+        residual_in_fp32,
+        True,
+        return_dropout_mask,
+    )
+
+
+class RMSNorm(torch.nn.Module):
+
+    def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        if dropout_p > 0.0:
+            self.drop = torch.nn.Dropout(dropout_p)
+        else:
+            self.drop = None
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.register_parameter("bias", None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)
+
+    def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
+        return rms_norm_fn(
+            x,
+            self.weight,
+            self.bias,
+            residual=residual,
+            eps=self.eps,
+            dropout_p=self.drop.p if self.drop is not None and self.training else 0.0,
+            prenorm=prenorm,
+            residual_in_fp32=residual_in_fp32,
+        )
+
+
+class LayerNormLinearFn(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx,
+        x,
+        norm_weight,
+        norm_bias,
+        linear_weight,
+        linear_bias,
+        residual=None,
+        eps=1e-6,
+        prenorm=False,
+        residual_in_fp32=False,
+        is_rms_norm=False,
+    ):
+        x_shape_og = x.shape
+        # reshape input data into 2D tensor
+        x = x.reshape(-1, x.shape[-1])
+        if x.stride(-1) != 1:
+            x = x.contiguous()
+        if residual is not None:
+            assert residual.shape == x_shape_og
+            residual = residual.reshape(-1, residual.shape[-1])
+            if residual.stride(-1) != 1:
+                residual = residual.contiguous()
+        norm_weight = norm_weight.contiguous()
+        if norm_bias is not None:
+            norm_bias = norm_bias.contiguous()
+        residual_dtype = (
+            residual.dtype
+            if residual is not None
+            else (torch.float32 if residual_in_fp32 else None)
+        )
+        y, _, mean, rstd, residual_out, *rest = _layer_norm_fwd(
+            x,
+            norm_weight,
+            norm_bias,
+            eps,
+            residual,
+            out_dtype=None if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype(),
+            residual_dtype=residual_dtype,
+            is_rms_norm=is_rms_norm,
+        )
+        y = y.reshape(x_shape_og)
+        dtype = torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype
+        linear_weight = linear_weight.to(dtype)
+        linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
+        out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
+        # We don't store y, will be recomputed in the backward pass to save memory
+        ctx.save_for_backward(residual_out, norm_weight, norm_bias, linear_weight, mean, rstd)
+        ctx.x_shape_og = x_shape_og
+        ctx.eps = eps
+        ctx.is_rms_norm = is_rms_norm
+        ctx.has_residual = residual is not None
+        ctx.prenorm = prenorm
+        ctx.x_dtype = x.dtype
+        ctx.linear_bias_is_none = linear_bias is None
+        return out if not prenorm else (out, residual_out.reshape(x_shape_og))
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dout, *args):
+        x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
+        dout = dout.reshape(-1, dout.shape[-1])
+        dy = F.linear(dout, linear_weight.t())
+        dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
+        if dy.stride(-1) != 1:
+            dy = dy.contiguous()
+        assert dy.shape == x.shape
+        if ctx.prenorm:
+            dresidual = args[0]
+            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
+            if dresidual.stride(-1) != 1:
+                dresidual = dresidual.contiguous()
+            assert dresidual.shape == x.shape
+        else:
+            dresidual = None
+        dx, dnorm_weight, dnorm_bias, dresidual_in, _, _, _, y = _layer_norm_bwd(
+            dy,
+            x,
+            norm_weight,
+            norm_bias,
+            ctx.eps,
+            mean,
+            rstd,
+            dresidual=dresidual,
+            has_residual=ctx.has_residual,
+            is_rms_norm=ctx.is_rms_norm,
+            x_dtype=ctx.x_dtype,
+            recompute_output=True,
+        )
+        dlinear_weight = torch.einsum("bo,bi->oi", dout, y)
+        return (
+            dx.reshape(ctx.x_shape_og),
+            dnorm_weight,
+            dnorm_bias,
+            dlinear_weight,
+            dlinear_bias,
+            dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+def layer_norm_linear_fn(
+    x,
+    norm_weight,
+    norm_bias,
+    linear_weight,
+    linear_bias,
+    residual=None,
+    eps=1e-6,
+    prenorm=False,
+    residual_in_fp32=False,
+    is_rms_norm=False,
+):
+    return LayerNormLinearFn.apply(
+        x,
+        norm_weight,
+        norm_bias,
+        linear_weight,
+        linear_bias,
+        residual,
+        eps,
+        prenorm,
+        residual_in_fp32,
+        is_rms_norm,
+    )
diff --git a/linear-warmup.yaml b/linear-warmup.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bb6a69896e8f042a9a2ca2b1102c853e863db936
--- /dev/null
+++ b/linear-warmup.yaml
@@ -0,0 +1,2 @@
+# @package train.scheduler
+_target_: transformers.get_linear_schedule_with_warmup
diff --git a/linear.py b/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8966dbc345ab0e593df0124451ee7be3dae131a
--- /dev/null
+++ b/linear.py
@@ -0,0 +1,594 @@
+# Adapted from https://github.com/ELS-RD/kernl/blob/main/src/kernl/implementations/linear_layer.py
+# and https://github.com/openai/triton/blob/master/python/triton/ops/matmul.py
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time
+
+from flash_attn.ops.triton.k_activations import (
+    gelu,
+    gelu_approx,
+    gelu_approx_grad,
+    gelu_grad,
+    squared_relu,
+    squared_relu_grad,
+)
+
+# CREDITS: Initially inspired by the Triton tutorial on matrix multiplications
+
+
+def init_to_zero(name):
+    return lambda nargs: nargs[name].zero_()
+
+
+def get_configs_io_bound():
+    configs = []
+    for num_stages in [2, 3, 4, 5, 6]:
+        for block_m in [16, 32]:
+            for block_k in [32, 64]:
+                for block_n in [32, 64, 128, 256]:
+                    num_warps = 2 if block_n <= 64 else 4
+                    configs.append(
+                        triton.Config(
+                            {
+                                "BLOCK_M": block_m,
+                                "BLOCK_N": block_n,
+                                "BLOCK_K": block_k,
+                                "SPLIT_K": 1,
+                            },
+                            num_stages=num_stages,
+                            num_warps=num_warps,
+                        )
+                    )
+                    # split_k not used
+                    # for split_k in [2, 4, 8, 16]:
+                    #     configs.append(triton.Config(
+                    #         {'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k},
+                    #         num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C')))
+    return configs
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=5, num_warps=2
+        ),
+        # good for int8
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1},
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1},
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1},
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=5, num_warps=2
+        ),
+    ]
+    + get_configs_io_bound(),
+    key=["CACHE_KEY_M", "CACHE_KEY_N", "CACHE_KEY_K"],
+    prune_configs_by={
+        "early_config_prune": early_config_prune,
+        "perf_model": estimate_matmul_time,
+        "top_k": 10,
+    },
+)
+@triton.heuristics(
+    {
+        "EVEN_K": lambda args: args["K"] % (args["BLOCK_K"] * args["SPLIT_K"]) == 0,
+    }
+)
+@triton.jit
+def kernel_fwd(
+    C,  # Pointers to matrices
+    ACT_INPUT,
+    A,
+    B,
+    bias,
+    # Matrix dimensions
+    M,
+    N,
+    K,
+    CACHE_KEY_M,
+    CACHE_KEY_N,
+    CACHE_KEY_K,
+    # The stride variables represent how much to increase the ptr by when moving by 1
+    # element in a particular dimension. E.g. stride_am is how much to increase a_ptr
+    # by to get the element one row down (A has M rows)
+    stride_cm,
+    # stride_cn,  # Assume that stride_cn == 1
+    stride_am,
+    stride_ak,
+    stride_bn,
+    stride_bk,
+    # Meta-parameters
+    BLOCK_M: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    # split k not used, not performant with activation, kept because early_config_prune is expecting it
+    SPLIT_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    A_ROWMAJOR: tl.constexpr,
+    B_COLMAJOR: tl.constexpr,
+    BIAS: tl.constexpr,
+    SAVE_ACT_INPUT: tl.constexpr,
+    ACTIVATION: tl.constexpr,
+):
+
+    """
+    Kernel for computing Out = activation(A x W + C)
+    - Input has shape (M, K)
+    - Weight has shape (K, N)
+    - Bias has shape (N,)
+    - Output has shape (M, N)
+    - ActInputs (optional) has shape (M, N)
+    'ActInputs' optionally saves the A x W + C intermediate for backward computations
+    This kernel will consolidate over K
+    """
+
+    pid = tl.program_id(axis=0)
+
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+
+    # now compute the block that each program will go through
+    # rm (resp. rn) denotes a range of indices
+    # for rows (resp. col) of C
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    # trick to avoid masking on M and N axis
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+
+    if A_ROWMAJOR:
+        A = A + (ram[:, None] * stride_am + rk[None, :])
+    else:
+        A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    if B_COLMAJOR:
+        B = B + (rk[:, None] + rbn[None, :] * stride_bn)
+    else:
+        B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.0)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.0)
+        acc += tl.dot(a, b)
+
+        if A_ROWMAJOR:
+            A += BLOCK_K
+        else:
+            A += BLOCK_K * stride_ak
+        if B_COLMAJOR:
+            B += BLOCK_K
+        else:
+            B += BLOCK_K * stride_bk
+
+    # Putting bias after the matmul (instead of before) is faster, idk why
+    if BIAS:
+        bias = tl.load(bias + rn, mask=rn < N, other=0.0).to(tl.float32)
+        acc += bias[None, :]
+
+    # optional: save the activation inputs
+    if SAVE_ACT_INPUT:
+        # act_in_ptrs = ACT_INPUT + ram[:, None] * stride_cm + rbn[None, :] * stride_cn
+        act_in_ptrs = ACT_INPUT + ram[:, None] * stride_cm + rbn[None, :]
+        tl.store(act_in_ptrs, acc)
+
+    # optional: fused activation (while the data is in shared memory)
+    if ACTIVATION == "gelu":
+        acc = gelu(acc)
+    elif ACTIVATION == "gelu_approx":
+        acc = gelu_approx(acc)
+    elif ACTIVATION == "squared_relu":
+        acc = squared_relu(acc)
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    # write back result
+    # C = C + rm[:, None] * stride_cm + rn[None, :] * stride_cn
+    C = C + rm[:, None] * stride_cm + rn[None, :]
+    mask = (rm < M)[:, None] & (rn < N)[None, :]
+    tl.store(C, acc)
+
+
+def triton_linear_act(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    activation: str = "id",
+    save_act_input: bool = False,
+) -> torch.Tensor:
+    """
+    Compute e = activation(x @ weight.T + bias).
+    This wrapper kicks the `kernel_fwd` Triton kernel
+    :param x: input tensor
+    :param weight: weight matrix
+    :param bias: an optional bias tensor
+    :param activation: Activation name. Needs to be a Triton kernel.
+    :param act_input: an optional tensor to save the activation inputs (for backward)
+    :return: result tensor
+    """
+    # if torch.is_autocast_enabled():
+    #     dtype = torch.get_autocast_gpu_dtype()
+    #     x, weight, bias = [a.to(dtype=dtype) for a in [x, weight, bias]]
+
+    assert activation in ["id", "gelu", "gelu_approx", "squared_relu"]
+
+    batch_shape, n = x.shape[:-1], x.shape[-1]
+    batch_dim = batch_shape.numel()
+    x_reshaped = x.reshape(batch_dim, n)
+
+    if x_reshaped.stride(0) > 1 and x_reshaped.stride(1) > 1:
+        x_reshaped = x_reshaped.contiguous()
+    if weight.stride(0) > 1 and weight.stride(1) > 1:
+        weight = weight.contiguous()
+    bias = bias.contiguous() if bias is not None else None
+
+    assert (
+        x.dtype == weight.dtype
+    ), f"Input and weight must have the same dtype, got {x.dtype} and {weight.dtype}"
+    if bias is not None:
+        assert (
+            x.dtype == bias.dtype
+        ), f"Input and bias must have the same dtype, got {x.dtype} and {bias.dtype}"
+    assert (
+        x_reshaped.shape[1] == weight.shape[1]
+    ), f"Incompatible dimensions: {x_reshaped.shape} - {weight.shape}"
+
+    assert (
+        bias is None or bias.shape[0] == weight.shape[0]
+    ), "Incompatible dimensions in between weight and bias"
+
+    M, K = x_reshaped.shape
+    N, K = weight.shape
+
+    output = torch.empty((M, N), device=x.device, dtype=x.dtype)
+    act_input = torch.empty_like(output) if save_act_input else None
+
+    # 1D launch kernel where each block gets its own program.
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)  # noqa
+
+    kernel_fwd[grid](
+        output,
+        act_input,
+        x_reshaped,
+        weight,  # data ptrs
+        bias if bias is not None else x,  # auto skip bias if not present
+        M,  # shapes
+        N,
+        K,
+        M // 32,  # key for triton cache (limit number of compilations)
+        N // 32,
+        K // 32,
+        stride_cm=output.stride(0),  # strides
+        # stride_cn=output.stride(1),
+        stride_am=x_reshaped.stride(0),
+        stride_ak=x_reshaped.stride(1),
+        stride_bk=weight.stride(1),
+        stride_bn=weight.stride(0),
+        BIAS=bias is not None,  # optional fused bias
+        SAVE_ACT_INPUT=save_act_input,  # optional save activation inputs
+        ACTIVATION=activation,  # optional fused activation
+        A_ROWMAJOR=x_reshaped.stride(1) == 1,
+        B_COLMAJOR=weight.stride(1) == 1,
+        GROUP_M=8,  # speed optimization: group the programs
+    )
+
+    if not save_act_input:
+        return output.reshape(*batch_shape, output.shape[-1])
+    else:
+        return (
+            output.reshape(*batch_shape, output.shape[-1]),
+            act_input.reshape(*batch_shape, act_input.shape[-1]),
+        )
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=5, num_warps=2
+        ),
+        # good for int8
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1},
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1},
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1},
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=5, num_warps=2
+        ),
+    ]
+    + get_configs_io_bound(),
+    key=["CACHE_KEY_M", "CACHE_KEY_N", "CACHE_KEY_K"],
+    prune_configs_by={
+        "early_config_prune": early_config_prune,
+        "perf_model": estimate_matmul_time,
+        "top_k": 10,
+    },
+)
+@triton.heuristics(
+    {
+        "EVEN_K": lambda args: args["K"] % (args["BLOCK_K"] * args["SPLIT_K"]) == 0,
+    }
+)
+@triton.jit
+def kernel_bwd(
+    C,  # Pointers to matrices
+    ACT_INPUT,
+    A,
+    B,
+    # Matrix dimensions
+    M,
+    N,
+    K,
+    CACHE_KEY_M,
+    CACHE_KEY_N,
+    CACHE_KEY_K,
+    # The stride variables represent how much to increase the ptr by when moving by 1
+    # element in a particular dimension. E.g. stride_am is how much to increase a_ptr
+    # by to get the element one row down (A has M rows)
+    stride_cm,
+    # stride_cn,  # Assume that stride_cn == 1
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    # Meta-parameters
+    BLOCK_M: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    # split k not used, not performant with activation, kept because early_config_prune is expecting it
+    SPLIT_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    ACTIVATION: tl.constexpr,
+):
+
+    """
+    Kernel for computing Out = activation(A x W + C)
+    - Input has shape (M, K)
+    - Weight has shape (K, N)
+    - Output has shape (M, N)
+    - ActInputs (optional) has shape (M, N)
+    'ActInputs' optionally saves the A x W + C intermediate for backward computations
+    This kernel will consolidate over K
+    """
+
+    pid = tl.program_id(axis=0)
+
+    grid_m = (M + BLOCK_M - 1) // BLOCK_M
+    grid_n = (N + BLOCK_N - 1) // BLOCK_N
+    # re-order program ID for better L2 performance
+    width = GROUP_M * grid_n
+    group_id = pid // width
+    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
+    pid_m = group_id * GROUP_M + (pid % group_size)
+    pid_n = (pid % width) // (group_size)
+
+    # now compute the block that each program will go through
+    # rm (resp. rn) denotes a range of indices
+    # for rows (resp. col) of C
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    # trick to avoid masking on M and N axis
+    ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
+    rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
+    rk = tl.arange(0, BLOCK_K)
+
+    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak)
+    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn)
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    for k in range(K, 0, -BLOCK_K):
+        if EVEN_K:
+            a = tl.load(A)
+            b = tl.load(B)
+        else:
+            a = tl.load(A, mask=rk[None, :] < k, other=0.0)
+            b = tl.load(B, mask=rk[:, None] < k, other=0.0)
+        acc += tl.dot(a, b)
+
+        A += BLOCK_K * stride_ak
+        B += BLOCK_K * stride_bk
+
+    # optional: fused activation (while the data is in shared memory)
+    if ACTIVATION != "id":
+        act_in_ptrs = ACT_INPUT + ram[:, None] * stride_cm + rbn[None, :]
+        act_input = tl.load(act_in_ptrs).to(acc.dtype)
+    if ACTIVATION == "gelu":
+        acc *= gelu_grad(act_input)
+    elif ACTIVATION == "gelu_approx":
+        acc *= gelu_approx_grad(act_input)
+    elif ACTIVATION == "squared_relu":
+        acc *= squared_relu_grad(act_input)
+
+    # rematerialize rm and rn to save registers
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    # write back result
+    C = C + rm[:, None] * stride_cm + rn[None, :]
+    mask = (rm < M)[:, None] & (rn < N)[None, :]
+    tl.store(C, acc, mask=mask)
+
+
+def triton_dgrad_act(
+    grad_output: torch.Tensor,
+    weight: torch.Tensor,
+    activation: str = "id",
+    act_input: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    Compute e = activation(grad_output @ weight + bias).
+    This wrapper kicks the `kernel_fwd` Triton kernel
+    :param grad_output: input tensor
+    :param weight: weight matrix
+    :param activation: Activation name. Needs to be a Triton kernel.
+    :param act_input: an optional tensor to save the activation inputs (for backward)
+    :return: result tensor
+    """
+    assert activation in ["id", "gelu", "gelu_approx", "squared_relu"]
+
+    batch_shape, n = grad_output.shape[:-1], grad_output.shape[-1]
+    batch_dim = batch_shape.numel()
+    grad_output_reshaped = grad_output.reshape(batch_dim, n)
+
+    if grad_output_reshaped.stride(0) > 1 and grad_output_reshaped.stride(1) > 1:
+        grad_output_reshaped = grad_output_reshaped.contiguous()
+    if weight.stride(0) > 1 and weight.stride(1) > 1:
+        weight = weight.contiguous()
+
+    assert (
+        grad_output.dtype == weight.dtype
+    ), f"grad_output and weight must have the same dtype, got {grad_output.dtype} and {weight.dtype}"
+    assert (
+        grad_output_reshaped.shape[1] == weight.shape[0]
+    ), f"Incompatible dimensions: {grad_output_reshaped.shape} - {weight.shape}"
+    if activation != "id":
+        assert act_input is not None, f"act_input is required for activation {activation}"
+
+    # M, N, K in bwd are different from M, N, K in fwd
+    M, K = grad_output_reshaped.shape
+    K, N = weight.shape
+
+    grad_input = torch.empty((M, N), device=grad_output.device, dtype=grad_output.dtype)
+
+    # 1D launch kernel where each block gets its own program.
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)  # noqa
+
+    kernel_bwd[grid](
+        grad_input,
+        act_input,
+        grad_output_reshaped,
+        weight,  # data ptrs
+        M,  # shapes
+        N,
+        K,
+        M // 32,  # key for triton cache (limit number of compilations)
+        N // 32,
+        K // 32,
+        stride_cm=grad_input.stride(0),  # strides
+        # stride_cn=grad_input.stride(1),
+        stride_am=grad_output_reshaped.stride(0),
+        stride_ak=grad_output_reshaped.stride(1),
+        stride_bk=weight.stride(0),
+        stride_bn=weight.stride(1),
+        ACTIVATION=activation,  # optional fused activation
+        GROUP_M=8,  # speed optimization: group the programs
+    )
+
+    return grad_input.reshape(*batch_shape, grad_input.shape[-1])
diff --git a/llama.py b/llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bfb51d17e27c1eeb5f09293b773cda8f4d81233
--- /dev/null
+++ b/llama.py
@@ -0,0 +1,422 @@
+# Copyright (c) 2023, Tri Dao.
+
+import json
+import math
+import os
+import re
+from collections import OrderedDict
+from pathlib import Path
+from typing import Dict, List, Union
+
+import torch
+import torch.nn.functional as F
+from sentencepiece import SentencePieceProcessor
+from transformers import GPT2Config, LlamaConfig
+
+from einops import rearrange
+
+
+def remap_state_dict_meta_llama(
+    state_dict: Dict[str, torch.Tensor], config: GPT2Config
+) -> Dict[str, torch.Tensor]:
+    """Convert the state_dict in Meta format to standard GPT format.
+
+    This function modifies state_dict in place.
+    """
+
+    def key_mapping_layers(key):
+        return f"transformer.{key}" if not key.startswith("output.") else key
+
+    state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items())
+
+    # Word embedding
+    def key_mapping_emb(key):
+        return re.sub(
+            r"^transformer.tok_embeddings.", "transformer.embeddings.word_embeddings.", key
+        )
+
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = (
+        math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    )
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    if getattr(config, "tie_word_embeddings"):
+        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+    else:
+        output_embeddings = state_dict.pop("output.weight")
+        # Need to recompute vocab_size since LLaMa shards the word embeddings and output embeddings
+        # differently.
+        vocab_size = (
+            math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple)
+            * pad_vocab_size_multiple
+        )
+        # It's possible that vocab_size is padded to be a multiple of 8, for example.
+        state_dict["lm_head.weight"] = F.pad(
+            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
+        )
+
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.norm.", r"transformer.ln_f.", key)
+        key = re.sub(
+            r"^transformer.layers.(\d+).attention_norm.",
+            r"transformer.layers.\1.norm1.",
+            key,
+        )
+        key = re.sub(r"^transformer.layers.(\d+).ffn_norm.", r"transformer.layers.\1.norm2.", key)
+        return key
+
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+
+    # MLP
+    for l in range(config.n_layer):
+        w1 = state_dict.pop(f"transformer.layers.{l}.feed_forward.w1.weight")
+        w3 = state_dict.pop(f"transformer.layers.{l}.feed_forward.w3.weight")
+        # Our ordering is different
+        state_dict[f"transformer.layers.{l}.mlp.fc1.weight"] = torch.cat([w3, w1], dim=0)
+
+    def key_mapping_mlp(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).feed_forward.w2.",
+            r"transformer.layers.\1.mlp.fc2.",
+            key,
+        )
+
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+
+    # Attention
+    for l in range(config.n_layer):
+        Wq = state_dict.pop(f"transformer.layers.{l}.attention.wq.weight")
+        Wk = state_dict.pop(f"transformer.layers.{l}.attention.wk.weight")
+        Wv = state_dict.pop(f"transformer.layers.{l}.attention.wv.weight")
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0)
+        # We don't store these
+        state_dict.pop(f"transformer.layers.{l}.attention.inner_attention.rope.freqs", None)
+
+    def key_mapping_attn(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).attention.wo.",
+            r"transformer.layers.\1.mixer.out_proj.",
+            key,
+        )
+
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+
+    state_dict.pop("transformer.rope.freqs", None)
+
+    return state_dict
+
+
+def remap_state_dict_hf_llama(
+    state_dict: Dict[str, torch.Tensor], config: GPT2Config
+) -> Dict[str, torch.Tensor]:
+    """Convert the state_dict in Hugging Face format to standard GPT format.
+
+    This function modifies state_dict in place.
+    """
+
+    # Embedding
+    def key_mapping_emb(key):
+        return re.sub(r"^model.embed_tokens.", "transformer.embeddings.word_embeddings.", key)
+
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = (
+        math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    )
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+
+    # LM head
+    if getattr(config, "tie_word_embeddings"):
+        state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+    else:
+        output_embeddings = state_dict.pop("lm_head.weight")
+        # Need to recompute vocab_size since LLaMa shards the word embeddings and output embeddings
+        # differently.
+        vocab_size = (
+            math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple)
+            * pad_vocab_size_multiple
+        )
+        # It's possible that vocab_size is padded to be a multiple of 8, for example.
+        state_dict["lm_head.weight"] = F.pad(
+            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
+        )
+
+    # MLP
+    for l in range(config.n_layer):
+        # Fusing weights this way based on difference in the following:
+        # https://github.com/huggingface/transformers/blob/b42010bb1d3cbf262d27e0a328661885be46dfdb/src/transformers/models/llama/modeling_llama.py#L220
+        # https://github.com/Dao-AILab/flash-attention/blob/c60851a8253257eb970e06a022c82517a8033e8c/flash_attn/modules/mlp.py#L115
+        w1 = state_dict.pop(f"model.layers.{l}.mlp.gate_proj.weight")
+        w3 = state_dict.pop(f"model.layers.{l}.mlp.up_proj.weight")
+        state_dict[f"transformer.layers.{l}.mlp.fc1.weight"] = torch.cat([w3, w1], dim=0)
+
+    def key_mapping_mlp(key):
+        return re.sub(
+            r"^model.layers.(\d+).mlp.down_proj.",
+            r"transformer.layers.\1.mlp.fc2.",
+            key,
+        )
+
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^model.norm.", r"transformer.ln_f.", key)
+        key = re.sub(
+            r"^model.layers.(\d+).input_layernorm.",
+            r"transformer.layers.\1.norm1.",
+            key,
+        )
+        key = re.sub(
+            r"^model.layers.(\d+).post_attention_layernorm.",
+            r"transformer.layers.\1.norm2.",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+
+    def inv_permute(w):
+        # Inverse of permute implemented in:
+        # https://github.com/huggingface/transformers/blob/b42010bb1d3cbf262d27e0a328661885be46dfdb/src/transformers/models/llama/convert_llama_weights_to_hf.py#L114
+        return rearrange(
+            w, "(h two d) n -> (h d two) n", d=config.n_embd // config.n_head // 2, two=2
+        )
+
+    # Attention
+    for l in range(config.n_layer):
+        Wq = state_dict.pop(f"model.layers.{l}.self_attn.q_proj.weight")
+        Wk = state_dict.pop(f"model.layers.{l}.self_attn.k_proj.weight")
+        Wv = state_dict.pop(f"model.layers.{l}.self_attn.v_proj.weight")
+
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat(
+            [inv_permute(Wq), inv_permute(Wk), Wv], dim=0
+        )
+        # We don't store these
+        state_dict.pop(f"model.layers.{l}.self_attn.rotary_emb.inv_freq", None)
+
+    def key_mapping_attn(key):
+        return re.sub(
+            r"^model.layers.(\d+).self_attn.o_proj.",
+            r"transformer.layers.\1.mixer.out_proj.",
+            key,
+        )
+
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    return state_dict
+
+
+def inv_remap_state_dict_hf_llama(
+    state_dict: Dict[str, torch.Tensor], config: GPT2Config
+) -> Dict[str, torch.Tensor]:
+    """Convert the state_dict in standard GPT format to Hugging Face format.
+
+    This function is meant to be the inverse of remap_state_dict_hf_llama, up to a
+    multiplier pad in the embedding and lm_head. That is if the original embedding
+    isn't a multiple of pad_vocab_size_multiple, then
+    inv_remap_state_dict_hf_llama(remap_state_dict_hf_llama(state_dict)) != state_dict.
+
+    This function modifies state_dict in place.
+    """
+
+    # Embedding
+    def key_mapping_emb(key):
+        return re.sub(r"^transformer.embeddings.word_embeddings.", "model.embed_tokens.", key)
+
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("model.embed_tokens.weight")
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = (
+        math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    )
+    state_dict["model.embed_tokens.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+
+    # LM head
+    if getattr(config, "tie_word_embeddings"):
+        state_dict["lm_head.weight"] = state_dict["model.embed_tokens.weight"]
+    else:
+        output_embeddings = state_dict.pop("lm_head.weight")
+        vocab_size = (
+            math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple)
+            * pad_vocab_size_multiple
+        )
+        state_dict["lm_head.weight"] = F.pad(
+            output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0])
+        )
+
+    # MLP
+    for l in range(config.n_layer):
+        w3, w1 = torch.chunk(
+            state_dict.pop(f"transformer.layers.{l}.mlp.fc1.weight"), chunks=2, dim=0
+        )
+        state_dict[f"model.layers.{l}.mlp.gate_proj.weight"] = w1
+        state_dict[f"model.layers.{l}.mlp.up_proj.weight"] = w3
+
+    def key_mapping_mlp(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).mlp.fc2.",
+            r"model.layers.\1.mlp.down_proj.",
+            key,
+        )
+
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.ln_f.", r"model.norm.", key)
+        key = re.sub(
+            r"^transformer.layers.(\d+).norm1.",
+            r"model.layers.\1.input_layernorm.",
+            key,
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).norm2.",
+            r"model.layers.\1.post_attention_layernorm.",
+            key,
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+
+    def permute(w):
+        return rearrange(
+            w, "(h d two) n -> (h two d) n", d=config.n_embd // config.n_head // 2, two=2
+        )
+
+    n_head = config.n_head
+    n_head_kv = getattr(config, "n_head_kv", n_head)
+
+    embed_dim = config.hidden_size
+    head_dim = embed_dim // n_head
+
+    q_dim = n_head * head_dim
+    k_dim = v_dim = n_head_kv * head_dim
+
+    # Attention
+    for l in range(config.n_layer):
+        Wqkv = state_dict.pop(f"transformer.layers.{l}.mixer.Wqkv.weight")
+        Wq = Wqkv[:q_dim]
+        Wk = Wqkv[q_dim : q_dim + k_dim]
+        Wv = Wqkv[q_dim + k_dim : q_dim + k_dim + v_dim]
+        state_dict[f"model.layers.{l}.self_attn.q_proj.weight"] = permute(Wq)
+        state_dict[f"model.layers.{l}.self_attn.k_proj.weight"] = permute(Wk)
+        state_dict[f"model.layers.{l}.self_attn.v_proj.weight"] = Wv
+        state_dict.pop(f"transformer.layers.{l}.attention.inner_attention.rope.freqs", None)
+
+    def key_mapping_attn(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).mixer.out_proj.",
+            r"model.layers.\1.self_attn.o_proj.",
+            key,
+        )
+
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    return state_dict
+
+
+def config_from_meta_checkpoint(
+    checkpoint_path: Union[str, os.PathLike], model_name: str
+) -> LlamaConfig:
+    """Load a LlamaConfig from a checkpoint path."""
+    with open(Path(checkpoint_path) / model_name / "params.json") as f:
+        params = json.load(f)
+    config = LlamaConfig(
+        hidden_size=params["dim"],
+        intermediate_size=None,
+        num_attention_heads=params["n_heads"],
+        num_hidden_layers=params["n_layers"],
+        rms_norm_eps=params["norm_eps"],
+        num_key_value_heads=params.get("n_kv_heads", None),
+    )
+    multiple_of = params.get("multiple_of", 1)
+    ffn_dim_multiplier = params.get("ffn_dim_multiplier", None)
+
+    # Compute the hidden dimension of the MLP
+    # https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/model.py#L224
+    intermediate_size = 4 * config.hidden_size
+    # https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/model.py#L195-L199
+    intermediate_size = int(2 * intermediate_size / 3)
+    # custom dim factor multiplier
+    if ffn_dim_multiplier is not None:
+        intermediate_size = int(ffn_dim_multiplier * intermediate_size)
+    intermediate_size = multiple_of * ((intermediate_size + multiple_of - 1) // multiple_of)
+
+    config.intermediate_size = intermediate_size
+    if "rope_theta" in params:
+        config.rotary_emb_base = params["rope_theta"]
+    config.vocab_size = 32000
+    # some CodeLLaMa have vocab_size 32000, some 32016
+    # Sadly it's not specified in the `params.json` file :(
+    tokenizer = Path(checkpoint_path) / model_name / "tokenizer.model"
+    if tokenizer.is_file():
+        config.vocab_size = SentencePieceProcessor(str(tokenizer)).vocab_size()
+    return config
+
+
+def config_from_hf_checkpoint(
+    checkpoint_path: Union[str, os.PathLike], model_name: str
+) -> LlamaConfig:
+    return LlamaConfig.from_pretrained(Path(checkpoint_path) / f"{model_name}-hf" / "config.json")
+
+
+def config_from_checkpoint(
+    checkpoint_path: Union[str, os.PathLike], model_name: str, checkpoint_format="meta"
+) -> LlamaConfig:
+    if checkpoint_format == "meta":
+        return config_from_meta_checkpoint(checkpoint_path, model_name)
+    else:
+        return config_from_hf_checkpoint(checkpoint_path, model_name)
+
+
+def state_dicts_from_checkpoint(
+    checkpoint_path: Union[str, os.PathLike], model_name: str
+) -> List[dict]:
+    # Need to sort, otherwise we mess up the ordering and the weights are wrong
+    return [
+        torch.load(path, map_location="cpu")
+        for path in sorted((Path(checkpoint_path) / model_name).glob("consolidated.*.pth"))
+    ]
+
+
+def llama_config_to_gpt2_config(llama_config: LlamaConfig) -> GPT2Config:
+    return GPT2Config(
+        vocab_size=llama_config.vocab_size,
+        n_positions=0,  # No absolute position embedding
+        n_embd=llama_config.hidden_size,
+        n_layer=llama_config.num_hidden_layers,
+        n_head=llama_config.num_attention_heads,
+        n_inner=llama_config.intermediate_size,
+        activation_function="swiglu",  # Hardcode since HF calls it 'silu'
+        # Llama doesn't have dropout, idk if it's because they only release the inference code
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attn_pdrop=0.0,
+        layer_norm_epsilon=llama_config.rms_norm_eps,
+        initializer_range=llama_config.initializer_range,
+        bos_token_id=llama_config.bos_token_id,
+        eos_token_id=llama_config.eos_token_id,
+        # These are new arguments not in the original GPT2Config
+        pad_token_id=llama_config.pad_token_id,  # Idk if this does anything
+        rms_norm=True,
+        rotary_emb_fraction=1.0,
+        rotary_emb_interleaved=True,
+        tie_word_embeddings=False,
+        qkv_proj_bias=False,
+        out_proj_bias=False,
+        mlp_fc1_bias=False,
+        mlp_fc2_bias=False,
+        rotary_emb_base=getattr(llama_config, "rotary_emb_base", 10000.0),
+        n_head_kv=llama_config.num_key_value_heads,
+    )
diff --git a/ln.h b/ln.h
new file mode 100644
index 0000000000000000000000000000000000000000..9830c092d0aca9f3466154a18d1d3c32d651716e
--- /dev/null
+++ b/ln.h
@@ -0,0 +1,281 @@
+#pragma once
+
+#include <unordered_map>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#ifdef OLD_GENERATOR_PATH
+#include <ATen/CUDAGeneratorImpl.h>
+#else
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#endif
+
+namespace layer_norm {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Params>
+struct LaunchParams{
+
+    size_t elts_per_thread;
+    size_t workspace_bytes;
+    size_t barrier_size;
+
+    cudaDeviceProp * props;
+
+    cudaStream_t stream;
+
+    Params params;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct ParamsBase {
+    ParamsBase()
+        : ctas_per_col(0)
+        , rows(0)
+        , cols(0)
+        , x(nullptr)
+        , mu(nullptr)
+        , rs(nullptr)
+        , gamma(nullptr)
+        , gamma1(nullptr)
+        , rowscale(nullptr)
+        , colscale(nullptr)
+        , dropout_keep_p(1.f)
+        , dropout_scale(1.f)
+        , is_rms_norm(false)
+        , workspace(nullptr)
+        , barrier(nullptr)
+    {
+    }
+
+    // For Multi-CTA, number of different CTA groups. Otherwise same as gridDim.x.
+    int ctas_per_col;
+
+    // Input is interpreted as matrix. We normalize across columns.
+    int rows;
+    int cols;
+
+    // Common data pointers.
+    void *x0;
+    void *x1;
+    void *residual;
+    void *x;
+    void *dmask;
+    void *dmask1;
+    void *mu;
+    void *rs;
+    void *gamma;
+    void *gamma1;
+    void *rowscale;
+    void *colscale;
+    void *x0_subset;
+    void *z_subset;
+
+    float inverse_cols;
+
+    float dropout_keep_p;
+    float dropout_scale;
+    float rowscale_const;
+
+    bool is_rms_norm;
+
+    // Multi-CTA workspace in gmem.
+    void *workspace;
+
+    // Multi-CTA sync barriers in gmem.
+    int *barrier;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct FwdParams : public ParamsBase {
+    FwdParams()
+        : ParamsBase()
+        , z(nullptr)
+        , z1(nullptr)
+        , beta(nullptr)
+        , beta1(nullptr)
+        , epsilon(0.f)
+    {
+    }
+
+    // Output of LN FWD.
+    void *z;
+    void *z1;
+    void *beta;
+    void *beta1;
+    float epsilon;
+
+    // Random state.
+    at::PhiloxCudaState philox_args;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct BwdParams : public ParamsBase {
+    BwdParams()
+        : ParamsBase()
+        , dz(nullptr)
+        , dz1(nullptr)
+        , dx(nullptr)
+        , dbeta_part(nullptr)
+        , dgamma_part(nullptr)
+        , dbeta1_part(nullptr)
+        , dgamma1_part(nullptr)
+        , dcolscale_part(nullptr)
+        , dx0(nullptr)
+        , dx1(nullptr)
+        , dresidual(nullptr)
+        , dbeta(nullptr)
+        , dgamma(nullptr)
+        , dbeta1(nullptr)
+        , dgamma1(nullptr)
+        , dcolscale(nullptr)
+    {
+    }
+
+    // Input: gradient wrt. LN FWD output.
+    void *dz;
+    void *dz1;
+    // Input: gradient wrt residual.
+    void *dx;
+
+    // Workspace for Wgrad pre-reduction.
+    void *dbeta_part;
+    void *dgamma_part;
+    void *dbeta1_part;
+    void *dgamma1_part;
+    void *dcolscale_part;
+
+    // Output: Dgrad.
+    void *dx0;
+    void *dx1;
+    void *dresidual;
+    // Output: Wgrad.
+    void *dbeta;
+    void *dgamma;
+    void *dbeta1;
+    void *dgamma1;
+    void *dcolscale;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using FwdFunction = std::function<void(LaunchParams<FwdParams>&, const bool)>;
+using BwdFunction = std::function<void(LaunchParams<BwdParams>&, const bool)>;
+using FunctionKey = uint64_t;
+using FwdRegistry = std::unordered_map<FunctionKey, FwdFunction>;
+using BwdRegistry = std::unordered_map<FunctionKey, BwdFunction>;
+
+extern FwdRegistry FWD_FUNCS, PARALLEL_FWD_FUNCS;
+extern BwdRegistry BWD_FUNCS, PARALLEL_BWD_FUNCS;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+using fp32 = float;
+using fp16 = half;
+using bf16 = nv_bfloat16;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct TypeId{};
+
+template<>
+struct TypeId<fp16>{
+    constexpr static uint32_t Value = 0;
+};
+
+template<>
+struct TypeId<bf16>{
+    constexpr static uint32_t Value = 1;
+};
+
+template<>
+struct TypeId<fp32>{
+    constexpr static uint32_t Value = 2;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, int S>
+struct Type2Key{
+    constexpr static uint32_t Value = TypeId<T>::Value << S;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct WeightType2Key : public Type2Key<T, 0>{};
+
+template<typename T>
+struct InputType2Key : public Type2Key<T, 2>{};
+
+template<typename T>
+struct ResidualType2Key : public Type2Key<T, 4>{};
+
+template<typename T>
+struct OutputType2Key : public Type2Key<T, 6>{};
+
+template<typename T>
+struct ComputeType2Key : public Type2Key<T, 8>{};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename W, typename I, typename R, typename O, typename C>
+struct Types2Key{
+    constexpr static uint32_t Value = WeightType2Key<W>::Value | InputType2Key<I>::Value | ResidualType2Key<R>::Value | OutputType2Key<O>::Value | ComputeType2Key<C>::Value;
+    constexpr static inline uint64_t get(const uint64_t hidden_size){
+        constexpr uint64_t type_key = Value;
+        return (type_key << 32) | hidden_size;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename W, typename I, typename R, typename O, typename C, uint64_t HIDDEN_SIZE>
+struct FwdRegistrar{
+    FwdRegistrar(FwdFunction f){
+        uint64_t key = Types2Key<W,I,R,O,C>::get(HIDDEN_SIZE);
+        FWD_FUNCS.insert({ key, f });
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename W, typename I, typename R, typename O, typename C, uint64_t HIDDEN_SIZE>
+struct BwdRegistrar{
+    BwdRegistrar(BwdFunction f){
+        uint64_t key = Types2Key<W,I,R,O,C>::get(HIDDEN_SIZE);
+        BWD_FUNCS.insert({ key, f });
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename W, typename I, typename R, typename O, typename C, uint64_t HIDDEN_SIZE>
+struct FwdParallelRegistrar{
+    FwdParallelRegistrar(FwdFunction f){
+        uint64_t key = Types2Key<W,I,R,O,C>::get(HIDDEN_SIZE);
+        PARALLEL_FWD_FUNCS.insert({ key, f });
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename W, typename I, typename R, typename O, typename C, uint64_t HIDDEN_SIZE>
+struct BwdParallelRegistrar{
+    BwdParallelRegistrar(BwdFunction f){
+        uint64_t key = Types2Key<W,I,R,O,C>::get(HIDDEN_SIZE);
+        PARALLEL_BWD_FUNCS.insert({ key, f });
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace layer_norm
diff --git a/ln_api.cpp b/ln_api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3981bbad58e56023c33ff66b89c130f4d1636a36
--- /dev/null
+++ b/ln_api.cpp
@@ -0,0 +1,850 @@
+#include <torch/extension.h>
+#include "ATen/cuda/CUDAContext.h"
+#include <c10/cuda/CUDAGuard.h>
+
+#include "ln.h"
+
+/*
+
+Supported Type combinations:
+
+input  residual   compute   weights   output
+============================================
+fp32     fp32      fp32      fp32      fp32
+fp16     fp32      fp32      fp32      fp16
+fp16     fp16      fp32      fp32      fp16
+bf16     fp32      fp32      fp32      bf16
+bf16     bf16      fp32      fp32      bf16
+fp16     fp16      fp32      fp16      fp16
+bf16     bf16      fp32      bf16      bf16
+
+Remarks:
+Output type = Input type
+Compute always in FP32
+
+*/
+
+namespace layer_norm {
+
+// Create registries and provide runtime versions of config hash functions.
+
+FwdRegistry FWD_FUNCS, PARALLEL_FWD_FUNCS;
+BwdRegistry BWD_FUNCS, PARALLEL_BWD_FUNCS;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+uint32_t get_type_id(torch::Dtype dtype){
+    if( dtype == torch::kFloat16 ) {
+        return TypeId<fp16>::Value;
+    } else if( dtype == torch::kBFloat16 ) {
+        return TypeId<bf16>::Value;
+    } else if( dtype == torch::kFloat32 ) {
+        return TypeId<fp32>::Value;
+    } else {
+        TORCH_CHECK(false, "Type not supported: ", dtype);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+uint64_t get_key(torch::Dtype wtype, torch::Dtype itype, torch::Dtype rtype, torch::Dtype otype, torch::Dtype ctype, uint64_t hidden_size) {
+    using namespace layer_norm;
+    uint64_t type_key = get_type_id(wtype) | (get_type_id(itype) << 2) | (get_type_id(rtype) << 4) | (get_type_id(otype) << 6) | (get_type_id(ctype) << 8);
+    uint64_t launcher_key = (type_key << 32) | hidden_size;
+    return launcher_key;
+}
+
+}  // namespace layer_norm
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+layer_norm::FwdFunction & get_fwd_launcher(torch::Dtype wtype, torch::Dtype itype, torch::Dtype rtype, torch::Dtype otype, torch::Dtype ctype, uint32_t hidden_size) {
+    auto iter = layer_norm::FWD_FUNCS.find(layer_norm::get_key(wtype, itype, rtype, otype, ctype, hidden_size));
+    if( iter != layer_norm::FWD_FUNCS.end() ) {
+        return iter->second;
+    } else {
+        TORCH_CHECK(false, "FWD: Unsupported hidden_size or types: ", hidden_size, wtype, itype, rtype, otype, ctype);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+layer_norm::BwdFunction & get_bwd_launcher(torch::Dtype wtype, torch::Dtype itype, torch::Dtype rtype, torch::Dtype otype, torch::Dtype ctype, uint32_t hidden_size) {
+    auto iter = layer_norm::BWD_FUNCS.find(layer_norm::get_key(wtype, itype, rtype, otype, ctype, hidden_size));
+    if( iter != layer_norm::BWD_FUNCS.end() ) {
+        return iter->second;
+    } else {
+        TORCH_CHECK(false, "BWD: Unsupported hidden_size or types: ", hidden_size, wtype, itype, rtype, otype, ctype);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+layer_norm::FwdFunction & get_parallel_fwd_launcher(torch::Dtype wtype, torch::Dtype itype, torch::Dtype rtype, torch::Dtype otype, torch::Dtype ctype, uint32_t hidden_size) {
+    auto iter = layer_norm::PARALLEL_FWD_FUNCS.find(layer_norm::get_key(wtype, itype, rtype, otype, ctype, hidden_size));
+    if( iter != layer_norm::PARALLEL_FWD_FUNCS.end() ) {
+        return iter->second;
+    } else {
+        TORCH_CHECK(false, "FWD: Unsupported hidden_size or types: ", hidden_size, wtype, itype, rtype, otype, ctype);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+layer_norm::BwdFunction & get_parallel_bwd_launcher(torch::Dtype wtype, torch::Dtype itype, torch::Dtype rtype, torch::Dtype otype, torch::Dtype ctype, uint32_t hidden_size) {
+    auto iter = layer_norm::PARALLEL_BWD_FUNCS.find(layer_norm::get_key(wtype, itype, rtype, otype, ctype, hidden_size));
+    if( iter != layer_norm::PARALLEL_BWD_FUNCS.end() ) {
+        return iter->second;
+    } else {
+        TORCH_CHECK(false, "BWD: Unsupported hidden_size or types: ", hidden_size, wtype, itype, rtype, otype, ctype);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+std::vector<at::Tensor> dropout_add_ln_fwd(const at::Tensor &x0,      // Input: BxSxhidden_size
+                                           c10::optional<const at::Tensor> &residual_,  // Residual: BxSxhidden_size
+                                           const at::Tensor &gamma,   // hidden_size
+                                           c10::optional<const at::Tensor> &beta_,   // hidden_size
+                                           c10::optional<const at::Tensor> &rowscale_,      // BxS
+                                           c10::optional<const at::Tensor> &colscale_,      // hidden_size
+                                           c10::optional<const at::Tensor> &x0_subset_,      // BxS
+                                           c10::optional<const at::Tensor> &z_subset_,      // BxS
+                                           const float dropout_p,
+                                           const float epsilon,
+                                           const float rowscale_const,
+                                           const int64_t z_numrows,
+                                           c10::optional<at::Generator> gen_,
+                                           bool residual_in_fp32=false,
+                                           bool is_rms_norm=false
+) {
+    auto itype = x0.scalar_type();
+    auto rtype = residual_.has_value()
+        ? residual_.value().scalar_type()
+        : (residual_in_fp32 ? torch::kFloat32 : x0.scalar_type());
+    auto wtype = gamma.scalar_type();
+    auto otype = itype;
+    auto ctype = torch::kFloat32;
+    auto mtype = torch::kUInt8;
+
+    TORCH_CHECK(x0.is_cuda());
+    TORCH_CHECK(gamma.is_cuda());
+
+    TORCH_CHECK(x0.is_contiguous());
+    // c10::IntArrayRef does not own the storage, so we need to construct a vector.
+    // Otherwise just constructing IntArrayRef({blah}) will cause uninitialized memory because
+    // blah is then deallocated.
+    std::vector<int64_t> sizes_vec {!x0_subset_.has_value() ? x0.size(0) : x0_subset_.value().size(0), x0.size(1)};
+    auto sizes = c10::IntArrayRef(sizes_vec);
+    TORCH_CHECK(x0.dim() == 2);
+    TORCH_CHECK(sizes.size() == 2);
+
+    const int rows = sizes[0];
+    const int cols = sizes[1];
+    auto hidden_size = gamma.numel();
+    TORCH_CHECK(hidden_size == cols);
+
+    if (beta_.has_value()) {
+        auto beta = beta_.value();
+        TORCH_CHECK(beta.dtype() == wtype);
+        TORCH_CHECK(beta.is_cuda());
+        TORCH_CHECK(beta.is_contiguous());
+        TORCH_CHECK(beta.sizes() == gamma.sizes());
+    }
+
+    if (residual_.has_value()) {
+        auto residual = residual_.value();
+        TORCH_CHECK(residual.is_cuda());
+        TORCH_CHECK(residual.is_contiguous());
+        TORCH_CHECK(residual.sizes() == sizes);
+    }
+
+    if (rowscale_.has_value()) {
+        auto rowscale = rowscale_.value();
+        TORCH_CHECK(rowscale.is_cuda());
+        TORCH_CHECK(rowscale.is_contiguous());
+        TORCH_CHECK(rowscale.sizes() == c10::IntArrayRef{rows});
+        TORCH_CHECK(rowscale.dtype() == itype);
+    }
+
+    if (colscale_.has_value()) {
+        auto colscale = colscale_.value();
+        TORCH_CHECK(colscale.is_cuda());
+        TORCH_CHECK(colscale.is_contiguous());
+        TORCH_CHECK(colscale.sizes() == c10::IntArrayRef{cols});
+        TORCH_CHECK(colscale.dtype() == wtype);
+    }
+
+    if (x0_subset_.has_value()) {
+        auto x0_subset = x0_subset_.value();
+        TORCH_CHECK(x0_subset.is_cuda());
+        TORCH_CHECK(x0_subset.is_contiguous());
+        TORCH_CHECK(x0_subset.sizes() == c10::IntArrayRef{rows});
+        TORCH_CHECK(x0_subset.dtype() == torch::kInt32);
+
+        TORCH_CHECK(z_subset_.has_value());
+        auto z_subset = z_subset_.value();
+        TORCH_CHECK(z_subset.is_cuda());
+        TORCH_CHECK(z_subset.is_contiguous());
+        TORCH_CHECK(z_subset.sizes() == c10::IntArrayRef{rows});
+        TORCH_CHECK(z_subset.dtype() == torch::kInt32);
+    }
+
+    TORCH_CHECK((hidden_size % 8 == 0) && (hidden_size <= 8192));
+    TORCH_CHECK(epsilon >= 0.f);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)x0.get_device()};
+
+    auto opts = x0.options();
+
+    bool save_x = residual_.has_value() || (dropout_p > 0.f) || rowscale_.has_value() || colscale_.has_value() || x0_subset_.has_value() || (itype != rtype);
+    at::Tensor x;
+    if (save_x) { x = torch::empty(sizes, opts.dtype(rtype)); }
+    at::Tensor dmask;
+    if (dropout_p > 0.f) { dmask = torch::empty(x0.sizes(), opts.dtype(mtype)); };
+    auto z = torch::empty(z_subset_.has_value() ? c10::IntArrayRef{z_numrows, cols} : sizes, opts.dtype(otype));
+
+    auto mu = torch::empty({ rows }, opts.dtype(ctype));
+    auto rsigma = torch::empty({ rows }, opts.dtype(ctype));
+
+    layer_norm::LaunchParams<layer_norm::FwdParams> launch_params;
+
+    launch_params.props = at::cuda::getCurrentDeviceProperties();
+    launch_params.stream = at::cuda::getCurrentCUDAStream().stream();
+    TORCH_CHECK(dropout_p < 1.f);
+    launch_params.params.dropout_keep_p = 1.f - dropout_p;
+    launch_params.params.residual = residual_.has_value() ? residual_.value().data_ptr() : nullptr;
+    launch_params.params.rowscale = rowscale_.has_value() ? rowscale_.value().data_ptr() : nullptr;
+    launch_params.params.colscale = colscale_.has_value() ? colscale_.value().data_ptr() : nullptr;
+    launch_params.params.x0_subset = x0_subset_.has_value() ? x0_subset_.value().data_ptr() : nullptr;
+    launch_params.params.z_subset = z_subset_.has_value() ? z_subset_.value().data_ptr() : nullptr;
+
+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+        gen_, at::cuda::detail::getDefaultCUDAGenerator());
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int multiple = hidden_size <= 1536 ? 256 : (hidden_size <= 3072 ? 512 : 1024);
+    // Request the kernel launcher.
+    auto launcher = get_fwd_launcher(wtype, itype, rtype, otype, ctype, round_multiple(hidden_size, multiple));
+
+    // Set the kernel runtime parameters.
+    layer_norm::FwdParams &params = launch_params.params;
+    params.rows = rows;
+    params.cols = cols;
+    params.x0 = x0.data_ptr();
+    params.x = save_x ? x.data_ptr() : nullptr;
+    params.dmask = dropout_p > 0.f ? dmask.data_ptr() : nullptr;
+    params.mu = mu.data_ptr();
+    params.rs = rsigma.data_ptr();
+    params.gamma = gamma.data_ptr();
+    params.beta = beta_.has_value() ? beta_.value().data_ptr() : nullptr;
+    params.z = z.data_ptr();
+    params.epsilon = epsilon;
+    params.dropout_scale = 1.f / (1.f - dropout_p);
+    params.inverse_cols = 1.f / float(params.cols);
+    params.rowscale_const = rowscale_const;
+    params.is_rms_norm = is_rms_norm;
+
+    // Query the kernel-specific launch parameters.
+    launcher(launch_params, true);
+
+    at::Tensor workspace, barrier;
+
+    if (dropout_p > 0.f) {
+        // number of times random will be generated per thread, to offset philox counter in thc random
+        // state
+        int64_t counter_offset = launch_params.elts_per_thread;
+
+        // See Note [Acquire lock when using random generators]
+        {
+            std::lock_guard<std::mutex> lock(gen->mutex_);
+            params.philox_args = gen->philox_cuda_state(counter_offset);
+        }
+    }
+
+    if( launch_params.barrier_size > 0 ) {
+        auto options = x0.options();
+        barrier = torch::zeros(launch_params.barrier_size, options.dtype(torch::kInt32));
+        workspace = torch::empty(launch_params.workspace_bytes, options.dtype(torch::kChar));
+        params.workspace = workspace.data_ptr();
+        params.barrier = barrier.data_ptr<int>();
+    }
+
+    // Launch the kernel.
+    launcher(launch_params, false);
+
+    return { z, x, dmask, mu, rsigma };
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+std::vector<at::Tensor> dropout_add_ln_bwd(const at::Tensor &dz,     // BxSxhidden_size
+                                           c10::optional<const at::Tensor> &dx_,     // BxSxhidden_size
+                                           const at::Tensor &x,      // BxSxhidden_size
+                                           c10::optional<const at::Tensor> &x0_,     // BxSxhidden_size
+                                           c10::optional<const at::Tensor> &dmask_,  // BxSxhidden_size
+                                           const at::Tensor &mu,     // BxS, FP32!
+                                           const at::Tensor &rsigma, // BxS, FP32!
+                                           const at::Tensor &gamma,   // hidden_size
+                                           c10::optional<const at::Tensor> &rowscale_,      // BxS
+                                           c10::optional<const at::Tensor> &colscale_,      // hidden_size
+                                           c10::optional<const at::Tensor> &x0_subset_,      // BxS
+                                           c10::optional<const at::Tensor> &z_subset_,      // BxS
+                                           const float dropout_p,
+                                           const float rowscale_const,
+                                           const int64_t x0_numrows,
+                                           const bool has_residual,
+                                           bool is_rms_norm=false
+) {
+
+    auto itype = dz.scalar_type();
+    auto rtype = x.scalar_type();
+    auto wtype = gamma.scalar_type();
+    auto otype = itype;
+    auto ctype = torch::kFloat32;
+    auto mtype = torch::kUInt8;
+
+    if (dropout_p > 0.f) { TORCH_CHECK(dmask_.has_value()); }
+
+    TORCH_CHECK(dz.dtype() == otype);
+    TORCH_CHECK(mu.dtype() == ctype);
+    TORCH_CHECK(rsigma.dtype() == ctype);
+
+    TORCH_CHECK(x.is_cuda());
+    TORCH_CHECK(dz.is_cuda());
+    TORCH_CHECK(mu.is_cuda());
+    TORCH_CHECK(rsigma.is_cuda());
+    TORCH_CHECK(gamma.is_cuda());
+
+    TORCH_CHECK(x.is_contiguous());
+    TORCH_CHECK(dz.is_contiguous());
+
+    auto sizes = x.sizes();
+    TORCH_CHECK(sizes.size() == 2);
+    auto rows = sizes[0];
+    auto cols = sizes[1];
+    TORCH_CHECK(dz.dim() == 2);
+    TORCH_CHECK(dz.size(1) == cols);
+    auto hidden_size = gamma.numel();
+    TORCH_CHECK(hidden_size == cols);
+
+    // c10::IntArrayRef does not own the storage, so we need to construct a vector.
+    // Otherwise just constructing IntArrayRef({blah}) will cause uninitialized memory because
+    // blah is then deallocated.
+    std::vector<int64_t> x0_sizes_vec {!x0_subset_.has_value() ? rows : x0_numrows, cols};
+    auto x0_sizes = c10::IntArrayRef(x0_sizes_vec);
+
+    if (dx_.has_value()) {
+        auto dx = dx_.value();
+        TORCH_CHECK(dx.dtype() == rtype);
+        TORCH_CHECK(dx.is_cuda());
+        TORCH_CHECK(dx.is_contiguous());
+        TORCH_CHECK(dx.sizes() == sizes);
+    }
+
+    if (dmask_.has_value()) {
+        auto dmask = dmask_.value();
+        TORCH_CHECK(dmask.dtype() == mtype);
+        TORCH_CHECK(dmask.is_cuda());
+        TORCH_CHECK(dmask.is_contiguous());
+        TORCH_CHECK(dmask.sizes() == x0_sizes);
+    }
+
+    if (rowscale_.has_value()) {
+        auto rowscale = rowscale_.value();
+        TORCH_CHECK(rowscale.is_cuda());
+        TORCH_CHECK(rowscale.is_contiguous());
+        TORCH_CHECK(rowscale.sizes() == c10::IntArrayRef{rows});
+        TORCH_CHECK(rowscale.dtype() == itype);
+    }
+
+    if (colscale_.has_value()) {
+        auto colscale = colscale_.value();
+        TORCH_CHECK(colscale.is_cuda());
+        TORCH_CHECK(colscale.is_contiguous());
+        TORCH_CHECK(colscale.sizes() == c10::IntArrayRef{cols});
+        TORCH_CHECK(colscale.dtype() == wtype);
+
+        TORCH_CHECK(x0_.has_value());
+        auto x0 = x0_.value();
+        TORCH_CHECK(x0.is_cuda());
+        TORCH_CHECK(x0.is_contiguous());
+        TORCH_CHECK(x0.sizes() == x0_sizes);
+        TORCH_CHECK(x0.dtype() == itype);
+    }
+
+    if (x0_subset_.has_value()) {
+        auto x0_subset = x0_subset_.value();
+        TORCH_CHECK(x0_subset.is_cuda());
+        TORCH_CHECK(x0_subset.is_contiguous());
+        TORCH_CHECK(x0_subset.sizes() == c10::IntArrayRef{rows});
+        TORCH_CHECK(x0_subset.dtype() == torch::kInt32);
+
+        TORCH_CHECK(z_subset_.has_value());
+        auto z_subset = z_subset_.value();
+        TORCH_CHECK(z_subset.is_cuda());
+        TORCH_CHECK(z_subset.is_contiguous());
+        TORCH_CHECK(z_subset.sizes() == c10::IntArrayRef{rows});
+        TORCH_CHECK(z_subset.dtype() == torch::kInt32);
+    }
+
+    TORCH_CHECK((hidden_size % 8 == 0) && (hidden_size <= 8192));
+
+    TORCH_CHECK(mu.numel() == rows);
+    TORCH_CHECK(mu.sizes() == rsigma.sizes());
+
+    TORCH_CHECK(gamma.numel() == cols);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)dz.get_device()};
+
+    auto opts = x.options();
+
+    auto dx0 = torch::empty(x0_sizes, opts.dtype(itype));
+    at::Tensor dresidual;
+    if (has_residual) { dresidual = torch::empty_like(x, opts.dtype(rtype)); }
+    auto dgamma = torch::empty_like(gamma);
+    auto dbeta = torch::empty_like(gamma);
+    at::Tensor dcolscale;
+    if (colscale_.has_value()) {
+        dcolscale = torch::empty_like(colscale_.value());
+    }
+
+    layer_norm::LaunchParams<layer_norm::BwdParams> launch_params;
+    launch_params.stream = at::cuda::getCurrentCUDAStream().stream();
+    launch_params.props = at::cuda::getCurrentDeviceProperties();
+    TORCH_CHECK(dropout_p < 1.f);
+    launch_params.params.dropout_keep_p = 1.f - dropout_p;
+    launch_params.params.dresidual = has_residual ? dresidual.data_ptr() : nullptr;
+    launch_params.params.rowscale = rowscale_.has_value() ? rowscale_.value().data_ptr() : nullptr;
+    launch_params.params.colscale = colscale_.has_value() ? colscale_.value().data_ptr() : nullptr;
+    launch_params.params.x0_subset = x0_subset_.has_value() ? x0_subset_.value().data_ptr() : nullptr;
+    launch_params.params.z_subset = z_subset_.has_value() ? z_subset_.value().data_ptr() : nullptr;
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int multiple = hidden_size <= 1536 ? 256 : (hidden_size <= 3072 ? 512 : 1024);
+    auto launcher = get_bwd_launcher(wtype, itype, rtype, otype, ctype, round_multiple(hidden_size, multiple));
+
+    launcher(launch_params, true);
+
+    auto dgamma_part = torch::empty({ launch_params.params.ctas_per_col, hidden_size }, opts.dtype(ctype));
+    auto dbeta_part = torch::empty({ launch_params.params.ctas_per_col, hidden_size }, opts.dtype(ctype));
+    at::Tensor dcolscale_part;
+    if (colscale_.has_value()) {
+        dcolscale_part = torch::empty({ launch_params.params.ctas_per_col, hidden_size }, opts.dtype(ctype));
+    }
+    at::Tensor workspace, barrier;
+
+    layer_norm::BwdParams &params = launch_params.params;
+    params.rows = rows;
+    params.cols = cols;
+    params.x = x.data_ptr();
+    params.x0 = x0_.has_value() ? x0_.value().data_ptr() : nullptr;
+    params.dmask = dropout_p > 0.f ? dmask_.value().data_ptr() : nullptr;
+    params.mu = mu.data_ptr();
+    params.rs = rsigma.data_ptr();
+    params.gamma = gamma.data_ptr();
+    params.dz = dz.data_ptr();
+    params.dx = dx_.has_value() ? dx_.value().data_ptr() : nullptr;
+    params.dx0 = dx0.data_ptr();
+    params.dbeta = dbeta.data_ptr();
+    params.dgamma = dgamma.data_ptr();
+    params.dcolscale = colscale_.has_value() ? dcolscale.data_ptr() : nullptr;
+    params.dbeta_part = dbeta_part.data_ptr();
+    params.dgamma_part = dgamma_part.data_ptr();
+    params.dcolscale_part = colscale_.has_value() ? dcolscale_part.data_ptr() : nullptr;
+    params.dropout_scale = 1.f / (1.f - dropout_p);
+    params.inverse_cols = 1.f / float(params.cols);
+    params.rowscale_const = rowscale_const;
+    params.is_rms_norm = is_rms_norm;
+
+    if( launch_params.barrier_size > 0 ) {
+        // TODO Any way to avoid this?
+        barrier = torch::zeros(launch_params.barrier_size, opts.dtype(torch::kInt32));
+        workspace = torch::empty(launch_params.workspace_bytes, opts.dtype(torch::kChar));
+        params.workspace = workspace.data_ptr();
+        params.barrier = barrier.data_ptr<int>();
+    }
+
+    launcher(launch_params, false);
+
+    std::vector<at::Tensor> result = { dx0, dresidual, dgamma, dbeta, dgamma_part, dbeta_part };
+    if (colscale_.has_value()) {
+        result.push_back(dcolscale);
+        result.push_back(dcolscale_part);
+    }
+    return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+std::vector<at::Tensor> dropout_add_ln_parallel_residual_fwd(
+    const at::Tensor &x0,      // Input: BxSxhidden_size
+    c10::optional<const at::Tensor> &x1_,      // Input: BxSxhidden_size
+    c10::optional<const at::Tensor> &residual_,  // Residual: BxSxhidden_size
+    const at::Tensor &gamma0,   // hidden_size
+    c10::optional<const at::Tensor> &beta0_,   // hidden_size
+    c10::optional<const at::Tensor> &gamma1_,   // hidden_size
+    c10::optional<const at::Tensor> &beta1_,   // hidden_size
+    const float dropout_p,
+    const float epsilon,
+    c10::optional<at::Generator> gen_,
+    bool residual_in_fp32=false,
+    bool is_rms_norm=false
+) {
+    auto itype = x0.scalar_type();
+    auto rtype = residual_.has_value()
+        ? residual_.value().scalar_type()
+        : (residual_in_fp32 ? torch::kFloat32 : x0.scalar_type());
+    auto wtype = gamma0.scalar_type();
+    auto otype = itype;
+    auto ctype = torch::kFloat32;
+    auto mtype = torch::kUInt8;
+
+    TORCH_CHECK(x0.is_cuda());
+    TORCH_CHECK(gamma0.is_cuda());
+
+    TORCH_CHECK(x0.is_contiguous());
+    const auto sizes = x0.sizes();
+    TORCH_CHECK(x0.dim() == 2);
+
+    const int rows = sizes[0];
+    const int cols = sizes[1];
+    auto hidden_size = gamma0.numel();
+    TORCH_CHECK(hidden_size == cols);
+
+    if (x1_.has_value()) {
+        auto x1 = x1_.value();
+        TORCH_CHECK(x1.is_cuda());
+        TORCH_CHECK(x1.is_contiguous());
+        TORCH_CHECK(x1.sizes() == sizes);
+    }
+
+    if (residual_.has_value()) {
+        auto residual = residual_.value();
+        TORCH_CHECK(residual.is_cuda());
+        TORCH_CHECK(residual.is_contiguous());
+        TORCH_CHECK(residual.sizes() == sizes);
+    }
+
+    if (beta0_.has_value()) {
+        auto beta0 = beta0_.value();
+        TORCH_CHECK(beta0.dtype() == wtype);
+        TORCH_CHECK(beta0.is_cuda());
+        TORCH_CHECK(beta0.is_contiguous());
+        TORCH_CHECK(beta0.sizes() == gamma0.sizes());
+    }
+
+    if (gamma1_.has_value()) {
+        auto gamma1 = gamma1_.value();
+        TORCH_CHECK(gamma1.dtype() == wtype);
+        TORCH_CHECK(gamma1.is_cuda());
+        TORCH_CHECK(gamma1.is_contiguous());
+        TORCH_CHECK(gamma1.sizes() == gamma0.sizes());
+    }
+
+    if (beta1_.has_value()) {
+        auto beta1 = beta1_.value();
+        TORCH_CHECK(beta1.dtype() == wtype);
+        TORCH_CHECK(beta1.is_cuda());
+        TORCH_CHECK(beta1.is_contiguous());
+        TORCH_CHECK(beta1.sizes() == gamma0.sizes());
+    }
+
+    TORCH_CHECK((hidden_size % 8 == 0) && (hidden_size <= 8192));
+    TORCH_CHECK(epsilon >= 0.f);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)x0.get_device()};
+
+    auto opts = x0.options();
+
+    bool save_x = residual_.has_value() || x1_.has_value() || (dropout_p > 0.f) || (itype != rtype);
+    at::Tensor x;
+    if (save_x) { x = torch::empty(sizes, opts.dtype(rtype)); }
+    at::Tensor dmask0, dmask1;
+    if (dropout_p > 0.f) {
+        dmask0 = torch::empty(x0.sizes(), opts.dtype(mtype));
+        if (x1_.has_value()) { dmask1 = torch::empty(x0.sizes(), opts.dtype(mtype)); }
+    };
+    auto z0 = torch::empty(sizes, opts.dtype(otype));
+    at::Tensor z1;
+    if (gamma1_.has_value()) { z1 = torch::empty(sizes, opts.dtype(otype)); }
+
+    auto mu = torch::empty({ rows }, opts.dtype(ctype));
+    auto rsigma = torch::empty({ rows }, opts.dtype(ctype));
+
+    layer_norm::LaunchParams<layer_norm::FwdParams> launch_params;
+
+    launch_params.props = at::cuda::getCurrentDeviceProperties();
+    launch_params.stream = at::cuda::getCurrentCUDAStream().stream();
+    TORCH_CHECK(dropout_p < 1.f);
+    launch_params.params.dropout_keep_p = 1.f - dropout_p;
+    launch_params.params.residual = residual_.has_value() ? residual_.value().data_ptr() : nullptr;
+
+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+        gen_, at::cuda::detail::getDefaultCUDAGenerator());
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int multiple = hidden_size <= 1536 ? 256 : (hidden_size <= 3072 ? 512 : 1024);
+    // Request the kernel launcher.
+    auto launcher = get_parallel_fwd_launcher(wtype, itype, rtype, otype, ctype, round_multiple(hidden_size, multiple));
+
+    // Set the kernel runtime parameters.
+    layer_norm::FwdParams &params = launch_params.params;
+    params.rows = rows;
+    params.cols = cols;
+    params.x0 = x0.data_ptr();
+    params.x1 = x1_.has_value() ? x1_.value().data_ptr() : nullptr;
+    params.x = save_x ? x.data_ptr() : nullptr;
+    params.dmask = dropout_p > 0.f ? dmask0.data_ptr() : nullptr;
+    params.dmask1 = (dropout_p > 0.f && x1_.has_value()) ? dmask1.data_ptr() : nullptr;
+    params.mu = mu.data_ptr();
+    params.rs = rsigma.data_ptr();
+    params.gamma = gamma0.data_ptr();
+    params.gamma1 = gamma1_.has_value() ? gamma1_.value().data_ptr() : nullptr;
+    params.beta = beta0_.has_value() ? beta0_.value().data_ptr() : nullptr;
+    params.beta1 = beta1_.has_value() ? beta1_.value().data_ptr() : nullptr;
+    params.z = z0.data_ptr();
+    params.z1 = gamma1_.has_value() ? z1.data_ptr() : nullptr;
+    params.epsilon = epsilon;
+    params.dropout_scale = 1.f / (1.f - dropout_p);
+    params.inverse_cols = 1.f / float(params.cols);
+    params.is_rms_norm = is_rms_norm;
+
+    // Query the kernel-specific launch parameters.
+    launcher(launch_params, true);
+
+    at::Tensor workspace, barrier;
+
+    if (dropout_p > 0.f) {
+        // number of times random will be generated per thread, to offset philox counter in thc random
+        // state
+        int64_t counter_offset = 2 * launch_params.elts_per_thread;
+
+        // See Note [Acquire lock when using random generators]
+        {
+            std::lock_guard<std::mutex> lock(gen->mutex_);
+            params.philox_args = gen->philox_cuda_state(counter_offset);
+        }
+    }
+
+    if( launch_params.barrier_size > 0 ) {
+        auto options = x0.options();
+        barrier = torch::zeros(launch_params.barrier_size, options.dtype(torch::kInt32));
+        workspace = torch::empty(launch_params.workspace_bytes, options.dtype(torch::kChar));
+        params.workspace = workspace.data_ptr();
+        params.barrier = barrier.data_ptr<int>();
+    }
+
+    // Launch the kernel.
+    launcher(launch_params, false);
+
+    return { z0, z1, x, dmask0, dmask1, mu, rsigma };
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+std::vector<at::Tensor> dropout_add_ln_parallel_residual_bwd(
+    const at::Tensor &dz0,     // BxSxhidden_size
+    c10::optional<const at::Tensor> &dz1_,     // BxSxhidden_size
+    c10::optional<const at::Tensor> &dx_,     // BxSxhidden_size
+    const at::Tensor &x,      // BxSxhidden_size
+    c10::optional<const at::Tensor> &dmask0_,  // BxSxhidden_size
+    c10::optional<const at::Tensor> &dmask1_,  // BxSxhidden_size
+    const at::Tensor &mu,     // BxS, FP32!
+    const at::Tensor &rsigma, // BxS, FP32!
+    const at::Tensor &gamma0,   // hidden_size
+    c10::optional<const at::Tensor> &gamma1_,   // hidden_size
+    const float dropout_p,
+    const bool has_x1,
+    const bool has_residual,
+    bool is_rms_norm=false
+) {
+
+    auto itype = dz0.scalar_type();
+    auto rtype = x.scalar_type();
+    auto wtype = gamma0.scalar_type();
+    auto otype = itype;
+    auto ctype = torch::kFloat32;
+    auto mtype = torch::kUInt8;
+
+    if (dropout_p > 0.f) { TORCH_CHECK(dmask0_.has_value()); }
+
+    TORCH_CHECK(dz0.dtype() == otype);
+    TORCH_CHECK(dz0.dtype() == otype);
+    TORCH_CHECK(mu.dtype() == ctype);
+    TORCH_CHECK(rsigma.dtype() == ctype);
+
+    TORCH_CHECK(x.is_cuda());
+    TORCH_CHECK(dz0.is_cuda());
+    TORCH_CHECK(mu.is_cuda());
+    TORCH_CHECK(rsigma.is_cuda());
+    TORCH_CHECK(gamma0.is_cuda());
+
+    TORCH_CHECK(x.is_contiguous());
+    TORCH_CHECK(dz0.is_contiguous());
+
+    auto sizes = x.sizes();
+    TORCH_CHECK(sizes.size() == 2);
+    auto rows = sizes[0];
+    auto cols = sizes[1];
+    TORCH_CHECK(dz0.dim() == 2);
+    TORCH_CHECK(dz0.size(1) == cols);
+    auto hidden_size = gamma0.numel();
+    TORCH_CHECK(hidden_size == cols);
+
+    if (dz1_.has_value()) {
+        auto dz1 = dz1_.value();
+        TORCH_CHECK(dz1.dtype() == otype);
+        TORCH_CHECK(dz1.is_cuda());
+        TORCH_CHECK(dz1.is_contiguous());
+        TORCH_CHECK(dz1.sizes() == sizes);
+
+        TORCH_CHECK(gamma1_.has_value());
+        auto gamma1 = gamma1_.value();
+        TORCH_CHECK(gamma1.dtype() == wtype);
+        TORCH_CHECK(gamma1.is_cuda());
+        TORCH_CHECK(gamma1.is_contiguous());
+        TORCH_CHECK(gamma1.sizes() == gamma0.sizes());
+    }
+
+    if (dx_.has_value()) {
+        auto dx = dx_.value();
+        TORCH_CHECK(dx.dtype() == rtype);
+        TORCH_CHECK(dx.is_cuda());
+        TORCH_CHECK(dx.is_contiguous());
+        TORCH_CHECK(dx.sizes() == sizes);
+    }
+
+    if (dmask0_.has_value()) {
+        auto dmask0 = dmask0_.value();
+        TORCH_CHECK(dmask0.dtype() == mtype);
+        TORCH_CHECK(dmask0.is_cuda());
+        TORCH_CHECK(dmask0.is_contiguous());
+        TORCH_CHECK(dmask0.sizes() == sizes);
+
+        if (has_x1) {
+            TORCH_CHECK(dmask1_.has_value());
+            auto dmask1 = dmask1_.value();
+            TORCH_CHECK(dmask1.dtype() == mtype);
+            TORCH_CHECK(dmask1.is_cuda());
+            TORCH_CHECK(dmask1.is_contiguous());
+            TORCH_CHECK(dmask1.sizes() == sizes);
+        }
+    }
+
+    TORCH_CHECK((hidden_size % 8 == 0) && (hidden_size <= 8192));
+
+    TORCH_CHECK(mu.numel() == rows);
+    TORCH_CHECK(mu.sizes() == rsigma.sizes());
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)dz0.get_device()};
+
+    auto opts = x.options();
+
+    auto dx0 = torch::empty(sizes, opts.dtype(itype));
+    at::Tensor dx1;
+    if (has_x1) { dx1 = torch::empty(sizes, opts.dtype(itype)); }
+    at::Tensor dresidual;
+    if (has_residual) { dresidual = torch::empty_like(x, opts.dtype(rtype)); }
+    auto dgamma0 = torch::empty_like(gamma0);
+    auto dbeta0 = torch::empty_like(gamma0);
+    at::Tensor dgamma1, dbeta1;
+    if (gamma1_.has_value()) {
+        dgamma1 = torch::empty_like(gamma0);
+        dbeta1 = torch::empty_like(gamma0);
+    }
+
+    layer_norm::LaunchParams<layer_norm::BwdParams> launch_params;
+    launch_params.stream = at::cuda::getCurrentCUDAStream().stream();
+    launch_params.props = at::cuda::getCurrentDeviceProperties();
+    TORCH_CHECK(dropout_p < 1.f);
+    launch_params.params.dropout_keep_p = 1.f - dropout_p;
+    launch_params.params.dresidual = has_residual ? dresidual.data_ptr() : nullptr;
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int multiple = hidden_size <= 1536 ? 256 : (hidden_size <= 3072 ? 512 : 1024);
+    auto launcher = get_parallel_bwd_launcher(wtype, itype, rtype, otype, ctype, round_multiple(hidden_size, multiple));
+
+    launcher(launch_params, true);
+
+    auto dgamma0_part = torch::zeros({ launch_params.params.ctas_per_col, hidden_size }, opts.dtype(ctype));
+    auto dbeta0_part = torch::zeros({ launch_params.params.ctas_per_col, hidden_size }, opts.dtype(ctype));
+    at::Tensor dgamma1_part, dbeta1_part;
+    if (gamma1_.has_value()) {
+        dgamma1_part = torch::zeros_like(dgamma0_part);
+        dbeta1_part = torch::zeros_like(dbeta0_part);
+    }
+    at::Tensor workspace, barrier;
+
+    layer_norm::BwdParams &params = launch_params.params;
+    params.rows = rows;
+    params.cols = cols;
+    params.x = x.data_ptr();
+    params.dmask = dropout_p > 0.f ? dmask0_.value().data_ptr() : nullptr;
+    params.dmask1 = (dropout_p > 0.f && has_x1) ? dmask1_.value().data_ptr() : nullptr;
+    params.mu = mu.data_ptr();
+    params.rs = rsigma.data_ptr();
+    params.gamma = gamma0.data_ptr();
+    params.gamma1 = gamma1_.has_value() ? gamma1_.value().data_ptr() : nullptr;
+    params.dz = dz0.data_ptr();
+    params.dz1 = dz1_.has_value() ? dz1_.value().data_ptr() : nullptr;
+    params.dx = dx_.has_value() ? dx_.value().data_ptr() : nullptr;
+    params.dx0 = dx0.data_ptr();
+    params.dx1 = has_x1 ? dx1.data_ptr() : nullptr;
+    params.dbeta = dbeta0.data_ptr();
+    params.dgamma = dgamma0.data_ptr();
+    params.dbeta1 = gamma1_.has_value() ? dbeta1.data_ptr() : nullptr;
+    params.dgamma1 = gamma1_.has_value() ? dgamma1.data_ptr() : nullptr;
+    params.dbeta_part = dbeta0_part.data_ptr();
+    params.dgamma_part = dgamma0_part.data_ptr();
+    params.dbeta1_part = gamma1_.has_value() ? dbeta1_part.data_ptr() : nullptr;
+    params.dgamma1_part = gamma1_.has_value() ? dgamma1_part.data_ptr() : nullptr;
+    params.dropout_scale = 1.f / (1.f - dropout_p);
+    params.inverse_cols = 1.f / float(params.cols);
+    params.is_rms_norm = is_rms_norm;
+
+    if( launch_params.barrier_size > 0 ) {
+        // TODO Any way to avoid this?
+        barrier = torch::zeros(launch_params.barrier_size, opts.dtype(torch::kInt32));
+        workspace = torch::empty(launch_params.workspace_bytes, opts.dtype(torch::kChar));
+        params.workspace = workspace.data_ptr();
+        params.barrier = barrier.data_ptr<int>();
+    }
+
+    launcher(launch_params, false);
+
+    std::vector<at::Tensor> result = { dx0, dx1, dresidual, dgamma0, dbeta0, dgamma1, dbeta1, dgamma0_part, dbeta0_part, dgamma1_part, dbeta1_part };
+    return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.doc() = "CUDA DropoutAddLayerNorm";
+    m.def("dropout_add_ln_fwd", &dropout_add_ln_fwd, "Run Dropout + Add + LayerNorm forward kernel",
+          py::arg("x0"), py::arg("residual"), py::arg("gamma"), py::arg("beta_"),
+          py::arg("rowscale_"), py::arg("colscale_"), py::arg("x0_subset_"), py::arg("z_subset_"),
+          py::arg("dropout_p"), py::arg("epsilon"), py::arg("rowscale_const"), py::arg("z_numrows"),
+          py::arg("gen_"), py::arg("residual_in_fp32")=false, py::arg("is_rms_norm")=false);
+    m.def("dropout_add_ln_bwd", &dropout_add_ln_bwd, "Run Dropout + Add + LayerNorm backward kernel",
+          py::arg("dz"), py::arg("dx_"), py::arg("x"), py::arg("x0_"), py::arg("dmask_"), py::arg("mu"),
+          py::arg("rsigma"), py::arg("gamma"), py::arg("rowscale_"), py::arg("colscale_"),
+          py::arg("x0_subset_"), py::arg("z_subset_"), py::arg("dropout_p"), py::arg("rowscale_const"),
+          py::arg("x0_numrows"), py::arg("has_residual"), py::arg("is_rms_norm")=false);
+    m.def("dropout_add_ln_parallel_residual_fwd", &dropout_add_ln_parallel_residual_fwd, "Run Dropout + Add + LayerNorm parallel residual forward kernel",
+          py::arg("x0"), py::arg("x1_"), py::arg("residual"), py::arg("gamma0"), py::arg("beta0_"),
+          py::arg("gamma1_"), py::arg("beta1_"), py::arg("dropout_p"), py::arg("epsilon"),
+          py::arg("gen_"), py::arg("residual_in_fp32")=false, py::arg("is_rms_norm")=false);
+    m.def("dropout_add_ln_parallel_residual_bwd", &dropout_add_ln_parallel_residual_bwd, "Run Dropout + Add + LayerNorm parallel residual backward kernel",
+          py::arg("dz0"), py::arg("dz1_"), py::arg("dx_"), py::arg("x"), py::arg("dmask0_"),
+          py::arg("dmask1_"), py::arg("mu"), py::arg("rsigma"), py::arg("gamma0"), py::arg("gamma1_"),
+          py::arg("dropout_p"), py::arg("has_x1"), py::arg("has_residual"), py::arg("is_rms_norm")=false);
+}
diff --git a/ln_bwd_1024.cu b/ln_bwd_1024.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f7101f6450fcdb8baa4ff4e79379d913048696b6
--- /dev/null
+++ b/ln_bwd_1024.cu
@@ -0,0 +1,15 @@
+#include "ln_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_BWD_LAUNCHER(  1024, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1024, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1024, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1024, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1024, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1024, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1024, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1024, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1024, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1024, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
diff --git a/ln_bwd_1280.cu b/ln_bwd_1280.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a80a5762a178bd1fd1cd2ef4d0fb2010c1eea22e
--- /dev/null
+++ b/ln_bwd_1280.cu
@@ -0,0 +1,15 @@
+#include "ln_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_BWD_LAUNCHER(  1280, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1280, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1280, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1280, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1280, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1280, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1280, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1280, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1280, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  1280, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
diff --git a/ln_bwd_1536.cu b/ln_bwd_1536.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0c25c088494d52f3b68251235d29c23a46ffc430
--- /dev/null
+++ b/ln_bwd_1536.cu
@@ -0,0 +1,15 @@
+#include "ln_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_BWD_LAUNCHER( 1536, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 1536, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 1536, fp32, fp16, fp32, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER( 1536, fp16, fp16, fp32, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER( 1536, fp32, fp16, fp16, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER( 1536, fp32, bf16, fp32, bf16, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER( 1536, bf16, bf16, fp32, bf16, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER( 1536, fp32, bf16, bf16, bf16, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER( 1536, fp16, fp16, fp16, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER( 1536, bf16, bf16, bf16, bf16, fp32, 1, 1, 4,  8, 4);
diff --git a/ln_bwd_2048.cu b/ln_bwd_2048.cu
new file mode 100644
index 0000000000000000000000000000000000000000..06c0e608a3e48ec7fad2081bc6ff82425ea1c56a
--- /dev/null
+++ b/ln_bwd_2048.cu
@@ -0,0 +1,15 @@
+#include "ln_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_BWD_LAUNCHER( 2048, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 2048, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 2048, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 2048, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 2048, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 2048, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 2048, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 2048, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 2048, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 2048, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
\ No newline at end of file
diff --git a/ln_bwd_256.cu b/ln_bwd_256.cu
new file mode 100644
index 0000000000000000000000000000000000000000..20945432b8e97be21d80ada73aa0b3e709733a5b
--- /dev/null
+++ b/ln_bwd_256.cu
@@ -0,0 +1,15 @@
+#include "ln_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_BWD_LAUNCHER(  256, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  256, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  256, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  256, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  256, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  256, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  256, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  256, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  256, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  256, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
diff --git a/ln_bwd_2560.cu b/ln_bwd_2560.cu
new file mode 100644
index 0000000000000000000000000000000000000000..309184c37b93e1f90bc1020a47973dae84f0f0c8
--- /dev/null
+++ b/ln_bwd_2560.cu
@@ -0,0 +1,15 @@
+#include "ln_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_BWD_LAUNCHER( 2560, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 2560, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 2560, fp32, fp16, fp32, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER( 2560, fp16, fp16, fp32, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER( 2560, fp32, fp16, fp16, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER( 2560, fp32, bf16, fp32, bf16, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER( 2560, bf16, bf16, fp32, bf16, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER( 2560, fp32, bf16, bf16, bf16, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER( 2560, fp16, fp16, fp16, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_BWD_LAUNCHER( 2560, bf16, bf16, bf16, bf16, fp32, 1, 1, 4,  8, 4);
diff --git a/ln_bwd_3072.cu b/ln_bwd_3072.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e156b11cd92f450a6ce8e0c432487bd36d6f9847
--- /dev/null
+++ b/ln_bwd_3072.cu
@@ -0,0 +1,15 @@
+#include "ln_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_BWD_LAUNCHER( 3072, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 3072, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 3072, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 3072, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 3072, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 3072, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 3072, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 3072, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 3072, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 3072, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
\ No newline at end of file
diff --git a/ln_bwd_4096.cu b/ln_bwd_4096.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b715b0efe48c4111ae4301365018d19f537c7a81
--- /dev/null
+++ b/ln_bwd_4096.cu
@@ -0,0 +1,15 @@
+#include "ln_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_BWD_LAUNCHER( 4096, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 4096, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 4096, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 4096, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 4096, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 4096, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 4096, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 4096, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 4096, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 4096, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
\ No newline at end of file
diff --git a/ln_bwd_512.cu b/ln_bwd_512.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2b472118f0a0025917edc4c706492ca5dc8fa205
--- /dev/null
+++ b/ln_bwd_512.cu
@@ -0,0 +1,15 @@
+#include "ln_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_BWD_LAUNCHER(  512, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  512, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  512, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  512, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  512, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  512, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  512, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  512, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  512, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  512, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
diff --git a/ln_bwd_5120.cu b/ln_bwd_5120.cu
new file mode 100644
index 0000000000000000000000000000000000000000..38f3fbd406db8989f4a9806e64075bf52444c529
--- /dev/null
+++ b/ln_bwd_5120.cu
@@ -0,0 +1,15 @@
+#include "ln_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_BWD_LAUNCHER( 5120, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 5120, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 5120, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 5120, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 5120, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 5120, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 5120, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 5120, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 5120, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_BWD_LAUNCHER( 5120, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
\ No newline at end of file
diff --git a/ln_bwd_6144.cu b/ln_bwd_6144.cu
new file mode 100644
index 0000000000000000000000000000000000000000..469ed4b6c7691c581bbd1db5b8587de860afcb16
--- /dev/null
+++ b/ln_bwd_6144.cu
@@ -0,0 +1,15 @@
+#include "ln_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_BWD_LAUNCHER( 6144, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 6144, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 6144, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 6144, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 6144, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 6144, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 6144, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 6144, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 6144, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 6144, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);
\ No newline at end of file
diff --git a/ln_bwd_7168.cu b/ln_bwd_7168.cu
new file mode 100644
index 0000000000000000000000000000000000000000..549eab11aa3c770bea97bda727495f3e141ec24b
--- /dev/null
+++ b/ln_bwd_7168.cu
@@ -0,0 +1,15 @@
+#include "ln_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_BWD_LAUNCHER( 7168, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 7168, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 7168, fp32, fp16, fp32, fp16, fp32, 1, 1, 8,  8, 4);
+REGISTER_BWD_LAUNCHER( 7168, fp16, fp16, fp32, fp16, fp32, 1, 1, 8,  8, 4);
+REGISTER_BWD_LAUNCHER( 7168, fp32, fp16, fp16, fp16, fp32, 1, 1, 8,  8, 4);
+REGISTER_BWD_LAUNCHER( 7168, fp32, bf16, fp32, bf16, fp32, 1, 1, 8,  8, 4);
+REGISTER_BWD_LAUNCHER( 7168, bf16, bf16, fp32, bf16, fp32, 1, 1, 8,  8, 4);
+REGISTER_BWD_LAUNCHER( 7168, fp32, bf16, bf16, bf16, fp32, 1, 1, 8,  8, 4);
+REGISTER_BWD_LAUNCHER( 7168, fp16, fp16, fp16, fp16, fp32, 1, 1, 8,  8, 4);
+REGISTER_BWD_LAUNCHER( 7168, bf16, bf16, bf16, bf16, fp32, 1, 1, 8,  8, 4);
\ No newline at end of file
diff --git a/ln_bwd_768.cu b/ln_bwd_768.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5db64d3d7b184f6ffb01ae0e1a26e0acec3bbe3d
--- /dev/null
+++ b/ln_bwd_768.cu
@@ -0,0 +1,15 @@
+#include "ln_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_BWD_LAUNCHER(  768, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  768, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  768, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  768, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  768, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  768, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  768, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  768, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  768, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_BWD_LAUNCHER(  768, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
diff --git a/ln_bwd_8192.cu b/ln_bwd_8192.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e6514e613fe9cbf444ad4919a5acf9579b216c9e
--- /dev/null
+++ b/ln_bwd_8192.cu
@@ -0,0 +1,15 @@
+#include "ln_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_BWD_LAUNCHER( 8192, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 8192, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 8192, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 8192, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 8192, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 8192, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 8192, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 8192, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 8192, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_BWD_LAUNCHER( 8192, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);
\ No newline at end of file
diff --git a/ln_bwd_kernels.cuh b/ln_bwd_kernels.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c7261d218442acbcf60b61ce2e8803556193d8cd
--- /dev/null
+++ b/ln_bwd_kernels.cuh
@@ -0,0 +1,534 @@
+#pragma once
+
+#include "ln.h"
+#include "ln_utils.cuh"
+#include "ln_kernel_traits.h"
+#include "static_switch.h"
+
+namespace layer_norm {
+
+template<typename Ktraits, bool Is_dropout, bool Has_colscale, bool Has_subset, bool Is_even_cols>
+__global__ __launch_bounds__(Ktraits::THREADS_PER_CTA) 
+void ln_bwd_kernel(layer_norm::BwdParams params) {
+
+    enum { ROWS_PER_CTA = Ktraits::ROWS_PER_CTA };
+    enum { WARPS_M = Ktraits::WARPS_M };
+    enum { WARPS_N = Ktraits::WARPS_N };
+    enum { THREADS_PER_ROW = Ktraits::THREADS_PER_ROW };
+    enum { COLS = Ktraits::COLS };
+    enum { BYTES_PER_ROW = Ktraits::BYTES_PER_ROW };
+    enum { LDGS = Ktraits::LDGS };
+    enum { NUM_ELTS = Ktraits::ELTS_PER_LDG };
+    enum { THREADS_PER_WARP = Ktraits::THREADS_PER_WARP };
+    enum { CTAS_PER_ROW = Ktraits::CTAS_PER_ROW };
+
+    using input_t = typename Ktraits::input_t;
+    using compute_t = typename Ktraits::compute_t;
+    using index_t = typename Ktraits::index_t;
+    using mask_t = typename Ktraits::mask_t;
+    using Ivec = typename Ktraits::Ivec;
+    using Rvec = typename Ktraits::Rvec;
+    using Ovec = typename Ktraits::Ovec;
+    using Wvec = typename Ktraits::Wvec;
+    using Cvec = typename Ktraits::Cvec;
+    using Mvec = typename Ktraits::Mvec;
+    using Reducer = typename Ktraits::Reducer;
+    using reduce_t = typename Reducer::Type;
+
+    extern __shared__ char smem_[];
+
+    const bool has_residual = params.dresidual != nullptr;
+    const bool prenorm = params.dx != nullptr;
+
+    const index_t tidx = threadIdx.x;
+    const index_t bidn = blockIdx.x % CTAS_PER_ROW;
+    const index_t bidm = blockIdx.x / CTAS_PER_ROW;
+    const index_t lane = tidx % THREADS_PER_WARP;
+    const index_t warp = tidx / THREADS_PER_WARP;
+    const index_t warp_m = warp / Ktraits::WARPS_N;
+    const index_t warp_n = warp % Ktraits::WARPS_N;
+    const index_t tid_r = warp_n * THREADS_PER_WARP + lane;
+
+    const index_t r = bidm * Ktraits::ROWS_PER_CTA + warp_m;
+    const index_t c = bidn * THREADS_PER_ROW + warp_n * THREADS_PER_WARP + lane;
+
+    static_assert(COLS == THREADS_PER_ROW * LDGS * NUM_ELTS * CTAS_PER_ROW);
+
+    const input_t *rowscale = static_cast<input_t *>(params.rowscale);
+    const index_t *x0_subset = static_cast<index_t *>(params.x0_subset);
+    const index_t *z_subset = static_cast<index_t *>(params.z_subset);
+
+    Cvec dzy_sum[LDGS];
+    Cvec dz_sum[LDGS];
+    Cvec dcolscale_sum[LDGS];
+
+    memset(dzy_sum, 0, sizeof(dzy_sum));
+    memset(dz_sum, 0, sizeof(dz_sum));
+    if (Has_colscale) { memset(dcolscale_sum, 0, sizeof(dcolscale_sum)); }
+
+    compute_t * smem_wgrad = reinterpret_cast<compute_t*>(smem_);
+    char *smem_dgrad = smem_ + Ktraits::SMEM_BYTES_WGRAD;
+
+    Reducer reducer(params, bidm, bidn, warp_m, warp_n, lane, smem_dgrad);
+
+    Sum<reduce_t> sum;
+
+    const index_t num_valid_ldgs =
+        ((params.cols / Ktraits::ELTS_PER_LDG) - 1 - c + Ktraits::VEC_COLS_PER_LDG) / Ktraits::VEC_COLS_PER_LDG;
+
+    Wvec gamma[LDGS];
+    Wvec colscale[LDGS];
+    index_t idx = c;
+    #pragma unroll
+    for( int it = 0; it < LDGS; it++ ) {
+        if (Is_even_cols || (it < num_valid_ldgs)) {
+            gamma[it].load_from(params.gamma, idx);
+            if (Has_colscale) { colscale[it].load_from(params.colscale, idx); }
+            idx += Ktraits::VEC_COLS_PER_LDG;
+        }
+    }
+    // TODO if ROWS_PER_CTA does not divide rows, we might get divergence in the
+    // last blocks with syncthreads!
+    // grid stride over rows
+    #pragma unroll 1
+    for( int row = r; row < params.rows; row += params.ctas_per_col * ROWS_PER_CTA ) {
+        const compute_t mu_r = static_cast<const compute_t *>(params.mu)[row];
+        const compute_t rs_r = static_cast<const compute_t *>(params.rs)[row];
+        const compute_t rowscale_val = !Has_subset ? (params.rowscale == nullptr ? 1.0f : compute_t(rowscale[row])) : params.rowscale_const;
+        const int row_z = !Has_subset ? row + 1 : z_subset[row];
+        const int row_x0 = !Has_subset ? row + 1 : x0_subset[row];
+        const bool load_dz = !Has_subset || row_z > 0;
+        const bool save_dx0 = !Has_subset || row_x0 > 0;
+        Mvec dmask[LDGS];
+        Rvec dx[LDGS];
+        compute_t dy[LDGS * NUM_ELTS];
+        compute_t y[LDGS * NUM_ELTS];
+        compute_t mdy_local = 0.f;
+        compute_t mdyy_local = 0.f;
+        // If dz is not loaded, then dy should be 0 and we don't care about the value of y.
+        if (load_dz) {
+            index_t idx_x = row * params.cols / Ktraits::ELTS_PER_LDG + c;
+            index_t idx_z = !Has_subset ? idx_x : (load_dz ? (row_z - 1) * params.cols / Ktraits::ELTS_PER_LDG + c : 0);
+            index_t idx_x0 = !Has_subset ? idx_x : (save_dx0 ? (row_x0 - 1) * params.cols / Ktraits::ELTS_PER_LDG + c : 0);
+            #pragma unroll
+            for( int it = 0; it < LDGS; it++ ) {
+                if (Is_even_cols || (it < num_valid_ldgs)) {
+                    Rvec x;
+                    Ovec dz;
+                    dz.load_from(params.dz, !Has_subset ? idx_x : idx_z);
+                    if (prenorm) { dx[it].load_from(params.dx, idx_x); }
+                    x.load_from(params.x, idx_x);
+                    if (Is_dropout) { dmask[it].load_from(params.dmask, !Has_subset ? idx_x : idx_x0); }
+                    idx_x += Ktraits::VEC_COLS_PER_LDG;
+                    idx_z += Ktraits::VEC_COLS_PER_LDG;
+                    idx_x0 += Ktraits::VEC_COLS_PER_LDG;
+                    #pragma unroll
+                    for( int jt = 0; jt < NUM_ELTS; jt++ ) {
+                        compute_t x_tmp = x.data.elt[jt];
+                        compute_t y_tmp = rs_r * (x_tmp - (!params.is_rms_norm ? mu_r : 0.f));
+                        compute_t dy_tmp = compute_t(gamma[it].data.elt[jt]) * compute_t(dz.data.elt[jt]);
+                        compute_t dz_tmp = dz.data.elt[jt];
+
+                        mdy_local += dy_tmp;
+                        mdyy_local += dy_tmp * y_tmp;
+
+                        dy[it * NUM_ELTS + jt] = dy_tmp;
+                        y[it * NUM_ELTS + jt] = y_tmp;
+
+                        dzy_sum[it].data.elt[jt] += dz_tmp * y_tmp;
+                        dz_sum[it].data.elt[jt] += dz_tmp;
+                    }
+                }
+            }
+        } else {
+            index_t idx_x = row * params.cols / Ktraits::ELTS_PER_LDG + c;
+            index_t idx_x0 = !Has_subset ? idx_x : (save_dx0 ? (row_x0 - 1) * params.cols / Ktraits::ELTS_PER_LDG + c : 0);
+            #pragma unroll
+            for( int it = 0; it < LDGS; it++ ) {
+                if (Is_even_cols || (it < num_valid_ldgs)) {
+                    if (prenorm) { dx[it].load_from(params.dx, idx_x); }
+                    if (Is_dropout) { dmask[it].load_from(params.dmask, !Has_subset ? idx_x : idx_x0); }
+                    idx_x += Ktraits::VEC_COLS_PER_LDG;
+                    idx_x0 += Ktraits::VEC_COLS_PER_LDG;
+                }
+            }
+        }
+
+        reduce_t result = reducer.allreduce({mdy_local, mdyy_local}, sum);
+        mdy_local = layer_norm::Get<0>::of<reduce_t, compute_t>(result) * params.inverse_cols;
+        mdyy_local = layer_norm::Get<1>::of<reduce_t, compute_t>(result) * params.inverse_cols;
+
+        index_t idx_x = row * params.cols / Ktraits::ELTS_PER_LDG + c;
+        index_t idx_x0 = !Has_subset ? idx_x : (save_dx0 ? (row_x0 - 1) * params.cols / Ktraits::ELTS_PER_LDG + c : 0);
+        #pragma unroll
+        for( int it = 0; it < LDGS; it++ ) {
+            if (Is_even_cols || (it < num_valid_ldgs)) {
+                Ivec dx0;
+                Rvec dresidual;
+                Ivec x0;
+                if (Has_colscale && save_dx0) { x0.load_from(params.x0, !Has_subset ? idx_x : idx_x0); }
+                #pragma unroll
+                for( int jt = 0; jt < NUM_ELTS; jt++ ) {
+                    compute_t dx_tmp_res;
+                    if (load_dz) {
+                        compute_t dy_tmp = dy[it * NUM_ELTS + jt];
+                        compute_t y_tmp = y[it * NUM_ELTS + jt];
+                        compute_t dx_tmp = rs_r * (dy_tmp - (mdyy_local * y_tmp + (!params.is_rms_norm ? mdy_local : 0.f)));
+                        dx_tmp_res = prenorm ? dx_tmp + compute_t(dx[it].data.elt[jt]) : dx_tmp;
+                    } else {
+                        dx_tmp_res = prenorm ? compute_t(dx[it].data.elt[jt]) : 0.f;
+                    }
+                    if (has_residual) { dresidual.data.elt[jt] = dx_tmp_res; }
+                    if (save_dx0) {
+                        compute_t dx0_tmp_res = dx_tmp_res * rowscale_val;
+                        if (Is_dropout) {
+                            dx0_tmp_res *= params.dropout_scale;
+                            if (Has_colscale) {
+                                dcolscale_sum[it].data.elt[jt] += dmask[it].data.elt[jt] ? dx0_tmp_res * compute_t(x0.data.elt[jt]) : 0.f;
+                                dx0.data.elt[jt] = dmask[it].data.elt[jt] ? dx0_tmp_res * compute_t(colscale[it].data.elt[jt]) : 0.f;
+                            } else {
+                                dx0.data.elt[jt] = dmask[it].data.elt[jt] ? dx0_tmp_res : 0.f;
+                            }
+                        } else {
+                            if (Has_colscale) {
+                                dcolscale_sum[it].data.elt[jt] += dx0_tmp_res * compute_t(x0.data.elt[jt]);
+                                dx0.data.elt[jt] = dx0_tmp_res * compute_t(colscale[it].data.elt[jt]);
+                            } else {
+                                dx0.data.elt[jt] = dx0_tmp_res;
+                            }
+                        }
+                    }
+                }
+                if (has_residual) { dresidual.store_to(params.dresidual, idx_x); }
+                if (save_dx0) { dx0.store_to(params.dx0, !Has_subset ? idx_x : idx_x0); }
+                idx_x += Ktraits::VEC_COLS_PER_LDG;
+                idx_x0 += Ktraits::VEC_COLS_PER_LDG;
+            }
+        }
+
+    }  // end: grid stride loop
+
+    if( WARPS_M == 1 ) {
+        idx = r * params.cols / Ktraits::ELTS_PER_LDG + c;
+        #pragma unroll
+        for( int it = 0; it < LDGS; it++ ) {
+            if (Is_even_cols || (it < num_valid_ldgs)) {
+                dz_sum[it].store_to(params.dbeta_part, idx);
+                dzy_sum[it].store_to(params.dgamma_part, idx);
+                if (Has_colscale) { dcolscale_sum[it].store_to(params.dcolscale_part, idx); }
+                idx += Ktraits::VEC_COLS_PER_LDG;
+            }
+        }
+    } else {
+        static_assert(WARPS_M == 1 || Ktraits::CTAS_PER_ROW == 1, "Multiple rows per CTA not supported for Multi-CTA.");
+        // Finalize reduction of part dgamma and dbeta for this CTA
+        // by reducing over the rows held across the WARPS_M warps
+
+        // Assumption: blockSize divides hidden size.
+        enum { NUM_RES = COLS / Ktraits::THREADS_PER_CTA };
+        static_assert(NUM_RES * Ktraits::THREADS_PER_CTA == COLS, "");
+
+        idx = warp_m * Ktraits::VEC_COLS + tid_r;
+        #pragma unroll
+        for( int it = 0; it < LDGS; it++ ) {
+            dz_sum[it].store_to(smem_wgrad, idx);
+            idx += THREADS_PER_ROW;
+        }
+        __syncthreads();
+        compute_t cta_dz_sum[NUM_RES];
+        memset(cta_dz_sum, 0, sizeof(compute_t) * NUM_RES);
+        for( int it = 0; it < ROWS_PER_CTA; it++ ) {
+            for( int jt = 0; jt < NUM_RES; jt++ ) {
+                cta_dz_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA];
+            }
+        }
+        __syncthreads();
+
+        idx = warp_m * Ktraits::VEC_COLS + tid_r;
+        #pragma unroll
+        for( int it = 0; it < LDGS; it++ ) {
+            dzy_sum[it].store_to(smem_wgrad, idx);
+            idx += THREADS_PER_ROW;
+        }
+        __syncthreads();
+        compute_t cta_dzy_sum[NUM_RES];
+        memset(cta_dzy_sum, 0, sizeof(compute_t) * NUM_RES);
+        for( int it = 0; it < ROWS_PER_CTA; it++ ) {
+            for( int jt = 0; jt < NUM_RES; jt++ ) {
+                cta_dzy_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA];
+            }
+        }
+
+        compute_t cta_dcolscale_sum[NUM_RES];
+        if (Has_colscale) {
+            __syncthreads();
+            idx = warp_m * Ktraits::VEC_COLS + tid_r;
+            #pragma unroll
+            for( int it = 0; it < LDGS; it++ ) {
+                dcolscale_sum[it].store_to(smem_wgrad, idx);
+                idx += THREADS_PER_ROW;
+            }
+            __syncthreads();
+            memset(cta_dcolscale_sum, 0, sizeof(compute_t) * NUM_RES);
+            for( int it = 0; it < ROWS_PER_CTA; it++ ) {
+                for( int jt = 0; jt < NUM_RES; jt++ ) {
+                    cta_dcolscale_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA];
+                }
+            }
+        }
+
+        const index_t num_valid_writes
+            = (params.cols - 1 - tidx + Ktraits::THREADS_PER_CTA) / Ktraits::THREADS_PER_CTA;
+        compute_t *dgamma_part = static_cast<compute_t *>(params.dgamma_part) + bidm * params.cols + tidx;
+        compute_t *dbeta_part = static_cast<compute_t *>(params.dbeta_part) + bidm * params.cols + tidx;
+        compute_t *dcolscale_part = Has_colscale ? static_cast<compute_t *>(params.dcolscale_part) + bidm * params.cols + tidx : nullptr;
+        for( int jt = 0; jt < NUM_RES; jt++ ) {
+            if (Is_even_cols || (jt < num_valid_writes)) {
+                *dgamma_part = cta_dzy_sum[jt];
+                dgamma_part += Ktraits::THREADS_PER_CTA;
+                *dbeta_part = cta_dz_sum[jt];
+                dbeta_part += Ktraits::THREADS_PER_CTA;
+                if (Has_colscale) {
+                    *dcolscale_part = cta_dcolscale_sum[jt];
+                    dcolscale_part += Ktraits::THREADS_PER_CTA;
+                }
+            }
+        }
+
+    }
+}
+
+template<typename Kernel_traits, bool Has_colscale, bool Is_even_cols>
+__global__ __launch_bounds__(Kernel_traits::THREADS_PER_CTA)
+void ln_bwd_finalize_kernel(BwdParams params)
+{
+
+    using compute_t = typename Kernel_traits::compute_t;
+    using weight_t = typename Kernel_traits::weight_t;
+    using index_t = typename Kernel_traits::index_t;
+    using Reducer = typename Kernel_traits::Reducer;
+    using reduce_t = typename Reducer::Type;
+
+    Sum<reduce_t> sum;
+    enum { NUM_ELT = Kernel_traits::ELTS_PER_LDG };
+    enum { THREADS_PER_WARP = Kernel_traits::THREADS_PER_WARP };
+
+    __shared__ char smem_[Kernel_traits::SMEM_BYTES_PER_CTA];
+
+    constexpr uint32_t bidm = 0;
+
+    const uint32_t bidn = blockIdx.x;
+    const uint32_t tidx = threadIdx.x;
+    const uint32_t warp = tidx / THREADS_PER_WARP;
+    const uint32_t lane = tidx % THREADS_PER_WARP;
+
+    Reducer reducer(params, bidm, bidn, 0, 0, lane, smem_);
+
+    const uint32_t c = bidn * THREADS_PER_WARP + lane;
+    const uint32_t c_out = bidn * THREADS_PER_WARP / 2 + lane;
+    constexpr uint32_t COL_STRIDE = Kernel_traits::CTAS * THREADS_PER_WARP;
+    for( uint32_t col = c, col_out = c_out; col < Kernel_traits::COLS; col += COL_STRIDE, col_out += COL_STRIDE / 2 ) {
+        // Each thread sums over NUM_ELT columns.
+        Vec<compute_t, NUM_ELT> dbeta_local, dgamma_local, dcolscale_local;
+        memset(&dgamma_local, 0, sizeof(dgamma_local));
+        memset(&dbeta_local, 0, sizeof(dbeta_local));
+        if (Has_colscale) { memset(&dcolscale_local, 0, sizeof(dcolscale_local)); }
+        if (Is_even_cols || col < params.cols) {
+            for( uint32_t row = warp; row < params.ctas_per_col; row += Kernel_traits::ROWS_PER_CTA ) {
+                index_t idx = row * params.cols + col;
+
+                Vec<compute_t, NUM_ELT> dbeta_part, dgamma_part, dcolscale_part;
+                dbeta_part.load_from(params.dbeta_part, idx);
+                dgamma_part.load_from(params.dgamma_part, idx);
+                if (Has_colscale) { dcolscale_part.load_from(params.dcolscale_part, idx); }
+                #pragma unroll
+                for( int it = 0; it < NUM_ELT; it++ ) {
+                    dgamma_local.data.elt[it] += dgamma_part.data.elt[it];
+                    dbeta_local.data.elt[it] += dbeta_part.data.elt[it];
+                    if (Has_colscale) { dcolscale_local.data.elt[it] += dcolscale_part.data.elt[it]; }
+                }
+            }
+        }
+        void * smem_gamma = smem_;
+        void * smem_beta = &smem_[Kernel_traits::SMEM_BYTES_TRANSPOSE];
+        void * smem_colscale = &smem_[2 * Kernel_traits::SMEM_BYTES_TRANSPOSE];
+
+        const int write_row = warp;
+        const int write_col = lane ^ write_row;
+        const int write_idx = write_row * THREADS_PER_WARP + write_col;
+
+        dgamma_local.store_to(smem_gamma, write_idx);
+        dbeta_local.store_to(smem_beta, write_idx);
+        if (Has_colscale) { dcolscale_local.store_to(smem_colscale, write_idx); }
+
+        __syncthreads();
+
+        // It would be probably safe to reuse the first row of smem_beta and smem_gamma
+        void * smem_gamma_out = &smem_[Kernel_traits::NUM_FACTORS * Kernel_traits::SMEM_BYTES_TRANSPOSE];
+        void * smem_beta_out = &smem_[Kernel_traits::NUM_FACTORS * Kernel_traits::SMEM_BYTES_TRANSPOSE + Kernel_traits::SMEM_BYTES_OUTPUT];
+        void * smem_colscale_out = &smem_[Kernel_traits::NUM_FACTORS * Kernel_traits::SMEM_BYTES_TRANSPOSE + 2 * Kernel_traits::SMEM_BYTES_OUTPUT];
+
+
+        // More than one iter iff ROWS_PER_CTA < 32.
+        for( int w = warp; w < THREADS_PER_WARP; w += Kernel_traits::ROWS_PER_CTA ) {
+            const int read_row = lane;
+            const int read_col = w ^ read_row;
+            const int read_idx = read_row * THREADS_PER_WARP + read_col;
+
+            memset(&dbeta_local, 0, sizeof(dbeta_local));
+            memset(&dgamma_local, 0, sizeof(dgamma_local));
+            if (Has_colscale) { memset(&dcolscale_local, 0, sizeof(dcolscale_local)); }
+
+            // Load beta and gamma transposed 
+            if(read_row < Kernel_traits::ROWS_PER_CTA){
+                dbeta_local.load_from(smem_beta, read_idx);
+                dgamma_local.load_from(smem_gamma, read_idx);
+                if (Has_colscale) { dcolscale_local.load_from(smem_colscale, read_idx); }
+            }
+
+            // Call reducer on the loaded value(s) and convert.
+            #pragma unroll
+            for( int it = 0; it < NUM_ELT; it++ ) {
+                compute_t b_i = dbeta_local.data.elt[it];
+                compute_t g_i = dgamma_local.data.elt[it];
+                b_i = reducer.allreduce(b_i, sum);
+                g_i = reducer.allreduce(g_i, sum);
+
+                dgamma_local.data.elt[it] = g_i;
+                dbeta_local.data.elt[it] = b_i;
+                if (Has_colscale) {
+                    compute_t cs_i = dcolscale_local.data.elt[it];
+                    cs_i = reducer.allreduce(cs_i, sum);
+                    dcolscale_local.data.elt[it] = cs_i;
+                }
+            }
+
+            // Leader stores the result at the current column.
+            if(lane == 0){
+                dgamma_local.store_to(smem_gamma_out, w);
+                dbeta_local.store_to(smem_beta_out, w);
+                if (Has_colscale) { dcolscale_local.store_to(smem_colscale_out, w); }
+            }
+
+        }
+
+        // All writes done.
+        __syncthreads();
+
+        // Pack and store: 2-wide stores with half the threads.
+        if (Is_even_cols || col_out * 2 < params.cols) {
+            if( warp == Kernel_traits::ROWS_PER_CTA - 1 && lane < THREADS_PER_WARP / 2 ) {
+
+                using src_t = typename TypeToVec2<compute_t>::Type;
+                using dst_t = typename TypeToVec2<weight_t>::Type;
+                Vec<src_t, NUM_ELT> dbeta_vec2, dgamma_vec2, dcolscale_vec2;
+                Vec<dst_t, NUM_ELT> dbeta_out2, dgamma_out2, dcolscale_out2;
+
+                dgamma_vec2.load_from(smem_gamma_out, lane);
+                dbeta_vec2.load_from(smem_beta_out, lane);
+                if (Has_colscale) { dcolscale_vec2.load_from(smem_colscale_out, lane); }
+                #pragma unroll
+                for( int it = 0; it < NUM_ELT; it++ ) {
+                    dgamma_out2.data.elt[it] = Converter<src_t,dst_t>::convert(dgamma_vec2.data.elt[it]);
+                    dbeta_out2.data.elt[it] = Converter<src_t,dst_t>::convert(dbeta_vec2.data.elt[it]);
+                    if (Has_colscale) { dcolscale_out2.data.elt[it] = Converter<src_t,dst_t>::convert(dcolscale_vec2.data.elt[it]); }
+                }
+                dgamma_out2.store_to(params.dgamma, col_out);
+                dbeta_out2.store_to(params.dbeta, col_out);
+                if (Has_colscale) { dcolscale_out2.store_to(params.dcolscale, col_out); }
+            }
+        }
+    }
+}
+}  // namespace layer_norm
+
+using namespace layer_norm;
+
+template<
+    typename weight_t,
+    typename input_t,
+    typename residual_t,
+    typename output_t,
+    typename compute_t,
+    typename index_t,
+    int HIDDEN_SIZE,
+    int CTAS_PER_ROW,
+    int WARPS_M,
+    int WARPS_N,
+    int BYTES_PER_LDG_MAIN,
+    int BYTES_PER_LDG_FINAL
+>
+void launch_(LaunchParams<BwdParams> &launch_params, const bool configure_params){
+
+    using Kernel_traits = Kernel_traits<weight_t,
+                                        input_t,
+                                        residual_t,
+                                        output_t,
+                                        compute_t,
+                                        index_t,
+                                        HIDDEN_SIZE,
+                                        CTAS_PER_ROW,
+                                        WARPS_M,
+                                        WARPS_N,
+                                        BYTES_PER_LDG_MAIN
+                                        >;
+    bool is_dropout = launch_params.params.dropout_keep_p < 1.f;
+    bool has_colscale = launch_params.params.colscale != nullptr;
+    bool has_subset = launch_params.params.x0_subset != nullptr;
+    bool is_even_cols = launch_params.params.cols == HIDDEN_SIZE;
+    BOOL_SWITCH(is_dropout, IsDropoutConst, [&] {
+        BOOL_SWITCH(has_colscale, HasColscaleConst, [&] {
+            BOOL_SWITCH(has_subset, HasSubsetConst, [&] {
+                BOOL_SWITCH(is_even_cols, IsEvenColsConst, [&] {
+                    auto kernel = &ln_bwd_kernel<Kernel_traits, IsDropoutConst, HasColscaleConst, HasSubsetConst, IsEvenColsConst>;
+                    if( configure_params ) {
+                        int ctas_per_sm;
+                        CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+                            &ctas_per_sm, kernel, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES));
+                        launch_params.params.ctas_per_col = launch_params.props->multiProcessorCount * ctas_per_sm / Kernel_traits::CTAS_PER_ROW;
+                        launch_params.barrier_size = 0;
+                        launch_params.workspace_bytes = 0;
+                        if(Kernel_traits::CTAS_PER_ROW > 1) {
+                            launch_params.barrier_size = 2 * launch_params.params.ctas_per_col;
+                            launch_params.workspace_bytes = launch_params.params.ctas_per_col
+                                                          * Kernel_traits::WARPS_M
+                                                          * Kernel_traits::CTAS_PER_ROW
+                                                          * sizeof(typename Kernel_traits::reduce_t)
+                                                          * 2;
+                        }
+                        return;
+                    }
+
+                    if( Kernel_traits::SMEM_BYTES >= 48 * 1024 ) {
+                        CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES));
+                    }
+                    auto stream = launch_params.stream;
+                    auto ctas_per_col = launch_params.params.ctas_per_col;
+
+                    if( Kernel_traits::CTAS_PER_ROW == 1 ) {
+                        kernel<<<ctas_per_col, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES, stream>>>(launch_params.params);
+                    } else {
+                        dim3 grid(Kernel_traits::CTAS_PER_ROW * ctas_per_col);
+                        dim3 block(Kernel_traits::THREADS_PER_CTA);
+                        void *params_ = (void *)&launch_params.params;
+                        cudaLaunchCooperativeKernel((void *)kernel, grid, block, (void **)&params_, Kernel_traits::SMEM_BYTES, stream);
+                    }
+
+                    using Kernel_traits_f = layer_norm::Kernel_traits_finalize<HIDDEN_SIZE,
+                                                                              weight_t,
+                                                                              input_t,
+                                                                              residual_t,
+                                                                              output_t,
+                                                                              compute_t,
+                                                                              index_t,
+                                                                              HasColscaleConst,
+                                                                              32 * 32,  // THREADS_PER_CTA
+                                                                              BYTES_PER_LDG_FINAL>;
+
+                    auto kernel_f = &layer_norm::ln_bwd_finalize_kernel<Kernel_traits_f, HasColscaleConst, IsEvenColsConst>;
+                    kernel_f<<<Kernel_traits_f::CTAS, Kernel_traits_f::THREADS_PER_CTA, 0, stream>>>(launch_params.params);
+                });
+            });
+        });
+    });
+}
diff --git a/ln_fwd_1024.cu b/ln_fwd_1024.cu
new file mode 100644
index 0000000000000000000000000000000000000000..824d86e9fd05920d3e557b42356feec86c904f68
--- /dev/null
+++ b/ln_fwd_1024.cu
@@ -0,0 +1,15 @@
+#include "ln_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_FWD_LAUNCHER( 1024, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1024, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1024, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1024, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1024, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1024, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1024, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1024, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1024, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1024, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
diff --git a/ln_fwd_1280.cu b/ln_fwd_1280.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1ff58cbc2889a2c06c51df560d2b35ca4e079201
--- /dev/null
+++ b/ln_fwd_1280.cu
@@ -0,0 +1,15 @@
+#include "ln_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_FWD_LAUNCHER( 1280, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1280, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1280, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1280, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1280, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1280, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1280, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1280, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1280, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1280, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
diff --git a/ln_fwd_1536.cu b/ln_fwd_1536.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a8e19d4dba97d91cd246e62ba80a2936ac05755c
--- /dev/null
+++ b/ln_fwd_1536.cu
@@ -0,0 +1,15 @@
+#include "ln_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_FWD_LAUNCHER( 1536, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1536, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1536, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1536, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1536, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1536, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1536, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1536, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1536, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 1536, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
diff --git a/ln_fwd_2048.cu b/ln_fwd_2048.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6f9794c1e77f91a333d64cc6e461560622b87e12
--- /dev/null
+++ b/ln_fwd_2048.cu
@@ -0,0 +1,15 @@
+#include "ln_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_FWD_LAUNCHER( 2048, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2048, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2048, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2048, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2048, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2048, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2048, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2048, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2048, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2048, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
diff --git a/ln_fwd_256.cu b/ln_fwd_256.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f3a541c6dbf20cd94bb56607bbb23e6a81059bdc
--- /dev/null
+++ b/ln_fwd_256.cu
@@ -0,0 +1,15 @@
+#include "ln_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_FWD_LAUNCHER(  256, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  256, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  256, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  256, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  256, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  256, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  256, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  256, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  256, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  256, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
diff --git a/ln_fwd_2560.cu b/ln_fwd_2560.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1650671e059ec358f8109c1d592694458e77d489
--- /dev/null
+++ b/ln_fwd_2560.cu
@@ -0,0 +1,15 @@
+#include "ln_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_FWD_LAUNCHER( 2560, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2560, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2560, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2560, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2560, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2560, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2560, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2560, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2560, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER( 2560, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
diff --git a/ln_fwd_3072.cu b/ln_fwd_3072.cu
new file mode 100644
index 0000000000000000000000000000000000000000..25bb8691dc9f6a95297301efbd91567a5c22d1c2
--- /dev/null
+++ b/ln_fwd_3072.cu
@@ -0,0 +1,15 @@
+#include "ln_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_FWD_LAUNCHER( 3072, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 3072, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 3072, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 3072, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 3072, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 3072, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 3072, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 3072, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 3072, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 3072, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
diff --git a/ln_fwd_4096.cu b/ln_fwd_4096.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b2bffb5831bf1b6eb18cd1e2cd2c4636a06f5736
--- /dev/null
+++ b/ln_fwd_4096.cu
@@ -0,0 +1,15 @@
+#include "ln_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_FWD_LAUNCHER( 4096, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 4096, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 4096, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 4096, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 4096, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 4096, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 4096, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 4096, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 4096, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 4096, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
diff --git a/ln_fwd_512.cu b/ln_fwd_512.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a08fe34c55d61eecdbc74caa41dfbec10b3a8126
--- /dev/null
+++ b/ln_fwd_512.cu
@@ -0,0 +1,15 @@
+#include "ln_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_FWD_LAUNCHER(  512, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  512, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  512, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  512, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  512, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  512, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  512, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  512, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  512, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  512, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
diff --git a/ln_fwd_5120.cu b/ln_fwd_5120.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bebbd69f05b38a5e3c0dae5d248de467118ef8c5
--- /dev/null
+++ b/ln_fwd_5120.cu
@@ -0,0 +1,15 @@
+#include "ln_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_FWD_LAUNCHER( 5120, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 5120, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 5120, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 5120, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 5120, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 5120, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 5120, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 5120, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 5120, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 5120, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
diff --git a/ln_fwd_6144.cu b/ln_fwd_6144.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4df01ead2f292e255221e6fb0b48e63941a22cab
--- /dev/null
+++ b/ln_fwd_6144.cu
@@ -0,0 +1,15 @@
+#include "ln_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_FWD_LAUNCHER( 6144, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 6144, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 6144, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 6144, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 6144, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 6144, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 6144, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 6144, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 6144, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 6144, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16);
diff --git a/ln_fwd_7168.cu b/ln_fwd_7168.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8343666d10c2788cb2c19ba4f448eef2ccf2b956
--- /dev/null
+++ b/ln_fwd_7168.cu
@@ -0,0 +1,15 @@
+#include "ln_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_FWD_LAUNCHER( 7168, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 7168, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 7168, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 7168, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 7168, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 7168, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 7168, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 7168, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 7168, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_FWD_LAUNCHER( 7168, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
diff --git a/ln_fwd_768.cu b/ln_fwd_768.cu
new file mode 100644
index 0000000000000000000000000000000000000000..06d5a3b09cdd4941764885f5107bbbfa6b264eef
--- /dev/null
+++ b/ln_fwd_768.cu
@@ -0,0 +1,15 @@
+#include "ln_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_FWD_LAUNCHER(  768, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  768, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  768, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  768, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  768, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  768, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  768, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  768, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  768, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_FWD_LAUNCHER(  768, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
diff --git a/ln_fwd_8192.cu b/ln_fwd_8192.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bf7cb40252baf820c88dff1337c81dffd934087a
--- /dev/null
+++ b/ln_fwd_8192.cu
@@ -0,0 +1,15 @@
+#include "ln_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_FWD_LAUNCHER( 8192, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 8192, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 8192, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 8192, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 8192, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 8192, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 8192, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 8192, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 8192, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16);
+REGISTER_FWD_LAUNCHER( 8192, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16);
diff --git a/ln_fwd_kernels.cuh b/ln_fwd_kernels.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..f6bccb8c28a2b3d967dddc3d8b21e1888ed2e29c
--- /dev/null
+++ b/ln_fwd_kernels.cuh
@@ -0,0 +1,272 @@
+#pragma once
+
+#ifdef OLD_GENERATOR_PATH
+#include <ATen/CUDAGeneratorImpl.h>
+#else
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#endif
+
+#include <ATen/cuda/detail/UnpackRaw.cuh>  // For at::cuda::philox::unpack
+#include <curand_kernel.h>
+
+#include "ln.h"
+#include "ln_utils.cuh"
+#include "ln_kernel_traits.h"
+#include "static_switch.h"
+
+namespace layer_norm {
+
+template<typename Ktraits, bool Is_dropout, bool Has_colscale, bool Has_subset, bool Is_even_cols>
+__global__ __launch_bounds__(Ktraits::THREADS_PER_CTA) 
+void ln_fwd_kernel(FwdParams params) {
+
+    enum { ROWS_PER_CTA = Ktraits::ROWS_PER_CTA };
+    enum { WARPS_N = Ktraits::WARPS_N };
+    enum { WARPS_M = Ktraits::WARPS_M };
+    enum { THREADS_PER_ROW = Ktraits::THREADS_PER_ROW };
+    enum { VEC_COLS_PER_LDG = Ktraits::VEC_COLS_PER_LDG };
+    enum { BYTES_PER_ROW = Ktraits::BYTES_PER_ROW };
+    enum { LDGS = Ktraits::LDGS };
+    enum { NUM_ELTS = Ktraits::NUM_ELTS };
+    enum { CTAS_PER_ROW = Ktraits::CTAS_PER_ROW };
+
+    using input_t = typename Ktraits::input_t;
+    using residual_t = typename Ktraits::residual_t;
+    using output_t = typename Ktraits::output_t;
+    using index_t = typename Ktraits::index_t;
+    using compute_t = typename Ktraits::compute_t;
+    using mask_t = typename Ktraits::mask_t;
+    using Ivec = typename Ktraits::Ivec;
+    using Rvec = typename Ktraits::Rvec;
+    using Ovec = typename Ktraits::Ovec;
+    using Wvec = typename Ktraits::Wvec;
+    using Cvec = typename Ktraits::Cvec;
+    using Mvec = typename Ktraits::Mvec;
+
+    using Stats = typename Ktraits::Stats;
+    using stats_t = typename Stats::stats_t;
+
+    const bool has_residual = params.residual != nullptr;
+    const bool save_x = has_residual || Is_dropout || Has_colscale || (params.rowscale != nullptr) || Has_subset || !(std::is_same<input_t, residual_t>::value);
+
+    extern __shared__ char smem_[];
+
+    const index_t tidx = threadIdx.x;
+    const index_t bidn = blockIdx.x % CTAS_PER_ROW;
+    const index_t bidm = blockIdx.x / CTAS_PER_ROW;
+    const index_t lane = tidx % THREADS_PER_WARP;
+    const index_t warp = tidx / THREADS_PER_WARP;
+    const index_t warp_m = warp / WARPS_N;
+    const index_t warp_n = warp % WARPS_N;
+
+    const index_t r = bidm * ROWS_PER_CTA + warp_m;
+    const index_t c = bidn * THREADS_PER_ROW + warp_n * THREADS_PER_WARP + lane;
+
+    Stats stats(params, bidm, bidn, warp_m, warp_n, lane, smem_);
+
+    compute_t *mu_ptr = static_cast<compute_t *>(params.mu);
+    compute_t *rs_ptr = static_cast<compute_t *>(params.rs);
+
+    const input_t *rowscale = static_cast<input_t *>(params.rowscale);
+    const index_t *x0_subset = static_cast<index_t *>(params.x0_subset);
+    const index_t *z_subset = static_cast<index_t *>(params.z_subset);
+
+    // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/Dropout.cu
+    curandStatePhilox4_32_10_t state;
+    if (Is_dropout) {
+        auto seeds = at::cuda::philox::unpack(params.philox_args);
+        const index_t tidx_global = blockIdx.x * blockDim.x + threadIdx.x;
+        curand_init(std::get<0>(seeds), tidx_global, std::get<1>(seeds), &state);
+    }
+
+    const index_t num_valid_ldgs = ((params.cols / Ktraits::ELTS_PER_LDG) - 1 - c + VEC_COLS_PER_LDG) / VEC_COLS_PER_LDG;
+
+    Wvec gamma[LDGS];
+    Wvec beta[LDGS];
+    Wvec colscale[LDGS];
+    index_t idx = c;
+    #pragma unroll
+    for( int it = 0; it < LDGS; it++ ) {
+        if (Is_even_cols || (it < num_valid_ldgs)) {
+            gamma[it].load_from(params.gamma, idx);
+            if (params.beta != nullptr) {
+                beta[it].load_from(params.beta, idx);
+            } else {
+                beta[it].zero_();
+            }
+            if (Has_colscale) { colscale[it].load_from(params.colscale, idx); }
+            idx += VEC_COLS_PER_LDG;
+        }
+    }
+
+    for( int row = r; row < params.rows; row += params.ctas_per_col * ROWS_PER_CTA ) {
+        const compute_t rowscale_val = !Has_subset ? (params.rowscale == nullptr ? 1.0f : compute_t(rowscale[row])) : params.rowscale_const;
+        const int row_x0 = !Has_subset ? row + 1 : x0_subset[row];
+        const int row_z = !Has_subset ? row + 1 : z_subset[row];
+        const bool load_x0 = !Has_subset || row_x0 > 0;
+        index_t idx_x = row * params.cols / Ktraits::ELTS_PER_LDG + c;
+        index_t idx_x0 = !Has_subset ? idx_x : (load_x0 ? (row_x0 - 1) * params.cols / Ktraits::ELTS_PER_LDG + c : 0);
+        compute_t xf[LDGS * NUM_ELTS];
+        #pragma unroll
+        for( int it = 0; it < LDGS; it++ ) {
+            if (Is_even_cols || (it < num_valid_ldgs)) {
+                Ivec x0;
+                Rvec residual;
+                Rvec x;
+                Mvec dmask;
+                if (load_x0) { x0.load_from(params.x0, !Has_subset ? idx_x : idx_x0); }
+                if (has_residual) { residual.load_from(params.residual, idx_x); }
+                #pragma unroll
+                for( int jt = 0; jt < NUM_ELTS; jt++ ) {
+                    // TD [2022-04-22]: We're memory bound, not compute bound, so we don't need to use
+                    // the more efficient curand_uniform4.
+                    compute_t x_ij;
+                    if (load_x0) {
+                        mask_t keep = !Is_dropout ? true : curand_uniform(&state) <= params.dropout_keep_p;
+                        if (Is_dropout) { dmask.data.elt[jt] = keep; }
+                        compute_t x0_ij = compute_t(x0.data.elt[jt]) * rowscale_val;
+                        x0_ij = keep ? (Is_dropout ? x0_ij * params.dropout_scale : x0_ij) : 0.0f;
+                        if (Has_colscale) { x0_ij *= compute_t(colscale[it].data.elt[jt]); }
+                        x_ij = has_residual ? x0_ij + compute_t(residual.data.elt[jt]) : x0_ij;
+                    } else {
+                        x_ij = has_residual ? compute_t(residual.data.elt[jt]) : 0.f;
+                    }
+                    if (save_x) { x.data.elt[jt] = x_ij; }
+                    xf[it * NUM_ELTS + jt] = x_ij;
+                }
+                if (save_x) { x.store_to(params.x, idx_x); }
+                if (Is_dropout && load_x0) { dmask.store_to(params.dmask, !Has_subset ? idx_x : idx_x0); }
+                idx_x += VEC_COLS_PER_LDG;
+                idx_x0 += VEC_COLS_PER_LDG;
+            }
+        }
+
+        static_assert(CTAS_PER_ROW == 1, "Don't support multiple CTAs per row for now");
+        const index_t num_vecs = params.cols / Ktraits::ELTS_PER_LDG;
+        const index_t num_full_ldgs = num_vecs / Ktraits::VEC_COLS_PER_LDG;
+        const index_t remaining_vecs = num_vecs % Ktraits::VEC_COLS_PER_LDG;
+        auto valid_elts_in_warp_fn = [num_full_ldgs, remaining_vecs] (int warp_n) -> int {
+            // Need to convert to int, otherwise the subtraction will wrap around.
+            const index_t valid_partial_vecs_in_warp =
+                std::min(std::max(int(remaining_vecs) - int(warp_n * THREADS_PER_WARP), int(0)),
+                        int(THREADS_PER_WARP));
+            return (num_full_ldgs * THREADS_PER_WARP + valid_partial_vecs_in_warp) * NUM_ELTS;
+        };
+        stats_t s = stats.template compute<Is_even_cols>(
+            xf, params.inverse_cols, valid_elts_in_warp_fn, num_valid_ldgs * NUM_ELTS
+        );
+
+        compute_t mu = layer_norm::Get<0>::of<stats_t, compute_t>(s);
+        compute_t m2 = layer_norm::Get<1>::of<stats_t, compute_t>(s);
+
+        if( bidn == 0 && warp_n == 0 && lane == 0 ) {
+            mu_ptr[row] = mu;
+        }
+
+        compute_t rs = rsqrtf(m2 * params.inverse_cols + params.epsilon + (!params.is_rms_norm ? 0.f : mu * mu));
+
+        if( bidn == 0 && warp_n == 0 && lane == 0 ) {
+            rs_ptr[row] = rs;
+        }
+
+        const bool save_z = !Has_subset || row_z > 0;
+        if (save_z) {
+            index_t idx_z = (!Has_subset ? row : (row_z - 1)) * params.cols / Ktraits::ELTS_PER_LDG + c;
+            #pragma unroll
+            for( int it = 0; it < LDGS; it++ ) {
+                if (Is_even_cols || (it < num_valid_ldgs)) {
+                    Ovec z;
+                    #pragma unroll
+                    for( int jt = 0; jt < NUM_ELTS; jt++ ) {
+                        compute_t y_ij = compute_t(rs * (xf[it * NUM_ELTS + jt] - (!params.is_rms_norm ? mu : 0.f)));
+                        compute_t g_ij = gamma[it].data.elt[jt];
+                        compute_t b_ij = beta[it].data.elt[jt];
+                        z.data.elt[jt] = output_t(g_ij * y_ij + b_ij);
+                    }
+                    z.store_to(params.z, idx_z);
+                    idx_z += VEC_COLS_PER_LDG;
+                }
+            }
+        }
+
+    }
+}
+
+}  // namespace layer_norm
+
+using namespace layer_norm;
+
+template<
+    typename weight_t,
+    typename input_t,
+    typename residual_t,
+    typename output_t,
+    typename compute_t,
+    typename index_t,
+    int HIDDEN_SIZE,
+    int CTAS_PER_ROW,
+    int WARPS_M,
+    int WARPS_N,
+    int BYTES_PER_LDG
+>
+void launch_(LaunchParams<FwdParams> &launch_params, const bool configure_params){
+
+    using Kernel_traits = Kernel_traits<weight_t,
+                                        input_t,
+                                        residual_t,
+                                        output_t,
+                                        compute_t,
+                                        index_t,
+                                        HIDDEN_SIZE,
+                                        CTAS_PER_ROW,
+                                        WARPS_M,
+                                        WARPS_N,
+                                        BYTES_PER_LDG
+                                        >;
+    bool has_colscale = launch_params.params.colscale != nullptr;
+    bool has_subset = launch_params.params.x0_subset != nullptr;
+    bool is_even_cols = launch_params.params.cols == HIDDEN_SIZE;
+    BOOL_SWITCH(launch_params.params.dropout_keep_p < 1.f, IsDropoutConst, [&] {
+        BOOL_SWITCH(has_colscale, HasColscaleConst, [&] {
+            BOOL_SWITCH(has_subset, HasSubsetConst, [&] {
+                    BOOL_SWITCH(is_even_cols, IsEvenColsConst, [&] {
+                        auto kernel = &ln_fwd_kernel<Kernel_traits, IsDropoutConst, HasColscaleConst, HasSubsetConst, IsEvenColsConst>;
+                    if( configure_params ) {
+                        int ctas_per_sm;
+                        CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+                            &ctas_per_sm, kernel, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES_FWD));
+                        launch_params.params.ctas_per_col = launch_params.props->multiProcessorCount * ctas_per_sm / Kernel_traits::CTAS_PER_ROW;
+                        const size_t rows_per_loop = launch_params.params.ctas_per_col * Kernel_traits::ROWS_PER_CTA;
+                        launch_params.elts_per_thread = (launch_params.params.rows + rows_per_loop - 1) / rows_per_loop * Kernel_traits::LDGS * Kernel_traits::NUM_ELTS;
+                        launch_params.barrier_size = 0;
+                        launch_params.workspace_bytes = 0;
+                        if(Kernel_traits::CTAS_PER_ROW > 1) {
+                            launch_params.barrier_size = 2 * launch_params.params.ctas_per_col;
+                            launch_params.workspace_bytes = launch_params.params.ctas_per_col
+                                                          * Kernel_traits::WARPS_M
+                                                          * Kernel_traits::CTAS_PER_ROW
+                                                          * sizeof(typename Kernel_traits::Stats::stats_t)
+                                                          * 2;
+                        }
+                        return;
+                    }
+
+                    if( Kernel_traits::SMEM_BYTES_FWD >= 48 * 1024 ) {
+                        CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES_FWD));
+                    }
+                    auto stream = launch_params.stream;
+                    auto ctas_per_col = launch_params.params.ctas_per_col;
+
+                    if( Kernel_traits::CTAS_PER_ROW == 1 ) {
+                        kernel<<<ctas_per_col, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES_FWD, stream>>>(launch_params.params);
+                    } else {
+                        dim3 grid(Kernel_traits::CTAS_PER_ROW * ctas_per_col);
+                        dim3 block(Kernel_traits::THREADS_PER_CTA);
+                        void *params_ = (void *)&launch_params.params;
+                        cudaLaunchCooperativeKernel((void *)kernel, grid, block, (void **)&params_, Kernel_traits::SMEM_BYTES_FWD, stream);
+                    }
+                });
+            });
+        });
+    });
+}
diff --git a/ln_kernel_traits.h b/ln_kernel_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..77de6bf9af60c9ae70427097db26cf4ed130b359
--- /dev/null
+++ b/ln_kernel_traits.h
@@ -0,0 +1,172 @@
+#pragma once
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace layer_norm {
+template<
+    uint32_t HIDDEN_SIZE_,
+    typename weight_t_,
+    typename input_t_,
+    typename residual_t_,
+    typename output_t_,
+    typename compute_t_,
+    typename index_t_,
+    uint32_t THREADS_PER_CTA_
+>
+struct Kernel_traits_base {
+
+    using weight_t = weight_t_;
+    using input_t = input_t_;
+    using residual_t = residual_t_;
+    using output_t = output_t_;
+    using compute_t = compute_t_;
+    using index_t = index_t_;
+
+    enum { HIDDEN_SIZE = HIDDEN_SIZE_ };
+    enum { THREADS_PER_CTA = THREADS_PER_CTA_ };
+    enum { THREADS_PER_WARP = 32 };
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<
+    uint32_t HIDDEN_SIZE_,
+    typename weight_t_,
+    typename input_t_,
+    typename residual_t_,
+    typename output_t_,
+    typename compute_t_,
+    typename index_t_,
+    bool Has_colscale,
+    uint32_t THREADS_PER_CTA_,
+    uint32_t BYTES_PER_LDG_,
+    typename Base = Kernel_traits_base<HIDDEN_SIZE_,
+                                        weight_t_,
+                                        input_t_,
+                                        residual_t_,
+                                        output_t_,
+                                        compute_t_,
+                                        index_t_,
+                                        THREADS_PER_CTA_>
+>
+struct Kernel_traits_finalize : public Base {
+    enum { ROWS_PER_CTA = Base::THREADS_PER_CTA / Base::THREADS_PER_WARP };
+    static_assert((int) ROWS_PER_CTA <= (int) Base::THREADS_PER_WARP);
+    // Bytes per global load from the input. 
+    enum { BYTES_PER_LDG = BYTES_PER_LDG_ };
+    // Number of elements fetched by a global load.
+    enum { ELTS_PER_LDG = BYTES_PER_LDG / sizeof(compute_t_) };
+    // Bytes per global store of the weights.
+    enum { BYTES_PER_STG = ELTS_PER_LDG * sizeof(weight_t_) };
+    static_assert(sizeof(BYTES_PER_LDG) == 4, "Conflict-free smem transpose only implemented for 4B compute type!");
+    static_assert(Base::THREADS_PER_CTA == ROWS_PER_CTA * Base::THREADS_PER_WARP, "We assume one warp per row!");
+    // The total number of BYTES_PER_LDG-wide words in a hidden vector.
+    enum { COLS = HIDDEN_SIZE_ * sizeof(compute_t_) / BYTES_PER_LDG };
+    static_assert(COLS * BYTES_PER_LDG == HIDDEN_SIZE_ * sizeof(compute_t_));
+
+    // Shared memory size to transpose the CTA result.
+    enum { SMEM_BYTES_TRANSPOSE = Base::THREADS_PER_CTA * BYTES_PER_LDG };
+    // Shared memory size to coalsece the CTA result.
+    enum { SMEM_BYTES_OUTPUT = Base::THREADS_PER_WARP * BYTES_PER_LDG };
+    // Shared memory requirement per CTA. 
+    static constexpr int NUM_FACTORS = Has_colscale ? 3 : 2;
+    enum { SMEM_BYTES_PER_CTA = NUM_FACTORS * SMEM_BYTES_TRANSPOSE + NUM_FACTORS * SMEM_BYTES_OUTPUT };
+
+    // The type of the reducer.
+    using Reducer = layer_norm::Reducer<compute_t_, 1, 1, 1>;
+
+    // Condition for the whole CTA to participate in syncthreads.
+    static_assert(COLS % Base::THREADS_PER_WARP == 0);
+    enum { CTAS = COLS / Base::THREADS_PER_WARP };
+}; 
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+template<
+    typename weight_t_,
+    typename input_t_,
+    typename residual_t_,
+    typename output_t_,
+    typename compute_t_,
+    typename index_t_,
+    uint32_t HIDDEN_SIZE_, 
+    uint32_t CTAS_PER_ROW_, 
+    uint32_t WARPS_M_, 
+    uint32_t WARPS_N_, 
+    uint32_t BYTES_PER_LDG_ = 16,
+    typename Base = Kernel_traits_base<
+        HIDDEN_SIZE_,
+        weight_t_, 
+        input_t_,
+        residual_t_,
+        output_t_, 
+        compute_t_, 
+        index_t_, 
+        WARPS_M_*WARPS_N_*THREADS_PER_WARP
+        >
+>
+struct Kernel_traits : public Base {
+
+    using input_t = typename Base::input_t;
+    using residual_t = typename Base::residual_t;
+    using weight_t = typename Base::weight_t;
+    using compute_t = typename Base::compute_t;
+    using output_t = typename Base::output_t;
+    using index_t = typename Base::index_t;
+    // using mask_t = unsigned char;
+    using mask_t = bool;
+
+    enum { CTAS_PER_ROW = CTAS_PER_ROW_ };
+    enum { WARPS_M = WARPS_M_ };
+    enum { WARPS_N = WARPS_N_ };
+    enum { COLS = HIDDEN_SIZE_ };
+    enum { HIDDEN_SIZE = HIDDEN_SIZE_ };
+    enum { BYTES_PER_LDG = BYTES_PER_LDG_ };
+    enum { NUM_ELTS = BYTES_PER_LDG / sizeof(input_t) };
+
+    enum { THREADS_PER_ROW = WARPS_N * THREADS_PER_WARP };
+    enum { THREADS_PER_CTA = WARPS_M * THREADS_PER_ROW };
+    enum { ROWS_PER_CTA = WARPS_M };
+
+    enum { BYTES_PER_ROW = COLS * sizeof(input_t) };
+    enum { BYTES_PER_ROW_PER_CTA = THREADS_PER_ROW * BYTES_PER_LDG };
+    // Multi-row per CTA not supported for multi-CTA => no smem for WGRAD needed
+    enum { SMEM_BYTES_WGRAD = CTAS_PER_ROW > 1 ? 0 : ROWS_PER_CTA * COLS * sizeof(compute_t) };
+    static_assert(WARPS_M == 1 || CTAS_PER_ROW == 1);
+
+    using reduce_t = typename layer_norm::TypeToVec2<compute_t>::Type;
+    using Reducer = layer_norm::Reducer<reduce_t, CTAS_PER_ROW, WARPS_M, WARPS_N>; 
+
+    enum { SMEM_BYTES_DGRAD = Reducer::SMEM_BYTES };
+    enum { SMEM_BYTES = SMEM_BYTES_DGRAD  + SMEM_BYTES_WGRAD };
+
+    using Ivec = layer_norm::Vec<input_t, NUM_ELTS>;
+    using Rvec = layer_norm::Vec<residual_t, NUM_ELTS>;
+    using Ovec = layer_norm::Vec<output_t, NUM_ELTS>;
+    using Wvec = layer_norm::Vec<weight_t, NUM_ELTS>;
+    using Cvec = layer_norm::Vec<compute_t, NUM_ELTS>;
+    using Mvec = layer_norm::Vec<mask_t, NUM_ELTS>;
+    enum { ELTS_PER_LDG = BYTES_PER_LDG / sizeof(input_t) };
+
+    // Assume that each thread can handle the same number of elements in the output and weights as in the input.
+    static_assert(sizeof(input_t) == sizeof(output_t));
+    static_assert(sizeof(input_t) <= sizeof(residual_t));
+    // The number of columns fetched per load from input: one per thread.
+    enum { VEC_COLS_PER_LDG =  CTAS_PER_ROW * THREADS_PER_ROW };
+    // The total number of vectorized loads/stores per hidden vector.
+    enum { VEC_COLS = COLS / ELTS_PER_LDG };
+    // The number of loads per thread for the input.
+    enum { LDGS = VEC_COLS / VEC_COLS_PER_LDG };
+    static_assert(LDGS * VEC_COLS_PER_LDG  == VEC_COLS);
+    //static_assert(LDGS * BYTES_PER_ROW_PER_CTA * CTAS_PER_ROW == BYTES_PER_ROW, "");
+
+    using Stats = layer_norm::Stats<compute_t, CTAS_PER_ROW, WARPS_M, WARPS_N>;
+    enum { SMEM_BYTES_FWD = Stats::SMEM_BYTES };
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace layer_norm
diff --git a/ln_parallel_bwd_1024.cu b/ln_parallel_bwd_1024.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6f4e77466c6c6d5a00275d54f4e68da062a5fc1a
--- /dev/null
+++ b/ln_parallel_bwd_1024.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_PARALLEL_BWD_LAUNCHER(  1024, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1024, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1024, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1024, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1024, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1024, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1024, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1024, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1024, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1024, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
diff --git a/ln_parallel_bwd_1280.cu b/ln_parallel_bwd_1280.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2dba3bebf26e99b853e7ef4b9b56421cf483e0bd
--- /dev/null
+++ b/ln_parallel_bwd_1280.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_PARALLEL_BWD_LAUNCHER(  1280, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1280, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1280, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1280, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1280, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1280, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1280, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1280, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1280, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  1280, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
diff --git a/ln_parallel_bwd_1536.cu b/ln_parallel_bwd_1536.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c2ac4b1b0998ca412dea02466f0d8fbe69f48216
--- /dev/null
+++ b/ln_parallel_bwd_1536.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp32, fp16, fp32, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp16, fp16, fp32, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp32, fp16, fp16, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp32, bf16, fp32, bf16, fp32, 1, 1, 4,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 1536, bf16, bf16, fp32, bf16, fp32, 1, 1, 4,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp32, bf16, bf16, bf16, fp32, 1, 1, 4,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp16, fp16, fp16, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 1536, bf16, bf16, bf16, bf16, fp32, 1, 1, 4,  8, 4);
diff --git a/ln_parallel_bwd_2048.cu b/ln_parallel_bwd_2048.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f7f959e2fa785a4df3b6a32506f527e1723d83cc
--- /dev/null
+++ b/ln_parallel_bwd_2048.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2048, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2048, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
\ No newline at end of file
diff --git a/ln_parallel_bwd_256.cu b/ln_parallel_bwd_256.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fa613cf45e1045d046cefc4afd55ded754bc20a4
--- /dev/null
+++ b/ln_parallel_bwd_256.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_PARALLEL_BWD_LAUNCHER(  256, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  256, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  256, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  256, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  256, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  256, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  256, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  256, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  256, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  256, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
diff --git a/ln_parallel_bwd_2560.cu b/ln_parallel_bwd_2560.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5f5707612df09149885d7883728672dc3a2b751f
--- /dev/null
+++ b/ln_parallel_bwd_2560.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp32, fp16, fp32, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp16, fp16, fp32, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp32, fp16, fp16, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp32, bf16, fp32, bf16, fp32, 1, 1, 4,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2560, bf16, bf16, fp32, bf16, fp32, 1, 1, 4,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp32, bf16, bf16, bf16, fp32, 1, 1, 4,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp16, fp16, fp16, fp16, fp32, 1, 1, 4,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 2560, bf16, bf16, bf16, bf16, fp32, 1, 1, 4,  8, 4);
diff --git a/ln_parallel_bwd_3072.cu b/ln_parallel_bwd_3072.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8fdcb8ffb4d0f0e0fcae6aee930808bd0349ede5
--- /dev/null
+++ b/ln_parallel_bwd_3072.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 3072, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 3072, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4);
\ No newline at end of file
diff --git a/ln_parallel_bwd_4096.cu b/ln_parallel_bwd_4096.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8decfb085ac8ace1e3694a491bb66a83209027b8
--- /dev/null
+++ b/ln_parallel_bwd_4096.cu
@@ -0,0 +1,17 @@
+#include "ln_parallel_residual_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+// Use 8 warps otherwise there's a lot of register spilling
+
+REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 4096, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 4096, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);
\ No newline at end of file
diff --git a/ln_parallel_bwd_512.cu b/ln_parallel_bwd_512.cu
new file mode 100644
index 0000000000000000000000000000000000000000..178453d3045bfefd95018320d357ea8662018782
--- /dev/null
+++ b/ln_parallel_bwd_512.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_PARALLEL_BWD_LAUNCHER(  512, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  512, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  512, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  512, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  512, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  512, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  512, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  512, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  512, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  512, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
diff --git a/ln_parallel_bwd_5120.cu b/ln_parallel_bwd_5120.cu
new file mode 100644
index 0000000000000000000000000000000000000000..815521973da7266534c7e8b167fa0b8baa47fa2c
--- /dev/null
+++ b/ln_parallel_bwd_5120.cu
@@ -0,0 +1,17 @@
+#include "ln_parallel_residual_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+// Use 8 warps otherwise there's a lot of register spilling
+
+REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp32, fp16, fp32, fp16, fp32, 1, 1, 8,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp16, fp16, fp32, fp16, fp32, 1, 1, 8,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp32, fp16, fp16, fp16, fp32, 1, 1, 8,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp32, bf16, fp32, bf16, fp32, 1, 1, 8,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 5120, bf16, bf16, fp32, bf16, fp32, 1, 1, 8,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp32, bf16, bf16, bf16, fp32, 1, 1, 8,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp16, fp16, fp16, fp16, fp32, 1, 1, 8,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 5120, bf16, bf16, bf16, bf16, fp32, 1, 1, 8,  8, 4);
\ No newline at end of file
diff --git a/ln_parallel_bwd_6144.cu b/ln_parallel_bwd_6144.cu
new file mode 100644
index 0000000000000000000000000000000000000000..eb8668d8a229d2ec24e5eac57db00f9d650615eb
--- /dev/null
+++ b/ln_parallel_bwd_6144.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 6144, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 6144, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);
\ No newline at end of file
diff --git a/ln_parallel_bwd_7168.cu b/ln_parallel_bwd_7168.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0c12dc476678ce7b24c5fcd0b9408eb686bd6825
--- /dev/null
+++ b/ln_parallel_bwd_7168.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp32, fp16, fp32, fp16, fp32, 1, 1, 8,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp16, fp16, fp32, fp16, fp32, 1, 1, 8,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp32, fp16, fp16, fp16, fp32, 1, 1, 8,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp32, bf16, fp32, bf16, fp32, 1, 1, 8,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 7168, bf16, bf16, fp32, bf16, fp32, 1, 1, 8,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp32, bf16, bf16, bf16, fp32, 1, 1, 8,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp16, fp16, fp16, fp16, fp32, 1, 1, 8,  8, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 7168, bf16, bf16, bf16, bf16, fp32, 1, 1, 8,  8, 4);
\ No newline at end of file
diff --git a/ln_parallel_bwd_768.cu b/ln_parallel_bwd_768.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8beece8ab19cea2baefedd118f5d15c90a646526
--- /dev/null
+++ b/ln_parallel_bwd_768.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_PARALLEL_BWD_LAUNCHER(  768, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  768, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  768, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  768, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  768, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  768, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  768, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  768, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  768, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER(  768, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4);
diff --git a/ln_parallel_bwd_8192.cu b/ln_parallel_bwd_8192.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5ad47c94fdff599dde62574d1c535c4bbacae551
--- /dev/null
+++ b/ln_parallel_bwd_8192.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_bwd_kernels.cuh"
+
+// Create backward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL
+
+REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 8192, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4);
+REGISTER_PARALLEL_BWD_LAUNCHER( 8192, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4);
\ No newline at end of file
diff --git a/ln_parallel_fwd_1024.cu b/ln_parallel_fwd_1024.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3c64e169302eea0f94ff65641728c35689d7c4ba
--- /dev/null
+++ b/ln_parallel_fwd_1024.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1024, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1024, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
diff --git a/ln_parallel_fwd_1280.cu b/ln_parallel_fwd_1280.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9bbfce5bc6c5e0303d70552bb36cf380601dcd38
--- /dev/null
+++ b/ln_parallel_fwd_1280.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1280, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1280, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
diff --git a/ln_parallel_fwd_1536.cu b/ln_parallel_fwd_1536.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b57f5edce8eb7b6779475f6eadb8aabba299c802
--- /dev/null
+++ b/ln_parallel_fwd_1536.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1536, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 1536, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
diff --git a/ln_parallel_fwd_2048.cu b/ln_parallel_fwd_2048.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6fa322d96b4e11aacf5722985672e141f929299b
--- /dev/null
+++ b/ln_parallel_fwd_2048.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2048, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2048, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
diff --git a/ln_parallel_fwd_256.cu b/ln_parallel_fwd_256.cu
new file mode 100644
index 0000000000000000000000000000000000000000..27445a6bc50c98935c7a5093ee5ffdddf52e2494
--- /dev/null
+++ b/ln_parallel_fwd_256.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_PARALLEL_FWD_LAUNCHER(  256, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  256, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  256, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  256, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  256, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  256, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  256, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  256, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  256, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  256, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
\ No newline at end of file
diff --git a/ln_parallel_fwd_2560.cu b/ln_parallel_fwd_2560.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fdde470c267302adca3d63f2c6b736b67af7ee86
--- /dev/null
+++ b/ln_parallel_fwd_2560.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2560, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 2560, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
diff --git a/ln_parallel_fwd_3072.cu b/ln_parallel_fwd_3072.cu
new file mode 100644
index 0000000000000000000000000000000000000000..992f71037607066fb4e4d0f1624669f21c2f53b1
--- /dev/null
+++ b/ln_parallel_fwd_3072.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 3072, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 3072, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
diff --git a/ln_parallel_fwd_4096.cu b/ln_parallel_fwd_4096.cu
new file mode 100644
index 0000000000000000000000000000000000000000..381837e60874e44aa5e0efccb8749b2ff41ac3fa
--- /dev/null
+++ b/ln_parallel_fwd_4096.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 4096, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 4096, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
diff --git a/ln_parallel_fwd_512.cu b/ln_parallel_fwd_512.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4ba478b01fbdbc2ff5aab0a15fb698eba369f61a
--- /dev/null
+++ b/ln_parallel_fwd_512.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_PARALLEL_FWD_LAUNCHER(  512, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  512, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  512, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  512, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  512, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  512, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  512, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  512, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  512, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  512, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
diff --git a/ln_parallel_fwd_5120.cu b/ln_parallel_fwd_5120.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7ada35228cb603ddd26b06e186989746a86926a8
--- /dev/null
+++ b/ln_parallel_fwd_5120.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 5120, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 5120, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
diff --git a/ln_parallel_fwd_6144.cu b/ln_parallel_fwd_6144.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6f531c881f7f53651c56e3afd1f0f53c580815ec
--- /dev/null
+++ b/ln_parallel_fwd_6144.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 6144, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 6144, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16);
diff --git a/ln_parallel_fwd_7168.cu b/ln_parallel_fwd_7168.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c99e752cd484a99e97f8bf7a92e433a817c54d64
--- /dev/null
+++ b/ln_parallel_fwd_7168.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 7168, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 7168, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16);
diff --git a/ln_parallel_fwd_768.cu b/ln_parallel_fwd_768.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f33f519c7fb2934b3b5aabf36a2d9046c4b51ee3
--- /dev/null
+++ b/ln_parallel_fwd_768.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_PARALLEL_FWD_LAUNCHER(  768, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  768, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  768, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  768, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  768, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  768, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  768, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  768, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  768, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER(  768, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16);
diff --git a/ln_parallel_fwd_8192.cu b/ln_parallel_fwd_8192.cu
new file mode 100644
index 0000000000000000000000000000000000000000..360e6d4471062cd40bf245ecff22b579f56d4020
--- /dev/null
+++ b/ln_parallel_fwd_8192.cu
@@ -0,0 +1,15 @@
+#include "ln_parallel_residual_fwd_kernels.cuh"
+
+// Create forward launch function and register. Macro signature:
+//  HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG
+
+REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 8192, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16);
+REGISTER_PARALLEL_FWD_LAUNCHER( 8192, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16);
diff --git a/ln_parallel_residual_bwd_kernels.cuh b/ln_parallel_residual_bwd_kernels.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..521495724400fde6eaecb27e255154a51d8ddbb0
--- /dev/null
+++ b/ln_parallel_residual_bwd_kernels.cuh
@@ -0,0 +1,540 @@
+#pragma once
+
+#include "ln.h"
+#include "ln_utils.cuh"
+#include "ln_kernel_traits.h"
+#include "static_switch.h"
+#include "ln_bwd_kernels.cuh"
+
+namespace layer_norm {
+
+template<typename Ktraits, bool Is_dropout, bool Tied_norm, bool Is_even_cols>
+__global__ __launch_bounds__(Ktraits::THREADS_PER_CTA) 
+void ln_parallel_residual_bwd_kernel(layer_norm::BwdParams params) {
+
+    enum { ROWS_PER_CTA = Ktraits::ROWS_PER_CTA };
+    enum { WARPS_M = Ktraits::WARPS_M };
+    enum { WARPS_N = Ktraits::WARPS_N };
+    enum { THREADS_PER_ROW = Ktraits::THREADS_PER_ROW };
+    enum { COLS = Ktraits::COLS };
+    enum { BYTES_PER_ROW = Ktraits::BYTES_PER_ROW };
+    enum { LDGS = Ktraits::LDGS };
+    enum { NUM_ELTS = Ktraits::ELTS_PER_LDG };
+    enum { THREADS_PER_WARP = Ktraits::THREADS_PER_WARP };
+    enum { CTAS_PER_ROW = Ktraits::CTAS_PER_ROW };
+
+    using input_t = typename Ktraits::input_t;
+    using compute_t = typename Ktraits::compute_t;
+    using index_t = typename Ktraits::index_t;
+    using mask_t = typename Ktraits::mask_t;
+    using Ivec = typename Ktraits::Ivec;
+    using Rvec = typename Ktraits::Rvec;
+    using Ovec = typename Ktraits::Ovec;
+    using Wvec = typename Ktraits::Wvec;
+    using Cvec = typename Ktraits::Cvec;
+    using Mvec = typename Ktraits::Mvec;
+    using Reducer = typename Ktraits::Reducer;
+    using reduce_t = typename Reducer::Type;
+
+    extern __shared__ char smem_[];
+
+    const bool has_residual = params.dresidual != nullptr;
+    const bool has_x1 = params.dx1 != nullptr;
+    const bool prenorm = params.dx != nullptr;
+
+    const index_t tidx = threadIdx.x;
+    const index_t bidn = blockIdx.x % CTAS_PER_ROW;
+    const index_t bidm = blockIdx.x / CTAS_PER_ROW;
+    const index_t lane = tidx % THREADS_PER_WARP;
+    const index_t warp = tidx / THREADS_PER_WARP;
+    const index_t warp_m = warp / Ktraits::WARPS_N;
+    const index_t warp_n = warp % Ktraits::WARPS_N;
+    const index_t tid_r = warp_n * THREADS_PER_WARP + lane;
+
+    const index_t r = bidm * Ktraits::ROWS_PER_CTA + warp_m;
+    const index_t c = bidn * THREADS_PER_ROW + warp_n * THREADS_PER_WARP + lane;
+
+    static_assert(COLS == THREADS_PER_ROW * LDGS * NUM_ELTS * CTAS_PER_ROW);
+
+    Cvec dz0y_sum[LDGS];
+    Cvec dz0_sum[LDGS];
+    Cvec dz1y_sum[LDGS];
+    Cvec dz1_sum[LDGS];
+
+    memset(dz0y_sum, 0, sizeof(dz0y_sum));
+    memset(dz0_sum, 0, sizeof(dz0_sum));
+    if (!Tied_norm) {
+        memset(dz1y_sum, 0, sizeof(dz1y_sum));
+        memset(dz1_sum, 0, sizeof(dz1_sum));
+    }
+
+    compute_t * smem_wgrad = reinterpret_cast<compute_t*>(smem_);
+    char *smem_dgrad = smem_ + Ktraits::SMEM_BYTES_WGRAD;
+
+    Reducer reducer(params, bidm, bidn, warp_m, warp_n, lane, smem_dgrad);
+
+    Sum<reduce_t> sum;
+
+    const index_t num_valid_ldgs =
+        ((params.cols / Ktraits::ELTS_PER_LDG) - 1 - c + Ktraits::VEC_COLS_PER_LDG) / Ktraits::VEC_COLS_PER_LDG;
+
+    Wvec gamma0[LDGS];
+    Wvec gamma1[LDGS];
+    index_t idx = c;
+    #pragma unroll
+    for( int it = 0; it < LDGS; it++ ) {
+        if (Is_even_cols || (it < num_valid_ldgs)) {
+            gamma0[it].load_from(params.gamma, idx);
+            if (!Tied_norm) { gamma1[it].load_from(params.gamma1, idx); }
+            idx += Ktraits::VEC_COLS_PER_LDG;
+        }
+    }
+    // TODO if ROWS_PER_CTA does not divide rows, we might get divergence in the
+    // last blocks with syncthreads!
+    // grid stride over rows
+    #pragma unroll 1
+    for( int row = r; row < params.rows; row += params.ctas_per_col * ROWS_PER_CTA ) {
+        const compute_t mu_r = static_cast<const compute_t *>(params.mu)[row];
+        const compute_t rs_r = static_cast<const compute_t *>(params.rs)[row];
+        Mvec dmask0[LDGS], dmask1[LDGS];
+        Rvec dx[LDGS];
+        compute_t dy[LDGS * NUM_ELTS];
+        compute_t y[LDGS * NUM_ELTS];
+        compute_t mdy_local = 0.f;
+        compute_t mdyy_local = 0.f;
+        index_t idx = row * params.cols / Ktraits::ELTS_PER_LDG + c;
+        #pragma unroll
+        for( int it = 0; it < LDGS; it++ ) {
+            if (Is_even_cols || (it < num_valid_ldgs)) {
+                Rvec x;
+                Ovec dz0, dz1;
+                dz0.load_from(params.dz, idx);
+                if (!Tied_norm) { dz1.load_from(params.dz1, idx); }
+                if (prenorm) { dx[it].load_from(params.dx, idx); }
+                x.load_from(params.x, idx);
+                if (Is_dropout) {
+                    dmask0[it].load_from(params.dmask, idx);
+                    if (has_x1) { dmask1[it].load_from(params.dmask1, idx); }
+                }
+                idx += Ktraits::VEC_COLS_PER_LDG;
+                #pragma unroll
+                for( int jt = 0; jt < NUM_ELTS; jt++ ) {
+                    compute_t x_tmp = x.data.elt[jt];
+                    compute_t y_tmp = rs_r * (x_tmp - (!params.is_rms_norm ? mu_r : 0.f));
+                    compute_t dy_tmp = compute_t(gamma0[it].data.elt[jt]) * compute_t(dz0.data.elt[jt]);
+                    if (!Tied_norm) {
+                        dy_tmp += compute_t(gamma1[it].data.elt[jt]) * compute_t(dz1.data.elt[jt]);
+                    }
+                    compute_t dz0_tmp = dz0.data.elt[jt];
+                    compute_t dz1_tmp;
+                    if (!Tied_norm) { dz1_tmp = dz1.data.elt[jt]; }
+
+                    mdy_local += dy_tmp;
+                    mdyy_local += dy_tmp * y_tmp;
+
+                    dy[it * NUM_ELTS + jt] = dy_tmp;
+                    y[it * NUM_ELTS + jt] = y_tmp;
+
+                    dz0y_sum[it].data.elt[jt] += dz0_tmp * y_tmp;
+                    dz0_sum[it].data.elt[jt] += dz0_tmp;
+                    if (!Tied_norm) {
+                        dz1y_sum[it].data.elt[jt] += dz1_tmp * y_tmp;
+                        dz1_sum[it].data.elt[jt] += dz1_tmp;
+                    }
+                }
+            }
+        }
+
+        reduce_t result = reducer.allreduce({mdy_local, mdyy_local}, sum);
+        mdy_local = layer_norm::Get<0>::of<reduce_t, compute_t>(result) * params.inverse_cols;
+        mdyy_local = layer_norm::Get<1>::of<reduce_t, compute_t>(result) * params.inverse_cols;
+
+        idx = row * params.cols / Ktraits::ELTS_PER_LDG + c;
+        #pragma unroll
+        for( int it = 0; it < LDGS; it++ ) {
+            if (Is_even_cols || (it < num_valid_ldgs)) {
+                Ivec dx0, dx1;
+                Rvec dresidual;
+                #pragma unroll
+                for( int jt = 0; jt < NUM_ELTS; jt++ ) {
+                    compute_t dx_tmp_res;
+                    compute_t dy_tmp = dy[it * NUM_ELTS + jt];
+                    compute_t y_tmp = y[it * NUM_ELTS + jt];
+                    compute_t dx_tmp = rs_r * (dy_tmp - (mdyy_local * y_tmp + (!params.is_rms_norm ? mdy_local : 0.f)));
+                    dx_tmp_res = prenorm ? dx_tmp + compute_t(dx[it].data.elt[jt]) : dx_tmp;
+                    if (has_residual) { dresidual.data.elt[jt] = dx_tmp_res; }
+                    if (Is_dropout) {
+                        dx0.data.elt[jt] = dmask0[it].data.elt[jt] ? dx_tmp_res * params.dropout_scale : 0.f;
+                        if (has_x1) { dx1.data.elt[jt] = dmask1[it].data.elt[jt] ? dx_tmp_res * params.dropout_scale : 0.f; }
+                    } else {
+                        dx0.data.elt[jt] = dx_tmp_res;
+                        if (has_x1) { dx1.data.elt[jt] = dx_tmp_res; }
+                    }
+                }
+                if (has_residual) { dresidual.store_to(params.dresidual, idx); }
+                dx0.store_to(params.dx0, idx);
+                if (has_x1) { dx1.store_to(params.dx1, idx); }
+                idx += Ktraits::VEC_COLS_PER_LDG;
+            }
+        }
+
+    }  // end: grid stride loop
+
+    if( WARPS_M == 1 ) {
+        idx = r * params.cols / Ktraits::ELTS_PER_LDG + c;
+        #pragma unroll
+        for( int it = 0; it < LDGS; it++ ) {
+            if (Is_even_cols || (it < num_valid_ldgs)) {
+                dz0_sum[it].store_to(params.dbeta_part, idx);
+                dz0y_sum[it].store_to(params.dgamma_part, idx);
+                if (!Tied_norm) {
+                    dz1_sum[it].store_to(params.dbeta1_part, idx);
+                    dz1y_sum[it].store_to(params.dgamma1_part, idx);
+                }
+                idx += Ktraits::VEC_COLS_PER_LDG;
+            }
+        }
+    } else {
+        static_assert(WARPS_M == 1 || Ktraits::CTAS_PER_ROW == 1, "Multiple rows per CTA not supported for Multi-CTA.");
+        // Finalize reduction of part dgamma and dbeta for this CTA
+        // by reducing over the rows held across the WARPS_M warps
+
+        // Assumption: blockSize divides hidden size.
+        enum { NUM_RES = COLS / Ktraits::THREADS_PER_CTA };
+        static_assert(NUM_RES * Ktraits::THREADS_PER_CTA == COLS, "");
+
+        idx = warp_m * Ktraits::VEC_COLS + tid_r;
+        #pragma unroll
+        for( int it = 0; it < LDGS; it++ ) {
+            dz0_sum[it].store_to(smem_wgrad, idx);
+            idx += THREADS_PER_ROW;
+        }
+        __syncthreads();
+        compute_t cta_dz0_sum[NUM_RES];
+        memset(cta_dz0_sum, 0, sizeof(compute_t) * NUM_RES);
+        for( int it = 0; it < ROWS_PER_CTA; it++ ) {
+            for( int jt = 0; jt < NUM_RES; jt++ ) {
+                cta_dz0_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA];
+            }
+        }
+        __syncthreads();
+
+        idx = warp_m * Ktraits::VEC_COLS + tid_r;
+        #pragma unroll
+        for( int it = 0; it < LDGS; it++ ) {
+            dz0y_sum[it].store_to(smem_wgrad, idx);
+            idx += THREADS_PER_ROW;
+        }
+        __syncthreads();
+        compute_t cta_dz0y_sum[NUM_RES];
+        memset(cta_dz0y_sum, 0, sizeof(compute_t) * NUM_RES);
+        for( int it = 0; it < ROWS_PER_CTA; it++ ) {
+            for( int jt = 0; jt < NUM_RES; jt++ ) {
+                cta_dz0y_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA];
+            }
+        }
+
+        compute_t cta_dz1_sum[NUM_RES], cta_dz1y_sum[NUM_RES];
+        if (!Tied_norm) {
+            __syncthreads();
+            idx = warp_m * Ktraits::VEC_COLS + tid_r;
+            #pragma unroll
+            for( int it = 0; it < LDGS; it++ ) {
+                dz1_sum[it].store_to(smem_wgrad, idx);
+                idx += THREADS_PER_ROW;
+            }
+            __syncthreads();
+            memset(cta_dz1_sum, 0, sizeof(compute_t) * NUM_RES);
+            for( int it = 0; it < ROWS_PER_CTA; it++ ) {
+                for( int jt = 0; jt < NUM_RES; jt++ ) {
+                    cta_dz1_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA];
+                }
+            }
+            __syncthreads();
+            idx = warp_m * Ktraits::VEC_COLS + tid_r;
+            #pragma unroll
+            for( int it = 0; it < LDGS; it++ ) {
+                dz1y_sum[it].store_to(smem_wgrad, idx);
+                idx += THREADS_PER_ROW;
+            }
+            __syncthreads();
+            memset(cta_dz1y_sum, 0, sizeof(compute_t) * NUM_RES);
+            for( int it = 0; it < ROWS_PER_CTA; it++ ) {
+                for( int jt = 0; jt < NUM_RES; jt++ ) {
+                    cta_dz1y_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA];
+                }
+            }
+        }
+
+        const index_t num_valid_writes
+            = (params.cols - 1 - tidx + Ktraits::THREADS_PER_CTA) / Ktraits::THREADS_PER_CTA;
+        compute_t *dgamma0_part = static_cast<compute_t *>(params.dgamma_part) + bidm * params.cols + tidx;
+        compute_t *dbeta0_part = static_cast<compute_t *>(params.dbeta_part) + bidm * params.cols + tidx;
+        compute_t *dgamma1_part = !Tied_norm ? static_cast<compute_t *>(params.dgamma1_part) + bidm * params.cols + tidx : nullptr;
+        compute_t *dbeta1_part = !Tied_norm ? static_cast<compute_t *>(params.dbeta1_part) + bidm * params.cols + tidx : nullptr;
+        for( int jt = 0; jt < NUM_RES; jt++ ) {
+            if (Is_even_cols || (jt < num_valid_writes)) {
+                *dgamma0_part = cta_dz0y_sum[jt];
+                dgamma0_part += Ktraits::THREADS_PER_CTA;
+                *dbeta0_part = cta_dz0_sum[jt];
+                dbeta0_part += Ktraits::THREADS_PER_CTA;
+                if (!Tied_norm) {
+                    *dgamma1_part = cta_dz1y_sum[jt];
+                    dgamma1_part += Ktraits::THREADS_PER_CTA;
+                    *dbeta1_part = cta_dz1_sum[jt];
+                    dbeta1_part += Ktraits::THREADS_PER_CTA;
+                }
+            }
+        }
+
+    }
+}
+
+template<typename Kernel_traits, bool Is_even_cols>
+__global__ __launch_bounds__(Kernel_traits::THREADS_PER_CTA)
+void ln_parallel_residual_bwd_finalize_kernel(BwdParams params)
+{
+
+    using compute_t = typename Kernel_traits::compute_t;
+    using weight_t = typename Kernel_traits::weight_t;
+    using index_t = typename Kernel_traits::index_t;
+    using Reducer = typename Kernel_traits::Reducer;
+    using reduce_t = typename Reducer::Type;
+
+    Sum<reduce_t> sum;
+    enum { NUM_ELT = Kernel_traits::ELTS_PER_LDG };
+    enum { THREADS_PER_WARP = Kernel_traits::THREADS_PER_WARP };
+
+    // Multiplying by 2 since we have both gamma0 and gamma1
+    __shared__ char smem_[2 * Kernel_traits::SMEM_BYTES_PER_CTA];
+
+    constexpr uint32_t bidm = 0;
+
+    const uint32_t bidn = blockIdx.x;
+    const uint32_t tidx = threadIdx.x;
+    const uint32_t warp = tidx / THREADS_PER_WARP;
+    const uint32_t lane = tidx % THREADS_PER_WARP;
+
+    Reducer reducer(params, bidm, bidn, 0, 0, lane, smem_);
+
+    const uint32_t c = bidn * THREADS_PER_WARP + lane;
+    const uint32_t c_out = bidn * THREADS_PER_WARP / 2 + lane;
+    constexpr uint32_t COL_STRIDE = Kernel_traits::CTAS * THREADS_PER_WARP;
+    for( uint32_t col = c, col_out = c_out; col < Kernel_traits::COLS; col += COL_STRIDE, col_out += COL_STRIDE / 2 ) {
+        // Each thread sums over NUM_ELT columns.
+        Vec<compute_t, NUM_ELT> dbeta0_local, dgamma0_local, dbeta1_local, dgamma1_local;
+        memset(&dgamma0_local, 0, sizeof(dgamma0_local));
+        memset(&dbeta0_local, 0, sizeof(dbeta0_local));
+        memset(&dgamma1_local, 0, sizeof(dgamma1_local));
+        memset(&dbeta1_local, 0, sizeof(dbeta1_local));
+        if (Is_even_cols || col < params.cols) {
+            for( uint32_t row = warp; row < params.ctas_per_col; row += Kernel_traits::ROWS_PER_CTA ) {
+                index_t idx = row * params.cols + col;
+
+                Vec<compute_t, NUM_ELT> dbeta0_part, dgamma0_part, dbeta1_part, dgamma1_part;
+                dbeta0_part.load_from(params.dbeta_part, idx);
+                dgamma0_part.load_from(params.dgamma_part, idx);
+                dbeta1_part.load_from(params.dbeta1_part, idx);
+                dgamma1_part.load_from(params.dgamma1_part, idx);
+                #pragma unroll
+                for( int it = 0; it < NUM_ELT; it++ ) {
+                    dgamma0_local.data.elt[it] += dgamma0_part.data.elt[it];
+                    dbeta0_local.data.elt[it] += dbeta0_part.data.elt[it];
+                    dgamma1_local.data.elt[it] += dgamma1_part.data.elt[it];
+                    dbeta1_local.data.elt[it] += dbeta1_part.data.elt[it];
+                }
+            }
+        }
+        void * smem_gamma0 = smem_;
+        void * smem_beta0 = &smem_[Kernel_traits::SMEM_BYTES_TRANSPOSE];
+        void * smem_gamma1 = &smem_[2 * Kernel_traits::SMEM_BYTES_TRANSPOSE];
+        void * smem_beta1 = &smem_[3 * Kernel_traits::SMEM_BYTES_TRANSPOSE];
+
+        const int write_row = warp;
+        const int write_col = lane ^ write_row;
+        const int write_idx = write_row * THREADS_PER_WARP + write_col;
+
+        dgamma0_local.store_to(smem_gamma0, write_idx);
+        dbeta0_local.store_to(smem_beta0, write_idx);
+        dgamma1_local.store_to(smem_gamma1, write_idx);
+        dbeta1_local.store_to(smem_beta1, write_idx);
+
+        __syncthreads();
+
+        // It would be probably safe to reuse the first row of smem_beta0 and smem_gamma0
+        void * smem_gamma0_out = &smem_[4 * Kernel_traits::SMEM_BYTES_TRANSPOSE];
+        void * smem_beta0_out = &smem_[4 * Kernel_traits::SMEM_BYTES_TRANSPOSE + Kernel_traits::SMEM_BYTES_OUTPUT];
+        void * smem_gamma1_out = &smem_[4 * Kernel_traits::SMEM_BYTES_TRANSPOSE + 2 * Kernel_traits::SMEM_BYTES_OUTPUT];
+        void * smem_beta1_out = &smem_[4 * Kernel_traits::SMEM_BYTES_TRANSPOSE + 3 * Kernel_traits::SMEM_BYTES_OUTPUT];
+
+        // More than one iter iff ROWS_PER_CTA < 32.
+        for( int w = warp; w < THREADS_PER_WARP; w += Kernel_traits::ROWS_PER_CTA ) {
+            const int read_row = lane;
+            const int read_col = w ^ read_row;
+            const int read_idx = read_row * THREADS_PER_WARP + read_col;
+
+            memset(&dbeta0_local, 0, sizeof(dbeta0_local));
+            memset(&dgamma0_local, 0, sizeof(dgamma0_local));
+            memset(&dbeta1_local, 0, sizeof(dbeta1_local));
+            memset(&dgamma1_local, 0, sizeof(dgamma1_local));
+
+            // Load beta and gamma transposed
+            if(read_row < Kernel_traits::ROWS_PER_CTA){
+                dbeta0_local.load_from(smem_beta0, read_idx);
+                dgamma0_local.load_from(smem_gamma0, read_idx);
+                dbeta1_local.load_from(smem_beta1, read_idx);
+                dgamma1_local.load_from(smem_gamma1, read_idx);
+            }
+
+            // Call reducer on the loaded value(s) and convert.
+            #pragma unroll
+            for( int it = 0; it < NUM_ELT; it++ ) {
+                compute_t b0_i = dbeta0_local.data.elt[it];
+                compute_t g0_i = dgamma0_local.data.elt[it];
+                compute_t b1_i = dbeta1_local.data.elt[it];
+                compute_t g1_i = dgamma1_local.data.elt[it];
+                b0_i = reducer.allreduce(b0_i, sum);
+                g0_i = reducer.allreduce(g0_i, sum);
+                b1_i = reducer.allreduce(b1_i, sum);
+                g1_i = reducer.allreduce(g1_i, sum);
+
+                dgamma0_local.data.elt[it] = g0_i;
+                dbeta0_local.data.elt[it] = b0_i;
+                dgamma1_local.data.elt[it] = g1_i;
+                dbeta1_local.data.elt[it] = b1_i;
+            }
+
+            // Leader stores the result at the current column.
+            if(lane == 0){
+                dgamma0_local.store_to(smem_gamma0_out, w);
+                dbeta0_local.store_to(smem_beta0_out, w);
+                dgamma1_local.store_to(smem_gamma1_out, w);
+                dbeta1_local.store_to(smem_beta1_out, w);
+            }
+
+        }
+
+        // All writes done.
+        __syncthreads();
+
+        // Pack and store: 2-wide stores with half the threads.
+        if (Is_even_cols || col_out * 2 < params.cols) {
+            if( warp == Kernel_traits::ROWS_PER_CTA - 1 && lane < THREADS_PER_WARP / 2 ) {
+
+                using src_t = typename TypeToVec2<compute_t>::Type;
+                using dst_t = typename TypeToVec2<weight_t>::Type;
+                Vec<src_t, NUM_ELT> dbeta0_vec2, dgamma0_vec2, dbeta1_vec2, dgamma1_vec2;
+                Vec<dst_t, NUM_ELT> dbeta0_out2, dgamma0_out2, dbeta1_out2, dgamma1_out2;
+
+                dgamma0_vec2.load_from(smem_gamma0_out, lane);
+                dbeta0_vec2.load_from(smem_beta0_out, lane);
+                dgamma1_vec2.load_from(smem_gamma1_out, lane);
+                dbeta1_vec2.load_from(smem_beta1_out, lane);
+                #pragma unroll
+                for( int it = 0; it < NUM_ELT; it++ ) {
+                    dgamma0_out2.data.elt[it] = Converter<src_t,dst_t>::convert(dgamma0_vec2.data.elt[it]);
+                    dbeta0_out2.data.elt[it] = Converter<src_t,dst_t>::convert(dbeta0_vec2.data.elt[it]);
+                    dgamma1_out2.data.elt[it] = Converter<src_t,dst_t>::convert(dgamma1_vec2.data.elt[it]);
+                    dbeta1_out2.data.elt[it] = Converter<src_t,dst_t>::convert(dbeta1_vec2.data.elt[it]);
+                }
+                dgamma0_out2.store_to(params.dgamma, col_out);
+                dbeta0_out2.store_to(params.dbeta, col_out);
+                dgamma1_out2.store_to(params.dgamma1, col_out);
+                dbeta1_out2.store_to(params.dbeta1, col_out);
+            }
+        }
+    }
+}
+
+}  // namespace layer_norm
+
+using namespace layer_norm;
+
+template<
+    typename weight_t,
+    typename input_t,
+    typename residual_t,
+    typename output_t,
+    typename compute_t,
+    typename index_t,
+    int HIDDEN_SIZE,
+    int CTAS_PER_ROW,
+    int WARPS_M,
+    int WARPS_N,
+    int BYTES_PER_LDG_MAIN,
+    int BYTES_PER_LDG_FINAL
+>
+void launch_parallel_residual_(LaunchParams<BwdParams> &launch_params, const bool configure_params){
+
+    using Kernel_traits = Kernel_traits<weight_t,
+                                        input_t,
+                                        residual_t,
+                                        output_t,
+                                        compute_t,
+                                        index_t,
+                                        HIDDEN_SIZE,
+                                        CTAS_PER_ROW,
+                                        WARPS_M,
+                                        WARPS_N,
+                                        BYTES_PER_LDG_MAIN
+                                        >;
+    bool is_dropout = launch_params.params.dropout_keep_p < 1.f;
+    bool tied_norm = launch_params.params.gamma1 == nullptr;
+    bool is_even_cols = launch_params.params.cols == HIDDEN_SIZE;
+    BOOL_SWITCH(is_dropout, IsDropoutConst, [&] {
+        BOOL_SWITCH(tied_norm, TiedNormConst, [&] {
+            BOOL_SWITCH(is_even_cols, IsEvenColsConst, [&] {
+                auto kernel = &ln_parallel_residual_bwd_kernel<Kernel_traits, IsDropoutConst, TiedNormConst, IsEvenColsConst>;
+                if( configure_params ) {
+                    int ctas_per_sm;
+                    CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+                        &ctas_per_sm, kernel, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES));
+                    launch_params.params.ctas_per_col = launch_params.props->multiProcessorCount * ctas_per_sm / Kernel_traits::CTAS_PER_ROW;
+                    launch_params.barrier_size = 0;
+                    launch_params.workspace_bytes = 0;
+                    if(Kernel_traits::CTAS_PER_ROW > 1) {
+                        launch_params.barrier_size = 2 * launch_params.params.ctas_per_col;
+                        launch_params.workspace_bytes = launch_params.params.ctas_per_col
+                                                      * Kernel_traits::WARPS_M
+                                                      * Kernel_traits::CTAS_PER_ROW
+                                                      * sizeof(typename Kernel_traits::reduce_t)
+                                                      * 2;
+                    }
+                    return;
+                }
+
+                if( Kernel_traits::SMEM_BYTES >= 48 * 1024 ) {
+                    CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES));
+                }
+                auto stream = launch_params.stream;
+                auto ctas_per_col = launch_params.params.ctas_per_col;
+
+                if( Kernel_traits::CTAS_PER_ROW == 1 ) {
+                    kernel<<<ctas_per_col, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES, stream>>>(launch_params.params);
+                } else {
+                    dim3 grid(Kernel_traits::CTAS_PER_ROW * ctas_per_col);
+                    dim3 block(Kernel_traits::THREADS_PER_CTA);
+                    void *params_ = (void *)&launch_params.params;
+                    cudaLaunchCooperativeKernel((void *)kernel, grid, block, (void **)&params_, Kernel_traits::SMEM_BYTES, stream);
+                }
+
+                using Kernel_traits_f = layer_norm::Kernel_traits_finalize<HIDDEN_SIZE,
+                                                                          weight_t,
+                                                                          input_t,
+                                                                          residual_t,
+                                                                          output_t,
+                                                                          compute_t,
+                                                                          index_t,
+                                                                          /*HasColscaleConst=*/false,
+                                                                          32 * 32,  // THREADS_PER_CTA
+                                                                          BYTES_PER_LDG_FINAL>;
+
+                auto kernel_f = !TiedNormConst
+                    ? &layer_norm::ln_parallel_residual_bwd_finalize_kernel<Kernel_traits_f, IsEvenColsConst>
+                    : &layer_norm::ln_bwd_finalize_kernel<Kernel_traits_f, /*HasColscaleConst=*/false, IsEvenColsConst>;
+                kernel_f<<<Kernel_traits_f::CTAS, Kernel_traits_f::THREADS_PER_CTA, 0, stream>>>(launch_params.params);
+
+            });
+        });
+    });
+}
diff --git a/ln_parallel_residual_fwd_kernels.cuh b/ln_parallel_residual_fwd_kernels.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..0e55cb4038b4dbe30d9eb47609df3afea4c4f5fb
--- /dev/null
+++ b/ln_parallel_residual_fwd_kernels.cuh
@@ -0,0 +1,281 @@
+#pragma once
+
+#ifdef OLD_GENERATOR_PATH
+#include <ATen/CUDAGeneratorImpl.h>
+#else
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#endif
+
+#include <ATen/cuda/detail/UnpackRaw.cuh>  // For at::cuda::philox::unpack
+#include <curand_kernel.h>
+
+#include "ln.h"
+#include "ln_utils.cuh"
+#include "ln_kernel_traits.h"
+#include "static_switch.h"
+
+namespace layer_norm {
+
+template<typename Ktraits, bool Is_dropout, bool Tied_norm, bool Is_even_cols>
+__global__ __launch_bounds__(Ktraits::THREADS_PER_CTA) 
+void ln_parallel_residual_fwd_kernel(FwdParams params) {
+
+    enum { ROWS_PER_CTA = Ktraits::ROWS_PER_CTA };
+    enum { WARPS_N = Ktraits::WARPS_N };
+    enum { WARPS_M = Ktraits::WARPS_M };
+    enum { THREADS_PER_ROW = Ktraits::THREADS_PER_ROW };
+    enum { VEC_COLS_PER_LDG = Ktraits::VEC_COLS_PER_LDG };
+    enum { BYTES_PER_ROW = Ktraits::BYTES_PER_ROW };
+    enum { LDGS = Ktraits::LDGS };
+    enum { NUM_ELTS = Ktraits::NUM_ELTS };
+    enum { CTAS_PER_ROW = Ktraits::CTAS_PER_ROW };
+
+    using input_t = typename Ktraits::input_t;
+    using residual_t = typename Ktraits::residual_t;
+    using output_t = typename Ktraits::output_t;
+    using index_t = typename Ktraits::index_t;
+    using compute_t = typename Ktraits::compute_t;
+    using mask_t = typename Ktraits::mask_t;
+    using Ivec = typename Ktraits::Ivec;
+    using Rvec = typename Ktraits::Rvec;
+    using Ovec = typename Ktraits::Ovec;
+    using Wvec = typename Ktraits::Wvec;
+    using Cvec = typename Ktraits::Cvec;
+    using Mvec = typename Ktraits::Mvec;
+
+    using Stats = typename Ktraits::Stats;
+    using stats_t = typename Stats::stats_t;
+
+    const bool has_residual = params.residual != nullptr;
+    const bool has_x1 = params.x1 != nullptr;
+    const bool save_x = has_residual || has_x1 || Is_dropout || !(std::is_same<input_t, residual_t>::value);
+
+    extern __shared__ char smem_[];
+
+    const index_t tidx = threadIdx.x;
+    const index_t bidn = blockIdx.x % CTAS_PER_ROW;
+    const index_t bidm = blockIdx.x / CTAS_PER_ROW;
+    const index_t lane = tidx % THREADS_PER_WARP;
+    const index_t warp = tidx / THREADS_PER_WARP;
+    const index_t warp_m = warp / WARPS_N;
+    const index_t warp_n = warp % WARPS_N;
+
+    const index_t r = bidm * ROWS_PER_CTA + warp_m;
+    const index_t c = bidn * THREADS_PER_ROW + warp_n * THREADS_PER_WARP + lane;
+
+    Stats stats(params, bidm, bidn, warp_m, warp_n, lane, smem_);
+
+    compute_t *mu_ptr = static_cast<compute_t *>(params.mu);
+    compute_t *rs_ptr = static_cast<compute_t *>(params.rs);
+
+    // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/Dropout.cu
+    curandStatePhilox4_32_10_t state;
+    if (Is_dropout) {
+        auto seeds = at::cuda::philox::unpack(params.philox_args);
+        const index_t tidx_global = blockIdx.x * blockDim.x + threadIdx.x;
+        curand_init(std::get<0>(seeds), tidx_global, std::get<1>(seeds), &state);
+    }
+
+    const index_t num_valid_ldgs = ((params.cols / Ktraits::ELTS_PER_LDG) - 1 - c + VEC_COLS_PER_LDG) / VEC_COLS_PER_LDG;
+
+    Wvec gamma0[LDGS];
+    Wvec beta0[LDGS];
+    Wvec gamma1[LDGS];
+    Wvec beta1[LDGS];
+    index_t idx = c;
+    #pragma unroll
+    for( int it = 0; it < LDGS; it++ ) {
+        if (Is_even_cols || (it < num_valid_ldgs)) {
+            gamma0[it].load_from(params.gamma, idx);
+            if (params.beta != nullptr) {
+                beta0[it].load_from(params.beta, idx);
+            } else {
+                beta0[it].zero_();
+            }
+            if (!Tied_norm) {
+                gamma1[it].load_from(params.gamma1, idx);
+                if (params.beta1 != nullptr) {
+                    beta1[it].load_from(params.beta1, idx);
+                } else {
+                    beta1[it].zero_();
+                }
+            }
+            idx += VEC_COLS_PER_LDG;
+        }
+    }
+
+    for( int row = r; row < params.rows; row += params.ctas_per_col * ROWS_PER_CTA ) {
+        index_t idx = row * params.cols / Ktraits::ELTS_PER_LDG + c;
+        compute_t xf[LDGS * NUM_ELTS];
+        #pragma unroll
+        for( int it = 0; it < LDGS; it++ ) {
+            if (Is_even_cols || (it < num_valid_ldgs)) {
+                Ivec x0;
+                Ivec x1;
+                Rvec residual;
+                Rvec x;
+                Mvec dmask0;
+                Mvec dmask1;
+                x0.load_from(params.x0, idx);
+                if (has_x1) { x1.load_from(params.x1, idx); }
+                if (has_residual) { residual.load_from(params.residual, idx); }
+                #pragma unroll
+                for( int jt = 0; jt < NUM_ELTS; jt++ ) {
+                    // TD [2022-04-22]: We're memory bound, not compute bound, so we don't need to use
+                    // the more efficient curand_uniform4.
+                    compute_t x_ij;
+                    mask_t keep0 = !Is_dropout ? true : curand_uniform(&state) <= params.dropout_keep_p;
+                    if (Is_dropout) { dmask0.data.elt[jt] = keep0; }
+                    compute_t x0_ij = compute_t(x0.data.elt[jt]);
+                    x0_ij = keep0 ? (Is_dropout ? x0_ij * params.dropout_scale : x0_ij) : 0.0f;
+                    if (has_x1) {
+                        mask_t keep1 = !Is_dropout ? true : curand_uniform(&state) <= params.dropout_keep_p;
+                        if (Is_dropout) { dmask1.data.elt[jt] = keep1; }
+                        compute_t x1_ij = compute_t(x1.data.elt[jt]);
+                        x1_ij = keep1 ? (Is_dropout ? x1_ij * params.dropout_scale : x1_ij) : 0.0f;
+                        x_ij = has_residual ? x0_ij + x1_ij + compute_t(residual.data.elt[jt]) : x0_ij + x1_ij;
+                    } else {
+                        x_ij = has_residual ? x0_ij + compute_t(residual.data.elt[jt]) : x0_ij;
+                    }
+                    if (save_x) { x.data.elt[jt] = x_ij; }
+                    xf[it * NUM_ELTS + jt] = x_ij;
+                }
+                if (save_x) { x.store_to(params.x, idx); }
+                if (Is_dropout) {
+                    dmask0.store_to(params.dmask, idx);
+                    if (has_x1) { dmask1.store_to(params.dmask1, idx); }
+                }
+                idx += VEC_COLS_PER_LDG;
+            }
+        }
+
+        static_assert(CTAS_PER_ROW == 1, "Don't support multiple CTAs per row for now");
+        const index_t num_vecs = params.cols / Ktraits::ELTS_PER_LDG;
+        const index_t num_full_ldgs = num_vecs / Ktraits::VEC_COLS_PER_LDG;
+        const index_t remaining_vecs = num_vecs % Ktraits::VEC_COLS_PER_LDG;
+        auto valid_elts_in_warp_fn = [num_full_ldgs, remaining_vecs] (int warp_n) -> int {
+            // Need to convert to int, otherwise the subtraction will wrap around.
+            const index_t valid_partial_vecs_in_warp =
+                std::min(std::max(int(remaining_vecs) - int(warp_n * THREADS_PER_WARP), int(0)),
+                        int(THREADS_PER_WARP));
+            return (num_full_ldgs * THREADS_PER_WARP + valid_partial_vecs_in_warp) * NUM_ELTS;
+        };
+        stats_t s = stats.template compute<Is_even_cols>(
+            xf, params.inverse_cols, valid_elts_in_warp_fn, num_valid_ldgs * NUM_ELTS
+        );
+
+        compute_t mu = layer_norm::Get<0>::of<stats_t, compute_t>(s);
+        compute_t m2 = layer_norm::Get<1>::of<stats_t, compute_t>(s);
+
+        if( bidn == 0 && warp_n == 0 && lane == 0 ) {
+            mu_ptr[row] = mu;
+        }
+
+        compute_t rs = rsqrtf(m2 * params.inverse_cols + params.epsilon + (!params.is_rms_norm ? 0.f : mu * mu));
+
+        if( bidn == 0 && warp_n == 0 && lane == 0 ) {
+            rs_ptr[row] = rs;
+        }
+
+        idx = row * params.cols / Ktraits::ELTS_PER_LDG + c;
+        #pragma unroll
+        for( int it = 0; it < LDGS; it++ ) {
+            if (Is_even_cols || (it < num_valid_ldgs)) {
+                Ovec z0;
+                Ovec z1;
+                #pragma unroll
+                for( int jt = 0; jt < NUM_ELTS; jt++ ) {
+                    compute_t y_ij = compute_t(rs * (xf[it * NUM_ELTS + jt] - (!params.is_rms_norm ? mu : 0.f)));
+                    compute_t g0_ij = gamma0[it].data.elt[jt];
+                    compute_t b0_ij = beta0[it].data.elt[jt];
+                    z0.data.elt[jt] = output_t(g0_ij * y_ij + b0_ij);
+                    if (!Tied_norm) {
+                        compute_t g1_ij = gamma1[it].data.elt[jt];
+                        compute_t b1_ij = beta1[it].data.elt[jt];
+                        z1.data.elt[jt] = output_t(g1_ij * y_ij + b1_ij);
+                    }
+                }
+                z0.store_to(params.z, idx);
+                if (!Tied_norm) { z1.store_to(params.z1, idx); }
+                idx += VEC_COLS_PER_LDG;
+            }
+        }
+
+    }
+}
+
+}  // namespace layer_norm
+
+using namespace layer_norm;
+
+template<
+    typename weight_t,
+    typename input_t,
+    typename residual_t,
+    typename output_t,
+    typename compute_t,
+    typename index_t,
+    int HIDDEN_SIZE,
+    int CTAS_PER_ROW,
+    int WARPS_M,
+    int WARPS_N,
+    int BYTES_PER_LDG
+>
+void launch_parallel_residual_(LaunchParams<FwdParams> &launch_params, const bool configure_params){
+
+    using Kernel_traits = Kernel_traits<weight_t,
+                                        input_t,
+                                        residual_t,
+                                        output_t,
+                                        compute_t,
+                                        index_t,
+                                        HIDDEN_SIZE,
+                                        CTAS_PER_ROW,
+                                        WARPS_M,
+                                        WARPS_N,
+                                        BYTES_PER_LDG
+                                        >;
+    bool is_even_cols = launch_params.params.cols == HIDDEN_SIZE;
+    bool tied_norm = launch_params.params.gamma1 == nullptr;
+    BOOL_SWITCH(launch_params.params.dropout_keep_p < 1.f, IsDropoutConst, [&] {
+        BOOL_SWITCH(tied_norm, TiedNormConst, [&] {
+            BOOL_SWITCH(is_even_cols, IsEvenColsConst, [&] {
+                auto kernel = &ln_parallel_residual_fwd_kernel<Kernel_traits, IsDropoutConst, TiedNormConst, IsEvenColsConst>;
+                if( configure_params ) {
+                    int ctas_per_sm;
+                    CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+                        &ctas_per_sm, kernel, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES_FWD));
+                    launch_params.params.ctas_per_col = launch_params.props->multiProcessorCount * ctas_per_sm / Kernel_traits::CTAS_PER_ROW;
+                    const size_t rows_per_loop = launch_params.params.ctas_per_col * Kernel_traits::ROWS_PER_CTA;
+                    launch_params.elts_per_thread = (launch_params.params.rows + rows_per_loop - 1) / rows_per_loop * Kernel_traits::LDGS * Kernel_traits::NUM_ELTS;
+                    launch_params.barrier_size = 0;
+                    launch_params.workspace_bytes = 0;
+                    if(Kernel_traits::CTAS_PER_ROW > 1) {
+                        launch_params.barrier_size = 2 * launch_params.params.ctas_per_col;
+                        launch_params.workspace_bytes = launch_params.params.ctas_per_col
+                                                      * Kernel_traits::WARPS_M
+                                                      * Kernel_traits::CTAS_PER_ROW
+                                                      * sizeof(typename Kernel_traits::Stats::stats_t)
+                                                      * 2;
+                    }
+                    return;
+                }
+
+                if( Kernel_traits::SMEM_BYTES_FWD >= 48 * 1024 ) {
+                    CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES_FWD));
+                }
+                auto stream = launch_params.stream;
+                auto ctas_per_col = launch_params.params.ctas_per_col;
+
+                if( Kernel_traits::CTAS_PER_ROW == 1 ) {
+                    kernel<<<ctas_per_col, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES_FWD, stream>>>(launch_params.params);
+                } else {
+                    dim3 grid(Kernel_traits::CTAS_PER_ROW * ctas_per_col);
+                    dim3 block(Kernel_traits::THREADS_PER_CTA);
+                    void *params_ = (void *)&launch_params.params;
+                    cudaLaunchCooperativeKernel((void *)kernel, grid, block, (void **)&params_, Kernel_traits::SMEM_BYTES_FWD, stream);
+                }
+            });
+        });
+    });
+}
diff --git a/ln_utils.cuh b/ln_utils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..178d6fda895b478ac76e2a77a2b1b35115fcc279
--- /dev/null
+++ b/ln_utils.cuh
@@ -0,0 +1,783 @@
+#pragma once
+
+#include <cassert>
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include "ln.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+constexpr uint32_t THREADS_PER_WARP = 32;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline void check_cuda_(cudaError_t status, const char *file, int line) {
+    if( status != cudaSuccess ) {
+        fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(status), file, line);
+        exit(status);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define CHECK_CUDA(ans)                                                                                                        \
+    { check_cuda_((ans), __FILE__, __LINE__); }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define DIVUP(x, y) (((x) + ((y)-1)) / (y))
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define REGISTER_FWD_LAUNCHER(HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG)                 \
+    void ln_fwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE(LaunchParams<FwdParams> &launch_params,                      \
+                                                                                const bool configure_params) {                               \
+        launch_<WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, uint32_t, HIDDEN_SIZE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG>(                    \
+            launch_params, configure_params);                                                                                                \
+    }                                                                                                                                        \
+    static FwdRegistrar<WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, HIDDEN_SIZE> reg_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE( \
+        ln_fwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define REGISTER_BWD_LAUNCHER(                                                                                                                  \
+    HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINALIZE)                      \
+    void ln_bwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE(LaunchParams<BwdParams> &launch_params,                         \
+                                                                                const bool configure_params) {                                  \
+        launch_<WTYPE,                                                                                                                          \
+                ITYPE,                                                                                                                          \
+                RTYPE,                                                                                                                          \
+                OTYPE,                                                                                                                          \
+                CTYPE,                                                                                                                          \
+                uint32_t,                                                                                                                       \
+                HIDDEN_SIZE,                                                                                                                    \
+                CTAS_PER_ROW,                                                                                                                   \
+                WARPS_M,                                                                                                                        \
+                WARPS_N,                                                                                                                        \
+                BYTES_PER_LDG,                                                                                                                  \
+                BYTES_PER_LDG_FINALIZE>(launch_params, configure_params);                                                                       \
+    }                                                                                                                                           \
+    static BwdRegistrar<WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, HIDDEN_SIZE> reg_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE(    \
+        ln_bwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define REGISTER_PARALLEL_FWD_LAUNCHER(HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG)                \
+    void ln_parallel_residual_fwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE(LaunchParams<FwdParams> &launch_params,            \
+                                                                                const bool configure_params) {                                       \
+        launch_parallel_residual_<WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, uint32_t, HIDDEN_SIZE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG>(          \
+            launch_params, configure_params);                                                                                                        \
+    }                                                                                                                                                \
+    static FwdParallelRegistrar<WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, HIDDEN_SIZE> reg_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE( \
+        ln_parallel_residual_fwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#define REGISTER_PARALLEL_BWD_LAUNCHER(                                                                                                              \
+    HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINALIZE)                           \
+    void ln_parallel_residual_bwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE(LaunchParams<BwdParams> &launch_params,            \
+                                                                                const bool configure_params) {                                       \
+        launch_parallel_residual_<WTYPE,                                                                                                             \
+                ITYPE,                                                                                                                               \
+                RTYPE,                                                                                                                               \
+                OTYPE,                                                                                                                               \
+                CTYPE,                                                                                                                               \
+                uint32_t,                                                                                                                            \
+                HIDDEN_SIZE,                                                                                                                         \
+                CTAS_PER_ROW,                                                                                                                        \
+                WARPS_M,                                                                                                                             \
+                WARPS_N,                                                                                                                             \
+                BYTES_PER_LDG,                                                                                                                       \
+                BYTES_PER_LDG_FINALIZE>(launch_params, configure_params);                                                                            \
+    }                                                                                                                                                \
+    static BwdParallelRegistrar<WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, HIDDEN_SIZE> reg_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE( \
+        ln_parallel_residual_bwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE)
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ float2 operator+(const float2 & a, const float2 & b){
+    return {a.x + b.x, a.y + b.y};
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void operator+=(float2 & a, const float2 & b){
+    a.x += b.x;
+    a.y += b.y;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct Sum {
+    inline __device__ Sum(){}
+    inline __device__ T operator()(const T &a, const T &b){
+        return a + b;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+inline __device__ T warp_shuffle_xor(const T & x, uint32_t idx){
+    return __shfl_xor_sync(uint32_t(-1), x, idx);
+}
+
+template<>
+inline __device__ float2 warp_shuffle_xor<float2>(const float2 & x, uint32_t idx){
+    return { warp_shuffle_xor(x.x, idx), warp_shuffle_xor(x.y, idx) };
+}
+
+template<typename T>
+inline __device__ T warp_shuffle_down(const T & x, uint32_t idx){
+    return __shfl_down_sync(uint32_t(-1), x, idx);
+}
+
+template<>
+inline __device__ float2 warp_shuffle_down<float2>(const float2 & x, uint32_t idx){
+    return { warp_shuffle_down(x.x, idx), warp_shuffle_down(x.y, idx) };
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace layer_norm {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct uint16 {
+    uint4 u;
+    uint4 v;
+    uint4 s;
+    uint4 t;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct uint8 {
+    uint4 u;
+    uint4 v;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int BYTES>
+struct BytesToType {};
+
+template<>
+struct BytesToType<64> {
+    using Type = uint16;
+    static_assert(sizeof(Type) == 64);
+};
+
+template<>
+struct BytesToType<32> {
+    using Type = uint8;
+    static_assert(sizeof(Type) == 32);
+};
+
+template<>
+struct BytesToType<16> {
+    using Type = uint4;
+    static_assert(sizeof(Type) == 16);
+};
+
+template<>
+struct BytesToType<8> {
+    using Type = uint64_t;
+    static_assert(sizeof(Type) == 8);
+};
+
+template<>
+struct BytesToType<4> {
+    using Type = uint32_t;
+    static_assert(sizeof(Type) == 4);
+};
+
+template<>
+struct BytesToType<2> {
+    using Type = uint16_t;
+    static_assert(sizeof(Type) == 2);
+};
+
+template<>
+struct BytesToType<1> {
+    using Type = uint8_t;
+    static_assert(sizeof(Type) == 1);
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct TypeToVec2 {};
+
+template<>
+struct TypeToVec2<float> {
+    using Type = float2;
+};
+
+template<>
+struct TypeToVec2<half> {
+    using Type = half2;
+};
+
+template<>
+struct TypeToVec2<nv_bfloat16> {
+    using Type = nv_bfloat162;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int INDEX>
+struct Get {
+    template<typename T, typename R>
+    static inline __device__ R of(const T &vec);
+};
+
+template<>
+template<typename T, typename R>
+inline __device__ R Get<0>::of(const T &vec) {
+    return vec.x;
+}
+
+template<>
+template<typename T, typename R>
+inline __device__ R Get<1>::of(const T &vec) {
+    return vec.y;
+}
+
+template<>
+template<typename T, typename R>
+inline __device__ R Get<2>::of(const T &vec) {
+    return vec.z;
+}
+
+template<>
+template<typename T, typename R>
+inline __device__ R Get<3>::of(const T &vec) {
+    return vec.w;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Src, typename Dst>
+struct Converter{
+    static inline __device__ Dst convert(const Src &from) {
+        return Dst(from);
+    }
+};
+
+template<>
+struct Converter<float2, half2>{
+    static inline __device__ half2 convert(const float2 &x) {
+        return __float22half2_rn(x);
+    }
+};
+
+template<>
+struct Converter<float2, nv_bfloat162>{
+    static inline __device__ nv_bfloat162 convert(const float2 &x) {
+#if __CUDA_ARCH__ >= 800
+        return __float22bfloat162_rn(x);
+#else
+        union {
+            nv_bfloat162 raw;
+            nv_bfloat16 x;
+            nv_bfloat16 y;
+        } tmp;
+        tmp.x = __float2bfloat16_rn(x.x);
+        tmp.y = __float2bfloat16_rn(x.y);
+        return tmp.raw;
+#endif
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct Zeros{
+    static inline __device__ T get() {
+        return T(0.f);
+    }
+};
+
+template<> 
+struct Zeros<float2>{
+    static inline __device__ float2 get() {
+        return make_float2(0.f, 0.f);
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename Elt_type, uint32_t NUM_ELT>
+struct Vec {
+
+    enum { BYTES = NUM_ELT * sizeof(Elt_type) };
+
+    using Vec_type = typename BytesToType<BYTES>::Type;
+
+    using Alias_type = union {
+        Vec_type vec;
+        Elt_type elt[NUM_ELT];
+    };
+
+    Alias_type data;
+
+    template<typename S>
+    inline __device__ void to(Vec<S, NUM_ELT> &other) {
+        #pragma unroll
+        for( int it = 0; it < NUM_ELT; it++ ) {
+            other.data.elt[it] = S(this->data.elt[it]);
+        }
+    }
+
+    template<typename Op>
+    inline __device__ void assign(const Op &op) {
+        #pragma unroll
+        for( int it = 0; it < NUM_ELT; it++ ) {
+            this->data.elt[it] = op(it);
+        }
+    }
+
+    inline __device__ void zero_() {
+        #pragma unroll
+        for( int it = 0; it < NUM_ELT; it++ ) {
+            this->data.elt[it] = Elt_type(0.f);
+        }
+    }
+
+    inline __device__ void load_from(const void *base_ptr, const size_t idx) {
+        this->data.vec = static_cast<const Vec_type *>(base_ptr)[idx];
+    }
+
+    inline __device__ void store_to(void *base_ptr, const size_t idx) {
+        static_cast<Vec_type *>(base_ptr)[idx] = this->data.vec;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<uint32_t CTAS_PER_ROW>
+struct InterCTASync {
+
+    template<typename Params>
+    inline __device__ InterCTASync(Params & params, uint32_t bidm, uint32_t bidn)
+        : phase_counter_(0)
+        , b0_(params.barrier + bidm) // The barrier for this group of CTAs.
+        , b1_(params.barrier + bidm + params.ctas_per_col) // The barrier for this group of CTAs.
+    {
+        // BARRIERS ARE ASSUMED TO BE INITIALIZED TO 0!
+    }
+
+    inline __device__ void spin_wait_(int *barrier, int step, int expected) {
+        asm volatile("red.release.gpu.global.add.s32 [%0], %1;" ::"l"(barrier), "r"(step));
+        for( int found = -1; found != expected; ) {
+            asm volatile("ld.global.acquire.gpu.b32 %0, [%1];" : "=r"(found) : "l"(barrier));
+        }
+    }
+
+    inline __device__ void sync(){
+        // ALL THREADS MUST ENTER!
+
+        // We switch barrier every iteration.
+        int *barrier = phase_counter_ & 0x1 ? b1_ : b0_;
+        // We decrement every other iteration.
+        bool dec = phase_counter_ & 0x2;
+        int step = dec ? -1 : 1;
+        int expected = dec ? 0 : CTAS_PER_ROW;
+        // There are only 4 phases: up/down for b0/b1.
+        phase_counter_ = (phase_counter_ + 1) & 0x3;
+
+        if( threadIdx.x == 0 ) {
+            spin_wait_(barrier, step, expected);
+        }
+        // CTA waits for thread 0
+        __syncthreads();
+    }
+
+    int phase_counter_;
+    int * b0_;
+    int * b1_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, uint32_t CTAS_PER_ROW, uint32_t WARPS_M, uint32_t WARPS_N>
+struct Reducer : public Reducer<T, 1, WARPS_M, WARPS_N> {
+
+    using InterCTASync = InterCTASync<CTAS_PER_ROW>;
+    using Base = Reducer<T, 1, WARPS_M, WARPS_N>;
+    using Type = typename Base::Type;
+
+    enum { SMEM_BYTES = Base::SMEM_BYTES };
+
+    enum { WS_BARRIER_BYTES = 2 * sizeof(int) };
+    enum { WS_DATA_BYTES = WARPS_M * CTAS_PER_ROW * sizeof(T) };
+
+    // size of the barriers + temporary result per CTA (multiply with CTAS_PER_ROW to get total)
+    enum { WORKSPACE_BYTES_PER_GROUP = Base::WORKSPACE_BYTES_PER_GROUP + WS_BARRIER_BYTES + WS_DATA_BYTES };
+
+    template<typename Params>
+    inline __device__ Reducer(Params & params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane, void * smem)
+        : Base(params, bidm, bidn, warp_m, warp_n, lane, smem) 
+        , inter_cta_(params, bidm, bidn)
+        , bidn_(bidn) // CTA id within the group.
+        , w0_(static_cast<T*>(params.workspace) + (bidm * WARPS_M + warp_m) * CTAS_PER_ROW)
+        , w1_(w0_ + params.ctas_per_col * WARPS_M * CTAS_PER_ROW)
+    {
+    }
+
+    template<typename Op>
+    inline __device__ T allreduce(T data, Op &op) {
+        data = Base::reduce(data, op);
+        // We switch workspace every iteration.
+        T *workspace = inter_cta_.phase_counter_ & 0x1 ? w1_ : w0_;
+
+        // Warp leaders 0 hold the CTA-local results.
+        if( this->warp_n_ == 0 && this->lane_ == 0 ) {
+            workspace[bidn_] = data;
+        }
+        inter_cta_.sync();
+        static_assert(CTAS_PER_ROW <= 32);
+        T total = Zeros<T>::get();
+        if(this->lane_ < CTAS_PER_ROW){
+            total = workspace[this->lane_];
+        }
+        total = Reducer<T, 1, 1, 1>::allreduce_(total, op);
+
+        return total;
+    }
+
+    InterCTASync inter_cta_;
+
+    T *w0_;
+    T *w1_;
+    int bidn_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, uint32_t WARPS_M>
+struct Reducer<T, 1, WARPS_M, 1> {
+
+    using Type = T;
+    enum { SMEM_BYTES = 0 };
+    enum { WORKSPACE_BYTES_PER_GROUP = 0 };
+
+    enum { THREADS_PER_WARP = 32 };
+
+    template<typename Params>
+    inline __device__ Reducer(Params & params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane, void * smem) 
+        : warp_n_(warp_n)
+        , lane_(lane)
+    {
+    }
+
+    template<typename Op>
+    static inline __device__ T allreduce_(T data, Op &op) {
+        #pragma unroll
+        for( int it = 1; it < THREADS_PER_WARP; it *= 2 ) {
+            data = op(data, warp_shuffle_xor(data, it));
+        }
+        return data;
+    }
+
+    template<typename Op>
+    inline __device__ T allreduce(T data, Op &op) {
+        return allreduce_(data, op);
+    }
+
+    template<typename Op>
+    inline __device__ T reduce(T data, Op &op){
+        // only lane 0 holds the result!
+        #pragma unroll
+        for( int it = THREADS_PER_WARP / 2; it > 0; it /= 2 ) {
+            data = op(data, warp_shuffle_down(data, it));
+        }  
+        return data;
+    }
+    int warp_n_;
+    int lane_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, uint32_t WARPS_M, uint32_t WARPS_N>
+struct Reducer<T, 1, WARPS_M, WARPS_N> : public Reducer<T, 1, WARPS_M, 1> {
+
+    using Base = Reducer<T, 1, WARPS_M, 1>;
+
+    using Type = T;
+
+    enum { SMEM_BYTES = Base::SMEM_BYTES + WARPS_M * WARPS_N * sizeof(T) * 2 };
+    enum { WORKSPACE_BYTES_PER_GROUP = 0 };
+
+    enum { THREADS_PER_WARP = 32 };
+
+    template<typename Params>
+    inline __device__ Reducer(Params & params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane, void * smem) 
+        : Base(params, bidm, bidn, warp_m, warp_n, lane, smem) 
+        , use0_(true)
+    {
+        smem0_ = &static_cast<T *>(smem)[warp_m * WARPS_N];
+        smem1_ = smem0_ + WARPS_M * WARPS_N;
+    }
+
+    template<typename Op>
+    inline __device__ T allreduce(T data, Op & op) {
+        T * smem = use0_ ? smem0_ : smem1_;
+        use0_ = !use0_;
+        data = Base::reduce(data, op);
+        if( this->lane_ == 0 ) {
+            smem[this->warp_n_] = data;
+        }
+        __syncthreads();
+        T out = Zeros<T>::get();
+        #pragma unroll
+        for( int it = 0; it < WARPS_N; it++ ) {
+            out = op(out, smem[it]);
+        }
+        return out;
+    }
+
+    template<typename Op>
+    inline __device__ T reduce(T data, Op &op) {
+        T * smem = use0_ ? smem0_ : smem1_;
+        use0_ = !use0_;
+        // only intra-CTA group leader holds the result!
+        data = Base::reduce(data, op);
+        if( this->lane_ == 0 ) {
+            smem[this->warp_n_] = data;
+        }
+        __syncthreads();
+        T out = Zeros<T>::get();
+        if( this->warp_n_ == 0 && this->lane_ == 0 ) {
+            #pragma unroll
+            for( int it = 0; it < WARPS_N; it++ ) {
+                out = op(out, smem[it]);
+            }
+        }
+        return out;
+    }
+
+    T * smem0_;
+    T * smem1_;
+    bool use0_;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+ 
+template<typename T, typename int_t>
+inline __device__ void warp_chan_upd_dynamic(T &m_a, T &m2_a, int_t &n_a, int num_active){
+    //Assume at least leftmost is valid and init: step = next_pow2(num_active) / 2 (might get NaN otherwise)
+    const int highest_bit_set = (8 * sizeof(num_active)) - __clz(num_active - 1);
+    
+    #pragma unroll
+    for( int step = (1 << (highest_bit_set - 1)); step > 0; step /= 2 ) {
+        // Exchange
+        int_t n_b = warp_shuffle_down(n_a, step);
+        T m_b = warp_shuffle_down(m_a, step);
+        T m2_b = warp_shuffle_down(m2_a, step);
+
+        // Update
+        const int_t n_ab = n_a + n_b; // We can handle one of them being 0, not both.
+        const T rn_ab = 1.f / n_ab; // Might have different n per thread, otherwise this would simplify :(
+        const T delta = m_a - m_b;
+        const float m2_ab = m2_a + m2_b + delta * delta * n_a * n_b * rn_ab;
+        const float m_ab = (n_a * m_a + n_b * m_b) * rn_ab;
+
+        n_a = n_ab;
+        m_a = m_ab;
+        m2_a = m2_ab;
+    }
+    // Intra-warp broadcast (only lane 0 has valid stats).
+    m_a = __shfl_sync(uint32_t(-1), m_a, 0);
+    m2_a = __shfl_sync(uint32_t(-1), m2_a, 0);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, uint32_t CTAS_PER_ROW, uint32_t WARPS_M, uint32_t WARPS_N>
+struct Stats {
+    // This could be done generically with the Reducer. But then we would have to exchange 3 instead of 2 fields.
+
+    using InterCTASync = InterCTASync<CTAS_PER_ROW>;
+    using BlockStats = Stats<T, 1, WARPS_M, WARPS_N>;
+    using stats_t = typename BlockStats::stats_t;
+
+    enum { SMEM_BYTES = BlockStats::SMEM_BYTES };
+
+    template<typename Params>
+    inline __device__ Stats(Params & params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane, void * smem) 
+        : inter_cta_(params, bidm, bidn)
+        , block_stats_(params, bidm, bidn, warp_m, warp_n, lane, smem)
+        , bidn_(bidn) // CTA id within the group.
+        , w0_(static_cast<stats_t*>(params.workspace) + (bidm * WARPS_M + warp_m) * CTAS_PER_ROW)
+        , w1_(w0_ + params.ctas_per_col * WARPS_M * CTAS_PER_ROW)
+        , warp_n_(warp_n)
+        , lane_(lane)
+    {
+    }
+
+    template<uint32_t N>
+    inline __device__ stats_t compute(const T (&elts)[N], const T rn) {
+        constexpr T ELTS_PER_ROW_PER_CTA = N * WARPS_N * THREADS_PER_WARP;
+        // TODO rn is not really needed here..
+        constexpr T block_rn = 1.f / T(ELTS_PER_ROW_PER_CTA);
+        stats_t block_stats = block_stats_.compute(elts, block_rn);
+
+        stats_t *workspace = inter_cta_.phase_counter_ & 0x1 ? w1_ : w0_;
+
+        if( warp_n_ == 0 && lane_ == 0 ) {
+            workspace[bidn_] = block_stats;
+        }
+
+        // Wait for all CTAS_PER_ROW CTAS in the group to have written their result.
+        inter_cta_.sync();
+
+        T n = Zeros<T>::get();
+        T m = Zeros<T>::get();
+        T m2 = Zeros<T>::get();
+
+        // Assume CTA group size in N less than 32, such that we can finalize with a single warp.
+        static_assert(CTAS_PER_ROW <= 32);
+
+        // Every warp does the final reduction locally. 
+        if( lane_ < CTAS_PER_ROW ) {
+            stats_t result = workspace[lane_];
+            n = ELTS_PER_ROW_PER_CTA;
+            m = layer_norm::Get<0>::of<stats_t, T>(result);
+            m2 = layer_norm::Get<1>::of<stats_t, T>(result);
+        }
+
+        warp_chan_upd_dynamic(m, m2, n, CTAS_PER_ROW);
+
+        return { m, m2 };
+    }
+
+    InterCTASync inter_cta_;
+    BlockStats block_stats_;
+
+    stats_t *w0_;
+    stats_t *w1_;
+    int bidn_;
+    int warp_n_;
+    int lane_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, uint32_t WARPS_M, uint32_t WARPS_N>
+struct Stats<T, 1, WARPS_M, WARPS_N> {
+
+    using WarpStats = Stats<T, 1, WARPS_M, 1>;
+    using stats_t = typename WarpStats::stats_t;
+
+    enum { SMEM_BYTES = WARPS_M * WARPS_N * sizeof(stats_t) * 2 };
+
+    template<typename Params>
+    inline __device__ Stats(Params & params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane, void * smem) 
+        : warp_stats_(params, bidm, bidn, warp_m, warp_n, lane, smem)
+        , use0_(true)
+    {
+        smem0_ = static_cast<stats_t*>(smem) + warp_m * WARPS_N;
+        smem1_ = smem0_ + WARPS_M * WARPS_N;
+    }
+
+    template<bool Is_even_cols, uint32_t N, typename function_t>
+    inline __device__ stats_t compute(const T (&elts)[N], const T row_norm_factor,
+                                      function_t valid_elts_in_warp_fn, const int num_valid_elts = N) {
+        stats_t * smem = use0_ ? smem0_ : smem1_;
+        use0_ = !use0_;
+        // Compute warp local for all WARPS_N
+        const auto warp_n = warp_stats_.reducer_.warp_n_;
+        const T warp_norm_factor = 1.f / T(Is_even_cols ? N * THREADS_PER_WARP : valid_elts_in_warp_fn(warp_n));
+        stats_t warp_stats = warp_stats_.template compute<Is_even_cols>(
+            elts, warp_norm_factor, valid_elts_in_warp_fn, num_valid_elts
+        );
+
+        //Each warp warp leader stores its stats
+        const auto lane = warp_stats_.reducer_.lane_;
+        if( lane == 0 ) {
+            smem[warp_n] = warp_stats;
+        }
+        __syncthreads();
+
+        int n = 0;;
+        T m = Zeros<T>::get();
+        T m2 = Zeros<T>::get();
+
+        // Assume that there are less than 32 warps, such that we can finalize with a single warp
+        static_assert(WARPS_N <= 32);
+        if(lane < WARPS_N){
+            stats_t result = smem[lane];
+            n = Is_even_cols ? N * THREADS_PER_WARP : valid_elts_in_warp_fn(lane);
+            m = layer_norm::Get<0>::of<stats_t, T>(result);
+            m2 = layer_norm::Get<1>::of<stats_t, T>(result);
+        }
+
+        warp_chan_upd_dynamic(m, m2, n, WARPS_N);
+
+        return { m, m2 };
+    }
+    WarpStats warp_stats_;
+    stats_t * smem0_;
+    stats_t * smem1_;
+    bool use0_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, uint32_t WARPS_M>
+struct Stats<T, 1, WARPS_M, 1> {
+
+    using stats_t = typename TypeToVec2<T>::Type;
+    // The simple Warp reducer.
+    using Reducer = Reducer<T, 1, WARPS_M, 1>;
+
+    enum { SMEM_BYTES = 0 };
+
+    template<typename Params>
+    inline __device__ Stats(Params & params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane, void * smem) 
+        : reducer_(params, bidm, bidn, warp_m, warp_n, lane, smem)
+    {
+    }
+
+    template<bool Is_even_cols, uint32_t N, typename function_t>
+    inline __device__ stats_t compute(const T (&elts)[N], const T row_norm_factor,
+                                      // const int valid_elts_in_warp_ignored_, const int num_valid_elts = N) {
+                                      function_t valid_elts_in_warp_fn, const int num_valid_elts = N) {
+
+        auto sum = Sum<T>();
+
+        T m = Zeros<T>::get();
+        #pragma unroll
+        for( int it = 0; it < N; it++ ) {
+            if (Is_even_cols || (it < num_valid_elts)) {
+                m += elts[it];
+            }
+        }
+        m = reducer_.allreduce(m, sum) * row_norm_factor;
+
+        T m2 = Zeros<T>::get();
+        #pragma unroll
+        for( int it = 0; it < N; it++ ) {
+            if (Is_even_cols || (it < num_valid_elts)) {
+                T diff = (elts[it] - m);
+                m2 += diff * diff;
+            }
+        }
+        m2 = reducer_.allreduce(m2, sum);
+
+        return {m, m2};
+    }
+
+    Reducer reducer_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace layer_norm
diff --git a/mainloop_bwd_sm90_tma_gmma_ws.hpp b/mainloop_bwd_sm90_tma_gmma_ws.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7483f4efdb5c704115ac154ecd0aa7b00ea6157f
--- /dev/null
+++ b/mainloop_bwd_sm90_tma_gmma_ws.hpp
@@ -0,0 +1,841 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/cutlass.h>
+#include <cutlass/array.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/barrier.h>
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "named_barrier.hpp"
+#include "softmax.h"
+#include "utils.h"
+
+namespace flash {
+
+using namespace cute;
+
+template <int Stages, class ClusterShape_, class TileShape_MNK_, class Element_, class ElementAccum_, class ArchTag_,
+        bool Is_causal_, bool Varlen_, bool Deterministic,
+        bool dKV_swapAB_, bool dQ_swapAB_,
+        int AtomLayoutMSdP=1, int AtomLayoutNdKV=2, int AtomLayoutMdQ=1>
+struct CollectiveMainloopBwd {
+
+    static constexpr int kStages = Stages;
+    using ClusterShape = ClusterShape_;
+    using TileShape_MNK = TileShape_MNK_;
+    using Element = Element_;
+    using ElementAccum = ElementAccum_;
+    using ArchTag = ArchTag_;
+    static constexpr bool Is_causal = Is_causal_;
+    static constexpr bool Varlen = Varlen_;
+    static constexpr bool SdP_swapAB = true;
+    static constexpr bool dKV_swapAB = dKV_swapAB_;
+    static constexpr bool dQ_swapAB = dQ_swapAB_;
+    static_assert(!(SdP_swapAB && dKV_swapAB));  // If SdP_swapAB, then we don't swap for dKV
+
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    static constexpr int kHeadDim = get<2>(TileShape_MNK{});
+
+    static constexpr int NumdQWarpGroups = 2;
+    static constexpr int kNThreadsdQ = NumdQWarpGroups * cutlass::NumThreadsPerWarpGroup;
+
+    static_assert(ArchTag::kMinComputeCapability >= 90);
+    static_assert(get<0>(ClusterShape{}) == 1 && get<2>(ClusterShape{}) == 1);
+
+    static constexpr bool Mma_dQ_is_RS = AtomLayoutMSdP == 2 && AtomLayoutMdQ == 2 && !SdP_swapAB && !dQ_swapAB;  // If dQ_swapAB we can't use RS
+    using TileShapeAtomSdP = std::conditional_t<
+        !SdP_swapAB,
+        Shape<Int<kBlockM>, Int<kBlockN / (2 / AtomLayoutMSdP)>, Int<kHeadDim>>,
+        Shape<Int<kBlockN>, Int<kBlockM / AtomLayoutMSdP>, Int<kHeadDim>>
+    >;
+    using AtomLayoutSdP = std::conditional_t<
+        !SdP_swapAB,
+        Layout<Shape<Int<AtomLayoutMSdP>, Int<2 / AtomLayoutMSdP>, _1>>,
+        Layout<Shape<Int<2 / AtomLayoutMSdP>, Int<AtomLayoutMSdP>, _1>>
+    >;
+    using TiledMmaSdP = decltype(cute::make_tiled_mma(
+        cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShapeAtomSdP>(),
+        AtomLayoutSdP{}));
+
+    using TileShapeAtomdKV = std::conditional_t<
+        !dKV_swapAB,
+        Shape<Int<kBlockN>, Int<kHeadDim / (2 / AtomLayoutNdKV)>, Int<kBlockM>>,
+        Shape<Int<kHeadDim>, Int<kBlockN / AtomLayoutNdKV>, Int<kBlockM>>
+    >;
+    using AtomLayoutdKV = std::conditional_t<
+        !dKV_swapAB,
+        Layout<Shape<Int<AtomLayoutNdKV>, Int<2 / AtomLayoutNdKV>, _1>>,
+        Layout<Shape<Int<2 / AtomLayoutNdKV>, Int<AtomLayoutNdKV>, _1>>
+    >;
+    using TiledMmadKV = decltype(cute::make_tiled_mma(
+        std::conditional_t<
+            !SdP_swapAB,
+            decltype(cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShapeAtomdKV, GMMA::Major::MN, GMMA::Major::MN>()),
+            decltype(cute::GMMA::rs_op_selector<Element, Element, ElementAccum, TileShapeAtomdKV, GMMA::Major::K, GMMA::Major::MN>())
+        >{},
+        AtomLayoutdKV{}));
+
+    using TileShapeAtomdQ = std::conditional_t<
+        !dQ_swapAB,
+        Shape<Int<kBlockM>, Int<kHeadDim / (NumdQWarpGroups / AtomLayoutMdQ)>, Int<kBlockN>>,
+        Shape<Int<kHeadDim>, Int<kBlockM / AtomLayoutMdQ>, Int<kBlockN>>
+    >;
+    using AtomLayoutdQ = std::conditional_t<
+        !dQ_swapAB,
+        Layout<Shape<Int<AtomLayoutMdQ>, Int<NumdQWarpGroups / AtomLayoutMdQ>, _1>>,
+        Layout<Shape<Int<NumdQWarpGroups / AtomLayoutMdQ>, Int<AtomLayoutMdQ>, _1>>
+    >;
+    static constexpr GMMA::Major MmadQMajorA = !dQ_swapAB ? GMMA::Major::K : GMMA::Major::MN;
+    static constexpr GMMA::Major MmadQMajorB = !dQ_swapAB ? GMMA::Major::MN : GMMA::Major::K;
+    using TiledMmadQ = decltype(cute::make_tiled_mma(
+        std::conditional_t<
+            !dQ_swapAB,
+            std::conditional_t<
+                Mma_dQ_is_RS,
+                decltype(cute::GMMA::rs_op_selector<Element, Element, ElementAccum, TileShapeAtomdQ, GMMA::Major::K, GMMA::Major::MN>()),
+                decltype(cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShapeAtomdQ, GMMA::Major::K, GMMA::Major::MN>())
+            >,
+            decltype(cute::GMMA::ss_op_selector<Element, Element, ElementAccum, TileShapeAtomdQ, GMMA::Major::MN, GMMA::Major::K>())
+        >{},
+        AtomLayoutdQ{}));
+
+    using SmemLayoutAtomQ = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+                                     Int<kBlockM>, Int<dKV_swapAB ? kHeadDim : kHeadDim / (2 / AtomLayoutNdKV)>>());
+    using SmemLayoutQ =
+        decltype(tile_to_shape(SmemLayoutAtomQ{},
+                 make_shape(shape<0>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int<kStages>{})));
+    using SmemLayoutdO = SmemLayoutQ;
+
+    using SmemLayoutAtomK = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+                                     Int<kBlockN>, Int<dQ_swapAB ? kHeadDim : kHeadDim / (NumdQWarpGroups / AtomLayoutMdQ)>>());
+    using SmemLayoutK = decltype(tile_to_shape(SmemLayoutAtomK{}, select<1, 2>(TileShape_MNK{})));
+
+    using SmemLayoutAtomV = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
+    using SmemLayoutV = decltype(tile_to_shape(SmemLayoutAtomV{}, select<1, 2>(TileShape_MNK{})));
+
+    using SmemLayoutAtomP = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<1>(TileShape_MNK{}))>());
+    using SmemLayoutP = decltype(tile_to_shape(SmemLayoutAtomP{}, select<0, 1>(TileShape_MNK{})));
+    using SmemLayoutAtomdS = decltype(cutlass::gemm::collective::detail::ss_smem_selector<GMMA::Major::K, Element,
+        decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<1>(TileShape_MNK{}))>());
+    using SmemLayoutdS = decltype(tile_to_shape(SmemLayoutAtomdS{}, make_shape(Int<kBlockM>{}, Int<kBlockN>{}, Int<kStages>{})));
+
+    // Need stride to be multiple of 32, otherwise we get error (misaligned address) when doing TMA if e.g. kBlockM=80
+    using SmemLayoutLSE = cute::Layout<cute::Shape<Int<kBlockM>, Int<kStages>>, cute::Stride<_1, Int<cute::round_up(kBlockM, 32)>>>;
+    using SmemLayoutLSEMma = cute::Layout<cute::Shape<Int<kBlockN>, Int<kBlockM>, Int<kStages>>, cute::Stride<_0, _1, Int<cute::round_up(kBlockM, 32)>>>;
+
+    // Note this is the transpose in terms of the view, not in terms of memory.
+    using SmemLayoutQt =
+        decltype(cute::composition(SmemLayoutQ{},
+                                   make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{}), Int<kStages>{}),
+                                               make_stride(Int<kBlockM>{}, _1{}, Int<kBlockM * kHeadDim>{}))));
+    using SmemLayoutdOt =
+        decltype(cute::composition(SmemLayoutdO{},
+                                   make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{}), Int<kStages>{}),
+                                               make_stride(Int<kBlockM>{}, _1{}, Int<kBlockM * kHeadDim>{}))));
+    using SmemLayoutKt =
+        decltype(cute::composition(SmemLayoutK{},
+                                   make_layout(make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{})),
+                                               make_stride(Int<kBlockN>{}, _1{}))));
+    using SmemLayoutPt =
+        decltype(cute::composition(SmemLayoutP{},
+                                   make_layout(make_shape(get<1>(TileShape_MNK{}), get<0>(TileShape_MNK{})),
+                                               make_stride(Int<kBlockM>{}, _1{}))));
+    using SmemLayoutdSt =
+        decltype(cute::composition(SmemLayoutdS{},
+                                   make_layout(make_shape(Int<kBlockN>{}, Int<kBlockM>{}, Int<kStages>{}),
+                                               make_stride(Int<kBlockM>{}, _1{}, Int<kBlockM * kBlockN>{}))));
+
+    // Thread layout, 256 threads per row
+    using R2SLayoutAtomdQaccum = Layout<Shape<Int<kNThreadsdQ>>, Stride<_1>>;
+    using R2STiledCopydQaccum = decltype(make_tiled_copy(Copy_Atom<DefaultCopy, ElementAccum>{}, R2SLayoutAtomdQaccum{},
+                                                         Layout<Shape < _4>>{}));  // Val layout, 4 vals per store
+    using SmemLayoutdQaccum = Layout<Shape<Int<kBlockM * kHeadDim>>, Stride<_1>>;
+    // We want dQaccum smem to have last dimension 32, so that we only need to do 1 TMA instruction.
+    // The layout Layout_K_SW128_Atom<ElementAccum> has 32 elements per row.
+    // // TMA limit is that each dimension in smem must be <= 256.
+    // static constexpr int ElemsPerRowTMA = (kBlockM * kHeadDim) / 32 <= 256 ? 32 : 64;
+    static constexpr int ElemsPerRowTMA = 32;  // If we change this, we'll also need to change the dQ shape in host.
+    static_assert((kBlockM * kHeadDim) % ElemsPerRowTMA == 0);
+    using TileShape_dQaccum = cute::Shape<Int<(kBlockM * kHeadDim) / ElemsPerRowTMA>, Int<ElemsPerRowTMA>>;
+    // using TileShape_dQaccum = cute::Shape<Int<kBlockM>, Int<kHeadDim>>;
+    using SmemLayoutdQaccumTMA =
+        decltype(tile_to_shape(GMMA::Layout_K_SW128_Atom<ElementAccum>{}, TileShape_dQaccum{}));
+    using SmemLayoutdQaccumTMANoSwizzle = decltype(get_nonswizzle_portion(SmemLayoutdQaccumTMA{}));
+
+    using SmemCopyAtomPdS = Copy_Atom<
+        std::conditional_t<!SdP_swapAB, cute::SM90_U32x4_STSM_N, cute::SM90_U16x8_STSM_T>,
+            Element>;
+    using SmemCopyAtomdKV = Copy_Atom<
+        std::conditional_t<!dKV_swapAB, cute::SM90_U32x4_STSM_N, cute::SM90_U16x8_STSM_T>,
+            Element>;
+
+    using GmemTiledCopyQdO = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape{})));
+    using GmemTiledCopyKV = cute::SM90_TMA_LOAD;
+    using GmemTiledCopydQaccum = cute::SM90_TMA_REDUCE_ADD;
+    using GmemTiledCopyLSE = cute::SM90_TMA_LOAD;
+
+    using ShapeQKV = cute::Shape<int32_t, int32_t, int32_t, int32_t>;  // (seqlen, d, head, batch)
+    using StrideQKV = cute::Stride<int64_t, _1, int64_t, int64_t>;
+    using ShapeLSE = cute::Shape<int32_t, int32_t, int32_t>;  // (seqlen, head, batch)
+    using StrideLSE = cute::Stride<_1, int64_t, int64_t>;  // (seqlen, head, batch)
+
+    using TMA_QdO = decltype(make_tma_copy(
+        GmemTiledCopyQdO{},
+        make_tensor(make_gmem_ptr(static_cast<Element const*>(nullptr)), ShapeQKV{}, StrideQKV{}),
+        take<0, 2>(SmemLayoutQ{}),
+        select<0, 2>(TileShape_MNK{}),
+        size<1>(ClusterShape{}))); // mcast along N mode for this M load, if any
+
+    using TMA_K = decltype(make_tma_copy(
+        GmemTiledCopyKV{},
+        make_tensor(make_gmem_ptr(static_cast<Element const*>(nullptr)), ShapeQKV{}, StrideQKV{}),
+        SmemLayoutK{},
+        select<1, 2>(TileShape_MNK{}),
+        _1{})); // no mcast for KV
+
+    using TMA_V = decltype(make_tma_copy(
+        GmemTiledCopyKV{},
+        make_tensor(make_gmem_ptr(static_cast<Element const*>(nullptr)), ShapeQKV{}, StrideQKV{}),
+        SmemLayoutV{},
+        select<1, 2>(TileShape_MNK{}),
+        _1{})); // no mcast for KV
+
+    using TMA_add_dQ = decltype(make_tma_copy(
+        GmemTiledCopydQaccum{},
+        make_tensor(make_gmem_ptr(static_cast<ElementAccum*>(nullptr)), ShapeQKV{}, StrideQKV{}),
+        SmemLayoutdQaccumTMA{},
+        TileShape_dQaccum{},
+        _1{})); // no mcast for dQ
+
+    using TMA_LSE = decltype(make_tma_copy(
+        GmemTiledCopyLSE{},
+        make_tensor(make_gmem_ptr(static_cast<ElementAccum const*>(nullptr)), ShapeLSE{}, StrideLSE{}),
+        select<0>(SmemLayoutLSE{}),
+        select<0>(TileShape_MNK{}),
+        _1{})); // no mcast for LSE
+
+    static constexpr int NumMmaThreads = size(TiledMmaSdP{});
+
+    using MainloopPipeline = typename cutlass::PipelineTmaAsync<kStages>;
+    using PipelineState = typename MainloopPipeline::PipelineState;
+
+    // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+    static constexpr uint32_t TmaTransactionBytesQ = static_cast<uint32_t>(size(take<0, 2>(SmemLayoutQ{})) * cutlass::sizeof_bits_v<Element> / 8);
+    static constexpr uint32_t TmaTransactionBytesK = static_cast<uint32_t>(size(SmemLayoutK{}) * cutlass::sizeof_bits_v<Element> / 8);
+    static constexpr uint32_t TmaTransactionBytesV = static_cast<uint32_t>(size(SmemLayoutV{}) * cutlass::sizeof_bits_v<Element> / 8);
+    static constexpr uint32_t TmaTransactionBytesLSE = static_cast<uint32_t>(size(select<0>(SmemLayoutLSE{})) * cutlass::sizeof_bits_v<ElementAccum> / 8);
+
+    struct TensorStorage : cute::aligned_struct<1024> {
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>> smem_k;
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutV>> smem_v;
+        // It's important that smem_dqacc is aligned to 1024 bytes for the TMA, so that the 1st row
+        // has no swizzle.
+        // If the address is only 128 bytes aligned, it's possible that the 1st row has swizzle
+        // and when we read it back in the postprocess kernel, the swizzle will not match.
+        cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutdQaccum>, 1024> smem_dqacc;
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>> smem_q;
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutdO>> smem_do;
+        cute::array_aligned<Element, cute::cosize_v<SmemLayoutdS>> smem_ds;
+        cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutLSE>, 128> smem_lse;
+        cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutLSE>, 128> smem_dpsum;
+    };
+
+    static constexpr int SharedStorageQdOSize = sizeof(decltype((TensorStorage{}).smem_q)) + sizeof(decltype((TensorStorage{}).smem_do)) + sizeof(decltype((TensorStorage{}).smem_ds)) + sizeof(decltype((TensorStorage{}).smem_dqacc));
+
+    // Host side kernel arguments
+    struct Arguments {
+        Element const* ptr_Q;
+        ShapeQKV const shape_Q;
+        StrideQKV const stride_Q;
+        Element const* ptr_K;
+        ShapeQKV const shape_K;
+        StrideQKV const stride_K;
+        Element const* ptr_V;
+        StrideQKV const stride_V;
+        Element const* ptr_dO;
+        StrideQKV const stride_dO;
+        ElementAccum* ptr_dQaccum;
+        ShapeQKV const shape_dQaccum;
+        StrideQKV const stride_dQaccum;
+        float const* ptr_LSE_log2;
+        ShapeLSE const shape_LSE;
+        StrideLSE const stride_LSE_log2;
+        float const* ptr_dPsum;
+        StrideLSE const stride_dPsum;
+        float const softmax_scale;
+        int num_batch;
+        int* dq_semaphore;
+        int const* cu_seqlens_q = nullptr;
+        int const* cu_seqlens_k = nullptr;
+    };
+
+    // Device side kernel params
+    struct Params {
+        ShapeQKV const shape_Q;
+        ShapeQKV const shape_K;
+        ShapeQKV const shape_dQaccum;
+        cutlass::FastDivmod qhead_per_khead_divmod;
+        TMA_QdO tma_load_Q, tma_load_dO;
+        TMA_K tma_load_K;
+        TMA_V tma_load_V;
+        TMA_add_dQ tma_add_dQ;
+        TMA_LSE tma_load_LSE, tma_load_dPsum;
+        float const* ptr_LSE_log2;
+        ShapeLSE const shape_LSE;
+        StrideLSE const stride_LSE_log2;
+        float const* ptr_dPsum;
+        StrideLSE const stride_dPsum;
+        float const softmax_scale;
+        float const softmax_scale_log2;
+        int num_batch;
+        int* dq_semaphore;
+        int const* cu_seqlens_q = nullptr;
+        int const* cu_seqlens_k = nullptr;
+    };
+
+    static Params
+    to_underlying_arguments(Arguments const& args) {
+        Tensor mQ = make_tensor(make_gmem_ptr(args.ptr_Q), args.shape_Q, args.stride_Q);
+        TMA_QdO tma_load_Q = make_tma_copy(
+            GmemTiledCopyQdO{},
+            mQ,
+            SmemLayoutQ{}(_, _, _0{}),
+            select<0, 2>(TileShape_MNK{}),
+            size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+        Tensor mdO = make_tensor(make_gmem_ptr(args.ptr_dO), args.shape_Q, args.stride_dO);
+        TMA_QdO tma_load_dO = make_tma_copy(
+            GmemTiledCopyQdO{},
+            mdO,
+            SmemLayoutdO{}(_, _, _0{}),
+            select<0, 2>(TileShape_MNK{}),
+            size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+        Tensor mK = make_tensor(make_gmem_ptr(args.ptr_K), args.shape_K, args.stride_K);
+        TMA_K tma_load_K = make_tma_copy(
+            GmemTiledCopyKV{},
+            mK,
+            SmemLayoutK{},
+            select<1, 2>(TileShape_MNK{}),
+            _1{}); // no mcast for KV
+        Tensor mV = make_tensor(make_gmem_ptr(args.ptr_V), args.shape_K, args.stride_V);
+        TMA_V tma_load_V = make_tma_copy(
+            GmemTiledCopyKV{},
+            mV,
+            SmemLayoutV{},
+            select<1, 2>(TileShape_MNK{}),
+            _1{}); // no mcast for KV
+        Tensor mdQaccum = make_tensor(make_gmem_ptr(args.ptr_dQaccum), args.shape_dQaccum, args.stride_dQaccum);
+        TMA_add_dQ tma_add_dQ = make_tma_copy(
+            GmemTiledCopydQaccum{},
+            mdQaccum,
+            SmemLayoutdQaccumTMA{},
+            TileShape_dQaccum{},
+            _1{}); // no mcast for dQaccum
+        Tensor mLSE = make_tensor(make_gmem_ptr(args.ptr_LSE_log2), args.shape_LSE, args.stride_LSE_log2);
+        TMA_LSE tma_load_LSE = make_tma_copy(
+            GmemTiledCopyLSE{},
+            mLSE,
+            select<0>(SmemLayoutLSE{}),
+            select<0>(TileShape_MNK{}),
+            _1{}); // no mcast for LSE
+        Tensor mdPsum = make_tensor(make_gmem_ptr(args.ptr_dPsum), args.shape_LSE, args.stride_dPsum);
+        TMA_LSE tma_load_dPsum = make_tma_copy(
+            GmemTiledCopyLSE{},
+            mdPsum,
+            select<0>(SmemLayoutLSE{}),
+            select<0>(TileShape_MNK{}),
+            _1{}); // no mcast for dPsum
+        if constexpr (Deterministic) { assert(args.dq_semaphore != nullptr); }
+        return {args.shape_Q, args.shape_K, args.shape_dQaccum,
+                cutlass::FastDivmod(cute::ceil_div(get<2>(args.shape_Q), get<2>(args.shape_K))),
+                tma_load_Q, tma_load_dO, tma_load_K, tma_load_V, tma_add_dQ, tma_load_LSE, tma_load_dPsum,
+                args.ptr_LSE_log2, args.shape_LSE, args.stride_LSE_log2, args.ptr_dPsum, args.stride_dPsum,
+                args.softmax_scale, float(args.softmax_scale * M_LOG2E),
+                args.num_batch, args.dq_semaphore, args.cu_seqlens_q, args.cu_seqlens_k};
+    }
+
+    /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+    CUTLASS_DEVICE
+    static void prefetch_tma_descriptors(Params const& params) {
+        cute::prefetch_tma_descriptor(params.tma_load_Q.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_dO.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_K.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_V.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_LSE.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_load_dPsum.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(params.tma_add_dQ.get_tma_descriptor());
+    }
+
+    CUTLASS_DEVICE
+    int get_seqlen_q(Params const& params, int bidb) {
+        if constexpr (!Varlen) {
+            return get<0>(params.shape_Q);
+        } else {
+            return params.cu_seqlens_q == nullptr
+                ? get<0>(params.shape_Q)
+                : params.cu_seqlens_q[bidb + 1] - params.cu_seqlens_q[bidb];
+        }
+    }
+
+    CUTLASS_DEVICE
+    int get_seqlen_k(Params const& params, int bidb) {
+        if constexpr (!Varlen) {
+            return get<0>(params.shape_K);
+        } else {
+            return params.cu_seqlens_k == nullptr
+                ? get<0>(params.shape_K)
+                : params.cu_seqlens_k[bidb + 1] - params.cu_seqlens_k[bidb];
+        }
+    }
+
+    CUTLASS_DEVICE
+    int get_m_block_min(Params const& params, int n_block, int bidb) {
+        if constexpr (Is_causal) {
+            int const seqlen_q = get_seqlen_q(params, bidb);
+            int const seqlen_k = get_seqlen_k(params, bidb);
+            return std::max(0, (n_block * kBlockN + seqlen_q - seqlen_k) / kBlockM);
+        } else {
+            return 0;
+        }
+    }
+
+    template <typename SchedulerPrefetch, typename SharedStorage>
+    CUTLASS_DEVICE void
+    load(Params const& params,
+         MainloopPipeline pipeline_q,
+         MainloopPipeline pipeline_do,
+         PipelineState& smem_pipe_write,
+         SharedStorage &shared_storage,
+         SchedulerPrefetch const& scheduler_prefetch,
+         cute::tuple<int32_t, int32_t, int32_t> block_coord,
+         int work_idx
+         ) {
+
+        Tensor sQ = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_q.data()), SmemLayoutQ{});
+        Tensor sdO = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_do.data()), SmemLayoutdO{});
+        Tensor sK = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_k.data()), SmemLayoutK{});
+        Tensor sV = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_v.data()), SmemLayoutV{});
+        Tensor sLSE = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_lse.data()), SmemLayoutLSE{});
+        Tensor sdPsum = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_dpsum.data()), SmemLayoutLSE{});
+
+        auto [n_block, bidh, bidb] = block_coord;
+        int bidh_kv = params.qhead_per_khead_divmod.divide(bidh);
+
+        // Prepare the TMA loads
+        uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+        constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+        uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+        bool const is_varlen_q = Varlen && params.cu_seqlens_q != nullptr;
+        bool const is_varlen_k = Varlen && params.cu_seqlens_k != nullptr;
+        Tensor mQ = params.tma_load_Q.get_tma_tensor(params.shape_Q)(_, _, bidh, !is_varlen_q ? bidb : 0);
+        Tensor mdO = params.tma_load_dO.get_tma_tensor(params.shape_Q)(_, _, bidh, !is_varlen_q ? bidb : 0);
+        Tensor mK = params.tma_load_K.get_tma_tensor(params.shape_K)(_, _, bidh_kv, !is_varlen_k ? bidb : 0);
+        Tensor mV = params.tma_load_V.get_tma_tensor(params.shape_K)(_, _, bidh_kv, !is_varlen_k ? bidb : 0);
+        Tensor mLSE = params.tma_load_LSE.get_tma_tensor(params.shape_LSE)(_, bidh, !is_varlen_q ? bidb : 0);
+        Tensor mdPsum = params.tma_load_dPsum.get_tma_tensor(params.shape_LSE)(_, bidh, !is_varlen_q ? bidb : 0);
+
+        int const offset_q = !is_varlen_q ? 0 : params.cu_seqlens_q[bidb];
+        int const offset_k = !is_varlen_k ? 0 : params.cu_seqlens_k[bidb];
+        int const offset_padded = !is_varlen_q ? 0 : (params.cu_seqlens_q[bidb] + bidb * 128) / 128 * 128;
+        Tensor gQ = local_tile(domain_offset(make_coord(offset_q, _0{}), mQ), select<0, 2>(TileShape_MNK{}), make_coord(_, _0{}));  // (M, K, _)
+        Tensor gdO = local_tile(domain_offset(make_coord(offset_q, _0{}), mdO), select<0, 2>(TileShape_MNK{}), make_coord(_, _0{}));  // (M, K, _)
+        Tensor gK = local_tile(domain_offset(make_coord(offset_k, _0{}), mK), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{}));  // (N, K)
+        Tensor gV = local_tile(domain_offset(make_coord(offset_k, _0{}), mV), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{}));  // (N, K)
+        Tensor gLSE = local_tile(domain_offset(make_coord(offset_padded), mLSE), select<0>(TileShape_MNK{}), make_coord(_));  // (M, _)
+        Tensor gdPsum = local_tile(domain_offset(make_coord(offset_padded), mdPsum), select<0>(TileShape_MNK{}), make_coord(_));  // (M, _)
+
+        Tensor sK_x = make_tensor(sK.data(), make_layout(sK.layout(), Layout<_1>{}));
+        Tensor gK_x = make_tensor(gK.data(), make_layout(gK.layout(), Layout<_1>{}));
+        Tensor sV_x = make_tensor(sV.data(), make_layout(sV.layout(), Layout<_1>{}));
+        Tensor gV_x = make_tensor(gV.data(), make_layout(gV.layout(), Layout<_1>{}));
+        auto [tQgQ, tQsQ] = tma_partition(params.tma_load_Q, block_rank_in_cluster, Layout<ClusterShape>{},
+                                          group_modes<0, 2>(sQ), group_modes<0, 2>(gQ));  // (TMA, k), (TMA, PIPE)
+        auto [tdOgdO, tdOsdO] = tma_partition(params.tma_load_dO, block_rank_in_cluster, Layout<ClusterShape>{},
+                                          group_modes<0, 2>(sdO), group_modes<0, 2>(gdO));  // (TMA, k), (TMA, PIPE)
+        auto [tKgK, tKsK] = tma_partition(params.tma_load_K, _0{}, Layout<_1>{},
+                                          group_modes<0, 2>(sK_x), group_modes<0, 2>(gK_x));  // (TMA), (TMA)
+        auto [tVgV, tVsV] = tma_partition(params.tma_load_V, _0{}, Layout<_1>{},
+                                          group_modes<0, 2>(sV_x), group_modes<0, 2>(gV_x));  // (TMA), (TMA)
+        auto [tLSEgLSE, tLSEsLSE] = tma_partition(params.tma_load_LSE, _0{}, Layout<_1>{},
+                                                  sLSE, gLSE);  // (TMA, k), (TMA, PIPE)
+        auto [tLSEgdPsum, tLSEsdPsum] = tma_partition(params.tma_load_dPsum, _0{}, Layout<_1>{},
+                                                  sdPsum, gdPsum);  // (TMA, k), (TMA, PIPE)
+
+        uint16_t mcast_mask_qdo = 0;
+        if constexpr (cute::is_same_v<GmemTiledCopyQdO, SM90_TMA_LOAD_MULTICAST>) {
+            auto block_layout = Layout<ClusterShape>{}; // (m,n) -> block_id
+            for (int n = 0; n < size<1>(block_layout); ++n) {
+                mcast_mask_qdo |= (uint16_t(1) << block_layout(n, cluster_local_block_id.x, _0{}));
+            }
+        }
+
+        int m_block_max = cute::ceil_div(get_seqlen_q(params, bidb), get<0>(TileShape_MNK{}));
+        int m_block_min = get_m_block_min(params, n_block, bidb);
+        int m_block = m_block_min;
+
+        int lane_predicate = cute::elect_one_sync();
+
+        // // Wait for the MMA warpgroups to say that smem_q is ready
+        // cutlass::arch::NamedBarrier::sync(NumMmaThreads + cutlass::NumThreadsPerWarp, static_cast<int>(BwdNamedBarriers::QueryEmpty) /*id*/);
+
+        if (lane_predicate) {
+            // Copy K tile and V tile from GMEM to SMEM.
+            shared_storage.barrier_KV.arrive_and_expect_tx(TmaTransactionBytesK + TmaTransactionBytesV);
+            copy(params.tma_load_K.with(reinterpret_cast<cutlass::arch::ClusterTransactionBarrier::ValueType&>(shared_storage.barrier_KV), 0 /*mcast_mask*/), tKgK, tKsK);
+            copy(params.tma_load_V.with(reinterpret_cast<cutlass::arch::ClusterTransactionBarrier::ValueType&>(shared_storage.barrier_KV), 0 /*mcast_mask*/), tVgV, tVsV);
+
+            pipeline_q.producer_acquire(smem_pipe_write);
+            copy(params.tma_load_Q.with(*pipeline_q.producer_get_barrier(smem_pipe_write), mcast_mask_qdo), tQgQ(_, m_block), tQsQ(_, smem_pipe_write.index()));
+            copy(params.tma_load_LSE.with(*pipeline_q.producer_get_barrier(smem_pipe_write), 0), tLSEgLSE(_, m_block), tLSEsLSE(_, smem_pipe_write.index()));
+            #pragma unroll 2
+            for (; m_block < m_block_max - 1; ++m_block) {
+                pipeline_do.producer_acquire(smem_pipe_write);
+                copy(params.tma_load_dO.with(*pipeline_do.producer_get_barrier(smem_pipe_write), mcast_mask_qdo), tdOgdO(_, m_block), tdOsdO(_, smem_pipe_write.index()));
+                copy(params.tma_load_dPsum.with(*pipeline_do.producer_get_barrier(smem_pipe_write), 0), tLSEgdPsum(_, m_block), tLSEsdPsum(_, smem_pipe_write.index()));
+                ++smem_pipe_write;
+                pipeline_q.producer_acquire(smem_pipe_write);
+                copy(params.tma_load_Q.with(*pipeline_q.producer_get_barrier(smem_pipe_write), mcast_mask_qdo), tQgQ(_, m_block + 1), tQsQ(_, smem_pipe_write.index()));
+                copy(params.tma_load_LSE.with(*pipeline_q.producer_get_barrier(smem_pipe_write), 0), tLSEgLSE(_, m_block + 1), tLSEsLSE(_, smem_pipe_write.index()));
+            }
+        }
+        scheduler_prefetch();
+        if (lane_predicate) {
+            pipeline_do.producer_acquire(smem_pipe_write);
+            copy(params.tma_load_dO.with(*pipeline_do.producer_get_barrier(smem_pipe_write), mcast_mask_qdo), tdOgdO(_, m_block), tdOsdO(_, smem_pipe_write.index()));
+            copy(params.tma_load_dPsum.with(*pipeline_do.producer_get_barrier(smem_pipe_write), 0), tLSEgdPsum(_, m_block), tLSEsdPsum(_, smem_pipe_write.index()));
+            ++smem_pipe_write;
+        }
+    }
+
+    /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+    CUTLASS_DEVICE void
+    load_tail(MainloopPipeline pipeline_q, MainloopPipeline pipeline_do,
+              PipelineState& smem_pipe_write) {
+        // Need to copy since pipeline_q.producer_tail(smem_pipe_write) will increment smem_pipe_write
+        PipelineState smem_pipe_write_do = smem_pipe_write;
+        int lane_predicate = cute::elect_one_sync();
+        // Issue the epilogue waits
+        if (lane_predicate) {
+            /* This helps avoid early exit of blocks in Cluster
+            * Waits for all stages to either be released (all Consumer UNLOCKs), or if the stage was never used
+            * then would just be acquired since the phase was still inverted from make_producer_start_state
+            */
+            pipeline_q.producer_tail(smem_pipe_write);
+            pipeline_do.producer_tail(smem_pipe_write_do);
+        }
+    }
+
+    template <typename SharedStorage>
+    CUTLASS_DEVICE void
+    store_dq(Params const& params,
+             SharedStorage &shared_storage,
+             cute::tuple<int32_t, int32_t, int32_t> block_coord
+             ) {
+
+        Tensor sdQ = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_dqacc.data()), SmemLayoutdQaccumTMA{});
+        Tensor sdQnoswizzle = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_dqacc.data()), SmemLayoutdQaccumTMANoSwizzle{});
+        auto [n_block, bidh, bidb] = block_coord;
+
+        bool const is_varlen_q = Varlen && params.cu_seqlens_q != nullptr;
+        // We reshaped dQaccum to have last dimension 32, so the offset needs to be multiplied by kHeadDim / 32
+        int const offset_padded = !is_varlen_q ? 0 : ((params.cu_seqlens_q[bidb] + bidb * 128) / 128 * 128) * (kHeadDim / ElemsPerRowTMA);
+        // Prepare the TMA loads
+        Tensor mdQaccum = params.tma_add_dQ.get_tma_tensor(params.shape_dQaccum)(_, _, bidh, !is_varlen_q ? bidb : 0);
+        Tensor gdQaccum = local_tile(domain_offset(make_coord(offset_padded, _0{}), mdQaccum), TileShape_dQaccum{}, make_coord(_, _0{}));  // (M, K, _)
+        auto block_tma_dQ = params.tma_add_dQ.get_slice(_0{});
+        Tensor tdQgdQ = block_tma_dQ.partition_D(gdQaccum);  // (TMA, TMA_M, TMA_K)
+        Tensor tdQsdQ = block_tma_dQ.partition_S(sdQ); // (TMA, TMA_M, TMA_K)
+
+        int m_block_max = cute::ceil_div(get_seqlen_q(params, bidb), get<0>(TileShape_MNK{}));
+        int m_block_min = get_m_block_min(params, n_block, bidb);
+        int m_block = m_block_min;
+        int const num_batch = params.num_batch;
+        int const num_head = get<2>(params.shape_Q);
+        int *lock_ptr = !Deterministic ? nullptr : params.dq_semaphore + bidb * num_head + bidh;
+        using Barrier = cutlass::GenericBarrier<cutlass::detail::SyncwarpSync>;
+        int lane_predicate = cute::elect_one_sync();
+        #pragma unroll 2
+        for (; m_block < m_block_max; ++m_block) {
+            if constexpr (Deterministic) {
+                Barrier::wait_eq(lock_ptr, threadIdx.x % cutlass::NumThreadsPerWarp, m_block * num_batch * num_head, n_block);
+            }
+            cutlass::arch::NamedBarrier::sync(kNThreadsdQ + cutlass::NumThreadsPerWarp, static_cast<int>(BwdNamedBarriers::dQFull) /*id*/);  // sdQ full, to be written to gmem
+            if (lane_predicate) {
+                cute::copy(params.tma_add_dQ, tdQsdQ, tdQgdQ(_, _, _, m_block));
+                tma_store_arrive();
+            }
+            tma_store_wait<0>();
+            if constexpr (Deterministic) {
+                Barrier::arrive_inc(lock_ptr, threadIdx.x % cutlass::NumThreadsPerWarp, m_block * num_batch * num_head);
+            }
+            cutlass::arch::NamedBarrier::arrive(kNThreadsdQ + cutlass::NumThreadsPerWarp, static_cast<int>(BwdNamedBarriers::dQEmpty) /*id*/);  // sdQ empty, ready to be written to
+        }
+    }
+
+    CUTLASS_DEVICE void
+    mma_init() {
+        // // Tell producer (warp 0) that smem_q is ready
+        // cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarp, static_cast<int>(BwdNamedBarriers::QueryEmpty) /*id*/);
+        int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+        if (cutlass::canonical_warp_group_idx() == 1 && warp_idx_in_warpgroup == 0) {
+            cutlass::arch::NamedBarrier::arrive(kNThreadsdQ + cutlass::NumThreadsPerWarp, static_cast<int>(BwdNamedBarriers::dQEmpty) /*id*/);  // sdQ empty, ready to be written to
+        }
+    }
+
+    template <typename SharedStorage, typename FrgTensordKV>
+    CUTLASS_DEVICE void
+    mma(Params const& params,
+        MainloopPipeline pipeline_q,
+        MainloopPipeline pipeline_do,
+        PipelineState& smem_pipe_read,
+        FrgTensordKV& tdKrdK,
+        FrgTensordKV& tdVrdV,
+        int thread_idx,
+        int work_idx,
+        cute::tuple<int32_t, int32_t, int32_t> block_coord,
+        SharedStorage& shared_storage
+        ) {
+        static_assert(is_rmem<FrgTensordKV>::value, "dK and dV tensor must be rmem resident.");
+
+        Tensor sQ = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_q.data()), SmemLayoutQ{});
+        Tensor sdO = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_do.data()), SmemLayoutdO{});
+        Tensor sK = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_k.data()), SmemLayoutK{});
+        Tensor sV = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_v.data()), SmemLayoutV{});
+        Tensor sQt = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_q.data()), SmemLayoutQt{});
+        Tensor sdOt = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_do.data()), SmemLayoutdOt{});
+        Tensor sKt = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_k.data()), SmemLayoutKt{});
+        Tensor sdS = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_ds.data()), SmemLayoutdS{});
+        Tensor sdSt = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_ds.data()), SmemLayoutdSt{});
+        Tensor sdQ = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_dqacc.data()), SmemLayoutdQaccum{});
+        Tensor sLSEMma = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_lse.data()), SmemLayoutLSEMma{});
+        Tensor sdPsumMma = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_dpsum.data()), SmemLayoutLSEMma{});
+
+        static_assert(stride<0>(typename TiledMmaSdP::ALayout{}) == 0 and
+                      stride<0>(typename TiledMmaSdP::BLayout{}) == 0 and
+                      size<0>(typename TiledMmaSdP::ALayout{}) == cutlass::NumThreadsPerWarpGroup and
+                      size<0>(typename TiledMmaSdP::BLayout{}) == cutlass::NumThreadsPerWarpGroup,
+                      "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+        constexpr int MmaWarpGroups = NumMmaThreads / cutlass::NumThreadsPerWarpGroup;
+        Layout warp_group_thread_layout = make_layout(make_shape(Int<MmaWarpGroups>{}),
+                                                      make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
+        Layout warp_group_thread_layout_dq = make_layout(make_shape(Int<NumdQWarpGroups>{}),
+                                                      make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
+
+        int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0);
+        TiledMmaSdP tiled_mma_SdP;
+        TiledMmadKV tiled_mma_dKV;
+        TiledMmadQ tiled_mma_dQ;
+        static_assert(!dKV_swapAB);
+
+        auto wg_mma_SdP = tiled_mma_SdP.get_slice(warp_group_thread_layout(warp_group_idx));
+        auto thread_mma_SdP = tiled_mma_SdP.get_thread_slice(thread_idx);
+        auto wg_mma_dKV = tiled_mma_dKV.get_slice(warp_group_thread_layout(warp_group_idx));
+        auto wg_mma_dQ = tiled_mma_dQ.get_slice(!Varlen ? warp_group_thread_layout_dq(NumdQWarpGroups == 2 ? warp_group_idx : 0) : thread_idx);
+        // auto wg_mma_dQ = tiled_mma_dQ.get_thread_slice(thread_idx);
+
+        auto smem_tiled_copy_PdS = make_tiled_copy_C(SmemCopyAtomPdS{}, tiled_mma_SdP);
+        auto smem_thr_copy_PdS = smem_tiled_copy_PdS.get_thread_slice(thread_idx);
+        Tensor tdSsdS = smem_thr_copy_PdS.partition_D(sdSt);      // ((Atom,AtomNum),PIPE_M,PIPE_N)
+
+        R2STiledCopydQaccum r2s_tiled_copy_dQaccum;
+        // auto r2s_thr_copy_dQaccum = r2s_tiled_copy_dQaccum.get_thread_slice(thread_idx);
+        auto r2s_thr_copy_dQaccum = r2s_tiled_copy_dQaccum.get_thread_slice(NumdQWarpGroups == 2 ? thread_idx : thread_idx % cutlass::NumThreadsPerWarpGroup);
+        Tensor tdQsdQaccum = r2s_thr_copy_dQaccum.partition_D(sdQ);
+
+        // Allocate "fragments/descriptors"
+        Tensor tSrQ = wg_mma_SdP.partition_fragment_B(sQ);
+        Tensor tSrK = wg_mma_SdP.partition_fragment_A(sK);
+        Tensor tdPrdO = wg_mma_SdP.partition_fragment_B(sdO);
+        Tensor tdPrV = wg_mma_SdP.partition_fragment_A(sV);
+        Tensor tdVrdO = wg_mma_dKV.partition_fragment_B(sdOt);
+        Tensor tdKrQ = wg_mma_dKV.partition_fragment_B(sQt);
+
+        int n_block = get<0>(block_coord);
+        int bidh = get<1>(block_coord);
+        int bidb = get<2>(block_coord);
+        int const seqlen_q = get_seqlen_q(params, bidb);
+        int const seqlen_k = get_seqlen_k(params, bidb);
+
+        int m_block_max = cute::ceil_div(get_seqlen_q(params, bidb), get<0>(TileShape_MNK{}));
+        int m_block_min = get_m_block_min(params, n_block, bidb);
+        int m_block = m_block_min;
+
+        // thread_mma_SdP.partition_C(sLSEMma) has shape ((2, 2, V), MMA_M, MMA_N, PIPE), we only take the row indices.
+        Tensor tLSEsLSE = thread_mma_SdP.partition_C(sLSEMma)(make_coord(_, _0{}, _), _0{}, _0{}, _);  // (2, V, PIPE)
+        Tensor tLSEsdPsum = thread_mma_SdP.partition_C(sdPsumMma)(make_coord(_, _0{}, _), _0{}, _0{}, _);
+
+
+        clear(tdKrdK);
+        clear(tdVrdV);
+        // tiled_mma_dKV.accumulate_ = GMMA::ScaleOut::Zero;
+
+        cutlass::ConsumerToken barrier_token = static_cast<cutlass::BarrierStatus>(shared_storage.barrier_KV.try_wait(work_idx % 2));
+        if (barrier_token == cutlass::BarrierStatus::WaitAgain) { shared_storage.barrier_KV.wait(work_idx % 2); }
+
+        auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
+            auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+            pipeline.consumer_wait(smem_pipe_read, barrier_token);
+        };
+
+        auto compute_dQ = [&]() {
+            static_assert(!Mma_dQ_is_RS);
+            // SMEM fence to make sure sP is written before it's read by WGMMA
+            cutlass::arch::fence_view_async_shared();
+            cutlass::arch::NamedBarrier::sync(kNThreadsdQ + cutlass::NumThreadsPerWarp, static_cast<int>(BwdNamedBarriers::dQEmpty) /*id*/);  // sdQ empty, ready to be written to
+            Tensor tdQrdQ = partition_fragment_C(tiled_mma_dQ, select<!dQ_swapAB ? 0 : 2, !dQ_swapAB ? 2 : 0>(TileShape_MNK{}));
+            if constexpr (!dQ_swapAB) {
+                Tensor tdQrdS = wg_mma_dQ.partition_fragment_A(sdS);
+                Tensor tdQrK = wg_mma_dQ.partition_fragment_B(sKt);
+                flash::gemm</*zero_init=*/true, /*wg_wait=*/1>(tiled_mma_dQ, tdQrdS(_, _, _, smem_pipe_read.index()), tdQrK, tdQrdQ);
+            } else {
+                Tensor tdQrdS = wg_mma_dQ.partition_fragment_B(sdS);
+                Tensor tdQrK = wg_mma_dQ.partition_fragment_A(sKt);
+                flash::gemm</*zero_init=*/true, /*wg_wait=*/1>(tiled_mma_dQ, tdQrK, tdQrdS(_, _, _, smem_pipe_read.index()), tdQrdQ);
+            }
+            pipeline_q.consumer_release(smem_pipe_read);  // release Q
+            warpgroup_wait<0>();
+            Tensor taccdQrdQ = r2s_thr_copy_dQaccum.retile_S(tdQrdQ);        // ((Atom,AtomNum), MMA_M, MMA_N)
+            cute::copy(r2s_tiled_copy_dQaccum, taccdQrdQ, tdQsdQaccum);
+            cutlass::arch::fence_view_async_shared();
+            cutlass::arch::NamedBarrier::arrive(kNThreadsdQ + cutlass::NumThreadsPerWarp, static_cast<int>(BwdNamedBarriers::dQFull) /*id*/);  // sdQ full, to be written to gmem
+        };
+
+        // We have separate iterations with causal masking. Not necessary for hdim 128 but for hdim 64
+        // this helps quite a bit to not have to do causal masking for most of the iterations.
+        if constexpr (Is_causal) {
+            static constexpr int n_masking_steps = cute::ceil_div(kBlockN, kBlockM) + 1;
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (; m_block < std::min(m_block_max, m_block_min + n_masking_steps); ++m_block) {
+                Tensor tSrS = partition_fragment_C(tiled_mma_SdP, select<1, 0>(TileShape_MNK{}));
+                pipeline_q.consumer_wait(smem_pipe_read);
+                flash::gemm</*zero_init=*/true, /*wg_wait=*/-1>(tiled_mma_SdP, tSrK, tSrQ(_, _, _, smem_pipe_read.index()), tSrS);
+                Tensor tLSErLSE = make_fragment_like(tLSEsLSE(_, _, _0{}));
+                cute::copy(tLSEsLSE(_, _, smem_pipe_read.index()), tLSErLSE);
+
+                Tensor tdPrdP = partition_fragment_C(tiled_mma_SdP, select<1, 0>(TileShape_MNK{}));
+                pipeline_do.consumer_wait(smem_pipe_read);
+                flash::gemm</*zero_init=*/true, /*wg_wait=*/-1>(tiled_mma_SdP, tdPrV, tdPrdO(_, _, _, smem_pipe_read.index()), tdPrdP);
+                warpgroup_wait<1>();
+                Tensor cS = cute::make_identity_tensor(select<1, 0>(TileShape_MNK{}));
+                Tensor taccScS = thread_mma_SdP.partition_C(cS);
+                int causal_row_offset = 1 + seqlen_k - n_block * kBlockN - seqlen_q + m_block * kBlockM;
+                #pragma unroll
+                for (int i = 0; i < size(tSrS); ++i) {
+                    if (int(get<0>(taccScS(i))) >= std::min(int(get<1>(taccScS(i))) + causal_row_offset,
+                                                            seqlen_k - n_block * kBlockN)) {
+                        tSrS(i) = -INFINITY;
+                    }
+                }
+                // Reshape tSrS from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M), ncol=(2, MMA_N))
+                Tensor scores = make_tensor(tSrS.data(), flash::convert_layout_acc_transposed_rowcol(tSrS.layout()));
+                flash::scale_apply_exp2</*Scale_max=*/false, /*Check_inf=*/false>(scores, group_modes<0, 2>(tLSErLSE), params.softmax_scale_log2);
+
+                Tensor tLSErdPsum = make_fragment_like(tLSEsdPsum(_, _, _0{}));
+                cute::copy(tLSEsdPsum(_, _, smem_pipe_read.index()), tLSErdPsum);
+
+                // Convert scores from fp32 to fp16/bf16
+                Tensor rP = flash::convert_type<Element>(tSrS);
+
+                warpgroup_wait<0>();
+                // Reshape tdPrdP from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M), ncol=(2, MMA_N))
+                Tensor dS = make_tensor(tdPrdP.data(), scores.layout());
+                for (int mi = 0; mi < size<0>(dS); ++mi) {
+                    #pragma unroll
+                    for (int ni = 0; ni < size<1>(dS); ++ni) { dS(mi, ni) = scores(mi, ni) * (dS(mi, ni) - tLSErdPsum(mi)); }
+                }
+                Tensor rdS = flash::convert_type<Element>(tdPrdP);
+
+                // Because of double buffering on dS, we don't need to sync here.
+                // Otherwise we might have WG1 writing to dS before WG2 is done reading from it during MmadQ.
+                // But because both WGs have to sync at the end of the loop and double buffering, this race condition
+                // is not possible.
+                Tensor tdSadS = smem_thr_copy_PdS.retile_S(rdS);     // ((Atom,AtomNum), MMA_N, MMA_N)
+                cute::copy(smem_tiled_copy_PdS, tdSadS, tdSsdS(_, _, _, smem_pipe_read.index()));
+
+                Tensor tdVrP = make_tensor(rP.data(), convert_layout_acc_Aregs<TiledMmadKV>(tSrS.layout()));
+                flash::gemm</*zero_init=*/false, /*wg_wait=*/-1>(tiled_mma_dKV, tdVrP, tdVrdO(_, _, _, smem_pipe_read.index()), tdVrdV);
+
+                Tensor tdKrdS = make_tensor(rdS.data(), convert_layout_acc_Aregs<TiledMmadKV>(tdPrdP.layout()));
+                flash::gemm</*zero_init=*/false, /*wg_wait=*/1>(tiled_mma_dKV, tdKrdS, tdKrQ(_, _, _, smem_pipe_read.index()), tdKrdK);
+                pipeline_do.consumer_release(smem_pipe_read);  // release dO
+
+                compute_dQ();
+                ++smem_pipe_read;
+            }
+        }
+
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (; m_block < m_block_max; ++m_block) {
+            Tensor tSrS = partition_fragment_C(tiled_mma_SdP, select<1, 0>(TileShape_MNK{}));
+            pipeline_q.consumer_wait(smem_pipe_read);
+            flash::gemm</*zero_init=*/true, /*wg_wait=*/-1>(tiled_mma_SdP, tSrK, tSrQ(_, _, _, smem_pipe_read.index()), tSrS);
+            Tensor tLSErLSE = make_fragment_like(tLSEsLSE(_, _, _0{}));
+            cute::copy(tLSEsLSE(_, _, smem_pipe_read.index()), tLSErLSE);
+
+            Tensor tdPrdP = partition_fragment_C(tiled_mma_SdP, select<1, 0>(TileShape_MNK{}));
+            pipeline_do.consumer_wait(smem_pipe_read);
+            flash::gemm</*zero_init=*/true, /*wg_wait=*/-1>(tiled_mma_SdP, tdPrV, tdPrdO(_, _, _, smem_pipe_read.index()), tdPrdP);
+            warpgroup_wait<1>();
+            Tensor cS = cute::make_identity_tensor(select<1, 0>(TileShape_MNK{}));
+            Tensor taccScS = thread_mma_SdP.partition_C(cS);
+            #pragma unroll
+            for (int i = 0; i < size(tSrS); ++i) {
+                if (int(get<0>(taccScS(i))) >= int(seqlen_k - n_block * kBlockN)) { tSrS(i) = -INFINITY; }
+            }
+            // Reshape tSrS from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M), ncol=(2, MMA_N))
+            Tensor scores = make_tensor(tSrS.data(), flash::convert_layout_acc_transposed_rowcol(tSrS.layout()));
+            // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(tLSErLSE); }
+            // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(scores); }
+            flash::scale_apply_exp2</*Scale_max=*/false, /*Check_inf=*/false>(scores, group_modes<0, 2>(tLSErLSE), params.softmax_scale_log2);
+            // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(scores); }
+
+            Tensor tLSErdPsum = make_fragment_like(tLSEsdPsum(_, _, _0{}));
+            cute::copy(tLSEsdPsum(_, _, smem_pipe_read.index()), tLSErdPsum);
+
+            // Convert scores from fp32 to fp16/bf16
+            Tensor rP = flash::convert_type<Element>(tSrS);
+
+            warpgroup_wait<0>();
+            // Reshape tdPrdP from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M), ncol=(2, MMA_N))
+            Tensor dS = make_tensor(tdPrdP.data(), scores.layout());
+            #pragma unroll
+            for (int mi = 0; mi < size<0>(dS); ++mi) {
+                #pragma unroll
+                for (int ni = 0; ni < size<1>(dS); ++ni) { dS(mi, ni) = scores(mi, ni) * (dS(mi, ni) - tLSErdPsum(mi)); }
+            }
+            // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(dS); }
+            Tensor rdS = flash::convert_type<Element>(tdPrdP);
+
+            Tensor tdSadS = smem_thr_copy_PdS.retile_S(rdS);     // ((Atom,AtomNum), MMA_N, MMA_N)
+            cute::copy(smem_tiled_copy_PdS, tdSadS, tdSsdS(_, _, _, smem_pipe_read.index()));
+
+            Tensor tdVrP = make_tensor(rP.data(), convert_layout_acc_Aregs<TiledMmadKV>(tSrS.layout()));
+            flash::gemm</*zero_init=*/false, /*wg_wait=*/-1>(tiled_mma_dKV, tdVrP, tdVrdO(_, _, _, smem_pipe_read.index()), tdVrdV);
+
+            Tensor tdKrdS = make_tensor(rdS.data(), convert_layout_acc_Aregs<TiledMmadKV>(tdPrdP.layout()));
+            flash::gemm</*zero_init=*/false, /*wg_wait=*/1>(tiled_mma_dKV, tdKrdS, tdKrQ(_, _, _, smem_pipe_read.index()), tdKrdK);
+            pipeline_do.consumer_release(smem_pipe_read);  // release dO
+
+            compute_dQ();
+            ++smem_pipe_read;
+        }
+        // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(tdVrdV); }
+        #pragma unroll
+        for (int i = 0; i < size(tdKrdK); ++i) { tdKrdK(i) *= params.softmax_scale; }
+    }
+
+};
+
+} // namespace flash
+
diff --git a/mainloop_fwd_sm90_tma_gmma_ws.hpp b/mainloop_fwd_sm90_tma_gmma_ws.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7c83f72a5aa38cfa00732b6f0b93ea65485ed5cd
--- /dev/null
+++ b/mainloop_fwd_sm90_tma_gmma_ws.hpp
@@ -0,0 +1,1025 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/cutlass.h>
+#include <cutlass/array.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/numeric_conversion.h>
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "named_barrier.hpp"
+#include "utils.h"
+
+namespace flash {
+
+using namespace cute;
+
+// 4 warps
+struct SmemTransposeFp8_64x64 {
+
+  using Element = cutlass::float_e4m3_t;
+  
+  using ldsm_thread_shape = Shape<_4, _1, _8, _4>;
+  using ldsm_value_shape = Shape<_2, _8, _2, _1>;  
+  using ldsm_value_stride = Stride<_2, _4, _1, _0>;
+  using TiledCopyLDSM = decltype(make_tiled_copy(
+      Copy_Atom<SM75_U16x8_LDSM_T, Element>{}, Layout<ldsm_thread_shape>{},
+      Layout<ldsm_value_shape, ldsm_value_stride>{}));
+  TiledCopyLDSM tiled_copy_ldsm;  
+
+  using stsm_thread_shape = Shape<_4, _1, _8, _4>;
+  // using stsm_thread_stride = Stride<_1, _0, _4, _32>;
+#ifndef NO_FP8_COLUMN_PERMUTE
+  using stsm_value_shape = Shape<_4, _4, _1, _2>;
+  using stsm_value_stride = Stride<_1, _8, _0, _4>;
+#else
+  using stsm_value_shape = Shape<_4, _4, _2, _1>;
+  using stsm_value_stride = Stride<_1, _8, _4, _0>;
+#endif
+
+  using TiledCopySTSM =
+      decltype(make_tiled_copy(Copy_Atom<SM90_U32x4_STSM_N, Element>{},
+                               Layout<stsm_thread_shape>{},
+                               Layout<stsm_value_shape, stsm_value_stride>{}));
+  TiledCopySTSM tiled_copy_stsm;
+
+  template <class SmemTensor, class SmemTensorOut>
+  CUTLASS_DEVICE void operator()(SmemTensor &&s_in, SmemTensorOut &&s_out) {
+    using namespace cute;
+
+    auto tid = threadIdx.x;
+    auto thr_copy_ldsm = tiled_copy_ldsm.get_thread_slice(tid);
+    auto thr_copy_stsm = tiled_copy_stsm.get_thread_slice(tid);
+
+    auto tXsX = thr_copy_ldsm.partition_S(s_in);
+    auto tXrX = make_tensor<Element>(shape(tXsX));    
+    auto tXsX_out = thr_copy_stsm.partition_D(s_out);
+
+    cute::copy(tiled_copy_ldsm, tXsX, tXrX);
+
+    auto data = tXrX.data();
+    // size(tXrX) == 32
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < size(tXrX); n += 8) {
+      uint32_t *data_32bit = reinterpret_cast<uint32_t *>(&data[n]);
+      auto upper = data_32bit[0];
+      auto lower = data_32bit[1];
+      data_32bit[0] = __byte_perm(upper, lower, 0x6420);
+      data_32bit[1] = __byte_perm(upper, lower, 0x7531);
+    }
+
+    cute::copy(tiled_copy_stsm, tXrX, tXsX_out);
+  }
+};
+
+template <typename Ktraits, bool Is_causal, typename Seqlen_traits>
+struct CollectiveMainloopFwd {
+
+    using Element = typename Ktraits::Element;
+    using TileShape_MNK = typename Ktraits::TileShape_MNK;
+    using ClusterShape = typename Ktraits::ClusterShape_MNK;
+
+    static constexpr int kStages = Ktraits::kStages;
+    static constexpr int kHeadDim = Ktraits::kHeadDim;    
+
+    using GmemTiledCopyQ = cute::SM90_TMA_LOAD;
+    using GmemTiledCopyKV = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape{})));
+    
+    using SmemLayoutQ = typename Ktraits::SmemLayoutQ;
+    using SmemLayoutK = typename Ktraits::SmemLayoutK;
+    using SmemLayoutV = typename Ktraits::SmemLayoutV;
+    using SmemLayoutVt = typename Ktraits::SmemLayoutVt;
+
+    using TMA_Q = decltype(make_tma_copy(
+        GmemTiledCopyQ{},
+        make_tensor(
+            make_gmem_ptr(static_cast<Element const*>(nullptr)), 
+            repeat_like(typename Seqlen_traits::StrideT{}, int32_t(0)), 
+            typename Seqlen_traits::StrideT{}
+        ),
+        SmemLayoutQ{},
+        select<0, 2>(TileShape_MNK{}),
+        _1{}));  // no mcast for Q
+
+    using TMA_K = decltype(make_tma_copy(
+        GmemTiledCopyKV{},
+        make_tensor(
+            make_gmem_ptr(static_cast<Element const*>(nullptr)), 
+            repeat_like(typename Seqlen_traits::StrideT{}, int32_t(0)), 
+            typename Seqlen_traits::StrideT{}
+        ),
+        take<0, 2>(SmemLayoutK{}),
+        select<1, 2>(TileShape_MNK{}),
+        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+
+    // TMA_V may differ from TMA_K for fp8 kernel (e.g. swizzling mode)
+    using TMA_V = decltype(make_tma_copy(
+        GmemTiledCopyKV{},
+        make_tensor(
+            make_gmem_ptr(static_cast<Element const*>(nullptr)),
+            repeat_like(typename Seqlen_traits::StrideT{}, int32_t(0)),
+            typename Seqlen_traits::StrideT{}
+        ),
+        take<0, 2>(SmemLayoutV{}),
+        select<1, 2>(TileShape_MNK{}),
+        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+
+    static constexpr int NumMmaThreads = size(typename Ktraits::TiledMma0{});
+    using MainloopPipeline = typename Ktraits::MainloopPipeline;
+    using MainloopPipelineNoTMA = typename Ktraits::MainloopPipelineNoTMA;
+    using PipelineParams = typename MainloopPipeline::Params;
+    using PipelineState = typename MainloopPipeline::PipelineState;
+
+    // Set the bytes transferred in this TMA transaction (may involve multiple issues)
+    static constexpr uint32_t TmaTransactionBytesQ = static_cast<uint32_t>(size(SmemLayoutQ{}) * cutlass::sizeof_bits_v<Element> / 8);
+    static constexpr uint32_t TmaTransactionBytesK = static_cast<uint32_t>(size(take<0, 2>(SmemLayoutK{})) * cutlass::sizeof_bits_v<Element> / 8);
+
+    // static constexpr bool UseSchedulerBarrier = kHeadDim <= 128;
+    static constexpr bool UseSchedulerBarrier =
+        cutlass::sizeof_bits_v<Element> == 8 ? kHeadDim >= 128
+                                             : kHeadDim <= 128;    
+
+    // Host side kernel arguments
+    struct Arguments {
+        Element const* ptr_Q;
+        typename Seqlen_traits::LayoutT layout_Q;
+        Element const* ptr_K;
+        typename Seqlen_traits::LayoutT layout_K;
+        Element const* ptr_V;
+        typename Seqlen_traits::LayoutT layout_V;
+        float const softmax_scale_log2;
+    };
+
+    // Device side kernel params
+    struct Params {
+        typename Seqlen_traits::LayoutT layout_Q;
+        typename Seqlen_traits::LayoutT layout_K;
+        typename Seqlen_traits::LayoutT layout_V;
+        cutlass::FastDivmod qhead_per_khead_divmod;
+        TMA_Q tma_load_Q;        
+        TMA_K tma_load_K;
+        TMA_V tma_load_V;
+        float const softmax_scale_log2;
+    };
+
+
+    static Params
+    to_underlying_arguments(Arguments const& args) {
+        Tensor mQ = make_tensor(make_gmem_ptr(args.ptr_Q), args.layout_Q);
+        TMA_Q tma_load_Q = make_tma_copy(
+            GmemTiledCopyQ{},
+            mQ,
+            SmemLayoutQ{},
+            select<0, 2>(TileShape_MNK{}),
+            _1{}); // no mcast for Q
+        Tensor mK = make_tensor(make_gmem_ptr(args.ptr_K), args.layout_K);
+        TMA_K tma_load_K = make_tma_copy(
+            GmemTiledCopyKV{},
+            mK,
+            SmemLayoutK{}(_, _, _0{}),
+            select<1, 2>(TileShape_MNK{}),
+            size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+        Tensor mV = make_tensor(make_gmem_ptr(args.ptr_V), args.layout_V);
+        TMA_V tma_load_V = make_tma_copy(
+            GmemTiledCopyKV{},
+            mV,
+            SmemLayoutV{}(_, _, _0{}),
+            select<1, 2>(TileShape_MNK{}),
+            size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+        return {args.layout_Q, args.layout_K, args.layout_V,
+                cutlass::FastDivmod(cute::ceil_div(get<2>(args.layout_Q.shape()), get<2>(args.layout_K.shape()))),
+                tma_load_Q, tma_load_K, tma_load_V,
+                args.softmax_scale_log2};
+    }
+
+    /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+    CUTLASS_DEVICE
+    static void prefetch_tma_descriptors(Params const& mainloop_params) {
+        cute::prefetch_tma_descriptor(mainloop_params.tma_load_Q.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(mainloop_params.tma_load_K.get_tma_descriptor());
+        cute::prefetch_tma_descriptor(mainloop_params.tma_load_V.get_tma_descriptor());
+    }
+
+    CUTLASS_DEVICE
+    int get_n_block_max(
+          Params const& mainloop_params, int m_block, 
+          const Seqlen_traits& seqlen_traits_q,
+          const Seqlen_traits& seqlen_traits_k
+        ) {
+        static constexpr int kBlockM = get<0>(TileShape_MNK{});
+        static constexpr int kBlockN = get<1>(TileShape_MNK{});        
+        int const seqlen_q = Seqlen_traits::kUseVarSeqLen ? seqlen_traits_q.actual_seq_len : shape<0>(mainloop_params.layout_Q);
+        int const seqlen_k = Seqlen_traits::kUseVarSeqLen ? seqlen_traits_k.actual_seq_len : shape<0>(mainloop_params.layout_K);        
+        int n_block_max = cute::ceil_div(seqlen_k, kBlockN);
+        if constexpr (Is_causal) {
+            n_block_max = std::min(n_block_max,
+                                   cute::ceil_div((m_block + 1) * kBlockM + seqlen_k - seqlen_q, kBlockN));
+        }
+        return n_block_max;
+    }
+
+    template <typename Scheduler, typename SharedStorage>
+    CUTLASS_DEVICE void
+    load(Params const& mainloop_params,
+         MainloopPipeline pipeline_k,
+         MainloopPipeline pipeline_v,
+         PipelineState& smem_pipe_write_k,
+         PipelineState& smem_pipe_write_v,
+         SharedStorage &shared_storage,
+         Scheduler& scheduler,
+         typename Scheduler::Params const& scheduler_params,
+         typename Scheduler::WorkTileInfo& work_tile_info,
+         cute::tuple<int32_t, int32_t, int32_t> block_coord,
+         int work_idx,
+         const Seqlen_traits& seqlen_traits_q,
+         const Seqlen_traits& seqlen_traits_k
+         ) {
+
+        Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQ{});
+        Tensor sK = make_tensor(make_smem_ptr(shared_storage.smem_k.data()), SmemLayoutK{});
+        Tensor sV = make_tensor(make_smem_ptr(shared_storage.smem_v.data()), SmemLayoutV{});
+
+        Tensor mQ = mainloop_params.tma_load_Q.get_tma_tensor(mainloop_params.layout_Q.shape());
+        Tensor mK = mainloop_params.tma_load_K.get_tma_tensor(mainloop_params.layout_K.shape());
+        Tensor mV = mainloop_params.tma_load_V.get_tma_tensor(mainloop_params.layout_V.shape());
+
+        auto [m_block, bidh, bidb] = block_coord;
+        int bidh_kv = mainloop_params.qhead_per_khead_divmod.divide(bidh);
+
+        // Prepare the TMA loads
+        uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+        constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+        uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+        Tensor gQ = seqlen_traits_q.get_local_tile_tensor(
+            mQ, select<0, 2>(TileShape_MNK{}), bidh, bidb)(_, _, m_block);  // (M, K)
+        Tensor gK = seqlen_traits_k.get_local_tile_tensor(
+            mK, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb);  // (N, K, _)
+        Tensor gV = seqlen_traits_k.get_local_tile_tensor(
+            mV, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb);  // (N, K, _)
+
+        Tensor sQ_x = make_tensor(sQ.data(), make_layout(sQ.layout(), Layout<_1>{}));
+        Tensor gQ_x = make_tensor(gQ.data(), make_layout(gQ.layout(), Layout<_1>{}));
+        auto [tQgQ, tQsQ] = tma_partition(mainloop_params.tma_load_Q, _0{}, Layout<_1>{},
+                                          group_modes<0, 2>(sQ_x), group_modes<0, 2>(gQ_x));  // (TMA), (TMA)
+        auto [tKgK, tKsK] = tma_partition(mainloop_params.tma_load_K, block_rank_in_cluster, Layout<ClusterShape>{},
+                                          group_modes<0, 2>(sK), group_modes<0, 2>(gK));  // (TMA, k), (TMA, PIPE)
+        auto [tVgV, tVsV] = tma_partition(mainloop_params.tma_load_V, block_rank_in_cluster, Layout<ClusterShape>{},
+                                          group_modes<0, 2>(sV), group_modes<0, 2>(gV));  // (TMA, k), (TMA, PIPE)
+
+        uint16_t mcast_mask_kv = 0;
+        if constexpr (cute::is_same_v<GmemTiledCopyKV, SM90_TMA_LOAD_MULTICAST>) {
+            auto block_layout = Layout<ClusterShape>{}; // (m,n) -> block_id
+            for (int m = 0; m < size<0>(block_layout); ++m) {
+                mcast_mask_kv |= (uint16_t(1) << block_layout(m, cluster_local_block_id.y, _0{}));
+            }
+        }
+
+        int n_block_max = get_n_block_max(mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
+        int n_block = n_block_max - 1;
+
+        int lane_predicate = cute::elect_one_sync();
+        if (lane_predicate) {
+            pipeline_k.producer_acquire(smem_pipe_write_k);
+            copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write_k), mcast_mask_kv),
+                tKgK(_, n_block), tKsK(_, smem_pipe_write_k.index()));
+            ++smem_pipe_write_k;
+        }
+
+        // Wait for the MMA warpgroups to say that smem_q is ready
+        cutlass::arch::NamedBarrier::sync(NumMmaThreads + cutlass::NumThreadsPerWarp, static_cast<int>(FwdNamedBarriers::QueryEmpty) /*id*/);
+
+        if (lane_predicate) {
+            shared_storage.barrier_Q.arrive_and_expect_tx(TmaTransactionBytesQ);
+            copy(mainloop_params.tma_load_Q.with(reinterpret_cast<cutlass::arch::ClusterTransactionBarrier::ValueType&>(shared_storage.barrier_Q), 0 /*mcast_mask*/), tQgQ, tQsQ);
+        }
+
+        // Wait for warp 1 to signal that smem_v are ready and V can be copied from gmem
+        // Need ClusterBarrier, not just NamedBarrier. Otherwise we might have CTA 0 finishing the
+        // TMA store on O first, call TMA multicast load on V, before CTA 1 can finishing TMA store on O.
+        shared_storage.barrier_O.wait((work_idx + 1) % 2);
+
+        if (lane_predicate) {
+            // CUTLASS_PRAGMA_NO_UNROLL
+            #pragma unroll 2
+            for (; n_block > 0; --n_block) {
+                pipeline_k.producer_acquire(smem_pipe_write_k);
+                copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write_k), mcast_mask_kv),
+                    tKgK(_, n_block - 1), tKsK(_, smem_pipe_write_k.index()));
+                ++smem_pipe_write_k;
+                pipeline_v.producer_acquire(smem_pipe_write_v);
+                copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write_v), mcast_mask_kv),
+                    tVgV(_, n_block), tVsV(_, smem_pipe_write_v.index()));
+                ++smem_pipe_write_v;
+            }
+        }
+        scheduler.prefetch_next_work(scheduler_params, work_tile_info);
+        if (lane_predicate) {
+            pipeline_v.producer_acquire(smem_pipe_write_v);
+            copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write_v), mcast_mask_kv),
+                tVgV(_, n_block), tVsV(_, smem_pipe_write_v.index()));
+            ++smem_pipe_write_v;
+        }
+        scheduler.broadcast_next_work(work_tile_info);
+    }
+
+    template <typename Scheduler, typename SharedStorage>
+    CUTLASS_DEVICE void
+    load_fp8(Params const& mainloop_params,
+         MainloopPipeline pipeline_k,
+         MainloopPipeline pipeline_v,
+         MainloopPipelineNoTMA pipeline_vt,         
+         PipelineState& smem_pipe_write,
+         PipelineState& smem_pipe_read,
+         SharedStorage &shared_storage,
+         Scheduler& scheduler,
+         typename Scheduler::Params const& scheduler_params,
+         typename Scheduler::WorkTileInfo& work_tile_info,
+         cute::tuple<int32_t, int32_t, int32_t> block_coord,
+         int work_idx,
+         const Seqlen_traits& seqlen_traits_q,
+         const Seqlen_traits& seqlen_traits_k         
+         ) {
+        
+        using SmemLayoutTransposeV = typename Ktraits::SmemLayoutTransposeV;
+        using SmemLayoutTransposeVt = typename Ktraits::SmemLayoutTransposeVt;
+
+        Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQ{});
+        Tensor sK = make_tensor(make_smem_ptr(shared_storage.smem_k.data()), SmemLayoutK{});
+        Tensor sV = make_tensor(make_smem_ptr(shared_storage.smem_v.data()), SmemLayoutV{});
+        
+        Tensor sV_divide = as_position_independent_swizzle_tensor(make_tensor(make_smem_ptr(shared_storage.smem_v.data()), SmemLayoutTransposeV{}));
+        Tensor sVt_divide = as_position_independent_swizzle_tensor(make_tensor(make_smem_ptr(shared_storage.smem_v_out.data()), SmemLayoutTransposeVt{}));
+
+        auto smem_transpose_V = SmemTransposeFp8_64x64();
+        auto do_transpose_V = [&](int stage) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int j = 0; j < shape<2>(SmemLayoutTransposeV{}); ++j) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int i = 0; i < shape<1>(SmemLayoutTransposeV{}); ++i) {
+                smem_transpose_V(flatten(sV_divide(_, i, j, stage)),
+                                flatten(sVt_divide(_, i, j, stage)));
+                }
+            }
+        };
+
+        Tensor mQ = mainloop_params.tma_load_Q.get_tma_tensor(mainloop_params.layout_Q.shape());
+        Tensor mK = mainloop_params.tma_load_K.get_tma_tensor(mainloop_params.layout_K.shape());
+        Tensor mV = mainloop_params.tma_load_V.get_tma_tensor(mainloop_params.layout_V.shape());
+
+        auto [m_block, bidh, bidb] = block_coord;
+        int bidh_kv = mainloop_params.qhead_per_khead_divmod.divide(bidh);
+
+        // Prepare the TMA loads
+        uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+        constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+        uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
+        Tensor gQ = seqlen_traits_q.get_local_tile_tensor(
+            mQ, select<0, 2>(TileShape_MNK{}), bidh, bidb)(_, _, m_block);  // (M, K)
+        Tensor gK = seqlen_traits_k.get_local_tile_tensor(
+            mK, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb);  // (N, K, _)
+        Tensor gV = seqlen_traits_k.get_local_tile_tensor(
+            mV, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb);  // (N, K, _)
+
+        Tensor sQ_x = make_tensor(sQ.data(), make_layout(sQ.layout(), Layout<_1>{}));
+        Tensor gQ_x = make_tensor(gQ.data(), make_layout(gQ.layout(), Layout<_1>{}));
+        auto [tQgQ, tQsQ] = tma_partition(mainloop_params.tma_load_Q, _0{}, Layout<_1>{},
+                                          group_modes<0, 2>(sQ_x), group_modes<0, 2>(gQ_x));  // (TMA), (TMA)
+        auto [tKgK, tKsK] = tma_partition(mainloop_params.tma_load_K, block_rank_in_cluster, Layout<ClusterShape>{},
+                                          group_modes<0, 2>(sK), group_modes<0, 2>(gK));  // (TMA, k), (TMA, PIPE)
+        auto [tVgV, tVsV] = tma_partition(mainloop_params.tma_load_V, block_rank_in_cluster, Layout<ClusterShape>{},
+                                          group_modes<0, 2>(sV), group_modes<0, 2>(gV));  // (TMA, k), (TMA, PIPE)
+
+        uint16_t mcast_mask_kv = 0;
+        if constexpr (cute::is_same_v<GmemTiledCopyKV, SM90_TMA_LOAD_MULTICAST>) {
+            auto block_layout = Layout<ClusterShape>{}; // (m,n) -> block_id
+            for (int m = 0; m < size<0>(block_layout); ++m) {
+                mcast_mask_kv |= (uint16_t(1) << block_layout(m, cluster_local_block_id.y, _0{}));
+            }
+        }
+
+        int n_block_max = get_n_block_max(mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k);
+        int n_block = n_block_max - 1;
+
+        int lane_predicate = cute::elect_one_sync();
+        int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+        if (warp_idx_in_warpgroup == 0 && lane_predicate) {
+            pipeline_k.producer_acquire(smem_pipe_write);
+            copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
+                tKgK(_, n_block), tKsK(_, smem_pipe_write.index()));
+        }
+
+        // Wait for the MMA warpgroups to say that smem_q is ready
+        // for fp8, change from NumThreadsPerWarp to NumThreadsPerWarpGroup
+        cutlass::arch::NamedBarrier::sync(NumMmaThreads + cutlass::NumThreadsPerWarpGroup, static_cast<int>(FwdNamedBarriers::QueryEmpty) /*id*/);
+
+        if constexpr(Is_causal) {
+            if (warp_idx_in_warpgroup == 0 && lane_predicate) {
+                shared_storage.barrier_Q.arrive_and_expect_tx(TmaTransactionBytesQ);
+                copy(mainloop_params.tma_load_Q.with(reinterpret_cast<cutlass::arch::ClusterTransactionBarrier::ValueType&>(shared_storage.barrier_Q), 0 /*mcast_mask*/), tQgQ, tQsQ);
+                pipeline_v.producer_acquire(smem_pipe_write);
+                copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
+                    tVgV(_, n_block), tVsV(_, smem_pipe_write.index()));
+            }
+
+            shared_storage.barrier_O.wait((work_idx + 1) % 2);            
+                        
+            CUTLASS_PRAGMA_UNROLL
+            for (int iter = 0; iter < kStages && n_block > 0; ++iter, --n_block) {
+                pipeline_v.consumer_wait(smem_pipe_read);
+                // pipeline_vt.producer_acquire(smem_pipe_write);
+                do_transpose_V(smem_pipe_read.index());
+                pipeline_vt.producer_commit(smem_pipe_write);
+                pipeline_v.consumer_release(smem_pipe_read);
+
+                ++smem_pipe_write;
+                ++smem_pipe_read;
+                
+                if (warp_idx_in_warpgroup == 0 && lane_predicate) {
+                    pipeline_k.producer_acquire(smem_pipe_write);
+                    copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
+                        tKgK(_, n_block-1), tKsK(_, smem_pipe_write.index()));
+                    pipeline_v.producer_acquire(smem_pipe_write);
+                    copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
+                        tVgV(_, n_block-1), tVsV(_, smem_pipe_write.index()));
+                }
+            }            
+            
+            #pragma unroll 2
+            for (; n_block > 0; --n_block) {
+                pipeline_v.consumer_wait(smem_pipe_read);
+                pipeline_vt.producer_acquire(smem_pipe_write);
+                do_transpose_V(smem_pipe_read.index());
+                pipeline_vt.producer_commit(smem_pipe_write);
+                pipeline_v.consumer_release(smem_pipe_read);
+
+                ++smem_pipe_write;
+                ++smem_pipe_read;
+                
+                if (warp_idx_in_warpgroup == 0 && lane_predicate) {
+                    pipeline_k.producer_acquire(smem_pipe_write);
+                    copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
+                        tKgK(_, n_block-1), tKsK(_, smem_pipe_write.index()));
+                    pipeline_v.producer_acquire(smem_pipe_write);
+                    copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
+                        tVgV(_, n_block-1), tVsV(_, smem_pipe_write.index()));
+                }                                                                
+            }       
+
+            scheduler.prefetch_next_work(scheduler_params, work_tile_info);
+            scheduler.broadcast_next_work(work_tile_info);
+            
+            pipeline_v.consumer_wait(smem_pipe_read);
+            if (n_block_max > kStages)
+                pipeline_vt.producer_acquire(smem_pipe_write);
+            do_transpose_V(smem_pipe_read.index());
+            pipeline_vt.producer_commit(smem_pipe_write);
+            pipeline_v.consumer_release(smem_pipe_read);
+
+            ++smem_pipe_write;
+            ++smem_pipe_read;
+        } else {
+            if (warp_idx_in_warpgroup == 0 && lane_predicate) {
+                shared_storage.barrier_Q.arrive_and_expect_tx(TmaTransactionBytesQ);
+                copy(mainloop_params.tma_load_Q.with(reinterpret_cast<cutlass::arch::ClusterTransactionBarrier::ValueType&>(shared_storage.barrier_Q), 0 /*mcast_mask*/), tQgQ, tQsQ);
+                pipeline_v.producer_acquire(smem_pipe_write);
+                copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
+                    tVgV(_, n_block), tVsV(_, smem_pipe_write.index()));        
+            }
+            // With fp8 kernel, smem_o is in union with smem_v_out,
+            // so could use NamedBarrier instead of ClusterBarrier.
+            // But, this doesn't appear to have any benefit.
+            shared_storage.barrier_O.wait((work_idx + 1) % 2);
+
+            pipeline_v.consumer_wait(smem_pipe_read);
+            // pipeline_vt.producer_acquire(smem_pipe_write);
+            do_transpose_V(smem_pipe_read.index());
+            pipeline_vt.producer_commit(smem_pipe_write);
+            pipeline_v.consumer_release(smem_pipe_read);
+
+            ++smem_pipe_write;
+            ++smem_pipe_read;
+            --n_block;
+
+            constexpr int extra_iterations = kStages - 1;
+            CUTLASS_PRAGMA_UNROLL
+            for (int iter = 0; iter < extra_iterations && n_block >= 0; ++iter) {
+                if (warp_idx_in_warpgroup == 0 && lane_predicate) {
+                    pipeline_k.producer_acquire(smem_pipe_write);
+                    copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
+                        tKgK(_, n_block), tKsK(_, smem_pipe_write.index()));
+                    pipeline_v.producer_acquire(smem_pipe_write);
+                    copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
+                        tVgV(_, n_block), tVsV(_, smem_pipe_write.index()));                
+                }
+                
+                pipeline_v.consumer_wait(smem_pipe_read);
+                // pipeline_vt.producer_acquire(smem_pipe_write);
+                do_transpose_V(smem_pipe_read.index());
+                pipeline_vt.producer_commit(smem_pipe_write);
+                pipeline_v.consumer_release(smem_pipe_read);
+                
+                ++smem_pipe_write;
+                ++smem_pipe_read;
+                --n_block;
+            }
+
+            // CUTLASS_PRAGMA_NO_UNROLL
+            #pragma unroll 2        
+            for (; n_block >= 0; --n_block) {
+                
+                if (warp_idx_in_warpgroup == 0 && lane_predicate) {
+                    pipeline_k.producer_acquire(smem_pipe_write);
+                    copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
+                        tKgK(_, n_block), tKsK(_, smem_pipe_write.index()));
+                    pipeline_v.producer_acquire(smem_pipe_write);
+                    copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv),
+                        tVgV(_, n_block), tVsV(_, smem_pipe_write.index()));                                
+                }
+                
+                pipeline_v.consumer_wait(smem_pipe_read);
+                pipeline_vt.producer_acquire(smem_pipe_write);
+                do_transpose_V(smem_pipe_read.index());
+                pipeline_vt.producer_commit(smem_pipe_write);
+                pipeline_v.consumer_release(smem_pipe_read);
+                
+                ++smem_pipe_write;
+                ++smem_pipe_read;
+            }
+            // scheduler.prefetch_next_work(scheduler_params, work_tile_info);
+            // scheduler.broadcast_next_work(work_tile_info);
+        }
+    }
+
+    /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+    CUTLASS_DEVICE void
+    load_tail(MainloopPipeline pipeline_k, MainloopPipeline pipeline_v,
+              PipelineState& smem_pipe_write_k, PipelineState& smem_pipe_write_v) {
+        int lane_predicate = cute::elect_one_sync();
+        int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+        // Issue the epilogue waits
+        if (warp_idx_in_warpgroup == 0 && lane_predicate) {
+          /* This helps avoid early exit of blocks in Cluster
+          * Waits for all stages to either be released (all Consumer UNLOCKs), or if the stage was never used
+          * then would just be acquired since the phase was still inverted from make_producer_start_state
+          */
+          pipeline_k.producer_tail(smem_pipe_write_k);
+          pipeline_v.producer_tail(smem_pipe_write_v);
+        }
+    }
+
+    /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+    CUTLASS_DEVICE void
+    load_tail_one_write(MainloopPipeline pipeline_k, MainloopPipeline pipeline_v,
+              PipelineState& smem_pipe_write) {
+        int lane_predicate = cute::elect_one_sync();
+        int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+        // Issue the epilogue waits
+        if (warp_idx_in_warpgroup == 0 && lane_predicate) {
+          /* This helps avoid early exit of blocks in Cluster
+          * Waits for all stages to either be released (all Consumer UNLOCKs), or if the stage was never used
+          * then would just be acquired since the phase was still inverted from make_producer_start_state
+          */
+          pipeline_k.producer_tail(smem_pipe_write);
+          pipeline_v.producer_tail(smem_pipe_write);
+        }
+    }
+
+    CUTLASS_DEVICE void
+    warp_scheduler_barrier_sync() {
+        if constexpr (UseSchedulerBarrier) {
+            cutlass::arch::NamedBarrier::sync(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + cutlass::canonical_warp_group_idx() /*id*/);
+        }
+    }
+
+    CUTLASS_DEVICE void
+    warp_scheduler_barrier_arrive() {
+        if constexpr (!UseSchedulerBarrier) { return; }
+        static_assert(NumMmaThreads == 2 * cutlass::NumThreadsPerWarpGroup || NumMmaThreads == 3 * cutlass::NumThreadsPerWarpGroup);
+        if constexpr (NumMmaThreads == 2 * cutlass::NumThreadsPerWarpGroup) {
+            cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + (3 - cutlass::canonical_warp_group_idx()) /*id*/);
+        } else {
+            cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + (cutlass::canonical_warp_group_idx() <= 2 ? cutlass::canonical_warp_group_idx() + 1 : cutlass::canonical_warp_group_idx() + 1 - 3)  /*id*/);
+            cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + (cutlass::canonical_warp_group_idx() <= 1 ? cutlass::canonical_warp_group_idx() + 2 : cutlass::canonical_warp_group_idx() + 2 - 3)  /*id*/);
+        }
+    }
+
+    CUTLASS_DEVICE void
+    mma_init() {
+        // Tell producer (warp 0) that smem_q is ready
+        cutlass::arch::NamedBarrier::arrive(NumMmaThreads + Ktraits::NumProducerThreads, static_cast<int>(FwdNamedBarriers::QueryEmpty) /*id*/);                
+        if constexpr (!UseSchedulerBarrier) { return; }
+        static_assert(NumMmaThreads == 2 * cutlass::NumThreadsPerWarpGroup || NumMmaThreads == 3 * cutlass::NumThreadsPerWarpGroup);
+        if (cutlass::canonical_warp_group_idx() > 1) {
+            cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + 1 /*id*/);
+        }
+        if constexpr (NumMmaThreads == 3 * cutlass::NumThreadsPerWarpGroup) {
+            if (cutlass::canonical_warp_group_idx() > 2) {
+                cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast<int>(FwdNamedBarriers::WarpSchedulerWG1) - 1 + 2 /*id*/);
+            }
+        }
+
+    }
+
+    template <typename SharedStorage, typename FrgTensorO, typename Softmax>
+    CUTLASS_DEVICE void
+    mma(Params const& mainloop_params,
+        MainloopPipeline pipeline_k,
+        MainloopPipeline pipeline_v,
+        PipelineState& smem_pipe_read_k,
+        PipelineState& smem_pipe_read_v,
+        FrgTensorO& tOrO,
+        Softmax& softmax,
+        int n_block_count,
+        int thread_idx,
+        int work_idx,
+        int m_block,
+        SharedStorage& shared_storage,
+        const Seqlen_traits& seqlen_traits_q,
+        const Seqlen_traits& seqlen_traits_k
+        ) {
+        static_assert(is_rmem<FrgTensorO>::value, "O tensor must be rmem resident.");
+
+        static constexpr int kBlockM = get<0>(TileShape_MNK{});
+        static constexpr int kBlockN = get<1>(TileShape_MNK{});
+
+        Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQ{});
+        Tensor sK = make_tensor(make_smem_ptr(shared_storage.smem_k.data()), SmemLayoutK{});
+        Tensor sVt = make_tensor(make_smem_ptr(shared_storage.smem_v.data()), SmemLayoutVt{});
+
+        typename Ktraits::TiledMma0 tiled_mma0;
+        typename Ktraits::TiledMma1 tiled_mma1;
+        auto threadMma0 = tiled_mma0.get_thread_slice(thread_idx);
+        auto threadMma1 = tiled_mma1.get_thread_slice(thread_idx);
+
+        // Allocate "fragments/descriptors" for first matmul.
+        Tensor tSrQ = threadMma0.partition_fragment_A(sQ);
+        Tensor tSrK = threadMma0.partition_fragment_B(sK);
+        // Allocate "fragments/descriptors" for second matmul.
+        // Note: S becomes P.
+        Tensor tOrV = threadMma1.partition_fragment_B(sVt);
+
+        auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
+            auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+            pipeline.consumer_wait(smem_pipe_read, barrier_token);
+        };
+
+        tiled_mma1.accumulate_ = GMMA::ScaleOut::Zero;
+        int const seqlen_q = seqlen_traits_q.actual_seq_len;
+        int const seqlen_k = seqlen_traits_k.actual_seq_len;
+        int n_block = n_block_count - 1;
+
+        cutlass::ConsumerToken barrier_token = static_cast<cutlass::BarrierStatus>(shared_storage.barrier_Q.try_wait(work_idx % 2));
+        if (barrier_token == cutlass::BarrierStatus::WaitAgain) { shared_storage.barrier_Q.wait(work_idx % 2); }
+
+        Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+        consumer_wait(pipeline_k, smem_pipe_read_k);
+        warp_scheduler_barrier_sync();
+        flash::gemm</*zero_init=*/true, /*wg_wait=*/-1>(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read_k.index()), tSrS);
+        warp_scheduler_barrier_arrive();
+    
+        if (work_idx != 0) {
+            int lane_predicate = cute::elect_one_sync();
+            if (cutlass::canonical_warp_idx_sync() == Ktraits::kNWarps - 1 && lane_predicate) {
+                tma_store_wait<0>();
+                #pragma unroll
+                for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) {
+                    shared_storage.barrier_O.arrive(cta_id, lane_predicate);
+                }
+            }
+        }
+        warpgroup_wait<0>();
+        pipeline_k.consumer_release(smem_pipe_read_k);
+        ++smem_pipe_read_k;
+
+        auto col_limit_causal = [&](int row, int n_block) {
+            return row + 1 + seqlen_k - n_block * kBlockN - seqlen_q + m_block * kBlockM;
+        };
+        {
+            Tensor cS = cute::make_identity_tensor(select<0, 1>(TileShape_MNK{}));
+            Tensor tScS = threadMma0.partition_C(cS);
+            #pragma unroll
+            for (int i = 0; i < size(tSrS); ++i) {
+                if constexpr (!Is_causal) {  // Just masking based on col
+                    if (int(get<1>(tScS(i))) >= int(seqlen_k - n_block * kBlockN)) { tSrS(i) = -INFINITY; }
+                } else {  // mask based on both row and col
+                    // using std::min is faster than doing col >= limit0 or col >= limit1
+                    // Need to cast get<1>(tScS(i)) to (signed) int since by default it's unsigned, and the
+                    // right hand side can be negative and might be converted to a very large unsigned integer.
+                    if (int(get<1>(tScS(i))) >= std::min(seqlen_k - n_block * kBlockN,
+                                                        col_limit_causal(int(get<0>(tScS(i))), n_block))) {
+                        tSrS(i) = -INFINITY;
+                    }
+                }
+            }
+        }
+
+        softmax.template online_softmax</*Is_first=*/true>(tSrS, mainloop_params.softmax_scale_log2);
+        Tensor tOrP = make_tensor(convert_type<Element>(tSrS).data(), convert_layout_acc_Aregs<typename Ktraits::TiledMma1>(tSrS.layout()));
+        Tensor scores_scale = make_fragment_like(softmax.row_max);
+        clear(scores_scale);
+
+        constexpr int n_masking_steps = !Is_causal ? 1 : cute::ceil_div(kBlockM, kBlockN) + 1;
+        // Only go through these if Is_causal, since n_masking_steps = 1 when !Is_causal
+        #pragma unroll
+        for (int masking_step = 0; masking_step < n_masking_steps - 1 && n_block > 0; ++masking_step, --n_block) {
+            Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+            consumer_wait(pipeline_k, smem_pipe_read_k);
+            warp_scheduler_barrier_sync();
+            flash::gemm</*zero_init=*/true, /*wg_wait=*/-1>(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read_k.index()), tSrS);
+            if (masking_step > 0) { softmax.rescale_o(tOrO, scores_scale); }
+            consumer_wait(pipeline_v, smem_pipe_read_v);
+            flash::gemm</*zero_init=*/false, /*wg_wait=*/-1>(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read_v.index()), tOrO);
+            warp_scheduler_barrier_arrive();
+            warpgroup_wait<1>();
+            pipeline_k.consumer_release(smem_pipe_read_k);  // release K
+            Tensor cS = cute::make_identity_tensor(select<0, 1>(TileShape_MNK{}));
+            Tensor tScS = threadMma0.partition_C(cS);
+            #pragma unroll
+            for (int i = 0; i < size(tSrS); ++i) {
+                if (int(get<1>(tScS(i))) >= col_limit_causal(int(get<0>(tScS(i))), n_block - 1)) {
+                    tSrS(i) = -INFINITY;
+                }
+            }
+            cute::copy(softmax.template max</*Is_first=*/false, /*Check_inf=*/true>(tSrS, mainloop_params.softmax_scale_log2), scores_scale);
+            softmax.template online_softmax</*Is_first=*/false, /*Check_inf=*/true>(tSrS, mainloop_params.softmax_scale_log2);
+            warpgroup_wait<0>();
+            pipeline_v.consumer_release(smem_pipe_read_v);  // release V
+            ++smem_pipe_read_k;
+            ++smem_pipe_read_v;
+            cute::copy(make_tensor(convert_type<Element>(tSrS).data(), convert_layout_acc_Aregs<typename Ktraits::TiledMma1>(tSrS.layout())), tOrP);
+        }
+
+        #pragma unroll 1
+        for (; n_block > 0; --n_block) {
+            Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+            consumer_wait(pipeline_k, smem_pipe_read_k);
+            warp_scheduler_barrier_sync();
+            flash::gemm</*zero_init=*/true, /*wg_wait=*/-1>(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read_k.index()), tSrS);
+            softmax.rescale_o(tOrO, scores_scale);
+            consumer_wait(pipeline_v, smem_pipe_read_v);
+            flash::gemm</*zero_init=*/false, /*wg_wait=*/-1>(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read_v.index()), tOrO);
+            warp_scheduler_barrier_arrive();
+            warpgroup_wait<1>();
+            pipeline_k.consumer_release(smem_pipe_read_k);  // release K
+            // auto scores_scale = softmax.template max</*Is_first=*/false>(tSrS);
+            cute::copy(softmax.template max</*Is_first=*/false>(tSrS, mainloop_params.softmax_scale_log2), scores_scale);
+            softmax.template online_softmax</*Is_first=*/false>(tSrS, mainloop_params.softmax_scale_log2);
+            warpgroup_wait<0>();
+            pipeline_v.consumer_release(smem_pipe_read_v);  // release V
+            ++smem_pipe_read_k;
+            ++smem_pipe_read_v;
+            // softmax.rescale_o(tOrO, scores_scale);
+            cute::copy(make_tensor(convert_type<Element>(tSrS).data(), convert_layout_acc_Aregs<typename Ktraits::TiledMma1>(tSrS.layout())), tOrP);
+        }
+        // Tell warp 0 that smem_q is ready
+        cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarp, static_cast<int>(FwdNamedBarriers::QueryEmpty) /*id*/);
+        softmax.rescale_o(tOrO, scores_scale);
+        consumer_wait(pipeline_v, smem_pipe_read_v);
+        flash::gemm</*zero_init=*/false, /*wg_wait=*/-1>(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read_v.index()), tOrO);
+        cute::copy(softmax.template finalize</*Check_inf=*/Is_causal>(tSrS, mainloop_params.softmax_scale_log2), scores_scale);
+        warpgroup_wait<0>();
+        pipeline_v.consumer_release(smem_pipe_read_v);  // release V, otherwise producers will hang
+        ++smem_pipe_read_v;
+
+        softmax.rescale_o(tOrO, scores_scale);
+        return;
+    }
+
+    template <bool Delay_V_release = false, typename SharedStorage, typename FrgTensorO, typename Softmax>
+    CUTLASS_DEVICE void
+    mma_fp8(Params const& mainloop_params,
+        MainloopPipeline pipeline_k,
+        MainloopPipelineNoTMA pipeline_vt,
+        PipelineState& smem_pipe_read,
+        PipelineState& smem_pipe_release,        
+        FrgTensorO& tOrO,
+        Softmax& softmax,
+        int n_block_count,
+        int thread_idx,
+        int work_idx,
+        int m_block,
+        SharedStorage& shared_storage,
+        const Seqlen_traits& seqlen_traits_q,
+        const Seqlen_traits& seqlen_traits_k
+        ) {
+        static_assert(is_rmem<FrgTensorO>::value, "O tensor must be rmem resident.");
+
+        static constexpr int kBlockM = get<0>(TileShape_MNK{});
+        static constexpr int kBlockN = get<1>(TileShape_MNK{});
+
+        Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQ{});
+        Tensor sK = make_tensor(make_smem_ptr(shared_storage.smem_k.data()), SmemLayoutK{});
+        Tensor sVt = make_tensor(make_smem_ptr(shared_storage.smem_v_out.data()), SmemLayoutVt{});
+
+        typename Ktraits::TiledMma0 tiled_mma0;
+        typename Ktraits::TiledMma1 tiled_mma1;
+        auto threadMma0 = tiled_mma0.get_thread_slice(thread_idx);
+        auto threadMma1 = tiled_mma1.get_thread_slice(thread_idx);
+
+        // Allocate "fragments/descriptors" for first matmul.
+        Tensor tSrQ = threadMma0.partition_fragment_A(sQ);
+        Tensor tSrK = threadMma0.partition_fragment_B(sK);
+        // Allocate "fragments/descriptors" for second matmul.
+        Tensor tOrV = threadMma1.partition_fragment_B(sVt);
+
+        auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
+            auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+            pipeline.consumer_wait(smem_pipe_read, barrier_token);
+        };
+
+        tiled_mma1.accumulate_ = GMMA::ScaleOut::Zero;
+        // workaround for fp8 only perf regression pending change to seqlen traits class
+        int const seqlen_q = Seqlen_traits::kUseVarSeqLen ? seqlen_traits_q.actual_seq_len : shape<0>(mainloop_params.layout_Q);
+        int const seqlen_k = Seqlen_traits::kUseVarSeqLen ? seqlen_traits_k.actual_seq_len : shape<0>(mainloop_params.layout_K);
+        int n_block = n_block_count - 1;
+        
+        cutlass::ConsumerToken barrier_token = static_cast<cutlass::BarrierStatus>(shared_storage.barrier_Q.try_wait(work_idx % 2));
+        if (barrier_token == cutlass::BarrierStatus::WaitAgain) { shared_storage.barrier_Q.wait(work_idx % 2); }
+        
+        Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));        
+        
+        consumer_wait(pipeline_k, smem_pipe_read);                        
+        warp_scheduler_barrier_sync();
+        flash::gemm</*zero_init=*/true, /*wg_wait=*/-1>(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
+        if (work_idx != 0) {        
+            int lane_predicate = cute::elect_one_sync();
+            if (cutlass::canonical_warp_idx_sync() == Ktraits::kNWarps - 1 && lane_predicate) {
+                tma_store_wait<0>();
+                #pragma unroll
+                for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) {
+                    shared_storage.barrier_O.arrive(cta_id, lane_predicate);
+                }
+            }        
+        }
+        warpgroup_wait<0>();
+        warp_scheduler_barrier_arrive();
+        pipeline_k.consumer_release(smem_pipe_read);
+
+        auto col_limit_causal = [&](int row, int n_block) {
+            return row + 1 + seqlen_k - n_block * kBlockN - seqlen_q + m_block * kBlockM;
+        };       
+        {
+            Tensor cS = cute::make_identity_tensor(select<0, 1>(TileShape_MNK{}));
+            Tensor tScS = threadMma0.partition_C(cS);
+            #pragma unroll
+            for (int i = 0; i < size(tSrS); ++i) {
+                if constexpr (!Is_causal) {  // Just masking based on col                
+                    if (int(get<1>(tScS(i))) >= int(seqlen_k - n_block * kBlockN)) { tSrS(i) = -INFINITY; }
+                } else {  // mask based on both row and col
+                    if (int(get<1>(tScS(i))) >= std::min(seqlen_k - n_block * kBlockN,
+                                                         col_limit_causal(int(get<0>(tScS(i))), n_block))) {
+                        tSrS(i) = -INFINITY;
+                    }
+                }
+            }
+        }
+
+        softmax.template online_softmax</*Is_first=*/true>(tSrS, mainloop_params.softmax_scale_log2);
+        Tensor tOrP = make_tensor(convert_type<Element>(tSrS).data(), convert_layout_acc_Aregs_fp8(tSrS.layout()));
+        permute_regs_A_to_C(tOrP);
+        
+        Tensor scores_scale = make_fragment_like(softmax.row_max);
+        clear(scores_scale);
+        
+        consumer_wait(pipeline_vt, smem_pipe_read);
+        flash::gemm</*zero_init=*/true, /*wg_wait=*/0>(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read.index()), tOrO);                
+        if constexpr(!Delay_V_release) { pipeline_vt.consumer_release(smem_pipe_read); }
+
+        ++smem_pipe_read;
+        --n_block;
+        constexpr int extra_iterations = !Is_causal ? kStages - 1 : cute::ceil_div(kBlockM, kBlockN);        
+
+        if constexpr(Is_causal) {
+            CUTLASS_PRAGMA_UNROLL      
+            for (int iter = 0; iter < extra_iterations && n_block >= 0; ++iter, --n_block) {
+                Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+                consumer_wait(pipeline_k, smem_pipe_read);
+                warp_scheduler_barrier_sync();
+                flash::gemm</*zero_init=*/true, /*wg_wait=*/0>(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
+
+                Tensor cS = cute::make_identity_tensor(select<0, 1>(TileShape_MNK{}));
+                Tensor tScS = threadMma0.partition_C(cS);
+                #pragma unroll
+                for (int i = 0; i < size(tSrS); ++i) {
+                    if (int(get<1>(tScS(i))) >= col_limit_causal(int(get<0>(tScS(i))), n_block)) {
+                        tSrS(i) = -INFINITY;
+                    }
+                }
+
+                warp_scheduler_barrier_arrive();
+                pipeline_k.consumer_release(smem_pipe_read);
+                consumer_wait(pipeline_vt, smem_pipe_read);
+                
+                cute::copy(softmax.template max</*Is_first=*/false, /*Check_inf=*/true>(tSrS, mainloop_params.softmax_scale_log2), scores_scale);
+                softmax.rescale_o(tOrO, scores_scale);
+                softmax.template online_softmax</*Is_first=*/false, /*Check_inf=*/true>(tSrS, mainloop_params.softmax_scale_log2);
+                Tensor tOrP = make_tensor(convert_type<Element>(tSrS).data(), convert_layout_acc_Aregs_fp8(tSrS.layout()));
+                permute_regs_A_to_C(tOrP);
+
+                if constexpr(Delay_V_release) {
+                    pipeline_vt.consumer_release(smem_pipe_release);
+                    ++smem_pipe_release;
+                }
+                flash::gemm</*zero_init=*/false, /*wg_wait=*/0>(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read.index()), tOrO);            
+                if constexpr(!Delay_V_release) { pipeline_vt.consumer_release(smem_pipe_read); }                
+                ++smem_pipe_read;
+            }
+        } else {
+            CUTLASS_PRAGMA_UNROLL      
+            for (int iter = 0; iter < extra_iterations && n_block >= 0; ++iter, --n_block) {
+                Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+                consumer_wait(pipeline_k, smem_pipe_read);
+                if constexpr(Delay_V_release) {
+                    pipeline_vt.consumer_release(smem_pipe_release);
+                    ++smem_pipe_release;
+                }
+                warp_scheduler_barrier_sync();
+                flash::gemm</*zero_init=*/true, /*wg_wait=*/0>(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
+                warp_scheduler_barrier_arrive();
+                if constexpr(!Delay_V_release) { pipeline_k.consumer_release(smem_pipe_read); }
+                else { consumer_wait(pipeline_vt, smem_pipe_read); }
+                
+                cute::copy(softmax.template max</*Is_first=*/false>(tSrS, mainloop_params.softmax_scale_log2), scores_scale);
+                softmax.rescale_o(tOrO, scores_scale);
+                softmax.template online_softmax</*Is_first=*/false>(tSrS, mainloop_params.softmax_scale_log2);
+                Tensor tOrP = make_tensor(convert_type<Element>(tSrS).data(), convert_layout_acc_Aregs_fp8(tSrS.layout()));
+                permute_regs_A_to_C(tOrP);
+
+                if constexpr (Delay_V_release) { pipeline_k.consumer_release(smem_pipe_read); }
+                else { consumer_wait(pipeline_vt, smem_pipe_read); }
+                flash::gemm</*zero_init=*/false, /*wg_wait=*/0>(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read.index()), tOrO);
+                if constexpr(!Delay_V_release) { pipeline_vt.consumer_release(smem_pipe_read); }                
+                ++smem_pipe_read;
+            }
+        }
+
+        if constexpr(Delay_V_release) {
+            warp_scheduler_barrier_sync();
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (; n_block >= 0; --n_block) {
+                Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+                consumer_wait(pipeline_k, smem_pipe_read);
+                pipeline_vt.consumer_release(smem_pipe_release);                
+                flash::gemm</*zero_init=*/true, /*wg_wait=*/-1>(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
+                warp_scheduler_barrier_arrive();
+                warpgroup_wait<0>();                
+                consumer_wait(pipeline_vt, smem_pipe_read);
+
+                cute::copy(softmax.template max</*Is_first=*/false>(tSrS, mainloop_params.softmax_scale_log2), scores_scale);
+                softmax.rescale_o(tOrO, scores_scale);
+                softmax.template online_softmax</*Is_first=*/false>(tSrS, mainloop_params.softmax_scale_log2);
+                Tensor tOrP = make_tensor(convert_type<Element>(tSrS).data(), convert_layout_acc_Aregs_fp8(tSrS.layout()));
+                permute_regs_A_to_C(tOrP);
+
+                pipeline_k.consumer_release(smem_pipe_read);
+                flash::gemm</*zero_init=*/false, /*wg_wait=*/-1>(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read.index()), tOrO);
+                warp_scheduler_barrier_sync();
+                warpgroup_wait<0>();
+                ++smem_pipe_read;
+                ++smem_pipe_release;
+            }
+            warp_scheduler_barrier_arrive();
+            pipeline_vt.consumer_release(smem_pipe_release);
+            ++smem_pipe_release;
+        } else {
+            if constexpr (kHeadDim == 128) { warp_scheduler_barrier_sync(); }
+            CUTLASS_PRAGMA_NO_UNROLL
+            for (; n_block >= 0; --n_block) {
+                Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+                consumer_wait(pipeline_k, smem_pipe_read);
+                if constexpr (kHeadDim == 256) { warp_scheduler_barrier_sync(); }
+                flash::gemm</*zero_init=*/true, /*wg_wait=*/0>(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
+                warp_scheduler_barrier_arrive();
+                pipeline_k.consumer_release(smem_pipe_read);
+
+                cute::copy(softmax.template max</*Is_first=*/false>(tSrS, mainloop_params.softmax_scale_log2), scores_scale);
+                softmax.rescale_o(tOrO, scores_scale);
+                softmax.template online_softmax</*Is_first=*/false>(tSrS, mainloop_params.softmax_scale_log2);
+                Tensor tOrP = make_tensor(convert_type<Element>(tSrS).data(), convert_layout_acc_Aregs_fp8(tSrS.layout()));
+                permute_regs_A_to_C(tOrP);
+
+                consumer_wait(pipeline_vt, smem_pipe_read);
+                if constexpr (kHeadDim == 128) { warp_scheduler_barrier_sync(); }
+                flash::gemm</*zero_init=*/false, /*wg_wait=*/0>(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read.index()), tOrO);
+                pipeline_vt.consumer_release(smem_pipe_read);
+                ++smem_pipe_read;
+            }
+            if constexpr (kHeadDim == 128) { warp_scheduler_barrier_arrive(); }
+        }
+        cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarpGroup, static_cast<int>(FwdNamedBarriers::QueryEmpty) /*id*/);
+        
+        cute::copy(softmax.template finalize</*Check_inf=*/Is_causal>(tSrS, mainloop_params.softmax_scale_log2), scores_scale);
+        softmax.rescale_o(tOrO, scores_scale);
+        return;
+    }
+
+};
+
+} // namespace flash
diff --git a/many_loggers.yaml b/many_loggers.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7bc3d6762674a8220d771e36ca10791ae6447e06
--- /dev/null
+++ b/many_loggers.yaml
@@ -0,0 +1,9 @@
+# train with many loggers at once
+
+defaults:
+  # - comet.yaml
+  - csv.yaml
+  # - mlflow.yaml
+  # - neptune.yaml
+  # - tensorboard.yaml
+  - wandb.yaml
diff --git a/mask.h b/mask.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ba435a37bb65fffca9d3f227113a657919bde07
--- /dev/null
+++ b/mask.h
@@ -0,0 +1,213 @@
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cute/tensor.hpp>
+
+namespace flash {
+
+using namespace cute;
+
+template <typename Engine, typename Layout>
+__forceinline__ __device__ void apply_mask(Tensor<Engine, Layout> &tensor, const int max_seqlen_k,
+                                  const int col_idx_offset_ = 0) {
+    // tensor has shape (nrow=(2, MMA_M), ncol=(2, MMA_N))
+    static_assert(Layout::rank == 2, "Only support 2D Tensor");
+    const int lane_id = threadIdx.x % 32;
+    const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
+    #pragma unroll
+    for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+        const int col_idx_base = col_idx_offset + nj * 8;
+        #pragma unroll
+        for (int j = 0; j < size<1, 0>(tensor); ++j) {
+            const int col_idx = col_idx_base + j;
+            if (col_idx >= max_seqlen_k) {
+                // Without the "make_coord" we get wrong results
+                #pragma unroll
+                for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                    tensor(mi, make_coord(j, nj)) = -INFINITY;
+                }
+            }
+        }
+    }
+}
+
+template <bool HasWSLeft=true, typename Engine, typename Layout>
+__forceinline__ __device__ void apply_mask_local(Tensor<Engine, Layout> &tensor, const int col_idx_offset_,
+                                        const int max_seqlen_k, const int row_idx_offset,
+                                        const int max_seqlen_q, const int warp_row_stride,
+                                        const int window_size_left, const int window_size_right) {
+    // tensor has shape (nrow=(2, MMA_M), ncol=(2, MMA_N))
+    static_assert(Layout::rank == 2, "Only support 2D Tensor");
+    const int lane_id = threadIdx.x % 32;
+    const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
+    #pragma unroll
+    for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
+        const int row_idx_base = row_idx_offset + mi * warp_row_stride;
+        #pragma unroll
+        for (int i = 0; i < size<0, 0>(tensor); ++i) {
+            const int row_idx = row_idx_base + i * 8;
+            const int col_idx_limit_left = std::max(0, row_idx + max_seqlen_k - max_seqlen_q - window_size_left);
+            const int col_idx_limit_right = std::min(max_seqlen_k, row_idx + 1 + max_seqlen_k - max_seqlen_q + window_size_right);
+            #pragma unroll
+            for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                const int col_idx_base = col_idx_offset + nj * 8;
+                #pragma unroll
+                for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                    const int col_idx = col_idx_base + j;
+                    if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) {
+                        tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                    }
+                }
+            }
+            // if (cute::thread0()) {
+            //     printf("mi = %d, i = %d, row_idx = %d, max_seqlen_k = %d\n", mi, i, row_idx, max_seqlen_k);
+            //     print(tensor(make_coord(i, mi), _));
+            //     // print(tensor(_, j + nj * size<1, 0>(tensor)));
+            // }
+        }
+    }
+}
+
+template <typename Engine, typename Layout>
+__forceinline__ __device__ void apply_mask_causal(Tensor<Engine, Layout> &tensor, const int col_idx_offset_,
+                                         const int max_seqlen_k, const int row_idx_offset,
+                                         const int max_seqlen_q, const int warp_row_stride) {
+    // Causal masking is equivalent to local masking with window_size_left = infinity and window_size_right = 0
+    apply_mask_local</*HasWSLeft=*/false>(tensor, col_idx_offset_, max_seqlen_k, row_idx_offset,
+                                          max_seqlen_q, warp_row_stride, -1, 0);
+}
+
+template <typename Engine0, typename Layout0, typename Engine1, typename Layout1>
+__forceinline__ __device__ void apply_mask_causal_w_idx(
+    Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> const &idx_rowcol,
+    const int col_idx_offset_, const int max_seqlen_k, const int row_idx_offset)
+{
+    // tensor has shape (nrow=(2, MMA_M), ncol=(2, MMA_N))
+    static_assert(Layout0::rank == 2, "Only support 2D Tensor");
+    static_assert(Layout1::rank == 2, "Only support 2D Tensor");
+    CUTE_STATIC_ASSERT_V(size<0>(tensor) == size<0>(idx_rowcol));
+    CUTE_STATIC_ASSERT_V(size<1>(tensor) == size<1>(idx_rowcol));
+    #pragma unroll
+    for (int mi = 0; mi < size<0>(tensor); ++mi) {
+        const int col_idx_limit = std::min(max_seqlen_k, 1 + row_idx_offset + get<0>(idx_rowcol(mi, 0)));
+        #pragma unroll
+        for (int ni = 0; ni < size<1, 1>(tensor); ++ni) {
+            if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) {
+                tensor(mi, ni) = -INFINITY;
+            }
+        }
+        // if (cute::thread0()) {
+        //     printf("ni = %d, j = %d, col_idx = %d, max_seqlen_k = %d\n", ni, j, col_idx, max_seqlen_k);
+        //     print(tensor(_, make_coord(j, ni)));
+        //     // print(tensor(_, j + ni * size<1, 0>(tensor)));
+        // }
+    }
+}
+
+template <bool Is_causal, bool Is_local, bool Has_alibi>
+struct Mask {
+
+    const int max_seqlen_k, max_seqlen_q;
+    const int window_size_left, window_size_right;
+    const float alibi_slope;
+
+    __forceinline__ __device__ Mask(const int max_seqlen_k, const int max_seqlen_q,
+                                    const int window_size_left, const int window_size_right,
+                                    const float alibi_slope=0.f)
+        : max_seqlen_k(max_seqlen_k)
+        , max_seqlen_q(max_seqlen_q)
+        , window_size_left(window_size_left)
+        , window_size_right(window_size_right)
+        , alibi_slope(!Has_alibi ? 0.0 : alibi_slope) {
+    };
+
+    // Causal_mask: whether this particular iteration needs causal masking
+    template <bool Causal_mask=false, bool Is_even_MN=true, typename Engine, typename Layout>
+    __forceinline__ __device__ void apply_mask(Tensor<Engine, Layout> &tensor_,
+                                               const int col_idx_offset_,
+                                               const int row_idx_offset,
+                                               const int warp_row_stride) {
+        static_assert(!(Causal_mask && Is_local), "Cannot be both causal and local");
+        static_assert(Layout::rank == 3, "Only support 3D Tensor");
+        static_assert(decltype(size<0>(tensor_))::value == 4, "First dimension must be 4");
+        static constexpr bool Need_masking = Has_alibi || Causal_mask || Is_local || !Is_even_MN;
+        // if (cute::thread0()) { printf("Has_alibi = %d, Causal_mask=%d, Is_local=%d, Is_even_MN = %d, Need_masking = %d\n", Has_alibi, Causal_mask, Is_local, Is_even_MN, Need_masking); }
+        if constexpr (Need_masking) {
+            // Reshape tensor_ from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
+            Tensor tensor = make_tensor(tensor_.data(), flash::convert_layout_acc_rowcol(tensor_.layout()));
+            // Do we need both row and column indices, or just column incides?
+            static constexpr bool Col_idx_only = !(Has_alibi && !Is_causal) && !Is_local && !Causal_mask;
+            const int lane_id = threadIdx.x % 32;
+            const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
+            if constexpr (Col_idx_only) {
+                #pragma unroll
+                for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                    const int col_idx_base = col_idx_offset + nj * 8;
+                    #pragma unroll
+                    for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                        const int col_idx = col_idx_base + j;
+                        #pragma unroll
+                        for (int mi = 0; mi < size<0>(tensor); ++mi) {
+                            // No causal, no local
+                            if constexpr (Has_alibi) {
+                                tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx;
+                            }
+                            if constexpr (!Is_even_MN) {
+                                if (col_idx >= max_seqlen_k) { tensor(mi, make_coord(j, nj)) = -INFINITY; }
+                            }
+                        }
+                    }
+                }
+            } else {
+                #pragma unroll
+                for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
+                    const int row_idx_base = row_idx_offset + mi * warp_row_stride;
+                    #pragma unroll
+                    for (int i = 0; i < size<0, 0>(tensor); ++i) {
+                        const int row_idx = row_idx_base + i * 8;
+                        const int col_idx_limit_left = std::max(0, row_idx + max_seqlen_k - max_seqlen_q - window_size_left);
+                        const int col_idx_limit_right = std::min(max_seqlen_k, row_idx + 1 + max_seqlen_k - max_seqlen_q + window_size_right);
+                        #pragma unroll
+                        for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+                            const int col_idx_base = col_idx_offset + nj * 8;
+                            #pragma unroll
+                            for (int j = 0; j < size<1, 0>(tensor); ++j) {
+                                const int col_idx = col_idx_base + j;
+                                if constexpr (Has_alibi) {
+                                    if constexpr (Is_causal) {
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) += alibi_slope * col_idx;
+                                    } else {
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
+
+                                    }
+                                }
+                                if constexpr (Causal_mask) {
+                                    if (col_idx >= col_idx_limit_right) {
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                                    }
+                                }
+                                if constexpr (Is_local) {
+                                    if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) {
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                                    }
+                                }
+                                if constexpr (!Causal_mask && !Is_local && !Is_even_MN) {
+                                    // Causal and Local already handles MN masking
+                                    if (col_idx >= max_seqlen_k) {
+                                        tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    };
+
+};
+
+} // namespace flash
diff --git a/mha.py b/mha.py
new file mode 100644
index 0000000000000000000000000000000000000000..77640c2b239ac729cad79ce3b2504e0eeacb5f73
--- /dev/null
+++ b/mha.py
@@ -0,0 +1,1020 @@
+# Copyright (c) 2023, Tri Dao.
+
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+
+from flash_attn.utils.distributed import get_dim_for_local_rank
+
+try:
+    from flash_attn import (
+        flash_attn_kvpacked_func,
+        flash_attn_qkvpacked_func,
+        flash_attn_varlen_kvpacked_func,
+        flash_attn_varlen_qkvpacked_func,
+        flash_attn_with_kvcache,
+    )
+except ImportError:
+    flash_attn_varlen_qkvpacked_func, flash_attn_varlen_kvpacked_func = None, None
+    flash_attn_qkvpacked_func, flash_attn_kvpacked_func = None, None
+    flash_attn_with_kvcache = None
+
+try:
+    from flash_attn.ops.fused_dense import ColumnParallelLinear, FusedDense, RowParallelLinear
+except ImportError:
+    FusedDense, ColumnParallelLinear, RowParallelLinear = None, None, None
+
+try:
+    from flash_attn.layers.rotary import RotaryEmbedding
+except ImportError:
+    RotaryEmbedding = None
+
+
+# From https://github.com/ofirpress/attention_with_linear_biases/blob/4b92f28a005ead2567abe2359f633e73e08f3833/fairseq/models/transformer.py#L742
+def get_alibi_slopes(nheads):
+    def get_slopes_power_of_2(nheads):
+        start = 2 ** (-(2 ** -(math.log2(nheads) - 3)))
+        ratio = start
+        return [start * ratio**i for i in range(nheads)]
+
+    if math.log2(nheads).is_integer():
+        return get_slopes_power_of_2(nheads)
+    else:
+        closest_power_of_2 = 2 ** math.floor(math.log2(nheads))
+        return (
+            get_slopes_power_of_2(closest_power_of_2)
+            + get_alibi_slopes(2 * closest_power_of_2)[0::2][: nheads - closest_power_of_2]
+        )
+
+
+class FlashSelfAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+
+    def __init__(
+        self,
+        causal=False,
+        softmax_scale=None,
+        attention_dropout=0.0,
+        window_size=(-1, -1),
+        alibi_slopes=None,
+        deterministic=False,
+    ):
+        super().__init__()
+        assert flash_attn_varlen_qkvpacked_func is not None, "FlashAttention is not installed"
+        assert flash_attn_qkvpacked_func is not None, "FlashAttention is not installed"
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.drop = nn.Dropout(attention_dropout)
+        self.register_buffer("alibi_slopes", alibi_slopes, persistent=False)
+        self.window_size = window_size
+        self.deterministic = deterministic
+
+    def forward(self, qkv, causal=None, cu_seqlens=None, max_seqlen=None):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value.
+                If cu_seqlens is None and max_seqlen is None, then qkv has shape (B, S, 3, H, D).
+                If cu_seqlens is not None and max_seqlen is not None, then qkv has shape
+                (total, 3, H, D), where total is the sum of the sequence lengths in the batch.
+            causal: if passed, will override self.causal
+            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+                of the sequences in the batch, used to index into qkv.
+            max_seqlen: int. Maximum sequence length in the batch.
+        Returns:
+        --------
+            out: (total, H, D) if cu_seqlens is not None and max_seqlen is not None,
+                else (B, S, H, D).
+        """
+        assert qkv.dtype in [torch.float16, torch.bfloat16]
+        assert qkv.is_cuda
+        causal = self.causal if causal is None else causal
+        unpadded = cu_seqlens is not None
+        if self.alibi_slopes is not None:
+            self.alibi_slopes = self.alibi_slopes.to(torch.float32)
+        if unpadded:
+            assert cu_seqlens.dtype == torch.int32
+            assert max_seqlen is not None
+            assert isinstance(max_seqlen, int)
+            return flash_attn_varlen_qkvpacked_func(
+                qkv,
+                cu_seqlens,
+                max_seqlen,
+                self.drop.p if self.training else 0.0,
+                softmax_scale=self.softmax_scale,
+                causal=causal,
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.window_size,
+                deterministic=self.deterministic,
+            )
+        else:
+            return flash_attn_qkvpacked_func(
+                qkv,
+                self.drop.p if self.training else 0.0,
+                softmax_scale=self.softmax_scale,
+                causal=causal,
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.window_size,
+                deterministic=self.deterministic,
+            )
+
+
+class FlashCrossAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+
+    def __init__(
+        self,
+        causal=False,
+        softmax_scale=None,
+        attention_dropout=0.0,
+        alibi_slopes=None,
+        window_size=(-1, -1),
+        deterministic=False,
+    ):
+        super().__init__()
+        assert flash_attn_varlen_kvpacked_func is not None, "FlashAttention is not installed"
+        assert flash_attn_kvpacked_func is not None, "FlashAttention is not installed"
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.drop = nn.Dropout(attention_dropout)
+        self.register_buffer("alibi_slopes", alibi_slopes, persistent=False)
+        self.window_size = window_size
+        self.deterministic = deterministic
+
+    def forward(
+        self,
+        q,
+        kv,
+        causal=None,
+        cu_seqlens=None,
+        max_seqlen=None,
+        cu_seqlens_k=None,
+        max_seqlen_k=None,
+    ):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            q: The tensor containing the query. (B, Sq, H, D)
+            kv: The tensor containing the key and value. (B, Sk, 2, H_k, D)
+            causal: if passed, will override self.causal
+            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+                of the sequences in the batch, used to index into q.
+            max_seqlen: int. Maximum sequence length in the batch of q.
+            cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+                of the sequences in the batch, used to index into kv.
+            max_seqlen_k: int. Maximum sequence length in the batch of k and v.
+        """
+        assert q.dtype in [torch.float16, torch.bfloat16]
+        assert q.is_cuda and kv.is_cuda
+        causal = self.causal if causal is None else causal
+        unpadded = cu_seqlens is not None
+        if self.alibi_slopes is not None:
+            self.alibi_slopes = self.alibi_slopes.to(torch.float32)
+        if unpadded:
+            assert cu_seqlens.dtype == torch.int32
+            assert max_seqlen is not None
+            assert isinstance(max_seqlen, int)
+            assert cu_seqlens_k is not None
+            assert cu_seqlens_k.dtype == torch.int32
+            assert max_seqlen_k is not None
+            assert isinstance(max_seqlen_k, int)
+            return flash_attn_varlen_kvpacked_func(
+                q,
+                kv,
+                cu_seqlens,
+                cu_seqlens_k,
+                max_seqlen,
+                max_seqlen_k,
+                self.drop.p if self.training else 0.0,
+                softmax_scale=self.softmax_scale,
+                causal=causal,
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.window_size,
+                deterministic=self.deterministic,
+            )
+        else:
+            batch_size, seqlen_q = q.shape[0], q.shape[1]
+            seqlen_k = kv.shape[1]
+            assert kv.shape[0] == batch_size and kv.shape[4] == q.shape[3]
+            return flash_attn_kvpacked_func(
+                q,
+                kv,
+                self.drop.p if self.training else 0.0,
+                causal=causal,
+                softmax_scale=self.softmax_scale,
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.window_size,
+                deterministic=self.deterministic,
+            )
+
+
+class SelfAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+
+    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0):
+        super().__init__()
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.drop = nn.Dropout(attention_dropout)
+
+    def forward(self, qkv, causal=None, key_padding_mask=None):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D)
+            causal: if passed, will override self.causal
+            key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
+                False means to mask out. (B, S)
+        """
+        batch_size, seqlen = qkv.shape[0], qkv.shape[1]
+        causal = self.causal if causal is None else causal
+        q, k, v = qkv.unbind(dim=2)
+        softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
+        scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
+        if key_padding_mask is not None:
+            padding_mask = torch.full(
+                (batch_size, seqlen), -10000.0, dtype=scores.dtype, device=scores.device
+            )
+            padding_mask.masked_fill_(key_padding_mask, 0.0)
+            # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+            scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
+        if causal:
+            # "triu_tril_cuda_template" not implemented for 'BFloat16'
+            # So we have to construct the mask in float
+            causal_mask = torch.triu(
+                torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1
+            )
+            # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+            scores = scores + causal_mask.to(dtype=scores.dtype)
+        attention = torch.softmax(scores, dim=-1, dtype=v.dtype)
+        attention_drop = self.drop(attention)
+        output = torch.einsum("bhts,bshd->bthd", attention_drop, v)
+        return output
+
+
+class CrossAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+
+    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0):
+        super().__init__()
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.drop = nn.Dropout(attention_dropout)
+
+    def forward(self, q, kv, causal=None, key_padding_mask=None):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            q: The tensor containing the query. (B, Sq, H, D)
+            kv: The tensor containing the key and value. (B, Sk, 2, H_k, D)
+            causal: if passed, will override self.causal
+            key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
+                False means to mask out. (B, Sk)
+        """
+        batch_size, seqlen_q = q.shape[0], q.shape[1]
+        causal = self.causal if causal is None else causal
+        seqlen_k = kv.shape[1]
+        assert kv.shape[0] == batch_size and kv.shape[4] == q.shape[3]
+        if kv.shape[3] != q.shape[2]:  # MQA/GQA
+            kv = repeat(kv, "... hkv d -> ... (hkv g) d", g=q.shape[2] // kv.shape[3])
+        k, v = kv.unbind(dim=2)
+        softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
+        scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
+        if key_padding_mask is not None:
+            padding_mask = torch.full(
+                (batch_size, seqlen_k), -10000.0, dtype=scores.dtype, device=scores.device
+            )
+            padding_mask.masked_fill_(key_padding_mask, 0.0)
+            # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+            scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
+        if causal:
+            # causal mask needs to take into account the difference between seqlen_q and seqlen_k
+            row_idx = rearrange(
+                torch.arange(seqlen_q, device=q.device, dtype=torch.long), "s -> s 1"
+            )
+            col_idx = torch.arange(seqlen_k, device=kv.device, dtype=torch.long)
+            sk = (
+                seqlen_k
+                if key_padding_mask is None
+                else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+            )
+            causal_mask = col_idx > row_idx + sk - seqlen_q
+            scores = scores.masked_fill(causal_mask, -10000.0)
+        attention = torch.softmax(scores, dim=-1, dtype=v.dtype)
+        attention_drop = self.drop(attention)
+        output = torch.einsum("bhts,bshd->bthd", attention_drop, v)
+        return output
+
+
+class LinearResidual(nn.Linear):
+    """Wrap nn.Linear to return the residual as well. For compatibility with FusedDense."""
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input), input
+
+
+def _update_kv_cache(kv, inference_params, layer_idx):
+    """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
+    # Pre-allocate memory for key-values for inference.
+    num_heads, head_dim = kv.shape[-2:]
+    if layer_idx not in inference_params.key_value_memory_dict:
+        kv_cache = torch.empty(
+            inference_params.max_batch_size,
+            inference_params.max_seqlen,
+            2,
+            num_heads,
+            head_dim,
+            dtype=kv.dtype,
+            device=kv.device,
+        )
+        inference_params.key_value_memory_dict[layer_idx] = kv_cache
+    else:
+        kv_cache = inference_params.key_value_memory_dict[layer_idx]
+    # Adjust key and value for inference
+    batch_start = inference_params.batch_size_offset
+    batch_end = batch_start + kv.shape[0]
+    sequence_start = inference_params.seqlen_offset
+    sequence_end = sequence_start + kv.shape[1]
+    assert batch_end <= kv_cache.shape[0]
+    assert sequence_end <= kv_cache.shape[1]
+    assert kv_cache is not None
+    kv_cache[batch_start:batch_end, sequence_start:sequence_end, ...] = kv
+    return kv_cache[batch_start:batch_end, :sequence_end, ...]
+
+
+class MHA(nn.Module):
+    """Multi-head self-attention and cross-attention"""
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        num_heads_kv=None,
+        cross_attn=False,
+        qkv_proj_bias=True,
+        out_proj_bias=True,
+        dropout=0.0,
+        softmax_scale=None,
+        causal=False,
+        layer_idx=None,
+        dwconv=False,
+        rotary_emb_dim=0,
+        rotary_emb_base=10000.0,
+        rotary_emb_scale_base=None,
+        rotary_emb_interleaved=False,
+        use_alibi=False,
+        window_size=(-1, -1),
+        fused_bias_fc=False,
+        use_flash_attn=False,
+        return_residual=False,
+        checkpointing=False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        """
+        num_heads_kv: can be used to toggle MQA / GQA. If None, use num_heads.
+        return_residual: whether to return the input x along with the output. This is for
+            performance reason: for post-norm architecture, returning the input allows us
+            to fuse the backward of nn.Linear with the residual connection.
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.cross_attn = cross_attn
+        self.causal = causal
+        self.layer_idx = layer_idx
+        self.dwconv = dwconv
+        self.rotary_emb_dim = rotary_emb_dim
+        self.use_flash_attn = use_flash_attn
+        self.return_residual = return_residual
+        self.checkpointing = checkpointing
+        if use_alibi:
+            assert use_flash_attn, "ALiBi code path requires flash_attn"
+            alibi_slopes = torch.tensor(get_alibi_slopes(num_heads), device=device)
+        else:
+            alibi_slopes = None
+        if window_size != (-1, -1):
+            assert use_flash_attn, "Local (sliding window) attention code path requires flash_attn"
+
+        self.num_heads = num_heads
+        self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads
+        assert (
+            self.num_heads % self.num_heads_kv == 0
+        ), "num_heads must be divisible by num_heads_kv"
+        assert self.embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
+        self.head_dim = self.embed_dim // num_heads
+        qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)
+        kv_dim = 2 * self.head_dim * self.num_heads_kv
+
+        if self.rotary_emb_dim > 0:
+            assert not cross_attn, "MHA with rotary embedding does not support cross-attention yet"
+            assert RotaryEmbedding is not None, "rotary_emb is not installed"
+            self.rotary_emb = RotaryEmbedding(
+                self.rotary_emb_dim,
+                base=rotary_emb_base,
+                scale_base=rotary_emb_scale_base,
+                interleaved=rotary_emb_interleaved,
+                device=device,
+            )
+
+        if fused_bias_fc and FusedDense is None:
+            raise ImportError("fused_dense is not installed")
+        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
+        linear_resid_cls = (
+            LinearResidual if not fused_bias_fc else partial(FusedDense, return_residual=True)
+        )
+        wqkv_cls = linear_cls if not self.return_residual else linear_resid_cls
+        inner_attn_cls = (
+            partial(FlashSelfAttention, alibi_slopes=alibi_slopes, window_size=window_size)
+            if use_flash_attn
+            else SelfAttention
+        )
+        inner_cross_attn_cls = (
+            partial(FlashCrossAttention, alibi_slopes=alibi_slopes, window_size=window_size)
+            if use_flash_attn
+            else CrossAttention
+        )
+        if not self.cross_attn:
+            self.Wqkv = wqkv_cls(embed_dim, qkv_dim, bias=qkv_proj_bias, **factory_kwargs)
+        else:
+            self.Wq = linear_cls(embed_dim, embed_dim, bias=qkv_proj_bias, **factory_kwargs)
+            self.Wkv = wqkv_cls(embed_dim, kv_dim, bias=qkv_proj_bias, **factory_kwargs)
+        if self.dwconv:
+            if self.num_heads_kv == self.num_heads:
+                self.dwconv_qkv = nn.Conv1d(
+                    qkv_dim, qkv_dim, kernel_size=3, padding=2, groups=qkv_dim
+                )
+            else:
+                self.dwconv_q = nn.Conv1d(
+                    embed_dim, embed_dim, kernel_size=3, padding=2, groups=embed_dim
+                )
+                self.dwconv_kv = nn.Conv1d(kv_dim, kv_dim, kernel_size=3, padding=2, groups=kv_dim)
+        self.inner_attn = inner_attn_cls(
+            causal=causal,
+            softmax_scale=softmax_scale,
+            attention_dropout=dropout,
+        )
+        self.inner_cross_attn = inner_cross_attn_cls(
+            causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
+        )
+        self.out_proj = linear_cls(embed_dim, embed_dim, bias=out_proj_bias, **factory_kwargs)
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
+        dtype = self.out_proj.weight.dtype if dtype is None else dtype
+        device = self.out_proj.weight.device
+        return torch.empty(
+            batch_size,
+            max_seqlen,
+            2,
+            self.num_heads_kv,
+            self.head_dim,
+            dtype=dtype,
+            device=device,
+        )
+
+    def _update_kv_cache(self, kv, inference_params):
+        """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
+        assert not self.dwconv, "Generation does not support dwconv yet"
+        assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
+        return _update_kv_cache(kv, inference_params, self.layer_idx)
+
+    def _apply_rotary_update_kvcache_attention(self, q, kv, inference_params):
+        """
+        Fast path that combine 3 steps: apply rotary to Q and K, update kv cache, and apply attention.
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        kv: (batch_size, seqlen_k, 2, nheads_kv, head_dim)
+        """
+        assert inference_params is not None and inference_params.seqlen_offset > 0
+        assert self.use_flash_attn
+        if self.rotary_emb_dim > 0:
+            assert self.rotary_emb.scale is None, "This code path does not support xPos"
+            self.rotary_emb._update_cos_sin_cache(
+                inference_params.max_seqlen, device=q.device, dtype=q.dtype
+            )
+            rotary_cos, rotary_sin = self.rotary_emb._cos_cached, self.rotary_emb._sin_cached
+        else:
+            rotary_cos, rotary_sin = None, None
+        batch = q.shape[0]
+        kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+        cache_seqlens = (
+            inference_params.lengths_per_sample[:batch]
+            if inference_params.lengths_per_sample is not None
+            else inference_params.seqlen_offset
+        )
+        alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None)
+        context = flash_attn_with_kvcache(
+            q,
+            kv_cache[:, :, 0],
+            kv_cache[:, :, 1],
+            kv[:, :, 0],
+            kv[:, :, 1],
+            rotary_cos=rotary_cos,
+            rotary_sin=rotary_sin,
+            cache_seqlens=cache_seqlens,
+            softmax_scale=self.inner_cross_attn.softmax_scale,
+            causal=self.inner_cross_attn.causal,
+            rotary_interleaved=self.rotary_emb.interleaved if self.rotary_emb_dim > 0 else False,
+            alibi_slopes=alibi_slopes,
+        )
+        return context
+
+    def _update_kvcache_attention(self, q, kv, inference_params):
+        """Write kv to inference_params, then do attention"""
+        if (
+            inference_params.seqlen_offset == 0
+            or flash_attn_with_kvcache is None
+            or not self.use_flash_attn
+        ):
+            # TODO: this only uses seqlen_offset and not lengths_per_sample.
+            kv = self._update_kv_cache(kv, inference_params)
+            return self.inner_cross_attn(q, kv)
+        else:
+            batch = q.shape[0]
+            kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+            cache_seqlens = (
+                inference_params.lengths_per_sample[:batch]
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+            alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None)
+            return flash_attn_with_kvcache(
+                q,
+                kv_cache[:, :, 0],
+                kv_cache[:, :, 1],
+                kv[:, :, 0],
+                kv[:, :, 1],
+                cache_seqlens=cache_seqlens,
+                softmax_scale=self.inner_cross_attn.softmax_scale,
+                causal=self.inner_cross_attn.causal,
+                alibi_slopes=alibi_slopes,
+            )
+
+    def forward(
+        self,
+        x,
+        x_kv=None,
+        key_padding_mask=None,
+        cu_seqlens=None,
+        max_seqlen=None,
+        mixer_subset=None,
+        inference_params=None,
+        **kwargs,
+    ):
+        """
+        Arguments:
+            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if
+                cu_seqlens is None and max_seqlen is None, else (total, hidden_dim) where total
+                is the is the sum of the sequence lengths in the batch.
+            x_kv: (batch, seqlen, hidden_dim), only applicable for cross-attention. If None, use x.
+            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+                of the sequences in the batch, used to index into x. Only applicable when using
+                FlashAttention.
+            max_seqlen: int. Maximum sequence length in the batch.
+            key_padding_mask: boolean mask, True means to keep, False means to mask out.
+                (batch, seqlen). Only applicable when not using FlashAttention.
+            mixer_subset: for cross-attention only. If not None, will take a subset of x
+                before applying the query projection. Useful for e.g., ViT where we only care
+                about the CLS token in the last layer.
+            inference_params: for generation. Adapted from Megatron-LM (and Apex)
+            https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
+        """
+        if cu_seqlens is not None:
+            assert max_seqlen is not None
+            assert key_padding_mask is None
+            assert self.use_flash_attn
+            assert not self.dwconv
+            assert self.rotary_emb_dim == 0
+        if key_padding_mask is not None:
+            assert cu_seqlens is None
+            assert max_seqlen is None
+            assert not self.use_flash_attn
+        if inference_params is not None:
+            assert key_padding_mask is None
+            assert cu_seqlens is None and max_seqlen is None
+            assert not self.dwconv
+
+        kwargs = (
+            {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen, **kwargs}
+            if self.use_flash_attn
+            else {"key_padding_mask": key_padding_mask, **kwargs}
+        )
+        seqlen_offset = (
+            0
+            if inference_params is None
+            else (
+                inference_params.lengths_per_sample
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+        )
+        rotary_max_seqlen = inference_params.max_seqlen if inference_params is not None else None
+        batch, seqlen = x.shape[:2]
+        if not self.cross_attn and self.num_heads_kv == self.num_heads:
+            assert x_kv is None and mixer_subset is None
+            if not self.return_residual:
+                qkv = self.Wqkv(x)
+            else:
+                qkv, x = self.Wqkv(x)
+            if self.dwconv:
+                qkv = rearrange(
+                    self.dwconv_qkv(rearrange(qkv, "b s d -> b d s"))[..., :-2], "b d s -> b s d"
+                ).contiguous()
+            qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim)
+            if (
+                inference_params is None
+                or inference_params.seqlen_offset == 0
+                or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0)
+                or not self.use_flash_attn
+            ):
+                if self.rotary_emb_dim > 0:
+                    qkv = self.rotary_emb(
+                        qkv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
+                    )
+                if inference_params is None:
+                    if not self.checkpointing:
+                        context = self.inner_attn(qkv, **kwargs)
+                    else:
+                        context = torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, **kwargs)
+                else:
+                    context = self._update_kvcache_attention(
+                        qkv[:, :, 0], qkv[:, :, 1:], inference_params
+                    )
+            else:
+                context = self._apply_rotary_update_kvcache_attention(
+                    qkv[:, :, 0], qkv[:, :, 1:], inference_params
+                )
+        else:
+            if self.cross_attn:
+                if not self.return_residual:
+                    q = self.Wq(x if mixer_subset is None else x[:, mixer_subset])
+                    kv = self.Wkv(x_kv if x_kv is not None else x)
+                else:
+                    if x_kv is not None:
+                        kv, x_kv = self.Wkv(x_kv)
+                    else:
+                        kv, x = self.Wkv(x)
+                    q = self.Wq(x if mixer_subset is None else x[:, mixer_subset])
+            else:
+                assert self.num_heads_kv != self.num_heads
+                if not self.return_residual:
+                    qkv = self.Wqkv(x)
+                else:
+                    qkv, x = self.Wqkv(x)
+                q = qkv[..., : self.num_heads * self.head_dim]
+                kv = qkv[..., self.num_heads * self.head_dim :]
+            q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim)
+            kv = rearrange(kv, "... (two hkv d) -> ... two hkv d", two=2, d=self.head_dim)
+            if self.dwconv:
+                q = rearrange(
+                    self.dwconv_q(rearrange(q, "b s d -> b d s"))[..., :-2], "b d s -> b s d"
+                ).contiguous()
+                kv = rearrange(
+                    self.dwconv_kv(rearrange(kv, "b s d -> b d s"))[..., :-2], "b d s -> b s d"
+                ).contiguous()
+            if (
+                inference_params is None
+                or inference_params.seqlen_offset == 0
+                or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0)
+                or not self.use_flash_attn
+            ):
+                if self.rotary_emb_dim > 0:
+                    q, kv = self.rotary_emb(
+                        q, kv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
+                    )
+                if inference_params is None:
+                    if not self.checkpointing:
+                        context = self.inner_cross_attn(q, kv, **kwargs)
+                    else:
+                        context = torch.utils.checkpoint.checkpoint(
+                            self.inner_cross_attn, q, kv, **kwargs
+                        )
+                else:
+                    context = self._update_kvcache_attention(q, kv, inference_params)
+            else:
+                context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
+        out = self.out_proj(rearrange(context, "... h d -> ... (h d)"))
+        return out if not self.return_residual else (out, x)
+
+
+class ParallelMHA(nn.Module):
+    """Multi-head self-attention and cross-attention"""
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        process_group,
+        num_heads_kv=None,
+        qkv_proj_bias=True,
+        out_proj_bias=True,
+        dropout=0.0,
+        softmax_scale=None,
+        causal=False,
+        layer_idx=None,
+        rotary_emb_dim=0,
+        rotary_emb_base=10000.0,
+        rotary_emb_scale_base=None,
+        rotary_emb_interleaved=False,
+        use_alibi=False,
+        window_size=(-1, -1),
+        use_flash_attn=False,
+        checkpointing=False,
+        sequence_parallel=True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.causal = causal
+        self.layer_idx = layer_idx
+        self.rotary_emb_dim = rotary_emb_dim
+        self.use_flash_attn = use_flash_attn
+        self.checkpointing = checkpointing
+        self.process_group = process_group
+        self.world_size = process_group.size()
+        self.local_rank = torch.distributed.get_rank(process_group)
+
+        self.num_heads = num_heads
+        assert self.embed_dim % self.num_heads == 0, "embed_dim must be divisible by num_heads"
+
+        self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads
+        assert (
+            self.num_heads % self.num_heads_kv == 0
+        ), "num_heads must be divisible by num_heads_kv"
+
+        self.num_heads_per_rank = get_dim_for_local_rank(
+            self.num_heads, self.world_size, self.local_rank
+        )
+        self.num_heads_kv_per_rank = get_dim_for_local_rank(
+            self.num_heads_kv, self.world_size, self.local_rank
+        )
+        self.head_dim = self.embed_dim // num_heads
+        qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)
+
+        if use_alibi:
+            assert use_flash_attn, "ALiBi code path requires flash_attn"
+            num_heads_local = math.ceil(self.num_heads / self.world_size)
+            alibi_slopes = torch.tensor(
+                get_alibi_slopes(num_heads)[
+                    self.local_rank * num_heads_local : (self.local_rank + 1) * num_heads_local
+                ],
+                device=device,
+            )
+        else:
+            alibi_slopes = None
+        if window_size != (-1, -1):
+            assert use_flash_attn, "Local (sliding window) attention code path requires flash_attn"
+
+        if self.rotary_emb_dim > 0:
+            assert RotaryEmbedding is not None, "rotary_emb is not installed"
+            self.rotary_emb = RotaryEmbedding(
+                self.rotary_emb_dim,
+                base=rotary_emb_base,
+                scale_base=rotary_emb_scale_base,
+                interleaved=rotary_emb_interleaved,
+                device=device,
+            )
+
+        if ColumnParallelLinear is None or RowParallelLinear is None:
+            raise ImportError("fused_dense is not installed")
+        self.Wqkv = ColumnParallelLinear(
+            embed_dim,
+            qkv_dim,
+            process_group,
+            bias=qkv_proj_bias,
+            sequence_parallel=sequence_parallel,
+            multiple_of=self.head_dim * (self.num_heads // self.num_heads_kv + 2),
+            **factory_kwargs,
+        )
+        inner_attn_cls = (
+            partial(FlashSelfAttention, alibi_slopes=alibi_slopes, window_size=window_size)
+            if use_flash_attn
+            else SelfAttention
+        )
+        inner_cross_attn_cls = (
+            partial(FlashCrossAttention, alibi_slopes=alibi_slopes, window_size=window_size)
+            if use_flash_attn
+            else CrossAttention
+        )
+        self.inner_attn = inner_attn_cls(
+            causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
+        )
+        self.inner_cross_attn = inner_cross_attn_cls(
+            causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
+        )
+        self.out_proj = RowParallelLinear(
+            embed_dim,
+            embed_dim,
+            process_group,
+            bias=out_proj_bias,
+            sequence_parallel=sequence_parallel,
+            multiple_of=self.head_dim,
+            **factory_kwargs,
+        )
+
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
+        dtype = self.out_proj.weight.dtype if dtype is None else dtype
+        device = self.out_proj.weight.device
+        return torch.empty(
+            batch_size,
+            max_seqlen,
+            2,
+            self.num_heads_kv_per_rank,
+            self.head_dim,
+            dtype=dtype,
+            device=device,
+        )
+
+    def _update_kv_cache(self, kv, inference_params):
+        """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
+        assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
+        return _update_kv_cache(kv, inference_params, self.layer_idx)
+
+    def _apply_rotary_update_kvcache_attention(self, q, kv, inference_params):
+        """
+        Fast path that combine 3 steps: apply rotary to Q and K, update kv cache, and apply attention.
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        kv: (batch_size, seqlen_k, 2, nheads_kv, head_dim)
+        """
+        assert inference_params is not None and inference_params.seqlen_offset > 0
+        assert self.use_flash_attn
+        if self.rotary_emb_dim > 0:
+            assert self.rotary_emb.scale is None, "This code path does not support xPos"
+            self.rotary_emb._update_cos_sin_cache(
+                inference_params.max_seqlen, device=q.device, dtype=q.dtype
+            )
+            rotary_cos, rotary_sin = self.rotary_emb._cos_cached, self.rotary_emb._sin_cached
+        else:
+            rotary_cos, rotary_sin = None, None
+        batch = q.shape[0]
+        kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+        cache_seqlens = (
+            inference_params.lengths_per_sample[:batch]
+            if inference_params.lengths_per_sample is not None
+            else inference_params.seqlen_offset
+        )
+        alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None)
+        context = flash_attn_with_kvcache(
+            q,
+            kv_cache[:, :, 0],
+            kv_cache[:, :, 1],
+            kv[:, :, 0],
+            kv[:, :, 1],
+            rotary_cos=rotary_cos,
+            rotary_sin=rotary_sin,
+            cache_seqlens=cache_seqlens,
+            softmax_scale=self.inner_cross_attn.softmax_scale,
+            causal=self.inner_cross_attn.causal,
+            rotary_interleaved=self.rotary_emb.interleaved if self.rotary_emb_dim > 0 else False,
+            alibi_slopes=alibi_slopes,
+        )
+        return context
+
+    def _update_kvcache_attention(self, q, kv, inference_params):
+        """Write kv to inference_params, then do attention"""
+        if inference_params.seqlen_offset == 0 or not self.use_flash_attn:
+            # TODO: this only uses seqlen_offset and not lengths_per_sample.
+            kv = self._update_kv_cache(kv, inference_params)
+            return self.inner_cross_attn(q, kv)
+        else:
+            batch = q.shape[0]
+            kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+            cache_seqlens = (
+                inference_params.lengths_per_sample[:batch]
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+            alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None)
+            context = flash_attn_with_kvcache(
+                q,
+                kv_cache[:, :, 0],
+                kv_cache[:, :, 1],
+                kv[:, :, 0],
+                kv[:, :, 1],
+                cache_seqlens=cache_seqlens,
+                softmax_scale=self.inner_cross_attn.softmax_scale,
+                causal=self.inner_cross_attn.causal,
+                alibi_slopes=alibi_slopes,
+            )
+            return context
+
+    def forward(self, x, seqlen=None, inference_params=None, **kwargs):
+        """
+        Arguments:
+            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if seqlen=None.
+                If seqlen is not None, x is (batch * seqlen, hidden_dim). This is so that when we
+                split x during sequence parallel, we split the batch * seqlen dimension
+                (in case batch is small).
+        """
+        qkv = self.Wqkv(x)
+        if seqlen is not None:
+            qkv = rearrange(qkv, "(b s) ... -> b s ...", s=seqlen)
+        seqlen_offset = (
+            0
+            if inference_params is None
+            else (
+                inference_params.lengths_per_sample
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+        )
+        rotary_max_seqlen = inference_params.max_seqlen if inference_params is not None else None
+        if self.num_heads_kv == self.num_heads:
+            qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, d=self.head_dim)
+            if (
+                inference_params is None
+                or inference_params.seqlen_offset == 0
+                or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0)
+                or not self.use_flash_attn
+            ):
+                if self.rotary_emb_dim > 0:
+                    qkv = self.rotary_emb(
+                        qkv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
+                    )
+                if inference_params is None:
+                    if not self.checkpointing:
+                        context = self.inner_attn(qkv, **kwargs)
+                    else:
+                        context = torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, **kwargs)
+                else:
+                    context = self._update_kvcache_attention(
+                        qkv[:, :, 0], qkv[:, :, 1:], inference_params
+                    )
+            else:
+                context = self._apply_rotary_update_kvcache_attention(
+                    qkv[:, :, 0], qkv[:, :, 1:], inference_params
+                )
+        else:
+            q = rearrange(
+                qkv[..., : self.num_heads_per_rank * self.head_dim],
+                "... (h d) -> ... h d",
+                d=self.head_dim,
+            )
+            kv = rearrange(
+                qkv[..., self.num_heads_per_rank * self.head_dim :],
+                "... (two hkv d) -> ... two hkv d",
+                two=2,
+                d=self.head_dim,
+            )
+            if (
+                inference_params is None
+                or inference_params.seqlen_offset == 0
+                or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0)
+                or not self.use_flash_attn
+            ):
+                if self.rotary_emb_dim > 0:
+                    q, kv = self.rotary_emb(
+                        q, kv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
+                    )
+                if inference_params is None:
+                    if not self.checkpointing:
+                        context = self.inner_cross_attn(q, kv, **kwargs)
+                    else:
+                        context = torch.utils.checkpoint.checkpoint(
+                            self.inner_cross_attn, q, kv, **kwargs
+                        )
+                else:
+                    context = self._update_kvcache_attention(q, kv, inference_params)
+            else:
+                context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
+        context = rearrange(context, "b s h d -> b s (h d)")
+        if seqlen is not None:
+            context = rearrange(context, "b s d -> (b s) d")
+        out = self.out_proj(context)
+        return out
diff --git a/mha_bwd.cpp b/mha_bwd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..884215adf0c53cdda06189a68d2a1cfca5250f75
--- /dev/null
+++ b/mha_bwd.cpp
@@ -0,0 +1,379 @@
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+
+#include "flash_common.hpp"
+
+#include "fmha_bwd.hpp"
+#include "mask.hpp"
+
+fmha_bwd_traits get_ck_fmha_bwd_traits(const mask_info &mask,
+                                       std::string dtype,
+                                       int head_size,
+                                       bool has_dropout,
+                                       bool enable_alibi)
+{
+    return fmha_bwd_traits{head_size,
+                           head_size,
+                           dtype,
+                           false, // is_group_mode
+                           mask.type,
+                           enable_alibi ? bias_enum::alibi : bias_enum::no_bias,
+                           false,    // has_dbias
+                           has_dropout};
+}
+
+fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask,
+                                   // sizes
+                                   const int b,
+                                   const int seqlen_q,
+                                   const int seqlen_k,
+                                   const int h,
+                                   const int h_k,
+                                   const int hdim,
+                                   // device pointers
+                                   const at::Tensor q,
+                                   const at::Tensor k,
+                                   const at::Tensor v,
+                                   c10::optional<at::Tensor> &alibi_slopes_,
+                                   const at::Tensor out,
+                                   const at::Tensor softmax_lse,
+                                   const at::Tensor dout,
+                                   at::Tensor d,
+                                   at::Tensor dq,
+                                   at::Tensor dk,
+                                   at::Tensor dv,
+                                   float softmax_scale,
+                                   float p_dropout,
+                                   uint64_t drop_seed,
+                                   uint64_t drop_offset)
+{
+    // q: (batch_size, seqlen_q, nheads, hdim)
+    // k: (batch_size, seqlen_k, nheads_k, hdim)
+    // v: (batch_size, seqlen_k, nheads_k, hdim)
+    // o: (batch_size, seqlen_q, nheads, hdim)
+    // dq: (batch_size, seqlen_q, nheads, hdim)
+    // dk_expanded: (batch_size, seqlen_k, nheads, hdim)
+    // dv_expanded: (batch_size, seqlen_k, nheads, hdim)
+    // do: (batch_size, seqlen_q, nheads, hdim)
+
+    // alibi_slopes:(batch_size, nheads) or (nhead)
+    // lse: (batch_size, nheads, seqlen_q)
+    // d: (batch_size, nheads, seqlen_q)
+
+    ck_tile::index_t stride_q = q.stride(1);
+    ck_tile::index_t stride_k = k.stride(1);
+    ck_tile::index_t stride_v = v.stride(1);
+    ck_tile::index_t stride_o = out.stride(1);
+    ck_tile::index_t stride_do = dout.stride(1);
+    ck_tile::index_t stride_dk = dk.stride(1);
+    ck_tile::index_t stride_dv = dv.stride(1);
+
+    ck_tile::index_t nhead_stride_q = q.stride(2);
+    ck_tile::index_t nhead_stride_k = k.stride(2);
+    ck_tile::index_t nhead_stride_v = v.stride(2);
+    ck_tile::index_t nhead_stride_o = out.stride(2);
+    ck_tile::index_t nhead_stride_do = dout.stride(2);
+    ck_tile::index_t nhead_stride_lse = softmax_lse.stride(1);
+
+    ck_tile::index_t batch_stride_q = q.stride(0);
+    ck_tile::index_t batch_stride_k = k.stride(0);
+    ck_tile::index_t batch_stride_v = v.stride(0);
+    ck_tile::index_t batch_stride_o = out.stride(0);
+    ck_tile::index_t batch_stride_do = dout.stride(0);
+    ck_tile::index_t batch_stride_lse = softmax_lse.stride(0);
+    ck_tile::index_t batch_stride_dk = dk.stride(0);
+    ck_tile::index_t batch_stride_dv = dv.stride(0);
+
+    float p_undrop = 1.0 - p_dropout;
+
+    void *alibi_slopes_ptr = nullptr;
+    ck_tile::index_t stride_alibi_slopes = 0;
+
+    if (alibi_slopes_.has_value()) {
+        auto alibi_slopes = alibi_slopes_.value();
+        CHECK_DEVICE(alibi_slopes);
+        TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension");
+        TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({h}) || alibi_slopes.sizes() == torch::IntArrayRef({b, h}));
+        alibi_slopes_ptr = alibi_slopes.data_ptr();
+        stride_alibi_slopes = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0;
+    }
+
+    return fmha_bwd_args{q.data_ptr(),
+                         k.data_ptr(),
+                         v.data_ptr(),
+                         alibi_slopes_ptr, // bias
+                         out.data_ptr(),
+                         softmax_lse.data_ptr(),
+                         dout.data_ptr(),
+                         d.data_ptr(),
+                         nullptr, // rand_val
+                         dq.data_ptr(),
+                         dk.data_ptr(),
+                         dv.data_ptr(),
+                         nullptr, // dbias
+                         nullptr, // seqstart_q
+                         nullptr, // seqstart_k
+                         nullptr, // seqlen_k_ptr
+                         seqlen_q,
+                         seqlen_k,
+                         b,
+                         seqlen_q, // max_seqlen_q
+                         seqlen_k, // max_seqlen_k
+                         hdim, // hdim_q
+                         hdim, // hdim_v
+                         h, // nhead
+                         h_k, // nhead_k
+                         softmax_scale,
+                         stride_q,
+                         stride_k,
+                         stride_v,
+                         stride_alibi_slopes,
+                         stride_o,
+                         0, // stride_randval
+                         stride_do,
+                         stride_dk,
+                         stride_dv,
+                         0, // stride_dbias, FA without bias
+                         nhead_stride_q,
+                         nhead_stride_k,
+                         nhead_stride_v,
+                         0, // nhead_stride_bias, FA without bias
+                         nhead_stride_o,
+                         0, // nhead_stride_randval
+                         nhead_stride_do,
+                         nhead_stride_lse,
+                         0, // nhead_stride_dbias, FA without dbias
+                         batch_stride_q,
+                         batch_stride_k,
+                         batch_stride_v,
+                         0  , // batch_stride_bias, FA without bias
+                         batch_stride_o,
+                         0, // batch_stride_randval
+                         batch_stride_do,
+                         batch_stride_lse,
+                         batch_stride_dk,
+                         batch_stride_dv,
+                         0  , // batch_stride_dbias, FA without dbias
+                         mask.left,
+                         mask.right,
+                         static_cast<ck_tile::index_t>(mask.type),
+                         p_dropout,
+                         p_undrop,
+                         false, // s_randval
+                         {drop_seed, drop_offset}};
+}
+
+std::vector<at::Tensor>
+mha_bwd(const at::Tensor &dout,                   // batch_size x seqlen_q x num_heads, x head_size_og
+        const at::Tensor &q,                      // batch_size x seqlen_q x num_heads x head_size
+        const at::Tensor &k,                      // batch_size x seqlen_k x num_heads_k x head_size
+        const at::Tensor &v,                      // batch_size x seqlen_k x num_heads_k x head_size
+        const at::Tensor &out,                    // batch_size x seqlen_q x num_heads x head_size
+        const at::Tensor &softmax_lse,            // b x h x seqlen_q
+        c10::optional<at::Tensor> &dq_,           // batch_size x seqlen_q x num_heads x head_size
+        c10::optional<at::Tensor> &dk_,           // batch_size x seqlen_k x num_heads_k x head_size
+        c10::optional<at::Tensor> &dv_,           // batch_size x seqlen_k x num_heads_k x head_size
+        c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+        const float p_dropout,                    // probability to drop
+        const float softmax_scale,
+        const bool is_causal,
+        int window_size_left,
+        int window_size_right,
+        const float /*softcap*/,
+        const bool deterministic,
+        c10::optional<at::Generator> gen_,
+        c10::optional<at::Tensor> &rng_state)
+{
+#ifdef FLASHATTENTION_DISABLE_BACKWARD
+    TORCH_CHECK(false, "This flash attention build does not support backward.");
+#endif
+    if (is_causal) { window_size_right = 0; }
+
+    bool is_dropout = p_dropout > 0.0;
+    auto stream = at::cuda::getCurrentHIPStream().stream();
+
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+    TORCH_CHECK(out.dtype() == q_dtype, "query and out must have the same dtype");
+    TORCH_CHECK(dout.dtype() == q_dtype, "query and dout must have the same dtype");
+
+    std::string q_dtype_str = q_dtype == torch::kFloat16 ? "fp16" : "bf16";
+
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse);
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
+    TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
+
+    const auto sizes = q.sizes();
+
+    const int batch_size = sizes[0];
+    const int seqlen_q = sizes[1];
+    const int num_heads = sizes[2];
+    const int head_size_og = dout.size(3);  // unpadded hdim
+    const int head_size_8x = sizes[3];
+    const int seqlen_k = k.size(1);
+    const int num_heads_k = k.size(2);
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size_8x % 8 == 0, "head_size_8x should be a multiple of 8");
+    TORCH_CHECK(head_size_8x <= 128, "CK FlashAttention backward only supports head dimension at most 128");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    TORCH_CHECK(head_size_8x == round_multiple(head_size_og, 8), "head_size_8x must be head_size_og rounded to a multiple of 8");
+
+    if (window_size_left >= seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= seqlen_k) { window_size_right = -1; }
+
+    mask_info mask;
+    if (is_causal) {
+        std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + "0";
+        mask = mask_info::decode(mask_identify, seqlen_q, seqlen_k); // casual
+    }
+    else if (window_size_left == -1 && window_size_right == -1) {
+        mask = mask_info::decode("0", seqlen_q, seqlen_k); // no mask
+    }
+    else {
+        // Local is the more general case where window_size_right >= 0 or window_size_left >= 0.
+        std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + std::to_string(window_size_right);
+        mask = mask_info::decode(mask_identify, seqlen_q, seqlen_k); // local
+    }
+
+    // q, k, v, out had been padded in mha_fwd
+    // dq_, dk_, dv_ are also padded tensor
+    CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_8x);
+    CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size_8x);
+    CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size_8x);
+    CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, head_size_8x);
+    CHECK_SHAPE(dout, batch_size, seqlen_q, num_heads, head_size_og);
+
+    at::Tensor dq, dk, dv;
+    if (dq_.has_value()) {
+        dq = dq_.value();
+        TORCH_CHECK(dq.dtype() == q_dtype, "dq must have the same dtype as q");
+        CHECK_DEVICE(dq);
+        TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension");
+        CHECK_SHAPE(dq, batch_size, seqlen_q, num_heads, head_size_8x);
+    } else {
+        dq = torch::empty_like(q);
+    }
+    if (dk_.has_value()) {
+    dk = dk_.value();
+    TORCH_CHECK(dk.dtype() == q_dtype, "dk must have the same dtype as q");
+    CHECK_DEVICE(dk);
+    TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension");
+    CHECK_SHAPE(dk, batch_size, seqlen_k, num_heads_k, head_size_8x);
+    } else {
+        dk = torch::empty_like(k);
+    }
+    if (dv_.has_value()) {
+        dv = dv_.value();
+        TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q");
+        CHECK_DEVICE(dv);
+        TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension");
+        CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_k, head_size_8x);
+    } else {
+        dv = torch::empty_like(v);
+    }
+
+    at::Tensor dout_padded;
+    if (head_size_og % 8 != 0) {
+        dout_padded = torch::nn::functional::pad(dout, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    } else {
+        dout_padded = dout;
+    }
+
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+    auto softmax_d = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat));
+    // TODO - CK does not support dq_accum
+
+    at::Tensor dk_expanded, dv_expanded;
+    if (num_heads_k != num_heads) {  // MQA / GQA
+        dk_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size_8x}, opts);
+        dv_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size_8x}, opts);
+    } else {
+        dk_expanded = dk;
+        dv_expanded = dv;
+    }
+
+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+        gen_, at::cuda::detail::getDefaultCUDAGenerator());
+
+    uint64_t drop_seed = 1, drop_offset = 0;
+    int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size();
+
+    if (rng_state.has_value()) {
+        uint64_t* d = reinterpret_cast<uint64_t*>(rng_state.value().data_ptr());
+        drop_seed = d[0];
+        drop_offset = d[1];
+    } else if(is_dropout) {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        auto philox_args = gen->philox_cuda_state(counter_offset);
+        std::tie(drop_seed, drop_offset) = flash::unpack(philox_args);
+    }
+
+    if (seqlen_q > 0) {
+        ck_tile::stream_config stream_config{stream};
+        dq.zero_(); // ck use atomic operation on dq
+
+        auto traits =
+            get_ck_fmha_bwd_traits(mask, q_dtype_str, head_size_8x, is_dropout, alibi_slopes_.has_value());
+
+        auto args =
+            get_ck_fmha_bwd_args(
+                mask,
+                batch_size,
+                seqlen_q,
+                seqlen_k,
+                num_heads,
+                num_heads_k,
+                head_size_8x,
+                q,
+                k,
+                v,
+                alibi_slopes_,
+                out,
+                softmax_lse,
+                dout_padded,
+                softmax_d,
+                dq,
+                dk_expanded,
+                dv_expanded,
+                softmax_scale,
+                p_dropout,
+                drop_seed,
+                drop_offset);
+
+        fmha_bwd(traits, args, stream_config);
+    } else {
+        // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0.
+        dk_expanded.zero_();
+        dv_expanded.zero_();
+        softmax_d.zero_();
+    }
+
+    // For MQA/GQA we need to sum dK and dV across the groups
+    if (num_heads_k != num_heads) {
+        at::sum_out(dk, at::reshape(dk_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size_8x}), {3});
+        at::sum_out(dv, at::reshape(dv_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size_8x}), {3});
+    }
+    if (head_size_og % 8 != 0) {
+        dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        dk = dk.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+    }
+
+    return { dq, dk, dv, softmax_d };
+}
\ No newline at end of file
diff --git a/mha_fwd.cpp b/mha_fwd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c1eeba5070885af479abb2e97e8c737afa91fba3
--- /dev/null
+++ b/mha_fwd.cpp
@@ -0,0 +1,348 @@
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+
+#include "flash_common.hpp"
+
+#include "fmha_fwd.hpp"
+#include "mask.hpp"
+
+fmha_fwd_traits get_ck_fmha_fwd_traits(const mask_info &mask,
+                                       std::string dtype,
+                                       int head_size,
+                                       bool has_dropout,
+                                       bool has_lse,
+                                       bool enable_alibi)
+{
+    return fmha_fwd_traits{head_size,
+                           head_size,
+                           dtype,
+                           false, // is_group_mode
+                           true,  // is_v_rowmajor
+                           mask.type,
+                           enable_alibi ? bias_enum::alibi : bias_enum::no_bias,
+                           has_lse,
+                           has_dropout,
+                           false}; // do_fp8_static_quant
+}
+
+fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse,
+                                   bool has_dropout_randval,
+                                   const mask_info &mask,
+                                   // sizes
+                                   const int b,
+                                   const int seqlen_q,
+                                   const int seqlen_k,
+                                   const int h,
+                                   const int h_k,
+                                   const int d,
+                                   // device pointers
+                                   const at::Tensor q,
+                                   const at::Tensor k,
+                                   const at::Tensor v,
+                                   c10::optional<at::Tensor> &alibi_slopes_,
+                                   at::Tensor out,
+                                   at::Tensor softmax_lse,
+                                   at::Tensor dropout_randval,
+                                   float softmax_scale,
+                                   float p_dropout,
+                                   uint64_t drop_seed,
+                                   uint64_t drop_offset)
+{
+    // q: (batch_size, seqlen_q, nheads, d)
+    // k: (batch_size, seqlen_k, nheads_k, d)
+    // v: (batch_size, seqlen_k, nheads_k, d)
+    // o: (batch_size, seqlen_q, nheads, d)
+
+    // alibi_slopes:(batch_size, nheads) or (nhead)
+    // lse: (batch_size, nheads, seqlen_q)
+    // randval: (batch_size, nheads, seqlen_q, seqlen_k)
+
+    ck_tile::index_t stride_q = q.stride(1);
+    ck_tile::index_t stride_k = k.stride(1);
+    ck_tile::index_t stride_v = v.stride(1);
+    ck_tile::index_t stride_o = out.stride(1);
+    ck_tile::index_t stride_randval = has_dropout_randval ? dropout_randval.stride(2) : 0;
+
+    ck_tile::index_t nhead_stride_q = q.stride(2);
+    ck_tile::index_t nhead_stride_k = k.stride(2);
+    ck_tile::index_t nhead_stride_v = v.stride(2);
+    ck_tile::index_t nhead_stride_o = out.stride(2);
+    ck_tile::index_t nhead_stride_lse = has_lse ? softmax_lse.stride(1) : 0;
+    ck_tile::index_t nhead_stride_randval = has_dropout_randval ? dropout_randval.stride(1) : 0;
+
+    ck_tile::index_t batch_stride_q = q.stride(0);
+    ck_tile::index_t batch_stride_k = k.stride(0);
+    ck_tile::index_t batch_stride_v = v.stride(0);
+    ck_tile::index_t batch_stride_o = out.stride(0);
+
+    ck_tile::index_t batch_stride_lse = has_lse ? softmax_lse.stride(0) : 0;
+    ck_tile::index_t batch_stride_randval = has_dropout_randval ? dropout_randval.stride(0) : 0;
+
+    void *alibi_slopes_ptr = nullptr;
+    ck_tile::index_t stride_alibi_slopes = 0;
+
+    if (alibi_slopes_.has_value()) {
+        auto alibi_slopes = alibi_slopes_.value();
+        CHECK_DEVICE(alibi_slopes);
+        TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension");
+        TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({h}) || alibi_slopes.sizes() == torch::IntArrayRef({b, h}));
+        alibi_slopes_ptr = alibi_slopes.data_ptr();
+        stride_alibi_slopes = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0;
+    }
+
+    return fmha_fwd_args{q.data_ptr(),
+                         k.data_ptr(),
+                         v.data_ptr(),
+                         alibi_slopes_ptr, // bias
+                         has_dropout_randval ? dropout_randval.data_ptr() : nullptr,
+                         nullptr, // lse_acc
+                         nullptr, // o_acc
+                         has_lse ? softmax_lse.data_ptr() : nullptr,
+                         out.data_ptr(),
+                         nullptr, // seqstart_q
+                         nullptr, // seqstart_k
+                         nullptr,
+                         seqlen_q,
+                         seqlen_k,
+                         b,
+                         seqlen_q,      // max_seqlen_q
+                         d,             // hdim_q
+                         d,             // hdim_v
+                         h,             // nhead
+                         h_k,           // nhead_k
+                         1,             // num_splits
+                         softmax_scale, // scale_s
+                         1,             // scale_p
+                         1,             // scale_o
+                         stride_q,
+                         stride_k,
+                         stride_v,
+                         stride_alibi_slopes,
+                         stride_randval,
+                         0, // stride_o_acc,
+                         stride_o,
+                         nhead_stride_q,
+                         nhead_stride_k,
+                         nhead_stride_v,
+                         0, // nhead_stride_bias, FA without bias
+                         nhead_stride_randval,
+                         nhead_stride_lse,
+                         0, // nhead_stride_lse_acc
+                         0, // nhead_stride_o_acc
+                         nhead_stride_o,
+                         batch_stride_q,
+                         batch_stride_k,
+                         batch_stride_v,
+                         0, // batch_stride_bias, FA without bias
+                         batch_stride_randval,
+                         batch_stride_lse,
+                         0, // batch_stride_lse_acc
+                         0, // batch_stride_o_acc
+                         batch_stride_o,
+                         0, // split_stride_lse_acc
+                         0, // split_stride_o_acc
+                         mask.left,
+                         mask.right,
+                         static_cast<ck_tile::index_t>(mask.type),
+                         p_dropout,
+                         has_dropout_randval,
+                         {drop_seed, drop_offset}};
+}
+
+std::vector<at::Tensor>
+mha_fwd(at::Tensor &q,                            // batch_size x seqlen_q x num_heads x head_size
+        const at::Tensor &k,                      // batch_size x seqlen_k x num_heads_k x head_size
+        const at::Tensor &v,                      // batch_size x seqlen_k x num_heads_k x head_size
+        c10::optional<at::Tensor> &out_,          // batch_size x seqlen_q x num_heads x head_size
+        c10::optional<at::Tensor> &alibi_slopes_, // num_heads or batch_size x num_heads
+        const float p_dropout,
+        const float softmax_scale,
+        bool is_causal,
+        int window_size_left,
+        int window_size_right,
+        const float /*softcap*/,
+        const bool return_dropout_randval,
+        c10::optional<at::Generator> gen_)
+{
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+
+    std::string q_dtype_str = q_dtype == torch::kFloat16 ? "fp16" : "bf16";
+
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+
+    const auto sizes = q.sizes();
+
+    const int batch_size = sizes[0];
+    int seqlen_q = sizes[1];
+    int num_heads = sizes[2];
+    const int head_size_og = sizes[3];
+    const int seqlen_k = k.size(1);
+    const int num_heads_k = k.size(2);
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size_og <= 256, "CK only supports head dimension at most 256");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+
+    if (window_size_left >= seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= seqlen_k) { window_size_right = -1; }
+
+    // causal=true is the same as causal=false in this case
+    if (seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; }
+
+    mask_info mask;
+    if (is_causal) {
+        // Causal is the special case where window_size_right == 0 and window_size_left < 0.
+        window_size_right = 0;
+        std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + "0";
+        mask = mask_info::decode(mask_identify, seqlen_q, seqlen_k); // casual
+    }
+    else if (window_size_left == -1 && window_size_right == -1) {
+        mask = mask_info::decode("0", seqlen_q, seqlen_k); // no mask
+    }
+    else {
+        // Local is the more general case where window_size_right >= 0 or window_size_left >= 0.
+        std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + std::to_string(window_size_right);
+        mask = mask_info::decode(mask_identify, seqlen_q, seqlen_k); // local
+    }
+
+    // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case
+    // H/t Daniel Haziza
+    const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && !alibi_slopes_.has_value();
+    const int ngroups = num_heads / num_heads_k;
+    if (seqlenq_ngroups_swapped) {
+        q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2);
+        seqlen_q = ngroups;
+        num_heads = num_heads_k;
+    }
+
+    CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og);
+    CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size_og);
+    CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size_og);
+
+    at::Tensor q_padded, k_padded, v_padded;
+    if (head_size_og % 8 != 0) {
+        q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    }
+    else {
+        q_padded = q;
+        k_padded = k;
+        v_padded = v;
+    }
+
+    at::Tensor out;
+    if (out_.has_value()) {
+        out = out_.value();
+        TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs");
+        CHECK_DEVICE(out);
+        TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension");
+        CHECK_SHAPE(out, batch_size, sizes[1], sizes[2], head_size_og);
+        if (seqlenq_ngroups_swapped) {
+            out = out.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2);
+        }
+        if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); }
+    }
+    else {
+        out = torch::empty_like(q_padded);
+    }
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size_8x = round_multiple(head_size_og, 8);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+    bool has_lse = true;
+    bool has_dropout = p_dropout > 0.0f;
+
+    at::Tensor softmax_lse;
+    // TODO - check gradient, only training require lse
+    softmax_lse = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(torch::kFloat32));
+
+    at::Tensor p;
+    if (return_dropout_randval) {
+        TORCH_CHECK(has_dropout, "return_dropout_randval require p_dropout > 0");
+        p = torch::empty({batch_size, num_heads, seqlen_q, seqlen_k}, opts.dtype(torch::kUInt8));
+    }
+
+    uint64_t drop_seed = 1, drop_offset = 0;
+    int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size();
+    auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    auto rng_state = torch::empty({2}, options.dtype(torch::kInt64));
+
+    if (p_dropout > 0.0)  {
+        auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+            gen_, at::cuda::detail::getDefaultCUDAGenerator());
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        auto philox_args = gen->philox_cuda_state(counter_offset);
+        std::tie(drop_seed, drop_offset) = flash::unpack(philox_args);
+    }
+
+    rng_state[0] = *(reinterpret_cast<int64_t*>(&drop_seed));
+    rng_state[1] = *(reinterpret_cast<int64_t*>(&drop_offset));
+
+    if (seqlen_k > 0) {
+        auto stream = at::cuda::getCurrentHIPStream().stream();
+        ck_tile::stream_config stream_config{stream};
+
+        auto traits =
+            get_ck_fmha_fwd_traits(mask, q_dtype_str, head_size_8x, has_dropout, has_lse, alibi_slopes_.has_value());
+
+        auto args =
+            get_ck_fmha_fwd_args(
+                has_lse,
+                return_dropout_randval,
+                mask,
+                batch_size,
+                seqlen_q,
+                seqlen_k,
+                num_heads,
+                num_heads_k,
+                head_size_8x,
+                q_padded,
+                k_padded,
+                v_padded,
+                alibi_slopes_,
+                out,
+                softmax_lse,
+                p,
+                softmax_scale,
+                p_dropout,
+                drop_seed,
+                drop_offset);
+
+        fmha_fwd(traits, args, stream_config);
+    }
+    else {
+        // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0.
+        out.zero_();
+        softmax_lse.fill_(std::numeric_limits<float>::infinity());
+    }
+
+    at::Tensor out_padded = out;
+    if (head_size_og % 8 != 0) {
+        out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        if (out_.has_value()) { out_.value().copy_(out); }
+    }
+
+    if (seqlenq_ngroups_swapped) {
+        out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
+        out_padded = out_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
+        q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
+        softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
+    }
+    return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse, p, rng_state};
+}
diff --git a/mha_varlen_bwd.cpp b/mha_varlen_bwd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d8eabab154746c4b063df9aabba23b391705ff47
--- /dev/null
+++ b/mha_varlen_bwd.cpp
@@ -0,0 +1,406 @@
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+
+#include "flash_common.hpp"
+
+#include "fmha_bwd.hpp"
+#include "mask.hpp"
+
+fmha_bwd_traits get_ck_fmha_varlen_bwd_traits(const mask_info &mask,
+                                              std::string dtype,
+                                              int head_size,
+                                              bool has_dropout,
+                                              bool enable_alibi)
+{
+    return fmha_bwd_traits{head_size,
+                           head_size,
+                           dtype,
+                           true, // is_group_mode
+                           mask.type,
+                           enable_alibi ? bias_enum::alibi : bias_enum::no_bias,
+                           false,    // has_dbias
+                           has_dropout};
+}
+
+fmha_bwd_args get_ck_fmha_varlen_bwd_args(const mask_info &mask,
+                                          // sizes
+                                          const int b,
+                                          const int max_seqlen_q,
+                                          const int max_seqlen_k,
+                                          const int h,
+                                          const int h_k,
+                                          const int hdim,
+                                          // device pointers
+                                          const at::Tensor q,
+                                          const at::Tensor k,
+                                          const at::Tensor v,
+                                          const at::Tensor seqlens_q,
+                                          const at::Tensor seqlens_k,
+                                          c10::optional<at::Tensor> &alibi_slopes_,
+                                          const at::Tensor out,
+                                          const at::Tensor softmax_lse,
+                                          const at::Tensor dout,
+                                          at::Tensor d,
+                                          at::Tensor dq,
+                                          at::Tensor dk,
+                                          at::Tensor dv,
+                                          float softmax_scale,
+                                          float p_dropout,
+                                          uint64_t drop_seed,
+                                          uint64_t drop_offset)
+{
+    // q: (total_q, nheads, hdim)
+    // k: (total_k, nheads_k, hdim)
+    // v: (total_k, nheads_k, hdim)
+    // o: (total_q, nheads, hdim)
+    // dq: (total_q, nheads, hdim)
+    // dk_expanded: (total_k, nheads, hdim)
+    // dv_expanded: (total_k, nheads, hdim)
+    // do: (total_q, nheads, hdim)
+
+    // alibi_slopes:(batch_size, nheads) or (nhead)
+    // lse: (batch_size, nheads, max_seqlen_q)
+    // d: (batch_size, nheads, max_seqlen_q)
+
+    ck_tile::index_t total_q = q.size(0);
+    ck_tile::index_t total_k = k.size(0);
+
+    ck_tile::index_t stride_q = q.stride(0);
+    ck_tile::index_t stride_k = k.stride(0);
+    ck_tile::index_t stride_v = v.stride(0);
+    ck_tile::index_t stride_o = out.stride(0);
+    ck_tile::index_t stride_do = dout.stride(0);
+    ck_tile::index_t stride_dk = dk.stride(0);
+    ck_tile::index_t stride_dv = dv.stride(0);
+
+    ck_tile::index_t nhead_stride_q = q.stride(1);
+    ck_tile::index_t nhead_stride_k = k.stride(1);
+    ck_tile::index_t nhead_stride_v = v.stride(1);
+    ck_tile::index_t nhead_stride_o = out.stride(1);
+    ck_tile::index_t nhead_stride_do = dout.stride(1);
+    ck_tile::index_t nhead_stride_lse = softmax_lse.stride(1);
+
+    ck_tile::index_t batch_stride_q = 0;
+    ck_tile::index_t batch_stride_k = 0;
+    ck_tile::index_t batch_stride_v = 0;
+    ck_tile::index_t batch_stride_o = 0;
+    ck_tile::index_t batch_stride_do = 0;
+    ck_tile::index_t batch_stride_lse = softmax_lse.stride(0);;
+    ck_tile::index_t batch_stride_dk = 0;
+    ck_tile::index_t batch_stride_dv = 0;
+
+    float p_undrop = 1.0 - p_dropout;
+
+    void *alibi_slopes_ptr = nullptr;
+    ck_tile::index_t stride_alibi_slopes = 0;
+
+    if (alibi_slopes_.has_value()) {
+        auto alibi_slopes = alibi_slopes_.value();
+        CHECK_DEVICE(alibi_slopes);
+        TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension");
+        TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({h}) || alibi_slopes.sizes() == torch::IntArrayRef({b, h}));
+        alibi_slopes_ptr = alibi_slopes.data_ptr();
+        stride_alibi_slopes = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0;
+    }
+
+    return fmha_bwd_args{q.data_ptr(),
+                         k.data_ptr(),
+                         v.data_ptr(),
+                         alibi_slopes_ptr, // bias
+                         out.data_ptr(),
+                         softmax_lse.data_ptr(),
+                         dout.data_ptr(),
+                         d.data_ptr(),
+                         nullptr, // rand_val
+                         dq.data_ptr(),
+                         dk.data_ptr(),
+                         dv.data_ptr(),
+                         nullptr, // dbias
+                         seqlens_q.data_ptr(), // seqstart_q
+                         seqlens_k.data_ptr(), // seqstart_k
+                         nullptr, // seqlen_k_ptr
+                         total_q,
+                         total_k,
+                         b,
+                         max_seqlen_q, // max_seqlen_q
+                         max_seqlen_k, // max_seqlen_k
+                         hdim, // hdim_q
+                         hdim, // hdim_v
+                         h, // nhead
+                         h_k, // nhead_k
+                         softmax_scale,
+                         stride_q,
+                         stride_k,
+                         stride_v,
+                         stride_alibi_slopes,
+                         stride_o,
+                         0, // stride_randval
+                         stride_do,
+                         stride_dk,
+                         stride_dv,
+                         0, // stride_dbias, FA without bias
+                         nhead_stride_q,
+                         nhead_stride_k,
+                         nhead_stride_v,
+                         0, // nhead_stride_bias, FA without bias
+                         nhead_stride_o,
+                         0, // nhead_stride_randval
+                         nhead_stride_do,
+                         nhead_stride_lse,
+                         0, // nhead_stride_dbias, FA without dbias
+                         batch_stride_q,
+                         batch_stride_k,
+                         batch_stride_v,
+                         0  , // batch_stride_bias, FA without bias
+                         batch_stride_o,
+                         0, // batch_stride_randval
+                         batch_stride_do,
+                         batch_stride_lse,
+                         batch_stride_dk,
+                         batch_stride_dv,
+                         0  , // batch_stride_dbias, FA without dbias
+                         mask.left,
+                         mask.right,
+                         static_cast<ck_tile::index_t>(mask.type),
+                         p_dropout,
+                         p_undrop,
+                         false, // s_randval
+                         {drop_seed, drop_offset}};
+}
+
+std::vector<at::Tensor>
+mha_varlen_bwd(const at::Tensor &dout,                   // total_q x num_heads x head_size
+               const at::Tensor &q,                      // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               const at::Tensor &k,                      // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &v,                      // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &out,                    // total_q x num_heads x head_size
+               const at::Tensor &softmax_lse,            // b x h x s   softmax logsumexp
+               c10::optional<at::Tensor> &dq_,           // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               c10::optional<at::Tensor> &dk_,           // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               c10::optional<at::Tensor> &dv_,           // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &cu_seqlens_q,           // b+1
+               const at::Tensor &cu_seqlens_k,           // b+1
+               c10::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
+               const int max_seqlen_q,
+               const int max_seqlen_k, // max sequence length to choose the kernel
+               const float p_dropout,  // probability to drop
+               const float softmax_scale,
+               const bool zero_tensors,
+               const bool is_causal,
+               int window_size_left,
+               int window_size_right,
+               const float /*softcap*/,
+               const bool deterministic,
+               c10::optional<at::Generator> gen_,
+               c10::optional<at::Tensor> &rng_state)
+{
+#ifdef FLASHATTENTION_DISABLE_BACKWARD
+    TORCH_CHECK(false, "This flash attention build does not support backward.");
+#endif
+    if (is_causal) { window_size_right = 0; }
+
+    bool is_dropout = p_dropout > 0.0;
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+    TORCH_CHECK(out.dtype() == q_dtype, "query and out must have the same dtype");
+    TORCH_CHECK(dout.dtype() == q_dtype, "query and dout must have the same dtype");
+    TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32");
+    TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32");
+
+    std::string q_dtype_str = q_dtype == torch::kFloat16 ? "fp16" : "bf16";
+
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse);
+    CHECK_DEVICE(cu_seqlens_q); CHECK_DEVICE(cu_seqlens_k);
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
+    TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
+    CHECK_CONTIGUOUS(cu_seqlens_q);
+    CHECK_CONTIGUOUS(cu_seqlens_k);
+
+    const auto sizes = q.sizes();
+
+    const int total_q = sizes[0];
+    const int batch_size = cu_seqlens_q.numel() - 1;
+    const int num_heads = sizes[1];
+    const int head_size_og = dout.size(2);
+    const int head_size_8x = sizes[2];
+    const int total_k = k.size(0);
+    const int num_heads_k = k.size(1);
+    TORCH_CHECK(batch_size > 0, "batch size must be positive");
+    TORCH_CHECK(head_size_8x % 8 == 0, "head_size should be a multiple of 8");
+    TORCH_CHECK(head_size_8x <= 128, "CK FlashAttention backward only supports head dimension at most 128");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    TORCH_CHECK(head_size_8x == round_multiple(head_size_og, 8), "head_size_8x must be head_size_og rounded to a multiple of 8");
+
+    if (window_size_left >= max_seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= max_seqlen_k) { window_size_right = -1; }
+
+    mask_info mask;
+    if (is_causal) {
+        std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + "0";
+        mask = mask_info::decode(mask_identify, max_seqlen_q, max_seqlen_k); // casual
+    }
+    else if (window_size_left == -1 && window_size_right == -1) {
+        mask = mask_info::decode("0", max_seqlen_q, max_seqlen_k); // no mask
+    }
+    else {
+        // Local is the more general case where window_size_right >= 0 or window_size_left >= 0.
+        std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + std::to_string(window_size_right);
+        mask = mask_info::decode(mask_identify, max_seqlen_q, max_seqlen_k); // local
+    }
+
+    // q, k, v, out had been padded in mha_fwd
+    // dq_, dk_, dv_ are also padded tensor
+    CHECK_SHAPE(q, total_q, num_heads, head_size_8x);
+    CHECK_SHAPE(k, total_k, num_heads_k, head_size_8x);
+    CHECK_SHAPE(v, total_k, num_heads_k, head_size_8x);
+    CHECK_SHAPE(out, total_q, num_heads, head_size_8x);
+    CHECK_SHAPE(dout, total_q, num_heads, head_size_og);
+    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
+    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
+
+    at::Tensor dq, dk, dv;
+    if (dq_.has_value()) {
+        dq = dq_.value();
+        TORCH_CHECK(dq.dtype() == q_dtype, "dq must have the same dtype as q");
+        CHECK_DEVICE(dq);
+        TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension");
+        CHECK_SHAPE(dq, total_q, num_heads, head_size_8x);
+    } else {
+        dq = torch::empty_like(q);
+    }
+    if (dk_.has_value()) {
+        dk = dk_.value();
+        TORCH_CHECK(dk.dtype() == q_dtype, "dk must have the same dtype as q");
+        CHECK_DEVICE(dk);
+        TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension");
+        CHECK_SHAPE(dk, total_k, num_heads_k, head_size_8x);
+    } else {
+        dk = torch::empty_like(k);
+    }
+    if (dv_.has_value()) {
+        dv = dv_.value();
+        TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q");
+        CHECK_DEVICE(dv);
+        TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension");
+        CHECK_SHAPE(dv, total_k, num_heads_k, head_size_8x);
+    } else {
+        dv = torch::empty_like(v);
+    }
+
+    at::Tensor dout_padded;
+    if (head_size_og % 8 != 0) {
+        dout_padded = torch::nn::functional::pad(dout, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    } else {
+        dout_padded = dout;
+    }
+
+
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+    auto softmax_d = torch::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat));
+    // TODO - CK does not support dq_accum
+
+    at::Tensor dk_expanded, dv_expanded;
+    if (num_heads_k != num_heads) {  // MQA / GQA
+        dk_expanded = torch::empty({total_k, num_heads, head_size_8x}, opts);
+        dv_expanded = torch::empty({total_k, num_heads, head_size_8x}, opts);
+    } else {
+        dk_expanded = dk;
+        dv_expanded = dv;
+    }
+
+    if(zero_tensors) {
+        dq.zero_();
+        dk_expanded.zero_();
+        dv_expanded.zero_();
+        softmax_d.zero_();
+    }
+
+    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+        gen_, at::cuda::detail::getDefaultCUDAGenerator());
+
+    uint64_t drop_seed = 1, drop_offset = 0;
+    int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size();
+
+    if (rng_state.has_value()) {
+        uint64_t* d = reinterpret_cast<uint64_t*>(rng_state.value().data_ptr());
+        drop_seed = d[0];
+        drop_offset = d[1];
+    } else if(is_dropout) {
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        auto philox_args = gen->philox_cuda_state(counter_offset);
+        std::tie(drop_seed, drop_offset) = flash::unpack(philox_args);
+    }
+
+    if (max_seqlen_q > 0) {
+        ck_tile::stream_config stream_config{stream};
+        dq.zero_(); // ck use atomic operation on dq
+
+        auto traits =
+            get_ck_fmha_varlen_bwd_traits(mask, q_dtype_str, head_size_8x, is_dropout, alibi_slopes_.has_value());
+
+        auto args =
+            get_ck_fmha_varlen_bwd_args(
+                mask,
+                batch_size,
+                max_seqlen_q,
+                max_seqlen_k,
+                num_heads,
+                num_heads_k,
+                head_size_8x,
+                q,
+                k,
+                v,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                alibi_slopes_,
+                out,
+                softmax_lse,
+                dout_padded,
+                softmax_d,
+                dq,
+                dk_expanded,
+                dv_expanded,
+                softmax_scale,
+                p_dropout,
+                drop_seed,
+                drop_offset);
+
+        fmha_bwd(traits, args, stream_config);
+    } else {
+        // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0.
+        dk_expanded.zero_();
+        dv_expanded.zero_();
+        softmax_d.zero_();
+    }
+
+    // For MQA/GQA we need to sum dK and dV across the groups
+    if (num_heads_k != num_heads) {
+        at::sum_out(dk, at::reshape(dk_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size_8x}), {2});
+        at::sum_out(dv, at::reshape(dv_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size_8x}), {2});
+    }
+    if (head_size_og % 8 != 0) {
+        dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        dk = dk.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+    }
+
+    return { dq, dk, dv, softmax_d };
+}
\ No newline at end of file
diff --git a/mha_varlen_fwd.cpp b/mha_varlen_fwd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2d2f4cfef1643bc796add7884d5a57a255d6218d
--- /dev/null
+++ b/mha_varlen_fwd.cpp
@@ -0,0 +1,371 @@
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+
+#include "flash_common.hpp"
+
+#include "fmha_fwd.hpp"
+#include "mask.hpp"
+
+fmha_fwd_traits get_ck_fmha_varlen_fwd_traits(const mask_info &mask,
+                                              std::string dtype,
+                                              int head_size,
+                                              bool has_dropout,
+                                              bool has_lse,
+                                              bool enable_alibi)
+{
+    return fmha_fwd_traits{head_size,
+                           head_size,
+                           dtype,
+                           true, // is_group_mode
+                           true, // is_v_rowmajor
+                           mask.type,
+                           enable_alibi ? bias_enum::alibi : bias_enum::no_bias,
+                           has_lse,
+                           has_dropout,
+                           false}; // do_fp8_static_quant
+}
+
+fmha_fwd_args get_ck_fmha_varlen_fwd_args(bool has_lse,
+                                          bool has_dropout_randval,
+                                          const mask_info &mask,
+                                          // sizes
+                                          const int b,
+                                          const int max_seqlen_q,
+                                          const int h,
+                                          const int h_k,
+                                          const int d,
+                                          // device pointers
+                                          const at::Tensor q,
+                                          const at::Tensor k,
+                                          const at::Tensor v,
+                                          const at::Tensor seqlens_q,
+                                          const at::Tensor seqlens_k,
+                                          c10::optional<at::Tensor> &alibi_slopes_,
+                                          at::Tensor out,
+                                          at::Tensor softmax_lse,
+                                          at::Tensor dropout_randval,
+                                          float softmax_scale,
+                                          float p_dropout,
+                                          uint64_t drop_seed,
+                                          uint64_t drop_offset)
+{
+    // q: (total_q, nheads, d)
+    // k: (total_k, nheads_k, d)
+    // v: (total_k, nheads_k, d)
+    // o: (total_q, nheads, d)
+
+    // alibi_slopes:(batch, nheads) or (nhead)
+    // lse: (batch, nheads, max_seqlen_q)
+    // randval: (nheads, total_q, max_seqlen_k)
+
+    ck_tile::index_t total_q = q.size(0);
+    ck_tile::index_t total_k = k.size(0);
+
+    ck_tile::index_t stride_q = q.stride(0);
+    ck_tile::index_t stride_k = k.stride(0);
+    ck_tile::index_t stride_v = v.stride(0);
+    ck_tile::index_t stride_o = out.stride(0);
+    ck_tile::index_t stride_randval = has_dropout_randval ? dropout_randval.stride(1) : 0;
+
+    ck_tile::index_t nhead_stride_q = q.stride(1);
+    ck_tile::index_t nhead_stride_k = k.stride(1);
+    ck_tile::index_t nhead_stride_v = v.stride(1);
+    ck_tile::index_t nhead_stride_o = out.stride(1);
+    ck_tile::index_t nhead_stride_lse = has_lse ? softmax_lse.stride(1) : 0;
+    ck_tile::index_t nhead_stride_randval = has_dropout_randval ? dropout_randval.stride(0) : 0;
+
+    ck_tile::index_t batch_stride_q = 0;
+    ck_tile::index_t batch_stride_k = 0;
+    ck_tile::index_t batch_stride_v = 0;
+    ck_tile::index_t batch_stride_o = 0;
+
+    ck_tile::index_t batch_stride_lse = has_lse ? softmax_lse.stride(0) : 0;
+    ck_tile::index_t batch_stride_randval = 0;
+
+    void *alibi_slopes_ptr = nullptr;
+    ck_tile::index_t stride_alibi_slopes = 0;
+
+    if (alibi_slopes_.has_value()) {
+        auto alibi_slopes = alibi_slopes_.value();
+        CHECK_DEVICE(alibi_slopes);
+        TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension");
+        TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({h}) || alibi_slopes.sizes() == torch::IntArrayRef({b, h}));
+        alibi_slopes_ptr = alibi_slopes.data_ptr();
+        stride_alibi_slopes = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0;
+    }
+
+    return fmha_fwd_args{q.data_ptr(),
+                         k.data_ptr(),
+                         v.data_ptr(),
+                         alibi_slopes_ptr, // bias
+                         has_dropout_randval ? dropout_randval.data_ptr() : nullptr,
+                         nullptr, // lse_acc
+                         nullptr, // o_acc
+                         has_lse ? softmax_lse.data_ptr() : nullptr,
+                         out.data_ptr(),
+                         seqlens_q.data_ptr(), // seqstart_q
+                         seqlens_k.data_ptr(), // seqstart_k
+                         nullptr,              // seqlen_kpads
+                         total_q,
+                         total_k,
+                         b,
+                         max_seqlen_q,
+                         d,             // hdim_q
+                         d,             // hdim_v
+                         h,             // nhead
+                         h_k,           // nhead_k
+                         1,             // num_splits
+                         softmax_scale, // scale_s
+                         1,             // scale_p
+                         1,             // scale_o
+                         stride_q,
+                         stride_k,
+                         stride_v,
+                         stride_alibi_slopes,
+                         stride_randval,
+                         0, // stride_o_acc,
+                         stride_o,
+                         nhead_stride_q,
+                         nhead_stride_k,
+                         nhead_stride_v,
+                         0, // nhead_stride_bias, FA without bias
+                         nhead_stride_randval,
+                         nhead_stride_lse,
+                         0, // nhead_stride_lse_acc
+                         0, // nhead_stride_o_acc
+                         nhead_stride_o,
+                         batch_stride_q,
+                         batch_stride_k,
+                         batch_stride_v,
+                         0, // batch_stride_bias, FA without bias
+                         batch_stride_randval,
+                         batch_stride_lse,
+                         0, // batch_stride_lse_acc
+                         0, // batch_stride_o_acc
+                         batch_stride_o,
+                         0, // split_stride_lse_acc
+                         0, // split_stride_o_acc
+                         mask.left,
+                         mask.right,
+                         static_cast<ck_tile::index_t>(mask.type),
+                         p_dropout,
+                         has_dropout_randval,
+                         {drop_seed, drop_offset}};
+}
+
+std::vector<at::Tensor>
+mha_varlen_fwd(at::Tensor &q,                   // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
+               const at::Tensor &k,             // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
+               const at::Tensor &v,             // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table.
+               c10::optional<at::Tensor> &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
+               const at::Tensor &cu_seqlens_q,  // b+1
+               const at::Tensor &cu_seqlens_k,  // b+1
+               c10::optional<at::Tensor> & /*seqused_k*/,
+               c10::optional<const at::Tensor> &/*leftpad_k_*/, // batch_size
+               c10::optional<at::Tensor> &block_table_,  // batch_size x max_num_blocks_per_seq
+               c10::optional<at::Tensor> &alibi_slopes_, // num_heads or b x num_heads
+               int max_seqlen_q,
+               const int max_seqlen_k,
+               const float p_dropout,
+               const float softmax_scale,
+               const bool zero_tensors,
+               bool is_causal,
+               int window_size_left,
+               int window_size_right,
+               const float /*softcap*/,
+               const bool return_dropout_randval,
+               c10::optional<at::Generator> gen_)
+{
+    auto q_dtype = q.dtype();
+    TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16,
+                "FlashAttention only support fp16 and bf16 data type");
+
+    TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype");
+    TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype");
+    TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32");
+    TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32");
+
+    std::string q_dtype_str = q_dtype == torch::kFloat16 ? "fp16" : "bf16";
+
+    CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v);
+    CHECK_DEVICE(cu_seqlens_q);
+    CHECK_DEVICE(cu_seqlens_k);
+
+    // TODO - Support paged_KV
+    const bool paged_KV = block_table_.has_value();
+    TORCH_CHECK(!paged_KV, "CK does not support paged_KV yet");
+
+    TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    CHECK_CONTIGUOUS(cu_seqlens_q);
+    CHECK_CONTIGUOUS(cu_seqlens_k);
+
+    const auto sizes = q.sizes();
+
+    const int batch_size = cu_seqlens_q.numel() - 1;
+    int num_heads = sizes[1];
+    const int head_size_og = sizes[2];
+    const int num_heads_k = k.size(1);
+
+    const int max_num_blocks_per_seq = 0;
+    const int num_blocks = 0;
+
+    if (max_seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; }  // causal=true is the same as causal=false in this case
+
+    // TODO
+    // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case
+    // H/t Daniel Haziza
+
+    const int total_q = q.size(0);
+    const int total_k = k.size(0);
+
+    TORCH_CHECK(batch_size > 0, "batch size must be postive");
+    TORCH_CHECK(head_size_og <= 256, "CK only supports head dimension at most 256");
+    TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query");
+
+    if (window_size_left >= max_seqlen_k) { window_size_left = -1; }
+    if (window_size_right >= max_seqlen_k) { window_size_right = -1; }
+
+    mask_info mask;
+
+    if (is_causal) {
+        // Causal is the special case where window_size_right == 0 and window_size_left < 0.
+        window_size_right = 0;
+        std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + "0";
+        mask = mask_info::decode(mask_identify, max_seqlen_q, max_seqlen_k); // casual
+    }
+    else if (window_size_left == -1 && window_size_right == -1) {
+        mask = mask_info::decode("0", max_seqlen_q, max_seqlen_k); // no mask
+    }
+    else {
+        // Local is the more general case where window_size_right >= 0 or window_size_left >= 0.
+        std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + std::to_string(window_size_right);
+        mask = mask_info::decode(mask_identify, max_seqlen_q, max_seqlen_k); // local
+    }
+
+    CHECK_SHAPE(q, total_q, num_heads, head_size_og);
+    CHECK_SHAPE(k, total_k, num_heads_k, head_size_og);
+    CHECK_SHAPE(v, total_k, num_heads_k, head_size_og);
+    CHECK_SHAPE(cu_seqlens_q, batch_size + 1);
+    CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
+
+    at::Tensor q_padded, k_padded, v_padded;
+    if (head_size_og % 8 != 0) {
+        q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+        v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8}));
+    }
+    else {
+        q_padded = q;
+        k_padded = k;
+        v_padded = v;
+    }
+
+    at::Tensor out;
+    if (out_.has_value()) {
+        out = out_.value();
+        TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs");
+        CHECK_DEVICE(out);
+        TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension");
+        CHECK_SHAPE(out, total_q, num_heads, head_size_og);
+
+        if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); }
+    }
+    else {
+        out = torch::empty_like(q_padded);
+    }
+
+    auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+    const int head_size_8x = round_multiple(head_size_og, 8);
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+
+    auto opts = q.options();
+    bool has_lse = true;
+    bool has_dropout = p_dropout > 0.0f;
+
+    at::Tensor softmax_lse;
+    // TODO - check gradient, only training require lse
+    softmax_lse = torch::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(torch::kFloat32));
+
+    at::Tensor p;
+    if (return_dropout_randval) {
+        TORCH_CHECK(has_dropout, "return_dropout_randval require p_dropout > 0");
+        p = torch::empty({num_heads, total_q, max_seqlen_k}, opts.dtype(torch::kUInt8));
+    }
+
+    if (zero_tensors)
+    {
+        out.zero_();
+        softmax_lse.fill_(-std::numeric_limits<float>::infinity());
+        if (return_dropout_randval) {p.zero_();}
+    }
+
+    uint64_t drop_seed = 1, drop_offset = 0;
+    int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size();
+    auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    auto rng_state = torch::empty({2}, options.dtype(torch::kInt64));
+
+    if (p_dropout > 0.0)  {
+        auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+            gen_, at::cuda::detail::getDefaultCUDAGenerator());
+        // See Note [Acquire lock when using random generators]
+        std::lock_guard<std::mutex> lock(gen->mutex_);
+        auto philox_args = gen->philox_cuda_state(counter_offset);
+        std::tie(drop_seed, drop_offset) = flash::unpack(philox_args);
+    }
+
+    rng_state[0] = *(reinterpret_cast<int64_t*>(&drop_seed));
+    rng_state[1] = *(reinterpret_cast<int64_t*>(&drop_offset));
+
+    if (max_seqlen_k > 0) {
+        auto stream = at::cuda::getCurrentHIPStream().stream();
+        ck_tile::stream_config stream_config{stream};
+
+        auto traits =
+            get_ck_fmha_varlen_fwd_traits(mask, q_dtype_str, head_size_8x, has_dropout, has_lse, alibi_slopes_.has_value());
+
+        auto args =
+            get_ck_fmha_varlen_fwd_args(
+                has_lse,
+                return_dropout_randval,
+                mask,
+                batch_size,
+                max_seqlen_q,
+                num_heads,
+                num_heads_k,
+                head_size_8x,
+                q_padded,
+                k_padded,
+                v_padded,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                alibi_slopes_,
+                out,
+                softmax_lse,
+                p,
+                softmax_scale,
+                p_dropout,
+                drop_seed,
+                drop_offset);
+
+        fmha_fwd(traits, args, stream_config);
+    }
+    else {
+        // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0.
+        out.zero_();
+        softmax_lse.fill_(std::numeric_limits<float>::infinity());
+    }
+
+    at::Tensor out_padded = out;
+    if (head_size_og % 8 != 0) {
+        out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)});
+        if (out_.has_value()) { out_.value().copy_(out); }
+    }
+
+    return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse, p, rng_state};
+}
diff --git a/mlflow.yaml b/mlflow.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bfb3781b1591fc1d3190101da06dbce160396130
--- /dev/null
+++ b/mlflow.yaml
@@ -0,0 +1,10 @@
+# https://mlflow.org
+
+mlflow:
+  _target_: pytorch_lightning.loggers.mlflow.MLFlowLogger
+  experiment_name: ${name}
+  tracking_uri: null
+  tags: null
+  save_dir: ./mlruns
+  prefix: ""
+  artifact_location: null
diff --git a/mlp.py b/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..b795310f1c8afc8203124597bb6ca70f1af7ed11
--- /dev/null
+++ b/mlp.py
@@ -0,0 +1,149 @@
+# The triton fused matmul + sqrelu is faster for fp16 but slower for bf16, compared
+# to naive implementation.
+import fused_dense_lib as fused_dense_cuda
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+from flash_attn.ops.activations import sqrelu_bwd, sqrelu_fwd
+from flash_attn.ops.triton.linear import triton_dgrad_act, triton_linear_act
+
+
+class FusedDenseSqreluDenseFunc(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(ctx, x, weight1, bias1, weight2, bias2, checkpoint_lvl=0):
+        """checkpoint_lvl:
+        0: no recomputation in the bwd
+        1: recompute gelu_out in the bwd
+        2: recompute act_input and gelu_out in the bwd
+        """
+        if torch.is_autocast_enabled():
+            dtype = torch.get_autocast_gpu_dtype()
+            x, weight1, bias1, weight2, bias2 = [
+                a.to(dtype=dtype) for a in [x, weight1, bias1, weight2, bias2]
+            ]
+        is_bf16 = x.dtype == torch.bfloat16
+        assert checkpoint_lvl in [0, 1, 2]
+        x = x.contiguous()
+        weight1 = weight1.contiguous()
+        bias1 = bias1.contiguous()
+        weight2 = weight2.contiguous()
+        bias2 = bias2.contiguous()
+        batch_shape, n = x.shape[:-1], x.shape[-1]
+        batch_dim = batch_shape.numel()
+        if is_bf16:
+            act_input = fused_dense_cuda.linear_bias_forward(
+                x.reshape(batch_dim, n), weight1, bias1
+            )
+            output1 = sqrelu_fwd(act_input)
+        else:
+            save_act_input = checkpoint_lvl != 2
+            result = triton_linear_act(
+                x.reshape(batch_dim, n),
+                weight1,
+                bias1,
+                activation="squared_relu",
+                save_act_input=save_act_input,
+            )
+            if save_act_input:
+                output1, act_input = result
+            else:
+                output1 = result
+        output2 = fused_dense_cuda.linear_bias_forward(output1, weight2, bias2)
+        ctx.checkpoint_lvl = checkpoint_lvl
+        if checkpoint_lvl == 0:
+            ctx.save_for_backward(x, weight1, bias1, weight2, act_input, output1)
+        elif checkpoint_lvl == 1:
+            ctx.save_for_backward(x, weight1, bias1, weight2, act_input)
+        elif checkpoint_lvl == 2:
+            ctx.save_for_backward(x, weight1, bias1, weight2)
+        return output2.reshape(*batch_shape, output2.shape[-1])
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output):
+        grad_output = grad_output.contiguous()
+        checkpoint_lvl = ctx.checkpoint_lvl
+        x, weight1, bias1, weight2, *rest = ctx.saved_tensors
+        batch_shape, n = x.shape[:-1], x.shape[-1]
+        batch_dim = batch_shape.numel()
+        is_bf16 = x.dtype == torch.bfloat16
+        if checkpoint_lvl == 0:
+            act_input, output1 = rest
+        elif checkpoint_lvl == 1:
+            (act_input,) = rest
+            output1 = sqrelu_fwd(act_input)
+        elif checkpoint_lvl == 2:
+            if is_bf16:
+                act_input = fused_dense_cuda.linear_bias_forward(
+                    x.reshape(batch_dim, n), weight1, bias1
+                )
+                output1 = sqrelu_fwd(act_input)
+            else:
+                output1, act_input = triton_linear_act(
+                    x.reshape(batch_dim, n),
+                    weight1,
+                    bias1,
+                    activation="squared_relu",
+                    save_act_input=True,
+                )
+
+        if is_bf16:
+            grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
+            grad_weight2, grad_bias2 = fused_dense_cuda.linear_bias_wgrad(output1, grad_output)
+            grad_output1 = grad_output @ weight2
+            grad_act_input = sqrelu_bwd(grad_output1, act_input)
+            grad_input, grad_weight1, grad_bias1 = fused_dense_cuda.linear_bias_backward(
+                x.reshape(batch_dim, n), weight1, grad_act_input
+            )
+        else:
+            grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1])
+            grad_weight2, grad_bias2 = fused_dense_cuda.linear_bias_wgrad(output1, grad_output)
+            grad_act_input = triton_dgrad_act(
+                grad_output, weight2, activation="squared_relu", act_input=act_input
+            )
+            grad_input, grad_weight1, grad_bias1 = fused_dense_cuda.linear_bias_backward(
+                x.reshape(batch_dim, n), weight1, grad_act_input
+            )
+        return grad_input.reshape_as(x), grad_weight1, grad_bias1, grad_weight2, grad_bias2, None
+
+
+fused_dense_sqrelu_dense_function = FusedDenseSqreluDenseFunc.apply
+
+
+class FusedDenseSqreluDense(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        bias1=True,
+        bias2=True,
+        checkpoint_lvl=0,
+        device=None,
+        dtype=None,
+    ):
+        """
+        checkpoint_lvl (increasing lvl means slower but more memory saving):
+            0: no recomputation in the bwd
+            1: recompute gelu_out in the bwd
+            2: recompute gelu_in and gelu_out in the bwd
+        """
+        assert checkpoint_lvl in [0, 1, 2]
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features * 4
+        assert bias1 == True, "DenseSqreluDense module without bias is currently not supported"
+        assert bias2 == True, "DenseSqreluDense module without bias is currently not supported"
+        self.checkpoint_lvl = checkpoint_lvl
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs)
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
+
+    def forward(self, x):
+        assert x.is_cuda
+        return fused_dense_sqrelu_dense_function(
+            x, self.fc1.weight, self.fc1.bias, self.fc2.weight, self.fc2.bias, self.checkpoint_lvl
+        )
diff --git a/model-summary.yaml b/model-summary.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3dba049adc7820fe47b84d7eaeb660e5d0254cdb
--- /dev/null
+++ b/model-summary.yaml
@@ -0,0 +1,2 @@
+model_summary:
+  _target_: pytorch_lightning.callbacks.RichModelSummary
diff --git a/mse.yaml b/mse.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..50b0484d9d0213bfa18b3e5b3e40047b990d0e02
--- /dev/null
+++ b/mse.yaml
@@ -0,0 +1,3 @@
+# @package eval.metrics
+mse:
+  _target_: torchmetrics.MeanSquaredError
diff --git a/multi-step.yaml b/multi-step.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..42cd60716a4654f469aa7d6384098bc1068f381c
--- /dev/null
+++ b/multi-step.yaml
@@ -0,0 +1,2 @@
+# @package train.scheduler
+_target_: torch.optim.lr_scheduler.MultiStepLR
diff --git a/named_barrier.hpp b/named_barrier.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..58c080f4ab151f6096fdb07f4b3d320d69545027
--- /dev/null
+++ b/named_barrier.hpp
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cutlass/arch/barrier.h"
+
+namespace flash {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Enumerates the reserved named barriers to avoid potential conflicts
+
+enum class FwdNamedBarriers {
+    QueryEmpty = 0,
+    ValueEmpty = 1,
+    TileCountSmemEmpty = 2,
+    TileCountSmemFull = 3,
+    WarpSchedulerWG1 = 4,
+    WarpSchedulerWG2 = 5,
+    WarpSchedulerWG3 = 6,
+    ProducerWG = 7
+};
+
+enum class BwdNamedBarriers {
+    QueryEmpty = 0,
+    KVEmpty = 1,
+    TileCountSmemEmpty = 2,
+    TileCountSmemFull = 3,
+    // WarpSchedulerWG1 = 4,
+    // WarpSchedulerWG2 = 5,
+    dQEmptyWG1 = 4,
+    dQEmptyWG2 = 5,
+    dSFull = 6,
+    // dSEmptyWG1 = 7,
+    // dSEmptyWG2 = 8,
+    dQEmpty = 7,
+    dQFull = 8,
+};
+
+} // flash
diff --git a/neptune.yaml b/neptune.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..117af9379319ff9b000311b863ddf42fe08b1b67
--- /dev/null
+++ b/neptune.yaml
@@ -0,0 +1,11 @@
+# https://neptune.ai
+
+neptune:
+  _target_: pytorch_lightning.loggers.neptune.NeptuneLogger
+  api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable
+  project_name: your_name/template-tests
+  close_after_fit: True
+  offline_mode: False
+  experiment_name: ${name}
+  experiment_id: null
+  prefix: ""
diff --git a/none.yaml b/none.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/norm-monitor.yaml b/norm-monitor.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f4c6e2ccb0c6418f083c387f887c01a0702b7fc5
--- /dev/null
+++ b/norm-monitor.yaml
@@ -0,0 +1,2 @@
+norm_monitor:
+  _target_: src.callbacks.norm_monitor.NormMonitor
diff --git a/num-tokens.yaml b/num-tokens.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..047d423541e83ee8c8bc7debf3893507e89a4afb
--- /dev/null
+++ b/num-tokens.yaml
@@ -0,0 +1,3 @@
+# @package eval.metrics
+num-tokens:
+  _target_: src.metrics.num_tokens.NumTokens
diff --git a/openwebtext.yaml b/openwebtext.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..327decbd86fff635e83ce45bfc7f2f088ab3c257
--- /dev/null
+++ b/openwebtext.yaml
@@ -0,0 +1,15 @@
+_target_: src.datamodules.language_modeling_hf.LMDataModule
+dataset_name: openwebtext
+dataset_config_name: null
+tokenizer_name: gpt2
+cache_dir: ${oc.env:DATA_DIR,${data_dir}}/openwebtext/cache
+max_length: 1024
+val_ratio: 0.0005
+val_split_seed: 2357
+add_eos: True
+batch_size: 8  # per GPU
+batch_size_eval: ${eval:${.batch_size} * 2}
+num_workers: 32  # For preprocessing only
+shuffle: True
+pin_memory: True
+__train_len: ${div_up:9035582198, ${.max_length}}
diff --git a/opt.py b/opt.py
new file mode 100644
index 0000000000000000000000000000000000000000..501f9eb6cf44be86aeb77a4e0f35048255850c30
--- /dev/null
+++ b/opt.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2023, Tri Dao.
+
+import math
+import re
+from collections import OrderedDict
+
+import torch
+import torch.nn.functional as F
+from transformers import GPT2Config, OPTConfig
+
+
+def remap_state_dict_hf_opt(state_dict, config):
+    def key_mapping_model(key):
+        key = re.sub(r"^model.decoder.", "transformer.", key)
+        # The OPT-350m model uses '^decoder' instead of '^model.decoder'
+        key = re.sub(r"^decoder.", "transformer.", key)
+        return key
+
+    state_dict = OrderedDict((key_mapping_model(k), v) for k, v in state_dict.items())
+    # Word embedding and position embedding
+    def key_mapping_emb(key):
+        key = re.sub(r"^transformer.embed_tokens.", "transformer.embeddings.word_embeddings.", key)
+        # The OPT-350m model uses has project_in and project_out
+        key = re.sub(r"^transformer.project_in.", "transformer.embeddings.project_in.", key)
+        key = re.sub(r"^transformer.project_out.", "project_out.", key)
+        key = re.sub(
+            r"^transformer.embed_positions.", "transformer.embeddings.position_embeddings.", key
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items())
+    # OPT uses the first 2 indices of pos_emb for padding tokens
+    pos_embeddings = state_dict.pop("transformer.embeddings.position_embeddings.weight")
+    state_dict["transformer.embeddings.position_embeddings.weight"] = pos_embeddings[2:]
+    word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"]
+
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^transformer.final_layer_norm.", r"transformer.ln_f.", key)
+        # The OPT-175B checkpoint calls this 'decoder.layer_norm' instead of 'decoder.final_layer_norm'
+        key = re.sub(r"^transformer.layer_norm.", r"transformer.ln_f.", key)
+        key = re.sub(
+            r"^transformer.layers.(\d+).self_attn_layer_norm.", r"transformer.layers.\1.norm1.", key
+        )
+        key = re.sub(
+            r"^transformer.layers.(\d+).final_layer_norm.", r"transformer.layers.\1.norm2.", key
+        )
+        return key
+
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+
+    # MLP
+    def key_mapping_mlp(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).fc(1|2).", r"transformer.layers.\1.mlp.fc\2.", key
+        )
+
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+
+    # Attention
+    for l in range(config.n_layer):
+        Wq = state_dict.pop(f"transformer.layers.{l}.self_attn.q_proj.weight")
+        Wk = state_dict.pop(f"transformer.layers.{l}.self_attn.k_proj.weight")
+        Wv = state_dict.pop(f"transformer.layers.{l}.self_attn.v_proj.weight")
+        bq = state_dict.pop(f"transformer.layers.{l}.self_attn.q_proj.bias")
+        bk = state_dict.pop(f"transformer.layers.{l}.self_attn.k_proj.bias")
+        bv = state_dict.pop(f"transformer.layers.{l}.self_attn.v_proj.bias")
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0)
+        state_dict[f"transformer.layers.{l}.mixer.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0)
+
+    def key_mapping_attn(key):
+        return re.sub(
+            r"^transformer.layers.(\d+).self_attn.out_proj.",
+            r"transformer.layers.\1.mixer.out_proj.",
+            key,
+        )
+
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+
+    return state_dict
+
+
+def opt_config_to_gpt2_config(opt_config: OPTConfig) -> GPT2Config:
+    assert opt_config.layerdrop == 0.0
+    assert opt_config.layer_norm_elementwise_affine
+    word_embed_proj_dim = (
+        None
+        if opt_config.word_embed_proj_dim == opt_config.hidden_size
+        else opt_config.word_embed_proj_dim
+    )
+    return GPT2Config(
+        vocab_size=opt_config.vocab_size,
+        n_positions=opt_config.max_position_embeddings,
+        n_embd=opt_config.hidden_size,
+        n_layer=opt_config.num_hidden_layers,
+        n_head=opt_config.num_attention_heads,
+        n_inner=opt_config.ffn_dim,
+        activation_function=opt_config.activation_function,
+        resid_pdrop=opt_config.dropout,
+        # HF's implementation of OPT doesn't seem to have embedding dropout
+        embd_pdrop=opt_config.dropout,
+        attn_pdrop=opt_config.attention_dropout,
+        initializer_range=opt_config.init_std,
+        bos_token_id=opt_config.bos_token_id,
+        eos_token_id=opt_config.eos_token_id,
+        # These are new arguments not in the original GPT2Config
+        prenorm=opt_config.do_layer_norm_before,
+        word_embed_proj_dim=word_embed_proj_dim,
+    )
diff --git a/params-log.yaml b/params-log.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b2a49dd8d7ade6be883b52f9e335fb574180da30
--- /dev/null
+++ b/params-log.yaml
@@ -0,0 +1,5 @@
+params_log:
+  _target_: src.callbacks.params_log.ParamsLog
+  total_params_log: True
+  trainable_params_log: True
+  non_trainable_params_log: True
diff --git a/patch_embed.py b/patch_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..05562f8e8bcdb58e947c6f402a49eacd2d031871
--- /dev/null
+++ b/patch_embed.py
@@ -0,0 +1,67 @@
+# We use the same API as https://github.com/rwightman/pytorch-image-models/blob/v0.6.11/timm/models/layers/patch_embed.py
+# But we use nn.Linear instead of Conv2d and it's about 8x faster.
+
+from functools import partial
+
+import torch.nn as nn
+from einops import rearrange
+from torch import _assert
+from torch.nn.modules.utils import _pair
+
+try:
+    from flash_attn.ops.fused_dense import FusedDense
+except ImportError:
+    FusedDense = None
+
+
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+        bias=True,
+        fused_bias_fc=False,
+    ):
+        super().__init__()
+        img_size = _pair(img_size)
+        patch_size = _pair(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        if fused_bias_fc and FusedDense is None:
+            raise ImportError("fused_dense is not installed")
+
+        linear_cls = nn.Linear if not fused_bias_fc or not bias else FusedDense
+        self.proj = linear_cls(in_chans * patch_size[0] * patch_size[1], embed_dim, bias=bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        _, _, H, W = x.shape
+        _assert(
+            H == self.img_size[0],
+            f"Input image height ({H}) doesn't match model ({self.img_size[0]}).",
+        )
+        _assert(
+            W == self.img_size[1],
+            f"Input image width ({W}) doesn't match model ({self.img_size[1]}).",
+        )
+        x = self.proj(
+            rearrange(
+                x,
+                "b c (h p1) (w p2) -> b h w (c p1 p2)",
+                p1=self.patch_size[0],
+                p2=self.patch_size[1],
+            )
+        )
+        if self.flatten:
+            x = rearrange(x, "b h w c -> b (h w) c")
+        x = self.norm(x)
+        return x
diff --git a/perplexity.yaml b/perplexity.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2edd2178828315901be9448710f30f145d874375
--- /dev/null
+++ b/perplexity.yaml
@@ -0,0 +1,3 @@
+# @package eval.metrics
+ppl:
+  _target_: src.metrics.perplexity.Perplexity
diff --git a/philox.cuh b/philox.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..cd7e4d2fae660d568546c4348bb26b6da7f6c297
--- /dev/null
+++ b/philox.cuh
@@ -0,0 +1,51 @@
+// Pytorch also has an implementation of Philox RNG: https://github.com/pytorch/pytorch/blob/8ca3c881db3e3510fcb7725389f6a0633c9b992c/torch/csrc/jit/tensorexpr/cuda_random.h
+#pragma once
+// Philox CUDA.
+
+namespace flash {
+
+struct ull2 {
+    unsigned long long x;
+    unsigned long long y;
+};
+
+__forceinline__ __device__ uint2 mulhilo32(const unsigned int a, const unsigned int b) {
+    uint2 *res;
+    unsigned long long tmp;
+    asm ("mul.wide.u32 %0, %1, %2;\n\t"
+          : "=l"(tmp)
+          : "r"(a), "r"(b));
+    res = (uint2*)(&tmp);
+    return *res;
+}
+
+__forceinline__ __device__ uint4 philox_single_round(const uint4 ctr, const uint2 key) {
+    constexpr unsigned long kPhiloxSA = 0xD2511F53;
+    constexpr unsigned long kPhiloxSB = 0xCD9E8D57;
+    uint2 res0 = mulhilo32(kPhiloxSA, ctr.x);
+    uint2 res1 = mulhilo32(kPhiloxSB, ctr.z);
+    uint4 ret = {res1.y ^ ctr.y ^ key.x, res1.x, res0.y ^ ctr.w ^ key.y, res0.x};
+    return ret;
+}
+
+__forceinline__ __device__ uint4 philox(unsigned long long seed,
+                               unsigned long long subsequence,
+                               unsigned long long offset) {
+    constexpr unsigned long kPhilox10A = 0x9E3779B9;
+    constexpr unsigned long kPhilox10B = 0xBB67AE85;
+    uint2 key = reinterpret_cast<uint2&>(seed);
+    uint4 counter;
+    ull2 *tmp = reinterpret_cast<ull2*>(&counter);
+    tmp->x = offset;
+    tmp->y = subsequence;
+    #pragma unroll
+    for (int i = 0; i < 6; i++) {
+        counter = philox_single_round(counter, key);
+        key.x += (kPhilox10A);
+        key.y += (kPhilox10B);
+    }
+    uint4 output = philox_single_round(counter, key);
+    return output;
+}
+
+} // namespace flash
diff --git a/plateau.yaml b/plateau.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..436c264dc2acd534a333e0bc4b28f93139d8ec7c
--- /dev/null
+++ b/plateau.yaml
@@ -0,0 +1,9 @@
+# @package _global_
+train:
+  scheduler_interval: epoch
+  scheduler_monitor: ???
+  scheduler:
+    _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
+    factor: 0.2  # Decay factor when ReduceLROnPlateau is used
+    patience: 20
+    min_lr: 0.0  # Minimum learning rate during annealing
diff --git a/poly-warmup.yaml b/poly-warmup.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..79808ea42c6c1d851ef94da812f7c2d448ec6ca0
--- /dev/null
+++ b/poly-warmup.yaml
@@ -0,0 +1,2 @@
+# @package train.scheduler
+_target_: transformers.get_polynomial_decay_schedule_with_warmup
diff --git a/pretrained.py b/pretrained.py
new file mode 100644
index 0000000000000000000000000000000000000000..40e76bd2692335c7f474f6b6479be67eb95f8d20
--- /dev/null
+++ b/pretrained.py
@@ -0,0 +1,79 @@
+import os
+from functools import partial
+
+import torch
+from safetensors.torch import load_file as safe_load_file
+from transformers.utils import (
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+)
+from transformers.utils.hub import cached_file, get_checkpoint_shard_files
+
+
+def state_dict_from_pretrained(model_name, device=None, dtype=None):
+    # If not fp32, then we don't want to load directly to the GPU
+    mapped_device = "cpu" if dtype not in [torch.float32, None] else device
+    is_sharded = False
+    load_safe = False
+    resolved_archive_file = None
+
+    weights_path = os.path.join(model_name, WEIGHTS_NAME)
+    weights_index_path = os.path.join(model_name, WEIGHTS_INDEX_NAME)
+    safe_weights_path = os.path.join(model_name, SAFE_WEIGHTS_NAME)
+    safe_weights_index_path = os.path.join(model_name, SAFE_WEIGHTS_INDEX_NAME)
+
+    if os.path.isfile(weights_path):
+        resolved_archive_file = cached_file(
+            model_name, WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False
+        )
+    elif os.path.isfile(weights_index_path):
+        resolved_archive_file = cached_file(
+            model_name, WEIGHTS_INDEX_NAME, _raise_exceptions_for_missing_entries=False
+        )
+        is_sharded = True
+    elif os.path.isfile(safe_weights_path):
+        resolved_archive_file = cached_file(
+            model_name, SAFE_WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False
+        )
+        load_safe = True
+    elif os.path.isfile(safe_weights_index_path):
+        resolved_archive_file = cached_file(
+            model_name, SAFE_WEIGHTS_INDEX_NAME, _raise_exceptions_for_missing_entries=False
+        )
+        is_sharded = True
+        load_safe = True
+    else:  # Try loading from HF hub instead of from local files
+        resolved_archive_file = cached_file(model_name, WEIGHTS_NAME,
+                                            _raise_exceptions_for_missing_entries=False)
+        if resolved_archive_file is None:
+            resolved_archive_file = cached_file(model_name, WEIGHTS_INDEX_NAME,
+                                                _raise_exceptions_for_missing_entries=False)
+            if resolved_archive_file is not None:
+                is_sharded = True
+
+    if resolved_archive_file is None:
+        raise EnvironmentError(f"Model name {model_name} was not found.")
+
+    if load_safe:
+        loader = partial(safe_load_file, device=mapped_device)
+    else:
+        loader = partial(torch.load, map_location=mapped_device)
+
+    if is_sharded:
+        # resolved_archive_file becomes a list of files that point to the different
+        # checkpoint shards in this case.
+        resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
+            model_name, resolved_archive_file
+        )
+        state_dict = {}
+        for sharded_file in resolved_archive_file:
+            state_dict.update(loader(sharded_file))
+    else:
+        state_dict = loader(resolved_archive_file)
+    # Convert dtype before moving to GPU to save memory
+    if dtype is not None:
+        state_dict = {k: v.to(dtype=dtype) for k, v in state_dict.items()}
+    state_dict = {k: v.to(device=device) for k, v in state_dict.items()}
+    return state_dict
diff --git a/profile.yaml b/profile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f6c547a43fc7932cb9aa657a6f1a8e492a09ebbe
--- /dev/null
+++ b/profile.yaml
@@ -0,0 +1,31 @@
+# @package _global_
+# Run the Pytorch profiler
+
+trainer:
+  profiler:
+    _target_: pytorch_lightning.profilers.PyTorchProfiler
+    dirpath: ${hydra.run.dir}
+    schedule:
+      _target_: torch.profiler.schedule
+      wait: 5
+      warmup: 5
+      active: 5
+    use_cuda: True
+  max_steps: 20
+
+logger:
+  wandb:
+    mode: disabled
+
+callbacks:
+  model_checkpoint: null
+  model_checkpoint_progress: null
+  early_stopping: null
+
+hydra:
+  # sets output paths for all file logs to 'logs/profile/'
+  run:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/profile/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/profile/multirun_${now:%Y-%m-%d_%H-%M-%S}
+    subdir: ${hydra.job.num}
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..3201555763efa5ab9159a3e58d0dd43ff79daffb
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,3 @@
+[tool.black]
+line-length = 100
+target-version = ['py38']
\ No newline at end of file
diff --git a/rms_norm.py b/rms_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..068348d61290e3839dd082b540d898578ba1e8e2
--- /dev/null
+++ b/rms_norm.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2022, Tri Dao.
+# Adapted from https://github.com/NVIDIA/apex/blob/master/apex/contrib/layer_norm/layer_norm.py
+
+import torch
+from torch.nn import init
+
+from flash_attn.ops.layer_norm import (
+    DropoutAddLayerNormFn,
+    DropoutAddLayerNormParallelResidualFn,
+    DropoutAddLayerNormSubsetFn,
+)
+
+
+def rms_norm(x, weight, epsilon):
+    return DropoutAddLayerNormFn.apply(
+        x, None, weight, None, None, None, 0.0, epsilon, False, False, True
+    )
+
+
+def dropout_add_rms_norm(
+    x0,
+    residual,
+    weight,
+    bias,
+    dropout_p,
+    epsilon,
+    rowscale=None,
+    layerscale=None,
+    prenorm=False,
+    residual_in_fp32=False,
+    return_dropout_mask=False,
+):
+    """residual_in_fp32 only has an effect if residual is None.
+    Otherwise residual dtype is residual.dtype.
+    """
+    return DropoutAddLayerNormFn.apply(
+        x0,
+        residual,
+        weight,
+        bias,
+        rowscale,
+        layerscale,
+        dropout_p,
+        epsilon,
+        residual_in_fp32,
+        prenorm,
+        True,
+        return_dropout_mask,
+    )
+
+
+def dropout_add_rms_norm_subset(
+    x0,
+    residual,
+    weight,
+    bias,
+    dropout_p,
+    epsilon,
+    layerscale=None,
+    x0_subset=None,
+    out_subset=None,
+    rowscale_const=1.0,
+    out_numrows=0,
+    prenorm=False,
+    residual_in_fp32=False,
+    return_dropout_mask=False,
+):
+    """residual_in_fp32 only has an effect if residual is None.
+    Otherwise residual dtype is residual.dtype.
+    """
+    return DropoutAddLayerNormSubsetFn.apply(
+        x0,
+        residual,
+        weight,
+        bias,
+        layerscale,
+        x0_subset,
+        out_subset,
+        dropout_p,
+        epsilon,
+        rowscale_const,
+        out_numrows,
+        residual_in_fp32,
+        prenorm,
+        True,
+        return_dropout_mask,
+    )
+
+
+def dropout_add_rms_norm_parallel_residual(
+    x0,
+    x1,
+    residual,
+    weight0,
+    bias0,
+    weight1,
+    bias1,
+    dropout_p,
+    epsilon,
+    prenorm=False,
+    residual_in_fp32=False,
+    return_dropout_mask=False,
+):
+    """residual_in_fp32 only has an effect if residual is None.
+    Otherwise residual dtype is residual.dtype.
+    """
+    return DropoutAddLayerNormParallelResidualFn.apply(
+        x0,
+        x1,
+        residual,
+        weight0,
+        bias0,
+        weight1,
+        bias1,
+        dropout_p,
+        epsilon,
+        residual_in_fp32,
+        prenorm,
+        True,
+        return_dropout_mask,
+    )
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-5, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.register_parameter("bias", None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init.ones_(self.weight)
+
+    def forward(self, x):
+        return rms_norm(x, self.weight, self.eps)
+
+
+class DropoutAddRMSNorm(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        prenorm=False,
+        p=0.0,
+        eps=1e-5,
+        residual_in_fp32=False,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.prenorm = prenorm
+        self.p = p
+        self.eps = eps
+        self.residual_in_fp32 = residual_in_fp32
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.register_parameter("bias", None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init.ones_(self.weight)
+
+    def forward(self, x0, residual=None):
+        return dropout_add_rms_norm(
+            x0,
+            residual,
+            self.weight,
+            None,
+            self.p if self.training else 0.0,
+            self.eps,
+            prenorm=self.prenorm,
+            residual_in_fp32=self.residual_in_fp32,
+        )
diff --git a/rotary.cpp b/rotary.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2a3cf0f75010dd2988f3f173b4be6379c166277
--- /dev/null
+++ b/rotary.cpp
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#include <torch/extension.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#define CHECK_DEVICE(x) TORCH_CHECK(x.device().type() == torch::kCUDA, #x " must be on CUDA")
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+
+void apply_rotary_cuda(const torch::Tensor x1, const torch::Tensor x2,
+                       const torch::Tensor cos, const torch::Tensor sin,
+                       torch::Tensor out1, torch::Tensor out2,
+                       const bool conj);
+
+void apply_rotary(const torch::Tensor x1, const torch::Tensor x2,
+                  const torch::Tensor cos, const torch::Tensor sin,
+                  torch::Tensor out1, torch::Tensor out2,
+                  const bool conj) {
+    CHECK_DEVICE(x1); CHECK_DEVICE(x2);
+    CHECK_DEVICE(cos); CHECK_DEVICE(sin);
+    CHECK_DEVICE(out1); CHECK_DEVICE(out1);
+    TORCH_CHECK(x1.dtype() == x2.dtype());
+    TORCH_CHECK(cos.dtype() == sin.dtype());
+    TORCH_CHECK(out1.dtype() == out2.dtype());
+    TORCH_CHECK(x1.dtype() == cos.dtype());
+    TORCH_CHECK(x1.dtype() == out1.dtype());
+    TORCH_CHECK(x1.sizes() == x2.sizes());
+    TORCH_CHECK(cos.sizes() == sin.sizes());
+    TORCH_CHECK(out1.sizes() == out2.sizes());
+
+    // Otherwise the kernel will be launched from cuda:0 device
+    // Cast to char to avoid compiler warning about narrowing
+    at::cuda::CUDAGuard device_guard{(char)x1.get_device()};
+
+    apply_rotary_cuda(x1, x2, cos, sin, out1, out2, conj);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("apply_rotary", &apply_rotary, "Apply rotary embedding");
+}
diff --git a/rotary.h b/rotary.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f1614ad24872f3edc1798a008cc94cb14dff5a0
--- /dev/null
+++ b/rotary.h
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cute/tensor.hpp>
+
+#include "utils.h"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace flash {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool Is_even_K=true, bool Clear_OOB_K=true,
+          typename Engine0, typename Layout0, typename Engine1, typename Layout1,
+          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
+__forceinline__ __device__ void copy_rotary_interleaved(Tensor<Engine0, Layout0> const &S,
+                                               Tensor<Engine1, Layout1> &D,
+                                               Tensor<Engine2, Layout2> const &Cos,
+                                               Tensor<Engine2, Layout2> const &Sin,
+                                               Tensor<Engine3, Layout3> const &identity_MN,
+                                               const int max_MN, const int min_MN,
+                                               const int dim, const int rotary_dim) {
+    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
+    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
+    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Cos));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Cos));                     // MMA_K
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Sin));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Sin));                     // MMA_K
+    CUTE_STATIC_ASSERT_V(size<0>(Cos) == size<0>(Sin));                     // MMA_K
+    static_assert(decltype(size<0>(S))::value == decltype(size<0>(Cos))::value * 2);
+    static_assert(decltype(size<0>(Cos))::value % 2 == 0);  // Since we do fast conversion from fp16/bf16 to fp32
+    Tensor rCos = make_fragment_like(Cos);
+    Tensor rSin = make_fragment_like(Sin);
+    Tensor rS = make_fragment_like(S);
+    #pragma unroll
+    for (int m = 0; m < size<1>(S); ++m) {
+        if (get<0>(identity_MN(0, m, 0)) >= min_MN && get<0>(identity_MN(0, m, 0)) < max_MN) {
+            #pragma unroll
+            for (int k = 0; k < size<2>(S); ++k) {
+                if (Is_even_K || get<1>(identity_MN(0, 0, k)) < dim) {
+                    cute::copy(S(_, m, k), rS(_, m, k));
+                    if (get<1>(identity_MN(0, 0, k)) < rotary_dim) {
+                        cute::copy(Cos(_, m, k), rCos(_, m, k));
+                        cute::copy(Sin(_, m, k), rSin(_, m, k));
+                        Tensor S_fp32 = convert_type<float>(rS(_, m, k));
+                        Tensor cos_fp32 = convert_type<float>(rCos(_, m, k));
+                        Tensor sin_fp32 = convert_type<float>(rSin(_, m, k));
+                        #pragma unroll
+                        for (int i = 0; i < size<0>(rS) / 2; ++i) {
+                            float real = S_fp32(2 * i) * cos_fp32(i) - S_fp32(2 * i + 1) * sin_fp32(i);
+                            float imag = S_fp32(2 * i) * sin_fp32(i) + S_fp32(2 * i + 1) * cos_fp32(i);
+                            S_fp32(2 * i) = real;
+                            S_fp32(2 * i + 1) = imag;
+                        }
+                        // Idk but I need to copy for the convert_type to work
+                        Tensor S_fp32_copy = make_fragment_like(S_fp32);
+                        cute::copy(S_fp32, S_fp32_copy);
+                        using T = typename Engine0::value_type;
+                        Tensor S_og_type = convert_type<T>(S_fp32_copy);
+                        cute::copy(S_og_type, rS(_, m, k));
+                    }
+                    cute::copy(rS(_, m, k), D(_, m, k));
+                } else if (Clear_OOB_K) {
+                    cute::clear(D(_, m, k));
+                }
+            }
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool Is_even_K=true, bool Clear_OOB_K=true,
+          typename Engine0, typename Layout0, typename Engine1, typename Layout1,
+          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
+__forceinline__ __device__ void copy_rotary_contiguous(Tensor<Engine0, Layout0> const &S,
+                                              Tensor<Engine1, Layout1> &D,
+                                              Tensor<Engine2, Layout2> const &Cos,
+                                              Tensor<Engine2, Layout2> const &Sin,
+                                              Tensor<Engine3, Layout3> const &identity_MN,
+                                              const int max_MN, const int min_MN,
+                                              const int dim, const int rotary_dim) {
+    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
+    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
+    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Cos));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Cos));                     // MMA_K
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Sin));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Sin));                     // MMA_K
+    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(Cos));                     // MMA
+    CUTE_STATIC_ASSERT_V(size<0>(Cos) == size<0>(Sin));
+    static_assert(decltype(size<0>(Cos))::value % 2 == 0);  // Since we do fast conversion from fp16/bf16 to fp32
+    Tensor rCos = make_fragment_like(Cos);
+    Tensor rSin = make_fragment_like(Sin);
+    Tensor rS = make_fragment_like(S);
+    Tensor rS_other = make_fragment_like(rS(_, 0, 0));
+    #pragma unroll
+    for (int m = 0; m < size<1>(S); ++m) {
+        if (get<0>(identity_MN(0, m, 0)) >= min_MN && get<0>(identity_MN(0, m, 0)) < max_MN) {
+            #pragma unroll
+            for (int k = 0; k < size<2>(S); ++k) {
+                if (Is_even_K || get<1>(identity_MN(0, 0, k)) < dim) {
+                    cute::copy(S(_, m, k), rS(_, m, k));
+                    if (get<1>(identity_MN(0, 0, k)) < rotary_dim) {
+                        const bool is_left = get<1>(identity_MN(0, 0, k)) < rotary_dim / 2;
+                        Tensor gS_other = make_tensor(S(_, m, k).data() + (is_left ? rotary_dim / 2 : -rotary_dim / 2), S(_, m, k).layout());
+                        cute::copy(gS_other, rS_other);
+                        // if (cute::thread0()) { print_tensor(rS(_, m, k)); print_tensor(rS_other); }
+                        Tensor gCos = make_tensor(Cos(_, m, k).data() + (is_left ? 0 : -rotary_dim / 2), Cos(_, m, k).layout());
+                        Tensor gSin = make_tensor(Sin(_, m, k).data() + (is_left ? 0 : -rotary_dim / 2), Sin(_, m, k).layout());
+                        cute::copy(gCos, rCos(_, m, k));
+                        cute::copy(gSin, rSin(_, m, k));
+                        // if (cute::thread0()) { print_tensor(rCos(_, m, k)); print_tensor(rSin(_, m, k)); }
+                        Tensor S_fp32 = convert_type<float>(rS(_, m, k));
+                        Tensor S_other_fp32 = convert_type<float>(rS_other);
+                        Tensor cos_fp32 = convert_type<float>(rCos(_, m, k));
+                        Tensor sin_fp32 = convert_type<float>(rSin(_, m, k));
+                        #pragma unroll
+                        for (int i = 0; i < size<0>(rS); ++i) {
+                            S_fp32(i) = S_fp32(i) * cos_fp32(i) + S_other_fp32(i) * (is_left ? -sin_fp32(i) : sin_fp32(i));
+                        }
+                        // Idk but I need to copy for the convert_type to work
+                        Tensor S_fp32_copy = make_fragment_like(S_fp32);
+                        cute::copy(S_fp32, S_fp32_copy);
+                        using T = typename Engine0::value_type;
+                        Tensor S_og_type = convert_type<T>(S_fp32_copy);
+                        cute::copy(S_og_type, rS(_, m, k));
+                        // if (cute::thread0()) { print_tensor(rS(_, m, k)); }
+                    }
+                    cute::copy(rS(_, m, k), D(_, m, k));
+                } else if (Clear_OOB_K) {
+                    cute::clear(D(_, m, k));
+                }
+            }
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace flash
diff --git a/rotary.py b/rotary.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c04a523ede814ea075e6773572cb56cac8bff64
--- /dev/null
+++ b/rotary.py
@@ -0,0 +1,227 @@
+# Copyright (c) 2023, Tri Dao.
+
+from typing import Optional, Union
+
+import torch
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def rotary_kernel(
+    OUT,  # Pointers to matrices
+    X,
+    COS,
+    SIN,
+    CU_SEQLENS,
+    SEQLEN_OFFSETS,  # this could be int or a pointer
+    # Matrix dimensions
+    seqlen,
+    rotary_dim,
+    seqlen_ro,
+    # strides
+    stride_out_batch,
+    stride_out_seqlen,
+    stride_out_nheads,
+    stride_out_headdim,
+    stride_x_batch,
+    stride_x_seqlen,
+    stride_x_nheads,
+    stride_x_headdim,
+    # Meta-parameters
+    BLOCK_K: tl.constexpr,
+    IS_SEQLEN_OFFSETS_TENSOR: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    INTERLEAVED: tl.constexpr,
+    CONJUGATE: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+):
+    pid_m = tl.program_id(axis=0)
+    pid_batch = tl.program_id(axis=1)
+    pid_head = tl.program_id(axis=2)
+    rotary_dim_half = rotary_dim // 2
+
+    if not IS_VARLEN:
+        X = X + pid_batch * stride_x_batch + pid_head * stride_x_nheads
+        OUT = OUT + pid_batch * stride_out_batch + pid_head * stride_out_nheads
+    else:
+        start_idx = tl.load(CU_SEQLENS + pid_batch)
+        seqlen = tl.load(CU_SEQLENS + pid_batch + 1) - start_idx
+        X = X + start_idx * stride_x_seqlen + pid_head * stride_x_nheads
+        OUT = OUT + start_idx * stride_out_seqlen + pid_head * stride_out_nheads
+
+    if pid_m * BLOCK_M >= seqlen:
+        return
+    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    if not IS_SEQLEN_OFFSETS_TENSOR:
+        rm_cs = rm + SEQLEN_OFFSETS
+    else:
+        rm_cs = rm + tl.load(SEQLEN_OFFSETS + pid_batch)
+    rk = tl.arange(0, BLOCK_K)
+    rk_half = tl.arange(0, BLOCK_K // 2)
+
+    if not INTERLEAVED:
+        # Load the 1st and 2nd halves of X, do calculation, then store to 1st and 2nd halves of OUT
+        X = X + (rm[:, None] * stride_x_seqlen + rk_half[None, :] * stride_x_headdim)
+        COS = COS + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :])
+        SIN = SIN + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :])
+        cos = tl.load(
+            COS, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=1.0
+        ).to(tl.float32)
+        sin = tl.load(
+            SIN, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=0.0
+        ).to(tl.float32)
+        x0 = tl.load(
+            X, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half), other=0.0
+        ).to(tl.float32)
+        x1 = tl.load(
+            X + rotary_dim_half * stride_x_headdim,
+            mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half),
+            other=0.0,
+        ).to(tl.float32)
+        if CONJUGATE:
+            sin = -sin
+        o0 = x0 * cos - x1 * sin
+        o1 = x0 * sin + x1 * cos
+        # write back result
+        OUT = OUT + (rm[:, None] * stride_out_seqlen + rk_half[None, :] * stride_out_headdim)
+        tl.store(OUT, o0, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half))
+        tl.store(
+            OUT + rotary_dim_half * stride_out_headdim,
+            o1,
+            mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half),
+        )
+    else:
+        # We don't want to load X[0, 2, 4, ...] and X[1, 3, 5, ...] separately since both are slow.
+        # Instead, we load x0 = X[0, 1, 2, 3, ...] and x1 = X[1, 0, 3, 2, ...].
+        # Loading x0 will be fast but x1 will be slow.
+        # Then we load cos = COS[0, 0, 1, 1, ...] and sin = SIN[0, 0, 1, 1, ...].
+        # Then we do the calculation and use tl.where to pick put the right outputs for the even
+        # and for the odd indices.
+        rk_swap = rk + ((rk + 1) % 2) * 2 - 1  # 1, 0, 3, 2, 5, 4, ...
+        rk_repeat = tl.arange(0, BLOCK_K) // 2
+        X0 = X + (rm[:, None] * stride_x_seqlen + rk[None, :] * stride_x_headdim)
+        X1 = X + (rm[:, None] * stride_x_seqlen + rk_swap[None, :] * stride_x_headdim)
+        COS = COS + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :])
+        SIN = SIN + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :])
+        cos = tl.load(
+            COS,
+            mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half),
+            other=1.0,
+        ).to(tl.float32)
+        sin = tl.load(
+            SIN,
+            mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half),
+            other=0.0,
+        ).to(tl.float32)
+        x0 = tl.load(X0, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim), other=0.0).to(
+            tl.float32
+        )
+        x1 = tl.load(
+            X1, mask=(rm[:, None] < seqlen) & (rk_swap[None, :] < rotary_dim), other=0.0
+        ).to(tl.float32)
+        if CONJUGATE:
+            sin = -sin
+        x0_cos = x0 * cos
+        x1_sin = x1 * sin
+        out = tl.where(rk[None, :] % 2 == 0, x0_cos - x1_sin, x0_cos + x1_sin)
+        OUT = OUT + (rm[:, None] * stride_out_seqlen + rk[None, :] * stride_out_headdim)
+        tl.store(OUT, out, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim))
+
+
+def apply_rotary(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    seqlen_offsets: Union[int, torch.Tensor] = 0,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    max_seqlen: Optional[int] = None,
+    interleaved=False,
+    inplace=False,
+    conjugate=False,
+) -> torch.Tensor:
+    """
+    Arguments:
+        x: (batch, seqlen, nheads, headdim) if cu_seqlens is None
+            else (total_seqlen, nheads, headdim).
+        cos: (seqlen_ro, rotary_dim / 2)
+        sin: (seqlen_ro, rotary_dim / 2)
+        seqlen_offsets: integer or integer tensor of size (batch,)
+        cu_seqlens: (batch + 1,) or None
+        max_seqlen: int
+    Returns:
+        y: (batch, seqlen, nheads, headdim)
+    """
+    is_varlen = cu_seqlens is not None
+    if not is_varlen:
+        batch, seqlen, nheads, headdim = x.shape
+    else:
+        assert max_seqlen is not None, "If cu_seqlens is passed in, then max_seqlen must be passed"
+        total_seqlen, nheads, headdim = x.shape
+        batch_p_1 = cu_seqlens.shape[0]
+        batch = batch_p_1 - 1
+        seqlen = max_seqlen
+    seqlen_ro, rotary_dim = cos.shape
+    assert sin.shape == cos.shape
+    rotary_dim *= 2
+    assert rotary_dim <= headdim, "rotary_dim must be <= headdim"
+    assert headdim <= 256, "Only support headdim <= 256"
+    assert seqlen_ro >= seqlen, "seqlen_ro must be >= seqlen"
+
+    assert (
+        cos.dtype == sin.dtype
+    ), f"cos and sin must have the same dtype, got {cos.dtype} and {sin.dtype}"
+    assert (
+        x.dtype == cos.dtype
+    ), f"Input and cos/sin must have the same dtype, got {x.dtype} and {cos.dtype}"
+
+    cos, sin = cos.contiguous(), sin.contiguous()
+    if isinstance(seqlen_offsets, torch.Tensor):
+        assert seqlen_offsets.shape == (batch,)
+        assert seqlen_offsets.dtype in [torch.int32, torch.int64]
+        seqlen_offsets = seqlen_offsets.contiguous()
+    else:
+        assert seqlen_offsets + seqlen <= seqlen_ro
+
+    output = torch.empty_like(x) if not inplace else x
+    if rotary_dim < headdim and not inplace:
+        output[..., rotary_dim:].copy_(x[..., rotary_dim:])
+
+    BLOCK_K = (
+        32
+        if rotary_dim <= 32
+        else (64 if rotary_dim <= 64 else (128 if rotary_dim <= 128 else 256))
+    )
+    grid = lambda META: (triton.cdiv(seqlen, META["BLOCK_M"]), batch, nheads)  # noqa
+    BLOCK_M = 4 if interleaved else (8 if rotary_dim <= 64 else 4)
+
+    # Need this, otherwise Triton tries to launch from cuda:0 and we get
+    # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
+    with torch.cuda.device(x.device.index):
+        rotary_kernel[grid](
+            output,  # data ptrs
+            x,
+            cos,
+            sin,
+            cu_seqlens,
+            seqlen_offsets,
+            seqlen,  # shapes
+            rotary_dim,
+            seqlen_ro,
+            output.stride(0) if not is_varlen else 0,  # batch_strides if not varlen else 0
+            output.stride(-3),  # seqlen_stride or total_seqlen_stride
+            output.stride(-2),  # nheads_stride
+            output.stride(-1),  # headdim_stride
+            x.stride(0) if not is_varlen else 0,  # batch_strides if not varlen else 0
+            x.stride(-3),  # seqlen stride or total_seqlen_stride
+            x.stride(-2),  # nheads stride
+            x.stride(-1),  # headdim stride
+            BLOCK_K,
+            isinstance(seqlen_offsets, torch.Tensor),
+            is_varlen,
+            interleaved,
+            conjugate,
+            BLOCK_M,
+        )
+    return output
diff --git a/rotary_cuda.cu b/rotary_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2dd0ff3f6e248183a03e4988f7032a0fac90b1dd
--- /dev/null
+++ b/rotary_cuda.cu
@@ -0,0 +1,45 @@
+/******************************************************************************
+ * Copyright (c) 2023, Tri Dao.
+ ******************************************************************************/
+
+#include <torch/python.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/Loops.cuh>
+
+void apply_rotary_cuda(const torch::Tensor x1, const torch::Tensor x2,
+                       const torch::Tensor cos, const torch::Tensor sin,
+                       torch::Tensor out1, torch::Tensor out2,
+                       const bool conj) {
+    auto iter = at::TensorIteratorConfig()
+        .add_output(out1)
+        .add_output(out2)
+        .add_input(x1)
+        .add_input(x2)
+        .add_input(cos)
+        .add_input(sin)
+        .check_all_same_dtype(false)
+        .promote_inputs_to_common_dtype(false)
+        .build();
+
+    if (!conj) {
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel", [&] {
+            at::native::gpu_kernel_multiple_outputs(
+                iter, [] GPU_LAMBDA (scalar_t x1, scalar_t x2, scalar_t cos,
+                                    scalar_t sin) -> thrust::tuple<scalar_t, scalar_t> {
+                scalar_t out1 = float(x1) * float(cos) - float(x2) * float(sin);
+                scalar_t out2 = float(x1) * float(sin) + float(x2) * float(cos);
+                return {out1, out2};
+            });
+        });
+    } else {
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel", [&] {
+            at::native::gpu_kernel_multiple_outputs(
+                iter, [] GPU_LAMBDA (scalar_t x1, scalar_t x2, scalar_t cos,
+                                    scalar_t sin) -> thrust::tuple<scalar_t, scalar_t> {
+                scalar_t out1 = float(x1) * float(cos) + float(x2) * float(sin);
+                scalar_t out2 = -float(x1) * float(sin) + float(x2) * float(cos);
+                return {out1, out2};
+            });
+        });
+    }
+}
\ No newline at end of file
diff --git a/run.py b/run.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b22d8e2cc560e8446063e406fd6540fc688292c
--- /dev/null
+++ b/run.py
@@ -0,0 +1,68 @@
+from typing import Callable
+
+import dotenv
+import hydra
+from omegaconf import OmegaConf, DictConfig
+
+# load environment variables from `.env` file if it exists
+# recursively searches for `.env` in all folders starting from work dir
+dotenv.load_dotenv(override=True)
+
+OmegaConf.register_new_resolver('eval', eval)
+OmegaConf.register_new_resolver('div_up', lambda x, y: (x + y - 1) // y)
+# Delay the evaluation until we have the datamodule
+# So we want the resolver to yield the same string.
+OmegaConf.register_new_resolver('datamodule', lambda attr: '${datamodule:' + str(attr) + '}')
+
+# Turn on TensorFloat32
+import torch.backends
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+
+
+def dictconfig_filter_key(d: DictConfig, fn: Callable) -> DictConfig:
+    """Only keep keys where fn(key) is True. Support nested DictConfig.
+    """
+    # Using d.items_ex(resolve=False) instead of d.items() since we want to keep the
+    # ${datamodule:foo} unresolved for now.
+    return DictConfig({k: dictconfig_filter_key(v, fn) if isinstance(v, DictConfig) else v
+                       # for k, v in d.items_ex(resolve=False) if fn(k)})
+                       for k, v in d.items() if fn(k)})
+
+
+@hydra.main(config_path="configs/", config_name="config.yaml")
+def main(config: DictConfig):
+
+    # Remove config keys that start with '__'. These are meant to be used only in computing
+    # other entries in the config.
+    config = dictconfig_filter_key(config, lambda k: not k.startswith('__'))
+
+    # Imports should be nested inside @hydra.main to optimize tab completion
+    # Read more here: https://github.com/facebookresearch/hydra/issues/934
+    from src.train import train
+    from src.eval import evaluate
+    from src.utils import utils
+
+    # A couple of optional utilities:
+    # - disabling python warnings
+    # - forcing debug-friendly configuration
+    # - verifying experiment name is set when running in experiment mode
+    # You can safely get rid of this line if you don't want those
+    utils.extras(config)
+
+    # Pretty print config using Rich library
+    if config.get("print_config"):
+        utils.print_config(config, resolve=True)
+
+    # Train model
+    mode = config.get('mode', 'train')
+    if mode not in ['train', 'eval']:
+        raise NotImplementedError(f'mode {mode} not supported')
+    if mode == 'train':
+        return train(config)
+    elif mode == 'eval':
+        return evaluate(config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scaled_masked_softmax.h b/scaled_masked_softmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..14b9f6e4242cf027aebf4d14637aecd1a0c17901
--- /dev/null
+++ b/scaled_masked_softmax.h
@@ -0,0 +1,528 @@
+/* coding=utf-8
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <cfloat>
+#include <limits>
+#include <stdint.h>
+#include <cuda_fp16.h>
+#include <c10/macros/Macros.h>
+
+namespace {
+
+template <typename Datatype, int ELEMENTS_PER_LDG>
+__device__ __inline__ void copy_vector(Datatype *dst, const Datatype *src);
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 1>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 4>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *((float2*) dst) = *((float2*) src); }
+
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 1>(c10::Half *dst, const c10::Half *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 4>(c10::Half *dst, const c10::Half *src) { *((float2*) dst) = *((float2*) src); }
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 1>(uint8_t *dst, const uint8_t *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 4>(uint8_t *dst, const uint8_t *src) {*((half2*) dst) = *((half2*) src); }
+
+int log2_ceil(int value) {
+    int log2_value = 0;
+    while ((1 << log2_value) < value) ++log2_value;
+    return log2_value;
+}
+
+template<typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+template<typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+    return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
+__device__ __forceinline__ void warp_reduce(acc_t* sum) {
+    ReduceOp<acc_t> r;
+    #pragma unroll
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+        #pragma unroll
+        for (int i = 0;  i < WARP_BATCH;  ++i) {
+            acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
+            sum[i] = r(sum[i], b);
+        }
+    }
+}
+
+/*
+ * Extended softmax (from native aten pytorch) with following additional features
+ * 1) input scaling
+ * 2) Explicit masking
+ */	
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_masked_softmax_warp_forward(
+    output_t *dst, 
+    const input_t *src,
+    const uint8_t *mask, 
+    const acc_t scale, 
+    int micro_batch_size, 
+    int element_count,
+    int pad_batches) 
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_forward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
+
+    // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
+    // gridDim/blockIdx = (seq_len, attn_heads, batches) 
+    int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z))+ threadIdx.y) * WARP_BATCH;
+    int pad_first_batch = 0;
+    if (pad_batches != 1) { // bert style
+        pad_first_batch = (blockDim.y * (blockIdx.x + gridDim.x * blockIdx.z) + threadIdx.y) * WARP_BATCH;
+    } else { // gpt2 style
+        pad_first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+    }
+
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    src += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+    dst += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+    mask += pad_first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+
+    // load data from global memory
+    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
+    input_t temp_data[ELEMENTS_PER_LDG_STG];
+    uint8_t temp_mask[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+
+            if (element_index < batch_element_count) {
+                int itr_idx = i*element_count+it*WARP_SIZE;
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + itr_idx);
+                copy_vector<uint8_t, ELEMENTS_PER_LDG_STG>(temp_mask, mask + itr_idx);
+
+                #pragma unroll
+                  for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                      if (temp_mask[element] != 1) {
+                          elements[i][it + element] = (acc_t)temp_data[element] * scale;
+                      } else {
+                          elements[i][it + element] = -10000.0;
+                      }
+                  }
+            } else {
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
+                }
+            }
+        }
+    }
+
+    // compute max_value
+    acc_t max_value[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        max_value[i] = elements[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
+
+    // compute scale value to account for full mask
+    acc_t scale_value[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        scale_value[i] = (max_value[i] == -10000.0) ? 0.0 : 1.0;
+    }
+ 
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            elements[i][it] = std::exp((elements[i][it] - max_value[i]));
+            sum[i] += elements[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    output_t out[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    out[element] = elements[i][it + element] * scale_value[i]/ sum[i];
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count + it * WARP_SIZE, out);  
+            } else {
+                break;
+            } 
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_masked_softmax_warp_backward(
+    output_t *gradInput, 
+    input_t *grad, 
+    const input_t *output,
+    acc_t scale, 
+    int micro_batch_size, 
+    int element_count)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_backward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
+
+    // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, )
+    // gridDim/blockIdx = (seq_len, attn_heads, batches) 
+    int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+    
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    // the first element to process by the current thread
+    int thread_offset = first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx;
+    grad += thread_offset;
+    output += thread_offset;
+    gradInput += thread_offset;
+
+    // load data from global memory
+    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
+    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
+    input_t temp_grad[ELEMENTS_PER_LDG_STG];
+    input_t temp_output[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_grad, grad + i * element_count + it * WARP_SIZE);
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_output, output + i * element_count + it * WARP_SIZE);
+
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    output_reg[i][it + element] = (acc_t)temp_output[element];
+                }
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    grad_reg[i][it + element] = (acc_t)temp_grad[element] * output_reg[i][it + element];
+                }
+            } 
+        }
+    }
+   
+    acc_t sum[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        sum[i] = grad_reg[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            sum[i] += grad_reg[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                // compute gradients
+                output_t out[ELEMENTS_PER_LDG_STG];
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    out[element] = (output_t)(scale * (grad_reg[i][it + element] - output_reg[i][it + element] * sum[i]));
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(gradInput + i * element_count + it * WARP_SIZE, out);
+            } 
+        }
+    }
+}
+} // end of anonymous namespace
+
+int get_batch_per_block(int query_seq_len, int key_seq_len, int batches, int attn_heads){
+    int log2_elements = log2_ceil(key_seq_len);
+    const int next_power_of_two = 1 << log2_elements;
+
+    int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+    constexpr int threads_per_block = 128;
+    int warps_per_block = (threads_per_block / warp_size);
+    int batches_per_block = warps_per_block * batches_per_warp;
+
+    return batches_per_block;
+}
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_masked_softmax_forward(
+    output_t *dst, 
+    const input_t *src, 
+    const uint8_t *mask,
+    const input_t scale, 
+    int query_seq_len, 
+    int key_seq_len, 
+    int batches,
+    int attn_heads,
+    int pad_batches)
+{
+    TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 8192 );
+    if (key_seq_len == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(key_seq_len);
+        const int next_power_of_two = 1 << log2_elements;
+        int batch_count = batches * attn_heads * query_seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        TORCH_INTERNAL_ASSERT(query_seq_len%batches_per_block == 0);
+        dim3 blocks(query_seq_len/batches_per_block, attn_heads, batches);
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 1: // 2
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 2: // 4
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 3: // 8
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 4: // 16
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 5: // 32
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 6: // 64
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 7: // 128
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 8: // 256
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 9: // 512
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 10: // 1024
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 11: // 2048
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 12: // 4096
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 12>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            case 13: // 8192
+                scaled_masked_softmax_warp_forward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches);
+                break;
+            default:
+                break;
+        }
+    }
+}
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_masked_softmax_backward(
+    output_t *grad_input, 
+    input_t *grad, 
+    const input_t *output, 
+    const acc_t scale, 
+    int query_seq_len, 
+    int key_seq_len, 
+    int batches,
+    int attn_heads)
+{
+    TORCH_INTERNAL_ASSERT( key_seq_len >= 0 && key_seq_len <= 8192 );
+    if (key_seq_len == 0) {
+       return;
+    } else {
+        int log2_elements = log2_ceil(key_seq_len);
+        const int next_power_of_two = 1 << log2_elements;
+        int batch_count = batches *  attn_heads * query_seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        int blocks = batch_count/batches_per_block;
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 1: // 2
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 2: // 4
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 3: // 8
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 4: // 16
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 5: // 32
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 6: // 64
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 7: // 128
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 8: // 256
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 9: // 512
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 10: // 1024
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 11: // 2048
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 12: // 4096
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 12>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            case 13: // 8192
+                scaled_masked_softmax_warp_backward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, key_seq_len);
+                break;
+            default:
+                break;
+        }
+    }
+}
diff --git a/scaled_masked_softmax_cuda.cu b/scaled_masked_softmax_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a08e752699c239fc6867ebdac434d3687dabbad8
--- /dev/null
+++ b/scaled_masked_softmax_cuda.cu
@@ -0,0 +1,121 @@
+/* coding=utf-8
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "scaled_masked_softmax.h"
+#include "type_shim.h"
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_masked_softmax {
+
+int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads){
+    return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads);
+}
+
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input,
+    torch::Tensor const& mask,
+    float scale_factor)
+{
+  // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
+  const int batches = input.size(0);
+  const int pad_batches = mask.size(0);
+  const int attn_heads = input.size(1);
+  const int query_seq_len = input.size(2);
+  const int key_seq_len = input.size(3);
+  TORCH_INTERNAL_ASSERT(key_seq_len <= 8192);
+  TORCH_INTERNAL_ASSERT(query_seq_len > 1);
+  TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches);
+  TORCH_INTERNAL_ASSERT(mask.size(1) == 1);
+  TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len);
+  TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len);
+
+  // Output 
+  auto act_options = input.options().requires_grad(false);
+  torch::Tensor softmax_results = 
+      torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
+
+  // Softmax Intermediate Result Ptr
+  void* input_ptr = static_cast<void*>(input.data_ptr());
+  void* mask_ptr = static_cast<void*>(mask.data_ptr());
+  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
+
+  DISPATCH_HALF_AND_BFLOAT(
+      input.scalar_type(),
+      "dispatch_scaled_masked_softmax_forward",
+      dispatch_scaled_masked_softmax_forward<scalar_t, scalar_t, float>(
+          reinterpret_cast<scalar_t*>(softmax_results_ptr),
+          reinterpret_cast<const scalar_t*>(input_ptr),
+          reinterpret_cast<const uint8_t*>(mask_ptr),
+          scale_factor,
+          query_seq_len,
+          key_seq_len,
+          batches,
+          attn_heads,
+          pad_batches
+      );
+  );
+  return softmax_results;
+}
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads_, 
+    torch::Tensor const& softmax_results_, 
+    float scale_factor)  {
+    
+  auto output_grads = output_grads_.contiguous();
+  auto softmax_results = softmax_results_.contiguous();
+
+  //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len]
+  const int batches = output_grads.size(0);
+  const int attn_heads = output_grads.size(1);
+  const int query_seq_len = output_grads.size(2);
+  const int key_seq_len = output_grads.size(3);
+
+  auto act_options = output_grads.options().requires_grad(false);
+  torch::Tensor input_grads = 
+      torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options);
+  void* input_grads_ptr = static_cast<void*>(input_grads.data_ptr());
+  void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
+
+  //Softmax Grad
+  DISPATCH_HALF_AND_BFLOAT(
+      output_grads_.scalar_type(),
+      "dispatch_scaled_masked_softmax_backward",
+      dispatch_scaled_masked_softmax_backward<scalar_t, scalar_t, float>(
+          reinterpret_cast<scalar_t*>(input_grads_ptr), 
+          reinterpret_cast<scalar_t*>(output_grads_ptr), 
+          reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
+          scale_factor,
+          query_seq_len,
+          key_seq_len,
+          batches,
+          attn_heads
+      );
+  );
+  return input_grads;
+}
+}
+}
+}
diff --git a/scaled_upper_triang_masked_softmax.h b/scaled_upper_triang_masked_softmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..21e93fb313a00d2bc51ecc09f4a87f02ece5a4f6
--- /dev/null
+++ b/scaled_upper_triang_masked_softmax.h
@@ -0,0 +1,529 @@
+/* coding=utf-8
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <assert.h>
+#include <cuda_fp16.h>
+#include <cfloat>
+#include <limits>
+#include <stdint.h>
+#include <c10/macros/Macros.h>
+
+namespace {
+
+template <typename Datatype, int ELEMENTS_PER_LDG>
+__device__ __inline__ void copy_vector(Datatype *dst, const Datatype *src);
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 1>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<c10::BFloat16, 4>(c10::BFloat16 *dst, const c10::BFloat16 *src) { *((float2*) dst) = *((float2*) src); }
+  
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 1>(c10::Half *dst, const c10::Half *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<c10::Half, 4>(c10::Half *dst, const c10::Half *src) { *((float2*) dst) = *((float2*) src); }
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 1>(uint8_t *dst, const uint8_t *src) { *dst = *src; }
+
+template <>
+__device__ __inline__ void copy_vector<uint8_t, 4>(uint8_t *dst, const uint8_t *src) {*((half2*) dst) = *((half2*) src); }
+
+template <typename Datatype, int ELEMENTS_PER_LDG>
+__device__ __inline__ void copy_zero_vector(Datatype *dst);
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::BFloat16, 1>(c10::BFloat16 *dst) { *dst = 0.0; }
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::BFloat16, 4>(c10::BFloat16 *dst) { *((float2*) dst) = make_float2(0.0f, 0.0f); }
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::Half, 1>(c10::Half *dst) { *dst = 0.0; }
+
+template <>
+__device__ __inline__ void copy_zero_vector<c10::Half, 4>(c10::Half *dst) { *((float2*) dst) = make_float2(0.0f, 0.0f); }
+
+
+int log2_ceil(int value) {
+    int log2_value = 0;
+    while ((1 << log2_value) < value) ++log2_value;
+    return log2_value;
+}
+
+template<typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+template<typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+    return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
+__device__ __forceinline__ void warp_reduce(acc_t* sum) {
+    ReduceOp<acc_t> r;
+    #pragma unroll
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+        #pragma unroll
+        for (int i = 0;  i < WARP_BATCH;  ++i) {
+            acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE);
+            sum[i] = r(sum[i], b);
+        }
+    }
+}
+
+/*
+ * Extended softmax (from native aten pytorch) with following additional features
+ * 1) input scaling
+ * 2) Implicit time (diagonal masking)
+ */
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_upper_triang_masked_softmax_warp_forward(
+    output_t *dst, 
+    const input_t *src, 
+    const acc_t scale, 
+    int micro_batch_size, 
+    int stride, 
+    int element_count) 
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_forward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
+
+    int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x;
+    int local_seq = blockIdx.x + 1; 
+    int warp_iteration_limit = (local_seq + ELEMENTS_PER_LDG_STG * WARP_SIZE - 1)/ WARP_SIZE;
+
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    src += first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
+    dst += first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
+
+    // load data from global memory
+    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
+    input_t temp_data[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : local_seq;
+
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+
+            if (element_index < batch_element_count) {
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_data, src + i*element_count*stride + it*WARP_SIZE);
+
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if ((element_index + element) < batch_element_count) {
+                        elements[i][it+element] = (acc_t)temp_data[element] * scale;
+                    } else {
+                        elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
+                    }
+                }
+            } else {
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    elements[i][it + element] = -std::numeric_limits<acc_t>::infinity();
+                }
+            }
+        }
+    }
+
+    // compute max_value
+    acc_t max_value[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        max_value[i] = elements[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
+
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (it < warp_iteration_limit) {
+                elements[i][it] = std::exp((elements[i][it] - max_value[i]));
+                sum[i] += elements[i][it];
+            } 
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    output_t out[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+
+            if (element_index < local_seq) {
+
+                #pragma unroll  
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if (element_index + element < local_seq) {
+                        out[element] = elements[i][it + element] / sum[i];
+                    } else {
+                        out[element] = 0;
+                    }
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count * stride + it * WARP_SIZE, out);
+            } else if (element_index < element_count) {
+                copy_zero_vector<output_t, ELEMENTS_PER_LDG_STG>(dst + i * element_count * stride + it * WARP_SIZE);
+            } else {
+                break;
+            } 
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements>
+__global__ void scaled_upper_triang_masked_softmax_warp_backward(
+    output_t *gradInput, 
+    input_t *grad, 
+    const input_t *output,
+    acc_t scale, 
+    int micro_batch_size, 
+    int stride, 
+    int element_count)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and 
+    // warp_size of method warp_softmax_backward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+    constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4;
+
+    int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x;
+    int local_seq = blockIdx.x + 1; 
+    
+    // micro_batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = micro_batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+
+    // the first element to process by the current thread
+    int thread_offset = first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx;
+    grad += thread_offset;
+    output += thread_offset;
+    gradInput += thread_offset;
+
+    // load data from global memory
+    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
+    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f };
+    input_t temp_grad[ELEMENTS_PER_LDG_STG];
+    input_t temp_output[ELEMENTS_PER_LDG_STG];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : local_seq;
+
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_grad, grad + i * element_count * stride + it * WARP_SIZE);
+                copy_vector<input_t, ELEMENTS_PER_LDG_STG>(temp_output, output + i * element_count * stride + it * WARP_SIZE);
+
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if (element_index + element < batch_element_count) {
+                        output_reg[i][it + element] = (acc_t)temp_output[element];
+                    }
+                }
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    if (element_index + element < batch_element_count) {
+                        grad_reg[i][it + element] = (acc_t)temp_grad[element] * output_reg[i][it + element];
+                    }
+                }
+            }
+        }
+    }
+   
+    acc_t sum[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        sum[i] = grad_reg[i][0];
+        #pragma unroll
+        for (int it = 1;  it < WARP_ITERATIONS;  ++it) {
+            sum[i] += grad_reg[i][it];
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  it+=ELEMENTS_PER_LDG_STG) {
+            int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                // compute gradients
+                output_t out[ELEMENTS_PER_LDG_STG];
+                #pragma unroll
+                for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) {
+                    out[element] = (output_t)(scale * (grad_reg[i][it + element] - output_reg[i][it + element] * sum[i]));
+                }
+                copy_vector<output_t, ELEMENTS_PER_LDG_STG>(gradInput + i * element_count * stride + it * WARP_SIZE, out);
+            } 
+        }
+    }
+}
+
+} // end of anonymous namespace
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_upper_triang_masked_softmax_forward(
+    output_t *dst, 
+    const input_t *src, 
+    const input_t scale, 
+    int softmax_elements, 
+    int softmax_elements_stride, 
+    int attn_batches)
+{
+    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 8192 );
+    if (softmax_elements == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+        int seq_len = softmax_elements;
+        int batch_count = attn_batches * seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0);
+
+        int blocks_per_seq = attn_batches / batches_per_block;
+        dim3 blocks(seq_len, blocks_per_seq, 1);
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 1: // 2
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 2: // 4
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 3: // 8
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 4: // 16
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 5: // 32
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 6: // 64
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 7: // 128
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 8: // 256
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 9: // 512
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 10: // 1024
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 11: // 2048
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 12: // 4096
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 12>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 13: // 8192
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            default:
+                break;
+        }
+    }
+}
+
+template<typename input_t, typename output_t, typename acc_t>
+void dispatch_scaled_upper_triang_masked_softmax_backward(
+    output_t *grad_input, 
+    input_t *grad, 
+    const input_t *output, 
+    const acc_t scale, 
+    int softmax_elements, 
+    int softmax_elements_stride, 
+    int attn_batches)
+{
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 8192 );
+    if (softmax_elements == 0) {
+       return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+        int seq_len = softmax_elements;
+        int batch_count = attn_batches * seq_len;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
+        int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximimize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0);
+
+        int blocks_per_seq = attn_batches / batches_per_block;
+        dim3 blocks(seq_len, blocks_per_seq, 1);
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            case 0: // 1
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 0>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 1: // 2
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 1>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 2: // 4
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 2>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 3: // 8
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 3>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 4: // 16
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 4>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 5: // 32
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 5>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 6: // 64
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 6>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 7: // 128
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 7>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 8: // 256
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 8>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 9: // 512
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 9>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 10: // 1024
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 10>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 11: // 2048
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 12: // 4096
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 12>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 13: // 8192
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            default:
+                break;
+        }
+    }
+}
diff --git a/scaled_upper_triang_masked_softmax_cuda.cu b/scaled_upper_triang_masked_softmax_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..79ec30be364317580742e0297ea86145789a38e7
--- /dev/null
+++ b/scaled_upper_triang_masked_softmax_cuda.cu
@@ -0,0 +1,98 @@
+/* coding=utf-8
+ * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "scaled_upper_triang_masked_softmax.h"
+#include "type_shim.h"
+
+namespace multihead_attn {
+namespace fused_softmax {
+namespace scaled_upper_triang_masked_softmax {
+
+torch::Tensor fwd_cuda(
+    torch::Tensor const& input, 
+    float scale_factor)
+{
+  // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
+  const int attn_batches = input.size(0);
+  const int seq_len = input.size(1);
+  TORCH_INTERNAL_ASSERT(seq_len <= 8192);
+
+  // Output 
+  auto act_options = input.options().requires_grad(false);
+  torch::Tensor softmax_results = 
+      torch::empty({attn_batches, seq_len, seq_len}, act_options);
+
+  // Softmax Intermediate Result Ptr
+  void* input_ptr = static_cast<void*>(input.data_ptr());
+  void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
+
+  DISPATCH_HALF_AND_BFLOAT(
+      input.scalar_type(),
+      "dispatch_scaled_upper_triang_masked_softmax_forward",
+      dispatch_scaled_upper_triang_masked_softmax_forward<scalar_t, scalar_t, float>(
+	  reinterpret_cast<scalar_t*>(softmax_results_ptr),
+	  reinterpret_cast<const scalar_t*>(input_ptr),
+	  scale_factor,
+	  seq_len,
+	  seq_len,
+	  attn_batches);
+      );
+  return softmax_results;
+}
+				      
+
+torch::Tensor bwd_cuda(
+    torch::Tensor const& output_grads_, 
+    torch::Tensor const& softmax_results_, 
+    float scale_factor)  {
+	
+  auto output_grads = output_grads_.contiguous();
+  auto softmax_results = softmax_results_.contiguous();
+
+  //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
+  const int attn_batches = output_grads.size(0);
+  const int seq_len = output_grads.size(1);
+  TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
+
+  void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
+
+  //Softmax Grad
+  DISPATCH_HALF_AND_BFLOAT(
+      output_grads_.scalar_type(),
+      "dispatch_scaled_upper_triang_masked_softmax_backward",
+      dispatch_scaled_upper_triang_masked_softmax_backward<scalar_t, scalar_t, float>(
+          reinterpret_cast<scalar_t*>(output_grads_ptr), 
+	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
+	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
+	  scale_factor,
+	  seq_len,
+	  seq_len,
+	  attn_batches);
+      );
+  
+  //backward pass is completely in-place
+  return output_grads;
+}
+}
+}
+}
diff --git a/seq_len.h b/seq_len.h
new file mode 100644
index 0000000000000000000000000000000000000000..76c4d08a317c6288321ec72a67eef4dda959a21a
--- /dev/null
+++ b/seq_len.h
@@ -0,0 +1,168 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/cutlass.h>
+#include <cute/layout.hpp>
+
+namespace flash {
+
+static constexpr int kMaxTileSize = 128;
+
+template <bool UseVarSeqLen> class SeqLenTraits {
+public:
+  // Total number of queries / keys. Unpadded.
+  int sum_s = 0;
+  // seq len offsets.
+  int *cu_seq_len = nullptr;
+  // actual seq len array.
+  int *seq_used = nullptr;
+  // seq len of the current batch.
+  int actual_seq_len = -1;
+
+  // Whether this is for fixed-seq-len or var-seq-len.
+  static constexpr bool kUseVarSeqLen = UseVarSeqLen;
+
+  using ShapeT = std::conditional_t<
+      UseVarSeqLen, 
+      cute::Shape<int32_t, int32_t, int32_t>, 
+      cute::Shape<int32_t, int32_t, int32_t, int32_t>
+  >;
+  using StrideT = std::conditional_t<
+      UseVarSeqLen, 
+      cute::Shape<int64_t, _1, int64_t>, 
+      cute::Shape<int64_t, _1, int64_t, int64_t>
+  >;
+  using LayoutT = cute::Layout<ShapeT, StrideT>;
+
+  using ShapeLseT = std::conditional_t<
+      UseVarSeqLen, 
+      cute::Shape<int32_t, int32_t>, 
+      cute::Shape<int32_t, int32_t, int32_t>
+  >;
+  using StrideLseT = std::conditional_t<
+      UseVarSeqLen, 
+      cute::Shape<int64_t, _1>, 
+      cute::Shape<int64_t, int64_t, _1>
+  >;
+  using LayoutLseT = cute::Layout<ShapeLseT, StrideLseT>;
+
+  CUTLASS_HOST SeqLenTraits() {}
+
+  CUTLASS_HOST SeqLenTraits(
+      int sum_s, int max_seq_len, int *cu_seq_len = nullptr, int *seq_used = nullptr): 
+      sum_s(sum_s), cu_seq_len(cu_seq_len), seq_used(seq_used), actual_seq_len(max_seq_len) {}
+
+  // Returns the layout of a tensor in MKHB format in global memory.
+  // padded: only useful for var-seq-len for dq_accum and softmax_d.
+  CUTLASS_HOST_DEVICE auto get_gmem_layout(
+      int m, int k, int h, int b, 
+      int64_t m_stride, int64_t h_stride, int64_t b_stride,
+      bool padded = false) const {
+    static_assert(!UseVarSeqLen, "Default implementation is for FixedSeqLen.");
+    return make_layout(make_shape(m, k, h, b),
+                       make_stride(m_stride, cute::_1{}, h_stride, b_stride));
+  }
+
+  // Returns the layout of a tensor in MKHB format in global memory.
+  // padded: only useful for var-seq-len for dq_accum and softmax_d.
+  CUTLASS_HOST_DEVICE auto get_lse_gmem_layout(
+      int m, int h, int b, bool padded = false) const {
+    static_assert(!UseVarSeqLen, "Default implementation is for FixedSeqLen.");
+    return make_layout(make_shape(b, h, m),
+                       make_stride(int64_t(h * m), int64_t(m), cute::_1()));
+  }
+
+  CUTLASS_DEVICE void init(int bidb) {}
+
+  template <typename MTensor, typename Shape>
+  CUTLASS_DEVICE auto get_local_tile_tensor(
+      const MTensor &m_tensor, const Shape &tile_shape, 
+      int bidh, int bidb, bool padded = false) const {
+    auto g_tensor = local_tile(
+      m_tensor(_, _, bidh, bidb), tile_shape, make_coord(_, _0{}));
+    return g_tensor;
+  }
+
+  template <typename MTensor, typename Shape>
+  CUTLASS_DEVICE auto get_lse_local_tile_tensor(
+      const MTensor &m_tensor, const Shape &tile_shape, 
+      int bidh, int bidb, bool padded = false) const {
+    auto g_tensor = local_tile(m_tensor(bidb, bidh, _), tile_shape, make_coord(_));
+    return g_tensor;
+  }
+};
+
+using FixedSeqLenTraits = SeqLenTraits<false>;
+
+using VarSeqLenTraits = SeqLenTraits<true>;
+
+// Returns the static layout of a var-seq-len tensor in global memory based on
+// max_seq_len and max_batch_size.
+// padded: only useful for var-seq-len for dq_accum and softmax_d.
+// When padded is True, use B_M + kMaxTileSize * B as the total B_M.
+template <>
+CUTLASS_HOST_DEVICE auto VarSeqLenTraits::get_gmem_layout(
+    int m, int k, int h, int b, 
+    int64_t m_stride, int64_t h_stride, int64_t b_stride,
+    bool padded) const {
+  return make_layout(
+    make_shape(sum_s + (padded ? kMaxTileSize * b : 0), k, h), 
+    make_stride(m_stride, cute::_1{}, h_stride));
+}
+
+// padded: only useful for var-seq-len for dq_accum and softmax_d.
+// When padded is True, use B_M + kMaxTileSize * B as the total B_M.
+template <>
+CUTLASS_HOST_DEVICE auto VarSeqLenTraits::get_lse_gmem_layout(
+    int m, int h, int b, bool padded) const {
+  return make_layout(
+    make_shape(h, sum_s + (padded ? kMaxTileSize * b : 0)), 
+    make_stride(int64_t(sum_s + (padded ? kMaxTileSize * b : 0)), cute::_1()));
+}
+
+template <>
+CUTLASS_DEVICE void VarSeqLenTraits::init(int bidb) {
+  actual_seq_len = 
+      seq_used ? seq_used[bidb] : (cu_seq_len[bidb + 1] - cu_seq_len[bidb]);
+}
+
+template <>
+template <typename MTensor, typename Shape>
+CUTLASS_DEVICE auto VarSeqLenTraits::get_local_tile_tensor(
+    const MTensor &m_tensor, const Shape &tile_shape,
+    int bidh, int bidb, bool padded) const {
+  auto g_offset = local_tile(
+      m_tensor(_, _, bidh), 
+      cute::make_shape(1, get<1>(tile_shape)), 
+      make_coord(cu_seq_len[bidb] + (padded ? kMaxTileSize * bidb : 0), _0{}));
+  auto g_sequence = make_tensor(
+      g_offset.data(), 
+      make_layout(
+        cute::make_shape(actual_seq_len, get<1>(tile_shape)), 
+        g_offset.stride()
+      ));
+  auto g_tensor = local_tile(g_sequence, tile_shape, make_coord(_, _0{}));
+  return g_tensor;
+}
+
+template <>
+template <typename MTensor, typename Shape>
+CUTLASS_DEVICE auto VarSeqLenTraits::get_lse_local_tile_tensor(
+    const MTensor &m_tensor, const Shape &tile_shape,
+    int bidh, int bidb, bool padded) const {
+  auto g_offset = local_tile(
+      m_tensor(bidh, _), cute::make_shape(_1{}), 
+      make_coord(cu_seq_len[bidb] + (padded ? kMaxTileSize * bidb : 0)));
+  auto g_sequence = make_tensor(
+      g_offset.data(), 
+      make_layout(cute::make_shape(actual_seq_len), cute::make_shape(_1{})));
+  auto g_tensor = local_tile(g_sequence, tile_shape, make_coord(_));
+  return g_tensor;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace flash
diff --git a/sequence-model.yaml b/sequence-model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..435cf0501a3c4b24efd35d72fbcabcaf19748589
--- /dev/null
+++ b/sequence-model.yaml
@@ -0,0 +1 @@
+_target_: src.tasks.seq.SequenceModel
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..12e86667c4cd3486466a36db186f812b88156131
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,295 @@
+# Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+
+import sys
+import warnings
+import os
+import re
+import shutil
+import ast
+from pathlib import Path
+from packaging.version import parse, Version
+import platform
+
+from setuptools import setup, find_packages
+import subprocess
+
+import urllib.request
+import urllib.error
+from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
+
+import torch
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME
+
+
+# with open("../README.md", "r", encoding="utf-8") as fh:
+with open("../README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+
+
+# ninja build does not work unless include_dirs are abs path
+this_dir = os.path.dirname(os.path.abspath(__file__))
+
+PACKAGE_NAME = "flashattn-hopper"
+
+BASE_WHEEL_URL = "https://github.com/Dao-AILab/flash-attention/releases/download/{tag_name}/{wheel_name}"
+
+# FORCE_BUILD: Force a fresh build locally, instead of attempting to find prebuilt wheels
+# SKIP_CUDA_BUILD: Intended to allow CI to use a simple `python setup.py sdist` run to copy over raw files, without any cuda compilation
+FORCE_BUILD = os.getenv("FAHOPPER_FORCE_BUILD", "FALSE") == "TRUE"
+SKIP_CUDA_BUILD = os.getenv("FAHOPPER_SKIP_CUDA_BUILD", "FALSE") == "TRUE"
+# For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI
+FORCE_CXX11_ABI = os.getenv("FAHOPPER_FORCE_CXX11_ABI", "FALSE") == "TRUE"
+
+
+def get_platform():
+    """
+    Returns the platform name as used in wheel filenames.
+    """
+    if sys.platform.startswith("linux"):
+        return "linux_x86_64"
+    elif sys.platform == "darwin":
+        mac_version = ".".join(platform.mac_ver()[0].split(".")[:2])
+        return f"macosx_{mac_version}_x86_64"
+    elif sys.platform == "win32":
+        return "win_amd64"
+    else:
+        raise ValueError("Unsupported platform: {}".format(sys.platform))
+
+
+def get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    bare_metal_version = parse(output[release_idx].split(",")[0])
+
+    return raw_output, bare_metal_version
+
+
+def check_if_cuda_home_none(global_option: str) -> None:
+    if CUDA_HOME is not None:
+        return
+    # warn instead of error because user could be downloading prebuilt wheels, so nvcc won't be necessary
+    # in that case.
+    warnings.warn(
+        f"{global_option} was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  "
+        "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, "
+        "only images whose names contain 'devel' will provide nvcc."
+    )
+
+
+def append_nvcc_threads(nvcc_extra_args):
+    return nvcc_extra_args + ["--threads", "4"]
+
+
+cmdclass = {}
+ext_modules = []
+
+# We want this even if SKIP_CUDA_BUILD because when we run python setup.py sdist we want the .hpp
+# files included in the source distribution, in case the user compiles from source.
+subprocess.run(["git", "submodule", "update", "--init", "../csrc/cutlass"])
+
+if not SKIP_CUDA_BUILD:
+    print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
+    TORCH_MAJOR = int(torch.__version__.split(".")[0])
+    TORCH_MINOR = int(torch.__version__.split(".")[1])
+
+    check_if_cuda_home_none("--fahopper")
+    cc_flag = []
+    _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
+    if bare_metal_version < Version("12.3"):
+        raise RuntimeError("FA Hopper is only supported on CUDA 12.3 and above")
+    cc_flag.append("-gencode")
+    cc_flag.append("arch=compute_90a,code=sm_90a")
+
+    # HACK: The compiler flag -D_GLIBCXX_USE_CXX11_ABI is set to be the same as
+    # torch._C._GLIBCXX_USE_CXX11_ABI
+    # https://github.com/pytorch/pytorch/blob/8472c24e3b5b60150096486616d98b7bea01500b/torch/utils/cpp_extension.py#L920
+    if FORCE_CXX11_ABI:
+        torch._C._GLIBCXX_USE_CXX11_ABI = True
+    repo_dir = Path(this_dir).parent
+    cutlass_dir = repo_dir / "csrc" / "cutlass"
+    sources = [
+        "flash_api.cpp",
+        "flash_fwd_hdim64_fp16_sm90.cu",
+        "flash_fwd_hdim64_bf16_sm90.cu",
+        "flash_fwd_hdim128_fp16_sm90.cu",
+        "flash_fwd_hdim128_bf16_sm90.cu",
+        "flash_fwd_hdim256_fp16_sm90.cu",
+        "flash_fwd_hdim256_bf16_sm90.cu",
+        "flash_bwd_hdim64_fp16_sm90.cu",
+        "flash_bwd_hdim96_fp16_sm90.cu",
+        "flash_bwd_hdim128_fp16_sm90.cu",
+        # "flash_bwd_hdim256_fp16_sm90.cu",
+        "flash_bwd_hdim64_bf16_sm90.cu",
+        "flash_bwd_hdim96_bf16_sm90.cu",
+        "flash_bwd_hdim128_bf16_sm90.cu",
+        "flash_fwd_hdim64_e4m3_sm90.cu",
+        "flash_fwd_hdim128_e4m3_sm90.cu",
+        "flash_fwd_hdim256_e4m3_sm90.cu"
+    ]
+    nvcc_flags = [
+        "-O3",
+        # "-O0",
+        "-std=c++17",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_BFLOAT16_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "-U__CUDA_NO_BFLOAT162_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT162_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "--use_fast_math",
+        "--ptxas-options=-v",  # printing out number of registers
+        "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage",  # printing out number of registers
+        "-lineinfo",
+        "-DCUTLASS_DEBUG_TRACE_LEVEL=0",  # Can toggle for debugging
+        "-DNDEBUG",  # Important, otherwise performance is severely impacted             
+    ]
+    include_dirs = [
+        # Path(this_dir) / "fmha-pipeline",
+        # repo_dir / "lib",
+        # repo_dir / "include",
+        cutlass_dir / "include",
+        # cutlass_dir / "examples" / "common",
+        # cutlass_dir / "tools" / "util" / "include",
+    ]
+
+    ext_modules.append(
+        CUDAExtension(
+            name="flashattn_hopper_cuda",
+            sources=sources,
+            extra_compile_args={
+                "cxx": ["-O3", "-std=c++17"],
+                # "cxx": ["-O0", "-std=c++17"],
+                "nvcc": append_nvcc_threads(
+                    nvcc_flags + cc_flag
+                ),
+            },
+            include_dirs=include_dirs,
+            # Without this we get and error about cuTensorMapEncodeTiled not defined
+            libraries=["cuda"]
+        )
+    )
+    # ext_modules.append(
+    #     CUDAExtension(
+    #         name="flashattn_hopper_cuda_ws",
+    #         sources=sources,
+    #         extra_compile_args={
+    #             "cxx": ["-O3", "-std=c++17"],
+    #             "nvcc": append_nvcc_threads(
+    #                 nvcc_flags + ["-DEXECMODE=1"] + cc_flag
+    #             ),
+    #         },
+    #         include_dirs=include_dirs,
+    #         # Without this we get and error about cuTensorMapEncodeTiled not defined
+    #         libraries=["cuda"]
+    #     )
+    # )
+
+
+def get_package_version():
+    with open(Path(this_dir) / "__init__.py", "r") as f:
+        version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE)
+    public_version = ast.literal_eval(version_match.group(1))
+    local_version = os.environ.get("FLASHATTN_HOPPER_LOCAL_VERSION")
+    if local_version:
+        return f"{public_version}+{local_version}"
+    else:
+        return str(public_version)
+
+
+def get_wheel_url():
+    # Determine the version numbers that will be used to determine the correct wheel
+    # We're using the CUDA version used to build torch, not the one currently installed
+    # _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME)
+    torch_cuda_version = parse(torch.version.cuda)
+    torch_version_raw = parse(torch.__version__)
+    # For CUDA 11, we only compile for CUDA 11.8, and for CUDA 12 we only compile for CUDA 12.2
+    # to save CI time. Minor versions should be compatible.
+    torch_cuda_version = parse("11.8") if torch_cuda_version.major == 11 else parse("12.2")
+    python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
+    platform_name = get_platform()
+    package_version = get_package_version()
+    # cuda_version = f"{cuda_version_raw.major}{cuda_version_raw.minor}"
+    cuda_version = f"{torch_cuda_version.major}{torch_cuda_version.minor}"
+    torch_version = f"{torch_version_raw.major}.{torch_version_raw.minor}"
+    cxx11_abi = str(torch._C._GLIBCXX_USE_CXX11_ABI).upper()
+
+    # Determine wheel URL based on CUDA version, torch version, python version and OS
+    wheel_filename = f"{PACKAGE_NAME}-{package_version}+cu{cuda_version}torch{torch_version}cxx11abi{cxx11_abi}-{python_version}-{python_version}-{platform_name}.whl"
+    wheel_url = BASE_WHEEL_URL.format(tag_name=f"v{package_version}", wheel_name=wheel_filename)
+    return wheel_url, wheel_filename
+
+
+class CachedWheelsCommand(_bdist_wheel):
+    """
+    The CachedWheelsCommand plugs into the default bdist wheel, which is ran by pip when it cannot
+    find an existing wheel (which is currently the case for all installs). We use
+    the environment parameters to detect whether there is already a pre-built version of a compatible
+    wheel available and short-circuits the standard full build pipeline.
+    """
+
+    def run(self):
+        if FORCE_BUILD:
+            return super().run()
+
+        wheel_url, wheel_filename = get_wheel_url()
+        print("Guessing wheel URL: ", wheel_url)
+        try:
+            urllib.request.urlretrieve(wheel_url, wheel_filename)
+
+            # Make the archive
+            # Lifted from the root wheel processing command
+            # https://github.com/pypa/wheel/blob/cf71108ff9f6ffc36978069acb28824b44ae028e/src/wheel/bdist_wheel.py#LL381C9-L381C85
+            if not os.path.exists(self.dist_dir):
+                os.makedirs(self.dist_dir)
+
+            impl_tag, abi_tag, plat_tag = self.get_tag()
+            archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}"
+
+            wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl")
+            print("Raw wheel path", wheel_path)
+            shutil.move(wheel_filename, wheel_path)
+        except urllib.error.HTTPError:
+            print("Precompiled wheel not found. Building from source...")
+            # If the wheel could not be downloaded, build from source
+            super().run()
+
+setup(
+    name=PACKAGE_NAME,
+    version=get_package_version(),
+    packages=find_packages(
+        exclude=(
+            "build",
+            "csrc",
+            "include",
+            "tests",
+            "dist",
+            "docs",
+            "benchmarks",
+        )
+    ),
+    py_modules=["flash_attn_interface"],
+    description="FlashAttention-3",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: Unix",
+    ],
+    ext_modules=ext_modules,
+    cmdclass={"bdist_wheel": CachedWheelsCommand, "build_ext": BuildExtension}
+    if ext_modules
+    else {
+        "bdist_wheel": CachedWheelsCommand,
+    },
+    python_requires=">=3.8",
+    install_requires=[
+        "torch",
+        "einops",
+        "packaging",
+        "ninja",
+    ],
+)
diff --git a/sgd.yaml b/sgd.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..43b834653e8735acf54ca3b21f89cc480a65ac3e
--- /dev/null
+++ b/sgd.yaml
@@ -0,0 +1,2 @@
+# @package train.optimizer
+_target_: torch.optim.SGD
diff --git a/smoke.yaml b/smoke.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..eac3dd286536a02635797f834e93ebb63007f009
--- /dev/null
+++ b/smoke.yaml
@@ -0,0 +1,22 @@
+# @package _global_
+# Smoke test: disable logging and model checkpointing
+
+logger:
+  wandb:
+    mode: disabled
+
+callbacks:
+  model_checkpoint: null
+  model_checkpoint_progress: null
+
+hydra:
+  # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
+  # sets level of only chosen command line loggers to 'DEBUG'
+  # verbose: [src.train, src.utils.utils]
+
+  # sets output paths for all file logs to 'logs/debug/'
+  run:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/debug/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/debug/multirun_${now:%Y-%m-%d_%H-%M-%S}
+    subdir: ${hydra.job.num}
diff --git a/softmax.h b/softmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..79433b82e71754e05b0aa013c0c7d4f03070e51c
--- /dev/null
+++ b/softmax.h
@@ -0,0 +1,234 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cmath>
+
+#include <cute/tensor.hpp>
+
+#include <cutlass/numeric_types.h>
+
+#include "utils.h"
+
+#include "cutlass/fast_math.h"
+
+namespace flash {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1, typename Operator>
+__device__ __forceinline__ void thread_reduce_(Tensor<Engine0, Layout0> const &tensor, Tensor<Engine1, Layout1> &summary, Operator &op) {
+    static_assert(Layout0::rank == 2, "Only support 2D Tensor");
+    static_assert(Layout1::rank == 1, "Only support 1D Tensor");
+    CUTE_STATIC_ASSERT_V(size<0>(summary) == size<0>(tensor));
+    #pragma unroll
+    for (int mi = 0; mi < size<0>(tensor); mi++) {
+        summary(mi) = zero_init ? tensor(mi, 0) : op(summary(mi), tensor(mi, 0));
+        #pragma unroll
+        for (int ni = 1; ni < size<1>(tensor); ni++) {
+            summary(mi) = op(summary(mi), tensor(mi, ni));
+        }
+    }
+}
+
+template<typename Engine0, typename Layout0, typename Engine1, typename Layout1, typename Operator>
+__device__ __forceinline__ void quad_allreduce_(Tensor<Engine0, Layout0> &dst, Tensor<Engine1, Layout1> &src, Operator &op) {
+    CUTE_STATIC_ASSERT_V(size(dst) == size(src));
+    #pragma unroll
+    for (int i = 0; i < size(dst); i++){
+        dst(i) = Allreduce<4>::run(src(i), op);
+    }
+}
+
+template<bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1, typename Operator>
+__device__ __forceinline__ void reduce_(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Layout1> &summary, Operator &op) {
+    thread_reduce_<zero_init>(tensor, summary, op);
+    quad_allreduce_(summary, summary, op);
+}
+
+template<bool zero_init=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
+__device__ __forceinline__ void reduce_max(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Layout1> &max){
+    MaxOp<float> max_op;
+    reduce_<zero_init>(tensor, max, max_op);
+}
+
+template<bool zero_init=true, bool warp_reduce=true, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
+__device__ __forceinline__ void reduce_sum(Tensor<Engine0, Layout0> const& tensor, Tensor<Engine1, Layout1> &sum){
+    SumOp<float> sum_op;
+    thread_reduce_<zero_init>(tensor, sum, sum_op);
+    if constexpr (warp_reduce) { quad_allreduce_(sum, sum, sum_op); }
+}
+
+__forceinline__ __device__ __half2 half_exp(__half2 x) {
+    uint32_t tmp_out, tmp_in;
+    tmp_in = reinterpret_cast<uint32_t&>(x);
+    asm ("ex2.approx.f16x2 %0, %1;\n"
+      : "=r"(tmp_out)
+      : "r"(tmp_in));
+    __half2 out = reinterpret_cast<__half2&>(tmp_out);
+    return out;
+}
+
+// Apply the exp to all the elements.
+template <bool zero_init=false, typename Engine0, typename Layout0, typename Engine1, typename Layout1>
+__forceinline__ __device__ void max_scale_exp2_sum(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> &max, Tensor<Engine1, Layout1> &sum, const float scale) {
+    static_assert(Layout0::rank == 2, "Only support 2D Tensor"); static_assert(Layout1::rank == 1, "Only support 1D Tensor"); CUTE_STATIC_ASSERT_V(size<0>(max) == size<0>(tensor));
+    #pragma unroll
+    for (int mi = 0; mi < size<0>(tensor); ++mi) {
+        MaxOp<float> max_op;
+        max(mi) = zero_init ? tensor(mi, 0) : max_op(max(mi), tensor(mi, 0));
+        #pragma unroll
+        for (int ni = 1; ni < size<1>(tensor); ni++) {
+            max(mi) = max_op(max(mi), tensor(mi, ni));
+        }
+        max(mi) = Allreduce<4>::run(max(mi), max_op);
+        // If max is -inf, then all elements must have been -inf (possibly due to masking).
+        // We don't want (-inf - (-inf)) since that would give NaN.
+        const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale;
+        sum(mi) = 0;
+        #pragma unroll
+        for (int ni = 0; ni < size<1>(tensor); ++ni)  {
+            // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+            // max * log_2(e)) This allows the compiler to use the ffma
+            // instruction instead of fadd and fmul separately.
+            tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);
+            sum(mi) += tensor(mi, ni);
+        }
+    }
+}
+
+// Apply the exp to all the elements.
+template <bool Scale_max=true, bool Check_inf=true, bool Use_max_offset=false,
+          typename Engine0, typename Layout0, typename Engine1, typename Layout1>
+__forceinline__ __device__ void scale_apply_exp2(Tensor<Engine0, Layout0> &tensor, Tensor<Engine1, Layout1> const &max, const float scale) {
+    constexpr static float max_offset = Use_max_offset ? 8.0f : 0.0f;
+    static_assert(Layout0::rank == 2, "Only support 2D Tensor");
+    static_assert(Layout1::rank == 1, "Only support 1D Tensor");
+    CUTE_STATIC_ASSERT_V(size<0>(max) == size<0>(tensor));
+    #pragma unroll
+    for (int mi = 0; mi < size<0>(tensor); ++mi) {
+        // If max is -inf, then all elements must have been -inf (possibly due to masking).
+        // We don't want (-inf - (-inf)) since that would give NaN.
+        // If we don't have float around M_LOG2E the multiplication is done in fp64.
+        const float max_scaled = Check_inf
+            ? (max(mi) == -INFINITY ? 0.f : (!Scale_max ? max(mi) : max(mi) * scale) - max_offset)
+            : (!Scale_max ? max(mi) : max(mi) * scale) - max_offset;
+        #pragma unroll
+        for (int ni = 0; ni < size<1>(tensor); ++ni)  {
+            // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+            // max * log_2(e)) This allows the compiler to use the ffma
+            // instruction instead of fadd and fmul separately.
+            tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int kNRows, bool Use_max_offset_ = false>
+struct Softmax { 
+    constexpr static bool Use_max_offset = Use_max_offset_; 
+    // constexpr static float max_offset = Use_max_offset ? 8.0f : 0.0f;
+    // constexpr static float max_offset_E = max_offset * float(M_LN2);
+
+    using TensorT = decltype(make_tensor<float>(Shape<Int<kNRows>>{}));
+    TensorT row_max, row_sum;
+
+    CUTLASS_DEVICE Softmax() {};
+
+    template<bool Is_first, bool Check_inf=false, typename Tensor0>
+    __forceinline__ __device__ TensorT max(Tensor0 &acc_s, float softmax_scale_log2) {
+        // Reshape acc_s from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, V, MMA_N))
+        Tensor scores = make_tensor(acc_s.data(), flash::convert_layout_acc_rowcol(acc_s.layout()));
+        static_assert(decltype(size<0>(scores))::value == kNRows);
+        TensorT scores_scale;
+        if constexpr (Is_first) {
+            flash::template reduce_max</*zero_init=*/true>(scores, row_max);
+            cute::fill(scores_scale, 1.f);
+        } else {
+            Tensor scores_max_prev = make_fragment_like(row_max);
+            cute::copy(row_max, scores_max_prev);
+            flash::template reduce_max</*zero_init=*/false>(scores, row_max);
+            #pragma unroll
+            for (int mi = 0; mi < size(row_max); ++mi) {
+                float scores_max_cur = !Check_inf
+                    ? row_max(mi)
+                    : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi));
+                scores_scale(mi) = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
+                row_sum(mi) *= scores_scale(mi);
+            }
+        }
+        return scores_scale;
+    };
+
+    template<bool Is_first, bool Check_inf=false, typename Tensor0>
+    __forceinline__ __device__ TensorT online_softmax(Tensor0 &acc_s, float softmax_scale_log2) {
+        // Reshape acc_s from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, V, MMA_N))
+        Tensor scores = make_tensor(acc_s.data(), flash::convert_layout_acc_rowcol(acc_s.layout()));
+        static_assert(decltype(size<0>(scores))::value == kNRows);
+        TensorT scores_scale;
+        if constexpr (Is_first) {
+            flash::template reduce_max</*zero_init=*/true>(scores, row_max);
+            flash::template scale_apply_exp2</*Scale_max=*/true, /*Check_inf=*/true, Use_max_offset>(scores, row_max, softmax_scale_log2);
+            flash::reduce_sum</*zero_init=*/true, /*warp_reduce=*/false>(scores, row_sum);
+            cute::fill(scores_scale, 1.f);
+            // if (cute::thread0()) { print_tensor(scores); printf("\n scale = %f\n", softmax_scale_log2); print_tensor(row_sum); }
+        } else {
+            // Tensor scores_max_prev = make_fragment_like(row_max);
+            // cute::copy(row_max, scores_max_prev);
+            // flash::template reduce_max</*zero_init=*/false>(scores, row_max);
+            // // if (cute::thread0()) { print_tensor(scores); printf("\n"); print_tensor(row_max); printf("\n"); }
+            // #pragma unroll
+            // for (int mi = 0; mi < size(row_max); ++mi) {
+            //     float scores_max_cur = !Check_inf
+            //         ? row_max(mi)
+            //         : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi));
+            //     scores_scale(mi) = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
+            //     row_sum(mi) *= scores_scale(mi);
+            // }
+            flash::template scale_apply_exp2</*Scale_max=*/true, Check_inf, Use_max_offset>(scores, row_max, softmax_scale_log2);
+            // We don't do the reduce across threads here since we don't need to use the row_sum.
+            // We do that reduce at the end when we need to normalize the softmax.
+            flash::reduce_sum</*zero_init=*/false, /*warp_reduce=*/false>(scores, row_sum);
+        }
+        return scores_scale;
+    };
+    
+    template<bool Is_dropout=false, bool Split=false, typename Tensor0>
+    __forceinline__ __device__ TensorT finalize(Tensor0 &acc_s, float softmax_scale_log2, float rp_dropout=1.0) {
+        constexpr static float max_offset_E = Use_max_offset ? 8.0f * float(M_LN2) : 0.0f;
+        // Reshape acc_s from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, V, MMA_N))
+        Tensor scores = make_tensor(acc_s.data(), flash::convert_layout_acc_rowcol(acc_s.layout()));
+        static_assert(decltype(size<0>(scores))::value == kNRows);
+        SumOp<float> sum_op;
+        quad_allreduce_(row_sum, row_sum, sum_op);
+        TensorT scores_scale;
+        #pragma unroll
+        for (int mi = 0; mi < size(row_max); ++mi) {
+            float sum = row_sum(mi);
+            float inv_sum = (sum == 0.f || sum != sum) ? 0.f : 1.f / sum;
+            row_sum(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : (row_max(mi) * softmax_scale_log2) * float(M_LN2) - max_offset_E + __logf(sum);
+            scores_scale(mi) = !Is_dropout ? inv_sum : inv_sum * rp_dropout;
+        }
+        return scores_scale;
+    };
+
+    template<typename Tensor1>
+    __forceinline__ __device__ void rescale_o(Tensor1 &acc_o, TensorT const &scores_scale) {
+        // Reshape acc_o from (MMA=4, MMA_M, MMA_K) to (nrow=(2, MMA_M), ncol=(2, MMA_K))
+        Tensor acc_o_rowcol = make_tensor(acc_o.data(), flash::convert_layout_acc_rowcol(acc_o.layout()));
+        static_assert(decltype(size<0>(acc_o_rowcol))::value == kNRows);
+        #pragma unroll
+        for (int mi = 0; mi < size(row_max); ++mi) {
+            #pragma unroll
+            for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) { acc_o_rowcol(mi, ni) *= scores_scale(mi); }
+        }
+    };
+
+};
+
+}  // namespace flash
diff --git a/static_switch.h b/static_switch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9ec6222498987673280c62af3af690675bd9a06
--- /dev/null
+++ b/static_switch.h
@@ -0,0 +1,79 @@
+// Inspired by
+// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
+// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
+
+#pragma once
+
+/// @param COND       - a boolean expression to switch by
+/// @param CONST_NAME - a name given for the constexpr bool variable.
+/// @param ...       - code to execute for true and false
+///
+/// Usage:
+/// ```
+/// BOOL_SWITCH(flag, BoolConst, [&] {
+///     some_function<BoolConst>(...);
+/// });
+/// ```
+//
+
+#define BOOL_SWITCH(COND, CONST_NAME, ...)                                     \
+  [&] {                                                                        \
+    if (COND) {                                                                \
+      constexpr static bool CONST_NAME = true;                                 \
+      return __VA_ARGS__();                                                    \
+    } else {                                                                   \
+      constexpr static bool CONST_NAME = false;                                \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+  }()
+
+#define PREC_SWITCH(PRECTYPE, ...)                                             \
+  [&] {                                                                        \
+    if (PRECTYPE == 1) {                                                       \
+      using kPrecType = cutlass::half_t;                                       \
+      constexpr static bool kSoftFp16 = false;                                 \
+      constexpr static bool kHybrid = false;                                   \
+      return __VA_ARGS__();                                                    \
+    } else if (PRECTYPE == 2) {                                                \
+      using kPrecType = cutlass::float_e4m3_t;                                 \
+      constexpr static bool kSoftFp16 = false;                                 \
+      constexpr static bool kHybrid = false;                                   \
+      return __VA_ARGS__();                                                    \
+    } else if (PRECTYPE == 3) {                                                \
+      using kPrecType = cutlass::float_e4m3_t;                                 \
+      constexpr static bool kSoftFp16 = false;                                 \
+      constexpr static bool kHybrid = true;                                    \
+      return __VA_ARGS__();                                                    \
+    } else if (PRECTYPE == 4) {                                                \
+      using kPrecType = cutlass::float_e4m3_t;                                 \
+      constexpr static bool kSoftFp16 = true;                                  \
+      constexpr static bool kHybrid = false;                                   \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+  }()
+
+#define HEADDIM_SWITCH(HEADDIM, ...)                                           \
+  [&] {                                                                        \
+    if (HEADDIM == 64) {                                                       \
+      constexpr static int kHeadSize = 64;                                     \
+      return __VA_ARGS__();                                                    \
+    } else if (HEADDIM == 128) {                                               \
+      constexpr static int kHeadSize = 128;                                    \
+      return __VA_ARGS__();                                                    \
+    } else if (HEADDIM == 256) {                                               \
+      constexpr static int kHeadSize = 256;                                    \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+  }()
+
+#define SEQLEN_SWITCH(USE_VAR_SEQ_LEN, NAME, ...)                              \
+  [&] {                                                                        \
+    bool useSeqLen = USE_VAR_SEQ_LEN;                                          \
+    if (useSeqLen) {                                                           \
+      using NAME = flash::VarSeqLenTraits;                                     \
+      return __VA_ARGS__();                                                    \
+    } else {                                                                   \
+      using NAME = flash::FixedSeqLenTraits;                                   \
+      return __VA_ARGS__();                                                    \
+    }                                                                          \
+  }() 
diff --git a/step.yaml b/step.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e0d9a0ce808720175c708b3f1a7d55db655bec28
--- /dev/null
+++ b/step.yaml
@@ -0,0 +1,3 @@
+# @package train.scheduler
+_target_: torch.optim.lr_scheduler.StepLR
+step_size: ???
diff --git a/tensorboard.yaml b/tensorboard.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..acd1fa411d0407535066cd809dbb2a11915a4154
--- /dev/null
+++ b/tensorboard.yaml
@@ -0,0 +1,10 @@
+# https://www.tensorflow.org/tensorboard/
+
+tensorboard:
+  _target_: pytorch_lightning.loggers.tensorboard.TensorBoardLogger
+  save_dir: "tensorboard/"
+  name: "default"
+  version: ${name}
+  log_graph: False
+  default_hp_metric: True
+  prefix: ""
diff --git a/test_baichuan.py b/test_baichuan.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d2964bd5e25ab14a124afcdff3776f17ee137d8
--- /dev/null
+++ b/test_baichuan.py
@@ -0,0 +1,460 @@
+# Copyright (c) 2023, Tri Dao.
+import os
+import time
+from pathlib import Path
+
+import torch
+import pytest
+
+from einops import rearrange
+
+from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
+
+from flash_attn.models.gpt import (
+    GPTLMHeadModel,
+    combine_state_dicts_tp,
+    shard_state_dict_tp,
+)
+from flash_attn.models.baichuan import (
+    remap_state_dict_hf_baichuan,
+    baichuan_config_to_gpt2_config,
+)
+from flash_attn.utils.distributed import all_gather_raw
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+from flash_attn.utils.generation import update_graph_cache
+
+
+@pytest.mark.parametrize(
+    "model_name",
+    [
+        "baichuan-inc/Baichuan-7B",
+        "baichuan-inc/Baichuan-13B-Base",
+        "baichuan-inc/Baichuan2-7B-Base",
+        "baichuan-inc/Baichuan2-13B-Base",
+    ],
+)
+def test_baichuan_state_dict(model_name):
+    config = baichuan_config_to_gpt2_config(
+        AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    )
+    pretrained_state_dict = remap_state_dict_hf_baichuan(
+        state_dict_from_pretrained(model_name), config
+    )
+    model = GPTLMHeadModel(config, device="meta")  # Without device='meta' init is very slow
+    state_dict = model.state_dict()
+    assert len(state_dict.keys()) == len(pretrained_state_dict.keys())
+    assert state_dict.keys() == pretrained_state_dict.keys()
+    for k in state_dict.keys():
+        assert state_dict[k].shape == pretrained_state_dict[k].shape
+
+
+@pytest.mark.parametrize(
+    "model_name",
+    [
+        "baichuan-inc/Baichuan-7B",
+        "baichuan-inc/Baichuan-13B-Base",
+        "baichuan-inc/Baichuan2-7B-Base",
+        "baichuan-inc/Baichuan2-13B-Base",
+    ],
+)
+def test_baichuan_optimized(model_name):
+    """Check that our implementation of Baichuan (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    device = "cuda"
+    config = baichuan_config_to_gpt2_config(
+        AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    )
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = False  # We don't have fused GatedMLP yet
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+
+    pretrained_state_dict = remap_state_dict_hf_baichuan(
+        state_dict_from_pretrained(model_name), config
+    )
+    model = GPTLMHeadModel(config, device=device, dtype=dtype)
+    model.load_state_dict(pretrained_state_dict)
+    model.eval()
+
+    torch.manual_seed(0)
+    batch_size = 2
+    max_seqlen = 256
+    seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device)
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device
+    )
+    with torch.no_grad():
+        out = model.transformer(input_ids)
+        logits = model(input_ids).logits
+    del model
+
+    # Without device_map, the model is loaded on the CPU, which is very slow
+    # Need auto here since the 13B fp32 model doesn't fit in memory on a A100 40GB
+    model_ref = AutoModelForCausalLM.from_pretrained(
+        model_name, device_map="auto", trust_remote_code=True
+    )
+    model_ref.eval()
+    with torch.no_grad():
+        out_ref = model_ref.model(input_ids).last_hidden_state.to(device=device)
+        logits_ref = model_ref(input_ids).logits.to(device=device)
+    del model_ref
+
+    model_hf = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=dtype,
+        device_map={"": device},
+        trust_remote_code=True,
+    )
+    model_hf.eval()
+    with torch.no_grad():
+        out_hf = model_hf.model(input_ids).last_hidden_state
+        logits_hf = model_hf(input_ids).logits
+    del model_hf
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
+    assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item()
+
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+    print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
+    assert (logits - logits_ref).abs().max().item() < 3 * (
+        logits_hf - logits_ref
+    ).abs().max().item()
+
+
+# torchrun --no_python --nproc_per_node=2 pytest -q -s tests/models/test_baichuan.py -k "test_baichuan_parallel_forward"
+@pytest.mark.parametrize("world_size", [2])
+@pytest.mark.parametrize(
+    "model_name",
+    [
+        "baichuan-inc/Baichuan-7B",
+        "baichuan-inc/Baichuan-13B-Base",
+        "baichuan-inc/Baichuan2-7B-Base",
+        "baichuan-inc/Baichuan2-13B-Base",
+    ],
+)
+def test_baichuan_parallel_forward(model_name, world_size):
+    """Check that our implementation of Baichuan (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    from apex.transformer import parallel_state
+
+    dtype = torch.float16
+    config = baichuan_config_to_gpt2_config(
+        AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    )
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = False  # We don't have fused GatedMLP yet
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    process_group = parallel_state.get_tensor_model_parallel_group()
+
+    pretrained_state_dict = remap_state_dict_hf_baichuan(
+        state_dict_from_pretrained(model_name), config
+    )
+
+    model = GPTLMHeadModel(config, process_group=process_group, device=device, dtype=dtype)
+    model.load_state_dict(shard_state_dict_tp(pretrained_state_dict, config, world_size, rank))
+    model.eval()
+
+    torch.manual_seed(0)
+    batch_size = 2
+    max_seqlen = 256
+    seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device)
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device
+    )
+    with torch.no_grad():
+        out = model.transformer(input_ids)
+        out, _ = all_gather_raw(out, process_group=process_group)
+        out = rearrange(out, "(b s) d -> b s d", b=batch_size)
+        logits = model(input_ids).logits
+        logits = rearrange(logits, "(b s) d -> b s d", b=batch_size)
+        logits, _ = all_gather_raw(logits, process_group)
+        logits = rearrange(logits, "(n b) ... d -> b ... (n d)", b=batch_size)
+    del model
+    parallel_state.destroy_model_parallel()
+
+    if rank == 0:
+        # Without device_map, the model is loaded on the CPU, which is very slow
+        model_ref = AutoModelForCausalLM.from_pretrained(
+            model_name, device_map="auto", trust_remote_code=True
+        )
+        model_ref.eval()
+        with torch.no_grad():
+            out_ref = model_ref.model(input_ids).last_hidden_state.to(device=device)
+            logits_ref = model_ref(input_ids).logits.to(device=device)
+        del model_ref
+
+        model_hf = AutoModelForCausalLM.from_pretrained(
+            model_name, torch_dtype=dtype, device_map="auto", trust_remote_code=True
+        )
+        model_hf.eval()
+        with torch.no_grad():
+            out_hf = model_hf.model(input_ids).last_hidden_state.to(device=device)
+            logits_hf = model_hf(input_ids).logits.to(device=device)
+        del model_hf
+
+        print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+        print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+        print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
+        print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
+        assert (out - out_ref).abs().max().item() < 2 * (out_hf - out_ref).abs().max().item()
+
+        print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+        print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
+        print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
+        print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
+        assert (logits - logits_ref).abs().max().item() < 2 * (
+            logits_hf - logits_ref
+        ).abs().max().item()
+
+
+@pytest.mark.parametrize(
+    "model_name", ["baichuan-inc/Baichuan-7B", "baichuan-inc/Baichuan-13B-Base"]
+)
+def test_baichuan_generation(model_name):
+    dtype = torch.float16
+    device = "cuda"
+    config = baichuan_config_to_gpt2_config(
+        AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    )
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = False  # We don't have fused GatedMLP yet
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    eos_token_id = tokenizer.eos_token_id
+
+    torch.manual_seed(0)
+    batch_size = 1
+    seqlen = 2048
+    max_length = 2048 + 150
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
+    )
+
+    model_hf = AutoModelForCausalLM.from_pretrained(
+        model_name, torch_dtype=dtype, device_map={"": device}, trust_remote_code=True
+    )
+    model_hf.eval()
+    print("HF fp16")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_hf = model_hf.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        return_dict_in_generate=True,
+        output_scores=True,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+    del model_hf
+
+    # Need auto here since the 13B fp32 model doesn't fit in memory on a A100 40GB
+    model_ref = AutoModelForCausalLM.from_pretrained(
+        model_name, device_map="auto", trust_remote_code=True
+    )
+    model_ref.eval()
+    with torch.no_grad():
+        logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1].to(device=device)
+    del model_ref
+
+    pretrained_state_dict = remap_state_dict_hf_baichuan(
+        state_dict_from_pretrained(model_name), config
+    )
+    model = GPTLMHeadModel(config, device=device, dtype=dtype)
+    model.load_state_dict(pretrained_state_dict)
+    model.eval()
+
+    model(input_ids)  # Warm up
+    print("Without CUDA graph")
+    torch.cuda.synchronize()
+    start = time.time()
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        eos_token_id=eos_token_id,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+        teacher_outputs=out_hf.sequences,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+
+    # Capture graph outside the timing loop
+    batch_size, seqlen_og = input_ids.shape
+    model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length)
+    print("With CUDA graph")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_cg = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        cg=True,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+        teacher_outputs=out_hf.sequences,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+
+    with torch.no_grad():
+        logits_parallel = model(out_hf.sequences).logits[:, (seqlen - 1) : -1]
+    logits_hf = torch.stack(out_hf.scores, dim=1)
+    logits = torch.stack(out.scores, dim=1)
+    logits_cg = torch.stack(out_cg.scores, dim=1)
+
+    del model
+
+    hf_error = (logits_hf - logits_ref).abs().max().item()
+
+    print(f"HF fp16 logits max diff: {hf_error}")
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item() }")
+    print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item() }")
+
+    assert (logits_parallel - logits_ref).abs().max().item() < 2 * hf_error
+    assert (logits - logits_ref).abs().max().item() < 2 * hf_error
+    assert torch.equal(logits_cg, logits)
+
+
+# torchrun --no_python --nproc_per_node=2 pytest -q -s tests/models/test_baichuan.py -k "baichuan_parallel_generation"
+@pytest.mark.parametrize("world_size", [2])
+@pytest.mark.parametrize("model_name", ["baichuan-inc/Baichuan-7B"])
+def test_baichuan_parallel_generation(model_name, world_size):
+    """Check that our implementation matches the HF implementation:
+    the scores in fp16 should be around the same as the HF scores in fp16, when compared to
+    the HF scores in fp32.
+    """
+    from apex.transformer import parallel_state
+
+    dtype = torch.float16
+    config = baichuan_config_to_gpt2_config(
+        AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    )
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = False  # We don't have fused GatedMLP yet
+    config.fused_dropout_add_ln = False
+    config.residual_in_fp32 = True
+    config.pad_vocab_size_multiple = 8 * world_size
+    config.sequence_parallel = False  # Need to set this to False for generation
+
+    os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0"
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    process_group = parallel_state.get_tensor_model_parallel_group()
+
+    torch.manual_seed(0)
+    batch_size = 1
+    seqlen = 100
+    max_length = 150
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
+    )
+
+    # Need this, otherwise when we capture the graph the process for GPU 1 would run on both
+    # GPU0 and GPU1 and things would hang
+    torch.cuda.set_device(device)
+
+    pretrained_state_dict = remap_state_dict_hf_baichuan(
+        state_dict_from_pretrained(model_name), config
+    )
+
+    model = GPTLMHeadModel(config, process_group=process_group, device=device, dtype=dtype)
+    model.load_state_dict(shard_state_dict_tp(pretrained_state_dict, config, world_size, rank))
+    model.eval()
+
+    print("Without CUDA graph")
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        tensor_parallel=world_size,
+        vocab_size=config.vocab_size,
+        # teacher_outputs=out_hf.sequences,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+    )
+
+    # Capture graph outside the timing loop
+    batch_size, seqlen_og = input_ids.shape
+    model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length)
+    print("With CUDA graph")
+    out_cg = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        tensor_parallel=world_size,
+        vocab_size=config.vocab_size,
+        cg=True,
+        # teacher_outputs=out_hf.sequences,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+    )
+    del model
+    parallel_state.destroy_model_parallel()
+
+    if rank == 0:
+        # Without device_map, the model is loaded on the CPU, which is very slow
+        model_hf = AutoModelForCausalLM.from_pretrained(
+            model_name, torch_dtype=dtype, device_map="auto", trust_remote_code=True
+        )
+        model_hf.eval()
+        print("HF fp16")
+        torch.cuda.synchronize()
+        start = time.time()
+        with torch.inference_mode():
+            out_hf = model_hf.generate(
+                input_ids=input_ids,
+                max_length=max_length,
+                return_dict_in_generate=True,
+                output_scores=True,
+            )
+        torch.cuda.synchronize()
+        print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+        del model_hf
+
+        model_ref = AutoModelForCausalLM.from_pretrained(
+            model_name, device_map="auto", trust_remote_code=True
+        )
+        model_ref.eval()
+        with torch.inference_mode():
+            logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1]
+        del model_ref
+        logits_hf = torch.stack(out_hf.scores, dim=1)
+
+        logits = torch.stack(out.scores, dim=1)
+        logits_cg = torch.stack(out_cg.scores, dim=1)
+
+        hf_error = (logits_hf - logits_ref).abs().max().item()
+        print(f"HF fp16 logits max diff: {hf_error}")
+        print(f"Logits max diff: {(logits - logits_ref).abs().max().item() }")
+        print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item() }")
+        assert (logits - logits_ref).abs().max().item() < 2 * hf_error
+        assert torch.equal(logits_cg, logits)
diff --git a/test_bert.py b/test_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c519b37e0e37603d21d517e9fb2175791204a75
--- /dev/null
+++ b/test_bert.py
@@ -0,0 +1,324 @@
+import re
+from collections import OrderedDict
+
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import BertConfig
+from transformers.models.bert.modeling_bert import BertForPreTraining as BertForPreTrainingHF
+from transformers.models.bert.modeling_bert import BertModel as BertModelHF
+
+from flash_attn.models.bert import (
+    BertForPreTraining,
+    BertModel,
+    inv_remap_state_dict,
+    remap_state_dict,
+)
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+
+
+@pytest.mark.parametrize("model_name", ["bert-base-uncased", "bert-large-uncased"])
+# @pytest.mark.parametrize('model_name', ["bert-base-uncased"])
+def test_bert_state_dict(model_name):
+    config = BertConfig.from_pretrained(model_name)
+    pretrained_state_dict = remap_state_dict(state_dict_from_pretrained(model_name), config)
+    model = BertForPreTraining(config)
+    state_dict = model.state_dict()
+    assert state_dict.keys() == pretrained_state_dict.keys()
+    for k in state_dict.keys():
+        assert state_dict[k].shape == pretrained_state_dict[k].shape
+
+
+def get_hf_models(model_name, config, dtype):
+    pretrained_state_dict = state_dict_from_pretrained(model_name)
+
+    def key_mapping_ln_gamma_beta(key):
+        key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key)
+        key = re.sub(r"LayerNorm.beta$", "LayerNorm.bias", key)
+        return key
+
+    pretrained_state_dict = OrderedDict(
+        (key_mapping_ln_gamma_beta(k), v) for k, v in pretrained_state_dict.items()
+    )
+    model_hf = BertForPreTrainingHF(config)
+    # Missing key(s) in state_dict: "bert.embeddings.position_ids", "cls.predictions.decoder.bias"
+    # position_ids is a buffer, and predictions.decoder.bias is tied to predictions.bias.
+    model_hf.load_state_dict(pretrained_state_dict, strict=False)
+    model_hf.cuda().to(dtype=dtype)
+    return model_hf
+
+
+@pytest.mark.parametrize("model_name", ["bert-base-uncased"])
+def test_bert_non_optimized(model_name):
+    """Check that our implementation of BERT (without any optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    config = BertConfig.from_pretrained(model_name)
+
+    model = BertForPreTraining.from_pretrained(model_name, config)
+    model = model.cuda().to(dtype=dtype)
+
+    model_ref = get_hf_models(model_name, config, torch.float32)
+    model_hf = get_hf_models(model_name, config, dtype)
+
+    model.eval()
+    model_ref.eval()
+    model_hf.eval()
+
+    torch.manual_seed(0)
+    batch_size = 4
+    max_seqlen = 512
+    seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device="cuda")
+    attention_mask = torch.arange(max_seqlen, device="cuda")[None, :] < seqlens[:, None]
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device="cuda"
+    )
+    out = model.bert(input_ids, attention_mask=attention_mask)
+    sequence_output, pooled_output = out.last_hidden_state, out.pooler_output
+    out_hf = model_hf.bert(input_ids, attention_mask=attention_mask)
+    sequence_output_hf, pooled_output_hf = out_hf.last_hidden_state, out_hf.pooler_output
+    out_ref = model_ref.bert(input_ids, attention_mask=attention_mask)
+    sequence_output_ref, pooled_output_ref = out_ref.last_hidden_state, out_ref.pooler_output
+
+    print(f"Output max diff: {(sequence_output - sequence_output_ref).abs().max().item()}")
+    print(f"Output mean diff: {(sequence_output - sequence_output_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(sequence_output_hf - sequence_output_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(sequence_output_hf - sequence_output_ref).abs().mean().item()}")
+    assert (sequence_output - sequence_output_ref).abs().max().item() < 3 * (
+        sequence_output_hf - sequence_output_ref
+    ).abs().max().item()
+    assert (pooled_output - pooled_output_ref).abs().max().item() < 3 * (
+        pooled_output_hf - pooled_output_ref
+    ).abs().max().item()
+
+
+@pytest.mark.parametrize("model_name", ["bert-base-uncased", "bert-large-uncased"])
+# @pytest.mark.parametrize('model_name', ["bert-base-uncased"])
+def test_bert_optimized(model_name):
+    """Check that our implementation of BERT (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    config = BertConfig.from_pretrained(model_name)
+    # Our implementation of fused_mlp assumes the activation is
+    # nn.GELU(approximate='tanh'). Huggingface calls it "gelu_new", "gelu_fast", or "gelu_pytorch_tanh".
+    # If you just want "gelu", disable fused_mlp.
+    config.hidden_act = "gelu_new"
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = True
+    config.fused_dropout_add_ln = True
+
+    model = BertForPreTraining.from_pretrained(model_name, config)
+    model = model.cuda().to(dtype=dtype)
+
+    model_ref = get_hf_models(model_name, config, torch.float32)
+    model_hf = get_hf_models(model_name, config, dtype)
+
+    model.eval()
+    model_ref.eval()
+    model_hf.eval()
+
+    torch.manual_seed(0)
+    batch_size = 4
+    max_seqlen = 512
+    seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device="cuda")
+    attention_mask = torch.arange(max_seqlen, device="cuda")[None, :] < seqlens[:, None]
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device="cuda"
+    )
+    out = model.bert(input_ids, attention_mask=attention_mask)
+    sequence_output, pooled_output = out.last_hidden_state, out.pooler_output
+    out_hf = model_hf.bert(input_ids, attention_mask=attention_mask)
+    sequence_output_hf, pooled_output_hf = out_hf.last_hidden_state, out_hf.pooler_output
+    # Need to zero out the padded tokens in the sequence before comparison.
+    sequence_output_hf[~attention_mask, :] = 0.0
+    out_ref = model_ref.bert(input_ids, attention_mask=attention_mask)
+    sequence_output_ref, pooled_output_ref = out_ref.last_hidden_state, out_ref.pooler_output
+    sequence_output_ref[~attention_mask, :] = 0.0
+
+    print(
+        f"BertModel output max diff: {(sequence_output - sequence_output_ref).abs().max().item()}"
+    )
+    print(
+        f"BertModel output mean diff: {(sequence_output - sequence_output_ref).abs().mean().item()}"
+    )
+    print(
+        f"HF fp16 BertModel max diff: {(sequence_output_hf - sequence_output_ref).abs().max().item()}"
+    )
+    print(
+        f"HF fp16 BertModel mean diff: {(sequence_output_hf - sequence_output_ref).abs().mean().item()}"
+    )
+    assert (sequence_output - sequence_output_ref).abs().max().item() < 4 * (
+        sequence_output_hf - sequence_output_ref
+    ).abs().max().item()
+    assert (pooled_output - pooled_output_ref).abs().max().item() < 4 * (
+        pooled_output_hf - pooled_output_ref
+    ).abs().max().item()
+
+    out = model(input_ids, attention_mask=attention_mask)
+    prediction_scores, seq_relationship_scores = out.prediction_logits, out.seq_relationship_logits
+    # Need to zero out the padded tokens in the sequence before comparison.
+    prediction_scores = prediction_scores.clone()
+    prediction_scores[~attention_mask, :] = 0.0
+    out_hf = model_hf(input_ids, attention_mask=attention_mask)
+    prediction_scores_hf, seq_relationship_scores_hf = (
+        out_hf.prediction_logits,
+        out_hf.seq_relationship_logits,
+    )
+    prediction_scores_hf[~attention_mask, :] = 0.0
+    out_ref = model_ref(input_ids, attention_mask=attention_mask)
+    prediction_scores_ref, seq_relationship_scores_ref = (
+        out_ref.prediction_logits,
+        out_ref.seq_relationship_logits,
+    )
+    prediction_scores_ref[~attention_mask, :] = 0.0
+
+    print(
+        f"prediction_scores max diff: {(prediction_scores - prediction_scores_ref).abs().max().item()}"
+    )
+    print(
+        f"prediction_scores mean diff: {(prediction_scores - prediction_scores_ref).abs().mean().item()}"
+    )
+    print(
+        f"HF fp16 prediction_scoresff: {(prediction_scores_hf - prediction_scores_ref).abs().max().item()}"
+    )
+    print(
+        f"HF fp16 prediction_scoresiff: {(prediction_scores_hf - prediction_scores_ref).abs().mean().item()}"
+    )
+    assert (prediction_scores - prediction_scores_ref).abs().max().item() < 2 * (
+        prediction_scores_hf - prediction_scores_ref
+    ).abs().max().item()
+    assert (seq_relationship_scores - seq_relationship_scores_ref).abs().max().item() < 2 * (
+        seq_relationship_scores_hf - seq_relationship_scores_ref
+    ).abs().max().item()
+
+
+@pytest.mark.parametrize("last_layer_subset", [False, True])
+# @pytest.mark.parametrize('last_layer_subset', [True])
+@pytest.mark.parametrize("has_key_padding_mask", [True, False])
+# @pytest.mark.parametrize('has_key_padding_mask', [True])
+@pytest.mark.parametrize("model_name", ["bert-base-uncased", "bert-large-uncased"])
+# @pytest.mark.parametrize('model_name', ["bert-base-uncased"])
+def test_bert_dense_seq_output(model_name, has_key_padding_mask, last_layer_subset):
+    """Check that our implementation of BERT (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    config = BertConfig.from_pretrained(model_name)
+    # Our implementation of fused_mlp assumes the activation is
+    # nn.GELU(approximate='tanh'). Huggingface calls it "gelu_new", "gelu_fast", or "gelu_pytorch_tanh".
+    # If you just want "gelu", disable fused_mlp.
+    config.hidden_act = "gelu_new"
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = True
+    config.fused_dropout_add_ln = True
+    config.dense_seq_output = True
+    config.last_layer_subset = last_layer_subset
+    config.use_xentropy = True
+
+    model = BertForPreTraining.from_pretrained(model_name, config)
+    model = model.cuda().to(dtype=dtype)
+
+    model_ref = get_hf_models(model_name, config, torch.float32)
+    model_hf = get_hf_models(model_name, config, dtype)
+
+    model.eval()
+    model_ref.eval()
+    model_hf.eval()
+
+    torch.manual_seed(0)
+    batch_size = 4
+    max_seqlen = 512
+    seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device="cuda")
+    if has_key_padding_mask:
+        attention_mask = torch.arange(max_seqlen, device="cuda")[None, :] < seqlens[:, None]
+    else:
+        attention_mask = None
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device="cuda"
+    )
+    labels = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device="cuda"
+    )
+    if attention_mask is not None:
+        labels[~attention_mask] = 0
+    labels[(torch.rand(batch_size, max_seqlen, device="cuda") > 0.15)] = 0
+    masked_tokens_mask = labels.flatten() > 0
+    next_sequence_label = torch.randint(0, 2, (batch_size,), device="cuda")
+
+    out = model(
+        input_ids,
+        attention_mask=attention_mask,
+        labels=labels,
+        next_sentence_label=next_sequence_label,
+    )
+    prediction_scores, seq_relationship_scores = out.prediction_logits, out.seq_relationship_logits
+    out_hf = model_hf(
+        input_ids,
+        attention_mask=attention_mask,
+        labels=labels,
+        next_sentence_label=next_sequence_label,
+    )
+    prediction_scores_hf, seq_relationship_scores_hf = (
+        out_hf.prediction_logits,
+        out_hf.seq_relationship_logits,
+    )
+    prediction_scores_hf = rearrange(prediction_scores_hf, "b s d -> (b s) d")[masked_tokens_mask]
+    out_ref = model_ref(
+        input_ids,
+        attention_mask=attention_mask,
+        labels=labels,
+        next_sentence_label=next_sequence_label,
+    )
+    prediction_scores_ref, seq_relationship_scores_ref = (
+        out_ref.prediction_logits,
+        out_ref.seq_relationship_logits,
+    )
+    prediction_scores_ref = rearrange(prediction_scores_ref, "b s d -> (b s) d")[masked_tokens_mask]
+
+    print(
+        f"prediction_scores max diff: {(prediction_scores - prediction_scores_ref).abs().max().item()}"
+    )
+    print(
+        f"prediction_scores mean diff: {(prediction_scores - prediction_scores_ref).abs().mean().item()}"
+    )
+    print(
+        f"HF fp16 prediction_scoresff: {(prediction_scores_hf - prediction_scores_ref).abs().max().item()}"
+    )
+    print(
+        f"HF fp16 prediction_scoresiff: {(prediction_scores_hf - prediction_scores_ref).abs().mean().item()}"
+    )
+    assert (prediction_scores - prediction_scores_ref).abs().max().item() < 2 * (
+        prediction_scores_hf - prediction_scores_ref
+    ).abs().max().item()
+    assert (seq_relationship_scores - seq_relationship_scores_ref).abs().max().item() < 2 * (
+        seq_relationship_scores_hf - seq_relationship_scores_ref
+    ).abs().max().item()
+    # The loss calculation from HF is wrong: it doesn't ignore the labels that are 0.
+    # assert (out.loss - out_ref.loss).abs().max().item() < 2 * (out_hf.loss - out_ref.loss).abs().max().item()
+
+
+@pytest.mark.parametrize("model_name", ["bert-base-uncased", "bert-large-uncased"])
+def test_inv_remap_state_dict(model_name: str):
+    """
+    Verify that we can convert a HF BERT model to flash_attn and back.
+    """
+
+    state_dict = state_dict_from_pretrained(model_name)
+    config = BertConfig.from_pretrained(model_name)
+
+    flash_state_dict = remap_state_dict(state_dict, config)
+    recovered_state_dict = inv_remap_state_dict(flash_state_dict, config)
+
+    assert set(state_dict.keys()) == set(recovered_state_dict.keys())
+
+    for k in state_dict.keys():
+        assert state_dict[k].shape == recovered_state_dict[k].shape
+        torch.testing.assert_close(state_dict[k], recovered_state_dict[k], rtol=1e-6, atol=1e-6)
diff --git a/test_bigcode.py b/test_bigcode.py
new file mode 100644
index 0000000000000000000000000000000000000000..b69038dde7d6cdbbb7498b33995b96cb3aa2f06a
--- /dev/null
+++ b/test_bigcode.py
@@ -0,0 +1,204 @@
+import time
+
+import pytest
+import torch
+from transformers import AutoTokenizer, GPTBigCodeConfig
+from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeForCausalLM
+
+from flash_attn.models.bigcode import bigcode_config_to_gpt2_config, inv_remap_state_dict_hf_bigcode
+from flash_attn.models.gpt import GPTLMHeadModel, remap_state_dict_hf_bigcode
+from flash_attn.utils.generation import update_graph_cache
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+
+
+@pytest.mark.parametrize("model_name", ["bigcode/starcoderbase-1b", "WizardLM/WizardCoder-1B-V1.0"])
+def test_bigcode_state_dict(model_name):
+    config = bigcode_config_to_gpt2_config(GPTBigCodeConfig.from_pretrained(model_name))
+    pretrained_state_dict = remap_state_dict_hf_bigcode(
+        state_dict_from_pretrained(model_name), config
+    )
+    model = GPTLMHeadModel(config, device="meta")
+    state_dict = model.state_dict()
+    assert state_dict.keys() == pretrained_state_dict.keys()
+    for k in state_dict.keys():
+        assert state_dict[k].shape == pretrained_state_dict[k].shape
+
+
+@pytest.mark.parametrize("model_name", ["bigcode/starcoderbase-1b", "WizardLM/WizardCoder-1B-V1.0"])
+def test_bigcode_optimized(model_name):
+    """Check that our implementation of BigCode (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    device = "cuda"
+    config = bigcode_config_to_gpt2_config(GPTBigCodeConfig.from_pretrained(model_name))
+    config.use_flash_attn = True  # FlashAttention-2 supports headdim 256
+    config.fused_bias_fc = True
+    config.fused_mlp = True
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+
+    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
+    model.eval()
+
+    torch.manual_seed(0)
+    batch_size = 2
+    max_seqlen = 256
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device
+    )
+    with torch.no_grad():
+        out = model.transformer(input_ids)
+        logits = model(input_ids).logits
+    del model
+
+    # Without device_map, the model is loaded on the CPU, which is very slow
+    model_ref = GPTBigCodeForCausalLM.from_pretrained(model_name, device_map={"": device})
+    model_ref.eval()
+    with torch.no_grad():
+        out_ref = model_ref.transformer(input_ids).last_hidden_state
+        logits_ref = model_ref(input_ids).logits
+    del model_ref
+
+    model_hf = GPTBigCodeForCausalLM.from_pretrained(
+        model_name, torch_dtype=dtype, device_map={"": device}
+    )
+    model_hf.eval()
+    out_hf = model_hf.transformer(input_ids).last_hidden_state
+    logits_hf = model_hf(input_ids).logits
+    del model_hf
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
+    assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item()
+
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+    print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
+    assert (logits - logits_ref).abs().max().item() < 3 * (
+        logits_hf - logits_ref
+    ).abs().max().item()
+
+
+@pytest.mark.parametrize("model_name", ["bigcode/starcoderbase-1b", "WizardLM/WizardCoder-1B-V1.0"])
+def test_bigcode_generation(model_name):
+    """Check that our implementation of BigCode (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    device = "cuda"
+    config = bigcode_config_to_gpt2_config(GPTBigCodeConfig.from_pretrained(model_name))
+    config.use_flash_attn = True  # FlashAttention-2 supports headdim 256
+    config.fused_bias_fc = True
+    config.fused_mlp = True
+    config.fused_dropout_add_ln = True
+    # Only prenorm supports residual_in_fp32
+    config.residual_in_fp32 = True
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    eos_token_id = tokenizer.eos_token_id
+
+    torch.manual_seed(0)
+    batch_size = 1
+    seqlen = 100
+    max_length = 150
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
+    )
+
+    model_hf = GPTBigCodeForCausalLM.from_pretrained(
+        model_name, torch_dtype=dtype, device_map={"": device}
+    )
+    model_hf.eval()
+    print("HF fp16")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_hf = model_hf.generate(
+        input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+    del model_hf
+
+    model_ref = GPTBigCodeForCausalLM.from_pretrained(model_name, device_map={"": device})
+    model_ref.eval()
+    with torch.no_grad():
+        logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1]
+    del model_ref
+
+    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
+    model.eval()
+
+    print("Without CUDA graph")
+    torch.cuda.synchronize()
+    start = time.time()
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        eos_token_id=eos_token_id,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+        teacher_outputs=out_hf.sequences,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+
+    # Capture graph outside the timing loop
+    batch_size, seqlen_og = input_ids.shape
+    model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length)
+    print("With CUDA graph")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_cg = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        cg=True,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+        teacher_outputs=out_hf.sequences,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+
+    with torch.no_grad():
+        logits_parallel = model(out_hf.sequences).logits[:, (seqlen - 1) : -1]
+    logits_hf = torch.stack(out_hf.scores, dim=1)
+    logits = torch.stack(out.scores, dim=1)
+    logits_cg = torch.stack(out_cg.scores, dim=1)
+
+    del model
+
+    hf_error = (logits_hf - logits_ref).abs().max().item()
+    assert (logits_parallel - logits_ref).abs().max().item() < 2 * hf_error
+
+    print(f"HF fp16 logits max diff: {hf_error}")
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item() }")
+    assert (logits - logits_ref).abs().max().item() < 2 * hf_error
+    print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item() }")
+    assert (logits_cg - logits_ref).abs().max().item() < 2 * hf_error
+
+
+@pytest.mark.parametrize("model_name", ["bigcode/starcoderbase-1b", "WizardLM/WizardCoder-1B-V1.0"])
+def test_inv_remap_state_dict(model_name: str):
+    """
+    Verify that we can convert a HF BigCode model to flash_attn and back.
+    """
+
+    state_dict = state_dict_from_pretrained(model_name)
+    config = GPTBigCodeConfig.from_pretrained(model_name)
+
+    flash_state_dict = remap_state_dict_hf_bigcode(state_dict, config)
+    recovered_state_dict = inv_remap_state_dict_hf_bigcode(flash_state_dict, config)
+
+    assert set(state_dict.keys()) == set(recovered_state_dict.keys())
+
+    for k in state_dict.keys():
+        assert state_dict[k].shape == recovered_state_dict[k].shape
+        torch.testing.assert_close(state_dict[k], recovered_state_dict[k], rtol=1e-6, atol=1e-6)
diff --git a/test_block_parallel.py b/test_block_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..d74cfa11fbe8b7c3fcdbf344376ac12bc09a3b9e
--- /dev/null
+++ b/test_block_parallel.py
@@ -0,0 +1,273 @@
+# Run test with:
+# torchrun --no_python --nproc_per_node=8 pytest -q -s tests/modules/test_block_parallel.py
+
+import math
+from functools import partial
+
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from apex.transformer import parallel_state, tensor_parallel
+from einops import rearrange
+from flash_attn.modules.block import Block
+from flash_attn.modules.mha import MHA, ParallelMHA
+from flash_attn.modules.mlp import FusedMLP, ParallelFusedMLP
+from flash_attn.utils.distributed import allreduce_sequence_parallel_grad
+
+is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8
+
+
+@pytest.mark.parametrize("dtype", [torch.float16] + ([torch.bfloat16] if is_sm8x else []))
+# @pytest.mark.parametrize('dtype', [torch.float16])
+@pytest.mark.parametrize("world_size", [1, 2, 4, 8])
+# @pytest.mark.parametrize('world_size', [2])
+@pytest.mark.parametrize("sequence_parallel", [True, False])
+# @pytest.mark.parametrize('sequence_parallel', [True])
+@pytest.mark.parametrize("dim", [1024])
+def test_block_parallel(dim, sequence_parallel, world_size, dtype):
+    head_dim = 64
+    assert dim % head_dim == 0
+    num_heads = dim // head_dim
+    assert num_heads % world_size == 0
+    rtol, atol = (3e-3, 5e-2) if dtype == torch.bfloat16 else (3e-3, 3e-3)
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    seqlen = 1024
+    assert (batch_size * seqlen) % world_size == 0
+    x_pt = torch.randn(batch_size * seqlen, dim, device=device, dtype=dtype, requires_grad=True)
+    residual_pt = torch.randn(batch_size * seqlen, dim, device=device, requires_grad=True)
+    # We need to generate g here so that all processes get the same gradient,
+    # as rank 0 will have an extra bias that changes the RNG.
+    # If we don't divide by batch_size, the gradient gets a bit too large.
+    g = torch.randn_like(x_pt) / 32
+    if sequence_parallel:
+        x = (
+            tensor_parallel.scatter_to_sequence_parallel_region(x_pt)
+            .detach()
+            .clone()
+            .requires_grad_()
+        )
+        residual = (
+            tensor_parallel.scatter_to_sequence_parallel_region(residual_pt)
+            .detach()
+            .clone()
+            .requires_grad_()
+        )
+    else:
+        x = x_pt.detach().clone().requires_grad_()
+        residual = residual_pt.detach().clone().requires_grad_()
+
+    mixer_cls_pt = partial(
+        MHA,
+        num_heads=num_heads,
+        rotary_emb_dim=int(head_dim // 2),
+        use_flash_attn=True,
+        device=device,
+        dtype=dtype,
+    )
+    mlp_cls_pt = partial(FusedMLP, hidden_features=4 * dim, device=device, dtype=dtype)
+    norm_cls = partial(nn.LayerNorm, device=device, dtype=dtype)
+    model_pt = Block(dim, mixer_cls_pt, mlp_cls_pt, norm_cls, fused_dropout_add_ln=True)
+    with torch.no_grad():
+        nn.init.normal_(model_pt.norm1.weight)
+        nn.init.normal_(model_pt.norm1.bias)
+        nn.init.normal_(model_pt.norm2.weight)
+        nn.init.normal_(model_pt.norm2.bias)
+
+    mixer_cls = partial(
+        ParallelMHA,
+        num_heads=num_heads,
+        process_group=parallel_state.get_tensor_model_parallel_group(),
+        rotary_emb_dim=int(head_dim // 2),
+        use_flash_attn=True,
+        sequence_parallel=sequence_parallel,
+        device=device,
+        dtype=dtype,
+    )
+    mlp_cls = partial(
+        ParallelFusedMLP,
+        hidden_features=4 * dim,
+        process_group=parallel_state.get_tensor_model_parallel_group(),
+        sequence_parallel=sequence_parallel,
+        device=device,
+        dtype=dtype,
+    )
+    model = Block(
+        dim,
+        mixer_cls,
+        mlp_cls,
+        norm_cls,
+        fused_dropout_add_ln=True,
+        sequence_parallel=sequence_parallel,
+        mark_shared_params=True,
+    )
+
+    partition_dim = dim // world_size
+    partition_hidden_dim = 4 * dim // world_size
+    with torch.no_grad():
+        model.mixer.Wqkv.weight.copy_(
+            rearrange(
+                rearrange(model_pt.mixer.Wqkv.weight, "(three o) i -> three o i", three=3)[
+                    :, rank * partition_dim : (rank + 1) * partition_dim
+                ],
+                "three o i -> (three o) i",
+            )
+        )
+        model.mixer.Wqkv.bias.copy_(
+            rearrange(
+                rearrange(model_pt.mixer.Wqkv.bias, "(three o) -> three o", three=3)[
+                    :, rank * partition_dim : (rank + 1) * partition_dim
+                ],
+                "three o -> (three o)",
+            )
+        )
+        model.mixer.out_proj.weight.copy_(
+            model_pt.mixer.out_proj.weight[:, rank * partition_dim : (rank + 1) * partition_dim]
+        )
+        if rank == 0:
+            model.mixer.out_proj.bias.copy_(model_pt.mixer.out_proj.bias)
+        model.mlp.fc1.weight.copy_(
+            model_pt.mlp.fc1.weight[rank * partition_hidden_dim : (rank + 1) * partition_hidden_dim]
+        )
+        model.mlp.fc1.bias.copy_(
+            model_pt.mlp.fc1.bias[rank * partition_hidden_dim : (rank + 1) * partition_hidden_dim]
+        )
+        model.mlp.fc2.weight.copy_(
+            model_pt.mlp.fc2.weight[
+                :, rank * partition_hidden_dim : (rank + 1) * partition_hidden_dim
+            ]
+        )
+        if rank == 0:
+            model.mlp.fc2.bias.copy_(model_pt.mlp.fc2.bias)
+        model.norm1.weight.copy_(model_pt.norm1.weight)
+        model.norm1.bias.copy_(model_pt.norm1.bias)
+        model.norm2.weight.copy_(model_pt.norm2.weight)
+        model.norm2.bias.copy_(model_pt.norm2.bias)
+
+    mixer_kwargs = {"seqlen": seqlen}
+    out, out_residual = model(x, residual, mixer_kwargs=mixer_kwargs)
+    out_pt, out_residual_pt = model_pt(
+        rearrange(x_pt, "(b s) d -> b s d", s=seqlen),
+        rearrange(residual_pt, "(b s) d -> b s d", s=seqlen),
+    )
+    out_pt, out_residual_pt = [rearrange(x, "b s d -> (b s) d") for x in [out_pt, out_residual_pt]]
+    partition_batch_dim = batch_size * seqlen // world_size
+    assert torch.allclose(
+        out,
+        out_pt[rank * partition_batch_dim : (rank + 1) * partition_batch_dim]
+        if sequence_parallel
+        else out_pt,
+        rtol=rtol,
+        atol=atol,
+    )
+    assert torch.allclose(
+        out_residual,
+        out_residual_pt[rank * partition_batch_dim : (rank + 1) * partition_batch_dim]
+        if sequence_parallel
+        else out_residual_pt,
+        rtol=rtol,
+        atol=atol,
+    )
+
+    (out_pt + 2 * out_residual_pt).backward(g)
+    (out + 2 * out_residual).backward(
+        g[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] if sequence_parallel else g
+    )
+    allreduce_sequence_parallel_grad(model, parallel_state.get_tensor_model_parallel_group())
+    parallel_state.destroy_model_parallel()
+
+    assert torch.allclose(
+        x.grad,
+        x_pt.grad[rank * partition_batch_dim : (rank + 1) * partition_batch_dim]
+        if sequence_parallel
+        else x_pt.grad,
+        rtol=rtol,
+        atol=atol / 10,  # magnitude of x.grad is quite small
+    )
+    assert torch.allclose(
+        residual.grad,
+        residual_pt.grad[rank * partition_batch_dim : (rank + 1) * partition_batch_dim]
+        if sequence_parallel
+        else residual_pt.grad,
+        rtol=rtol,
+        atol=atol,
+    )
+    # The error for d_weight and d_bias is quite a bit higher
+    assert torch.allclose(
+        model.mixer.Wqkv.weight.grad,
+        rearrange(
+            rearrange(model_pt.mixer.Wqkv.weight.grad, "(three o) i -> three o i", three=3)[
+                :, rank * partition_dim : (rank + 1) * partition_dim
+            ],
+            "three o i -> (three o) i",
+        ),
+        rtol=rtol,
+        atol=atol * 10,
+    )
+    assert torch.allclose(
+        model.mixer.Wqkv.bias.grad,
+        rearrange(
+            rearrange(model_pt.mixer.Wqkv.bias.grad, "(three o) -> three o", three=3)[
+                :, rank * partition_dim : (rank + 1) * partition_dim
+            ],
+            "three o -> (three o)",
+        ),
+        rtol=rtol,
+        atol=atol * 5,
+    )
+    assert torch.allclose(
+        model.mixer.out_proj.weight.grad,
+        model_pt.mixer.out_proj.weight.grad[:, rank * partition_dim : (rank + 1) * partition_dim],
+        rtol=rtol,
+        atol=atol * 10,
+    )
+    if rank == 0:
+        assert torch.allclose(
+            model.mixer.out_proj.bias.grad,
+            model_pt.mixer.out_proj.bias.grad,
+            rtol=rtol,
+            atol=atol * 5,
+        )
+    assert torch.allclose(
+        model.mlp.fc1.weight.grad,
+        model_pt.mlp.fc1.weight.grad[
+            rank * partition_hidden_dim : (rank + 1) * partition_hidden_dim
+        ],
+        rtol=rtol,
+        atol=atol * 10,
+    )
+    assert torch.allclose(
+        model.mlp.fc1.bias.grad,
+        model_pt.mlp.fc1.bias.grad[rank * partition_hidden_dim : (rank + 1) * partition_hidden_dim],
+        rtol=rtol,
+        atol=atol * 5,
+    )
+    assert torch.allclose(
+        model.mlp.fc2.weight.grad,
+        model_pt.mlp.fc2.weight.grad[
+            :, rank * partition_hidden_dim : (rank + 1) * partition_hidden_dim
+        ],
+        rtol=rtol,
+        atol=atol * 10,
+    )
+    if rank == 0:
+        assert torch.allclose(
+            model.mlp.fc2.bias.grad, model_pt.mlp.fc2.bias.grad, rtol=rtol, atol=atol * 5
+        )
+
+    assert torch.allclose(
+        model.norm1.weight.grad, model_pt.norm1.weight.grad, rtol=rtol, atol=atol * 5
+    )
+    assert torch.allclose(model.norm1.bias.grad, model_pt.norm1.bias.grad, rtol=rtol, atol=atol * 5)
+    assert torch.allclose(
+        model.norm2.weight.grad, model_pt.norm2.weight.grad, rtol=rtol, atol=atol * 5
+    )
+    assert torch.allclose(model.norm2.bias.grad, model_pt.norm2.bias.grad, rtol=rtol, atol=atol * 5)
diff --git a/test_btlm.py b/test_btlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb5316bbe17ca074a0a2b7b758c1864597df3607
--- /dev/null
+++ b/test_btlm.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2023, Tri Dao.
+import time
+
+import torch
+import pytest
+
+from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
+
+from flash_attn.models.gpt import GPTLMHeadModel
+from flash_attn.models.btlm import btlm_config_to_gpt2_config, remap_state_dict_hf_btlm
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+from flash_attn.utils.generation import update_graph_cache
+
+
+@pytest.mark.parametrize("model_name", ["cerebras/btlm-3b-8k-base"])
+def test_btlm_state_dict(model_name):
+    config = btlm_config_to_gpt2_config(
+        AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    )
+    pretrained_state_dict = remap_state_dict_hf_btlm(state_dict_from_pretrained(model_name), config)
+    model = GPTLMHeadModel(config, device="meta")  # Without device='meta' init is very slow
+    state_dict = model.state_dict()
+    assert len(state_dict.keys()) == len(pretrained_state_dict.keys())
+    assert state_dict.keys() == pretrained_state_dict.keys()
+    for k in state_dict.keys():
+        assert state_dict[k].shape == pretrained_state_dict[k].shape
+
+
+@pytest.mark.parametrize("model_name", ["cerebras/btlm-3b-8k-base"])
+def test_btlm_optimized(model_name):
+    """Check that our implementation of Btlm (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    device = "cuda"
+    config = btlm_config_to_gpt2_config(
+        AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    )
+    config.fused_bias_fc = True
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+
+    pretrained_state_dict = remap_state_dict_hf_btlm(state_dict_from_pretrained(model_name), config)
+    model = GPTLMHeadModel(config, device=device, dtype=dtype)
+    model.load_state_dict(pretrained_state_dict)
+    model.eval()
+
+    torch.manual_seed(0)
+    batch_size = 2
+    max_seqlen = 256
+    seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device)
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device
+    )
+    with torch.no_grad():
+        out = model.transformer(input_ids)
+        logits = model(input_ids).logits
+    del model
+
+    # Without device_map, the model is loaded on the CPU, which is very slow
+    # Need auto here since the 13B fp32 model doesn't fit in memory on a A100 40GB
+    model_ref = AutoModelForCausalLM.from_pretrained(
+        model_name, device_map="auto", trust_remote_code=True
+    )
+    model_ref.eval()
+    with torch.no_grad():
+        out_ref = model_ref.transformer(input_ids).last_hidden_state.to(device=device)
+        logits_ref = model_ref(input_ids).logits.to(device=device)
+    del model_ref
+
+    model_hf = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=dtype,
+        device_map={"": device},
+        trust_remote_code=True,
+    )
+    model_hf.eval()
+    with torch.no_grad():
+        out_hf = model_hf.transformer(input_ids).last_hidden_state
+        logits_hf = model_hf(input_ids).logits
+    del model_hf
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
+    assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item()
+
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+    print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
+    assert (logits - logits_ref).abs().max().item() < 3 * (
+        logits_hf - logits_ref
+    ).abs().max().item()
+
+
+@pytest.mark.parametrize("model_name", ["cerebras/btlm-3b-8k-base"])
+def test_btlm_generation(model_name):
+    dtype = torch.float16
+    device = "cuda"
+    config = btlm_config_to_gpt2_config(
+        AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    )
+    config.fused_bias_fc = True
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    eos_token_id = tokenizer.eos_token_id
+
+    torch.manual_seed(0)
+    batch_size = 1
+    seqlen = 2048
+    max_length = 2048 + 150
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
+    )
+
+    model_hf = AutoModelForCausalLM.from_pretrained(
+        model_name, torch_dtype=dtype, device_map={"": device}, trust_remote_code=True
+    )
+    model_hf.eval()
+    print("HF fp16")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_hf = model_hf.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        return_dict_in_generate=True,
+        output_scores=True,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+    del model_hf
+
+    # Need auto here since the 13B fp32 model doesn't fit in memory on a A100 40GB
+    model_ref = AutoModelForCausalLM.from_pretrained(
+        model_name, device_map="auto", trust_remote_code=True
+    )
+    model_ref.eval()
+    with torch.no_grad():
+        logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1].to(device=device)
+    del model_ref
+
+    pretrained_state_dict = remap_state_dict_hf_btlm(state_dict_from_pretrained(model_name), config)
+    model = GPTLMHeadModel(config, device=device, dtype=dtype)
+    model.load_state_dict(pretrained_state_dict)
+    model.eval()
+
+    model(input_ids)  # Warm up
+    print("Without CUDA graph")
+    torch.cuda.synchronize()
+    start = time.time()
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        eos_token_id=eos_token_id,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+        teacher_outputs=out_hf.sequences,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+
+    # Capture graph outside the timing loop
+    batch_size, seqlen_og = input_ids.shape
+    model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length)
+    print("With CUDA graph")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_cg = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        cg=True,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+        teacher_outputs=out_hf.sequences,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+
+    with torch.no_grad():
+        logits_parallel = model(out_hf.sequences).logits[:, (seqlen - 1) : -1]
+    logits_hf = torch.stack(out_hf.scores, dim=1)
+    logits = torch.stack(out.scores, dim=1)
+    logits_cg = torch.stack(out_cg.scores, dim=1)
+
+    del model
+
+    hf_error = (logits_hf - logits_ref).abs().max().item()
+
+    print(f"HF fp16 logits max diff: {hf_error}")
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item() }")
+    print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item() }")
+
+    assert (logits_parallel - logits_ref).abs().max().item() < 2 * hf_error
+    assert (logits - logits_ref).abs().max().item() < 2 * hf_error
+    assert torch.equal(logits_cg, logits)
+
+
+@pytest.mark.parametrize("model_name", ["cerebras/btlm-3b-8k-base"])
+def test_btlm_init(model_name):
+    dtype = torch.float32
+    device = "cuda"
+    btlm_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    config = btlm_config_to_gpt2_config(btlm_config)
+    model = GPTLMHeadModel(config, device=device, dtype=dtype)
+    model_ref = AutoModelForCausalLM.from_config(btlm_config, trust_remote_code=True).to(device)
+
+    assert model.transformer.embeddings.word_embeddings.weight.mean().abs() < 1e-4
+    assert (
+        model.transformer.embeddings.word_embeddings.weight.std()
+        - model_ref.transformer.wte.weight.std()
+    ).abs() < 1e-4
+    assert model.lm_head.weight.mean().abs() < 1e-4
+    assert (model.lm_head.weight.std() - model_ref.lm_head.weight.std()).abs() < 1e-4
+    for l in range(config.n_layer):
+        assert model.transformer.layers[l].mixer.Wqkv.weight.mean().abs() < 1e-4
+        assert (
+            model.transformer.layers[l].mixer.Wqkv.weight.std()
+            - model_ref.transformer.h[l].attn.c_attn.weight.std()
+        ).abs() < 1e-4
+        assert model.transformer.layers[l].mixer.Wqkv.bias.abs().max() == 0.0
+        assert model.transformer.layers[l].mixer.out_proj.weight.mean().abs() < 1e-4
+        assert (
+            model.transformer.layers[l].mixer.out_proj.weight.std()
+            - model_ref.transformer.h[l].attn.c_proj.weight.std()
+        ).abs() < 1e-4
+        assert model.transformer.layers[l].mixer.out_proj.bias.abs().max() == 0.0
+        assert model.transformer.layers[l].mlp.fc1.weight.mean().abs() < 1e-4
+        assert (
+            model.transformer.layers[l].mlp.fc1.weight.std()
+            - model_ref.transformer.h[l].mlp.c_fc.weight.std()
+        ).abs() < 1e-4
+        assert model.transformer.layers[l].mlp.fc1.bias.abs().max() == 0.0
+        assert model.transformer.layers[l].mlp.fc2.weight.mean().abs() < 1e-4
+        assert (
+            model.transformer.layers[l].mlp.fc2.weight.std()
+            - model_ref.transformer.h[l].mlp.c_proj.weight.std()
+        ).abs() < 1e-4
+        assert model.transformer.layers[l].mlp.fc2.bias.abs().max() == 0.0
diff --git a/test_cross_entropy.py b/test_cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d67f5906a17cb3511e0557be71663332519b36b
--- /dev/null
+++ b/test_cross_entropy.py
@@ -0,0 +1,68 @@
+import math
+
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from flash_attn.losses.cross_entropy import CrossEntropyLoss
+
+is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8
+
+
+@pytest.mark.parametrize(
+    "dtype", [torch.float16, torch.float32] + ([torch.bfloat16] if is_sm8x else [])
+)
+# @pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("inplace_backward", [False, True])
+# @pytest.mark.parametrize("inplace_backward", [False])
+@pytest.mark.parametrize("lse_square_scale", [0.0, 1e-2])
+@pytest.mark.parametrize("return_z_loss", [False, True])
+# @pytest.mark.parametrize("lse_square_scale", [1e-2])
+@pytest.mark.parametrize("logit_scale", [1.0, 0.7])
+# @pytest.mark.parametrize("logit_scale", [1.0])
+@pytest.mark.parametrize("smoothing", [0.0, 0.9])
+# @pytest.mark.parametrize("smoothing", [0.0])
+@pytest.mark.parametrize("vocab_size", [50257, 128 * 1024])  # test vocab larger than 64k for split
+# @pytest.mark.parametrize("vocab_size", [12])
+def test_cross_entropy_loss(
+        vocab_size, smoothing, logit_scale, lse_square_scale, return_z_loss, inplace_backward, dtype
+):
+    device = "cuda"
+    rtol, atol = (1e-5, 1e-6) if dtype == torch.float32 else (1e-3, 1e-4)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 1 if dtype == torch.float32 else 4  # Otherwise OOM
+    seqlen = 4096 if lse_square_scale == 0.0 and logit_scale == 1.0 else 1024  # Otherwise OOM
+    x_pt = torch.randn(
+        batch_size * seqlen, vocab_size, device=device, dtype=dtype, requires_grad=True
+    )
+    x = x_pt.detach().clone().requires_grad_()
+    y = torch.randint(0, vocab_size, (batch_size * seqlen,), dtype=torch.long, device=device)
+    if batch_size * seqlen > 10:
+        y[torch.randperm(batch_size * seqlen)[:10]] = -100
+    model_pt = torch.nn.CrossEntropyLoss(label_smoothing=smoothing)
+    model = CrossEntropyLoss(
+        label_smoothing=smoothing,
+        logit_scale=logit_scale,
+        lse_square_scale=lse_square_scale,
+        return_z_loss=return_z_loss,
+        inplace_backward=inplace_backward,
+    )
+    if return_z_loss:
+        out, out_z_loss = model(x, y)
+    else:
+        out = model(x, y)
+    x_pt_scaled = (x_pt.float() * logit_scale) if logit_scale != 1.0 else x_pt.float()
+    out_pt = model_pt(x_pt_scaled, y)
+    if lse_square_scale > 0.0:
+        lse_pt = torch.logsumexp(x_pt_scaled, dim=-1)
+        z_loss_pt = lse_square_scale * (lse_pt[y != -100] ** 2).mean()
+        if return_z_loss:
+            assert torch.allclose(out_z_loss, z_loss_pt, rtol=rtol, atol=atol)
+        out_pt += z_loss_pt
+    assert torch.allclose(out, out_pt, rtol=1e-5, atol=1e-6)
+
+    g = torch.randn_like(out)
+    out_pt.backward(g)
+    out.backward(g)
+    assert torch.allclose(x.grad, x_pt.grad, rtol=rtol, atol=atol)
diff --git a/test_cross_entropy_parallel.py b/test_cross_entropy_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8b97fc291d7004e106b2dbd0e115ab290c67f78
--- /dev/null
+++ b/test_cross_entropy_parallel.py
@@ -0,0 +1,88 @@
+# Run test with:
+# torchrun --no_python --nproc_per_node=2 pytest -q -s tests/losses/test_cross_entropy_parallel.py
+
+import math
+
+import pytest
+import torch
+from apex.transformer import parallel_state, tensor_parallel
+from flash_attn.losses.cross_entropy import CrossEntropyLoss
+
+is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8
+
+
+@pytest.mark.parametrize(
+    "dtype", [torch.float16, torch.float32] + ([torch.bfloat16] if is_sm8x else [])
+)
+# @pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("inplace_backward", [False, True])
+# @pytest.mark.parametrize("inplace_backward", [False])
+@pytest.mark.parametrize("lse_square_scale", [0.0, 1e-2])
+# @pytest.mark.parametrize("lse_square_scale", [0.0])
+@pytest.mark.parametrize("logit_scale", [0.7])
+# @pytest.mark.parametrize("logit_scale", [1.0])
+@pytest.mark.parametrize("smoothing", [0.0, 0.9])
+# @pytest.mark.parametrize("smoothing", [0.0])
+@pytest.mark.parametrize("vocab_size", [50264, 256 * 1024])  # test vocab larger than 64k for split
+# @pytest.mark.parametrize("vocab_size", [50264])  # test vocab larger than 64k for split
+# @pytest.mark.parametrize("world_size", [1, 2])
+@pytest.mark.parametrize("world_size", [2])
+def test_cross_entropy_loss_parallel(
+    vocab_size, world_size, smoothing, logit_scale, lse_square_scale, inplace_backward, dtype
+):
+    assert vocab_size % world_size == 0
+    rtol, atol = (
+        (1e-5, 2e-5)
+        if dtype == torch.float32
+        else ((1e-3, 1e-4) if dtype == torch.float16 else (1e-2, 3e-3))
+    )
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    partition_vocab_size = vocab_size // world_size
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen = 128
+    x_pt = (
+        torch.randn(batch_size * seqlen, vocab_size, device=device, dtype=dtype) * 10
+    ).requires_grad_()
+    x = (
+        tensor_parallel.scatter_to_tensor_model_parallel_region(x_pt)
+        .detach()
+        .clone()
+        .requires_grad_()
+    )
+    y = torch.randint(0, vocab_size, (batch_size * seqlen,), dtype=torch.long, device=device)
+    y[torch.randperm(batch_size * seqlen)[:10]] = -100
+    model_pt = torch.nn.CrossEntropyLoss(label_smoothing=smoothing, reduction="none")
+    model = CrossEntropyLoss(
+        label_smoothing=smoothing,
+        logit_scale=logit_scale,
+        reduction="none",
+        lse_square_scale=lse_square_scale,
+        inplace_backward=inplace_backward,
+        process_group=parallel_state.get_tensor_model_parallel_group(),
+    )
+    out = model(x, y)
+    out_pt = model_pt(x_pt.float() * logit_scale, y)
+    if lse_square_scale > 0.0:
+        lse_pt = torch.logsumexp(x_pt.float() * logit_scale, dim=-1)
+        out_pt += lse_square_scale * lse_pt.square()
+        out_pt.masked_fill_(y == -100, 0.0)
+    assert torch.allclose(out, out_pt, rtol=1e-5, atol=1e-6)
+
+    g = torch.randn_like(out)
+    out_pt.backward(g)
+    out.backward(g)
+    assert torch.allclose(
+        x.grad,
+        x_pt.grad[:, (rank * partition_vocab_size) : (rank + 1) * partition_vocab_size],
+        rtol=rtol,
+        atol=atol,
+    )
+
+    parallel_state.destroy_model_parallel()
diff --git a/test_dropout_layer_norm.py b/test_dropout_layer_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccc8c8ea0692c282301b7e55d7254d767df3900f
--- /dev/null
+++ b/test_dropout_layer_norm.py
@@ -0,0 +1,1189 @@
+import math
+
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from flash_attn.ops.layer_norm import (
+    DropoutAddLayerNorm,
+    dropout_add_layer_norm,
+    dropout_add_layer_norm_parallel_residual,
+    dropout_add_layer_norm_subset,
+)
+from flash_attn.ops.rms_norm import (
+    DropoutAddRMSNorm,
+    dropout_add_rms_norm,
+    dropout_add_rms_norm_parallel_residual,
+    dropout_add_rms_norm_subset,
+)
+
+try:
+    from apex.normalization import FusedRMSNorm
+    from apex.normalization.fused_layer_norm import fused_rms_norm_affine
+except:
+    FusedRMSNorm, fused_rms_norm_affine = None, None
+
+
+is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8
+
+
+@pytest.mark.parametrize("is_rms_norm", [False, True])
+@pytest.mark.parametrize("has_colscale", [True, False])
+# @pytest.mark.parametrize('has_colscale', [False])
+@pytest.mark.parametrize("has_rowscale", [True, False])
+# @pytest.mark.parametrize('has_rowscale', [True])
+@pytest.mark.parametrize("has_residual", [True, False])
+# @pytest.mark.parametrize('has_residual', [False])
+@pytest.mark.parametrize("dropout_p", [0.37, 0.0])
+# @pytest.mark.parametrize('dropout_p', [0.0])
+@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16])
+# @pytest.mark.parametrize('weight_dtype', [torch.float32])
+@pytest.mark.parametrize(
+    "input_dtype,residual_dtype",
+    [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)]
+    + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []),
+)
+# @pytest.mark.parametrize('input_dtype,residual_dtype', [(torch.float16, torch.float32)])
+@pytest.mark.parametrize(
+    "hidden_size",
+    [192, 256, 384, 768, 1024, 1280, 1536, 1600, 2048, 2560, 3000, 3072, 4096, 5120, 6144],
+)
+# @pytest.mark.parametrize('hidden_size', [256])
+def test_dropout_layer_norm_training(
+    hidden_size,
+    input_dtype,
+    residual_dtype,
+    weight_dtype,
+    dropout_p,
+    has_residual,
+    has_rowscale,
+    has_colscale,
+    is_rms_norm,
+):
+    if weight_dtype == torch.float16 and input_dtype == torch.bfloat16:
+        pytest.skip()  # Not supported
+    if is_rms_norm and FusedRMSNorm is None:
+        pytest.skip()  # We need Apex's FusedRMSNorm to test
+    layer_norm_cls = torch.nn.LayerNorm if not is_rms_norm else FusedRMSNorm
+    our_layer_norm_cls = DropoutAddLayerNorm if not is_rms_norm else DropoutAddRMSNorm
+    our_layer_norm_func = dropout_add_layer_norm if not is_rms_norm else dropout_add_rms_norm
+    device = "cuda"
+    # rtol, atol = (1e-5, 1e-6) if input_dtype == torch.float32 else (1e-3, 1e-4)
+    rtol, atol = (1e-3, 1e-4)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen = 512
+    x0_pt = torch.randn(
+        batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
+    )
+    x0 = x0_pt.detach().clone().requires_grad_()
+    x0_ref = x0_pt.detach().clone().float().requires_grad_()
+    if has_colscale:
+        colscale = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+        colscale_pt = colscale.detach().clone().requires_grad_()
+        colscale_ref = colscale.detach().clone().float().requires_grad_()
+    else:
+        colscale = None
+    if has_residual:
+        res_pt = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True)
+        res = res_pt.detach().clone().requires_grad_()
+        res_ref = res_pt.detach().clone().float().requires_grad_()
+    else:
+        res = None
+    if has_rowscale:
+        rowscale = torch.empty(batch_size, seqlen, device=device, dtype=input_dtype)
+        survival_rate = 0.87
+        rowscale = rowscale.bernoulli_(survival_rate) / survival_rate
+        x0_scaled_pt = x0_pt * rearrange(rowscale, "... -> ... 1")
+        x0_scaled_ref = x0_ref * rearrange(rowscale, "... -> ... 1")
+    else:
+        rowscale = None
+        x0_scaled_pt = x0_pt
+        x0_scaled_ref = x0_ref
+    if has_colscale:
+        x0_scaled_pt = x0_scaled_pt * colscale_pt
+        x0_scaled_ref = x0_scaled_ref * colscale_ref
+    model_pt = layer_norm_cls(hidden_size).to(device=device, dtype=weight_dtype)
+    torch.nn.init.normal_(model_pt.weight)
+    if not is_rms_norm:
+        torch.nn.init.normal_(model_pt.bias)
+    model_ref = layer_norm_cls(hidden_size).to(device=device, dtype=torch.float32)
+    model = our_layer_norm_cls(hidden_size, p=dropout_p, device=device, dtype=weight_dtype)
+    with torch.no_grad():
+        model.weight.copy_(model_pt.weight)
+        model_ref.weight.copy_(model_pt.weight)
+        if not is_rms_norm:
+            model.bias.copy_(model_pt.bias)
+            model_ref.bias.copy_(model_pt.bias)
+    residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32
+    out, dmask = our_layer_norm_func(
+        x0,
+        res,
+        model.weight,
+        model.bias,
+        model.p,
+        model.eps,
+        rowscale=rowscale,
+        layerscale=colscale,
+        residual_in_fp32=residual_in_fp32,
+        return_dropout_mask=True,
+    )
+    assert out.dtype == input_dtype
+    print(f"Actual dropout fraction: {1 - dmask.float().mean().item()}")
+    if has_residual:
+        residual_pt = (
+            (x0_scaled_pt.float() * dmask.float()) / (1 - dropout_p) + res_pt.float()
+        ).to(dtype=residual_dtype)
+        residual_ref = (x0_scaled_ref * dmask.float()) / (1 - dropout_p) + res_ref
+    else:
+        residual_pt = ((x0_scaled_pt.float() * dmask.float()) / (1 - dropout_p)).to(
+            dtype=residual_dtype
+        )
+        residual_ref = (x0_scaled_ref * dmask.float()) / (1 - dropout_p)
+    out_pt = model_pt(residual_pt.to(dtype=weight_dtype)).to(dtype=input_dtype)
+    out_ref = model_ref(residual_ref)
+    assert (out - out_ref).abs().max() <= 4 * (out_pt - out_ref).abs().max() + 1e-4
+
+    g = torch.randn_like(out) / batch_size
+    out_pt.backward(g)
+    out.backward(g)
+    out_ref.backward(g)
+    assert (x0.grad - x0_ref.grad).abs().max() <= 4 * (x0_pt.grad - x0_ref.grad).abs().max() + 1e-4
+    if has_residual:
+        assert (res.grad - res_ref.grad).abs().max() <= 4 * (
+            res_pt.grad - res_ref.grad
+        ).abs().max() + 1e-4
+    assert (model.weight.grad - model_ref.weight.grad).abs().max() <= 3 * (
+        model_pt.weight.grad - model_ref.weight.grad
+    ).abs().max() + 3e-5
+    if not is_rms_norm:
+        assert (model.bias.grad - model_ref.bias.grad).abs().max() <= 2 * (
+            model_pt.bias.grad - model_ref.bias.grad
+        ).abs().max() + 3e-5
+    if has_colscale:
+        assert (colscale.grad - colscale_ref.grad).abs().max() <= 2 * (
+            colscale_pt.grad - colscale_ref.grad
+        ).abs().max() + 2e-4
+
+
+@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16])
+@pytest.mark.parametrize(
+    "input_dtype,residual_dtype",
+    [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)]
+    + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []),
+)
+@pytest.mark.parametrize("hidden_size", [768, 1024, 1280, 1536, 1600, 2048, 2560, 3072, 4096, 5120])
+def test_dropout_layer_norm_eval(hidden_size, input_dtype, residual_dtype, weight_dtype):
+    if weight_dtype == torch.float16 and input_dtype == torch.bfloat16:
+        pytest.skip()  # Not supported
+    device = "cuda"
+    # rtol, atol = (1e-5, 1e-6) if dtype == torch.float32 else (1e-3, 1e-4)
+    rtol, atol = (1e-3, 1e-4)
+    dropout_p = 0.37
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 32
+    seqlen = 512
+    x0_pt = torch.randn(
+        batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
+    )
+    x0 = x0_pt.detach().clone().requires_grad_()
+    x0_ref = x0_pt.detach().clone().float().requires_grad_()
+    res_pt = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True)
+    res = res_pt.detach().clone().requires_grad_()
+    res_ref = res_pt.detach().clone().float().requires_grad_()
+    model_pt = torch.nn.LayerNorm(hidden_size, device=device, dtype=weight_dtype)
+    torch.nn.init.normal_(model_pt.weight)
+    torch.nn.init.normal_(model_pt.bias)
+    model = DropoutAddLayerNorm(hidden_size, p=dropout_p, device=device, dtype=weight_dtype)
+    model_ref = torch.nn.LayerNorm(hidden_size, device=device, dtype=torch.float32)
+    with torch.no_grad():
+        model.weight.copy_(model_pt.weight)
+        model.bias.copy_(model_pt.bias)
+        model_ref.weight.copy_(model_pt.weight)
+        model_ref.bias.copy_(model_pt.bias)
+    model_pt.eval()
+    model.eval()
+    model_ref.eval()
+    out = model(x0, res)
+    residual_pt = (x0_pt.float() + res_pt.float()).to(dtype=residual_dtype)
+    residual_ref = x0_ref + res_ref
+    out_pt = model_pt(residual_pt.to(dtype=weight_dtype)).to(input_dtype)
+    out_ref = model_ref(residual_ref)
+    assert (out - out_ref).abs().max() <= 4 * (out_pt - out_ref).abs().max() + 1e-4
+
+
+@pytest.mark.parametrize("is_rms_norm", [False, True])
+@pytest.mark.parametrize("has_colscale", [True, False])
+@pytest.mark.parametrize("has_rowscale", [True, False])
+@pytest.mark.parametrize("has_residual", [True, False])
+@pytest.mark.parametrize("dropout_p", [0.37, 0.0])
+@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16])
+@pytest.mark.parametrize(
+    "input_dtype,residual_dtype",
+    [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)]
+    + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []),
+)
+# @pytest.mark.parametrize('has_colscale', [True])
+# @pytest.mark.parametrize('has_rowscale', [False])
+# @pytest.mark.parametrize('has_residual', [True])
+# @pytest.mark.parametrize('dropout_p', [0.0])
+# @pytest.mark.parametrize('weight_dtype', [torch.float32])
+# @pytest.mark.parametrize('input_dtype,residual_dtype', [(torch.float32, torch.float32)])
+@pytest.mark.parametrize(
+    "hidden_size",
+    [192, 256, 384, 768, 1024, 1280, 1536, 1600, 2048, 2560, 3000, 3072, 4096, 5120, 6144],
+)
+# @pytest.mark.parametrize('hidden_size', [256])
+def test_dropout_layer_norm_prenorm_training(
+    hidden_size,
+    input_dtype,
+    residual_dtype,
+    weight_dtype,
+    dropout_p,
+    has_residual,
+    has_rowscale,
+    has_colscale,
+    is_rms_norm,
+):
+    if weight_dtype == torch.float16 and input_dtype == torch.bfloat16:
+        pytest.skip()  # Not supported
+    if is_rms_norm and FusedRMSNorm is None:
+        pytest.skip()  # We need Apex's FusedRMSNorm to test
+    layer_norm_cls = torch.nn.LayerNorm if not is_rms_norm else FusedRMSNorm
+    our_layer_norm_cls = DropoutAddLayerNorm if not is_rms_norm else DropoutAddRMSNorm
+    our_layer_norm_func = dropout_add_layer_norm if not is_rms_norm else dropout_add_rms_norm
+    device = "cuda"
+    # rtol, atol = (1e-5, 1e-6) if input_dtype == torch.float32 else (1e-3, 1e-4)
+    rtol, atol = (1e-3, 2e-4)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen = 512
+    x0_pt = torch.randn(
+        batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
+    )
+    x0 = x0_pt.detach().clone().requires_grad_()
+    x0_ref = x0_pt.detach().clone().float().requires_grad_()
+    if has_colscale:
+        colscale = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+        colscale_pt = colscale.detach().clone().requires_grad_()
+        colscale_ref = colscale.detach().clone().float().requires_grad_()
+    else:
+        colscale = None
+    if has_residual:
+        res_pt = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True)
+        res = res_pt.detach().clone().requires_grad_()
+        res_ref = res_pt.detach().clone().float().requires_grad_()
+    else:
+        res = None
+    if has_rowscale:
+        rowscale = torch.empty(batch_size, seqlen, device=device, dtype=input_dtype)
+        survival_rate = 0.87
+        rowscale = rowscale.bernoulli_(survival_rate) / survival_rate
+        x0_scaled_pt = x0_pt * rearrange(rowscale, "... -> ... 1")
+        x0_scaled_ref = x0_ref * rearrange(rowscale, "... -> ... 1")
+    else:
+        rowscale = None
+        x0_scaled_pt = x0_pt
+        x0_scaled_ref = x0_ref
+    if has_colscale:
+        x0_scaled_pt = x0_scaled_pt * colscale_pt
+        x0_scaled_ref = x0_scaled_ref * colscale_ref
+    model_pt = layer_norm_cls(hidden_size).to(device=device, dtype=weight_dtype)
+    torch.nn.init.normal_(model_pt.weight)
+    if not is_rms_norm:
+        torch.nn.init.normal_(model_pt.bias)
+    model_ref = layer_norm_cls(hidden_size).to(device=device, dtype=torch.float32)
+    model = our_layer_norm_cls(
+        hidden_size, prenorm=True, p=dropout_p, device=device, dtype=weight_dtype
+    )
+    with torch.no_grad():
+        model.weight.copy_(model_pt.weight)
+        model_ref.weight.copy_(model_pt.weight)
+        if not is_rms_norm:
+            model.bias.copy_(model_pt.bias)
+            model_ref.bias.copy_(model_pt.bias)
+    residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32
+    out, residual, dmask = our_layer_norm_func(
+        x0,
+        res,
+        model.weight,
+        model.bias,
+        model.p,
+        model.eps,
+        rowscale=rowscale,
+        layerscale=colscale,
+        prenorm=True,
+        residual_in_fp32=residual_in_fp32,
+        return_dropout_mask=True,
+    )
+    print(f"Actual dropout fraction: {1 - dmask.float().mean().item()}")
+    if has_residual:
+        residual_pt = (
+            (x0_scaled_pt.float() * dmask.float()) / (1 - dropout_p) + res_pt.float()
+        ).to(dtype=residual_dtype)
+        residual_ref = (x0_scaled_ref * dmask.float()) / (1 - dropout_p) + res_ref
+    else:
+        residual_pt = ((x0_scaled_pt.float() * dmask.float()) / (1 - dropout_p)).to(
+            dtype=residual_dtype
+        )
+        residual_ref = (x0_scaled_ref * dmask.float()) / (1 - dropout_p)
+    out_pt = model_pt(residual_pt.to(dtype=weight_dtype)).to(dtype=input_dtype)
+    out_ref = model_ref(residual_ref)
+    assert out.dtype == input_dtype
+    assert residual.dtype == residual_dtype
+    assert (out - out_ref).abs().max() <= 4 * (out_pt - out_ref).abs().max() + 1e-4
+    assert (residual - residual_ref).abs().max() <= 4 * (
+        residual_pt - residual_ref
+    ).abs().max() + 1e-4
+
+    g = torch.randn_like(out) / batch_size
+    (out_pt * F.sigmoid(residual_pt)).backward(g)
+    (out * F.sigmoid(residual)).backward(g)
+    (out_ref * F.sigmoid(residual_ref.to(dtype=residual_dtype))).backward(g)
+    assert (x0.grad - x0_ref.grad).abs().max() <= 4 * (x0_pt.grad - x0_ref.grad).abs().max() + 1e-4
+    if has_residual:
+        assert (res.grad - res_ref.grad).abs().max() <= 4 * (
+            res_pt.grad - res_ref.grad
+        ).abs().max() + 1e-4
+    assert (model.weight.grad - model_ref.weight.grad).abs().max() <= 2 * (
+        model_pt.weight.grad - model_ref.weight.grad
+    ).abs().max() + 2e-4
+    if not is_rms_norm:
+        assert (model.bias.grad - model_ref.bias.grad).abs().max() <= 2 * (
+            model_pt.bias.grad - model_ref.bias.grad
+        ).abs().max() + 2e-4
+    if has_colscale:
+        assert (colscale.grad - colscale_ref.grad).abs().max() <= 2 * (
+            colscale_pt.grad - colscale_ref.grad
+        ).abs().max() + 2e-4
+
+
+@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16])
+@pytest.mark.parametrize(
+    "input_dtype,residual_dtype",
+    [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)]
+    + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []),
+)
+@pytest.mark.parametrize("hidden_size", [768, 1024, 1280, 1536, 1600, 2048, 2560, 3072, 4096, 5120])
+def test_dropout_layer_norm_prenorm_eval(hidden_size, input_dtype, residual_dtype, weight_dtype):
+    if weight_dtype == torch.float16 and input_dtype == torch.bfloat16:
+        pytest.skip()  # Not supported
+    device = "cuda"
+    # rtol, atol = (1e-5, 1e-6) if dtype == torch.float32 else (1e-3, 1e-4)
+    rtol, atol = (1e-3, 1e-4)
+    dropout_p = 0.37
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 32
+    seqlen = 512
+    x0_pt = torch.randn(
+        batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
+    )
+    x0 = x0_pt.detach().clone().requires_grad_()
+    x0_ref = x0_pt.detach().clone().float().requires_grad_()
+    res_pt = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True)
+    res = res_pt.detach().clone().requires_grad_()
+    res_ref = res_pt.detach().clone().float().requires_grad_()
+    model_pt = torch.nn.LayerNorm(hidden_size, device=device, dtype=weight_dtype)
+    torch.nn.init.normal_(model_pt.weight)
+    torch.nn.init.normal_(model_pt.bias)
+    model = DropoutAddLayerNorm(
+        hidden_size, prenorm=True, p=dropout_p, device=device, dtype=weight_dtype
+    )
+    model_ref = torch.nn.LayerNorm(hidden_size, device=device, dtype=torch.float32)
+    with torch.no_grad():
+        model.weight.copy_(model_pt.weight)
+        model.bias.copy_(model_pt.bias)
+        model_ref.weight.copy_(model_pt.weight)
+        model_ref.bias.copy_(model_pt.bias)
+    model_pt.eval()
+    model.eval()
+    model_ref.eval()
+    out, residual = model(x0, res)
+    residual_pt = (x0_pt.float() + res_pt.float()).to(dtype=residual_dtype)
+    residual_ref = x0_ref + res_ref
+    out_pt = model_pt(residual_pt.to(dtype=weight_dtype)).to(input_dtype)
+    out_ref = model_ref(residual_ref)
+    assert (out - out_ref).abs().max() <= 4 * (out_pt - out_ref).abs().max() + 1e-4
+    assert (residual - residual_ref).abs().max() <= 4 * (
+        residual_pt - residual_ref
+    ).abs().max() + 1e-4
+
+
+@pytest.mark.parametrize("has_colscale", [True, False])
+@pytest.mark.parametrize("has_residual", [True, False])
+@pytest.mark.parametrize("dropout_p", [0.37, 0.0])
+@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16])
+@pytest.mark.parametrize(
+    "input_dtype,residual_dtype",
+    [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)]
+    + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []),
+)
+# @pytest.mark.parametrize('has_colscale', [True])
+# @pytest.mark.parametrize('has_residual', [True])
+# @pytest.mark.parametrize('dropout_p', [0.0])
+# @pytest.mark.parametrize('weight_dtype', [torch.float32])
+# @pytest.mark.parametrize('input_dtype,residual_dtype', [(torch.float32, torch.float32)])
+@pytest.mark.parametrize(
+    "hidden_size",
+    [192, 256, 384, 768, 1024, 1280, 1536, 1600, 2048, 2560, 3000, 3072, 4096, 5120, 6144],
+)
+# @pytest.mark.parametrize('hidden_size', [256])
+def test_dropout_layer_norm_subset_training(
+    hidden_size, input_dtype, residual_dtype, weight_dtype, dropout_p, has_residual, has_colscale
+):
+    if weight_dtype == torch.float16 and input_dtype == torch.bfloat16:
+        pytest.skip()  # Not supported
+    device = "cuda"
+    # rtol, atol = (1e-5, 1e-6) if input_dtype == torch.float32 else (1e-3, 1e-4)
+    rtol, atol = (1e-3, 2e-4)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen = 512
+    drop_path_rate = 0.4
+    drop_path_scale = 1 / (1 - drop_path_rate)
+
+    def generate_droppath_masks(batch_size, seqlen, drop_path_rate, device):
+        # Do it on CPU so we can get the numrows (with .item()) without GPU-CPU sync
+        mask_batch = torch.rand(batch_size) < 1 - drop_path_rate
+        numrows = (mask_batch).sum().item() * seqlen
+        mask_batch = mask_batch.to(device=device, non_blocking=True)
+        mask_batch_seqlen = repeat(mask_batch, "b -> (b s)", s=seqlen)
+        subset = torch.cumsum(mask_batch_seqlen, dim=0, dtype=torch.int32).masked_fill_(
+            ~mask_batch_seqlen, 0
+        )
+        return mask_batch, numrows, rearrange(subset, "(b s) -> b s", b=batch_size)
+
+    x0_mask_batch, x0_numrows, x0_subset = generate_droppath_masks(
+        batch_size, seqlen, drop_path_rate, device
+    )
+    out_mask_batch, out_numrows, out_subset = generate_droppath_masks(
+        batch_size, seqlen, drop_path_rate, device
+    )
+
+    x0_pt = torch.randn(
+        batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
+    )
+    x0 = x0_pt.detach().clone()[x0_mask_batch].requires_grad_()
+    x0_ref = x0_pt.detach().clone().float().requires_grad_()
+    if has_colscale:
+        colscale = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+        colscale_pt = colscale.detach().clone().requires_grad_()
+        colscale_ref = colscale.detach().clone().float().requires_grad_()
+    else:
+        colscale = None
+    if has_residual:
+        res_pt = torch.randn_like(x0_pt, dtype=residual_dtype, requires_grad=True)
+        res = res_pt.detach().clone().requires_grad_()
+        res_ref = res_pt.detach().clone().float().requires_grad_()
+    else:
+        res = None
+
+    if has_colscale:
+        x0_scaled_pt = x0_pt * colscale_pt
+        x0_scaled_ref = x0_ref * colscale_ref
+    else:
+        x0_scaled_pt = x0_pt
+        x0_scaled_ref = x0_ref
+
+    model_pt = torch.nn.LayerNorm(hidden_size, device=device, dtype=weight_dtype)
+    torch.nn.init.normal_(model_pt.weight)
+    torch.nn.init.normal_(model_pt.bias)
+    model_ref = torch.nn.LayerNorm(hidden_size, device=device, dtype=torch.float32)
+    model = DropoutAddLayerNorm(
+        hidden_size, prenorm=False, p=dropout_p, device=device, dtype=weight_dtype
+    )
+    with torch.no_grad():
+        model.weight.copy_(model_pt.weight)
+        model.bias.copy_(model_pt.bias)
+        model_ref.weight.copy_(model_pt.weight)
+        model_ref.bias.copy_(model_pt.bias)
+
+    residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32
+    out, dmask = dropout_add_layer_norm_subset(
+        x0,
+        res,
+        model.weight,
+        model.bias,
+        model.p,
+        model.eps,
+        layerscale=colscale,
+        x0_subset=x0_subset,
+        out_subset=out_subset,
+        rowscale_const=drop_path_scale,
+        out_numrows=out_numrows,
+        prenorm=False,
+        residual_in_fp32=residual_in_fp32,
+        return_dropout_mask=True,
+    )
+    print(f"Actual dropout fraction: {1 - dmask.float().mean().item()}")
+
+    x0_scaled_pt = (
+        x0_scaled_pt.masked_fill(repeat(~x0_mask_batch, "b -> b s d", s=seqlen, d=hidden_size), 0)
+        * drop_path_scale
+    )
+    x0_scaled_ref = (
+        x0_scaled_ref.masked_fill(repeat(~x0_mask_batch, "b -> b s d", s=seqlen, d=hidden_size), 0)
+        * drop_path_scale
+    )
+    dmask_expanded = torch.zeros_like(x0_pt, dtype=torch.uint8)
+    dmask_expanded[x0_mask_batch] = dmask
+    if has_residual:
+        residual_pt = (
+            (x0_scaled_pt.float() * dmask_expanded.float()) / (1 - dropout_p) + res_pt.float()
+        ).to(dtype=residual_dtype)
+        residual_ref = (x0_scaled_ref * dmask_expanded.float()) / (1 - dropout_p) + res_ref
+    else:
+        residual_pt = ((x0_scaled_pt.float() * dmask_expanded.float()) / (1 - dropout_p)).to(
+            dtype=residual_dtype
+        )
+        residual_ref = (x0_scaled_ref * dmask_expanded.float()) / (1 - dropout_p)
+    out_pt = model_pt(residual_pt.to(dtype=weight_dtype)).to(dtype=input_dtype)[out_mask_batch]
+    out_ref = model_ref(residual_ref)[out_mask_batch]
+    assert out.dtype == input_dtype
+    assert (out - out_ref).abs().max() <= 4 * (out_pt - out_ref).abs().max() + 1e-4
+
+    g = torch.randn_like(out) / batch_size
+    out_pt.backward(g)
+    out.backward(g)
+    out_ref.backward(g)
+    assert (x0.grad - x0_ref.grad[x0_mask_batch]).abs().max() <= 4 * (x0_pt.grad - x0_ref.grad)[
+        x0_mask_batch
+    ].abs().max() + 1e-4
+    if has_residual:
+        assert (res.grad - res_ref.grad).abs().max() <= 4 * (
+            res_pt.grad - res_ref.grad
+        ).abs().max() + 1e-4
+    assert (model.weight.grad - model_ref.weight.grad).abs().max() <= 2 * (
+        model_pt.weight.grad - model_ref.weight.grad
+    ).abs().max() + 2e-4
+    assert (model.bias.grad - model_ref.bias.grad).abs().max() <= 2 * (
+        model_pt.bias.grad - model_ref.bias.grad
+    ).abs().max() + 2e-4
+    if has_colscale:
+        assert (colscale.grad - colscale_ref.grad).abs().max() <= 2 * (
+            colscale_pt.grad - colscale_ref.grad
+        ).abs().max() + 2e-4
+
+
+@pytest.mark.parametrize("has_colscale", [True, False])
+@pytest.mark.parametrize("has_residual", [True, False])
+@pytest.mark.parametrize("dropout_p", [0.37, 0.0])
+@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16])
+@pytest.mark.parametrize(
+    "input_dtype,residual_dtype",
+    [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)]
+    + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []),
+)
+# @pytest.mark.parametrize('has_colscale', [True])
+# @pytest.mark.parametrize('has_residual', [True])
+# @pytest.mark.parametrize('dropout_p', [0.0])
+# @pytest.mark.parametrize('weight_dtype', [torch.float32])
+# @pytest.mark.parametrize('input_dtype,residual_dtype', [(torch.float32, torch.float32)])
+@pytest.mark.parametrize(
+    "hidden_size",
+    [192, 256, 384, 768, 1024, 1280, 1536, 1600, 2048, 2560, 3000, 3072, 4096, 5120, 6144],
+)
+# @pytest.mark.parametrize('hidden_size', [256])
+def test_dropout_layer_norm_subset_prenorm_training(
+    hidden_size, input_dtype, residual_dtype, weight_dtype, dropout_p, has_residual, has_colscale
+):
+    if weight_dtype == torch.float16 and input_dtype == torch.bfloat16:
+        pytest.skip()  # Not supported
+    device = "cuda"
+    # rtol, atol = (1e-5, 1e-6) if input_dtype == torch.float32 else (1e-3, 1e-4)
+    rtol, atol = (1e-3, 2e-4)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen = 512
+    drop_path_rate = 0.4
+    drop_path_scale = 1 / (1 - drop_path_rate)
+
+    def generate_droppath_masks(batch_size, seqlen, drop_path_rate, device):
+        # Do it on CPU so we can get the numrows (with .item()) without GPU-CPU sync
+        mask_batch = torch.rand(batch_size) < 1 - drop_path_rate
+        numrows = (mask_batch).sum().item() * seqlen
+        mask_batch = mask_batch.to(device=device, non_blocking=True)
+        mask_batch_seqlen = repeat(mask_batch, "b -> (b s)", s=seqlen)
+        subset = torch.cumsum(mask_batch_seqlen, dim=0, dtype=torch.int32).masked_fill_(
+            ~mask_batch_seqlen, 0
+        )
+        return mask_batch, numrows, rearrange(subset, "(b s) -> b s", b=batch_size)
+
+    x0_mask_batch, x0_numrows, x0_subset = generate_droppath_masks(
+        batch_size, seqlen, drop_path_rate, device
+    )
+    out_mask_batch, out_numrows, out_subset = generate_droppath_masks(
+        batch_size, seqlen, drop_path_rate, device
+    )
+
+    x0_pt = torch.randn(
+        batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
+    )
+    x0 = x0_pt.detach().clone()[x0_mask_batch].requires_grad_()
+    x0_ref = x0_pt.detach().clone().float().requires_grad_()
+    if has_colscale:
+        colscale = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+        colscale_pt = colscale.detach().clone().requires_grad_()
+        colscale_ref = colscale.detach().clone().float().requires_grad_()
+    else:
+        colscale = None
+    if has_residual:
+        res_pt = torch.randn_like(x0_pt, dtype=residual_dtype, requires_grad=True)
+        res = res_pt.detach().clone().requires_grad_()
+        res_ref = res_pt.detach().clone().float().requires_grad_()
+    else:
+        res = None
+
+    if has_colscale:
+        x0_scaled_pt = x0_pt * colscale_pt
+        x0_scaled_ref = x0_ref * colscale_ref
+    else:
+        x0_scaled_pt = x0_pt
+        x0_scaled_ref = x0_ref
+
+    model_pt = torch.nn.LayerNorm(hidden_size, device=device, dtype=weight_dtype)
+    torch.nn.init.normal_(model_pt.weight)
+    torch.nn.init.normal_(model_pt.bias)
+    model_ref = torch.nn.LayerNorm(hidden_size, device=device, dtype=torch.float32)
+    model = DropoutAddLayerNorm(
+        hidden_size, prenorm=True, p=dropout_p, device=device, dtype=weight_dtype
+    )
+    with torch.no_grad():
+        model.weight.copy_(model_pt.weight)
+        model.bias.copy_(model_pt.bias)
+        model_ref.weight.copy_(model_pt.weight)
+        model_ref.bias.copy_(model_pt.bias)
+
+    residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32
+    out, residual, dmask = dropout_add_layer_norm_subset(
+        x0,
+        res,
+        model.weight,
+        model.bias,
+        model.p,
+        model.eps,
+        layerscale=colscale,
+        x0_subset=x0_subset,
+        out_subset=out_subset,
+        rowscale_const=drop_path_scale,
+        out_numrows=out_numrows,
+        prenorm=True,
+        residual_in_fp32=residual_in_fp32,
+        return_dropout_mask=True,
+    )
+    print(f"Actual dropout fraction: {1 - dmask.float().mean().item()}")
+
+    x0_scaled_pt = (
+        x0_scaled_pt.masked_fill(repeat(~x0_mask_batch, "b -> b s d", s=seqlen, d=hidden_size), 0)
+        * drop_path_scale
+    )
+    x0_scaled_ref = (
+        x0_scaled_ref.masked_fill(repeat(~x0_mask_batch, "b -> b s d", s=seqlen, d=hidden_size), 0)
+        * drop_path_scale
+    )
+    dmask_expanded = torch.zeros_like(x0_pt, dtype=torch.uint8)
+    dmask_expanded[x0_mask_batch] = dmask
+    if has_residual:
+        residual_pt = (
+            (x0_scaled_pt.float() * dmask_expanded.float()) / (1 - dropout_p) + res_pt.float()
+        ).to(dtype=residual_dtype)
+        residual_ref = (x0_scaled_ref * dmask_expanded.float()) / (1 - dropout_p) + res_ref
+    else:
+        residual_pt = ((x0_scaled_pt.float() * dmask_expanded.float()) / (1 - dropout_p)).to(
+            dtype=residual_dtype
+        )
+        residual_ref = (x0_scaled_ref * dmask_expanded.float()) / (1 - dropout_p)
+    out_pt = model_pt(residual_pt.to(dtype=weight_dtype)).to(dtype=input_dtype)[out_mask_batch]
+    out_ref = model_ref(residual_ref)[out_mask_batch]
+    assert out.dtype == input_dtype
+    assert residual.dtype == residual_dtype
+    assert (out - out_ref).abs().max() <= 4 * (out_pt - out_ref).abs().max() + 1e-4
+    assert (residual - residual_ref).abs().max() <= 4 * (
+        residual_pt - residual_ref
+    ).abs().max() + 1e-4
+
+    g = torch.randn_like(out) / batch_size
+    (out_pt * F.sigmoid(residual_pt[out_mask_batch]) + residual_pt.mean(0, keepdim=True)).backward(
+        g
+    )
+    (out * F.sigmoid(residual[out_mask_batch]) + residual.mean(0, keepdim=True)).backward(g)
+    (
+        out_ref * F.sigmoid(residual_ref[out_mask_batch].to(dtype=residual_dtype))
+        + residual_ref.mean(0, keepdim=True)
+    ).backward(g)
+    assert (x0.grad - x0_ref.grad[x0_mask_batch]).abs().max() <= 4 * (x0_pt.grad - x0_ref.grad)[
+        x0_mask_batch
+    ].abs().max() + 1e-4
+    if has_residual:
+        assert (res.grad - res_ref.grad).abs().max() <= 4 * (
+            res_pt.grad - res_ref.grad
+        ).abs().max() + 1e-4
+    assert (model.weight.grad - model_ref.weight.grad).abs().max() <= 2 * (
+        model_pt.weight.grad - model_ref.weight.grad
+    ).abs().max() + 2e-4
+    assert (model.bias.grad - model_ref.bias.grad).abs().max() <= 2 * (
+        model_pt.bias.grad - model_ref.bias.grad
+    ).abs().max() + 2e-4
+    if has_colscale:
+        assert (colscale.grad - colscale_ref.grad).abs().max() <= 2 * (
+            colscale_pt.grad - colscale_ref.grad
+        ).abs().max() + 2e-4
+
+
+@pytest.mark.parametrize("is_rms_norm", [False, True])
+# @pytest.mark.parametrize('is_rms_norm', [False])
+@pytest.mark.parametrize("tied_norm", [False, True])
+# @pytest.mark.parametrize('tied_norm', [False])
+@pytest.mark.parametrize("has_residual", [True, False])
+# @pytest.mark.parametrize('has_residual', [False])
+@pytest.mark.parametrize("has_x1", [True, False])
+# @pytest.mark.parametrize('has_x1', [True])
+@pytest.mark.parametrize("dropout_p", [0.37, 0.0])
+# @pytest.mark.parametrize('dropout_p', [0.0])
+@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16])
+# @pytest.mark.parametrize('weight_dtype', [torch.float16])
+@pytest.mark.parametrize(
+    "input_dtype,residual_dtype",
+    [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)]
+    + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []),
+)
+# @pytest.mark.parametrize('input_dtype,residual_dtype', [(torch.float16, torch.float32)])
+@pytest.mark.parametrize(
+    "hidden_size",
+    [192, 256, 384, 768, 1024, 1280, 1536, 1600, 2048, 2560, 3000, 3072, 4096, 5120, 6144],
+)
+# @pytest.mark.parametrize('hidden_size', [256])
+def test_dropout_layer_norm_parallel_residual_training(
+    hidden_size,
+    input_dtype,
+    residual_dtype,
+    weight_dtype,
+    dropout_p,
+    has_x1,
+    has_residual,
+    tied_norm,
+    is_rms_norm,
+):
+    if weight_dtype == torch.float16 and input_dtype == torch.bfloat16:
+        pytest.skip()  # Not supported
+    if is_rms_norm and fused_rms_norm_affine is None:
+        pytest.skip()  # We need Apex's FusedRMSNorm to test
+    our_layer_norm_func = (
+        dropout_add_layer_norm_parallel_residual
+        if not is_rms_norm
+        else dropout_add_rms_norm_parallel_residual
+    )
+    device = "cuda"
+    # rtol, atol = (1e-5, 1e-6) if input_dtype == torch.float32 else (1e-3, 1e-4)
+    rtol, atol = (1e-3, 1e-4)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen = 512
+    x0_pt = torch.randn(
+        batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
+    )
+    x0 = x0_pt.detach().clone().requires_grad_()
+    x0_ref = x0_pt.detach().clone().float().requires_grad_()
+    if has_x1:
+        x1_pt = torch.randn(
+            batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
+        )
+        x1 = x1_pt.detach().clone().requires_grad_()
+        x1_ref = x1_pt.detach().clone().float().requires_grad_()
+    else:
+        x1 = None
+    if has_residual:
+        res_pt = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True)
+        res = res_pt.detach().clone().requires_grad_()
+        res_ref = res_pt.detach().clone().float().requires_grad_()
+    else:
+        res = None
+    weight0 = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+    bias0 = (
+        torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+        if not is_rms_norm
+        else None
+    )
+    weight0_pt = weight0.detach().clone().requires_grad_()
+    weight0_ref = weight0.detach().clone().float().requires_grad_()
+    bias0_pt = bias0.detach().clone().requires_grad_() if bias0 is not None else None
+    bias0_ref = bias0.detach().clone().float().requires_grad_() if bias0 is not None else None
+    if not tied_norm:
+        weight1 = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+        bias1 = (
+            torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+            if not is_rms_norm
+            else None
+        )
+        weight1_pt = weight1.detach().clone().requires_grad_()
+        weight1_ref = weight1.detach().clone().float().requires_grad_()
+        bias1_pt = bias1.detach().clone().requires_grad_() if bias1 is not None else None
+        bias1_ref = bias1.detach().clone().float().requires_grad_() if bias1 is not None else None
+    else:
+        weight1, bias1 = None, None
+    epsilon = 1e-5
+    residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32
+
+    out0, out1, dmask0, dmask1 = our_layer_norm_func(
+        x0,
+        x1,
+        res,
+        weight0,
+        bias0,
+        weight1,
+        bias1,
+        dropout_p,
+        epsilon,
+        residual_in_fp32=residual_in_fp32,
+        return_dropout_mask=True,
+    )
+    assert out0.dtype == input_dtype
+    if not tied_norm:
+        assert out1.dtype == input_dtype
+    print(f"Actual dropout fraction: {1 - dmask0.float().mean().item()}")
+    if has_residual:
+        if has_x1:
+            residual_pt = (
+                (x0_pt.float() * dmask0.float()) / (1 - dropout_p)
+                + (x1_pt.float() * dmask1.float()) / (1 - dropout_p)
+                + res_pt.float()
+            ).to(dtype=residual_dtype)
+            residual_ref = (
+                (x0_ref * dmask0.float()) / (1 - dropout_p)
+                + (x1_ref * dmask1.float()) / (1 - dropout_p)
+            ) + res_ref
+        else:
+            residual_pt = ((x0_pt.float() * dmask0.float()) / (1 - dropout_p) + res_pt.float()).to(
+                dtype=residual_dtype
+            )
+            residual_ref = (x0_ref * dmask0.float()) / (1 - dropout_p) + res_ref
+    else:
+        if has_x1:
+            residual_pt = (
+                (x0_pt.float() * dmask0.float()) / (1 - dropout_p)
+                + (x1_pt.float() * dmask1.float()) / (1 - dropout_p)
+            ).to(dtype=residual_dtype)
+            residual_ref = (x0_ref * dmask0.float()) / (1 - dropout_p) + (
+                x1_ref * dmask1.float()
+            ) / (1 - dropout_p)
+        else:
+            residual_pt = ((x0_pt.float() * dmask0.float()) / (1 - dropout_p)).to(
+                dtype=residual_dtype
+            )
+            residual_ref = (x0_ref * dmask0.float()) / (1 - dropout_p)
+    if not is_rms_norm:
+        out0_pt = F.layer_norm(
+            residual_pt.to(dtype=weight_dtype), (hidden_size,), weight0_pt, bias0_pt, eps=epsilon
+        ).to(dtype=input_dtype)
+        out0_ref = F.layer_norm(residual_ref, (hidden_size,), weight0_ref, bias0_ref, eps=epsilon)
+        if not tied_norm:
+            out1_pt = F.layer_norm(
+                residual_pt.to(dtype=weight_dtype),
+                (hidden_size,),
+                weight1_pt,
+                bias1_pt,
+                eps=epsilon,
+            ).to(dtype=input_dtype)
+            out1_ref = F.layer_norm(
+                residual_ref, (hidden_size,), weight1_ref, bias1_ref, eps=epsilon
+            )
+    else:
+        out0_pt = fused_rms_norm_affine(
+            residual_pt.to(dtype=weight_dtype), weight0_pt, (hidden_size,), eps=epsilon
+        ).to(dtype=input_dtype)
+        out0_ref = fused_rms_norm_affine(residual_ref, weight0_ref, (hidden_size,), eps=epsilon)
+        if not tied_norm:
+            out1_pt = fused_rms_norm_affine(
+                residual_pt.to(dtype=weight_dtype), weight1_pt, (hidden_size,), eps=epsilon
+            ).to(dtype=input_dtype)
+            out1_ref = fused_rms_norm_affine(residual_ref, weight1_ref, (hidden_size,), eps=epsilon)
+
+    assert (out0 - out0_ref).abs().max() <= 4 * (out0_pt - out0_ref).abs().max() + 1e-4
+    if not tied_norm:
+        assert (out1 - out1_ref).abs().max() <= 4 * (out1_pt - out1_ref).abs().max() + 1e-4
+
+    g0 = torch.randn_like(out0) / batch_size
+    if tied_norm:
+        out0.backward(g0)
+        out0_pt.backward(g0)
+        out0_ref.backward(g0)
+    else:
+        g1 = torch.randn_like(out1) / batch_size
+        (out0 * g0 + out1 * g1).sum().backward()
+        (out0_pt * g0 + out1_pt * g1).sum().backward()
+        (out0_ref * g0 + out1_ref * g1).sum().backward()
+    assert (x0.grad - x0_ref.grad).abs().max() <= 4 * (x0_pt.grad - x0_ref.grad).abs().max() + 1e-4
+    if has_x1:
+        assert (x1.grad - x1_ref.grad).abs().max() <= 4 * (
+            x1_pt.grad - x1_ref.grad
+        ).abs().max() + 1e-4
+    if has_residual:
+        assert (res.grad - res_ref.grad).abs().max() <= 4 * (
+            res_pt.grad - res_ref.grad
+        ).abs().max() + 1e-4
+    assert (weight0.grad - weight0_ref.grad).abs().max() <= 3 * (
+        weight0_pt.grad - weight0_ref.grad
+    ).abs().max() + 3e-5
+    if not is_rms_norm:
+        assert (bias0.grad - bias0_ref.grad).abs().max() <= 2 * (
+            bias0_pt.grad - bias0_ref.grad
+        ).abs().max() + 3e-5
+    if not tied_norm:
+        assert (weight1.grad - weight1_ref.grad).abs().max() <= 3 * (
+            weight1_pt.grad - weight1_ref.grad
+        ).abs().max() + 3e-5
+        if not is_rms_norm:
+            assert (bias1.grad - bias1_ref.grad).abs().max() <= 2 * (
+                bias1_pt.grad - bias1_ref.grad
+            ).abs().max() + 3e-5
+
+
+@pytest.mark.parametrize("is_rms_norm", [False, True])
+# @pytest.mark.parametrize('is_rms_norm', [False])
+@pytest.mark.parametrize("tied_norm", [False, True])
+# @pytest.mark.parametrize('tied_norm', [False])
+@pytest.mark.parametrize("has_residual", [True, False])
+# @pytest.mark.parametrize('has_residual', [False])
+@pytest.mark.parametrize("has_x1", [True, False])
+# @pytest.mark.parametrize('has_x1', [True])
+@pytest.mark.parametrize("dropout_p", [0.37, 0.0])
+# @pytest.mark.parametrize('dropout_p', [0.0])
+@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16])
+# @pytest.mark.parametrize('weight_dtype', [torch.float16])
+@pytest.mark.parametrize(
+    "input_dtype,residual_dtype",
+    [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)]
+    + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []),
+)
+# @pytest.mark.parametrize('input_dtype,residual_dtype', [(torch.float16, torch.float32)])
+@pytest.mark.parametrize(
+    "hidden_size",
+    [192, 256, 384, 768, 1024, 1280, 1536, 1600, 2048, 2560, 3000, 3072, 4096, 5120, 6144],
+)
+# @pytest.mark.parametrize('hidden_size', [256])
+def test_dropout_layer_norm_parallel_residual_prenorm_training(
+    hidden_size,
+    input_dtype,
+    residual_dtype,
+    weight_dtype,
+    dropout_p,
+    has_x1,
+    has_residual,
+    tied_norm,
+    is_rms_norm,
+):
+    if weight_dtype == torch.float16 and input_dtype == torch.bfloat16:
+        pytest.skip()  # Not supported
+    if is_rms_norm and fused_rms_norm_affine is None:
+        pytest.skip()  # We need Apex's FusedRMSNorm to test
+    our_layer_norm_func = (
+        dropout_add_layer_norm_parallel_residual
+        if not is_rms_norm
+        else dropout_add_rms_norm_parallel_residual
+    )
+    device = "cuda"
+    # rtol, atol = (1e-5, 1e-6) if input_dtype == torch.float32 else (1e-3, 1e-4)
+    rtol, atol = (1e-3, 1e-4)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen = 512
+    x0_pt = torch.randn(
+        batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
+    )
+    x0 = x0_pt.detach().clone().requires_grad_()
+    x0_ref = x0_pt.detach().clone().float().requires_grad_()
+    if has_x1:
+        x1_pt = torch.randn(
+            batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
+        )
+        x1 = x1_pt.detach().clone().requires_grad_()
+        x1_ref = x1_pt.detach().clone().float().requires_grad_()
+    else:
+        x1 = None
+    if has_residual:
+        res_pt = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True)
+        res = res_pt.detach().clone().requires_grad_()
+        res_ref = res_pt.detach().clone().float().requires_grad_()
+    else:
+        res = None
+    weight0 = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+    bias0 = (
+        torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+        if not is_rms_norm
+        else None
+    )
+    weight0_pt = weight0.detach().clone().requires_grad_()
+    weight0_ref = weight0.detach().clone().float().requires_grad_()
+    bias0_pt = bias0.detach().clone().requires_grad_() if bias0 is not None else None
+    bias0_ref = bias0.detach().clone().float().requires_grad_() if bias0 is not None else None
+    if not tied_norm:
+        weight1 = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+        bias1 = (
+            torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+            if not is_rms_norm
+            else None
+        )
+        weight1_pt = weight1.detach().clone().requires_grad_()
+        weight1_ref = weight1.detach().clone().float().requires_grad_()
+        bias1_pt = bias1.detach().clone().requires_grad_() if bias1 is not None else None
+        bias1_ref = bias1.detach().clone().float().requires_grad_() if bias1 is not None else None
+    else:
+        weight1, bias1 = None, None
+    epsilon = 1e-5
+    residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32
+
+    out0, out1, residual, dmask0, dmask1 = our_layer_norm_func(
+        x0,
+        x1,
+        res,
+        weight0,
+        bias0,
+        weight1,
+        bias1,
+        dropout_p,
+        epsilon,
+        prenorm=True,
+        residual_in_fp32=residual_in_fp32,
+        return_dropout_mask=True,
+    )
+    assert out0.dtype == input_dtype
+    if not tied_norm:
+        assert out1.dtype == input_dtype
+    print(f"Actual dropout fraction: {1 - dmask0.float().mean().item()}")
+    if has_residual:
+        if has_x1:
+            residual_pt = (
+                (x0_pt.float() * dmask0.float()) / (1 - dropout_p)
+                + (x1_pt.float() * dmask1.float()) / (1 - dropout_p)
+                + res_pt.float()
+            ).to(dtype=residual_dtype)
+            residual_ref = (
+                (x0_ref * dmask0.float()) / (1 - dropout_p)
+                + (x1_ref * dmask1.float()) / (1 - dropout_p)
+            ) + res_ref
+        else:
+            residual_pt = ((x0_pt.float() * dmask0.float()) / (1 - dropout_p) + res_pt.float()).to(
+                dtype=residual_dtype
+            )
+            residual_ref = (x0_ref * dmask0.float()) / (1 - dropout_p) + res_ref
+    else:
+        if has_x1:
+            residual_pt = (
+                (x0_pt.float() * dmask0.float()) / (1 - dropout_p)
+                + (x1_pt.float() * dmask1.float()) / (1 - dropout_p)
+            ).to(dtype=residual_dtype)
+            residual_ref = (x0_ref * dmask0.float()) / (1 - dropout_p) + (
+                x1_ref * dmask1.float()
+            ) / (1 - dropout_p)
+        else:
+            residual_pt = ((x0_pt.float() * dmask0.float()) / (1 - dropout_p)).to(
+                dtype=residual_dtype
+            )
+            residual_ref = (x0_ref * dmask0.float()) / (1 - dropout_p)
+    if not is_rms_norm:
+        out0_pt = F.layer_norm(
+            residual_pt.to(dtype=weight_dtype), (hidden_size,), weight0_pt, bias0_pt, eps=epsilon
+        ).to(dtype=input_dtype)
+        out0_ref = F.layer_norm(residual_ref, (hidden_size,), weight0_ref, bias0_ref, eps=epsilon)
+        if not tied_norm:
+            out1_pt = F.layer_norm(
+                residual_pt.to(dtype=weight_dtype),
+                (hidden_size,),
+                weight1_pt,
+                bias1_pt,
+                eps=epsilon,
+            ).to(dtype=input_dtype)
+            out1_ref = F.layer_norm(
+                residual_ref, (hidden_size,), weight1_ref, bias1_ref, eps=epsilon
+            )
+    else:
+        out0_pt = fused_rms_norm_affine(
+            residual_pt.to(dtype=weight_dtype), weight0_pt, (hidden_size,), eps=epsilon
+        ).to(dtype=input_dtype)
+        out0_ref = fused_rms_norm_affine(residual_ref, weight0_ref, (hidden_size,), eps=epsilon)
+        if not tied_norm:
+            out1_pt = fused_rms_norm_affine(
+                residual_pt.to(dtype=weight_dtype), weight1_pt, (hidden_size,), eps=epsilon
+            ).to(dtype=input_dtype)
+            out1_ref = fused_rms_norm_affine(residual_ref, weight1_ref, (hidden_size,), eps=epsilon)
+
+    assert (out0 - out0_ref).abs().max() <= 4 * (out0_pt - out0_ref).abs().max() + 1e-4
+    if not tied_norm:
+        assert (out1 - out1_ref).abs().max() <= 4 * (out1_pt - out1_ref).abs().max() + 1e-4
+    assert (residual - residual_ref).abs().max() <= 4 * (
+        residual_pt - residual_ref
+    ).abs().max() + 1e-4
+
+    g0 = torch.randn_like(out0) / batch_size
+    if tied_norm:
+        (out0 * F.sigmoid(residual)).backward(g0)
+        (out0_pt * F.sigmoid(residual_pt)).backward(g0)
+        (out0_ref * F.sigmoid(residual_ref)).backward(g0)
+    else:
+        g1 = torch.randn_like(out1) / batch_size
+        (out0 * F.sigmoid(residual) * g0 + out1 * g1).sum().backward()
+        (out0_pt * F.sigmoid(residual_pt) * g0 + out1_pt * g1).sum().backward()
+        (out0_ref * F.sigmoid(residual_ref) * g0 + out1_ref * g1).sum().backward()
+    assert (x0.grad - x0_ref.grad).abs().max() <= 4 * (x0_pt.grad - x0_ref.grad).abs().max() + 1e-4
+    if has_x1:
+        assert (x1.grad - x1_ref.grad).abs().max() <= 4 * (
+            x1_pt.grad - x1_ref.grad
+        ).abs().max() + 1e-4
+    if has_residual:
+        assert (res.grad - res_ref.grad).abs().max() <= 4 * (
+            res_pt.grad - res_ref.grad
+        ).abs().max() + 1e-4
+    assert (weight0.grad - weight0_ref.grad).abs().max() <= 3 * (
+        weight0_pt.grad - weight0_ref.grad
+    ).abs().max() + 3e-5
+    if not is_rms_norm:
+        assert (bias0.grad - bias0_ref.grad).abs().max() <= 2 * (
+            bias0_pt.grad - bias0_ref.grad
+        ).abs().max() + 3e-5
+    if not tied_norm:
+        assert (weight1.grad - weight1_ref.grad).abs().max() <= 3 * (
+            weight1_pt.grad - weight1_ref.grad
+        ).abs().max() + 3e-5
+        if not is_rms_norm:
+            assert (bias1.grad - bias1_ref.grad).abs().max() <= 2 * (
+                bias1_pt.grad - bias1_ref.grad
+            ).abs().max() + 3e-5
+
+
+def test_dropout_layer_norm_randomness():
+    hidden_size = 256
+    dtype = torch.float32
+    dropout_p = 0.1
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen = 512
+    x0 = torch.randn(
+        batch_size, seqlen, hidden_size, device=device, dtype=dtype, requires_grad=True
+    )
+    res = torch.randn_like(x0, dtype=dtype, requires_grad=True)
+    model = DropoutAddLayerNorm(hidden_size, p=dropout_p, device=device, dtype=dtype)
+    torch.random.manual_seed(42)
+    _, dmask0 = dropout_add_layer_norm(
+        x0, res, model.weight, model.bias, model.p, model.eps, return_dropout_mask=True
+    )
+    # Subsequent call should have a different dropout mask
+    _, dmask1 = dropout_add_layer_norm(
+        x0, res, model.weight, model.bias, model.p, model.eps, return_dropout_mask=True
+    )
+    torch.random.manual_seed(42)
+    # Resetting the seed, should get the same dropout mask
+    _, dmask2 = dropout_add_layer_norm(
+        x0, res, model.weight, model.bias, model.p, model.eps, return_dropout_mask=True
+    )
+    assert not torch.equal(dmask0, dmask1)
+    assert torch.equal(dmask0, dmask2)
diff --git a/test_embedding_parallel.py b/test_embedding_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..c99293620bd6633ecced4b35611b441c5ee0aacc
--- /dev/null
+++ b/test_embedding_parallel.py
@@ -0,0 +1,106 @@
+# Run test with:
+# torchrun --no_python --nproc_per_node=8 pytest -q -s tests/modules/test_embedding_parallel.py
+
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from apex.transformer import parallel_state
+from einops import rearrange
+from flash_attn.modules.embedding import GPT2Embeddings, ParallelGPT2Embeddings
+
+is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8
+
+
+@pytest.mark.parametrize("dtype", [torch.float16] + ([torch.bfloat16] if is_sm8x else []))
+# @pytest.mark.parametrize('dtype', [torch.bfloat16])
+@pytest.mark.parametrize("world_size", [1, 2, 4, 8])
+# @pytest.mark.parametrize('world_size', [2])
+@pytest.mark.parametrize("sequence_parallel", [True, False])
+# @pytest.mark.parametrize('sequence_parallel', [False])
+@pytest.mark.parametrize("has_pos_emb", [True, False])
+# @pytest.mark.parametrize('has_pos_emb', [True])
+@pytest.mark.parametrize("dim", [1024])
+def test_embedding_parallel(dim, has_pos_emb, sequence_parallel, world_size, dtype):
+    vocab_size = 50264
+    seqlen = 2048
+    assert vocab_size % world_size == 0
+    assert dim % world_size == 0
+    rtol, atol = (3e-3, 5e-2) if dtype == torch.bfloat16 else (3e-3, 3e-3)
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen = 1024
+    assert (batch_size * seqlen) % world_size == 0
+    input_ids_pt = torch.randint(0, vocab_size, (batch_size, seqlen), device=device)
+    input_ids = input_ids_pt.detach().clone()
+
+    model_pt = GPT2Embeddings(
+        dim, vocab_size, seqlen if has_pos_emb else 0, device=device, dtype=dtype
+    )
+    model = ParallelGPT2Embeddings(
+        dim,
+        vocab_size,
+        seqlen if has_pos_emb else 0,
+        parallel_state.get_tensor_model_parallel_group(),
+        sequence_parallel=sequence_parallel,
+        device=device,
+        dtype=dtype,
+    )
+    partition_vocab_size = vocab_size // world_size
+    partition_dim = dim // world_size
+    with torch.no_grad():
+        model.word_embeddings.weight.copy_(
+            model_pt.word_embeddings.weight[
+                rank * partition_vocab_size : (rank + 1) * partition_vocab_size
+            ]
+        )
+        if has_pos_emb:
+            model.position_embeddings.weight.copy_(
+                model_pt.position_embeddings.weight[
+                    :, rank * partition_dim : (rank + 1) * partition_dim
+                ]
+            )
+
+    out = model(input_ids, combine_batch_seqlen_dim=True)
+    out_pt = rearrange(model_pt(input_ids), "b s d -> (b s) d")
+    partition_batch_dim = batch_size * seqlen // world_size
+    assert torch.allclose(
+        out,
+        out_pt[rank * partition_batch_dim : (rank + 1) * partition_batch_dim]
+        if sequence_parallel
+        else out_pt,
+        rtol=rtol,
+        atol=atol,
+    )
+
+    g = torch.randn_like(out_pt)
+    out_pt.backward(g)
+    out.backward(
+        g[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] if sequence_parallel else g
+    )
+    parallel_state.destroy_model_parallel()
+
+    assert torch.allclose(
+        model.word_embeddings.weight.grad,
+        model_pt.word_embeddings.weight.grad[
+            rank * partition_vocab_size : (rank + 1) * partition_vocab_size
+        ],
+        rtol=rtol,
+        atol=atol,
+    )
+    if has_pos_emb:
+        assert torch.allclose(
+            model.position_embeddings.weight.grad,
+            model_pt.position_embeddings.weight.grad[
+                :, rank * partition_dim : (rank + 1) * partition_dim
+            ],
+            rtol=rtol,
+            atol=atol,
+        )
diff --git a/test_falcon.py b/test_falcon.py
new file mode 100644
index 0000000000000000000000000000000000000000..582f907b4b1ba5765669ab4f18abd3b061e27df8
--- /dev/null
+++ b/test_falcon.py
@@ -0,0 +1,408 @@
+# Copyright (c) 2023, Tri Dao.
+
+import os
+import time
+from pathlib import Path
+
+current_dir = Path(__file__).parent.absolute()
+
+import pytest
+import torch
+from einops import rearrange
+from flash_attn.models.falcon import falcon_config_to_gpt2_config, remap_state_dict_hf_falcon
+from flash_attn.models.gpt import GPTLMHeadModel, combine_state_dicts_tp, shard_state_dict_tp
+from flash_attn.utils.distributed import all_gather_raw
+from flash_attn.utils.generation import update_graph_cache
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+
+@pytest.mark.parametrize("model_name", ["tiiuae/falcon-7b", "tiiuae/falcon-40b"])
+def test_falcon_state_dict(model_name):
+    config = falcon_config_to_gpt2_config(
+        AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    )
+    pretrained_state_dict = remap_state_dict_hf_falcon(
+        state_dict_from_pretrained(model_name), config
+    )
+    model = GPTLMHeadModel(config, device="meta")  # Without device='meta' init is very slow
+    state_dict = model.state_dict()
+    assert state_dict.keys() == pretrained_state_dict.keys()
+    for k in state_dict.keys():
+        assert state_dict[k].shape == pretrained_state_dict[k].shape
+
+
+@pytest.mark.parametrize("model_name", ["tiiuae/falcon-7b"])
+def test_falcon_optimized(model_name):
+    """Check that our implementation (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    device = "cuda"
+    config = falcon_config_to_gpt2_config(
+        AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    )
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = False  # We don't have fused MLP for "gelu" activation
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+
+    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
+    model.eval()
+
+    torch.manual_seed(0)
+    batch_size = 2
+    max_seqlen = 256
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device
+    )
+    with torch.no_grad():
+        out = model.transformer(input_ids)
+        logits = model(input_ids).logits
+    del model
+
+    # Without device_map, the model is loaded on the CPU, which is very slow
+    model_ref = AutoModelForCausalLM.from_pretrained(
+        model_name, device_map={"": device}, trust_remote_code=True
+    )
+    model_ref.eval()
+    with torch.no_grad():
+        out_ref = model_ref.transformer(input_ids).last_hidden_state.to(device=device)
+        logits_ref = model_ref(input_ids).logits.to(device=device)
+    del model_ref
+
+    model_hf = AutoModelForCausalLM.from_pretrained(
+        model_name, torch_dtype=dtype, device_map={"": device}, trust_remote_code=True
+    )
+    model_hf.eval()
+    out_hf = model_hf.transformer(input_ids).last_hidden_state
+    logits_hf = model_hf(input_ids).logits
+    del model_hf
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
+    assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item()
+
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+    print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
+    assert (logits - logits_ref).abs().max().item() < 3 * (
+        logits_hf - logits_ref
+    ).abs().max().item()
+
+
+# torchrun --no_python --nproc_per_node=4 pytest -q -s tests/models/test_falcon.py -k "falcon_parallel_forward"
+# We want to run this on a machine with 4 x A100 80GB or 8 x A100 40GB so we have enough
+# memory to run the model in fp32.
+@pytest.mark.parametrize("world_size", [4])
+@pytest.mark.parametrize("model_name", ["tiiuae/falcon-40b"])
+def test_falcon_parallel_forward(model_name, world_size):
+    from apex.transformer import parallel_state
+
+    dtype = torch.float16
+    config = falcon_config_to_gpt2_config(
+        AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    )
+    config.use_flash_attn = False
+    config.fused_bias_fc = True
+    config.fused_mlp = False  # We don't have fused MLP for "gelu" activation
+    config.fused_dropout_add_ln = False
+    config.residual_in_fp32 = True
+
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    process_group = parallel_state.get_tensor_model_parallel_group()
+
+    pretrained_state_dict = remap_state_dict_hf_falcon(
+        state_dict_from_pretrained(model_name), config
+    )
+
+    model = GPTLMHeadModel(config, process_group=process_group, device=device, dtype=dtype)
+    model.load_state_dict(shard_state_dict_tp(pretrained_state_dict, config, world_size, rank))
+    model.eval()
+
+    torch.manual_seed(0)
+    batch_size = 2
+    max_seqlen = 256
+    seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device)
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device
+    )
+    with torch.no_grad():
+        out = model.transformer(input_ids)
+        out, _ = all_gather_raw(out, process_group=process_group)
+        out = rearrange(out, "(b s) d -> b s d", b=batch_size)
+        logits = model(input_ids).logits
+        logits = rearrange(logits, "(b s) d -> b s d", b=batch_size)
+        logits, _ = all_gather_raw(logits, process_group)
+        logits = rearrange(logits, "(n b) ... d -> b ... (n d)", b=batch_size)
+    del model
+    parallel_state.destroy_model_parallel()
+
+    if rank == 0:
+        model_hf = AutoModelForCausalLM.from_pretrained(
+            model_name, torch_dtype=dtype, device_map="auto", trust_remote_code=True
+        )
+        model_hf.eval()
+        out_hf = model_hf.transformer(input_ids).last_hidden_state.to(device=device)
+        logits_hf = model_hf(input_ids).logits.to(device=device)
+        del model_hf
+
+        # Without device_map, the model is loaded on the CPU, which is very slow
+        model_ref = AutoModelForCausalLM.from_pretrained(
+            model_name, device_map="auto", trust_remote_code=True
+        )
+        model_ref.eval()
+        with torch.no_grad():
+            out_ref = model_ref.transformer(input_ids).last_hidden_state.to(device=device)
+            logits_ref = model_ref(input_ids).logits.to(device=device)
+        del model_ref
+
+        print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+        print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+        print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
+        print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
+        assert (out - out_ref).abs().max().item() < 2 * (out_hf - out_ref).abs().max().item()
+
+        print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+        print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
+        print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
+        print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
+        assert (logits - logits_ref).abs().max().item() < 2 * (
+            logits_hf - logits_ref
+        ).abs().max().item()
+
+
+@pytest.mark.parametrize("model_name", ["tiiuae/falcon-7b"])
+def test_falcon_generation(model_name):
+    """Check that our implementation (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    device = "cuda"
+    config = falcon_config_to_gpt2_config(
+        AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    )
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = False  # We don't have fused MLP for "gelu" activation
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    eos_token_id = tokenizer.eos_token_id
+
+    torch.manual_seed(0)
+    batch_size = 1
+    seqlen = 100
+    max_length = 150
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
+    )
+
+    model_hf = AutoModelForCausalLM.from_pretrained(
+        model_name, torch_dtype=dtype, device_map={"": device}, trust_remote_code=True
+    )
+    model_hf.eval()
+    print("HF fp16")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_hf = model_hf.generate(
+        input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+    del model_hf
+
+    model_ref = AutoModelForCausalLM.from_pretrained(
+        model_name, device_map={"": device}, trust_remote_code=True
+    )
+    model_ref.eval()
+    with torch.no_grad():
+        logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1]
+    del model_ref
+
+    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
+    model.eval()
+
+    print("Without CUDA graph")
+    torch.cuda.synchronize()
+    start = time.time()
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        eos_token_id=eos_token_id,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+        teacher_outputs=out_hf.sequences,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+
+    # Capture graph outside the timing loop
+    batch_size, seqlen_og = input_ids.shape
+    model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length)
+    print("With CUDA graph")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_cg = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        cg=True,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+        teacher_outputs=out_hf.sequences,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+
+    with torch.no_grad():
+        logits_parallel = model(out_hf.sequences).logits[:, (seqlen - 1) : -1]
+    logits_hf = torch.stack(out_hf.scores, dim=1)
+    logits = torch.stack(out.scores, dim=1)
+    logits_cg = torch.stack(out_cg.scores, dim=1)
+
+    del model
+
+    hf_error = (logits_hf - logits_ref).abs().max().item()
+    assert (logits_parallel - logits_ref).abs().max().item() < 2 * hf_error
+
+    print(f"HF fp16 logits max diff: {hf_error}")
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item() }")
+    assert (logits - logits_ref).abs().max().item() < 2 * hf_error
+    print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item() }")
+    assert torch.equal(logits_cg, logits)
+
+
+# torchrun --no_python --nproc_per_node=4 pytest -q -s tests/models/test_falcon.py -k "falcon_parallel_generation"
+# We want to run this on a machine with 4 x A100 80GB or 8 x A100 40GB so we have enough
+# memory to run the model in fp32.
+@pytest.mark.parametrize("world_size", [4])
+@pytest.mark.parametrize("model_name", ["tiiuae/falcon-40b"])
+def test_falcon_parallel_generation(model_name, world_size):
+    """Check that our implementation matches the HF implementation:
+    the scores in fp16 should be around the same as the HF scores in fp16, when compared to
+    the HF scores in fp32.
+    """
+    from apex.transformer import parallel_state
+
+    dtype = torch.float16
+    config = falcon_config_to_gpt2_config(
+        AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    )
+    config.use_flash_attn = False
+    config.fused_bias_fc = True
+    config.fused_mlp = False  # We don't have fused MLP for "gelu" activation
+    config.fused_dropout_add_ln = False
+    config.residual_in_fp32 = True
+    config.pad_vocab_size_multiple = 8 * world_size
+    config.sequence_parallel = False  # Need to set this to False for generation
+
+    os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0"
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    process_group = parallel_state.get_tensor_model_parallel_group()
+
+    torch.manual_seed(0)
+    batch_size = 1
+    seqlen = 100
+    max_length = 150
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
+    )
+
+    # Need this, otherwise when we capture the graph the process for GPU 1 would run on both
+    # GPU0 and GPU1 and things would hang
+    torch.cuda.set_device(device)
+
+    pretrained_state_dict = remap_state_dict_hf_falcon(
+        state_dict_from_pretrained(model_name), config
+    )
+
+    model = GPTLMHeadModel(config, process_group=process_group, device=device, dtype=dtype)
+    model.load_state_dict(shard_state_dict_tp(pretrained_state_dict, config, world_size, rank))
+    model.eval()
+
+    print("Without CUDA graph")
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        tensor_parallel=world_size,
+        vocab_size=config.vocab_size,
+        # teacher_outputs=out_hf.sequences,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+    )
+
+    # Capture graph outside the timing loop
+    batch_size, seqlen_og = input_ids.shape
+    model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length)
+    print("With CUDA graph")
+    out_cg = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        tensor_parallel=world_size,
+        vocab_size=config.vocab_size,
+        cg=True,
+        # teacher_outputs=out_hf.sequences,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+    )
+    del model
+    parallel_state.destroy_model_parallel()
+
+    if rank == 0:
+        model_hf = AutoModelForCausalLM.from_pretrained(
+            model_name, torch_dtype=dtype, device_map="auto", trust_remote_code=True
+        )
+        model_hf.eval()
+        print("HF fp16")
+        torch.cuda.synchronize()
+        start = time.time()
+        with torch.inference_mode():
+            out_hf = model_hf.generate(
+                input_ids=input_ids,
+                max_length=max_length,
+                return_dict_in_generate=True,
+                output_scores=True,
+            )
+        torch.cuda.synchronize()
+        print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+        del model_hf
+
+        model_ref = AutoModelForCausalLM.from_pretrained(
+            model_name, device_map="auto", trust_remote_code=True
+        )
+        model_ref.eval()
+        with torch.inference_mode():
+            logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1]
+        del model_ref
+        logits_hf = torch.stack(out_hf.scores, dim=1)
+
+        logits = torch.stack(out.scores, dim=1)
+        logits_cg = torch.stack(out_cg.scores, dim=1)
+
+        hf_error = (logits_hf - logits_ref).abs().max().item()
+        print(f"HF fp16 logits max diff: {hf_error}")
+        print(f"Logits max diff: {(logits - logits_ref).abs().max().item() }")
+        assert (logits - logits_ref).abs().max().item() < 2 * hf_error
+        print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item() }")
+        assert torch.equal(logits_cg, logits)
diff --git a/test_flash_attn.py b/test_flash_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..72d55134e58cc2f90f2922244ac99a0c19fa32b6
--- /dev/null
+++ b/test_flash_attn.py
@@ -0,0 +1,2525 @@
+import math
+
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from flash_attn import (
+    flash_attn_func,
+    flash_attn_kvpacked_func,
+    flash_attn_qkvpacked_func,
+    flash_attn_varlen_func,
+    flash_attn_varlen_kvpacked_func,
+    flash_attn_varlen_qkvpacked_func,
+    flash_attn_with_kvcache,
+)
+from flash_attn.bert_padding import pad_input, unpad_input
+from flash_attn.flash_attn_interface import _get_block_size_n
+from flash_attn.layers.rotary import apply_rotary_emb
+
+MAX_HEADDIM_SM8x = 192
+
+
+is_sm75 = torch.cuda.get_device_capability("cuda") == (7, 5)
+is_sm8x = torch.cuda.get_device_capability("cuda")[0] == 8
+is_sm80 = torch.cuda.get_device_capability("cuda") == (8, 0)
+is_sm90 = torch.cuda.get_device_capability("cuda") == (9, 0)
+
+
+def attn_bias_from_alibi_slopes(
+    slopes, seqlen_q, seqlen_k, query_padding_mask=None, key_padding_mask=None, causal=False, key_leftpad=None
+):
+    batch, nheads = slopes.shape
+    device = slopes.device
+    slopes = rearrange(slopes, "b h -> b h 1 1")
+    if causal:
+        return torch.arange(-seqlen_k + 1, 1, device=device, dtype=torch.float32) * slopes
+    else:
+        row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
+        col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+        if key_leftpad is not None:
+            key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1")
+            col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0])
+            col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32)
+        sk = (
+            seqlen_k
+            if key_padding_mask is None
+            else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+        )
+        sq = (
+            seqlen_q
+            if query_padding_mask is None
+            else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+        )
+        relative_pos = torch.abs(row_idx + sk - sq - col_idx)
+        return -slopes * relative_pos.to(dtype=slopes.dtype)
+
+
+def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
+    assert mode in ["full", "random", "third"]
+    if mode == "full":
+        lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
+    elif mode == "random":
+        lengths = torch.randint(
+            max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device
+        )
+    elif mode == "third":
+        lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device)
+    padding_mask = (
+        repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
+    )
+    return padding_mask
+
+
+def generate_qkv(
+    q, k, v, query_padding_mask=None, key_padding_mask=None, kvpacked=False, qkvpacked=False
+):
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, d)
+        k: (batch_size, seqlen_k, nheads_k, d)
+        v: (batch_size, seqlen_k, nheads_k, d)
+        query_padding_mask: (batch_size, seqlen), bool
+        key_padding_mask: (batch_size, seqlen), bool
+    """
+    assert not (kvpacked and qkvpacked)
+    batch_size, seqlen_q, nheads, d = q.shape
+    _, seqlen_k, nheads_k, _ = k.shape
+    assert k.shape == (batch_size, seqlen_k, nheads_k, d)
+    assert v.shape == (batch_size, seqlen_k, nheads_k, d)
+
+    if query_padding_mask is not None:
+        q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, query_padding_mask)
+        output_pad_fn = lambda output_unpad: pad_input(
+            output_unpad, indices_q, batch_size, seqlen_q
+        )
+    else:
+        q_unpad = rearrange(q, "b s h d -> (b s) h d")
+        cu_seqlens_q = torch.arange(
+            0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device
+        )
+        max_seqlen_q = seqlen_q
+        output_pad_fn = lambda output_unpad: rearrange(
+            output_unpad, "(b s) h d -> b s h d", b=batch_size
+        )
+
+    if key_padding_mask is not None:
+        k_unpad, indices_k, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask)
+        v_unpad, _, _, _ = unpad_input(v, key_padding_mask)
+    else:
+        k_unpad = rearrange(k, "b s h d -> (b s) h d")
+        v_unpad = rearrange(v, "b s h d -> (b s) h d")
+        cu_seqlens_k = torch.arange(
+            0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device
+        )
+        max_seqlen_k = seqlen_k
+
+    if qkvpacked:
+        assert (query_padding_mask == key_padding_mask).all()
+        assert nheads == nheads_k
+        qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1)
+        qkv = torch.stack([q, k, v], dim=2)
+        if query_padding_mask is not None:
+            dqkv_pad_fn = lambda dqkv_unpad: pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q)
+        else:
+            dqkv_pad_fn = lambda dqkv_unpad: rearrange(
+                dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
+            )
+        return (
+            qkv_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            max_seqlen_q,
+            qkv.detach().requires_grad_(),
+            output_pad_fn,
+            dqkv_pad_fn,
+        )
+    elif kvpacked:
+        kv_unpad = torch.stack([k_unpad, v_unpad], dim=1)
+        kv = torch.stack([k, v], dim=2)
+        dq_pad_fn = output_pad_fn
+        if key_padding_mask is not None:
+            dkv_pad_fn = lambda dkv_unpad: pad_input(dkv_unpad, indices_k, batch_size, seqlen_k)
+        else:
+            dkv_pad_fn = lambda dkv_unpad: rearrange(
+                dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
+            )
+        return (
+            q_unpad.detach().requires_grad_(),
+            kv_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q.detach().requires_grad_(),
+            kv.detach().requires_grad_(),
+            output_pad_fn,
+            dq_pad_fn,
+            dkv_pad_fn,
+        )
+    else:
+        dq_pad_fn = output_pad_fn
+        if key_padding_mask is not None:
+            dk_pad_fn = lambda dk_unpad: pad_input(dk_unpad, indices_k, batch_size, seqlen_k)
+        else:
+            dk_pad_fn = lambda dk_unpad: rearrange(dk_unpad, "(b s) h d -> b s h d", b=batch_size)
+        return (
+            q_unpad.detach().requires_grad_(),
+            k_unpad.detach().requires_grad_(),
+            v_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q.detach().requires_grad_(),
+            k.detach().requires_grad_(),
+            v.detach().requires_grad_(),
+            output_pad_fn,
+            dq_pad_fn,
+            dk_pad_fn,
+        )
+
+
+def construct_local_mask(
+    seqlen_q,
+    seqlen_k,
+    window_size=(-1, -1),  # -1 means infinite window size
+    query_padding_mask=None,
+    key_padding_mask=None,
+    device=None,
+    key_leftpad=None,
+):
+    row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
+    col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+    if key_leftpad is not None:
+        key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1")
+        col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0])
+        col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32)
+    sk = (
+        seqlen_k
+        if key_padding_mask is None
+        else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    sq = (
+        seqlen_q
+        if query_padding_mask is None
+        else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    if window_size[0] < 0:
+        return col_idx > row_idx + sk - sq + window_size[1]
+    else:
+        sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
+        return torch.logical_or(
+            col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),
+            col_idx < row_idx + sk - sq - window_size[0],
+        )
+
+
+def attention_ref(
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    attn_bias=None,
+    dropout_p=0.0,
+    dropout_mask=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite window size
+    softcap=0.0,
+    upcast=True,
+    reorder_ops=False,
+    key_leftpad=None,
+):
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        k: (batch_size, seqlen_k, nheads_k, head_dim)
+        v: (batch_size, seqlen_k, nheads_k, head_dim)
+        query_padding_mask: (batch_size, seqlen_q)
+        key_padding_mask: (batch_size, seqlen_k)
+        attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
+        dropout_p: float
+        dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k)
+        causal: whether to apply causal masking
+        window_size: (int, int), left and right window size
+        upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast
+            output back to fp16/bf16.
+        reorder_ops: whether to change the order of operations (scaling k instead of scaling q, etc.)
+            without changing the math. This is to estimate the numerical error from operation
+            reordering.
+    Output:
+        output: (batch_size, seqlen_q, nheads, head_dim)
+        attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout
+    """
+    if causal:
+        window_size = (window_size[0], 0)
+    dtype_og = q.dtype
+    if upcast:
+        q, k, v = q.float(), k.float(), v.float()
+    seqlen_q, seqlen_k = q.shape[1], k.shape[1]
+    k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
+    v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
+    d = q.shape[-1]
+    if not reorder_ops:
+        scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k)
+    else:
+        scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d))
+    if softcap > 0:
+        scores = scores / softcap
+        scores = scores.tanh()
+        scores = scores * softcap
+    if key_padding_mask is not None:
+        scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf"))
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        local_mask = construct_local_mask(
+            seqlen_q,
+            seqlen_k,
+            window_size,
+            query_padding_mask,
+            key_padding_mask,
+            q.device,
+            key_leftpad=key_leftpad,
+        )
+        scores.masked_fill_(local_mask, float("-inf"))
+    if attn_bias is not None:
+        scores = scores + attn_bias
+    attention = torch.softmax(scores, dim=-1).to(v.dtype)
+    # Some rows might be completely masked out so we fill them with zero instead of NaN
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        attention = attention.masked_fill(torch.all(local_mask, dim=-1, keepdim=True), 0.0)
+    # We want to mask here so that the attention matrix doesn't have any NaNs
+    # Otherwise we'll get NaN in dV
+    if query_padding_mask is not None:
+        attention = attention.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
+    dropout_scaling = 1.0 / (1 - dropout_p)
+    # attention_drop = attention.masked_fill(~dropout_mask, 0.0) * dropout_scaling
+    # output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    if dropout_mask is not None:
+        attention_drop = attention.masked_fill(~dropout_mask, 0.0)
+    else:
+        attention_drop = attention
+    output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling)
+    if query_padding_mask is not None:
+        output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0)
+    return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)
+
+
+def attention_kvpacked_ref(
+    q,
+    kv,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    attn_bias=None,
+    dropout_p=0.0,
+    dropout_mask=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite window size
+    softcap=0.0,
+    upcast=True,
+    reorder_ops=False,
+    key_leftpad=None,
+):
+    return attention_ref(
+        q,
+        kv[:, :, 0],
+        kv[:, :, 1],
+        query_padding_mask,
+        key_padding_mask,
+        attn_bias,
+        dropout_p,
+        dropout_mask,
+        upcast=upcast,
+        causal=causal,
+        window_size=window_size,
+        softcap=softcap,
+        reorder_ops=reorder_ops,
+        key_leftpad=key_leftpad,
+    )
+
+
+def attention_qkvpacked_ref(
+    qkv,
+    key_padding_mask=None,
+    attn_bias=None,
+    dropout_p=0.0,
+    dropout_mask=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite window size
+    softcap=0.0,
+    upcast=True,
+    reorder_ops=False,
+):
+    return attention_ref(
+        qkv[:, :, 0],
+        qkv[:, :, 1],
+        qkv[:, :, 2],
+        key_padding_mask,
+        key_padding_mask,
+        attn_bias,
+        dropout_p,
+        dropout_mask,
+        upcast=upcast,
+        causal=causal,
+        window_size=window_size,
+        softcap=softcap,
+        reorder_ops=reorder_ops,
+    )
+
+
+def generate_sparsity_mask(seqlen, sparsity=0.3):
+    repeats = seqlen // 16 // 2
+    # mask = torch.stack([torch.tensor([1, 0] * repeats, dtype=torch.bool, device='cuda'),
+    #                     torch.tensor([0, 1] * repeats, dtype=torch.bool, device='cuda')], dim=-1)
+    # mask = torch.stack([torch.tensor([1, 1] * repeats, dtype=torch.bool, device='cuda'),
+    #                     torch.tensor([1, 1] * repeats, dtype=torch.bool, device='cuda')], dim=-1)
+    # mask = torch.stack([torch.tensor([1, 1] * repeats, dtype=torch.bool, device='cuda')], dim=-1)
+    # mask = torch.stack([torch.tensor([1, 0] * repeats, dtype=torch.bool, device='cuda')], dim=-1)
+    nrow, ncol = seqlen // 16, seqlen // 256
+    mask = torch.rand(nrow, ncol, device="cuda") < sparsity
+    return mask
+
+
+def attention_blocksparse_ref(qkv, blockmask, attn_mask, dropout_p, dropout_mask):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, head_dim)
+        blockmask: (seqlen / 16, seqlen / 256)
+        attn_mask: (batch_size, seqlen)
+        dropout_p: float
+        dropout_mask: (batch_size, nheads, seqlen, seqlen)
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+        attention: softmax after dropout
+    """
+    q, k, v = qkv.float().unbind(dim=2)
+    d = qkv.shape[-1]
+    seqlen = qkv.shape[1]
+    scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k)
+    scores.masked_fill_(rearrange(~attn_mask, "b s -> b 1 1 s"), float("-inf"))
+    blockmask = repeat(blockmask, "s_16 s_256 -> (s_16 16) (s_256 256)")
+    blockmask = blockmask[:seqlen, :seqlen]
+    scores.masked_fill_(rearrange(~blockmask, "t s -> 1 1 t s"), float("-inf"))
+    attention = torch.softmax(scores, dim=-1)
+    attention = attention.masked_fill(rearrange(~attn_mask, "b s -> b 1 s 1"), 0.0)
+    attention = attention.masked_fill_(rearrange(~blockmask, "t s -> 1 1 t s"), 0.0)
+    attention_drop = attention.masked_fill(~dropout_mask, 0.0) / (1 - dropout_p)
+    output = torch.einsum("bhts,bshd->bthd", attention_drop, v)
+    output.masked_fill_(rearrange(~attn_mask, "b s -> b s 1 1"), 0)
+    return output.to(dtype=qkv.dtype), attention.to(dtype=qkv.dtype)
+
+
+def convert_flash_attn_S_to_softmax(
+    S,
+    seqlen_q,
+    seqlen_k,
+    query_padding_mask,
+    key_padding_mask,
+    head_dim,
+    is_dropout,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite window size
+):
+    """FlashAttention stores the S matrix in a different way.
+    Arguments:
+        S: (batch_size, nheads, seqlen_q_rounded, seqlen_k_rounded)
+        query_padding_mask: (batch_size, seqlen_q_rounded)
+        key_padding_mask: (batch_size, seqlen_k_rounded)
+    """
+    if causal:
+        window_size = (window_size[0], 0)
+    seqlen_q_rounded, seqlen_k_rounded = S.shape[-2:]
+    S_converted = S
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        local_mask = construct_local_mask(
+            seqlen_q,
+            seqlen_k,
+            window_size,
+            query_padding_mask,
+            key_padding_mask,
+            S.device,
+        )
+        local_mask = F.pad(
+            local_mask,
+            (0, seqlen_k_rounded - seqlen_k, 0, seqlen_q_rounded - seqlen_q),
+            value=True,
+        )
+        S_converted = S_converted.masked_fill(local_mask, 0.0)
+
+    # Need to zero out things not in attention_mask in case S was initialized with random values
+    # and some of those values aren't overwritten.
+    seqlen_q_og = (
+        query_padding_mask.shape[-1] if query_padding_mask is not None else seqlen_q_rounded
+    )
+    if query_padding_mask is not None:
+        query_padding_mask = F.pad(query_padding_mask, (0, seqlen_q_rounded - seqlen_q_og))
+        S_converted = S_converted.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
+    seqlen_k_og = key_padding_mask.shape[-1] if key_padding_mask is not None else seqlen_k
+    if key_padding_mask is not None:
+        key_padding_mask = F.pad(key_padding_mask, (0, seqlen_k_rounded - seqlen_k_og))
+        S_converted = S_converted.masked_fill(rearrange(~key_padding_mask, "b s -> b 1 1 s"), 0.0)
+    S_converted = F.pad(S_converted, (0, 0, 0, seqlen_q_og - seqlen_q_rounded))
+    S_converted = F.pad(S_converted, (0, seqlen_k_og - seqlen_k_rounded))
+    return S_converted[:, :, :seqlen_q, :seqlen_k]
+
+
+def normalize_flash_attn_S(
+    attn_unnorm,
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    attn_bias=None,
+    is_dropout=False,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite window size
+):
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        k, v: (batch_size, seqlen_k, nheads, head_dim)
+        key_padding_mask: (batch_size, seqlen_q)
+        attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
+    Output:
+        softmax_lse: (batch_size, nheads, seqlen_q)
+        softmax_max: (batch_size, nheads, seqlen_q)
+    """
+    if causal:
+        window_size = (window_size[0], 0)
+    q, k, v = q.float(), k.float(), v.float()
+    _, seqlen_q, _, head_dim = q.shape
+    seqlen_k = k.shape[1]
+    scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(head_dim), k)
+    if key_padding_mask is not None:
+        scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf"))
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        local_mask = construct_local_mask(
+            seqlen_q,
+            seqlen_k,
+            window_size,
+            query_padding_mask,
+            key_padding_mask,
+            q.device,
+        )
+        scores.masked_fill_(local_mask, float("-inf"))
+    if attn_bias is not None:
+        scores = scores + attn_bias.to(dtype=scores.dtype)
+    block_size_n = _get_block_size_n(scores.device, head_dim, is_dropout, causal)
+    scores_block = scores.split(block_size_n, dim=-1)
+    lse_block = torch.stack([torch.logsumexp(s, dim=-1) for s in scores_block], dim=-1)
+    lse = torch.logsumexp(lse_block, dim=-1)
+    # lse could be -inf (i.e. all values in scores are -inf), and we want to set those to inf
+    # so that when we do torch.exp(m - lse), we get 0.0 instead of NaN.
+    lse[lse == float("-inf")] = float("inf")
+    scores_max_block = torch.stack([torch.amax(s, dim=-1) for s in scores_block], dim=-1)
+    cummax_block = torch.cummax(scores_max_block.flip(-1), dim=-1).values.flip(-1).unbind(dim=-1)
+    attn_unnorm_block = attn_unnorm.split(block_size_n, dim=-1)
+    attn_norm = torch.cat(
+        [
+            a * rearrange(torch.exp(m - lse), "b h s -> b h s 1")
+            for a, m in zip(attn_unnorm_block, cummax_block)
+        ],
+        dim=-1,
+    )
+    if query_padding_mask is not None:
+        attn_norm.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
+    return attn_norm.to(dtype=attn_unnorm.dtype)
+
+
+def get_dropout_fraction(
+    dropout_mask,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite window size
+):
+    """
+    dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k), bool. True means keep, False means drop.
+    query_padding_mask: (batch_size, seqlen_q)
+    key_padding_mask: (batch_size, seqlen_k)
+    """
+    if causal:
+        window_size = (window_size[0], 0)
+    batch_size, nheads, seqlen_q, seqlen_k = dropout_mask.shape
+    dropped = ~dropout_mask
+    valid = torch.ones_like(dropout_mask)
+    if query_padding_mask is not None:
+        dropped.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), False)
+        valid.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), False)
+    if key_padding_mask is not None:
+        dropped.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), False)
+        valid.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), False)
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        local_mask = construct_local_mask(
+            seqlen_q,
+            seqlen_k,
+            window_size,
+            query_padding_mask,
+            key_padding_mask,
+            dropout_mask.device,
+        )
+        dropped.masked_fill_(local_mask, False)
+        valid.masked_fill_(local_mask, False)
+    dropped_total = dropped.sum()
+    return dropped.sum() / valid.sum()
+
+
+@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))
+# @pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("deterministic", [False, True])
+# @pytest.mark.parametrize("deterministic", [False])
+@pytest.mark.parametrize("alibi", [False, True])
+# @pytest.mark.parametrize("alibi", [False])
+@pytest.mark.parametrize("local", [False, True])
+# @pytest.mark.parametrize("local", [False])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize("causal", [False])
+@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize('d', [32, 64, 96, 128])
+# @pytest.mark.parametrize("d", [64])
+# @pytest.mark.parametrize('seqlen', [128, 256, 384, 512, 768, 1024, 2048])
+@pytest.mark.parametrize("seqlen", [97, 128, 200, 384, 768, 1024, 1025, 2048])
+# @pytest.mark.parametrize("seqlen", [512])
+@pytest.mark.parametrize("dropout_p", [0.0, 0.17])
+# @pytest.mark.parametrize("dropout_p", [0.0])
+def test_flash_attn_qkvpacked(seqlen, d, dropout_p, causal, local, alibi, deterministic, dtype):
+    if seqlen >= 2048 and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30:
+        pytest.skip()  # Reference implementation OOM
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 4
+    nheads = 9
+    window_size = (-1, -1) if not local else torch.randint(0, seqlen, (2,))
+    qkv = torch.randn(
+        batch_size, seqlen, 3, nheads, d, device=device, dtype=dtype, requires_grad=True
+    )
+    if alibi:
+        alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
+        attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen, seqlen, causal=causal)
+    else:
+        alibi_slopes, attn_bias = None, None
+    out, lse, S_dmask = flash_attn_qkvpacked_func(
+        qkv,
+        dropout_p,
+        causal=causal,
+        window_size=window_size,
+        alibi_slopes=alibi_slopes,
+        deterministic=deterministic,
+        return_attn_probs=True,
+    )
+    if dropout_p > 0.0:
+        S_dmask_converted = convert_flash_attn_S_to_softmax(
+            S_dmask,
+            seqlen,
+            seqlen,
+            None,
+            None,
+            d,
+            dropout_p > 0.0,
+            causal=causal,
+            window_size=window_size,
+        )
+        dropout_mask = S_dmask_converted >= 0
+        attn_unnorm = S_dmask_converted.abs()
+        attn = normalize_flash_attn_S(
+            attn_unnorm,
+            qkv[:, :, 0],
+            qkv[:, :, 1],
+            qkv[:, :, 2],
+            None,
+            None,
+            attn_bias,
+            dropout_p > 0.0,
+            causal=causal,
+            window_size=window_size,
+        )
+        dropout_fraction = get_dropout_fraction(
+            dropout_mask, None, None, causal=causal, window_size=window_size
+        ).item()
+        print(f"Actual dropout fraction: {dropout_fraction}")
+    else:
+        dropout_mask = None
+
+    out_ref, attn_ref = attention_qkvpacked_ref(
+        qkv, None, attn_bias, dropout_p, dropout_mask, causal=causal, window_size=window_size
+    )
+    out_pt, attn_pt = attention_qkvpacked_ref(
+        qkv,
+        None,
+        attn_bias,
+        dropout_p,
+        dropout_mask,
+        causal=causal,
+        window_size=window_size,
+        upcast=False,
+        reorder_ops=True,
+    )
+    # v = qkv[:, :, 2].float()
+    # qk = torch.einsum('bshd,bthd->bhst', qkv[:, :, 0], qkv[:, :, 1]).float()
+    # if causal:
+    #     causal_mask = torch.triu(torch.ones(seqlen, seqlen, dtype=torch.bool, device=qkv.device), 1)
+    #     qk.masked_fill_(causal_mask, float('-inf'))
+    # m = qk.amax(-1, keepdim=True)
+    # s_tmp = torch.exp((qk - m) / math.sqrt(d))
+    # p_tmp = torch.softmax(qk / math.sqrt(d), -1)
+    # p_dropped = p_tmp if dropout_mask is None else p_tmp.masked_fill(~dropout_mask, 0)
+    # lse_ref = torch.logsumexp(qk / math.sqrt(d), -1)
+    # qk_max1 = torch.max(qk[:, :, 128:, 192:], -1, keepdim=True).values
+    # qk_max2 = torch.max(qk[:, :, 128:, 128:], -1, keepdim=True).values
+    # qk_max3 = torch.max(qk[:, :, 128:, 64:], -1, keepdim=True).values
+    # qk_max4 = torch.max(qk[:, :, 128:, :], -1, keepdim=True).values
+    # o1 = torch.einsum('bhst,bthd->bshd', torch.exp((qk[:, :, 128:, 192:] - qk_max1) / math.sqrt(d)), v[:, 192:])
+    # o2 = torch.einsum('bhst,bthd->bshd', torch.exp((qk[:, :, 128:, 128:] - qk_max2) / math.sqrt(d)), v[:, 128:])
+    # o3 = torch.einsum('bhst,bthd->bshd', torch.exp((qk[:, :, 128:, 64:] - qk_max3) / math.sqrt(d)), v[:, 64:])
+    # o4 = torch.einsum('bhst,bthd->bshd', torch.exp((qk[:, :, 128:, :] - qk_max4) / math.sqrt(d)), v[:, :])
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+    if dropout_p > 0.0:
+        print(f"Attention max diff: {(attn - attn_ref).abs().max().item()}")
+        print(f"Attention Pytorch max diff: {(attn_pt - attn_ref).abs().max().item()}")
+
+    g = torch.randn_like(out)
+    # do_o = (g.float() * out.float()).sum(-1)
+    # dv_tmp = torch.einsum('bhts,bthd->bshd', attn_pt[:, :, :64], g[:, :64])
+    # dv_tmp1 = torch.einsum('bhts,bthd->bshd', attn_pt[:, :, 64:], g[:, 64:])
+    if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90):
+        (dqkv,) = torch.autograd.grad(out, qkv, g)
+        (dqkv_ref,) = torch.autograd.grad(out_ref, qkv, g)
+        (dqkv_pt,) = torch.autograd.grad(out_pt, qkv, g)
+        print(f"dQ max diff: {(dqkv[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}")
+        print(f"dK max diff: {(dqkv[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}")
+        print(f"dV max diff: {(dqkv[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}")
+        print(f"dQKV mean diff: {(dqkv - dqkv_ref).abs().mean().item()}")
+        print(f"dQ Pytorch max diff: {(dqkv_pt[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}")
+        print(f"dK Pytorch max diff: {(dqkv_pt[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}")
+        print(f"dV Pytorch max diff: {(dqkv_pt[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}")
+        print(f"dQKV Pytorch mean diff: {(dqkv_pt - dqkv_ref).abs().mean().item()}")
+
+    # Check that FlashAttention's numerical error is at most twice the numerical error
+    # of a Pytorch implementation.
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
+
+    if dropout_p > 0.0:
+        assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item()
+        # With alibi, many of the prob values are 0.0 & -0.0 so dropout_fraction isn't accurate
+        if not alibi:
+            assert abs(dropout_fraction - dropout_p) <= (0.01 if not local else 0.025)
+
+    if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90):
+        assert (dqkv - dqkv_ref).abs().max().item() <= 2 * (dqkv_pt - dqkv_ref).abs().max().item()
+
+
+@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))
+# @pytest.mark.parametrize('dtype', [torch.float16])
+@pytest.mark.parametrize("deterministic", [False, True])
+# @pytest.mark.parametrize("deterministic", [True])
+@pytest.mark.parametrize("alibi", [False, True])
+# @pytest.mark.parametrize("alibi", [True])
+@pytest.mark.parametrize("local", [False, True])
+# @pytest.mark.parametrize("local", [True])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize('causal', [False])
+@pytest.mark.parametrize("d", [32, 59, 64, 80, 96, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize('d', [64])
+@pytest.mark.parametrize("seqlen", [97, 128, 200, 257, 384, 512, 768, 1025, 2048])
+# @pytest.mark.parametrize('seqlen', [128])
+@pytest.mark.parametrize("dropout_p", [0.0, 0.17])
+# @pytest.mark.parametrize('dropout_p', [0.0])
+def test_flash_attn_varlen_qkvpacked(
+    seqlen, d, dropout_p, causal, local, alibi, deterministic, dtype
+):
+    if seqlen >= 2048 and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30:
+        pytest.skip()  # Reference implementation OOM
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 5
+    nheads = 6
+    window_size = (-1, -1) if not local else torch.randint(0, seqlen, (2,))
+    qkv = torch.randn(
+        batch_size, seqlen, 3, nheads, d, device=device, dtype=dtype, requires_grad=True
+    )
+
+    key_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode="random")
+    # key_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode='full')
+    if alibi:
+        alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
+        attn_bias = attn_bias_from_alibi_slopes(
+            alibi_slopes, seqlen, seqlen, key_padding_mask, key_padding_mask, causal=causal
+        )
+    else:
+        alibi_slopes, attn_bias = None, None
+
+    qkv_unpad, cu_seqlens, max_seqlen, qkv, output_pad_fn, dqkv_pad_fn = generate_qkv(
+        *qkv.unbind(dim=2), key_padding_mask, key_padding_mask, qkvpacked=True
+    )
+
+    out_unpad, sm_lse, S_dmask = flash_attn_varlen_qkvpacked_func(
+        qkv_unpad,
+        cu_seqlens,
+        max_seqlen,
+        dropout_p,
+        causal=causal,
+        window_size=window_size,
+        alibi_slopes=alibi_slopes,
+        deterministic=deterministic,
+        return_attn_probs=True,
+    )
+    out = output_pad_fn(out_unpad)
+    if dropout_p > 0.0:
+        S_dmask_converted = convert_flash_attn_S_to_softmax(
+            S_dmask,
+            seqlen,
+            seqlen,
+            key_padding_mask,
+            key_padding_mask,
+            d,
+            dropout_p > 0.0,
+            causal=causal,
+            window_size=window_size,
+        )
+        dropout_mask = S_dmask_converted >= 0
+        attn_unnorm = S_dmask_converted.abs()
+        attn = normalize_flash_attn_S(
+            attn_unnorm,
+            qkv[:, :, 0],
+            qkv[:, :, 1],
+            qkv[:, :, 2],
+            key_padding_mask,
+            key_padding_mask,
+            attn_bias,
+            dropout_p > 0.0,
+            causal=causal,
+            window_size=window_size,
+        )
+        dropout_fraction = get_dropout_fraction(
+            dropout_mask, key_padding_mask, key_padding_mask, causal=causal, window_size=window_size
+        ).item()
+        print(f"Actual dropout fraction: {dropout_fraction}")
+    else:
+        dropout_mask = None
+
+    out_ref, attn_ref = attention_qkvpacked_ref(
+        qkv,
+        key_padding_mask,
+        attn_bias,
+        dropout_p,
+        dropout_mask,
+        causal=causal,
+        window_size=window_size,
+    )
+    out_pt, attn_pt = attention_qkvpacked_ref(
+        qkv,
+        key_padding_mask,
+        attn_bias,
+        dropout_p,
+        dropout_mask,
+        causal=causal,
+        window_size=window_size,
+        upcast=False,
+        reorder_ops=True,
+    )
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+    if dropout_p > 0.0:
+        print(f"Attention max diff: {(attn - attn_ref).abs().max().item()}")
+        print(f"Attention Pytorch max diff: {(attn_pt - attn_ref).abs().max().item()}")
+
+    g = torch.randn_like(out)
+    if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90):
+        (dqkv_unpad,) = torch.autograd.grad(out, qkv_unpad, g)
+        dqkv = dqkv_pad_fn(dqkv_unpad)
+        (dqkv_ref,) = torch.autograd.grad(out_ref, qkv, g)
+        (dqkv_pt,) = torch.autograd.grad(out_pt, qkv, g)
+        print(f"dQ max diff: {(dqkv[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}")
+        print(f"dK max diff: {(dqkv[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}")
+        print(f"dV max diff: {(dqkv[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}")
+        print(f"dQKV mean diff: {(dqkv - dqkv_ref).abs().mean().item()}")
+        print(f"dQ Pytorch max diff: {(dqkv_pt[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}")
+        print(f"dK Pytorch max diff: {(dqkv_pt[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}")
+        print(f"dV Pytorch max diff: {(dqkv_pt[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}")
+        print(f"dQKV Pytorch mean diff: {(dqkv_pt - dqkv_ref).abs().mean().item()}")
+
+    # Check that FlashAttention's numerical error is at most twice the numerical error
+    # of a Pytorch implementation.
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
+
+    if dropout_p > 0.0:
+        assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item()
+        # With alibi, many of the prob values are 0.0 & -0.0 so dropout_fraction isn't accurate
+        if not alibi:
+            assert abs(dropout_fraction - dropout_p) <= (0.01 if not local else 0.025)
+
+    if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90):
+        assert (dqkv - dqkv_ref).abs().max().item() <= 2 * (dqkv_pt - dqkv_ref).abs().max().item()
+
+
+@pytest.mark.parametrize("kvpacked", [True, False])
+# @pytest.mark.parametrize("kvpacked", [False])
+@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))
+# @pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
+# @pytest.mark.parametrize("mha_type", ["mha"])
+@pytest.mark.parametrize("deterministic", [False, True])
+# @pytest.mark.parametrize("deterministic", [True])
+@pytest.mark.parametrize("alibi", [False, True])
+# @pytest.mark.parametrize("alibi", [False])
+@pytest.mark.parametrize("local", [False, True])
+# @pytest.mark.parametrize("local", [False])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize("causal", [True])
+@pytest.mark.parametrize("d", [32, 40, 59, 64, 96, 111, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [56, 80])
+# @pytest.mark.parametrize("d", [64])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [
+        (113, 203),
+        (128, 217),
+        (113, 211),
+        (108, 256),
+        (256, 512),
+        (512, 256),
+        (1024, 1024),
+        (1023, 1024),
+        (1024, 1023),
+        (2048, 2048),
+    ],
+)
+# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)])
+@pytest.mark.parametrize("dropout_p", [0.0, 0.17])
+# @pytest.mark.parametrize("dropout_p", [0.0])
+@pytest.mark.parametrize("softcap", [0.0, 50.0])
+def test_flash_attn_output(
+    seqlen_q, seqlen_k, d, dropout_p, causal, local, alibi, deterministic, mha_type, dtype, kvpacked, softcap
+):
+    if (
+        max(seqlen_q, seqlen_k) >= 2048
+        and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30
+    ):
+        pytest.skip()  # Reference implementation OOM
+    if softcap > 0.0 and dropout_p > 0.0:
+        pytest.skip("Softcap and dropout not supported together")
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 4
+    nheads = 6 if softcap == 0.0 else 4  # softcap reference impl takes more memory
+    nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 2)
+    assert nheads % nheads_k == 0
+    window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
+    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    if softcap > 0:
+        # Ensure the values of qk are at least within softcap range.
+        q = q * softcap
+    if kvpacked:
+        kv = torch.randn(
+            batch_size, seqlen_k, 2, nheads_k, d, device=device, dtype=dtype, requires_grad=True
+        )
+    else:
+        k = torch.randn(
+            batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True
+        )
+        v = torch.randn(
+            batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True
+        )
+    if alibi:
+        alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
+        attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen_q, seqlen_k, causal=causal)
+    else:
+        alibi_slopes, attn_bias = None, None
+
+    if kvpacked:
+        out, lse, S_dmask = flash_attn_kvpacked_func(
+            q,
+            kv,
+            dropout_p,
+            causal=causal,
+            window_size=window_size,
+            softcap=softcap,
+            alibi_slopes=alibi_slopes,
+            deterministic=deterministic,
+            return_attn_probs=True,
+        )
+    else:
+        out, lse, S_dmask = flash_attn_func(
+            q,
+            k,
+            v,
+            dropout_p,
+            causal=causal,
+            window_size=window_size,
+            softcap=softcap,
+            alibi_slopes=alibi_slopes,
+            deterministic=deterministic,
+            return_attn_probs=True,
+        )
+    if dropout_p > 0.0:
+        S_dmask_converted = convert_flash_attn_S_to_softmax(
+            S_dmask,
+            seqlen_q,
+            seqlen_k,
+            None,
+            None,
+            d,
+            dropout_p > 0.0,
+            causal=causal,
+            window_size=window_size,
+        )
+        dropout_mask = S_dmask_converted >= 0
+        attn_unnorm = S_dmask_converted.abs()
+        if kvpacked:
+            kv_rep = repeat(kv, "b s two h d -> b s two (h g) d", g=nheads // nheads_k)
+            k_rep, v_rep = kv_rep.unbind(dim=2)
+        else:
+            k_rep = repeat(k, "b s h d -> b s (h g) d", g=nheads // nheads_k)
+            v_rep = repeat(v, "b s h d -> b s (h g) d", g=nheads // nheads_k)
+        attn = normalize_flash_attn_S(
+            attn_unnorm,
+            q,
+            k_rep,
+            v_rep,
+            None,
+            None,
+            attn_bias,
+            dropout_p > 0.0,
+            causal=causal,
+            window_size=window_size,
+        )
+        dropout_fraction = get_dropout_fraction(
+            dropout_mask, None, None, causal=causal, window_size=window_size
+        ).item()
+        print(f"Actual dropout fraction: {dropout_fraction}")
+    else:
+        dropout_mask = None
+
+    if kvpacked:
+        out_ref, attn_ref = attention_kvpacked_ref(
+            q,
+            kv,
+            None,
+            None,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+            softcap=softcap,
+        )
+        out_pt, attn_pt = attention_kvpacked_ref(
+            q,
+            kv,
+            None,
+            None,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+            softcap=softcap,
+            upcast=False,
+            reorder_ops=True,
+        )
+    else:
+        out_ref, attn_ref = attention_ref(
+            q,
+            k,
+            v,
+            None,
+            None,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+            softcap=softcap,
+        )
+        out_pt, attn_pt = attention_ref(
+            q,
+            k,
+            v,
+            None,
+            None,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+            softcap=softcap,
+            upcast=False,
+            reorder_ops=True,
+        )
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+    if dropout_p > 0.0:
+        print(f"Attention max diff: {(attn - attn_ref).abs().max().item()}")
+        print(f"Attention Pytorch max diff: {(attn_pt - attn_ref).abs().max().item()}")
+
+    g = torch.randn_like(out)
+    do_o = (g.float() * out.float()).sum(-1)
+    if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90):
+        if kvpacked:
+            (
+                dq,
+                dkv,
+            ) = torch.autograd.grad(out, (q, kv), g)
+            dk, dv = dkv.unbind(2)
+            (
+                dq_ref,
+                dkv_ref,
+            ) = torch.autograd.grad(out_ref, (q, kv), g)
+            dk_ref, dv_ref = dkv_ref.unbind(2)
+            (
+                dq_pt,
+                dkv_pt,
+            ) = torch.autograd.grad(out_pt, (q, kv), g)
+            dk_pt, dv_pt = dkv_pt.unbind(2)
+        else:
+            (
+                dq,
+                dk,
+                dv,
+            ) = torch.autograd.grad(out, (q, k, v), g)
+            (
+                dq_ref,
+                dk_ref,
+                dv_ref,
+            ) = torch.autograd.grad(out_ref, (q, k, v), g)
+            (
+                dq_pt,
+                dk_pt,
+                dv_pt,
+            ) = torch.autograd.grad(out_pt, (q, k, v), g)
+        print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
+        print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
+        print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
+        print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
+        print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
+        print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
+        print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
+        print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
+        print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
+        print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
+        print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
+        print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
+
+    # Check that FlashAttention's numerical error is at most twice the numerical error
+    # of a Pytorch implementation.
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
+
+    if dropout_p > 0.0:
+        assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item()
+        # With alibi, many of the prob values are 0.0 & -0.0 so dropout_fraction isn't accurate
+        if not alibi:
+            assert abs(dropout_fraction - dropout_p) <= (0.01 if not local else 0.025)
+
+    if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90):
+        assert (dq - dq_ref).abs().max().item() <= 3 * (dq_pt - dq_ref).abs().max().item()
+        assert (dk - dk_ref).abs().max().item() <= 3 * (dk_pt - dk_ref).abs().max().item()
+        assert (dv - dv_ref).abs().max().item() <= 3 * (dv_pt - dv_ref).abs().max().item()
+
+
+@pytest.mark.parametrize("kvpacked", [True, False])
+# @pytest.mark.parametrize('kvpacked', [False])
+@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))
+# @pytest.mark.parametrize('dtype', [torch.float16])
+@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
+# @pytest.mark.parametrize('mha_type', ["mqa"])
+@pytest.mark.parametrize("deterministic", [False, True])
+# @pytest.mark.parametrize("deterministic", [True])
+@pytest.mark.parametrize("alibi", [False, True])
+# @pytest.mark.parametrize("alibi", [True])
+@pytest.mark.parametrize("local", [False, True])
+# @pytest.mark.parametrize("local", [True])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize('causal', [True])
+@pytest.mark.parametrize("d", [32, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize('d', [64])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [
+        (1, 147),
+        (113, 203),
+        (128, 217),
+        (113, 211),
+        (108, 256),
+        (256, 512),
+        (512, 256),
+        (1024, 1024),
+        (1023, 1024),
+        (1024, 1023),
+        (2048, 2048),
+    ],
+)
+# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(128, 128)])
+@pytest.mark.parametrize("dropout_p", [0.0, 0.17])
+@pytest.mark.parametrize("softcap", [0.0, 50.0])
+# @pytest.mark.parametrize('dropout_p', [0.0])
+def test_flash_attn_varlen_output(
+    seqlen_q, seqlen_k, d, dropout_p, causal, local, alibi, deterministic, mha_type, dtype, kvpacked, softcap
+):
+    if (
+        max(seqlen_q, seqlen_k) >= 2048
+        and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30
+    ):
+        pytest.skip()  # Reference implementation OOM
+    if softcap > 0.0 and dropout_p > 0.0:
+        pytest.skip("Softcap and dropout not supported together")
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 4
+    nheads = 6 if softcap == 0.0 else 4  # softcap reference impl takes more memory
+    nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 2)
+    assert nheads % nheads_k == 0
+    window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
+    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    if softcap > 0:
+        # Ensure the values of qk are at least within softcap range.
+        q = q * softcap
+
+    if kvpacked:
+        kv = torch.randn(
+            batch_size, seqlen_k, 2, nheads_k, d, device=device, dtype=dtype, requires_grad=True
+        )
+    else:
+        k = torch.randn(
+            batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True
+        )
+        v = torch.randn(
+            batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True
+        )
+
+    query_padding_mask = generate_random_padding_mask(seqlen_q, batch_size, device, mode="random")
+    key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode="random")
+    # key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode='full')
+    if alibi:
+        alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
+        attn_bias = attn_bias_from_alibi_slopes(
+            alibi_slopes, seqlen_q, seqlen_k, query_padding_mask, key_padding_mask, causal=causal
+        )
+    else:
+        alibi_slopes, attn_bias = None, None
+
+    if kvpacked:
+        (
+            q_unpad,
+            kv_unpad,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q,
+            kv,
+            output_pad_fn,
+            dq_pad_fn,
+            dkv_pad_fn,
+        ) = generate_qkv(q, *kv.unbind(dim=2), query_padding_mask, key_padding_mask, kvpacked=True)
+        out_unpad, sm_lse, S_dmask = flash_attn_varlen_kvpacked_func(
+            q_unpad,
+            kv_unpad,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p,
+            causal=causal,
+            window_size=window_size,
+            softcap=softcap,
+            alibi_slopes=alibi_slopes,
+            deterministic=deterministic,
+            return_attn_probs=True,
+        )
+    else:
+        (
+            q_unpad,
+            k_unpad,
+            v_unpad,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q,
+            k,
+            v,
+            output_pad_fn,
+            dq_pad_fn,
+            dk_pad_fn,
+        ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+        out_unpad, sm_lse, S_dmask = flash_attn_varlen_func(
+            q_unpad,
+            k_unpad,
+            v_unpad,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p,
+            causal=causal,
+            window_size=window_size,
+            softcap=softcap,
+            alibi_slopes=alibi_slopes,
+            deterministic=deterministic,
+            return_attn_probs=True,
+        )
+    out = output_pad_fn(out_unpad)
+    if dropout_p > 0.0:
+        S_dmask_converted = convert_flash_attn_S_to_softmax(
+            S_dmask,
+            seqlen_q,
+            seqlen_k,
+            query_padding_mask,
+            key_padding_mask,
+            d,
+            dropout_p > 0.0,
+            causal=causal,
+            window_size=window_size,
+        )
+        dropout_mask = S_dmask_converted >= 0
+        attn_unnorm = S_dmask_converted.abs()
+        if kvpacked:
+            kv_rep = repeat(kv, "b s two h d -> b s two (h g) d", g=nheads // nheads_k)
+            k_rep, v_rep = kv_rep.unbind(dim=2)
+        else:
+            k_rep = repeat(k, "b s h d -> b s (h g) d", g=nheads // nheads_k)
+            v_rep = repeat(v, "b s h d -> b s (h g) d", g=nheads // nheads_k)
+        attn = normalize_flash_attn_S(
+            attn_unnorm,
+            q,
+            k_rep,
+            v_rep,
+            query_padding_mask,
+            key_padding_mask,
+            attn_bias,
+            dropout_p > 0.0,
+            causal=causal,
+            window_size=window_size,
+        )
+        dropout_fraction = get_dropout_fraction(
+            dropout_mask,
+            query_padding_mask,
+            key_padding_mask,
+            causal=causal,
+            window_size=window_size,
+        ).item()
+        print(f"Actual dropout fraction: {dropout_fraction}")
+    else:
+        dropout_mask = None
+
+    if kvpacked:
+        out_ref, attn_ref = attention_kvpacked_ref(
+            q,
+            kv,
+            query_padding_mask,
+            key_padding_mask,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+            softcap=softcap,
+        )
+        out_pt, attn_pt = attention_kvpacked_ref(
+            q,
+            kv,
+            query_padding_mask,
+            key_padding_mask,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+            softcap=softcap,
+            upcast=False,
+            reorder_ops=True,
+        )
+    else:
+        out_ref, attn_ref = attention_ref(
+            q,
+            k,
+            v,
+            query_padding_mask,
+            key_padding_mask,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+            softcap=softcap,
+        )
+        out_pt, attn_pt = attention_ref(
+            q,
+            k,
+            v,
+            query_padding_mask,
+            key_padding_mask,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+            softcap=softcap,
+            upcast=False,
+            reorder_ops=True,
+        )
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+    if dropout_p > 0.0:
+        print(f"Attention max diff: {(attn - attn_ref).abs().max().item()}")
+        print(f"Attention Pytorch max diff: {(attn_pt - attn_ref).abs().max().item()}")
+
+    g = torch.randn_like(out)
+    if ((d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90)):
+        if kvpacked:
+            (
+                dq_unpad,
+                dkv_unpad,
+            ) = torch.autograd.grad(out, (q_unpad, kv_unpad), g)
+            dk, dv = dkv_pad_fn(dkv_unpad).unbind(2)
+            (
+                dq_ref,
+                dkv_ref,
+            ) = torch.autograd.grad(out_ref, (q, kv), g)
+            dk_ref, dv_ref = dkv_ref.unbind(2)
+            (
+                dq_pt,
+                dkv_pt,
+            ) = torch.autograd.grad(out_pt, (q, kv), g)
+            dk_pt, dv_pt = dkv_pt.unbind(2)
+        else:
+            (
+                dq_unpad,
+                dk_unpad,
+                dv_unpad,
+            ) = torch.autograd.grad(out, (q_unpad, k_unpad, v_unpad), g)
+            dk = dk_pad_fn(dk_unpad)
+            dv = dk_pad_fn(dv_unpad)
+            (
+                dq_ref,
+                dk_ref,
+                dv_ref,
+            ) = torch.autograd.grad(out_ref, (q, k, v), g)
+            (
+                dq_pt,
+                dk_pt,
+                dv_pt,
+            ) = torch.autograd.grad(out_pt, (q, k, v), g)
+        dq = dq_pad_fn(dq_unpad)
+        print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
+        print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
+        print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
+        print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
+        print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
+        print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
+        print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
+        print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
+        print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
+        print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
+        print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
+        print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
+
+    # Check that FlashAttention's numerical error is at most twice the numerical error
+    # of a Pytorch implementation.
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
+
+    if dropout_p > 0.0:
+        assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item()
+        # With alibi, many of the prob values are 0.0 & -0.0 so dropout_fraction isn't accurate
+        if not alibi:
+            assert abs(dropout_fraction - dropout_p) <= (0.01 if not local else 0.04)
+
+    if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90):
+        assert (dq - dq_ref).abs().max().item() <= 3 * (dq_pt - dq_ref).abs().max().item()
+        assert (dk - dk_ref).abs().max().item() <= 3 * (dk_pt - dk_ref).abs().max().item()
+        assert (dv - dv_ref).abs().max().item() <= 3 * (dv_pt - dv_ref).abs().max().item()
+
+
+@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))
+# @pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("local", [False, True])
+# @pytest.mark.parametrize("local", [True])
+@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [56, 80])
+# @pytest.mark.parametrize("d", [64, 128])
+@pytest.mark.parametrize("swap_sq_sk", [False, True])
+# @pytest.mark.parametrize("swap_sq_sk", [True])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [
+        (1, 239),
+        (3, 799),
+        (127, 512),
+        (127, 513),
+        (113, 203),
+        (128, 217),
+        (113, 211),
+        (108, 256),
+        (256, 512),
+        (1023, 1024),
+    ],
+)
+# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)])
+def test_flash_attn_causal(seqlen_q, seqlen_k, swap_sq_sk, d, local, dtype):
+    if (
+        max(seqlen_q, seqlen_k) >= 2048
+        and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30
+    ):
+        pytest.skip()  # Reference implementation OOM
+    if swap_sq_sk:
+        seqlen_q, seqlen_k = seqlen_k, seqlen_q
+    device = "cuda"
+    causal = True
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    nheads = 9
+    window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
+    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    out = flash_attn_func(q, k, v, 0.0, causal=causal, window_size=window_size)
+    out_ref, attn_ref = attention_ref(
+        q, k, v, None, None, None, 0.0, None, causal=causal, window_size=window_size
+    )
+    out_pt, attn_pt = attention_ref(
+        q,
+        k,
+        v,
+        None,
+        None,
+        None,
+        0.0,
+        None,
+        causal=causal,
+        window_size=window_size,
+        upcast=False,
+        reorder_ops=True,
+    )
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+
+    g = torch.randn_like(out)
+    do_o = (g.float() * out.float()).sum(-1)
+    (
+        dq,
+        dk,
+        dv,
+    ) = torch.autograd.grad(out, (q, k, v), g)
+    (
+        dq_ref,
+        dk_ref,
+        dv_ref,
+    ) = torch.autograd.grad(out_ref, (q, k, v), g)
+    (
+        dq_pt,
+        dk_pt,
+        dv_pt,
+    ) = torch.autograd.grad(out_pt, (q, k, v), g)
+    print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
+    print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
+    print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
+    print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
+    print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
+    print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
+    print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
+    print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
+    print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
+    print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
+    print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
+    print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
+
+    # Check that FlashAttention's numerical error is at most twice the numerical error
+    # of a Pytorch implementation.
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + 1e-5
+
+    assert (dq - dq_ref).abs().max().item() <= 2 * (dq_pt - dq_ref).abs().max().item() + 1e-5
+    assert (dk - dk_ref).abs().max().item() <= 2 * (dk_pt - dk_ref).abs().max().item() + 1e-5
+    assert (dv - dv_ref).abs().max().item() <= 2 * (dv_pt - dv_ref).abs().max().item() + 1e-5
+
+
+@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))
+# @pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("local", [False, True])
+# @pytest.mark.parametrize("local", [True])
+@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [56, 80])
+# @pytest.mark.parametrize("d", [64])
+@pytest.mark.parametrize("swap_sq_sk", [False, True])
+# @pytest.mark.parametrize("swap_sq_sk", [True])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [
+        (1, 239),
+        (3, 799),
+        (127, 512),
+        (127, 513),
+        (113, 203),
+        (128, 217),
+        (113, 211),
+        (108, 256),
+        (256, 512),
+        (1023, 1024),
+    ],
+)
+# TODO: add smaller page sizes when https://github.com/Dao-AILab/flash-attention/pull/824 is merged
+@pytest.mark.parametrize("paged_kv_block_size", [None, 256, 512])
+# @pytest.mark.parametrize("seqlen_q,seqlen_k", [(256, 128)])
+def test_flash_attn_varlen_causal(
+    seqlen_q, seqlen_k, swap_sq_sk, d, local, paged_kv_block_size, dtype
+):
+    if (
+        max(seqlen_q, seqlen_k) >= 2048
+        and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30
+    ):
+        pytest.skip()  # Reference implementation OOM
+    if swap_sq_sk:
+        seqlen_q, seqlen_k = seqlen_k, seqlen_q
+    device = "cuda"
+    causal = True
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    nheads = 9
+    window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
+    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
+
+    if paged_kv_block_size is None:
+        k = torch.randn(
+            batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True
+        )
+        v = torch.randn(
+            batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True
+        )
+        block_table = None
+    else:
+        k, v, block_table, k_cache_paged, v_cache_paged, num_blocks = _generate_block_kvcache(
+            seqlen_k, paged_kv_block_size, batch_size, nheads, d, device, dtype
+        )
+    query_padding_mask = generate_random_padding_mask(seqlen_q, batch_size, device, mode="random")
+    key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode="random")
+    (
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        q,
+        k,
+        v,
+        output_pad_fn,
+        dq_pad_fn,
+        dk_pad_fn,
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+    out_unpad = flash_attn_varlen_func(
+        q_unpad,
+        k_unpad if paged_kv_block_size is None else k_cache_paged,
+        v_unpad if paged_kv_block_size is None else v_cache_paged,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        0.0,
+        causal=causal,
+        window_size=window_size,
+        block_table=block_table,
+    )
+    out = output_pad_fn(out_unpad)
+    out_ref, attn_ref = attention_ref(
+        q,
+        k,
+        v,
+        query_padding_mask,
+        key_padding_mask,
+        None,
+        0.0,
+        None,
+        causal=causal,
+        window_size=window_size,
+    )
+    out_pt, attn_pt = attention_ref(
+        q,
+        k,
+        v,
+        query_padding_mask,
+        key_padding_mask,
+        None,
+        0.0,
+        None,
+        causal=causal,
+        window_size=window_size,
+        upcast=False,
+        reorder_ops=True,
+    )
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+
+    g = torch.randn_like(out)
+    do_o = (g.float() * out.float()).sum(-1)
+    test_backward = block_table is None
+    if test_backward:
+        (
+            dq_unpad,
+            dk_unpad,
+            dv_unpad,
+        ) = torch.autograd.grad(out, (q_unpad, k_unpad, v_unpad), g)
+        dq = dq_pad_fn(dq_unpad)
+        dk = dk_pad_fn(dk_unpad)
+        dv = dk_pad_fn(dv_unpad)
+        (
+            dq_ref,
+            dk_ref,
+            dv_ref,
+        ) = torch.autograd.grad(out_ref, (q, k, v), g)
+        (
+            dq_pt,
+            dk_pt,
+            dv_pt,
+        ) = torch.autograd.grad(out_pt, (q, k, v), g)
+        print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
+        print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
+        print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
+        print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
+        print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
+        print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
+        print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
+        print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
+        print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
+        print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
+        print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
+        print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
+
+    # Check that FlashAttention's numerical error is at most twice the numerical error
+    # of a Pytorch implementation.
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + 1e-5
+
+    if test_backward:
+        assert (dq - dq_ref).abs().max().item() <= 2 * (dq_pt - dq_ref).abs().max().item() + 1e-5
+        assert (dk - dk_ref).abs().max().item() <= 2 * (dk_pt - dk_ref).abs().max().item() + 1e-5
+        assert (dv - dv_ref).abs().max().item() <= 2 * (dv_pt - dv_ref).abs().max().item() + 1e-5
+
+
+@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))
+# @pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("deterministic", [False, True])
+# @pytest.mark.parametrize("deterministic", [True])
+@pytest.mark.parametrize("alibi", [False, True])
+# @pytest.mark.parametrize("alibi", [True])
+@pytest.mark.parametrize("local", [False, True])
+# @pytest.mark.parametrize("local", [False])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize("causal", [True])
+@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [56, 80])
+# @pytest.mark.parametrize("d", [64])
+@pytest.mark.parametrize("swap_sq_sk", [False, True])
+# @pytest.mark.parametrize("swap_sq_sk", [False])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [
+        (3, 1024),
+        (1, 339),
+        (64, 800),
+        (3, 799),
+        (64, 2048),
+        (16, 20000),
+        (16, 100000),
+        (128, 128),
+        (256, 256),
+    ],
+)
+# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)])
+def test_flash_attn_splitkv(
+    seqlen_q, seqlen_k, swap_sq_sk, d, causal, local, alibi, deterministic, dtype
+):
+    if swap_sq_sk:
+        seqlen_q, seqlen_k = seqlen_k, seqlen_q
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 1
+    nheads = 12
+    window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
+    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    if alibi:
+        alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
+        attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen_q, seqlen_k, causal=causal)
+    else:
+        alibi_slopes, attn_bias = None, None
+    out, lse, _ = flash_attn_func(
+        q,
+        k,
+        v,
+        0.0,
+        causal=causal,
+        window_size=window_size,
+        alibi_slopes=alibi_slopes,
+        deterministic=deterministic,
+        return_attn_probs=True,
+    )
+    out_ref, attn_ref = attention_ref(
+        q, k, v, None, None, attn_bias, 0.0, None, causal=causal, window_size=window_size
+    )
+    out_pt, attn_pt = attention_ref(
+        q,
+        k,
+        v,
+        None,
+        None,
+        attn_bias,
+        0.0,
+        None,
+        causal=causal,
+        window_size=window_size,
+        upcast=False,
+        reorder_ops=True,
+    )
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+
+    g = torch.randn_like(out)
+    do_o = (g.float() * out.float()).sum(-1)
+    (
+        dq,
+        dk,
+        dv,
+    ) = torch.autograd.grad(out, (q, k, v), g)
+    (
+        dq_ref,
+        dk_ref,
+        dv_ref,
+    ) = torch.autograd.grad(out_ref, (q, k, v), g)
+    (
+        dq_pt,
+        dk_pt,
+        dv_pt,
+    ) = torch.autograd.grad(out_pt, (q, k, v), g)
+    print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
+    print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
+    print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
+    print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
+    print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
+    print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
+    print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
+    print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
+    print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
+    print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
+    print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
+    print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
+
+    # Check that FlashAttention's numerical error is at most twice the numerical error
+    # of a Pytorch implementation.
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + 1e-5
+
+    mult = 2 if not alibi else 8
+    assert (dq - dq_ref).abs().max().item() <= mult * (dq_pt - dq_ref).abs().max().item() + 2e-4
+    assert (dk - dk_ref).abs().max().item() <= mult * (dk_pt - dk_ref).abs().max().item() + 2e-4
+    assert (dv - dv_ref).abs().max().item() <= mult * (dv_pt - dv_ref).abs().max().item() + 2e-4
+
+
+# @pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))
+@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("num_splits", [1, 0])
+# @pytest.mark.parametrize("num_splits", [1])
+@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
+# @pytest.mark.parametrize("mha_type", ["mha"])
+@pytest.mark.parametrize("new_kv", [False, True])
+# @pytest.mark.parametrize("new_kv", [False])
+@pytest.mark.parametrize("alibi", [False, True])
+# @pytest.mark.parametrize("alibi", [False])
+@pytest.mark.parametrize("local", [False, True])
+# @pytest.mark.parametrize("local", [False])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize("causal", [False])
+@pytest.mark.parametrize("seqlen_new_eq_seqlen_q", [True, False])
+# @pytest.mark.parametrize("seqlen_new_eq_seqlen_q", [True])
+@pytest.mark.parametrize("rotary_interleaved", [False, True])
+# @pytest.mark.parametrize("rotary_interleaved", [False])
+@pytest.mark.parametrize("rotary_fraction", [0.0, 0.5, 1.0])
+# @pytest.mark.parametrize("rotary_fraction", [0.0])
+@pytest.mark.parametrize("paged_kv_block_size", [None, 256])
+# @pytest.mark.parametrize("paged_kv_block_size", [256, 512])
+# @pytest.mark.parametrize("paged_kv_block_size", [None])
+@pytest.mark.parametrize("has_leftpad", [False, True])
+# @pytest.mark.parametrize("has_leftpad", [True])
+# @pytest.mark.parametrize("has_batch_idx", [False, True])
+@pytest.mark.parametrize("has_batch_idx", [False])
+@pytest.mark.parametrize("d", [32, 59, 64, 80, 128, 256])
+# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [56, 80])
+# @pytest.mark.parametrize("d", [128])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [
+        (1, 128),
+        (1, 339),
+        (3, 1024),
+        (64, 800),
+        (64, 256),
+        (3, 799),
+        (64, 2048),
+        (16, 20000),
+        (1, 128 * 1024),
+        (16, 128 * 1024),
+        (128, 128),
+    ],
+)
+# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)])
+def test_flash_attn_kvcache(
+    seqlen_q,
+    seqlen_k,
+    d,
+    has_batch_idx,
+    has_leftpad,
+    paged_kv_block_size,
+    rotary_fraction,
+    rotary_interleaved,
+    seqlen_new_eq_seqlen_q,
+    causal,
+    local,
+    alibi,
+    new_kv,
+    mha_type,
+    num_splits,
+    dtype,
+):
+    if seqlen_q > seqlen_k and new_kv:
+        pytest.skip()
+    if not new_kv and rotary_fraction > 0.0:
+        pytest.skip()
+    if has_batch_idx and paged_kv_block_size is not None:
+        pytest.skip()
+    if has_leftpad and paged_kv_block_size is not None:
+        pytest.skip()
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    batch_size_cache = batch_size if not has_batch_idx else batch_size * 2
+    nheads = 6
+    # rotary_dim must be a multiple of 16, and must be <= d
+    rotary_dim = math.floor(int(rotary_fraction * d) / 16) * 16
+    nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 3)
+    assert nheads % nheads_k == 0
+    window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
+    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype)
+    seqlen_new = seqlen_q if seqlen_new_eq_seqlen_q else torch.randint(1, seqlen_q + 1, (1,)).item()
+    if new_kv:
+        k = torch.randn(batch_size, seqlen_new, nheads_k, d, device=device, dtype=dtype)
+        v = torch.randn(batch_size, seqlen_new, nheads_k, d, device=device, dtype=dtype)
+    else:
+        k, v = None, None
+    if paged_kv_block_size is None:
+        k_cache = torch.randn(batch_size_cache, seqlen_k, nheads_k, d, device=device, dtype=dtype)
+        v_cache = torch.randn(batch_size_cache, seqlen_k, nheads_k, d, device=device, dtype=dtype)
+        block_table = None
+    else:
+        (
+            k_cache,
+            v_cache,
+            block_table,
+            k_cache_paged,
+            v_cache_paged,
+            num_blocks,
+        ) = _generate_block_kvcache(
+            seqlen_k, paged_kv_block_size, batch_size, nheads_k, d, device, dtype
+        )
+    cache_seqlens = torch.randint(
+        0 if new_kv else 1,
+        # If we don't use seqlen_q in the case of causal and rotary, cos/sin won't be long enough
+        (
+            (seqlen_k - (seqlen_q if (causal or local) and rotary_dim > 1 else seqlen_new) + 1)
+            if new_kv
+            else (seqlen_k + 1)
+        ),
+        (batch_size,),
+        dtype=torch.int32,
+        device=device,
+    )
+    if has_leftpad:
+        cache_leftpad = torch.cat([torch.randint(0, cache_seqlens[i].item(), (1,), dtype=torch.int32, device=device)
+                                   if cache_seqlens[i].item() > 0 else torch.zeros(1, dtype=torch.int32, device=device)
+                                   for i in range(batch_size)])
+    else:
+        cache_leftpad = None
+    arange = rearrange(torch.arange(seqlen_k, device=device), "s -> 1 s")
+    cache_seqlens_expanded = rearrange(cache_seqlens, "b -> b 1")
+    key_padding_mask = arange < cache_seqlens_expanded + (seqlen_new if new_kv else 0)
+    if has_leftpad:
+        key_padding_mask = torch.logical_and(
+            key_padding_mask, arange >= cache_leftpad.unsqueeze(-1).expand(-1, seqlen_k)
+        )
+    if has_batch_idx:
+        cache_batch_idx = torch.randperm(batch_size_cache, dtype=torch.int32, device=device)[
+            :batch_size
+        ]
+    else:
+        cache_batch_idx = None
+    if alibi:
+        alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
+        attn_bias = attn_bias_from_alibi_slopes(
+            alibi_slopes, seqlen_q, seqlen_k, None, key_padding_mask, causal=causal, key_leftpad=cache_leftpad
+        )
+    else:
+        alibi_slopes, attn_bias = None, None
+    # cache_seqlens = torch.tensor([64], dtype=torch.int32, device=device)
+    if rotary_dim > 0:
+        angle = (
+            torch.rand(
+                seqlen_k if paged_kv_block_size is None else num_blocks * paged_kv_block_size,
+                rotary_dim // 2,
+                device=device,
+            )
+            * 2
+            * math.pi
+        )
+        cos = torch.cos(angle).to(dtype=dtype)
+        sin = torch.sin(angle).to(dtype=dtype)
+        if causal or local:
+            q_ro = apply_rotary_emb(
+                q, cos, sin, seqlen_offsets=cache_seqlens, interleaved=rotary_interleaved
+            )
+        else:
+            q_ro = rearrange(
+                apply_rotary_emb(
+                    rearrange(q, "b s h d -> b 1 (s h) d"),
+                    cos,
+                    sin,
+                    seqlen_offsets=cache_seqlens,
+                    interleaved=rotary_interleaved,
+                ),
+                "b 1 (s h) d -> b s h d",
+                s=seqlen_q,
+            )
+        # q_ro = q
+        k_ro = apply_rotary_emb(
+            k, cos, sin, seqlen_offsets=cache_seqlens, interleaved=rotary_interleaved
+        )
+    else:
+        cos, sin = None, None
+        q_ro, k_ro = q, k
+    # k_cache[:, 64:] = -1
+    k_cache_ref = (
+        k_cache if not has_batch_idx else k_cache[cache_batch_idx.to(dtype=torch.long)]
+    ).clone()
+    v_cache_ref = (
+        v_cache if not has_batch_idx else v_cache[cache_batch_idx.to(dtype=torch.long)]
+    ).clone()
+    if new_kv:
+        update_mask = torch.logical_and(
+            cache_seqlens_expanded <= arange, arange < cache_seqlens_expanded + seqlen_new
+        )
+        k_cache_ref[update_mask] = rearrange(k_ro, "b s ... -> (b s) ...")
+        v_cache_ref[update_mask] = rearrange(v, "b s ... -> (b s) ...")
+    k_cache_rep = repeat(k_cache_ref, "b s h d -> b s (h g) d", g=nheads // nheads_k)
+    v_cache_rep = repeat(v_cache_ref, "b s h d -> b s (h g) d", g=nheads // nheads_k)
+    out = flash_attn_with_kvcache(
+        q,
+        k_cache if paged_kv_block_size is None else k_cache_paged,
+        v_cache if paged_kv_block_size is None else v_cache_paged,
+        k,
+        v,
+        rotary_cos=cos,
+        rotary_sin=sin,
+        cache_seqlens=cache_seqlens,
+        cache_batch_idx=cache_batch_idx,
+        cache_leftpad=cache_leftpad,
+        block_table=block_table,
+        causal=causal,
+        window_size=window_size,
+        rotary_interleaved=rotary_interleaved,
+        alibi_slopes=alibi_slopes,
+        num_splits=num_splits,
+    )
+    # out = flash_attn_with_kvcache(
+    #     q, k_cache, v_cache, cache_seqlens=cache_seqlens, causal=causal, window_size=window_size
+    # )
+    # out = flash_attn_with_kvcache(q, k_cache, v_cache, causal=causal, window_size=window_size)
+    # qk = torch.einsum("bqhd,bkhd->bhqk", q, k_cache_ref)
+    # m = qk.amax(-1, keepdim=True)
+    # s_tmp = torch.exp((qk - m) / math.sqrt(d))
+    # o1 = torch.einsum('bhst,bthd->bshd', s_tmp, v_cache_ref)
+    # lse_ref = torch.logsumexp(qk / math.sqrt(d), -1)
+    # probs = torch.softmax(qk, dim=-1)
+    out_ref, _ = attention_ref(
+        q_ro,
+        k_cache_rep,
+        v_cache_rep,
+        None,
+        key_padding_mask,
+        attn_bias,
+        0.0,
+        None,
+        causal=causal,
+        window_size=window_size,
+        key_leftpad=cache_leftpad,
+    )
+    out_pt, _ = attention_ref(
+        q_ro,
+        k_cache_rep,
+        v_cache_rep,
+        None,
+        key_padding_mask,
+        attn_bias,
+        0.0,
+        None,
+        causal=causal,
+        window_size=window_size,
+        upcast=False,
+        reorder_ops=True,
+        key_leftpad=cache_leftpad,
+    )
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+
+    # Check that FlashAttention's numerical error is at most twice the numerical error
+    # of a Pytorch implementation.
+    if new_kv:
+        if paged_kv_block_size is None:
+            k_cache_select = (
+                k_cache if not has_batch_idx else k_cache[cache_batch_idx.to(dtype=torch.long)]
+            )
+            v_cache_select = (
+                v_cache if not has_batch_idx else v_cache[cache_batch_idx.to(dtype=torch.long)]
+            )
+        else:
+            k_cache_select = rearrange(
+                k_cache_paged[block_table.to(dtype=torch.long).flatten()],
+                "(b nblocks) block_size ... -> b (nblocks block_size) ...",
+                b=batch_size,
+            )[:, :seqlen_k]
+            v_cache_select = rearrange(
+                v_cache_paged[block_table.to(dtype=torch.long).flatten()],
+                "(b nblocks) block_size ... -> b (nblocks block_size) ...",
+                b=batch_size,
+            )[:, :seqlen_k]
+        assert torch.allclose(k_cache_select, k_cache_ref, rtol=1e-3, atol=1e-3)
+        assert torch.equal(v_cache_select, v_cache_ref)
+    mult = 3 if not alibi else 5
+    assert (out - out_ref).abs().max().item() <= mult * (out_pt - out_ref).abs().max().item() + 1e-5
+
+
+def _generate_block_kvcache(seqlen_k, paged_kv_block_size, batch_size, nheads_k, d, device, dtype):
+    num_blocks = math.ceil(seqlen_k / paged_kv_block_size) * batch_size * 3
+    k_cache_paged = torch.randn(
+        num_blocks, paged_kv_block_size, nheads_k, d, device=device, dtype=dtype
+    )
+    v_cache_paged = torch.randn(
+        num_blocks, paged_kv_block_size, nheads_k, d, device=device, dtype=dtype
+    )
+    block_table = rearrange(
+        torch.randperm(num_blocks, dtype=torch.int32, device=device),
+        "(b nblocks) -> b nblocks",
+        b=batch_size,
+    )
+    k_cache = rearrange(
+        # pytorch 1.12 doesn't have indexing with int32
+        k_cache_paged[block_table.to(dtype=torch.long).flatten()],
+        "(b nblocks) block_size ... -> b (nblocks block_size) ...",
+        b=batch_size,
+    )[:, :seqlen_k]
+    v_cache = rearrange(
+        v_cache_paged[block_table.to(dtype=torch.long).flatten()],
+        "(b nblocks) block_size ... -> b (nblocks block_size) ...",
+        b=batch_size,
+    )[:, :seqlen_k]
+    return k_cache, v_cache, block_table, k_cache_paged, v_cache_paged, num_blocks
+
+
+# @pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))
+@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize('causal', [True])
+@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize('d', [32, 56, 64, 80, 96, 128])
+# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [128])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [
+        (1, 239),
+        (239, 1),
+        (3, 799),
+        (799, 3),
+        (1024, 128),
+        (97, 97),
+        (128, 128),
+        (200, 200),
+        (256, 256),
+        (257, 257),
+        (384, 384),
+        (512, 512),
+        (768, 768),
+        (1024, 1024),
+    ],
+)
+@pytest.mark.parametrize("dropout_p", [0.0, 0.17])
+# @pytest.mark.parametrize("dropout_p", [0.0])
+def test_flash_attn_race_condition(seqlen_q, seqlen_k, d, dropout_p, causal, dtype):
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 60  # Sometimes we need large batch size for the race conditions to trigger
+    nheads = 4
+    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    torch.random.manual_seed(42)
+    out0, lse0, _ = flash_attn_func(q, k, v, dropout_p, causal=causal, return_attn_probs=True)
+    g = torch.randn_like(out0)
+    if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90):
+        (
+            dq0,
+            dk0,
+            dv0,
+        ) = torch.autograd.grad(out0, (q, k, v), g)
+        # Numerical error if we just do any arithmetic on dq
+        dq_atol = 2 * ((dq0 + 0.3 - 0.3) - dq0).abs().max().item()
+
+    for i in range(250):
+        torch.random.manual_seed(42)
+        out, lse, _ = flash_attn_func(q, k, v, dropout_p, causal=causal, return_attn_probs=True)
+        assert torch.equal(out, out0)
+        assert torch.equal(lse, lse0)
+
+        if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90):
+            (
+                dq,
+                dk,
+                dv,
+            ) = torch.autograd.grad(out, (q, k, v), g)
+            dq_equal = torch.allclose(dq, dq0, atol=dq_atol)
+            if not dq_equal:
+                print(f"Iter {i}, {dq_atol = }, dQ max diff: {(dq - dq0).abs().max().item()}")
+            assert torch.equal(dv, dv0)
+            assert torch.equal(dk, dk0)
+            assert dq_equal
+
+
+@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize('causal', [False])
+@pytest.mark.parametrize("d", [16, 32, 64])
+# @pytest.mark.parametrize('d', [16])
+@pytest.mark.parametrize("seqlen", [1, 2, 5, 17, 128])
+# @pytest.mark.parametrize('seqlen', [2])
+def test_flash_attn_bwd_overflow(seqlen, d, causal, dtype):
+    """We previously had a bug where not masking elements beyond seqlen_k caused NaN in dQ,
+    in the case where seqlen % 128 != 0.
+    """
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    nheads = 5
+    q = torch.randn([batch_size, seqlen, nheads, d], dtype=dtype, device="cuda") * 5
+    k, v = [
+        torch.randn([batch_size, seqlen, nheads, d], dtype=dtype, device="cuda") * 3
+        for _ in range(2)
+    ]
+    q.requires_grad_(True)
+    k.requires_grad_(True)
+    v.requires_grad_(True)
+    out = flash_attn_func(q, k, v, causal=causal)
+    g = torch.randn_like(out)
+    out.backward(g)
+    q_pt = q.detach().clone().requires_grad_(True)
+    k_pt = k.detach().clone().requires_grad_(True)
+    v_pt = v.detach().clone().requires_grad_(True)
+    out_pt, _ = attention_ref(q_pt, k_pt, v_pt, causal=causal, upcast=False, reorder_ops=True)
+    out_pt.backward(g)
+    q_ref = q.detach().clone().requires_grad_(True)
+    k_ref = k.detach().clone().requires_grad_(True)
+    v_ref = v.detach().clone().requires_grad_(True)
+    out_ref, attn_ref = attention_ref(q_ref, k_ref, v_ref, causal=causal)
+    out_ref.backward(g)
+    print(f"dQ max diff: {(q.grad - q_ref.grad).abs().max().item()}")
+    print(f"dK max diff: {(k.grad - k_ref.grad).abs().max().item()}")
+    print(f"dV max diff: {(v.grad - v_ref.grad).abs().max().item()}")
+    print(f"dQ Pytorch max diff: {(q_pt.grad - q_ref.grad).abs().max().item()}")
+    print(f"dK Pytorch max diff: {(k_pt.grad - k_ref.grad).abs().max().item()}")
+    print(f"dV Pytorch max diff: {(v_pt.grad - v_ref.grad).abs().max().item()}")
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
+    assert (q.grad - q_ref.grad).abs().max().item() <= 5 * (
+        q_pt.grad - q_ref.grad
+    ).abs().max().item() + 1e-3
+    assert (k.grad - k_ref.grad).abs().max().item() <= 5 * (
+        k_pt.grad - k_ref.grad
+    ).abs().max().item() + 1e-3
+    assert (v.grad - v_ref.grad).abs().max().item() <= 5 * (
+        v_pt.grad - v_ref.grad
+    ).abs().max().item() + 1e-3
+
+
+@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))
+# @pytest.mark.parametrize('dtype', [torch.bfloat16])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize('causal', [False])
+@pytest.mark.parametrize("d", [64, 128])
+# @pytest.mark.parametrize('d', [64])
+@pytest.mark.parametrize("seqlen", [97, 128, 200, 256])
+# @pytest.mark.parametrize('seqlen', [128])
+def test_flash_attn_bwd_transpose(seqlen, d, causal, dtype):
+    """We previously had a bug where we were using the wrong strides of dout, which shows up
+    when dout is not contiguous.
+    """
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 5
+    nheads = 2
+    q, k, v = [
+        torch.randn([batch_size, seqlen, nheads, d], dtype=dtype, device="cuda", requires_grad=True)
+        for _ in range(3)
+    ]
+    out = rearrange(flash_attn_func(q, k, v, causal=causal), "b s ... -> s b ...")
+    # So g is not contiguous
+    g = torch.randn(seqlen, 2 * batch_size, nheads, d, dtype=dtype, device="cuda")[:, ::2]
+    out.backward(g)
+    q_pt = q.detach().clone().requires_grad_(True)
+    k_pt = k.detach().clone().requires_grad_(True)
+    v_pt = v.detach().clone().requires_grad_(True)
+    out_pt, attn_pt = attention_ref(q_pt, k_pt, v_pt, causal=causal, upcast=False, reorder_ops=True)
+    out_pt = rearrange(out_pt, "b s ... -> s b ...")
+    out_pt.backward(g)
+    q_ref = q.detach().clone().requires_grad_(True)
+    k_ref = k.detach().clone().requires_grad_(True)
+    v_ref = v.detach().clone().requires_grad_(True)
+    out_ref, attn_ref = attention_ref(q_ref, k_ref, v_ref, causal=causal)
+    out_ref = rearrange(out_ref, "b s ... -> s b ...")
+    out_ref.backward(g)
+    print(f"dQ max diff: {(q.grad - q_ref.grad).abs().max().item()}")
+    print(f"dK max diff: {(k.grad - k_ref.grad).abs().max().item()}")
+    print(f"dV max diff: {(v.grad - v_ref.grad).abs().max().item()}")
+    print(f"dQ Pytorch max diff: {(q_pt.grad - q_ref.grad).abs().max().item()}")
+    print(f"dK Pytorch max diff: {(k_pt.grad - k_ref.grad).abs().max().item()}")
+    print(f"dV Pytorch max diff: {(v_pt.grad - v_ref.grad).abs().max().item()}")
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
+    assert (q.grad - q_ref.grad).abs().max().item() <= 2 * (
+        q_pt.grad - q_ref.grad
+    ).abs().max().item()
+    assert (k.grad - k_ref.grad).abs().max().item() <= 2 * (
+        k_pt.grad - k_ref.grad
+    ).abs().max().item()
+    assert (v.grad - v_ref.grad).abs().max().item() <= 2 * (
+        v_pt.grad - v_ref.grad
+    ).abs().max().item()
+
+
+@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize('causal', [False])
+@pytest.mark.parametrize("d", [16, 32, 64])
+# @pytest.mark.parametrize('d', [16])
+def test_flash_attn_bwd_varlen_overflow(d, causal, dtype):
+    """We previously had a bug where not masking elements beyond seqlen_k caused NaN in dQ,
+    in the case where seqlen % 128 != 0 or varlen.
+    """
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    nheads = 5
+    q_cuseqlen = torch.tensor([0, 76, 110, 256], device=device, dtype=torch.int32)
+    k_cuseqlen = torch.tensor([0, 1, 2, 3], device=device, dtype=torch.int32)
+    Mq = 256
+    Mk = 3
+
+    q = torch.randn([Mq, nheads, d], dtype=dtype, device=device) * 3
+    k, v = [torch.randn([Mk, nheads, d], dtype=dtype, device=device) * 3 for _ in range(2)]
+    q.requires_grad_(True)
+    k.requires_grad_(True)
+    v.requires_grad_(True)
+
+    out = flash_attn_varlen_func(q, k, v, q_cuseqlen, k_cuseqlen, Mq, Mk, causal=causal)
+    g = torch.randn_like(out)
+    out.backward(g)
+
+    assert not q.grad.isnan().any()
+    assert not k.grad.isnan().any()
+    assert not v.grad.isnan().any()
+
+
+@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))
+# @pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("local", [False, True])
+# @pytest.mark.parametrize("local", [True])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize("causal", [True])
+@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [56, 80])
+# @pytest.mark.parametrize("d", [64])
+@pytest.mark.parametrize("swap_sq_sk", [False, True])
+# @pytest.mark.parametrize("swap_sq_sk", [False])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [
+        (1, 239),
+        (3, 799),
+        (127, 512),
+        (127, 513),
+        (113, 203),
+        (128, 217),
+        (113, 211),
+        (108, 256),
+        (256, 512),
+        (1023, 1024),
+    ],
+)
+# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)])
+def test_flash_attn_deterministic(seqlen_q, seqlen_k, swap_sq_sk, d, causal, local, dtype):
+    if (
+        max(seqlen_q, seqlen_k) >= 2048
+        and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30
+    ):
+        pytest.skip()  # Reference implementation OOM
+    if swap_sq_sk:
+        seqlen_q, seqlen_k = seqlen_k, seqlen_q
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 4
+    nheads = 9
+    window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
+    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    out = flash_attn_func(q, k, v, 0.0, causal=causal, window_size=window_size, deterministic=True)
+
+    g = torch.randn_like(out)
+    dq0, dk0, dv0 = torch.autograd.grad(out, (q, k, v), g, retain_graph=True)
+    for _ in range(50):
+        dq, dk, dv = torch.autograd.grad(out, (q, k, v), g, retain_graph=True)
+        assert torch.equal(dv, dv0)
+        assert torch.equal(dk, dk0)
+        assert torch.equal(dq, dq0)
+
+
+@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16]))
+# @pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("local", [False, True])
+# @pytest.mark.parametrize("local", [True])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize("causal", [True])
+@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
+# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192])
+# @pytest.mark.parametrize('d', [56, 80])
+# @pytest.mark.parametrize("d", [64])
+@pytest.mark.parametrize("swap_sq_sk", [False, True])
+# @pytest.mark.parametrize("swap_sq_sk", [True])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [
+        (1, 239),
+        (3, 799),
+        (127, 512),
+        (127, 513),
+        (113, 203),
+        (128, 217),
+        (113, 211),
+        (108, 256),
+        (256, 512),
+        (1023, 1024),
+    ],
+)
+# @pytest.mark.parametrize("seqlen_q,seqlen_k", [(256, 128)])
+def test_flash_attn_varlen_deterministic(seqlen_q, seqlen_k, swap_sq_sk, d, causal, local, dtype):
+    if (
+        max(seqlen_q, seqlen_k) >= 2048
+        and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30
+    ):
+        pytest.skip()  # Reference implementation OOM
+    if swap_sq_sk:
+        seqlen_q, seqlen_k = seqlen_k, seqlen_q
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    nheads = 9
+    window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
+    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    query_padding_mask = generate_random_padding_mask(seqlen_q, batch_size, device, mode="random")
+    key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode="random")
+    (
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        q,
+        k,
+        v,
+        output_pad_fn,
+        dq_pad_fn,
+        dk_pad_fn,
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+    out = flash_attn_varlen_func(
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        0.0,
+        causal=causal,
+        window_size=window_size,
+        deterministic=True,
+    )
+
+    g = torch.randn_like(out)
+    dq0, dk0, dv0 = torch.autograd.grad(out, (q_unpad, k_unpad, v_unpad), g, retain_graph=True)
+    for _ in range(50):
+        dq, dk, dv = torch.autograd.grad(out, (q_unpad, k_unpad, v_unpad), g, retain_graph=True)
+        assert torch.equal(dv, dv0)
+        assert torch.equal(dk, dk0)
+        assert torch.equal(dq, dq0)
diff --git a/test_flash_attn_ck.py b/test_flash_attn_ck.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbcb51cefee1055ed4b7a1d3991489d12de4a6e5
--- /dev/null
+++ b/test_flash_attn_ck.py
@@ -0,0 +1,754 @@
+import math
+
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from flash_attn import (
+    flash_attn_func,
+    flash_attn_kvpacked_func,
+    flash_attn_qkvpacked_func,
+    flash_attn_varlen_func,
+    flash_attn_varlen_kvpacked_func,
+    flash_attn_varlen_qkvpacked_func,
+)
+
+from test_flash_attn import (
+    attn_bias_from_alibi_slopes,
+    convert_flash_attn_S_to_softmax,
+    generate_qkv,
+    generate_random_padding_mask,
+    attention_ref,
+    attention_kvpacked_ref,
+    attention_qkvpacked_ref,
+)
+
+def is_bwd_hdim_supported(d):
+    return d <= 128 and d % 2 == 0
+
+
+def ck_randval_to_dropout_mask(randval, p):
+    # If p = 0.3, randval in 255 * (0.7, 1.0] will be dropout
+    # randval in 255 * [0, 0.7] will be kept
+    # If return dropout_mask >=0, value will be kept
+    return torch.floor(255.0 * (1 - p) - randval)
+
+
+def pad_rearrange_dropout_mask_hts_to_bhss(S_dmask, cu_seqlens_q, seqlen_q_rounded, seqlen_k_rounded):
+    """ pad + rearrange [nheads, total_q, max_seqlen_k] into [b, nheads, seqlen_q_rounded, seqlen_k_rounded]
+    Arguments:
+        S_dmask: (nheads, total_q, max_seqlen_k)
+        cu_seqlens_q: (b + 1)
+    Output:
+        S_dmask: (b, nheads, seqlen_q_rounded, seqlen_k_rounded)
+    """
+    batch_size = cu_seqlens_q.numel() - 1
+    seqlens_q = torch.roll(cu_seqlens_q, shifts = -1) - cu_seqlens_q
+    seqlens_q = seqlens_q[0:batch_size].tolist()
+    S_dmask = torch.split(S_dmask, seqlens_q, dim=1)
+    # [(nheads, seqlen_q0, max_seqlen_k), (nheads, seqlen_q1, max_seqlen_k), ..., (nheads, seqlen_qb, max_seqlen_k)]
+    masks = ()
+    for mask in S_dmask:
+        # (nheads, seqlen_qi, max_seqlen_k) -> (nheads, seqlen_q_rounded, seqlen_k_rounded)
+        mask = F.pad(mask, (0, seqlen_k_rounded - mask.shape[2], 0, seqlen_q_rounded - mask.shape[1], 0, 0)).unsqueeze(1)
+        masks = masks + (mask, )
+    S_dmask = torch.cat(masks, dim=1)
+
+    S_dmask = S_dmask.transpose(0, 1)
+    return S_dmask
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("deterministic", [False])
+@pytest.mark.parametrize("alibi", [False, True])
+@pytest.mark.parametrize("local", [False, True])
+@pytest.mark.parametrize("causal", [False, True])
+@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
+@pytest.mark.parametrize("seqlen", [97, 128, 200, 384, 768, 1024, 1025, 2048])
+@pytest.mark.parametrize("dropout_p", [0.0, 0.17])
+def test_flash_attn_qkvpacked(seqlen, d, dropout_p, causal, local, alibi, deterministic, dtype):
+    if d > 256:
+        pytest.skip()
+
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 4
+    nheads = 9
+    window_size = (-1, -1) if not local else torch.randint(0, seqlen, (2,))
+
+    qkv = torch.randn(
+        batch_size, seqlen, 3, nheads, d, device=device, dtype=dtype, requires_grad=True
+    )
+
+    if alibi:
+        alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
+        attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen, seqlen, causal=causal)
+    else:
+        alibi_slopes, attn_bias = None, None
+    out, lse, S_dmask = flash_attn_qkvpacked_func(
+        qkv,
+        dropout_p,
+        causal=causal,
+        window_size=window_size,
+        alibi_slopes=alibi_slopes,
+        deterministic=deterministic,
+        return_attn_probs=True,
+    )
+    if dropout_p > 0.0:
+        # TODO - move to c++ mha_varlen_fwd()
+        S_dmask = ck_randval_to_dropout_mask(S_dmask, dropout_p)
+        S_dmask_converted = convert_flash_attn_S_to_softmax(
+            S_dmask,
+            seqlen,
+            seqlen,
+            None,
+            None,
+            d,
+            dropout_p > 0.0,
+            causal=causal,
+            window_size=window_size,
+        )
+        dropout_mask = S_dmask_converted >= 0
+        # CK does not return P. Hence, we don't test the attn here.
+    else:
+        dropout_mask = None
+
+    out_ref, attn_ref = attention_qkvpacked_ref(
+        qkv, None, attn_bias, dropout_p, dropout_mask, causal=causal, window_size=window_size
+    )
+    out_pt, attn_pt = attention_qkvpacked_ref(
+        qkv,
+        None,
+        attn_bias,
+        dropout_p,
+        dropout_mask,
+        causal=causal,
+        window_size=window_size,
+        upcast=False,
+        reorder_ops=True,
+    )
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+
+    # Check that FlashAttention's numerical error is at most twice the numerical error
+    # of a Pytorch implementation.
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
+
+    g = torch.randn_like(out)
+    if is_bwd_hdim_supported(d):
+        (dqkv,) = torch.autograd.grad(out, qkv, g)
+        (dqkv_ref,) = torch.autograd.grad(out_ref, qkv, g)
+        (dqkv_pt,) = torch.autograd.grad(out_pt, qkv, g)
+        print(f"dQ max diff: {(dqkv[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}")
+        print(f"dK max diff: {(dqkv[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}")
+        print(f"dV max diff: {(dqkv[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}")
+        print(f"dQKV mean diff: {(dqkv - dqkv_ref).abs().mean().item()}")
+        print(f"dQ Pytorch max diff: {(dqkv_pt[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}")
+        print(f"dK Pytorch max diff: {(dqkv_pt[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}")
+        print(f"dV Pytorch max diff: {(dqkv_pt[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}")
+        print(f"dQKV Pytorch mean diff: {(dqkv_pt - dqkv_ref).abs().mean().item()}")
+
+        # TODO - use 10 times to check, wait for ck to change dq type to f32
+        assert (dqkv - dqkv_ref).abs().max().item() <= 10 * (dqkv_pt - dqkv_ref).abs().max().item()
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("deterministic", [False])
+@pytest.mark.parametrize("alibi", [False, True])
+@pytest.mark.parametrize("local", [False, True])
+@pytest.mark.parametrize("causal", [False, True])
+@pytest.mark.parametrize("d", [32, 59, 64, 80, 96, 128, 160, 192, 224, 256])
+@pytest.mark.parametrize("seqlen", [97, 128, 200, 257, 384, 512, 768, 1025, 2048])
+@pytest.mark.parametrize("dropout_p", [0, 0.17])
+def test_flash_attn_varlen_qkvpacked(seqlen, d, dropout_p, causal, local, alibi, deterministic, dtype):
+    if d > 256:
+        pytest.skip()
+
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 5
+    nheads = 6
+    window_size = (-1, -1) if not local else torch.randint(0, seqlen, (2,))
+    qkv = torch.randn(
+        batch_size, seqlen, 3, nheads, d, device=device, dtype=dtype, requires_grad=True
+    )
+
+    key_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode="random")
+    # key_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode='full')
+    if alibi:
+        alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
+        attn_bias = attn_bias_from_alibi_slopes(
+            alibi_slopes, seqlen, seqlen, key_padding_mask, key_padding_mask, causal=causal
+        )
+    else:
+        alibi_slopes, attn_bias = None, None
+
+    qkv_unpad, cu_seqlens, max_seqlen, qkv, output_pad_fn, dqkv_pad_fn = generate_qkv(
+        *qkv.unbind(dim=2), key_padding_mask, key_padding_mask, qkvpacked=True
+    )
+
+    out_unpad, sm_lse, S_dmask = flash_attn_varlen_qkvpacked_func(
+        qkv_unpad,
+        cu_seqlens,
+        max_seqlen,
+        dropout_p,
+        causal=causal,
+        window_size=window_size,
+        alibi_slopes=alibi_slopes,
+        deterministic=deterministic,
+        return_attn_probs=True,
+    )
+    out = output_pad_fn(out_unpad)
+    if dropout_p > 0.0:
+        # TODO - move to c++ mha_varlen_fwd()
+        S_dmask = ck_randval_to_dropout_mask(S_dmask, dropout_p)
+        S_dmask = pad_rearrange_dropout_mask_hts_to_bhss(S_dmask, cu_seqlens, seqlen, seqlen)
+
+        S_dmask_converted = convert_flash_attn_S_to_softmax(
+            S_dmask,
+            seqlen,
+            seqlen,
+            key_padding_mask,
+            key_padding_mask,
+            d,
+            dropout_p > 0.0,
+            causal=causal,
+            window_size=window_size,
+        )
+
+        dropout_mask = S_dmask_converted >= 0
+        # CK does not return P. Hence, we don't test the attn here.
+    else:
+        dropout_mask = None
+
+    out_ref, attn_ref = attention_qkvpacked_ref(
+        qkv,
+        key_padding_mask,
+        attn_bias,
+        dropout_p,
+        dropout_mask,
+        causal=causal,
+        window_size=window_size,
+    )
+    out_pt, attn_pt = attention_qkvpacked_ref(
+        qkv,
+        key_padding_mask,
+        attn_bias,
+        dropout_p,
+        dropout_mask,
+        causal=causal,
+        window_size=window_size,
+        upcast=False,
+        reorder_ops=True,
+    )
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+
+    # Check that FlashAttention's numerical error is at most twice the numerical error
+    # of a Pytorch implementation.
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
+
+    g = torch.randn_like(out)
+    if is_bwd_hdim_supported(d):
+        (dqkv_unpad,) = torch.autograd.grad(out, qkv_unpad, g)
+        dqkv = dqkv_pad_fn(dqkv_unpad)
+        (dqkv_ref,) = torch.autograd.grad(out_ref, qkv, g)
+        (dqkv_pt,) = torch.autograd.grad(out_pt, qkv, g)
+        print(f"dQ max diff: {(dqkv[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}")
+        print(f"dK max diff: {(dqkv[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}")
+        print(f"dV max diff: {(dqkv[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}")
+        print(f"dQKV mean diff: {(dqkv - dqkv_ref).abs().mean().item()}")
+        print(f"dQ Pytorch max diff: {(dqkv_pt[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}")
+        print(f"dK Pytorch max diff: {(dqkv_pt[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}")
+        print(f"dV Pytorch max diff: {(dqkv_pt[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}")
+        print(f"dQKV Pytorch mean diff: {(dqkv_pt - dqkv_ref).abs().mean().item()}")
+
+        # TODO - use 10 times to check, wait for ck to change dq type to f32
+        assert (dqkv - dqkv_ref).abs().max().item() <= 10 * (dqkv_pt - dqkv_ref).abs().max().item()
+
+
+@pytest.mark.parametrize("kvpacked", [True, False])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
+@pytest.mark.parametrize("deterministic", [False])
+@pytest.mark.parametrize("alibi", [False, True])
+@pytest.mark.parametrize("local", [False, True])
+@pytest.mark.parametrize("causal", [False, True])
+@pytest.mark.parametrize("d", [32, 40, 59, 64, 96, 111, 128, 160, 192, 224, 256])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [
+        (113, 203),
+        (128, 217),
+        (113, 211),
+        (108, 256),
+        (256, 512),
+        (512, 256),
+        (1024, 1024),
+        (1023, 1024),
+        (1024, 1023),
+        (2048, 2048),
+    ],
+)
+@pytest.mark.parametrize("dropout_p", [0.0, 0.17])
+def test_flash_attn_output(
+    seqlen_q, seqlen_k, d, dropout_p, causal, local, alibi, deterministic, mha_type, dtype, kvpacked
+):
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 4
+    nheads = 9
+    nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 3)
+    assert nheads % nheads_k == 0
+    window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
+    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    if kvpacked:
+        kv = torch.randn(
+            batch_size, seqlen_k, 2, nheads_k, d, device=device, dtype=dtype, requires_grad=True
+        )
+    else:
+        k = torch.randn(
+            batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True
+        )
+        v = torch.randn(
+            batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True
+        )
+    if alibi:
+        alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
+        attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen_q, seqlen_k, causal=causal)
+    else:
+        alibi_slopes, attn_bias = None, None
+
+    if kvpacked:
+        out, lse, S_dmask = flash_attn_kvpacked_func(
+            q,
+            kv,
+            dropout_p,
+            causal=causal,
+            window_size=window_size,
+            alibi_slopes=alibi_slopes,
+            deterministic=deterministic,
+            return_attn_probs=True,
+        )
+    else:
+        out, lse, S_dmask = flash_attn_func(
+            q,
+            k,
+            v,
+            dropout_p,
+            causal=causal,
+            window_size=window_size,
+            alibi_slopes=alibi_slopes,
+            deterministic=deterministic,
+            return_attn_probs=True,
+        )
+    if dropout_p > 0.0:
+        # TODO - move to c++ mha_varlen_fwd()
+        S_dmask = ck_randval_to_dropout_mask(S_dmask, dropout_p)
+        S_dmask_converted = convert_flash_attn_S_to_softmax(
+            S_dmask,
+            seqlen_q,
+            seqlen_k,
+            None,
+            None,
+            d,
+            dropout_p > 0.0,
+            causal=causal,
+            window_size=window_size,
+        )
+        dropout_mask = S_dmask_converted >= 0
+        if kvpacked:
+            kv_rep = repeat(kv, "b s two h d -> b s two (h g) d", g=nheads // nheads_k)
+            k_rep, v_rep = kv_rep.unbind(dim=2)
+        else:
+            k_rep = repeat(k, "b s h d -> b s (h g) d", g=nheads // nheads_k)
+            v_rep = repeat(v, "b s h d -> b s (h g) d", g=nheads // nheads_k)
+        # CK does not return P. Hence, we don't test the attn here.
+    else:
+        dropout_mask = None
+
+    if kvpacked:
+        out_ref, attn_ref = attention_kvpacked_ref(
+            q,
+            kv,
+            None,
+            None,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+        )
+        out_pt, attn_pt = attention_kvpacked_ref(
+            q,
+            kv,
+            None,
+            None,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+            upcast=False,
+            reorder_ops=True,
+        )
+    else:
+        out_ref, attn_ref = attention_ref(
+            q,
+            k,
+            v,
+            None,
+            None,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+        )
+        out_pt, attn_pt = attention_ref(
+            q,
+            k,
+            v,
+            None,
+            None,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+            upcast=False,
+            reorder_ops=True,
+        )
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+
+    # Check that FlashAttention's numerical error is at most twice the numerical error
+    # of a Pytorch implementation.
+    assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item()
+
+    g = torch.randn_like(out)
+    if is_bwd_hdim_supported(d):
+        if kvpacked:
+            (
+                dq,
+                dkv,
+            ) = torch.autograd.grad(out, (q, kv), g)
+            dk, dv = dkv.unbind(2)
+            (
+                dq_ref,
+                dkv_ref,
+            ) = torch.autograd.grad(out_ref, (q, kv), g)
+            dk_ref, dv_ref = dkv_ref.unbind(2)
+            (
+                dq_pt,
+                dkv_pt,
+            ) = torch.autograd.grad(out_pt, (q, kv), g)
+            dk_pt, dv_pt = dkv_pt.unbind(2)
+        else:
+            (
+                dq,
+                dk,
+                dv,
+            ) = torch.autograd.grad(out, (q, k, v), g)
+            (
+                dq_ref,
+                dk_ref,
+                dv_ref,
+            ) = torch.autograd.grad(out_ref, (q, k, v), g)
+            (
+                dq_pt,
+                dk_pt,
+                dv_pt,
+            ) = torch.autograd.grad(out_pt, (q, k, v), g)
+        print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
+        print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
+        print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
+        print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
+        print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
+        print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
+        print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
+        print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
+        print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
+        print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
+        print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
+        print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
+
+        # TODO - use 10 times to check, wait for ck to change dq type to f32
+        assert (dq - dq_ref).abs().max().item() <= 10 * (dq_pt - dq_ref).abs().max().item()
+        assert (dk - dk_ref).abs().max().item() <= 10 * (dk_pt - dk_ref).abs().max().item()
+        assert (dv - dv_ref).abs().max().item() <= 10 * (dv_pt - dv_ref).abs().max().item()
+
+
+@pytest.mark.parametrize("kvpacked", [True, False])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"])
+@pytest.mark.parametrize("deterministic", [False, True])
+@pytest.mark.parametrize("alibi", [False, True])
+@pytest.mark.parametrize("local", [False, True])
+@pytest.mark.parametrize("causal", [False, True])
+@pytest.mark.parametrize("d", [32, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
+@pytest.mark.parametrize(
+    "seqlen_q,seqlen_k",
+    [
+        (1, 147),
+        (113, 203),
+        (128, 217),
+        (113, 211),
+        (108, 256),
+        (256, 512),
+        (512, 256),
+        (1024, 1024),
+        (1023, 1024),
+        (1024, 1023),
+        (2048, 2048),
+    ],
+)
+@pytest.mark.parametrize("dropout_p", [0.0, 0.17])
+def test_flash_attn_varlen_output(
+    seqlen_q, seqlen_k, d, dropout_p, causal, local, alibi, deterministic, mha_type, dtype, kvpacked
+):
+    device = "cuda"
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 4
+    nheads = 9
+    nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 3)
+    assert nheads % nheads_k == 0
+    window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,))
+    q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    if kvpacked:
+        kv = torch.randn(
+            batch_size, seqlen_k, 2, nheads_k, d, device=device, dtype=dtype, requires_grad=True
+        )
+    else:
+        k = torch.randn(
+            batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True
+        )
+        v = torch.randn(
+            batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True
+        )
+
+    query_padding_mask = generate_random_padding_mask(seqlen_q, batch_size, device, mode="random")
+    key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode="random")
+    # key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode='full')
+    if alibi:
+        alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3
+        attn_bias = attn_bias_from_alibi_slopes(
+            alibi_slopes, seqlen_q, seqlen_k, query_padding_mask, key_padding_mask, causal=causal
+        )
+    else:
+        alibi_slopes, attn_bias = None, None
+
+    if kvpacked:
+        (
+            q_unpad,
+            kv_unpad,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q,
+            kv,
+            output_pad_fn,
+            dq_pad_fn,
+            dkv_pad_fn,
+        ) = generate_qkv(q, *kv.unbind(dim=2), query_padding_mask, key_padding_mask, kvpacked=True)
+        out_unpad, sm_lse, S_dmask = flash_attn_varlen_kvpacked_func(
+            q_unpad,
+            kv_unpad,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p,
+            causal=causal,
+            window_size=window_size,
+            alibi_slopes=alibi_slopes,
+            deterministic=deterministic,
+            return_attn_probs=True,
+        )
+    else:
+        (
+            q_unpad,
+            k_unpad,
+            v_unpad,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q,
+            k,
+            v,
+            output_pad_fn,
+            dq_pad_fn,
+            dk_pad_fn,
+        ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+        out_unpad, sm_lse, S_dmask = flash_attn_varlen_func(
+            q_unpad,
+            k_unpad,
+            v_unpad,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p,
+            causal=causal,
+            window_size=window_size,
+            alibi_slopes=alibi_slopes,
+            deterministic=deterministic,
+            return_attn_probs=True,
+        )
+    out = output_pad_fn(out_unpad)
+    if dropout_p > 0.0:
+        # TODO - move to c++ mha_varlen_fwd()
+        S_dmask = ck_randval_to_dropout_mask(S_dmask, dropout_p)
+        S_dmask = pad_rearrange_dropout_mask_hts_to_bhss(S_dmask, cu_seqlens_q, seqlen_q, seqlen_k)
+        S_dmask_converted = convert_flash_attn_S_to_softmax(
+            S_dmask,
+            seqlen_q,
+            seqlen_k,
+            query_padding_mask,
+            key_padding_mask,
+            d,
+            dropout_p > 0.0,
+            causal=causal,
+            window_size=window_size,
+        )
+        dropout_mask = S_dmask_converted >= 0
+        if kvpacked:
+            kv_rep = repeat(kv, "b s two h d -> b s two (h g) d", g=nheads // nheads_k)
+            k_rep, v_rep = kv_rep.unbind(dim=2)
+        else:
+            k_rep = repeat(k, "b s h d -> b s (h g) d", g=nheads // nheads_k)
+            v_rep = repeat(v, "b s h d -> b s (h g) d", g=nheads // nheads_k)
+        # CK does not return P. Hence, we don't test the attn here.
+    else:
+        dropout_mask = None
+
+    if kvpacked:
+        out_ref, attn_ref = attention_kvpacked_ref(
+            q,
+            kv,
+            query_padding_mask,
+            key_padding_mask,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+        )
+        out_pt, attn_pt = attention_kvpacked_ref(
+            q,
+            kv,
+            query_padding_mask,
+            key_padding_mask,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+            upcast=False,
+            reorder_ops=True,
+        )
+    else:
+        out_ref, attn_ref = attention_ref(
+            q,
+            k,
+            v,
+            query_padding_mask,
+            key_padding_mask,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+        )
+        out_pt, attn_pt = attention_ref(
+            q,
+            k,
+            v,
+            query_padding_mask,
+            key_padding_mask,
+            attn_bias,
+            dropout_p,
+            dropout_mask,
+            causal=causal,
+            window_size=window_size,
+            upcast=False,
+            reorder_ops=True,
+        )
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}")
+    print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}")
+
+    # Check that FlashAttention's numerical error is at most 4 times the numerical error
+    # of a Pytorch implementation.
+    assert (out - out_ref).abs().max().item() <= 4 * (out_pt - out_ref).abs().max().item()
+
+    g = torch.randn_like(out)
+    if is_bwd_hdim_supported(d):
+        if kvpacked:
+            (
+                dq_unpad,
+                dkv_unpad,
+            ) = torch.autograd.grad(out, (q_unpad, kv_unpad), g)
+            dk, dv = dkv_pad_fn(dkv_unpad).unbind(2)
+            (
+                dq_ref,
+                dkv_ref,
+            ) = torch.autograd.grad(out_ref, (q, kv), g)
+            dk_ref, dv_ref = dkv_ref.unbind(2)
+            (
+                dq_pt,
+                dkv_pt,
+            ) = torch.autograd.grad(out_pt, (q, kv), g)
+            dk_pt, dv_pt = dkv_pt.unbind(2)
+        else:
+            (
+                dq_unpad,
+                dk_unpad,
+                dv_unpad,
+            ) = torch.autograd.grad(out, (q_unpad, k_unpad, v_unpad), g)
+            dk = dk_pad_fn(dk_unpad)
+            dv = dk_pad_fn(dv_unpad)
+            (
+                dq_ref,
+                dk_ref,
+                dv_ref,
+            ) = torch.autograd.grad(out_ref, (q, k, v), g)
+            (
+                dq_pt,
+                dk_pt,
+                dv_pt,
+            ) = torch.autograd.grad(out_pt, (q, k, v), g)
+        dq = dq_pad_fn(dq_unpad)
+        print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}")
+        print(f"dK max diff: {(dk - dk_ref).abs().max().item()}")
+        print(f"dV max diff: {(dv - dv_ref).abs().max().item()}")
+        print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}")
+        print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}")
+        print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}")
+        print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}")
+        print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}")
+        print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}")
+        print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}")
+        print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}")
+        print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}")
+
+        # TODO - use 10 times to check, wait for ck to change dq type to f32
+        assert (dq - dq_ref).abs().max().item() <= 10 * (dq_pt - dq_ref).abs().max().item()
+        assert (dk - dk_ref).abs().max().item() <= 10 * (dk_pt - dk_ref).abs().max().item()
+        assert (dv - dv_ref).abs().max().item() <= 10 * (dv_pt - dv_ref).abs().max().item()
diff --git a/test_fused_dense.py b/test_fused_dense.py
new file mode 100644
index 0000000000000000000000000000000000000000..084dd5f4b8ee7c0c688f60409c644022e6c00a81
--- /dev/null
+++ b/test_fused_dense.py
@@ -0,0 +1,172 @@
+import math
+from functools import partial
+
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from flash_attn.ops.fused_dense import FusedDense, FusedMLP
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("return_residual", [False, True])
+@pytest.mark.parametrize("has_bias", [True, False])
+@pytest.mark.parametrize("out_features", [1024, 4096])
+@pytest.mark.parametrize("in_features", [1024, 4096])
+def test_fused_linear_bias(in_features, out_features, has_bias, return_residual, dtype):
+    device = "cuda"
+    rtol, atol = (3e-3, 1e-2) if dtype == torch.bfloat16 else (3e-3, 1e-3)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen = 512
+    x_pt = torch.randn(
+        batch_size, seqlen, in_features, device=device, dtype=dtype, requires_grad=True
+    )
+    x = x_pt.detach().clone().requires_grad_()
+    model_pt = torch.nn.Linear(in_features, out_features, bias=has_bias, device=device, dtype=dtype)
+    model = FusedDense(
+        in_features,
+        out_features,
+        bias=has_bias,
+        return_residual=return_residual,
+        device=device,
+        dtype=dtype,
+    )
+    with torch.no_grad():
+        model.weight.copy_(model_pt.weight)
+        if has_bias:
+            model.bias.copy_(model_pt.bias)
+    out_pt = model_pt(x_pt)
+    if not return_residual:
+        out = model(x)
+    else:
+        out, x_copy = model(x)
+        x_copy = (
+            x_copy[..., :out_features]
+            if out_features < in_features
+            else F.pad(x_copy, (0, out_features - in_features))
+        )
+        x_pt_copy = (
+            x_pt[..., :out_features]
+            if out_features < in_features
+            else F.pad(x_pt, (0, out_features - in_features))
+        )
+        # Just add some random function of the residual
+        out_pt = out_pt + F.gelu(x_pt_copy)
+        out = out + F.gelu(x_copy)
+
+    # with torch.no_grad():
+    #     out_fl = F.linear(x_pt.float(), model.weight.float(), model.bias.float()).half()
+    assert torch.allclose(out, out_pt, rtol=rtol, atol=atol)
+
+    # If we don't divide by batch_size, the gradient gets a bit too large.
+    g = torch.randn_like(out) / 32
+    out_pt.backward(g)
+    out.backward(g)
+    assert torch.allclose(x.grad, x_pt.grad, rtol=rtol, atol=atol)
+    # The error for d_weight and d_bias is quite a bit higher
+    assert torch.allclose(model.weight.grad, model_pt.weight.grad, rtol=rtol, atol=atol * 10)
+    if has_bias:
+        assert torch.allclose(model.bias.grad, model_pt.bias.grad, rtol=rtol, atol=atol * 5)
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+# @pytest.mark.parametrize('dtype', [torch.float16])
+@pytest.mark.parametrize("heuristic", ["auto", -1])
+# @pytest.mark.parametrize('heuristic', ['auto'])
+@pytest.mark.parametrize("checkpoint_lvl", [0, 1, 2])
+# @pytest.mark.parametrize('checkpoint_lvl', [1])
+@pytest.mark.parametrize("return_residual", [False, True])
+# @pytest.mark.parametrize('return_residual', [False])
+@pytest.mark.parametrize("has_bias2", [True, False])
+@pytest.mark.parametrize("has_bias1", [True, False])
+# @pytest.mark.parametrize('has_bias2', [True])
+# @pytest.mark.parametrize('has_bias1', [True])
+@pytest.mark.parametrize("activation", ["gelu_approx", "relu"])
+# @pytest.mark.parametrize('activation', ['relu'])
+@pytest.mark.parametrize("out_features", [1024, 4096])
+@pytest.mark.parametrize("in_features", [1024, 4096])
+# @pytest.mark.parametrize('out_features', [4096])
+# @pytest.mark.parametrize('in_features', [1024])
+def test_fused_mlp(
+    in_features,
+    out_features,
+    activation,
+    has_bias1,
+    has_bias2,
+    return_residual,
+    checkpoint_lvl,
+    heuristic,
+    dtype,
+):
+    device = "cuda"
+    rtol, atol = (3e-3, 3e-2) if dtype == torch.bfloat16 else (3e-3, 1e-3)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen = 512
+    x_pt = torch.randn(
+        batch_size, seqlen, in_features, device=device, dtype=dtype, requires_grad=True
+    )
+    x = x_pt.detach().clone().requires_grad_()
+    model_pt_fc1 = torch.nn.Linear(
+        in_features, out_features, bias=has_bias1, device=device, dtype=dtype
+    )
+    model_pt_fc2 = torch.nn.Linear(
+        out_features, in_features, bias=has_bias2, device=device, dtype=dtype
+    )
+    model = FusedMLP(
+        in_features,
+        out_features,
+        in_features,
+        activation=activation,
+        bias1=has_bias1,
+        bias2=has_bias2,
+        return_residual=return_residual,
+        checkpoint_lvl=checkpoint_lvl,
+        heuristic=heuristic,
+        device=device,
+        dtype=dtype,
+    )
+    with torch.no_grad():
+        model.fc1.weight.copy_(model_pt_fc1.weight)
+        if has_bias1:
+            model.fc1.bias.copy_(model_pt_fc1.bias)
+        model.fc2.weight.copy_(model_pt_fc2.weight)
+        if has_bias2:
+            model.fc2.bias.copy_(model_pt_fc2.bias)
+    activation_fn = (
+        partial(F.gelu, approximate="tanh")
+        if activation == "gelu_approx"
+        else partial(F.relu, inplace=True)
+    )
+    out_pt = model_pt_fc2(activation_fn(model_pt_fc1(x_pt)))
+    if not return_residual:
+        out = model(x)
+    else:
+        out, x_copy = model(x)
+        # Just add some random function of the residual
+        out_pt = out_pt + F.gelu(x_pt)
+        out = out + F.gelu(x_copy)
+    assert torch.allclose(out, out_pt, rtol=rtol, atol=atol)
+
+    # If we don't divide by batch_size, the gradient gets a bit too large.
+    g = torch.randn_like(out) / 32
+    out_pt.backward(g)
+    out.backward(g)
+    # The error for relu is higher still
+    if activation == "relu":
+        atol = 1e-1 if dtype == torch.bfloat16 else 5e-2
+    assert torch.allclose(x.grad, x_pt.grad, rtol=rtol, atol=atol)
+    # The error for d_weight and d_bias is quite a bit higher
+    assert torch.allclose(
+        model.fc1.weight.grad, model_pt_fc1.weight.grad, rtol=rtol, atol=atol * 10
+    )
+    if has_bias1:
+        assert torch.allclose(model.fc1.bias.grad, model_pt_fc1.bias.grad, rtol=rtol, atol=atol * 5)
+    assert torch.allclose(
+        model.fc2.weight.grad, model_pt_fc2.weight.grad, rtol=rtol, atol=atol * 10
+    )
+    if has_bias2:
+        assert torch.allclose(model.fc2.bias.grad, model_pt_fc2.bias.grad, rtol=rtol, atol=atol * 5)
diff --git a/test_fused_dense_parallel.py b/test_fused_dense_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..df0e70bcae9d41fbd4af60f0a187709ef8032a31
--- /dev/null
+++ b/test_fused_dense_parallel.py
@@ -0,0 +1,237 @@
+# Run test with:
+# torchrun --no_python --nproc_per_node=8 pytest -q -s tests/ops/test_fused_dense_parallel.py
+
+import math
+
+import pytest
+import torch
+import torch.nn.functional as F
+from apex.transformer import parallel_state, tensor_parallel
+from flash_attn.ops.fused_dense import ColumnParallelLinear, FusedDense, FusedMLP, ParallelFusedMLP
+
+is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8
+
+
+@pytest.mark.parametrize("dtype", [torch.float16] + ([torch.bfloat16] if is_sm8x else []))
+# @pytest.mark.parametrize('dtype', [torch.bfloat16])
+@pytest.mark.parametrize("world_size", [1, 2, 4, 8])
+# @pytest.mark.parametrize('world_size', [2])
+@pytest.mark.parametrize("sequence_parallel", [True, False])
+# @pytest.mark.parametrize('sequence_parallel', [False])
+@pytest.mark.parametrize("has_bias", [True, False])
+# @pytest.mark.parametrize('has_bias', [False])
+@pytest.mark.parametrize("out_features", [1024])
+@pytest.mark.parametrize("in_features", [4096])
+def test_fused_linear_bias(
+    in_features, out_features, has_bias, sequence_parallel, world_size, dtype
+):
+    assert out_features % world_size == 0
+    rtol, atol = (3e-3, 3e-2) if dtype == torch.bfloat16 else (3e-3, 3e-3)
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    seqlen = 512
+    assert batch_size * seqlen % world_size == 0
+    x_pt = torch.randn(
+        batch_size * seqlen, in_features, device=device, dtype=dtype, requires_grad=True
+    )
+    if sequence_parallel:
+        x = (
+            tensor_parallel.scatter_to_sequence_parallel_region(x_pt)
+            .detach()
+            .clone()
+            .requires_grad_()
+        )
+    else:
+        x = x_pt.detach().clone().requires_grad_()
+
+    model_pt = torch.nn.Linear(in_features, out_features, bias=has_bias, device=device, dtype=dtype)
+    partition_out_features = out_features // world_size
+    model = ColumnParallelLinear(
+        in_features,
+        out_features,
+        parallel_state.get_tensor_model_parallel_group(),
+        bias=has_bias,
+        sequence_parallel=sequence_parallel,
+        device=device,
+        dtype=dtype,
+    )
+    with torch.no_grad():
+        model.weight.copy_(
+            model_pt.weight[rank * partition_out_features : (rank + 1) * partition_out_features]
+        )
+        if has_bias:
+            model.bias.copy_(
+                model_pt.bias[rank * partition_out_features : (rank + 1) * partition_out_features]
+            )
+
+    out = model(x)
+    out_pt = model_pt(x_pt)
+    assert torch.allclose(
+        out,
+        out_pt[:, rank * partition_out_features : (rank + 1) * partition_out_features],
+        rtol=rtol,
+        atol=atol,
+    )
+
+    # If we don't divide by batch_size, the gradient gets a bit too large.
+    g = torch.randn_like(out_pt) / 32
+    out_pt.backward(g)
+    out.backward(g[:, rank * partition_out_features : (rank + 1) * partition_out_features])
+    parallel_state.destroy_model_parallel()
+
+    partition_batch_dim = batch_size * seqlen // world_size
+    assert torch.allclose(
+        x.grad,
+        x_pt.grad[rank * partition_batch_dim : (rank + 1) * partition_batch_dim]
+        if sequence_parallel
+        else x_pt.grad,
+        rtol=rtol,
+        atol=atol,
+    )
+    # The error for d_weight and d_bias is quite a bit higher
+    assert torch.allclose(
+        model.weight.grad,
+        model_pt.weight.grad[rank * partition_out_features : (rank + 1) * partition_out_features],
+        rtol=rtol,
+        atol=atol * 10,
+    )
+    if has_bias:
+        assert torch.allclose(
+            model.bias.grad,
+            model_pt.bias.grad[rank * partition_out_features : (rank + 1) * partition_out_features],
+            rtol=rtol,
+            atol=atol * 5,
+        )
+
+
+@pytest.mark.parametrize("dtype", [torch.float16] + ([torch.bfloat16] if is_sm8x else []))
+# @pytest.mark.parametrize('dtype', [torch.bfloat16])
+@pytest.mark.parametrize("world_size", [1, 2, 4, 8])
+# @pytest.mark.parametrize('world_size', [2])
+@pytest.mark.parametrize("sequence_parallel", [True, False])
+# @pytest.mark.parametrize('sequence_parallel', [False])
+@pytest.mark.parametrize("has_bias2", [True, False])
+# @pytest.mark.parametrize('has_bias2', [True])
+@pytest.mark.parametrize("out_features", [4096])
+@pytest.mark.parametrize("in_features", [1024])
+def test_fused_mlp(in_features, out_features, has_bias2, sequence_parallel, world_size, dtype):
+    assert out_features % world_size == 0
+    rtol, atol = (3e-3, 3e-2) if dtype == torch.bfloat16 else (3e-3, 3e-3)
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    seqlen = 512
+    assert batch_size * seqlen % world_size == 0
+    x_pt = torch.randn(
+        batch_size * seqlen, in_features, device=device, dtype=dtype, requires_grad=True
+    )
+    # We need to generate g here so that all processes get the same gradient,
+    # as rank 0 will have an extra bias that changes the RNG.
+    # If we don't divide by batch_size, the gradient gets a bit too large.
+    g = torch.randn_like(x_pt) / 32
+    if sequence_parallel:
+        x = (
+            tensor_parallel.scatter_to_sequence_parallel_region(x_pt)
+            .detach()
+            .clone()
+            .requires_grad_()
+        )
+    else:
+        x = x_pt.detach().clone().requires_grad_()
+
+    model_pt_fc1 = torch.nn.Linear(in_features, out_features, device=device, dtype=dtype)
+    model_pt_fc2 = torch.nn.Linear(
+        out_features, in_features, bias=has_bias2, device=device, dtype=dtype
+    )
+    partition_out_features = out_features // world_size
+    partition_in_features = in_features // world_size
+    model = ParallelFusedMLP(
+        in_features,
+        out_features,
+        in_features,
+        process_group=parallel_state.get_tensor_model_parallel_group(),
+        bias2=has_bias2 and rank == 0,
+        sequence_parallel=sequence_parallel,
+        device=device,
+        dtype=dtype,
+    )
+
+    with torch.no_grad():
+        model.fc1.weight.copy_(
+            model_pt_fc1.weight[rank * partition_out_features : (rank + 1) * partition_out_features]
+        )
+        model.fc1.bias.copy_(
+            model_pt_fc1.bias[rank * partition_out_features : (rank + 1) * partition_out_features]
+        )
+        model.fc2.weight.copy_(
+            model_pt_fc2.weight[
+                :, rank * partition_out_features : (rank + 1) * partition_out_features
+            ]
+        )
+        if has_bias2 and rank == 0:
+            model.fc2.bias.copy_(model_pt_fc2.bias)
+
+    out = model(x)
+    out_pt = model_pt_fc2(F.gelu(model_pt_fc1(x_pt), approximate="tanh"))
+    partition_batch_dim = batch_size * seqlen // world_size
+    assert torch.allclose(
+        out,
+        out_pt[rank * partition_batch_dim : (rank + 1) * partition_batch_dim]
+        if sequence_parallel
+        else out_pt,
+        rtol=rtol,
+        atol=atol,
+    )
+
+    out_pt.backward(g)
+    out.backward(
+        g[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] if sequence_parallel else g
+    )
+    parallel_state.destroy_model_parallel()
+
+    assert torch.allclose(
+        x.grad,
+        x_pt.grad[rank * partition_batch_dim : (rank + 1) * partition_batch_dim]
+        if sequence_parallel
+        else x_pt.grad,
+        rtol=rtol,
+        atol=atol,
+    )
+    # The error for d_weight and d_bias is quite a bit higher
+    assert torch.allclose(
+        model.fc1.weight.grad,
+        model_pt_fc1.weight.grad[
+            rank * partition_out_features : (rank + 1) * partition_out_features
+        ],
+        rtol=rtol,
+        atol=atol * 10,
+    )
+    assert torch.allclose(
+        model.fc1.bias.grad,
+        model_pt_fc1.bias.grad[rank * partition_out_features : (rank + 1) * partition_out_features],
+        rtol=rtol,
+        atol=atol * 5,
+    )
+    assert torch.allclose(
+        model.fc2.weight.grad,
+        model_pt_fc2.weight.grad[
+            :, rank * partition_out_features : (rank + 1) * partition_out_features
+        ],
+        rtol=rtol,
+        atol=atol * 10,
+    )
+    if has_bias2 and rank == 0:
+        assert torch.allclose(model.fc2.bias.grad, model_pt_fc2.bias.grad, rtol=rtol, atol=atol * 5)
diff --git a/test_gpt.py b/test_gpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..9822030051b19d7e2e688f31880a705f6046a198
--- /dev/null
+++ b/test_gpt.py
@@ -0,0 +1,478 @@
+import re
+
+import pytest
+import torch
+from einops import rearrange
+from flash_attn.models.gpt import (
+    GPTLMHeadModel,
+    remap_state_dict_hf_gpt2,
+    shard_state_dict_tp,
+    combine_state_dicts_tp,
+)
+from flash_attn.utils.generation import InferenceParams
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+from transformers import GPT2Config, GPT2Tokenizer
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel as GPT2LMHeadModelHF
+
+
+@pytest.mark.parametrize("model_name", ["gpt2", "gpt2-medium"])
+# @pytest.mark.parametrize('model_name', ["gpt2"])
+def test_gpt2_state_dict(model_name):
+    config = GPT2Config.from_pretrained(model_name)
+    pretrained_state_dict = remap_state_dict_hf_gpt2(state_dict_from_pretrained(model_name), config)
+    model = GPTLMHeadModel(config)
+    state_dict = model.state_dict()
+    assert state_dict.keys() == pretrained_state_dict.keys()
+    for k in state_dict.keys():
+        assert state_dict[k].shape == pretrained_state_dict[k].shape
+
+
+@pytest.mark.parametrize("model_name", ["gpt2", "gpt2-medium"])
+# @pytest.mark.parametrize('model_name', ["gpt2"])
+def test_gpt2_non_optimized(model_name):
+    """Check that our implementation of GPT2 (without any optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    config = GPT2Config.from_pretrained(model_name)
+
+    model = GPTLMHeadModel.from_pretrained(model_name, config)
+    model = model.cuda().to(dtype=dtype)
+
+    model_ref = GPT2LMHeadModelHF.from_pretrained(model_name).cuda()
+    model_hf = GPT2LMHeadModelHF.from_pretrained(model_name).cuda().to(dtype=dtype)
+
+    model.eval()
+    model_ref.eval()
+    model_hf.eval()
+
+    torch.manual_seed(0)
+    batch_size = 4
+    max_seqlen = 512
+    seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device="cuda")
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device="cuda"
+    )
+    out = model.transformer(input_ids)
+    out_hf = model_hf.transformer(input_ids).last_hidden_state
+    out_ref = model_ref.transformer(input_ids).last_hidden_state
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
+    assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item()
+
+    logits = model(input_ids).logits
+    logits_hf = model_hf(input_ids).logits
+    logits_ref = model_ref(input_ids).logits
+
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+    print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
+    assert (logits - logits_ref).abs().max().item() < 3 * (
+        logits_hf - logits_ref
+    ).abs().max().item()
+
+
+@pytest.mark.parametrize("model_name", ["gpt2", "gpt2-medium"])
+# @pytest.mark.parametrize('model_name', ["gpt2"])
+def test_gpt2_optimized(model_name):
+    """Check that our implementation of GPT2 (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    config = GPT2Config.from_pretrained(model_name)
+    vocab_size_og = config.vocab_size
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = True
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+    config.pad_vocab_size_multiple = 8
+
+    model = GPTLMHeadModel.from_pretrained(model_name, config)
+    model = model.cuda().to(dtype=dtype)
+
+    model_ref = GPT2LMHeadModelHF.from_pretrained(model_name).cuda()
+    model_hf = GPT2LMHeadModelHF.from_pretrained(model_name).cuda().to(dtype=dtype)
+
+    model.eval()
+    model_ref.eval()
+    model_hf.eval()
+
+    torch.manual_seed(0)
+    batch_size = 4
+    max_seqlen = 512
+    seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device="cuda")
+    input_ids = torch.randint(
+        0, vocab_size_og, (batch_size, max_seqlen), dtype=torch.long, device="cuda"
+    )
+    out = model.transformer(input_ids)
+    out_hf = model_hf.transformer(input_ids).last_hidden_state
+    out_ref = model_ref.transformer(input_ids).last_hidden_state
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
+    assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item()
+
+    logits = model(input_ids).logits[..., :vocab_size_og]
+    logits_hf = model_hf(input_ids).logits
+    logits_ref = model_ref(input_ids).logits
+
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+    print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
+    assert (logits - logits_ref).abs().max().item() < 3 * (
+        logits_hf - logits_ref
+    ).abs().max().item()
+
+
+@pytest.mark.parametrize("optimized", [False, True])
+# @pytest.mark.parametrize('optimized', [True])
+@pytest.mark.parametrize("rotary", [False, True])
+# @pytest.mark.parametrize('rotary', [False])
+@pytest.mark.parametrize("model_name", ["gpt2"])
+def test_gpt2_generation(model_name, rotary, optimized):
+    """Check that our implementation of GPT2 generation matches the HF implementation:
+    the scores in fp16 should be around the same as the HF scores in fp16, when compared to
+    the HF scores in fp32.
+    """
+    dtype = torch.float16
+    device = "cuda"
+    rtol, atol = 3e-3, 3e-1
+    config = GPT2Config.from_pretrained(model_name)
+    if rotary:
+        config.n_positions = 0
+        config.rotary_emb_fraction = 0.5
+        config.rotary_emb_base = 24000
+    config.residual_in_fp32 = True
+    if optimized:
+        config.use_flash_attn = True
+        config.fused_bias_fc = True
+        config.fused_mlp = True
+        config.fused_dropout_add_ln = True
+
+    # if not rotary, we load the weight from HF but ignore the position embeddings.
+    # The model would be nonsense but it doesn't matter for the test.
+    model = GPTLMHeadModel.from_pretrained(
+        model_name, config, strict=not rotary, device=device, dtype=dtype
+    )
+    model.eval()
+
+    if not rotary:
+        model_ref = GPT2LMHeadModelHF.from_pretrained(model_name).to(device=device)
+        model_hf = GPT2LMHeadModelHF.from_pretrained(model_name, torch_dtype=dtype).to(
+            device=device
+        )
+        model_ref.eval()
+        model_hf.eval()
+
+    torch.manual_seed(0)
+    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    input_ids = tokenizer("Hello, my dog is cute and he", return_tensors="pt").input_ids.to(
+        device=device
+    )
+    max_length = 25
+    # input_ids = torch.randint(0, 100, (2, 10), dtype=torch.long, device='cuda')
+    # max_length = input_ids.shape[1] + 40
+
+    # Slow generation for reference
+    sequences = []
+    scores = []
+    cur_input_ids = input_ids
+    with torch.inference_mode():
+        scores.append(model(cur_input_ids).logits[:, -1])
+        sequences.append(scores[-1].argmax(dim=-1))
+        for _ in range(input_ids.shape[1] + 1, max_length):
+            cur_input_ids = torch.cat([cur_input_ids, rearrange(sequences[-1], "b -> b 1")], dim=-1)
+            scores.append(model(cur_input_ids).logits[:, -1])
+            sequences.append(scores[-1].argmax(dim=-1))
+    sequences = torch.cat([input_ids, torch.stack(sequences, dim=1)], dim=1)
+    scores = tuple(scores)
+
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+    )
+    print(out.sequences)
+    print(tokenizer.batch_decode(out.sequences.tolist()))
+    if getattr(config, "use_flash_attn", False):
+        out_cg = model.generate(
+            input_ids=input_ids,
+            max_length=max_length,
+            cg=True,
+            return_dict_in_generate=True,
+            output_scores=True,
+            enable_timing=True,
+        )
+        print(out_cg.sequences)
+        assert torch.equal(torch.stack(out.scores, dim=1), torch.stack(out_cg.scores, dim=1))
+
+    if not rotary:
+        out_hf = model_hf.generate(
+            input_ids=input_ids,
+            max_length=max_length,
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+        out_ref = model_ref.generate(
+            input_ids=input_ids,
+            max_length=max_length,
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+
+        print(
+            f"Scores max diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}"
+        )
+        print(
+            f"Scores mean diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}"
+        )
+        print(
+            f"HF fp16 max diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}"
+        )
+        print(
+            f"HF fp16 mean diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}"
+        )
+        print(tokenizer.batch_decode(out_ref.sequences.tolist()))
+
+    assert torch.all(out.sequences == sequences)
+    assert torch.allclose(
+        torch.stack(out.scores, dim=1), torch.stack(scores, dim=1), rtol=rtol, atol=atol
+    )
+    if not rotary:
+        assert torch.all(out.sequences == out_ref.sequences)
+        assert torch.all(out.sequences == out_hf.sequences)
+
+        assert (
+            torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)
+        ).abs().max().item() < 3 * (
+            torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)
+        ).abs().max().item()
+
+
+def get_logits(model, input_ids, max_length, teacher_outputs=None, **kwargs):
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        teacher_outputs=teacher_outputs,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+        **kwargs,
+    )
+    return torch.stack(out.scores, dim=1)
+
+
+@pytest.mark.parametrize("seqlen,maxlen", [(10, 20), (30, 150), (3000, 3400), (14000, 15000)])
+# @pytest.mark.parametrize('seqlen,maxlen', [(10, 20)])
+@pytest.mark.parametrize("rotary", [None, "interleaved", "contiguous"])
+# @pytest.mark.parametrize('rotary', [None])
+@pytest.mark.parametrize("model_name", ["gpt2"])
+def test_gpt2_generation_cg(model_name, rotary, seqlen, maxlen):
+    """Check that decoding with CUDA graph is the same as decoding without CUDA graph."""
+    dtype = torch.float16
+    device = "cuda"
+    rtol, atol = 3e-3, 3e-1
+    config = GPT2Config.from_pretrained(model_name)
+    config.n_positions = 16 * 1024
+    assert seqlen <= maxlen <= config.n_positions
+    if rotary is not None:
+        config.n_positions = 0
+        config.rotary_emb_dim = 32
+        config.rotary_emb_interleaved = rotary == "interleaved"
+    config.residual_in_fp32 = True
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = True
+    config.fused_dropout_add_ln = True
+
+    model = GPTLMHeadModel(config, device=device, dtype=dtype)
+    model.eval()
+
+    torch.manual_seed(0)
+    batch_size = 1
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
+    )
+    teacher_outputs = torch.randint(
+        0, config.vocab_size, (batch_size, maxlen), dtype=torch.long, device=device
+    )
+
+    logits = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs)
+    logits_cg = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs, cg=True)
+    assert torch.equal(logits, logits_cg)
+
+    # Try increasing batch size and seqlen, then decrease them to see if it's still correct
+    batch_size = 3
+    maxlen += 30
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
+    )
+    teacher_outputs = torch.randint(
+        0, config.vocab_size, (batch_size, maxlen), dtype=torch.long, device=device
+    )
+    logits = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs)
+    logits_cg = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs, cg=True)
+    assert torch.equal(logits, logits_cg)
+
+    batch_size = 2
+    maxlen -= 35
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
+    )
+    teacher_outputs = torch.randint(
+        0, config.vocab_size, (batch_size, maxlen), dtype=torch.long, device=device
+    )
+    logits = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs)
+    logits_cg = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs, cg=True)
+    assert torch.equal(logits, logits_cg)
+
+
+@pytest.mark.parametrize("optimized", [False, True])
+# @pytest.mark.parametrize("optimized", [False])
+@pytest.mark.parametrize("model_name", ["gpt2"])
+def test_gpt2_multiple_token_generation(model_name, optimized):
+    """Generation when we pass in multiple tokens at a time, not just one."""
+    dtype = torch.float16
+    device = "cuda"
+    rtol, atol = 3e-3, 3e-1
+    config = GPT2Config.from_pretrained(model_name)
+    config.residual_in_fp32 = True
+    if optimized:
+        config.use_flash_attn = True
+        config.fused_bias_fc = True
+        config.fused_mlp = True
+        config.fused_dropout_add_ln = True
+
+    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
+    model.eval()
+
+    torch.manual_seed(0)
+    input_ids = torch.randint(0, config.vocab_size, (1, 20), dtype=torch.long, device=device)
+    # Reference logits
+    logits_ref = model(input_ids).logits
+
+    # Run 10 tokens, then pass in another 4, then another 6, to see if we get the same logits
+    inference_params = InferenceParams(max_seqlen=20, max_batch_size=1)
+    logits_10 = model(input_ids[:, :10], inference_params=inference_params).logits
+    inference_params.seqlen_offset += 10
+    position_ids = torch.arange(10, 14, dtype=torch.long, device=device)
+    logits_1014 = model(
+        input_ids[:, 10:14], position_ids=position_ids, inference_params=inference_params
+    ).logits
+    inference_params.seqlen_offset += 4
+    position_ids = torch.arange(14, 20, dtype=torch.long, device=device)
+    logits_1420 = model(
+        input_ids[:, 14:20], position_ids=position_ids, inference_params=inference_params
+    ).logits
+    logits = torch.cat([logits_10, logits_1014, logits_1420], dim=1)
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+    print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
+    assert torch.allclose(logits, logits_ref, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("cg", [False, True])
+# @pytest.mark.parametrize("cg", [True])
+@pytest.mark.parametrize("optimized", [False, True])
+# @pytest.mark.parametrize("optimized", [True])
+# @pytest.mark.parametrize("model_name", ["gpt2-medium"])
+@pytest.mark.parametrize("model_name", ["gpt2-xl"])
+def test_gpt2_speculative_decoding(model_name, optimized, cg):
+    if cg and not optimized:
+        pytest.skip()  # CG requires use_flash_attn
+    dtype = torch.float16
+    device = "cuda"
+    rtol, atol = 3e-3, 3e-1
+    config = GPT2Config.from_pretrained(model_name)
+    config.residual_in_fp32 = True
+    if optimized:
+        config.use_flash_attn = True
+        config.fused_bias_fc = True
+        config.fused_mlp = True
+        config.fused_dropout_add_ln = True
+    config_draft = GPT2Config.from_pretrained("gpt2")
+    config_draft.residual_in_fp32 = True
+    if optimized:
+        config_draft.use_flash_attn = True
+        config_draft.fused_bias_fc = True
+        config_draft.fused_mlp = True
+        config_draft.fused_dropout_add_ln = True
+
+    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
+    model.eval()
+    model_draft = GPTLMHeadModel.from_pretrained("gpt2", config_draft, device=device, dtype=dtype)
+    model_draft.eval()
+
+    torch.manual_seed(0)
+    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    input_ids = tokenizer("Hello, my dog is cute and he", return_tensors="pt").input_ids.to(
+        device=device
+    )
+    max_length = 100
+
+    from flash_attn.utils.generation import decode_speculative
+
+    torch.manual_seed(42)
+    print(f"Speculative decoding, {optimized = }")
+    out = decode_speculative(
+        input_ids,
+        model,
+        model_draft,
+        max_length=max_length,
+        top_k=5,
+        cg=cg,
+        speculative_lookahead=4,
+        enable_timing=True,
+        # debug=True,
+    )
+    print(tokenizer.batch_decode(out.sequences))
+    print(f"Without speculative decoding, {cg = }")
+    out_og = model.generate(
+        input_ids,
+        max_length=max_length,
+        top_k=5,
+        cg=cg,
+        enable_timing=True,
+        return_dict_in_generate=True,
+    )
+    print(tokenizer.batch_decode(out_og.sequences))
+
+
+@pytest.mark.parametrize(
+    "n_heads_q_kv",
+    [
+        (8, 8),  # Regular attention
+        (8, 4),  # GQA
+        (8, 2),  # MQA
+    ],
+)
+def test_gpt2_shard_unshard(n_heads_q_kv):
+    world_size = 2
+
+    config = GPT2Config.from_pretrained("gpt2")
+    config.vocab_size = 1024
+    config.n_head, config.n_head_kv = n_heads_q_kv
+    model = GPTLMHeadModel(config, device="cuda", dtype=torch.float16)
+    state_dict = model.state_dict()
+    shards = [
+        # NOTE: Shallow copy as `state_dict` is modified in-place
+        shard_state_dict_tp(dict(state_dict), config, world_size, rank)
+        for rank in range(world_size)
+    ]
+    state_dict2 = combine_state_dicts_tp(shards, config)
+    assert state_dict2.keys() == state_dict.keys()
+    for k in state_dict.keys():
+        ref = state_dict[k]
+        new = state_dict[k]
+        assert torch.allclose(ref, new, atol=0.0, rtol=0.0)
diff --git a/test_gpt_generation_parallel.py b/test_gpt_generation_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcf2bf513b39f75911344fd331977afa20d61bc1
--- /dev/null
+++ b/test_gpt_generation_parallel.py
@@ -0,0 +1,172 @@
+# Run test with:
+# torchrun --no_python --nproc_per_node=8 pytest -q -s tests/models/test_gpt_generation_parallel.py -k "parallel"
+import os
+import re
+
+import pytest
+import torch
+from einops import rearrange
+from flash_attn.models.gpt import GPTLMHeadModel, remap_state_dict_hf_gpt2
+from flash_attn.utils.distributed import all_gather_raw
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+from transformers import GPT2Config, GPT2Tokenizer
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel as GPT2LMHeadModelHF
+
+
+# @pytest.mark.parametrize('world_size', [1, 2, 4, 8])
+@pytest.mark.parametrize("world_size", [2])
+@pytest.mark.parametrize('rotary', [False, True])
+# @pytest.mark.parametrize("rotary", [False])
+@pytest.mark.parametrize("model_name", ["gpt2"])
+def test_tensor_parallel(model_name, rotary, world_size):
+    """Check that our implementation of GPT2 generation matches the HF implementation:
+    the scores in fp16 should be around the same as the HF scores in fp16, when compared to
+    the HF scores in fp32.
+    """
+    dtype = torch.float16
+    rtol, atol = 3e-3, 3e-1
+    config = GPT2Config.from_pretrained(model_name)
+    if rotary:
+        config.n_positions = 0
+        config.rotary_emb_dim = 64
+    config.residual_in_fp32 = True
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = True
+    config.fused_dropout_add_ln = True
+    config.pad_vocab_size_multiple = 8 * world_size
+    config.sequence_parallel = False  # Need to set this to False for generation
+
+    os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0"
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    # Need this, otherwise when we capture the graph the process for GPU 1 would run on both
+    # GPU0 and GPU1 and things would hang
+    torch.cuda.set_device(device)
+
+    from apex.transformer import parallel_state
+
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    process_group = parallel_state.get_tensor_model_parallel_group()
+
+    # if not rotary, we load the weight from HF but ignore the position embeddings.
+    # The model would be nonsense but it doesn't matter for the test.
+    model = GPTLMHeadModel.from_pretrained(
+        model_name,
+        config,
+        strict=not rotary,
+        device=device,
+        dtype=dtype,
+        process_group=process_group,
+        world_size=world_size,
+        rank=rank,
+    )
+    model.eval()
+
+    if not rotary:
+        model_ref = GPT2LMHeadModelHF.from_pretrained(model_name).to(device=device)
+        model_hf = GPT2LMHeadModelHF.from_pretrained(model_name).to(device=device, dtype=dtype)
+        model_ref.eval()
+        model_hf.eval()
+
+    torch.manual_seed(0)
+    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    input_ids = tokenizer("Hello, my dog is cute and ", return_tensors="pt").input_ids.to(
+        device=device
+    )
+    max_length = 30
+    # input_ids = torch.randint(0, 100, (1, 10), dtype=torch.long, device='cuda')
+    # max_length = input_ids.shape[1] + 40
+
+    # Slow generation for reference
+    sequences = []
+    scores = []
+    cur_input_ids = input_ids
+    with torch.inference_mode():
+        logits, _ = all_gather_raw(model(cur_input_ids).logits[:, -1], process_group)
+        logits = rearrange(logits, "(n b) d -> b (n d)", b=input_ids.shape[0])[
+            ..., : config.vocab_size
+        ]
+        scores.append(logits)
+        sequences.append(scores[-1].argmax(dim=-1))
+        for _ in range(input_ids.shape[1] + 1, max_length):
+            cur_input_ids = torch.cat([cur_input_ids, rearrange(sequences[-1], "b -> b 1")], dim=-1)
+            logits, _ = all_gather_raw(model(cur_input_ids).logits[:, -1], process_group)
+            logits = rearrange(logits, "(n b) d -> b (n d)", b=input_ids.shape[0])[
+                ..., : config.vocab_size
+            ]
+            scores.append(logits)
+            sequences.append(scores[-1].argmax(dim=-1))
+    sequences = torch.cat([input_ids, torch.stack(sequences, dim=1)], dim=1)
+    scores = tuple(scores)
+    print(sequences)
+
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        tensor_parallel=world_size,
+        vocab_size=config.vocab_size,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+    )
+    print(out.sequences)
+    if getattr(config, "use_flash_attn", False):
+        out_cg = model.generate(
+            input_ids=input_ids,
+            max_length=max_length,
+            tensor_parallel=world_size,
+            vocab_size=config.vocab_size,
+            cg=True,
+            return_dict_in_generate=True,
+            output_scores=True,
+            enable_timing=True,
+        )
+        print(out_cg.sequences)
+
+    parallel_state.destroy_model_parallel()
+
+    if not rotary:
+        out_hf = model_hf.generate(
+            input_ids=input_ids,
+            max_length=max_length,
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+        out_ref = model_ref.generate(
+            input_ids=input_ids,
+            max_length=max_length,
+            return_dict_in_generate=True,
+            output_scores=True,
+        )
+
+        print(
+            f"Scores max diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}"
+        )
+        print(
+            f"Scores mean diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}"
+        )
+        print(
+            f"HF fp16 max diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}"
+        )
+        print(
+            f"HF fp16 mean diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}"
+        )
+
+    assert torch.all(out.sequences == sequences)
+    assert torch.allclose(
+        torch.stack(out.scores, dim=1), torch.stack(scores, dim=1), rtol=rtol, atol=atol
+    )
+    assert torch.equal(torch.stack(out.scores, dim=1), torch.stack(out_cg.scores, dim=1))
+    if not rotary:
+        assert torch.all(out.sequences == out_ref.sequences)
+        assert torch.all(out.sequences == out_hf.sequences)
+
+        assert (
+            torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)
+        ).abs().max().item() < 3 * (
+            torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)
+        ).abs().max().item()
diff --git a/test_gpt_neox.py b/test_gpt_neox.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ae8aa9a24da90070aabd41b85d6bbfc6ee1aa9c
--- /dev/null
+++ b/test_gpt_neox.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2023, Tri Dao.
+
+import time
+
+import pytest
+import torch
+from flash_attn.models.gpt import GPTLMHeadModel
+from flash_attn.models.gpt_neox import gpt_neox_config_to_gpt2_config, remap_state_dict_hf_gpt_neox
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+from transformers import AutoTokenizer, GPTNeoXConfig
+from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
+
+
+@pytest.mark.parametrize("model_name", ["EleutherAI/gpt-neox-20b"])
+def test_gptj_state_dict(model_name):
+    config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(model_name))
+    pretrained_state_dict = remap_state_dict_hf_gpt_neox(
+        state_dict_from_pretrained(model_name), config
+    )
+    model = GPTLMHeadModel(config, device="meta")  # Without device='meta' init is very slow
+    state_dict = model.state_dict()
+    assert state_dict.keys() == pretrained_state_dict.keys()
+    for k in state_dict.keys():
+        assert state_dict[k].shape == pretrained_state_dict[k].shape
+
+
+@pytest.mark.parametrize(
+    "model_name",
+    [
+        "EleutherAI/pythia-1b",
+        "EleutherAI/pythia-2.8b",
+        "EleutherAI/gpt-neox-20b",
+        "togethercomputer/RedPajama-INCITE-7B-Base",
+    ],
+)
+def test_gpt_neox_optimized(model_name):
+    """Check that our implementation of GPT-NeoX (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    device = "cuda"
+    config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(model_name))
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = config.activation_function in [
+        "gelu_fast",
+        "gelu_new",
+        "gelu_approx",
+        "gelu_pytorch_tanh",
+    ]
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+
+    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
+    model.eval()
+
+    torch.manual_seed(0)
+    batch_size = 2
+    max_seqlen = 256
+    seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device)
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device
+    )
+    with torch.no_grad():
+        out = model.transformer(input_ids)
+        logits = model(input_ids).logits
+    del model
+
+    # Need at least 2 GPUs, otherwise we'll OOM for the 20B model
+    # Without device_map, the model is loaded on the CPU, which is very slow
+    model_ref = GPTNeoXForCausalLM.from_pretrained(model_name, device_map="auto")
+    model_ref.eval()
+    with torch.no_grad():
+        out_ref = model_ref.gpt_neox(input_ids).last_hidden_state.to(device=device)
+        logits_ref = model_ref(input_ids).logits.to(device=device)
+    del model_ref
+
+    model_hf = GPTNeoXForCausalLM.from_pretrained(
+        model_name, torch_dtype=dtype, device_map={"": device}
+    )
+    model_hf.eval()
+    with torch.no_grad():
+        out_hf = model_hf.gpt_neox(input_ids).last_hidden_state
+        logits_hf = model_hf(input_ids).logits
+    del model_hf
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
+    assert (out - out_ref).abs().max().item() < 2 * (out_hf - out_ref).abs().max().item()
+    assert (out - out_ref).abs().mean().item() < 2 * (out_hf - out_ref).abs().mean().item()
+
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+    print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
+    assert (logits - logits_ref).abs().max().item() < 2 * (
+        logits_hf - logits_ref
+    ).abs().max().item()
+    assert (logits - logits_ref).abs().mean().item() < 2 * (
+        logits_hf - logits_ref
+    ).abs().mean().item()
diff --git a/test_gpt_parallel.py b/test_gpt_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..1674d2a60fca9a16dcf8f0e919c7adae46e7475a
--- /dev/null
+++ b/test_gpt_parallel.py
@@ -0,0 +1,236 @@
+# Run test with:
+# torchrun --no_python --nproc_per_node=8 pytest -q -s tests/models/test_gpt_parallel.py
+
+import math
+
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from apex.transformer import parallel_state
+from einops import rearrange
+from flash_attn.losses.cross_entropy import CrossEntropyLoss
+from flash_attn.models.gpt import GPTLMHeadModel, shard_state_dict_tp
+from flash_attn.utils.distributed import allreduce_sequence_parallel_grad
+from transformers import GPT2Config
+
+is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8
+
+
+@pytest.mark.parametrize("dtype", [torch.float16] + ([torch.bfloat16] if is_sm8x else []))
+# @pytest.mark.parametrize('dtype', [torch.bfloat16])
+@pytest.mark.parametrize("world_size", [1, 2, 4, 8])
+# @pytest.mark.parametrize('world_size', [2])
+@pytest.mark.parametrize("sequence_parallel", [True, False])
+# @pytest.mark.parametrize('sequence_parallel', [False])
+@pytest.mark.parametrize("has_pos_emb", [True, False])
+# @pytest.mark.parametrize('has_pos_emb', [True])
+@pytest.mark.parametrize("dim", [1024])
+def test_gpt_parallel(dim, has_pos_emb, sequence_parallel, world_size, dtype):
+    head_dim = 64
+    assert dim % head_dim == 0
+    num_heads = dim // head_dim
+    assert num_heads % world_size == 0
+    vocab_size = 50264
+    assert vocab_size % world_size == 0
+    num_layers = 2
+    rtol, atol = (3e-3, 1e-1) if dtype == torch.bfloat16 else (3e-3, 1e-2)
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    process_group = parallel_state.get_tensor_model_parallel_group()
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen = 1024
+    assert (batch_size * seqlen) % world_size == 0
+    input_ids = torch.randint(0, vocab_size, (batch_size, seqlen + 1), device=device)
+
+    # We need to generate g here so that all processes get the same gradient,
+    # as rank 0 will have an extra bias that changes the RNG.
+    g = torch.randn(batch_size * seqlen, device=device)
+
+    config = GPT2Config(
+        n_embd=dim,
+        n_head=num_heads,
+        n_layer=num_layers,
+        n_positions=seqlen if has_pos_emb else 0,
+        vocab_size=50257,
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attn_pdrop=0.0,
+        scale_attn_by_inverse_layer_idx=True,
+        use_flash_attn=True,
+        fused_mlp=True,
+        fused_bias_fc=True,
+        fused_dropout_add_ln=True,
+        residual_in_fp32=True,
+        rotary_emb_fraction=0.0 if has_pos_emb else 0.5,
+        pad_vocab_size_multiple=8 * world_size,
+        sequence_parallel=sequence_parallel,
+    )
+    config.vocab_size = math.ceil(config.vocab_size / (8 * world_size)) * (8 * world_size)
+    model_pt = GPTLMHeadModel(config, device=device)
+
+    def init_layer_norm(module):
+        if isinstance(module, nn.LayerNorm):
+            nn.init.normal_(module.weight)
+            nn.init.normal_(module.bias)
+
+    model_pt.apply(init_layer_norm)
+
+    model = GPTLMHeadModel(config, process_group=process_group, device=device)
+    total_nparams = sum(p.numel() for p in model_pt.parameters())
+    sharded_nparams = sum(p.numel() for p in model.parameters())
+    sharded_nparams_all = torch.empty(world_size, dtype=torch.long, device=device)
+    torch.distributed.all_gather_into_tensor(
+        sharded_nparams_all, torch.tensor([sharded_nparams], device=device), group=process_group
+    )
+    shared_nparams = sum(
+        p.numel() for p in model.parameters() if getattr(p, "_shared_params", False)
+    )
+    shared_nparams_all = torch.empty(world_size, dtype=torch.long, device=device)
+    torch.distributed.all_gather_into_tensor(
+        shared_nparams_all, torch.tensor([shared_nparams], device=device), group=process_group
+    )
+    assert torch.all(shared_nparams_all == shared_nparams)
+    assert total_nparams == (
+        (sharded_nparams_all - shared_nparams_all).sum().item() + shared_nparams
+    )
+
+    # vocab_size has been rounded up here
+    partition_vocab_size = config.vocab_size // world_size
+    partition_dim = dim // world_size
+    partition_hidden_dim = 4 * dim // world_size
+    with torch.no_grad():
+        model.load_state_dict(shard_state_dict_tp(model_pt.state_dict(), config, world_size, rank))
+        model.tie_weights()
+
+    with torch.autocast(device_type="cuda", dtype=dtype):
+        out = model(input_ids[:, :-1]).logits
+        if not sequence_parallel:
+            out = rearrange(out, "b s d -> (b s) d")
+        out_pt = rearrange(model_pt(input_ids[:, :-1]).logits, "b s d -> (b s) d")
+    partition_batch_dim = batch_size * seqlen // world_size
+    assert torch.allclose(
+        out,
+        out_pt[:, rank * partition_vocab_size : (rank + 1) * partition_vocab_size],
+        rtol=rtol,
+        atol=atol,
+    )
+    loss_fn = CrossEntropyLoss(inplace_backward=True, reduction="none", process_group=process_group)
+    loss_fn_pt = CrossEntropyLoss(inplace_backward=True, reduction="none")
+    loss = loss_fn(out, input_ids[:, 1:].flatten())
+    loss_pt = loss_fn_pt(out_pt, input_ids[:, 1:].flatten())
+    assert torch.allclose(loss, loss_pt, rtol=rtol, atol=atol)
+
+    loss_pt.backward(g)
+    loss.backward(g)
+    allreduce_sequence_parallel_grad(model, process_group)
+    parallel_state.destroy_model_parallel()
+
+    grad_dict = shard_state_dict_tp(
+        {k: v.grad for k, v in model_pt.named_parameters()}, config, world_size, rank
+    )
+
+    assert torch.allclose(
+        model.transformer.embeddings.word_embeddings.weight.grad,
+        grad_dict["transformer.embeddings.word_embeddings.weight"],
+        rtol=rtol,
+        atol=atol * 5,
+    )
+    if has_pos_emb:
+        assert torch.allclose(
+            model.transformer.embeddings.position_embeddings.weight.grad,
+            grad_dict["transformer.embeddings.position_embeddings.weight"],
+            rtol=rtol,
+            atol=atol,
+        )
+    assert torch.allclose(
+        model.transformer.ln_f.weight.grad,
+        grad_dict["transformer.ln_f.weight"],
+        rtol=rtol,
+        atol=atol,
+    )
+    assert torch.allclose(
+        model.transformer.ln_f.bias.grad, grad_dict["transformer.ln_f.bias"], rtol=rtol, atol=atol
+    )
+    for i in range(num_layers):
+        assert torch.allclose(
+            model.transformer.layers[i].mixer.Wqkv.weight.grad,
+            grad_dict[f"transformer.layers.{i}.mixer.Wqkv.weight"],
+            rtol=rtol,
+            atol=atol * 10,
+        )
+        assert torch.allclose(
+            model.transformer.layers[i].mixer.Wqkv.bias.grad,
+            grad_dict[f"transformer.layers.{i}.mixer.Wqkv.bias"],
+            rtol=rtol,
+            atol=atol * 10,
+        )
+        assert torch.allclose(
+            model.transformer.layers[i].mixer.out_proj.weight.grad,
+            grad_dict[f"transformer.layers.{i}.mixer.out_proj.weight"],
+            rtol=rtol,
+            atol=atol * 10,
+        )
+        if rank == 0:
+            assert torch.allclose(
+                model.transformer.layers[i].mixer.out_proj.bias.grad,
+                grad_dict[f"transformer.layers.{i}.mixer.out_proj.bias"],
+                rtol=rtol,
+                atol=atol * 5,
+            )
+        assert torch.allclose(
+            model.transformer.layers[i].mlp.fc1.weight.grad,
+            grad_dict[f"transformer.layers.{i}.mlp.fc1.weight"],
+            rtol=rtol,
+            atol=atol * 10,
+        )
+        assert torch.allclose(
+            model.transformer.layers[i].mlp.fc1.bias.grad,
+            grad_dict[f"transformer.layers.{i}.mlp.fc1.bias"],
+            rtol=rtol,
+            atol=atol * 10,
+        )
+        assert torch.allclose(
+            model.transformer.layers[i].mlp.fc2.weight.grad,
+            grad_dict[f"transformer.layers.{i}.mlp.fc2.weight"],
+            rtol=rtol,
+            atol=atol * 10,
+        )
+        if rank == 0:
+            assert torch.allclose(
+                model.transformer.layers[i].mlp.fc2.bias.grad,
+                grad_dict[f"transformer.layers.{i}.mlp.fc2.bias"],
+                rtol=rtol,
+                atol=atol * 5,
+            )
+
+        assert torch.allclose(
+            model.transformer.layers[i].norm1.weight.grad,
+            grad_dict[f"transformer.layers.{i}.norm1.weight"],
+            rtol=rtol,
+            atol=atol,
+        )
+        assert torch.allclose(
+            model.transformer.layers[i].norm1.bias.grad,
+            grad_dict[f"transformer.layers.{i}.norm1.bias"],
+            rtol=rtol,
+            atol=atol,
+        )
+        assert torch.allclose(
+            model.transformer.layers[i].norm2.weight.grad,
+            grad_dict[f"transformer.layers.{i}.norm2.weight"],
+            rtol=rtol,
+            atol=atol,
+        )
+        assert torch.allclose(
+            model.transformer.layers[i].norm2.bias.grad,
+            grad_dict[f"transformer.layers.{i}.norm2.bias"],
+            rtol=rtol,
+            atol=atol,
+        )
diff --git a/test_gptj.py b/test_gptj.py
new file mode 100644
index 0000000000000000000000000000000000000000..496d8322523ea37b35c8ade0a5ff855968daca64
--- /dev/null
+++ b/test_gptj.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2023, Tri Dao.
+
+import time
+
+import pytest
+import torch
+from flash_attn.models.gpt import GPTLMHeadModel
+from flash_attn.models.gptj import gptj_config_to_gpt2_config, remap_state_dict_hf_gptj
+from flash_attn.utils.generation import update_graph_cache
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+from transformers import AutoTokenizer, GPTJConfig
+from transformers.models.gptj.modeling_gptj import GPTJForCausalLM
+
+
+@pytest.mark.parametrize("model_name", ["EleutherAI/gpt-j-6B"])
+def test_gptj_state_dict(model_name):
+    config = gptj_config_to_gpt2_config(GPTJConfig.from_pretrained(model_name))
+    pretrained_state_dict = remap_state_dict_hf_gptj(state_dict_from_pretrained(model_name), config)
+    model = GPTLMHeadModel(config, device="meta")  # Without device='meta' init is very slow
+    state_dict = model.state_dict()
+    assert state_dict.keys() == pretrained_state_dict.keys()
+    for k in state_dict.keys():
+        assert state_dict[k].shape == pretrained_state_dict[k].shape
+
+
+@pytest.mark.parametrize("model_name", ["EleutherAI/gpt-j-6B", "togethercomputer/GPT-JT-6B-v1"])
+def test_gptj_optimized(model_name):
+    """Check that our implementation of GPT-J (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    device = "cuda"
+    config = gptj_config_to_gpt2_config(GPTJConfig.from_pretrained(model_name))
+    config.use_flash_attn = True  # FlashAttention-2 supports headdim 256
+    config.fused_bias_fc = True
+    config.fused_mlp = True
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+
+    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
+    model.eval()
+
+    torch.manual_seed(0)
+    batch_size = 2
+    max_seqlen = 256
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device
+    )
+    with torch.no_grad():
+        out = model.transformer(input_ids)
+        logits = model(input_ids).logits
+    del model
+
+    # Without device_map, the model is loaded on the CPU, which is very slow
+    model_ref = GPTJForCausalLM.from_pretrained(model_name, device_map={"": device})
+    model_ref.eval()
+    with torch.no_grad():
+        out_ref = model_ref.transformer(input_ids).last_hidden_state
+        logits_ref = model_ref(input_ids).logits
+    del model_ref
+
+    model_hf = GPTJForCausalLM.from_pretrained(
+        model_name, torch_dtype=dtype, device_map={"": device}
+    )
+    model_hf.eval()
+    out_hf = model_hf.transformer(input_ids).last_hidden_state
+    logits_hf = model_hf(input_ids).logits
+    del model_hf
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
+    assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item()
+
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+    print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
+    assert (logits - logits_ref).abs().max().item() < 3 * (
+        logits_hf - logits_ref
+    ).abs().max().item()
+
+
+@pytest.mark.parametrize("model_name", ["EleutherAI/gpt-j-6B"])
+def test_gptj_generation(model_name):
+    """Check that our implementation of GPT-J (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    device = "cuda"
+    config = gptj_config_to_gpt2_config(GPTJConfig.from_pretrained(model_name))
+    config.use_flash_attn = True  # FlashAttention-2 supports headdim 256
+    config.fused_bias_fc = True
+    config.fused_mlp = True
+    config.fused_dropout_add_ln = True
+    # Only prenorm supports residual_in_fp32
+    config.residual_in_fp32 = True
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    eos_token_id = tokenizer.eos_token_id
+
+    torch.manual_seed(0)
+    batch_size = 1
+    seqlen = 100
+    max_length = 150
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
+    )
+
+    model_hf = GPTJForCausalLM.from_pretrained(
+        model_name, torch_dtype=dtype, device_map={"": device}
+    )
+    model_hf.eval()
+    print("HF fp16")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_hf = model_hf.generate(
+        input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+    del model_hf
+
+    model_ref = GPTJForCausalLM.from_pretrained(model_name, device_map={"": device})
+    model_ref.eval()
+    with torch.no_grad():
+        logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1]
+    del model_ref
+
+    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
+    model.eval()
+
+    print("Without CUDA graph")
+    torch.cuda.synchronize()
+    start = time.time()
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        eos_token_id=eos_token_id,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+        teacher_outputs=out_hf.sequences,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+
+    # Capture graph outside the timing loop
+    batch_size, seqlen_og = input_ids.shape
+    model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length)
+    print("With CUDA graph")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_cg = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        cg=True,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+        teacher_outputs=out_hf.sequences,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+
+    with torch.no_grad():
+        logits_parallel = model(out_hf.sequences).logits[:, (seqlen - 1) : -1]
+    logits_hf = torch.stack(out_hf.scores, dim=1)
+    logits = torch.stack(out.scores, dim=1)
+    logits_cg = torch.stack(out_cg.scores, dim=1)
+
+    del model
+
+    hf_error = (logits_hf - logits_ref).abs().max().item()
+    assert (logits_parallel - logits_ref).abs().max().item() < 2 * hf_error
+
+    print(f"HF fp16 logits max diff: {hf_error}")
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item() }")
+    assert (logits - logits_ref).abs().max().item() < 2 * hf_error
+    print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item() }")
+    assert torch.equal(logits_cg, logits)
diff --git a/test_layer_norm.py b/test_layer_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d92b6b32961e17acfabc69fe44c2ae84ef1b8b6
--- /dev/null
+++ b/test_layer_norm.py
@@ -0,0 +1,368 @@
+# Copyright (c) 2024, Tri Dao.
+
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+
+from flash_attn.ops.triton.layer_norm import (
+    layer_norm_fn,
+    layer_norm_ref,
+    rms_norm_ref,
+    layer_norm_linear_fn,
+)
+
+
+is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8
+
+
+@pytest.mark.parametrize("has_weight1", [False, True])
+# @pytest.mark.parametrize("has_weight1", [True])
+@pytest.mark.parametrize("has_x1", [False, True])
+# @pytest.mark.parametrize("has_x1", [False])
+@pytest.mark.parametrize("has_rowscale", [False, True])
+# @pytest.mark.parametrize("has_rowscale", [False])
+@pytest.mark.parametrize("dropout_p", [0.0, 0.27])
+# @pytest.mark.parametrize("dropout_p", [0.0])
+@pytest.mark.parametrize("prenorm", [True, False])
+# @pytest.mark.parametrize("prenorm", [False])
+@pytest.mark.parametrize("is_rms_norm", [False, True])
+# @pytest.mark.parametrize("is_rms_norm", [True])
+@pytest.mark.parametrize("has_residual", [True, False])
+# @pytest.mark.parametrize("has_residual", [False])
+@pytest.mark.parametrize(
+    "weight_dtype", [torch.float32, torch.float16] + ([torch.bfloat16] if is_sm8x else [])
+)
+# @pytest.mark.parametrize("weight_dtype", [torch.float32])
+@pytest.mark.parametrize(
+    "input_dtype,residual_dtype",
+    [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)]
+    + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []),
+)
+# @pytest.mark.parametrize("input_dtype,residual_dtype", [(torch.float16, torch.float16)])
+@pytest.mark.parametrize("hidden_size", [192, 2048, 2560, 3000, 4096])
+# @pytest.mark.parametrize("hidden_size", [256])
+def test_layer_norm(
+    hidden_size,
+    input_dtype,
+    residual_dtype,
+    weight_dtype,
+    has_residual,
+    is_rms_norm,
+    prenorm,
+    dropout_p,
+    has_rowscale,
+    has_x1,
+    has_weight1,
+):
+    if has_rowscale and has_x1:
+        pytest.skip("Not supported")
+    device = "cuda"
+    if any(x == torch.bfloat16 for x in [input_dtype, residual_dtype, weight_dtype]):
+        atol = 5e-2
+    elif any(x == torch.float16 for x in [input_dtype, residual_dtype, weight_dtype]):
+        atol = 1e-2
+    else:
+        atol = 1e-4
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen = 512
+    layer_norm_ref_fn = layer_norm_ref if not is_rms_norm else rms_norm_ref
+    allclose = (
+        # Sometimes x0_pt.grad is NaN
+        lambda x, x_pt, x_ref, atol=atol: (x - x_ref).abs().max()
+        <= 2 * (x_pt[~x_pt.isnan()] - x_ref[~x_pt.isnan()]).abs().max() + atol
+        or (
+            # Sometimes x_pt and x_ref are the same (e.g. bfloat16) so we want to perturb is a bit
+            # by multiply and divide by 0.3
+            (x_pt[~x_pt.isnan()] - x_ref[~x_pt.isnan()]).abs().max() == 0.0
+            and (x - x_ref).abs().max()
+            <= 2 * (x_pt[~x_pt.isnan()] * 0.3 / 0.3 - x_ref[~x_pt.isnan()]).abs().max() + atol
+        )
+    )
+    x0 = torch.randn(
+        batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
+    )
+    x0_pt = x0.detach().clone().requires_grad_()
+    x0_ref = x0.detach().clone().requires_grad_()
+    if has_residual:
+        res = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True)
+        res_pt = res.detach().clone().requires_grad_()
+        res_ref = res.detach().clone().requires_grad_()
+    else:
+        res, res_pt, res_ref = None, None, None
+    weight = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+    if not is_rms_norm:
+        bias = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+    else:
+        bias = None
+    weight_pt = weight.detach().clone().requires_grad_()
+    weight_ref = weight.detach().clone().requires_grad_()
+    bias_pt = bias.detach().clone().requires_grad_() if bias is not None else None
+    bias_ref = bias.detach().clone().requires_grad_() if bias is not None else None
+    if has_x1:
+        x1 = torch.randn_like(x0, dtype=input_dtype, requires_grad=True)
+        x1_pt = x1.detach().clone().requires_grad_()
+        x1_ref = x1.detach().clone().requires_grad_()
+    else:
+        x1, x1_pt, x1_ref = None, None, None
+    if has_weight1:
+        weight1 = torch.randn(
+            hidden_size, device=device, dtype=weight_dtype, requires_grad=True
+        )
+        weight1_pt = weight1.detach().clone().requires_grad_()
+        weight1_ref = weight1.detach().clone().requires_grad_()
+        if not is_rms_norm:
+            bias1 = torch.randn(
+                hidden_size, device=device, dtype=weight_dtype, requires_grad=True
+            )
+        else:
+            bias1 = None
+        bias1_pt = bias1.detach().clone().requires_grad_() if bias1 is not None else None
+        bias1_ref = bias1.detach().clone().requires_grad_() if bias1 is not None else None
+    else:
+        weight1, weight1_pt, weight1_ref = None, None, None
+        bias1, bias1_pt, bias1_ref = None, None, None
+
+    rowscale = (
+        torch.randn(batch_size, seqlen, dtype=input_dtype, device=device)
+        if has_rowscale
+        else None
+    )
+
+    residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32
+    out, *rest = layer_norm_fn(
+        x0,
+        weight,
+        bias,
+        residual=res,
+        x1=x1,
+        weight1=weight1,
+        bias1=bias1,
+        eps=1e-6,
+        dropout_p=dropout_p,
+        rowscale=rowscale,
+        prenorm=prenorm,
+        residual_in_fp32=residual_in_fp32,
+        is_rms_norm=is_rms_norm,
+        return_dropout_mask=True,
+    )
+    dropout_mask = rest[-2] if dropout_p > 0.0 else None
+    dropout_mask1 = rest[-1] if dropout_p > 0.0 and x1 is not None else None
+    out_pt = layer_norm_ref_fn(
+        x0_pt,
+        weight_pt,
+        bias_pt,
+        residual=res_pt,
+        x1=x1_pt,
+        weight1=weight1_pt,
+        bias1=bias1_pt,
+        eps=1e-6,
+        dropout_p=dropout_p,
+        rowscale=rowscale,
+        prenorm=prenorm,
+        dropout_mask=dropout_mask,
+        dropout_mask1=dropout_mask1,
+    )
+    out_ref = layer_norm_ref_fn(
+        x0_ref,
+        weight_ref,
+        bias_ref,
+        residual=res_ref,
+        x1=x1_ref,
+        weight1=weight1_ref,
+        bias1=bias1_ref,
+        eps=1e-6,
+        dropout_p=dropout_p,
+        rowscale=rowscale,
+        prenorm=prenorm,
+        dropout_mask=dropout_mask,
+        dropout_mask1=dropout_mask1,
+        upcast=True,
+    )
+    if not has_weight1:
+        if prenorm:
+            residual = rest[0]
+            out_pt, residual_pt = out_pt
+            out_ref, residual_ref = out_ref
+        out1, out1_pt, out1_ref = None, None, None
+    else:
+        out1 = rest.pop(0)
+        if prenorm:
+            residual = rest[0]
+            out_pt, out1_pt, residual_pt = out_pt
+            out_ref, out1_ref, residual_ref = out_ref
+        else:
+            out_pt, out1_pt = out_pt
+            out_ref, out1_ref = out_ref
+    assert out.dtype == input_dtype
+    if prenorm:
+        assert residual.dtype == residual_dtype
+        assert allclose(residual, residual_pt, residual_ref)
+    assert allclose(out, out_pt, out_ref)
+    if out1 is not None:
+        assert out1.dtype == input_dtype
+        assert allclose(out1, out1_pt, out1_ref)
+    if dropout_mask is not None:
+        dropout_fraction = 1.0 - dropout_mask.float().mean()
+        assert abs(dropout_fraction - dropout_p) < 0.01
+    if dropout_mask1 is not None:
+        dropout_fraction = 1.0 - dropout_mask1.float().mean()
+        assert abs(dropout_fraction - dropout_p) < 0.01
+        assert not torch.equal(dropout_mask, dropout_mask1)
+
+    g = torch.randn_like(out) / batch_size
+    if has_weight1:
+        out = out * F.gelu(out1)
+        out_pt = out_pt * F.gelu(out1_pt)
+        out_ref = out_ref * F.gelu(out1_ref)
+    if not prenorm:
+        out.backward(g)
+        out_pt.backward(g)
+        out_ref.backward(g)
+    else:
+        (out * F.sigmoid(residual)).backward(g)
+        (out_pt * F.sigmoid(residual_pt)).backward(g)
+        (out_ref * F.sigmoid(residual_ref.to(dtype=residual_dtype))).backward(g)
+    assert allclose(x0.grad, x0_pt.grad, x0_ref.grad)
+    if has_residual:
+        assert allclose(res.grad, res_pt.grad, res_ref.grad)
+    if has_x1:
+        assert allclose(x1.grad, x1_pt.grad, x1_ref.grad)
+    assert allclose(weight.grad, weight_pt.grad, weight_ref.grad)
+    if bias is not None:
+        assert allclose(bias.grad, bias_pt.grad, bias_ref.grad)
+    if has_weight1:
+        assert allclose(weight1.grad, weight1_pt.grad, weight1_ref.grad)
+        if bias1 is not None:
+            assert allclose(bias1.grad, bias1_pt.grad, bias1_ref.grad)
+
+
+@pytest.mark.parametrize("prenorm", [True, False])
+# @pytest.mark.parametrize("prenorm", [True])
+@pytest.mark.parametrize("is_rms_norm", [False, True])
+# @pytest.mark.parametrize("is_rms_norm", [True])
+@pytest.mark.parametrize("has_residual", [True, False])
+# @pytest.mark.parametrize("has_residual", [False])
+@pytest.mark.parametrize("weight_dtype", [torch.float32])
+@pytest.mark.parametrize(
+    "input_dtype,residual_dtype",
+    [(torch.float16, torch.float16), (torch.float16, torch.float32)]
+    + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []),
+)
+# @pytest.mark.parametrize("input_dtype,residual_dtype", [(torch.bfloat16, torch.float32)])
+@pytest.mark.parametrize("hidden_size", [192, 2048, 2560, 3000])
+# @pytest.mark.parametrize("hidden_size", [256])
+def test_layer_norm_linear(
+    hidden_size, input_dtype, residual_dtype, weight_dtype, has_residual, is_rms_norm, prenorm
+):
+    device = "cuda"
+    if any(x == torch.bfloat16 for x in [input_dtype, residual_dtype, weight_dtype]):
+        atol = 5e-2
+    elif any(x == torch.float16 for x in [input_dtype, residual_dtype, weight_dtype]):
+        atol = 1e-2
+    else:
+        atol = 1e-4
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 4
+    seqlen = 512
+    # batch_size = 1
+    # seqlen = 1
+    layer_norm_ref_fn = layer_norm_ref if not is_rms_norm else rms_norm_ref
+    allclose = (
+        lambda x, x_pt, x_ref, atol=atol: (x - x_ref).abs().max()
+        <= 2 * (x_pt - x_ref).abs().max() + atol
+    )
+    x0 = torch.randn(
+        batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
+    )
+    x0_pt = x0.detach().clone().requires_grad_()
+    x0_ref = x0.detach().clone().requires_grad_()
+    if has_residual:
+        res = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True)
+        res_pt = res.detach().clone().requires_grad_()
+        res_ref = res.detach().clone().requires_grad_()
+    else:
+        res, res_pt, res_ref = None, None, None
+    norm_weight = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+    if not is_rms_norm:
+        norm_bias = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
+    else:
+        norm_bias = None
+    norm_weight_pt = norm_weight.detach().clone().requires_grad_()
+    norm_weight_ref = norm_weight.detach().clone().requires_grad_()
+    norm_bias_pt = norm_bias.detach().clone().requires_grad_() if norm_bias is not None else None
+    norm_bias_ref = norm_bias.detach().clone().requires_grad_() if norm_bias is not None else None
+    linear_weight = torch.empty(
+        2 * hidden_size, hidden_size, device=device, dtype=weight_dtype, requires_grad=True
+    )
+    torch.nn.init.xavier_uniform_(linear_weight)
+    if not is_rms_norm:
+        linear_bias = torch.randn(
+            2 * hidden_size, device=device, dtype=weight_dtype, requires_grad=True
+        )
+    else:
+        linear_bias = None
+    linear_weight_pt = linear_weight.detach().clone().requires_grad_()
+    linear_weight_ref = linear_weight.detach().clone().requires_grad_()
+    linear_bias_pt = (
+        linear_bias.detach().clone().requires_grad_() if linear_bias is not None else None
+    )
+    linear_bias_ref = (
+        linear_bias.detach().clone().requires_grad_() if linear_bias is not None else None
+    )
+
+    residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32
+    with torch.autocast(device_type="cuda", dtype=input_dtype):
+        out, *rest = layer_norm_linear_fn(
+            x0,
+            norm_weight,
+            norm_bias,
+            linear_weight,
+            linear_bias,
+            residual=res,
+            eps=1e-6,
+            prenorm=prenorm,
+            residual_in_fp32=residual_in_fp32,
+            is_rms_norm=is_rms_norm,
+        )
+    out_pt, *rest_pt = layer_norm_ref_fn(
+        x0_pt, norm_weight_pt, norm_bias_pt, residual=res_pt, eps=1e-6, prenorm=prenorm
+    )
+    with torch.autocast(device_type="cuda", dtype=input_dtype):
+        out_pt = F.linear(out_pt, linear_weight_pt, linear_bias_pt)
+    out_ref, *rest_ref = layer_norm_ref_fn(
+        x0_ref,
+        norm_weight_ref,
+        norm_bias_ref,
+        residual=res_ref,
+        eps=1e-6,
+        prenorm=prenorm,
+        upcast=True,
+    )
+    out_ref = F.linear(out_ref.to(linear_weight_ref.dtype), linear_weight_ref, linear_bias_ref)
+    if prenorm:
+        residual = rest[0]
+        residual_pt = rest_pt[0]
+        residual_ref = rest_ref[0]
+    assert out.dtype == input_dtype
+    if prenorm:
+        assert residual.dtype == residual_dtype
+        assert allclose(residual, residual_pt, residual_ref)
+    assert allclose(out, out_pt, out_ref)
+
+    g = torch.randn_like(out) / batch_size
+    out.backward(g)
+    out_pt.backward(g)
+    out_ref.backward(g)
+    assert allclose(x0.grad, x0_pt.grad, x0_ref.grad)
+    if has_residual:
+        assert allclose(res.grad, res_pt.grad, res_ref.grad)
+    assert allclose(norm_weight.grad, norm_weight_pt.grad, norm_weight_ref.grad)
+    if norm_bias is not None:
+        assert allclose(norm_bias.grad, norm_bias_pt.grad, norm_bias_ref.grad)
+    assert allclose(linear_weight.grad, linear_weight_pt.grad, linear_weight_ref.grad)
+    if linear_bias is not None:
+        assert allclose(linear_bias.grad, linear_bias_pt.grad, linear_bias_ref.grad)
diff --git a/test_llama.py b/test_llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..32e9cd2114ad85756bd842937ff49ec219bdb38f
--- /dev/null
+++ b/test_llama.py
@@ -0,0 +1,633 @@
+# Copyright (c) 2023, Tri Dao.
+
+# To run the huggingface implementation of LLaMa (1), we first need to convert the weights:
+# https://github.com/huggingface/transformers/pull/21955
+# python -m transformers.models.llama.convert_llama_weights_to_hf --input_dir $CHECKPOINT_DIR/llama --model_size 7B --output_dir $CHECKPOINT_DIR/llama/7B-hf
+# and repeat for 13B, 30B, 65B
+
+import os
+import time
+from pathlib import Path
+
+current_dir = Path(__file__).parent.absolute()
+
+import shutil
+
+import pytest
+import torch
+from einops import rearrange
+from flash_attn.models.gpt import GPTLMHeadModel, combine_state_dicts_tp, shard_state_dict_tp
+from flash_attn.models.llama import (
+    config_from_checkpoint,
+    inv_remap_state_dict_hf_llama,
+    llama_config_to_gpt2_config,
+    remap_state_dict_hf_llama,
+    remap_state_dict_meta_llama,
+    state_dicts_from_checkpoint,
+)
+from flash_attn.utils.distributed import all_gather_raw
+from flash_attn.utils.generation import update_graph_cache
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+from transformers import LlamaConfig, LlamaTokenizer
+from transformers.models.llama.modeling_llama import LlamaForCausalLM
+from transformers import AutoConfig
+
+
+def _pretrained_state_dict_from_checkpoint(checkpoint_path, model_name, config, checkpoint_format):
+    if checkpoint_format == "meta":
+        ckpt_state_dicts = state_dicts_from_checkpoint(checkpoint_path, model_name)
+        pretrained_state_dicts = [remap_state_dict_meta_llama(s, config) for s in ckpt_state_dicts]
+        pretrained_state_dict = combine_state_dicts_tp(pretrained_state_dicts, config)
+    else:
+        pretrained_state_dict = state_dict_from_pretrained(
+            Path(checkpoint_path) / f"{model_name}-hf"
+        )
+        pretrained_state_dict = remap_state_dict_hf_llama(pretrained_state_dict, config)
+    return pretrained_state_dict
+
+
+@pytest.mark.parametrize("model_name", ["7B"])
+def test_llama_state_dict(model_name):
+    checkpoint_path = (
+        Path(os.environ.get("CHECKPOINT_DIR", current_dir.parent.parent / "checkpoints")) / "llama"
+    )
+    config = llama_config_to_gpt2_config(config_from_checkpoint(checkpoint_path, model_name))
+    ckpt_state_dicts = state_dicts_from_checkpoint(checkpoint_path, model_name)
+    pretrained_state_dict = remap_state_dict_meta_llama(ckpt_state_dicts[0], config)
+    model = GPTLMHeadModel(config, device="meta")  # Without device='meta' init is very slow
+    state_dict = model.state_dict()
+    assert state_dict.keys() == pretrained_state_dict.keys()
+    for k in state_dict.keys():
+        assert state_dict[k].shape == pretrained_state_dict[k].shape
+
+
+# TinyLlama-1.1B is to test MQA
+@pytest.mark.parametrize(
+    "model_name", ["meta-llama/Llama-2-7b-hf", "PY007/TinyLlama-1.1B-step-50K-105b"]
+)
+def test_inv_remap_state_dict_hf_llama(model_name):
+    config = llama_config_to_gpt2_config(
+        AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    )
+    state_dict = state_dict_from_pretrained(model_name)
+    # inv_remap_state_dict_hf_llama should be the inverse of remap_state_dict_hf_llama
+    state_dict = {key: val for key, val in state_dict.items() if "rotary_emb.inv_freq" not in key}
+    pretrained_state_dict = remap_state_dict_hf_llama(state_dict, config)
+    state_dict_recover = inv_remap_state_dict_hf_llama(pretrained_state_dict, config)
+    assert set(state_dict_recover.keys()) == set(state_dict.keys())
+    for key in state_dict_recover.keys():
+        torch.testing.assert_close(state_dict_recover[key], state_dict[key])
+
+
+# TinyLlama-1.1B is to test MQA
+@pytest.mark.parametrize(
+    "model_name",
+    [
+        "7B",  # Llama 1
+        "13B",  # Llama 1
+        "meta-llama/Llama-2-13b-hf",
+        "codellama/CodeLlama-7b-hf",
+        "codellama/CodeLlama-13b-hf",
+        "codellama/CodeLlama-34b-hf",
+        "PY007/TinyLlama-1.1B-step-50K-105b",
+    ],
+)
+def test_llama_optimized(model_name):
+    """Check that our implementation of LLaMa (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    checkpoint_path = (
+        Path(os.environ.get("CHECKPOINT_DIR", current_dir.parent.parent / "checkpoints")) / "llama"
+    )
+
+    dtype = torch.float16
+    device = "cuda"
+    if "/" in model_name:  # Download from HF
+        config = llama_config_to_gpt2_config(
+            AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+        )
+    else:
+        config = config_from_checkpoint(checkpoint_path, model_name, checkpoint_format="meta")
+        config = llama_config_to_gpt2_config(config)
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = False  # We don't have fused GatedMLP yet
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+
+    if "/" in model_name:  # Download from HF
+        pretrained_state_dict = remap_state_dict_hf_llama(
+            state_dict_from_pretrained(model_name), config
+        )
+    else:
+        pretrained_state_dict = _pretrained_state_dict_from_checkpoint(
+            checkpoint_path, model_name, config, checkpoint_format="meta"
+        )
+    model = GPTLMHeadModel(config, device=device, dtype=dtype)
+    model.load_state_dict(pretrained_state_dict)
+    model.eval()
+
+    torch.manual_seed(0)
+    batch_size = 2
+    max_seqlen = 256
+    seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device)
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device
+    )
+    with torch.no_grad():
+        out = model.transformer(input_ids)
+        logits = model(input_ids).logits
+    del model
+
+    # Without device_map, the model is loaded on the CPU, which is very slow
+    # Need auto here since the 13B fp32 model doesn't fit in memory on a A100 40GB
+    model_ref = LlamaForCausalLM.from_pretrained(
+        model_name if "/" in model_name else Path(checkpoint_path) / f"{model_name}-hf",
+        device_map="auto",
+    )
+    model_ref.eval()
+    with torch.no_grad():
+        out_ref = model_ref.model(input_ids).last_hidden_state.to(device=device)
+        logits_ref = model_ref(input_ids).logits.to(device=device)
+    del model_ref
+
+    model_hf = LlamaForCausalLM.from_pretrained(
+        model_name if "/" in model_name else Path(checkpoint_path) / f"{model_name}-hf",
+        torch_dtype=dtype,
+        device_map={"": device},
+    )
+    model_hf.eval()
+    with torch.no_grad():
+        out_hf = model_hf.model(input_ids).last_hidden_state
+        logits_hf = model_hf(input_ids).logits
+    del model_hf
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
+    assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item()
+
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+    print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
+    assert (logits - logits_ref).abs().max().item() < 3 * (
+        logits_hf - logits_ref
+    ).abs().max().item()
+
+
+# torchrun --no_python --nproc_per_node=2 pytest -q -s tests/models/test_llama.py -k "parallel"
+@pytest.mark.parametrize("world_size", [2])
+@pytest.mark.parametrize(
+    "model_name", ["13B", "meta-llama/Llama-2-13b-hf", "codellama/CodeLlama-34b-hf"]
+)
+def test_llama_parallel(model_name, world_size):
+    """Check that our implementation of LLaMa (with all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    from apex.transformer import parallel_state
+
+    checkpoint_path = (
+        Path(os.environ.get("CHECKPOINT_DIR", current_dir.parent.parent / "checkpoints")) / "llama"
+    )
+
+    dtype = torch.float16
+    if "/" in model_name:  # Download from HF
+        config = llama_config_to_gpt2_config(
+            AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+        )
+    else:
+        config = config_from_checkpoint(checkpoint_path, model_name, checkpoint_format="meta")
+        config = llama_config_to_gpt2_config(config)
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = False  # We don't have fused GatedMLP yet
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    process_group = parallel_state.get_tensor_model_parallel_group()
+
+    if "/" in model_name:  # Download from HF
+        pretrained_state_dict = remap_state_dict_hf_llama(
+            state_dict_from_pretrained(model_name), config
+        )
+    else:
+        pretrained_state_dict = _pretrained_state_dict_from_checkpoint(
+            checkpoint_path, model_name, config, checkpoint_format="meta"
+        )
+    model = GPTLMHeadModel(config, process_group=process_group, device=device, dtype=dtype)
+    model.load_state_dict(shard_state_dict_tp(pretrained_state_dict, config, world_size, rank))
+    model.eval()
+
+    torch.manual_seed(0)
+    batch_size = 2
+    max_seqlen = 256
+    seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device)
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device
+    )
+    with torch.no_grad():
+        out = model.transformer(input_ids)
+        out, _ = all_gather_raw(out, process_group=process_group)
+        out = rearrange(out, "(b s) d -> b s d", b=batch_size)
+        logits = model(input_ids).logits
+        logits = rearrange(logits, "(b s) d -> b s d", b=batch_size)
+        logits, _ = all_gather_raw(logits, process_group)
+        logits = rearrange(logits, "(n b) ... d -> b ... (n d)", b=batch_size)
+    del model
+
+    if rank == 0:
+        # Without device_map, the model is loaded on the CPU, which is very slow
+        model_ref = LlamaForCausalLM.from_pretrained(
+            model_name if "/" in model_name else Path(checkpoint_path) / f"{model_name}-hf",
+            device_map="auto",
+        )
+        model_ref.eval()
+        with torch.no_grad():
+            out_ref = model_ref.model(input_ids).last_hidden_state.to(device=device)
+            logits_ref = model_ref(input_ids).logits.to(device=device)
+        del model_ref
+
+        model_hf = LlamaForCausalLM.from_pretrained(
+            model_name if "/" in model_name else Path(checkpoint_path) / f"{model_name}-hf",
+            torch_dtype=dtype,
+            device_map="auto",
+        )
+        model_hf.eval()
+        with torch.no_grad():
+            out_hf = model_hf.model(input_ids).last_hidden_state.to(device=device)
+            logits_hf = model_hf(input_ids).logits.to(device=device)
+        del model_hf
+
+        print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+        print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+        print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
+        print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
+        assert (out - out_ref).abs().max().item() < 2 * (out_hf - out_ref).abs().max().item()
+
+        print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+        print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
+        print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
+        print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
+        assert (logits - logits_ref).abs().max().item() < 2 * (
+            logits_hf - logits_ref
+        ).abs().max().item()
+
+
+# @pytest.mark.parametrize('model_name', ["7B", "13B"])
+@pytest.mark.parametrize("model_name", ["7B"])
+@pytest.mark.parametrize("checkpoint_format", ["meta", "hf"])
+def test_llama_generation(model_name, checkpoint_format):
+    checkpoint_path = (
+        Path(os.environ.get("CHECKPOINT_DIR", current_dir.parent.parent / "checkpoints")) / "llama"
+    )
+
+    dtype = torch.float16
+    device = "cuda"
+    config = config_from_checkpoint(checkpoint_path, model_name, checkpoint_format)
+    config = llama_config_to_gpt2_config(config)
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = False  # We don't have fused GatedMLP yet
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+
+    tokenizer = LlamaTokenizer.from_pretrained(Path(checkpoint_path) / f"{model_name}-hf")
+    eos_token_id = tokenizer.eos_token_id
+
+    torch.manual_seed(0)
+    batch_size = 1
+    seqlen = 100
+    max_length = 150
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
+    )
+
+    model_hf = LlamaForCausalLM.from_pretrained(
+        Path(checkpoint_path) / f"{model_name}-hf", torch_dtype=dtype, device_map={"": device}
+    )
+    model_hf.eval()
+    print("HF fp16")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_hf = model_hf.generate(
+        input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+    del model_hf
+
+    # Need auto here since the 13B fp32 model doesn't fit in memory on a A100 40GB
+    model_ref = LlamaForCausalLM.from_pretrained(
+        Path(checkpoint_path) / f"{model_name}-hf", device_map="auto"
+    )
+    model_ref.eval()
+    with torch.no_grad():
+        logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1].to(device=device)
+    del model_ref
+
+    pretrained_state_dict = _pretrained_state_dict_from_checkpoint(
+        checkpoint_path, model_name, config, checkpoint_format
+    )
+    model = GPTLMHeadModel(config, device=device, dtype=dtype)
+    model.load_state_dict(pretrained_state_dict)
+    model.eval()
+
+    print("Without CUDA graph")
+    torch.cuda.synchronize()
+    start = time.time()
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        eos_token_id=eos_token_id,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+        teacher_outputs=out_hf.sequences,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+
+    # Capture graph outside the timing loop
+    batch_size, seqlen_og = input_ids.shape
+    model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length)
+    print("With CUDA graph")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_cg = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        cg=True,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+        teacher_outputs=out_hf.sequences,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+
+    with torch.no_grad():
+        logits_parallel = model(out_hf.sequences).logits[:, (seqlen - 1) : -1]
+    logits_hf = torch.stack(out_hf.scores, dim=1)
+    logits = torch.stack(out.scores, dim=1)
+    logits_cg = torch.stack(out_cg.scores, dim=1)
+
+    del model
+
+    hf_error = (logits_hf - logits_ref).abs().max().item()
+
+    print(f"HF fp16 logits max diff: {hf_error}")
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+    print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item()}")
+
+    assert (logits_parallel - logits_ref).abs().max().item() < 2 * hf_error
+    assert (logits - logits_ref).abs().max().item() < 2 * hf_error
+    assert torch.equal(logits_cg, logits)
+
+
+# torchrun --no_python --nproc_per_node=2 pytest -q -s tests/models/test_llama.py -k "llama_parallel_generation"
+@pytest.mark.parametrize("world_size", [2])
+@pytest.mark.parametrize(
+    "model_name", ["13B", "meta-llama/Llama-2-13b-hf", "codellama/CodeLlama-34b-hf"]
+)
+def test_llama_parallel_generation(model_name, world_size):
+    """Check that our implementation matches the HF implementation:
+    the scores in fp16 should be around the same as the HF scores in fp16, when compared to
+    the HF scores in fp32.
+    """
+    from apex.transformer import parallel_state
+
+    checkpoint_path = (
+        Path(os.environ.get("CHECKPOINT_DIR", current_dir.parent.parent / "checkpoints")) / "llama"
+    )
+
+    dtype = torch.float16
+    if "/" in model_name:  # Download from HF
+        config = llama_config_to_gpt2_config(
+            AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+        )
+    else:
+        config = config_from_checkpoint(checkpoint_path, model_name, checkpoint_format="meta")
+        config = llama_config_to_gpt2_config(config)
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = False  # We don't have fused GatedMLP yet
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+    config.pad_vocab_size_multiple = 8 * world_size
+    config.sequence_parallel = False  # Need to set this to False for generation
+
+    os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0"
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    process_group = parallel_state.get_tensor_model_parallel_group()
+
+    torch.manual_seed(0)
+    batch_size = 1
+    seqlen = 100
+    max_length = 150
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device
+    )
+
+    # Need this, otherwise when we capture the graph the process for GPU 1 would run on both
+    # GPU0 and GPU1 and things would hang
+    torch.cuda.set_device(device)
+
+    if "/" in model_name:  # Download from HF
+        pretrained_state_dict = remap_state_dict_hf_llama(
+            state_dict_from_pretrained(model_name), config
+        )
+    else:
+        pretrained_state_dict = _pretrained_state_dict_from_checkpoint(
+            checkpoint_path, model_name, config, checkpoint_format="meta"
+        )
+    model = GPTLMHeadModel(config, process_group=process_group, device=device, dtype=dtype)
+    model.load_state_dict(shard_state_dict_tp(pretrained_state_dict, config, world_size, rank))
+    model.eval()
+
+    print("Without CUDA graph")
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        tensor_parallel=world_size,
+        vocab_size=config.vocab_size,
+        # teacher_outputs=out_hf.sequences,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+    )
+
+    # Capture graph outside the timing loop
+    batch_size, seqlen_og = input_ids.shape
+    model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length)
+    print("With CUDA graph")
+    out_cg = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        tensor_parallel=world_size,
+        vocab_size=config.vocab_size,
+        cg=True,
+        # teacher_outputs=out_hf.sequences,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+    )
+    del model
+    parallel_state.destroy_model_parallel()
+
+    if rank == 0:
+        # Without device_map, the model is loaded on the CPU, which is very slow
+        model_hf = LlamaForCausalLM.from_pretrained(
+            model_name if "/" in model_name else Path(checkpoint_path) / f"{model_name}-hf",
+            torch_dtype=dtype,
+            device_map="auto",
+        )
+        model_hf.eval()
+        print("HF fp16")
+        torch.cuda.synchronize()
+        start = time.time()
+        with torch.inference_mode():
+            out_hf = model_hf.generate(
+                input_ids=input_ids,
+                max_length=max_length,
+                return_dict_in_generate=True,
+                output_scores=True,
+            )
+        torch.cuda.synchronize()
+        print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+        del model_hf
+
+        model_ref = LlamaForCausalLM.from_pretrained(
+            model_name if "/" in model_name else Path(checkpoint_path) / f"{model_name}-hf",
+            device_map="auto",
+        )
+        model_ref.eval()
+        with torch.inference_mode():
+            logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1]
+        del model_ref
+        logits_hf = torch.stack(out_hf.scores, dim=1)
+
+        logits = torch.stack(out.scores, dim=1)
+        logits_cg = torch.stack(out_cg.scores, dim=1)
+
+        hf_error = (logits_hf - logits_ref).abs().max().item()
+        print(f"HF fp16 logits max diff: {hf_error}")
+        print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+        assert (logits - logits_ref).abs().max().item() < 2 * hf_error
+        print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item()}")
+        assert torch.equal(logits_cg, logits)
+
+
+@torch.no_grad()
+@pytest.mark.parametrize("world_size", [2])
+def test_llama_parallel_uneven_num_heads(world_size):
+    from apex.transformer import parallel_state
+
+    checkpoint_path = (
+        Path(os.environ.get("CHECKPOINT_DIR", current_dir.parent.parent / "checkpoints")) / "llama"
+    )
+    num_attention_heads = world_size + 1
+    model_name = f"teeny-{num_attention_heads}-heads"
+
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    process_group = parallel_state.get_tensor_model_parallel_group()
+
+    dtype = torch.float16
+    llama_config = LlamaConfig(
+        hidden_size=256
+        * num_attention_heads,  # ParallelGatedMlp hidden_features must be divisible by 256
+        intermediate_size=256 * num_attention_heads * 4,
+        num_hidden_layers=4,
+        num_attention_heads=num_attention_heads,
+        initializer_range=0.5,  # Set crazy init range so we don't have near zero weights implying a vacuous test.
+    )
+    config = llama_config_to_gpt2_config(llama_config)
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = False  # We don't have fused GatedMLP yet
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+
+    torch.manual_seed(0)
+    batch_size = 2
+    max_seqlen = 256
+    seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device)
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device
+    )
+
+    # Create a shared test model.
+    if rank == 0:
+        LlamaForCausalLM(config=llama_config).save_pretrained(checkpoint_path / f"{model_name}-hf")
+    torch.distributed.barrier()
+
+    # Run the standard forward pass test.
+    pretrained_state_dict = _pretrained_state_dict_from_checkpoint(
+        checkpoint_path, model_name, config, checkpoint_format="hf"
+    )
+    model = GPTLMHeadModel(config, process_group=process_group, device=device, dtype=dtype)
+    model.load_state_dict(shard_state_dict_tp(pretrained_state_dict, config, world_size, rank))
+    model.eval()
+
+    # TODO: Avoid duplicate code. Modularize the comparison of two forward pass diffs.
+    out = model.transformer(input_ids)
+    out, _ = all_gather_raw(out, process_group=process_group)
+    out = rearrange(out, "(b s) d -> b s d", b=batch_size)
+    logits = model(input_ids).logits
+    logits = rearrange(logits, "(b s) d -> b s d", b=batch_size)
+    logits, _ = all_gather_raw(logits, process_group)
+    logits = rearrange(logits, "(n b) ... d -> b ... (n d)", b=batch_size)
+
+    if rank == 0:
+        model_ref = LlamaForCausalLM.from_pretrained(
+            Path(checkpoint_path) / f"{model_name}-hf", device_map={"": device}
+        )
+        model_ref = model_ref.to(device=device)
+        model_ref.eval()
+        out_ref = model_ref.model(input_ids).last_hidden_state
+        logits_ref = model_ref(input_ids).logits
+        del model_ref
+
+        model_hf = LlamaForCausalLM.from_pretrained(
+            Path(checkpoint_path) / f"{model_name}-hf", torch_dtype=dtype, device_map={"": device}
+        )
+        model_hf.eval()
+        out_hf = model_hf.model(input_ids).last_hidden_state.to(device=device)
+        logits_hf = model_hf(input_ids).logits.to(device=device)
+        del model_hf
+
+        print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+        print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+        print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
+        print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
+        assert (out - out_ref).abs().max().item() < 2 * (out_hf - out_ref).abs().max().item()
+
+        print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+        print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
+        print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
+        print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
+        assert (logits - logits_ref).abs().max().item() < 2 * (
+            logits_hf - logits_ref
+        ).abs().max().item()
+
+        if os.path.exists(checkpoint_path / f"{model_name}-hf"):
+            shutil.rmtree(checkpoint_path / f"{model_name}-hf")
diff --git a/test_mha_parallel.py b/test_mha_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..880cce2f679725da21bdebdbfa38cba972b26070
--- /dev/null
+++ b/test_mha_parallel.py
@@ -0,0 +1,160 @@
+# Run test with:
+# torchrun --no_python --nproc_per_node=8 pytest -q -s tests/modules/test_mha_parallel.py
+
+import math
+
+import pytest
+import torch
+import torch.nn.functional as F
+from apex.transformer import parallel_state, tensor_parallel
+from einops import rearrange
+from flash_attn.modules.mha import MHA, ParallelMHA
+
+is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8
+
+
+@pytest.mark.parametrize("dtype", [torch.float16] + ([torch.bfloat16] if is_sm8x else []))
+# @pytest.mark.parametrize('dtype', [torch.float16])
+@pytest.mark.parametrize("world_size", [1, 2, 4, 8])
+# @pytest.mark.parametrize('world_size', [2])
+@pytest.mark.parametrize("sequence_parallel", [True, False])
+# @pytest.mark.parametrize('sequence_parallel', [False])
+@pytest.mark.parametrize("head_dim", [64, 128])
+# @pytest.mark.parametrize('head_dim', [64])
+@pytest.mark.parametrize("embed_dim", [1024, 4096])
+# @pytest.mark.parametrize('embed_dim', [1024])
+def test_mha_parallel(embed_dim, head_dim, sequence_parallel, world_size, dtype):
+    assert embed_dim % head_dim == 0
+    num_heads = embed_dim // head_dim
+    assert num_heads % world_size == 0
+    rtol, atol = (3e-3, 1e-2) if dtype == torch.bfloat16 else (3e-3, 1e-3)
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    seqlen = 1024
+    assert (batch_size * seqlen) % world_size == 0
+    x_pt = torch.randn(
+        batch_size * seqlen, embed_dim, device=device, dtype=dtype, requires_grad=True
+    )
+    # We need to generate g here so that all processes get the same gradient,
+    # as rank 0 will have an extra bias that changes the RNG.
+    # If we don't divide by batch_size, the gradient gets a bit too large.
+    g = torch.randn_like(x_pt) / 32
+    if sequence_parallel:
+        x = (
+            tensor_parallel.scatter_to_sequence_parallel_region(x_pt)
+            .detach()
+            .clone()
+            .requires_grad_()
+        )
+    else:
+        x = x_pt.detach().clone().requires_grad_()
+
+    model_pt = MHA(
+        embed_dim,
+        num_heads,
+        rotary_emb_dim=int(head_dim // 2),
+        use_flash_attn=True,
+        device=device,
+        dtype=dtype,
+    )
+    partition_dim = embed_dim // world_size
+    model = ParallelMHA(
+        embed_dim,
+        num_heads,
+        parallel_state.get_tensor_model_parallel_group(),
+        rotary_emb_dim=int(head_dim // 2),
+        use_flash_attn=True,
+        sequence_parallel=sequence_parallel,
+        device=device,
+        dtype=dtype,
+    )
+
+    with torch.no_grad():
+        model.Wqkv.weight.copy_(
+            rearrange(
+                rearrange(model_pt.Wqkv.weight, "(three o) i -> three o i", three=3)[
+                    :, rank * partition_dim : (rank + 1) * partition_dim
+                ],
+                "three o i -> (three o) i",
+            )
+        )
+        model.Wqkv.bias.copy_(
+            rearrange(
+                rearrange(model_pt.Wqkv.bias, "(three o) -> three o", three=3)[
+                    :, rank * partition_dim : (rank + 1) * partition_dim
+                ],
+                "three o -> (three o)",
+            )
+        )
+        model.out_proj.weight.copy_(
+            model_pt.out_proj.weight[:, rank * partition_dim : (rank + 1) * partition_dim]
+        )
+        if rank == 0:
+            model.out_proj.bias.copy_(model_pt.out_proj.bias)
+
+    out = model(x, seqlen=seqlen)
+    out_pt = rearrange(model_pt(rearrange(x_pt, "(b s) d -> b s d", s=seqlen)), "b s d -> (b s) d")
+    partition_batch_dim = batch_size * seqlen // world_size
+    assert torch.allclose(
+        out,
+        out_pt[rank * partition_batch_dim : (rank + 1) * partition_batch_dim]
+        if sequence_parallel
+        else out_pt,
+        rtol=rtol,
+        atol=atol,
+    )
+
+    out_pt.backward(g)
+    out.backward(
+        g[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] if sequence_parallel else g
+    )
+    parallel_state.destroy_model_parallel()
+
+    assert torch.allclose(
+        x.grad,
+        x_pt.grad[rank * partition_batch_dim : (rank + 1) * partition_batch_dim]
+        if sequence_parallel
+        else x_pt.grad,
+        rtol=rtol,
+        atol=atol / 100,  # magnitude of x.grad is quite small
+    )
+    # The error for d_weight and d_bias is quite a bit higher
+    assert torch.allclose(
+        model.Wqkv.weight.grad,
+        rearrange(
+            rearrange(model_pt.Wqkv.weight.grad, "(three o) i -> three o i", three=3)[
+                :, rank * partition_dim : (rank + 1) * partition_dim
+            ],
+            "three o i -> (three o) i",
+        ),
+        rtol=rtol,
+        atol=atol * 10,
+    )
+    assert torch.allclose(
+        model.Wqkv.bias.grad,
+        rearrange(
+            rearrange(model_pt.Wqkv.bias.grad, "(three o) -> three o", three=3)[
+                :, rank * partition_dim : (rank + 1) * partition_dim
+            ],
+            "three o -> (three o)",
+        ),
+        rtol=rtol,
+        atol=atol * 5,
+    )
+    assert torch.allclose(
+        model.out_proj.weight.grad,
+        model_pt.out_proj.weight.grad[:, rank * partition_dim : (rank + 1) * partition_dim],
+        rtol=rtol,
+        atol=atol * 10,
+    )
+    if rank == 0:
+        assert torch.allclose(
+            model.out_proj.bias.grad, model_pt.out_proj.bias.grad, rtol=rtol, atol=atol * 5
+        )
diff --git a/test_mlp_parallel.py b/test_mlp_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..49051bc2be19fa8cb63ddc5887be87ac3406dce1
--- /dev/null
+++ b/test_mlp_parallel.py
@@ -0,0 +1,143 @@
+# Run test with:
+# torchrun --no_python --nproc_per_node=8 pytest -q -s tests/modules/test_mlp_parallel.py
+
+import pytest
+import torch
+import torch.nn.functional as F
+from apex.transformer import parallel_state, tensor_parallel
+from einops import rearrange
+from flash_attn.modules.mlp import GatedMlp, ParallelGatedMlp
+
+is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8
+
+
+@pytest.mark.parametrize("dtype", [torch.float16] + ([torch.bfloat16] if is_sm8x else []))
+# @pytest.mark.parametrize('dtype', [torch.float16])
+@pytest.mark.parametrize("world_size", [1, 2, 4, 8])
+# @pytest.mark.parametrize('world_size', [2])
+@pytest.mark.parametrize("sequence_parallel", [True, False])
+# @pytest.mark.parametrize('sequence_parallel', [False])
+@pytest.mark.parametrize("activation", [F.silu, F.sigmoid])
+# @pytest.mark.parametrize('activation', [F.silu])
+@pytest.mark.parametrize("dim", [1024, 4096])
+# @pytest.mark.parametrize('dim', [1024])
+def test_mlp_parallel(dim, activation, sequence_parallel, world_size, dtype):
+    rtol, atol = (3e-3, 3e-2) if dtype == torch.bfloat16 else (3e-3, 3e-3)
+
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    device = f"cuda:{torch.distributed.get_rank()}"
+    assert world_size <= torch.distributed.get_world_size()
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 2
+    seqlen = 1024
+    assert (batch_size * seqlen) % world_size == 0
+    x_pt = torch.randn(batch_size * seqlen, dim, device=device, dtype=dtype, requires_grad=True)
+    # We need to generate g here so that all processes get the same gradient,
+    # as rank 0 will have an extra bias that changes the RNG.
+    # If we don't divide by batch_size, the gradient gets a bit too large.
+    g = torch.randn_like(x_pt) / 32
+    if sequence_parallel:
+        x = (
+            tensor_parallel.scatter_to_sequence_parallel_region(x_pt)
+            .detach()
+            .clone()
+            .requires_grad_()
+        )
+    else:
+        x = x_pt.detach().clone().requires_grad_()
+
+    model_pt = GatedMlp(dim, activation=activation, device=device, dtype=dtype)
+    partition_dim = model_pt.fc1.weight.shape[0] // 2 // world_size
+    model = ParallelGatedMlp(
+        dim,
+        parallel_state.get_tensor_model_parallel_group(),
+        activation=activation,
+        sequence_parallel=sequence_parallel,
+        device=device,
+        dtype=dtype,
+    )
+
+    with torch.no_grad():
+        model.fc1.weight.copy_(
+            rearrange(
+                rearrange(model_pt.fc1.weight, "(two o) i -> two o i", two=2)[
+                    :, rank * partition_dim : (rank + 1) * partition_dim
+                ],
+                "two o i -> (two o) i",
+            )
+        )
+        model.fc1.bias.copy_(
+            rearrange(
+                rearrange(model_pt.fc1.bias, "(two o) -> two o", two=2)[
+                    :, rank * partition_dim : (rank + 1) * partition_dim
+                ],
+                "two o -> (two o)",
+            )
+        )
+        model.fc2.weight.copy_(
+            model_pt.fc2.weight[:, rank * partition_dim : (rank + 1) * partition_dim]
+        )
+        if rank == 0:
+            model.fc2.bias.copy_(model_pt.fc2.bias)
+
+    out = model(x)
+    out_pt = model_pt(x_pt)
+    partition_batch_dim = batch_size * seqlen // world_size
+    assert torch.allclose(
+        out,
+        out_pt[rank * partition_batch_dim : (rank + 1) * partition_batch_dim]
+        if sequence_parallel
+        else out_pt,
+        rtol=rtol,
+        atol=atol,
+    )
+
+    out_pt.backward(g)
+    out.backward(
+        g[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] if sequence_parallel else g
+    )
+    parallel_state.destroy_model_parallel()
+
+    assert torch.allclose(
+        x.grad,
+        x_pt.grad[rank * partition_batch_dim : (rank + 1) * partition_batch_dim]
+        if sequence_parallel
+        else x_pt.grad,
+        rtol=rtol,
+        atol=atol,
+    )
+
+    assert torch.allclose(
+        model.fc1.weight.grad,
+        rearrange(
+            rearrange(model_pt.fc1.weight.grad, "(two o) i -> two o i", two=2)[
+                :, rank * partition_dim : (rank + 1) * partition_dim
+            ],
+            "two o i -> (two o) i",
+        ),
+        rtol=rtol,
+        atol=atol,
+    )
+    assert torch.allclose(
+        model.fc1.bias.grad,
+        rearrange(
+            rearrange(model_pt.fc1.bias.grad, "(two o) -> two o", two=2)[
+                :, rank * partition_dim : (rank + 1) * partition_dim
+            ],
+            "two o -> (two o)",
+        ),
+        rtol=rtol,
+        atol=atol,
+    )
+    assert torch.allclose(
+        model.fc2.weight.grad,
+        model_pt.fc2.weight.grad[:, rank * partition_dim : (rank + 1) * partition_dim],
+        rtol=rtol,
+        atol=atol,
+    )
+    if rank == 0:
+        assert torch.allclose(model.fc2.bias.grad, model_pt.fc2.bias.grad, rtol=rtol, atol=atol)
diff --git a/test_opt.py b/test_opt.py
new file mode 100644
index 0000000000000000000000000000000000000000..8188e36a30c0e0e869b0ea828e3ea0839db9973c
--- /dev/null
+++ b/test_opt.py
@@ -0,0 +1,237 @@
+import re
+import time
+
+import pytest
+import torch
+from einops import rearrange
+from flash_attn.models.gpt import GPTLMHeadModel
+from flash_attn.models.opt import opt_config_to_gpt2_config, remap_state_dict_hf_opt
+from flash_attn.utils.generation import update_graph_cache
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+from transformers import AutoTokenizer, OPTConfig
+from transformers.models.opt.modeling_opt import OPTForCausalLM
+
+
+@pytest.mark.parametrize(
+    "model_name", ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b"]
+)
+# @pytest.mark.parametrize('model_name', ["facebook/opt-350m"])
+def test_opt_state_dict(model_name):
+    config = opt_config_to_gpt2_config(OPTConfig.from_pretrained(model_name))
+    pretrained_state_dict = remap_state_dict_hf_opt(state_dict_from_pretrained(model_name), config)
+    model = GPTLMHeadModel(config)
+    state_dict = model.state_dict()
+    assert state_dict.keys() == pretrained_state_dict.keys()
+    for k in state_dict.keys():
+        assert state_dict[k].shape == pretrained_state_dict[k].shape
+
+
+@pytest.mark.parametrize(
+    "model_name", ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b"]
+)
+# @pytest.mark.parametrize('model_name', ["facebook/opt-350m"])
+def test_opt_optimized(model_name):
+    """Check that our implementation of OPT (without all optimizations enabled) matches the
+    HF implementation: the output of our forward pass in fp16 should be around the same as the HF
+    forward pass in fp16, when compared to the HF forward pass in fp32.
+    """
+    dtype = torch.float16
+    device = "cuda"
+    config = opt_config_to_gpt2_config(OPTConfig.from_pretrained(model_name))
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = True
+    config.fused_dropout_add_ln = True
+    # Only prenorm supports residual_in_fp32
+    config.residual_in_fp32 = getattr(config, "prenorm", True)
+    config.pad_vocab_size_multiple = 8
+
+    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
+
+    model_ref = OPTForCausalLM.from_pretrained(model_name).to(device=device)
+    model_hf = OPTForCausalLM.from_pretrained(model_name, torch_dtype=dtype).to(device=device)
+
+    model.eval()
+    model_ref.eval()
+    model_hf.eval()
+
+    torch.manual_seed(0)
+    batch_size = 2
+    max_seqlen = 256
+    seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device="cuda")
+    input_ids = torch.randint(
+        0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device="cuda"
+    )
+    if model_name != "facebook/opt-350m":  # The OPT-350m projects the embeddings to dimension 512
+        out = model.transformer(input_ids)
+        out_hf = model_hf.model(input_ids).last_hidden_state
+        out_ref = model_ref.model(input_ids).last_hidden_state
+
+        print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+        print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+        print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}")
+        print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}")
+        assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item()
+
+    logits = model(input_ids).logits
+    logits_hf = model_hf(input_ids).logits
+    logits_ref = model_ref(input_ids).logits
+
+    print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}")
+    print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}")
+    print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}")
+    print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}")
+    assert (logits - logits_ref).abs().max().item() < 3 * (
+        logits_hf - logits_ref
+    ).abs().max().item()
+
+
+@pytest.mark.parametrize(
+    "model_name",
+    [
+        "facebook/opt-125m",
+        "facebook/opt-350m",
+        "facebook/opt-1.3b",
+        "facebook/opt-2.7b",
+        "facebook/opt-6.7b",
+    ],
+)
+# @pytest.mark.parametrize('model_name', ["facebook/opt-125m"])
+def test_opt_generation(model_name):
+    """Check that our implementation of OPT generation matches the HF implementation:
+    the scores in fp16 should be around the same as the HF scores in fp16, when compared to
+    the HF scores in fp32.
+    """
+    print(f"\nMODEL: {model_name}")
+    verbose = False
+    dtype = torch.float16
+    device = "cuda"
+    rtol, atol = 3e-3, 3e-1
+    config = opt_config_to_gpt2_config(OPTConfig.from_pretrained(model_name))
+    # Only prenorm supports residual_in_fp32
+    config.residual_in_fp32 = getattr(config, "prenorm", True)
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = True
+    config.fused_dropout_add_ln = True
+
+    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
+    model.eval()
+
+    torch.manual_seed(0)
+    # OPT tokenizer requires use_fast=False
+    # https://huggingface.co/docs/transformers/model_doc/opt
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+    eos_token_id = tokenizer.eos_token_id
+
+    input_ids = tokenizer("Hello, my dog is cute and he", return_tensors="pt").input_ids.to(
+        device=device
+    )
+    max_length = 25
+    # input_ids = torch.randint(0, 100, (2, 10), dtype=torch.long, device='cuda')
+    # max_length = input_ids.shape[1] + 40
+
+    # Slow generation for reference
+    sequences = []
+    scores = []
+    cur_input_ids = input_ids
+    with torch.inference_mode():
+        scores.append(model(cur_input_ids).logits[:, -1])
+        sequences.append(scores[-1].argmax(dim=-1))
+        for _ in range(input_ids.shape[1] + 1, max_length):
+            cur_input_ids = torch.cat([cur_input_ids, rearrange(sequences[-1], "b -> b 1")], dim=-1)
+            scores.append(model(cur_input_ids).logits[:, -1])
+            sequences.append(scores[-1].argmax(dim=-1))
+            if eos_token_id is not None and (sequences[-1] == eos_token_id).all():
+                break
+    sequences = torch.cat([input_ids, torch.stack(sequences, dim=1)], dim=1)
+    scores = tuple(scores)
+
+    print("Without CUDA graph")
+    torch.cuda.synchronize()
+    start = time.time()
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        eos_token_id=eos_token_id,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+    if verbose:
+        print(out.sequences)
+    print(tokenizer.batch_decode(out.sequences.tolist()))
+    if getattr(config, "use_flash_attn", False):
+        # Capture graph outside the timing loop
+        batch_size, seqlen_og = input_ids.shape
+        model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length)
+        print("With CUDA graph")
+        torch.cuda.synchronize()
+        start = time.time()
+        out_cg = model.generate(
+            input_ids=input_ids,
+            max_length=max_length,
+            cg=True,
+            return_dict_in_generate=True,
+            output_scores=True,
+            enable_timing=True,
+        )
+        torch.cuda.synchronize()
+        print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+        if verbose:
+            print(out_cg.sequences)
+        print(tokenizer.batch_decode(out_cg.sequences.tolist()))
+
+    del model
+
+    model_hf = OPTForCausalLM.from_pretrained(model_name, torch_dtype=dtype).to(device=device)
+    model_hf.eval()
+    print("HF fp16")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_hf = model_hf.generate(
+        input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+    del model_hf
+
+    model_ref = OPTForCausalLM.from_pretrained(model_name).to(device=device)
+    model_ref.eval()
+    print("HF fp32")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_ref = model_ref.generate(
+        input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+    del model_ref
+    print(tokenizer.batch_decode(out_ref.sequences.tolist()))
+
+    if verbose:
+        print(
+            f"Scores max diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}"
+        )
+        print(
+            f"Scores mean diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}"
+        )
+        print(
+            f"HF fp16 max diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}"
+        )
+        print(
+            f"HF fp16 mean diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}"
+        )
+
+    assert torch.all(out.sequences == sequences)
+    assert torch.allclose(
+        torch.stack(out.scores, dim=1), torch.stack(scores, dim=1), rtol=rtol, atol=atol
+    )
+    assert torch.all(out.sequences == out_ref.sequences)
+    assert torch.all(out.sequences == out_hf.sequences)
+
+    assert (torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item() < 3 * (
+        torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)
+    ).abs().max().item()
diff --git a/test_rotary.py b/test_rotary.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b73ff90dcebf8b8f4743391f0b26de9356bdaae
--- /dev/null
+++ b/test_rotary.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2023, Tri Dao.
+
+import math
+
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from flash_attn.layers.rotary import RotaryEmbedding, apply_rotary_emb_func, apply_rotary_emb_qkv_
+from transformers.models.gpt_neox.modeling_gpt_neox import RotaryEmbedding as RotaryEmbeddingNeoX
+from transformers.models.gpt_neox.modeling_gpt_neox import (
+    apply_rotary_pos_emb as apply_rotary_pos_emb_neox,
+)
+from transformers.models.gptj.modeling_gptj import apply_rotary_pos_emb as apply_rotary_pos_emb_gptj
+from transformers.models.gptj.modeling_gptj import fixed_pos_embedding
+
+
+# NeoX-style rotary embedding
+@pytest.mark.parametrize("seqlen_offset", [0, 711])
+@pytest.mark.parametrize("rotary_emb_fraction", [0.5, 1.0])
+def test_rotary(rotary_emb_fraction, seqlen_offset):
+    device = "cuda"
+    dtype = torch.float16
+    rtol, atol = (1e-3, 5e-3)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen_total = 2048
+    seqlen = seqlen_total - seqlen_offset
+    nheads = 16
+    headdim = 128
+    rotary_dim = int(headdim * rotary_emb_fraction)
+    qkv = torch.randn(
+        batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype, requires_grad=True
+    )
+    qkv_og = qkv.clone().detach()  # Our implementation modifies qkv inplace
+    rotary = RotaryEmbedding(rotary_dim, device=device)
+    rotary_neox = RotaryEmbeddingNeoX(rotary_dim, seqlen_total, device=device)
+    # Doesn't matter what tensor we pass in, rotary_neox only uses the device of the tensor
+    cos_neox, sin_neox = rotary_neox(qkv, seq_len=seqlen_total)
+    cos_neox, sin_neox = cos_neox.to(dtype=dtype), sin_neox.to(dtype=dtype)
+    q_pt = (
+        rearrange(qkv[:, :, 0, :, :rotary_dim], "b s h d -> b h s d")
+        .detach()
+        .clone()
+        .requires_grad_(True)
+    )
+    k_pt = (
+        rearrange(qkv[:, :, 1, :, :rotary_dim], "b s h d -> b h s d")
+        .detach()
+        .clone()
+        .requires_grad_(True)
+    )
+    q_neox, k_neox = apply_rotary_pos_emb_neox(q_pt, k_pt, cos_neox, sin_neox, offset=seqlen_offset)
+    out = rotary(qkv, seqlen_offset=seqlen_offset)
+    assert torch.allclose(
+        rotary._cos_cached, cos_neox[..., : rotary_dim // 2].to(dtype=dtype), rtol=rtol, atol=atol
+    )
+    assert torch.allclose(
+        rotary._sin_cached, sin_neox[..., : rotary_dim // 2].to(dtype=dtype), rtol=rtol, atol=atol
+    )
+    assert torch.allclose(
+        rearrange(q_neox, "b h s d -> b s h d"), out[:, :, 0, :, :rotary_dim], rtol=rtol, atol=atol
+    )
+    assert torch.allclose(
+        rearrange(k_neox, "b h s d -> b s h d"), out[:, :, 1, :, :rotary_dim], rtol=rtol, atol=atol
+    )
+    assert torch.equal(out[:, :, 0:2, :, rotary_dim:], qkv_og[:, :, 0:2, :, rotary_dim:])
+    assert torch.equal(out[:, :, 2], qkv_og[:, :, 2])
+
+    g = torch.randn_like(out)
+    g_og = g.clone().detach()  # Our implementation modifies g inplace
+    out.backward(g)
+    q_neox.backward(rearrange(g_og[:, :, 0, :, :rotary_dim], "b s h d -> b h s d"))
+    k_neox.backward(rearrange(g_og[:, :, 1, :, :rotary_dim], "b s h d -> b h s d"))
+    assert torch.allclose(
+        rearrange(q_pt.grad, "b h s d -> b s h d"),
+        qkv.grad[:, :, 0, :, :rotary_dim],
+        rtol=rtol,
+        atol=atol,
+    )
+    assert torch.allclose(
+        rearrange(k_pt.grad, "b h s d -> b s h d"),
+        qkv.grad[:, :, 1, :, :rotary_dim],
+        rtol=rtol,
+        atol=atol,
+    )
+    assert torch.equal(qkv.grad[:, :, 0:2, :, rotary_dim:], g_og[:, :, 0:2, :, rotary_dim:])
+    assert torch.equal(qkv.grad[:, :, 2], g_og[:, :, 2])
+
+
+# GPT-J-style rotary embedding
+@pytest.mark.parametrize("seqlen_offset", [0, 711])
+@pytest.mark.parametrize("rotary_emb_fraction", [0.5, 1.0])
+def test_rotary_interleaved(rotary_emb_fraction, seqlen_offset):
+    device = "cuda"
+    dtype = torch.float16
+    rtol, atol = (1e-3, 5e-3)
+    # set seed
+    torch.random.manual_seed(0)
+    batch_size = 8
+    seqlen_total = 2048
+    seqlen = seqlen_total - seqlen_offset
+    nheads = 16
+    headdim = 128
+    rotary_dim = int(headdim * rotary_emb_fraction)
+    qkv = torch.randn(
+        batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype, requires_grad=True
+    )
+    qkv_og = qkv.clone().detach()  # Our implementation modifies qkv inplace
+    rotary = RotaryEmbedding(rotary_dim, interleaved=True, device=device)
+    sincos_gptj = fixed_pos_embedding(qkv[..., :rotary_dim], seq_dim=1, seq_len=seqlen_total)
+    sincos_gptj = tuple(x.to(dtype=dtype) for x in sincos_gptj)
+    q_pt = qkv[:, :, 0, :, :rotary_dim].detach().clone().requires_grad_(True)
+    k_pt = qkv[:, :, 1, :, :rotary_dim].detach().clone().requires_grad_(True)
+    q_gptj = apply_rotary_pos_emb_gptj(q_pt, sincos_gptj, offset=seqlen_offset)
+    k_gptj = apply_rotary_pos_emb_gptj(k_pt, sincos_gptj, offset=seqlen_offset)
+    out = rotary(qkv, seqlen_offset=seqlen_offset)
+    assert torch.allclose(rotary._cos_cached, sincos_gptj[1], rtol=rtol, atol=atol)
+    assert torch.allclose(rotary._sin_cached, sincos_gptj[0], rtol=rtol, atol=atol)
+    assert torch.allclose(q_gptj, out[:, :, 0, :, :rotary_dim], rtol=rtol, atol=atol)
+    assert torch.allclose(k_gptj, out[:, :, 1, :, :rotary_dim], rtol=rtol, atol=atol)
+    assert torch.equal(out[:, :, 0:2, :, rotary_dim:], qkv_og[:, :, 0:2, :, rotary_dim:])
+    assert torch.equal(out[:, :, 2], qkv_og[:, :, 2])
+
+    g = torch.randn_like(out)
+    g_og = g.clone().detach()  # Our implementation modifies g inplace
+    out.backward(g)
+    q_gptj.backward(g_og[:, :, 0, :, :rotary_dim])
+    k_gptj.backward(g_og[:, :, 1, :, :rotary_dim])
+    assert torch.allclose(q_pt.grad, qkv.grad[:, :, 0, :, :rotary_dim], rtol=rtol, atol=atol)
+    assert torch.allclose(k_pt.grad, qkv.grad[:, :, 1, :, :rotary_dim], rtol=rtol, atol=atol)
+    assert torch.equal(qkv.grad[:, :, 0:2, :, rotary_dim:], g_og[:, :, 0:2, :, rotary_dim:])
+    assert torch.equal(qkv.grad[:, :, 2], g_og[:, :, 2])
diff --git a/test_util.py b/test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..513a9b8e8fc134ce86f6a423c068e7efecfadeff
--- /dev/null
+++ b/test_util.py
@@ -0,0 +1,254 @@
+import math
+
+import torch
+from einops import rearrange, repeat
+from flash_attn.bert_padding import pad_input, unpad_input
+
+
+def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"):
+    assert mode in ["full", "random", "third"]
+    if mode == "full":
+        lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32)
+    elif mode == "random":
+        lengths = torch.randint(
+            max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device
+        )
+    elif mode == "third":
+        lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device)
+    padding_mask = (
+        repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths
+    )
+    return padding_mask
+
+
+def generate_qkv(
+    q, k, v, query_padding_mask=None, key_padding_mask=None, kvpacked=False, qkvpacked=False
+):
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, d)
+        k: (batch_size, seqlen_k, nheads_k, d)
+        v: (batch_size, seqlen_k, nheads_k, d)
+        query_padding_mask: (batch_size, seqlen), bool
+        key_padding_mask: (batch_size, seqlen), bool
+    """
+    assert not (kvpacked and qkvpacked)
+    batch_size, seqlen_q, nheads, d = q.shape
+    _, seqlen_k, nheads_k, _ = k.shape
+    assert k.shape == (batch_size, seqlen_k, nheads_k, d)
+    assert v.shape == (batch_size, seqlen_k, nheads_k, d)
+
+    if query_padding_mask is not None:
+        q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, query_padding_mask)
+        output_pad_fn = lambda output_unpad: pad_input(
+            output_unpad, indices_q, batch_size, seqlen_q
+        )
+    else:
+        q_unpad = rearrange(q, "b s h d -> (b s) h d")
+        cu_seqlens_q = torch.arange(
+            0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device
+        )
+        max_seqlen_q = seqlen_q
+        output_pad_fn = lambda output_unpad: rearrange(
+            output_unpad, "(b s) h d -> b s h d", b=batch_size
+        )
+
+    if key_padding_mask is not None:
+        k_unpad, indices_k, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask)
+        v_unpad, _, _, _ = unpad_input(v, key_padding_mask)
+    else:
+        k_unpad = rearrange(k, "b s h d -> (b s) h d")
+        v_unpad = rearrange(v, "b s h d -> (b s) h d")
+        cu_seqlens_k = torch.arange(
+            0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device
+        )
+        max_seqlen_k = seqlen_k
+
+    if qkvpacked:
+        assert (query_padding_mask == key_padding_mask).all()
+        assert nheads == nheads_k
+        qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1)
+        qkv = torch.stack([q, k, v], dim=2)
+        if query_padding_mask is not None:
+            dqkv_pad_fn = lambda dqkv_unpad: pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q)
+        else:
+            dqkv_pad_fn = lambda dqkv_unpad: rearrange(
+                dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
+            )
+        return (
+            qkv_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            max_seqlen_q,
+            qkv.detach().requires_grad_(),
+            output_pad_fn,
+            dqkv_pad_fn,
+        )
+    elif kvpacked:
+        kv_unpad = torch.stack([k_unpad, v_unpad], dim=1)
+        kv = torch.stack([k, v], dim=2)
+        dq_pad_fn = output_pad_fn
+        if key_padding_mask is not None:
+            dkv_pad_fn = lambda dkv_unpad: pad_input(dkv_unpad, indices_k, batch_size, seqlen_k)
+        else:
+            dkv_pad_fn = lambda dkv_unpad: rearrange(
+                dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size
+            )
+        return (
+            q_unpad.detach().requires_grad_(),
+            kv_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q.detach().requires_grad_(),
+            kv.detach().requires_grad_(),
+            output_pad_fn,
+            dq_pad_fn,
+            dkv_pad_fn,
+        )
+    else:
+        dq_pad_fn = output_pad_fn
+        if key_padding_mask is not None:
+            dk_pad_fn = lambda dk_unpad: pad_input(dk_unpad, indices_k, batch_size, seqlen_k)
+        else:
+            dk_pad_fn = lambda dk_unpad: rearrange(dk_unpad, "(b s) h d -> b s h d", b=batch_size)
+        return (
+            q_unpad.detach().requires_grad_(),
+            k_unpad.detach().requires_grad_(),
+            v_unpad.detach().requires_grad_(),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q.detach().requires_grad_(),
+            k.detach().requires_grad_(),
+            v.detach().requires_grad_(),
+            output_pad_fn,
+            dq_pad_fn,
+            dk_pad_fn,
+        )
+
+
+def construct_local_mask(
+    seqlen_q,
+    seqlen_k,
+    window_size=(-1, -1),  # -1 means infinite window size
+    query_padding_mask=None,
+    key_padding_mask=None,
+    device=None,
+    key_leftpad=None,
+):
+    row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1")
+    col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long)
+    if key_leftpad is not None:
+        key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1")
+        col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0])
+        col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32)
+    sk = (
+        seqlen_k
+        if key_padding_mask is None
+        else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    sq = (
+        seqlen_q
+        if query_padding_mask is None
+        else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1")
+    )
+    if window_size[0] < 0:
+        return col_idx > row_idx + sk - sq + window_size[1]
+    else:
+        sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk
+        return torch.logical_or(
+            col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk),
+            col_idx < row_idx + sk - sq - window_size[0],
+        )
+
+
+def attention_ref(
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    attn_bias=None,
+    dropout_p=0.0,
+    dropout_mask=None,
+    causal=False,
+    window_size=(-1, -1),  # -1 means infinite window size
+    softcap=0.0,
+    upcast=True,
+    reorder_ops=False,
+    key_leftpad=None,
+):
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        k: (batch_size, seqlen_k, nheads_k, head_dim)
+        v: (batch_size, seqlen_k, nheads_k, head_dim)
+        query_padding_mask: (batch_size, seqlen_q)
+        key_padding_mask: (batch_size, seqlen_k)
+        attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k)
+        dropout_p: float
+        dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k)
+        causal: whether to apply causal masking
+        window_size: (int, int), left and right window size
+        upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast
+            output back to fp16/bf16.
+        reorder_ops: whether to change the order of operations (scaling k instead of scaling q, etc.)
+            without changing the math. This is to estimate the numerical error from operation
+            reordering.
+    Output:
+        output: (batch_size, seqlen_q, nheads, head_dim)
+        attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout
+    """
+    if causal:
+        window_size = (window_size[0], 0)
+    dtype_og = q.dtype
+    if upcast:
+        q, k, v = q.float(), k.float(), v.float()
+    seqlen_q, seqlen_k = q.shape[1], k.shape[1]
+    k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2])
+    v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2])
+    d = q.shape[-1]
+    if not reorder_ops:
+        scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k)
+    else:
+        scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d))
+    if softcap > 0:
+        scores /= softcap
+        scores = scores.tanh()
+        scores *= softcap
+    if key_padding_mask is not None:
+        scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf"))
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        local_mask = construct_local_mask(
+            seqlen_q,
+            seqlen_k,
+            window_size,
+            query_padding_mask,
+            key_padding_mask,
+            q.device,
+            key_leftpad=key_leftpad,
+        )
+        scores.masked_fill_(local_mask, float("-inf"))
+    if attn_bias is not None:
+        scores = scores + attn_bias
+    attention = torch.softmax(scores, dim=-1).to(v.dtype)
+    # Some rows might be completely masked out so we fill them with zero instead of NaN
+    if window_size[0] >= 0 or window_size[1] >= 0:
+        attention = attention.masked_fill(torch.all(local_mask, dim=-1, keepdim=True), 0.0)
+    # We want to mask here so that the attention matrix doesn't have any NaNs
+    # Otherwise we'll get NaN in dV
+    if query_padding_mask is not None:
+        attention = attention.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0)
+    dropout_scaling = 1.0 / (1 - dropout_p)
+    # attention_drop = attention.masked_fill(~dropout_mask, 0.0) * dropout_scaling
+    # output = torch.einsum('bhts,bshd->bthd', attention_drop , v)
+    if dropout_mask is not None:
+        attention_drop = attention.masked_fill(~dropout_mask, 0.0)
+    else:
+        attention_drop = attention
+    output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling)
+    if query_padding_mask is not None:
+        output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0)
+    return output.to(dtype=dtype_og), attention.to(dtype=dtype_og)
diff --git a/test_vit.py b/test_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..129c05ff53f5c7b7dcc79b949d7d97da0c0f5d16
--- /dev/null
+++ b/test_vit.py
@@ -0,0 +1,48 @@
+import re
+
+import pytest
+import torch
+from flash_attn.models.vit import vit_base_patch16_224 as flash_vit_base_patch16_224
+from timm.models.vision_transformer import vit_base_patch16_224
+
+
+@pytest.mark.parametrize("fused_mlp", [False, True])
+# @pytest.mark.parametrize('fused_mlp', [False])
+@pytest.mark.parametrize("optimized", [False, True])
+# @pytest.mark.parametrize('optimized', [True])
+def test_vit(optimized, fused_mlp):
+    """Check that our implementation of ViT matches the timm's implementation:
+    the output of our forward pass in fp16 should be around the same as
+    timm' forward pass in fp16, when compared to timm's forward pass in fp32.
+    """
+    dtype = torch.float16
+    device = "cuda"
+
+    kwargs = {}
+    if optimized:
+        kwargs = dict(use_flash_attn=True, fused_bias_fc=True, fused_dropout_add_ln=True)
+    kwargs["fused_mlp"] = fused_mlp
+    model = flash_vit_base_patch16_224(**kwargs).to(device=device, dtype=dtype)
+
+    model_ref = vit_base_patch16_224(pretrained=True).to(device=device)
+    model_timm = vit_base_patch16_224(pretrained=True).to(device=device, dtype=dtype)
+
+    model.load_state_dict(model_ref.state_dict())
+
+    model.eval()
+    model_ref.eval()
+    model_timm.eval()
+
+    torch.manual_seed(0)
+    batch_size = 2
+    x = torch.randn(batch_size, 3, 224, 224, device=device, dtype=dtype)
+    out = model(x)
+    out_timm = model_timm(x)
+    out_ref = model_ref(x.float())
+
+    print(f"Output max diff: {(out - out_ref).abs().max().item()}")
+    print(f"Output mean diff: {(out - out_ref).abs().mean().item()}")
+    print(f"timm fp16 max diff: {(out_timm - out_ref).abs().max().item()}")
+    print(f"timm fp16 mean diff: {(out_timm - out_ref).abs().mean().item()}")
+    rtol = 2 if not fused_mlp else 8
+    assert (out - out_ref).abs().max().item() < rtol * (out_timm - out_ref).abs().max().item()
diff --git a/thepile.yaml b/thepile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d0f93535ce3fd213737f9466c24b8cf3d6f7dd1f
--- /dev/null
+++ b/thepile.yaml
@@ -0,0 +1,14 @@
+_target_: src.datamodules.language_modeling_hf.LMDataModule
+dataset_name: the_pile
+dataset_config_name: null
+tokenizer_name: gpt2
+cache_dir: ${oc.env:DATA_DIR,${data_dir}}/the_pile/cache
+max_length: 2048
+add_eos: True
+batch_size: 4  # per GPU
+batch_size_eval: ${eval:${.batch_size} * 2}
+num_workers: 64  # For preprocessing only
+use_shmem: False
+shuffle: True
+pin_memory: True
+__train_len: ${div_up:374337375694, ${.max_length}}
diff --git a/tile_scheduler.hpp b/tile_scheduler.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ac93ca94b4ce0eb694b923865418ab2e805da483
--- /dev/null
+++ b/tile_scheduler.hpp
@@ -0,0 +1,273 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cutlass/fast_math.h"
+#include "cutlass/arch/barrier.h"
+
+#include "named_barrier.hpp"
+
+namespace flash {
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct SingleTileScheduler {
+
+public:
+
+    // Host side kernel arguments
+    struct Arguments {
+        int const num_blocks_m, num_head, num_batch;
+        int* const tile_count_semaphore = nullptr;
+    };
+
+    // Device side kernel params
+    struct Params {};
+
+    static Params
+    to_underlying_arguments(Arguments const& args) {
+        return {};
+    }
+
+    static dim3
+    get_grid_dim(Arguments const& args, int num_sm) {
+        return {uint32_t(args.num_blocks_m), uint32_t(args.num_head), uint32_t(args.num_batch)};
+    }
+
+    struct WorkTileInfo {
+        int M_idx = 0;
+        int H_idx = 0;
+        int B_idx = 0;
+        bool is_valid_tile = false;
+
+        CUTLASS_DEVICE
+        bool
+        is_valid(Params const& params) const {
+            return is_valid_tile;
+        }
+
+        CUTLASS_DEVICE
+        cute::tuple<int32_t, int32_t, int32_t>
+        get_block_coord(Params const& params) const {
+            return {M_idx, H_idx, B_idx};
+        }
+
+    };
+
+    CUTLASS_DEVICE
+    SingleTileScheduler(int* tile_count_smem_) { }
+
+    CUTLASS_DEVICE
+    WorkTileInfo
+    get_initial_work() const {
+        return {int(blockIdx.x), int(blockIdx.y), int(blockIdx.z), true};
+    }
+
+    CUTLASS_DEVICE
+    void
+    init_consumer() const {}
+
+    CUTLASS_DEVICE
+    void
+    prefetch_next_work(Params const& params, WorkTileInfo& current_work) const {}
+
+    CUTLASS_DEVICE
+    void
+    broadcast_next_work(WorkTileInfo& current_work) const {}
+
+    template<bool IsProducer=false>
+    CUTLASS_DEVICE
+    WorkTileInfo
+    get_next_work(Params const& params, WorkTileInfo const& current_work) const {
+        return {-1, -1, -1, false};
+    }
+
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class StaticPersistentTileScheduler {
+
+public:
+
+    // Host side kernel arguments
+    struct Arguments {
+        int const num_blocks_m, num_head, num_batch;
+        int* const tile_count_semaphore = nullptr;
+    };
+
+    // Device side kernel params
+    struct Params {
+        int total_blocks;
+        cutlass::FastDivmod m_block_divmod, head_divmod;
+    };
+
+    static Params
+    to_underlying_arguments(Arguments const& args) {
+        return {args.num_blocks_m * args.num_head * args.num_batch,
+                cutlass::FastDivmod(args.num_blocks_m), cutlass::FastDivmod(args.num_head)};
+    }
+
+    static dim3
+    get_grid_dim(Arguments const& args, int num_sm) {
+        return {uint32_t(num_sm)};
+    }
+
+    struct WorkTileInfo {
+        int tile_idx;
+
+        CUTLASS_DEVICE
+        bool
+        is_valid(Params const& params) const {
+            return tile_idx < params.total_blocks;
+        }
+
+        CUTLASS_DEVICE
+        cute::tuple<int32_t, int32_t, int32_t>
+        get_block_coord(Params const& params) const {
+            int m_block, bidh, bidb;
+            bidb = params.head_divmod.divmod(bidh, params.m_block_divmod.divmod(m_block, tile_idx));
+            return {m_block, bidh, bidb};
+        }
+
+    };
+
+    CUTLASS_DEVICE
+    StaticPersistentTileScheduler(int* tile_count_smem_) {};
+
+    CUTLASS_DEVICE
+    WorkTileInfo
+    get_initial_work() const {
+        return {int(blockIdx.x)};
+    }
+
+    CUTLASS_DEVICE
+    void
+    init_consumer() const {}
+
+    CUTLASS_DEVICE
+    void
+    prefetch_next_work(Params const& params, WorkTileInfo& current_work) const {}
+
+    CUTLASS_DEVICE
+    void
+    broadcast_next_work(WorkTileInfo& current_work) const {}
+
+    template<bool IsProducer=false>
+    CUTLASS_DEVICE
+    WorkTileInfo
+    get_next_work(Params const& params, WorkTileInfo const& current_work) const {
+        return {current_work.tile_idx + int(gridDim.x)};
+    }
+
+};
+
+template<int NumMmaThreads=2 * cutlass::NumThreadsPerWarpGroup, int NumProducerThreads = cutlass::NumThreadsPerWarp>
+class DynamicPersistentTileScheduler {
+
+protected:
+    int* const tile_count_smem;
+
+public:
+
+    // Host side kernel arguments
+    struct Arguments {
+        int const num_blocks_m, num_head, num_batch;
+        int* const tile_count_semaphore;
+    };
+
+    // Device side kernel params
+    struct Params {
+        int const total_blocks;
+        cutlass::FastDivmod const m_block_divmod, head_divmod;
+        int* const tile_count_semaphore;
+    };
+
+    static Params
+    to_underlying_arguments(Arguments const& args) {
+        return {args.num_blocks_m * args.num_head * args.num_batch,
+                cutlass::FastDivmod(args.num_blocks_m), cutlass::FastDivmod(args.num_head),
+                args.tile_count_semaphore};
+    }
+
+    static dim3
+    get_grid_dim(Arguments const& args, int num_sm) {
+        return {uint32_t(num_sm)};
+    }
+
+    struct WorkTileInfo {
+        int tile_idx;
+
+        CUTLASS_DEVICE
+        bool
+        is_valid(Params const& params) const {
+            return tile_idx < params.total_blocks;
+        }
+
+        CUTLASS_DEVICE
+        cute::tuple<int32_t, int32_t, int32_t>
+        get_block_coord(Params const& params) const {
+            int m_block, bidh, bidb;
+            bidb = params.head_divmod.divmod(bidh, params.m_block_divmod.divmod(m_block, tile_idx));
+            return {m_block, bidh, bidb};
+        }
+
+    };
+
+    CUTLASS_DEVICE
+    DynamicPersistentTileScheduler(int* tile_count_smem_) : tile_count_smem(tile_count_smem_) {};
+
+    CUTLASS_DEVICE
+    WorkTileInfo
+    get_initial_work() const {
+        return {int(blockIdx.x)};
+    }
+
+    CUTLASS_DEVICE
+    void
+    init_consumer() const {
+        cutlass::arch::NamedBarrier::arrive(NumMmaThreads + NumProducerThreads, static_cast<int>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
+    }
+
+    CUTLASS_DEVICE
+    void
+    prefetch_next_work(Params const& params, WorkTileInfo& current_work) const {
+        if (threadIdx.x % NumProducerThreads == 0) {
+            current_work.tile_idx = atomicAdd(params.tile_count_semaphore, 1) + int(gridDim.x);
+        }
+    }
+
+    CUTLASS_DEVICE
+    void
+    broadcast_next_work(WorkTileInfo& current_work) const {
+        cutlass::arch::NamedBarrier::sync(NumMmaThreads + NumProducerThreads, static_cast<int>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
+        if (threadIdx.x % NumProducerThreads == 0) {
+            *tile_count_smem = current_work.tile_idx;
+        }
+        cutlass::arch::NamedBarrier::arrive(NumMmaThreads + NumProducerThreads, static_cast<int>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
+    }
+
+    template<bool IsProducer=false>
+    CUTLASS_DEVICE
+    WorkTileInfo
+    get_next_work(Params const& params, WorkTileInfo const& current_work) const {
+        if constexpr (IsProducer && NumProducerThreads == cutlass::NumThreadsPerWarp) {
+            // thread 0 already has the right tile_idx, just need to broadcast to the rest of the producer threads (warp 0)
+            return {__shfl_sync(0xffffffff, current_work.tile_idx, 0 /*lane*/)};
+        } else if constexpr (IsProducer && NumProducerThreads == cutlass::NumThreadsPerWarpGroup) {
+            // TODO: investigate optimal synchronize
+            int tile_idx = *tile_count_smem;
+            return {tile_idx};
+        } else {
+            cutlass::arch::NamedBarrier::sync(NumMmaThreads + NumProducerThreads, static_cast<int>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
+            int tile_idx = *tile_count_smem;
+            cutlass::arch::NamedBarrier::arrive(NumMmaThreads + NumProducerThreads, static_cast<int>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
+            return {tile_idx};
+        }
+    }
+
+};
+
+} // flash
\ No newline at end of file
diff --git a/tile_scheduler_bwd.hpp b/tile_scheduler_bwd.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3c3c5813a707ab3becd5e742c1eab31edba9f0dc
--- /dev/null
+++ b/tile_scheduler_bwd.hpp
@@ -0,0 +1,92 @@
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cutlass/fast_math.h"
+#include "cutlass/arch/barrier.h"
+
+#include "named_barrier.hpp"
+
+namespace flash {
+
+///////////////////////////////////////////////////////////////////////////////
+
+class SingleTileSchedulerBwd {
+
+public:
+
+    using SharedStorage = int;
+
+    // Host side kernel arguments
+    struct Arguments {
+        int const num_blocks_m, num_head, num_batch;
+        int* const tile_count_semaphore = nullptr;
+        int* const cu_seqlens = nullptr;
+    };
+
+    // Device side kernel params
+    struct Params {
+        int const num_blocks_m, num_head, num_batch;
+    };
+
+    static Params
+    to_underlying_arguments(Arguments const& args) {
+        return {args.num_blocks_m, args.num_head, args.num_batch};
+    }
+
+    static dim3
+    get_grid_shape(Params const& params, int num_sm) {
+        return {uint32_t(params.num_blocks_m), uint32_t(params.num_head), uint32_t(params.num_batch)};
+    }
+
+    struct WorkTileInfo {
+        int M_idx = 0;
+        int H_idx = 0;
+        int B_idx = 0;
+        bool is_valid_tile = false;
+
+        CUTLASS_DEVICE
+        bool
+        is_valid(Params const& params) const {
+            return is_valid_tile;
+        }
+
+        CUTLASS_DEVICE
+        cute::tuple<int32_t, int32_t, int32_t>
+        get_block_coord(Params const& params) const {
+            return {M_idx, H_idx, B_idx};
+        }
+
+    };
+
+    CUTLASS_DEVICE
+    SingleTileSchedulerBwd(SharedStorage* const smem_scheduler) { }
+
+    template<bool IsProducer=false>
+    CUTLASS_DEVICE
+    WorkTileInfo
+    get_initial_work(Params const& params) const {
+        return {int(blockIdx.x), int(blockIdx.y), int(blockIdx.z), true};
+    }
+
+    CUTLASS_DEVICE
+    void
+    init_consumer() const {}
+
+    CUTLASS_DEVICE
+    void
+    prefetch_next_work(Params const& params, WorkTileInfo& current_work) const {}
+
+    template<bool IsProducer=false>
+    CUTLASS_DEVICE
+    WorkTileInfo
+    get_next_work(Params const& params, WorkTileInfo const& current_work) const {
+        return {-1, -1, -1, false};
+    }
+
+};
+
+
+} // flash
diff --git a/type_shim.h b/type_shim.h
new file mode 100644
index 0000000000000000000000000000000000000000..815ec7ec88967f3b258cf666d43b5fe995f0f2b5
--- /dev/null
+++ b/type_shim.h
@@ -0,0 +1,20 @@
+#include <ATen/ATen.h>
+
+#define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)                   \
+switch(TYPE)                                                        \
+{                                                                   \
+case at::ScalarType::Half:                                          \
+    {                                                               \
+using scalar_t = at::Half;                                          \
+__VA_ARGS__;                                                        \
+break;                                                              \
+    }                                                               \
+case at::ScalarType::BFloat16:                                      \
+    {                                                               \
+using scalar_t = at::BFloat16;                                      \
+__VA_ARGS__;                                                        \
+break;                                                              \
+    }                                                               \
+default:                                                            \
+    AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
+}
diff --git a/usage.md b/usage.md
new file mode 100644
index 0000000000000000000000000000000000000000..133bfbdb6b2a8860682fdf82baa0792b07733e8c
--- /dev/null
+++ b/usage.md
@@ -0,0 +1,127 @@
+# FlashAttention adoption
+
+We've been very happy to see FlashAttention being adopted by many organizations
+and research labs to speed up their training / inference (within 6 months after
+FlashAttention's release, at the time of writing).
+This page contains a partial list of places where FlashAttention is being used.
+If you'd like to add links to your organization / product / codebase, please open a
+PR or email us. We'd very much like to hear from you!
+
+## Integrated into machine learning frameworks
+
+- Pytorch: [integrated](https://github.com/pytorch/pytorch/pull/81434) into core Pytorch in nn.Transformer.
+
+- Huggingface's [transformers](https://github.com/huggingface/transformers) library.
+  [On-going](https://github.com/huggingface/transformers/pull/18439), blogpost
+  coming soon.
+
+- Microsoft's [DeepSpeed](https://github.com/microsoft/DeepSpeed):
+  FlashAttention is [integrated](https://github.com/microsoft/DeepSpeed/blob/ec13da6ba7cabc44bb4745a64a208b8580792954/deepspeed/ops/transformer/inference/triton_ops.py) into DeepSpeed's inference engine.
+
+- Nvidia's [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/pull/267). This
+  library is a popular framework on training large transformer language models at scale.
+
+- MosaicML [Composer](https://github.com/mosaicml/composer)
+  [library](https://www.mosaicml.com/blog/gpt-3-quality-for-500k). Composer is a
+  library for efficient neural network training.
+  
+- EleutherAI's [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/pull/725). This is a research library for training large language transformer models at scale based on NVIDIA's Megatron-LM and Microsoft's DeepSpeed.
+
+- PaddlePaddle: integrated into the framework with [API](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/nn/functional/flash_attention.py) `paddle.nn.functional.flash_attention`.
+
+## MLPerf benchmarks
+
+[MLPerf](https://mlcommons.org/en/) is a competitive machine learning performance benchmark. FlashAttention
+yields the fastest BERT training on cloud instances in MLPerf training 2.0 (June
+2022) and MLPerf training 2.1 (November 2022).
+
+- MLPerf 2.0: [IEEE Spectrum](https://spectrum.ieee.org/mlperf-rankings-2022) and [Forbes](ttps://www.forbes.com/sites/moorinsights/2022/07/12/google-dethrones-nvidia-in-latest-artificial-intelligence-benchmarking-tests/) articles about our submission to the MLPerf 2.0 benchmark using FlashAttention.
+
+- MLPerf 2.1 -
+  collaboration
+  between [Azure and Hazy Research](https://techcommunity.microsoft.com/t5/azure-high-performance-computing/azure-collaborates-with-hazy-research-and-nvidia-to-achieve/ba-p/3667511): for the first time, we can train MLPerf BERT
+  in under 2 minutes on 16 nodes.
+
+- MLPerf 2.1 -
+  [Nvidia](https://developer.nvidia.com/blog/leading-mlperf-training-2-1-with-full-stack-optimizations-for-ai/):
+  Nvidia uses techniques from FlashAttention to make their (already extremely optimized) BERT
+  implementation go even faster.
+
+- MLPerf 2.1 - [MosaicML](https://www.mosaicml.com/blog/mlperf-nlp-nov2022): FlashAttention
+  helps train BERT 2.7x faster in the open division.
+
+## Language model training & inference
+
+- [PubMedGPT 2.7B](https://crfm.stanford.edu/2022/12/15/pubmedgpt.html), a
+  domain-specific LLM for biomedicine, by Stanford CRFM, trained on
+  [MosaicML](https://www.mosaicml.com/blog/introducing-pubmed-gpt) Cloud. Just
+  using FlashAttention nearly halves the total training time.
+
+- Meta's
+  [AITemplate](https://ai.facebook.com/blog/gpu-inference-engine-nvidia-amd-open-source/)
+  uses FlashAttention as part of their approach to speed up Transformer
+  inference (up to 5.3x on BERT).
+
+- Nvidia's [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) is a
+  state-of-the-art Transformer inference library. As of version
+  [5.2](https://github.com/NVIDIA/FasterTransformer/commit/b672f49e256ba7a2d4fc9691d270b60b7fc1a2ff),
+  FlashAttention is used as a component of FasterTransformer to speed up GPT inference.
+
+- [Kernl](https://github.com/ELS-RD/kernl) is a library for fast Transformer
+  inference. They use FlashAttention as part of their
+  [approach](https://twitter.com/pommedeterre33/status/1585284221014245377) to
+  speed up Transformers by up to 12x.
+
+## Diffusion model training and inference
+
+- Huggingface's [diffusers](https://github.com/huggingface/diffusers) library
+  for diffusion models. FlashAttention is integrated into [diffusers
+  v0.7.0](https://github.com/huggingface/diffusers/releases/tag/v0.7.0).
+  Up to 2x faster inference and lower memory usage.
+
+- Colossal-AI's
+  [implementation](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion)
+  of Stable Diffusion: with FlashAttention as one of its components, it speeds up
+  pretraining by up to 6.5x, and reduces the hardware cost of fine-tuning by 7x.
+
+- Meta's
+  [AITemplate](https://ai.facebook.com/blog/gpu-inference-engine-nvidia-amd-open-source/)
+  with FlashAttention one of the components, is currently the [fastest](https://twitter.com/bing_xu_/status/1590447334055632897) Stable
+  Diffusion inference engine that we know of.
+
+- Stable Diffusion inference from
+  [Labml.ai](https://twitter.com/labmlai/status/1573634095732490240): 50% speedup.
+
+- Our own Stable Diffusion [fork](https://twitter.com/realDanFu/status/1580641495991754752) uses FlashAttention to get 3-4x speedup compared
+  to the original version.
+
+## Other models
+
+- [Uni-Fold](https://github.com/dptech-corp/Uni-Fold): Uni-Fold is an
+  open-source platform for developing protein models beyond AlphaFold. With
+  FlashAttention, Uni-Fold is 2.6x
+  [faster](https://twitter.com/guolin_ke/status/1580532071901995008) than AlphaFold.
+
+- [OpenFold](https://github.com/aqlaboratory/openfold): a trainable,
+  memory-efficient, and GPU-friendly PyTorch reproduction of AlphaFold 2. With
+  FlashAttention as one of its
+  [components](https://twitter.com/gahdritz/status/1595420944880779266), it is
+  up to 3x faster than AlphaFold2 to run inference on short sequences, and can
+  predict 2x longer structures.
+
+## Different implementations
+
+- [Triton](https://github.com/openai/triton): an [implementation](https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py) of
+  FlashAttention in Triton by Phil Tillet from OpenAI. Triton is a Python-based
+  language and compiler for parallel programming.
+
+- [xformers](https://github.com/facebookresearch/xformers): The xformers team
+  has implemented [memory-efficient
+  attention](https://twitter.com/fvsmassa/status/1580229170629849089) in a
+  similar spirit to FlashAttention.
+  xformers dynamically dispatches to whichever implementation is available / faster.
+
+- [Jax](https://github.com/google/jax): an [implementation](https://github.com/lucidrains/flash-attention-jax)
+  in Jax by [lucidrains](https://github.com/lucidrains/).
+
+- [Metal](https://developer.apple.com/metal): an [implementation](https://github.com/philipturner/metal-flash-attention) in Metal by Philip Turner. This ports FlashAttention to mobile GPU architectures such as Apple silicon.
diff --git a/utils.h b/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..aaf0712ad9bf54141607cb6135fd259aaf045a4a
--- /dev/null
+++ b/utils.h
@@ -0,0 +1,353 @@
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <cuda_fp16.h>
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#include <cuda_bf16.h>
+#endif
+
+#include <cute/tensor.hpp>
+#include <cute/arch/cluster_sm90.hpp>  // For cute::elect_one_sync()
+
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+
+#define CHECK_CUDA(call)                                                                                  \
+    do {                                                                                                  \
+        cudaError_t status_ = call;                                                                       \
+        if (status_ != cudaSuccess) {                                                                     \
+            fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_)); \
+            exit(1);                                                                                      \
+        }                                                                                                 \
+    } while(0)
+
+#define CHECK_CUDA_KERNEL_LAUNCH() CHECK_CUDA(cudaGetLastError())
+
+
+namespace flash {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct MaxOp {
+__device__ __forceinline__ T operator()(T const & x, T const & y) { return x > y ? x : y; }
+};
+
+template <>
+struct MaxOp<float> {
+// This is slightly faster
+__device__ __forceinline__ float operator()(float const &x, float const &y) { return max(x, y); }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<typename T>
+struct SumOp {
+__device__ __forceinline__ T operator()(T const & x, T const & y) { return x + y; }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<int THREADS>
+struct Allreduce {
+    static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
+    template<typename T, typename Operator>
+    static __device__ __forceinline__ T run(T x, Operator &op) {
+        constexpr int OFFSET = THREADS / 2;
+        x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
+        return Allreduce<OFFSET>::run(x, op);
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template<>
+struct Allreduce<2> {
+template<typename T, typename Operator>
+static __device__ __forceinline__ T run(T x, Operator &op) {
+    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
+    return x;
+}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// For SM80, convert acc_layout from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N))
+// For SM90, convert acc_layout from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, V, MMA_N))
+template<typename Layout>
+__forceinline__ __device__ auto convert_layout_acc_rowcol(Layout acc_layout) {
+    if constexpr (decltype(rank<0>(acc_layout))::value == 3) {  // SM90
+        static_assert(decltype(size<0, 0>(acc_layout))::value == 2);
+        static_assert(decltype(size<0, 1>(acc_layout))::value == 2);
+        static_assert(decltype(rank(acc_layout))::value == 3);
+        auto l = acc_layout;
+        return make_layout(make_layout(get<0, 1>(l), get<1>(l)), make_layout(get<0, 0>(l), get<0, 2>(l), get<2>(l)));
+    } else {  // SM80
+        static_assert(decltype(size<0>(acc_layout))::value == 4);
+        static_assert(decltype(rank(acc_layout))::value == 3);
+        auto l = logical_divide(acc_layout, Shape<_2>{});  // ((2, 2), MMA_M, MMA_N)
+        return make_layout(make_layout(get<0, 1>(l), get<1>(l)), make_layout(get<0, 0>(l), get<2>(l)));
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// For SM90, convert acc_layout from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M), ncol=(2, MMA_N))
+template<typename Layout>
+__forceinline__ __device__ auto convert_layout_acc_transposed_rowcol(Layout acc_layout) {
+    static_assert(decltype(size<0, 0>(acc_layout))::value == 2);
+    static_assert(decltype(size<0, 1>(acc_layout))::value == 2);
+    static_assert(decltype(rank(acc_layout))::value == 3);
+    auto l = acc_layout;
+    return make_layout(make_layout(get<0, 0>(l), get<0, 2>(l), get<2>(l)), make_layout(get<0, 1>(l), get<1>(l)));
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// For SM80, convert acc_layout from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2)
+// if using m16n8k16, or to (4, MMA_M, MMA_N) if using m16n8k8.
+// For SM90, convert acc_layout from ((2, 2, N / 8), MMA_M, MMA_N) to ((2, 2, 2), MMA_M, (N / 16, MMA_N))
+template<typename MMA_traits, typename Layout>
+__forceinline__ __device__ auto convert_layout_acc_Aregs(Layout acc_layout) {
+    using X = Underscore;
+    if constexpr (decltype(rank<0>(acc_layout))::value == 3) {  // SM90
+        static_assert(decltype(size<0, 0>(acc_layout))::value == 2);
+        static_assert(decltype(size<0, 1>(acc_layout))::value == 2);
+        static_assert(decltype(rank(acc_layout))::value == 3);
+        static_assert(decltype(rank(get<0>(acc_layout)))::value == 3);
+        auto l = logical_divide(get<0>(acc_layout), Shape<X, X, _2>{});  // (2, 2, (2, N / 16)))
+        return make_layout(make_layout(get<0>(l), get<1>(l), get<2, 0>(l)), get<1>(acc_layout), make_layout(get<2, 1>(l), get<2>(acc_layout)));
+    } else {  // SM80
+        static_assert(decltype(size<0>(acc_layout))::value == 4);
+        static_assert(decltype(rank(acc_layout))::value == 3);
+        constexpr int mma_shape_K = get<2>(typename MMA_traits::Shape_MNK{});
+        static_assert(mma_shape_K == 8 || mma_shape_K == 16);
+        if constexpr (mma_shape_K == 8) {
+            return acc_layout;
+        } else {
+            auto l = logical_divide(acc_layout, Shape<X, X, _2>{});  // (4, MMA_M, (2, MMA_N / 2)))
+            return make_layout(make_layout(get<0>(l), get<2, 0>(l)), get<1>(l), get<2, 1>(l));
+        }
+    }
+};
+
+// Convert acc_layout from ((2, 2, N / 8), MMA_M, MMA_N) to ((4, 2, 2), MMA_M, (N / 32, MMA_N))
+template<typename Layout>
+__forceinline__ __device__ auto convert_layout_acc_Aregs_fp8(Layout acc_layout) {
+    using X = Underscore;    
+    static_assert(decltype(size<0, 0>(acc_layout))::value == 2);
+    static_assert(decltype(size<0, 1>(acc_layout))::value == 2);
+    static_assert(decltype(rank(acc_layout))::value == 3);
+    static_assert(decltype(rank(get<0>(acc_layout)))::value == 3);
+    auto l = logical_divide(get<0>(acc_layout), Shape<X, X, _4>{});  // (2, 2, (2, N / 32)))    
+    return make_layout(make_layout(Shape<_4, _2, _2>{}),
+                       get<1>(acc_layout),
+                       make_layout(get<2, 1>(l), get<2>(acc_layout)));
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Byte permute for fp8 kernel
+template <typename Fragment>
+CUTLASS_DEVICE void permute_regs_A_to_C(Fragment &accum) {  
+
+  auto data = accum.data();    
+
+  #pragma unroll  
+  for (int n = 0; n < size(accum); n += 8) {
+      uint32_t *data_32bit = reinterpret_cast<uint32_t *>(&data[n]);
+      auto upper = data_32bit[0];
+      auto lower = data_32bit[1];
+      data_32bit[0] = __byte_perm(upper, lower, 0x5410);
+      data_32bit[1] = __byte_perm(upper, lower, 0x7632);        
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename To_type, typename Engine, typename Layout>
+__forceinline__ __device__ auto convert_type(Tensor<Engine, Layout> const &tensor) {
+    using From_type = typename Engine::value_type;
+    constexpr int numel = decltype(size(tensor))::value;
+    cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;
+    // HACK: this requires tensor to be "contiguous"
+    auto frag = convert_op(*reinterpret_cast<const cutlass::Array<From_type, numel> *>(tensor.data()));
+    return make_tensor(make_rmem_ptr<To_type>(&frag), tensor.layout());
+    // Tensor out = make_tensor_like<To_type>(tensor);
+    // cute::copy(make_tensor(make_rmem_ptr<To_type>(&frag), tensor.layout()), out);
+    // return out;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool zero_init=false, int wg_wait=0, bool arrive=true, bool commit=true, typename Tensor0, typename Tensor1, typename Tensor2,
+          typename TiledMma>
+__forceinline__ __device__ void gemm(TiledMma &tiled_mma, Tensor0 const &tCrA, Tensor1 const &tCrB, Tensor2 &tCrC) {
+    constexpr bool Is_RS = !cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value;
+    // Need to cast away const on tCrA since warpgroup_fence_operand doesn't take const
+    if constexpr (Is_RS) { warpgroup_fence_operand(const_cast<Tensor0 &>(tCrA)); }
+    warpgroup_fence_operand(tCrC);
+    if constexpr (arrive) {
+        warpgroup_arrive();
+    }
+    if constexpr (zero_init) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+        // Unroll the K mode manually to set scale D to 1
+        CUTLASS_PRAGMA_UNROLL
+        for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+          cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCrC);
+          tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        }
+    } else {
+        // cute::gemm(tiled_mma, tCrA, tCrB, tCrC);
+        // Unroll the K mode manually to set scale D to 1
+        CUTLASS_PRAGMA_UNROLL
+        for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+          cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCrC);
+          tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+        }
+    }
+    if constexpr (commit) {
+        warpgroup_commit_batch();
+    }
+    if constexpr (wg_wait >= 0) { warpgroup_wait<wg_wait>(); }
+    warpgroup_fence_operand(tCrC);
+    if constexpr (Is_RS) { warpgroup_fence_operand(const_cast<Tensor0 &>(tCrA)); }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool Is_even_MN=true, bool Is_even_K=true, bool Clear_OOB_MN=false, bool Clear_OOB_K=true,
+          typename TiledCopy, typename Engine0, typename Layout0, typename Engine1, typename Layout1,
+          typename Engine2, typename Layout2, typename Engine3, typename Layout3>
+__forceinline__ __device__ void copy(TiledCopy tiled_copy, Tensor<Engine0, Layout0> const &S,
+                            Tensor<Engine1, Layout1> &D, Tensor<Engine2, Layout2> const &identity_MN,
+                            Tensor<Engine3, Layout3> const &predicate_K, const int max_MN=0) {
+    CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
+    CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
+    CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D));                     // MMA
+    CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D));                     // MMA_M
+    CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D));                     // MMA_K
+    // There's no case where !Clear_OOB_K && Clear_OOB_MN
+    static_assert(!(Clear_OOB_MN && !Clear_OOB_K));
+    #pragma unroll
+    for (int m = 0; m < size<1>(S); ++m) {
+        if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) {
+            #pragma unroll
+            for (int k = 0; k < size<2>(S); ++k) {
+                if (Is_even_K || predicate_K(k)) {
+                    cute::copy(tiled_copy, S(_, m, k), D(_, m, k));
+                } else if (Clear_OOB_K) {
+                    cute::clear(D(_, m, k));
+                }
+            }
+        } else if (Clear_OOB_MN) {
+            cute::clear(D(_, m, _));
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int NumCopyThreads, typename ElemO, typename TMACopyO, typename LayoutO, 
+          typename TileShapeO, typename SMemO, typename SeqLenTraits>
+__forceinline__ __device__ void write_tma(
+        ElemO* O, const TMACopyO& tma_store_O,
+        const LayoutO& layout_O, const TileShapeO& tile_shape_O,
+        const SMemO& sO, int m_block, int bidh, int bidb,
+        const SeqLenTraits& seqlen_traits_o, int write_warp_idx) {
+    Tensor mO = tma_store_O.get_tma_tensor(layout_O.shape());
+    Tensor gO = seqlen_traits_o.get_local_tile_tensor(
+        mO, tile_shape_O, bidh, bidb
+    )(_, _, m_block);  // (M, K)
+    auto block_tma_O = tma_store_O.get_slice(_0{});
+    Tensor tOgO = block_tma_O.partition_D(gO);  // (TMA, TMA_M, TMA_K)
+    Tensor tOsO = block_tma_O.partition_S(sO);  // (TMA, TMA_M, TMA_K)
+
+    int const lane_predicate = cute::elect_one_sync();
+    int const warp_idx = cutlass::canonical_warp_idx_sync();
+    if (warp_idx == write_warp_idx && lane_predicate) {
+        cute::copy(tma_store_O, tOsO, tOgO);
+        tma_store_arrive();
+    }
+    // Note: no wait here.
+    // tma_store_wait<0>();
+}
+
+template <int NumCopyThreads, typename ElemO, typename TiledCopyO, typename LayoutO, 
+          typename TileShapeO, typename SMemO, typename SeqLenTraits>
+__forceinline__ __device__ void write_tiled(
+        ElemO* O, const TiledCopyO& tiled_copy_O,
+        const LayoutO& layout_O, const TileShapeO& tile_shape_O,
+        const SMemO& sO, int m_block, int bidh, int bidb,
+        const SeqLenTraits& seqlen_traits_o) {
+    Tensor mO = make_tensor(make_gmem_ptr(O), layout_O);
+    Tensor gO = seqlen_traits_o.get_local_tile_tensor(
+        mO, tile_shape_O, bidh, bidb
+    )(_, _, m_block);  // (M, K)
+
+    ThrCopy thr_copy_O = tiled_copy_O.get_slice(threadIdx.x - NumCopyThreads);
+    Tensor tOgO = thr_copy_O.partition_D(gO); // (CPY,CPY_M,CPY_K,k)
+    Tensor tOsO = thr_copy_O.partition_S(sO); // (CPY,CPY_M,CPY_K)
+
+    // Prepare for TiledCopy.
+    // Grouping is needed because cute::copy_if() does group_modes<1, R> for src and dst.
+    // After grouping, the first dim is number of elements to read together.
+    Tensor tOsOFlatten = cute::flatten(tOsO);
+    Tensor tOsOGroup = cute::group_modes<1, rank(tOsOFlatten)>(tOsOFlatten);
+    Tensor tOgOFlatten = cute::flatten(tOgO);
+    Tensor tOgOGroup = cute::group_modes<1, rank(tOgOFlatten)>(tOgOFlatten);
+
+    // Get thread coords to global index mapping.
+    Tensor gOCounting = cute::make_identity_tensor(gO.shape());
+    Tensor tSgOCounting = thr_copy_O.partition_D(gOCounting);
+    Tensor tSgOCountingFlatten = cute::flatten(tSgOCounting);
+    Tensor tSgOCountingGrouped =
+        cute::group_modes<1, rank(tSgOCountingFlatten)>(tSgOCountingFlatten);
+
+    // Write out to GMEM.
+    const int kNumMsPerTile = get<0>(tile_shape_O);
+    int cta_m = std::min(
+        seqlen_traits_o.actual_seq_len - m_block * kNumMsPerTile, kNumMsPerTile
+    );
+    if (cta_m == kNumMsPerTile) {
+        copy(tiled_copy_O, tOsOGroup, tOgOGroup);
+    } else {
+        auto predicate_fn = [&](auto coords) {
+            auto s_coords = tSgOCountingGrouped(_0{}, coords);
+            return elem_less(get<0>(s_coords), cta_m);
+        };
+        copy_if(tiled_copy_O, predicate_fn, tOsOGroup, tOgOGroup);
+    }
+}
+
+template <bool IsTMACopy, int NumCopyThreads, typename ElemO, 
+          typename TMACopyO, typename TiledCopyO, typename LayoutO, 
+          typename TileShapeO, typename SMemO, typename SeqLenTraits>
+__forceinline__ __device__ void write_O(
+        ElemO* O, const TMACopyO& tma_copy_O, const TiledCopyO& tiled_copy_O,
+        const LayoutO& layout_O, const TileShapeO& tile_shape_O,
+        const SMemO& sO, int m_block, int bidh, int bidb,
+        const SeqLenTraits& seqlen_traits_o, int write_warp_idx) {
+    if constexpr (IsTMACopy) {
+        write_tma<NumCopyThreads>(O, tma_copy_O, layout_O, tile_shape_O, sO, m_block, bidh, bidb, seqlen_traits_o, write_warp_idx);
+    } else {
+        write_tiled<NumCopyThreads>(O, tiled_copy_O, layout_O, tile_shape_O, sO, m_block, bidh, bidb, seqlen_traits_o);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace flash
diff --git a/vit.py b/vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..4602fd7414d251e40f9d42250c23cc974d596661
--- /dev/null
+++ b/vit.py
@@ -0,0 +1,373 @@
+# Copyright (c) 2022, Tri Dao.
+# Inspired by / adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+import math
+import re
+from collections import OrderedDict
+from copy import deepcopy
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from timm.models.helpers import named_apply
+from torch.nn.init import trunc_normal_
+from torchvision.ops import StochasticDepth
+
+from flash_attn.layers.patch_embed import PatchEmbed
+from flash_attn.modules.block import Block
+from flash_attn.modules.mha import MHA
+from flash_attn.modules.mlp import FusedMLP, Mlp
+
+try:
+    from flash_attn.ops.triton.layer_norm import layer_norm_fn
+except ImportError:
+    layer_norm_fn = None
+
+
+def create_mixer_cls(
+    num_heads, qkv_bias, attn_drop, use_flash_attn, fused_bias_fc, cross_attn=False
+):
+    mixer_cls = partial(
+        MHA,
+        num_heads=num_heads,
+        cross_attn=cross_attn,
+        qkv_proj_bias=qkv_bias,
+        dropout=attn_drop,
+        fused_bias_fc=fused_bias_fc,
+        use_flash_attn=use_flash_attn,
+    )
+    return mixer_cls
+
+
+def create_mlp_cls(embed_dim, mlp_ratio, act_layer, fused_mlp):
+    inner_dim = int(embed_dim * mlp_ratio)
+    if not fused_mlp:
+        mlp_cls = partial(Mlp, hidden_features=inner_dim, activation=act_layer())
+    else:
+        mlp_cls = partial(FusedMLP, hidden_features=inner_dim)
+    return mlp_cls
+
+
+def create_block(
+    embed_dim,
+    num_heads,
+    mlp_ratio,
+    qkv_bias,
+    drop_rate,
+    attn_drop_rate,
+    drop_path1,
+    drop_path2,
+    norm_layer,
+    act_layer,
+    use_flash_attn,
+    fused_bias_fc,
+    fused_mlp,
+    fused_dropout_add_ln,
+    layer_idx=None,
+    n_layer=None,
+    last_layer_subset=False,
+):
+    mixer_cls = create_mixer_cls(
+        num_heads,
+        qkv_bias,
+        attn_drop_rate,
+        use_flash_attn,
+        fused_bias_fc,
+        cross_attn=(last_layer_subset and layer_idx == n_layer - 1),
+    )
+    mlp_cls = create_mlp_cls(embed_dim, mlp_ratio, act_layer, fused_mlp)
+    # TD [2022-10-15]: Force residual in fp32 in case of DeepSpeed
+    block = Block(
+        embed_dim,
+        mixer_cls,
+        mlp_cls,
+        norm_cls=norm_layer,
+        prenorm=True,
+        resid_dropout1=drop_rate,
+        resid_dropout2=drop_rate,
+        drop_path1=drop_path1,
+        drop_path2=drop_path2,
+        fused_dropout_add_ln=fused_dropout_add_ln,
+        residual_in_fp32=True,
+    )
+    return block
+
+
+class VisionTransformer(nn.Module):
+    """Vision Transformer
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    """
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        num_classes=1000,
+        global_pool="token",
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        init_values=None,
+        class_token=True,
+        no_embed_class=False,
+        pre_norm=False,
+        fc_norm=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        weight_init="",
+        embed_layer=PatchEmbed,
+        norm_layer=None,
+        act_layer=None,
+        use_flash_attn=False,
+        fused_bias_fc=False,
+        fused_mlp=False,
+        fused_dropout_add_ln=False,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            global_pool (str): type of global pooling for final sequence (default: 'token')
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            init_values: (float): layer-scale init values
+            class_token (bool): use class token
+            fc_norm (Optional[bool]): pre-fc norm after pool, set if global_pool == 'avg' if None (default: None)
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            weight_init (str): weight init scheme
+            embed_layer (nn.Module): patch embedding layer
+            norm_layer: (nn.Module): normalization layer
+            act_layer: (nn.Module): MLP activation layer
+        """
+        super().__init__()
+        assert global_pool == "token", "Only support pooling with CLS token"
+        assert class_token
+        assert init_values is None, "LayerScale is not supported yet"
+        assert weight_init == ""
+        assert fc_norm is None
+        # pre_norm seems redundant, as there's a LayerNorm right at the start of each block, idk
+        assert not pre_norm
+        use_fc_norm = global_pool == "avg" if fc_norm is None else fc_norm
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = act_layer or nn.GELU
+
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = (
+            self.embed_dim
+        ) = embed_dim  # num_features for consistency with other models
+        self.num_prefix_tokens = 1 if class_token else 0
+        self.no_embed_class = no_embed_class
+
+        patch_embed_extra_kwargs = (
+            {"fused_bias_fc": fused_bias_fc} if embed_layer is PatchEmbed else {}
+        )
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            bias=not pre_norm,  # disable bias if pre-norm is used (e.g. CLIP)
+            **patch_embed_extra_kwargs,
+        )
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None
+        embed_len = num_patches if no_embed_class else num_patches + self.num_prefix_tokens
+        self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02)
+
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+
+        # We change the order of dropout, residual and layer norm:
+        # Instead of LN -> Attn / MLP -> Dropout -> Add, we do:
+        # Dropout -> Add -> LN -> Attn / MLP, returning both the residual branch (output of Add) and
+        # the main branch (output of MLP). The model definition is unchanged, but the mapping of the
+        # nn.Dropout probabilities are changed.
+        # This is for performance reason: we can fuse dropout + add + layer_norm.
+        self.blocks = nn.ModuleList(
+            [
+                create_block(
+                    embed_dim,
+                    num_heads,
+                    mlp_ratio,
+                    qkv_bias,
+                    drop_rate,
+                    attn_drop_rate,
+                    drop_path1=dpr[i - 1] if i > 0 else 0.0,
+                    drop_path2=dpr[i],
+                    norm_layer=norm_layer,
+                    act_layer=act_layer,
+                    use_flash_attn=use_flash_attn,
+                    fused_bias_fc=fused_bias_fc,
+                    fused_mlp=fused_mlp,
+                    fused_dropout_add_ln=fused_dropout_add_ln,
+                    layer_idx=i,
+                    n_layer=depth,
+                    last_layer_subset=(global_pool == "token"),
+                )
+                for i in range(depth)
+            ]
+        )
+
+        self.dropout = nn.Dropout(p=drop_rate)
+        self.drop_path = StochasticDepth(p=dpr[-1], mode="row")
+        self.norm = norm_layer(embed_dim)
+
+        self.fused_dropout_add_ln = fused_dropout_add_ln
+        if self.fused_dropout_add_ln and layer_norm_fn is None:
+            raise ImportError("Triton is not installed")
+
+        # Classifier Head
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        self.init_weights(weight_init)
+
+    def init_weights(self, mode=""):
+        assert mode == ""
+        trunc_normal_(self.pos_embed, std=0.02)
+        if self.cls_token is not None:
+            nn.init.normal_(self.cls_token, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+
+    def _init_weights(self, m):
+        # this fn left here for compat with downstream users
+        init_weights_vit_timm(m)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"pos_embed", "cls_token"}
+
+    def _pos_embed(self, x):
+        if self.no_embed_class:
+            # deit-3, updated JAX (big vision)
+            # position embedding does not overlap with class token, add then concat
+            x = x + self.pos_embed
+            if self.cls_token is not None:
+                x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        else:
+            # original timm, JAX, and deit vit impl
+            # pos_embed has entry for class token, concat then add
+            if self.cls_token is not None:
+                x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+            x = x + self.pos_embed
+        return x
+
+    def forward_features(self, x, all_tokens=True):
+        """
+        If all_tokens==False and self.global_pool == 'token', we only return the features for the
+        cls token.
+        """
+        x = self.patch_embed(x)
+        hidden_states = self._pos_embed(x)
+        residual = None
+        if self.global_pool != "token" or all_tokens:
+            # if True:
+            for block in self.blocks:
+                hidden_states, residual = block(hidden_states, residual)
+        else:
+            for block in self.blocks[:-1]:
+                hidden_states, residual = block(hidden_states, residual)
+            # For the last layer, we only want the 1st token of the output. So we do cross-attention
+            # where the query is the 1st token and the key/value is the whole sequence.
+            hidden_states, residual = self.blocks[-1](
+                hidden_states, residual, mixer_subset=slice(0, 1)
+            )
+        if not self.fused_dropout_add_ln:
+            residual = self.drop_path(self.dropout(hidden_states)) + residual
+            hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
+        else:
+            if self.drop_path.p == 0 or not self.training:
+                rowscale = None
+            else:
+                rowscale = self.drop_path(
+                    torch.ones(
+                        hidden_states.shape[:-1],
+                        device=hidden_states.device,
+                        dtype=hidden_states.dtype,
+                    )
+                )
+            # Set prenorm=False here since we don't need to the residual
+            hidden_states = layer_norm_fn(
+                hidden_states,
+                self.norm.weight,
+                self.norm.bias,
+                residual=residual,
+                eps=self.norm.eps,
+                dropout_p=self.dropout.p if self.training else 0.0,
+                rowscale=rowscale,
+                prenorm=False,
+            )
+        return hidden_states
+
+    def forward_head(self, x, pre_logits: bool = False):
+        if self.global_pool:
+            x = x[:, self.num_prefix_tokens :].mean(dim=1) if self.global_pool == "avg" else x[:, 0]
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x):
+        x = self.forward_features(x, all_tokens=False)
+        x = self.forward_head(x)
+        return x
+
+    def load_state_dict(self, state_dict, strict=True):
+        patch_embed_weight = state_dict["patch_embed.proj.weight"]
+        if patch_embed_weight.dim() == 4:
+            # convert from Conv2d to Linear
+            state_dict["patch_embed.proj.weight"] = rearrange(
+                patch_embed_weight, "o c h w -> o (c h w)"
+            )
+
+        def key_mapping_attn(key):
+            key = re.sub(r"^blocks.(\d+).attn.qkv.", r"blocks.\1.mixer.Wqkv.", key)
+            key = re.sub(r"^blocks.(\d+).attn.proj.", r"blocks.\1.mixer.out_proj.", key)
+            return key
+
+        state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+        n_layer = len(self.blocks)
+        # Convert from Wqkv to Wq and Wkv for cross attention (last layer)
+        if (
+            self.blocks[-1].mixer.cross_attn
+            and f"blocks.{n_layer - 1}.mixer.Wqkv.weight" in state_dict
+        ):
+            Wqkv = state_dict.pop(f"blocks.{n_layer - 1}.mixer.Wqkv.weight")
+            bqkv = state_dict.pop(f"blocks.{n_layer - 1}.mixer.Wqkv.bias")
+            state_dict[f"blocks.{n_layer - 1}.mixer.Wq.weight"] = Wqkv[: self.embed_dim]
+            state_dict[f"blocks.{n_layer - 1}.mixer.Wkv.weight"] = Wqkv[self.embed_dim :]
+            state_dict[f"blocks.{n_layer - 1}.mixer.Wq.bias"] = bqkv[: self.embed_dim]
+            state_dict[f"blocks.{n_layer - 1}.mixer.Wkv.bias"] = bqkv[self.embed_dim :]
+        return super().load_state_dict(state_dict, strict=strict)
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif hasattr(module, "init_weights"):
+        module.init_weights()
+
+
+def vit_base_patch16_224(pretrained=False, **kwargs):
+    """ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
+    """
+    assert not pretrained
+    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = VisionTransformer(**model_kwargs)
+    return model
diff --git a/wandb.yaml b/wandb.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c6ae21d3aec18aa281a6d1ef5ea41199a4c04295
--- /dev/null
+++ b/wandb.yaml
@@ -0,0 +1,26 @@
+defaults:
+  - default.yaml
+
+watch_model:
+  _target_: src.callbacks.wandb_callbacks.WatchModel
+  log: "all"
+  log_freq: 100
+
+upload_code_as_artifact:
+  _target_: src.callbacks.wandb_callbacks.UploadCodeAsArtifact
+  code_dir: ${work_dir}/src
+
+upload_ckpts_as_artifact:
+  _target_: src.callbacks.wandb_callbacks.UploadCheckpointsAsArtifact
+  ckpt_dir: "checkpoints/"
+  upload_best_only: True
+
+log_f1_precision_recall_heatmap:
+  _target_: src.callbacks.wandb_callbacks.LogF1PrecRecHeatmap
+
+log_confusion_matrix:
+  _target_: src.callbacks.wandb_callbacks.LogConfusionMatrix
+
+log_image_predictions:
+  _target_: src.callbacks.wandb_callbacks.LogImagePredictions
+  num_samples: 8
diff --git a/xentropy_kernel.cu b/xentropy_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8d8836e6e45cf13bc6cc0e7ef5917cb7b78212d1
--- /dev/null
+++ b/xentropy_kernel.cu
@@ -0,0 +1,760 @@
+// Adapted from https://github.com/NVIDIA/apex/blob/master/apex/contrib/csrc/xentropy/xentropy_kernel.cu
+// TD [2022-09-17]: We make it work for bfloat16, and add an option to do the backward inplace (to save memory).
+/**
+ * From PyTorch:
+ *
+ * Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+ * Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+ * Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+ * Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+ * Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+ * Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+ * Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+ * Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+ * Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+ *
+ * From Caffe2:
+ *
+ * Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+ *
+ * All contributions by Facebook:
+ * Copyright (c) 2016 Facebook Inc.
+ *
+ * All contributions by Google:
+ * Copyright (c) 2015 Google Inc.
+ * All rights reserved.
+ *
+ * All contributions by Yangqing Jia:
+ * Copyright (c) 2015 Yangqing Jia
+ * All rights reserved.
+ *
+ * All contributions from Caffe:
+ * Copyright(c) 2013, 2014, 2015, the respective contributors
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright(c) 2015, 2016 the respective contributors
+ * All rights reserved.
+ *
+ * Caffe2 uses a copyright model similar to Caffe: each contributor holds
+ * copyright over their contributions to Caffe2. The project versioning records
+ * all such contribution and copyright details. If a contributor wants to further
+ * mark their specific copyright on a particular contribution, they should
+ * indicate their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+ *    and IDIAP Research Institute nor the names of its contributors may be
+ *    used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/NumericLimits.cuh>
+
+// https://github.com/NVIDIA/apex/blob/master/csrc/type_shim.h
+// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#define DISPATCH_FLOAT_AND_HALF_AND_BF16(TYPE, LEVEL, NAME, ...) \
+  switch(TYPE) \
+  { \
+    case at::ScalarType::Float: \
+    { \
+      using scalar_t_##LEVEL = float; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    case at::ScalarType::Half: \
+    { \
+      using scalar_t_##LEVEL = at::Half; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    case at::ScalarType::BFloat16: \
+    { \
+      using scalar_t_##LEVEL = at::BFloat16; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    default: \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");  \
+  }
+// #else
+// #define DISPATCH_FLOAT_AND_HALF_AND_BF16(TYPE, LEVEL, NAME, ...) \
+//   switch(TYPE) \
+//   { \
+//     case at::ScalarType::Float: \
+//     { \
+//       using scalar_t_##LEVEL = float; \
+//       __VA_ARGS__; \
+//       break; \
+//     } \
+//     case at::ScalarType::Half: \
+//     { \
+//       using scalar_t_##LEVEL = at::Half; \
+//       __VA_ARGS__; \
+//       break; \
+//     } \
+//     default: \
+//       AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");  \
+//   }
+// #endif
+
+#define ALIGN_BYTES 16
+
+using Tensor = at::Tensor;
+using TensorList = at::TensorList;
+using ScalarType = at::ScalarType;
+using at::acc_type;
+
+template<typename T, typename AccumT, typename OutT>
+struct LogSoftMaxForwardEpilogue {
+  __device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_input, AccumT sum)
+    : logsum(max_input + std::log(sum)) {}
+
+  __device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_log_sum_exp)
+    : logsum(max_log_sum_exp) {}
+
+  __device__ __forceinline__ OutT operator()(T input) const {
+    return static_cast<OutT>(input - logsum);
+  }
+
+  const AccumT logsum;
+};
+
+template<typename T, typename AccumT, typename OutT>
+struct LogSoftMaxBackwardEpilogue {
+  __device__ __forceinline__ LogSoftMaxBackwardEpilogue(AccumT sum)
+    : sum(sum) {}
+
+  __device__ __forceinline__ T operator()(OutT gradOutput, OutT output) const {
+    return static_cast<T>(gradOutput - std::exp(static_cast<AccumT>(output)) * sum);
+  }
+
+  const AccumT sum;
+};
+
+
+
+const int max_threads = 1024;
+
+inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) {
+  uint64_t block_size = 1;
+  uint64_t max_block_size = std::min(dim_size / ILP, static_cast<uint64_t>(max_threads));
+  while (block_size < (max_block_size/2)) block_size *= 2;
+  // Launch at least a single warp - the kernel assumes that.
+  block_size = std::max(block_size, static_cast<uint64_t>(32));
+  return dim3(block_size);
+}
+
+template<typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+template<typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Regular kernel (fast when dim_size is large; requires inner_size == 1)
+////////////////////////////////////////////////////////////////////////////////
+
+
+template <typename T, typename AccumT>
+struct MaxFloat
+{
+  __device__ __forceinline__ AccumT operator()(AccumT max, T v) const {
+    return ::max(max, (AccumT)v);
+  }
+};
+
+template<typename T, typename AccumT>
+struct AddFloat
+{
+  __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
+    return sum + v;
+  }
+};
+
+template<typename T, typename AccumT>
+struct SumExpFloat
+{
+  __device__ __forceinline__ SumExpFloat(AccumT v)
+    : max_k(v) {}
+
+  __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
+    return sum + std::exp(v - max_k);
+  }
+
+  const AccumT max_k;
+};
+
+template <template<typename> class Reduction, typename AccumT>
+__device__ __forceinline__ AccumT
+blockReduce(AccumT* smem, AccumT val,
+            const Reduction<AccumT>& r,
+            AccumT defaultVal)
+{
+  // To avoid RaW races from chaining blockReduce calls together, we need a sync here
+  __syncthreads();
+
+  smem[threadIdx.x] = val;
+
+  __syncthreads();
+
+  AccumT warpVal = defaultVal;
+
+  // First warp will perform per-warp reductions for the remaining warps
+  uint32_t mask = (((uint64_t)1) << (blockDim.x / 32)) - 1;
+  if (threadIdx.x < 32) {
+    int lane = threadIdx.x % 32;
+    if (lane < blockDim.x / 32) {
+#pragma unroll
+      for (int i = 0; i < 32; ++i) {
+        warpVal = r(warpVal, smem[lane * 32 + i]);
+      }
+      __syncwarp(mask);
+      smem[lane] = warpVal;
+    }
+  }
+
+  __syncthreads();
+
+  // First thread will perform a reduction of the above per-warp reductions
+  AccumT blockVal = defaultVal;
+
+  if (threadIdx.x == 0) {
+    for (int i = 0; i < blockDim.x / 32; ++i) {
+      blockVal = r(blockVal, smem[i]);
+    }
+    smem[0] = blockVal;
+  }
+
+  // Sync and broadcast
+  __syncthreads();
+  return smem[0];
+}
+
+template <template<typename> class Reduction1, template<typename> class Reduction2, typename AccumT>
+__device__ __forceinline__ void
+blockReduce(AccumT* smem,
+            AccumT* reducVal1,
+            AccumT val1,
+            const Reduction1<AccumT>& r1,
+            AccumT defaultVal1,
+            AccumT* reducVal2,
+            AccumT val2,
+            const Reduction2<AccumT>& r2,
+            AccumT defaultVal2)
+{
+  // To avoid RaW races from chaining blockReduce calls together, we need a sync here
+  __syncthreads();
+
+  smem[threadIdx.x] = val1;
+  smem[blockDim.x + threadIdx.x] = val2;
+
+  __syncthreads();
+
+  AccumT warpVal1 = defaultVal1;
+  AccumT warpVal2 = defaultVal2;
+
+  // First warp will perform per-warp reductions for the remaining warps
+  uint32_t mask = (((uint64_t)1) << (blockDim.x / 32)) - 1;
+  if (threadIdx.x < 32) {
+    int lane = threadIdx.x % 32;
+    if (lane < blockDim.x / 32) {
+#pragma unroll
+      for (int i = 0; i < 32; ++i) {
+        warpVal1 = r1(warpVal1, smem[lane * 32 + i]);
+        warpVal2 = r2(warpVal2, smem[lane * 32 + i + blockDim.x]);
+      }
+      __syncwarp(mask);
+      smem[lane] = warpVal1;
+      smem[lane + blockDim.x] = warpVal2;
+    }
+  }
+
+  __syncthreads();
+
+  // First thread will perform a reduction of the above per-warp reductions
+  AccumT blockVal1 = defaultVal1;
+  AccumT blockVal2 = defaultVal2;
+
+  if (threadIdx.x == 0) {
+    for (int i = 0; i < blockDim.x / 32; ++i) {
+      blockVal1 = r1(blockVal1, smem[i]);
+      blockVal2 = r2(blockVal2, smem[i + blockDim.x]);
+    }
+    smem[0] = blockVal1;
+    smem[blockDim.x] = blockVal2;
+  }
+
+  // Sync and broadcast
+  __syncthreads();
+  *reducVal1 = smem[0];
+  *reducVal2 = smem[blockDim.x];
+  __syncthreads();
+}
+
+template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT>
+__device__ __forceinline__ AccumT
+ilpReduce(int shift,
+          T* data,
+          int size,
+          const Reduction<T, AccumT>& r,
+          AccumT defaultVal)
+{
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LoadT;
+  AccumT threadVal = defaultVal;
+  int offset = threadIdx.x;
+
+  // shift and do 1
+  if(shift > 0){
+    data -= shift;
+    size += shift;
+    if(threadIdx.x >= shift){
+      threadVal = r(threadVal, data[offset]);
+    }
+    size -= blockDim.x;
+    data += blockDim.x;
+  }
+  int last = size % (ILP * blockDim.x);
+
+  T v[ILP];
+  LoadT* value = reinterpret_cast<LoadT*>(&v);
+
+  for (; offset * ILP < (size - last); offset += blockDim.x) {
+    *value = reinterpret_cast<LoadT*>(data)[offset];
+
+    for (int j = 0; j < ILP; ++j) {
+      threadVal = r(threadVal, v[j]);
+    }
+  }
+
+  offset = size - last + threadIdx.x;
+  // Epilogue
+  for (; offset < size; offset += blockDim.x)
+    threadVal = r(threadVal, data[offset]);
+
+  return threadVal;
+}
+
+template <template<typename, typename> class Reduction1, template<typename, typename> class Reduction2, int ILP, typename T, typename AccumT>
+__device__ __forceinline__ void
+ilpReduce(int shift,
+          T* data,
+          int size,
+          AccumT* reducVal1,
+          const Reduction1<T, AccumT>& r1,
+          AccumT defaultVal1,
+          AccumT* reducVal2,
+          const Reduction2<T, AccumT>& r2,
+          AccumT defaultVal2)
+{
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LoadT;
+
+  AccumT threadVal1 = defaultVal1;
+  AccumT threadVal2 = defaultVal2;
+  int offset = threadIdx.x;
+
+  // shift and do 1
+  if(shift > 0){
+    data -= shift;
+    size += shift;
+    if(threadIdx.x >= shift){
+      threadVal1 = r1(threadVal1, data[offset]);
+      threadVal2 = r2(threadVal2, data[offset]);
+    }
+    size -= blockDim.x;
+    data += blockDim.x;
+  }
+  int last = size % (ILP * blockDim.x);
+
+  T v[ILP];
+  LoadT* value = reinterpret_cast<LoadT*>(&v);
+
+  for (; offset * ILP < (size - last); offset += blockDim.x) {
+    *value = reinterpret_cast<LoadT*>(data)[offset];
+
+    for (int j = 0; j < ILP; ++j) {
+      threadVal1 = r1(threadVal1, v[j]);
+      threadVal2 = r2(threadVal2, v[j]);
+    }
+  }
+
+  offset = size - last + threadIdx.x;
+  // Epilogue
+  for (; offset < size; offset += blockDim.x) {
+    threadVal1 = r1(threadVal1, data[offset]);
+    threadVal2 = r2(threadVal2, data[offset]);
+  }
+
+  *reducVal1 = threadVal1;
+  *reducVal2 = threadVal2;
+}
+
+template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template <typename, typename, typename> class Epilogue>
+__global__ void
+cunn_SoftMaxXEntropyForward(
+    accscalar_t *losses,
+    outscalar_t *max_log_sum_exp,
+    scalar_t *input,
+    int64_t *labels,
+    int64_t classes,
+    const float smoothing,
+    const int total_classes)
+{
+  extern __shared__ unsigned char smem[];
+  auto sdata = reinterpret_cast<accscalar_t*>(smem);
+  // forward pointers to batch[blockIdx.x]
+  // each block handles a sample in the mini-batch
+  input += blockIdx.x * classes;
+  //output += blockIdx.x * classes;
+  const int shift = ((uint64_t)input) % ALIGN_BYTES / sizeof(scalar_t);
+
+  int64_t label = labels[blockIdx.x];
+
+  // find the max and sum
+  accscalar_t threadMax, threadSum, max_k, sum_k;
+  ilpReduce<MaxFloat, AddFloat, ILP, scalar_t, accscalar_t>(
+    shift, input, classes,
+    &threadMax, MaxFloat<scalar_t, accscalar_t>(),
+    -at::numeric_limits<accscalar_t>::max(),
+    &threadSum, AddFloat<scalar_t, accscalar_t>(),
+    static_cast<accscalar_t>(0));
+
+  blockReduce<Max, Add, accscalar_t>(
+      sdata,
+      &max_k, threadMax, Max<accscalar_t>(),
+      -at::numeric_limits<accscalar_t>::max(),
+      &sum_k, threadSum, Add<accscalar_t>(),
+      static_cast<accscalar_t>(0));
+
+  accscalar_t threadExp = ilpReduce<SumExpFloat, ILP, scalar_t, accscalar_t>(shift, input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
+  accscalar_t sumAll = blockReduce<Add, accscalar_t>(
+      sdata, threadExp, Add<accscalar_t>(), static_cast<accscalar_t>(0));
+
+  Epilogue<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
+
+  // calculate per element loss with label smoothing
+  // reserve max + log_sum_exp for bprop
+  if (threadIdx.x == 0) {
+    accscalar_t lse = max_k + std::log(sumAll);
+    accscalar_t log_prob = (label >= 0 && label < classes) ? epilogue(static_cast<accscalar_t>(input[label])) : 0.f;
+    losses[blockIdx.x] = (lse - sum_k / total_classes) * smoothing - log_prob * (1 - smoothing);
+    max_log_sum_exp[blockIdx.x] = lse;
+  }
+}
+
+template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t>
+__device__ __forceinline__ void
+apply(scalar_t *gradInput,
+      scalar_t *logits,
+      outscalar_t *max_log_sum_exp,
+      outscalar_t *gradOutput,
+      int64_t *labels,
+      const float smoothing,
+      int classes,
+      const int total_classes)
+{
+  accscalar_t smooth_positives = 1.0 - smoothing;
+  accscalar_t smooth_negatives = smoothing / total_classes;
+  accscalar_t tmpGradOutput = gradOutput[blockIdx.x];
+  int64_t label = labels[blockIdx.x];
+  accscalar_t coeff = max_log_sum_exp[blockIdx.x];
+
+  int offset = threadIdx.x;
+  int last = classes % (ILP * blockDim.x);
+
+  for (; offset < classes - last; offset += blockDim.x * ILP) {
+    accscalar_t tmpLogits[ILP];
+
+#pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      tmpLogits[j] = static_cast<accscalar_t>(logits[offset + j * blockDim.x]);
+    }
+
+#pragma unroll
+    for (int j = 0; j < ILP; ++j)
+      gradInput[offset + j * blockDim.x] = tmpGradOutput * (
+        std::exp(tmpLogits[j] - coeff) - static_cast<accscalar_t>(
+          (offset + j * blockDim.x == label) ? 1 : 0) *
+        smooth_positives - smooth_negatives);
+  }
+
+  for (; offset < classes; offset += blockDim.x)
+    gradInput[offset] = tmpGradOutput * (std::exp(
+        static_cast<accscalar_t>(logits[offset]) - coeff) -
+        static_cast<accscalar_t>((offset == label) ? 1 : 0) *
+        smooth_positives - smooth_negatives);
+}
+
+
+template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t>
+__device__ __forceinline__ void
+aligned_apply(int shift,
+              scalar_t *gradInput,
+              scalar_t *logits,
+              outscalar_t *max_log_sum_exp,
+              outscalar_t *gradOutput,
+              int64_t *labels,
+              const float smoothing,
+              int classes,
+              const int total_classes)
+{
+  accscalar_t smooth_positives = 1.0 - smoothing;
+  accscalar_t smooth_negatives = smoothing / total_classes;
+  accscalar_t tmpGradOutput = gradOutput[blockIdx.x];
+  int64_t label = labels[blockIdx.x];
+  accscalar_t coeff = max_log_sum_exp[blockIdx.x];
+
+  int offset = threadIdx.x;
+
+  // shift and do 1
+  if(shift > 0){
+    logits -= shift;
+    gradInput -= shift;
+    classes += shift;
+    if(threadIdx.x >= shift){
+      gradInput[offset] = tmpGradOutput * (std::exp(
+        static_cast<accscalar_t>(logits[offset]) - coeff) -
+        static_cast<accscalar_t>(((offset - shift) == label) ? 1 : 0) *
+        smooth_positives - smooth_negatives);
+    }
+    classes -= blockDim.x;
+    gradInput += blockDim.x;
+    logits += blockDim.x;
+    shift -= blockDim.x;
+  }
+
+  int last = classes % (ILP * blockDim.x);
+
+  typedef typename std::aligned_storage<ILP*sizeof(scalar_t), ILP*alignof(scalar_t)>::type LoadT;
+  // input
+  scalar_t v[ILP];
+  LoadT* value = reinterpret_cast<LoadT*>(&v);
+  // output
+  scalar_t r[ILP];
+  LoadT* result = reinterpret_cast<LoadT*>(&r);
+
+  for (; offset * ILP < (classes - last); offset += blockDim.x) {
+    *value = reinterpret_cast<LoadT*>(logits)[offset];
+
+#pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      r[j] = tmpGradOutput * (std::exp(
+          static_cast<accscalar_t>(v[j]) - coeff) -
+          static_cast<accscalar_t>(((ILP * offset + j - shift) == label) ? 1 : 0) *
+          smooth_positives - smooth_negatives);
+    }
+    reinterpret_cast<LoadT*>(gradInput)[offset] = *result;
+  }
+
+  offset = classes - last + threadIdx.x;
+  for (; offset < classes; offset += blockDim.x)
+    gradInput[offset] = tmpGradOutput * (std::exp(
+        static_cast<accscalar_t>(logits[offset]) - coeff) -
+        static_cast<accscalar_t>(((offset - shift) == label) ? 1 : 0) *
+        smooth_positives - smooth_negatives);
+
+}
+
+template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template<typename, typename, typename> class Epilogue>
+__global__ void
+cunn_SoftMaxXEntropyBackward(
+    scalar_t *gradInput,
+    scalar_t *logits,
+    outscalar_t *max_log_sum_exp,
+    outscalar_t *gradOutput,
+    int64_t *labels,
+    const float smoothing,
+    int classes,
+    const int total_classes)
+{
+  gradInput += blockIdx.x * classes;
+  logits += blockIdx.x * classes;
+
+  // Do vectorized load/store when input/output have same alignment
+  const int shift = ((uint64_t)logits) % ALIGN_BYTES / sizeof(scalar_t);
+  const int shift_ = ((uint64_t)gradInput) % ALIGN_BYTES / sizeof(scalar_t);
+  if (shift == shift_){
+    aligned_apply<ILP, scalar_t, accscalar_t, outscalar_t>(shift, gradInput, logits, max_log_sum_exp, gradOutput, labels, smoothing, classes, total_classes <= 0 ? classes : total_classes);
+  }
+  else {
+    apply<ILP, scalar_t, accscalar_t, outscalar_t>(gradInput, logits, max_log_sum_exp, gradOutput, labels, smoothing, classes, total_classes <= 0 ? classes : total_classes);
+  }
+
+}
+
+template<template<typename, typename, typename> class Epilogue>
+std::vector<Tensor> host_softmax_xentropy(
+        const Tensor & input_,
+        const Tensor & labels_,
+        const float smoothing,
+        const int total_classes) {
+  // For tensor parallel cross entropy with smoothing, we want to pass in the total number
+  // of classes so that smoothing can be applied correctly. If total_classes=-1, use the
+  // last dimension of the input tensor.
+  AT_ASSERTM(labels_.scalar_type() == ScalarType::Long,"Label type should be CUDA Long");
+
+  // Otherwise the kernel will be launched from cuda:0 device
+  // Cast to char to avoid compiler warning about narrowing
+  at::cuda::CUDAGuard device_guard{(char)input_.get_device()};
+
+  auto input = input_.contiguous();
+  Tensor max_log_sum_exp = at::empty_like(labels_, input.options().dtype(ScalarType::Float));
+  Tensor losses = at::empty_like(labels_, input_.options().dtype(ScalarType::Float));
+
+  static_assert(std::is_same<acc_type<at::Half, true>, float>::value ||
+    std::is_same<acc_type<at::Half, true>, double>::value,
+    "accscalar_t for half should be float or double");
+  AT_ASSERTM(input.dim() == 2, "Currently only 2 dim input supported");
+  AT_ASSERTM(labels_.dim() == 1, "Labels should be 1 dimensional");
+  AT_ASSERTM(input.size(0) == labels_.size(0), "Input and label should have same number of examples");
+  AT_ASSERTM(input.numel() > 0, "Number of classes in input should not be 0");
+
+  const int64_t dim = 1;
+  int64_t outer_size = 1;
+  int64_t dim_size = input.size(dim);
+  int64_t inner_size = 1;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  for (int64_t i = 0; i < dim; ++i)
+    outer_size *= input.size(i);
+  for (int64_t i = dim + 1; i < input.dim(); ++i)
+    inner_size *= input.size(i);
+  // This kernel spawns a block per each element in the batch.
+  // XXX: it assumes that inner_size == 1
+  TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");
+
+  dim3 grid(outer_size);
+
+  using namespace at;
+  DISPATCH_FLOAT_AND_HALF_AND_BF16(input.scalar_type(), 0, "host_softmax_xentropy",
+    using accscalar_t = at::acc_type<scalar_t_0, true>;
+    const int ILP = sizeof(float4)/sizeof(scalar_t_0);
+    dim3 block = SoftMax_getBlockSize(ILP, dim_size);
+    cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
+      <<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
+        losses.data_ptr<accscalar_t>(), max_log_sum_exp.data_ptr<accscalar_t>(),
+        input.data_ptr<scalar_t_0>(), labels_.data_ptr<int64_t>(),
+        dim_size, smoothing, total_classes <= 0 ? dim_size : total_classes
+    );
+  );
+
+  C10_CUDA_CHECK(cudaGetLastError());
+
+  std::vector<at::Tensor> ret = {losses, max_log_sum_exp};
+  return ret;
+}
+
+template<template<typename, typename, typename> class Epilogue>
+Tensor host_softmax_xentropy_backward(
+    const at::Tensor &grad_loss,
+    at::Tensor &logits_,
+    const at::Tensor &max_log_sum_exp,
+    const at::Tensor &labels,
+    const float smoothing,
+    bool inplace,
+    const int total_classes) {
+  // Otherwise the kernel will be launched from cuda:0 device
+  // Cast to char to avoid compiler warning about narrowing
+  at::cuda::CUDAGuard device_guard{(char)grad_loss.get_device()};
+
+  const int64_t dim = 1;
+  Tensor gI = inplace ? logits_ : at::empty_like(logits_);
+  if (grad_loss.numel() == 0) {
+    return gI;
+  }
+
+  auto grad = grad_loss.contiguous();
+  auto logits = logits_.contiguous();
+
+  static_assert(std::is_same<acc_type<at::Half, true>, float>::value ||
+    std::is_same<acc_type<at::Half, true>, double>::value,
+    "accscalar_t for half should be float or double");
+  if (grad.dim() == 0) grad = grad.view(1);
+
+  AT_ASSERTM(logits_.dim() == 2, "Currently only 2 dim input supported");
+  AT_ASSERTM(labels.dim() == 1, "Labels should be 1 dimensional");
+  AT_ASSERTM(logits_.numel() > 0, "Number of classes in input should not be 0");
+  AT_ASSERTM(logits_.size(0) == labels.size(0), "Input and label should have same number of examples");
+  AT_ASSERTM(labels.size(0) == grad.size(0), "Label and loss should have same number of examples");
+
+  int64_t outer_size = 1;
+  int64_t dim_size = logits.size(dim);
+  int64_t inner_size = 1;
+  for (int64_t i = 0; i < dim; ++i)
+    outer_size *= logits.size(i);
+  for (int64_t i = dim + 1; i < logits.dim(); ++i)
+    inner_size *= logits.size(i);
+  // See descriptions of kernels above.
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");
+
+  dim3 grid(outer_size);
+
+  DISPATCH_FLOAT_AND_HALF_AND_BF16(gI.scalar_type(), 0, "host_softmax_xentropy_backward",
+    using accscalar_t = acc_type<scalar_t_0, true>;
+    const int ILP = sizeof(float4)/sizeof(scalar_t_0);
+    dim3 block = SoftMax_getBlockSize(ILP, dim_size);
+    cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
+      <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
+        gI.data_ptr<scalar_t_0>(), logits.data_ptr<scalar_t_0>(),
+        max_log_sum_exp.data_ptr<accscalar_t>(),
+        grad.data_ptr<accscalar_t>(), labels.data_ptr<int64_t>(),
+        smoothing, dim_size, total_classes
+    );
+  );
+
+  C10_CUDA_CHECK(cudaGetLastError());
+  return gI;
+}
+
+std::vector<Tensor> softmax_xentropy_cuda(const Tensor &input, const Tensor &labels, const float smoothing, const int total_classes){
+  return host_softmax_xentropy<LogSoftMaxForwardEpilogue>(input, labels, smoothing, total_classes);
+}
+
+at::Tensor softmax_xentropy_backward_cuda(
+    const at::Tensor &grad_loss,
+    at::Tensor &logits,
+    const at::Tensor &max_log_sum_exp,
+    const at::Tensor &labels,
+    const float smoothing,
+    const bool inplace,
+    const int total_classes) {
+  AT_ASSERTM((grad_loss.scalar_type() == ScalarType::Float), "expected grad types to be at::Float");
+  return host_softmax_xentropy_backward<LogSoftMaxBackwardEpilogue>(grad_loss, logits, max_log_sum_exp, labels, smoothing, inplace, total_classes);
+}