diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..e3eb456a457edebf8c4564a930dadd63884ef6be 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +flashattention_logo.png filter=lfs diff=lfs merge=lfs -text diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000000000000000000000000000000000000..e35a781665eafa7421c30241962ef8e49588bffc --- /dev/null +++ b/AUTHORS @@ -0,0 +1 @@ +Tri Dao, trid@cs.stanford.edu \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..e546cfc1327df2b91a891fa22a7c7c0cabee3290 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,91 @@ +# Inspired by https://github.com/anibali/docker-pytorch/blob/master/dockerfiles/1.10.0-cuda11.3-ubuntu20.04/Dockerfile +# ARG COMPAT=0 +ARG PERSONAL=0 +# FROM nvidia/cuda:11.3.1-devel-ubuntu20.04 as base-0 +FROM nvcr.io/nvidia/pytorch:22.12-py3 as base + +ENV HOST docker +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 +# https://serverfault.com/questions/683605/docker-container-time-timezone-will-not-reflect-changes +ENV TZ America/Los_Angeles +RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone + +# git for installing dependencies +# tzdata to set time zone +# wget and unzip to download data +# [2021-09-09] TD: zsh, stow, subversion, fasd are for setting up my personal environment. +# [2021-12-07] TD: openmpi-bin for MPI (multi-node training) +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + cmake \ + curl \ + ca-certificates \ + sudo \ + less \ + htop \ + git \ + tzdata \ + wget \ + tmux \ + zip \ + unzip \ + zsh stow subversion fasd \ + && rm -rf /var/lib/apt/lists/* + # openmpi-bin \ + +# Allow running runmpi as root +# ENV OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 + +# # Create a non-root user and switch to it +# RUN adduser --disabled-password --gecos '' --shell /bin/bash user \ +# && echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user +# USER user + +# All users can use /home/user as their home directory +ENV HOME=/home/user +RUN mkdir -p /home/user && chmod 777 /home/user +WORKDIR /home/user + +# Set up personal environment +# FROM base-${COMPAT} as env-0 +FROM base as env-0 +FROM env-0 as env-1 +# Use ONBUILD so that the dotfiles dir doesn't need to exist unless we're building a personal image +# https://stackoverflow.com/questions/31528384/conditional-copy-add-in-dockerfile +ONBUILD COPY dotfiles ./dotfiles +ONBUILD RUN cd ~/dotfiles && stow bash zsh tmux && sudo chsh -s /usr/bin/zsh $(whoami) +# nvcr pytorch image sets SHELL=/bin/bash +ONBUILD ENV SHELL=/bin/zsh + +FROM env-${PERSONAL} as packages + +# Disable pip cache: https://stackoverflow.com/questions/45594707/what-is-pips-no-cache-dir-good-for +ENV PIP_NO_CACHE_DIR=1 + +# # apex and pytorch-fast-transformers take a while to compile so we install them first +# TD [2022-04-28] apex is already installed. In case we need a newer commit: +# RUN pip install --upgrade --force-reinstall --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_multihead_attn" --global-option="--fmha" --global-option="--fast_layer_norm" --global-option="--xentropy" git+https://github.com/NVIDIA/apex.git#egg=apex + +# xgboost conflicts with deepspeed +RUN pip uninstall -y xgboost && DS_BUILD_UTILS=1 DS_BUILD_FUSED_LAMB=1 pip install deepspeed==0.7.7 + +# General packages that we don't care about the version +# zstandard to extract the_pile dataset +# psutil to get the number of cpu physical cores +# twine to upload package to PyPI +RUN pip install pytest matplotlib jupyter ipython ipdb gpustat scikit-learn spacy munch einops opt_einsum fvcore gsutil cmake pykeops zstandard psutil h5py twine gdown \ + && python -m spacy download en_core_web_sm +# hydra +RUN pip install hydra-core==1.3.1 hydra-colorlog==1.2.0 hydra-optuna-sweeper==1.2.0 pyrootutils rich +# Core packages +RUN pip install transformers==4.25.1 datasets==2.8.0 pytorch-lightning==1.8.6 triton==2.0.0.dev20221202 wandb==0.13.7 timm==0.6.12 torchmetrics==0.10.3 +# torchmetrics 0.11.0 broke hydra's instantiate + +# For MLPerf +RUN pip install git+https://github.com/mlcommons/logging.git@2.1.0 + +# Install FlashAttention +RUN pip install flash-attn==2.6.3 + +# Install CUDA extensions for fused dense +RUN pip install git+https://github.com/Dao-AILab/flash-attention@v2.6.3#subdirectory=csrc/fused_dense_lib diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..5860e4b33f3d9d85fc636137c559331d51783a5b --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..021b4d0f7d3fbf523d3b0d4934f2a1f46781cb50 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,11 @@ +recursive-include csrc *.cu +recursive-include csrc *.h +recursive-include csrc *.cuh +recursive-include csrc *.cpp +recursive-include csrc *.hpp + +recursive-include flash_attn *.cu +recursive-include flash_attn *.h +recursive-include flash_attn *.cuh +recursive-include flash_attn *.cpp +recursive-include flash_attn *.hpp diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..79aefc7973fb6163b84dc149012f5101b2d399af --- /dev/null +++ b/Makefile @@ -0,0 +1,9 @@ + +clean_dist: + rm -rf dist/* + +create_dist: clean_dist + python setup.py sdist + +upload_package: create_dist + twine upload dist/* diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6bb429e123a17fa027648c4238fe404a42895f86 --- /dev/null +++ b/README.md @@ -0,0 +1,231 @@ +# Optimized Transformer implementation +This repo contains examples of how FlashAttention can be integrated into a model +(e.g., GPT, ViT) and trained end-to-end. We also provide optimized +implementations of other layers (e.g., MLP, LayerNorm, cross-entropy loss, +rotary embedding). Overall this speeds up training by 3-5x compared to the +baseline implementation from Huggingface, reaching up to 189 TFLOPs/sec per A100, +equivalent to 60.6\% model FLOPs utilization (we don't need any activation +checkpointing). All without changing the model architecture (i.e., no +approximation). + +Goals: +- Performance: we optimize for model speed and memory, especially on 1-node + (e.g., with 8 A100s). +- Flexibility: we provide optimized building blocks (MLP, attention, LayerNorm), + and the model code illustrates how these components can be put together. + The training code also aims to be model- & task-agnostic. + +Non-goals (and other resources): +- Support as many models as possible: Huggingface's + [transformers](https://github.com/huggingface/transformers) and + [timm](https://github.com/rwightman/pytorch-image-models/) are great for this. +- Large-scale distributed training: our codebase has been used for multi-GPU and multi-node + training for models up to 2.7B parameters. However, if you're looking for large-scale distributed + training techniques (e.g., pipeline parallelism, tensor parallelism), + check out [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/) and + [DeepSpeed](https://github.com/microsoft/deepspeed). +- Inference: we currently focus on training (this might change in the future). + If you want fast inference, take a look at + [FasterTransformer](https://github.com/NVIDIA/FasterTransformer). +- Production: this codebase was written during several research projects to validate ideas + on speeding up ML models. + +## Model Components + +The GPT model is implemented +[here](https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/models/gpt.py). +And here's an example to construct the GPT3-1.3B model with rotary embedding: +```python +from transformers.models.gpt2.configuration_gpt2 import GPT2Config +from flash_attn.models.gpt import GPTLMHeadModel + +seqlen = 2048 +hidden_dim = 2048 +nheads = 16 +n_layer = 24 +rotary_emb_fraction = 0.5 +config = GPT2Config(vocab_size=50257, n_positions=seqlen, n_embd=hidden_dim, + n_layer=n_layer, n_head=nheads, + scale_attn_by_inverse_layer_idx=True, + rotary_emb_fraction=rotary_emb_fraction, + use_flash_attn=True, fused_mlp=True, + fused_bias_fc=True, fused_dropout_add_ln=True, + pad_vocab_size_multiple=8) +model = GPTLMHeadModel(config) +``` + +We provide the following optimized components: + +1. FlashAttention: fast and memory-efficient exact attention. This makes +attention much faster and saves a lot of activation memory. As a result we don't need +to use any activation checkpointing. +```sh +pip install flash-attn +``` + +2. Fused matmul + bias (forward and backward), and fused matmul + bias + gelu +(forward and backward), adapted from Apex's +[FusedDense](https://github.com/NVIDIA/apex/tree/master/apex/fused_dense). We +make it work for bfloat16. For best performance, you should use CUDA >= 11.8. CuBLAS versions before +this doesn't have the best matmul + bias + gelu performance for bfloat16. +```sh +cd ../csrc/fused_dense_lib && pip install . +``` +3. Optimized cross-entropy loss, adapted from Apex's +[Xentropy](https://github.com/NVIDIA/apex/tree/master/apex/contrib/xentropy). We make it work for bfloat16 and support in-place backward to save memory. +```sh +cd ../csrc/xentropy && pip install . +``` +4. Fused rotary embedding: +```sh +cd ../csrc/rotary && pip install . +``` +5. Fused dropout + residual + LayerNorm, adapted from Apex's +[FastLayerNorm](https://github.com/NVIDIA/apex/tree/master/apex/contrib/layer_norm). We add dropout and residual, and make it work for both pre-norm and post-norm architecture. +This supports dimensions divisible by 8, up to 6144. +```sh +cd ../csrc/layer_norm && pip install . +``` + +## Training + +We also provide here training scripts to train GPT2 on Openwebtext and GPT3 on +The Pile as examples. Feel free to use the model in your own training setup as +well. + +We use [Hydra](https://hydra.cc/) for configuration, +[Pytorch-Lightning](https://github.com/Lightning-AI/lightning) for training, and +[Wandb](https://wandb.ai/) for logging. + +We use the template from `https://github.com/ashleve/lightning-hydra-template`. +Please read the instructions there to understand the repo structure. + +### Requirements + +Python 3.8+, Pytorch 1.12+, torchvision, einops, timm, hydra-core, +hydra-colorlog, python-dotenv, rich, pytorch-lightning, triton, flash-attn. +We recommend CUDA 11.8 (e.g., using the Nvidia's Pytorch Docker image from https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) + +We provide a Dockerfile that lists all the required packages. + +### Dataset preparation + +Running the training command would automatically download the datasets +(Openwebtext, Pile), tokenize with the GPT2 tokenizer, concatenate all the +tokens, then save this cache to disk. Alternatively, you can also prepare the +datasets as a separate step. + +The cached datasets are saved to `${DATA_DIR}/openwebtext` and +`${DATA_DIR}/the_pile`. If `${DATA_DIR}` is not set, they will be saved to +`./data/{openwebtext,the_pile}`. + +- Openwebtext: +```sh +export PYTHONPATH=$PWD:$PYTHONPATH +pytest -q -s tests/datamodules/test_language_modeling_hf.py -k "openwebtext" +``` +This takes around 1h on a 64-core CPU. The processed dataset has size 17GB. + +- The Pile: +```sh +export PYTHONPATH=$PWD:$PYTHONPATH +pytest -q -s tests/datamodules/test_language_modeling_hf.py -k "pile" +``` +This takes around 20h on a 64-core CPU. The processed dataset has size 699GB. + +### GPT2 training on Openwebtext +To train GPT2 on Openwebtext with 8 GPUs: +```sh +python run.py experiment=owt/gpt2s-flash trainer.devices=8 # 125M +python run.py experiment=owt/gpt2m-flash trainer.devices=8 # 355M +python run.py experiment=owt/gpt2l-flash trainer.devices=8 # 760M +python run.py experiment=owt/gpt2xl-flash trainer.devices=8 # 1.6B +``` +The default parameters are set for 8 x A100 80GB. + +To train with bf16 instead of fp16, add `trainer.precision=bf16`. + +### GPT3 training on The Pile +To train GPT3 on The Pile with 8 GPUs: +```sh +python run.py experiment=pile/gpt3s-flash trainer.devices=8 # 125M +python run.py experiment=pile/gpt3m-flash trainer.devices=8 # 355M +python run.py experiment=pile/gpt3l-flash trainer.devices=8 # 760M +python run.py experiment=pile/gpt3xl-flash trainer.devices=8 # 1.3B +python run.py experiment=pile/gpt3-2.7B-flash-hdim128 trainer.devices=8 # 2.7B +``` +The default parameters are set for 8 x A100 80GB. We train with bf16 by default. + +To train with rotary embedding, run the experiments `pile/gpt3{s,m,l,xl}-flash-rotary`. + +### Training options + +**Gradient accumulation**: to adjust device batch size to fit into GPU memory +(the global batch size stays the same, and gradient accumulation is calculated +automatically), set `datamodule.batch_size=blah`. + +**Multi-node**: to train on multiple nodes, add `trainer.num_nodes=blah`. + +**Speed benchmarking**: to print out iteration time, add `+callbacks.speed_monitor.verbose=True`. + +**Resumable training**: set a name to the run, and then set `resume=True` when +you resume. Training will restart at exactly the same batch. +```sh +python run.py experiment=pile/gpt3s-flash trainer.devices=8 name=pile-gpt3s-flash resume=True +``` + +## Training speed + +We measure the wallclock training speed on one node with 8 x A100 80GB SXM4 80GB (400W) with NVLink. + +FLOPs are calculated using the formula from the [Megatron-LM +paper](https://arxiv.org/abs/2104.04473) (Section 5.1), except we scale by 3/4 +to get the model FLOPs (instead of hardware FLOPs with activation +checkpointing). + + +### GPT2 (sequence length 1024) + +![GPT2 speedup](../assets/gpt2_training_efficiency.jpg) + +The implementation in this repo (FlashAttention) is 3-4x faster than the +baseline implementation from Huggingface. + +### GPT3 (sequence length 2048) + +![GPT3 speedup](../assets/gpt3_training_efficiency.jpg) + +The implementation in this repo (FlashAttention) is 3-5x faster than the +baseline implementation from Huggingface. + +For the GPT3-2.7B model, we set head dimension to 128 (instead of 80) for better efficiency. + +We include here more details on the training speed with FlashAttention on 8 x +A100 80GB. + +| Model | Batch size (tokens) | Through put (tokens/sec) | Hours / 1B tokens | +| --------- | ------------------- | ------------------------ | ----------------- | +| GPT3-125M | 0.5M | 1310k | 0.21 | +| GPT3-355M | 0.5M | 503k | 0.55 | +| GPT3-760M | 0.5M | 245k | 1.13 | +| GPT3-1.3B | 1M | 169k | 1.64 | +| GPT3-2.7B | 1M | 85k | 3.27 | + +As an example, this means that one can train a GPT3-1.3B model on 26B tokens +(compute-optimal according to Chinchilla scaling) in about 43 hours on 8 x A100. + +## Training quality + +We include here the loss curve for GPT2 on Openwebtext, trained for 200B tokens. +For GPT2, the runs with FlashAttention yield the same loss curve as the runs +with the baseline implementation from Huggingface for 125M and 355M models. For +larger models the baseline implementation just takes too long. + +![GPT2 training curve](../assets/gpt2_training_curve.jpg) + +We include here the loss curve for GPT3 on The Pile, trained for 400B tokens. +The 125M, 355M, 760M models have batch size 512k tokens so this translates to +800k training steps, while the 1.3B and 2.7B models have batch size 1M tokens, +which translates to 400k training steps. + +![GPT3 training curve](../assets/gpt3_training_curve.jpg) diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2e33087c53f36e50efce2ba54b90d9a7562262d5 --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ +__version__ = "3.0.0.b1" diff --git a/acc.yaml b/acc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fe7a63f5c162d0bded2b19e4e227c1eb4147442e --- /dev/null +++ b/acc.yaml @@ -0,0 +1,3 @@ +# @package eval.metrics +acc: + _target_: src.metrics.accuracy.AccuracyMine diff --git a/acc_ignore_index.yaml b/acc_ignore_index.yaml new file mode 100644 index 0000000000000000000000000000000000000000..03364aa1af55795a2b92efec04e46037b72c5761 --- /dev/null +++ b/acc_ignore_index.yaml @@ -0,0 +1,4 @@ +# @package eval.metrics +acc: + _target_: torchmetrics.Accuracy + ignore_index: -100 diff --git a/acctop5.yaml b/acctop5.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5f798ae0cb6d42a9f0651f928f23c13bc4ad16c1 --- /dev/null +++ b/acctop5.yaml @@ -0,0 +1,4 @@ +# @package eval.metrics +acctop5: + _target_: src.metrics.accuracy.AccuracyMine + top_k: 5 diff --git a/activations.py b/activations.py new file mode 100644 index 0000000000000000000000000000000000000000..b00063b6bd497e10a70a201cfe246178174aad67 --- /dev/null +++ b/activations.py @@ -0,0 +1,135 @@ +# Copied from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/model/layers/activations.py +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# 1/sqrt(2*pi)-> 0.3989423 +# 1/sqrt(2) -> 0.70710678 +# sqrt(2/pi) -> 0.79788456 + +# this function is tanh approximation of gelu +# actual gelu is: +# x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) +@torch.jit.script +def bias_gelu(y, bias): + x = bias + y + return (x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))).to(dtype=y.dtype) + + +# gradient of tanh approximation of gelu +# gradient of actual gelu is: +# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) +@torch.jit.script +def bias_gelu_back(g, y, bias): + """Assume that y has shape (B, D) and bias has shape (D)""" + x = bias + y + tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) + # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 + ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * ( + 1 + tanh_out + ) + grad_y = ff * g + return grad_y.to(dtype=y.dtype), grad_y.sum(dim=(0), dtype=bias.dtype) + + +class GeLUFunction(torch.autograd.Function): + @staticmethod + # bias is an optional argument + def forward(ctx, input, bias): + ctx.save_for_backward(input, bias) + return bias_gelu(input, bias) + + @staticmethod + def backward(ctx, grad_output): + input, bias = ctx.saved_tensors + tmp = bias_gelu_back(grad_output, input, bias) + return tmp, tmp + + +bias_gelu_impl = GeLUFunction.apply + +# this function is tanh approximation of gelu +# actual gelu is: +# x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) +@torch.jit.script +def gelu_fwd(x): + return (x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))).to(dtype=x.dtype) + + +# gradient of tanh approximation of gelu +# gradient of actual gelu is: +# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) +@torch.jit.script +def gelu_bwd(g, x): + tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) + # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 + ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * ( + 1 + tanh_out + ) + return (ff * g).to(dtype=x.dtype) + + +class FastGeLUFunction(torch.autograd.Function): + @staticmethod + # bias is an optional argument + def forward(ctx, input): + ctx.save_for_backward(input) + return gelu_fwd(input) + + @staticmethod + def backward(ctx, grad_output): + (input,) = ctx.saved_tensors + tmp = gelu_bwd(grad_output, input) + return tmp + + +fast_gelu_impl = FastGeLUFunction.apply + + +@torch.jit.script +def relu_bwd(g, x): + return torch.where(x >= 0, g, 0.0).to(dtype=x.dtype) + + +@torch.jit.script +def sqrelu_fwd(x): + r = F.relu(x) + return (r * r).to(dtype=x.dtype) + + +@torch.jit.script +def sqrelu_bwd(g, x): + return (2.0 * g * F.relu(x)).to(dtype=x.dtype) + + +swiglu_fwd_codestring = """ +template T swiglu_fwd(T x, T y) { + return float(x) * float(y) / (1.0f + ::exp(-float(x))); +} +""" +swiglu_bwd_codestring = """ +template T swiglu_bwd(T x, T y, T g, T& dx, T& dy) { + float x_sigmoid = 1.0f / (1.0f + ::exp(-float(x))); + dx = x_sigmoid * (1 + float(x) * (1.0f - x_sigmoid)) * float(g) * float(y); + dy = float(x) * x_sigmoid * float(g); +} +""" +swiglu_fwd = torch.cuda.jiterator._create_jit_fn(swiglu_fwd_codestring) +swiglu_bwd = torch.cuda.jiterator._create_multi_output_jit_fn(swiglu_bwd_codestring, num_outputs=2) + + +class SwiGLUFunction(torch.autograd.Function): + + @staticmethod + def forward(ctx, x, y): + ctx.save_for_backward(x, y) + return swiglu_fwd(x, y) + + @staticmethod + def backward(ctx, dout): + x, y = ctx.saved_tensors + return swiglu_bwd(x, y, dout) + +swiglu = SwiGLUFunction.apply diff --git a/adam.yaml b/adam.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8821d74ced32918343f78ecb55d13c5102cf5ec --- /dev/null +++ b/adam.yaml @@ -0,0 +1,2 @@ +# @package train.optimizer +_target_: torch.optim.Adam diff --git a/adamw-apex-distributed.yaml b/adamw-apex-distributed.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b7a5136ebe88ae64012cdf070dbc661be6577818 --- /dev/null +++ b/adamw-apex-distributed.yaml @@ -0,0 +1,3 @@ +# @package train.optimizer +_target_: apex.contrib.optimizers.distributed_fused_adam.DistributedFusedAdam +adam_w_mode: True diff --git a/adamw-apex-zero.yaml b/adamw-apex-zero.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f19d7a0445db380a3038fe141a885f3090f90fbb --- /dev/null +++ b/adamw-apex-zero.yaml @@ -0,0 +1,7 @@ +# @package train.optimizer +_target_: torch.distributed.optim.ZeroRedundancyOptimizer +_recursive_: True +optimizer_class: + _target_: apex.optimizers.FusedAdam + _partial_: True + adam_w_mode: True diff --git a/adamw-apex.yaml b/adamw-apex.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fdbf90fdfecac98313b6f51562b21f460521143c --- /dev/null +++ b/adamw-apex.yaml @@ -0,0 +1,3 @@ +# @package train.optimizer +_target_: apex.optimizers.FusedAdam +adam_w_mode: True diff --git a/adamw-zero.yaml b/adamw-zero.yaml new file mode 100644 index 0000000000000000000000000000000000000000..66ea2fd03150e3811fa9387815ab8adcb2921697 --- /dev/null +++ b/adamw-zero.yaml @@ -0,0 +1,7 @@ +# @package train.optimizer +_target_: torch.distributed.optim.ZeroRedundancyOptimizer +_recursive_: True +optimizer_class: + _target_: torch.optim.__getattribute__ + _args_: + - "AdamW" diff --git a/adamw.yaml b/adamw.yaml new file mode 100644 index 0000000000000000000000000000000000000000..02252ec1cec06b2b06ef99fdeb9e324c1035738d --- /dev/null +++ b/adamw.yaml @@ -0,0 +1,2 @@ +# @package train.optimizer +_target_: torch.optim.AdamW diff --git a/alibi.h b/alibi.h new file mode 100644 index 0000000000000000000000000000000000000000..e714233e7eb8553f0c68d2a4ce02a4784dcfbd6b --- /dev/null +++ b/alibi.h @@ -0,0 +1,74 @@ +#include + +#include + +#include +#include + +#include "utils.h" + +namespace flash { + +using namespace cute; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Alibi { + + const float alibi_slope; + const int max_seqlen_k, max_seqlen_q; + + __forceinline__ __device__ Alibi(const float alibi_slope, const int max_seqlen_k, const int max_seqlen_q) + : alibi_slope(alibi_slope) + , max_seqlen_k(max_seqlen_k) + , max_seqlen_q(max_seqlen_q) { + }; + + + template + __forceinline__ __device__ void apply_alibi(Tensor &tensor, + const int col_idx_offset_, + const int row_idx_offset, + const int warp_row_stride) { + // tensor has shape (nrow=(2, MMA_M), ncol=(2, MMA_N)) + static_assert(Layout::rank == 2, "Only support 2D Tensor"); + const int lane_id = threadIdx.x % 32; + const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2; + if constexpr (Is_causal) { // Simpler, we add the same bias vector to all rows + #pragma unroll + for (int nj = 0; nj < size<1, 1>(tensor); ++nj) { + const int col_idx_base = col_idx_offset + nj * 8; + #pragma unroll + for (int j = 0; j < size<1, 0>(tensor); ++j) { + const int col_idx = col_idx_base + j; + #pragma unroll + for (int mi = 0; mi < size<0>(tensor); ++mi) { + tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx; + } + } + } + } else { // Bias depends on both row_idx and col_idx + #pragma unroll + for (int mi = 0; mi < size<0, 1>(tensor); ++mi) { + const int row_idx_base = row_idx_offset + mi * warp_row_stride; + #pragma unroll + for (int i = 0; i < size<0, 0>(tensor); ++i) { + const int row_idx = row_idx_base + i * 8; + #pragma unroll + for (int nj = 0; nj < size<1, 1>(tensor); ++nj) { + const int col_idx_base = col_idx_offset + nj * 8; + #pragma unroll + for (int j = 0; j < size<1, 0>(tensor); ++j) { + const int col_idx = col_idx_base + j; + tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx); + } + } + } + } + } + } + +}; + +} // namespace flash diff --git a/all_params.yaml b/all_params.yaml new file mode 100644 index 0000000000000000000000000000000000000000..24a0b50486d3a579a15dd1b14e7aa45f3747aa88 --- /dev/null +++ b/all_params.yaml @@ -0,0 +1,49 @@ +_target_: pytorch_lightning.Trainer + +# default values for all trainer parameters +checkpoint_callback: True +default_root_dir: null +gradient_clip_val: 0.0 +process_position: 0 +num_nodes: 1 +num_processes: 1 +gpus: null +auto_select_gpus: False +tpu_cores: null +log_gpu_memory: null +overfit_batches: 0.0 +track_grad_norm: -1 +check_val_every_n_epoch: 1 +fast_dev_run: False +accumulate_grad_batches: 1 +max_epochs: 1 +min_epochs: 1 +max_steps: null +min_steps: null +limit_train_batches: 1.0 +limit_val_batches: 1.0 +limit_test_batches: 1.0 +val_check_interval: 1.0 +flush_logs_every_n_steps: 100 +log_every_n_steps: 50 +accelerator: null +sync_batchnorm: False +precision: 32 +weights_summary: "top" +weights_save_path: null +num_sanity_val_steps: 2 +truncated_bptt_steps: null +resume_from_checkpoint: null +profiler: null +benchmark: False +deterministic: False +reload_dataloaders_every_epoch: False +auto_lr_find: False +replace_sampler_ddp: True +terminate_on_nan: False +auto_scale_batch_size: False +prepare_data_per_node: True +plugins: null +amp_backend: "native" +amp_level: "O2" +move_metrics_to_cpu: False diff --git a/baichuan.py b/baichuan.py new file mode 100644 index 0000000000000000000000000000000000000000..97d030782187afdfa22b9ad0a9a264b9f6c0a95e --- /dev/null +++ b/baichuan.py @@ -0,0 +1,151 @@ +# Copyright (c) 2023, GGGGGGXY, Tri Dao. + +import math +import json +import re +from pathlib import Path + +from collections import OrderedDict + +import torch +import torch.nn.functional as F + +from einops import rearrange +from transformers import GPT2Config, AutoConfig, PretrainedConfig + + +def remap_state_dict_hf_baichuan(state_dict, config): + def key_mapping_layers(key): + return re.sub(r"^model.", "transformer.", key) + + state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items()) + + # Word embedding + def key_mapping_emb(key): + return re.sub( + r"^transformer.embed_tokens.", + "transformer.embeddings.word_embeddings.", + key, + ) + + state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items()) + word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight") + # It's possible that vocab_size is padded to be a multiple of 8, for example. + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = ( + math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple) + * pad_vocab_size_multiple + ) + state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad( + word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0]) + ) + if getattr(config, "tie_word_embeddings"): + state_dict["lm_head.weight"] = state_dict[ + "transformer.embeddings.word_embeddings.weight" + ] + else: + output_embeddings = state_dict.pop("lm_head.weight") + # Need to recompute vocab_size since Baichuan shards the word embeddings and output embeddings + # differently. + vocab_size = ( + math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple) + * pad_vocab_size_multiple + ) + # It's possible that vocab_size is padded to be a multiple of 8, for example. + state_dict["lm_head.weight"] = F.pad( + output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0]) + ) + + # LayerNorm + def key_mapping_ln(key): + key = re.sub(r"^transformer.norm.", r"transformer.ln_f.", key) + key = re.sub( + r"^transformer.layers.(\d+).input_layernorm.", + r"transformer.layers.\1.norm1.", + key, + ) + key = re.sub( + r"^transformer.layers.(\d+).post_attention_layernorm.", + r"transformer.layers.\1.norm2.", + key, + ) + return key + + state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items()) + + # MLP + for l in range(config.n_layer): + w1 = state_dict.pop(f"transformer.layers.{l}.mlp.gate_proj.weight") + w3 = state_dict.pop(f"transformer.layers.{l}.mlp.up_proj.weight") + # Our ordering is different + state_dict[f"transformer.layers.{l}.mlp.fc1.weight"] = torch.cat( + [w3, w1], dim=0 + ) + + def key_mapping_mlp(key): + return re.sub( + r"^transformer.layers.(\d+).mlp.down_proj.", + r"transformer.layers.\1.mlp.fc2.", + key, + ) + + state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items()) + + # Attention + def key_mapping_attn(key): + key = re.sub( + r"^transformer.layers.(\d+).self_attn.W_pack.", + r"transformer.layers.\1.mixer.Wqkv.", + key, + ) + key = re.sub( + r"^transformer.layers.(\d+).self_attn.o_proj.", + r"transformer.layers.\1.mixer.out_proj.", + key, + ) + return key + + state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items()) + for l in range(config.n_layer): + # pop rotary_emb.inv_freq from state dict + state_dict.pop(f"transformer.layers.{l}.self_attn.rotary_emb.inv_freq", None) + return state_dict + + +def baichuan_config_to_gpt2_config(baichuan_config: PretrainedConfig) -> GPT2Config: + # HACK: the config doesn't have say whether it's rotary or alibi. + # So we have to infer from the hidden size (7B -> rotary, 13B -> alibi). + # HACK: the config doesn't have say whether it uses norm head. + # So we have to infer from the vocab size + # (v1, vocab size 64k, no norm head; v2, vocab size 128k, norm head). + use_rotary = baichuan_config.hidden_size < 5000 + return GPT2Config( + vocab_size=baichuan_config.vocab_size, + n_positions=0, # No absolute position embedding + n_embd=baichuan_config.hidden_size, + n_layer=baichuan_config.num_hidden_layers, + n_head=baichuan_config.num_attention_heads, + n_inner=baichuan_config.intermediate_size, + activation_function="swiglu", # Hardcode since HF calls it 'silu' + # baichuan doesn't have dropout, idk if it's because they only release the inference code + resid_pdrop=0.0, + embd_pdrop=0.0, + attn_pdrop=0.0, + layer_norm_epsilon=baichuan_config.rms_norm_eps, + initializer_range=baichuan_config.initializer_range, + bos_token_id=baichuan_config.bos_token_id, + eos_token_id=baichuan_config.eos_token_id, + # These are new arguments not in the original GPT2Config + pad_token_id=baichuan_config.pad_token_id, # Idk if this does anything + rms_norm=True, + rotary_emb_fraction=1.0 if use_rotary else 0.0, + rotary_emb_interleaved=False, + use_alibi=not use_rotary, + use_flash_attn=not use_rotary, # Alibi code path requires flash_attn + tie_word_embeddings=False, + norm_head=baichuan_config.vocab_size > 70000, + qkv_proj_bias=False, + out_proj_bias=False, + mlp_fc1_bias=False, + mlp_fc2_bias=False, + ) diff --git a/base.yaml b/base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..48de7a20d2d1d31e5c7166d618d16b1595d85395 --- /dev/null +++ b/base.yaml @@ -0,0 +1,82 @@ +# @package _global_ +defaults: + - override /trainer: default # choose trainer from 'configs/trainer/' + - override /model: null + - override /datamodule: openwebtext + # FusedAdam from apex speeds up the optimizer step a bit, for GPT2-small time + # per global step (i.e. batch size 512) on 8 A100s goes from 376ms to 368ms. + # For GPT2-medium time per global goes from 997ms to 972ms. + - override /optimizer: adamw-apex + - override /scheduler: linear-warmup + - override /callbacks: [default, norm-monitor] + - override /metrics: [perplexity, num-tokens] + - override /logger: wandb + +# all parameters below will be merged with parameters from default configurations set above +# this allows you to overwrite only specified parameters + +task: + _target_: src.tasks.seq.SequenceLMModel + +seed: 1111 + +trainer: + accelerator: gpu + devices: 8 + num_nodes: 1 + accumulate_grad_batches: ${div_up:${train.global_batch_size}, ${eval:${trainer.devices} * ${datamodule.batch_size} * ${trainer.num_nodes}}} + max_steps: 400000 + val_check_interval: ${eval:1000 * ${.accumulate_grad_batches}} + check_val_every_n_epoch: null # We don't care about epoch boundary + precision: 16 + gradient_clip_val: 1.0 + strategy: null + +datamodule: + batch_size: 16 # Per GPU + batch_size_eval: ${.batch_size} # Fused dense only support batch size at most 64k + max_length: 1024 + fault_tolerant: True + ddp: ${eval:"${trainer.devices} > 1"} + +train: + gpu_mem: ${eval:"round(float(__import__('subprocess').check_output('nvidia-smi -i 0 --query-gpu=memory.total --format=csv,noheader,nounits', shell=True).strip().decode()) / 1000)"} + global_batch_size: 512 + optimizer: + lr: 6e-4 + weight_decay: 0.1 + optimizer_param_grouping: + bias_weight_decay: False + normalization_weight_decay: False + scheduler: + num_warmup_steps: ${eval:0.01 * ${trainer.max_steps}} + num_training_steps: ${trainer.max_steps} + loss_fn: + # This is faster and uses less memory than torch.nn.CrossEntropyLoss. + # It's also more numerically stable if we're using DeepSpeed 16 bits. + _target_: flash_attn.losses.cross_entropy.CrossEntropyLoss + inplace_backward: True # to save memory + +eval: + log_on_step: True # 1 training epoch takes too long, we want to see metrics per train step + +callbacks: + model_checkpoint: + monitor: val/loss + mode: min + save_top_k: 3 + save_last: True + every_n_train_steps: 1000 + dirpath: ${work_dir}/checkpoints/${oc.select:name,''} + filename: step_{step} + auto_insert_metric_name: False + model_checkpoint_progress: + _target_: src.callbacks.model_checkpoint.ModelCheckpointMine + fault_tolerant: True + every_n_train_steps: 50000 + save_last: False + save_top_k: -1 # Save all the checkpoints + dirpath: ${..model_checkpoint.dirpath} + filename: progress_step_{step} + auto_insert_metric_name: False + early_stopping: null diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..15b30405f209921189b75f7307814876350e7317 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,268 @@ +# Copyright (c) 2023, Tri Dao. +""" Useful functions for writing test code. """ + +import torch +import torch.utils.benchmark as benchmark + + +def benchmark_forward( + fn, *inputs, repeats=10, desc="", verbose=True, amp=False, amp_dtype=torch.float16, **kwinputs +): + """Use Pytorch Benchmark on the forward pass of an arbitrary function.""" + if verbose: + print(desc, "- Forward pass") + + def amp_wrapper(*inputs, **kwinputs): + with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp): + fn(*inputs, **kwinputs) + + t = benchmark.Timer( + stmt="fn_amp(*inputs, **kwinputs)", + globals={"fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs}, + num_threads=torch.get_num_threads(), + ) + m = t.timeit(repeats) + if verbose: + print(m) + return t, m + + +def benchmark_backward( + fn, + *inputs, + grad=None, + repeats=10, + desc="", + verbose=True, + amp=False, + amp_dtype=torch.float16, + **kwinputs, +): + """Use Pytorch Benchmark on the backward pass of an arbitrary function.""" + if verbose: + print(desc, "- Backward pass") + with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp): + y = fn(*inputs, **kwinputs) + if type(y) is tuple: + y = y[0] + if grad is None: + grad = torch.randn_like(y) + else: + if grad.shape != y.shape: + raise RuntimeError("Grad shape does not match output shape") + + def f(*inputs, y, grad): + # Set .grad to None to avoid extra operation of gradient accumulation + for x in inputs: + if isinstance(x, torch.Tensor): + x.grad = None + y.backward(grad, retain_graph=True) + + t = benchmark.Timer( + stmt="f(*inputs, y=y, grad=grad)", + globals={"f": f, "inputs": inputs, "y": y, "grad": grad}, + num_threads=torch.get_num_threads(), + ) + m = t.timeit(repeats) + if verbose: + print(m) + return t, m + + +def benchmark_combined( + fn, + *inputs, + grad=None, + repeats=10, + desc="", + verbose=True, + amp=False, + amp_dtype=torch.float16, + **kwinputs, +): + """Use Pytorch Benchmark on the forward+backward pass of an arbitrary function.""" + if verbose: + print(desc, "- Forward + Backward pass") + with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp): + y = fn(*inputs, **kwinputs) + if type(y) is tuple: + y = y[0] + if grad is None: + grad = torch.randn_like(y) + else: + if grad.shape != y.shape: + raise RuntimeError("Grad shape does not match output shape") + + def f(grad, *inputs, **kwinputs): + for x in inputs: + if isinstance(x, torch.Tensor): + x.grad = None + with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp): + y = fn(*inputs, **kwinputs) + if type(y) is tuple: + y = y[0] + y.backward(grad, retain_graph=True) + + t = benchmark.Timer( + stmt="f(grad, *inputs, **kwinputs)", + globals={"f": f, "fn": fn, "inputs": inputs, "grad": grad, "kwinputs": kwinputs}, + num_threads=torch.get_num_threads(), + ) + m = t.timeit(repeats) + if verbose: + print(m) + return t, m + + +def benchmark_fwd_bwd( + fn, + *inputs, + grad=None, + repeats=10, + desc="", + verbose=True, + amp=False, + amp_dtype=torch.float16, + **kwinputs, +): + """Use Pytorch Benchmark on the forward+backward pass of an arbitrary function.""" + return ( + benchmark_forward( + fn, + *inputs, + repeats=repeats, + desc=desc, + verbose=verbose, + amp=amp, + amp_dtype=amp_dtype, + **kwinputs, + ), + benchmark_backward( + fn, + *inputs, + grad=grad, + repeats=repeats, + desc=desc, + verbose=verbose, + amp=amp, + amp_dtype=amp_dtype, + **kwinputs, + ), + ) + + +def benchmark_all( + fn, + *inputs, + grad=None, + repeats=10, + desc="", + verbose=True, + amp=False, + amp_dtype=torch.float16, + **kwinputs, +): + """Use Pytorch Benchmark on the forward+backward pass of an arbitrary function.""" + return ( + benchmark_forward( + fn, + *inputs, + repeats=repeats, + desc=desc, + verbose=verbose, + amp=amp, + amp_dtype=amp_dtype, + **kwinputs, + ), + benchmark_backward( + fn, + *inputs, + grad=grad, + repeats=repeats, + desc=desc, + verbose=verbose, + amp=amp, + amp_dtype=amp_dtype, + **kwinputs, + ), + benchmark_combined( + fn, + *inputs, + grad=grad, + repeats=repeats, + desc=desc, + verbose=verbose, + amp=amp, + amp_dtype=amp_dtype, + **kwinputs, + ), + ) + + +def pytorch_profiler( + fn, + *inputs, + trace_filename=None, + backward=False, + amp=False, + amp_dtype=torch.float16, + cpu=False, + verbose=True, + **kwinputs, +): + """Wrap benchmark functions in Pytorch profiler to see CUDA information.""" + if backward: + with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp): + out = fn(*inputs, **kwinputs) + if type(out) is tuple: + out = out[0] + g = torch.randn_like(out) + for _ in range(30): # Warm up + if backward: + for x in inputs: + if isinstance(x, torch.Tensor): + x.grad = None + with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp): + out = fn(*inputs, **kwinputs) + if type(out) is tuple: + out = out[0] + # Backward should be done outside autocast + if backward: + out.backward(g, retain_graph=True) + activities = ([torch.profiler.ProfilerActivity.CPU] if cpu else []) + [ + torch.profiler.ProfilerActivity.CUDA + ] + with torch.profiler.profile( + activities=activities, + record_shapes=True, + # profile_memory=True, + with_stack=True, + ) as prof: + if backward: + for x in inputs: + if isinstance(x, torch.Tensor): + x.grad = None + with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp): + out = fn(*inputs, **kwinputs) + if type(out) is tuple: + out = out[0] + if backward: + out.backward(g, retain_graph=True) + if verbose: + # print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=50)) + print(prof.key_averages().table(row_limit=50)) + if trace_filename is not None: + prof.export_chrome_trace(trace_filename) + + +def benchmark_memory(fn, *inputs, desc="", verbose=True, **kwinputs): + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + torch.cuda.synchronize() + fn(*inputs, **kwinputs) + torch.cuda.synchronize() + mem = torch.cuda.max_memory_allocated() / ((2**20) * 1000) + if verbose: + print(f"{desc} max memory: {mem}GB") + torch.cuda.empty_cache() + return mem diff --git a/benchmark_alibi.py b/benchmark_alibi.py new file mode 100644 index 0000000000000000000000000000000000000000..55da356270238af214e35bce1a6b479c83a51bef --- /dev/null +++ b/benchmark_alibi.py @@ -0,0 +1,275 @@ +# Copyright (c) 2024, Sanghun Cho, Tri Dao. + +import pickle +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + +from einops import rearrange, repeat +from flash_attn.layers.rotary import apply_rotary_emb + +from flash_attn.utils.benchmark import benchmark_all, benchmark_forward, benchmark_backward +from flash_attn.utils.benchmark import benchmark_fwd_bwd, benchmark_combined + +from flash_attn import flash_attn_qkvpacked_func, flash_attn_func + +try: + import xformers.ops as xops +except ImportError: + xops = None + + +def generate_cos_sin(seqlen, rotary_dim, device, dtype): + assert rotary_dim % 2 == 0 + angle = torch.rand(seqlen * 2, rotary_dim // 2, device=device) * 2 * math.pi + cos = torch.cos(angle).to(dtype=dtype) + sin = torch.sin(angle).to(dtype=dtype) + return cos, sin + + +def flash_rotary(q, k, v, cos, sin, causal=False): + # corrected by @tridao comments + q = apply_rotary_emb( + q, cos, sin, seqlen_offsets=0, interleaved=False, inplace=True + ) + k = apply_rotary_emb( + k, cos, sin, seqlen_offsets=0, interleaved=False, inplace=True + ) + + return flash_attn_func(q, k, v, causal=causal) + + +def attn_bias_from_alibi_slopes( + slopes, seqlen_q, seqlen_k, query_padding_mask=None, key_padding_mask=None, causal=False +): + batch, nheads = slopes.shape + device = slopes.device + slopes = rearrange(slopes, "b h -> b h 1 1") + if causal: + return torch.arange(-seqlen_k + 1, 1, device=device, dtype=torch.float32) * slopes + else: + row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1") + col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long) + sk = ( + seqlen_k + if key_padding_mask is None + else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1") + ) + sq = ( + seqlen_q + if query_padding_mask is None + else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1") + ) + relative_pos = torch.abs(row_idx + sk - sq - col_idx) + return -slopes * relative_pos.to(dtype=slopes.dtype) + + +def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"): + assert mode in ["fwd", "bwd", "fwd_bwd"] + f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1) + return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f) + + +def efficiency(flop, time): + return (flop / time / 10**12) if not math.isnan(time) else 0.0 + + +def attention_pytorch(q, k, v, dropout_p=0.0, causal=True, attn_bias=None): + """ + Arguments: + q, k, v: (batch_size, seqlen, nheads, head_dim) + dropout_p: float + attn_bias: (batch_size, nheads, seqlen, seqlen) or (1, nheads, seqlen, seqlen) + Output: + output: (batch_size, seqlen, nheads, head_dim) + """ + batch_size, seqlen, nheads, d = q.shape + q = rearrange(q, 'b t h d -> (b h) t d') + k = rearrange(k, 'b s h d -> (b h) d s') + softmax_scale = 1.0 / math.sqrt(d) + # Preallocate attn_weights for `baddbmm` + if attn_bias is not None: + scores = rearrange(attn_bias, 'b h t s -> (b h) t s') + else: + scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=q.dtype, device=q.device) + scores = rearrange(torch.baddbmm(scores, q, k, beta=1.0, alpha=softmax_scale), + '(b h) t s -> b h t s', h=nheads) + if causal: + # "triu_tril_cuda_template" not implemented for 'BFloat16' + # So we have to construct the mask in float + causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1) + # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess) + scores = scores + causal_mask.to(dtype=scores.dtype) + attention = torch.softmax(scores, dim=-1) + attention_drop = F.dropout(attention, dropout_p) + output = torch.einsum('bhts,bshd->bthd', attention_drop , v) + return output.to(dtype=q.dtype) + + +def time_fwd_bwd(func, *args, **kwargs): + time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs) + return time_f[1].mean, time_b[1].mean + + +repeats = 30 +device = 'cuda' +dtype = torch.float16 + +bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)] +causal_vals = [False, True] +headdim_vals = [64, 128] +dim = 2048 +dropout_p = 0.0 + +methods = (["fa2_alibi", "torch"] + + (["xformers"] if xops is not None else []) + + ["sdpa"] + + ["fa2_baseline"] + + ["fa2_rotary"]) + +time_f = {} +time_b = {} +time_f_b = {} +speed_f = {} +speed_b = {} +speed_f_b = {} +for causal in causal_vals: + for headdim in headdim_vals: + for batch_size, seqlen in bs_seqlen_vals: + config = (causal, headdim, batch_size, seqlen) + nheads = dim // headdim + q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) for _ in range(3)] + # alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 + alibi_slopes = torch.rand(1, nheads, device=device, dtype=torch.float32) * 0.3 + attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen, seqlen, causal=causal).to(dtype) + attn_bias = repeat(attn_bias, "1 ... -> b ...", b=batch_size) + f, b = time_fwd_bwd( + flash_attn_func, + q, k, v, + dropout_p, + causal=causal, + # alibi_slopes=alibi_slopes, + alibi_slopes=None, + repeats=repeats, + verbose=False + ) + time_f[config, "fa2_baseline"] = f + time_b[config, "fa2_baseline"] = b + + q = q.detach().requires_grad_(True) + k = k.detach().requires_grad_(True) + v = v.detach().requires_grad_(True) + f, b = time_fwd_bwd( + flash_attn_func, + q, k, v, + dropout_p, + causal=causal, + alibi_slopes=rearrange(alibi_slopes, "1 h -> h"), + # alibi_slopes=None, + repeats=repeats, + verbose=False + ) + time_f[config, "fa2_alibi"] = f + time_b[config, "fa2_alibi"] = b + + try: + q = q.detach().requires_grad_(True) + k = k.detach().requires_grad_(True) + v = v.detach().requires_grad_(True) + f, b = time_fwd_bwd( + attention_pytorch, + q, k, v, + dropout_p, + causal=causal, + attn_bias=attn_bias, + repeats=repeats, + verbose=False + ) + except: # Skip if OOM + f, b = float('nan'), float('nan') + time_f[config, "torch"] = f + time_b[config, "torch"] = b + + # F.sdpa doesn't currently (torch 2.1) dispatch to flash-attn but just to be safe + with torch.backends.cuda.sdp_kernel(enable_flash=False): + q_pt = q.detach().requires_grad_(True).transpose(1, 2) + k_pt = k.detach().requires_grad_(True).transpose(1, 2) + v_pt = v.detach().requires_grad_(True).transpose(1, 2) + f, b = time_fwd_bwd( + F.scaled_dot_product_attention, + q_pt, k_pt, v_pt, + attn_mask=attn_bias, + dropout_p=dropout_p, + is_causal=causal, + repeats=repeats, + verbose=False + ) + time_f[config, "sdpa"] = f + time_b[config, "sdpa"] = b + + if xops is not None: + q = q.detach().requires_grad_(True) + k = k.detach().requires_grad_(True) + v = v.detach().requires_grad_(True) + if causal: + attn_bias_xops = xops.LowerTriangularMask().add_bias(attn_bias.expand(-1, -1, seqlen, -1).to(dtype=q.dtype)) + # NotImplementedError: No operator found for `memory_efficient_attention_backward` with inputs: + # `flshattB@v2.3.6` is not supported because: + # attn_bias type is + # `cutlassB` is not supported because: + # attn_bias type is + attn_bias_xops = attn_bias_xops.materialize((batch_size, nheads, seqlen, seqlen), dtype=q.dtype, device=device) + else: + attn_bias_xops = attn_bias.to(dtype=q.dtype) + f, b = time_fwd_bwd( + xops.memory_efficient_attention, + q, k, v, + attn_bias_xops, + dropout_p, + repeats=repeats, + verbose=False + ) + time_f[config, "xformers"] = f + time_b[config, "xformers"] = b + + q = q.detach().requires_grad_(True) + k = k.detach().requires_grad_(True) + v = v.detach().requires_grad_(True) + cos, sin = generate_cos_sin(seqlen, headdim, device, dtype) + f, b = time_fwd_bwd( + flash_rotary, + q, k, v, + cos, sin, + causal, + repeats=repeats, + verbose=False + ) + time_f[config, "fa2_rotary"] = f + time_b[config, "fa2_rotary"] = b + + print(f"### causal={causal}, headdim={headdim}, batch_size={batch_size}, seqlen={seqlen} ###") + csv_output = "" + csv_output += f"{causal},{headdim},{batch_size},{seqlen}," + for method in methods: + time_f_b[config, method] = time_f[config, method] + time_b[config, method] + speed_f[config, method] = efficiency( + flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"), + time_f[config, method] + ) + speed_b[config, method] = efficiency( + flops(batch_size, seqlen, headdim, nheads, causal, mode="bwd"), + time_b[config, method] + ) + speed_f_b[config, method] = efficiency( + flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd_bwd"), + time_f_b[config, method] + ) + print( + f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, " + f"bwd: {speed_b[config, method]:.2f} TFLOPs/s, " + f"fwd + bwd: {speed_f_b[config, method]:.2f} TFLOPs/s" + ) + csv_output += f"{speed_f[config, method]:.2f},{speed_b[config, method]:.2f},{speed_f_b[config, method]:.2f}," + print(csv_output) diff --git a/benchmark_attn.py b/benchmark_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..74d2ce3c264cdb73536c791958ba3c180b9dd5be --- /dev/null +++ b/benchmark_attn.py @@ -0,0 +1,314 @@ +from functools import partial +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + +import time + +try: + import cudnn +except ImportError: + cudnn = None + + +from einops import rearrange, repeat + +# from flash_attn.utils.benchmark import benchmark_forward, benchmark_backward, benchmark_combined, benchmark_all, benchmark_fwd_bwd, pytorch_profiler +from flash_attn.utils.benchmark import benchmark_forward, benchmark_backward, benchmark_combined, benchmark_all, benchmark_fwd_bwd, pytorch_profiler +from flash_attn.flash_attn_interface import flash_attn_func +from flash_attn_interface import flash_attn_func as flash_attn_func_v3, flash_attn_varlen_func as flash_attn_varlen_func_v3 + +# Need to install triton nightly: +# pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly + +try: + from triton_fused_attention import attention as triton_attention +except ImportError: + triton_attention = None + +def flops(batch, nheads, seqlen_q, seqlen_k, headdim, causal=False, mode='fwd'): + assert mode in ["fwd", "bwd", "fwd_bwd"] + f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1) + return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f) + + +def convert_to_cudnn_type(torch_type): + if torch_type == torch.float16: + return cudnn.data_type.HALF + elif torch_type == torch.bfloat16: + return cudnn.data_type.BFLOAT16 + elif torch_type == torch.float32: + return cudnn.data_type.FLOAT + elif torch_type == torch.int32: + return cudnn.data_type.INT32 + elif torch_type == torch.int64: + return cudnn.data_type.INT64 + else: + raise ValueError("Unsupported tensor data type.") + + +def cudnn_sdpa_setup(q, k, v, grad, o, stats, causal=False, varlen=False, seqlens=None): + b, nheads, seqlen_q, headdim = q.shape + _, nheads_kv, seqlen_k, _ = k.shape + assert v.shape == (b, nheads_kv, seqlen_k, headdim) + assert cudnn is not None, 'CUDNN is not available' + q_gpu, k_gpu, v_gpu = q, k, v + o_gpu, stats_gpu = o, stats + graph_forward = cudnn.pygraph( + io_data_type=convert_to_cudnn_type(q.dtype), + intermediate_data_type=cudnn.data_type.FLOAT, + compute_data_type=cudnn.data_type.FLOAT, + ) + q_forward = graph_forward.tensor_like(q_gpu.detach()) + k_forward = graph_forward.tensor_like(k_gpu.detach()) + v_forward = graph_forward.tensor_like(v_gpu.detach()) + + seqlens_reshaped = seqlens if varlen else None + seq_len_q = graph_forward.tensor_like(seqlens_reshaped.detach()) if varlen else None + seq_len_kv = graph_forward.tensor_like(seqlens_reshaped.detach()) if varlen else None + + o_forward, stats_forward = graph_forward.sdpa( + name="sdpa", + q=q_forward, + k=k_forward, + v=v_forward, + is_inference=False, + attn_scale=1.0 / math.sqrt(headdim), + use_causal_mask=causal, + use_padding_mask=varlen, + seq_len_q=seq_len_q, + seq_len_kv=seq_len_kv, + ) + + o_forward.set_output(True).set_dim(o_gpu.shape).set_stride(o_gpu.stride()) + stats_forward.set_output(True).set_data_type(cudnn.data_type.FLOAT) + + graph_forward.validate() + graph_forward.build_operation_graph() + graph_forward.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK]) + graph_forward.check_support() + graph_forward.build_plans() + + variant_pack_forward = { + q_forward: q_gpu, + k_forward: k_gpu, + v_forward: v_gpu, + o_forward: o_gpu, + stats_forward: stats_gpu, + seq_len_q: seqlens_reshaped, + seq_len_kv: seqlens_reshaped, + } + + dQ_gpu = torch.empty_like(q_gpu) + dK_gpu = torch.empty_like(k_gpu) + dV_gpu = torch.empty_like(v_gpu) + dO_gpu = grad + + graph_backward = cudnn.pygraph( + io_data_type=cudnn.data_type.HALF, + intermediate_data_type=cudnn.data_type.FLOAT, + compute_data_type=cudnn.data_type.FLOAT, + ) + + q_backward = graph_backward.tensor_like(q_gpu.detach()) + k_backward = graph_backward.tensor_like(k_gpu.detach()) + v_backward = graph_backward.tensor_like(v_gpu.detach()) + o_backward = graph_backward.tensor_like(o_gpu.detach()) + dO_backward = graph_backward.tensor_like(dO_gpu.detach()) + stats_backward = graph_backward.tensor_like(stats_gpu.detach()) + seq_len_q = graph_backward.tensor_like(seqlens_reshaped.detach()) if varlen else None + seq_len_kv = graph_backward.tensor_like(seqlens_reshaped.detach()) if varlen else None + + dQ_backward, dK_backward, dV_backward = graph_backward.sdpa_backward( + name="sdpa_backward", + q=q_backward, + k=k_backward, + v=v_backward, + o=o_backward, + dO=dO_backward, + stats=stats_backward, + attn_scale=1.0 / math.sqrt(headdim), + use_causal_mask=causal, + use_padding_mask=varlen, + seq_len_q=seq_len_q, + seq_len_kv=seq_len_kv, + ) + + dQ_backward.set_output(True).set_dim(dQ_gpu.size()).set_stride(dQ_gpu.stride()) + dK_backward.set_output(True).set_dim(dK_gpu.size()).set_stride(dK_gpu.stride()) + dV_backward.set_output(True).set_dim(dV_gpu.size()).set_stride(dV_gpu.stride()) + + graph_backward.validate() + graph_backward.build_operation_graph() + graph_backward.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK]) + graph_backward.check_support() + graph_backward.build_plans() + + variant_pack_backward = { + q_backward: q_gpu, + k_backward: k_gpu, + v_backward: v_gpu, + o_backward: o_gpu, + dO_backward: dO_gpu, + stats_backward: stats_gpu, + dQ_backward: dQ_gpu, + dK_backward: dK_gpu, + dV_backward: dV_gpu, + seq_len_q: seqlens_reshaped, + seq_len_kv: seqlens_reshaped, + } + + workspace = torch.empty( + max(graph_forward.get_workspace_size(), graph_backward.get_workspace_size()), + device="cuda", dtype=torch.uint8 + ) + + def run_fwd(*args, **kwargs): + graph_forward.execute(variant_pack_forward, workspace) + return o_gpu, stats_gpu + + def run_bwd(*args, **kwargs): + graph_backward.execute(variant_pack_backward, workspace) + return dQ_gpu, dK_gpu, dV_gpu + + return run_fwd, run_bwd + + +torch.manual_seed(0) +repeats = 100 +dropout_p = 0.0 +causal = False +dtype = torch.float16 +device = 'cuda' +verbose = False +batch_size = 2 +# seqlen = 2048 +seqlen = 8192 +# seqlen = 4096 +# seqlen = 2047 +dim = 2048 +# headdim = 128 +# headdim = 64 +headdim = 256 + +for mode in ['fwd', 'bwd']: +# for mode in ['bwd']: + for headdim in [64, 128, 256]: + # for headdim in [128]: + for seqlen in [1024, 2048, 4096, 8192, 16384, 32768]: + # for seqlen in [8192]: + nheads = dim // headdim + # nheads = 24 + # headdim = 64 + # batch_size = 64 + # seqlen = 512 + # nheads = 8 + # headdim = 128 + # nheads = 16 + # headdim = 128 + nheads_kv = nheads + # nheads_kv = 1 + + qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) + q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, requires_grad=True) + k = torch.randn(batch_size, seqlen, nheads_kv, headdim, device=device, dtype=dtype, requires_grad=True) + v = torch.randn(batch_size, seqlen, nheads_kv, headdim, device=device, dtype=dtype, requires_grad=True) + q_t = q.transpose(1, 2).contiguous().detach().requires_grad_() + k_t = k.transpose(1, 2).contiguous().detach().requires_grad_() + v_t = k.transpose(1, 2).contiguous().detach().requires_grad_() + grad = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype) + grad_t = grad.transpose(1, 2).contiguous() + o_t = torch.empty_like(q.transpose(1, 2)) + stats = torch.empty(batch_size, nheads, seqlen, 1, dtype=torch.float32, device=q.device) + + bench_fn = benchmark_forward if mode == 'fwd' else partial(benchmark_backward, grad=grad) + + for causal in [False, True]: + # for causal in [True]: + print(f"\n### {mode = }, {batch_size = }, {headdim = }, {seqlen = }, {causal = } ###") + # For var-seq-len + lens = torch.full([q.shape[0]], seqlen, dtype=torch.int32) + seqlens_cudnn = lens.reshape(batch_size, 1, 1, 1).contiguous().cuda() + cu_seqlens = torch.cat([torch.tensor([0], dtype=torch.int32), torch.cumsum(lens, dim=0, dtype=torch.int32)]).cuda() + if headdim <= 128 and cudnn is not None: + cudnn_sdpa_fwd, cudnn_sdpa_bwd = cudnn_sdpa_setup(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), grad.transpose(1, 2), o_t, stats, causal=causal) + cudnn_sdpa_fwd_varlen, cudnn_sdpa_bwd_varlen = cudnn_sdpa_setup(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), grad.transpose(1, 2), o_t, stats, causal=causal, varlen=True, seqlens=seqlens_cudnn) + f = flops(batch_size, nheads, seqlen, seqlen, headdim, causal=causal, mode=mode) + ref_o = flash_attn_func(q, k, v, dropout_p, causal=causal) + _, m0 = bench_fn(flash_attn_func, q, k, v, dropout_p, causal=causal, repeats=repeats, verbose=verbose, desc='Fav2') + if mode == 'bwd': + ref_dv, v.grad = v.grad.clone(), None + ref_dk, k.grad = k.grad.clone(), None + ref_dq, q.grad = q.grad.clone(), None + # pytorch_profiler(flash_attn_func, q, k, v, dropout_p, causal=causal, backward=False) + if headdim <= 128: + if triton_attention is not None and nheads_kv == nheads: + if mode == 'fwd': + time.sleep(1) # Sleep to avoid residual power throttling from the previous benchmark + _, m3 = benchmark_forward(triton_attention, q_t, k_t, v_t, causal, 1 / math.sqrt(headdim), repeats=repeats, verbose=verbose, desc='Triton') + # TODO: fix Triton numeric errors. + # if mode == 'bwd': + # dv, v_t.grad = v_t.grad.clone(), None + # dk, k_t.grad = k_t.grad.clone(), None + # dq, q_t.grad = q_t.grad.clone(), None + # torch.testing.assert_close(ref_dv, dv.transpose(1, 2), atol=0.05, rtol=0.05) + # torch.testing.assert_close(ref_dk, dk.transpose(1, 2), atol=0.05, rtol=0.05) + # torch.testing.assert_close(ref_dq, dq.transpose(1, 2), atol=0.05, rtol=0.05) + if cudnn is not None: + time.sleep(1) # Sleep to avoid residual power throttling from the previous benchmark + if mode == 'fwd': + _, m2 = benchmark_forward(cudnn_sdpa_fwd, repeats=repeats, verbose=verbose, desc='CuDNN') + _, m2_var = benchmark_forward(cudnn_sdpa_fwd_varlen, repeats=repeats, verbose=verbose, desc='CuDNN') + cudnn_sdpa_fwd() + torch.testing.assert_close(ref_o, o_t.transpose(1, 2), atol=0.05, rtol=0.05) + cudnn_sdpa_fwd_varlen() + torch.testing.assert_close(ref_o, o_t.transpose(1, 2), atol=0.05, rtol=0.05) + else: + cudnn_sdpa_fwd() + _, m2 = benchmark_forward(cudnn_sdpa_bwd, repeats=repeats, verbose=verbose, desc='CuDNN') + _, m2_var = benchmark_forward(cudnn_sdpa_bwd_varlen, repeats=repeats, verbose=verbose, desc='CuDNN') + dq, dk, dv = cudnn_sdpa_bwd() + torch.testing.assert_close(ref_dv, dv.transpose(1, 2), atol=0.05, rtol=0.05) + torch.testing.assert_close(ref_dk, dk.transpose(1, 2), atol=0.05, rtol=0.05) + torch.testing.assert_close(ref_dq, dq.transpose(1, 2), atol=0.05, rtol=0.05) + dq, dk, dv = cudnn_sdpa_bwd_varlen() + torch.testing.assert_close(ref_dv, dv.transpose(1, 2), atol=0.05, rtol=0.05) + torch.testing.assert_close(ref_dk, dk.transpose(1, 2), atol=0.05, rtol=0.05) + torch.testing.assert_close(ref_dq, dq.transpose(1, 2), atol=0.05, rtol=0.05) + # pytorch_profiler(cudnn_sdpa, backward=False) + + if headdim <= 128 or mode == 'fwd': + time.sleep(1) + _, m1 = bench_fn(flash_attn_func_v3, q, k, v, causal=causal, repeats=repeats, verbose=verbose, desc='Fav3') + q_var = q.reshape(-1, q.shape[-2], q.shape[-1]) + k_var = k.reshape(-1, k.shape[-2], k.shape[-1]) + v_var = v.reshape(-1, v.shape[-2], v.shape[-1]) + time.sleep(1) + if mode == 'bwd': + dv, v.grad = v.grad.clone(), None + dk, k.grad = k.grad.clone(), None + dq, q.grad = q.grad.clone(), None + torch.testing.assert_close(ref_dv, dv, atol=0.05, rtol=0.05) + torch.testing.assert_close(ref_dk, dk, atol=0.05, rtol=0.05) + torch.testing.assert_close(ref_dq, dq, atol=0.05, rtol=0.05) + + bench_var_fn = bench_fn + if mode == 'bwd': + grad_var = grad.reshape(-1, grad.shape[-2], grad.shape[-1]) + bench_var_fn = partial(benchmark_backward, grad=grad_var) + _, m1_var = bench_var_fn(flash_attn_varlen_func_v3, q_var, k_var, v_var, cu_seqlens, cu_seqlens, seqlen, seqlen, causal=causal, repeats=repeats, verbose=verbose, desc='Fav3 var len') + + # pytorch_profiler(flash_attn_func_v3, q, k, v, causal=causal, backward=False) + print(f'Fav2: {m0.mean * 1e3:.3f}ms, {(f / m0.mean * 1e-12):.1f} TFLOPS') + if headdim <= 128: + if mode == 'fwd' and triton_attention is not None and nheads_kv == nheads: + print(f'Triton: {m3.mean * 1e3:.3f}ms, {(f / m3.mean * 1e-12):.1f} TFLOPS') + if cudnn is not None: + print(f'CuDNN: {m2.mean * 1e3:.3f}ms, {(f / m2.mean * 1e-12):.1f} TFLOPS') + print(f'CuDNN varlen: {m2_var.mean * 1e3:.3f}ms, {(f / m2_var.mean * 1e-12):.1f} TFLOPS') + if headdim <= 128 or mode == 'fwd': + print(f'Fav3: {m1.mean * 1e3:.3f}ms, {(f / m1.mean * 1e-12):.1f} TFLOPS') + print(f'Fav3 varlen: {m1_var.mean * 1e3:.3f}ms, {(f / m1_var.mean * 1e-12):.1f} TFLOPS') + \ No newline at end of file diff --git a/benchmark_causal.py b/benchmark_causal.py new file mode 100644 index 0000000000000000000000000000000000000000..6c4797c83e0cc5c100d991a9d847f1e9b4351002 --- /dev/null +++ b/benchmark_causal.py @@ -0,0 +1,225 @@ +from functools import partial +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + +from einops import rearrange, repeat + +# from flash_attn.utils.benchmark import benchmark_forward, benchmark_backward, benchmark_combined, benchmark_all, benchmark_fwd_bwd, pytorch_profiler +from flash_attn.utils.benchmark import benchmark_forward, benchmark_backward, benchmark_combined, benchmark_all, benchmark_fwd_bwd, pytorch_profiler +from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func +# # from flash_attn.triton.fused_attention import attention as attention +# from flash_attn.flash_attn_triton import flash_attn_qkvpacked_func +# from flash_attn.flash_attn_triton_og import attention as attention_og + +# from triton.ops.flash_attention import attention as attention_triton + +from flash_attn import flash_attn_qkvpacked_func, flash_attn_kvpacked_func + +try: + from flash_attn.fused_softmax import scaled_upper_triang_masked_softmax +except ImportError: + scaled_upper_triang_masked_softmax = None + + +def attention_pytorch(qkv, dropout_p=0.0, causal=True): + """ + Arguments: + qkv: (batch_size, seqlen, 3, nheads, head_dim) + dropout_p: float + Output: + output: (batch_size, seqlen, nheads, head_dim) + """ + batch_size, seqlen, _, nheads, d = qkv.shape + q, k, v = qkv.unbind(dim=2) + q = rearrange(q, 'b t h d -> (b h) t d') + k = rearrange(k, 'b s h d -> (b h) d s') + softmax_scale = 1.0 / math.sqrt(d) + # Preallocate attn_weights for `baddbmm` + scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device) + scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale), + '(b h) t s -> b h t s', h=nheads) + if causal: + # "triu_tril_cuda_template" not implemented for 'BFloat16' + # So we have to construct the mask in float + causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1) + # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess) + scores = scores + causal_mask.to(dtype=scores.dtype) + attention = torch.softmax(scores, dim=-1) + attention_drop = F.dropout(attention, dropout_p) + output = torch.einsum('bhts,bshd->bthd', attention_drop , v) + return output.to(dtype=qkv.dtype) + + +def attention_megatron(qkv): + """ + Arguments: + qkv: (batch_size, seqlen, 3, nheads, head_dim) + Output: + output: (batch_size, seqlen, nheads, head_dim) + """ + batch_size, seqlen, _, nheads, d = qkv.shape + q, k, v = qkv.unbind(dim=2) + q = rearrange(q, 'b t h d -> (b h) t d') + k = rearrange(k, 'b s h d -> (b h) d s') + softmax_scale = 1.0 / math.sqrt(d) + # Preallocate attn_weights for `baddbmm` + scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device) + scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale), + '(b h) t s -> b h t s', h=nheads) + attention = scaled_upper_triang_masked_softmax(scores, None, scale=1.0) + output = torch.einsum('bhts,bshd->bthd', attention, v) + return output.to(dtype=qkv.dtype) + + +torch.manual_seed(0) +repeats = 30 +batch_size = 8 +seqlen = 2048 +nheads = 12 +headdim = 128 +# nheads = 24 +# headdim = 64 +# batch_size = 64 +# seqlen = 512 +# nheads = 8 +# headdim = 128 +dropout_p = 0.0 +causal = True +dtype = torch.float16 +device = 'cuda' + +qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) +cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, + device=qkv.device) + +qkv_unpad = rearrange(qkv, 'b s ... -> (b s) ...').detach().requires_grad_(True) +# benchmark_all(flash_attn_varlen_qkvpacked_func, qkv_unpad, +# cu_seqlens, seqlen, dropout_p, causal=causal, repeats=repeats, desc='FlashAttention') +# pytorch_profiler(flash_attn_varlen_qkvpacked_func, qkv_unpad, +# cu_seqlens, seqlen, dropout_p, causal=causal, backward=True) +benchmark_forward(flash_attn_qkvpacked_func, qkv, dropout_p, causal=causal, repeats=repeats, desc='Fav2') +pytorch_profiler(flash_attn_qkvpacked_func, qkv, dropout_p, causal=causal, backward=False) + +# for dropout_p in [0.1, 0.0]: +# for causal in [False, True]: +# print(f"### {dropout_p = }, {causal = } ###") +# pytorch_profiler(fav2_qkvpacked_func, qkv, dropout_p, causal=causal, backward=True) + + +# nheads_k = 2 +# q = torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, requires_grad=True) +# kv = torch.randn(batch_size, seqlen, 2, nheads_k, headdim, device=device, dtype=dtype, +# requires_grad=True) +# if fav2_kvpacked_func is not None: +# benchmark_all(fav2_kvpacked_func, q, kv, dropout_p, causal=causal, repeats=repeats, desc='Fav2') +# pytorch_profiler(fav2_kvpacked_func, q, kv, dropout_p, causal=causal, backward=True) + +# dropout_p = 0.0 +# causal = False +# benchmark_all(attention_pytorch, qkv, dropout_p, causal=causal, +# repeats=repeats, desc='PyTorch Attention') + +# benchmark_all(flash_attn_qkvpacked_func, qkv, None, causal, repeats=repeats, desc='FlashAttention Triton') +# pytorch_profiler(flash_attn_qkvpacked_func, qkv, None, causal, backward=True) + +# q, k, v = [torch.randn(batch_size, nheads, seqlen, headdim, device=device, dtype=dtype, +# requires_grad=True) for _ in range(3)] +# benchmark_all(attention_og, q, k, v, 1.0, repeats=repeats, desc='FlashAttention Triton OG') +# # pytorch_profiler(attention, q, k, v, 1.0, backward=True) + +# if scaled_upper_triang_masked_softmax is not None: +# benchmark_all(attention_megatron, qkv, repeats=repeats, desc='Megatron Attention') + +# from src.ops.fftconv import fftconv_func + +# dim = nheads * headdim +# u = torch.randn(batch_size, dim, seqlen, device=device, dtype=dtype, requires_grad=True) +# k = torch.randn(dim, seqlen, device=device, requires_grad=True) +# D = torch.randn(dim, device=device, requires_grad=True) +# benchmark_all(fftconv_func, u, k, D, repeats=repeats, desc='FFTConv') +# pytorch_profiler(fftconv_func, u, k, D, backward=True) +# pytorch_profiler(torch.fft.rfft, u.float()) + +flops = 4 * batch_size * seqlen ** 2 * nheads * headdim +ideal_a100_time = flops / 312 / 1e9 +print(f"Ideal A100 fwd time: {ideal_a100_time:.3f}ms, bwd time: {ideal_a100_time * 2.5:.3f}ms") +exit(0) + + +def time_fwd_bwd(func, *args, **kwargs): + time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs) + return time_f[1].mean, time_b[1].mean + +bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)] +causal_vals = [False, True] +headdim_vals = [64, 128] +dim = 2048 +dropout_p = 0.0 + +time_f = {} +time_b = {} +for causal in causal_vals: + for headdim in headdim_vals: + for batch_size, seqlen in bs_seqlen_vals: + nheads = dim // headdim + qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) + cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, + device=qkv.device) + qkv_unpad = rearrange(qkv, 'b s ... -> (b s) ...').detach().requires_grad_(True) + f, b = time_fwd_bwd( + flash_attn_varlen_qkvpacked_func, qkv_unpad, cu_seqlens, seqlen, dropout_p, + causal=causal, repeats=repeats, verbose=False + ) + time_f[(causal, headdim, batch_size, seqlen), "Flash"] = f + time_b[(causal, headdim, batch_size, seqlen), "Flash"] = b + + qkv = qkv.detach().requires_grad_(True) + f, b = time_fwd_bwd( + fav2_qkvpacked_func, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False + ) + time_f[(causal, headdim, batch_size, seqlen), "Flash2"] = f + time_b[(causal, headdim, batch_size, seqlen), "Flash2"] = b + + # q, k, v = [torch.randn(batch_size, nheads, seqlen, headdim, device=device, dtype=dtype, + # requires_grad=True) for _ in range(3)] + # # Try both values of sequence_parallel and pick the faster one + # f, b = time_fwd_bwd( + # attention_triton, q, k, v, causal, headdim**(-0.5), + # False, repeats=repeats, verbose=False + # ) + # _, b0 = time_fwd_bwd( + # attention_triton, q, k, v, causal, headdim**(-0.5), + # True, repeats=repeats, verbose=False + # ) + # time_f[(causal, headdim, batch_size, seqlen), "Triton"] = f + # time_b[(causal, headdim, batch_size, seqlen), "Triton"] = min(b, b0) + + if seqlen <= 8 * 1024: + qkv = qkv.detach().requires_grad_(True) + f, b = time_fwd_bwd( + attention_pytorch, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False + ) + else: + f, b = float('nan'), float('nan') + time_f[(causal, headdim, batch_size, seqlen), "Pytorch"] = f + time_b[(causal, headdim, batch_size, seqlen), "Pytorch"] = b + + # q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, + # requires_grad=True) for _ in range(3)] + # import xformers.ops as xops + # f, b = time_fwd_bwd( + # xops.memory_efficient_attention, q, k, v, + # attn_bias=xops.LowerTriangularMask() if causal else None, + # op=(xops.fmha.cutlass.FwOp, xops.fmha.cutlass.BwOp) + # ) + # time_f[(causal, headdim, batch_size, seqlen), "xformers"] = f + # time_b[(causal, headdim, batch_size, seqlen), "xformers"] = b + + +import pickle +with open('flash2_attn_time_h100.plk', 'wb') as fp: + pickle.dump((time_f, time_b), fp, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/benchmark_flash_attention.py b/benchmark_flash_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..341ae4b213901d8b5406143bf6c8abb50bc422bd --- /dev/null +++ b/benchmark_flash_attention.py @@ -0,0 +1,180 @@ +# Install the newest triton version with +# pip install "git+https://github.com/openai/triton.git#egg=triton&subdirectory=python" +import pickle +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + +from einops import rearrange, repeat + +from flash_attn.utils.benchmark import benchmark_all, benchmark_forward, benchmark_backward +from flash_attn.utils.benchmark import benchmark_fwd_bwd, benchmark_combined + +from flash_attn import flash_attn_qkvpacked_func + +try: + from triton.ops.flash_attention import attention as attention_triton +except ImportError: + attention_triton = None + +try: + import xformers.ops as xops +except ImportError: + xops = None + + +def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"): + assert mode in ["fwd", "bwd", "fwd_bwd"] + f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1) + return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f) + +def efficiency(flop, time): + return (flop / time / 10**12) if not math.isnan(time) else 0.0 + + +def attention_pytorch(qkv, dropout_p=0.0, causal=True): + """ + Arguments: + qkv: (batch_size, seqlen, 3, nheads, head_dim) + dropout_p: float + Output: + output: (batch_size, seqlen, nheads, head_dim) + """ + batch_size, seqlen, _, nheads, d = qkv.shape + q, k, v = qkv.unbind(dim=2) + q = rearrange(q, 'b t h d -> (b h) t d') + k = rearrange(k, 'b s h d -> (b h) d s') + softmax_scale = 1.0 / math.sqrt(d) + # Preallocate attn_weights for `baddbmm` + scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device) + scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale), + '(b h) t s -> b h t s', h=nheads) + if causal: + # "triu_tril_cuda_template" not implemented for 'BFloat16' + # So we have to construct the mask in float + causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1) + # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess) + scores = scores + causal_mask.to(dtype=scores.dtype) + attention = torch.softmax(scores, dim=-1) + attention_drop = F.dropout(attention, dropout_p) + output = torch.einsum('bhts,bshd->bthd', attention_drop , v) + return output.to(dtype=qkv.dtype) + + +def time_fwd_bwd(func, *args, **kwargs): + time_f, time_b = benchmark_fwd_bwd(func, *args, **kwargs) + return time_f[1].mean, time_b[1].mean + + +repeats = 30 +device = 'cuda' +dtype = torch.float16 + +bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 16384)] +causal_vals = [False, True] +headdim_vals = [64, 128] +dim = 2048 +dropout_p = 0.0 + +methods = (["Flash2", "Pytorch"] + + (["Triton"] if attention_triton is not None else []) + + (["xformers.c"] if xops is not None else []) + + (["xformers.f"] if xops is not None else [])) + +time_f = {} +time_b = {} +time_f_b = {} +speed_f = {} +speed_b = {} +speed_f_b = {} +for causal in causal_vals: + for headdim in headdim_vals: + for batch_size, seqlen in bs_seqlen_vals: + config = (causal, headdim, batch_size, seqlen) + nheads = dim // headdim + qkv = torch.randn(batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) + f, b = time_fwd_bwd( + flash_attn_qkvpacked_func, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False + ) + time_f[config, "Flash2"] = f + time_b[config, "Flash2"] = b + + try: + qkv = qkv.detach().requires_grad_(True) + f, b = time_fwd_bwd( + attention_pytorch, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False + ) + except: # Skip if OOM + f, b = float('nan'), float('nan') + time_f[config, "Pytorch"] = f + time_b[config, "Pytorch"] = b + + if attention_triton is not None: + q, k, v = [torch.randn(batch_size, nheads, seqlen, headdim, device=device, dtype=dtype, + requires_grad=True) for _ in range(3)] + # Try both values of sequence_parallel and pick the faster one + try: + f, b = time_fwd_bwd( + attention_triton, q, k, v, causal, headdim**(-0.5), + False, repeats=repeats, verbose=False + ) + except: + f, b = float('nan'), float('inf') + try: + _, b0 = time_fwd_bwd( + attention_triton, q, k, v, causal, headdim**(-0.5), + True, repeats=repeats, verbose=False + ) + except: + b0 = float('inf') + time_f[config, "Triton"] = f + time_b[config, "Triton"] = min(b, b0) if min(b, b0) < float('inf') else float('nan') + + if xops is not None: + q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) for _ in range(3)] + f, b = time_fwd_bwd( + xops.memory_efficient_attention, q, k, v, + attn_bias=xops.LowerTriangularMask() if causal else None, + op=(xops.fmha.cutlass.FwOp, xops.fmha.cutlass.BwOp) + ) + time_f[config, "xformers.c"] = f + time_b[config, "xformers.c"] = b + + if xops is not None: + q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=dtype, + requires_grad=True) for _ in range(3)] + f, b = time_fwd_bwd( + xops.memory_efficient_attention, q, k, v, + attn_bias=xops.LowerTriangularMask() if causal else None, + op=(xops.fmha.flash.FwOp, xops.fmha.flash.BwOp) + ) + time_f[config, "xformers.f"] = f + time_b[config, "xformers.f"] = b + + print(f"### causal={causal}, headdim={headdim}, batch_size={batch_size}, seqlen={seqlen} ###") + for method in methods: + time_f_b[config, method] = time_f[config, method] + time_b[config, method] + speed_f[config, method] = efficiency( + flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"), + time_f[config, method] + ) + speed_b[config, method] = efficiency( + flops(batch_size, seqlen, headdim, nheads, causal, mode="bwd"), + time_b[config, method] + ) + speed_f_b[config, method] = efficiency( + flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd_bwd"), + time_f_b[config, method] + ) + print( + f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, " + f"bwd: {speed_b[config, method]:.2f} TFLOPs/s, " + f"fwd + bwd: {speed_f_b[config, method]:.2f} TFLOPs/s" + ) + + +# with open('flash2_attn_time.plk', 'wb') as fp: +# pickle.dump((speed_f, speed_b, speed_f_b), fp, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/benchmark_flash_attention_fp8.py b/benchmark_flash_attention_fp8.py new file mode 100644 index 0000000000000000000000000000000000000000..6f700a31b005a13cceaa6873d27b6fef8aba908a --- /dev/null +++ b/benchmark_flash_attention_fp8.py @@ -0,0 +1,333 @@ +# Install the newest triton version with +# pip install "git+https://github.com/openai/triton.git#egg=triton&subdirectory=python" +import pickle +import math +import time +import torch +import torch.nn as nn +import torch.nn.functional as F + +from einops import rearrange, repeat + +from flash_attn.utils.benchmark import benchmark_all, benchmark_forward, benchmark_backward +from flash_attn.utils.benchmark import benchmark_fwd_bwd, benchmark_combined + +from flash_attn import flash_attn_qkvpacked_func +from flash_attn_interface import flash_attn_func + +try: + from triton_fused_attention import attention as attention_triton +except ImportError: + attention_triton = None + +try: + import xformers.ops as xops +except ImportError: + xops = None + +try: + import cudnn +except ImportError: + cudnn = None + + +def convert_to_cudnn_type(torch_type): + if torch_type == torch.float16: + return cudnn.data_type.HALF + elif torch_type == torch.bfloat16: + return cudnn.data_type.BFLOAT16 + elif torch_type == torch.float32: + return cudnn.data_type.FLOAT + elif torch_type == torch.int32: + return cudnn.data_type.INT32 + elif torch_type == torch.int64: + return cudnn.data_type.INT64 + elif torch_type == torch.float8_e4m3fn: + return cudnn.data_type.FP8_E4M3 + elif torch_type == torch.float8_e4m3fn: + return cudnn.data_type.FP8_E5M2 + else: + raise ValueError("Unsupported tensor data type.") + +def cudnn_spda_setup(qkv, seqlen_q, seqlen_k, causal=False): + b, _, _, nheads, headdim = qkv.shape + assert cudnn is not None, 'CUDNN is not available' + o_gpu = torch.zeros(b, seqlen_q, nheads, headdim, dtype=qkv.dtype, device=qkv.device) + o_gpu_transposed = torch.as_strided( + o_gpu, + [b, nheads, seqlen_q, headdim], + [nheads * seqlen_q * headdim, headdim, nheads * headdim, 1], + ) + stats_gpu = torch.empty(b, nheads, seqlen_q, 1, dtype=torch.float32, device=qkv.device) + amax_s_gpu = torch.empty(1, 1, 1, 1, dtype=torch.float32, device=qkv.device) + amax_o_gpu = torch.empty(1, 1, 1, 1, dtype=torch.float32, device=qkv.device) + graph = cudnn.pygraph( + io_data_type=convert_to_cudnn_type(qkv.dtype), + intermediate_data_type=cudnn.data_type.FLOAT, + compute_data_type=cudnn.data_type.FLOAT, + ) + new_q = torch.as_strided( + qkv, + [b, nheads, seqlen_q, headdim], + [seqlen_q * nheads * headdim * 3, headdim, headdim * nheads * 3, 1], + storage_offset=0, + ) + q = graph.tensor( + name = "Q", + dim = list(new_q.shape), + stride = list(new_q.stride()), + data_type=convert_to_cudnn_type(qkv.dtype) + ) + new_k = torch.as_strided( + qkv, + [b, nheads, seqlen_k, headdim], + [seqlen_k * nheads * headdim * 3, headdim, headdim * nheads * 3, 1], + storage_offset=nheads * headdim, + ) + k = graph.tensor( + name = "K", + dim = list(new_k.shape), + stride = list(new_k.stride()), + data_type=convert_to_cudnn_type(qkv.dtype) + ) + new_v = torch.as_strided( + qkv, + [b, nheads, seqlen_k, headdim], + [seqlen_k * nheads * headdim * 3, headdim, headdim * nheads * 3, 1], + storage_offset=nheads * headdim * 2, + ) + v = graph.tensor( + name = "V", + dim = list(new_v.shape), + stride = list(new_v.stride()), + data_type=convert_to_cudnn_type(qkv.dtype) + ) + + def get_default_scale_tensor(): + return graph.tensor( + dim = [1, 1, 1, 1], + stride = [1, 1, 1, 1], + data_type=cudnn.data_type.FLOAT + ) + + default_scale_gpu = torch.ones(1, 1, 1, 1, dtype=torch.float32, device="cuda") + descale_q = get_default_scale_tensor() + descale_k = get_default_scale_tensor() + descale_v = get_default_scale_tensor() + descale_s = get_default_scale_tensor() + scale_s = get_default_scale_tensor() + scale_o = get_default_scale_tensor() + + o, _, amax_s, amax_o = graph.sdpa_fp8( + q=q, + k=k, + v=v, + descale_q=descale_q, + descale_k=descale_k, + descale_v=descale_v, + descale_s=descale_s, + scale_s=scale_s, + scale_o=scale_o, + is_inference=True, + attn_scale=1.0 / math.sqrt(headdim), + use_causal_mask=causal, + name="sdpa", + ) + + o.set_output(True).set_dim(o_gpu_transposed.shape).set_stride(o_gpu_transposed.stride()) + + amax_s.set_output(False).set_dim(amax_s_gpu.shape).set_stride(amax_s_gpu.stride()) + amax_o.set_output(False).set_dim(amax_o_gpu.shape).set_stride(amax_o_gpu.stride()) + # stats.set_output(True).set_data_type(cudnn.data_type.FLOAT) + + graph.validate() + graph.build_operation_graph() + graph.create_execution_plans([cudnn.heur_mode.A, cudnn.heur_mode.FALLBACK]) + graph.check_support() + graph.build_plans() + + variant_pack = { + q: new_q, + k: new_k, + v: new_v, + descale_q: default_scale_gpu, + descale_k: default_scale_gpu, + descale_v: default_scale_gpu, + descale_s: default_scale_gpu, + scale_s: default_scale_gpu, + scale_o: default_scale_gpu, + o: o_gpu_transposed, + amax_s: amax_s_gpu, + amax_o: amax_o_gpu, + } + + workspace = torch.empty(graph.get_workspace_size(), device="cuda", dtype=torch.uint8) + + def run(*args, **kwargs): + graph.execute(variant_pack, workspace) + return o_gpu, amax_o_gpu + + return run + + +def attention_pytorch(qkv, dropout_p=0.0, causal=True): + """ + Arguments: + qkv: (batch_size, seqlen, 3, nheads, head_dim) + dropout_p: float + Output: + output: (batch_size, seqlen, nheads, head_dim) + """ + batch_size, seqlen, _, nheads, d = qkv.shape + q, k, v = qkv.unbind(dim=2) + q = rearrange(q, 'b t h d -> (b h) t d') + k = rearrange(k, 'b s h d -> (b h) d s') + softmax_scale = 1.0 / math.sqrt(d) + # Preallocate attn_weights for `baddbmm` + scores = torch.empty(batch_size * nheads, seqlen, seqlen, dtype=qkv.dtype, device=qkv.device) + scores = rearrange(torch.baddbmm(scores, q, k, beta=0, alpha=softmax_scale), + '(b h) t s -> b h t s', h=nheads) + if causal: + # "triu_tril_cuda_template" not implemented for 'BFloat16' + # So we have to construct the mask in float + causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1) + # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess) + scores = scores + causal_mask.to(dtype=scores.dtype) + attention = torch.softmax(scores, dim=-1) + attention_drop = F.dropout(attention, dropout_p) + output = torch.einsum('bhts,bshd->bthd', attention_drop , v) + return output.to(dtype=qkv.dtype) + +def flops(batch, seqlen, headdim, nheads, causal, mode="fwd"): + assert mode in ["fwd", "bwd", "fwd_bwd"] + f = 4 * batch * seqlen**2 * nheads * headdim // (2 if causal else 1) + return f if mode == "fwd" else (2.5 * f if mode == "bwd" else 3.5 * f) + +def efficiency(flop, time): + return (flop / time / 10**12) if not math.isnan(time) else 0.0 + +def time_fwd(func, *args, **kwargs): + time.sleep(1) # Sleep to avoid residual power throttling from the previous benchmark + time_f = benchmark_forward(func, *args, **kwargs) + return time_f[1].mean + + +torch.manual_seed(0) + +repeats = 30 +device = 'cuda' +# dtype = torch.float16 +dtype = torch.float8_e4m3fn + +bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4224), (2, 8448), (1, 8448 * 2)] +# bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048), (4, 4096), (2, 8192), (1, 8192 * 2)] +# bs_seqlen_vals = [(4, 4096), (2, 8192), (1, 8192 * 2), (4, 4224), (2, 8448), (1, 8448 * 2)] +# bs_seqlen_vals = [(32, 512), (16, 1024), (8, 2048)] +causal_vals = [False, True] +headdim_vals = [128] +dim = 2048 +# dim = 256 +dropout_p = 0.0 + +methods = (["Pytorch", "Flash3", "cuDNN"] + # + (["Triton"] if attention_triton is not None else []) + # + (["xformers.c"] if xops is not None else []) + # + (["xformers.f"] if xops is not None else []) + ) + +time_f = {} +time_b = {} +time_f_b = {} +speed_f = {} +speed_b = {} +speed_f_b = {} +for causal in causal_vals: + for headdim in headdim_vals: + for batch_size, seqlen in bs_seqlen_vals: + torch.cuda.empty_cache() + config = (causal, headdim, batch_size, seqlen) + nheads = dim // headdim + q, k, v = [torch.randn(batch_size, seqlen, nheads, headdim, device=device, dtype=torch.float16, requires_grad=False) for _ in range(3)] + + qkv = torch.stack([q, k, v], dim=2) + qkv = qkv.to(torch.float16) + f = time_fwd(attention_pytorch, qkv, dropout_p, causal=causal, repeats=repeats, verbose=False) + time_f[config, "Pytorch"] = f + res_baseline = attention_pytorch(qkv, dropout_p, causal=causal) + + if attention_triton is not None: + q_transposed = q.transpose(1, 2).contiguous().to(torch.float8_e4m3fn) + k_transposed = k.transpose(1, 2).contiguous().to(torch.float8_e4m3fn) + v_transposed = v.transpose(1, 2).contiguous().permute(0, 1, 3, 2).to(torch.float8_e4m3fn) + scale = 1 / math.sqrt(headdim) + f = time_fwd( + attention_triton, q_transposed, k_transposed, v_transposed, + causal, scale, repeats=5, verbose=False, desc='Triton' + ) + f = time_fwd( + attention_triton, q_transposed, k_transposed, v_transposed, + causal, scale, repeats=repeats, verbose=False, desc='Triton' + ) + time_f[config, "Triton"] = f + res = attention_triton( + q_transposed, k_transposed, v_transposed.permute(0, 1, 3, 2), + causal, scale + ).half().transpose(1, 2) + torch.testing.assert_close(res, res_baseline, atol=0.5, rtol=0.5) + + # out = torch.empty_like(q) + q, k, v = q.to(dtype), k.to(dtype), v.to(dtype) + f = time_fwd(flash_attn_func, q, k, v, causal=causal, repeats=repeats, verbose=False) + + # res = flash_attn_func(q, k, v, causal=causal) + # torch.testing.assert_close(res.half(), res_baseline, atol=0.05, rtol=0.05) + + time_f[config, "Flash3"] = f + + if cudnn is not None: + qkv_fp8 = qkv.to(dtype) + time.sleep(1) # Sleep to avoid residual power throttling from the previous benchmark + f = time_fwd( + cudnn_spda_setup( + qkv_fp8, seqlen, seqlen, + causal=causal + ), + repeats=repeats, verbose=False + ) + time_f[config, "cuDNN"] = f + # res, amax_o = cudnn_spda_setup( + # qkv_fp8, seqlen, seqlen, + # causal=causal + # )() + # res = res.half() + # TODO: CUDNN has numerics issues when + # num_heads=16, dim=128, seq_len=1024, batch_size=2 + # or larger sizes. + # res_cpu = res.cpu().reshape(-1) + # res_baseline_cpu = res_baseline.cpu().reshape(-1) + # print(amax_o) + # print(res) + # print(res_baseline) + # for i in range(len(res_cpu)): + # item = res_cpu[i] + # item_baseline = res_baseline_cpu[i] + # if abs(item - item_baseline) > 0.5: + # print(i) + # print(item) + # print(item_baseline) + # torch.testing.assert_close(res, res_baseline, atol=0.05, rtol=0.05) + + print(f"### causal={causal}, headdim={headdim}, batch_size={batch_size}, seqlen={seqlen} ###") + for method in methods: + speed_f[config, method] = efficiency( + flops(batch_size, seqlen, headdim, nheads, causal, mode="fwd"), + time_f[config, method] + ) + #print (time_f[config,method]) + print( + f"{method} fwd: {speed_f[config, method]:.2f} TFLOPs/s, {time_f[config, method] * 1e3} ms, " + ) + + +# with open('flash3_attn_time.plk', 'wb') as fp: +# pickle.dump((time_f, time_b, time_f_b), fp, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/benchmark_gemm.py b/benchmark_gemm.py new file mode 100644 index 0000000000000000000000000000000000000000..6a7dc7bd79170c07a30ee3bf1266d4f8dc61b430 --- /dev/null +++ b/benchmark_gemm.py @@ -0,0 +1,43 @@ +import time +import torch +import torch.utils.benchmark as benchmark + +from triton.testing import do_bench + + +def benchmark_forward(fn, *inputs, repeats=10, desc='', verbose=True, **kwinputs): + """Use Pytorch Benchmark on the forward pass of an arbitrary function.""" + if verbose: + print(desc, '- Forward pass') + t = benchmark.Timer( + stmt='fn(*inputs, **kwinputs)', + globals={'fn': fn, 'inputs': inputs, 'kwinputs': kwinputs}, + num_threads=torch.get_num_threads(), + ) + m = t.timeit(repeats) + if verbose: + print(m) + return t, m + + +torch.manual_seed(0) +repeats = 30 +dtype = torch.float16 +device = 'cuda' +verbose = False +m, n = 8192, 8192 + +tflops_matmul = {} +tflops_matmul1 = {} +for k in [512, 1024, 1536, 2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192]: + a = torch.randn(m, k, device=device, dtype=dtype) + b = torch.randn(n, k, device=device, dtype=dtype).transpose(-1, -2) + nFLOPS_matmul = 2 * m * n * k + time.sleep(2) # to reduce power throttling + timing = benchmark_forward(torch.matmul, a, b, desc='cuBLAS', verbose=verbose, repeats=repeats)[1] + tflops_matmul[k] = nFLOPS_matmul / timing.mean * 1e-12 + print(f'[torch.utils.benchmark] cuBLAS, {m = }, {n = }, {k = }: {timing.mean * 1e3:.3f}ms, {tflops_matmul[k]:.1f} TFLOPS') + time.sleep(2) # to reduce power throttling + ms = do_bench(lambda: torch.matmul(a, b), warmup=10, rep=repeats) + tflops_matmul1[k] = nFLOPS_matmul / ms * 1e-9 + print(f'[triton.test.do_bench] cuBLAS, {m = }, {n = }, {k = }: {ms:.3f}ms, {tflops_matmul1[k]:.1f} TFLOPS') diff --git a/bert.py b/bert.py new file mode 100644 index 0000000000000000000000000000000000000000..33d6935202a1b99393ef34d56a6b4fa0e188ab57 --- /dev/null +++ b/bert.py @@ -0,0 +1,764 @@ +# Copyright (c) 2022, Tri Dao. +# This BERT implementation is based on our MLPerf 2.0 and MLPerf 2.1 BERT implementation. +# https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py +# https://github.com/mlcommons/training_results_v2.1/blob/main/Azure-HazyResearch/benchmarks/bert/implementations/ND96amsr_A100_v4/modeling.py + +# Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py + +import logging +import re +from collections import OrderedDict +from collections.abc import Sequence +from functools import partial +from typing import Any, Mapping + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange +from transformers import BertConfig, PretrainedConfig +from transformers.models.bert.modeling_bert import ( + BaseModelOutputWithPoolingAndCrossAttentions, + BertForPreTrainingOutput, +) + +from flash_attn.bert_padding import ( + index_first_axis, + index_first_axis_residual, + pad_input, + unpad_input, +) +from flash_attn.modules.block import Block +from flash_attn.modules.embedding import BertEmbeddings +from flash_attn.modules.mha import MHA +from flash_attn.modules.mlp import FusedMLP, Mlp +from flash_attn.utils.pretrained import state_dict_from_pretrained + +try: + from flash_attn.ops.fused_dense import FusedDense +except ImportError: + FusedDense = None + +try: + from flash_attn.ops.triton.layer_norm import layer_norm_fn +except ImportError: + layer_norm_fn = None + + +try: + from flash_attn.losses.cross_entropy import CrossEntropyLoss +except ImportError: + CrossEntropyLoss = None + + +logger = logging.getLogger(__name__) + + +def create_mixer_cls(config, cross_attn=False, return_residual=False): + use_flash_attn = getattr(config, "use_flash_attn", False) + fused_bias_fc = getattr(config, "fused_bias_fc", False) + rotary_kwargs = {} + if config.position_embedding_type == "rotary": + rotary_kwargs["rotary_emb_dim"] = getattr(config, "rotary_emb_dim", config.hidden_size) + rotary_kwargs["rotary_emb_base"] = getattr(config, "rotary_emb_base", 10000.0) + rotary_kwargs["rotary_emb_scale_base"] = getattr(config, "rotary_emb_scale_base", None) + rotary_kwargs["rotary_emb_interleaved"] = getattr(config, "rotary_emb_interleaved", False) + mixer_cls = partial( + MHA, + num_heads=config.num_attention_heads, + cross_attn=cross_attn, + dropout=config.attention_probs_dropout_prob, + causal=False, + fused_bias_fc=fused_bias_fc, + use_flash_attn=use_flash_attn, + return_residual=return_residual, + **rotary_kwargs, + ) + return mixer_cls + + +def create_mlp_cls(config, layer_idx=None, return_residual=False): + inner_dim = config.intermediate_size + fused_mlp = getattr(config, "fused_mlp", False) + if fused_mlp: + assert config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"], ( + "fused_mlp only " "supports approximate gelu" + ) + if not fused_mlp: + approximate = ( + "tanh" + if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"] + else "none" + ) + mlp_cls = partial( + Mlp, + hidden_features=inner_dim, + activation=partial(F.gelu, approximate=approximate), + return_residual=return_residual, + ) + else: + if FusedMLP is None: + raise ImportError("fused_dense is not installed") + mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0) + # mlp_checkpoint_lvl could be a list, which contains the checkpoint_lvl for each layer + if isinstance(mlp_checkpoint_lvl, Sequence): + assert layer_idx is not None + mlp_checkpoint_lvl = mlp_checkpoint_lvl[layer_idx] + mlp_cls = partial( + FusedMLP, + hidden_features=inner_dim, + checkpoint_lvl=mlp_checkpoint_lvl, + return_residual=return_residual, + ) + return mlp_cls + + +def create_block(config, layer_idx=None): + last_layer_subset = getattr(config, "last_layer_subset", False) + cross_attn = last_layer_subset and layer_idx == config.num_hidden_layers - 1 + # TD [2022-12-19]: For cross attention (last layer), we actually want to return the + # residual x_kv, not residual x. But it's annoying to change the API (and it only affects + # one layer) so we just choose not to return residual in this case. + return_residual = not cross_attn + mixer_cls = create_mixer_cls(config, cross_attn, return_residual=return_residual) + mlp_cls = create_mlp_cls(config, layer_idx, return_residual=return_residual) + norm_cls = partial(nn.LayerNorm, eps=config.layer_norm_eps) + block = Block( + config.hidden_size, + mixer_cls, + mlp_cls, + norm_cls=norm_cls, + prenorm=False, + resid_dropout1=config.hidden_dropout_prob, + resid_dropout2=config.hidden_dropout_prob, + fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False), + return_residual=return_residual, + ) + return block + + +# https://github.com/huggingface/transformers/blob/7032e0203262ebb2ebf55da8d2e01f873973e835/src/transformers/models/bert/modeling_bert.py#L748 +def _init_weights(module, initializer_range=0.02): + if isinstance(module, nn.Linear): + nn.init.normal_(module.weight, std=initializer_range) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, std=initializer_range) + if module.padding_idx is not None: + nn.init.zeros_(module.weight[module.padding_idx]) + + +class BertEncoder(nn.Module): + def __init__(self, config: BertConfig): + super().__init__() + self.use_flash_attn = getattr(config, "use_flash_attn", False) + self.layers = nn.ModuleList( + [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)] + ) + + def forward(self, hidden_states, key_padding_mask=None, subset_mask=None): + """If subset_mask is not None, we only want output for the subset of the sequence. + This means that we only compute the last layer output for these tokens. + subset_mask: (batch, seqlen), dtype=torch.bool + """ + if key_padding_mask is None or not self.use_flash_attn: + mixer_kwargs = ( + {"key_padding_mask": key_padding_mask} if key_padding_mask is not None else None + ) + for layer in self.layers: + hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs) + if subset_mask is not None: + hidden_states = hidden_states[subset_mask] + else: + batch, seqlen = hidden_states.shape[:2] + hidden_states, indices, cu_seqlens, max_seqlen_in_batch = unpad_input( + hidden_states, key_padding_mask + ) + mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch} + if subset_mask is None: + for layer in self.layers: + hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs) + hidden_states = pad_input(hidden_states, indices, batch, seqlen) + else: + for layer in self.layers[:-1]: + hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs) + if key_padding_mask is not None: + subset_idx = torch.nonzero( + subset_mask[key_padding_mask], as_tuple=False + ).flatten() + subset_seqlens = (subset_mask & key_padding_mask).sum(dim=-1, dtype=torch.int32) + subset_cu_seqlens = F.pad( + torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0) + ) + else: + subset_idx = torch.nonzero(subset_mask, as_tuple=False).flatten() + subset_seqlens = subset_mask.sum(dim=-1, dtype=torch.int32) + subset_cu_seqlens = F.pad( + torch.cumsum(subset_seqlens, dim=0, dtype=torch.torch.int32), (1, 0) + ) + hidden_states_subset, hidden_states = index_first_axis_residual( + hidden_states, subset_idx + ) + # It's ok to set max_seqlen_q to be much larger + mixer_kwargs = { + "x_kv": hidden_states, + "cu_seqlens": subset_cu_seqlens, + "max_seqlen": max_seqlen_in_batch, + "cu_seqlens_k": cu_seqlens, + "max_seqlen_k": max_seqlen_in_batch, + } + hidden_states = self.layers[-1](hidden_states_subset, mixer_kwargs=mixer_kwargs) + return hidden_states + + +class BertPooler(nn.Module): + def __init__(self, config): + super().__init__() + fused_bias_fc = getattr(config, "fused_bias_fc", False) + if fused_bias_fc and FusedDense is None: + raise ImportError("fused_dense is not installed") + linear_cls = nn.Linear if not fused_bias_fc else FusedDense + self.dense = linear_cls(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states, pool=True): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] if pool else hidden_states + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + fused_bias_fc = getattr(config, "fused_bias_fc", False) + if fused_bias_fc and FusedDense is None: + raise ImportError("fused_dense is not installed") + self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False) + if self.fused_dropout_add_ln and layer_norm_fn is None: + raise ImportError("Triton is not installed") + linear_cls = nn.Linear if not fused_bias_fc else FusedDense + self.dense = linear_cls(config.hidden_size, config.hidden_size) + approximate = ( + "tanh" + if config.hidden_act in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"] + else "none" + ) + self.transform_act_fn = nn.GELU(approximate=approximate) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + if not self.fused_dropout_add_ln: + hidden_states = self.layer_norm(hidden_states) + else: + hidden_states = layer_norm_fn( + hidden_states, self.layer_norm.weight, self.layer_norm.bias, eps=self.layer_norm.eps + ) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + fused_bias_fc = getattr(config, "fused_bias_fc", False) + if fused_bias_fc and FusedDense is None: + raise ImportError("fused_dense is not installed") + linear_cls = nn.Linear if not fused_bias_fc else FusedDense + + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = linear_cls(config.hidden_size, config.vocab_size, bias=True) + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertPreTrainingHeads(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class BertPreTrainedModel(nn.Module): + """An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + + def __init__(self, config, *inputs, **kwargs): + super().__init__() + if not isinstance(config, BertConfig): + raise ValueError( + "Parameter config in `{}(config)` should be an instance of class `BertConfig`. " + "To create a model from a Google pretrained model use " + "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( + self.__class__.__name__, self.__class__.__name__ + ) + ) + self.config = config + + @classmethod + def from_pretrained(cls, model_name, config, *inputs, **kwargs): + """ + Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict. + Download and cache the pre-trained model file if needed. + + Params: + pretrained_model_name_or_path: either: + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `pytorch_model.bin` a PyTorch dump of a BertForPretraining instance + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `model.chkpt` a TensorFlow checkpoint + *inputs, **kwargs: additional input for the specific Bert class + (ex: num_labels for BertForSequenceClassification) + """ + # Instantiate model. + model = cls(config, *inputs, **kwargs) + load_return = model.load_state_dict( + remap_state_dict(state_dict_from_pretrained(model_name), config), strict=False + ) + logger.info(load_return) + return model + + +class BertModel(BertPreTrainedModel): + def __init__(self, config: BertConfig, add_pooling_layer=True): + super().__init__(config) + self.pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + if config.vocab_size % self.pad_vocab_size_multiple != 0: + config.vocab_size += self.pad_vocab_size_multiple - ( + config.vocab_size % self.pad_vocab_size_multiple + ) + self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False) + if self.fused_dropout_add_ln and layer_norm_fn is None: + raise ImportError("Triton is not installed") + assert config.hidden_act in ["gelu", "gelu_new", "gelu_fast", "gelu_pytorch_tanh"] + + self.embeddings = BertEmbeddings( + config.hidden_size, + config.vocab_size, + config.max_position_embeddings, + config.type_vocab_size, + padding_idx=config.pad_token_id, + ) + self.emb_drop = nn.Dropout(config.hidden_dropout_prob) + self.emb_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.encoder = BertEncoder(config) + self.pooler = BertPooler(config) if add_pooling_layer else None + + self.apply(partial(_init_weights, initializer_range=config.initializer_range)) + + def forward( + self, + input_ids, + position_ids=None, + token_type_ids=None, + attention_mask=None, + masked_tokens_mask=None, + ): + """If masked_tokens_mask is not None (i.e. last_layer_subset == True in BertForPreTraining), + we only want the output for the masked tokens. This means that we only compute the last + layer output for these tokens. + masked_tokens_mask: (batch, seqlen), dtype=torch.bool + """ + hidden_states = self.embeddings( + input_ids, position_ids=position_ids, token_type_ids=token_type_ids + ) + # TD [2022-12:18]: Don't need to force residual in fp32 + # BERT puts embedding LayerNorm before embedding dropout. + if not self.fused_dropout_add_ln: + hidden_states = self.emb_ln(hidden_states) + else: + hidden_states = layer_norm_fn( + hidden_states, self.emb_ln.weight, self.emb_ln.bias, eps=self.emb_ln.eps + ) + hidden_states = self.emb_drop(hidden_states) + + if masked_tokens_mask is not None: + batch_size, seqlen = input_ids.shape[:2] + # We also need the first column for the CLS token + first_col_mask = torch.zeros( + batch_size, seqlen, dtype=torch.bool, device=input_ids.device + ) + first_col_mask[:, 0] = True + subset_mask = masked_tokens_mask | first_col_mask + else: + subset_mask = None + + sequence_output = self.encoder( + hidden_states, key_padding_mask=attention_mask, subset_mask=subset_mask + ) + + if masked_tokens_mask is None: + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + else: + # TD [2022-03-01]: the indexing here is very tricky. + if attention_mask is not None: + subset_idx = subset_mask[attention_mask] + pool_input = sequence_output[first_col_mask[attention_mask][subset_idx]] + sequence_output = sequence_output[masked_tokens_mask[attention_mask][subset_idx]] + else: + pool_input = sequence_output[first_col_mask[subset_mask]] + sequence_output = sequence_output[masked_tokens_mask[subset_mask]] + pooled_output = self.pooler(pool_input, pool=False) if self.pooler is not None else None + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + ) + + +class BertForPreTraining(BertPreTrainedModel): + def __init__(self, config: BertConfig): + super().__init__(config) + # If dense_seq_output, we only need to pass the hidden states for the masked out tokens + # (around 15%) to the classifier heads. + self.dense_seq_output = getattr(config, "dense_seq_output", False) + # If last_layer_subset, we only need the compute the last layer for a subset of tokens + # (e.g., the tokens we need to compute the masked LM loss and the next-sentence prediction). + self.last_layer_subset = getattr(config, "last_layer_subset", False) + if self.last_layer_subset: + assert self.dense_seq_output, "last_layer_subset requires dense_seq_output" + use_xentropy = getattr(config, "use_xentropy", False) + if use_xentropy and CrossEntropyLoss is None: + raise ImportError("xentropy_cuda is not installed") + loss_cls = ( + nn.CrossEntropyLoss + if not use_xentropy + else partial(CrossEntropyLoss, inplace_backward=True) + ) + + self.bert = BertModel(config) + self.cls = BertPreTrainingHeads(config) + self.mlm_loss = loss_cls(ignore_index=0) + self.nsp_loss = loss_cls(ignore_index=-1) + + # Initialize weights and apply final processing + self.apply(partial(_init_weights, initializer_range=config.initializer_range)) + self.tie_weights() + + def tie_weights(self): + self.cls.predictions.decoder.weight = self.bert.embeddings.word_embeddings.weight + + def forward( + self, + input_ids, + position_ids=None, + token_type_ids=None, + attention_mask=None, + labels=None, + next_sentence_label=None, + ): + """ + If labels are provided, they must be 0 for masked out tokens (as specified in the attention + mask). + Outputs: + if `labels` and `next_sentence_label` are not `None`: + Outputs the total_loss which is the sum of the masked language modeling loss and the next + sentence classification loss. + if `labels` or `next_sentence_label` is `None`: + Outputs a tuple comprising + - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and + - the next sentence classification logits of shape [batch_size, 2]. + + """ + masked_tokens_mask = labels > 0 if (self.last_layer_subset and labels is not None) else None + outputs = self.bert( + input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + attention_mask=attention_mask.bool() if attention_mask is not None else None, + masked_tokens_mask=masked_tokens_mask, + ) + sequence_output, pooled_output = outputs.last_hidden_state, outputs.pooler_output + if self.dense_seq_output and labels is not None: + masked_token_idx = torch.nonzero(labels.flatten() > 0, as_tuple=False).flatten() + if not self.last_layer_subset: + sequence_output = index_first_axis( + rearrange(sequence_output, "b s d -> (b s) d"), masked_token_idx + ) + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + total_loss = None + if labels is not None and next_sentence_label is not None: + if ( + self.dense_seq_output and labels is not None + ): # prediction_scores are already flattened + masked_lm_loss = self.mlm_loss( + prediction_scores, labels.flatten()[masked_token_idx] + ) + else: + masked_lm_loss = self.mlm_loss( + rearrange(prediction_scores, "... v -> (...) v"), + rearrange(labels, "... -> (...)"), + ) + next_sentence_loss = self.nsp_loss( + rearrange(seq_relationship_score, "... t -> (...) t"), + rearrange(next_sentence_label, "... -> (...)"), + ) + total_loss = masked_lm_loss.float() + next_sentence_loss.float() + + return BertForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + ) + + +def remap_state_dict(state_dict, config: PretrainedConfig): + """ + Map the state_dict of a Huggingface BERT model to be flash_attn compatible. + """ + + # LayerNorm + def key_mapping_ln_gamma_beta(key): + key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key) + key = re.sub(r"LayerNorm.beta$", "LayerNorm.bias", key) + return key + + state_dict = OrderedDict((key_mapping_ln_gamma_beta(k), v) for k, v in state_dict.items()) + + # Layers + def key_mapping_layers(key): + return re.sub(r"^bert.encoder.layer.", "bert.encoder.layers.", key) + + state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items()) + + # LayerNorm + def key_mapping_ln(key): + key = re.sub(r"^bert.embeddings.LayerNorm.", "bert.emb_ln.", key) + key = re.sub( + r"^bert.encoder.layers.(\d+).attention.output.LayerNorm.(weight|bias)", + r"bert.encoder.layers.\1.norm1.\2", + key, + ) + key = re.sub( + r"^bert.encoder.layers.(\d+).output.LayerNorm.(weight|bias)", + r"bert.encoder.layers.\1.norm2.\2", + key, + ) + key = re.sub( + r"^cls.predictions.transform.LayerNorm.(weight|bias)", + r"cls.predictions.transform.layer_norm.\1", + key, + ) + return key + + state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items()) + + # MLP + def key_mapping_mlp(key): + key = re.sub( + r"^bert.encoder.layers.(\d+).intermediate.dense.(weight|bias)", + r"bert.encoder.layers.\1.mlp.fc1.\2", + key, + ) + key = re.sub( + r"^bert.encoder.layers.(\d+).output.dense.(weight|bias)", + r"bert.encoder.layers.\1.mlp.fc2.\2", + key, + ) + return key + + state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items()) + + # Attention + last_layer_subset = getattr(config, "last_layer_subset", False) + for d in range(config.num_hidden_layers): + Wq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.weight") + Wk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.weight") + Wv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.weight") + bq = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.query.bias") + bk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.bias") + bv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.bias") + if not (last_layer_subset and d == config.num_hidden_layers - 1): + state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.weight"] = torch.cat( + [Wq, Wk, Wv], dim=0 + ) + state_dict[f"bert.encoder.layers.{d}.mixer.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0) + else: + state_dict[f"bert.encoder.layers.{d}.mixer.Wq.weight"] = Wq + state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.weight"] = torch.cat([Wk, Wv], dim=0) + state_dict[f"bert.encoder.layers.{d}.mixer.Wq.bias"] = bq + state_dict[f"bert.encoder.layers.{d}.mixer.Wkv.bias"] = torch.cat([bk, bv], dim=0) + + def key_mapping_attn(key): + return re.sub( + r"^bert.encoder.layers.(\d+).attention.output.dense.(weight|bias)", + r"bert.encoder.layers.\1.mixer.out_proj.\2", + key, + ) + + state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items()) + + def key_mapping_decoder_bias(key): + return re.sub(r"^cls.predictions.bias", "cls.predictions.decoder.bias", key) + + state_dict = OrderedDict((key_mapping_decoder_bias(k), v) for k, v in state_dict.items()) + + # Word embedding + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + if pad_vocab_size_multiple > 1: + word_embeddings = state_dict["bert.embeddings.word_embeddings.weight"] + state_dict["bert.embeddings.word_embeddings.weight"] = F.pad( + word_embeddings, (0, 0, 0, config.vocab_size - word_embeddings.shape[0]) + ) + decoder_weight = state_dict["cls.predictions.decoder.weight"] + state_dict["cls.predictions.decoder.weight"] = F.pad( + decoder_weight, (0, 0, 0, config.vocab_size - decoder_weight.shape[0]) + ) + # If the vocab was padded, we want to set the decoder bias for those padded indices to be + # strongly negative (i.e. the decoder shouldn't predict those indices). + # TD [2022-05-09]: I don't think it affects the MLPerf training. + decoder_bias = state_dict["cls.predictions.decoder.bias"] + state_dict["cls.predictions.decoder.bias"] = F.pad( + decoder_bias, (0, config.vocab_size - decoder_bias.shape[0]), value=-100.0 + ) + + return state_dict + + +def inv_remap_state_dict(state_dict, config: PretrainedConfig): + """ + Map the state_dict of a flash_attn model to be Huggingface BERT compatible. + + This function is meant to be the inverse of remap_state_dict. + """ + # Word embedding + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + if pad_vocab_size_multiple > 1: + word_embeddings = state_dict["bert.embeddings.word_embeddings.weight"] + decoder_weight = state_dict["cls.predictions.decoder.weight"] + decoder_bias = state_dict["cls.predictions.decoder.bias"] + # unpad embeddings + state_dict["bert.embeddings.word_embeddings.weight"] = word_embeddings[ + : config.orig_vocab_size, : + ] + state_dict["cls.predictions.decoder.weight"] = decoder_weight[: config.orig_vocab_size, :] + state_dict["cls.predictions.decoder.bias"] = decoder_bias[: config.orig_vocab_size] + + for d in range(config.num_hidden_layers): + last_layer_subset = getattr(config, "last_layer_subset", False) + if not last_layer_subset or d != (config.num_hidden_layers - 1): + Wqkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.weight") + Wqkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wqkv.bias") + state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = Wqkv_weights[ + : Wqkv_weights.shape[0] // 3, : + ] + state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = Wqkv_weights[ + Wqkv_weights.shape[0] // 3 : 2 * Wqkv_weights.shape[0] // 3, : + ] + state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = Wqkv_weights[ + 2 * Wqkv_weights.shape[0] // 3 :, : + ] + state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wqkv_biases[ + : Wqkv_biases.shape[0] // 3 + ] + state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wqkv_biases[ + Wqkv_biases.shape[0] // 3 : 2 * Wqkv_biases.shape[0] // 3 + ] + state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = Wqkv_biases[ + 2 * Wqkv_biases.shape[0] // 3 : + ] + else: + Wq_weight = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.weight") + Wkv_weights = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.weight") + Wq_bias = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wq.bias") + Wkv_biases = state_dict.pop(f"bert.encoder.layers.{d}.mixer.Wkv.bias") + state_dict[f"bert.encoder.layers.{d}.attention.self.query.weight"] = Wq_weight + state_dict[f"bert.encoder.layers.{d}.attention.self.key.weight"] = Wkv_weights[ + : Wkv_weights.shape[0] // 2, : + ] + state_dict[f"bert.encoder.layers.{d}.attention.self.value.weight"] = Wkv_weights[ + Wkv_weights.shape[0] // 2 :, : + ] + state_dict[f"bert.encoder.layers.{d}.attention.self.query.bias"] = Wq_bias + state_dict[f"bert.encoder.layers.{d}.attention.self.key.bias"] = Wkv_biases[ + : Wkv_biases.shape[0] // 2 + ] + state_dict[f"bert.encoder.layers.{d}.attention.self.value.bias"] = Wkv_biases[ + Wkv_biases.shape[0] // 2 : + ] + + def inv_key_mapping_ln(key): + key = re.sub(r"bert.emb_ln.", "bert.embeddings.LayerNorm.", key) + key = re.sub( + r"bert.encoder.layers.(\d+).norm1.(weight|bias)", + r"bert.encoder.layers.\1.attention.output.LayerNorm.\2", + key, + ) + key = re.sub( + r"bert.encoder.layers.(\d+).norm2.(weight|bias)", + r"bert.encoder.layers.\1.output.LayerNorm.\2", + key, + ) + key = re.sub( + r"cls.predictions.transform.layer_norm.(weight|bias)", + r"cls.predictions.transform.LayerNorm.\1", + key, + ) + return key + + def inv_key_mapping_ln_gamma_beta(key): + key = re.sub(r"LayerNorm.weight$", "LayerNorm.gamma", key) + key = re.sub(r"LayerNorm.bias$", "LayerNorm.beta", key) + return key + + def inv_key_mapping_layers(key): + return re.sub(r"bert.encoder.layers.", "bert.encoder.layer.", key) + + def inv_key_mapping_mlp(key): + key = re.sub( + r"bert.encoder.layer.(\d+).mlp.fc1.(weight|bias)", + r"bert.encoder.layer.\1.intermediate.dense.\2", + key, + ) + key = re.sub( + r"bert.encoder.layer.(\d+).mlp.fc2.(weight|bias)", + r"bert.encoder.layer.\1.output.dense.\2", + key, + ) + return key + + def inv_key_mapping_attn(key): + return re.sub( + r"bert.encoder.layer.(\d+).mixer.out_proj.(weight|bias)", + r"bert.encoder.layer.\1.attention.output.dense.\2", + key, + ) + + def inv_key_mapping_decoder_bias(key): + return re.sub(r"cls.predictions.decoder.bias", "cls.predictions.bias", key) + + state_dict = OrderedDict((inv_key_mapping_ln(key), value) for key, value in state_dict.items()) + state_dict = OrderedDict( + (inv_key_mapping_ln_gamma_beta(key), value) for key, value in state_dict.items() + ) + state_dict = OrderedDict( + (inv_key_mapping_layers(key), value) for key, value in state_dict.items() + ) + state_dict = OrderedDict((inv_key_mapping_mlp(key), value) for key, value in state_dict.items()) + state_dict = OrderedDict( + (inv_key_mapping_attn(key), value) for key, value in state_dict.items() + ) + state_dict = OrderedDict( + (inv_key_mapping_decoder_bias(key), value) for key, value in state_dict.items() + ) + + return state_dict diff --git a/bert_padding.py b/bert_padding.py new file mode 100644 index 0000000000000000000000000000000000000000..1d447d3f660e1a6ddd7e7f6fb7d1ae4241bfec73 --- /dev/null +++ b/bert_padding.py @@ -0,0 +1,213 @@ +# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/padding.py + +import torch +import torch.nn.functional as F +from einops import rearrange, repeat + + +class IndexFirstAxis(torch.autograd.Function): + @staticmethod + def forward(ctx, input, indices): + ctx.save_for_backward(indices) + assert input.ndim >= 2 + ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:] + second_dim = other_shape.numel() + # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing. + # return input[indices] + return torch.gather( + rearrange(input, "b ... -> b (...)"), 0, repeat(indices, "z -> z d", d=second_dim) + ).reshape(-1, *other_shape) + + @staticmethod + def backward(ctx, grad_output): + (indices,) = ctx.saved_tensors + assert grad_output.ndim >= 2 + other_shape = grad_output.shape[1:] + grad_output = rearrange(grad_output, "b ... -> b (...)") + grad_input = torch.zeros( + [ctx.first_axis_dim, grad_output.shape[1]], + device=grad_output.device, + dtype=grad_output.dtype, + ) + # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing. + # grad_input[indices] = grad_output + grad_input.scatter_(0, repeat(indices, "z -> z d", d=grad_output.shape[1]), grad_output) + return grad_input.reshape(ctx.first_axis_dim, *other_shape), None + + +index_first_axis = IndexFirstAxis.apply + + +class IndexPutFirstAxis(torch.autograd.Function): + @staticmethod + def forward(ctx, values, indices, first_axis_dim): + ctx.save_for_backward(indices) + assert indices.ndim == 1 + assert values.ndim >= 2 + output = torch.zeros( + first_axis_dim, *values.shape[1:], device=values.device, dtype=values.dtype + ) + # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing. + output[indices] = values + # output.scatter_(0, repeat(indices, 'z -> z d', d=values.shape[1]), values) + return output + + @staticmethod + def backward(ctx, grad_output): + (indices,) = ctx.saved_tensors + # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing. + grad_values = grad_output[indices] + # grad_values = torch.gather(grad_output, 0, repeat(indices, 'z -> z d', d=grad_output.shape[1])) + return grad_values, None, None + + +index_put_first_axis = IndexPutFirstAxis.apply + + +class IndexFirstAxisResidual(torch.autograd.Function): + @staticmethod + def forward(ctx, input, indices): + ctx.save_for_backward(indices) + assert input.ndim >= 2 + ctx.first_axis_dim, other_shape = input.shape[0], input.shape[1:] + second_dim = other_shape.numel() + # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing. + output = input[indices] + # We don't want to reshape input (b ... -> b (...)) since it could change the channel_last + # memory format to channel_first. In other words, input might not be contiguous. + # If we don't detach, Pytorch complains about output being a view and is being modified inplace + return output, input.detach() + + @staticmethod + def backward(ctx, grad_output, grad_residual): + (indices,) = ctx.saved_tensors + assert grad_output.ndim >= 2 + other_shape = grad_output.shape[1:] + assert grad_residual.shape[1:] == other_shape + grad_input = grad_residual + # grad_input[indices] += grad_output + indices = indices.reshape(indices.shape[0], *((1,) * (grad_output.ndim - 1))) + indices = indices.expand_as(grad_output) + grad_input.scatter_add_(0, indices, grad_output) + return grad_input.reshape(ctx.first_axis_dim, *other_shape), None + + +index_first_axis_residual = IndexFirstAxisResidual.apply + + +def unpad_input(hidden_states, attention_mask): + """ + Arguments: + hidden_states: (batch, seqlen, ...) + attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid. + Return: + hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask. + indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence. + cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states. + max_seqlen_in_batch: int + """ + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the + # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim + # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to + # index with integer indices. Moreover, torch's index is a bit slower than it needs to be, + # so we write custom forward and backward to make it a bit faster. + return ( + index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices), + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +def unpad_input_for_concatenated_sequences(hidden_states, attention_mask_in_length): + """ + Supports concatenating short samples in one sequence. The attention_mask_in_length is utilized to mask other short samples. It helps efficient training of variant lengths-based samples (e.g., the supervised fine-tuning task in large language model). + The motivation for this function is explained [here](https://github.com/Dao-AILab/flash-attention/issues/432#issuecomment-1668822286). + + For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is: + ``` + [ + [2, 3, 0, 0, 0, 0], + [3, 2, 0, 0, 0, 0], + [6, 0, 0, 0, 0, 0] + ] + ``` + , which refers to the 3D-attention mask: + ``` + [ + [ + [1, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0], + [0, 0, 1, 1, 0, 0], + [0, 0, 1, 1, 1, 0], + [0, 0, 0, 0, 0, 1] + ], + [ + [1, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0], + [0, 0, 0, 1, 0, 0], + [0, 0, 0, 1, 1, 0], + [0, 0, 0, 0, 0, 1] + ], + [ + [1, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0], + [1, 1, 1, 1, 0, 0], + [1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1] + ] + ] + ```. + + Arguments: + hidden_states: (batch, seqlen, ...) + attention_mask_in_length: (batch, seqlen), int, a nonzero number (e.g., 1, 2, 3, etc.) means length of concatenated sequence in b-th batch, and 0 means none. + Return: + hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask. + indices: (total_nnz), the indices of non-masked tokens from the flattened input sequence. + cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states. + max_seqlen_in_batch: int + """ + length = attention_mask_in_length.sum(dim=-1) + seqlen = attention_mask_in_length.size(-1) + attention_mask_2d = torch.arange(seqlen, device=length.device, dtype=length.dtype).expand(len(length), seqlen) < length.unsqueeze(1) + real_indices_idx = torch.nonzero(attention_mask_in_length.flatten(), as_tuple=False).flatten() + seqlens_in_batch = attention_mask_in_length.flatten()[real_indices_idx] + indices = torch.nonzero(attention_mask_2d.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the + # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim + # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to + # index with integer indices. Moreover, torch's index is a bit slower than it needs to be, + # so we write custom forward and backward to make it a bit faster. + return ( + index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices), + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +def pad_input(hidden_states, indices, batch, seqlen): + """ + Arguments: + hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask. + indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence. + batch: int, batch size for the padded sequence. + seqlen: int, maximum sequence length for the padded sequence. + Return: + hidden_states: (batch, seqlen, ...) + """ + dim = hidden_states.shape[-1] + # output = torch.zeros((batch * seqlen), dim, device=hidden_states.device, dtype=hidden_states.dtype) + # output[indices] = hidden_states + output = index_put_first_axis(hidden_states, indices, batch * seqlen) + return rearrange(output, "(b s) ... -> b s ...", b=batch) diff --git a/bigcode.py b/bigcode.py new file mode 100644 index 0000000000000000000000000000000000000000..234944d4d6907fb3e1b0c2c3c315a2bee29d7775 --- /dev/null +++ b/bigcode.py @@ -0,0 +1,233 @@ +import math +import re +from collections import OrderedDict + +import torch +import torch.nn.functional as F +from transformers import GPT2Config, GPTBigCodeConfig, PretrainedConfig + + +def remap_state_dict_hf_bigcode(state_dict, config: PretrainedConfig): + """ + Map the state_dict of a Huggingface BigCode model to be flash_attn compatible. + """ + + # Word embedding and position embedding + def key_mapping_pos_emb(key): + return re.sub(r"^transformer.wpe.", "transformer.embeddings.position_embeddings.", key) + + state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items()) + word_embeddings = state_dict.pop("transformer.wte.weight") + # It's possible that vocab_size is padded to be a multiple of 8, for example. + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple + state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad( + word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0]) + ) + state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"] + + # LayerNorm + def key_mapping_ln(key): + key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key) + key = re.sub( + r"^transformer.h.(\d+).ln_(1|2).(weight|bias)", + r"transformer.layers.\1.norm\2.\3", + key, + ) + return key + + state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items()) + + def key_mapping_mlp(key): + key = re.sub( + r"^transformer.h.(\d+).mlp.c_fc.weight", + r"transformer.layers.\1.mlp.fc1.weight", + key, + ) + key = re.sub( + r"^transformer.h.(\d+).mlp.c_proj.weight", + r"transformer.layers.\1.mlp.fc2.weight", + key, + ) + key = re.sub( + r"^transformer.h.(\d+).mlp.c_fc.bias", + r"transformer.layers.\1.mlp.fc1.bias", + key, + ) + key = re.sub( + r"^transformer.h.(\d+).mlp.c_proj.bias", + r"transformer.layers.\1.mlp.fc2.bias", + key, + ) + return key + + state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items()) + + # TODO: add support for multi-head attention + assert config.multi_query, "Only multi-query attention is supported" + + # Attention + for d in range(config.num_hidden_layers): + embed_dim = config.n_embd + head_dim = embed_dim // config.n_head + + c_attn_weight = state_dict.pop(f"transformer.h.{d}.attn.c_attn.weight") + # with multi-query attention, the weights have shape (embed_dim, embed_dim + head_dim + head_dim) + # see https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py#L112 + # see also https://github.com/ggerganov/ggml/blob/dd1d575956e54c5bdc07632f25506b3b1884dbd2/examples/starcoder/convert-hf-to-ggml.py#L183 + # ((n_head + 2) * head_dim, embed_dim) -> (3 * n_heads * head_dim, hidden_dim) + q, k, v = torch.split(c_attn_weight, [embed_dim, head_dim, head_dim], dim=0) + # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim) + k = torch.tile(k, (config.n_head, 1)) + v = torch.tile(v, (config.n_head, 1)) + state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = torch.cat((q, k, v), dim=0) + + # same deal with the bias + c_attn_bias = state_dict.pop(f"transformer.h.{d}.attn.c_attn.bias") + # ((n_head + 2) * head_dim, embed_dim) -> (3 * n_heads * head_dim, hidden_dim) + q, k, v = torch.split(c_attn_bias, [embed_dim, head_dim, head_dim], dim=0) + # duplicate k, v along the first axis (head_dim, hidden_dim) -> (n_heads * head_dim, hidden_dim) + k = torch.tile(k, (config.n_head,)) + v = torch.tile(v, (config.n_head,)) + state_dict[f"transformer.layers.{d}.mixer.Wqkv.bias"] = torch.cat((q, k, v), dim=0) + + def key_mapping_attn(key): + key = re.sub( + r"^transformer.h.(\d+).attn.c_proj.weight", + r"transformer.layers.\1.mixer.out_proj.weight", + key, + ) + key = re.sub( + r"^transformer.h.(\d+).attn.c_proj.bias", + r"transformer.layers.\1.mixer.out_proj.bias", + key, + ) + return key + + state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items()) + + return state_dict + + +def inv_remap_state_dict_hf_bigcode(state_dict, config: PretrainedConfig): + """ + Map the state_dict of a flash_attn model to be Huggingface BigCode compatible. + + This function is meant to be the inverse of remap_state_dict_hf_bigcode. + """ + + # Word embedding and position embeddings + def inv_key_mapping_pos_emb(key): + return re.sub(r"^transformer.embeddings.position_embeddings.", "transformer.wpe.", key) + + state_dict = OrderedDict((inv_key_mapping_pos_emb(k), v) for k, v in state_dict.items()) + word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight") + + word_embeddings = word_embeddings[:, : config.vocab_size] + state_dict["transformer.wte.weight"] = word_embeddings + state_dict["lm_head.weight"] = word_embeddings + + # LayerNorm + def inv_key_mapping_ln(key): + key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key) + key = re.sub( + r"^transformer.layers.(\d+).norm(1|2).(weight|bias)", + r"transformer.h.\1.ln_\2.\3", + key, + ) + return key + + state_dict = OrderedDict((inv_key_mapping_ln(k), v) for k, v in state_dict.items()) + + # MLPs + def inv_key_mapping_mlp(key): + key = re.sub( + r"^transformer.layers.(\d+).mlp.fc1.weight", + r"transformer.h.\1.mlp.c_fc.weight", + key, + ) + key = re.sub( + r"^transformer.layers.(\d+).mlp.fc2.weight", + r"transformer.h.\1.mlp.c_proj.weight", + key, + ) + key = re.sub( + r"^transformer.layers.(\d+).mlp.fc1.bias", + r"transformer.h.\1.mlp.c_fc.bias", + key, + ) + key = re.sub( + r"^transformer.layers.(\d+).mlp.fc2.bias", + r"transformer.h.\1.mlp.c_proj.bias", + key, + ) + return key + + state_dict = OrderedDict((inv_key_mapping_mlp(k), v) for k, v in state_dict.items()) + + # Attention + for d in range(config.num_hidden_layers): + embed_dim = config.n_embd + head_dim = embed_dim // config.n_head + + Wqkv_weight = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.weight") + q, k, v = torch.split( + Wqkv_weight, [embed_dim, head_dim * config.n_head, head_dim * config.n_head], dim=0 + ) + c_attn_weight = torch.cat((q, k[:head_dim], v[:head_dim]), dim=0) + state_dict[f"transformer.h.{d}.attn.c_attn.weight"] = c_attn_weight + + # Same deal with the bias + Wqkv_bias = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.bias") + q, k, v = torch.split( + Wqkv_bias, [embed_dim, head_dim * config.n_head, head_dim * config.n_head], dim=0 + ) + c_attn_bias = torch.cat((q, k[:head_dim], v[:head_dim]), dim=0) + state_dict[f"transformer.h.{d}.attn.c_attn.bias"] = c_attn_bias + + def inv_key_mapping_attn(key): + key = re.sub( + r"^transformer.layers.(\d+).mixer.out_proj.weight", + r"transformer.h.\1.attn.c_proj.weight", + key, + ) + key = re.sub( + r"^transformer.layers.(\d+).mixer.out_proj.bias", + r"transformer.h.\1.attn.c_proj.bias", + key, + ) + return key + + state_dict = OrderedDict((inv_key_mapping_attn(k), v) for k, v in state_dict.items()) + + return state_dict + + +def bigcode_config_to_gpt2_config(bigcode_config: GPTBigCodeConfig) -> GPT2Config: + return GPT2Config( + activation_function=bigcode_config.activation_function, + attn_pdrop=bigcode_config.attn_pdrop, + bos_token_id=bigcode_config.bos_token_id, + embd_pdrop=bigcode_config.embd_pdrop, + eos_token_id=bigcode_config.eos_token_id, + initializer_range=bigcode_config.initializer_range, + layer_norm_epsilon=bigcode_config.layer_norm_epsilon, + max_batch_size=bigcode_config.max_batch_size, + max_sequence_length=bigcode_config.max_sequence_length, + model_type=bigcode_config.model_type, + multi_query=bigcode_config.multi_query, + n_embd=bigcode_config.n_embd, + n_head=bigcode_config.n_head, + n_inner=bigcode_config.n_inner, + n_layer=bigcode_config.n_layer, + n_positions=bigcode_config.n_positions, + resid_pdrop=bigcode_config.resid_pdrop, + scale_attn_weights=bigcode_config.scale_attn_weights, + summary_activation=bigcode_config.summary_activation, + summary_first_dropout=bigcode_config.summary_first_dropout, + summary_proj_to_labels=bigcode_config.summary_proj_to_labels, + summary_type=bigcode_config.summary_type, + summary_use_proj=bigcode_config.summary_use_proj, + use_cache=bigcode_config.use_cache, + vocab_size=bigcode_config.vocab_size, + ) diff --git a/block.py b/block.py new file mode 100644 index 0000000000000000000000000000000000000000..be8e8b864b600220068c2ec16aba5e2f1a81c121 --- /dev/null +++ b/block.py @@ -0,0 +1,397 @@ +# Copyright (c) 2024, Tri Dao. + +from functools import partial +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from torchvision.ops import StochasticDepth + +from flash_attn.modules.mha import MHA +from flash_attn.modules.mlp import Mlp + +try: + from flash_attn.ops.triton.layer_norm import layer_norm_fn, RMSNorm +except ImportError: + layer_norm_fn, RMSNorm = None, None + + +class Block(nn.Module): + def __init__( + self, + dim, + mixer_cls=None, + mlp_cls=None, + norm_cls=nn.LayerNorm, + dropout_cls=nn.Dropout, + prenorm=True, + resid_dropout1=0.0, + resid_dropout2=0.0, + drop_path1=0.0, + drop_path2=0.0, + fused_dropout_add_ln=False, + return_residual=False, + residual_in_fp32=False, + sequence_parallel=False, + mark_shared_params=False, + ): + """ + For prenorm=True, this Block has a slightly different structure compared to a regular + prenorm Transformer block. + The standard block is: LN -> MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add. + [Ref: https://arxiv.org/abs/2002.04745] + Here we have: Dropout -> Add -> LN -> MHA -> Dropout -> Add -> LN -> MLP, returning both + the hidden_states (output of the MLP) and the residual. + This is for performance reasons, as we can fuse the dropout, add and LayerNorm. + The residual needs to be provided (except for the very first block). + + For prenorm=False, this Block has the same structure as a regular postnorm Transformer + block: MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add -> LN. + + return_residual: whether each of the sub-layers (mixer and mlp) will return the residual. + This is for performance reason: for post-norm architecture, returning the input allows us + to fuse the backward of nn.Linear with the residual connection. + """ + super().__init__() + self.prenorm = prenorm + self.fused_dropout_add_ln = fused_dropout_add_ln + self.return_residual = return_residual + self.residual_in_fp32 = residual_in_fp32 + if self.residual_in_fp32: + assert self.prenorm, "residual_in_fp32 is only compatible with prenorm=True" + if mixer_cls is None: + mixer_cls = partial(MHA, num_heads=dim // 64) + if mlp_cls is None: + mlp_cls = partial(Mlp, hidden_features=4 * dim) + self.mixer = mixer_cls(dim) + self.dropout1 = dropout_cls(resid_dropout1) + self.drop_path1 = StochasticDepth(drop_path1, mode="row") + self.norm1 = norm_cls(dim) + self.mlp = mlp_cls(dim) + if not isinstance(self.mlp, nn.Identity): + self.dropout2 = dropout_cls(resid_dropout2) + self.drop_path2 = StochasticDepth(drop_path2, mode="row") + self.norm2 = norm_cls(dim) + + if self.fused_dropout_add_ln: + assert layer_norm_fn is not None, "Triton is not installed" + assert isinstance(self.norm1, (nn.LayerNorm, RMSNorm)) and isinstance( + self.dropout1, nn.Dropout + ) + + # TD [2023-01-07]: TODO: During training, if sequence_parallel is False and dropout != 0.0, + # then the input to each worker in the tensor parallel group will be different. + # This would produce wrong outputs? Somehow we'd need to sync the RNG state across workers. + # For now this is not an issue because we always use sequence_parallel=True during training + # and only use sequence_parallel=False during inference. + + # Mark the norm parameters as "sequence_parallel" so that we run all-reduce on their grads. + if sequence_parallel: + for p in self.norm1.parameters(): + p._sequence_parallel = True + if hasattr(self, "norm2"): + for p in self.norm2.parameters(): + p._sequence_parallel = True + # Mark the norm parameters as "shared_params" so that we sync their values at init. + if mark_shared_params: + for p in self.norm1.parameters(): + p._shared_params = True + if hasattr(self, "norm2"): + for p in self.norm2.parameters(): + p._shared_params = True + + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs): + return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs) + + def forward( + self, + hidden_states: Tensor, + residual: Optional[Tensor] = None, + mixer_subset=None, + mixer_kwargs=None, + ): + r"""Pass the input through the encoder layer. + + Args: + hidden_states: the sequence to the encoder layer (required). + residual: if postnorm, residual=None, If prenorm, hidden_states = Attn/MLP(LN(residual)) + mixer_subset: for cross-attention only. If not None, will take a subset of x + before applying the query projection. Useful for e.g., ViT where we only care + about the CLS token in the last layer. + """ + if self.prenorm: + if not self.fused_dropout_add_ln: + dropped = self.drop_path1(self.dropout1(hidden_states)) + residual = (dropped + residual) if residual is not None else dropped + hidden_states = self.norm1(residual.to(dtype=self.norm1.weight.dtype)) + if self.residual_in_fp32: + residual = residual.to(torch.float32) + else: + if self.drop_path1.p == 0 or not self.training: + rowscale1 = None + else: + rowscale1 = self.drop_path1( + torch.ones( + hidden_states.shape[:-1], + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + ) + hidden_states, residual = layer_norm_fn( + hidden_states, + self.norm1.weight, + self.norm1.bias, + residual=residual, + eps=self.norm1.eps, + dropout_p=self.dropout1.p if self.training else 0.0, + rowscale=rowscale1, + prenorm=True, + residual_in_fp32=self.residual_in_fp32, + is_rms_norm=isinstance(self.norm1, RMSNorm) + ) + if mixer_kwargs is None: + mixer_kwargs = {} + if mixer_subset is not None: + mixer_kwargs["mixer_subset"] = mixer_subset + hidden_states = self.mixer(hidden_states, **mixer_kwargs) + if mixer_subset is not None: + residual = residual[:, mixer_subset] + if not isinstance(self.mlp, nn.Identity): + if not self.fused_dropout_add_ln: + dropped = self.drop_path2(self.dropout2(hidden_states)) + residual = (dropped + residual) if residual is not None else dropped + hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype)) + if self.residual_in_fp32: + residual = residual.to(torch.float32) + else: + if self.drop_path2.p == 0 or not self.training: + rowscale2 = None + else: + rowscale2 = self.drop_path2( + torch.ones( + hidden_states.shape[:-1], + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + ) + hidden_states, residual = layer_norm_fn( + hidden_states, + self.norm2.weight, + self.norm2.bias, + residual=residual, + eps=self.norm2.eps, + dropout_p=self.dropout2.p if self.training else 0.0, + rowscale=rowscale2, + prenorm=True, + residual_in_fp32=self.residual_in_fp32, + is_rms_norm=isinstance(self.norm2, RMSNorm) + ) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + else: + assert residual is None + mixer_out = self.mixer( + hidden_states, **(mixer_kwargs if mixer_kwargs is not None else {}) + ) + if self.return_residual: # mixer out is actually a pair here + mixer_out, hidden_states = mixer_out + if not self.fused_dropout_add_ln: + hidden_states = self.norm1( + (self.drop_path1(self.dropout1(mixer_out)) + hidden_states).to( + dtype=self.norm1.weight.dtype + ) + ) + else: + if self.drop_path1.p == 0 or not self.training: + rowscale1 = None + else: + rowscale1 = self.drop_path1( + torch.ones( + mixer_out.shape[:-1], device=mixer_out.device, dtype=mixer_out.dtype + ) + ) + hidden_states = layer_norm_fn( + mixer_out, + self.norm1.weight, + self.norm1.bias, + residual=hidden_states, + eps=self.norm1.eps, + dropout_p=self.dropout1.p if self.training else 0.0, + rowscale=rowscale1, + prenorm=False, + is_rms_norm=isinstance(self.norm1, RMSNorm) + ) + if not isinstance(self.mlp, nn.Identity): + mlp_out = self.mlp(hidden_states) + if self.return_residual: # mlp out is actually a pair here + mlp_out, hidden_states = mlp_out + if not self.fused_dropout_add_ln: + hidden_states = self.norm2( + (self.drop_path2(self.dropout2(mlp_out)) + hidden_states).to( + dtype=self.norm2.weight.dtype + ) + ) + else: + if self.drop_path2.p == 0 or not self.training: + rowscale2 = None + else: + rowscale2 = self.drop_path2( + torch.ones( + mlp_out.shape[:-1], device=mlp_out.device, dtype=mlp_out.dtype + ) + ) + hidden_states = layer_norm_fn( + mlp_out, + self.norm2.weight, + self.norm2.bias, + residual=hidden_states, + eps=self.norm2.eps, + dropout_p=self.dropout2.p if self.training else 0.0, + rowscale=rowscale2, + prenorm=False, + is_rms_norm=isinstance(self.norm2, RMSNorm) + ) + return hidden_states + + +class ParallelBlock(nn.Module): + """The attention (mixer) and MLP blocks are done in parallel, similar to GPT-J, GPT-NeoX, + and PaLM. + """ + + def __init__( + self, + dim, + mixer_cls=None, + mlp_cls=None, + norm_cls=nn.LayerNorm, + dropout_cls=nn.Dropout, + resid_dropout1=0.0, + resid_dropout2=0.0, + tied_norm=False, + fused_dropout_add_ln=False, + residual_in_fp32=False, + sequence_parallel=False, + mark_shared_params=False, + ): + """ + This Block has a slightly different structure compared to a regular + prenorm Transformer block. + The standard block is: LN -> MHA / MLP -> Dropout -> Add. + [Ref: https://arxiv.org/abs/2002.04745] + Here we have: Dropout -> Add -> LN -> MHA / MLP, returning both + the hidden_states (output1 of the MHA / MLP) and the residual. + This is for performance reasons, as we can fuse the dropout, add and LayerNorm. + The residual needs to be provided (except for the very first block). + """ + super().__init__() + self.tied_norm = tied_norm + self.fused_dropout_add_ln = fused_dropout_add_ln + self.residual_in_fp32 = residual_in_fp32 + if mixer_cls is None: + mixer_cls = partial(MHA, num_heads=dim // 64) + if mlp_cls is None: + mlp_cls = partial(Mlp, hidden_features=4 * dim) + self.mixer = mixer_cls(dim) + self.dropout1 = dropout_cls(resid_dropout1) + self.norm1 = norm_cls(dim) + self.mlp = mlp_cls(dim) + self.dropout2 = dropout_cls(resid_dropout2) + if not self.tied_norm: + self.norm2 = norm_cls(dim) + + if self.fused_dropout_add_ln: + assert layer_norm_fn is not None, "Triton is not installed" + assert isinstance(self.norm1, (nn.LayerNorm, RMSNorm)) and isinstance( + self.dropout1, nn.Dropout + ) + + # TD [2023-01-07]: TODO: During training, if sequence_parallel is False and dropout != 0.0, + # then the input to each worker in the tensor parallel group will be different. + # This would produce wrong outputs? Somehow we'd need to sync the RNG state across workers. + # For now this is not an issue because we always use sequence_parallel=True during training + # and only use sequence_parallel=False during inference. + + # Mark the norm parameters as "sequence_parallel" so that we run all-reduce on their grads. + if sequence_parallel: + for p in self.norm1.parameters(): + p._sequence_parallel = True + if hasattr(self, "norm2"): + for p in self.norm2.parameters(): + p._sequence_parallel = True + # Mark the norm parameters as "shared_params" so that we sync their values at init. + if mark_shared_params: + for p in self.norm1.parameters(): + p._shared_params = True + if hasattr(self, "norm2"): + for p in self.norm2.parameters(): + p._shared_params = True + + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs): + return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs) + + def forward( + self, + hidden_states1: Tensor, + hidden_states2: Optional[Tensor] = None, + residual: Optional[Tensor] = None, + mixer_kwargs=None, + ): + r"""Pass the input through the encoder layer. + + Args: + hidden_states1: the output of the previous attention (mixer) or embedding layer. + hidden_states2: the output of the previous MLP layer (if None, will use hidden_states1). + residual. + """ + # TODO: Ideally we should only do the allgather / allreduce once for + # the Linear to MLP & Attention + if not self.fused_dropout_add_ln: + dropped1 = self.dropout1(hidden_states1) + # For the very 1st block, we only want 1 dropout, not two different dropouts + if hidden_states2 is not None: + dropped2 = self.dropout2(hidden_states2) + residual = ( + (residual + dropped1 + dropped2) + if residual is not None + else dropped1 + dropped2 + ) + else: + residual = (residual + dropped1) if residual is not None else dropped1 + hidden_states1 = self.norm1(residual.to(dtype=self.norm1.weight.dtype)) + hidden_states2 = ( + self.norm2(residual.to(dtype=self.norm2.weight.dtype)) + if not self.tied_norm + else hidden_states1 + ) + if self.residual_in_fp32: + residual = residual.to(torch.float32) + else: + weight2, bias2 = ( + (self.norm2.weight, self.norm2.bias) if not self.tied_norm else (None, None) + ) + hidden_states1, *rest, residual = layer_norm_fn( + hidden_states1, + self.norm1.weight, + self.norm1.bias, + residual=residual, + x1=hidden_states2, + weight1=weight2, + bias1=bias2, + eps=self.norm1.eps, + dropout_p=self.dropout1.p if self.training else 0.0, + prenorm=True, + residual_in_fp32=self.residual_in_fp32, + is_rms_norm=isinstance(self.norm1, RMSNorm) + ) + if self.tied_norm: + hidden_states2 = hidden_states1 + else: + hidden_states2, = rest + if mixer_kwargs is None: + mixer_kwargs = {} + hidden_states1 = self.mixer(hidden_states1, **mixer_kwargs) + hidden_states2 = self.mlp(hidden_states2) + return hidden_states1, hidden_states2, residual diff --git a/block_info.h b/block_info.h new file mode 100644 index 0000000000000000000000000000000000000000..3a23a1e1f26da48dae1032b6fe36b208b43da2ab --- /dev/null +++ b/block_info.h @@ -0,0 +1,46 @@ +/****************************************************************************** + * Copyright (c) 2023, Tri Dao. + ******************************************************************************/ + +#pragma once + +namespace flash { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct BlockInfo { + + template + __device__ BlockInfo(const Params ¶ms, const int bidb) + : sum_s_q(!Varlen || params.cu_seqlens_q == nullptr ? -1 : params.cu_seqlens_q[bidb]) + , sum_s_k(!Varlen || params.cu_seqlens_k == nullptr || !params.is_seqlens_k_cumulative ? -1 : params.cu_seqlens_k[bidb]) + , actual_seqlen_q(!Varlen || params.cu_seqlens_q == nullptr ? params.seqlen_q : params.cu_seqlens_q[bidb + 1] - sum_s_q) + // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb]. + // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K. + , seqlen_k_cache(!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k : (params.is_seqlens_k_cumulative ? params.cu_seqlens_k[bidb + 1] - sum_s_k : params.cu_seqlens_k[bidb])) + , actual_seqlen_k(params.seqused_k ? params.seqused_k[bidb] : seqlen_k_cache + (params.knew_ptr == nullptr ? 0 : params.seqlen_knew)) + { + } + + template + __forceinline__ __device__ index_t q_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const { + return sum_s_q == -1 ? bidb * batch_stride : uint32_t(sum_s_q) * row_stride; + } + + template + __forceinline__ __device__ index_t k_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const { + return sum_s_k == -1 ? bidb * batch_stride : uint32_t(sum_s_k) * row_stride; + } + + const int sum_s_q; + const int sum_s_k; + const int actual_seqlen_q; + // We have to have seqlen_k_cache declared before actual_seqlen_k, otherwise actual_seqlen_k is set to 0. + const int seqlen_k_cache; + const int actual_seqlen_k; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace flash diff --git a/btlm.py b/btlm.py new file mode 100644 index 0000000000000000000000000000000000000000..295e12062320be819dd835de4a866607650431b2 --- /dev/null +++ b/btlm.py @@ -0,0 +1,102 @@ +# Copyright (c) 2023, Tri Dao. + +import math +import json +import re +from pathlib import Path + +from collections import OrderedDict + +import torch +import torch.nn.functional as F + +from einops import rearrange +from transformers import GPT2Config, AutoConfig, PretrainedConfig + + +def remap_state_dict_hf_btlm(state_dict, config): + # Word embedding and position embedding + def key_mapping_pos_emb(key): + return re.sub(r"^transformer.wpe.", "transformer.embeddings.position_embeddings.", key) + + if "transformer.wpe.weight" in state_dict: + state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items()) + word_embeddings = state_dict.pop("transformer.wte.weight") + # It's possible that vocab_size is padded to be a multiple of 8, for example. + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple + state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad( + word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0]) + ) + state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"] + + # LayerNorm + def key_mapping_ln(key): + key = re.sub(r"^transformer.ln_f.(weight|bias)", r"transformer.ln_f.\1", key) + key = re.sub(r"^transformer.h.(\d+).ln_(1|2).(weight|bias)", r"transformer.layers.\1.norm\2.\3", key) + return key + + state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items()) + + # MLP + for d in range(config.num_hidden_layers): + W1 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc.weight") + W3 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc2.weight") + state_dict[f"transformer.layers.{d}.mlp.fc1.weight"] = torch.cat([W1.t(), W3.t()], dim=0) + b1 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc.bias") + b3 = state_dict.pop(f"transformer.h.{d}.mlp.c_fc2.bias") + state_dict[f"transformer.layers.{d}.mlp.fc1.bias"] = torch.cat([b1, b3], dim=0) + W2 = state_dict.pop(f"transformer.h.{d}.mlp.c_proj.weight") + state_dict[f"transformer.layers.{d}.mlp.fc2.weight"] = W2.t() + + def key_mapping_mlp(key): + key = re.sub(r"^transformer.h.(\d+).mlp.c_proj.bias", r"transformer.layers.\1.mlp.fc2.bias", key) + return key + + state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items()) + + # Attention + for d in range(config.num_hidden_layers): + Wqkv = state_dict.pop(f"transformer.h.{d}.attn.c_attn.weight") + state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = Wqkv.t() + Wout = state_dict.pop(f"transformer.h.{d}.attn.c_proj.weight") + state_dict[f"transformer.layers.{d}.mixer.out_proj.weight"] = Wout.t() + state_dict.pop(f"transformer.relative_pe.slopes") # We don't store the Alibi slopes + + def key_mapping_attn(key): + key = re.sub(r"^transformer.h.(\d+).attn.c_attn.bias", r"transformer.layers.\1.mixer.Wqkv.bias", key) + key = re.sub( + r"^transformer.h.(\d+).attn.c_proj.bias", r"transformer.layers.\1.mixer.out_proj.bias", key + ) + return key + + state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items()) + + return state_dict + + +def btlm_config_to_gpt2_config(btlm_config: PretrainedConfig) -> GPT2Config: + return GPT2Config( + vocab_size=btlm_config.vocab_size, + n_positions=0 if btlm_config.position_embedding_type == "alibi" else btlm_config.n_positions, + n_embd=btlm_config.hidden_size, + n_layer=btlm_config.num_hidden_layers, + n_head=btlm_config.num_attention_heads, + n_inner=btlm_config.n_inner, + activation_function=btlm_config.activation_function, + resid_pdrop=btlm_config.resid_pdrop, + embd_pdrop=btlm_config.embd_pdrop, + attn_pdrop=btlm_config.attn_pdrop, + layer_norm_epsilon=btlm_config.layer_norm_epsilon, + initializer_range=btlm_config.initializer_range, + bos_token_id=btlm_config.bos_token_id, + eos_token_id=btlm_config.eos_token_id, + # These are new arguments not in the original GPT2Config + use_alibi=btlm_config.position_embedding_type == "alibi", + use_flash_attn=btlm_config.position_embedding_type == "alibi", # Alibi code path requires flash_attn + mup_width_scale=btlm_config.mup_width_scale, + mup_embeddings_multiplier=btlm_config.mup_embeddings_scale, + mup_output_multiplier=btlm_config.mup_output_alpha, + mup_scale_qk_dot_by_d=btlm_config.mup_scale_qk_dot_by_d, + mlp_multiple_of=1, + ) diff --git a/causality-monitor.yaml b/causality-monitor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fbac5b68e91761f1384876ca6ceb6cfa04981cca --- /dev/null +++ b/causality-monitor.yaml @@ -0,0 +1,2 @@ +causality-monitor: + _target_: src.callbacks.causality_monitor.CausalityMonitor \ No newline at end of file diff --git a/comet.yaml b/comet.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6ac99f46ca6d2d81e1bd70c230648ec5547c0df0 --- /dev/null +++ b/comet.yaml @@ -0,0 +1,7 @@ +# https://www.comet.ml + +comet: + _target_: pytorch_lightning.loggers.comet.CometLogger + api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable + project_name: "template-tests" + experiment_name: ${name} diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f7c8f510f6ef919dee609792bcd457b249d17104 --- /dev/null +++ b/config.yaml @@ -0,0 +1,50 @@ +# @package _global_ + +# specify here default training configuration +defaults: + - _self_ + - trainer: default + - optimizer: adamw + - scheduler: null + - task: sequence-model + - model: null + - datamodule: null + - callbacks: default # set this to null if you don't want to use callbacks + - metrics: null + - logger: null # set logger here or use command line (e.g. `python run.py logger=wandb`) + + - mode: default + + - experiment: null + - hparams_search: null + + # enable color logging + - override hydra/hydra_logging: colorlog + - override hydra/job_logging: colorlog + +# path to original working directory +# hydra hijacks working directory by changing it to the current log directory, +# so it's useful to have this path as a special variable +# https://hydra.cc/docs/next/tutorials/basic/running_your_app/working_directory +work_dir: ${hydra:runtime.cwd} + +# path to folder with data +data_dir: ${work_dir}/data/ + +# pretty print config at the start of the run using Rich library +print_config: True + +# disable python warnings if they annoy you +ignore_warnings: True + +# check performance on test set, using the best model achieved during training +# lightning chooses best model based on metric specified in checkpoint callback +test_after_training: True + +resume: False + +# seed for random number generators in pytorch, numpy and python.random +seed: null + +# name of the run, accessed by loggers +name: null diff --git a/cosine-warmup-timm.yaml b/cosine-warmup-timm.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f2bbbec01f0f13fd102b92196e52b34898b081da --- /dev/null +++ b/cosine-warmup-timm.yaml @@ -0,0 +1,2 @@ +# @package train.scheduler +_target_: src.optim.timm_lr_scheduler.TimmCosineLRScheduler diff --git a/cosine-warmup.yaml b/cosine-warmup.yaml new file mode 100644 index 0000000000000000000000000000000000000000..afaf0f618ecc41c9182c7df34717c973aa4af974 --- /dev/null +++ b/cosine-warmup.yaml @@ -0,0 +1,2 @@ +# @package train.scheduler +_target_: transformers.get_cosine_schedule_with_warmup diff --git a/cross_entropy.py b/cross_entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..1782338132ab7e797f6d4998aa5a78f84a217504 --- /dev/null +++ b/cross_entropy.py @@ -0,0 +1,318 @@ +# Copyright (c) 2023, Tri Dao. + +from typing import Tuple, Optional, Union + +import torch + +import triton +import triton.language as tl + +# `all_gather_into_tensor` and `reduce_scatter_tensor` are new placeholders for +# `_all_gather_base` and `_reduce_scatter_base`. They require the most recent +# version of PyTorch. The following 2 lines are for backward compatibility with +# older PyTorch. +if "all_gather_into_tensor" not in dir(torch.distributed): + torch.distributed.all_gather_into_tensor = torch.distributed._all_gather_base + + +@triton.heuristics( + { + "HAS_SMOOTHING": lambda args: args["smoothing"] > 0.0, + } +) +@triton.jit +def cross_entropy_fwd_kernel( + loss_ptr, # data ptrs + lse_ptr, + z_loss_ptr, + logits_ptr, + labels_ptr, + smoothing, + logit_scale, + lse_square_scale, + ignore_index, + total_classes, + class_start_idx, # Useful for tensor parallel when each rank only has a subset of classes + n_cols, # shapes + n_rows, + logits_row_stride, # strides + BLOCK_SIZE: tl.constexpr, + HAS_SMOOTHING: tl.constexpr, + # if SPLIT (e.g. tensor parallel), don't include the LSE in the loss since it's not the final LSE + SPLIT: tl.constexpr, +): + row_idx = tl.program_id(0) + col_block_idx = tl.program_id(1) + logits_ptr = logits_ptr + row_idx * logits_row_stride.to(tl.int64) + col_offsets = col_block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + label_idx = tl.load(labels_ptr + row_idx) + logits = tl.load(logits_ptr + col_offsets, mask=col_offsets < n_cols, other=-float("inf")).to( + tl.float32 + ) * logit_scale + max_logits = tl.max(logits, 0) + if HAS_SMOOTHING: + sum_logits = tl.sum(tl.where(col_offsets < n_cols, logits, 0.0), 0) + lse = tl.log(tl.sum(tl.exp(logits - max_logits), 0)) + max_logits + tl.store(lse_ptr + col_block_idx * n_rows + row_idx, lse) + if label_idx == ignore_index: + loss = 0.0 + z_loss = 0.0 + else: + label_idx -= class_start_idx + if label_idx >= col_block_idx * BLOCK_SIZE and label_idx < min( + n_cols, (col_block_idx + 1) * BLOCK_SIZE + ): + logits_label = tl.load(logits_ptr + label_idx) * logit_scale + if HAS_SMOOTHING: + loss = ( + (lse if not SPLIT else 0.0) + - smoothing * sum_logits / total_classes + - (1 - smoothing) * logits_label + ) + else: + loss = (lse if not SPLIT else 0.0) - logits_label + else: + # If label is out of bounds, we set the CE loss to 0.0. But we still want the smoothing loss + if HAS_SMOOTHING: + loss = smoothing * ((lse if not SPLIT else 0.0) - sum_logits / total_classes) + else: + loss = 0.0 + if not SPLIT: + z_loss = lse_square_scale * lse * lse + loss += z_loss + else: + z_loss = 0.0 + tl.store(loss_ptr + col_block_idx * n_rows + row_idx, loss) + if not SPLIT: + tl.store(z_loss_ptr + col_block_idx * n_rows + row_idx, z_loss) + + +@triton.heuristics( + { + "HAS_SMOOTHING": lambda args: args["smoothing"] > 0.0, + } +) +@triton.jit +def cross_entropy_bwd_kernel( + dlogits_ptr, # data ptrs + dloss_ptr, + logits_ptr, + lse_ptr, + labels_ptr, + smoothing, + logit_scale, + lse_square_scale, + ignore_index, + total_classes, + class_start_idx, # Useful for tensor parallel when each rank only has a subset of classes + n_cols, # shapes + logits_row_stride, # strides + dlogits_row_stride, + dloss_row_stride, + BLOCK_SIZE: tl.constexpr, + HAS_SMOOTHING: tl.constexpr, +): + row_idx = tl.program_id(0) + col_block_idx = tl.program_id(1) + logits_ptr = logits_ptr + row_idx * logits_row_stride.to(tl.int64) + dlogits_ptr = dlogits_ptr + row_idx * dlogits_row_stride.to(tl.int64) + col_offsets = col_block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + label_idx = tl.load(labels_ptr + row_idx) + if label_idx != ignore_index: + dloss = tl.load(dloss_ptr + row_idx * dloss_row_stride) + else: + dloss = 0.0 + logits = tl.load(logits_ptr + col_offsets, mask=col_offsets < n_cols, other=-float("inf")).to( + tl.float32 + ) * logit_scale + lse = tl.load(lse_ptr + row_idx) + probs = tl.exp(logits - lse) + probs += 2.0 * lse_square_scale * lse * probs + label_idx -= class_start_idx + if HAS_SMOOTHING: + smooth_positive = 1.0 - smoothing + smooth_negative = smoothing / total_classes + probs = tl.where(col_offsets == label_idx, probs - (1 - smoothing), probs) - smooth_negative + else: + probs = tl.where(col_offsets == label_idx, probs - 1.0, probs) + tl.store(dlogits_ptr + col_offsets, (dloss * logit_scale) * probs, mask=col_offsets < n_cols) + + +class CrossEntropyLoss(torch.autograd.Function): + + @staticmethod + def forward( + ctx, + logits, + labels, + smoothing=0.0, + logit_scale=1.0, + lse_square_scale=0.0, + ignore_index=-100, + inplace_backward=False, + process_group=None, + ): + n_rows, n_cols = logits.shape + assert labels.shape == (n_rows,) + world_size = 1 if process_group is None else torch.distributed.get_world_size(process_group) + total_classes = world_size * n_cols + rank = 0 if process_group is None else torch.distributed.get_rank(process_group) + class_start_idx = rank * n_cols + + if logits.stride(-1) != 1: + logits = logits.contiguous() + # Set these similar to https://github.com/openai/triton/blob/main/python/tutorials/02-fused-softmax.py + MAX_BLOCK_SIZE = 64 * 1024 + BLOCK_SIZE = min(triton.next_power_of_2(n_cols), MAX_BLOCK_SIZE) + num_warps = ( + 4 + if BLOCK_SIZE < 2048 + else (8 if BLOCK_SIZE < 8192 else (16 if BLOCK_SIZE < 128 * 1024 else 32)) + ) + # We may split the lse computation across multiple blocks, then do a reduction + # lse(local_lse) to get the final LSE. This is faster for large n_cols (e.g., > 64k) + # where having just one thread block processing more than 64k elements is slow. + split = world_size > 1 or n_cols > MAX_BLOCK_SIZE + n_splits = (n_cols + BLOCK_SIZE - 1) // BLOCK_SIZE + loss_shape = (n_splits, n_rows) if n_splits > 1 else (n_rows,) + losses = torch.empty(*loss_shape, dtype=torch.float, device=logits.device) + lse = torch.empty(*loss_shape, dtype=torch.float, device=logits.device) + z_losses = torch.empty(*loss_shape, dtype=torch.float, device=logits.device) + # Need this, otherwise Triton tries to launch from cuda:0 and we get + # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?) + with torch.cuda.device(logits.device.index): + cross_entropy_fwd_kernel[(n_rows, n_splits)]( + losses, # data ptrs + lse, + z_losses, + logits, + labels, + smoothing, + logit_scale, + lse_square_scale, + ignore_index, + total_classes, + class_start_idx, + n_cols, # shapes + n_rows, + logits.stride(0), # strides + BLOCK_SIZE=BLOCK_SIZE, # constants + num_warps=num_warps, + SPLIT=split, + ) + + if split: + # If there's no smoothing, if labels are in the vocab of this partition, losses contains + # - predicted logit, and 0 otherwise. + # If there's smoothing=0.1, for labels in the vocab of this partition, losses contains + # -0.9 * predicted logit - 0.1 * sum logit / total_classes. + # For labels not in the vocab of this partition, losses contains + # -0.1 * sum logit / total_classes. + if n_splits > 1: + lse = torch.logsumexp(lse, dim=0) + losses = losses.sum(dim=0) + if world_size > 1: + lse_allgather = torch.empty(world_size, n_rows, dtype=lse.dtype, device=lse.device) + torch.distributed.all_gather_into_tensor(lse_allgather, lse, group=process_group) + handle_losses = torch.distributed.all_reduce( + losses, op=torch.distributed.ReduceOp.SUM, group=process_group, async_op=True + ) + lse = torch.logsumexp(lse_allgather, dim=0) + handle_losses.wait() + # After the allreduce, if there's no smoothing, the total losses are - predicted_logit, + # we just have to add the (global) lse. + # If there's smoothing=0.1, the total losses are + # -0.9 * predicted_logit - 0.1 * sum logit / total_classes. + # Again, we just have to add the (global) lse. + losses += lse + if lse_square_scale != 0.0: + z_losses = lse_square_scale * lse.square() + z_losses.masked_fill_(labels == ignore_index, 0.0) + losses += z_losses + else: + z_losses = torch.zeros_like(losses) + losses.masked_fill_(labels == ignore_index, 0.0) + + ctx.save_for_backward(logits, lse, labels) + ctx.mark_non_differentiable(z_losses) + ctx.smoothing = smoothing + ctx.logit_scale = logit_scale + ctx.lse_square_scale = lse_square_scale + ctx.ignore_index = ignore_index + ctx.total_classes = total_classes + ctx.class_start_idx = class_start_idx + ctx.inplace_backward = inplace_backward + + return losses, z_losses + + @staticmethod + def backward(ctx, grad_losses, grad_z_losses): + del grad_z_losses # z_losses are only for logging. + + logits, lse, labels = ctx.saved_tensors + dlogits = logits if ctx.inplace_backward else torch.empty_like(logits) + n_rows, n_cols = logits.shape + BLOCK_SIZE = min(triton.next_power_of_2(n_cols), 4 * 1024) + num_warps = 4 if BLOCK_SIZE < 2048 else (8 if BLOCK_SIZE < 8192 else 16) + grid = lambda META: (n_rows, triton.cdiv(n_cols, META["BLOCK_SIZE"])) # noqa + # Need this, otherwise Triton tries to launch from cuda:0 and we get + # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?) + with torch.cuda.device(logits.device.index): + cross_entropy_bwd_kernel[grid]( + dlogits, # data ptrs + grad_losses, + logits, + lse, + labels, + ctx.smoothing, + ctx.logit_scale, + ctx.lse_square_scale, + ctx.ignore_index, + ctx.total_classes, + ctx.class_start_idx, + n_cols, # shapes + logits.stride(0), # strides + dlogits.stride(0), + grad_losses.stride(0), + BLOCK_SIZE=BLOCK_SIZE, # constants + num_warps=num_warps, + ) + return dlogits, None, None, None, None, None, None, None, None + +def cross_entropy_loss( + logits: torch.Tensor, + labels: torch.Tensor, + label_smoothing: float = 0.0, + logit_scale: float = 1.0, + lse_square_scale: float = 0.0, + ignore_index=-100, + inplace_backward: bool = False, + process_group=None, +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Arguments: + logits: (batch, vocab_size) + labels: (batch,) + label_smoothing: float + logit_scale: float. Multiply logits by this scale before calculating the loss. + lse_square_scale: float. If > 0, we add lse_square_scale * lse(logits) ^ 2 to the loss. + This is also referred to as "z-loss". + ignore_index: int. If labels == ignore_index, the loss is set to 0.0. + inplace_backward: bool. If True, we do the backward pass in-place by modifying the logits. + This saves memory. + process_group: if not None, we're doing Tensor Parallel: each process is responsible for + one part of the vocab. The loss will be aggregated across processes. + Returns: + losses: (batch,), float + z_losses: (batch,), float + """ + return CrossEntropyLoss.apply( + logits, + labels, + label_smoothing, + logit_scale, + lse_square_scale, + ignore_index, + inplace_backward, + process_group, + ) diff --git a/csv.yaml b/csv.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0f917e89ca1902e8ef89b524a9f2affb8bc402e3 --- /dev/null +++ b/csv.yaml @@ -0,0 +1,8 @@ +# csv logger built in lightning + +csv: + _target_: pytorch_lightning.loggers.csv_logs.CSVLogger + save_dir: "." + name: "csv/" + version: ${name} + prefix: "" diff --git a/cuda_bf16_fallbacks.cuh b/cuda_bf16_fallbacks.cuh new file mode 100644 index 0000000000000000000000000000000000000000..f5641f61609172090da1c8e77e43f9f4694ccca0 --- /dev/null +++ b/cuda_bf16_fallbacks.cuh @@ -0,0 +1,257 @@ +// Downloaded from from FasterTransformer v5.2.1 +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/utils/cuda_bf16_fallbacks.cuh +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cuda_bf16_wrapper.h" +#include + +namespace fastertransformer { + +#ifdef ENABLE_BF16 +inline __device__ float2 bf1622float2(const __nv_bfloat162 val) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + float2 f_val; + f_val.x = __low2float(val); + f_val.y = __high2float(val); + return f_val; +#else + return __bfloat1622float2(val); +#endif +} + +inline __device__ int16_t bf1622int16(__nv_bfloat162 val) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + float2 f_val; + f_val.x = max(min(__low2float(val), 127.f), -128.f); + f_val.y = max(min(__high2float(val), 127.f), -128.f); + union { int8_t int8[2]; int16_t int16; }; + int8[0] = static_cast(static_cast(f_val.x)); + int8[1] = static_cast(static_cast(f_val.y)); + return int16; +#else + val = __hmin2(val, make_bfloat162(127., 127.)); + val = __hmax2(val, make_bfloat162(-128., -128.)); + union { int8_t int8[2]; int16_t int16; }; + int8[0] = static_cast(static_cast(val.x)); + int8[1] = static_cast(static_cast(val.y)); + return int16; +#endif +} + +inline __device__ __nv_bfloat162 float22bf162(const float2 val) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + return __floats2bfloat162_rn(val.x, val.y); +#else + return __float22bfloat162_rn(val); +#endif +} + +inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + __nv_bfloat162 val2; + val2.x = val; + val2.y = val; + return val2; +#else + return __bfloat162bfloat162(val); +#endif +} + +inline __device__ __nv_bfloat162 bf16hadd2(const __nv_bfloat162 x, const __nv_bfloat162 y) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + float fxl, fxh, fyl, fyh; + fxl = __low2float(x); + fxh = __high2float(x); + fyl = __low2float(y); + fyh = __high2float(y); + return __floats2bfloat162_rn(fxl + fyl, fxh + fyh); +#else + return __hadd2(x, y); +#endif +} + +inline __device__ __nv_bfloat16 bf16hadd(const __nv_bfloat16 x, const __nv_bfloat16 y) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + return __float2bfloat16( __bfloat162float(x) + __bfloat162float(y) ); +#else + return __hadd(x, y); +#endif +} + +inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 x, const __nv_bfloat162 y) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + float fxl, fxh, fyl, fyh; + fxl = __low2float(x); + fxh = __high2float(x); + fyl = __low2float(y); + fyh = __high2float(y); + return __floats2bfloat162_rn(fxl - fyl, fxh - fyh); +#else + return __hsub2(x, y); +#endif +} + +inline __device__ __nv_bfloat16 bf16hsub(const __nv_bfloat16 x, const __nv_bfloat16 y) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + return __float2bfloat16( __bfloat162float(x) - __bfloat162float(y) ); +#else + return __hsub(x, y); +#endif +} + +inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 x, const __nv_bfloat162 y) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + float fxl, fxh, fyl, fyh; + fxl = __low2float(x); + fxh = __high2float(x); + fyl = __low2float(y); + fyh = __high2float(y); + return __floats2bfloat162_rn(fxl * fyl, fxh * fyh); +#else + return __hmul2(x, y); +#endif +} + +inline __device__ __nv_bfloat16 bf16hmul(const __nv_bfloat16 x, const __nv_bfloat16 y) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) ); +#else + return __hmul(x, y); +#endif +} + +inline __device__ __nv_bfloat162 bf16hfma2(const __nv_bfloat162 x, const __nv_bfloat162 y, const __nv_bfloat162 z) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + float fxl, fxh, fyl, fyh, fzl, fzh; + fxl = __low2float(x); + fxh = __high2float(x); + fyl = __low2float(y); + fyh = __high2float(y); + fzl = __low2float(z); + fzh = __high2float(z); + return __floats2bfloat162_rn(fxl * fyl + fzl, fxh * fyh + fzh); +#else + return __hfma2(x, y, z); +#endif +} + +inline __device__ __nv_bfloat16 bf16hfma(const __nv_bfloat16 x, const __nv_bfloat16 y, const __nv_bfloat16 z) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) + __bfloat162float(z)); +#else + return __hfma(x, y, z); +#endif +} + +inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + float fxl, fxh; + fxl = __low2float(x); + fxh = __high2float(x);; + return __floats2bfloat162_rn(expf(fxl), expf(fxh)); +#else + return h2exp(x); +#endif +} + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800) +inline __device__ __nv_bfloat162 operator*(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hmul2(x, y); }; +inline __device__ __nv_bfloat162 operator+(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hadd2(x, y); }; + +inline __device__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y) +{ + __nv_bfloat162 t; t.x = x; t.y = y; return t; +} + +#endif + +inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c)); +#else + return a + b + c; +#endif +} + +inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c, __nv_bfloat16 d) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c) + __bfloat162float(d)); +#else + return (__nv_bfloat16)((float)a + (float)b + (float)c + (float)d); +#endif +} + +inline __device__ __nv_bfloat162 bf16hadd2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + float fal, fah, fbl, fbh, fcl, fch; + fal = __low2float(a); + fah = __high2float(a); + fbl = __low2float(b); + fbh = __high2float(b); + fcl = __low2float(c); + fch = __high2float(c); + return __floats2bfloat162_rn(fal + fbl + fcl, fah + fbh + fch); +#else + return a + b + c; +#endif +} + +inline __device__ __nv_bfloat16 bf16hmul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b) * __bfloat162float(c)); +#else + return a * b * c; +#endif +} + +inline __device__ __nv_bfloat162 bf16hmul2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + float fal, fah, fbl, fbh, fcl, fch; + fal = __low2float(a); + fah = __high2float(a); + fbl = __low2float(b); + fbh = __high2float(b); + fcl = __low2float(c); + fch = __high2float(c); + return __floats2bfloat162_rn(fal * fbl * fcl, fah * fbh * fch); +#else + return a * b * c; +#endif +} + +inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162 d) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + float fal, fah, fbl, fbh, fcl, fch, fdl, fdh; + fal = __low2float(a); + fah = __high2float(a); + fbl = __low2float(b); + fbh = __high2float(b); + fcl = __low2float(c); + fch = __high2float(c); + fdl = __low2float(d); + fdh = __high2float(d); + return __floats2bfloat162_rn(fal * fbl * fcl + fdl, fah * fbh * fch + fdh); +#else + return a * b * c + d; +#endif +} + +#endif // ENABLE_BF16 + +} // namespace fastertransformer diff --git a/cuda_bf16_wrapper.h b/cuda_bf16_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..efb6e798730879bc2cd16088b2091991862a6074 --- /dev/null +++ b/cuda_bf16_wrapper.h @@ -0,0 +1,23 @@ +// Downloaded from from FasterTransformer v5.2.1 +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/utils/cuda_bf16_wrapper.h +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef ENABLE_BF16 +#include +#endif diff --git a/ddp.yaml b/ddp.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3c9544407298d1e839b0c7851cc2cce4a0f01cc3 --- /dev/null +++ b/ddp.yaml @@ -0,0 +1,6 @@ +defaults: + - default.yaml + +accelerator: gpu +devices: 4 +strategy: ddp diff --git a/debug.yaml b/debug.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b2335c981ab1217d3e8a9f489c0d3c1af168ec47 --- /dev/null +++ b/debug.yaml @@ -0,0 +1,27 @@ +# @package _global_ + +# run in debug mode with: +# `python run.py mode=debug` + +defaults: + - override /trainer: debug.yaml + +debug_mode: True + +hydra: + # sets level of all command line loggers to 'DEBUG' + verbose: True + + # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/ + # sets level of only chosen command line loggers to 'DEBUG' + # verbose: [src.train, src.utils.utils] + + # sets output paths for all file logs to 'logs/debug/' + run: + dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/debug/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/debug/multirun_${now:%Y-%m-%d_%H-%M-%S} + subdir: ${hydra.job.num} + +# disable rich config printing, since it will be already printed by hydra when `verbose: True` +print_config: False diff --git a/decoder_masked_multihead_attention.cu b/decoder_masked_multihead_attention.cu new file mode 100644 index 0000000000000000000000000000000000000000..13306f76868e7c46321998513b3a49634edf9e2c --- /dev/null +++ b/decoder_masked_multihead_attention.cu @@ -0,0 +1,149 @@ +// Adapted from from FasterTransformer v5.2.1 +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "decoder_masked_multihead_attention.h" +#include "decoder_masked_multihead_attention_utils.h" +#include "cuda_bf16_wrapper.h" +#include +#include +#include + +#include "decoder_masked_multihead_attention_template.hpp" + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#define MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, DO_CROSS_ATTENTION, stream) \ + size_t smem_sz = mmha::smem_size_in_bytes(params, THDS_PER_VALUE, THDS_PER_BLOCK); \ + auto kernel = mmha::masked_multihead_attention_kernel; \ + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_sz); \ + dim3 grid(params.nnz_head_idx == nullptr ? params.num_heads : params.nnz_heads, params.batch_size); \ + kernel<<>>(params) + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// !!! Specialize the launcher for Cross attention +template +void mmha_launch_kernel(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream) +{ + constexpr int THREADS_PER_VALUE = Dh_MAX * sizeof(T) / 16; + constexpr bool DO_CROSS_ATTENTION = std::is_same>::value; + int tlength = (DO_CROSS_ATTENTION) ? params.memory_max_len : params.timestep; + // printf("tlength, CROSS_ATTENTION = %d, %d\n", tlength, DO_CROSS_ATTENTION); + if (tlength < 32) { + MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 64, DO_CROSS_ATTENTION, stream); + } + else if (tlength < 2048) { + MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 2, THREADS_PER_VALUE, 128, DO_CROSS_ATTENTION, stream); + } + else { + MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 1, THREADS_PER_VALUE, 256, DO_CROSS_ATTENTION, stream); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#undef MMHA_LAUNCH_KERNEL + +template +void multihead_attention_(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream) +{ + switch (params.hidden_size_per_head) { + case 32: + mmha_launch_kernel(params, stream); + break; + case 48: + mmha_launch_kernel(params, stream); + break; + case 64: + mmha_launch_kernel(params, stream); + break; + case 80: + mmha_launch_kernel(params, stream); + break; + case 96: + mmha_launch_kernel(params, stream); + break; + case 128: + mmha_launch_kernel(params, stream); + break; + case 160: + mmha_launch_kernel(params, stream); + break; + case 192: + mmha_launch_kernel(params, stream); + break; + case 224: + mmha_launch_kernel(params, stream); + break; + case 256: + mmha_launch_kernel(params, stream); + break; + default: + assert(false); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +void masked_multihead_attention(const Masked_multihead_attention_params& params, const cudaStream_t& stream) +{ + multihead_attention_>(params, stream); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +void masked_multihead_attention(const Masked_multihead_attention_params& params, const cudaStream_t& stream) +{ + multihead_attention_>(params, stream); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef ENABLE_BF16 +void masked_multihead_attention(const Masked_multihead_attention_params<__nv_bfloat16>& params, + const cudaStream_t& stream) +{ + multihead_attention_<__nv_bfloat16, Masked_multihead_attention_params<__nv_bfloat16>>(params, stream); +} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////// + +void cross_multihead_attention(const Cross_multihead_attention_params& params, const cudaStream_t& stream) +{ + multihead_attention_>(params, stream); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +void cross_multihead_attention(const Cross_multihead_attention_params& params, const cudaStream_t& stream) +{ + multihead_attention_>(params, stream); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef ENABLE_BF16 +void cross_multihead_attention(const Cross_multihead_attention_params<__nv_bfloat16>& params, + const cudaStream_t& stream) +{ + multihead_attention_<__nv_bfloat16, Cross_multihead_attention_params<__nv_bfloat16>>(params, stream); +} +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/decoder_masked_multihead_attention.h b/decoder_masked_multihead_attention.h new file mode 100644 index 0000000000000000000000000000000000000000..3c79f88b856efbc6dd50bfb61747675727de402b --- /dev/null +++ b/decoder_masked_multihead_attention.h @@ -0,0 +1,192 @@ +// Downloaded from from FasterTransformer v5.2.1 +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention.h +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cuda_bf16_wrapper.h" +#include +#include +#include +#include +#include + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#define CHECK_CUDA(call) \ + do { \ + cudaError_t status_ = call; \ + if (status_ != cudaSuccess) { \ + fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_)); \ + exit(1); \ + } \ + } while (0) + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// The structure of parameters for the masked multihead attention kernel. +// +// We use the following terminology to describe the different dimensions. +// +// B: Batch size (number of sequences), +// L: Sequence length, +// D: Hidden dimension, +// H: Number of heads, +// Dh: Hidden dimension per head - Dh = D / H. + +template +struct Multihead_attention_params_base { + + // The output buffer. Dimensions B x D. + T* out = nullptr; + + // The input Qs and the associated bias. Dimensions B x D and D, resp. + const T *q = nullptr, *q_bias = nullptr; + // The input Ks and the associated bias. Dimensions B x D and D, resp. + const T *k = nullptr, *k_bias = nullptr; + // The input Vs and the associated bias. Dimensions B x D and D, resp. + const T *v = nullptr, *v_bias = nullptr; + + // The cache for the Ks. The size must be at least B x L x D. + T* k_cache = nullptr; + // The cache for the Vs. The size must be at least B x L x D. + T* v_cache = nullptr; + // The indirections to use for cache when beam sampling. + const int* cache_indir = nullptr; + + // Stride to handle the case when KQV is a single buffer + int stride_q = 0; + int stride_k = 0; + int stride_v = 0; + + // The batch size. + int batch_size = 0; + // The beam width + int beam_width = 0; + // The sequence length. + int memory_max_len = 0; + // The number of heads (H). + int num_heads = 0; + int num_heads_kv = 0; + int num_heads_q_kv_ratio = 0; + // The hidden dimension per head (Dh). + int hidden_size_per_head = 0; + // The per-head latent space reserved for rotary embeddings. + int rotary_embedding_dim = 0; + bool neox_rotary_style = false; + float rotary_base = 0.0f; + // The maximum length of input sentences. + int max_input_length = 0; + // The current timestep. TODO(bhsueh) Check that do we only this param in cross attention? + int timestep = 0; + // The current timestep of each sentences (support different timestep for different sentences) + + // The 1.f / sqrt(Dh). Computed on the host. + float inv_sqrt_dh = 0.0f; + + // Used when we have some input context like gpt + const int* total_padding_tokens = nullptr; + + const bool* masked_tokens = nullptr; + const int* prefix_prompt_lengths = nullptr; + int max_prefix_prompt_length = 0; + + const T* relative_attention_bias = nullptr; + int relative_attention_bias_stride = 0; + // The slope per head of linear position bias to attention score (H). + const T* linear_bias_slopes = nullptr; + + const T* ia3_key_weights = nullptr; + const T* ia3_value_weights = nullptr; + const int* ia3_tasks = nullptr; + + const float* qkv_scale_out = nullptr; + const float* attention_out_scale = nullptr; + int int8_mode = 0; + + const T *rotary_cos = nullptr; + const T *rotary_sin = nullptr; + + const int *nnz_head_idx = nullptr; + int nnz_heads = 0; +}; + +template +struct Multihead_attention_params: public Multihead_attention_params_base { + // output cross attentions + float* cross_attention_out = nullptr; + int max_decoder_seq_len = 0; + bool is_return_cross_attentions = false; + + // allows to exist attention eary + bool* finished = nullptr; + + // required in case of cross attention + // will need it here till if constexpr in c++17 + int* memory_length_per_sample = nullptr; + + // required in case of masked attention with different length + const int* length_per_sample = nullptr; +}; + +template +struct Multihead_attention_params: public Multihead_attention_params_base { + // output cross attentions + float* cross_attention_out = nullptr; + int max_decoder_seq_len = 0; + bool is_return_cross_attentions = false; + + // allows to exist attention eary + bool* finished = nullptr; + + // required in case of cross attention + int* memory_length_per_sample = nullptr; + + // required in case of masked attention with different length + const int* length_per_sample = nullptr; +}; + +template +using Masked_multihead_attention_params = Multihead_attention_params; + +template +using Cross_multihead_attention_params = Multihead_attention_params; + +template +struct outputCrossAttentionParam { + // max decoder output length + int max_decoder_seq_len = 0; + T* cross_attention_out = nullptr; + bool is_return_cross_attentions = false; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +void masked_multihead_attention(const Masked_multihead_attention_params& params, const cudaStream_t& stream); +void masked_multihead_attention(const Masked_multihead_attention_params& params, const cudaStream_t& stream); +#ifdef ENABLE_BF16 +void masked_multihead_attention(const Masked_multihead_attention_params<__nv_bfloat16>& params, + const cudaStream_t& stream); +#endif +void cross_multihead_attention(const Cross_multihead_attention_params& params, const cudaStream_t& stream); +void cross_multihead_attention(const Cross_multihead_attention_params& params, const cudaStream_t& stream); +#ifdef ENABLE_BF16 +void cross_multihead_attention(const Cross_multihead_attention_params<__nv_bfloat16>& params, + const cudaStream_t& stream); +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/decoder_masked_multihead_attention_template.hpp b/decoder_masked_multihead_attention_template.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2ae1b2425b87c89b8b38f2b1d5c6707229495037 --- /dev/null +++ b/decoder_masked_multihead_attention_template.hpp @@ -0,0 +1,1619 @@ +// Downloaded from from FasterTransformer v5.2.1 +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "decoder_masked_multihead_attention.h" +#include "decoder_masked_multihead_attention_utils.h" +#include "cuda_bf16_wrapper.h" +#include "cuda_bf16_fallbacks.cuh" +#include +#include +#include + +// #define MMHA_USE_HMMA_FOR_REDUCTION + +// Below are knobs to extend FP32 accumulation for higher FP16 accuracy + +// Does not seem to affect the accuracy that much +#define MMHA_USE_FP32_ACUM_FOR_FMA + +// Seems to slightly improve the accuracy +#define MMHA_USE_FP32_ACUM_FOR_OUT + +#if 0 && defined(MMHA_USE_FP32_ACUM_FOR_OUT) + // Does not seem to improve the accuracy + //#define MMHA_USE_FP32_ACUM_FOR_LOGITS +#endif + +namespace mmha { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// +// We use the following terminology to describe the different dimensions. +// +// B: Batch size (number of sequences), +// L: Sequence length, +// D: Hidden dimension, +// H: Number of heads, +// Dh: Hidden dimension per head - Dh = D / H. +// +// The different kernels assign a threadblock for B x H pair. The grid has size (1, B, H). We use +// 64, 128 and 256 threads per block. +// +// Each threadblock loads Dh values from Q and its associated bias. The kernels run a loop to +// compute Q * K^T where K is loaded from a cache buffer -- except for the current timestep. The +// cache buffer helps with memory accesses and contains keys with bias. +// +// The layout of the cache buffer for the keys is [B, H, Dh/x, L, x] where x == 8 for FP16 and +// x == 4 for FP32 where the fastest moving dimension (contiguous data) is the rightmost one. The +// values for x are chosen to create chunks of 16 bytes. +// +// The different kernels use 1, 2 or 4 threads per key (THREADS_PER_KEY). The size of the LDGs +// depends on the number of threads per key. Each thread sums Dh / THREADS_PER_KEY elements. At +// the end of each iteration of the Q * K^T loop, we perform a reduction between lanes using an +// HMMA instruction (Tensor Core). Each Q * K^T valuey is stored in shared memory in FP32. +// +// After that loop, a parallel softmax is computed across the different Q * K^T values stored in +// shared memory. +// +// The kernel ends with a loop over the values in V. We use THREADS_PER_VALUE to control how many +// timesteps are computed by loop iteration. As with the keys, the values are read from a cache +// except for the current timestep. The layout of the cache buffer for the values is much simpler +// as it is [B, H, L, Dh]. +// + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Qk_vec_ { +}; + +template<> +struct Qk_vec_ { + using Type = float; +}; +template<> +struct Qk_vec_ { + using Type = float2; +}; +template<> +struct Qk_vec_ { + using Type = float4; +}; +template<> +struct Qk_vec_ { + using Type = float4; +}; +template<> +struct Qk_vec_ { + using Type = uint32_t; +}; +template<> +struct Qk_vec_ { + using Type = uint32_t; +}; +template<> +struct Qk_vec_ { + using Type = uint2; +}; +template<> +struct Qk_vec_ { + using Type = uint4; +}; +#ifdef ENABLE_BF16 +template<> +struct Qk_vec_<__nv_bfloat16, 32> { + using Type = __nv_bfloat162; +}; +template<> +struct Qk_vec_<__nv_bfloat16, 64> { + using Type = __nv_bfloat162; +}; +template<> +struct Qk_vec_<__nv_bfloat16, 128> { + using Type = bf16_4_t; +}; +template<> +struct Qk_vec_<__nv_bfloat16, 256> { + using Type = bf16_8_t; +}; +#endif // ENABLE_BF16 +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct K_vec_ { +}; + +template<> +struct K_vec_ { + using Type = float; +}; +template<> +struct K_vec_ { + using Type = float2; +}; +template<> +struct K_vec_ { + using Type = float4; +}; +template<> +struct K_vec_ { + using Type = uint32_t; +}; +template<> +struct K_vec_ { + using Type = uint2; +}; +template<> +struct K_vec_ { + using Type = uint4; +}; +#ifdef ENABLE_BF16 +template<> +struct K_vec_<__nv_bfloat16, 4> { + using Type = __nv_bfloat162; +}; +template<> +struct K_vec_<__nv_bfloat16, 2> { + using Type = bf16_4_t; +}; +template<> +struct K_vec_<__nv_bfloat16, 1> { + using Type = bf16_8_t; +}; +#endif // ENABLE_BF16 +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct V_vec_ { +}; + +template<> +struct V_vec_ { + using Type = float; +}; +template<> +struct V_vec_ { + using Type = float2; +}; +template<> +struct V_vec_ { + using Type = float4; +}; +template<> +struct V_vec_ { + using Type = uint32_t; +}; +template<> +struct V_vec_ { + using Type = uint2; +}; +template<> +struct V_vec_ { + using Type = uint4; +}; +#ifdef ENABLE_BF16 +template<> +struct V_vec_<__nv_bfloat16, 2> { + using Type = __nv_bfloat162; +}; +template<> +struct V_vec_<__nv_bfloat16, 4> { + using Type = bf16_4_t; +}; +template<> +struct V_vec_<__nv_bfloat16, 8> { + using Type = bf16_8_t; +}; +#endif // ENABLE_BF16 +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef MMHA_USE_FP32_ACUM_FOR_FMA +template +struct Qk_vec_acum_fp32_ { +}; + +template<> +struct Qk_vec_acum_fp32_ { + using Type = float; +}; +template<> +struct Qk_vec_acum_fp32_ { + using Type = float2; +}; +template<> +struct Qk_vec_acum_fp32_ { + using Type = float4; +}; +// template<> struct Qk_vec_acum_fp32_ { using Type = float; }; +template<> +struct Qk_vec_acum_fp32_ { + using Type = float2; +}; +template<> +struct Qk_vec_acum_fp32_ { + using Type = Float4_; +}; +template<> +struct Qk_vec_acum_fp32_ { + using Type = Float8_; +}; +template<> +struct Qk_vec_acum_fp32_<__nv_bfloat16> { + using Type = float; +}; +template<> +struct Qk_vec_acum_fp32_<__nv_bfloat162> { + using Type = float2; +}; +template<> +struct Qk_vec_acum_fp32_ { + using Type = Float4_; +}; +template<> +struct Qk_vec_acum_fp32_ { + using Type = Float8_; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct K_vec_acum_fp32_ { +}; + +template<> +struct K_vec_acum_fp32_ { + using Type = float; +}; +template<> +struct K_vec_acum_fp32_ { + using Type = float2; +}; +template<> +struct K_vec_acum_fp32_ { + using Type = float4; +}; +template<> +struct K_vec_acum_fp32_ { + using Type = float2; +}; +template<> +struct K_vec_acum_fp32_ { + using Type = Float4_; +}; +template<> +struct K_vec_acum_fp32_ { + using Type = Float8_; +}; +template<> +struct K_vec_acum_fp32_<__nv_bfloat16> { + using Type = float; +}; +template<> +struct K_vec_acum_fp32_<__nv_bfloat162> { + using Type = float2; +}; +template<> +struct K_vec_acum_fp32_ { + using Type = Float4_; +}; +template<> +struct K_vec_acum_fp32_ { + using Type = Float8_; +}; +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef MMHA_USE_FP32_ACUM_FOR_OUT +template +struct V_vec_acum_fp32_ { +}; + +template<> +struct V_vec_acum_fp32_ { + using Type = float; +}; +template<> +struct V_vec_acum_fp32_ { + using Type = float2; +}; +template<> +struct V_vec_acum_fp32_ { + using Type = float4; +}; +template<> +struct V_vec_acum_fp32_ { + using Type = float2; +}; +template<> +struct V_vec_acum_fp32_ { + using Type = Float4_; +}; +template<> +struct V_vec_acum_fp32_ { + using Type = Float8_; +}; +#ifdef ENABLE_BF16 +template<> +struct V_vec_acum_fp32_<__nv_bfloat162> { + using Type = float2; +}; +template<> +struct V_vec_acum_fp32_ { + using Type = Float4_; +}; +template<> +struct V_vec_acum_fp32_ { + using Type = Float8_; +}; +#endif // ENABLE_BF16 +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +inline __device__ float qk_dot_(const K_vec (&q)[N], const K_vec (&k)[N]) +{ +#ifdef MMHA_USE_FP32_ACUM_FOR_FMA + using K_vec_acum = typename K_vec_acum_fp32_::Type; +#else + using K_vec_acum = K_vec; +#endif + // Compute the parallel products for Q*K^T (treat vector lanes separately). + K_vec_acum qk_vec = mul(q[0], k[0]); +#pragma unroll + for (int ii = 1; ii < N; ++ii) { + qk_vec = fma(q[ii], k[ii], qk_vec); + } + + // Finalize the reduction across lanes. + float qk = sum(qk_vec); +#pragma unroll + for (int mask = THREADS_PER_KEY / 2; mask >= 1; mask /= 2) { + qk += __shfl_xor_sync(uint32_t(-1), qk, mask); + } + return qk; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Qk_dot { + template + static inline __device__ float dot(const K_vec (&q)[N], const K_vec (&k)[N]) + { + return qk_dot_(q, k); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float4 hmma_fp32(const uint2& a, uint32_t b) +{ + float4 c; + float zero = 0.f; + asm volatile("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 \n" + " {%0, %1, %2, %3}, \n" + " {%4, %5}, \n" + " {%6}, \n" + " {%7, %7, %7, %7}; \n" + + : "=f"(c.x), "=f"(c.y), "=f"(c.z), "=f"(c.w) + : "r"(a.x) "r"(a.y), "r"(b), "f"(zero)); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +inline __device__ float qk_hmma_dot_(const uint32_t (&q)[N], const uint32_t (&k)[N]) +{ +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750 +#ifdef MMHA_USE_FP32_ACUM_FOR_FMA + using K_vec_acum = typename K_vec_acum_fp32_::Type; +#else + using K_vec_acum = uint32_t; +#endif + K_vec_acum qk_vec = mul(q[0], k[0]); +#pragma unroll + for (int ii = 1; ii < N; ++ii) { + qk_vec = fma(q[ii], k[ii], qk_vec); + } +#ifdef MMHA_USE_FP32_ACUM_FOR_FMA + uint32_t qk_vec_ = float2_to_half2(qk_vec); + return hmma_fp32(make_uint2(qk_vec_, 0u), 0x3c003c00u).x; +#else + return hmma_fp32(make_uint2(qk_vec, 0u), 0x3c003c00u).x; +#endif +#else + return 0.f; +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +struct Qk_dot { + template + static inline __device__ float dot(const uint32_t (&q)[N], const uint32_t (&k)[N]) + { +#if __CUDA_ARCH__ >= 750 && defined(MMHA_USE_HMMA_FOR_REDUCTION) + return qk_hmma_dot_(q, k); +#else + return qk_dot_<4>(q, k); +#endif // defined MMHA_USE_HMMA_FOR_REDUCTION + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +inline __device__ float block_sum(float* red_smem, float sum) +{ + + // Decompose the thread index into warp / lane. + int warp = threadIdx.x / WARP_SIZE; + int lane = threadIdx.x % WARP_SIZE; + +// Compute the sum per warp. +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) { + sum += __shfl_xor_sync(uint32_t(-1), sum, mask); + } + + // Warp leaders store the data to shared memory. + if (lane == 0) { + red_smem[warp] = sum; + } + + // Make sure the data is in shared memory. + __syncthreads(); + + // The warps compute the final sums. + if (lane < WARPS_PER_BLOCK) { + sum = red_smem[lane]; + } + +// Parallel reduction inside the warp. +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + sum += __shfl_xor_sync(uint32_t(-1), sum, mask); + } + + // Broadcast to other threads. + return __shfl_sync(uint32_t(-1), sum, 0); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ void convert_from_float(float& dst, float src) +{ + dst = src; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ void convert_from_float(uint16_t& dst, float src) +{ + dst = float_to_half(src); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ void convert_from_float(uint32_t& dst, float2 src) +{ + dst = float2_to_half2(src); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef ENABLE_BF16 +inline __device__ void convert_from_float(__nv_bfloat16& dst, float src) +{ + dst = __float2bfloat16(src); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ void convert_from_float(__nv_bfloat162& dst, float2 src) +{ +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + dst = __float22bfloat162_rn(src); +#else + dst = __floats2bfloat162_rn(src.x, src.y); +#endif +} +#endif // ENABLE_BF16 +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ void convert_from_float(uint2& dst, Float4_ src) +{ + dst.x = float2_to_half2(src.x); + dst.y = float2_to_half2(src.y); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ void convert_from_float(uint2& dst, float4 src) +{ + convert_from_float(dst, Float4_{make_float2(src.x, src.y), make_float2(src.z, src.w)}); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ void convert_from_float(uint4& dst, Float8_ src) +{ + dst.x = float2_to_half2(src.x); + dst.y = float2_to_half2(src.y); + dst.z = float2_to_half2(src.z); + dst.w = float2_to_half2(src.w); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef ENABLE_BF16 +inline __device__ void convert_from_float(bf16_4_t& dst, Float4_ src) +{ +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + dst.x = __float22bfloat162_rn(src.x); + dst.y = __float22bfloat162_rn(src.y); +#else + dst.x = __floats2bfloat162_rn(src.x.x, src.x.y); + dst.y = __floats2bfloat162_rn(src.y.x, src.y.y); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ void convert_from_float(bf16_4_t& dst, float4 src) +{ + convert_from_float(dst, Float4_{make_float2(src.x, src.y), make_float2(src.z, src.w)}); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ void convert_from_float(bf16_8_t& dst, Float8_ src) +{ +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + dst.x = __float22bfloat162_rn(src.x); + dst.y = __float22bfloat162_rn(src.y); + dst.z = __float22bfloat162_rn(src.z); + dst.w = __float22bfloat162_rn(src.w); +#else + dst.x = __floats2bfloat162_rn(src.x.x, src.x.y); + dst.y = __floats2bfloat162_rn(src.y.x, src.y.y); + dst.z = __floats2bfloat162_rn(src.z.x, src.z.y); + dst.w = __floats2bfloat162_rn(src.w.x, src.w.y); +#endif +} +#endif // ENABLE_BF16 + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ void convert_from_float(float2& dst, float2 src) +{ + dst = src; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ void convert_from_float(float4& dst, float4 src) +{ + dst = src; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float convert_to_float(float4 u) +{ + return u.x; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float convert_to_float(uint4 u) +{ + float2 tmp = half2_to_float2(u.x); + return tmp.x; +} + +#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS) + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float cast_to_float(float u) +{ + return u; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 cast_to_float(float2 u) +{ + return u; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float4 cast_to_float(float4 u) +{ + return u; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float4_ cast_to_float(Float4_ u) +{ + return u; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float8_ cast_to_float(Float8_ u) +{ + return u; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 cast_to_float(uint32_t u) +{ + return half2_to_float2(u); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float4_ cast_to_float(uint2 u) +{ + Float4_ tmp; + tmp.x = half2_to_float2(u.x); + tmp.y = half2_to_float2(u.y); + return tmp; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float8_ cast_to_float(uint4 u) +{ + Float8_ tmp; + tmp.x = half2_to_float2(u.x); + tmp.y = half2_to_float2(u.y); + tmp.z = half2_to_float2(u.z); + tmp.w = half2_to_float2(u.w); + return tmp; +} + +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float float_from_int8(int8_t u) +{ + return u; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 float_from_int8(int16_t u) +{ + union { + int16_t int16; + int8_t int8[2]; + }; + int16 = u; + return make_float2(int8[0], int8[1]); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float4 float_from_int8(int32_t u) +{ + union { + int32_t int32; + int8_t int8[4]; + }; + int32 = u; + return make_float4(int8[0], int8[1], int8[2], int8[3]); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// clang-format off +inline __device__ Float8_ float_from_int8(int64_t u) +{ + union { + int64_t int64; + int16_t int16[4]; + }; + int64 = u; + return Float8_ {float_from_int8(int16[0]), + float_from_int8(int16[1]), + float_from_int8(int16[2]), + float_from_int8(int16[3])}; +} +// clang-format on + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ int8_t cast_to_int8(float val) +{ + union { + int8_t int8[2]; + int16_t int16; + }; + asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val)); + return int8[0]; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ int32_t cast_to_int8(float4 val) +{ + union { + int8_t int8[4]; + int32_t int32; + }; + int8[0] = cast_to_int8(val.x); + int8[1] = cast_to_int8(val.y); + int8[2] = cast_to_int8(val.z); + int8[3] = cast_to_int8(val.w); + return int32; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ int64_t cast_to_int8(Float8_ val) +{ + union { + int8_t int8[8]; + int64_t int64; + }; + int8[0] = cast_to_int8(val.x.x); + int8[1] = cast_to_int8(val.x.y); + int8[2] = cast_to_int8(val.y.x); + int8[3] = cast_to_int8(val.y.y); + int8[4] = cast_to_int8(val.z.x); + int8[5] = cast_to_int8(val.z.y); + int8[6] = cast_to_int8(val.w.x); + int8[7] = cast_to_int8(val.w.y); + return int64; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +inline __device__ __host__ T div_up(T m, T n) +{ + return (m + n - 1) / n; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +inline size_t smem_size_in_bytes(const Multihead_attention_params& params, + int threads_per_value, + int threads_per_block) +{ + // The amount of shared memory needed to store the Q*K^T values in float. + const int max_timesteps = min(params.timestep, params.memory_max_len); + size_t qk_sz = (DO_CROSS_ATTENTION) ? div_up(params.memory_max_len + 1, 4) * 16 : div_up(max_timesteps + 1, 4) * 16; + + // The extra memory needed if we are not using floats for the final logits. + size_t logits_sz = 0; +#ifndef MMHA_USE_FP32_ACUM_FOR_LOGITS + if (sizeof(T) != 4) { + // TDOD + logits_sz = (DO_CROSS_ATTENTION) ? div_up(params.memory_max_len + 1, 4) * 4 * sizeof(T) : + div_up(max_timesteps + 1, 4) * 4 * sizeof(T); + } +#endif + + // The total size needed during softmax. + size_t softmax_sz = qk_sz + logits_sz; + + // The number of partial rows to reduce in the final reduction. + int rows_per_red = threads_per_block / threads_per_value; + // The amount of storage needed to finalize the outputs. + size_t red_sz = rows_per_red * params.hidden_size_per_head * sizeof(T) / 2; + + size_t transpose_rotary_size = 0; + if (params.rotary_embedding_dim > 0 && params.neox_rotary_style) { + transpose_rotary_size = 2 * params.rotary_embedding_dim * sizeof(T); + } + + // The max. + return max(max(softmax_sz, red_sz), transpose_rotary_size); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ constexpr uint32_t shfl_mask(int threads) +{ + return threads == 32 ? uint32_t(-1) : (1u << threads) - 1u; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template< + // The type of the inputs. Supported types: float and half. + typename T, + // The hidden dimension per head. + int Dh, + int Dh_MAX, + // The number of threads per key. + int THREADS_PER_KEY, + // The number of threads per value. + int THREADS_PER_VALUE, + // The number of threads in a threadblock. + int THREADS_PER_BLOCK, + bool DO_CROSS_ATTENTION> +__global__ void masked_multihead_attention_kernel(Multihead_attention_params params) +{ + + // Make sure the hidden dimension per head is a multiple of the number of threads per key. + static_assert(Dh_MAX % THREADS_PER_KEY == 0, ""); + // Make sure the hidden dimension per head is a multiple of the number of threads per value. + static_assert(Dh_MAX % THREADS_PER_VALUE == 0, ""); + + // The size of a warp. + constexpr int WARP_SIZE = 32; + // The number of warps in a threadblock. + constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE; + + // Use smem_size_in_bytes (above) to determine the amount of shared memory. + extern __shared__ char smem_[]; + + // The shared memory for the Q*K^T values and partial logits in softmax. + float* qk_smem = reinterpret_cast(smem_); + + // The shared memory for the logits. For FP32, that's the same buffer as qk_smem. + char* logits_smem_ = smem_; +#ifndef MMHA_USE_FP32_ACUM_FOR_LOGITS + if (sizeof(T) != 4) { + // TODO - change to tlength + const int max_timesteps = min(params.timestep, params.memory_max_len); + logits_smem_ += + (DO_CROSS_ATTENTION) ? div_up(params.memory_max_len + 1, 4) * 16 : div_up(max_timesteps + 1, 4) * 16; + } + T* logits_smem = reinterpret_cast(logits_smem_); +#else + float* logits_smem = reinterpret_cast(logits_smem_); +#endif + + // The shared memory to do the final reduction for the output values. Reuse qk_smem. + T* out_smem = reinterpret_cast(smem_); + + // The shared memory buffers for the block-wide reductions. One for max, one for sum. + __shared__ float red_smem[WARPS_PER_BLOCK * 2]; + + // A vector of Q or K elements for the current timestep. + using Qk_vec = typename Qk_vec_::Type; + + // Use alignment for safely casting the shared buffers as Qk_vec. + // Shared memory to store Q inputs. + __shared__ __align__(sizeof(Qk_vec)) T q_smem[Dh_MAX]; + + // This is one of the reasons we should have a separate kernel for cross attention + __shared__ __align__(sizeof(Qk_vec)) T bias_smem[DO_CROSS_ATTENTION ? Dh_MAX : 1]; + + // A vector of Q or K elements for the current timestep. + using Qk_vec = typename Qk_vec_::Type; + // The number of elements per vector. + constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T); + // Make sure the hidden size per head is a multiple of the vector size. + static_assert(Dh_MAX % QK_VEC_SIZE == 0, ""); + // We will use block wide reduction if needed + // static_assert(Dh_MAX / QK_VEC_SIZE <= WARP_SIZE, ""); + // The number of vectors per warp. + constexpr int QK_VECS_PER_WARP = Dh_MAX / QK_VEC_SIZE; + + // The layout of the cache is [B, H, Dh/x, L, x] with x == 4/8 for FP32/FP16. Since each thread + // owns x elements, we have to decompose the linear index into chunks of x values and the posi- + // tion of the thread in that chunk. + + // The number of elements in a chunk of 16B (that's the x in the above formula). + constexpr int QK_ELTS_IN_16B = 16 / sizeof(T); + // The number of K vectors in 16B. + constexpr int QK_VECS_IN_16B = 16 / sizeof(Qk_vec); + + // The batch/beam idx + const int bi = blockIdx.y; + if (params.finished != nullptr && params.finished[bi] == true) { + return; + } + // The beam idx + const int beami = bi % params.beam_width; + // The "beam-aware" batch idx + const int bbi = bi / params.beam_width; + // The head. + // const int hi = blockIdx.x; + const int hi = params.nnz_head_idx == nullptr ? blockIdx.x : params.nnz_head_idx[blockIdx.x]; + const int hi_kv = hi / params.num_heads_q_kv_ratio; + // Combine the batch and the head indices. + const int bhi = bi * params.num_heads + hi; + const int bhi_kv = bi * params.num_heads_kv + hi_kv; + // Combine the "beam-aware" batch idx and the head indices. + const int bbhi = bbi * params.beam_width * params.num_heads_kv + hi_kv; + // The thread in the block. + const int tidx = threadIdx.x; + + const bool handle_kv = !DO_CROSS_ATTENTION || (DO_CROSS_ATTENTION && params.timestep == 0); + + // While doing the product Q*K^T for the different keys we track the max. + float qk_max = -FLT_MAX; + + float qk = 0.0F; + + int q_base_offset = (params.stride_q == 0) ? bhi * Dh : bi * params.stride_q + hi * Dh; + int k_base_offset = (params.stride_k == 0) ? bhi_kv * Dh : bi * params.stride_k + hi_kv * Dh; + int v_base_offset = (params.stride_v == 0) ? bhi_kv * Dh : bi * params.stride_v + hi_kv * Dh; + + const size_t bi_seq_len_offset = bi * params.memory_max_len; + + // int tlength = (DO_CROSS_ATTENTION)? params.memory_length_per_sample[bi] - 1 : params.timestep; + int tlength = (DO_CROSS_ATTENTION) ? params.memory_length_per_sample[bi] - 1 : + (params.length_per_sample == nullptr) ? + params.timestep : + params.length_per_sample[bi] + params.max_prefix_prompt_length; + const int first_step = max(0, tlength + 1 - params.memory_max_len); + const int tlength_circ = tlength % params.memory_max_len; + + // First QK_VECS_PER_WARP load Q and K + the bias values for the current timestep. + const bool is_masked = tidx >= QK_VECS_PER_WARP; + + // The offset in the Q and K buffer also accounts for the batch. + int q_offset = q_base_offset + tidx * QK_VEC_SIZE; + int k_offset = k_base_offset + tidx * QK_VEC_SIZE; + // The offset in the bias buffer. + int q_bias_offset = hi * Dh + tidx * QK_VEC_SIZE; + int k_bias_offset = hi_kv * Dh + tidx * QK_VEC_SIZE; + + const bool do_ia3 = handle_kv && params.ia3_tasks != nullptr; + const int ia3_task_id = do_ia3 ? params.ia3_tasks[bbi] : 0; + + // Trigger the loads from the Q and K buffers. + Qk_vec q; + zero(q); + if (!is_masked && (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh)) { + if (params.int8_mode == 2) { + using Packed_Int8_t = typename packed_type::value>::type; + using Packed_Float_t = typename packed_type::value>::type; + const auto q_scaling = params.qkv_scale_out[0]; + const auto q_quant = + *reinterpret_cast(&reinterpret_cast(params.q)[q_offset]); + + convert_from_float(q, mul(q_scaling, float_from_int8(q_quant))); + } + else { + q = *reinterpret_cast(¶ms.q[q_offset]); + } + } + + Qk_vec k; + zero(k); + if (DO_CROSS_ATTENTION) { + // The 16B chunk written by the thread. + int co = tidx / QK_VECS_IN_16B; + // The position of the thread in that 16B chunk. + int ci = tidx % QK_VECS_IN_16B * QK_VEC_SIZE; + + // Two chunks are separated by L * x elements. A thread write QK_VEC_SIZE elements. + int offset = bhi_kv * params.memory_max_len * Dh + co * params.memory_max_len * QK_ELTS_IN_16B + + // params.timestep*QK_ELTS_IN_16B + + tlength * QK_ELTS_IN_16B + ci; + k = !is_masked && (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) ? + *reinterpret_cast(¶ms.k_cache[offset]) : + k; + } + else { + if (!is_masked && (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh)) { + if (params.int8_mode == 2) { + using Packed_Int8_t = typename packed_type::value>::type; + using Packed_Float_t = typename packed_type::value>::type; + const auto k_scaling = params.qkv_scale_out[1]; + const auto k_quant = + *reinterpret_cast(&reinterpret_cast(params.k)[k_offset]); + + convert_from_float(k, mul(k_scaling, float_from_int8(k_quant))); + } + else { + k = *reinterpret_cast(¶ms.k[k_offset]); + } + } + } + + // Trigger the loads from the Q and K bias buffers. + Qk_vec q_bias; + zero(q_bias); + q_bias = (!is_masked && Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) && params.q_bias != nullptr ? + *reinterpret_cast(¶ms.q_bias[q_bias_offset]) : + q_bias; + + Qk_vec k_bias; + zero(k_bias); + if (handle_kv) { + k_bias = !is_masked && (Dh == Dh_MAX || tidx * QK_VEC_SIZE < Dh) && params.k_bias != nullptr ? + *reinterpret_cast(¶ms.k_bias[k_bias_offset]) : + k_bias; + } + + // Computes the Q/K values with bias. + q = add(q, q_bias); + if (handle_kv) { + k = add(k, k_bias); + } + if (do_ia3 && !is_masked) { + k = mul( + k, + *reinterpret_cast( + ¶ms.ia3_key_weights[(ia3_task_id * params.num_heads + hi) * Dh + tidx * QK_VEC_SIZE])); + } + + // Padded len + const int padd_len = (params.total_padding_tokens == nullptr) ? 0 : params.total_padding_tokens[bi]; + if (params.rotary_embedding_dim > 0 && !params.neox_rotary_style) { + if (handle_kv) { + if (params.rotary_cos == nullptr) { + apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, tlength - padd_len, params.rotary_base); + } else { + apply_rotary_embedding(q, k, tidx, params.rotary_embedding_dim, tlength - padd_len, + params.rotary_cos + bi * params.rotary_embedding_dim / 2, + params.rotary_sin + bi * params.rotary_embedding_dim / 2); + } + } + else { + if (params.rotary_cos == nullptr) { + apply_rotary_embedding(q, tidx, params.rotary_embedding_dim, tlength - padd_len, params.rotary_base); + } else { + apply_rotary_embedding(q, tidx, params.rotary_embedding_dim, tlength - padd_len, + params.rotary_cos + bi * params.rotary_embedding_dim / 2, + params.rotary_sin + bi * params.rotary_embedding_dim / 2); + } + } + } + else if (params.rotary_embedding_dim > 0 && params.neox_rotary_style) { + const bool do_rotary = !is_masked && QK_VEC_SIZE * tidx < params.rotary_embedding_dim; + + T* q_smem = reinterpret_cast(smem_); + T* k_smem = q_smem + params.rotary_embedding_dim; + + const int half_rotary_dim = params.rotary_embedding_dim / 2; + const int half_idx = (tidx * QK_VEC_SIZE) / half_rotary_dim; + const int intra_half_idx = (tidx * QK_VEC_SIZE) % half_rotary_dim; + const int smem_pitch = half_rotary_dim; // TODO: adjust for bank conflicts + + assert(half_rotary_dim % QK_VEC_SIZE == 0); + + if (do_rotary) { + *reinterpret_cast(q_smem + half_idx * smem_pitch + intra_half_idx) = q; + + if (handle_kv) { + *reinterpret_cast(k_smem + half_idx * smem_pitch + intra_half_idx) = k; + } + } + + __syncthreads(); + + const int transpose_idx = half_idx * (half_rotary_dim / 2) + intra_half_idx / 2; + constexpr int tidx_factor = (QK_VEC_SIZE > 1) ? QK_VEC_SIZE / 2 : 1; + if (do_rotary) { + mmha::vec_from_smem_transpose(q, q_smem, transpose_idx, smem_pitch); + + if (handle_kv) { + mmha::vec_from_smem_transpose(k, k_smem, transpose_idx, smem_pitch); + + if (params.rotary_cos == nullptr) { + mmha::apply_rotary_embedding( + q, k, transpose_idx / tidx_factor, params.rotary_embedding_dim, tlength - padd_len, params.rotary_base); + } else { + mmha::apply_rotary_embedding( + q, k, transpose_idx / tidx_factor, params.rotary_embedding_dim, tlength - padd_len, + params.rotary_cos + bi * params.rotary_embedding_dim / 2, + params.rotary_sin + bi * params.rotary_embedding_dim / 2); + } + + mmha::write_smem_transpose(k, k_smem, transpose_idx, smem_pitch); + } + else { + if (params.rotary_cos == nullptr) { + mmha::apply_rotary_embedding( + q, transpose_idx / tidx_factor, params.rotary_embedding_dim, tlength, params.rotary_base); + } else { + mmha::apply_rotary_embedding( + q, transpose_idx / tidx_factor, params.rotary_embedding_dim, tlength, + params.rotary_cos + bi * params.rotary_embedding_dim / 2, + params.rotary_sin + bi * params.rotary_embedding_dim / 2); + } + } + mmha::write_smem_transpose(q, q_smem, transpose_idx, smem_pitch); + } + + __syncthreads(); + + if (do_rotary) { + q = *reinterpret_cast(q_smem + half_idx * smem_pitch + intra_half_idx); + if (handle_kv) { + k = *reinterpret_cast(k_smem + half_idx * smem_pitch + intra_half_idx); + } + } + + __syncthreads(); + } + + if (!is_masked) { + // Store the Q values to shared memory. + *reinterpret_cast(&q_smem[tidx * QK_VEC_SIZE]) = q; + + // Store Dh values of k_bias into smem, since will need to add later + // if params.timestep == 0 + if (DO_CROSS_ATTENTION && params.timestep == 0) { + *reinterpret_cast(&bias_smem[tidx * QK_VEC_SIZE]) = k_bias; + } + + // Write the K values to the global memory cache. + // + // NOTE: The stores are uncoalesced as we have multiple chunks of 16B spread across the memory + // system. We designed it this way as it allows much better memory loads (and there are many + // more loads) + the stores are really "write and forget" since we won't need the ack before + // the end of the kernel. There's plenty of time for the transactions to complete. + + // The 16B chunk written by the thread. + int co = tidx / QK_VECS_IN_16B; + // The position of the thread in that 16B chunk. + int ci = tidx % QK_VECS_IN_16B * QK_VEC_SIZE; + + // Two chunks are separated by L * x elements. A thread write QK_VEC_SIZE elements. + int offset = bhi_kv * params.memory_max_len * Dh + co * params.memory_max_len * QK_ELTS_IN_16B + + // params.timestep*QK_ELTS_IN_16B + + tlength_circ * QK_ELTS_IN_16B + ci; + + if (handle_kv && hi % params.num_heads_q_kv_ratio == 0) { + // Trigger the stores to global memory. + if (Dh == Dh_MAX || co < Dh / QK_ELTS_IN_16B) { + *reinterpret_cast(¶ms.k_cache[offset]) = k; + } + } + + // Compute \sum_i Q[i] * K^T[i] for the current timestep. +#ifdef MMHA_USE_FP32_ACUM_FOR_FMA + using Qk_vec_acum = typename Qk_vec_acum_fp32_::Type; +#else + using Qk_vec_acum = Qk_vec; +#endif + qk = dot(q, k); + if (QK_VECS_PER_WARP <= WARP_SIZE) { +#pragma unroll + for (int mask = QK_VECS_PER_WARP / 2; mask >= 1; mask /= 2) { + qk += __shfl_xor_sync(shfl_mask(QK_VECS_PER_WARP), qk, mask); + } + } + } + + if (QK_VECS_PER_WARP > WARP_SIZE) { + constexpr int WARPS_PER_RED = (QK_VECS_PER_WARP + WARP_SIZE - 1) / WARP_SIZE; + qk = block_sum(&red_smem[WARPS_PER_RED], qk); + } + + // Store that value in shared memory. Keep the Q*K^T value in register for softmax. + if (tidx == 0) { + // Normalize qk. + qk *= params.inv_sqrt_dh; + if (params.relative_attention_bias != nullptr) { + qk = add(qk, + params.relative_attention_bias[hi * params.relative_attention_bias_stride + * params.relative_attention_bias_stride + + (tlength - padd_len) * params.relative_attention_bias_stride + + (tlength - padd_len)]); + } + // We don't need to apply the linear position bias here since qi - ki = 0 yields the position bias 0. + + qk_max = qk; + qk_smem[tlength - first_step] = qk; + // qk_smem[params.timestep] = qk; + } + + // Make sure the data is in shared memory. + __syncthreads(); + + // The type of queries and keys for the math in the Q*K^T product. + using K_vec = typename K_vec_::Type; + // The number of elements per vector. + constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(T); + // Make sure the hidden size per head is a multiple of the vector size. + static_assert(Dh_MAX % K_VEC_SIZE == 0, ""); + // The number of elements per thread. + constexpr int K_ELTS_PER_THREAD = Dh_MAX / THREADS_PER_KEY; + // The number of vectors per thread. + constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE; + + // The position the first key loaded by each thread from the cache buffer (for this B * H). + int ko = tidx / THREADS_PER_KEY; + // The position of the thread in the chunk of keys. + int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE; + + static_assert(Dh_MAX == THREADS_PER_KEY * K_VEC_SIZE * K_VECS_PER_THREAD); + + // Load the Q values from shared memory. The values are reused during the loop on K. + K_vec q_vec[K_VECS_PER_THREAD]; +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + q_vec[ii] = *reinterpret_cast(&q_smem[ki + ii * THREADS_PER_KEY * K_VEC_SIZE]); + } + + K_vec k_bias_vec[DO_CROSS_ATTENTION ? K_VECS_PER_THREAD : 1]; + if (DO_CROSS_ATTENTION && params.timestep == 0) { +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + k_bias_vec[ii] = *reinterpret_cast(&bias_smem[ki + ii * THREADS_PER_KEY * K_VEC_SIZE]); + } + } + + // The number of timesteps loaded per iteration. + constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY; + // The number of keys per warp. + constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY; + + // The base pointer for the key in the cache buffer. + T* k_cache = ¶ms.k_cache[bhi_kv * params.memory_max_len * Dh + ki]; + // Base pointer for the beam's batch, before offsetting with indirection buffer + T* k_cache_batch = ¶ms.k_cache[bbhi * params.memory_max_len * Dh + ki]; + + // Pick a number of keys to make sure all the threads of a warp enter (due to shfl_sync). + // int ti_end = div_up(params.timestep, K_PER_WARP) * K_PER_WARP; + int ti_end = div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step; + + // prefix prompt length if has + const int prefix_prompt_length = (params.prefix_prompt_lengths == nullptr) ? 0 : params.prefix_prompt_lengths[bi]; + + // Iterate over the keys/timesteps to compute the various (Q*K^T)_{ti} values. + const bool has_beams = params.cache_indir != nullptr; + const int* beam_indices = has_beams ? ¶ms.cache_indir[bi_seq_len_offset] : nullptr; + + for (int ti = first_step + ko; ti < ti_end; ti += K_PER_ITER) { + const int ti_circ = ti % params.memory_max_len; + + // The keys loaded from the key cache. + K_vec k[K_VECS_PER_THREAD]; + K_vec k_vec_zero; + zero(k_vec_zero); +#pragma unroll + for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) { + int jj = ii * params.memory_max_len + ti_circ; + // if( ti < params.timestep ) { + const bool within_bounds = (Dh == Dh_MAX || jj * QK_ELTS_IN_16B < Dh * params.memory_max_len); + if (ti < tlength) { + if (!within_bounds) { + k[ii] = k_vec_zero; + } + else { + if (has_beams) { + const int beam_offset = beam_indices[ti_circ] * params.num_heads * params.memory_max_len * Dh; + k[ii] = *reinterpret_cast(&k_cache_batch[beam_offset + jj * QK_ELTS_IN_16B]); + } + else { + k[ii] = *reinterpret_cast(&k_cache_batch[jj * QK_ELTS_IN_16B]); + } + } + // add bias and update k_cache + if (DO_CROSS_ATTENTION && params.timestep == 0) { + k[ii] = add(k[ii], k_bias_vec[ii]); + + if (do_ia3) { + k[ii] = mul( + k[ii], + *reinterpret_cast( + ¶ms.ia3_key_weights[(ia3_task_id * params.num_heads + hi) * Dh + ki + + ii * THREADS_PER_KEY * K_VEC_SIZE])); + } + + if (Dh == Dh_MAX || jj * QK_ELTS_IN_16B < Dh * params.memory_max_len) { + *reinterpret_cast(&k_cache[jj * QK_ELTS_IN_16B]) = k[ii]; + } + } + } + } + + // Perform the dot product and normalize qk. + // + // WARNING: ALL THE THREADS OF A WARP MUST ENTER!!! + float qk = Qk_dot::dot(q_vec, k) * params.inv_sqrt_dh; + bool is_mask = (params.masked_tokens != nullptr) && params.masked_tokens[bi_seq_len_offset + ti]; + + // Store the product to shared memory. There's one qk value per timestep. Update the max. + // if( ti < params.timestep && tidx % THREADS_PER_KEY == 0 ) { + if (ti < tlength && tidx % THREADS_PER_KEY == 0) { + if (params.relative_attention_bias != nullptr) { + qk = add(qk, + params.relative_attention_bias[hi * params.relative_attention_bias_stride + * params.relative_attention_bias_stride + + tlength * params.relative_attention_bias_stride + ti]); + } + if (params.linear_bias_slopes != nullptr) { + // Apply the linear position bias: (ki - qi) * slope[hi]. + // The padding token locates between the input context and the generated tokens. + // We need to remove the number of padding tokens in the distance computation. + // ti : 0 1 2 3 4 5 6 7 8 9(tlength) + // token: i i i i p p p o o o where i=input, p=pad, o=output. + // e.g. ti = 2, dist = (9 - 3) - 2 = 4. + int max_context_length = params.max_prefix_prompt_length + params.max_input_length; + float dist = (ti < max_context_length ? ti + padd_len : ti) - tlength; + + qk += mul(params.linear_bias_slopes[hi], dist); + } + qk_max = is_mask ? qk_max : fmaxf(qk_max, qk); + qk_smem[ti - first_step] = qk; + } + } + +// Perform the final reduction to compute the max inside each warp. +// +// NOTE: In a group of THREADS_PER_KEY threads, the leader already has the max value for the +// group so it's not needed to run the reduction inside the group (again). +#pragma unroll + for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Decompose the thread index into warp and lane. + const int warp = tidx / WARP_SIZE; + const int lane = tidx % WARP_SIZE; + + // The warp leader writes the max to shared memory. + if (lane == 0) { + red_smem[warp] = qk_max; + } + + // Make sure the products are in shared memory. + __syncthreads(); + + // The warps finalize the reduction. + qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX; +#pragma unroll + for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask)); + } + + // Broadcast to all the threads in the warp. + qk_max = __shfl_sync(uint32_t(-1), qk_max, 0); + + // Compute the logits and start the sum. + float sum = 0.f; + // for( int ti = tidx; ti <= params.timestep; ti += THREADS_PER_BLOCK ) { + for (int ti = first_step + tidx; ti <= tlength; ti += THREADS_PER_BLOCK) { + bool is_mask = (params.masked_tokens != nullptr) && params.masked_tokens[bi_seq_len_offset + ti]; + float logit = is_mask ? 0.f : __expf(qk_smem[ti - first_step] - qk_max); + sum += logit; + qk_smem[ti - first_step] = logit; + } + + // Compute the sum. + sum = block_sum(&red_smem[WARPS_PER_BLOCK], sum); + + // Normalize the logits. + float inv_sum = __fdividef(1.f, sum + 1.e-6f); + // for( int ti = tidx; ti <= params.timestep; ti += THREADS_PER_BLOCK ) { + const size_t cross_attention_out_offset = + params.is_return_cross_attentions ? + bhi * params.max_decoder_seq_len * params.memory_max_len + params.timestep * params.memory_max_len : + 0; + for (int ti = first_step + tidx; ti <= tlength; ti += THREADS_PER_BLOCK) { + float logit = qk_smem[ti - first_step] * inv_sum; + if (params.is_return_cross_attentions) { + params.cross_attention_out[cross_attention_out_offset + ti] = logit; + } + convert_from_float(logits_smem[ti - first_step], logit); + } + + // Put Values part below so we leverage __syncthreads + // from the previous step + + // The number of elements per vector. + constexpr int V_VEC_SIZE = Dh_MAX / THREADS_PER_VALUE; + // A vector of V elements for the current timestep. + using V_vec = typename V_vec_::Type; + + // The value computed by this thread. + int vo = tidx / THREADS_PER_VALUE; + // The hidden dimensions computed by this particular thread. + int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE; + + // The base pointer for the value in the cache buffer. + T* v_cache = ¶ms.v_cache[bhi_kv * params.memory_max_len * Dh + vi]; + // Base pointer for the beam's batch, before offsetting with indirection buffer + T* v_cache_batch = ¶ms.v_cache[bbhi * params.memory_max_len * Dh + vi]; + + // The number of values processed per iteration of the loop. + constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE; + + // One group of threads computes the product(s) for the current timestep. + V_vec v_bias; + zero(v_bias); + // if( vo == params.timestep % V_PER_ITER ) { + if (Dh == Dh_MAX || vi < Dh) { + if (handle_kv) { + if (vo == tlength % V_PER_ITER) { + // Trigger the loads from the V bias buffer. + if (params.v_bias != nullptr) { + v_bias = *reinterpret_cast(¶ms.v_bias[hi_kv * Dh + vi]); + } + if (DO_CROSS_ATTENTION) { + *reinterpret_cast(&bias_smem[vi]) = v_bias; + } + } + } + } + + // From previous, before values, step + // Also make sure the logits are in shared memory. + __syncthreads(); + + // Values continued +#ifdef MMHA_USE_FP32_ACUM_FOR_OUT + using V_vec_acum = typename V_vec_acum_fp32_::Type; +#else + using V_vec_acum = V_vec; +#endif + // The partial outputs computed by each thread. + V_vec_acum out; + zero(out); + + // Loop over the timesteps to compute the partial outputs. + // for( int ti = vo; ti < params.timestep; ti += V_PER_ITER ) { + if (Dh == Dh_MAX || vi < Dh) { + for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) { + const int ti_circ = ti % params.memory_max_len; + + // Fetch offset based on cache_indir when beam sampling + const int beam_src = (params.cache_indir != nullptr) ? params.cache_indir[bi_seq_len_offset + ti_circ] : 0; + const int beam_offset = beam_src * params.num_heads * params.memory_max_len * Dh; + // Load the values from the cache. + V_vec v = *reinterpret_cast(&v_cache_batch[beam_offset + ti_circ * Dh]); + if (DO_CROSS_ATTENTION && params.timestep == 0) { + v = add(v, *reinterpret_cast(&bias_smem[vi])); + if (do_ia3) { + v = mul( + v, + *reinterpret_cast( + ¶ms.ia3_value_weights[(ia3_task_id * params.num_heads + hi) * Dh + vi])); + } + *reinterpret_cast(&v_cache[ti * Dh]) = v; + } + // Load the logits from shared memory. +#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS) + float logit = logits_smem[ti - first_step]; + out = fma(logit, cast_to_float(v), out); +#else + T logit = logits_smem[ti - first_step]; + + // Update the partial sums. + out = fma(logit, v, out); +#endif + } + } + + // One group of threads computes the product(s) for the current timestep. + // if( vo == params.timestep % V_PER_ITER ) { + if (vo == tlength % V_PER_ITER && (Dh == Dh_MAX || vi < Dh)) { + + V_vec v; + if (DO_CROSS_ATTENTION) { + v = *reinterpret_cast(&v_cache[tlength * Dh]); + } + else { + // Trigger the loads from the V buffer. + const auto v_offset = v_base_offset + vi; + if (params.int8_mode == 2) { + using Packed_Int8_t = typename packed_type::value>::type; + using Packed_Float_t = typename packed_type::value>::type; + const auto v_scaling = params.qkv_scale_out[2]; + const auto v_quant = + *reinterpret_cast(&reinterpret_cast(params.v)[v_offset]); + + convert_from_float(v, mul(v_scaling, float_from_int8(v_quant))); + } + else { + v = *reinterpret_cast(¶ms.v[v_offset]); + } + // Trigger the loads from the V bias buffer. + // V_vec v_bias = *reinterpret_cast(¶ms.v_bias[hi*Dh + vi]); + } + + // Compute the V values with bias. + if (handle_kv) { + v = add(v, v_bias); + + if (do_ia3) { + v = mul( + v, + *reinterpret_cast( + ¶ms.ia3_value_weights[(ia3_task_id * params.num_heads + hi) * Dh + vi])); + } + + // Store the values with bias back to global memory in the cache for V. + if (hi % params.num_heads_q_kv_ratio == 0) { + //*reinterpret_cast(&v_cache[params.timestep*Dh]) = v; + *reinterpret_cast(&v_cache[tlength_circ * Dh]) = v; + } + } + + // Initialize the output value with the current timestep. +#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS) + // out = fma(logits_smem[params.timestep], cast_to_float(v), out); + out = fma(logits_smem[tlength - first_step], cast_to_float(v), out); +#else + // out = fma(logits_smem[params.timestep], v, out); + out = fma(logits_smem[tlength - first_step], v, out); +#endif + } + + // Make sure we can start writing to shared memory. + __syncthreads(); + + // Run the final reduction amongst the different groups computing different partial outputs. + if (Dh == Dh_MAX || vi < Dh) { +#pragma unroll + for (int active_groups = V_PER_ITER; active_groups >= 2; active_groups /= 2) { + + // The midpoint in the number of active groups. + int midpoint = active_groups / 2; + + // The upper part of active threads store to shared memory. + if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) { +#ifdef MMHA_USE_FP32_ACUM_FOR_OUT + convert_from_float(*reinterpret_cast(&out_smem[(vo - midpoint) * Dh + vi]), out); +#else + *reinterpret_cast(&out_smem[(vo - midpoint) * Dh + vi]) = out; +#endif + } + __syncthreads(); + + // The bottom warps update their values. + if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) { + out = add(*reinterpret_cast(&out_smem[vo * Dh + vi]), out); + } + __syncthreads(); + } + } + + // Output the final values. + if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) { +#ifdef MMHA_USE_FP32_ACUM_FOR_OUT + if (params.int8_mode == 2) { + using Packed_Int8_t = typename packed_type::value>::type; + out = mul(*params.attention_out_scale, out); + *reinterpret_cast(&(reinterpret_cast(params.out)[bhi * Dh + vi])) = + cast_to_int8(out); + } + else { + convert_from_float(*reinterpret_cast(¶ms.out[bhi * Dh + vi]), out); + } +#else + // TODO: support int8_mode? + *reinterpret_cast(¶ms.out[bhi * Dh + vi]) = out; +#endif + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace mmha + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +void mmha_launch_kernel(const KERNEL_PARAMS_TYPE& params, const cudaStream_t& stream); diff --git a/decoder_masked_multihead_attention_utils.h b/decoder_masked_multihead_attention_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..98875aba9b8c42e53b9a107e28cb785c44f534e5 --- /dev/null +++ b/decoder_masked_multihead_attention_utils.h @@ -0,0 +1,2017 @@ +// Downloaded from from FasterTransformer v5.2.1 +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.2.1_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cuda_bf16_wrapper.h" +#include "cuda_bf16_fallbacks.cuh" +#include + +using namespace fastertransformer; + +namespace mmha { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct Float8_ { + float2 x; + float2 y; + float2 z; + float2 w; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct Float4_ { + float2 x; + float2 y; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef ENABLE_BF16 +struct bf16_4_t { + __nv_bfloat162 x; + __nv_bfloat162 y; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct bf16_8_t { + __nv_bfloat162 x; + __nv_bfloat162 y; + __nv_bfloat162 z; + __nv_bfloat162 w; +}; +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct num_elems; +template<> +struct num_elems { + static constexpr int value = 1; +}; +template<> +struct num_elems { + static constexpr int value = 2; +}; +template<> +struct num_elems { + static constexpr int value = 4; +}; +template<> +struct num_elems { + static constexpr int value = 4; +}; +template<> +struct num_elems { + static constexpr int value = 8; +}; + +template<> +struct num_elems { + static constexpr int value = 2; +}; +template<> +struct num_elems { + static constexpr int value = 4; +}; +template<> +struct num_elems { + static constexpr int value = 8; +}; + +#ifdef ENABLE_BF16 +template<> +struct num_elems<__nv_bfloat162> { + static constexpr int value = 2; +}; +template<> +struct num_elems { + static constexpr int value = 4; +}; +template<> +struct num_elems { + static constexpr int value = 8; +}; +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct packed_type; +template +struct packed_type { + using type = T; +}; +template<> +struct packed_type { + using type = int16_t; +}; +template<> +struct packed_type { + using type = int32_t; +}; +template<> +struct packed_type { + using type = int64_t; +}; + +template<> +struct packed_type { + using type = float2; +}; +template<> +struct packed_type { + using type = float4; +}; +template<> +struct packed_type { + using type = Float8_; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float add(float a, float b) +{ + return a + b; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 add(float2 a, float2 b) +{ + float2 c; + c.x = add(a.x, b.x); + c.y = add(a.y, b.y); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float4 add(float4 a, float4 b) +{ + float4 c; + c.x = add(a.x, b.x); + c.y = add(a.y, b.y); + c.z = add(a.z, b.z); + c.w = add(a.w, b.w); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef ENABLE_BF16 +inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) +{ + return a + b; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b) +{ + return bf16hadd2(a, b); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ bf16_4_t add(bf16_4_t a, bf16_4_t b) +{ + bf16_4_t c; + c.x = add(a.x, b.x); + c.y = add(a.y, b.y); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ bf16_8_t add(bf16_8_t a, bf16_8_t b) +{ + bf16_8_t c; + c.x = add(a.x, b.x); + c.y = add(a.y, b.y); + c.z = add(a.z, b.z); + c.w = add(a.w, b.w); + return c; +} +#endif // ENABLE_BF16 + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ uint16_t add(uint16_t a, uint16_t b) +{ + uint16_t c; + asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b)); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ uint32_t add(uint32_t a, uint32_t b) +{ + uint32_t c; + asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b)); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ uint2 add(uint2 a, uint2 b) +{ + uint2 c; + c.x = add(a.x, b.x); + c.y = add(a.y, b.y); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ uint4 add(uint4 a, uint4 b) +{ + uint4 c; + c.x = add(a.x, b.x); + c.y = add(a.y, b.y); + c.z = add(a.z, b.z); + c.w = add(a.w, b.w); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ uint16_t float_to_half(float f) +{ + union { + uint32_t u32; + uint16_t u16[2]; + } tmp; +#if 0 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 // Is it better? + float zero = 0.f; + asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(zero), "f"(f)); +#else + asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f)); +#endif + return tmp.u16[0]; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ uint32_t float2_to_half2(float2 f) +{ + union { + uint32_t u32; + uint16_t u16[2]; + } tmp; +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x)); +#else + asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x)); + asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y)); +#endif + return tmp.u32; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float half_to_float(uint16_t h) +{ + float f; + asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h)); + return f; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 half2_to_float2(uint32_t v) +{ + uint16_t lo, hi; + asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v)); + return make_float2(half_to_float(lo), half_to_float(hi)); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float add(float a, uint16_t b) +{ + return a + half_to_float(b); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef ENABLE_BF16 +inline __device__ float add(float a, __nv_bfloat16 b) +{ + return a + __bfloat162float(b); +} +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 add(uint32_t a, float2 fb) +{ + float2 fa = half2_to_float2(a); + return add(fa, fb); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float4_ add(uint2 a, Float4_ fb) +{ + Float4_ fc; + fc.x = add(a.x, fb.x); + fc.y = add(a.y, fb.y); + return fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float8_ add(uint4 a, Float8_ fb) +{ + Float8_ fc; + fc.x = add(a.x, fb.x); + fc.y = add(a.y, fb.y); + fc.z = add(a.z, fb.z); + fc.w = add(a.w, fb.w); + return fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ uint32_t h0_h0(uint16_t a) +{ + uint32_t b; + asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a)); + return b; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float fma(float a, float b, float c) +{ + return a * b + c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 fma(float2 a, float2 b, float2 c) +{ + float2 d; + d.x = fma(a.x, b.x, c.x); + d.y = fma(a.y, b.y, c.y); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 fma(float a, float2 b, float2 c) +{ + float2 d; + d.x = fma(a, b.x, c.x); + d.y = fma(a, b.y, c.y); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float4 fma(float4 a, float4 b, float4 c) +{ + float4 d; + d.x = fma(a.x, b.x, c.x); + d.y = fma(a.y, b.y, c.y); + d.z = fma(a.z, b.z, c.z); + d.w = fma(a.w, b.w, c.w); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float4 fma(float a, float4 b, float4 c) +{ + float4 d; + d.x = fma(a, b.x, c.x); + d.y = fma(a, b.y, c.y); + d.z = fma(a, b.z, c.z); + d.w = fma(a, b.w, c.w); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float4_ fma(float a, Float4_ b, Float4_ c) +{ + Float4_ d; + d.x = fma(a, b.x, c.x); + d.y = fma(a, b.y, c.y); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) +{ + Float8_ d; + d.x = fma(a, b.x, c.x); + d.y = fma(a, b.y, c.y); + d.z = fma(a, b.z, c.z); + d.w = fma(a, b.w, c.w); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef ENABLE_BF16 +inline __device__ float2 add(__nv_bfloat162 a, float2 fb) +{ + float2 fa = bf1622float2(a); + return add(fa, fb); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float4_ add(bf16_4_t a, Float4_ fb) +{ + Float4_ fc; + fc.x = add(a.x, fb.x); + fc.y = add(a.y, fb.y); + return fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float8_ add(bf16_8_t a, Float8_ fb) +{ + Float8_ fc; + fc.x = add(a.x, fb.x); + fc.y = add(a.y, fb.y); + fc.z = add(a.z, fb.z); + fc.w = add(a.w, fb.w); + return fc; +} +#endif // ENABLE_BF16 + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) +{ + uint32_t d; + asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c)); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ uint32_t fma(uint16_t a, uint32_t b, uint32_t c) +{ + return fma(h0_h0(a), b, c); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ uint2 fma(uint2 a, uint2 b, uint2 c) +{ + uint2 d; + d.x = fma(a.x, b.x, c.x); + d.y = fma(a.y, b.y, c.y); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ uint2 fma(uint16_t a, uint2 b, uint2 c) +{ + uint32_t s = h0_h0(a); + uint2 d; + d.x = fma(s, b.x, c.x); + d.y = fma(s, b.y, c.y); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ uint4 fma(uint4 a, uint4 b, uint4 c) +{ + uint4 d; + d.x = fma(a.x, b.x, c.x); + d.y = fma(a.y, b.y, c.y); + d.z = fma(a.z, b.z, c.z); + d.w = fma(a.w, b.w, c.w); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ uint4 fma(uint16_t a, uint4 b, uint4 c) +{ + uint32_t s = h0_h0(a); + uint4 d; + d.x = fma(s, b.x, c.x); + d.y = fma(s, b.y, c.y); + d.z = fma(s, b.z, c.z); + d.w = fma(s, b.w, c.w); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float fma(uint16_t a, uint16_t b, float fc) +{ + float fa = half_to_float(a); + float fb = half_to_float(b); + return fa * fb + fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 fma(uint32_t a, uint32_t b, float2 fc) +{ + float2 fa = half2_to_float2(a); + float2 fb = half2_to_float2(b); + return fma(fa, fb, fc); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 fma(uint16_t a, uint32_t b, float2 fc) +{ + return fma(h0_h0(a), b, fc); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float4_ fma(uint2 a, uint2 b, Float4_ fc) +{ + Float4_ fd; + fd.x = fma(a.x, b.x, fc.x); + fd.y = fma(a.y, b.y, fc.y); + return fd; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float4_ fma(uint16_t a, uint2 b, Float4_ fc) +{ + uint32_t s = h0_h0(a); + Float4_ fd; + fd.x = fma(s, b.x, fc.x); + fd.y = fma(s, b.y, fc.y); + return fd; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float8_ fma(uint4 a, uint4 b, Float8_ fc) +{ + Float8_ fd; + fd.x = fma(a.x, b.x, fc.x); + fd.y = fma(a.y, b.y, fc.y); + fd.z = fma(a.z, b.z, fc.z); + fd.w = fma(a.w, b.w, fc.w); + return fd; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc) +{ + uint32_t s = h0_h0(a); + Float8_ fd; + fd.x = fma(s, b.x, fc.x); + fd.y = fma(s, b.y, fc.y); + fd.z = fma(s, b.z, fc.z); + fd.w = fma(s, b.w, fc.w); + return fd; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef ENABLE_BF16 +inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) +{ + return bf16hfma2(a, b, c); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b, __nv_bfloat162 c) +{ + return bf16hfma2(bf162bf162(a), b, c); +} +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ bf16_4_t fma(bf16_4_t a, bf16_4_t b, bf16_4_t c) +{ + bf16_4_t d; + d.x = fma(a.x, b.x, c.x); + d.y = fma(a.y, b.y, c.y); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ bf16_4_t fma(__nv_bfloat16 a, bf16_4_t b, bf16_4_t c) +{ + __nv_bfloat162 s = bf162bf162(a); + bf16_4_t d; + d.x = fma(s, b.x, c.x); + d.y = fma(s, b.y, c.y); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ bf16_8_t fma(bf16_8_t a, bf16_8_t b, bf16_8_t c) +{ + bf16_8_t d; + d.x = fma(a.x, b.x, c.x); + d.y = fma(a.y, b.y, c.y); + d.z = fma(a.z, b.z, c.z); + d.w = fma(a.w, b.w, c.w); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ bf16_8_t fma(__nv_bfloat16 a, bf16_8_t b, bf16_8_t c) +{ + __nv_bfloat162 s = bf162bf162(a); + bf16_8_t d; + d.x = fma(s, b.x, c.x); + d.y = fma(s, b.y, c.y); + d.z = fma(s, b.z, c.z); + d.w = fma(s, b.w, c.w); + return d; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float fma(__nv_bfloat16 a, __nv_bfloat16 b, float fc) +{ + return __bfloat162float(a) * __bfloat162float(b) + fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 fma(__nv_bfloat162 a, __nv_bfloat162 b, float2 fc) +{ + float2 fa = bf1622float2(a); + float2 fb = bf1622float2(b); + return fma(fa, fb, fc); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 fma(__nv_bfloat16 a, __nv_bfloat162 b, float2 fc) +{ + return fma(bf162bf162(a), b, fc); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float4_ fma(bf16_4_t a, bf16_4_t b, Float4_ fc) +{ + Float4_ fd; + fd.x = fma(a.x, b.x, fc.x); + fd.y = fma(a.y, b.y, fc.y); + return fd; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float4_ fma(__nv_bfloat16 a, bf16_4_t b, Float4_ fc) +{ + __nv_bfloat162 s = bf162bf162(a); + Float4_ fd; + fd.x = fma(s, b.x, fc.x); + fd.y = fma(s, b.y, fc.y); + return fd; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float8_ fma(bf16_8_t a, bf16_8_t b, Float8_ fc) +{ + Float8_ fd; + fd.x = fma(a.x, b.x, fc.x); + fd.y = fma(a.y, b.y, fc.y); + fd.z = fma(a.z, b.z, fc.z); + fd.w = fma(a.w, b.w, fc.w); + return fd; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc) +{ + __nv_bfloat162 s = bf162bf162(a); + Float8_ fd; + fd.x = fma(s, b.x, fc.x); + fd.y = fma(s, b.y, fc.y); + fd.z = fma(s, b.z, fc.z); + fd.w = fma(s, b.w, fc.w); + return fd; +} +#endif // ENABLE_BF16 +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +inline __device__ Acc mul(A a, B b) +{ + return a * b; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float mul(float a, float b) +{ + return a * b; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float2 mul(float2 a, float2 b) +{ + float2 c; + c.x = a.x * b.x; + c.y = a.y * b.y; + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float2 mul(float a, float2 b) +{ + float2 c; + c.x = a * b.x; + c.y = a * b.y; + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float4 mul(float4 a, float4 b) +{ + float4 c; + c.x = a.x * b.x; + c.y = a.y * b.y; + c.z = a.z * b.z; + c.w = a.w * b.w; + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float4 mul(float a, float4 b) +{ + float4 c; + c.x = a * b.x; + c.y = a * b.y; + c.z = a * b.z; + c.w = a * b.w; + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ Float8_ mul(float a, Float8_ b) +{ + Float8_ c; + c.x = make_float2(a * b.x.x, a * b.x.y); + c.y = make_float2(a * b.y.x, a * b.y.y); + c.z = make_float2(a * b.z.x, a * b.z.y); + c.w = make_float2(a * b.w.x, a * b.w.y); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ uint16_t mul(uint16_t a, uint16_t b) +{ + uint16_t c; + asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b)); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ uint32_t mul(uint32_t a, uint32_t b) +{ + uint32_t c; + asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b)); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ uint32_t mul(uint16_t a, uint32_t b) +{ + return mul(h0_h0(a), b); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ uint2 mul(uint2 a, uint2 b) +{ + uint2 c; + c.x = mul(a.x, b.x); + c.y = mul(a.y, b.y); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ uint2 mul(uint16_t a, uint2 b) +{ + uint32_t s = h0_h0(a); + uint2 c; + c.x = mul(s, b.x); + c.y = mul(s, b.y); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ uint4 mul(uint4 a, uint4 b) +{ + uint4 c; + c.x = mul(a.x, b.x); + c.y = mul(a.y, b.y); + c.z = mul(a.z, b.z); + c.w = mul(a.w, b.w); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ uint4 mul(uint16_t a, uint4 b) +{ + uint32_t s = h0_h0(a); + uint4 c; + c.x = mul(s, b.x); + c.y = mul(s, b.y); + c.z = mul(s, b.z); + c.w = mul(s, b.w); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float mul(uint16_t a, uint16_t b) +{ + float fa = half_to_float(a); + float fb = half_to_float(b); + return fa * fb; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float mul(uint16_t a, float b) +{ + return half_to_float(a) * b; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float2 mul(uint32_t a, uint32_t b) +{ + float2 fa = half2_to_float2(a); + float2 fb = half2_to_float2(b); + return mul(fa, fb); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float2 mul(uint16_t a, uint32_t b) +{ + return mul(h0_h0(a), b); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ Float4_ mul(uint2 a, uint2 b) +{ + Float4_ fc; + fc.x = mul(a.x, b.x); + fc.y = mul(a.y, b.y); + return fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ Float4_ mul(uint16_t a, uint2 b) +{ + uint32_t s = h0_h0(a); + Float4_ fc; + fc.x = mul(s, b.x); + fc.y = mul(s, b.y); + return fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ Float8_ mul(uint4 a, uint4 b) +{ + Float8_ fc; + fc.x = mul(a.x, b.x); + fc.y = mul(a.y, b.y); + fc.z = mul(a.z, b.z); + fc.w = mul(a.w, b.w); + return fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ Float8_ mul(uint16_t a, uint4 b) +{ + uint32_t s = h0_h0(a); + Float8_ fc; + fc.x = mul(s, b.x); + fc.y = mul(s, b.y); + fc.z = mul(s, b.z); + fc.w = mul(s, b.w); + return fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef ENABLE_BF16 +template<> +inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) +{ +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + return __hmul(a, b); +#else + return bf16hmul(a, b); +#endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) +{ + return bf16hmul2(a, b); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b) +{ + return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) +{ + bf16_4_t c; + c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x); + c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.y, b.y); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) +{ + __nv_bfloat162 s = bf162bf162(a); + bf16_4_t c; + c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.x); + c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.y); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) +{ + bf16_8_t c; + c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x); + c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.y, b.y); + c.z = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.z, b.z); + c.w = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.w, b.w); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) +{ + __nv_bfloat162 s = bf162bf162(a); + bf16_8_t c; + c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.x); + c.y = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.y); + c.z = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.z); + c.w = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(s, b.w); + return c; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b) +{ + float fa = (float)a; + float fb = (float)b; + return fa * fb; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float mul(__nv_bfloat16 a, float b) +{ + return __bfloat162float(a) * b; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b) +{ + float2 fa = bf1622float2(a); + float2 fb = bf1622float2(b); + return mul(fa, fb); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b) +{ + return mul(bf162bf162(a), b); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) +{ + Float4_ fc; + fc.x = mul(a.x, b.x); + fc.y = mul(a.y, b.y); + return fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) +{ + __nv_bfloat162 s = bf162bf162(a); + Float4_ fc; + fc.x = mul(s, b.x); + fc.y = mul(s, b.y); + return fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) +{ + Float8_ fc; + fc.x = mul(a.x, b.x); + fc.y = mul(a.y, b.y); + fc.z = mul(a.z, b.z); + fc.w = mul(a.w, b.w); + return fc; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) +{ + __nv_bfloat162 s = bf162bf162(a); + Float8_ fc; + fc.x = mul(s, b.x); + fc.y = mul(s, b.y); + fc.z = mul(s, b.z); + fc.w = mul(s, b.w); + return fc; +} +#endif // ENABLE_BF16 +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float sum(float v) +{ + return v; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float sum(float2 v) +{ + return v.x + v.y; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float sum(float4 v) +{ + return v.x + v.y + v.z + v.w; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef ENABLE_BF16 +inline __device__ float sum(__nv_bfloat162 v) +{ + float2 vf = bf1622float2(v); + return vf.x + vf.y; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float sum(bf16_4_t v) +{ + return sum(v.x) + sum(v.y); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float sum(bf16_8_t v) +{ + return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w); +} +#endif // ENABLE_BF16 +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float sum(uint16_t v) +{ + return half_to_float(v); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float sum(uint32_t v) +{ + float2 tmp = half2_to_float2(v); + return tmp.x + tmp.y; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float sum(uint2 v) +{ + uint32_t c = add(v.x, v.y); + return sum(c); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float sum(uint4 v) +{ +#if 1 + uint32_t c = add(v.x, v.y); + c = add(c, v.z); + c = add(c, v.w); +#else + uint32_t c = add(v.x, v.y); + uint32_t d = add(v.z, v.w); + c = add(c, d); +#endif + return sum(c); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float sum(Float4_ v) +{ + return v.x.x + v.x.y + v.y.x + v.y.y; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float sum(Float8_ v) +{ + return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +inline __device__ float dot(T a, T b) +{ + return sum(mul(a, b)); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +inline __device__ float dot(T a, T b) +{ + return sum(mul(a, b)); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ void zero(uint16_t& dst) +{ + dst = uint16_t(0); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +inline __device__ void zero(T& dst) +{ + constexpr int WORDS = sizeof(T) / 4; + union { + T raw; + uint32_t words[WORDS]; + } tmp; +#pragma unroll + for (int ii = 0; ii < WORDS; ++ii) { + tmp.words[ii] = 0u; + } + dst = tmp.raw; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 rotary_embedding_coefficient(const int zid, const int rot_embed_dim, const int t_step, const float base) +{ + const float pos_idx_inv_freq = t_step / pow(base, zid / (float)rot_embed_dim); + return {cos(pos_idx_inv_freq), sin(pos_idx_inv_freq)}; +} + +inline __device__ float2 rotary_embedding_transform(const float2 v, const float2 coef) +{ + float2 rot_v; + rot_v.x = coef.x * v.x - coef.y * v.y; + rot_v.y = coef.x * v.y + coef.y * v.x; + return rot_v; +} + +inline __device__ uint32_t rotary_embedding_transform(const uint32_t v, const float2 coef) +{ + float2 fv = half2_to_float2(v); + float2 rot_fv = rotary_embedding_transform(fv, coef); + return float2_to_half2(rot_fv); +} + +#ifdef ENABLE_BF16 +inline __device__ __nv_bfloat162 rotary_embedding_transform(const __nv_bfloat162 v, const float2 coef) +{ + float2 fv = bf1622float2(v); + float2 rot_fv = rotary_embedding_transform(fv, coef); + return __floats2bfloat162_rn(rot_fv.x, rot_fv.y); +} +#endif + +inline __device__ void apply_rotary_embedding(float& q, int zid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + return; +} + +inline __device__ void apply_rotary_embedding(float& q, float& k, int zid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + return; +} + +inline __device__ void apply_rotary_embedding(float2& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (2 * tid >= rot_embed_dim) { + return; + } + const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base); + q = rotary_embedding_transform(q, coef); +} + +inline __device__ void apply_rotary_embedding(float2& q, float2& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (2 * tid >= rot_embed_dim) { + return; + } + const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base); + q = rotary_embedding_transform(q, coef); + k = rotary_embedding_transform(k, coef); +} + +inline __device__ void apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (4 * tid >= rot_embed_dim) { + return; + } + + Float4_& q_ = *reinterpret_cast(&q); + const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base); + q_.x = rotary_embedding_transform(q_.x, coef0); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base); + q_.y = rotary_embedding_transform(q_.y, coef1); +} + +inline __device__ void apply_rotary_embedding(float4& q, float4& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (4 * tid >= rot_embed_dim) { + return; + } + + Float4_& q_ = *reinterpret_cast(&q); + Float4_& k_ = *reinterpret_cast(&k); + const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base); + q_.x = rotary_embedding_transform(q_.x, coef0); + k_.x = rotary_embedding_transform(k_.x, coef0); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base); + q_.y = rotary_embedding_transform(q_.y, coef1); + k_.y = rotary_embedding_transform(k_.y, coef1); +} + +inline __device__ void apply_rotary_embedding(uint32_t& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (2 * tid >= rot_embed_dim) { + return; + } + const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base); + q = rotary_embedding_transform(q, coef); +} + +inline __device__ void apply_rotary_embedding(uint32_t& q, uint32_t& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (2 * tid >= rot_embed_dim) { + return; + } + const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base); + q = rotary_embedding_transform(q, coef); + k = rotary_embedding_transform(k, coef); +} + +inline __device__ void apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (4 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base); + q.x = rotary_embedding_transform(q.x, coef0); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base); + q.y = rotary_embedding_transform(q.y, coef1); +} + +inline __device__ void apply_rotary_embedding(uint2& q, uint2& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (4 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base); + q.x = rotary_embedding_transform(q.x, coef0); + k.x = rotary_embedding_transform(k.x, coef0); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base); + q.y = rotary_embedding_transform(q.y, coef1); + k.y = rotary_embedding_transform(k.y, coef1); +} + +inline __device__ void apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (8 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step, base); + q.x = rotary_embedding_transform(q.x, coef0); + const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step, base); + q.y = rotary_embedding_transform(q.y, coef1); + const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step, base); + q.z = rotary_embedding_transform(q.z, coef2); + const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step, base); + q.w = rotary_embedding_transform(q.w, coef3); +} + +inline __device__ void apply_rotary_embedding(uint4& q, uint4& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (8 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step, base); + q.x = rotary_embedding_transform(q.x, coef0); + k.x = rotary_embedding_transform(k.x, coef0); + const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step, base); + q.y = rotary_embedding_transform(q.y, coef1); + k.y = rotary_embedding_transform(k.y, coef1); + const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step, base); + q.z = rotary_embedding_transform(q.z, coef2); + k.z = rotary_embedding_transform(k.z, coef2); + const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step, base); + q.w = rotary_embedding_transform(q.w, coef3); + k.w = rotary_embedding_transform(k.w, coef3); +} + +#ifdef ENABLE_BF16 +inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (2 * tid >= rot_embed_dim) { + return; + } + const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base); + q = rotary_embedding_transform(q, coef); +} + +inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, __nv_bfloat162& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (2 * tid >= rot_embed_dim) { + return; + } + const auto coef = rotary_embedding_coefficient(2 * tid, rot_embed_dim, t_step, base); + q = rotary_embedding_transform(q, coef); + k = rotary_embedding_transform(k, coef); +} + +inline __device__ void apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (4 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base); + q.x = rotary_embedding_transform(q.x, coef0); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base); + q.y = rotary_embedding_transform(q.y, coef1); +} + +inline __device__ void apply_rotary_embedding(bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (4 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(4 * tid, rot_embed_dim, t_step, base); + q.x = rotary_embedding_transform(q.x, coef0); + k.x = rotary_embedding_transform(k.x, coef0); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, rot_embed_dim, t_step, base); + q.y = rotary_embedding_transform(q.y, coef1); + k.y = rotary_embedding_transform(k.y, coef1); +} + +inline __device__ void apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (8 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step, base); + q.x = rotary_embedding_transform(q.x, coef0); + const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step, base); + q.y = rotary_embedding_transform(q.y, coef1); + const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step, base); + q.z = rotary_embedding_transform(q.z, coef2); + const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step, base); + q.w = rotary_embedding_transform(q.w, coef3); +} + +inline __device__ void apply_rotary_embedding(bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, int t_step, const float base=10000.0f) +{ + if (8 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(8 * tid, rot_embed_dim, t_step, base); + q.x = rotary_embedding_transform(q.x, coef0); + k.x = rotary_embedding_transform(k.x, coef0); + const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, rot_embed_dim, t_step, base); + q.y = rotary_embedding_transform(q.y, coef1); + k.y = rotary_embedding_transform(k.y, coef1); + const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, rot_embed_dim, t_step, base); + q.z = rotary_embedding_transform(q.z, coef2); + k.z = rotary_embedding_transform(k.z, coef2); + const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, rot_embed_dim, t_step, base); + q.w = rotary_embedding_transform(q.w, coef3); + k.w = rotary_embedding_transform(k.w, coef3); +} +#endif // ENABLE_BF16 + +template +inline __device__ float2 rotary_embedding_coefficient(const int zid, const int t_step, const T* rotary_cos, const T* rotary_sin) +{ + // zid is the index of the dimension (0, 2, 4, ..., rotary_dim). + // rotary_cos/sin stores those at index 0, 1, 2, ..., rotary_dim / 2. + return {float(rotary_cos[zid / 2]), float(rotary_sin[zid / 2])}; +} + +// fp16 is special because we use uint16_t for reading the data, for backward compatibility. +template <> +inline __device__ float2 rotary_embedding_coefficient(const int zid, const int t_step, const uint16_t* rotary_cos, const uint16_t* rotary_sin) +{ + // zid is the index of the dimension (0, 2, 4, ..., rotary_dim). + // rotary_cos/sin stores those at index 0, 1, 2, ..., rotary_dim / 2. + return {float(reinterpret_cast(rotary_cos)[zid / 2]), + float(reinterpret_cast(rotary_sin)[zid / 2])}; +} + +inline __device__ void apply_rotary_embedding(float& q, int zid, int rot_embed_dim, int t_step, const float* rotary_cos, const float* rotary_sin) +{ + return; +} + +inline __device__ void apply_rotary_embedding(float& q, float& k, int zid, int rot_embed_dim, int t_step, const float* rotary_cos, const float* rotary_sin) +{ + return; +} + +inline __device__ void apply_rotary_embedding(float2& q, int tid, int rot_embed_dim, int t_step, const float* rotary_cos, const float* rotary_sin) +{ + if (2 * tid >= rot_embed_dim) { + return; + } + const auto coef = rotary_embedding_coefficient(2 * tid, t_step, rotary_cos, rotary_sin); + q = rotary_embedding_transform(q, coef); +} + +inline __device__ void apply_rotary_embedding(float2& q, float2& k, int tid, int rot_embed_dim, int t_step, const float* rotary_cos, const float* rotary_sin) +{ + if (2 * tid >= rot_embed_dim) { + return; + } + const auto coef = rotary_embedding_coefficient(2 * tid, t_step, rotary_cos, rotary_sin); + q = rotary_embedding_transform(q, coef); + k = rotary_embedding_transform(k, coef); +} + +inline __device__ void apply_rotary_embedding(float4& q, int tid, int rot_embed_dim, int t_step, const float* rotary_cos, const float* rotary_sin) +{ + if (4 * tid >= rot_embed_dim) { + return; + } + + Float4_& q_ = *reinterpret_cast(&q); + const auto coef0 = rotary_embedding_coefficient(4 * tid, t_step, rotary_cos, rotary_sin); + q_.x = rotary_embedding_transform(q_.x, coef0); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, t_step, rotary_cos, rotary_sin); + q_.y = rotary_embedding_transform(q_.y, coef1); +} + +inline __device__ void apply_rotary_embedding(float4& q, float4& k, int tid, int rot_embed_dim, int t_step, const float* rotary_cos, const float* rotary_sin) +{ + if (4 * tid >= rot_embed_dim) { + return; + } + + Float4_& q_ = *reinterpret_cast(&q); + Float4_& k_ = *reinterpret_cast(&k); + const auto coef0 = rotary_embedding_coefficient(4 * tid, t_step, rotary_cos, rotary_sin); + q_.x = rotary_embedding_transform(q_.x, coef0); + k_.x = rotary_embedding_transform(k_.x, coef0); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, t_step, rotary_cos, rotary_sin); + q_.y = rotary_embedding_transform(q_.y, coef1); + k_.y = rotary_embedding_transform(k_.y, coef1); +} + +inline __device__ void apply_rotary_embedding(uint32_t& q, int tid, int rot_embed_dim, int t_step, const uint16_t* rotary_cos, const uint16_t* rotary_sin) +{ + if (2 * tid >= rot_embed_dim) { + return; + } + const auto coef = rotary_embedding_coefficient(2 * tid, t_step, rotary_cos, rotary_sin); + q = rotary_embedding_transform(q, coef); +} + +inline __device__ void apply_rotary_embedding(uint32_t& q, uint32_t& k, int tid, int rot_embed_dim, int t_step, const uint16_t* rotary_cos, const uint16_t* rotary_sin) +{ + if (2 * tid >= rot_embed_dim) { + return; + } + const auto coef = rotary_embedding_coefficient(2 * tid, t_step, rotary_cos, rotary_sin); + q = rotary_embedding_transform(q, coef); + k = rotary_embedding_transform(k, coef); +} + +inline __device__ void apply_rotary_embedding(uint2& q, int tid, int rot_embed_dim, int t_step, const uint16_t* rotary_cos, const uint16_t* rotary_sin) +{ + if (4 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(4 * tid, t_step, rotary_cos, rotary_sin); + q.x = rotary_embedding_transform(q.x, coef0); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, t_step, rotary_cos, rotary_sin); + q.y = rotary_embedding_transform(q.y, coef1); +} + +inline __device__ void apply_rotary_embedding(uint2& q, uint2& k, int tid, int rot_embed_dim, int t_step, const uint16_t* rotary_cos, const uint16_t* rotary_sin) +{ + if (4 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(4 * tid, t_step, rotary_cos, rotary_sin); + q.x = rotary_embedding_transform(q.x, coef0); + k.x = rotary_embedding_transform(k.x, coef0); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, t_step, rotary_cos, rotary_sin); + q.y = rotary_embedding_transform(q.y, coef1); + k.y = rotary_embedding_transform(k.y, coef1); +} + +inline __device__ void apply_rotary_embedding(uint4& q, int tid, int rot_embed_dim, int t_step, const uint16_t* rotary_cos, const uint16_t* rotary_sin) +{ + if (8 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(8 * tid, t_step, rotary_cos, rotary_sin); + q.x = rotary_embedding_transform(q.x, coef0); + const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, t_step, rotary_cos, rotary_sin); + q.y = rotary_embedding_transform(q.y, coef1); + const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, t_step, rotary_cos, rotary_sin); + q.z = rotary_embedding_transform(q.z, coef2); + const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, t_step, rotary_cos, rotary_sin); + q.w = rotary_embedding_transform(q.w, coef3); +} + +inline __device__ void apply_rotary_embedding(uint4& q, uint4& k, int tid, int rot_embed_dim, int t_step, const uint16_t* rotary_cos, const uint16_t* rotary_sin) +{ + if (8 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(8 * tid, t_step, rotary_cos, rotary_sin); + q.x = rotary_embedding_transform(q.x, coef0); + k.x = rotary_embedding_transform(k.x, coef0); + const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, t_step, rotary_cos, rotary_sin); + q.y = rotary_embedding_transform(q.y, coef1); + k.y = rotary_embedding_transform(k.y, coef1); + const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, t_step, rotary_cos, rotary_sin); + q.z = rotary_embedding_transform(q.z, coef2); + k.z = rotary_embedding_transform(k.z, coef2); + const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, t_step, rotary_cos, rotary_sin); + q.w = rotary_embedding_transform(q.w, coef3); + k.w = rotary_embedding_transform(k.w, coef3); +} + +#ifdef ENABLE_BF16 +inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, int tid, int rot_embed_dim, int t_step, const __nv_bfloat16* rotary_cos, const __nv_bfloat16* rotary_sin) +{ + if (2 * tid >= rot_embed_dim) { + return; + } + const auto coef = rotary_embedding_coefficient(2 * tid, t_step, rotary_cos, rotary_sin); + q = rotary_embedding_transform(q, coef); +} + +inline __device__ void apply_rotary_embedding(__nv_bfloat162& q, __nv_bfloat162& k, int tid, int rot_embed_dim, int t_step, const __nv_bfloat16* rotary_cos, const __nv_bfloat16* rotary_sin) +{ + if (2 * tid >= rot_embed_dim) { + return; + } + const auto coef = rotary_embedding_coefficient(2 * tid, t_step, rotary_cos, rotary_sin); + q = rotary_embedding_transform(q, coef); + k = rotary_embedding_transform(k, coef); +} + +inline __device__ void apply_rotary_embedding(bf16_4_t& q, int tid, int rot_embed_dim, int t_step, const __nv_bfloat16* rotary_cos, const __nv_bfloat16* rotary_sin) +{ + if (4 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(4 * tid, t_step, rotary_cos, rotary_sin); + q.x = rotary_embedding_transform(q.x, coef0); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, t_step, rotary_cos, rotary_sin); + q.y = rotary_embedding_transform(q.y, coef1); +} + +inline __device__ void apply_rotary_embedding(bf16_4_t& q, bf16_4_t& k, int tid, int rot_embed_dim, int t_step, const __nv_bfloat16* rotary_cos, const __nv_bfloat16* rotary_sin) +{ + if (4 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(4 * tid, t_step, rotary_cos, rotary_sin); + q.x = rotary_embedding_transform(q.x, coef0); + k.x = rotary_embedding_transform(k.x, coef0); + const auto coef1 = rotary_embedding_coefficient(4 * tid + 2, t_step, rotary_cos, rotary_sin); + q.y = rotary_embedding_transform(q.y, coef1); + k.y = rotary_embedding_transform(k.y, coef1); +} + +inline __device__ void apply_rotary_embedding(bf16_8_t& q, int tid, int rot_embed_dim, int t_step, const __nv_bfloat16* rotary_cos, const __nv_bfloat16* rotary_sin) +{ + if (8 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(8 * tid, t_step, rotary_cos, rotary_sin); + q.x = rotary_embedding_transform(q.x, coef0); + const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, t_step, rotary_cos, rotary_sin); + q.y = rotary_embedding_transform(q.y, coef1); + const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, t_step, rotary_cos, rotary_sin); + q.z = rotary_embedding_transform(q.z, coef2); + const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, t_step, rotary_cos, rotary_sin); + q.w = rotary_embedding_transform(q.w, coef3); +} + +inline __device__ void apply_rotary_embedding(bf16_8_t& q, bf16_8_t& k, int tid, int rot_embed_dim, int t_step, const __nv_bfloat16* rotary_cos, const __nv_bfloat16* rotary_sin) +{ + if (8 * tid >= rot_embed_dim) { + return; + } + const auto coef0 = rotary_embedding_coefficient(8 * tid, t_step, rotary_cos, rotary_sin); + q.x = rotary_embedding_transform(q.x, coef0); + k.x = rotary_embedding_transform(k.x, coef0); + const auto coef1 = rotary_embedding_coefficient(8 * tid + 2, t_step, rotary_cos, rotary_sin); + q.y = rotary_embedding_transform(q.y, coef1); + k.y = rotary_embedding_transform(k.y, coef1); + const auto coef2 = rotary_embedding_coefficient(8 * tid + 4, t_step, rotary_cos, rotary_sin); + q.z = rotary_embedding_transform(q.z, coef2); + k.z = rotary_embedding_transform(k.z, coef2); + const auto coef3 = rotary_embedding_coefficient(8 * tid + 6, t_step, rotary_cos, rotary_sin); + q.w = rotary_embedding_transform(q.w, coef3); + k.w = rotary_embedding_transform(k.w, coef3); +} +#endif // ENABLE_BF16 + +template +__device__ __inline__ void vec_from_smem_transpose(Vec_T& vec, T* smem, int transpose_idx, int smem_pitch); + +template<> +__device__ __inline__ void vec_from_smem_transpose(float& vec, float* smem, int transpose_idx, int smem_pitch) +{ + return; +} + +template<> +__device__ __inline__ void vec_from_smem_transpose(uint32_t& vec, uint16_t* smem, int transpose_idx, int smem_pitch) +{ + union { + uint32_t u32; + uint16_t u16[2]; + } tmp; + tmp.u16[0] = smem[transpose_idx]; + tmp.u16[1] = smem[smem_pitch + transpose_idx]; + + vec = tmp.u32; +} + +template<> +__device__ __inline__ void vec_from_smem_transpose(uint2& vec, uint16_t* smem, int transpose_idx, int smem_pitch) +{ + union { + uint32_t u32; + uint16_t u16[2]; + } tmp_1, tmp_2; + tmp_1.u32 = *reinterpret_cast(&smem[transpose_idx]); + tmp_2.u32 = *reinterpret_cast(&smem[smem_pitch + transpose_idx]); + + union { + uint2 u32x2; + uint16_t u16[4]; + } tmp_3; + tmp_3.u16[0] = tmp_1.u16[0]; + tmp_3.u16[1] = tmp_2.u16[0]; + tmp_3.u16[2] = tmp_1.u16[1]; + tmp_3.u16[3] = tmp_2.u16[1]; + + vec = tmp_3.u32x2; +} + +template<> +__device__ __inline__ void vec_from_smem_transpose(uint4& vec, uint16_t* smem, int transpose_idx, int smem_pitch) +{ + union { + uint64_t u64; + uint16_t u16[4]; + } tmp_1, tmp_2; + tmp_1.u64 = *reinterpret_cast(&smem[transpose_idx]); + tmp_2.u64 = *reinterpret_cast(&smem[smem_pitch + transpose_idx]); + + union { + uint4 u32x4; + uint16_t u16[8]; + } tmp_3; + tmp_3.u16[0] = tmp_1.u16[0]; + tmp_3.u16[1] = tmp_2.u16[0]; + tmp_3.u16[2] = tmp_1.u16[1]; + tmp_3.u16[3] = tmp_2.u16[1]; + tmp_3.u16[4] = tmp_1.u16[2]; + tmp_3.u16[5] = tmp_2.u16[2]; + tmp_3.u16[6] = tmp_1.u16[3]; + tmp_3.u16[7] = tmp_2.u16[3]; + + vec = tmp_3.u32x4; +} + +#ifdef ENABLE_BF16 +template<> +__device__ __inline__ void +vec_from_smem_transpose(bf16_4_t& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch) +{ + union { + uint32_t u32; + __nv_bfloat16 bf16[2]; + } tmp_1, tmp_2; + tmp_1.u32 = *reinterpret_cast(&smem[transpose_idx]); + tmp_2.u32 = *reinterpret_cast(&smem[smem_pitch + transpose_idx]); + + vec.x = __nv_bfloat162{tmp_1.bf16[0], tmp_2.bf16[0]}; + vec.y = __nv_bfloat162{tmp_1.bf16[1], tmp_2.bf16[1]}; +} + +template<> +__device__ __inline__ void +vec_from_smem_transpose(bf16_8_t& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch) +{ + union { + uint64_t u64; + __nv_bfloat16 bf16[4]; + } tmp_1, tmp_2; + tmp_1.u64 = *reinterpret_cast(&smem[transpose_idx]); + tmp_2.u64 = *reinterpret_cast(&smem[smem_pitch + transpose_idx]); + + vec.x = __nv_bfloat162{tmp_1.bf16[0], tmp_2.bf16[0]}; + vec.y = __nv_bfloat162{tmp_1.bf16[1], tmp_2.bf16[1]}; + vec.z = __nv_bfloat162{tmp_1.bf16[2], tmp_2.bf16[2]}; + vec.w = __nv_bfloat162{tmp_1.bf16[3], tmp_2.bf16[3]}; +} +#endif // ENABLE_BF16 + +template<> +__device__ __inline__ void vec_from_smem_transpose(float4& vec, float* smem, int transpose_idx, int smem_pitch) +{ + vec.x = smem[transpose_idx]; + vec.z = smem[transpose_idx + 1]; + vec.y = smem[smem_pitch + transpose_idx]; + vec.w = smem[smem_pitch + transpose_idx + 1]; +} + +template<> +__device__ __inline__ void vec_from_smem_transpose(uint32_t& vec, half* smem, int transpose_idx, int smem_pitch) +{ + union { + uint32_t u32; + half u16[2]; + } tmp; + tmp.u16[0] = smem[transpose_idx]; + tmp.u16[1] = smem[smem_pitch + transpose_idx]; + + vec = tmp.u32; +} + +#ifdef ENABLE_BF16 +template<> +__device__ __inline__ void +vec_from_smem_transpose(__nv_bfloat162& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch) +{ + vec.x = smem[transpose_idx]; + vec.y = smem[smem_pitch + transpose_idx]; +} +#endif + +template<> +__device__ __inline__ void vec_from_smem_transpose(float2& vec, float* smem, int transpose_idx, int smem_pitch) +{ + vec.x = smem[transpose_idx]; + vec.y = smem[smem_pitch + transpose_idx]; +} + +template +__device__ __inline__ void write_smem_transpose(const Vec_T& vec, T* smem, int transpose_idx, int smem_pitch); + +template<> +__device__ __inline__ void write_smem_transpose(const float& vec, float* smem, int transpose_idx, int smem_pitch) +{ + return; +} + +template<> +__device__ __inline__ void write_smem_transpose(const uint4& vec, uint16_t* smem, int transpose_idx, int smem_pitch) +{ + union { + uint64_t u64; + uint16_t u16[4]; + } tmp_1, tmp_2; + + union { + uint4 u32x4; + uint16_t u16[8]; + } tmp_3; + tmp_3.u32x4 = vec; + tmp_1.u16[0] = tmp_3.u16[0]; + tmp_2.u16[0] = tmp_3.u16[1]; + tmp_1.u16[1] = tmp_3.u16[2]; + tmp_2.u16[1] = tmp_3.u16[3]; + tmp_1.u16[2] = tmp_3.u16[4]; + tmp_2.u16[2] = tmp_3.u16[5]; + tmp_1.u16[3] = tmp_3.u16[6]; + tmp_2.u16[3] = tmp_3.u16[7]; + + *reinterpret_cast(&smem[transpose_idx]) = tmp_1.u64; + *reinterpret_cast(&smem[smem_pitch + transpose_idx]) = tmp_2.u64; +} + +template<> +__device__ __inline__ void write_smem_transpose(const uint2& vec, uint16_t* smem, int transpose_idx, int smem_pitch) +{ + union { + uint32_t u32; + uint16_t u16[2]; + } tmp_1, tmp_2; + + union { + uint2 u32x2; + uint16_t u16[4]; + } tmp_3; + tmp_3.u32x2 = vec; + tmp_1.u16[0] = tmp_3.u16[0]; + tmp_2.u16[0] = tmp_3.u16[1]; + tmp_1.u16[1] = tmp_3.u16[2]; + tmp_2.u16[1] = tmp_3.u16[3]; + + *reinterpret_cast(&smem[transpose_idx]) = tmp_1.u32; + *reinterpret_cast(&smem[smem_pitch + transpose_idx]) = tmp_2.u32; +} + +template<> +__device__ __inline__ void write_smem_transpose(const uint32_t& vec, uint16_t* smem, int transpose_idx, int smem_pitch) +{ + union { + uint32_t u32; + uint16_t u16[2]; + } tmp; + tmp.u32 = vec; + + smem[transpose_idx] = tmp.u16[0]; + smem[smem_pitch + transpose_idx] = tmp.u16[1]; +} + +template<> +__device__ __inline__ void write_smem_transpose(const float4& vec, float* smem, int transpose_idx, int smem_pitch) +{ + smem[transpose_idx] = vec.x; + smem[transpose_idx + 1] = vec.z; + smem[smem_pitch + transpose_idx] = vec.y; + smem[smem_pitch + transpose_idx + 1] = vec.w; +} + +template<> +__device__ __inline__ void write_smem_transpose(const uint32_t& vec, half* smem, int transpose_idx, int smem_pitch) +{ + union { + uint32_t u32; + half u16[2]; + } tmp; + + tmp.u32 = vec; + smem[transpose_idx] = tmp.u16[0]; + smem[smem_pitch + transpose_idx] = tmp.u16[1]; +} + +#ifdef ENABLE_BF16 +template<> +__device__ __inline__ void +write_smem_transpose(const __nv_bfloat162& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch) +{ + smem[transpose_idx] = vec.x; + smem[smem_pitch + transpose_idx] = vec.y; +} + +template<> +__device__ __inline__ void +write_smem_transpose(const bf16_4_t& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch) +{ + write_smem_transpose(reinterpret_cast(vec), reinterpret_cast(smem), transpose_idx, smem_pitch); +} + +template<> +__device__ __inline__ void +write_smem_transpose(const bf16_8_t& vec, __nv_bfloat16* smem, int transpose_idx, int smem_pitch) +{ + write_smem_transpose(reinterpret_cast(vec), reinterpret_cast(smem), transpose_idx, smem_pitch); +} +#endif + +template<> +__device__ __inline__ void write_smem_transpose(const float2& vec, float* smem, int transpose_idx, int smem_pitch) +{ + smem[transpose_idx] = vec.x; + smem[smem_pitch + transpose_idx] = vec.y; +} + +} // namespace mmha diff --git a/default.yaml b/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..09b2aebd54031b6e080638cbed3605d0ef3fe83e --- /dev/null +++ b/default.yaml @@ -0,0 +1,45 @@ +# rich_progress_bar: +# _target_: pytorch_lightning.callbacks.RichProgressBar + +rich_model_summary: + _target_: pytorch_lightning.callbacks.RichModelSummary + +model_checkpoint: + _target_: pytorch_lightning.callbacks.ModelCheckpoint + monitor: "val/acc" # name of the logged metric which determines when model is improving + mode: "max" # can be "max" or "min" + save_top_k: 1 # save k best models (determined by above metric) + save_last: True # additionally always save model from last epoch + verbose: False + dirpath: ${oc.env:CHECKPOINT_DIR,checkpoints}/${oc.select:name,''} + filename: "epoch_{epoch:03d}" + auto_insert_metric_name: False + +early_stopping: + _target_: pytorch_lightning.callbacks.EarlyStopping + monitor: "val/acc" # name of the logged metric which determines when model is improving + mode: "max" # can be "max" or "min" + patience: 100 # how many epochs of not improving until training stops + min_delta: 0 # minimum change in the monitored metric needed to qualify as an improvement + +learning_rate_monitor: + _target_: pytorch_lightning.callbacks.LearningRateMonitor + logging_interval: step + +speed_monitor: + _target_: src.callbacks.speed_monitor.SpeedMonitor + intra_step_time: True + inter_step_time: True + epoch_time: True + +loss_scale_monitor: + _target_: src.callbacks.loss_scale_monitor.LossScaleMonitor + +params_log: + _target_: src.callbacks.params_log.ParamsLog + total_params_log: True + trainable_params_log: True + non_trainable_params_log: True + +gpu_affinity: + _target_: src.callbacks.gpu_affinity.GpuAffinity diff --git a/distributed.py b/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..74c55279645cd0fd687584bc1b7374c8c3c73e56 --- /dev/null +++ b/distributed.py @@ -0,0 +1,144 @@ +from typing import Optional + +import torch +from torch import Tensor +from torch.distributed import ProcessGroup + +# `all_gather_into_tensor` and `reduce_scatter_tensor` are new placeholders for +# `_all_gather_base` and `_reduce_scatter_base`. They require the most recent +# version of PyTorch. The following 4 lines are for backward compatibility with +# older PyTorch. +if "all_gather_into_tensor" not in dir(torch.distributed): + torch.distributed.all_gather_into_tensor = torch.distributed._all_gather_base +if "reduce_scatter_tensor" not in dir(torch.distributed): + torch.distributed.reduce_scatter_tensor = torch.distributed._reduce_scatter_base + + +# Raw operation, does not support autograd, but does support async +def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False): + world_size = torch.distributed.get_world_size(process_group) + output = torch.empty( + world_size * input_.shape[0], *input_.shape[1:], dtype=input_.dtype, device=input_.device + ) + handle = torch.distributed.all_gather_into_tensor( + output, input_.contiguous(), group=process_group, async_op=async_op + ) + return output, handle + + +# Raw operation, does not support autograd, but does support async +def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False): + world_size = torch.distributed.get_world_size(process_group) + assert input_.shape[0] % world_size == 0 + output = torch.empty( + input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device + ) + handle = torch.distributed.reduce_scatter_tensor( + output, input_.contiguous(), group=process_group, async_op=async_op + ) + return output, handle + + +# Raw operation, does not support autograd, but does support async +def all_reduce_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False): + input_ = input_.contiguous() + handle = torch.distributed.all_reduce(input_, group=process_group, async_op=async_op) + return input_, handle + + +class AllGatherFunc(torch.autograd.Function): + """Gather the input from sequence parallel region and concatenate.""" + + @staticmethod + def forward(ctx, input_: Tensor, process_group: ProcessGroup) -> Tensor: + ctx.process_group = process_group + output, _ = all_gather_raw(input_, process_group) + return output + + @staticmethod + def backward(ctx, grad_output: Tensor): + grad_input, _ = reduce_scatter_raw(grad_output, ctx.process_group) + return grad_input, None + + +# Supports autograd, but does not support async +all_gather = AllGatherFunc.apply + + +class ReduceScatterFunc(torch.autograd.Function): + """Reduce scatter the input from the sequence parallel region and concatenate.""" + + @staticmethod + def forward(ctx, input_: Tensor, process_group: ProcessGroup) -> Tensor: + ctx.process_group = process_group + output, _ = reduce_scatter_raw(input_, process_group) + return output + + @staticmethod + def backward(ctx, grad_output: Tensor): + grad_input, _ = all_gather_raw(grad_output, ctx.process_group) + return grad_input, None + + +# Supports autograd, but does not support async +reduce_scatter = ReduceScatterFunc.apply + + +class AllReduceFunc(torch.autograd.Function): + """Gather the input from sequence parallel region and concatenate.""" + + @staticmethod + def forward(ctx, input_: Tensor, process_group: ProcessGroup) -> Tensor: + ctx.process_group = process_group + output, _ = all_reduce_raw(input_, process_group) + return output + + @staticmethod + def backward(ctx, grad_output: Tensor): + return grad_output, None + + +# Supports autograd, but does not support async +all_reduce = AllReduceFunc.apply + + +def sync_shared_params(model: torch.nn.Module, process_group: ProcessGroup): + # We want to iterate over parameters with _shared_params=True in the same order, + # as different ranks might have different number of parameters (e.g., only rank 0 has bias). + pamams_shared = { + name: p for name, p in model.named_parameters() if getattr(p, "_shared_params", False) + } + for _, p in sorted(pamams_shared.items()): + with torch.no_grad(): + # Broadcast needs src to be global rank, not group rank + torch.distributed.broadcast( + p, src=torch.distributed.get_global_rank(process_group, 0), group=process_group + ) + + +# Ref: https://github.com/NVIDIA/Megatron-LM/blob/52e636888cccc41e931251c417a7181fc36de926/megatron/optimizer/optimizer.py#L256 +def allreduce_sequence_parallel_grad(model: torch.nn.Module, process_group: ProcessGroup): + # We want to iterate over parameters with _sequence_parallel=True in the same order, + # as different ranks might have different number of parameters (e.g., only rank 0 has bias). + params_seqparallel = { + name: p for name, p in model.named_parameters() if getattr(p, "_sequence_parallel", False) + } + grads = [p.grad for _, p in sorted(params_seqparallel.items())] + if grads: + with torch.no_grad(): + coalesced = torch._utils._flatten_dense_tensors(grads) + torch.distributed.all_reduce(coalesced, group=process_group) + for buf, synced in zip(grads, torch._utils._unflatten_dense_tensors(coalesced, grads)): + buf.copy_(synced) + + +def get_dim_for_local_rank(dim: int, world_size: int, local_rank: int, multiple_of: int = 1) -> int: + """Get the dim for the local rank derived from splitting dim on world_size processes. + + The split may not be even across the world_size processes. + """ + multiple = dim // multiple_of + div = multiple // world_size + mod = multiple % world_size + local_multiple = div + int(local_rank < mod) + return local_multiple * multiple_of diff --git a/dropout.h b/dropout.h new file mode 100644 index 0000000000000000000000000000000000000000..4882f97d93832debd4fa51bf3a1d3f4eae916fc5 --- /dev/null +++ b/dropout.h @@ -0,0 +1,94 @@ +/****************************************************************************** + * Copyright (c) 2024, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include "philox.cuh" +#include "utils.h" + +namespace flash { + +struct Dropout { + + const unsigned long long seed, offset; + const uint8_t p_dropout_in_uint8_t; + + __forceinline__ __device__ Dropout(const unsigned long long seed, const unsigned long long offset, + const uint8_t p_dropout_in_uint8_t, + const int bid, const int hid, const int tid, const int nheads) + : seed(seed) + , offset(offset + (bid * nheads + hid) * 32 + tid % 32) + , p_dropout_in_uint8_t(p_dropout_in_uint8_t) { + } + + template + __forceinline__ __device__ void apply_dropout(Tensor &tensor_, + int block_row_start, int block_col_start, int block_row_stride) { + // convert shape from (4, MMA_M, MMA_N) to (8, MMA_M, MMA_N / 2) + Tensor tensor = make_tensor(tensor_.data(), flash::convert_layout_acc_dropout(tensor_.layout())); + using T = typename Engine::value_type; + auto encode_dropout = [](bool keep, T val) { + return keep ? val : (encode_dropout_in_sign_bit ? -val : T(0)); + }; + static_assert(decltype(size<2>(tensor))::value % 2 == 0); + const uint16_t p_dropout_8bit_in_uint16_t = uint16_t(p_dropout_in_uint8_t); + const uint32_t p_dropout_8bit_in_uint32_t = (uint32_t(p_dropout_8bit_in_uint16_t) << 16) | uint32_t(p_dropout_8bit_in_uint16_t); + // if (cute::thread0()) { printf("threshold2 = 0x%x\n", p_dropout_8bit_in_uint32_t); } + #pragma unroll + for (int m = 0; m < size<1>(tensor); ++m, block_row_start += block_row_stride) { + uint2 rowcol = make_uint2(block_row_start, block_col_start); + #pragma unroll + for (int n = 0; n < size<2>(tensor) / 2; ++n, ++rowcol.y) { + // if (cute::thread(32, 0)) { printf("m = %d, n = %d, row = %d, col = %d\n", m, n, int(rowcol.x), int(rowcol.y));} + uint4 random_uint4 = flash::philox(seed, reinterpret_cast(rowcol), offset); + // if (cute::thread0()) { printf("philox = %u, %d, %d, %d\n", random_uint4.x, random_uint4.y, random_uint4.z, random_uint4.w);} + uint8_t (&rnd_8)[16] = reinterpret_cast(random_uint4); + // Special implementation for 16-bit types: we duplicate the threshold to the + // low and high 16 bits of a 32-bit value, then use the f16x2 comparison instruction + // to get a mask. The low 16 bits of the mask will be either 0xffff or 0x0000, + // and the high 16 bits will be either 0xffff or 0x0000, depending on whether + // the random value is less than the threshold. + // We then do a bit-wise AND between the mask and the original value (in 32-bit). + // We're exploiting the fact that floating point comparison is equivalent to integer + // comparison, since we're comparing unsigned integers whose top 8-bits are zero. + if (!encode_dropout_in_sign_bit + && (std::is_same::value || std::is_same::value)) { + uint16_t rnd_16[16]; + #pragma unroll + for (int i = 0; i < 16; i++) { rnd_16[i] = uint16_t(rnd_8[i]); } + uint32_t (&rnd_32)[8] = reinterpret_cast(rnd_16); + #pragma unroll + for (int j = 0; j < 2; j++) { + Tensor tensor_uint32 = recast(tensor(_, m, n * 2 + j)); + // if (cute::thread0()) { printf("random = 0x%x, 0x%x, 0x%x, 0x%x\n", rnd_32[j * 4 + 0], rnd_32[j * 4 + 1], rnd_32[j * 4 + 2], rnd_32[j * 4 + 3]); } + // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); } + #pragma unroll + for (int i = 0; i < 4; i++) { + uint32_t mask; + asm volatile("set.le.u32.f16x2 %0, %1, %2;\n" : "=r"(mask) : "r"(rnd_32[j * 4 + i]), "r"(p_dropout_8bit_in_uint32_t)); + tensor_uint32(i) &= mask; + } + // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); } + } + } else { + #pragma unroll + for (int j = 0; j < 2; j++) { + #pragma unroll + for (int i = 0; i < 8; i++) { + tensor(i, m, n * 2 + j) = encode_dropout(rnd_8[j * 8 + i] <= p_dropout_in_uint8_t, tensor(i, m, n * 2 + j)); + } + Tensor tensor_uint32 = recast(tensor(_, m, n * 2 + j)); + // if (cute::thread0()) { printf("tensor_uint32 = 0x%x, 0x%x, 0x%x, 0x%x\n", tensor_uint32(0), tensor_uint32(1), tensor_uint32(2), tensor_uint32(3)); } + } + } + // // if ((threadIdx.x == 0) && (blockIdx.x == 0) && (blockIdx.y == 0)) { + // // printf("n = %d, ph Philox: %u, %u, %u, %u\n", n, rnd_8.x, rnd_8.y, rnd_8.z, rnd_8.w); + // // } + } + } + } + +}; + +} // namespace flash diff --git a/ema.yaml b/ema.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d5586db26b4ff8e8484dafc89fd47716f75ca4c4 --- /dev/null +++ b/ema.yaml @@ -0,0 +1,4 @@ +ema: + _target_: src.callbacks.ema.EMACallback + decay: ??? + use_num_updates: False diff --git a/embedding.py b/embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..33587d09413dbab5edccfa3806fca829a6f9f9da --- /dev/null +++ b/embedding.py @@ -0,0 +1,216 @@ +# Copyright (c) 2022, Tri Dao. + +import torch +import torch.nn as nn +from einops import rearrange +from torch import Tensor + +from flash_attn.utils.distributed import all_reduce, reduce_scatter + + +class GPT2Embeddings(nn.Module): + def __init__( + self, + embed_dim, + vocab_size, + max_position_embeddings, + padding_idx=None, + word_embed_proj_dim=None, + device=None, + dtype=None, + ): + """ + If max_position_embeddings <= 0, there's no position embeddings + If word_embe_proj_dim is not None (e.g., OPT-350m), we embed to that dimension + the project up to embed_dim + """ + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + if word_embed_proj_dim is None: + self.word_embeddings = nn.Embedding( + vocab_size, embed_dim, padding_idx=padding_idx, **factory_kwargs + ) + self.project_in = None + else: + self.word_embeddings = nn.Embedding( + vocab_size, word_embed_proj_dim, padding_idx=padding_idx, **factory_kwargs + ) + self.project_in = nn.Linear( + word_embed_proj_dim, embed_dim, bias=False, **factory_kwargs + ) + self.max_position_embeddings = max_position_embeddings + if self.max_position_embeddings > 0: + self.position_embeddings = nn.Embedding( + max_position_embeddings, embed_dim, **factory_kwargs + ) + + def forward(self, input_ids, position_ids=None): + """ + input_ids: (batch, seqlen) + position_ids: (batch, seqlen) + """ + batch_size, seqlen = input_ids.shape + embeddings = self.word_embeddings(input_ids) + if self.project_in is not None: + embeddings = self.project_in(embeddings) + if self.max_position_embeddings > 0: + if position_ids is None: + position_ids = torch.arange(seqlen, dtype=torch.long, device=input_ids.device) + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + return embeddings + + +class BertEmbeddings(nn.Module): + def __init__( + self, + embed_dim, + vocab_size, + max_position_embeddings, + type_vocab_size, + padding_idx=None, + device=None, + dtype=None, + ): + """ + If max_position_embeddings <= 0, there's no position embeddings + If type_vocab_size <= 0, there's no token type embeddings + """ + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.word_embeddings = nn.Embedding( + vocab_size, embed_dim, padding_idx=padding_idx, **factory_kwargs + ) + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + if self.max_position_embeddings > 0: + self.position_embeddings = nn.Embedding( + max_position_embeddings, embed_dim, **factory_kwargs + ) + if self.type_vocab_size > 0: + self.token_type_embeddings = nn.Embedding(type_vocab_size, embed_dim, **factory_kwargs) + + def forward(self, input_ids, position_ids=None, token_type_ids=None): + """ + input_ids: (batch, seqlen) + position_ids: (batch, seqlen) + token_type_ids: (batch, seqlen) + """ + batch_size, seqlen = input_ids.shape + embeddings = self.word_embeddings(input_ids) + if self.max_position_embeddings > 0: + if position_ids is None: + position_ids = torch.arange(seqlen, dtype=torch.long, device=input_ids.device) + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + if self.type_vocab_size > 0: + if token_type_ids is None: + token_type_ids = torch.zeros(seqlen, dtype=torch.long, device=input_ids.device) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + embeddings = embeddings + token_type_embeddings + return embeddings + + +class VocabParallelEmbedding(nn.Embedding): + def __init__(self, num_embeddings, *args, process_group=None, padding_idx=None, **kwargs): + self.process_group = process_group + if process_group is not None: + world_size = torch.distributed.get_world_size(process_group) + if num_embeddings % world_size != 0: + raise ValueError( + f"num_embeddings ({num_embeddings}) must be divisible by " + f"world_size ({world_size})" + ) + if world_size > 1 and padding_idx is not None: + raise RuntimeError("ParallelEmbedding does not support padding_idx") + else: + world_size = 1 + super().__init__(num_embeddings // world_size, *args, padding_idx=padding_idx, **kwargs) + + def forward(self, input: Tensor) -> Tensor: + if self.process_group is None: + return super().forward(input) + else: + rank = torch.distributed.get_rank(self.process_group) + vocab_size = self.num_embeddings + vocab_start_index, vocab_end_index = rank * vocab_size, (rank + 1) * vocab_size + # Create a mask of valid vocab ids (1 means it needs to be masked). + input_ids_mask = (input < vocab_start_index) | (input >= vocab_end_index) + input = input - vocab_start_index + input[input_ids_mask] = 0 + embeddings = super().forward(input) + embeddings[input_ids_mask] = 0.0 + return embeddings + + +class ColumnParallelEmbedding(nn.Embedding): + def __init__(self, num_embeddings, embedding_dim, *args, process_group=None, **kwargs): + self.process_group = process_group + if process_group is not None: + world_size = torch.distributed.get_world_size(process_group) + if embedding_dim % world_size != 0: + raise ValueError( + f"embedding_dim ({embedding_dim}) must be divisible by " + f"world_size ({world_size})" + ) + else: + world_size = 1 + super().__init__(num_embeddings, embedding_dim // world_size, *args, **kwargs) + + +class ParallelGPT2Embeddings(nn.Module): + def __init__( + self, + embed_dim, + vocab_size, + max_position_embeddings, + process_group, + padding_idx=None, + sequence_parallel=True, + device=None, + dtype=None, + ): + """ + If max_position_embeddings <= 0, there's no position embeddings + """ + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.process_group = process_group + self.sequence_parallel = sequence_parallel + self.word_embeddings = VocabParallelEmbedding( + vocab_size, + embed_dim, + padding_idx=padding_idx, + process_group=process_group, + **factory_kwargs, + ) + self.max_position_embeddings = max_position_embeddings + if self.max_position_embeddings > 0: + self.position_embeddings = ColumnParallelEmbedding( + max_position_embeddings, embed_dim, process_group=process_group, **factory_kwargs + ) + + def forward(self, input_ids, position_ids=None, combine_batch_seqlen_dim=False): + """ + input_ids: (batch, seqlen) + position_ids: (batch, seqlen) + """ + batch_size, seqlen = input_ids.shape + world_size = torch.distributed.get_world_size(self.process_group) + embeddings = self.word_embeddings(input_ids) + if self.max_position_embeddings > 0: + if position_ids is None: + position_ids = torch.arange(seqlen, dtype=torch.long, device=input_ids.device) + position_embeddings = self.position_embeddings(position_ids) + if world_size <= 1: + embeddings = embeddings + position_embeddings + else: + partition_dim = self.position_embeddings.embedding_dim + rank = torch.distributed.get_rank(self.process_group) + embeddings[ + ..., rank * partition_dim : (rank + 1) * partition_dim + ] += position_embeddings + if combine_batch_seqlen_dim: + embeddings = rearrange(embeddings, "b s d -> (b s) d") + reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce + return embeddings if world_size <= 1 else reduce_fn(embeddings, self.process_group) diff --git a/epilogue_bwd_sm90_tma.hpp b/epilogue_bwd_sm90_tma.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b6741120ef5c41bb4f2bb389245d200d07923a69 --- /dev/null +++ b/epilogue_bwd_sm90_tma.hpp @@ -0,0 +1,270 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include +#include "cute/tensor.hpp" + +#include "cutlass/gemm/collective/collective_builder.hpp" + +#include "named_barrier.hpp" +#include "utils.h" + +namespace flash { + +using namespace cute; + +template +struct CollectiveEpilogueBwd { + + using TileShape_MNK = TileShape_MNK_; + using Element = Element_; + static constexpr int NumEpilogueThreads = NumEpilogueThreads_; + static constexpr bool Varlen = Varlen_; + + using GmemTiledCopydKVTMA = cute::SM90_TMA_STORE; + + // These are for storing the output tensor without TMA (e.g., for setting output to zero) + static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element); + static_assert(get<2>(TileShape_MNK{}) % kGmemElemsPerLoad == 0, "Headdim must be a multiple of kGmemElemsPerLoad"); + static constexpr int kHeadDim = get<2>(TileShape_MNK{}); + static constexpr int kGmemThreadsPerRow = cutlass::gcd(kHeadDim / kGmemElemsPerLoad, NumEpilogueThreads); + static_assert(NumEpilogueThreads % kGmemThreadsPerRow == 0, "NumEpilogueThreads must be a multiple of kGmemThreadsPerRow"); + using GmemLayoutAtom = Layout, Int>, + Stride, _1>>; + using GmemTiledCopydKV = decltype( + make_tiled_copy(Copy_Atom{}, + GmemLayoutAtom{}, + Layout>>{})); // Val layout, 8 or 16 vals per store + + using SmemLayoutAtomdKVTMA = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutdKVTMA = decltype(tile_to_shape(SmemLayoutAtomdKVTMA{}, select<1, 2>(TileShape_MNK{}))); + + // If we don't use TMA + static constexpr int kBlockKSmem = kHeadDim % 64 == 0 ? 64 : (kHeadDim % 32 == 0 ? 32 : 16); + static constexpr int kSwizzle = kBlockKSmem == 64 ? 3 : (kBlockKSmem == 32 ? 2 : 1); + using SmemLayoutAtomdKVSTG = + decltype(composition(Swizzle{}, + Layout, Int>, + Stride, _1>>{})); + + using SmemLayoutAtomdKV = std::conditional_t; + using SmemLayoutdKV = decltype(tile_to_shape(SmemLayoutAtomdKV{}, select<1, 2>(TileShape_MNK{}))); + + using SmemCopyAtomdKV = Copy_Atom; + + struct TensorStorage : cute::aligned_struct<128> { + cute::array_aligned> smem_dk; + cute::array_aligned> smem_dv; + }; + + using ShapedKV = cute::Shape; // (seqlen_q, d, head, batch) + using StridedKV = cute::Stride; + using LayoutdKV = cute::Layout; + + using TMA_dKV = decltype(make_tma_copy( + GmemTiledCopydKVTMA{}, + make_tensor(make_gmem_ptr(static_cast(nullptr)), ShapedKV{}, StridedKV{}), + SmemLayoutdKVTMA{}, + select<1, 2>(TileShape_MNK{}), + _1{})); // no mcast for dKV + + // Host side kernel arguments + struct Arguments { + Element* ptr_dK; + ShapedKV const shape_dK; + StridedKV const stride_dK; + Element* ptr_dV; + StridedKV const stride_dV; + int const* cu_seqlens = nullptr; + }; + + // Device side kernel params + struct Params { + Element* ptr_dK; + ShapedKV const shape_dK; + StridedKV const stride_dK; + Element* ptr_dV; + StridedKV const stride_dV; + TMA_dKV tma_store_dK, tma_store_dV; + int const* cu_seqlens = nullptr; + }; + + static Params + to_underlying_arguments(Arguments const& args) { + if constexpr (Varlen) { + assert (args.cu_seqlens != nullptr); + } + Tensor mdK = make_tensor(make_gmem_ptr(args.ptr_dK), args.shape_dK, args.stride_dK); + Tensor mdV = make_tensor(make_gmem_ptr(args.ptr_dV), args.shape_dK, args.stride_dV); + TMA_dKV tma_store_dK = make_tma_copy( + GmemTiledCopydKVTMA{}, + mdK, + SmemLayoutdKVTMA{}, + select<1, 2>(TileShape_MNK{}), + _1{}); // no mcast for dKV + TMA_dKV tma_store_dV = make_tma_copy( + GmemTiledCopydKVTMA{}, + mdV, + SmemLayoutdKVTMA{}, + select<1, 2>(TileShape_MNK{}), + _1{}); // no mcast for dKV + return {args.ptr_dK, args.shape_dK, args.stride_dK, args.ptr_dV, args.stride_dV, + tma_store_dK, tma_store_dV, args.cu_seqlens}; + } + + /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance + CUTLASS_DEVICE + static void prefetch_tma_descriptors(Params const& params) { + if constexpr (!Varlen) { + cute::prefetch_tma_descriptor(params.tma_store_dK.get_tma_descriptor()); + cute::prefetch_tma_descriptor(params.tma_store_dV.get_tma_descriptor()); + } + } + + template + CUTLASS_DEVICE void + store(Params const& params, + FrgTensorO const& tdKrdK, + FrgTensorO const& tdVrdV, + SharedStorage& shared_storage, + TiledMma tiled_mma, + int thread_idx, + cute::tuple const& block_coord + ) { + + auto [n_block, bidh, bidb] = block_coord; + Tensor sdK = make_tensor(make_smem_ptr(shared_storage.epilogue.smem_dk.data()), SmemLayoutdKV{}); + Tensor sdV = make_tensor(make_smem_ptr(shared_storage.epilogue.smem_dv.data()), SmemLayoutdKV{}); + auto smem_tiled_copy_dKV = make_tiled_copy_C(SmemCopyAtomdKV{}, tiled_mma); + auto smem_thr_copy_dKV = smem_tiled_copy_dKV.get_thread_slice(thread_idx); + + Tensor tdVrdV_out = flash::convert_type(tdVrdV); + Tensor tdKrdK_out = flash::convert_type(tdKrdK); + Tensor taccdKrdK = smem_thr_copy_dKV.retile_S(tdKrdK_out); // ((Atom,AtomNum), MMA_M, MMA_N) + Tensor taccdVrdV = smem_thr_copy_dKV.retile_S(tdVrdV_out); // ((Atom,AtomNum), MMA_M, MMA_N) + Tensor taccdKsdK = smem_thr_copy_dKV.partition_D(sdK); // ((Atom,AtomNum),PIPE_M,PIPE_N) + Tensor taccdVsdV = smem_thr_copy_dKV.partition_D(sdV); // ((Atom,AtomNum),PIPE_M,PIPE_N) + + // Make sure all WGs have finished reading K and V + + cutlass::arch::NamedBarrier::sync(NumEpilogueThreads, static_cast(BwdNamedBarriers::KVEmpty) /*id*/); + cute::copy(smem_tiled_copy_dKV, taccdVrdV, taccdVsdV); + cute::copy(smem_tiled_copy_dKV, taccdKrdK, taccdKsdK); + if constexpr (!Varlen) { + cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA + cutlass::arch::NamedBarrier::arrive(NumEpilogueThreads + cutlass::NumThreadsPerWarp, + cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); + + Tensor mdK = params.tma_store_dK.get_tma_tensor(params.shape_dK); + Tensor mdV = params.tma_store_dV.get_tma_tensor(params.shape_dK); + Tensor gdK = local_tile(mdK(_, _, bidh, bidb), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{})); // (M, K) + Tensor gdV = local_tile(mdV(_, _, bidh, bidb), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{})); // (M, K) + auto block_tma_dK = params.tma_store_dK.get_slice(_0{}); + auto block_tma_dV = params.tma_store_dV.get_slice(_0{}); + Tensor tdKgdK = block_tma_dK.partition_D(gdK); // (TMA, TMA_M, TMA_K) + Tensor tdKsdK = block_tma_dK.partition_S(sdK); // (TMA, TMA_M, TMA_K) + Tensor tdVgdV = block_tma_dV.partition_D(gdV); // (TMA, TMA_M, TMA_K) + Tensor tdVsdV = block_tma_dV.partition_S(sdV); // (TMA, TMA_M, TMA_K) + int warp_idx_sync = __shfl_sync(0xffffffff, thread_idx / cutlass::NumThreadsPerWarp, 0); + if (warp_idx_sync == NumEpilogueThreads / cutlass::NumThreadsPerWarp - 1) { + cutlass::arch::NamedBarrier::sync(NumEpilogueThreads + cutlass::NumThreadsPerWarp, + cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); + int const lane_predicate = cute::elect_one_sync(); + if (lane_predicate) { + cute::copy(params.tma_store_dV, tdVsdV, tdVgdV); + cute::copy(params.tma_store_dK, tdKsdK, tdKgdK); + tma_store_arrive(); + } + } + + } else { + cutlass::arch::NamedBarrier::sync(NumEpilogueThreads, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); + bool const is_varlen = params.cu_seqlens != nullptr; + int const offset = !is_varlen ? 0 : params.cu_seqlens[bidb]; + int const seqlen = !is_varlen ? get<0>(params.shape_dK) : params.cu_seqlens[bidb + 1] - params.cu_seqlens[bidb]; + + Tensor mdK = make_tensor(make_gmem_ptr(params.ptr_dK), params.shape_dK, params.stride_dK)(_, _, bidh, !is_varlen ? bidb : 0); + Tensor gdK = local_tile(cute::domain_offset(make_coord(offset, _0{}), mdK), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{})); // (M, K) + Tensor mdV = make_tensor(make_gmem_ptr(params.ptr_dV), params.shape_dK, params.stride_dV)(_, _, bidh, !is_varlen ? bidb : 0); + Tensor gdV = local_tile(cute::domain_offset(make_coord(offset, _0{}), mdV), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{})); // (M, K) + + GmemTiledCopydKV gmem_tiled_copy_dKV; + auto gmem_thr_copy_dKV = gmem_tiled_copy_dKV.get_thread_slice(thread_idx); + Tensor tdKVgdV = gmem_thr_copy_dKV.partition_D(gdV); + Tensor tdKVsdV = gmem_thr_copy_dKV.partition_S(sdV); // (TMA, TMA_M, TMA_K) + Tensor tdKVgdK = gmem_thr_copy_dKV.partition_D(gdK); + Tensor tdKVsdK = gmem_thr_copy_dKV.partition_S(sdK); // (TMA, TMA_M, TMA_K) + Tensor tdKVrdV = make_fragment_like(tdKVgdV); + Tensor tdKVrdK = make_fragment_like(tdKVgdK); + cute::copy(gmem_tiled_copy_dKV, tdKVsdV, tdKVrdV); + cute::copy(gmem_tiled_copy_dKV, tdKVsdK, tdKVrdK); + // Construct identity layout for gdKV + Tensor cdKV = cute::make_identity_tensor(select<1, 2>(TileShape_MNK{})); // (BLK_M,BLK_K) -> (blk_m,blk_k) + // Repeat the partitioning with identity layouts + Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV); + Tensor tdKVpdKV = make_tensor(make_shape(size<2>(tdKVgdV))); + #pragma unroll + for (int k = 0; k < size(tdKVpdKV); ++k) { tdKVpdKV(k) = get<1>(tdKVcdKV(_0{}, _0{}, k)) < get<1>(params.shape_dK); } + static constexpr int kBlockN = get<1>(TileShape_MNK{}); + // Clear_OOB_K must be false since we don't want to write zeros to gmem + flash::copy( + gmem_tiled_copy_dKV, tdKVrdV, tdKVgdV, tdKVcdKV, tdKVpdKV, seqlen - n_block * kBlockN + ); + flash::copy( + gmem_tiled_copy_dKV, tdKVrdK, tdKVgdK, tdKVcdKV, tdKVpdKV, seqlen - n_block * kBlockN + ); + } + } + + CUTLASS_DEVICE void + store_tail() { + if constexpr (!Varlen) { tma_store_wait<0>(); } + } + + // Write 0 to dK and dV + CUTLASS_DEVICE void + store_zero( + Params const& params, + int thread_idx, + cute::tuple const& block_coord + ) { + static constexpr int kBlockN = get<1>(TileShape_MNK{}); + auto [n_block, bidh, bidb] = block_coord; + bool const is_varlen = Varlen && params.cu_seqlens != nullptr; + int const offset = !is_varlen ? 0 : params.cu_seqlens[bidb]; + int const seqlen = !is_varlen ? get<0>(params.shape_dK) : params.cu_seqlens[bidb + 1] - offset; + + Tensor mdK = make_tensor(make_gmem_ptr(params.ptr_dK), params.shape_dK, params.stride_dK)(_, _, bidh, !is_varlen ? bidb : 0); + Tensor gdK = local_tile(cute::domain_offset(make_coord(offset, _0{}), mdK), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{})); // (M, K) + Tensor mdV = make_tensor(make_gmem_ptr(params.ptr_dV), params.shape_dK, params.stride_dV)(_, _, bidh, !is_varlen ? bidb : 0); + Tensor gdV = local_tile(cute::domain_offset(make_coord(offset, _0{}), mdV), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{})); // (M, K) + + GmemTiledCopydKV gmem_tiled_copy_dKV; + auto gmem_thr_copy_dKV = gmem_tiled_copy_dKV.get_thread_slice(thread_idx); + Tensor tdKVgdK = gmem_thr_copy_dKV.partition_D(gdK); + Tensor tdKVgdV = gmem_thr_copy_dKV.partition_D(gdV); + Tensor tdKVrdKV = make_fragment_like(tdKVgdK); + clear(tdKVrdKV); + // Construct identity layout for gdKV + Tensor cdKV = cute::make_identity_tensor(select<1, 2>(TileShape_MNK{})); // (BLK_M,BLK_K) -> (blk_m,blk_k) + // Repeat the partitioning with identity layouts + Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV); + Tensor tdKVpdKV = make_tensor(make_shape(size<2>(tdKVgdK))); + #pragma unroll + for (int k = 0; k < size(tdKVpdKV); ++k) { tdKVpdKV(k) = get<1>(tdKVcdKV(_0{}, _0{}, k)) < get<1>(params.shape_dK); } + // Clear_OOB_K must be false since we don't want to write zeros to gmem + flash::copy( + gmem_tiled_copy_dKV, tdKVrdKV, tdKVgdK, tdKVcdKV, tdKVpdKV, seqlen - n_block * kBlockN + ); + flash::copy( + gmem_tiled_copy_dKV, tdKVrdKV, tdKVgdV, tdKVcdKV, tdKVpdKV, seqlen - n_block * kBlockN + ); + } + +}; + +} // namespace flash diff --git a/epilogue_fwd_sm90_tma.hpp b/epilogue_fwd_sm90_tma.hpp new file mode 100644 index 0000000000000000000000000000000000000000..5133c55fc1e7e93629bd13833ff6235921cec239 --- /dev/null +++ b/epilogue_fwd_sm90_tma.hpp @@ -0,0 +1,296 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include +#include "cute/tensor.hpp" + +#include "cutlass/gemm/collective/collective_builder.hpp" + +#include "named_barrier.hpp" +#include "utils.h" + +namespace flash { + +using namespace cute; + +// template +template +struct CollectiveEpilogueFwd { + + using Element = typename Ktraits::OutputType; + static constexpr int kBlockM = Ktraits::kBlockM; + static constexpr int kBlockN = Ktraits::kBlockN; + static constexpr int kHeadDim = Ktraits::kHeadDim; + using TileShape_MNK = Shape, Int, Int>; + + static constexpr int kNWarps = Ktraits::kNWarps; + static constexpr int kNThreads = kNWarps * cutlass::NumThreadsPerWarp; + static constexpr bool Is_WS = kNWarps >= 12; + + static constexpr int NumCopyThreads = !Is_WS ? 0 : cutlass::NumThreadsPerWarpGroup; + static constexpr int NumMmaThreads = kNThreads - NumCopyThreads; + + using SmemLayoutAtomO = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutO = decltype(tile_to_shape(SmemLayoutAtomO{}, select<0, 2>(TileShape_MNK{}))); + + using SmemCopyAtomO = Copy_Atom; + using SharedStorage = cute::array_aligned>; + + using GmemTiledCopyOTMA = cute::SM90_TMA_STORE; + using TMA_O = decltype(make_tma_copy( + GmemTiledCopyOTMA{}, + make_tensor( + make_gmem_ptr(static_cast(nullptr)), + typename Seqlen_traits::ShapeT{}, + typename Seqlen_traits::StrideT{} + ), + SmemLayoutO{}, + select<0, 2>(TileShape_MNK{}), + _1{})); // no mcast for O + + // These are for storing the output tensor without TMA (e.g., for setting output to zero and var-seq-len) + static constexpr int kNumVecElem = ceil_div(128, sizeof_bits_v); + static_assert(kHeadDim % kNumVecElem == 0); + static constexpr int kNumThreadsPerRow = kHeadDim / kNumVecElem; + static_assert(NumMmaThreads % kNumThreadsPerRow == 0); + static constexpr int kNumRows = NumMmaThreads / kNumThreadsPerRow; + using TiledCopyOAtom = cute::Copy_Atom, Element>; + using TiledCopyOThrLayout = decltype(cute::make_layout( + cute::make_shape(Int{}, Int{}), + LayoutRight{})); + using TiledCopyOValLayout = decltype(cute::make_layout( + cute::make_shape(_1{}, Int{}), + LayoutRight{})); + using TiledCopyO = decltype(make_tiled_copy( + TiledCopyOAtom{}, + TiledCopyOThrLayout{}, // Thr layout + TiledCopyOValLayout{} // Val layout + )); + + // used for rmem -> smem O copy in fp8 kernel to undo column permutation + using ThreadLayoutrO = Layout, _4, _1>, + Stride<_4, _32, _1, _0>>; + using ValueLayoutrO = Layout, Int>, + Stride<_0, _2, Stride<_4, _1>, _8>>; + using TiledCopyrO = decltype(make_tiled_copy(Copy_Atom, Element>{}, + ThreadLayoutrO{}, ValueLayoutrO{})); + using TiledCopyShaperO = Shape<_8, Int, _16, Int>; + using SmemLayoutrO = decltype(composition(SmemLayoutO{}, Layout{})); + + // Host side kernel arguments + struct Arguments { + Element* ptr_O; + typename Seqlen_traits::LayoutT const layout_O; + float* ptr_LSE; + typename Seqlen_traits::LayoutLseT const layout_LSE; + }; + + // Device side kernel params + struct Params { + Element* ptr_O; + typename Seqlen_traits::LayoutT const layout_O; + float* ptr_LSE; + typename Seqlen_traits::LayoutLseT const layout_LSE; + TMA_O tma_store_O; + }; + + static Params + to_underlying_arguments(Arguments const& args) { + Tensor mO = make_tensor(make_gmem_ptr(args.ptr_O), args.layout_O); + TMA_O tma_store_O = make_tma_copy( + GmemTiledCopyOTMA{}, + mO, + SmemLayoutO{}, + select<0, 2>(TileShape_MNK{}), + _1{}); // no mcast for O + return {args.ptr_O, args.layout_O, args.ptr_LSE, args.layout_LSE, tma_store_O}; + } + + /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance + CUTLASS_DEVICE + static void prefetch_tma_descriptors(Params const& epilogue_params) { + if constexpr (!Seqlen_traits::kUseVarSeqLen) { + cute::prefetch_tma_descriptor(epilogue_params.tma_store_O.get_tma_descriptor()); + } + } + + template + CUTLASS_DEVICE void + store(Params const& epilogue_params, + FrgTensorO const& tOrO, + FrgTensorLSE const& lse, + SharedStorage& shared_storage, + TiledMma tiled_mma, + int thread_idx, + cute::tuple const& block_coord, + const Seqlen_traits& seqlen_traits_q + ) { + + auto [m_block, bidh, bidb] = block_coord; + Tensor sO = make_tensor(make_smem_ptr(shared_storage.smem_o.data()), SmemLayoutO{}); + auto smem_tiled_copy_O = make_tiled_copy_C(SmemCopyAtomO{}, tiled_mma); + auto smem_thr_copy_O = smem_tiled_copy_O.get_thread_slice(thread_idx); + + Tensor tOrO_out = flash::convert_type(tOrO); + Tensor taccOrO = smem_thr_copy_O.retile_S(tOrO_out); // ((Atom,AtomNum), MMA_M, MMA_N) + Tensor taccOsO = smem_thr_copy_O.partition_D(sO); // ((Atom,AtomNum),PIPE_M,PIPE_N) + + // Make sure all WGs have finished reading V + cutlass::arch::NamedBarrier::sync(NumMmaThreads, static_cast(FwdNamedBarriers::ValueEmpty) /*id*/); + cute::copy(smem_tiled_copy_O, taccOrO, taccOsO); + cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA + cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarp, + cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); + + Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_params.ptr_LSE), epilogue_params.layout_LSE); + Tensor gLSE = seqlen_traits_q.get_lse_local_tile_tensor( + mLSE, Shape>{}, bidh, bidb)(_, m_block); + Tensor caccO = cute::make_identity_tensor(select<0, 2>(TileShape_MNK{})); + auto thread_mma = tiled_mma.get_thread_slice(thread_idx); + Tensor taccOcO = thread_mma.partition_C(caccO); // (MMA,MMA_M,MMA_K) + static_assert(decltype(size<0, 0>(taccOcO))::value == 2); + static_assert(decltype(size<0, 1>(taccOcO))::value == 2); + // taccOcO has shape ((2, 2, V), MMA_M, MMA_K), we only take only the row indices. + Tensor taccOcO_row = taccOcO(make_coord(_0{}, _, _0{}), _, _0{}); + CUTE_STATIC_ASSERT_V(size(lse) == size(taccOcO_row)); // MMA_M + if (get<1>(taccOcO_row(_0{})) == 0) { + #pragma unroll + for (int mi = 0; mi < size(lse); ++mi) { + const int row = get<0>(taccOcO_row(mi)); + if (row < seqlen_traits_q.actual_seq_len - m_block * kBlockM) { gLSE(row) = lse(mi); } + } + } + + int write_warp_idx = kNWarps - 1; + if (cutlass::canonical_warp_idx_sync() == write_warp_idx) { + cutlass::arch::NamedBarrier::sync( + NumMmaThreads + cutlass::NumThreadsPerWarp, + cutlass::arch::ReservedNamedBarriers::EpilogueBarrier + ); + } + TiledCopyO gmem_tiled_copy_O; + flash::write_O( + epilogue_params.ptr_O, epilogue_params.tma_store_O, gmem_tiled_copy_O, + epilogue_params.layout_O, select<0, 2>(TileShape_MNK{}), sO, + m_block, bidh, bidb, seqlen_traits_q, write_warp_idx + ); + } + + template + CUTLASS_DEVICE void + store_fp8(Params const& epilogue_params, + FrgTensorO const& tOrO, + FrgTensorLSE const& lse, + SharedStorage& shared_storage, + TiledMma tiled_mma, + int thread_idx, + cute::tuple const& block_coord, + const Seqlen_traits& seqlen_traits_q + ) { + // using SmemLayoutrO = typename Ktraits::SmemLayoutrO; + // using TiledCopyrO = typename Ktraits::TiledCopyrO; + auto [m_block, bidh, bidb] = block_coord; + + TiledCopyrO rmem_tiled_copy_O; + Tensor sOacc = make_tensor(make_smem_ptr(shared_storage.smem_o.data()), SmemLayoutrO{}); + auto rmem_thr_copy_O = rmem_tiled_copy_O.get_thread_slice(thread_idx); + + Tensor taccOsO = rmem_thr_copy_O.partition_D(sOacc); + Tensor tOrO_out = flash::convert_type(tOrO); // Element is Ktraits::OutputType + Tensor taccOrO = make_tensor(tOrO_out.data(), shape(taccOsO)); + + // Make sure all WGs have finished reading V + cutlass::arch::NamedBarrier::sync(NumMmaThreads, static_cast(FwdNamedBarriers::ValueEmpty) /*id*/); + cute::copy(rmem_tiled_copy_O, taccOrO, taccOsO); + cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA + cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarp, + cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); + + Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_params.ptr_LSE), epilogue_params.layout_LSE); + Tensor gLSE = seqlen_traits_q.get_lse_local_tile_tensor( + mLSE, Shape>{}, bidh, bidb)(_, m_block); + Tensor caccO = cute::make_identity_tensor(select<0, 2>(TileShape_MNK{})); + auto thread_mma = tiled_mma.get_thread_slice(thread_idx); + Tensor taccOcO = thread_mma.partition_C(caccO); // (MMA,MMA_M,MMA_K) + static_assert(decltype(size<0, 0>(taccOcO))::value == 2); + static_assert(decltype(size<0, 1>(taccOcO))::value == 2); + // taccOcO has shape ((2, 2, V), MMA_M, MMA_K), we only take only the row indices. + Tensor taccOcO_row = taccOcO(make_coord(_0{}, _, _0{}), _, _0{}); + CUTE_STATIC_ASSERT_V(size(lse) == size(taccOcO_row)); // MMA_M + int const seqlen_q = [&] { + if constexpr(Seqlen_traits::kUseVarSeqLen) { return seqlen_traits_q.actual_seq_len; } + else { return shape<2>(epilogue_params.layout_LSE); } + }(); + if (get<1>(taccOcO_row(_0{})) == 0) { + #pragma unroll + for (int mi = 0; mi < size(lse); ++mi) { + const int row = get<0>(taccOcO_row(mi)); + if (row < seqlen_q - m_block * kBlockM) { gLSE(row) = lse(mi); } + } + } + + int write_warp_idx = kNWarps - 1; + if (cutlass::canonical_warp_idx_sync() == write_warp_idx) { + cutlass::arch::NamedBarrier::sync(NumMmaThreads + cutlass::NumThreadsPerWarp, + cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); + } + TiledCopyO gmem_tiled_copy_O; + Tensor sO = make_tensor(make_smem_ptr(shared_storage.smem_o.data()), SmemLayoutO{}); + flash::write_O( + epilogue_params.ptr_O, epilogue_params.tma_store_O, gmem_tiled_copy_O, + epilogue_params.layout_O, select<0, 2>(TileShape_MNK{}), sO, + m_block, bidh, bidb, seqlen_traits_q, write_warp_idx + ); + } + + CUTLASS_DEVICE void + store_tail() { + tma_store_wait<0>(); + } + + // Write 0 to output and -inf to LSE + template + CUTLASS_DEVICE void + store_zero( + Params const& epilogue_params, + SharedStorage& shared_storage, + int thread_idx, + cute::tuple const& block_coord, + const Seqlen_traits& seqlen_traits_q + ) { + auto [m_block, bidh, bidb] = block_coord; + Tensor mO = make_tensor(make_gmem_ptr(epilogue_params.ptr_O), epilogue_params.layout_O); + Tensor gO = seqlen_traits_q.get_local_tile_tensor( + mO, select<0, 2>(TileShape_MNK{}), bidh, bidb + )(_, _, m_block); // (M, K) + Tensor mLSE = make_tensor(make_gmem_ptr(epilogue_params.ptr_LSE), epilogue_params.layout_LSE); + Tensor gLSE = seqlen_traits_q.get_lse_local_tile_tensor( + mLSE, Shape>{}, bidh, bidb)(_, m_block); + + TiledCopyO gmem_tiled_copy_O; + auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx); + Tensor tOgO = gmem_thr_copy_O.partition_D(gO); + Tensor tOrO = make_fragment_like(tOgO); + clear(tOrO); + // Construct identity layout for sO + Tensor cO = cute::make_identity_tensor(select<0, 2>(TileShape_MNK{})); // (BLK_M,BLK_K) -> (blk_m,blk_k) + // Repeat the partitioning with identity layouts + Tensor tOcO = gmem_thr_copy_O.partition_D(cO); + Tensor tOpO = make_tensor(make_shape(size<2>(tOgO))); + #pragma unroll + for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(epilogue_params.layout_O.shape()); } + // Clear_OOB_K must be false since we don't want to write zeros to gmem + flash::copy( + gmem_tiled_copy_O, tOrO, tOgO, tOcO, tOpO, get<0>(epilogue_params.layout_O.shape()) - m_block * kBlockM + ); + static_assert(kBlockM <= NumMmaThreads); + if (thread_idx < get<0>(epilogue_params.layout_LSE.shape()) - m_block * kBlockM) { gLSE(thread_idx) = -INFINITY; } + } + +}; + +} // namespace flash diff --git a/exp.yaml b/exp.yaml new file mode 100644 index 0000000000000000000000000000000000000000..032aaa9432899c83d41d292a07c0eb3c3edb7c7e --- /dev/null +++ b/exp.yaml @@ -0,0 +1,17 @@ +# @package _global_ + +# run in experiment mode with: +# `python run.py mode=exp name=experiment_name` + +experiment_mode: True + +# allows for custom naming of the experiment +name: ??? + +hydra: + # sets output paths for all file logs to `logs/experiment/name' + run: + dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/experiments/${name} + sweep: + dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/experiments/${name} + subdir: ${hydra.job.num} diff --git a/falcon.py b/falcon.py new file mode 100644 index 0000000000000000000000000000000000000000..4b02ec7727740eaa9ca70a7f0ca64df94fff4c3a --- /dev/null +++ b/falcon.py @@ -0,0 +1,143 @@ +# Copyright (c) 2023, Tri Dao. + +import math +import re +from collections import OrderedDict + +import torch +import torch.nn.functional as F +from einops import rearrange +from transformers import FalconConfig, GPT2Config + + +def remap_state_dict_hf_falcon(state_dict, config): + def key_mapping_layers(key): + return re.sub(r"^transformer.h.", "transformer.layers.", key) + + state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items()) + # Word embedding + def key_mapping_emb(key): + return re.sub( + r"^transformer.word_embeddings.", "transformer.embeddings.word_embeddings.", key + ) + + state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items()) + word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight") + # It's possible that vocab_size is padded to be a multiple of 8, for example. + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple + state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad( + word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0]) + ) + if getattr(config, "tie_word_embeddings"): + state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"] + else: + output_embeddings = state_dict.pop("lm_head.weight") + # It's possible that vocab_size is padded to be a multiple of 8, for example. + state_dict["lm_head.weight"] = F.pad( + output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0]) + ) + output_embeddings_bias = state_dict.pop("lm_head.bias") + state_dict["lm_head.bias"] = F.pad( + output_embeddings_bias, (0, vocab_size - output_embeddings_bias.shape[0]) + ) + + # LayerNorm + def key_mapping_ln(key): + key = re.sub( + r"^transformer.layers.(\d+).input_layernorm.", r"transformer.layers.\1.norm1.", key + ) + key = re.sub( + r"^transformer.layers.(\d+).post_attention_layernorm.", + r"transformer.layers.\1.norm2.", + key, + ) + key = re.sub(r"^transformer.layers.(\d+).ln_attn.", r"transformer.layers.\1.norm1.", key) + key = re.sub(r"^transformer.layers.(\d+).ln_mlp.", r"transformer.layers.\1.norm2.", key) + return key + + state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items()) + + # MLP + def key_mapping_mlp(key): + key = re.sub( + r"^transformer.layers.(\d+).mlp.dense_h_to_4h.", r"transformer.layers.\1.mlp.fc1.", key + ) + key = re.sub( + r"^transformer.layers.(\d+).mlp.dense_4h_to_h.", r"transformer.layers.\1.mlp.fc2.", key + ) + return key + + state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items()) + + def key_mapping_attn(key): + key = re.sub( + r"^transformer.layers.(\d+).self_attention.query_key_value.", + r"transformer.layers.\1.mixer.Wqkv.", + key, + ) + key = re.sub( + r"^transformer.layers.(\d+).self_attention.dense.", + r"transformer.layers.\1.mixer.out_proj.", + key, + ) + return key + + state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items()) + n_head = config.n_head + n_head_kv = getattr(config, "n_head_kv", 1) + headdim = config.hidden_size // n_head + for l in range(config.n_layer): + # The weights are stored in a different layout compared to our implementation + Wqkv = rearrange( + state_dict.pop(f"transformer.layers.{l}.mixer.Wqkv.weight"), + "(group ratio headdim) ... -> group ratio headdim ...", + ratio=n_head // n_head_kv + 2, + headdim=headdim, + ) + Wq = rearrange(Wqkv[:, :-2], "group ratio headdim ... -> (group ratio headdim) ...") + Wk = rearrange(Wqkv[:, [-2]], "group ratio headdim ... -> (group ratio headdim) ...") + Wv = rearrange(Wqkv[:, [-1]], "group ratio headdim ... -> (group ratio headdim) ...") + state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0) + + return state_dict + + +def falcon_config_to_gpt2_config(falcon_config: FalconConfig) -> GPT2Config: + # The 40b config uses "n_head_kv" instead of "num_kv_heads" + n_head_kv = getattr( + falcon_config, + "n_head_kv", + 1 if getattr(falcon_config, "multi_query", False) else falcon_config.n_head, + ) + # HACK: the 40b config has 2 LN per layer instead of 1, but that's not reflected in the config. + # So we have to infer it from the number of heads in the key/value block + parallel_block_tied_norm = n_head_kv == 1 + return GPT2Config( + vocab_size=falcon_config.vocab_size, + n_positions=0, # No absolute position embedding + n_embd=falcon_config.hidden_size, + n_layer=falcon_config.n_layer, + n_head=falcon_config.n_head, + n_inner=falcon_config.hidden_size * 4, + activation_function="gelu", + resid_pdrop=falcon_config.hidden_dropout, + embd_pdrop=0.0, # There doesn't seem to be any embedding dropout + attn_pdrop=falcon_config.attention_dropout, + layer_norm_epsilon=falcon_config.layer_norm_epsilon, + initializer_range=falcon_config.initializer_range, + bos_token_id=falcon_config.bos_token_id, + eos_token_id=falcon_config.eos_token_id, + # These are new arguments not in the original GPT2Config + parallel_block=falcon_config.parallel_attn, + n_head_kv=n_head_kv, + parallel_block_tied_norm=parallel_block_tied_norm, + rotary_emb_fraction=1.0, + rotary_emb_interleaved=False, + tie_word_embeddings=True, + qkv_proj_bias=falcon_config.bias, + out_proj_bias=falcon_config.bias, + mlp_fc1_bias=falcon_config.bias, + mlp_fc2_bias=falcon_config.bias, + lm_head_bias=False, + ) diff --git a/flash.h b/flash.h new file mode 100644 index 0000000000000000000000000000000000000000..c8ba8f22cf2b335c13f7a665d7f62a0621ef9426 --- /dev/null +++ b/flash.h @@ -0,0 +1,184 @@ +/****************************************************************************** + * Copyright (c) 2023, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include +#include + +#include "cutlass/fast_math.h" // For cutlass::FastDivmod + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct Qkv_params { + using index_t = int64_t; + // The QKV matrices. + void *__restrict__ q_ptr; + void *__restrict__ k_ptr; + void *__restrict__ v_ptr; + + // The stride between rows of the Q, K and V matrices. + index_t q_batch_stride; + index_t k_batch_stride; + index_t v_batch_stride; + index_t q_row_stride; + index_t k_row_stride; + index_t v_row_stride; + index_t q_head_stride; + index_t k_head_stride; + index_t v_head_stride; + + // The number of heads. + int h, h_k; + // In the case of multi-query and grouped-query attention (MQA/GQA), nheads_k could be + // different from nheads (query). + int h_h_k_ratio; // precompute h / h_k, +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct Flash_fwd_params : public Qkv_params { + + // The O matrix (output). + void * __restrict__ o_ptr; + void * __restrict__ oaccum_ptr; + + // The stride between rows of O. + index_t o_batch_stride; + index_t o_row_stride; + index_t o_head_stride; + + // The pointer to the P matrix. + void * __restrict__ p_ptr; + + // The pointer to the softmax sum. + void * __restrict__ softmax_lse_ptr; + void * __restrict__ softmax_lseaccum_ptr; + + // The dimensions. + int b, seqlen_q, seqlen_k, seqlen_knew, d, seqlen_q_rounded, seqlen_k_rounded, d_rounded, rotary_dim, total_q, total_k; + + // The scaling factors for the kernel. + float scale_softmax; + float scale_softmax_log2; + uint32_t scale_softmax_log2_half2; + + // array of length b+1 holding starting offset of each sequence. + int * __restrict__ cu_seqlens_q; + int * __restrict__ cu_seqlens_k; + + // If provided, the actual length of each k sequence. + int * __restrict__ seqused_k; + + int *__restrict__ blockmask; + + // The K_new and V_new matrices. + void * __restrict__ knew_ptr; + void * __restrict__ vnew_ptr; + + // The stride between rows of the Q, K and V matrices. + index_t knew_batch_stride; + index_t vnew_batch_stride; + index_t knew_row_stride; + index_t vnew_row_stride; + index_t knew_head_stride; + index_t vnew_head_stride; + + // The cos and sin matrices for rotary embedding. + void * __restrict__ rotary_cos_ptr; + void * __restrict__ rotary_sin_ptr; + + // The indices to index into the KV cache. + int * __restrict__ cache_batch_idx; + + // Paged KV cache + int * __restrict__ block_table; + index_t block_table_batch_stride; + int page_block_size; + + // The dropout probability (probability of keeping an activation). + float p_dropout; + // uint32_t p_dropout_in_uint; + // uint16_t p_dropout_in_uint16_t; + uint8_t p_dropout_in_uint8_t; + + // Scale factor of 1 / (1 - p_dropout). + float rp_dropout; + float scale_softmax_rp_dropout; + + // Local window size + int window_size_left, window_size_right; + + // Pointer to the RNG seed (idx 0) and offset (idx 1). + uint64_t * rng_state; + + bool is_bf16; + bool is_e4m3; + bool is_causal; + + // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb]. + // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K. + bool is_seqlens_k_cumulative; + + bool is_rotary_interleaved; + + int num_splits; // For split-KV version + + void * __restrict__ alibi_slopes_ptr; + index_t alibi_slopes_batch_stride; + + bool unpadded_lse; // For varlen paths: LSE is in [nheads, total_seqlen_q] format instead of [b, nheads, seqlen_q]. + + int * __restrict__ tile_count_semaphore; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct Flash_bwd_params : public Flash_fwd_params { + + // The dO and dQKV matrices. + void *__restrict__ do_ptr; + void *__restrict__ dq_ptr; + void *__restrict__ dk_ptr; + void *__restrict__ dv_ptr; + + // To accumulate dQ + void *__restrict__ dq_accum_ptr; + void *__restrict__ dk_accum_ptr; + void *__restrict__ dv_accum_ptr; + + // // To accumulate dK and dV in case we're splitting the bwd along seqlen_q + // dimension void *__restrict__ dk_accum_ptr; void *__restrict__ + // dv_accum_ptr; + + // The stride between rows of the dO, dQ, dK and dV matrices. + // TD [2022-04-16]: We're using 32-bit indexing to save registers. + // The code probably won't work for arrays larger than 2GB. + index_t do_batch_stride; + index_t do_row_stride; + index_t do_head_stride; + index_t dq_batch_stride; + index_t dk_batch_stride; + index_t dv_batch_stride; + index_t dq_row_stride; + index_t dk_row_stride; + index_t dv_row_stride; + index_t dq_head_stride; + index_t dk_head_stride; + index_t dv_head_stride; + + // The pointer to the softmax d sum. + void *__restrict__ dsoftmax_sum; + void *__restrict__ softmax_lse_log2_ptr; + + int *__restrict__ dq_semaphore; + + bool deterministic; + index_t dq_accum_split_stride; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream); +template void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream); diff --git a/flash2_a100_fwd_bwd_benchmark.png b/flash2_a100_fwd_bwd_benchmark.png new file mode 100644 index 0000000000000000000000000000000000000000..f529197bec8d885219f5cb44533c7ee2db50d667 Binary files /dev/null and b/flash2_a100_fwd_bwd_benchmark.png differ diff --git a/flash2_h100_fwd_bwd_benchmark.png b/flash2_h100_fwd_bwd_benchmark.png new file mode 100644 index 0000000000000000000000000000000000000000..41779e0afd3144bfed055b29aaddcc305d6e94c8 Binary files /dev/null and b/flash2_h100_fwd_bwd_benchmark.png differ diff --git a/flash3_fp16_fwd.png b/flash3_fp16_fwd.png new file mode 100644 index 0000000000000000000000000000000000000000..403d13944d1b884890d8d37c45f946f8d89848e6 Binary files /dev/null and b/flash3_fp16_fwd.png differ diff --git a/flash_api.cpp b/flash_api.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8b12e93505d83bbec32a4ec270429901e61e735e --- /dev/null +++ b/flash_api.cpp @@ -0,0 +1,952 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +// Include these 2 headers instead of torch/extension.h since we don't need all of the torch headers. +#include +#include +#include +#include + +#include + +#include "flash.h" +#include "static_switch.h" + +#define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA") +#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")") +#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") + + +void set_params_fprop(Flash_fwd_params ¶ms, + // sizes + const size_t b, + const size_t seqlen_q, + const size_t seqlen_k, + const size_t seqlen_q_rounded, + const size_t seqlen_k_rounded, + const size_t h, + const size_t h_k, + const size_t d, + const size_t d_rounded, + // device pointers + const at::Tensor q, + const at::Tensor k, + const at::Tensor v, + at::Tensor out, + void *cu_seqlens_q_d, + void *cu_seqlens_k_d, + void *seqused_k, + void *p_d, + void *softmax_lse_d, + float p_dropout, + float softmax_scale, + int window_size_left, + int window_size_right, + bool seqlenq_ngroups_swapped=false, + bool unpadded_lse=false) { + + // Reset the parameters + params = {}; + + params.is_bf16 = q.dtype() == torch::kBFloat16; + params.is_e4m3 = q.dtype() == torch::kFloat8_e4m3fn; + + // Set the pointers and strides. + params.q_ptr = q.data_ptr(); + params.k_ptr = k.data_ptr(); + params.v_ptr = v.data_ptr(); + // All stride are in elements, not bytes. + params.q_row_stride = q.stride(-3); + params.k_row_stride = k.stride(-3); + params.v_row_stride = v.stride(-3); + params.q_head_stride = q.stride(-2); + params.k_head_stride = k.stride(-2); + params.v_head_stride = v.stride(-2); + params.o_ptr = out.data_ptr(); + params.o_row_stride = out.stride(-3); + params.o_head_stride = out.stride(-2); + + if (cu_seqlens_q_d == nullptr) { + params.q_batch_stride = q.stride(0); + params.k_batch_stride = k.stride(0); + params.v_batch_stride = v.stride(0); + params.o_batch_stride = out.stride(0); + if (seqlenq_ngroups_swapped) { + params.q_batch_stride *= seqlen_q; + params.o_batch_stride *= seqlen_q; + } + } + + params.cu_seqlens_q = static_cast(cu_seqlens_q_d); + params.cu_seqlens_k = static_cast(cu_seqlens_k_d); + params.seqused_k = static_cast(seqused_k); + + TORCH_CHECK( + bool(params.cu_seqlens_q) == bool(params.cu_seqlens_k), + "cu_seqlens_q and cu_seqlens_k must be both null or non-null" + ); + + // P = softmax(QK^T) + params.p_ptr = p_d; + + // Softmax sum + params.softmax_lse_ptr = softmax_lse_d; + + // Set the dimensions. + params.b = b; + params.h = h; + params.h_k = h_k; + params.h_h_k_ratio = h / h_k; + params.seqlen_q = seqlen_q; + params.seqlen_k = seqlen_k; + params.seqlen_q_rounded = seqlen_q_rounded; + params.seqlen_k_rounded = seqlen_k_rounded; + params.d = d; + params.d_rounded = d_rounded; + + // Set the different scale values. + params.scale_softmax = softmax_scale; + params.scale_softmax_log2 = softmax_scale * M_LOG2E; + __half scale_softmax_log2_half = __float2half(params.scale_softmax_log2); + __half2 scale_softmax_log2_half2 = __half2(scale_softmax_log2_half, scale_softmax_log2_half); + params.scale_softmax_log2_half2 = reinterpret_cast(scale_softmax_log2_half2); + + // Set this to probability of keeping an element to simplify things. + params.p_dropout = 1.f - p_dropout; + // Convert p from float to int so we don't have to convert the random uint to float to compare. + // [Minor] We want to round down since when we do the comparison we use <= instead of < + // params.p_dropout_in_uint = uint32_t(std::floor(params.p_dropout * 4294967295.0)); + // params.p_dropout_in_uint16_t = uint16_t(std::floor(params.p_dropout * 65535.0)); + params.p_dropout_in_uint8_t = uint8_t(std::floor(params.p_dropout * 255.0)); + params.rp_dropout = 1.f / params.p_dropout; + params.scale_softmax_rp_dropout = params.rp_dropout * params.scale_softmax; + TORCH_CHECK(p_dropout < 1.f); + #ifdef FLASHATTENTION_DISABLE_DROPOUT + TORCH_CHECK(p_dropout == 0.0f, "This flash attention build does not support dropout."); + #endif + + // Causal is the special case where window_size_right == 0 and window_size_left < 0. + // Local is the more general case where window_size_right >= 0 or window_size_left >= 0. + params.is_causal = window_size_left < 0 && window_size_right == 0; + + if (window_size_left < 0 && window_size_right >= 0) { window_size_left = seqlen_k; } + if (window_size_left >= 0 && window_size_right < 0) { window_size_right = seqlen_k; } + params.window_size_left = window_size_left; + params.window_size_right = window_size_right; + + #ifdef FLASHATTENTION_DISABLE_LOCAL + TORCH_CHECK(params.is_causal || (window_size_left < 0 && window_size_right < 0), + "This flash attention build does not support local attention."); + #endif + + params.is_seqlens_k_cumulative = true; + + #ifdef FLASHATTENTION_DISABLE_UNEVEN_K + TORCH_CHECK(d == d_rounded, "This flash attention build does not support headdim not being a multiple of 32."); + #endif + + params.unpadded_lse = unpadded_lse; +} + +void set_params_dgrad(Flash_bwd_params ¶ms, + // sizes + const size_t b, + const size_t seqlen_q, + const size_t seqlen_k, + const size_t seqlen_q_rounded, + const size_t seqlen_k_rounded, + const size_t h, + const size_t h_k, + const size_t d, + const size_t d_rounded, + // device pointers + const at::Tensor q, + const at::Tensor k, + const at::Tensor v, + const at::Tensor out, + const at::Tensor dout, + at::Tensor dq, + at::Tensor dk, + at::Tensor dv, + void *cu_seqlens_q_d, + void *cu_seqlens_k_d, + void *dq_accum_d, + void *dk_accum_d, + void *dv_accum_d, + void *softmax_lse_d, + void *dsoftmax_sum_d, + float p_dropout, + float softmax_scale, + int window_size_left, + int window_size_right, + bool deterministic) { + + set_params_fprop(params, + b, seqlen_q, seqlen_k, seqlen_q_rounded, seqlen_k_rounded, h, h_k, d, d_rounded, + q, k, v, out, + cu_seqlens_q_d, + cu_seqlens_k_d, + nullptr, + nullptr, + softmax_lse_d, + p_dropout, + softmax_scale, + window_size_left, + window_size_right); + + // Set the pointers and strides. + params.do_ptr = dout.data_ptr(); + params.do_row_stride = dout.stride(-3); + params.do_head_stride = dout.stride(-2); + params.dq_ptr = dq.data_ptr(); + params.dk_ptr = dk.data_ptr(); + params.dv_ptr = dv.data_ptr(); + params.dq_row_stride = dq.stride(-3); + params.dk_row_stride = dk.stride(-3); + params.dv_row_stride = dv.stride(-3); + params.dq_head_stride = dq.stride(-2); + params.dk_head_stride = dk.stride(-2); + params.dv_head_stride = dv.stride(-2); + + if (cu_seqlens_q_d == nullptr) { + params.do_batch_stride = dout.stride(0); + params.dq_batch_stride = dq.stride(0); + params.dk_batch_stride = dk.stride(0); + params.dv_batch_stride = dv.stride(0); + } + + params.dq_accum_ptr = dq_accum_d; + params.dk_accum_ptr = dk_accum_d; + params.dv_accum_ptr = dv_accum_d; + + // Softmax sum + params.dsoftmax_sum = dsoftmax_sum_d; + + params.deterministic = deterministic; +} + +void run_mha_fwd(Flash_fwd_params ¶ms, cudaStream_t stream, bool force_split_kernel=false) { + // HEADDIM_SWITCH(params.d, [&] { + // run_mha_fwd_(params, stream); + // }); + if (!params.is_e4m3) { + if (params.is_bf16) { + if (params.d == 64) { + run_mha_fwd_(params, stream); + } else if (params.d == 128) { + run_mha_fwd_(params, stream); + } else { + run_mha_fwd_(params, stream); + } + } else { + if (params.d == 64) { + run_mha_fwd_(params, stream); + } else if (params.d == 128) { + run_mha_fwd_(params, stream); + } else { + run_mha_fwd_(params, stream); + } + } + } else { + if (params.d == 64) { + run_mha_fwd_(params, stream); + } else if (params.d == 128) { + run_mha_fwd_(params, stream); + } else if (params.d == 256) { + run_mha_fwd_(params, stream); + } + } +} + +std::vector +mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size + c10::optional &out_, // batch_size x seqlen_q x num_heads x head_size + const float softmax_scale, + bool is_causal) { + + auto dprops = at::cuda::getCurrentDeviceProperties(); + bool is_sm90 = dprops->major == 9 && dprops->minor == 0; + TORCH_CHECK(is_sm90, "FlashAttention only supports Hopper GPUs or newer."); + + auto q_dtype = q.dtype(); + // TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, + // "FlashAttention only support fp16 and bf16 data type for now"); + // TODO: will add e4m3 later + // TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kFloat8_e4m3fn, + // "FlashAttention only support fp16 and bf16 data type"); + // "FlashAttention only support fp16 and fp8 (e4m3) data type for now"); + TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype"); + TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype"); + + CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); + + TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + + const auto sizes = q.sizes(); + + const int batch_size = sizes[0]; + int seqlen_q = sizes[1]; + int num_heads = sizes[2]; + const int head_size_og = sizes[3]; + const int seqlen_k = k.size(1); + const int num_heads_k = k.size(2); + TORCH_CHECK(batch_size > 0, "batch size must be positive"); + TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); + TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + + TORCH_CHECK(head_size_og == 64 || head_size_og == 128 || head_size_og == 256, "Only support head size 64, 128, and 256 for now"); + + CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og); + CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size_og); + CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size_og); + + at::Tensor q_padded, k_padded, v_padded; + if (head_size_og % 8 != 0) { + q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + } else { + q_padded = q; + k_padded = k; + v_padded = v; + } + + at::Tensor out; + if (out_.has_value()) { + out = out_.value(); + // TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs"); + TORCH_CHECK(q_dtype == at::ScalarType::Float8_e4m3fn + ? (out.dtype() == at::kHalf) + : (out.dtype() == q_dtype), + "Output must have the same dtype as input dtype if dtype is " + "not fp8, or fp16 for fp8 input."); + CHECK_DEVICE(out); + TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); + CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, head_size_og); + if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); } + } else { + if (q_dtype == at::ScalarType::Float8_e4m3fn) + out = torch::empty_like(q_padded, at::kHalf); + else + out = torch::empty_like(q_padded); + } + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + const int head_size = round_multiple(head_size_og, 8); + const int head_size_rounded = round_multiple(head_size, 32); + const int seqlen_q_rounded = round_multiple(seqlen_q, 128); + const int seqlen_k_rounded = round_multiple(seqlen_k, 128); + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + + auto opts = q.options(); + + auto softmax_lse = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat)); + at::Tensor p; + + Flash_fwd_params params; + set_params_fprop(params, + batch_size, + seqlen_q, seqlen_k, + seqlen_q_rounded, seqlen_k_rounded, + num_heads, num_heads_k, + head_size, head_size_rounded, + q_padded, k_padded, v_padded, out, + /*cu_seqlens_q_d=*/nullptr, + /*cu_seqlens_k_d=*/nullptr, + /*seqused_k=*/nullptr, + nullptr, + softmax_lse.data_ptr(), + /*p_dropout=*/0.f, + softmax_scale, + /*window_size_left=*/-1, + /*window_size_right=*/is_causal ? 0 : -1); + + auto tile_count_semaphore = is_causal ? torch::zeros({1}, opts.dtype(torch::kInt32)) : torch::empty({1}, opts.dtype(torch::kInt32)); + params.tile_count_semaphore = tile_count_semaphore.data_ptr(); + + if (seqlen_k > 0) { + auto stream = at::cuda::getCurrentCUDAStream().stream(); + run_mha_fwd(params, stream); + } else { + // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0. + out.zero_(); + softmax_lse.fill_(std::numeric_limits::infinity()); + } + + at::Tensor out_padded = out; + if (head_size_og % 8 != 0) { + out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + if (out_.has_value()) { out_.value().copy_(out); } + } + + return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse, p}; +} + +std::vector +mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + const at::Tensor &k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table. + const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table. + c10::optional &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &cu_seqlens_q, // b+1 + const at::Tensor &cu_seqlens_k, // b+1 + c10::optional &seqused_k, // b. If given, only this many elements of each batch element's keys are used. + int max_seqlen_q, + const int max_seqlen_k, + const float softmax_scale, + bool is_causal) { + + auto dprops = at::cuda::getCurrentDeviceProperties(); + bool is_sm90 = dprops->major == 9 && dprops->minor == 0; + TORCH_CHECK(is_sm90, "FlashAttention only supports Hopper GPUs or newer."); + + auto q_dtype = q.dtype(); + TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, + "FlashAttention only support fp16 and bf16 data type"); + TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype"); + TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype"); + TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32"); + TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32"); + + CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); + CHECK_DEVICE(cu_seqlens_q); + CHECK_DEVICE(cu_seqlens_k); + + TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + CHECK_CONTIGUOUS(cu_seqlens_q); + CHECK_CONTIGUOUS(cu_seqlens_k); + + const auto sizes = q.sizes(); + + const int batch_size = cu_seqlens_q.numel() - 1; + int num_heads = sizes[1]; + const int head_size_og = sizes[2]; + const int num_heads_k = k.size(1); + + int window_size_left = -1; + int window_size_right = -1; + if (is_causal) { window_size_right = 0; } + + void *cu_seqlens_q_d = cu_seqlens_q.data_ptr(); + + const int total_q = q.sizes()[0]; + + TORCH_CHECK(batch_size > 0, "batch size must be positive"); + TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256"); + TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + + if (window_size_left >= max_seqlen_k) { window_size_left = -1; } + if (window_size_right >= max_seqlen_k) { window_size_right = -1; } + + CHECK_SHAPE(q, total_q, num_heads, head_size_og); + const int total_k = k.size(0); + CHECK_SHAPE(k, total_k, num_heads_k, head_size_og); + CHECK_SHAPE(v, total_k, num_heads_k, head_size_og); + + CHECK_SHAPE(cu_seqlens_q, batch_size + 1); + CHECK_SHAPE(cu_seqlens_k, batch_size + 1); + if (seqused_k.has_value()){ + auto seqused_k_ = seqused_k.value(); + TORCH_CHECK(seqused_k_.dtype() == torch::kInt32, "seqused_k must have dtype int32"); + TORCH_CHECK(seqused_k_.is_cuda(), "seqused_k must be on CUDA device"); + TORCH_CHECK(seqused_k_.is_contiguous(), "seqused_k must be contiguous"); + CHECK_SHAPE(seqused_k_, batch_size); + } + + at::Tensor q_padded, k_padded, v_padded; + if (head_size_og % 8 != 0) { + q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + } else { + q_padded = q; + k_padded = k; + v_padded = v; + } + + at::Tensor out; + if (out_.has_value()) { + out = out_.value(); + TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs"); + CHECK_DEVICE(out); + TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); + CHECK_SHAPE(out, sizes[0], sizes[1], head_size_og); + if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); } + } else { + out = torch::empty_like(q_padded); + } + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + const int head_size = round_multiple(head_size_og, 8); + const int head_size_rounded = round_multiple(head_size, 32); + const int seqlen_q_rounded = round_multiple(max_seqlen_q, 128); + const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128); + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + + auto opts = q.options(); + auto softmax_lse = torch::empty({num_heads, total_q}, opts.dtype(at::kFloat)); + + Flash_fwd_params params; + set_params_fprop(params, + batch_size, + max_seqlen_q, max_seqlen_k, + seqlen_q_rounded, seqlen_k_rounded, + num_heads, num_heads_k, + head_size, head_size_rounded, + q_padded, k_padded, v_padded, out, + cu_seqlens_q_d, + cu_seqlens_k.data_ptr(), + seqused_k.has_value() ? seqused_k.value().data_ptr() : nullptr, + /*p_d=*/nullptr, + softmax_lse.data_ptr(), + /*p_dropout=*/0.f, + softmax_scale, + window_size_left, + window_size_right, + /*seqlenq_ngroups_swapped=*/false, + /*unpadded_lse=*/true); + params.total_q = total_q; + params.total_k = total_k; + + if (max_seqlen_k > 0) { + auto stream = at::cuda::getCurrentCUDAStream().stream(); + run_mha_fwd(params, stream); + } else { + // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0. + out.zero_(); + softmax_lse.fill_(std::numeric_limits::infinity()); + } + + at::Tensor out_padded = out; + if (head_size_og % 8 != 0) { + out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + if (out_.has_value()) { out_.value().copy_(out); } + } + + return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse}; +} + +void run_mha_bwd(Flash_bwd_params ¶ms, cudaStream_t stream) { + // FP16_SWITCH(!params.is_bf16, [&] { + // HEADDIM_SWITCH(params.d, [&] { + // run_mha_bwd_(params, stream); + // }); + // }); + if (!params.is_bf16) { + if (params.d <= 64) { + run_mha_bwd_(params, stream); + } else if (params.d <= 96) { + run_mha_bwd_(params, stream); + } else { + run_mha_bwd_(params, stream); + } + } else { + if (params.d <= 64) { + run_mha_bwd_(params, stream); + } else if (params.d <= 96) { + run_mha_bwd_(params, stream); + } else { + run_mha_bwd_(params, stream); + } + } +} + +std::vector +mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_size_og + const at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &out, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &softmax_lse, // b x h x seqlen_q + c10::optional &dq_, // batch_size x seqlen_q x num_heads x head_size + c10::optional &dk_, // batch_size x seqlen_k x num_heads_k x head_size + c10::optional &dv_, // batch_size x seqlen_k x num_heads_k x head_size + const float softmax_scale, + const bool is_causal, + const bool deterministic) { + + #ifdef FLASHATTENTION_DISABLE_BACKWARD + TORCH_CHECK(false, "This flash attention build does not support backward."); + #endif + auto dprops = at::cuda::getCurrentDeviceProperties(); + bool is_sm9x = dprops->major == 9 && dprops->minor >= 0; + TORCH_CHECK(is_sm9x, "FlashAttentionHopper only supports Hopper GPUs or newer."); + + auto stream = at::cuda::getCurrentCUDAStream().stream(); + + auto q_dtype = q.dtype(); + TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, + "FlashAttention only support fp16 and bf16 data type"); + TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype"); + TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype"); + TORCH_CHECK(out.dtype() == q_dtype, "query and out must have the same dtype"); + TORCH_CHECK(dout.dtype() == q_dtype, "query and dout must have the same dtype"); + + CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); + CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse); + + TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension"); + TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension"); + + const auto sizes = q.sizes(); + + const int batch_size = sizes[0]; + const int seqlen_q = sizes[1]; + const int num_heads = sizes[2]; + const int head_size_og = dout.size(3); + const int head_size = sizes[3]; + const int seqlen_k = k.size(1); + const int num_heads_k = k.size(2); + TORCH_CHECK(batch_size > 0, "batch size must be positive"); + TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8"); + TORCH_CHECK(head_size <= 128, "FlashAttention backward only supports head dimension at most 128"); + TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + const int head_size_rounded = head_size <= 64 ? 64 : round_multiple(head_size, 32); + // This should match the kernel configs + const int kBlockM = head_size <= 64 ? 128 : (head_size < 256 ? 64 : 32); + const int seqlen_q_rounded = round_multiple(seqlen_q, kBlockM); + const int seqlen_k_rounded = round_multiple(seqlen_k, 128); + + TORCH_CHECK(head_size == round_multiple(head_size_og, 8), "head_size must be head_size_og rounded to a multiple of 8"); + + CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size); + CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size); + CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size); + CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, head_size); + CHECK_SHAPE(dout, batch_size, seqlen_q, num_heads, head_size_og); + + at::Tensor dq, dk, dv; + if (dq_.has_value()) { + dq = dq_.value(); + TORCH_CHECK(dq.dtype() == q_dtype, "dq must have the same dtype as q"); + CHECK_DEVICE(dq); + TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension"); + CHECK_SHAPE(dq, batch_size, seqlen_q, num_heads, head_size); + } else { + dq = torch::empty_like(q); + } + if (dk_.has_value()) { + dk = dk_.value(); + TORCH_CHECK(dk.dtype() == q_dtype, "dk must have the same dtype as q"); + CHECK_DEVICE(dk); + TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension"); + CHECK_SHAPE(dk, batch_size, seqlen_k, num_heads_k, head_size); + } else { + dk = torch::empty_like(k); + } + if (dv_.has_value()) { + dv = dv_.value(); + TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q"); + CHECK_DEVICE(dv); + TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension"); + CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_k, head_size); + } else { + dv = torch::empty_like(v); + } + + at::Tensor dout_padded; + if (head_size_og % 8 != 0) { + dout_padded = torch::nn::functional::pad(dout, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + } else { + dout_padded = dout; + } + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + + auto opts = q.options(); + // Need softmax_d to have seqlen_q_rounded since we want its address to be aligned by 16/8 bytes for TMA / LDG.64 + auto softmax_d = torch::empty({batch_size, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat)); + auto softmax_lse_log2 = torch::empty({batch_size, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat)); + at::Tensor dq_accum; + at::Tensor dk_accum, dv_accum; + dq_accum = torch::empty({batch_size, num_heads, seqlen_q_rounded, head_size_rounded}, opts.dtype(at::kFloat)); + // dk_accum = torch::zeros({batch_size, seqlen_k_rounded, num_heads_k, head_size_rounded}, opts.dtype(at::kFloat)); + // dv_accum = torch::zeros({batch_size, seqlen_k_rounded, num_heads_k, head_size_rounded}, opts.dtype(at::kFloat)); + + at::Tensor dk_expanded, dv_expanded; + if (num_heads_k != num_heads) { // MQA / GQA + dk_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size}, opts); + dv_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size}, opts); + } else { + dk_expanded = dk; + dv_expanded = dv; + } + + Flash_bwd_params params; + + set_params_dgrad(params, + batch_size, + seqlen_q, seqlen_k, + seqlen_q_rounded, seqlen_k_rounded, + num_heads, num_heads_k, + head_size, head_size_rounded, + q, k, v, out, + dout_padded, dq, dk_expanded, dv_expanded, + nullptr, + nullptr, + dq_accum.data_ptr(), + // loop ? dk_accum.data_ptr() : nullptr, + // loop ? dv_accum.data_ptr() : nullptr, + nullptr, + nullptr, + softmax_lse.data_ptr(), + softmax_d.data_ptr(), + /*p_dropout=*/0.f, + softmax_scale, + /*window_size_left=*/-1, + /*window_size_right=*/is_causal ? 0 : -1, + deterministic); + params.softmax_lse_log2_ptr = softmax_lse_log2.data_ptr(); + + // Will be zero'ed out in the backward preprocess kernel + at::Tensor dq_semaphore = torch::empty({(seqlen_q + kBlockM - 1) / kBlockM, batch_size, num_heads}, opts.dtype(torch::kInt32)); + params.dq_semaphore = dq_semaphore.data_ptr(); + // printf("dq_semaphore: %p, [%d, %d, %d]\n", params.dq_semaphore, (seqlen_q + 64 - 1) / 64, batch_size, num_heads); + + if (seqlen_q > 0) { + run_mha_bwd(params, stream); + } else { + // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0. + dk_expanded.zero_(); + dv_expanded.zero_(); + softmax_d.zero_(); + } + + // For MQA/GQA we need to sum dK and dV across the groups + if (num_heads_k != num_heads) { + at::sum_out(dk, at::reshape(dk_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3}); + at::sum_out(dv, at::reshape(dv_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size}), {3}); + } + + if (head_size_og % 8 != 0) { + dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + dk = dk.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + } + + return { dq, dk, dv, softmax_d, dq_accum}; +} + +std::vector +mha_varlen_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_size_og + const at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &out, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &softmax_lse, // b x h x seqlen_q + c10::optional &dq_, // batch_size x seqlen_q x num_heads x head_size + c10::optional &dk_, // batch_size x seqlen_k x num_heads_k x head_size + c10::optional &dv_, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &cu_seqlens_q, // b+1 + const at::Tensor &cu_seqlens_k, // b+1 + const int max_seqlen_q, + const int max_seqlen_k, // max sequence length to choose the kernel + const float softmax_scale, + const bool is_causal, + const bool deterministic) { + + #ifdef FLASHATTENTION_DISABLE_BACKWARD + TORCH_CHECK(false, "This flash attention build does not support backward."); + #endif + auto dprops = at::cuda::getCurrentDeviceProperties(); + bool is_sm9x = dprops->major == 9 && dprops->minor >= 0; + TORCH_CHECK(is_sm9x, "FlashAttentionHopper only supports Hopper GPUs or newer."); + + auto stream = at::cuda::getCurrentCUDAStream().stream(); + + auto q_dtype = q.dtype(); + TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, + "FlashAttention only support fp16 and bf16 data type"); + TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype"); + TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype"); + TORCH_CHECK(out.dtype() == q_dtype, "query and out must have the same dtype"); + TORCH_CHECK(dout.dtype() == q_dtype, "query and dout must have the same dtype"); + TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32"); + TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32"); + + CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); + CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse); + CHECK_DEVICE(cu_seqlens_q); CHECK_DEVICE(cu_seqlens_k); + + TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension"); + TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension"); + CHECK_CONTIGUOUS(cu_seqlens_q); + CHECK_CONTIGUOUS(cu_seqlens_k); + + const auto sizes = q.sizes(); + + const int total_q = sizes[0]; + const int batch_size = cu_seqlens_q.numel() - 1; + const int num_heads = sizes[1]; + const int head_size_og = dout.size(2); + const int head_size = sizes[2]; + const int total_k = k.size(0); + const int num_heads_k = k.size(1); + TORCH_CHECK(batch_size > 0, "batch size must be positive"); + TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8"); + TORCH_CHECK(head_size <= 128, "FlashAttention backward only supports head dimension at most 128"); + TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + const int head_size_rounded = head_size <= 64 ? 64 : round_multiple(head_size, 32); + // This should match the kernel configs + const int kBlockM = head_size <= 64 ? 128 : (head_size < 256 ? 64 : 32); + const int seqlen_q_rounded = round_multiple(max_seqlen_q, kBlockM); + const int seqlen_k_rounded = round_multiple(max_seqlen_k, 128); + int const total_q_padded_rounded = round_multiple(total_q + batch_size * 128, 128); + + TORCH_CHECK(head_size == round_multiple(head_size_og, 8), "head_size must be head_size_og rounded to a multiple of 8"); + + CHECK_SHAPE(q, total_q, num_heads, head_size_og); + CHECK_SHAPE(k, total_k, num_heads_k, head_size_og); + CHECK_SHAPE(v, total_k, num_heads_k, head_size_og); + CHECK_SHAPE(out, total_q, num_heads, head_size); + CHECK_SHAPE(dout, total_q, num_heads, head_size_og); + CHECK_SHAPE(cu_seqlens_q, batch_size + 1); + CHECK_SHAPE(cu_seqlens_k, batch_size + 1); + + at::Tensor dq, dk, dv; + if (dq_.has_value()) { + dq = dq_.value(); + TORCH_CHECK(dq.dtype() == q_dtype, "dq must have the same dtype as q"); + CHECK_DEVICE(dq); + TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension"); + CHECK_SHAPE(dq, total_q, num_heads, head_size); + } else { + dq = torch::empty_like(q); + } + if (dk_.has_value()) { + dk = dk_.value(); + TORCH_CHECK(dk.dtype() == q_dtype, "dk must have the same dtype as q"); + CHECK_DEVICE(dk); + TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension"); + CHECK_SHAPE(dk, total_k, num_heads_k, head_size); + } else { + dk = torch::empty_like(k); + } + if (dv_.has_value()) { + dv = dv_.value(); + TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q"); + CHECK_DEVICE(dv); + TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension"); + CHECK_SHAPE(dv, total_k, num_heads_k, head_size); + } else { + dv = torch::empty_like(v); + } + + at::Tensor dout_padded; + if (head_size_og % 8 != 0) { + dout_padded = torch::nn::functional::pad(dout, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + } else { + dout_padded = dout; + } + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + + auto opts = q.options(); + // Need softmax_d to have total_q_padded_rounded since we want its address to be aligned by 16/8 bytes for TMA / LDG.64 + auto softmax_d = torch::empty({num_heads, total_q_padded_rounded}, opts.dtype(at::kFloat)); + auto softmax_lse_log2 = torch::empty({num_heads, total_q_padded_rounded}, opts.dtype(at::kFloat)); + at::Tensor dq_accum; + at::Tensor dk_accum, dv_accum; + dq_accum = torch::empty({num_heads, total_q_padded_rounded, head_size_rounded}, opts.dtype(at::kFloat)); + // dk_accum = torch::zeros({batch_size, seqlen_k_rounded, num_heads_k, head_size_rounded}, opts.dtype(at::kFloat)); + // dv_accum = torch::zeros({batch_size, seqlen_k_rounded, num_heads_k, head_size_rounded}, opts.dtype(at::kFloat)); + + at::Tensor dk_expanded, dv_expanded; + if (num_heads_k != num_heads) { // MQA / GQA + dk_expanded = torch::empty({total_k, num_heads, head_size}, opts); + dv_expanded = torch::empty({total_k, num_heads, head_size}, opts); + } else { + dk_expanded = dk; + dv_expanded = dv; + } + + Flash_bwd_params params; + + set_params_dgrad(params, + batch_size, + max_seqlen_q, max_seqlen_k, + seqlen_q_rounded, seqlen_k_rounded, + num_heads, num_heads_k, + head_size, head_size_rounded, + q, k, v, out, + dout_padded, dq, dk_expanded, dv_expanded, + cu_seqlens_q.data_ptr(), + cu_seqlens_k.data_ptr(), + dq_accum.data_ptr(), + // loop ? dk_accum.data_ptr() : nullptr, + // loop ? dv_accum.data_ptr() : nullptr, + nullptr, + nullptr, + softmax_lse.data_ptr(), + softmax_d.data_ptr(), + /*p_dropout=*/0.f, + softmax_scale, + /*window_size_left=*/-1, + /*window_size_right=*/is_causal ? 0 : -1, + deterministic); + params.total_q = total_q; + params.total_k = total_k; + params.softmax_lse_log2_ptr = softmax_lse_log2.data_ptr(); + + // Will be zero'ed out in the backward preprocess kernel + at::Tensor dq_semaphore = torch::empty({(max_seqlen_q + kBlockM - 1) / kBlockM, batch_size, num_heads}, opts.dtype(torch::kInt32)); + params.dq_semaphore = dq_semaphore.data_ptr(); + + if (max_seqlen_q > 0) { + run_mha_bwd(params, stream); + } else { + // If max_seqlen_q == 0, then we have an empty tensor. We need to set the output to 0. + dk_expanded.zero_(); + dv_expanded.zero_(); + softmax_d.zero_(); + } + + // For MQA/GQA we need to sum dK and dV across the groups + if (num_heads_k != num_heads) { + at::sum_out(dk, at::reshape(dk_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size}), {2}); + at::sum_out(dv, at::reshape(dv_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size}), {2}); + } + + if (head_size_og % 8 != 0) { + dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + dk = dk.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + } + + return { dq, dk, dv, softmax_d, dq_accum, softmax_lse_log2 }; +} + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.doc() = "FlashAttention"; + m.def("fwd", &mha_fwd, "Forward pass"); + m.def("bwd", &mha_bwd, "Backward pass"); + m.def("varlen_fwd", &mha_varlen_fwd, "Forward pass (variable length)"); + m.def("varlen_bwd", &mha_varlen_bwd, "Varlen backward pass"); +} diff --git a/flash_attn_interface.py b/flash_attn_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..13ddff4bbe54e4caf2780276c9b909c20044e97f --- /dev/null +++ b/flash_attn_interface.py @@ -0,0 +1,383 @@ +# Copyright (c) 2023, Tri Dao. + +from typing import Optional, Union + +import torch +import torch.nn as nn + +# isort: off +# We need to import the CUDA kernels after importing torch +import flashattn_hopper_cuda + +# isort: on + +def maybe_contiguous(x): + return x.contiguous() if x is not None and x.stride(-1) != 1 else x + +def _flash_attn_forward(q, k, v, softmax_scale, causal): + q, k, v = [maybe_contiguous(x) for x in (q, k, v)] + out, q, k, v, out_padded, softmax_lse, S_dmask = flashattn_hopper_cuda.fwd( + q, + k, + v, + None, + softmax_scale, + causal, + ) + return out, q, k, v, out_padded, softmax_lse, S_dmask + + +def _flash_attn_backward( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + softmax_scale, + causal, + deterministic=False +): + # dq, dk, dv are allocated by us so they should already be contiguous + dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)] + dq, dk, dv, softmax_d, *rest = flashattn_hopper_cuda.bwd( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + softmax_scale, + causal, + deterministic, + ) + return dq, dk, dv, softmax_d + +def _flash_attn_varlen_forward( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + softmax_scale, + causal, +): + maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x + q, k, v = [maybe_contiguous(x) for x in (q, k, v)] + out, q, k, v, out_padded, softmax_lse = flashattn_hopper_cuda.varlen_fwd( + q, + k, + v, + None, + cu_seqlens_q, + cu_seqlens_k, + None, + max_seqlen_q, + max_seqlen_k, + softmax_scale, + causal, + ) + # if out.isnan().any() or softmax_lse.isnan().any(): + # breakpoint() + return out, q, k, v, out_padded, softmax_lse + + +def _flash_attn_varlen_backward( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + softmax_scale, + causal, + deterministic=False, +): + maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x + # dq, dk, dv are allocated by us so they should already be contiguous + dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)] + ( + dq, + dk, + dv, + softmax_d, + *rest, + ) = flashattn_hopper_cuda.varlen_bwd( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + softmax_scale, + causal, + deterministic, + ) + # if dk.isnan().any() or dk.isnan().any() or dv.isnan().any() or softmax_d.isnan().any(): + # breakpoint() + return dq, dk, dv, softmax_d + + +class FlashAttnFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q, + k, + v, + softmax_scale, + causal, + deterministic=False, + ): + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + out, q, k, v, out_padded, softmax_lse, S_dmask = _flash_attn_forward( + q, + k, + v, + softmax_scale, + causal + ) + ctx.save_for_backward(q, k, v, out_padded, softmax_lse) + ctx.softmax_scale = softmax_scale + ctx.causal = causal + ctx.deterministic = deterministic + return out, softmax_lse + + @staticmethod + def backward(ctx, dout, *args): + q, k, v, out, softmax_lse = ctx.saved_tensors + dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v) + _flash_attn_backward( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + ctx.softmax_scale, + ctx.causal, + ctx.deterministic, + ) + dq = dq[..., : dout.shape[-1]] # We could have padded the head dimension + dk = dk[..., : dout.shape[-1]] + dv = dv[..., : dout.shape[-1]] + return dq, dk, dv, None, None, None + + +class FlashAttnVarlenFunc(torch.autograd.Function): + @staticmethod + def forward( + ctx, + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + softmax_scale, + causal, + deterministic=False, + ): + if softmax_scale is None: + softmax_scale = q.shape[-1] ** (-0.5) + out, q, k, v, out_padded, softmax_lse = _flash_attn_varlen_forward( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + softmax_scale, + causal=causal, + ) + ctx.save_for_backward( + q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k + ) + ctx.max_seqlen_q = max_seqlen_q + ctx.max_seqlen_k = max_seqlen_k + ctx.softmax_scale = softmax_scale + ctx.causal = causal + ctx.deterministic = deterministic + return out, softmax_lse + + @staticmethod + def backward(ctx, dout, *args): + q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k = ctx.saved_tensors + dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v) + _flash_attn_varlen_backward( + dout, + q, + k, + v, + out, + softmax_lse, + dq, + dk, + dv, + cu_seqlens_q, + cu_seqlens_k, + ctx.max_seqlen_q, + ctx.max_seqlen_k, + ctx.softmax_scale, + ctx.causal, + ctx.deterministic, + ) + dq = dq[..., : dout.shape[-1]] # We could have padded the head dimension + dk = dk[..., : dout.shape[-1]] + dv = dv[..., : dout.shape[-1]] + return dq, dk, dv, None, None, None, None, None, None, None + + +def flash_attn_func( + q, + k, + v, + softmax_scale=None, + causal=False, + deterministic=False +): + """dropout_p should be set to 0.0 during evaluation + Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads + than Q. Note that the number of heads in Q must be divisible by the number of heads in KV. + For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head + 0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V. + + If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix. + For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is: + 1 1 1 1 0 + 1 1 1 1 1 + If seqlen_q = 5 and seqlen_k = 2, the causal mask is: + 0 0 + 0 0 + 0 0 + 1 0 + 1 1 + If the row of the mask is all zero, the output will be zero. + + If window_size != (-1, -1), implements sliding window local attention. Query at position i + will only attend to keys between + [i + seqlen_k - seqlen_q - window_size[0], i + seqlen_k - seqlen_q + window_size[1]] inclusive. + + Arguments: + q: (batch_size, seqlen, nheads, headdim) + k: (batch_size, seqlen, nheads_k, headdim) + v: (batch_size, seqlen, nheads_k, headdim) + dropout_p: float. Dropout probability. + softmax_scale: float. The scaling of QK^T before applying softmax. + Default to 1 / sqrt(headdim). + causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). + window_size: (left, right). If not (-1, -1), implements sliding window local attention. + alibi_slopes: (nheads,) or (batch_size, nheads), fp32. A bias of + (-alibi_slope * |i + seqlen_k - seqlen_q - j|) + is added to the attention score of query i and key j. + deterministic: bool. Whether to use the deterministic implementation of the backward pass, + which is slightly slower and uses more memory. The forward pass is always deterministic. + return_attn_probs: bool. Whether to return the attention probabilities. This option is for + testing only. The returned probabilities are not guaranteed to be correct + (they might not have the right scaling). + Return: + out: (batch_size, seqlen, nheads, headdim). + softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The + logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax + normalization factor). + S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen). + The output of softmax (possibly with different scaling). It also encodes the dropout + pattern (negative means that location was dropped, nonnegative means it was kept). + """ + return FlashAttnFunc.apply( + q, + k, + v, + softmax_scale, + causal, + deterministic, + ) + + +def flash_attn_varlen_func( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + softmax_scale=None, + causal=False, + deterministic=False, +): + """ + Supports multi-query and grouped-query attention (MQA/GQA) by passing in K, V with fewer heads + than Q. Note that the number of heads in Q must be divisible by the number of heads in KV. + For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head + 0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V. + If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix. + For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is: + 1 1 1 1 0 + 1 1 1 1 1 + If seqlen_q = 5 and seqlen_k = 2, the causal mask is: + 0 0 + 0 0 + 0 0 + 1 0 + 1 1 + If the row of the mask is all zero, the output will be zero. + Arguments: + q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch. + k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch. + v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch. + cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into q. + cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into kv. + max_seqlen_q: int. Maximum query sequence length in the batch. + max_seqlen_k: int. Maximum key sequence length in the batch. + softmax_scale: float. The scaling of QK^T before applying softmax. + Default to 1 / sqrt(headdim). + causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling). + Return: + out: (total, nheads, headdim). + softmax_lse [optional, if return_attn_probs=True]: (nheads, total_q_seqlen). The + logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax + normalization factor). + """ + return FlashAttnVarlenFunc.apply( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + softmax_scale, + causal, + deterministic, + ) diff --git a/flash_attn_triton.py b/flash_attn_triton.py new file mode 100644 index 0000000000000000000000000000000000000000..30420c057adf1916e16403d3f0d02d0e26c8b7a3 --- /dev/null +++ b/flash_attn_triton.py @@ -0,0 +1,1160 @@ +""" +*Experimental* implementation of FlashAttention in Triton. +Tested with triton==2.0.0.dev20221202. +Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions +other than 64: +https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207 +We'll update this implementation with the new Triton backend once this is fixed. + +We use the FlashAttention implementation from Phil Tillet a starting point. +https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py + +Changes: +- Implement both causal and non-causal attention. +- Implement both self-attention and cross-attention. +- Support arbitrary seqlens (not just multiples of 128), for both forward and backward. +- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward. +- Support attention bias. +- Speed up the forward pass a bit, and only store the LSE instead of m and l. +- Make the backward for d=128 much faster by reducing register spilling. +- Optionally parallelize the backward pass across seqlen_k, to deal with the case of +small batch size * nheads. + +Caution: +- This is an *experimental* implementation. The forward pass should be quite robust but +I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler). +- This implementation has only been tested on A100. +- If you plan to use headdim other than 64 and 128, you should test for race conditions +(due to the Triton compiler), as done in tests/test_flash_attn.py +"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions +for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident +that there are none left for other head dimensions. + +Differences between this Triton version and the CUDA version: +- Triton version doesn't support dropout. +- Triton forward is generally faster than CUDA forward, while Triton backward is +generally slower than CUDA backward. Overall Triton forward + backward is slightly slower +than CUDA forward + backward. +- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor). +- Triton version supports attention bias, while CUDA version doesn't. +""" + +import math + +import torch +import triton +import triton.language as tl + + +# Disabling autotune for now, set num_warps=4 if headdim=64 and num_warps=8 if headdim=128 +# @triton.autotune( +# configs=[ +# triton.Config({"BLOCK_M": 128, "BLOCK_N": 128}, num_warps=4, num_stages=1), +# # This config has a race condition when EVEN_M == False, disabling it for now. +# # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_warps=4, num_stages=1), +# ], +# key=['CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL', 'BLOCK_HEADDIM'] +# ) +@triton.heuristics( + { + "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0, + "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0, + "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"], + } +) +@triton.jit +def _fwd_kernel( + Q, + K, + V, + Bias, + Out, + Lse, + TMP, # NOTE: TMP is a scratchpad buffer to workaround a compiler bug + softmax_scale, + stride_qb, + stride_qh, + stride_qm, + stride_kb, + stride_kh, + stride_kn, + stride_vb, + stride_vh, + stride_vn, + stride_bb, + stride_bh, + stride_bm, + stride_ob, + stride_oh, + stride_om, + nheads, + seqlen_q, + seqlen_k, + seqlen_q_rounded, + headdim, + CACHE_KEY_SEQLEN_Q, + CACHE_KEY_SEQLEN_K, + BIAS_TYPE: tl.constexpr, + IS_CAUSAL: tl.constexpr, + BLOCK_HEADDIM: tl.constexpr, + EVEN_M: tl.constexpr, + EVEN_N: tl.constexpr, + EVEN_HEADDIM: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, +): + start_m = tl.program_id(0) + off_hb = tl.program_id(1) + off_b = off_hb // nheads + off_h = off_hb % nheads + # off_b = tl.program_id(1) + # off_h = tl.program_id(2) + # off_hb = off_b * nheads + off_h + # initialize offsets + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) + offs_d = tl.arange(0, BLOCK_HEADDIM) + # Initialize pointers to Q, K, V + # Adding parenthesis around indexing might use int32 math instead of int64 math? + # https://github.com/openai/triton/issues/741 + # I'm seeing a tiny bit of difference (5-7us) + q_ptrs = ( + Q + off_b * stride_qb + off_h * stride_qh + (offs_m[:, None] * stride_qm + offs_d[None, :]) + ) + k_ptrs = ( + K + off_b * stride_kb + off_h * stride_kh + (offs_n[:, None] * stride_kn + offs_d[None, :]) + ) + v_ptrs = ( + V + off_b * stride_vb + off_h * stride_vh + (offs_n[:, None] * stride_vn + offs_d[None, :]) + ) + if BIAS_TYPE == "vector": + b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n + elif BIAS_TYPE == "matrix": + b_ptrs = ( + Bias + + off_b * stride_bb + + off_h * stride_bh + + (offs_m[:, None] * stride_bm + offs_n[None, :]) + ) + # initialize pointer to m and l + t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m + lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32) + # load q: it will stay in SRAM throughout + # [2022-10-30] TD: Triton bug - in the case of EVEN_M=True and EVEN_N=False, if we just call + # tl.load(q_ptrs), we get the wrong output! + if EVEN_M & EVEN_N: + if EVEN_HEADDIM: + q = tl.load(q_ptrs) + else: + q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0) + else: + if EVEN_HEADDIM: + q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0) + else: + q = tl.load( + q_ptrs, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0 + ) + # loop over k, v and update accumulator + end_n = seqlen_k if not IS_CAUSAL else tl.minimum((start_m + 1) * BLOCK_M, seqlen_k) + for start_n in range(0, end_n, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + if EVEN_N & EVEN_M: # If we just do "if EVEN_N", there seems to be some race condition + if EVEN_HEADDIM: + k = tl.load(k_ptrs + start_n * stride_kn) + else: + k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_d[None, :] < headdim, other=0.0) + else: + if EVEN_HEADDIM: + k = tl.load( + k_ptrs + start_n * stride_kn, + mask=(start_n + offs_n)[:, None] < seqlen_k, + other=0.0, + ) + else: + k = tl.load( + k_ptrs + start_n * stride_kn, + mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), + other=0.0, + ) + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk += tl.dot(q, k, trans_b=True) + # Trying to combine the two masks seem to make the result wrong + if not EVEN_N: # Need to mask out otherwise the softmax is wrong + qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float("-inf")) + if IS_CAUSAL: + qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float("-inf")) + if BIAS_TYPE != "none": + if BIAS_TYPE == "vector": + if EVEN_N: + bias = tl.load(b_ptrs + start_n).to(tl.float32) + else: + bias = tl.load( + b_ptrs + start_n, mask=(start_n + offs_n) < seqlen_k, other=0.0 + ).to(tl.float32) + bias = bias[None, :] + elif BIAS_TYPE == "matrix": + if EVEN_M & EVEN_N: + bias = tl.load(b_ptrs + start_n).to(tl.float32) + else: + bias = tl.load( + b_ptrs + start_n, + mask=(offs_m[:, None] < seqlen_q) + & ((start_n + offs_n)[None, :] < seqlen_k), + other=0.0, + ).to(tl.float32) + # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler + # can then fuse the mult and add into an fma instruction. But if we have bias we need to + # to multiply with softmax_scale here. + qk = qk * softmax_scale + bias + m_ij = tl.maximum(tl.max(qk, 1), lse_i) + p = tl.exp(qk - m_ij[:, None]) + else: + m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i) + p = tl.exp(qk * softmax_scale - m_ij[:, None]) + l_ij = tl.sum(p, 1) + + # scale acc_o + acc_o_scale = tl.exp(m_i - m_ij) + + # # -- update output accumulator -- + # BUG: have to store and immediately load + tl.store(t_ptrs, acc_o_scale) + acc_o_scale = tl.load(t_ptrs) + acc_o = acc_o * acc_o_scale[:, None] + # update acc_o + if EVEN_N & EVEN_M: # If we just do "if EVEN_N", there seems to be some race condition + if EVEN_HEADDIM: + v = tl.load(v_ptrs + start_n * stride_vn) + else: + v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_d[None, :] < headdim, other=0.0) + else: + if EVEN_HEADDIM: + v = tl.load( + v_ptrs + start_n * stride_vn, + mask=(start_n + offs_n)[:, None] < seqlen_k, + other=0.0, + ) + else: + v = tl.load( + v_ptrs + start_n * stride_vn, + mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), + other=0.0, + ) + p = p.to(v.dtype) + acc_o += tl.dot(p, v) + + # -- update statistics + m_i = m_ij + l_i_new = tl.exp(lse_i - m_ij) + l_ij + lse_i = m_ij + tl.log(l_i_new) + + o_scale = tl.exp(m_i - lse_i) + # BUG: have to store and immediately load + tl.store(t_ptrs, o_scale) + o_scale = tl.load(t_ptrs) + acc_o = acc_o * o_scale[:, None] + # rematerialize offsets to save registers + start_m = tl.program_id(0) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + # write back l and m + lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m + tl.store(lse_ptrs, lse_i) + # initialize pointers to output + offs_d = tl.arange(0, BLOCK_HEADDIM) + out_ptrs = ( + Out + + off_b * stride_ob + + off_h * stride_oh + + (offs_m[:, None] * stride_om + offs_d[None, :]) + ) + if EVEN_M: + if EVEN_HEADDIM: + tl.store(out_ptrs, acc_o) + else: + tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim) + else: + if EVEN_HEADDIM: + tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q) + else: + tl.store( + out_ptrs, acc_o, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim) + ) + + +@triton.jit +def _bwd_preprocess_do_o_dot( + Out, + DO, + Delta, + stride_ob, + stride_oh, + stride_om, + stride_dob, + stride_doh, + stride_dom, + nheads, + seqlen_q, + seqlen_q_rounded, + headdim, + BLOCK_M: tl.constexpr, + BLOCK_HEADDIM: tl.constexpr, +): + start_m = tl.program_id(0) + off_hb = tl.program_id(1) + off_b = off_hb // nheads + off_h = off_hb % nheads + # initialize offsets + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_d = tl.arange(0, BLOCK_HEADDIM) + # load + o = tl.load( + Out + off_b * stride_ob + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :], + mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), + other=0.0, + ).to(tl.float32) + do = tl.load( + DO + + off_b * stride_dob + + off_h * stride_doh + + offs_m[:, None] * stride_dom + + offs_d[None, :], + mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), + other=0.0, + ).to(tl.float32) + delta = tl.sum(o * do, axis=1) + # write-back + tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta) + + +@triton.jit +def _bwd_store_dk_dv( + dk_ptrs, + dv_ptrs, + dk, + dv, + offs_n, + offs_d, + seqlen_k, + headdim, + EVEN_M: tl.constexpr, + EVEN_N: tl.constexpr, + EVEN_HEADDIM: tl.constexpr, +): + # [2022-11-01] TD: Same bug. In the case of EVEN_N=True and EVEN_M=False, + # if we just call tl.store(dv_ptrs), there's a race condition + if EVEN_N & EVEN_M: + if EVEN_HEADDIM: + tl.store(dv_ptrs, dv) + tl.store(dk_ptrs, dk) + else: + tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim) + tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim) + else: + if EVEN_HEADDIM: + tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k) + tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k) + else: + tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim)) + tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim)) + + +@triton.jit +def _bwd_kernel_one_col_block( + start_n, + Q, + K, + V, + Bias, + DO, + DQ, + DK, + DV, + LSE, + D, + softmax_scale, + stride_qm, + stride_kn, + stride_vn, + stride_bm, + stride_dom, + stride_dqm, + stride_dkn, + stride_dvn, + seqlen_q, + seqlen_k, + headdim, + ATOMIC_ADD: tl.constexpr, + BIAS_TYPE: tl.constexpr, + IS_CAUSAL: tl.constexpr, + BLOCK_HEADDIM: tl.constexpr, + EVEN_M: tl.constexpr, + EVEN_N: tl.constexpr, + EVEN_HEADDIM: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, +): + # We need to make sure begin_m is a multiple of BLOCK_M (not BLOCK_N) + begin_m = 0 if not IS_CAUSAL else ((start_n * BLOCK_N) // BLOCK_M) * BLOCK_M + # initialize row/col offsets + offs_qm = begin_m + tl.arange(0, BLOCK_M) + offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N) + offs_m = tl.arange(0, BLOCK_M) + offs_d = tl.arange(0, BLOCK_HEADDIM) + # initialize pointers to value-like data + q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :]) + k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :]) + v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :]) + do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :]) + dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :]) + if BIAS_TYPE == "vector": + b_ptrs = Bias + offs_n + elif BIAS_TYPE == "matrix": + b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :]) + # initialize dv and dk + dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32) + dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32) + # There seems to be some problem with Triton pipelining that makes results wrong for + # headdim=64, seqlen=(113, 255), bias_type='matrix'. In this case the for loop + # may have zero step, and pipelining with the bias matrix could screw it up. + # So we just exit early. + if begin_m >= seqlen_q: + dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :]) + dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :]) + _bwd_store_dk_dv( + dk_ptrs, + dv_ptrs, + dk, + dv, + offs_n, + offs_d, + seqlen_k, + headdim, + EVEN_M=EVEN_M, + EVEN_N=EVEN_N, + EVEN_HEADDIM=EVEN_HEADDIM, + ) + return + # k and v stay in SRAM throughout + # [2022-10-30] TD: Same bug as the fwd. In the case of EVEN_N=True and EVEN_M=False, + # if we just call tl.load(k_ptrs), we get the wrong output! + if EVEN_N & EVEN_M: + if EVEN_HEADDIM: + k = tl.load(k_ptrs) + v = tl.load(v_ptrs) + else: + k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0) + v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0) + else: + if EVEN_HEADDIM: + k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0) + v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0) + else: + k = tl.load( + k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0 + ) + v = tl.load( + v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0 + ) + # loop over rows + num_block_m = tl.cdiv(seqlen_q, BLOCK_M) + for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M): + start_m = tl.multiple_of(start_m, BLOCK_M) + offs_m_curr = start_m + offs_m + # load q, k, v, do on-chip + # Same bug as below. Otherwise gives wrong result for headdim=40, seqlen=(128, 117) + if EVEN_M & EVEN_HEADDIM: + q = tl.load(q_ptrs) + else: + if EVEN_HEADDIM: + q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0) + else: + q = tl.load( + q_ptrs, + mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), + other=0.0, + ) + # recompute p = softmax(qk, dim=-1).T + qk = tl.dot(q, k, trans_b=True) + # Trying to combine the two masks seem to make the result wrong + if not EVEN_N: # Need to mask out otherwise the softmax is wrong + qk = tl.where(offs_n[None, :] < seqlen_k, qk, float("-inf")) + if IS_CAUSAL: + qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf")) + if BIAS_TYPE != "none": + tl.debug_barrier() # Race condition otherwise + if BIAS_TYPE == "vector": + if EVEN_N: + bias = tl.load(b_ptrs).to(tl.float32) + else: + bias = tl.load(b_ptrs, mask=offs_n < seqlen_k, other=0.0).to(tl.float32) + bias = bias[None, :] + elif BIAS_TYPE == "matrix": + if EVEN_M & EVEN_N: + bias = tl.load(b_ptrs).to(tl.float32) + else: + bias = tl.load( + b_ptrs, + mask=(offs_m_curr[:, None] < seqlen_q) & (offs_n[None, :] < seqlen_k), + other=0.0, + ).to(tl.float32) + qk = qk * softmax_scale + bias + # There seems to be a race condition when headdim=48/96, and dq, dk, dv are wrong. + # Also wrong for headdim=64. + if not (EVEN_M & EVEN_HEADDIM): + tl.debug_barrier() + lse_i = tl.load(LSE + offs_m_curr) + if BIAS_TYPE == "none": + p = tl.exp(qk * softmax_scale - lse_i[:, None]) + else: + p = tl.exp(qk - lse_i[:, None]) + # compute dv + # [2022-10-30] TD: A Triton bug: if EVEN_M=True and EVEN_HEADDIM=False, if we call + # do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0), we get wrong outputs + # in the case of headdim=48/96, seqlen_q & seqlen_k >= 512. If headdim=40 or seqlen < 512, + # the output is correct. + if EVEN_M & EVEN_HEADDIM: + do = tl.load(do_ptrs) + else: + # [2022-11-01] TD: Triton bug, there's a race condition if we just use m_mask and not d_mask. + do = tl.load( + do_ptrs, + mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), + other=0.0, + ) + # if EVEN_M: + # if EVEN_HEADDIM: + # do = tl.load(do_ptrs) + # else: + # do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0) + # else: + # if EVEN_HEADDIM: + # do = tl.load(do_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0) + # else: + # do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) + # & (offs_d[None, :] < headdim), other=0.0) + dv += tl.dot(p.to(do.dtype), do, trans_a=True) + # compute dp = dot(v, do) + # There seems to be a race condition when headdim=48/96, and dq, dk are wrong. + # Also wrong for headdim=128, seqlen=(108, 256), and ATOMIC_ADD=True + # Also wrong for headdim=64, seqlen=(1023, 1024), and ATOMIC_ADD=False + if not (EVEN_M & EVEN_HEADDIM): + tl.debug_barrier() + dp = tl.dot(do, v, trans_b=True) + # There's a race condition for headdim=48 + if not EVEN_HEADDIM: + tl.debug_barrier() + # compute ds = p * (dp - delta[:, None]) + # Putting the subtraction after the dp matmul (instead of before) is slightly faster + Di = tl.load(D + offs_m_curr) + # Converting ds to q.dtype here reduces register pressure and makes it much faster + # for BLOCK_HEADDIM=128 + ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype) + # compute dk = dot(ds.T, q) + dk += tl.dot(ds, q, trans_a=True) + # compute dq + if not ( + EVEN_M & EVEN_HEADDIM + ): # Otherewise there's a race condition when BIAS_TYPE='matrix' + tl.debug_barrier() + if not ATOMIC_ADD: + if EVEN_M & EVEN_HEADDIM: # Race condition if we just do EVEN_M + dq = tl.load(dq_ptrs, eviction_policy="evict_last") + dq += tl.dot(ds, k) + tl.store(dq_ptrs, dq, eviction_policy="evict_last") + else: + if EVEN_HEADDIM: + dq = tl.load( + dq_ptrs, + mask=offs_m_curr[:, None] < seqlen_q, + other=0.0, + eviction_policy="evict_last", + ) + dq += tl.dot(ds, k) + tl.store( + dq_ptrs, + dq, + mask=offs_m_curr[:, None] < seqlen_q, + eviction_policy="evict_last", + ) + else: + dq = tl.load( + dq_ptrs, + mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), + other=0.0, + eviction_policy="evict_last", + ) + dq += tl.dot(ds, k) + tl.store( + dq_ptrs, + dq, + mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), + eviction_policy="evict_last", + ) + else: # If we're parallelizing across the seqlen_k dimension + dq = tl.dot(ds, k) + if EVEN_M & EVEN_HEADDIM: # Race condition if we just do EVEN_M + tl.atomic_add(dq_ptrs, dq) + else: + if EVEN_HEADDIM: + tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q) + else: + tl.atomic_add( + dq_ptrs, + dq, + mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), + ) + # increment pointers + dq_ptrs += BLOCK_M * stride_dqm + q_ptrs += BLOCK_M * stride_qm + do_ptrs += BLOCK_M * stride_dom + if BIAS_TYPE == "matrix": + b_ptrs += BLOCK_M * stride_bm + # write-back + dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :]) + dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :]) + _bwd_store_dk_dv( + dk_ptrs, + dv_ptrs, + dk, + dv, + offs_n, + offs_d, + seqlen_k, + headdim, + EVEN_M=EVEN_M, + EVEN_N=EVEN_N, + EVEN_HEADDIM=EVEN_HEADDIM, + ) + + +def init_to_zero(name): + return lambda nargs: nargs[name].zero_() + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False}, + num_warps=8, + num_stages=1, + pre_hook=init_to_zero("DQ"), + ), + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": True}, + num_warps=8, + num_stages=1, + pre_hook=init_to_zero("DQ"), + ), + # Other configs seem to give wrong results when seqlen_q % 128 != 0, disabling them for now + # # Kernel is buggy (give wrong result) if we set BLOCK_m=128, BLOCK_n=64, num_warps=*4* + # triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')), + # triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')), + # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')), + # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')), + ], + key=["CACHE_KEY_SEQLEN_Q", "CACHE_KEY_SEQLEN_K", "BIAS_TYPE", "IS_CAUSAL", "BLOCK_HEADDIM"], +) +@triton.heuristics( + { + "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0, + "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0, + "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"], + } +) +@triton.jit +def _bwd_kernel( + Q, + K, + V, + Bias, + DO, + DQ, + DK, + DV, + LSE, + D, + softmax_scale, + stride_qb, + stride_qh, + stride_qm, + stride_kb, + stride_kh, + stride_kn, + stride_vb, + stride_vh, + stride_vn, + stride_bb, + stride_bh, + stride_bm, + stride_dob, + stride_doh, + stride_dom, + stride_dqb, + stride_dqh, + stride_dqm, + stride_dkb, + stride_dkh, + stride_dkn, + stride_dvb, + stride_dvh, + stride_dvn, + nheads, + seqlen_q, + seqlen_k, + seqlen_q_rounded, + headdim, + CACHE_KEY_SEQLEN_Q, + CACHE_KEY_SEQLEN_K, + BIAS_TYPE: tl.constexpr, + IS_CAUSAL: tl.constexpr, + BLOCK_HEADDIM: tl.constexpr, + SEQUENCE_PARALLEL: tl.constexpr, + EVEN_M: tl.constexpr, + EVEN_N: tl.constexpr, + EVEN_HEADDIM: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, +): + off_hb = tl.program_id(1) + off_b = off_hb // nheads + off_h = off_hb % nheads + # offset pointers for batch/head + Q += off_b * stride_qb + off_h * stride_qh + K += off_b * stride_kb + off_h * stride_kh + V += off_b * stride_vb + off_h * stride_vh + DO += off_b * stride_dob + off_h * stride_doh + DQ += off_b * stride_dqb + off_h * stride_dqh + DK += off_b * stride_dkb + off_h * stride_dkh + DV += off_b * stride_dvb + off_h * stride_dvh + if BIAS_TYPE != "none": + Bias += off_b * stride_bb + off_h * stride_bh + # pointer to row-wise quantities in value-like data + D += off_hb * seqlen_q_rounded + LSE += off_hb * seqlen_q_rounded + if not SEQUENCE_PARALLEL: + num_block_n = tl.cdiv(seqlen_k, BLOCK_N) + for start_n in range(0, num_block_n): + _bwd_kernel_one_col_block( + start_n, + Q, + K, + V, + Bias, + DO, + DQ, + DK, + DV, + LSE, + D, + softmax_scale, + stride_qm, + stride_kn, + stride_vn, + stride_bm, + stride_dom, + stride_dqm, + stride_dkn, + stride_dvn, + seqlen_q, + seqlen_k, + headdim, + ATOMIC_ADD=False, + BIAS_TYPE=BIAS_TYPE, + IS_CAUSAL=IS_CAUSAL, + BLOCK_HEADDIM=BLOCK_HEADDIM, + EVEN_M=EVEN_M, + EVEN_N=EVEN_N, + EVEN_HEADDIM=EVEN_HEADDIM, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + ) + else: + start_n = tl.program_id(0) + _bwd_kernel_one_col_block( + start_n, + Q, + K, + V, + Bias, + DO, + DQ, + DK, + DV, + LSE, + D, + softmax_scale, + stride_qm, + stride_kn, + stride_vn, + stride_bm, + stride_dom, + stride_dqm, + stride_dkn, + stride_dvn, + seqlen_q, + seqlen_k, + headdim, + ATOMIC_ADD=True, + BIAS_TYPE=BIAS_TYPE, + IS_CAUSAL=IS_CAUSAL, + BLOCK_HEADDIM=BLOCK_HEADDIM, + EVEN_M=EVEN_M, + EVEN_N=EVEN_N, + EVEN_HEADDIM=EVEN_HEADDIM, + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + ) + + +def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None): + # shape constraints + batch, seqlen_q, nheads, d = q.shape + _, seqlen_k, _, _ = k.shape + assert k.shape == (batch, seqlen_k, nheads, d) + assert v.shape == (batch, seqlen_k, nheads, d) + assert d <= 128, "FlashAttention only support head dimensions up to 128" + assert q.dtype == k.dtype == v.dtype, "All tensors must have the same type" + assert q.dtype in [torch.float16, torch.bfloat16], "Only support fp16 and bf16" + assert q.is_cuda and k.is_cuda and v.is_cuda + softmax_scale = softmax_scale or 1.0 / math.sqrt(d) + + has_bias = bias is not None + bias_type = "none" + if has_bias: + assert bias.dtype in [q.dtype, torch.float] + assert bias.is_cuda + assert bias.dim() == 4 + if bias.stride(-1) != 1: + bias = bias.contiguous() + if bias.shape[2:] == (1, seqlen_k): + bias_type = "vector" + elif bias.shape[2:] == (seqlen_q, seqlen_k): + bias_type = "matrix" + else: + raise RuntimeError( + "Last 2 dimensions of bias must be (1, seqlen_k)" " or (seqlen_q, seqlen_k)" + ) + bias = bias.expand(batch, nheads, seqlen_q, seqlen_k) + bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0) + + seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128 + lse = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32) + tmp = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32) + o = torch.empty_like(q) + + BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16) + BLOCK = 128 + num_warps = 4 if d <= 64 else 8 + grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads) + _fwd_kernel[grid]( + q, + k, + v, + bias, + o, + lse, + tmp, + softmax_scale, + q.stride(0), + q.stride(2), + q.stride(1), + k.stride(0), + k.stride(2), + k.stride(1), + v.stride(0), + v.stride(2), + v.stride(1), + *bias_strides, + o.stride(0), + o.stride(2), + o.stride(1), + nheads, + seqlen_q, + seqlen_k, + seqlen_q_rounded, + d, + seqlen_q // 32, + seqlen_k // 32, # key for triton cache (limit number of compilations) + # Can't use kwargs here because triton autotune expects key to be args, not kwargs + # IS_CAUSAL=causal, BLOCK_HEADDIM=d, + bias_type, + causal, + BLOCK_HEADDIM, + BLOCK_M=BLOCK, + BLOCK_N=BLOCK, + num_warps=num_warps, + num_stages=1, + ) + return o, lse, softmax_scale # softmax_scale could have been updated + + +def _flash_attn_backward( + do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None +): + # Make sure that the last dimension is contiguous + if do.stride(-1) != 1: + do = do.contiguous() + batch, seqlen_q, nheads, d = q.shape + _, seqlen_k, _, _ = k.shape + # assert d in {16, 32, 64, 128} + assert d <= 128 + seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128 + assert lse.shape == (batch, nheads, seqlen_q_rounded) + assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1 + assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1 + softmax_scale = softmax_scale or 1.0 / math.sqrt(d) + # dq_accum = torch.zeros_like(q, dtype=torch.float32) + dq_accum = torch.empty_like(q, dtype=torch.float32) + delta = torch.empty_like(lse) + # delta = torch.zeros_like(lse) + + BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16) + grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads) + _bwd_preprocess_do_o_dot[grid]( + o, + do, + delta, + o.stride(0), + o.stride(2), + o.stride(1), + do.stride(0), + do.stride(2), + do.stride(1), + nheads, + seqlen_q, + seqlen_q_rounded, + d, + BLOCK_M=128, + BLOCK_HEADDIM=BLOCK_HEADDIM, + ) + + has_bias = bias is not None + bias_type = "none" + if has_bias: + assert bias.dtype in [q.dtype, torch.float] + assert bias.is_cuda + assert bias.dim() == 4 + assert bias.stride(-1) == 1 + if bias.shape[2:] == (1, seqlen_k): + bias_type = "vector" + elif bias.shape[2:] == (seqlen_q, seqlen_k): + bias_type = "matrix" + else: + raise RuntimeError( + "Last 2 dimensions of bias must be (1, seqlen_k)" " or (seqlen_q, seqlen_k)" + ) + bias = bias.expand(batch, nheads, seqlen_q, seqlen_k) + bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0) + + # BLOCK_M = 128 + # BLOCK_N = 64 + # num_warps = 4 + grid = lambda META: ( + triton.cdiv(seqlen_k, META["BLOCK_N"]) if META["SEQUENCE_PARALLEL"] else 1, + batch * nheads, + ) + _bwd_kernel[grid]( + q, + k, + v, + bias, + do, + dq_accum, + dk, + dv, + lse, + delta, + softmax_scale, + q.stride(0), + q.stride(2), + q.stride(1), + k.stride(0), + k.stride(2), + k.stride(1), + v.stride(0), + v.stride(2), + v.stride(1), + *bias_strides, + do.stride(0), + do.stride(2), + do.stride(1), + dq_accum.stride(0), + dq_accum.stride(2), + dq_accum.stride(1), + dk.stride(0), + dk.stride(2), + dk.stride(1), + dv.stride(0), + dv.stride(2), + dv.stride(1), + nheads, + seqlen_q, + seqlen_k, + seqlen_q_rounded, + d, + seqlen_q // 32, + seqlen_k // 32, # key for triton cache (limit number of compilations) + # Can't use kwargs here because triton autotune expects key to be args, not kwargs + # IS_CAUSAL=causal, BLOCK_HEADDIM=d, + bias_type, + causal, + BLOCK_HEADDIM, + # SEQUENCE_PARALLEL=False, + # BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N, + # num_warps=num_warps, + # num_stages=1, + ) + dq.copy_(dq_accum) + + +class FlashAttnQKVPackedFunc(torch.autograd.Function): + @staticmethod + def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None): + """ + qkv: (batch, seqlen, 3, nheads, headdim) + bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen). + For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen). + ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen) + """ + # Make sure that the last dimension is contiguous + if qkv.stride(-1) != 1: + qkv = qkv.contiguous() + o, lse, ctx.softmax_scale = _flash_attn_forward( + qkv[:, :, 0], + qkv[:, :, 1], + qkv[:, :, 2], + bias=bias, + causal=causal, + softmax_scale=softmax_scale, + ) + ctx.save_for_backward(qkv, o, lse, bias) + ctx.causal = causal + return o + + @staticmethod + def backward(ctx, do): + qkv, o, lse, bias = ctx.saved_tensors + assert not ctx.needs_input_grad[1], "FlashAttention does not support bias gradient yet" + # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd + # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version. + with torch.inference_mode(): + dqkv = torch.empty_like(qkv) + _flash_attn_backward( + do, + qkv[:, :, 0], + qkv[:, :, 1], + qkv[:, :, 2], + o, + lse, + dqkv[:, :, 0], + dqkv[:, :, 1], + dqkv[:, :, 2], + bias=bias, + causal=ctx.causal, + softmax_scale=ctx.softmax_scale, + ) + return dqkv, None, None, None + + +flash_attn_qkvpacked_func = FlashAttnQKVPackedFunc.apply + + +class FlashAttnKVPackedFunc(torch.autograd.Function): + @staticmethod + def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None): + """ + q: (batch, seqlen_q, nheads, headdim) + kv: (batch, seqlen_k, 2, nheads, headdim) + bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k). + For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k). + ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k) + """ + # Make sure that the last dimension is contiguous + q, kv = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, kv]] + o, lse, ctx.softmax_scale = _flash_attn_forward( + q, kv[:, :, 0], kv[:, :, 1], bias=bias, causal=causal, softmax_scale=softmax_scale + ) + ctx.save_for_backward(q, kv, o, lse, bias) + ctx.causal = causal + return o + + @staticmethod + def backward(ctx, do): + q, kv, o, lse, bias = ctx.saved_tensors + if len(ctx.needs_input_grad) >= 3: + assert not ctx.needs_input_grad[2], "FlashAttention does not support bias gradient yet" + # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd + # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version. + with torch.inference_mode(): + dq = torch.empty_like(q) + dkv = torch.empty_like(kv) + _flash_attn_backward( + do, + q, + kv[:, :, 0], + kv[:, :, 1], + o, + lse, + dq, + dkv[:, :, 0], + dkv[:, :, 1], + bias=bias, + causal=ctx.causal, + softmax_scale=ctx.softmax_scale, + ) + return dq, dkv, None, None, None + + +flash_attn_kvpacked_func = FlashAttnKVPackedFunc.apply + + +class FlashAttnFunc(torch.autograd.Function): + @staticmethod + def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None): + """ + q: (batch_size, seqlen_q, nheads, headdim) + k, v: (batch_size, seqlen_k, nheads, headdim) + bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k). + For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k). + ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k) + """ + # Make sure that the last dimension is contiguous + q, k, v = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]] + o, lse, ctx.softmax_scale = _flash_attn_forward( + q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale + ) + ctx.save_for_backward(q, k, v, o, lse, bias) + ctx.causal = causal + return o + + @staticmethod + def backward(ctx, do): + q, k, v, o, lse, bias = ctx.saved_tensors + assert not ctx.needs_input_grad[3], "FlashAttention does not support bias gradient yet" + # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd + # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version. + with torch.inference_mode(): + dq = torch.empty_like(q) + dk = torch.empty_like(k) + dv = torch.empty_like(v) + _flash_attn_backward( + do, + q, + k, + v, + o, + lse, + dq, + dk, + dv, + bias=bias, + causal=ctx.causal, + softmax_scale=ctx.softmax_scale, + ) + return dq, dk, dv, None, None, None + + +flash_attn_func = FlashAttnFunc.apply diff --git a/flash_attn_triton_og.py b/flash_attn_triton_og.py new file mode 100644 index 0000000000000000000000000000000000000000..f2ddb99487b4f162745e2f6dd3d1744946dc3fb2 --- /dev/null +++ b/flash_attn_triton_og.py @@ -0,0 +1,365 @@ +# [2022-10-23] Downloaded from https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py +# for benchmarking. +# We fixed a few dtype cast to make it work for bf16 + +""" +Fused Attention +=============== +This is a Triton implementation of the Flash Attention algorithm +(see: Dao et al., https://arxiv.org/pdf/2205.14135v2.pdf; Rabe and Staats https://arxiv.org/pdf/2112.05682v2.pdf) +""" + +import pytest +import torch +import triton +import triton.language as tl + + +@triton.jit +def _fwd_kernel( + Q, + K, + V, + sm_scale, + TMP, + L, + M, # NOTE: TMP is a scratchpad buffer to workaround a compiler bug + Out, + stride_qz, + stride_qh, + stride_qm, + stride_qk, + stride_kz, + stride_kh, + stride_kn, + stride_kk, + stride_vz, + stride_vh, + stride_vk, + stride_vn, + stride_oz, + stride_oh, + stride_om, + stride_on, + Z, + H, + N_CTX, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, +): + start_m = tl.program_id(0) + off_hz = tl.program_id(1) + # initialize offsets + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) + offs_d = tl.arange(0, BLOCK_DMODEL) + off_q = off_hz * stride_qh + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk + off_k = off_hz * stride_qh + offs_n[:, None] * stride_kn + offs_d[None, :] * stride_kk + off_v = off_hz * stride_qh + offs_n[:, None] * stride_qm + offs_d[None, :] * stride_qk + # Initialize pointers to Q, K, V + q_ptrs = Q + off_q + k_ptrs = K + off_k + v_ptrs = V + off_v + # initialize pointer to m and l + t_ptrs = TMP + off_hz * N_CTX + offs_m + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + # load q: it will stay in SRAM throughout + q = tl.load(q_ptrs) + # loop over k, v and update accumulator + for start_n in range(0, (start_m + 1) * BLOCK_M, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + k = tl.load(k_ptrs + start_n * stride_kn) + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk += tl.dot(q, k, trans_b=True) + qk *= sm_scale + qk += tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), 0, float("-inf")) + # -- compute m_ij, p, l_ij + m_ij = tl.max(qk, 1) + p = tl.exp(qk - m_ij[:, None]) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + m_i_new = tl.maximum(m_i, m_ij) + alpha = tl.exp(m_i - m_i_new) + beta = tl.exp(m_ij - m_i_new) + l_i_new = alpha * l_i + beta * l_ij + # -- update output accumulator -- + # scale p + p_scale = beta / l_i_new + p = p * p_scale[:, None] + # scale acc + acc_scale = l_i / l_i_new * alpha + tl.store(t_ptrs, acc_scale) + acc_scale = tl.load(t_ptrs) # BUG: have to store and immediately load + acc = acc * acc_scale[:, None] + # update acc + v = tl.load(v_ptrs + start_n * stride_vk) + p = p.to(v.dtype) + acc += tl.dot(p, v) + # update m_i and l_i + l_i = l_i_new + m_i = m_i_new + # rematerialize offsets to save registers + start_m = tl.program_id(0) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + # write back l and m + l_ptrs = L + off_hz * N_CTX + offs_m + m_ptrs = M + off_hz * N_CTX + offs_m + tl.store(l_ptrs, l_i) + tl.store(m_ptrs, m_i) + # initialize pointers to output + offs_n = tl.arange(0, BLOCK_DMODEL) + off_o = off_hz * stride_oh + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on + out_ptrs = Out + off_o + tl.store(out_ptrs, acc) + + +@triton.jit +def _bwd_preprocess( + Out, + DO, + L, + NewDO, + Delta, + BLOCK_M: tl.constexpr, + D_HEAD: tl.constexpr, +): + off_m = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M) + off_n = tl.arange(0, D_HEAD) + # load + o = tl.load(Out + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32) + do = tl.load(DO + off_m[:, None] * D_HEAD + off_n[None, :]).to(tl.float32) + denom = tl.load(L + off_m).to(tl.float32) + # compute + do = do / denom[:, None] + delta = tl.sum(o * do, axis=1) + # write-back + tl.store(NewDO + off_m[:, None] * D_HEAD + off_n[None, :], do) + tl.store(Delta + off_m, delta) + + +@triton.jit +def _bwd_kernel( + Q, + K, + V, + sm_scale, + Out, + DO, + DQ, + DK, + DV, + L, + M, + D, + stride_qz, + stride_qh, + stride_qm, + stride_qk, + stride_kz, + stride_kh, + stride_kn, + stride_kk, + stride_vz, + stride_vh, + stride_vk, + stride_vn, + Z, + H, + N_CTX, + num_block, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, +): + off_hz = tl.program_id(0) + off_z = off_hz // H + off_h = off_hz % H + # offset pointers for batch/head + Q += off_z * stride_qz + off_h * stride_qh + K += off_z * stride_qz + off_h * stride_qh + V += off_z * stride_qz + off_h * stride_qh + DO += off_z * stride_qz + off_h * stride_qh + DQ += off_z * stride_qz + off_h * stride_qh + DK += off_z * stride_qz + off_h * stride_qh + DV += off_z * stride_qz + off_h * stride_qh + for start_n in range(0, num_block): + lo = start_n * BLOCK_M + # initialize row/col offsets + offs_qm = lo + tl.arange(0, BLOCK_M) + offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M) + offs_m = tl.arange(0, BLOCK_N) + offs_k = tl.arange(0, BLOCK_DMODEL) + # initialize pointers to value-like data + q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) + k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk) + v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk) + do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) + dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) + # pointer to row-wise quantities in value-like data + D_ptrs = D + off_hz * N_CTX + m_ptrs = M + off_hz * N_CTX + # initialize dv amd dk + dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + # k and v stay in SRAM throughout + k = tl.load(k_ptrs) + v = tl.load(v_ptrs) + # loop over rows + for start_m in range(lo, num_block * BLOCK_M, BLOCK_M): + offs_m_curr = start_m + offs_m + # load q, k, v, do on-chip + q = tl.load(q_ptrs) + # recompute p = softmax(qk, dim=-1).T + # NOTE: `do` is pre-divided by `l`; no normalization here + qk = tl.dot(q, k, trans_b=True) + qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf")) + m = tl.load(m_ptrs + offs_m_curr) + p = tl.exp(qk * sm_scale - m[:, None]) + # compute dv + do = tl.load(do_ptrs) + dv += tl.dot(p.to(do.dtype), do, trans_a=True) + # compute dp = dot(v, do) + Di = tl.load(D_ptrs + offs_m_curr) + dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None] + dp += tl.dot(do, v, trans_b=True) + # compute ds = p * (dp - delta[:, None]) + ds = p * dp * sm_scale + # compute dk = dot(ds.T, q) + dk += tl.dot(ds.to(q.dtype), q, trans_a=True) + # # compute dq + dq = tl.load(dq_ptrs, eviction_policy="evict_last") + dq += tl.dot(ds.to(k.dtype), k) + tl.store(dq_ptrs, dq, eviction_policy="evict_last") + # # increment pointers + dq_ptrs += BLOCK_M * stride_qm + q_ptrs += BLOCK_M * stride_qm + do_ptrs += BLOCK_M * stride_qm + # write-back + dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk) + dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk) + tl.store(dv_ptrs, dv) + tl.store(dk_ptrs, dk) + + +class _attention(torch.autograd.Function): + @staticmethod + def forward(ctx, q, k, v, sm_scale): + BLOCK = 128 + # shape constraints + Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] + assert Lq == Lk and Lk == Lv + assert Lk in {16, 32, 64, 128} + o = torch.empty_like(q) + grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1]) + tmp = torch.empty( + (q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32 + ) + L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32) + m = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32) + num_warps = 4 if Lk <= 64 else 8 + + _fwd_kernel[grid]( + q, + k, + v, + sm_scale, + tmp, + L, + m, + o, + q.stride(0), + q.stride(1), + q.stride(2), + q.stride(3), + k.stride(0), + k.stride(1), + k.stride(2), + k.stride(3), + v.stride(0), + v.stride(1), + v.stride(2), + v.stride(3), + o.stride(0), + o.stride(1), + o.stride(2), + o.stride(3), + q.shape[0], + q.shape[1], + q.shape[2], + BLOCK_M=BLOCK, + BLOCK_N=BLOCK, + BLOCK_DMODEL=Lk, + num_warps=num_warps, + num_stages=1, + ) + ctx.save_for_backward(q, k, v, o, L, m) + ctx.BLOCK = BLOCK + ctx.grid = grid + ctx.sm_scale = sm_scale + ctx.BLOCK_DMODEL = Lk + return o + + @staticmethod + def backward(ctx, do): + q, k, v, o, l, m = ctx.saved_tensors + do = do.contiguous() + dq = torch.zeros_like(q, dtype=torch.float32) + dk = torch.empty_like(k) + dv = torch.empty_like(v) + do_scaled = torch.empty_like(do) + delta = torch.empty_like(l) + _bwd_preprocess[(ctx.grid[0] * ctx.grid[1],)]( + o, + do, + l, + do_scaled, + delta, + BLOCK_M=ctx.BLOCK, + D_HEAD=ctx.BLOCK_DMODEL, + ) + + # NOTE: kernel currently buggy for other values of `num_warps` + num_warps = 8 + _bwd_kernel[(ctx.grid[1],)]( + q, + k, + v, + ctx.sm_scale, + o, + do_scaled, + dq, + dk, + dv, + l, + m, + delta, + q.stride(0), + q.stride(1), + q.stride(2), + q.stride(3), + k.stride(0), + k.stride(1), + k.stride(2), + k.stride(3), + v.stride(0), + v.stride(1), + v.stride(2), + v.stride(3), + q.shape[0], + q.shape[1], + q.shape[2], + ctx.grid[0], + BLOCK_M=ctx.BLOCK, + BLOCK_N=ctx.BLOCK, + BLOCK_DMODEL=ctx.BLOCK_DMODEL, + num_warps=num_warps, + num_stages=1, + ) + return dq.to(q.dtype), dk, dv, None + + +attention = _attention.apply diff --git a/flash_blocksparse_attention.py b/flash_blocksparse_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..03798d16ffbb3cbf1806296d5b33f81360717315 --- /dev/null +++ b/flash_blocksparse_attention.py @@ -0,0 +1,197 @@ +import math + +import hydra +import torch +import torch.nn as nn +from einops import rearrange + +from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input +from flash_attn.flash_blocksparse_attn_interface import ( + convert_blockmask, + flash_blocksparse_attn_func, +) + + +class FlashBlocksparseAttention(nn.Module): + """Implement the scaled dot product attention with softmax. + Arguments + --------- + softmax_temp: The temperature to use for the softmax attention. + (default: 1/sqrt(d_keys) where d_keys is computed at + runtime) + attention_dropout: The dropout rate to apply to the attention + (default: 0.1) + """ + + def __init__( + self, + sparsity_config, + softmax_temp=None, + attention_dropout=0.0, + max_seq_length=2048, + device=None, + dtype=None, + ): + super().__init__() + self.sparsity_config = hydra.utils.instantiate(sparsity_config) + self.softmax_temp = softmax_temp + self.dropout_p = attention_dropout + + # initialize sparse layout and register as buffer + max_seq_length = ((max_seq_length + 256 - 1) // 256) * 256 + layout = self.sparsity_config.make_layout(max_seq_length) + self.register_buffer("layout", layout) + blockmask_converted = convert_blockmask(self.layout, causal=False) + self.register_buffer("blockmask_converted", blockmask_converted) + # logger.info(f'Attention class {self.__class__}: saving={self.layout.float().mean()}') + + def forward( + self, + qkv, + attn_mask=None, + key_padding_mask=None, + causal=False, + cu_seqlens=None, + max_s=None, + need_weights=False, + convert_mask=True, + ): + """Implements the multihead softmax attention. + Arguments + --------- + qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None + attn_mask: An implementation of BaseMask that encodes where each + query can attend to + key_padding_mask: An implementation of BaseMask that encodes how + many query each sequence in the batch consists of + """ + assert not need_weights + assert attn_mask is None + assert qkv.dtype == torch.float16 + assert qkv.is_cuda + + if cu_seqlens is None: + batch_size = qkv.shape[0] + seqlen = qkv.shape[1] + # Convert mask to take a subset + seqlen_rounded = ((seqlen + 256 - 1) // 256) * 256 + assert seqlen_rounded // 16 <= self.layout.shape[0], ( + seqlen_rounded // 256 <= self.layout.shape[1] + ) + blockmask = self.layout[: seqlen_rounded // 16, : seqlen_rounded // 256] + if key_padding_mask is None: + qkv = rearrange(qkv, "b s ... -> (b s) ...") + max_s = seqlen + cu_seqlens = torch.arange( + 0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=qkv.device + ) + output = flash_blocksparse_attn_func( + qkv, + cu_seqlens, + blockmask, + self.dropout_p if self.training else 0.0, + max_s, + softmax_scale=self.softmax_temp, + causal=causal, + ) + output = rearrange(output, "(b s) ... -> b s ...", b=batch_size) + else: + key_padding_mask_bool = key_padding_mask.bool_matrix + nheads = qkv.shape[-2] + x = rearrange(qkv, "b s three h d -> b s (three h d)") + x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask_bool) + x_unpad = rearrange(x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads) + output_unpad = flash_blocksparse_attn_func( + x_unpad, + cu_seqlens, + blockmask, + self.dropout_p if self.training else 0.0, + max_s, + softmax_scale=self.softmax_temp, + causal=causal, + ) + output = rearrange( + pad_input( + rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, batch_size, seqlen + ), + "b s (h d) -> b s h d", + h=nheads, + ) + else: + assert max_s is not None + seqlen = max_s + # Convert mask to take a subset + seqlen_rounded = ((seqlen + 256 - 1) // 256) * 256 + assert seqlen_rounded // 16 <= self.layout.shape[0], ( + seqlen_rounded // 256 <= self.layout.shape[1] + ) + blockmask = self.layout[: seqlen_rounded // 16, : seqlen_rounded // 256] + if convert_mask: + output = flash_blocksparse_attn_func( + qkv, + cu_seqlens, + blockmask, + self.dropout_p if self.training else 0.0, + max_s, + softmax_scale=self.softmax_temp, + causal=causal, + ) + else: + output = flash_blocksparse_attn_func( + qkv, + cu_seqlens, + self.blockmask_converted, + self.dropout_p if self.training else 0.0, + max_s, + softmax_scale=self.softmax_temp, + causal=causal, + convert_mask=False, + ) + + return output, None + + +class FlashBlocksparseMHA(nn.Module): + def __init__( + self, + embed_dim, + num_heads, + sparsity_config, + bias=True, + batch_first=True, + attention_dropout=0.0, + causal=False, + max_seq_length=2048, + device=None, + dtype=None, + **kwargs, + ) -> None: + assert batch_first + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.embed_dim = embed_dim + self.causal = causal + + self.num_heads = num_heads + assert self.embed_dim % num_heads == 0, "self.kdim must be divisible by num_heads" + self.head_dim = self.embed_dim // num_heads + assert self.head_dim in [16, 32, 64], "Only support head_dim == 16, 32, or 64" + + self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs) + self.inner_attn = FlashBlocksparseAttention( + sparsity_config, + attention_dropout=attention_dropout, + max_seq_length=max_seq_length, + **factory_kwargs, + ) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, **factory_kwargs) + + def forward( + self, x, x_ignored_, x_ignored_1_, attn_mask=None, key_padding_mask=None, need_weights=False + ): + qkv = self.Wqkv(x) + qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, h=self.num_heads) + context, attn_weights = self.inner_attn( + qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=self.causal + ) + return self.out_proj(rearrange(context, "b s h d -> b s (h d)")), attn_weights diff --git a/flash_blocksparse_attn_interface.py b/flash_blocksparse_attn_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..9ce3fe8c1344dd33165c43e4cc1ef0f70feb5d04 --- /dev/null +++ b/flash_blocksparse_attn_interface.py @@ -0,0 +1,200 @@ +# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/fmha.py +import flash_attn_cuda +import torch +import torch.nn as nn + + +def convert_blockmask(blockmask, causal): + """Convert from the 0-1 format to the format used by the CUDA code. + 0 means the block is skipped. + nonzero means the block is not skipped. + Argument: + blockmask: (row, col): a 0-1 tensor + Return: + blockmask_converted: (col, row), dtype torch.int32: for each column, it contains the row + indices of the nonzero blocks, padded with -1 to reach length @row. + The indices are multiplied by 4, with the smallest bit used to encode whether + it is the first nonzero in its row, and the 2nd smallest bit to encode whether it is + the last nonzero in its row.. + """ + assert not causal + # TD [2022-05-13]: The indexing and sorting is very tricky + nrow, ncol = blockmask.shape + # Sort does not support bool on CUDA + blockmask = blockmask.to(dtype=torch.uint8) + nonzero_val, nonzero_sorted_rowidx = blockmask.sort(dim=0, stable=True, descending=True) + nonzero_unsorted_rowidx = nonzero_sorted_rowidx.argsort(dim=0) + last_nonzero_col_per_row = blockmask.sort(dim=-1, stable=True).indices[:, -1] + last_nonzero_col_per_row_after_sort = nonzero_unsorted_rowidx[ + torch.arange(nrow, device=blockmask.device), last_nonzero_col_per_row + ] + first_nonzero_col_per_row = blockmask.sort(dim=-1, stable=True, descending=True).indices[:, 0] + first_nonzero_col_per_row_after_sort = nonzero_unsorted_rowidx[ + torch.arange(nrow, device=blockmask.device), first_nonzero_col_per_row + ] + nonzero_idx = nonzero_sorted_rowidx * 4 + nonzero_idx[last_nonzero_col_per_row_after_sort, last_nonzero_col_per_row] += 2 + nonzero_idx[first_nonzero_col_per_row_after_sort, first_nonzero_col_per_row] += 1 + nonzero_idx[nonzero_val == 0] = -1 + return nonzero_idx.T.contiguous().to(dtype=torch.int32) + + +def _flash_blocksparse_attn_forward( + qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal, return_softmax +): + context, softmax_lse, *rest = flash_attn_cuda.fwd_block( + qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal, return_softmax, None + ) + # if context.isnan().any() or softmax_lse.isnan().any(): + # breakpoint() + S_dmask = rest[0] if return_softmax else None + return context, softmax_lse, S_dmask + + +def _flash_blocksparse_attn_backward( + dout, + qkv, + out, + S_dmask, + softmax_lse, + cu_seqlens, + blockmask, + dropout_p, + max_s, + softmax_scale, + causal, +): + dqkv, dp, softmax_d = flash_attn_cuda.bwd_block( + dout, + qkv, + out, + S_dmask, + softmax_lse, + cu_seqlens, + blockmask, + dropout_p, + softmax_scale, + max_s, + causal, + None, + ) + # if dqkv.isnan().any() or softmax_d.isnan().any(): + # breakpoint() + return dqkv + + +class FlashBlocksparseAttnFun(torch.autograd.Function): + @staticmethod + def forward(ctx, qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal): + # Save rng_state because the backward pass will regenerate the dropout mask + rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None + if softmax_scale is None: + softmax_scale = qkv.shape[-1] ** (-0.5) + context, softmax_lse, S_dmask = _flash_blocksparse_attn_forward( + qkv, + cu_seqlens, + blockmask, + dropout_p, + max_s, + softmax_scale, + causal=causal, + return_softmax=False, + ) + ctx.save_for_backward(qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state) + ctx.dropout_p = dropout_p + ctx.max_s = max_s + ctx.softmax_scale = softmax_scale + ctx.causal = causal + return context + + @staticmethod + def backward(ctx, dout): + qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state = ctx.saved_tensors + if rng_state is not None: + cur_rng_state = torch.cuda.get_rng_state() + torch.cuda.set_rng_state(rng_state) + # S_dmask is None, temporarily use another tensor just to get it running + dqkv = _flash_blocksparse_attn_backward( + dout, + qkv, + context, + context, + softmax_lse, + cu_seqlens, + blockmask, + ctx.dropout_p, + ctx.max_s, + ctx.softmax_scale, + ctx.causal, + ) + if rng_state is not None: + torch.cuda.set_rng_state(cur_rng_state) + return dqkv, None, None, None, None, None, None, None + + +# We duplicate code to return both the output and the softmax for testing +# Returning both makes backward a bit slower, so we want to keep using the other version for speed. +class FlashBlocksparseAttnFunWithS(torch.autograd.Function): + @staticmethod + def forward(ctx, qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal): + # Save rng_state because the backward pass is gonna regenerate the dropout mask + rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None + if softmax_scale is None: + softmax_scale = qkv.shape[-1] ** (-0.5) + context, softmax_lse, S_dmask = _flash_blocksparse_attn_forward( + qkv, + cu_seqlens, + blockmask, + dropout_p, + max_s, + softmax_scale, + causal=causal, + return_softmax=True, + ) + ctx.save_for_backward(qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state) + ctx.dropout_p = dropout_p + ctx.max_s = max_s + ctx.softmax_scale = softmax_scale + ctx.causal = causal + return context, S_dmask, softmax_lse + + @staticmethod + def backward(ctx, dout, _dS_dmask_ignored, _dsoftmax_sum_ignored): + qkv, context, S_dmask, softmax_lse, cu_seqlens, blockmask, rng_state = ctx.saved_tensors + if rng_state is not None: + cur_rng_state = torch.cuda.get_rng_state() + torch.cuda.set_rng_state(rng_state) + dqkv = _flash_blocksparse_attn_backward( + dout, + qkv, + context, + S_dmask, + softmax_lse, + cu_seqlens, + blockmask, + ctx.dropout_p, + ctx.max_s, + ctx.softmax_scale, + ctx.causal, + ) + if rng_state is not None: + torch.cuda.set_rng_state(cur_rng_state) + return dqkv, None, None, None, None, None, None + + +def flash_blocksparse_attn_func( + qkv, + cu_seqlens, + blockmask, + dropout_p, + max_s, + softmax_scale=None, + causal=False, + return_attn_probs=False, + convert_mask=True, +): + """dropout_p should be set to 0.0 during evaluation""" + func = FlashBlocksparseAttnFun if not return_attn_probs else FlashBlocksparseAttnFunWithS + if convert_mask: + blockmask = convert_blockmask(blockmask, causal=causal) + return func.apply(qkv, cu_seqlens, blockmask, dropout_p, max_s, softmax_scale, causal) diff --git a/flash_bwd_hdim128_bf16_causal_sm80.cu b/flash_bwd_hdim128_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..13132e86da365c8c2e9b41fe7b35bb7f73be3cea --- /dev/null +++ b/flash_bwd_hdim128_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim128(params, stream); +} diff --git a/flash_bwd_hdim128_bf16_sm80.cu b/flash_bwd_hdim128_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..85a5dc88e164862ce420c643999513eeb310049b --- /dev/null +++ b/flash_bwd_hdim128_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim128(params, stream); +} diff --git a/flash_bwd_hdim128_bf16_sm90.cu b/flash_bwd_hdim128_bf16_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..372d0ed52fec3b6967207ed503c1fb769bd55711 --- /dev/null +++ b/flash_bwd_hdim128_bf16_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim128(params, stream); +} diff --git a/flash_bwd_hdim128_fp16_causal_sm80.cu b/flash_bwd_hdim128_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..5d27cd97b2b85b8512c3c7c48dd178f10875331b --- /dev/null +++ b/flash_bwd_hdim128_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim128(params, stream); +} diff --git a/flash_bwd_hdim128_fp16_sm80.cu b/flash_bwd_hdim128_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..2d7ddf46b5739473fcf8db6a350144fc2632cd96 --- /dev/null +++ b/flash_bwd_hdim128_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim128(params, stream); +} diff --git a/flash_bwd_hdim128_fp16_sm90.cu b/flash_bwd_hdim128_fp16_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..01a1d469fc10eff9757e473a475177868c8d0241 --- /dev/null +++ b/flash_bwd_hdim128_fp16_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim128(params, stream); +} diff --git a/flash_bwd_hdim160_bf16_causal_sm80.cu b/flash_bwd_hdim160_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..c18a78c764ce2c32fbd9998fe8a083b5aeb165da --- /dev/null +++ b/flash_bwd_hdim160_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim160(params, stream); +} diff --git a/flash_bwd_hdim160_bf16_sm80.cu b/flash_bwd_hdim160_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..1b6173725e918cace475713da0bd505708ef00f7 --- /dev/null +++ b/flash_bwd_hdim160_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim160(params, stream); +} diff --git a/flash_bwd_hdim160_fp16_causal_sm80.cu b/flash_bwd_hdim160_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..a511162dc0bc7d536ca235c23982b3c39f4101b4 --- /dev/null +++ b/flash_bwd_hdim160_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim160(params, stream); +} diff --git a/flash_bwd_hdim160_fp16_sm80.cu b/flash_bwd_hdim160_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..c9ce19acbbe5403589dfa25027bade9762567fe5 --- /dev/null +++ b/flash_bwd_hdim160_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim160(params, stream); +} diff --git a/flash_bwd_hdim192_bf16_causal_sm80.cu b/flash_bwd_hdim192_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..f492a717156550ddec40bda6bfe4b52d13109011 --- /dev/null +++ b/flash_bwd_hdim192_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim192(params, stream); +} diff --git a/flash_bwd_hdim192_bf16_sm80.cu b/flash_bwd_hdim192_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..2df58daa2a73c9f87225b68f288cb44a1f3a1b0c --- /dev/null +++ b/flash_bwd_hdim192_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim192(params, stream); +} diff --git a/flash_bwd_hdim192_fp16_causal_sm80.cu b/flash_bwd_hdim192_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..69cad5ae4538942012a13c4777cd83e172eb7c6a --- /dev/null +++ b/flash_bwd_hdim192_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim192(params, stream); +} diff --git a/flash_bwd_hdim192_fp16_sm80.cu b/flash_bwd_hdim192_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..3d4cab58bc840135485632fd4970b6da2baae84d --- /dev/null +++ b/flash_bwd_hdim192_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim192(params, stream); +} diff --git a/flash_bwd_hdim224_bf16_sm80.cu b/flash_bwd_hdim224_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..b2b58e2abc99a8921f4ce53dbea84a2ff078229c --- /dev/null +++ b/flash_bwd_hdim224_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2023, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim224(params, stream); +} diff --git a/flash_bwd_hdim224_fp16_sm80.cu b/flash_bwd_hdim224_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..e65cdaedeae1586ef2309c542948970ea3b5b59d --- /dev/null +++ b/flash_bwd_hdim224_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2023, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim224(params, stream); +} diff --git a/flash_bwd_hdim256_bf16_causal_sm80.cu b/flash_bwd_hdim256_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..6927445974973efc835fc68357d994e13d55a468 --- /dev/null +++ b/flash_bwd_hdim256_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim256(params, stream); +} diff --git a/flash_bwd_hdim256_bf16_sm80.cu b/flash_bwd_hdim256_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..d718ec88be5a7dd0d9af069b8ddd8f2b24ad3089 --- /dev/null +++ b/flash_bwd_hdim256_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim256(params, stream); +} diff --git a/flash_bwd_hdim256_fp16_causal_sm80.cu b/flash_bwd_hdim256_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..551c695e0559809daedecc94da62114dcbe69835 --- /dev/null +++ b/flash_bwd_hdim256_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim256(params, stream); +} diff --git a/flash_bwd_hdim256_fp16_sm80.cu b/flash_bwd_hdim256_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..a5877002631b497344106ff76506b45aeb8292cf --- /dev/null +++ b/flash_bwd_hdim256_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim256(params, stream); +} diff --git a/flash_bwd_hdim256_fp16_sm90.cu b/flash_bwd_hdim256_fp16_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..ee139bd2e5431aaa8314a0b61bd6c803f12842dd --- /dev/null +++ b/flash_bwd_hdim256_fp16_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim256(params, stream); +} diff --git a/flash_bwd_hdim32_bf16_causal_sm80.cu b/flash_bwd_hdim32_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..1282939a059c1e5e3f1f9a6297413e289f8456cc --- /dev/null +++ b/flash_bwd_hdim32_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim32(params, stream); +} diff --git a/flash_bwd_hdim32_bf16_sm80.cu b/flash_bwd_hdim32_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..d6d4036383f50ffbb3c3cc0aaea44be479686b0f --- /dev/null +++ b/flash_bwd_hdim32_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim32(params, stream); +} diff --git a/flash_bwd_hdim32_fp16_causal_sm80.cu b/flash_bwd_hdim32_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..60aa2d60b3b713931af611f6922467aab5333b1f --- /dev/null +++ b/flash_bwd_hdim32_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim32(params, stream); +} diff --git a/flash_bwd_hdim32_fp16_sm80.cu b/flash_bwd_hdim32_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..b06d50eaa8d6f35277c88ad1c32e41dad2669be1 --- /dev/null +++ b/flash_bwd_hdim32_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim32(params, stream); +} diff --git a/flash_bwd_hdim64_bf16_causal_sm80.cu b/flash_bwd_hdim64_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..52b93be9d4f4aa95123f155b217fd65d56875b20 --- /dev/null +++ b/flash_bwd_hdim64_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim64(params, stream); +} diff --git a/flash_bwd_hdim64_bf16_sm80.cu b/flash_bwd_hdim64_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..09d9e2b75cefb6a1cbb5a0150aa5b47168d91548 --- /dev/null +++ b/flash_bwd_hdim64_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim64(params, stream); +} diff --git a/flash_bwd_hdim64_bf16_sm90.cu b/flash_bwd_hdim64_bf16_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..587e6b944252643d7e7033cf5fd582949525551b --- /dev/null +++ b/flash_bwd_hdim64_bf16_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim64(params, stream); +} diff --git a/flash_bwd_hdim64_fp16_causal_sm80.cu b/flash_bwd_hdim64_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..5a4ea5f465e04105c7e9d55e968aee735a5727fd --- /dev/null +++ b/flash_bwd_hdim64_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim64(params, stream); +} diff --git a/flash_bwd_hdim64_fp16_sm80.cu b/flash_bwd_hdim64_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..fb115ff767908d601ecab113c139ef095a48792c --- /dev/null +++ b/flash_bwd_hdim64_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim64(params, stream); +} diff --git a/flash_bwd_hdim64_fp16_sm90.cu b/flash_bwd_hdim64_fp16_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..f486e870f3d1b4434d753fe01ce9800ffcd1d951 --- /dev/null +++ b/flash_bwd_hdim64_fp16_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim64(params, stream); +} diff --git a/flash_bwd_hdim96_bf16_causal_sm80.cu b/flash_bwd_hdim96_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..5f4c26a476c785edbe88f9824ff970494b60530e --- /dev/null +++ b/flash_bwd_hdim96_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim96(params, stream); +} diff --git a/flash_bwd_hdim96_bf16_sm80.cu b/flash_bwd_hdim96_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..224213d79f796ad9095ff6d83147cd231ecf3594 --- /dev/null +++ b/flash_bwd_hdim96_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim96(params, stream); +} diff --git a/flash_bwd_hdim96_bf16_sm90.cu b/flash_bwd_hdim96_bf16_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..19f5582b2755f0ba5d2f34bbbec9a5e724d6b8aa --- /dev/null +++ b/flash_bwd_hdim96_bf16_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim96(params, stream); +} diff --git a/flash_bwd_hdim96_fp16_causal_sm80.cu b/flash_bwd_hdim96_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..d0349014f92075428286a7e2ac4371e13a96a358 --- /dev/null +++ b/flash_bwd_hdim96_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim96(params, stream); +} diff --git a/flash_bwd_hdim96_fp16_sm80.cu b/flash_bwd_hdim96_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..663fc85924e5cd4ce2639964f9dd9e25e2e2d4ce --- /dev/null +++ b/flash_bwd_hdim96_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim96(params, stream); +} diff --git a/flash_bwd_hdim96_fp16_sm90.cu b/flash_bwd_hdim96_fp16_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..0952ee97144a11de98faea6926834ba978017c6e --- /dev/null +++ b/flash_bwd_hdim96_fp16_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_(Flash_bwd_params ¶ms, cudaStream_t stream) { + run_mha_bwd_hdim96(params, stream); +} diff --git a/flash_bwd_kernel.h b/flash_bwd_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..f4ba0ff47207a22189961f16710d4532e2cbb436 --- /dev/null +++ b/flash_bwd_kernel.h @@ -0,0 +1,308 @@ + +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include "cute/tensor.hpp" + +#include +#include +#include +#include +#include +#include +#include "cutlass/pipeline/pipeline.hpp" + +#include "utils.h" +#include "tile_scheduler_bwd.hpp" +#include "mainloop_bwd_sm90_tma_gmma_ws.hpp" +#include "epilogue_bwd_sm90_tma.hpp" + +namespace flash { + +using namespace cute; + +template +class FlashAttnBwd { + +public: + + // Type Aliases + static constexpr bool Is_causal = CollectiveMainloop_::Is_causal; + static_assert(CollectiveMainloop_::Varlen == CollectiveEpilogue_::Varlen); + static constexpr bool Varlen = CollectiveMainloop_::Varlen; + + // Mainloop derived types + using CollectiveMainloop = CollectiveMainloop_; + using TileShape_MNK = typename CollectiveMainloop::TileShape_MNK; + using TiledMmaSdP = typename CollectiveMainloop::TiledMmaSdP; + using TiledMmadKV = typename CollectiveMainloop::TiledMmadKV; + using ArchTag = typename CollectiveMainloop::ArchTag; + using ClusterShape = typename CollectiveMainloop::ClusterShape; + using MainloopArguments = typename CollectiveMainloop::Arguments; + using MainloopParams = typename CollectiveMainloop::Params; + static constexpr bool dKV_swapAB = CollectiveMainloop::dKV_swapAB; + + // Epilogue derived types + using CollectiveEpilogue = CollectiveEpilogue_; + using EpilogueArguments = typename CollectiveEpilogue::Arguments; + using EpilogueParams = typename CollectiveEpilogue::Params; + + static_assert(ArchTag::kMinComputeCapability >= 90); + + using TileScheduler = TileScheduler_; + using TileSchedulerArguments = typename TileScheduler::Arguments; + using TileSchedulerParams = typename TileScheduler::Params; + + static constexpr uint32_t NumLoadWarpGroups = 1; + static constexpr uint32_t NumMmaWarpGroups = CUTE_STATIC_V(size(TiledMmaSdP{})) / cutlass::NumThreadsPerWarpGroup; + static constexpr uint32_t MaxThreadsPerBlock = CUTE_STATIC_V(size(TiledMmaSdP{})) + (NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup); + static constexpr uint32_t MinBlocksPerMultiprocessor = 1; + static_assert(NumMmaWarpGroups == 2); + + /// Register requirement for Load and Math WGs + static constexpr uint32_t LoadRegisterRequirement = 24; + static constexpr uint32_t MmaRegisterRequirement = 240; + // If you want to print from the producer warp, you'd need to increase the number of registers + // Otherwise you'll get CUDA error. + // static constexpr uint32_t LoadRegisterRequirement = 56; + // static constexpr uint32_t MmaRegisterRequirement = 224; + + // Kernel level shared memory storage + struct SharedStorage { + struct { + union { + typename CollectiveMainloop::TensorStorage mainloop; + typename CollectiveEpilogue::TensorStorage epilogue; + }; + }; + + struct { + alignas(16) cutlass::arch::ClusterTransactionBarrier barrier_KV; + alignas(16) cutlass::arch::ClusterBarrier barrier_dKV; + alignas(16) typename CollectiveMainloop::MainloopPipeline::SharedStorage pipeline_q; + alignas(16) typename CollectiveMainloop::MainloopPipeline::SharedStorage pipeline_do; + alignas(16) typename TileScheduler::SharedStorage smem_scheduler; + }; + + }; + + static constexpr int SharedStorageSize = sizeof(SharedStorage); + + // Device side arguments + struct Arguments { + MainloopArguments mainloop{}; + EpilogueArguments epilogue{}; + cutlass::KernelHardwareInfo hw_info{}; + TileSchedulerArguments scheduler{}; + }; + + // Kernel entry point API + struct Params { + MainloopParams mainloop{}; + EpilogueParams epilogue{}; + cutlass::KernelHardwareInfo hw_info{}; + TileSchedulerParams scheduler{}; + }; + + // + // Methods + // + + // Convert to underlying arguments. In this case, a simple copy for the aliased type. + static + Params + to_underlying_arguments(Arguments const& args) { + CUTLASS_TRACE_HOST("to_underlying_arguments():"); + + // Get SM count if needed, otherwise use user supplied SM count + int sm_count = args.hw_info.sm_count; + if (sm_count <= 0) { + CUTLASS_TRACE_HOST(" WARNING: Arguments do not include a valid SM count.\n" + " For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count."); + sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id); + } + + CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count); + + cutlass::KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count}; + return { + CollectiveMainloop::to_underlying_arguments(args.mainloop), + CollectiveEpilogue::to_underlying_arguments(args.epilogue), + hw_info, + TileScheduler::to_underlying_arguments(args.scheduler) + }; + } + + // Computes the kernel launch grid shape based on runtime parameters + static dim3 + get_grid_shape(Params const& params) { + return TileScheduler::get_grid_shape(params.scheduler, params.hw_info.sm_count); + } + + static dim3 + get_block_shape() { + return dim3(MaxThreadsPerBlock, 1, 1); + } + + + CUTLASS_DEVICE + void + operator()(Params const& params, char* smem_buf) { + + static constexpr int NumMmaThreads = NumMmaWarpGroups * cutlass::NumThreadsPerWarpGroup; + static constexpr int NumCopyThreads = NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup; + static constexpr int kBlockM = get<0>(TileShape_MNK{}); + + using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline; + using PipelineParams = typename MainloopPipeline::Params; + using PipelineState = typename MainloopPipeline::PipelineState; + + SharedStorage& shared_storage = *reinterpret_cast(smem_buf); + + int const lane_predicate = cute::elect_one_sync(); + int const warp_idx = cutlass::canonical_warp_idx_sync(); + + // Issue Tma Descriptor Prefetch from a single thread + if (warp_idx == 0 && lane_predicate) { + CollectiveMainloop::prefetch_tma_descriptors(params.mainloop); + CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue); + } + + // Obtain warp index + int const warp_group_thread_idx = threadIdx.x % cutlass::NumThreadsPerWarpGroup; + + PipelineParams pipeline_params; + pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytesQ + CollectiveMainloop::TmaTransactionBytesLSE; + int warp_group_idx = cutlass::canonical_warp_group_idx(); + pipeline_params.role = warp_group_idx == 0 + ? MainloopPipeline::ThreadCategory::Producer + : MainloopPipeline::ThreadCategory::Consumer; + pipeline_params.is_leader = warp_group_thread_idx == 0; + pipeline_params.num_consumers = NumMmaThreads; + + if (warp_idx == 0 && lane_predicate) { + shared_storage.barrier_KV.init(1 /*numThreads*/); + // shared_storage.barrier_dKV.init(size(ClusterShape{}) /*numThreads*/); + } + // We're counting on pipeline_q to call cutlass::arch::fence_barrier_init(); + MainloopPipeline pipeline_q(shared_storage.pipeline_q, pipeline_params, ClusterShape{}); + MainloopPipeline pipeline_do(shared_storage.pipeline_do, pipeline_params, ClusterShape{}); + + CollectiveMainloop collective_mainloop; + CollectiveEpilogue collective_epilogue; + + // We need this to guarantee that the Pipeline init is visible to all producers and consumer blocks in the Cluster + if constexpr (size(ClusterShape{}) > 1) { + cute::cluster_arrive_relaxed(); + cute::cluster_wait(); + } else { + __syncthreads(); + } + + if (warp_group_idx == 0) { // Producer + cutlass::arch::warpgroup_reg_dealloc(); + + int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0); + if (warp_idx_in_warpgroup == 0) { // Load K, V, and do TMA on Q and dO + PipelineState smem_pipe_write = cutlass::make_producer_start_state(); + + int work_idx = 0; + + TileScheduler scheduler(reinterpret_cast(&shared_storage.smem_scheduler)); + for (auto work_tile_info = scheduler.template get_initial_work(params.scheduler); + work_tile_info.is_valid(params.scheduler); + work_tile_info = scheduler.template get_next_work(params.scheduler, work_tile_info)) { + auto block_coord = work_tile_info.get_block_coord(params.scheduler); + auto [n_block, bidh, bidb] = block_coord; + if constexpr (Varlen) { + if (n_block * kBlockM >= collective_mainloop.get_seqlen_k(params.mainloop, bidb)) { + scheduler.prefetch_next_work(params.scheduler, work_tile_info); + continue; + } + } + if constexpr (Is_causal) { + int const m_block_min = collective_mainloop.get_m_block_min(params.mainloop, n_block, bidb); + int const m_block_max = cute::ceil_div(collective_mainloop.get_seqlen_q(params.mainloop, bidb), kBlockM); + if (m_block_min >= m_block_max) { + scheduler.prefetch_next_work(params.scheduler, work_tile_info); + continue; + } + } + auto scheduler_prefetch = [&scheduler, ¶ms, &work_tile_info]() { + scheduler.prefetch_next_work(params.scheduler, work_tile_info); + }; + collective_mainloop.load(params.mainloop, pipeline_q, pipeline_do, smem_pipe_write, + shared_storage, scheduler_prefetch, block_coord, work_idx); + ++work_idx; + } + collective_mainloop.load_tail(pipeline_q, pipeline_do, smem_pipe_write); + } else if (warp_idx_in_warpgroup == 1) { + TileScheduler scheduler(reinterpret_cast(&shared_storage.smem_scheduler)); + for (auto work_tile_info = scheduler.template get_initial_work(params.scheduler); + work_tile_info.is_valid(params.scheduler); + work_tile_info = scheduler.template get_next_work(params.scheduler, work_tile_info)) { + auto block_coord = work_tile_info.get_block_coord(params.scheduler); + auto [n_block, bidh, bidb] = block_coord; + if constexpr (Varlen) { + if (n_block * kBlockM >= collective_mainloop.get_seqlen_k(params.mainloop, bidb)) { continue; } + } + if constexpr (Is_causal) { + int const m_block_min = collective_mainloop.get_m_block_min(params.mainloop, n_block, bidb); + int const m_block_max = cute::ceil_div(collective_mainloop.get_seqlen_q(params.mainloop, bidb), kBlockM); + if (m_block_min >= m_block_max) { continue; } + } + collective_mainloop.store_dq(params.mainloop, shared_storage, block_coord); + } + } + } else { // Consumer + cutlass::arch::warpgroup_reg_alloc(); + + TileScheduler scheduler(reinterpret_cast(&shared_storage.smem_scheduler)); + // Initialize matmul objects. + TiledMmadKV tiled_mma_dKV; + + PipelineState smem_pipe_read; + + collective_mainloop.mma_init(); + scheduler.init_consumer(); + + int work_idx = 0; + CUTLASS_PRAGMA_NO_UNROLL + for (auto work_tile_info = scheduler.template get_initial_work(params.scheduler); + work_tile_info.is_valid(params.scheduler); + work_tile_info = scheduler.template get_next_work(params.scheduler, work_tile_info)) { + auto block_coord = work_tile_info.get_block_coord(params.scheduler); + auto [n_block, bidh, bidb] = block_coord; + if constexpr (Varlen) { + if (n_block * kBlockM >= collective_mainloop.get_seqlen_k(params.mainloop, bidb)) { continue; } + } + if constexpr (Is_causal) { + int const m_block_min = collective_mainloop.get_m_block_min(params.mainloop, n_block, bidb); + int const m_block_max = cute::ceil_div(collective_mainloop.get_seqlen_q(params.mainloop, bidb), kBlockM); + if (m_block_min >= m_block_max) { // We exit early and write 0 to dK and dV + collective_epilogue.store_zero(params.epilogue, threadIdx.x - NumCopyThreads, block_coord); + continue; + } + } + + // dK and dV output accumulator. + Tensor tdKrdK = partition_fragment_C(tiled_mma_dKV, select(TileShape_MNK{})); + Tensor tdVrdV = partition_fragment_C(tiled_mma_dKV, select(TileShape_MNK{})); + collective_mainloop.mma(params.mainloop, pipeline_q, pipeline_do, smem_pipe_read, + tdKrdK, tdVrdV, threadIdx.x - NumCopyThreads, work_idx, block_coord, shared_storage); + collective_epilogue.store(params.epilogue, tdKrdK, tdVrdV, shared_storage, tiled_mma_dKV, + threadIdx.x - NumCopyThreads, block_coord); + + ++work_idx; + } + collective_epilogue.store_tail(); + } + + } + +}; + +} // namespace flash diff --git a/flash_bwd_launch_template.h b/flash_bwd_launch_template.h new file mode 100644 index 0000000000000000000000000000000000000000..268d835cea82fe8ff0746d2f5ff97e2b102e7cd7 --- /dev/null +++ b/flash_bwd_launch_template.h @@ -0,0 +1,200 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include "cute/tensor.hpp" + +#include "cutlass/cluster_launch.hpp" +#include "cutlass/device_kernel.h" // For device_kernel + +#include "static_switch.h" +#include "flash.h" +#include "flash_bwd_preprocess_kernel.h" +#include "flash_bwd_postprocess_kernel.h" +#include "tile_scheduler_bwd.hpp" +#include "mainloop_bwd_sm90_tma_gmma_ws.hpp" +#include "epilogue_bwd_sm90_tma.hpp" +#include "flash_bwd_kernel.h" + +using namespace cute; + +template +void run_flash_bwd(Flash_bwd_params ¶ms, cudaStream_t stream) { + using TileShape_MK = cute::Shape, Int>; + using ElementAccum = float; + using PreprocessKernel = flash::FlashAttnBwdPreprocess; + int const total_q_padded_rounded = cute::round_up(params.total_q + params.b * 128, 128); + typename PreprocessKernel::Arguments preprocess_args { + static_cast(params.o_ptr), + {!Varlen ? params.seqlen_q : params.total_q, params.d, params.h, !Varlen ? params.b : 1}, // shape_O + {params.o_row_stride, _1{}, params.o_head_stride, !Varlen ? params.o_batch_stride : 0}, // stride_O + static_cast(params.do_ptr), + {params.do_row_stride, _1{}, params.do_head_stride, !Varlen ? params.do_batch_stride : 0}, // stride_dO + static_cast(params.dsoftmax_sum), + {!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded, params.h, !Varlen ? params.b : 1}, // shape_dPsum + {_1{}, !Varlen ? params.seqlen_q_rounded : total_q_padded_rounded, !Varlen ? params.h * params.seqlen_q_rounded : 0}, // stride_dPsum + static_cast(params.softmax_lse_ptr), + {_1{}, !Varlen ? params.seqlen_q : params.total_q, !Varlen ? params.h * params.seqlen_q : 0}, // stride_LSE + static_cast(params.softmax_lse_log2_ptr), + {_1{}, !Varlen ? params.seqlen_q_rounded : total_q_padded_rounded, !Varlen ? params.h * params.seqlen_q_rounded : 0}, // stride_LSE_log2 + static_cast(params.dq_accum_ptr), + {!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded, params.d_rounded, params.h, !Varlen ? params.b : 1}, // shape_dQaccum + {params.d_rounded, _1{}, params.d_rounded * (!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded), !Varlen ? params.d_rounded * params.seqlen_q_rounded * params.h : 0}, // stride_dQ + params.b, + params.dq_semaphore, + params.cu_seqlens_q + }; + typename PreprocessKernel::Params preprocess_params = PreprocessKernel::to_underlying_arguments(preprocess_args); + int num_m_block = cute::ceil_div(params.seqlen_q, kBlockM); + dim3 grid_m(num_m_block, params.h, params.b); + cutlass::device_kernel<<>>(preprocess_params); + + using TileShape_MNK = cute::Shape, Int, Int>; + using ClusterShape = cute::Shape<_1, Int<1>, _1>; + static constexpr int Stages = 2; + using CollectiveMainloop = flash::CollectiveMainloopBwd; + using CollectiveEpilogue = flash::CollectiveEpilogueBwd; + using Scheduler = flash::SingleTileSchedulerBwd; + using AttnKernel = flash::FlashAttnBwd; + + typename CollectiveMainloop::Arguments mainloop_args { + static_cast(params.q_ptr), + {!Varlen ? params.seqlen_q : params.total_q, params.d, params.h, !Varlen ? params.b : 1}, // shape_Q + {params.q_row_stride, _1{}, params.q_head_stride, !Varlen ? params.q_batch_stride : 0}, // stride_Q + static_cast(params.k_ptr), + {!Varlen ? params.seqlen_k : params.total_k, params.d, params.h_k, !Varlen ? params.b : 1}, // shape_K + {params.k_row_stride, _1{}, params.k_head_stride, !Varlen ? params.k_batch_stride : 0}, // stride_K + static_cast(params.v_ptr), + {params.v_row_stride, _1{}, params.v_head_stride, !Varlen ? params.v_batch_stride : 0}, // stride_V + static_cast(params.do_ptr), + {params.do_row_stride, _1{}, params.do_head_stride, !Varlen ? params.do_batch_stride : 0}, // stride_dO + static_cast(params.dq_accum_ptr), + // {params.seqlen_q_rounded, params.d_rounded, params.h, params.b}, // shape_dQaccum + // {params.d_rounded, _1{}, params.d_rounded * params.seqlen_q_rounded, params.d_rounded * params.seqlen_q_rounded * params.h}, // stride_dQaccum + {(!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded) * (params.d_rounded / 32), 32, params.h, !Varlen ? params.b : 1}, // shape_dQaccum + {32, _1{}, params.d_rounded * (!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded), !Varlen ? params.d_rounded * params.seqlen_q_rounded * params.h : 0}, // stride_dQaccum + static_cast(params.softmax_lse_log2_ptr), + {!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded, params.h, !Varlen ? params.b : 1}, // shape_LSE + {_1{}, !Varlen ? params.seqlen_q_rounded : total_q_padded_rounded, !Varlen ? params.h * params.seqlen_q_rounded : 0}, // stride_LSE_log2 + static_cast(params.dsoftmax_sum), + {_1{}, !Varlen ? params.seqlen_q_rounded : total_q_padded_rounded, !Varlen ? params.h * params.seqlen_q_rounded : 0}, // stride_dPsum + params.scale_softmax, + params.b, + params.dq_semaphore, + params.cu_seqlens_q, params.cu_seqlens_k, + }; + typename CollectiveEpilogue::Arguments epilogue_args { + static_cast(params.dk_ptr), + {!Varlen ? params.seqlen_k : params.total_k, params.d, params.h, !Varlen ? params.b : 1}, // shape_dK + {params.dk_row_stride, _1{}, params.dk_head_stride, !Varlen ? params.dk_batch_stride : 0}, // stride_dK + static_cast(params.dv_ptr), + {params.dv_row_stride, _1{}, params.dv_head_stride, !Varlen ? params.dv_batch_stride : 0}, + params.cu_seqlens_k + }; + + int num_blocks_n = cutlass::ceil_div(params.seqlen_k, get<1>(TileShape_MNK{})); + num_blocks_n = cutlass::round_up(num_blocks_n, size<1>(ClusterShape{})); + typename Scheduler::Arguments scheduler_args { + num_blocks_n, params.h, params.b, params.tile_count_semaphore, params.cu_seqlens_k + }; + + int device; + cudaGetDevice(&device); + typename AttnKernel::Params kernel_params = AttnKernel::to_underlying_arguments({ + mainloop_args, epilogue_args, {device}, scheduler_args + }); + + // Get the ptr to kernel function. + void const* kernel = (void const*) cutlass::device_kernel; + int smem_size = AttnKernel::SharedStorageSize; + // int smem_size_q = sizeof(decltype((typename AttnKernel::SharedStorage{}).mainloop.smem_q)); + // int smem_size_do = sizeof(decltype((typename AttnKernel::SharedStorage{}).mainloop.smem_do)); + // int smem_size_ds = sizeof(decltype((typename AttnKernel::SharedStorage{}).mainloop.smem_ds)); + // int smem_size_dqacc = sizeof(decltype((typename AttnKernel::SharedStorage{}).mainloop.smem_dqacc)); + // int smem_size_k = sizeof(decltype((typename AttnKernel::SharedStorage{}).mainloop.smem_k)); + // int smem_size_v = sizeof(decltype((typename AttnKernel::SharedStorage{}).mainloop.smem_v)); + // printf("smem_size = %d, q = %d, k = %d, v = %d, do = %d, ds = %d, dqacc = %d\n", smem_size, smem_size_q, smem_size_k, smem_size_v, smem_size_do, smem_size_ds, smem_size_dqacc); + if (smem_size >= 48 * 1024) { + CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); + } + + dim3 grid_dims = AttnKernel::get_grid_shape(kernel_params); + dim3 block_dims = AttnKernel::get_block_shape(); + dim3 cluster_dims(size<0>(ClusterShape{}), size<1>(ClusterShape{}), size<2>(ClusterShape{})); + cutlass::ClusterLaunchParams launch_params{grid_dims, block_dims, cluster_dims, smem_size, stream}; + cutlass::launch_kernel_on_cluster(launch_params, kernel, kernel_params); + CHECK_CUDA_KERNEL_LAUNCH(); + + using PostprocessKernel = flash::FlashAttnBwdPostprocessConvertdQ; + typename PostprocessKernel::Arguments postprocess_args { + static_cast(params.dq_accum_ptr), + // {params.seqlen_q_rounded, params.d_rounded, params.h, params.b}, // shape_dQaccum + // {params.d_rounded, _1{}, params.d_rounded * params.seqlen_q_rounded, params.d_rounded * params.seqlen_q_rounded * params.h}, // stride_dQaccum + {(!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded) * (params.d_rounded / 32), 32, params.h, !Varlen ? params.b : 1}, // shape_dQaccum + {32, _1{}, params.d_rounded * (!Varlen ? params.seqlen_q_rounded : total_q_padded_rounded), !Varlen ? params.d_rounded * params.seqlen_q_rounded * params.h : 0}, // stride_dQaccum + static_cast(params.dq_ptr), + {!Varlen ? params.seqlen_q : params.total_q, params.d, params.h, !Varlen ? params.b : 1}, // shape_dQ + {params.dq_row_stride, _1{}, params.dq_head_stride, params.dq_batch_stride}, // stride_dQ + params.scale_softmax, + params.cu_seqlens_q + }; + typename PostprocessKernel::Params postprocess_params = PostprocessKernel::to_underlying_arguments(postprocess_args); + int num_m_block_postprocess = cute::ceil_div(params.seqlen_q, get<0>(TileShape_MK{})); + dim3 grid_m_postprocess(num_m_block_postprocess, params.h, params.b); + // Get the ptr to kernel function. + auto postprocess_kernel = cutlass::device_kernel; + int smem_size_postprocess = PostprocessKernel::SharedStorageSize; + if (smem_size_postprocess >= 48 * 1024) { + CHECK_CUDA(cudaFuncSetAttribute(postprocess_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); + } + postprocess_kernel<<>>(postprocess_params); + CHECK_CUDA_KERNEL_LAUNCH(); + +} + + +template +void run_mha_bwd_hdim64(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 64; + BOOL_SWITCH(params.is_causal, Is_causal, [&] { + BOOL_SWITCH(params.cu_seqlens_q != nullptr || params.cu_seqlens_k != nullptr, Varlen, [&] { + BOOL_SWITCH(params.deterministic, Deterministic, [&] { + run_flash_bwd(params, stream); + }); + }); + }); +} + +template +void run_mha_bwd_hdim96(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 96; + BOOL_SWITCH(params.is_causal, Is_causal, [&] { + BOOL_SWITCH(params.cu_seqlens_q != nullptr || params.cu_seqlens_k != nullptr, Varlen, [&] { + BOOL_SWITCH(params.deterministic, Deterministic, [&] { + run_flash_bwd(params, stream); + }); + }); + }); +} + +template +void run_mha_bwd_hdim128(Flash_bwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 128; + BOOL_SWITCH(params.is_causal, Is_causal, [&] { + BOOL_SWITCH(params.cu_seqlens_q != nullptr || params.cu_seqlens_k != nullptr, Varlen, [&] { + BOOL_SWITCH(params.deterministic, Deterministic, [&] { + run_flash_bwd(params, stream); + }); + }); + }); +} diff --git a/flash_bwd_postprocess_kernel.h b/flash_bwd_postprocess_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..c3bfc5f5576804ed5c2fc14c9ef2ad748c5f4bc4 --- /dev/null +++ b/flash_bwd_postprocess_kernel.h @@ -0,0 +1,248 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include "cute/tensor.hpp" + +#include +#include +#include +#include +#include "cutlass/arch/barrier.h" + +#include "utils.h" + +namespace flash { + +using namespace cute; + +template +class FlashAttnBwdPostprocessConvertdQ { + +public: + + // Type Aliases + using TileShape_MK = TileShape_MK_; + + static constexpr uint32_t MaxThreadsPerBlock = kNThreads; + static constexpr uint32_t MinBlocksPerMultiprocessor = 2; + + static constexpr int kHeadDim = get<1>(TileShape_MK{}); + using R2SLayoutAtomdQaccum = Layout>, Stride<_1>>; + using R2STiledCopydQaccum = decltype(make_tiled_copy(Copy_Atom{}, R2SLayoutAtomdQaccum{}, + Layout>{})); // Val layout, 4 vals per read + static constexpr int SmemdQaccumSize = size(TileShape_MK{}); + static_assert(size(TileShape_MK{}) == size(SmemLayoutdQaccumTMA{}), "TileShape_MK and SmemLayoutdQaccumTMA must have the same size"); + using SmemLayoutdQaccum = Layout>, Stride<_1>>; + + // We can't just use kHeadDim here. E.g. if MMA shape is 64 x 96 but split across 2 WGs, + // then setting kBlockKSmem to 32 will cause "Static shape_div failure". + // We want to treat it as 64 x 48, so kBlockKSmem should be 16. + static constexpr int MmaShapeN = get<1>(typename TiledMma::AtomShape_MNK{}); + static constexpr int kBlockKSmem = MmaShapeN % 64 == 0 ? 64 : (MmaShapeN % 32 == 0 ? 32 : 16); + static constexpr int kSwizzle = kBlockKSmem == 64 ? 3 : (kBlockKSmem == 32 ? 2 : 1); + using SmemLayoutAtomdQ = + decltype(composition(Swizzle{}, + Layout, Int>, + Stride, _1>>{})); + using SmemLayoutdQ = decltype(tile_to_shape(SmemLayoutAtomdQ{}, TileShape_MK{})); + using SmemLayoutdQt = + decltype(cute::composition(SmemLayoutdQ{}, + make_layout(make_shape(get<1>(TileShape_MK{}), get<0>(TileShape_MK{})), + make_stride(Int(TileShape_MK{})>{}, _1{})))); + + using SmemCopyAtomdQ = Copy_Atom< + std::conditional_t, + Element>; + + static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element); + static_assert(kHeadDim % kGmemElemsPerLoad == 0, "Headdim must be a multiple of kGmemElemsPerLoad"); + static constexpr int kGmemThreadsPerRow = cutlass::gcd(kHeadDim / kGmemElemsPerLoad, int(MaxThreadsPerBlock)); + static_assert(MaxThreadsPerBlock % kGmemThreadsPerRow == 0, "MaxThreadsPerBlock must be a multiple of kGmemThreadsPerRow"); + using GmemLayoutAtom = Layout, Int>, + Stride, _1>>; + using GmemTiledCopy = decltype( + make_tiled_copy(Copy_Atom{}, + GmemLayoutAtom{}, + Layout>>{})); // Val layout, 8 or 16 vals per load + + using GmemTiledCopydQaccum = cute::SM90_TMA_LOAD; + + struct SharedStorage : cute::aligned_struct<128> { + cute::array_aligned, 1024> smem_dqacc; + cute::array_aligned> smem_dq; + alignas(16) cutlass::arch::ClusterTransactionBarrier barrier_dQaccum; + }; + + static constexpr int SharedStorageSize = sizeof(SharedStorage); + + using ShapedQ = cute::Shape; // (seqlen_q, d, head, batch) + using StridedQ = cute::Stride; + + using TMA_dQaccum = decltype(make_tma_copy( + GmemTiledCopydQaccum{}, + make_tensor(make_gmem_ptr(static_cast(nullptr)), ShapedQ{}, StridedQ{}), + SmemLayoutdQaccumTMA{}, + SmemLayoutdQaccumTMA{}.shape(), + _1{})); // no mcast for dQ + + // Device side arguments + struct Arguments { + ElementAccum const* ptr_dQaccum; + ShapedQ const shape_dQaccum; + StridedQ const stride_dQaccum; + Element* ptr_dQ; + ShapedQ const shape_dQ; + StridedQ const stride_dQ; + float const softmax_scale; + int const* cu_seqlens = nullptr; + }; + + // Kernel entry point API + struct Params { + TMA_dQaccum tma_load_dQaccum; + ShapedQ const shape_dQaccum; + Element* ptr_dQ; + ShapedQ const shape_dQ; + StridedQ const stride_dQ; + float const softmax_scale; + int const* cu_seqlens = nullptr; + }; + + // Convert to underlying arguments. In this case, a simple copy for the aliased type. + static + Params + to_underlying_arguments(Arguments const& args) { + Tensor mdQaccum = make_tensor(make_gmem_ptr(args.ptr_dQaccum), args.shape_dQaccum, args.stride_dQaccum); + TMA_dQaccum tma_load_dQaccum = make_tma_copy( + GmemTiledCopydQaccum{}, + mdQaccum, + SmemLayoutdQaccumTMA{}, + SmemLayoutdQaccumTMA{}.shape(), + _1{}); // no mcast for dQaccum + return { + tma_load_dQaccum, + args.shape_dQaccum, + args.ptr_dQ, + args.shape_dQ, + args.stride_dQ, + args.softmax_scale, + args.cu_seqlens + }; + } + + CUTLASS_DEVICE + void + operator()(Params const& params, char* smem_buf) { + + static constexpr int kBlockM = get<0>(TileShape_MK{}); + SharedStorage& shared_storage = *reinterpret_cast(smem_buf); + + Tensor sdQaccumTMA = make_tensor(make_smem_ptr(shared_storage.smem_dqacc.data()), SmemLayoutdQaccumTMA{}); + // Tensor sdQaccumTMAnoswizzle = make_tensor(make_smem_ptr(shared_storage.smem_dqacc.data()), SmemLayoutdQaccumTMANoSwizzle{}); + Tensor sdQaccum = make_tensor(make_smem_ptr(shared_storage.smem_dqacc.data()), SmemLayoutdQaccum{}); + Tensor sdQ = make_tensor(make_smem_ptr(shared_storage.smem_dq.data()), SmemLayoutdQ{}); + Tensor sdQt = make_tensor(make_smem_ptr(shared_storage.smem_dq.data()), SmemLayoutdQt{}); + + int const thread_idx = threadIdx.x; + int const m_block = blockIdx.x; + int const bidh = blockIdx.y; + int const bidb = blockIdx.z; + + bool const is_varlen = params.cu_seqlens != nullptr; + int const seqlen = !is_varlen ? get<0>(params.shape_dQ) : params.cu_seqlens[bidb + 1] - params.cu_seqlens[bidb]; + if (is_varlen && m_block * kBlockM >= seqlen) { return; } + + int lane_predicate = cute::elect_one_sync(); + int warp_idx = cutlass::canonical_warp_idx_sync(); + // Issue Tma Descriptor Prefetch from a single thread + if (warp_idx == 0 && lane_predicate) { + cute::prefetch_tma_descriptor(params.tma_load_dQaccum.get_tma_descriptor()); + shared_storage.barrier_dQaccum.init(1 /*numThreads*/); + } + __syncthreads(); + + // Step 1: TMA to load dQaccum from gmem to smem + // We reshaped dQaccum to have last dimension 32, so the offset needs to be multiplied by kHeadDim / 32 + int const offset_padded = !is_varlen ? 0 : ((params.cu_seqlens[bidb] + bidb * 128) / 128 * 128) * (kHeadDim / get<1>(SmemLayoutdQaccumTMA{}.shape())); + Tensor mdQaccum = params.tma_load_dQaccum.get_tma_tensor(params.shape_dQaccum)(_, _, bidh, !is_varlen ? bidb : 0); + Tensor gdQaccum = local_tile(domain_offset(make_coord(offset_padded, _0{}), mdQaccum), SmemLayoutdQaccumTMA{}.shape(), make_coord(m_block, _0{})); // (M, K) + auto block_tma_dQ = params.tma_load_dQaccum.get_slice(_0{}); + Tensor tdQgdQaccumTMA = block_tma_dQ.partition_D(gdQaccum); // (TMA, TMA_M, TMA_K) + Tensor tdQsdQaccumTMA = block_tma_dQ.partition_S(sdQaccumTMA); // (TMA, TMA_M, TMA_K) + static constexpr uint32_t TmaTransactionBytesdQaccum = static_cast(size(SmemLayoutdQaccumTMA{}) * cute::sizeof_bits_v / 8); + if (warp_idx == 0 && lane_predicate) { + shared_storage.barrier_dQaccum.arrive_and_expect_tx(TmaTransactionBytesdQaccum); + copy(params.tma_load_dQaccum.with(reinterpret_cast(shared_storage.barrier_dQaccum), 0 /*mcast_mask*/), tdQgdQaccumTMA, tdQsdQaccumTMA); + } + shared_storage.barrier_dQaccum.wait(0); + + // __syncthreads(); if (cute::thread0()) { print_tensor(sdQaccumTMA); } + // __syncthreads(); if (cute::thread0()) { print_tensor(sdQaccumTMAnoswizzle); } + // __syncthreads(); if (cute::thread0()) { print_tensor(sdQaccum); } + + // Step 2: Load dQaccum from smem to register, then convert fp32 -> fp16/bf16 + R2STiledCopydQaccum s2r_tiled_copy_dQaccum; + auto s2r_thr_copy_dQaccum = s2r_tiled_copy_dQaccum.get_thread_slice(thread_idx); + Tensor tdQsdQaccum = s2r_thr_copy_dQaccum.partition_S(sdQaccum); + TiledMma tiled_mma_dQ; + Tensor taccdQrdQaccum = partition_fragment_C(tiled_mma_dQ, select(TileShape_MK{})); + // if (cute::thread0()) { print(tiled_mma_dQ); printf("\n"); } + // if (cute::thread0()) { print(tdQsdQaccum); } + // if (cute::thread0()) { print(taccdQrdQaccum); } + CUTE_STATIC_ASSERT_V(size(taccdQrdQaccum) == size(tdQsdQaccum)); + Tensor tdQrdQaccum = s2r_thr_copy_dQaccum.retile_D(taccdQrdQaccum); + cute::copy(s2r_tiled_copy_dQaccum, tdQsdQaccum, tdQrdQaccum); + #pragma unroll + for (int i = 0; i < size(taccdQrdQaccum); ++i) { taccdQrdQaccum(i) *= params.softmax_scale; } + // Convert tdQrdQ from fp32 to fp16 + Tensor rdQ = flash::convert_type(taccdQrdQaccum); + + // Step 3: Copy dQ from register to smem + auto smem_tiled_copy_dQ = make_tiled_copy_C(SmemCopyAtomdQ{}, tiled_mma_dQ); + auto smem_thr_copy_dQ = smem_tiled_copy_dQ.get_thread_slice(thread_idx); + Tensor taccdQrdQ = smem_thr_copy_dQ.retile_S(rdQ); // ((Atom,AtomNum), MMA_N, MMA_N) + // if (cute::thread0()) { print(smem_tiled_copy_dQ); } + // if (cute::thread0()) { print(smem_thr_copy_dQ); } + // if (cute::thread0()) { print(sdQ); } + if constexpr (!dQ_swapAB) { + Tensor taccdQsdQ = smem_thr_copy_dQ.partition_D(sdQ); // ((Atom,AtomNum),PIPE_M,PIPE_N) + cute::copy(smem_tiled_copy_dQ, taccdQrdQ, taccdQsdQ); + } else { + Tensor taccdQsdQt = smem_thr_copy_dQ.partition_D(sdQt); // ((Atom,AtomNum),PIPE_M,PIPE_N) + cute::copy(smem_tiled_copy_dQ, taccdQrdQ, taccdQsdQt); + } + __syncthreads(); + + // Step 4: Copy dQ from smem to register to prepare for coalesced write to gmem + int const offset = !is_varlen ? 0 : params.cu_seqlens[bidb]; + Tensor mdQ = make_tensor(make_gmem_ptr(params.ptr_dQ), params.shape_dQ, params.stride_dQ)(_, _, bidh, !is_varlen ? bidb : 0); + Tensor gdQ = local_tile(domain_offset(make_coord(offset, _0{}), mdQ), TileShape_MK{}, make_coord(m_block, _0{})); // (M, K) + GmemTiledCopy gmem_tiled_copy_dQ; + auto gmem_thr_copy_dQ = gmem_tiled_copy_dQ.get_thread_slice(thread_idx); + Tensor tdQsdQ = gmem_thr_copy_dQ.partition_S(sdQ); // ((Atom,AtomNum),ATOM_M,ATOM_N) + Tensor tdQgdQ = gmem_thr_copy_dQ.partition_D(gdQ); + + Tensor tdQrdQ = make_fragment_like(tdQsdQ); + cute::copy(gmem_tiled_copy_dQ, tdQsdQ, tdQrdQ); + + // Step 5: Copy dQ from register to gmem + // Construct identity layout for gdQ + Tensor cdQ = cute::make_identity_tensor(TileShape_MK{}); // (BLK_M,BLK_K) -> (blk_m,blk_k) + // Repeat the partitioning with identity layouts + Tensor tdQcdQ = gmem_thr_copy_dQ.partition_D(cdQ); + Tensor tdQpdQ = make_tensor(make_shape(size<2>(tdQgdQ))); + #pragma unroll + for (int k = 0; k < size(tdQpdQ); ++k) { tdQpdQ(k) = get<1>(tdQcdQ(_0{}, _0{}, k)) < get<1>(params.shape_dQ); } + // Clear_OOB_K must be false since we don't want to write zeros to gmem + flash::copy( + gmem_tiled_copy_dQ, tdQrdQ, tdQgdQ, tdQcdQ, tdQpdQ, seqlen - m_block * kBlockM + ); + } + +}; + +} // namespace flash diff --git a/flash_bwd_preprocess_kernel.h b/flash_bwd_preprocess_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..d0c185f72f6bf322a91d107671583d6963d4c159 --- /dev/null +++ b/flash_bwd_preprocess_kernel.h @@ -0,0 +1,246 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include "cute/tensor.hpp" + +#include +#include +#include +#include + +#include "utils.h" + +namespace flash { + +using namespace cute; + +template +class FlashAttnBwdPreprocess { + +public: + + // Type Aliases + using TileShape_MK = TileShape_MK_; + + static constexpr uint32_t MaxThreadsPerBlock = 256; + static constexpr uint32_t MinBlocksPerMultiprocessor = 2; + static constexpr int SharedStorageSize = 0; + + static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element); + static_assert(get<1>(TileShape_MK{}) % kGmemElemsPerLoad == 0, "Headdim must be a multiple of kGmemElemsPerLoad"); + static constexpr int kHeadDim = get<1>(TileShape_MK{}); + // We want kBlockKGmem to be a power of 2 so that when we do the summing, + // it's just between threads in the same warp + static constexpr int kBlockKGmem = kHeadDim % 128 == 0 ? 128 : (kHeadDim % 64 == 0 ? 64 : 32); + static constexpr int kGmemThreadsPerRow = kBlockKGmem / kGmemElemsPerLoad; + static_assert(MaxThreadsPerBlock % kGmemThreadsPerRow == 0, "MaxThreadsPerBlock must be a multiple of kGmemThreadsPerRow"); + using GmemLayoutAtom = Layout, Int>, + Stride, _1>>; + using GmemTiledCopy = decltype( + make_tiled_copy(Copy_Atom{}, + GmemLayoutAtom{}, + Layout>>{})); // Val layout, 8 or 16 vals per load + + static constexpr int kGmemElemsPerLoadAccum = sizeof(cute::uint128_t) / sizeof(ElementAccum); + static_assert(get<1>(TileShape_MK{}) % kGmemElemsPerLoadAccum == 0, "Headdim must be a multiple of kGmemElemsPerLoadAccum"); + static constexpr int kGmemThreadsPerRowAccum = kBlockKGmem / kGmemElemsPerLoadAccum; + static_assert(MaxThreadsPerBlock % kGmemThreadsPerRowAccum == 0, "MaxThreadsPerBlock must be a multiple of kGmemThreadsPerRowAccum"); + using GmemLayoutAtomAccum = Layout, Int>, + Stride, _1>>; + using GmemTiledCopyAccum = decltype( + make_tiled_copy(Copy_Atom{}, + GmemLayoutAtomAccum{}, + Layout>>{})); // Val layout, 4 vals per store + + using ShapeO = cute::Shape; // (seqlen_q, d, head, batch) + using StrideO = cute::Stride; + using ShapedPsum = cute::Shape; // (seqlen_q, head, batch) + using StridedPsum = cute::Stride<_1, int64_t, int64_t>; + + // Device side arguments + struct Arguments { + Element const* ptr_O; + ShapeO const shape_O; + StrideO const stride_O; + Element const* ptr_dO; + StrideO const stride_dO; + float* ptr_dPsum; + ShapedPsum const shape_dPsum; + StridedPsum const stride_dPsum; + float const* ptr_LSE; + StridedPsum const stride_LSE; + float *ptr_LSE_log2; + StridedPsum const stride_LSE_log2; + ElementAccum* ptr_dQaccum; + ShapeO const shape_dQaccum; + StrideO const stride_dQaccum; + int num_batch; // We need this to know the size of dq_semaphore in case of varlen + int* dq_semaphore; + int const* cu_seqlens = nullptr; + }; + + // Kernel entry point API + struct Params { + Element const* ptr_O; + ShapeO const shape_O; + StrideO const stride_O; + Element const* ptr_dO; + StrideO const stride_dO; + float* ptr_dPsum; + ShapedPsum const shape_dPsum; + StridedPsum const stride_dPsum; + float const* ptr_LSE; + StridedPsum const stride_LSE; + float* ptr_LSE_log2; + StridedPsum const stride_LSE_log2; + ElementAccum* ptr_dQaccum; + ShapeO const shape_dQaccum; + StrideO const stride_dQaccum; + int num_batch; + int* dq_semaphore; + int const* cu_seqlens = nullptr; + }; + + // Convert to underlying arguments. In this case, a simple copy for the aliased type. + static + Params + to_underlying_arguments(Arguments const& args) { + return { + args.ptr_O, + args.shape_O, + args.stride_O, + args.ptr_dO, + args.stride_dO, + args.ptr_dPsum, + args.shape_dPsum, + args.stride_dPsum, + args.ptr_LSE, + args.stride_LSE, + args.ptr_LSE_log2, + args.stride_LSE_log2, + args.ptr_dQaccum, + args.shape_dQaccum, + args.stride_dQaccum, + args.num_batch, + args.dq_semaphore, + args.cu_seqlens + }; + } + + CUTLASS_DEVICE + void + operator()(Params const& params, [[maybe_unused]] char* smem_buf) { + + static constexpr int kBlockM = get<0>(TileShape_MK{}); + + int const thread_idx = threadIdx.x; + int const m_block = blockIdx.x; + int const bidh = blockIdx.y; + int const bidb = blockIdx.z; + + bool const is_varlen = Varlen && params.cu_seqlens != nullptr; + int const offset_o = !is_varlen ? 0 : params.cu_seqlens[bidb]; + int const seqlen_o = !is_varlen ? get<0>(params.shape_O) : params.cu_seqlens[bidb + 1] - offset_o; + if (is_varlen && m_block * kBlockM >= seqlen_o) { return; } + + Tensor mO = make_tensor(make_gmem_ptr(params.ptr_O), params.shape_O, params.stride_O)(_, _, bidh, !is_varlen ? bidb : 0); + Tensor gO = local_tile(cute::domain_offset(make_coord(offset_o, _0{}), mO), TileShape_MK{}, make_coord(m_block, _0{})); // (M, K) + Tensor mdO = make_tensor(make_gmem_ptr(params.ptr_dO), params.shape_O, params.stride_dO)(_, _, bidh, !is_varlen ? bidb : 0); + Tensor gdO = local_tile(cute::domain_offset(make_coord(offset_o, _0{}), mdO), TileShape_MK{}, make_coord(m_block, _0{})); // (M, K) + + auto shape_LSE = select<0, 2, 3>(params.shape_O); + Tensor mLSE = make_tensor(make_gmem_ptr(params.ptr_LSE), shape_LSE, params.stride_LSE)(_, bidh, !is_varlen ? bidb : 0); + Tensor gLSE = local_tile(cute::domain_offset(make_coord(offset_o), mLSE), Shape>{}, make_coord(m_block)); + static_assert(kBlockM <= MaxThreadsPerBlock); + float lse = thread_idx < seqlen_o - m_block * kBlockM && thread_idx < kBlockM ? gLSE(thread_idx) : INFINITY; + + GmemTiledCopy gmem_tiled_copy_O; + auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx); + + Tensor tOgO = gmem_thr_copy_O.partition_S(gO); + Tensor tOgdO = gmem_thr_copy_O.partition_S(gdO); + // Construct identity layout for gO + Tensor cO = cute::make_identity_tensor(TileShape_MK{}); // (BLK_M,BLK_K) -> (blk_m,blk_k) + // Repeat the partitioning with identity layouts + Tensor tOcO = gmem_thr_copy_O.partition_D(cO); + Tensor tOpO = make_tensor(make_shape(size<2>(tOgO))); + #pragma unroll + for (int k = 0; k < size(tOpO); ++k) { tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(params.shape_O); } + + // (8, kBlockM / 32, kHeadDim / 64) or (8, kBlockM / 16, kHeadDim / 128) + Tensor tOrO = make_fragment_like(tOgO); + Tensor tOrdO = make_fragment_like(tOgdO); + flash::copy( + gmem_tiled_copy_O, tOgO, tOrO, tOcO, tOpO, seqlen_o - m_block * kBlockM + ); + flash::copy( + gmem_tiled_copy_O, tOgdO, tOrdO, tOcO, tOpO, seqlen_o - m_block * kBlockM + ); + + // Reshape from e.g. (8, kBlockM / 32, kHeadDim / 64) to (kBlockM / 32, (8, kHeadDim / 64)) + Layout l = make_layout(get<1>(tOrO.layout()), make_layout(get<0>(tOrO.layout()), get<2>(tOrO.layout()))); + Tensor o_fp32 = flash::convert_type(make_tensor(tOrO.data(), l)); + Tensor do_fp32 = flash::convert_type(make_tensor(tOrdO.data(), l)); + // Sum across the last dimension + Tensor dP_sum = make_tensor(make_shape(size<0>(o_fp32))); + #pragma unroll + for (int mi = 0; mi < size<0>(o_fp32); ++mi) { + float dP_sum_cur = do_fp32(mi, 0) * o_fp32(mi, 0); + #pragma unroll + for (int ni = 1; ni < size<1>(o_fp32); ni++) { + dP_sum_cur += do_fp32(mi, ni) * o_fp32(mi, ni); + } + flash::SumOp sum_op; + dP_sum(mi) = flash::Allreduce::run(dP_sum_cur, sum_op); + } + + // If varlen, the layout for dPSum, LSE_log2, and dQaccum is that we pad each sequence in the batch + // by an extra 128, so that the write for each sequence doesn't touch the next sequence. + // Sequence i starts at params.cu_seqlens[i] + i * 128 and ends at params.cu_seqlens[i + 1] + i * 128 + int const offset_padded = !is_varlen ? 0 : (params.cu_seqlens[bidb] + bidb * 128) / 128 * 128; + Tensor mdPsum = make_tensor(make_gmem_ptr(params.ptr_dPsum), params.shape_dPsum, params.stride_dPsum)(_, bidh, !is_varlen ? bidb : 0); + Tensor gdPsum = local_tile(cute::domain_offset(make_coord(offset_padded), mdPsum), Shape>{}, make_coord(m_block)); + if (thread_idx % kGmemThreadsPerRow == 0) { + #pragma unroll + for (int mi = 0; mi < size(dP_sum); ++mi) { + int row = thread_idx / kGmemThreadsPerRow + mi * MaxThreadsPerBlock / kGmemThreadsPerRow; + gdPsum(row) = row < seqlen_o - m_block * kBlockM ? dP_sum(mi) : 0; + } + } + + int const seqlen_rounded = cute::round_up(seqlen_o, kBlockM); + Tensor mLSElog2 = make_tensor(make_gmem_ptr(params.ptr_LSE_log2), params.shape_dPsum, params.stride_LSE_log2)(_, bidh, !is_varlen ? bidb : 0); + Tensor gLSElog2 = local_tile(cute::domain_offset(make_coord(offset_padded), mLSElog2), Shape>{}, make_coord(m_block)); + if (thread_idx < seqlen_rounded - m_block * kBlockM && thread_idx < kBlockM) { + gLSElog2(thread_idx) = lse == -INFINITY ? 0.f : lse * float(M_LOG2E); + } + + if constexpr (Clear_dQaccum) { + Tensor mdQaccum = make_tensor(make_gmem_ptr(params.ptr_dQaccum), params.shape_dQaccum, params.stride_dQaccum)(_, _, bidh, !is_varlen ? bidb : 0); + Tensor gdQaccum = local_tile(cute::domain_offset(make_coord(offset_padded, _0{}), mdQaccum), TileShape_MK{}, make_coord(m_block, _0{})); + GmemTiledCopyAccum gmem_tiled_copy_dQaccum; + auto gmem_thr_copy_dQaccum = gmem_tiled_copy_dQaccum.get_thread_slice(thread_idx); + Tensor tdQgdQaccum = gmem_thr_copy_dQaccum.partition_D(gdQaccum); + Tensor zero = make_fragment_like(tdQgdQaccum); + clear(zero); + // cute::copy(zero, tdQgdQaccum); // Somehow this doesn't vectorize the write + #pragma unroll + for (int m = 0; m < size<1>(zero); ++m) { + cute::copy(zero(_, m, _), tdQgdQaccum(_, m, _)); + } + } + + if (params.dq_semaphore != nullptr && thread_idx == 0) { + int const num_batch = params.num_batch; + int const num_head = get<2>(params.shape_dQaccum); + params.dq_semaphore[bidh + bidb * num_head + m_block * num_head * num_batch] = 0; + } + + } + +}; + +} // namespace flash diff --git a/flash_common.hpp b/flash_common.hpp new file mode 100644 index 0000000000000000000000000000000000000000..cc601f9705c9bf53f015cbca195e5e1cf91be527 --- /dev/null +++ b/flash_common.hpp @@ -0,0 +1,38 @@ +/****************************************************************************** + * Copyright (c) 2024, Tri Dao. + ******************************************************************************/ + +#pragma once + +// Include these 2 headers instead of torch/extension.h since we don't need all of the torch headers. +#include +#include +#include +#include + +#ifdef OLD_GENERATOR_PATH +#include +#else +#include +#endif + + +#define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA") +#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")") +#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") + +namespace flash { +// Copy from PyTorch +// https://github.com/pytorch/pytorch/blob/8b61daaf7349e9102117e1aeefaa51666d887547/aten/src/ATen/cuda/detail/UnpackRaw.cuh#L17 +static std::tuple unpack(at::PhiloxCudaState arg) { + if (arg.captured_) { + // static_cast avoids "warning: invalid narrowing conversion from "long" to "unsigned long". + // *(arg.offset_.ptr) is a broadcast load of a single int64_t to the entire kernel. + // For most threads' reads it will hit in cache, so it shouldn't hurt performance. + return std::make_tuple(static_cast(*arg.seed_.ptr), static_cast(*(arg.offset_.ptr) + arg.offset_intragraph_)); + } else { + return std::make_tuple(arg.seed_.val, arg.offset_.val); + } +} + +} // namespace flash diff --git a/flash_fwd_hdim128_bf16_causal_sm80.cu b/flash_fwd_hdim128_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..9383c1024927e1af6ce7d14ed358d6386913f5ba --- /dev/null +++ b/flash_fwd_hdim128_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim128(params, stream); +} diff --git a/flash_fwd_hdim128_bf16_sm80.cu b/flash_fwd_hdim128_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..f03abda4869b516ce63d367551157f1d92995a32 --- /dev/null +++ b/flash_fwd_hdim128_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim128(params, stream); +} diff --git a/flash_fwd_hdim128_bf16_sm90.cu b/flash_fwd_hdim128_bf16_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..11bb9ddecccf85f346555f585d391d93f324a732 --- /dev/null +++ b/flash_fwd_hdim128_bf16_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim128(params, stream); +} diff --git a/flash_fwd_hdim128_e4m3_sm90.cu b/flash_fwd_hdim128_e4m3_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..04b431f10b8aae6a5b3c80d79799fc6c8740f6a5 --- /dev/null +++ b/flash_fwd_hdim128_e4m3_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim128_fp8(params, stream); +} diff --git a/flash_fwd_hdim128_fp16_causal_sm80.cu b/flash_fwd_hdim128_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..c616628c871d1773f9cf1c28da43ede7f684c230 --- /dev/null +++ b/flash_fwd_hdim128_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim128(params, stream); +} diff --git a/flash_fwd_hdim128_fp16_sm80.cu b/flash_fwd_hdim128_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..4ff6b9fbfbbef50456a4e707638de23e6612eafe --- /dev/null +++ b/flash_fwd_hdim128_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim128(params, stream); +} diff --git a/flash_fwd_hdim128_fp16_sm90.cu b/flash_fwd_hdim128_fp16_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..176c38eddcdc25aa4ba6ef9eccb2d9fd53ee2c03 --- /dev/null +++ b/flash_fwd_hdim128_fp16_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim128(params, stream); +} diff --git a/flash_fwd_hdim160_bf16_causal_sm80.cu b/flash_fwd_hdim160_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..d6d4371bfbdddbdde9ead33cf5f6b054df7fd526 --- /dev/null +++ b/flash_fwd_hdim160_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim160(params, stream); +} diff --git a/flash_fwd_hdim160_bf16_sm80.cu b/flash_fwd_hdim160_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..5af68ac38fb2ea3032026dae4fc82d27d1ee0437 --- /dev/null +++ b/flash_fwd_hdim160_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim160(params, stream); +} diff --git a/flash_fwd_hdim160_fp16_causal_sm80.cu b/flash_fwd_hdim160_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..1ef511a6b749c21f5b9c8dcec111cf71a517a7cf --- /dev/null +++ b/flash_fwd_hdim160_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim160(params, stream); +} diff --git a/flash_fwd_hdim160_fp16_sm80.cu b/flash_fwd_hdim160_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..96abfbd8a1808ebff6da52798d6086efcfb3ad18 --- /dev/null +++ b/flash_fwd_hdim160_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim160(params, stream); +} diff --git a/flash_fwd_hdim192_bf16_causal_sm80.cu b/flash_fwd_hdim192_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..077d25d091ed512b778039d88f3019a96a24c279 --- /dev/null +++ b/flash_fwd_hdim192_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim192(params, stream); +} diff --git a/flash_fwd_hdim192_bf16_sm80.cu b/flash_fwd_hdim192_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..ea5f265fe33e70461eb4c20090aced3ae9526df2 --- /dev/null +++ b/flash_fwd_hdim192_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim192(params, stream); +} diff --git a/flash_fwd_hdim192_fp16_causal_sm80.cu b/flash_fwd_hdim192_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..a4a7bc2422829913afe6b97bb399c8ccf096ad18 --- /dev/null +++ b/flash_fwd_hdim192_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim192(params, stream); +} diff --git a/flash_fwd_hdim192_fp16_sm80.cu b/flash_fwd_hdim192_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..c30c4a14fe7b4712335041f1014338c2fa68754f --- /dev/null +++ b/flash_fwd_hdim192_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim192(params, stream); +} diff --git a/flash_fwd_hdim224_bf16_causal_sm80.cu b/flash_fwd_hdim224_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..a12a5f4ad7adaa632aced8f0ed390f6451dd787b --- /dev/null +++ b/flash_fwd_hdim224_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2023, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim224(params, stream); +} diff --git a/flash_fwd_hdim224_bf16_sm80.cu b/flash_fwd_hdim224_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..8690bdb1a40992bbda9680e1669bc669d9e9bc0f --- /dev/null +++ b/flash_fwd_hdim224_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2023, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim224(params, stream); +} diff --git a/flash_fwd_hdim224_fp16_causal_sm80.cu b/flash_fwd_hdim224_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..f01dad09cff91953f4e12119c8d77b2e83748aa0 --- /dev/null +++ b/flash_fwd_hdim224_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2023, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim224(params, stream); +} diff --git a/flash_fwd_hdim224_fp16_sm80.cu b/flash_fwd_hdim224_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..7ec1e16b7fee69cb735d6ba0e92d944f253b310a --- /dev/null +++ b/flash_fwd_hdim224_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2023, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim224(params, stream); +} diff --git a/flash_fwd_hdim256_bf16_causal_sm80.cu b/flash_fwd_hdim256_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..f84e978c911efabf1234530300cac47e2384c263 --- /dev/null +++ b/flash_fwd_hdim256_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim256(params, stream); +} diff --git a/flash_fwd_hdim256_bf16_sm80.cu b/flash_fwd_hdim256_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..c52f0417b9c41396b52f258f949180a65484f7d4 --- /dev/null +++ b/flash_fwd_hdim256_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim256(params, stream); +} diff --git a/flash_fwd_hdim256_bf16_sm90.cu b/flash_fwd_hdim256_bf16_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..06d0df617bdcc9dd8fd7d6fff25961512b18b6ff --- /dev/null +++ b/flash_fwd_hdim256_bf16_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim256(params, stream); +} diff --git a/flash_fwd_hdim256_e4m3_sm90.cu b/flash_fwd_hdim256_e4m3_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..78884313ecf2ad54eea58d8153eab7306b2717cb --- /dev/null +++ b/flash_fwd_hdim256_e4m3_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim256_fp8(params, stream); +} diff --git a/flash_fwd_hdim256_fp16_causal_sm80.cu b/flash_fwd_hdim256_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..f96f7edc672bf802bca02368a648b869f281d1a2 --- /dev/null +++ b/flash_fwd_hdim256_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim256(params, stream); +} diff --git a/flash_fwd_hdim256_fp16_sm80.cu b/flash_fwd_hdim256_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..9c7c6b93d8a3c587eb4004d38743ce723aae5011 --- /dev/null +++ b/flash_fwd_hdim256_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim256(params, stream); +} diff --git a/flash_fwd_hdim256_fp16_sm90.cu b/flash_fwd_hdim256_fp16_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..0cc26c79104d2b2a77d3b1acbef1459b06a07433 --- /dev/null +++ b/flash_fwd_hdim256_fp16_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim256(params, stream); +} diff --git a/flash_fwd_hdim32_bf16_causal_sm80.cu b/flash_fwd_hdim32_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..e21d0408ca5d3156e8148d14d6c57d01e737bfbb --- /dev/null +++ b/flash_fwd_hdim32_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim32(params, stream); +} diff --git a/flash_fwd_hdim32_bf16_sm80.cu b/flash_fwd_hdim32_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..f377a5b8fa8d20578ab840f6e0345d080f2ad72f --- /dev/null +++ b/flash_fwd_hdim32_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim32(params, stream); +} diff --git a/flash_fwd_hdim32_fp16_causal_sm80.cu b/flash_fwd_hdim32_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..74e4d66ae97d190f2a99553df34410949643882d --- /dev/null +++ b/flash_fwd_hdim32_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim32(params, stream); +} diff --git a/flash_fwd_hdim32_fp16_sm80.cu b/flash_fwd_hdim32_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..e85db18e39b1e24b102e4b4de15d539434812ad0 --- /dev/null +++ b/flash_fwd_hdim32_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim32(params, stream); +} diff --git a/flash_fwd_hdim64_bf16_causal_sm80.cu b/flash_fwd_hdim64_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..9297e8bb68d7e4f40c6525b81c6b530fd7cd0971 --- /dev/null +++ b/flash_fwd_hdim64_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim64(params, stream); +} diff --git a/flash_fwd_hdim64_bf16_sm80.cu b/flash_fwd_hdim64_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..8364b1e7ee90ef75d9edfb1074a05e473ced4b1a --- /dev/null +++ b/flash_fwd_hdim64_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim64(params, stream); +} diff --git a/flash_fwd_hdim64_bf16_sm90.cu b/flash_fwd_hdim64_bf16_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..d3839898f25d5c1c4163024fa951af4b26b30073 --- /dev/null +++ b/flash_fwd_hdim64_bf16_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim64(params, stream); +} diff --git a/flash_fwd_hdim64_e4m3_sm90.cu b/flash_fwd_hdim64_e4m3_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..471a5037a1dd0028037c38f195b1bd583b0466c1 --- /dev/null +++ b/flash_fwd_hdim64_e4m3_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim64_fp8(params, stream); +} diff --git a/flash_fwd_hdim64_fp16_causal_sm80.cu b/flash_fwd_hdim64_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..1c6ed7ef02396ccc7769a2cc8073192ef2d13a2e --- /dev/null +++ b/flash_fwd_hdim64_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim64(params, stream); +} diff --git a/flash_fwd_hdim64_fp16_sm80.cu b/flash_fwd_hdim64_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..3c87573ba264b741b9c501d32dbe4473aeee4b81 --- /dev/null +++ b/flash_fwd_hdim64_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim64(params, stream); +} diff --git a/flash_fwd_hdim64_fp16_sm90.cu b/flash_fwd_hdim64_fp16_sm90.cu new file mode 100644 index 0000000000000000000000000000000000000000..c6eac53520404a79253f0c23e84de95312bff4c1 --- /dev/null +++ b/flash_fwd_hdim64_fp16_sm90.cu @@ -0,0 +1,9 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim64(params, stream); +} diff --git a/flash_fwd_hdim96_bf16_causal_sm80.cu b/flash_fwd_hdim96_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..49fae856a58661871b7bf1924029f5ef62443d96 --- /dev/null +++ b/flash_fwd_hdim96_bf16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim96(params, stream); +} diff --git a/flash_fwd_hdim96_bf16_sm80.cu b/flash_fwd_hdim96_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..c5af1cf63451e79c43a39835cda9fbab5682a8fb --- /dev/null +++ b/flash_fwd_hdim96_bf16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim96(params, stream); +} diff --git a/flash_fwd_hdim96_fp16_causal_sm80.cu b/flash_fwd_hdim96_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..b0d6c9928ecb62be1b4269ef42bf74bb4aee3ad5 --- /dev/null +++ b/flash_fwd_hdim96_fp16_causal_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim96(params, stream); +} diff --git a/flash_fwd_hdim96_fp16_sm80.cu b/flash_fwd_hdim96_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..c97aa33f8bcb86110c28ccf4582389431924d8e7 --- /dev/null +++ b/flash_fwd_hdim96_fp16_sm80.cu @@ -0,0 +1,10 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_(Flash_fwd_params ¶ms, cudaStream_t stream) { + run_mha_fwd_hdim96(params, stream); +} diff --git a/flash_fwd_kernel.h b/flash_fwd_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..6b55021cf023491eb412bcca5d34ad21c447da5e --- /dev/null +++ b/flash_fwd_kernel.h @@ -0,0 +1,385 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include "cute/tensor.hpp" + +#include +#include +#include +#include +#include +#include "cutlass/pipeline/pipeline.hpp" + +#include "flash.h" +#include "utils.h" +#include "softmax.h" +#include "tile_scheduler.hpp" +#include "mainloop_fwd_sm90_tma_gmma_ws.hpp" +#include "epilogue_fwd_sm90_tma.hpp" + +namespace flash { + +using namespace cute; + +template +__global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp, 1) + compute_attn_ws(CUTE_GRID_CONSTANT typename CollectiveMainloopFwd::Params const mainloop_params, + CUTE_GRID_CONSTANT typename CollectiveEpilogueFwd::Params const epilogue_params, + CUTE_GRID_CONSTANT typename TileScheduler::Params const scheduler_params, + Seqlen_traits seqlen_traits_q, Seqlen_traits seqlen_traits_k + ) { + + using Element = typename Ktraits::Element; + using ElementAccum = typename Ktraits::ElementAccum; + using SoftType = ElementAccum; + using TileShape_MNK = typename Ktraits::TileShape_MNK; + using ClusterShape = typename Ktraits::ClusterShape_MNK; + + static_assert(Ktraits::Is_WS); + static constexpr bool Is_WS = Ktraits::Is_WS; + + static constexpr int NumMmaThreads = size(typename Ktraits::TiledMma0{}); + static constexpr int NumCopyThreads = !Is_WS ? 0 : cutlass::NumThreadsPerWarpGroup; + static constexpr int kBlockM = Ktraits::kBlockM; + // static constexpr int kBlockN = Ktraits::kBlockN; + // constexpr int kHeadDim = Ktraits::kHeadDim; + + using CollectiveMainloop = CollectiveMainloopFwd; + using CollectiveEpilogue = CollectiveEpilogueFwd; + + using MainloopPipeline = typename Ktraits::MainloopPipeline; + using PipelineParams = typename MainloopPipeline::Params; + using PipelineState = typename MainloopPipeline::PipelineState; + + extern __shared__ char shared_memory[]; + auto &shared_storage = *reinterpret_cast(shared_memory); + + int const lane_predicate = cute::elect_one_sync(); + int const warp_idx = cutlass::canonical_warp_idx_sync(); + + // Issue Tma Descriptor Prefetch from a single thread + if (warp_idx == 0 && lane_predicate) { + CollectiveMainloop::prefetch_tma_descriptors(mainloop_params); + CollectiveEpilogue::prefetch_tma_descriptors(epilogue_params); + } + + // Obtain warp index + int const warp_group_thread_idx = threadIdx.x % cutlass::NumThreadsPerWarpGroup; + + PipelineParams pipeline_params; + pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytesK; + int warp_group_idx = cutlass::canonical_warp_group_idx(); + pipeline_params.role = warp_group_idx == 0 + ? MainloopPipeline::ThreadCategory::Producer + : MainloopPipeline::ThreadCategory::Consumer; + pipeline_params.is_leader = warp_group_thread_idx == 0; + pipeline_params.num_consumers = NumMmaThreads; + + if (warp_idx == 0 && lane_predicate) { + shared_storage.barrier_Q.init(1 /*numThreads*/); + shared_storage.barrier_O.init(size(ClusterShape{}) /*numThreads*/); + } + // We're counting on pipeline_k to call cutlass::arch::fence_barrier_init(); + MainloopPipeline pipeline_k(shared_storage.pipeline_k, pipeline_params, ClusterShape{}); + MainloopPipeline pipeline_v(shared_storage.pipeline_v, pipeline_params, ClusterShape{}); + + CollectiveMainloop collective_mainloop; + CollectiveEpilogue collective_epilogue; + + // We need this to guarantee that the Pipeline init is visible to all producers and consumer blocks in the Cluster + if constexpr (size(ClusterShape{}) > 1) { + cute::cluster_arrive_relaxed(); + cute::cluster_wait(); + } else { + __syncthreads(); + } + + static_assert(Ktraits::kNWarps == 12 || Ktraits::kNWarps == 16); + if (warp_group_idx == 0) { // Producer + cutlass::arch::warpgroup_reg_dealloc(); + // cutlass::arch::warpgroup_reg_dealloc<56>(); + + int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0); + if (warp_idx_in_warpgroup == 0) { // Load Q, K, V + PipelineState smem_pipe_write_k = cutlass::make_producer_start_state(); + PipelineState smem_pipe_write_v = cutlass::make_producer_start_state(); + + int work_idx = 0; + + TileScheduler scheduler(&shared_storage.tile_count_semaphore); + for (auto work_tile_info = scheduler.get_initial_work(); + work_tile_info.is_valid(scheduler_params); + work_tile_info = scheduler.template get_next_work(scheduler_params, work_tile_info)) { + auto block_coord = work_tile_info.get_block_coord(scheduler_params); + auto [m_block, bidh, bidb] = block_coord; + + seqlen_traits_q.init(bidb); + seqlen_traits_k.init(bidb); + if (m_block * kBlockM >= seqlen_traits_q.actual_seq_len) { + continue; + } + int n_block_max = collective_mainloop.get_n_block_max( + mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k); + if (Is_causal && n_block_max <= 0) { + scheduler.prefetch_next_work(scheduler_params, work_tile_info); + scheduler.broadcast_next_work(work_tile_info); + continue; + } + collective_mainloop.load(mainloop_params, pipeline_k, pipeline_v, smem_pipe_write_k, smem_pipe_write_v, + shared_storage, scheduler, scheduler_params, work_tile_info, block_coord, work_idx, + seqlen_traits_q, seqlen_traits_k); + ++work_idx; + } + collective_mainloop.load_tail(pipeline_k, pipeline_v, smem_pipe_write_k, smem_pipe_write_v); + } + } else { // Consumer + cutlass::arch::warpgroup_reg_alloc(); + // cutlass::arch::warpgroup_reg_alloc(); + + TileScheduler scheduler(&shared_storage.tile_count_semaphore); + // Initialize matmul objects. + typename Ktraits::TiledMma1 tiled_mma1; + + PipelineState smem_pipe_read_k, smem_pipe_read_v; + // We don't need separate variables smem_pipe_release_k and smem_pipe_release_v + // (like in Cutlass's gemm) because the read and release pipeline states are always the same. + + collective_mainloop.mma_init(); + scheduler.init_consumer(); + + int work_idx = 0; + CUTLASS_PRAGMA_NO_UNROLL + for (auto work_tile_info = scheduler.get_initial_work(); + work_tile_info.is_valid(scheduler_params); + work_tile_info = scheduler.template get_next_work(scheduler_params, work_tile_info)) { + // Attention output (GEMM-II) accumulator. + Tensor tOrO = partition_fragment_C(tiled_mma1, select<0, 2>(TileShape_MNK{})); + flash::Softmax<2 * (2 * kBlockM / NumMmaThreads)> softmax; + + auto block_coord = work_tile_info.get_block_coord(scheduler_params); + auto [m_block, bidh, bidb] = block_coord; + + seqlen_traits_q.init(bidb); + seqlen_traits_k.init(bidb); + if (m_block * kBlockM >= seqlen_traits_q.actual_seq_len) { + continue; + } + int n_block_max = collective_mainloop.get_n_block_max( + mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k); + if (Is_causal && n_block_max <= 0) { // We exit early and write 0 to gO and -inf to gLSE. + collective_epilogue.store_zero(epilogue_params, shared_storage, threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q); + continue; + } + + collective_mainloop.mma(mainloop_params, pipeline_k, pipeline_v, smem_pipe_read_k, smem_pipe_read_v, + tOrO, softmax, n_block_max, threadIdx.x - NumCopyThreads, work_idx, m_block, shared_storage, + seqlen_traits_q, seqlen_traits_k); + // tOrO, softmax, n_block_max, threadIdx.x - NumCopyThreads + (work_idx >> 30), work_idx, shared_storage); + collective_epilogue.store(epilogue_params, tOrO, softmax.row_sum, shared_storage, tiled_mma1, + threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q); + + ++work_idx; + } + collective_epilogue.store_tail(); + } + +} + +template +__global__ void __launch_bounds__(Ktraits::kNWarps * cutlass::NumThreadsPerWarp, 1) + compute_attn_ws_fp8(CUTE_GRID_CONSTANT typename CollectiveMainloopFwd::Params const mainloop_params, + CUTE_GRID_CONSTANT typename CollectiveEpilogueFwd::Params const epilogue_params, + CUTE_GRID_CONSTANT typename TileScheduler::Params const scheduler_params, + Seqlen_traits seqlen_traits_q, Seqlen_traits seqlen_traits_k + ) { + + using Element = typename Ktraits::Element; + static_assert(cutlass::sizeof_bits_v == 8); + using ElementAccum = typename Ktraits::ElementAccum; + using SoftType = ElementAccum; + using TileShape_MNK = typename Ktraits::TileShape_MNK; + using ClusterShape = typename Ktraits::ClusterShape_MNK; + + static_assert(Ktraits::Is_WS); + static constexpr bool Is_WS = Ktraits::Is_WS; + static constexpr bool kUseVarSeqLen = Seqlen_traits::kUseVarSeqLen; + + static constexpr int NumMmaThreads = size(typename Ktraits::TiledMma0{}); + static constexpr int NumCopyThreads = !Is_WS ? 0 : cutlass::NumThreadsPerWarpGroup; + static constexpr int kBlockM = Ktraits::kBlockM; + // static constexpr int kBlockN = Ktraits::kBlockN; + // static constexpr int kHeadDim = Ktraits::kHeadDim; + static constexpr bool Delay_V_release = Is_causal && Ktraits::kHeadDim == 128; + // for now, disable for hdim 128 causal to avoid perf regression with register spilling + static constexpr bool Use_max_offset = !(Is_causal && Ktraits::kHeadDim == 128); + + using CollectiveMainloop = CollectiveMainloopFwd; + using CollectiveEpilogue = CollectiveEpilogueFwd; + + using MainloopPipeline = typename Ktraits::MainloopPipeline; + using MainloopPipelineVt = typename Ktraits::MainloopPipelineNoTMA; + using PipelineParams = typename MainloopPipeline::Params; + using PipelineParamsVt = typename MainloopPipelineVt::Params; + using PipelineState = typename MainloopPipeline::PipelineState; + + extern __shared__ char shared_memory[]; + auto &shared_storage = *reinterpret_cast(shared_memory); + + int const lane_predicate = cute::elect_one_sync(); + int const warp_idx = cutlass::canonical_warp_idx_sync(); + + // Issue Tma Descriptor Prefetch from a single thread + if (warp_idx == 0 && lane_predicate) { + CollectiveMainloop::prefetch_tma_descriptors(mainloop_params); + CollectiveEpilogue::prefetch_tma_descriptors(epilogue_params); + } + + // Obtain warp index + int const warp_group_thread_idx = threadIdx.x % cutlass::NumThreadsPerWarpGroup; + + // additional pipeline to synchronize out-of-place smem transpose of V + PipelineParamsVt pipeline_params_vt; + pipeline_params_vt.producer_arv_count = NumCopyThreads; + pipeline_params_vt.consumer_arv_count = NumMmaThreads; + MainloopPipelineVt pipeline_vt(shared_storage.pipeline_vt, pipeline_params_vt); + + PipelineParams pipeline_params; + pipeline_params.transaction_bytes = CollectiveMainloop::TmaTransactionBytesK; + int warp_group_idx = cutlass::canonical_warp_group_idx(); + pipeline_params.role = warp_group_idx == 0 + ? MainloopPipeline::ThreadCategory::Producer + : MainloopPipeline::ThreadCategory::Consumer; + pipeline_params.is_leader = warp_group_thread_idx == 0; + pipeline_params.num_consumers = NumMmaThreads; + + if (warp_idx == 0 && lane_predicate) { + shared_storage.barrier_Q.init(1 /*numThreads*/); + shared_storage.barrier_O.init(size(ClusterShape{}) /*numThreads*/); + } + // We're counting on pipeline_k to call cutlass::arch::fence_barrier_init(); + MainloopPipeline pipeline_k(shared_storage.pipeline_k, pipeline_params, ClusterShape{}); + // pipeline_v has producer warpgroup for its consumer in fp8 kernel + pipeline_params.num_consumers = NumCopyThreads; + pipeline_params.role = MainloopPipeline::ThreadCategory::ProducerConsumer; + MainloopPipeline pipeline_v(shared_storage.pipeline_v, pipeline_params, ClusterShape{}); + + CollectiveMainloop collective_mainloop; + CollectiveEpilogue collective_epilogue; + + // We need this to guarantee that the Pipeline init is visible to all producers and consumer blocks in the Cluster + if constexpr (size(ClusterShape{}) > 1) { + cute::cluster_arrive_relaxed(); + cute::cluster_wait(); + } else { + __syncthreads(); + } + + static_assert(Ktraits::kNWarps == 12 || Ktraits::kNWarps == 16); + if (warp_group_idx == 0) { // Producer + cutlass::arch::warpgroup_reg_dealloc(); + + PipelineState smem_pipe_write = cutlass::make_producer_start_state(); + PipelineState smem_pipe_read, smem_pipe_release; + + int work_idx = 0; + + TileScheduler scheduler(&shared_storage.tile_count_semaphore); + for (auto work_tile_info = scheduler.get_initial_work(); + work_tile_info.is_valid(scheduler_params); + work_tile_info = scheduler.template get_next_work(scheduler_params, work_tile_info)) { + auto block_coord = work_tile_info.get_block_coord(scheduler_params); + auto [m_block, bidh, bidb] = block_coord; + + if constexpr(kUseVarSeqLen) { + seqlen_traits_q.init(bidb); + seqlen_traits_k.init(bidb); + if (m_block * kBlockM >= seqlen_traits_q.actual_seq_len) { + continue; + } + } + int n_block_max = collective_mainloop.get_n_block_max( + mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k); + if constexpr(Is_causal) { + if(n_block_max <= 0) { + scheduler.prefetch_next_work(scheduler_params, work_tile_info); + scheduler.broadcast_next_work(work_tile_info); + // need to sync producer warpgroup + cutlass::arch::NamedBarrier::sync(NumCopyThreads, static_cast(FwdNamedBarriers::ProducerWG) /*id*/); + continue; + } + } + collective_mainloop.load_fp8( + mainloop_params, pipeline_k, pipeline_v, pipeline_vt, + smem_pipe_write, smem_pipe_read, shared_storage, + scheduler, scheduler_params, work_tile_info, block_coord, work_idx, + seqlen_traits_q, seqlen_traits_k); + ++work_idx; + // don't need to sync producer warpgroup here + // if constexpr (Is_causal) { + // cutlass::arch::NamedBarrier::sync(NumCopyThreads, static_cast(FwdNamedBarriers::ProducerWG) /*id*/); } + } + collective_mainloop.load_tail_one_write(pipeline_k, pipeline_v, smem_pipe_write); + } else { // Consumer + cutlass::arch::warpgroup_reg_alloc(); + + TileScheduler scheduler(&shared_storage.tile_count_semaphore); + // Initialize matmul objects. + typename Ktraits::TiledMma1 tiled_mma1; + PipelineState smem_pipe_read; + PipelineState smem_pipe_release; + + collective_mainloop.mma_init(); + scheduler.init_consumer(); + + int work_idx = 0; + CUTLASS_PRAGMA_NO_UNROLL + for (auto work_tile_info = scheduler.get_initial_work(); + work_tile_info.is_valid(scheduler_params); + work_tile_info = scheduler.template get_next_work(scheduler_params, work_tile_info)) { + // Attention output (GEMM-II) accumulator. + Tensor tOrO = partition_fragment_C(tiled_mma1, select<0, 2>(TileShape_MNK{})); + flash::Softmax<2 * (2 * kBlockM / NumMmaThreads), Use_max_offset> softmax; + + auto block_coord = work_tile_info.get_block_coord(scheduler_params); + auto [m_block, bidh, bidb] = block_coord; + + if constexpr(kUseVarSeqLen) { + seqlen_traits_q.init(bidb); + seqlen_traits_k.init(bidb); + if (m_block * kBlockM >= seqlen_traits_q.actual_seq_len) { + continue; + } + } + int n_block_max = collective_mainloop.get_n_block_max( + mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k); + if constexpr(Is_causal) { + if(n_block_max <= 0) { // We exit early and write 0 to gO and -inf to gLSE. + collective_epilogue.store_zero(epilogue_params, shared_storage, threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q); + continue; + } + } + + collective_mainloop.mma_fp8( + mainloop_params, pipeline_k, pipeline_vt, smem_pipe_read, smem_pipe_release, + tOrO, softmax, n_block_max, + threadIdx.x - NumCopyThreads, work_idx, m_block, + shared_storage, seqlen_traits_q, seqlen_traits_k); + + #ifndef NO_FP8_COLUMN_PERMUTE + collective_epilogue.store_fp8(epilogue_params, tOrO, softmax.row_sum, shared_storage, tiled_mma1, + threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q); + #else + collective_epilogue.store(epilogue_params, tOrO, softmax.row_sum, shared_storage, tiled_mma1, + threadIdx.x - NumCopyThreads, block_coord, seqlen_traits_q); + #endif + ++work_idx; + } + collective_epilogue.store_tail(); + } + +} + +} // namespace flash diff --git a/flash_fwd_launch_template.h b/flash_fwd_launch_template.h new file mode 100644 index 0000000000000000000000000000000000000000..df128c8316b57c6b99569f52ae00fa13158812d0 --- /dev/null +++ b/flash_fwd_launch_template.h @@ -0,0 +1,211 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include "cute/tensor.hpp" + +#include "cutlass/cutlass.h" +#include "cutlass/cluster_launch.hpp" + +#include "static_switch.h" +#include "flash.h" +#include "tile_scheduler.hpp" +#include "flash_fwd_kernel.h" +#include "kernel_traits.h" +#include "seq_len.h" +#include "utils.h" + + +template +void run_flash_fwd(Flash_fwd_params ¶ms, cudaStream_t stream) { + using Element = typename Kernel_traits::Element; + using OutputType = typename Kernel_traits::OutputType; + using TileShape_MNK = typename Kernel_traits::TileShape_MNK; + using ClusterShape = typename Kernel_traits::ClusterShape_MNK; + + // print(typename Kernel_traits::SmemLayoutVt{}); printf("\n"); print(typename Kernel_traits::SmemLayoutVt_tmp{}); + using CollectiveMainloop = flash::CollectiveMainloopFwd; + using CollectiveEpilogue = flash::CollectiveEpilogueFwd; + using Scheduler = std::conditional_t< + Seqlen_traits::kUseVarSeqLen, + flash::SingleTileScheduler, + std::conditional_t + >>; + // using Scheduler = flash::SingleTileScheduler; + Seqlen_traits seqlen_traits_q( + params.total_q, params.seqlen_q, params.cu_seqlens_q); + Seqlen_traits seqlen_traits_k( + params.total_k, params.seqlen_k, params.cu_seqlens_k, params.seqused_k); + typename CollectiveMainloop::Params mainloop_params = + CollectiveMainloop::to_underlying_arguments({ + static_cast(params.q_ptr), + seqlen_traits_q.get_gmem_layout( + params.seqlen_q, params.d, params.h, params.b, + params.q_row_stride, params.q_head_stride, params.q_batch_stride + ), // layout_Q + static_cast(params.k_ptr), + seqlen_traits_k.get_gmem_layout( + params.seqlen_k, params.d, params.h_k, params.b, + params.k_row_stride, params.k_head_stride, params.k_batch_stride + ), // layout_K + static_cast(params.v_ptr), + seqlen_traits_k.get_gmem_layout( + params.seqlen_k, params.d, params.h_k, params.b, + params.v_row_stride, params.v_head_stride, params.v_batch_stride + ), // layout_V + params.scale_softmax_log2 + }); + typename CollectiveEpilogue::Params epilogue_params = + CollectiveEpilogue::to_underlying_arguments({ + static_cast(params.o_ptr), + seqlen_traits_q.get_gmem_layout( + params.seqlen_q, params.d, params.h, params.b, + params.o_row_stride, params.o_head_stride, params.o_batch_stride + ), // layout_O + static_cast(params.softmax_lse_ptr), + seqlen_traits_q.get_lse_gmem_layout( + params.seqlen_q, params.h, params.b + ) // layout_LSE + }); + + int num_blocks_m = cutlass::ceil_div(params.seqlen_q, Kernel_traits::kBlockM); + num_blocks_m = cutlass::ceil_div(num_blocks_m, size<0>(ClusterShape{})) * size<0>(ClusterShape{}); + typename Scheduler::Arguments scheduler_args = {num_blocks_m, params.h, params.b, params.tile_count_semaphore}; + typename Scheduler::Params scheduler_params = Scheduler::to_underlying_arguments(scheduler_args); + + // Get the ptr to kernel function. + void *kernel; + if constexpr(cutlass::sizeof_bits_v == 8) + kernel = (void *)flash::compute_attn_ws_fp8; + else + kernel = (void *)flash::compute_attn_ws; + int smem_size = sizeof(typename Kernel_traits::SharedStorage); + // int smem_size_q = sizeof(decltype((typename Kernel_traits::SharedStorage{}).smem_q)); + // int smem_size_k = sizeof(decltype((typename Kernel_traits::SharedStorage{}).smem_k)); + // int smem_size_v = sizeof(decltype((typename Kernel_traits::SharedStorage{}).smem_v)); + // int smem_size_o = sizeof(decltype((typename Kernel_traits::SharedStorage{}).smem_o)); + // printf("smem_size = %d, q = %d, k = %d, v = %d, o = %d.\n", smem_size, smem_size_q, smem_size_k, smem_size_v, smem_size_o); + if (smem_size >= 48 * 1024) { + CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); + } + + int device; + cudaGetDevice(&device); + int multiprocessor_count; + CHECK_CUDA(cudaDeviceGetAttribute(&multiprocessor_count, cudaDevAttrMultiProcessorCount, device)); + dim3 grid_dims = Scheduler::get_grid_dim(scheduler_args, multiprocessor_count); + static constexpr int ctaSize = Kernel_traits::kNWarps * 32; + dim3 block_dims(ctaSize); + dim3 cluster_dims(size<0>(ClusterShape{}), size<1>(ClusterShape{}), size<2>(ClusterShape{})); + cutlass::ClusterLaunchParams launch_params{grid_dims, block_dims, cluster_dims, smem_size, stream}; + cutlass::launch_kernel_on_cluster( + launch_params, kernel, mainloop_params, epilogue_params, + scheduler_params, seqlen_traits_q, seqlen_traits_k); + CHECK_CUDA_KERNEL_LAUNCH(); +} + +template +void run_mha_fwd_hdim64(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 64; + BOOL_SWITCH(params.is_causal, Is_causal, [&] { + SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] { + run_flash_fwd< + Flash_fwd_kernel_traits, + Is_causal, Seqlen_traits + >(params, stream); + }); + }); +} + +template +void run_mha_fwd_hdim128(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 128; + BOOL_SWITCH(params.is_causal, Is_causal, [&] { + SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] { + // Only use Cluster if number of tiles along seqlen_q is even and not Is_causal + BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128) % 2 == 0 && !Is_causal && !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] { + run_flash_fwd< + Flash_fwd_kernel_traits, + Is_causal, Seqlen_traits + >(params, stream); + }); + }); + }); +} + +template +void run_mha_fwd_hdim256(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 256; + BOOL_SWITCH(params.is_causal, Is_causal, [&] { + SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] { + // Only use Cluster if number of tiles along seqlen_q is even + BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, 128) % 2 == 0 && !Is_causal && !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] { + run_flash_fwd< + Flash_fwd_kernel_traits, + Is_causal, Seqlen_traits + >(params, stream); + }); + }); + }); +} + +template +void run_mha_fwd_hdim64_fp8(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 64; + constexpr static int kBlockM = 192; + constexpr static int kBlockN = 128; + constexpr static int kNWarps = 4 + kBlockM/16; + constexpr static int kStages = 4; + BOOL_SWITCH(params.is_causal, Is_causal, [&] { + SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] { + // Only use Cluster if number of tiles along seqlen_q is even + BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, kBlockM) % 2 == 0 && !Is_causal && + !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] { + run_flash_fwd, Is_causal, Seqlen_traits>(params, stream); + }); + }); + }); +} + +template +void run_mha_fwd_hdim128_fp8(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 128; + constexpr static int kBlockM = 128; + constexpr static int kBlockN = 256; + constexpr static int kNWarps = 4 + kBlockM/16; + constexpr static int kStages = 2; + BOOL_SWITCH(params.is_causal, Is_causal, [&] { + SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] { + // Only use Cluster if number of tiles along seqlen_q is even + BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, kBlockM) % 2 == 0 && !Is_causal && + !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] { + run_flash_fwd, Is_causal, Seqlen_traits>(params, stream); + }); + }); + }); +} + +template +void run_mha_fwd_hdim256_fp8(Flash_fwd_params ¶ms, cudaStream_t stream) { + constexpr static int Headdim = 256; + constexpr static int kBlockM = 128; + constexpr static int kBlockN = 128; + constexpr static int kNWarps = 4 + kBlockM/16; + constexpr static int kStages = 2; + BOOL_SWITCH(params.is_causal, Is_causal, [&] { + SEQLEN_SWITCH(params.cu_seqlens_q, Seqlen_traits, [&] { + // Only use Cluster if number of tiles along seqlen_q is even + BOOL_SWITCH(cutlass::ceil_div(params.seqlen_q, kBlockM) % 2 == 0 && !Is_causal && + !Seqlen_traits::kUseVarSeqLen, UseCluster, [&] { + run_flash_fwd, Is_causal, Seqlen_traits>(params, stream); + }); + }); + }); +} diff --git a/flash_fwd_split_hdim128_bf16_causal_sm80.cu b/flash_fwd_split_hdim128_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..a959c9ceb143bee1f46602c3af6a1e7d5564738a --- /dev/null +++ b/flash_fwd_split_hdim128_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim128_bf16_sm80.cu b/flash_fwd_split_hdim128_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..e608e308ef75acb59696266439073babac5529fc --- /dev/null +++ b/flash_fwd_split_hdim128_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim128_fp16_causal_sm80.cu b/flash_fwd_split_hdim128_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..3dd74e273d5a4b4b7f114799883deace0ab8c4c7 --- /dev/null +++ b/flash_fwd_split_hdim128_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim128_fp16_sm80.cu b/flash_fwd_split_hdim128_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..addacedf49c60ed3469a9c72ecb5f774860f79f0 --- /dev/null +++ b/flash_fwd_split_hdim128_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim160_bf16_causal_sm80.cu b/flash_fwd_split_hdim160_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..8ace7bda96877448195bc53500bc87b986ee602a --- /dev/null +++ b/flash_fwd_split_hdim160_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim160_bf16_sm80.cu b/flash_fwd_split_hdim160_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..1e133ec1af696fdac66785bac5148c06cb37e245 --- /dev/null +++ b/flash_fwd_split_hdim160_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim160_fp16_causal_sm80.cu b/flash_fwd_split_hdim160_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..1723c69e08198f7e5898b5877fc387cc7c14bd21 --- /dev/null +++ b/flash_fwd_split_hdim160_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim160_fp16_sm80.cu b/flash_fwd_split_hdim160_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..892d2352aa581c8a4bc32401256365b71c2e8492 --- /dev/null +++ b/flash_fwd_split_hdim160_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim192_bf16_causal_sm80.cu b/flash_fwd_split_hdim192_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..d07ee0af2f18062323a3035897db97de10f033f9 --- /dev/null +++ b/flash_fwd_split_hdim192_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim192_bf16_sm80.cu b/flash_fwd_split_hdim192_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..23cfa59d5a03861cab3e0c50ac3ff857cb623aa0 --- /dev/null +++ b/flash_fwd_split_hdim192_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim192_fp16_causal_sm80.cu b/flash_fwd_split_hdim192_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..273a28442790c8b063677fc0cbc43d662e51b871 --- /dev/null +++ b/flash_fwd_split_hdim192_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim192_fp16_sm80.cu b/flash_fwd_split_hdim192_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..0f588d1f4482008d8df314a31d75cd932e25f91c --- /dev/null +++ b/flash_fwd_split_hdim192_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim224_bf16_causal_sm80.cu b/flash_fwd_split_hdim224_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..ea024d9abf92199b436a598e2e4b1a36351b6146 --- /dev/null +++ b/flash_fwd_split_hdim224_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2023, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim224_bf16_sm80.cu b/flash_fwd_split_hdim224_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..b06ae5ace1fa516425b852776e470166f034430c --- /dev/null +++ b/flash_fwd_split_hdim224_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2023, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim224_fp16_causal_sm80.cu b/flash_fwd_split_hdim224_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..b217f37891014db4e40548cb630d85a1252a6793 --- /dev/null +++ b/flash_fwd_split_hdim224_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2023, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim224_fp16_sm80.cu b/flash_fwd_split_hdim224_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..8cf2eabed632d46a0f69578e56be6da5c2248f6d --- /dev/null +++ b/flash_fwd_split_hdim224_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2023, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim256_bf16_causal_sm80.cu b/flash_fwd_split_hdim256_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..370fe9ca3eea4dea5ea26afd5bc9ab4ff267b690 --- /dev/null +++ b/flash_fwd_split_hdim256_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim256_bf16_sm80.cu b/flash_fwd_split_hdim256_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..508f07f7d3d6eee0494b1ec96326c63487e74c3b --- /dev/null +++ b/flash_fwd_split_hdim256_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim256_fp16_causal_sm80.cu b/flash_fwd_split_hdim256_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..019ded67f9d00fc38be09e975b7f8e52a0c48572 --- /dev/null +++ b/flash_fwd_split_hdim256_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim256_fp16_sm80.cu b/flash_fwd_split_hdim256_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..708f5542acb78266a7e22323a5e50b333437b661 --- /dev/null +++ b/flash_fwd_split_hdim256_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim32_bf16_causal_sm80.cu b/flash_fwd_split_hdim32_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..5a205b7e793a21258ca9052bcf4a0dc4a94b35df --- /dev/null +++ b/flash_fwd_split_hdim32_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim32_bf16_sm80.cu b/flash_fwd_split_hdim32_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..2c576f118debc6476f509037c227a2d4d12acdc1 --- /dev/null +++ b/flash_fwd_split_hdim32_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim32_fp16_causal_sm80.cu b/flash_fwd_split_hdim32_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..484a15e9368fbfa37dd3cbf1553ba207aa109a87 --- /dev/null +++ b/flash_fwd_split_hdim32_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim32_fp16_sm80.cu b/flash_fwd_split_hdim32_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..5474ae89d92d9e3c2f96f67e334ccf0f19ee3762 --- /dev/null +++ b/flash_fwd_split_hdim32_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim64_bf16_causal_sm80.cu b/flash_fwd_split_hdim64_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..8c7da41ddc854caa7afbdb6e9170a231b33fcc04 --- /dev/null +++ b/flash_fwd_split_hdim64_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim64_bf16_sm80.cu b/flash_fwd_split_hdim64_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..93f29dea8afaf6dbbe9389d933a7ba47ec0262b0 --- /dev/null +++ b/flash_fwd_split_hdim64_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim64_fp16_causal_sm80.cu b/flash_fwd_split_hdim64_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..1e2e12b8cb32f60bc50d64fa8123f3fd1ff06cf6 --- /dev/null +++ b/flash_fwd_split_hdim64_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim64_fp16_sm80.cu b/flash_fwd_split_hdim64_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..16c34ed3f578e202eb452fa79ed9f1c3a5fb7792 --- /dev/null +++ b/flash_fwd_split_hdim64_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim96_bf16_causal_sm80.cu b/flash_fwd_split_hdim96_bf16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..50080c47e57f600bded3f5d250e54426512d553a --- /dev/null +++ b/flash_fwd_split_hdim96_bf16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim96_bf16_sm80.cu b/flash_fwd_split_hdim96_bf16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..ae56ddd4cae718bebb1039861f83b49e77cc7a02 --- /dev/null +++ b/flash_fwd_split_hdim96_bf16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim96_fp16_causal_sm80.cu b/flash_fwd_split_hdim96_fp16_causal_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..ed305767e1b47334705ffcc8fef9c9f4998adbd7 --- /dev/null +++ b/flash_fwd_split_hdim96_fp16_causal_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flash_fwd_split_hdim96_fp16_sm80.cu b/flash_fwd_split_hdim96_fp16_sm80.cu new file mode 100644 index 0000000000000000000000000000000000000000..02206465616bb84092a7d3640c07f44d7fbdecc5 --- /dev/null +++ b/flash_fwd_split_hdim96_fp16_sm80.cu @@ -0,0 +1,7 @@ +// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py" + +#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch(Flash_fwd_params ¶ms, cudaStream_t stream); diff --git a/flashattention_logo.png b/flashattention_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..b8bd5d849754b640f7d815ee04de6098b2a7ba5b --- /dev/null +++ b/flashattention_logo.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61969fc112a38be106744ce2c416a2bca8026a173ef3cbb883826c998732958c +size 2738980 diff --git a/flashattn_banner.jpg b/flashattn_banner.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2dcee8582111d52b5bdba9bc7c8dbaf863b884ac Binary files /dev/null and b/flashattn_banner.jpg differ diff --git a/flashattn_banner.pdf b/flashattn_banner.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c4ad4270ae0106f1e9b45f8f3ca4b053e861dcde --- /dev/null +++ b/flashattn_banner.pdf @@ -0,0 +1,3356 @@ +%PDF-1.5 %âãÏÓ +1 0 obj <>/OCGs[7 0 R 8 0 R 9 0 R 10 0 R 45 0 R 46 0 R 47 0 R 48 0 R 82 0 R 83 0 R 84 0 R 85 0 R 119 0 R 120 0 R 121 0 R 122 0 R 156 0 R 157 0 R 158 0 R 159 0 R 193 0 R 194 0 R 195 0 R 196 0 R 230 0 R 231 0 R 232 0 R 233 0 R 267 0 R 268 0 R 269 0 R 270 0 R 304 0 R 305 0 R 306 0 R 307 0 R 341 0 R 342 0 R 343 0 R 344 0 R 377 0 R 378 0 R 379 0 R 380 0 R 413 0 R 414 0 R 415 0 R 416 0 R 449 0 R 450 0 R 451 0 R 452 0 R 485 0 R 486 0 R 487 0 R 488 0 R 521 0 R 522 0 R 523 0 R 524 0 R 557 0 R 558 0 R 559 0 R 560 0 R 593 0 R 594 0 R 595 0 R 596 0 R 629 0 R 630 0 R 631 0 R 632 0 R 665 0 R 666 0 R 667 0 R 668 0 R 701 0 R 702 0 R 703 0 R 704 0 R 737 0 R 738 0 R 739 0 R 740 0 R 773 0 R 774 0 R 775 0 R 776 0 R 809 0 R 810 0 R 811 0 R 812 0 R 845 0 R 846 0 R 847 0 R 848 0 R 881 0 R 882 0 R 883 0 R 884 0 R 919 0 R 920 0 R 921 0 R 922 0 R 961 0 R 962 0 R 963 0 R 964 0 R 1003 0 R 1004 0 R 1005 0 R 1006 0 R 1045 0 R 1046 0 R 1047 0 R 1048 0 R]>>/Pages 3 0 R/Type/Catalog>> endobj 2 0 obj <>stream + + + + + application/pdf + + + Print + + + 2022-05-23T07:35:16-07:00 + 2022-05-23T07:35:16-07:00 + 2022-05-16T17:20:26-07:00 + Adobe Illustrator 26.2 (Macintosh) + + + + 256 + 176 + JPEG + /9j/4AAQSkZJRgABAgEASABIAAD/7QAsUGhvdG9zaG9wIDMuMAA4QklNA+0AAAAAABAASAAAAAEA AQBIAAAAAQAB/+4ADkFkb2JlAGTAAAAAAf/bAIQABgQEBAUEBgUFBgkGBQYJCwgGBggLDAoKCwoK DBAMDAwMDAwQDA4PEA8ODBMTFBQTExwbGxscHx8fHx8fHx8fHwEHBwcNDA0YEBAYGhURFRofHx8f Hx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8fHx8f/8AAEQgAsAEAAwER AAIRAQMRAf/EAaIAAAAHAQEBAQEAAAAAAAAAAAQFAwIGAQAHCAkKCwEAAgIDAQEBAQEAAAAAAAAA AQACAwQFBgcICQoLEAACAQMDAgQCBgcDBAIGAnMBAgMRBAAFIRIxQVEGE2EicYEUMpGhBxWxQiPB UtHhMxZi8CRygvElQzRTkqKyY3PCNUQnk6OzNhdUZHTD0uIIJoMJChgZhJRFRqS0VtNVKBry4/PE 1OT0ZXWFlaW1xdXl9WZ2hpamtsbW5vY3R1dnd4eXp7fH1+f3OEhYaHiImKi4yNjo+Ck5SVlpeYmZ qbnJ2en5KjpKWmp6ipqqusra6voRAAICAQIDBQUEBQYECAMDbQEAAhEDBCESMUEFURNhIgZxgZEy obHwFMHR4SNCFVJicvEzJDRDghaSUyWiY7LCB3PSNeJEgxdUkwgJChgZJjZFGidkdFU38qOzwygp 0+PzhJSktMTU5PRldYWVpbXF1eX1RlZmdoaWprbG1ub2R1dnd4eXp7fH1+f3OEhYaHiImKi4yNjo +DlJWWl5iZmpucnZ6fkqOkpaanqKmqq6ytrq+v/aAAwDAQACEQMRAD8A9U4q7FXYq7FXYq7FXYq0 XUMEJHIgkLXcgUqae1RireKtI6uodCGVgCrA1BB6EHFW8VdirsVdirsVdirsVdirsVdirsVdirsV dirEtc/M3yzpOtx6NJcJJqCysl9bhgjwRLZvd+uQ1AyAKiu1QqBuTEKCcVXSfml5DS6Fu2s20bqr S3HrsYBHCjem0jGRVFFmIiapFHDKfiRgFUefO/lX9FTauNQRtMhngtXvFV2iMt00SRBGCkOrPcIv JaqGqCQVaiqMttXF/Z6df6XEbuwv6SGZuUDJbtC8iSiOVVZuTBF4mn2q9sVYPN55846pNfXWiaab LS9PsFvCl/wE90p+uKxtGgN1C3J7SERSFuIV2kKSK0dVU31zWb/SWspZ766s0tbiKzJvVspIdUUq rzSJHbA3T3DRo6wRQhGaUj90yA4qzHFXYq7FXYq7FXmfmS3nj8wXzRaHqVyHdSbu2chT8If4F+ru KcuvxHpmlz6KJyE1Lfu/6Rckdq5YAQEQYj3pN5hiuYLaewgs/N9vaJclzc6K7rM8cENwgZAIBX1A 0dEJG/HccPiztFhGMGIur6/LuHc1ZdXLNK5Cq/axMroN9Pq0kVn+YGqaxbWsNjO8/ovcW8cctrcD 0pI1laO4+ASqrbludeLGuZjWm0+nNduZXtvzCuL201HT4ra6uWSF14C6hkmiaKMoIRBz9VgKuWSt OZYquGl22reVLr0dA833U9iTEyeYl9G5uEvLe5imaKRYrqVmCTSDZQvN15fDWhVK/MflSWzutU17 QPKuuhyZrRo7K5ltLiWKCSyEFzEn6Oll9WSTjODzr+7csSeS4FeyeQNVuZ9Pi0xtJvdMtdNs7WKF r9CkjsPURlrxjRuKRoTxA3PQDjVVleKuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KpL5n8xSaNpNzfQ2 wupYJraFYXkMSsbmaOEEuElI4+rX7J6ZKMbNIkaFsb0rzTey6q0o8s25ubgu8slpdrJcu3pIp4+v DaoeS28YPKVRRR1oBl09OYi2qGcSNIry+35VLp2my2FhpGlfpiztL+1sDFZwTGCeaI2xaOPY/wCk yxKtCR6hHEk0zHbmSXOkaCukfUbixtjpNskdLNoUaBEtiHiCxcSoERQFABtQUwgWaQTSXzt5J1mz Wxu7a0vbGxAuUgurdWghEAosirKnBeAOxHTJzxSiLIYxyRlyYro/5sWFlp8195nuJYpL9H1axtPR Vfq2nT3Ednp8JIoXluCwfetGZuRRAuVs2RaF+Y3lzWNTj0pPrFnqcimlneQtDJ60a8p4KGtXhWhY iqMDVGcVoqgNF/Nrytdy3ttf3cVld2LSSyK7FY2snu/QsLmJ3C+ul1G8bhoeaqW4sQSoKqbaT52s tT1ufSYtPv4GgJU3lxb+lauRFDMqxys3xl0nPEKK/A9aDiWVZDirsVdirDvLmi2GrLql3qHrT3A1 S+hDm4nUCOG4aONQquqgKigCgwqkXnjXfy48q6jbaPqwvbafUba4nivIpLqaOFIYndnkCTept6ew VT91SFXmmnXmjXdjZrF+ZmoWThktpFuNJmkleS4dkgLEtNwPqROjFjQ07VDFVG6H5k8m2esWN9e/ mFJrGgJZSz6hayWN9byOfV+rxzIYkZv74kFeQpQUUg1CtPYdA0zyTr+lxappXrz2UzSIkjy3sLco ZGikVo5WSRSrowoyjAqK0WyhsPM+qWVqZFtRZWMyxPLJKBJJLdq7D1GahZY1Bp4YqyHFXYq7FXYq 7FXYq7FXYq7FXYq7FXYq7FXYqwz8yPV/wvqXpANJ9c0zgC3AV+uW/wC1xkp/wJyUASRXNjOuE2x7 yg+sf4ls/VgjWKrhmW59Qj90wB4G2hrv/wAWD+GZGSGURuUgR7nFxGPEKTnynovkG803QVgiSS/b QLAWMx9SGZ9NhUGExHm5pFI6ueEjFH9NieXptmK5jOJ6GGQGP1QVNYjT4tvs7+ON1ugpfaSRTg2R 00w2jIeavGBHv1UqRQg5AZTI7gsYnpVJZF580pPL9vrF4ksQuraO+itIopp51t7mQJaiVET93LL6 ijg23LnQlUZsmzYP5p/OHyBc6rpn6PubrU/MGiwalrVpo9qg9Iz2unXSNFeTenJxPEyxqImPx9ai mKvRND8022rapqemLay2l5pJgF1FNJauwNxEJVHC3nndKA9ZFUN+xyG+KpnJYWMtxDcyW8T3FuzP bzMil42deDMjEVUsh4kjqNsVV8VdirsVeIan5h1S01W406HVn0m0a81OcyxqTylN9MoDlFd+NE7D 78tjVWUwAJopdJ5r8xLHI6+aJ2ZE5KgMtWbb4RWECvz298Nw8m44oV1es6L5p0NNHtr7UbuGPUns 9ObU5ioVy14OMAYqo+3I7cQOle2VEbuOGP8A5i+cWR7G10rVmsoRd3FrqdzCpLxy26Rtw3FdvVBP HJwA3tkACQCwk+aNfD8f8Vyld/jBmptWnWCu9NtslxQ8m/wof0noH5Yanfam0l7fTGe6k020V5mA BYR32oxrWgG/FRlcxu47PcirsVdirsVdirsVdirsVdirsVdirsVdirEPOGmWkvm3yXfyIXuU1Ge1 ikBKmKJ9PurmTgVoR6klnEH33UFPsu4ZVS/MgSHy5e+mQH+vaZQsKgf6XB1AI/XluH6g15fpSPyx Hqh1uACaLkVlCcEKNyMT8SGYzAb+KN8sztR9BcTDXEE4svInlyxhsrm+e6Sey0q3tngt55VgQae7 yySg2kdrzlaWc8/gHOgpGN81jnqfki08l29tavb+Y7jWL3TvXgmuLzUruQNPZt9VupTbTTMgAkfr xKjkpU0Kkqs8BBAINQdwRiqU23lHypam0NrotjAbBpHsTHbQp6DzEGVouKjgZCoLFetN8VV9N0Sx 0691S8tw3ravcpd3fIggPHbQ2ihKAUX07ZdvGuKo/FXYq7FXYq7FXky+SdZ1i7utV0q4t45otR1K 3kjueYWgvpJFZSivvU7gjKNTpo5oiMiRRvZv02oOKRIF2KQunad54802+rWKXNmIdJvpNMuVmWSK s0EasWjIiqycZtm75jHs6Jq5SoV3dPg3jXEXURv7+vxeg32raf5J8o2L6kZbiCyWy08tbxF3d5Gj tlYRgk05NyO5NPE5sSXBQvn7yhqHmBdPm0+aGK6sXkIS45CNllUBviQMQRxFNspz4RlgYHkW3BmO OYkOYYXZ2fnvUtW1jyulzZiTSI7VrwusiROt4GkjEcgi5PT0fi8OmYZ7NiY8PFKv839TlDXkS4uG N/H9bLfImhz6FqFzpdxKs08Gm2TSSRghOUt5qEpC13ovOle/tmxAoU4JNm2Z4odirsVdirsVdirs VdirsVdirsVdirsVee+c7j9N+b/LelaeLab9GasW1SadJfVtpra3gvlSLlwgk9SCZRxPJgWWRFrC 7Rqo78ymuF8rambdEknF5pnpxyOY0LfXLegZ1WQqPficsxfUGGT6SxvyddeZBrtu11p9qoVZSFtr t5nJETlVVZYLZCSdt3Ue+Z2ovgLiYa4gvtLnyVaaJo9p50aLVtSvNZ1a2tIDDK9rJd3WsSW87rby NJEqxPccFaQlkjLBWPJuWtc5kxtPy31vU9QS80zSLyfTIi8tzLDbTgW08splZpCpCD6zFOJFLV5K Wbriqfrq3l+0tlVby0t7aBQqqJI0jRFFABuAoAyZxy7ix4496Mtbq2u7aK6tZUuLW4RZYJ4mDxyR uOSujLUMrA1BGQZKmKuxV2KuxV2KuxVjnkb/AHh1T/tr6l/1FyYqxPU9c1OL/FOr+XZtR1FoNcis JdMsI4ah0tLSGdx6tndyH0zUn9nbqMVSm9846/qCpEdG85/VYWiu7gXNhYwykwUuofq6xWkySSBo xVJJIxX4aljwKqa3XmnzNCIZorXzVNbD1heqtnYC4RooIpk9GI2fGZX9Vkr6inkhADHFV/lrztdf 4iPqeXfNM/6RWxtH1HUbGGARc/VlTksKQosUPqv60hJIagAOKsztP+Uz1T/tnad/yfvsVTrFXYq7 FXYq7FXYq7FXYq7FXYq7FXYq7FUBaaFpNpqF9qMFsi3uozC4u7g/E7SLBFbihNeI9K3QUG21epOK sd/MMMfLt9QqB9e0vkWNAF+uW9ST7dclCVEHmwyD0lJdLkure9SfSxb6jfxiQwWTXAgEtENV9QLL xNK0+GniQNxk5dRxRI4ZD3hxcMakEjudd/LbQtL0u78+2Eaa+66j5itdHq95JAuq3oIgELcQ0r+u o48OPOORl+yWzEc16Hr0GjP5O1TUdMjt3juNKmNvdW3DjLB6MkkQWRNmj/esy9viqMVYtaLZJqen ot1E0xu4TwMiqSBIhAUb1PX55kS1cyKMCHBxwFjdG6H5vutOku7SbR72WKK/vvraWFoJvqsrmS9Z JSlxLK5ZKyoywj1RNEFUOSmY7nIv/lZoOl6bfN5f1W1k1XUo9JtdOvLcw3QklgM/ryRguRDEEcyM vLiqs2/GmKqWi/m7o+rWEGoQaRqsNldXH1e1nuIIolkBuLW2WUcpdkZ70Urv8DinIKrKsm8qard6 x5W0fVryD6rd6jY213cW1CPSknhWR46N8Q4s1N8VTTFXYq7FXg3mHWtZtNQubazvri1gF7qMnC3l eGrvfzqS3plS2yClenbqcw9XllGqLi6jIY1STxa3rkLStDqd5G0z+pMUuZlLvQLyaj7txUCp8Mw/ zOTvcbx597KrbQ/PtzYRXkWvT8Z0R4oWvbtZG9QAqFBoG2bcqSNjvUHLwcpF23jxCLtDazZeedI0 9b+51u5a3eX0YzHe3DFjV6MAWHwlYwwPcMvvQTnliLJYzlkiLJSL/EnmT/q8X/8A0lz/APNeU/mc ne1ePPven/ldeXd4ZLi7me4uG061VppWLOQl9qKLyY7miqBU75tMRJiCXYYzcQWfZYzdirzf84tU 1SybR47K9ntEl+stL9XleEsU9ILyMZUmnM7VpmLqshjEU4+omYjZ5z/iHzH/ANXjUP8ApMuP+a8w fzE+9xPHn3u/xD5j/wCrxqH/AEmXH/NeP5ifevjz73f4h8x/9XjUP+ky4/5rx/MT718efe7/ABD5 j/6vGof9Jlx/zXj+Yn3r48+93+IfMf8A1eNQ/wCky4/5rx/MT718efe7/EPmP/q8ah/0mXH/ADXj +Yn3r48+9m35S6zrF15hura7v7m6ga0aT07iZ5gHSWNVK+oWK7OenXv0GZmkyyldlydNkMrt6xmY 5TsVdirsVYh510rUdRSSxg1GG2tp5bGSZPqUt5Mki3Keix9K4iIjkeMKT6dFAZmYAVEoyo2iQsUl ul+UPMnrTzWPmG0SW3eW3WddMl+GThxZkMl0UcoWp0ZeQodwy5dk1BkKpphgETaI8v8Ali+ksdO1 W0u9HupZITcW+qzaK8dzJ9brNLM1LqHg87SM8gCLux+EdMx29Eavp2uW+iro0mraXZ2l8g0uyt7f Sp6j1YyixwxJenZIwWNFoiKWaiqSFUr0vyXr1wlrqFp5gspI1kSVOWmTqeUThijo16rowZeLKwDK ag75ky1UiCKaBpwDaeWGj+ZtMEsSazpgmu5pbqZ5LCcySSSuASSb6pC8kjT+VQqjYDMZvTnRn1CS 3aW7vrS/VmIilsoWhQcSVcHlPc8iGFNiKYqmGKuxV2KuxV2KvG9e8oXU+o3XrW2oxTC7vHElvZi7 hkhnuXnhZWWWKhpKeQP4U3pzYBkq+jVkxCfNLv8AA038urf9wk/9lOU/kY97V+Vj3l3+Bpv5dW/7 hJ/7KcfyMe9fyse8qMXlOGa2+tQy6lJbFSwnTTA0fEdTyF1SmP5GPeV/Kx7yrDyPKRULqpB/7VJ/ 7KsfyMe9fyse8s//AC40q4sDcKYLmK2jtbe3jlvIhbvJKtxd3EnGLnIQq/WlAJP8cyox4QA5EY0K ZtkmTsVYt558jnzR9RZL76lJZ+qKmL1lZZeFducZBBjHfKsuITFFryYxMUWK/wDKkrn/AKvqf9IZ /wCyjMf8lHvLT+VHe7/lSVz/ANX1P+kM/wDZRj+Sj3lfyo73f8qSuf8Aq+p/0hn/ALKMfyUe8r+V He7/AJUlc/8AV9T/AKQz/wBlGP5KPeV/Kjvd/wAqSuf+r6n/AEhn/sox/JR7yv5Ud7v+VJXP/V9T /pDP/ZRj+Sj3lfyo72Q+Svy7PlrUZ76TUPrsksPoIiw+iqhmDMTV5Sx+BadKb9e1+HAIXTbjxCHJ mWXNrsVdirsVSHzH5L0fzBcwXV8GFxa29xb2kyBOcD3LROLmFmVjHcQvbq0Ug+ya/QqxbQvynsbi xnn1241Ka+vLWewmjvJLVpIlaE2AuIpIFfjNJZoqtIrD1BRnjVgFRVmfl3y1pHl+0nttNiKC7uZr 28mcl5Z7m4blJLK53ZjsPYAAbAYqqa9aR3Glz8pJ4WhVpY57NVe6QqpDehyST42Qsnwryodt8Vef XXlz8s7WFtLS69LStToiWFvcW0loYdTS5e1kgtpAyx85HnSBLVeUjblXUEqqpWOk/kRZ+ZrXS7WS xuL7U+WoJbSXiXMEsrLGIvUimlcPNMuo+pASpZgzEGmKsz8iy2n6KuLaJoVuEvLu6uLWGVZvS/SF zJexh2QBeRjuFLcaryqFZqciqyPFXYq7FXYq7FUBr+i22t6PdaVdPJHBdpwkeIqHAqDtzV0PTcMp BGxBGKsRtPya8t2jwS219qEU8IiVplkh5SfV4rWODl+64r6ZsInHAL8Va1U8cVS5/wDnHvyK+mnT mmvGtiXA5tbyOI2MJjiEkkDuEiFuqpvWhapNcVUrb/nHPyPbyQMl3qLCC1lswkj20itHP6hkL84D V2MzfH9obcaUGKovSPyF8l6Xr9trsE9819aXj38HKSFUV5GdmQLFFHSMmT7Ip0GKvSMVdirsVdir sVdirsVdirsVdirsVdirsVdirsVdirsVdiqG1G0ku7X0EuJLYl42aWI0cokiu6A9R6iqUJG4B23x VJV/LnySltLax6RBHaziRZ7dAUjf1J1uasqkAmOVA0R/3X0TiNsVRMnkvyu10t2tgkF2rrL9Zt2e 3lLpayWSO0kLIzMtvO6KSdtiN1UhVE6H5c0LQbGKx0axhsbaGNIUSFAp4RAhAzfaalTuxJqSe+Kp jirsVU7a4hubeK4gbnDMiyRPQiquKqaHfocVVMVdirsVdirsVdirsVdirsVdirsVdirsVdiqE/Sl t/Jcf9I0/wDzRkeMfgNnhHy+Y/Wx28vvOa3E7WElu8DyFoVurW8qiCnFR6cS9d+VS3iDvRaTKd7f cXNhDBQ4rvyMf0n9X62Xep+e4o2e0WznenIpJbXy7hFqkahB+0G3d969qYmc/wAAsoYtOT6uIfGP 27/cE90/VZjZQnUI3F7x/fiC3ufT5f5PJOVPnlkZ7b8/cXDyYhxHh+nzMf1oyyvre8jaSAsUR2jb mjxnkhowo4U7HvkxIFrnjMTRV8LB2Kqa3ELXD24as0aJI6UOyyFgpr03MbYqqYq7FXYq7FXYq7FX Yq7FXYql3lr/AJRzSv8AmDt/+TS4qmOKuxV2KuxV2KuxV2KuxV2KuxV2KuxVjOr+Vdduten1aw8w 3Gnh7RLa2swplt4pkdy05haQROXWTiQUr8KnltQqpF5m8h+cL3R4ok8yXF61oXpYenHCl1E6xqsc 7liWaMxl1djUkkHbIzBIIDGQJBp55feRPM2n3dot5ZJD+kZls7TlPbgNPwllCbydWWM08TtmF4GW uf2lxfByVzQC+XL6G9FjcNa293bhoZIZ7q2hLOqh29P1JF9QKqNyKVAoa9Dkjgyb79e9JxT33Qkd j6lncTerbpHayyGZpZ4Y9orj034B2UuFf4arUbjxGPgZL59O/wAl8Kd80bL5a1FlmWNYJWZvQIiu LeQrIaqFcI7FN1P2spy3i4eOVb+bbh0maf071z3Z1+Uel3dt5hvJZFVI1tWjZQytVmkjI+yT2U75 PRZhIkCXF8/0tw0ebELydXrObBXYql0H/KR33/MHaf8AJ25xVMcVeea/+YvnGy1+ew0/ydd3mn20 0aG/InpPE8kUbNAscDryVjLs7qOKh+VGAKqUn83fzAkQuv5f39qI1gaQTrdy8vW9BnVPQtXb92ss vI8TTh0Lfu8VRsP5q+c5NHa9/wCVf6il0LiaBbFmlDMkUCSpKCbfZZGkZd1AojbmTjGyqM/5WJ5x fVdI0+DyVdsdRit5Lq4kkljhtTKEab1JTbenSEM2xZXYrTgKiqqa+WvN/mHU9Ss7HU/Lk+lfWNPa +mnZpZYopPUjWOD1DBEnNlkaqsyupRvg48HZVleKuxV2Kpd5a/5RzSv+YO3/AOTS4qmOKuxV2Kux VC6pqdnpenz394/C2t15OQCSd6BVUblmJAUDqcEpACzySASaCraXVvd2sN1bOJbedFlhkHRkcVUi viDhQq4q7FXYq7FXYqk2v+a9M0O70u2vVlLatcC1t3jUMquabvUg0+L9kEgVYjgrMqqU6H+anlPW VtBaNcrdXk620dnNbyRTK7ep9sMOIAEDsd+g9xVVl+KvNfzJ0/V7vzDZtDDeT21vDFPa/VxOyR3S PMrSL6WyyBHA5daHNbrxqLHhfHl+l2GiOCj4vw5/oT1vJmjXelW2p3uh2d55lSyAaa5hjMsk7W5j dZnNC/LkVPJs2ButubhQETIcX02l/wCi7+eCSO58iaaXdS5djaMjTmUSM7JQn4pB6lOVeQ+1XcV8 c/5v2uYcGnvbJ/sSjrC3lS/WJ/JltZw3E4D3SNbNtR2MrrGrUI926tsfGMhxkCUb+1eCEIkwyb1y oj4MksrLSYJJDYwW8Ui/BKYERSDseLcR+GTx4ccT6QB7g4uTJOQ9RJ96A1vzbpujazo+l3kU/PWm mjt7pEDQRPCqtSdq1Tnzop4kV60y1qSfQfzV8ta0bdbaG8hku7hLW3iuIfTkaRg5aqciyhBC5blT ptWoxVkMH/KR33/MHaf8nbnFUn8x+cvL0cOq6N9d9LVY7W7HpMkqANDbxSsfVKiPZbuGh5blqDcG irzjzJ+bHnCx8w6nZW00K29rdTQxKYlJCxuVFSe+2Z2LTxlEElxcmaQNAJxof5peZz5buNUksP09 dW5f/cVYRN9cdVntI+aBPUqFS6kanp78ftDfMbNARlQb8cjIWUztvzfv2tLSW78ieZoppwizQxWP q+lMSRIpLNF+7UrtIQOQNaDKmbLPK/mKXXrW7um0y70yGG5e3tkv4nt5pokRCJ/SkVGRWZiFB323 odgqnOKuxV2KuxVLvLX/ACjmlf8AMHb/APJpcVTHFXYq7FXYqwTzVdprevLooSebTdMBmvxalBI1 xwLIo9QhSsabnrVj/Mua3XZBI+HuesuGrr4/j5uw0cDEeJt3C7q/gt0a91TQ5Ljy5bW7LPKi3eiQ 3jxsSGb/AEmJmjZF+FuUib1I5bUWmDHkyRgYRHriBQlXL4Hpus8cJTE5H0EmzG+fx+Cavqnn5Dto 0EwXdgsqJzpSoUtIaV+ICo8K4YZdVxAGEavc/g+9ZYtNwkiRvp+KSbXdU/OuDU9ZTRtI0y6023iV 9FklJWW4lMsXKOQG5QKBE0nxGnxL03pmydepWPmD86ZdIl+veWLa21NYklilhnglRpCjB4fQa5Ub SqDy9cAo3ZhiqD1XzF/zkBDodjLpvlfT7rVpXnW9hlkiiWJVMJgbj9dYHlWZSBKeit8O6Yq9B8uv rj6LaNrqRR6vwpeLB/d8wSKru/UUPXFUq8/apdWOh3bWzenItrLOkqjk6PHJCqlev+/D2wS+kljI sf8Ayh8watq0mrrqF3NdCAWxi9YH4S/rcqVA68BlGnnKQNteGRI3VbS9vz+bOsWJk1U6d9SgbhMx WyErPGp+pgLvRT+8blUVNKdcyW5mE0CvqgUmTiyciQ8gAPxfZI+z0FeJGKWl0bTLqMXFzbLJPPGn qu/Llso8dwdsiYg9GQySAoErj5e0Ulq2cfxLxNBTYeFOh9+uDgj3J8afefm7/D+i/wDLHHunp9O3 j/rf5XXHgj3L40+8/NRnOieX4numX6nahP3zxoShPNEUsFDMzkvQdzgJjAE8gvFOZA3JSXzP5vtL jybql/od44mtTEhlWN0dC8qDZZFU7qx3pkPGjKJMTbDNCUAbFFI/yq8zazqM15HfXUt2vqwIvrg1 UNFcO3GoXqYlyGDJIjfva9MTKJJ6fsZ3B/ykd9/zB2n/ACducyWaC8z6Loz6Vql+9hbtfCzuaXZi Qy/HCFb95TlusSA79FHgMVfPfnNF/wAX63sP977nt/xa2bbTxBgNnX5pHiLNfy717TPLPlqbXryF nhtmu1l9L0lkIeTT41HOV4owoaSpLuABU5g6oVMuVgNxDMG/PX8ro5kgn1d4Ll7f636ElpeBhD6H 1nkaQkf3J5dcx25PfKnnzyt5sWSTQLtryKOKOcymKWEFJpZoVI9VYyfjtZB07YqyDFXYq7FXYql3 lr/lHNK/5g7f/k0uKpjirsVdiqSecvMkfl7Qpr74WuW/dWUTdHmcHjUVGy0LN7A5GcxEWWM5CIt4 95WsV1S4n+vTXrgMrvJbRvK7vJJylZikcvxsOTbgVPfsdMMMMsjKQ3acOszchKQHks1cyaVdabfa VLdKeEdzHLcoyhZxRygPGPmvB15cdirdaHGMI4pCUQjLrMpq5SMfN6db/mBc3iafPYaLcXdneLGb i4i5sIHLlJY2CxtyMRU+Fc2ksxsULB6thymxQsOfzzrq6et5/he9r6s0D21JTMGjtfXiZVEJrHLL +55mgU7+NLMcjIWRTKEiRZFJRN+aHnG2vrKKbyLfS2d6ikXNq1xK0Tm3gnYTRvaxcFBnaMHlUsjD iMmzVL/8zvM9vrtnp9t5H1W8tburveqjxrAnxcVk5IU5ngGI5gDkBUsKYqqeQvzH81eZdevLDVPJ 15oFlawvJFqFwLnhOwdVUR+vbWlKqSaH4vbriqH8weYJNe8lT6mbR7D1LK7VUlKuCEuLdeQKkGlR TcA19qEz1GPhiRd/gtPFxC0F+SDlZNdaSRCqpaEkDiAKz9ak5g6Qek+9jpuRTLT9T83y/mfqkErG 48riOL6pPG70iP8Ao7DZVWF/UkeVD8TOAprxWnLMchkPmTUtPs7xRcyhGeEcQFYtQMQfsIzd/H+O GLOATHTodMmtY0iZJWiRBIEevEla0IU7fbJwbsd191HpNnby3N06W9vAplnmlk4IiChLOzEBVHDq fDG1srYl0WZImiljkS45LAyy8hJxBDBCG+KgrWmNraWebtEjvtKmtYOaTSJzTirSLVJ4pKsoNTuO 29K0zH1QvHIc7btPKsgN8nmut+V9KtNG1S7uLW6m1iD04or6SFoIYUaWJmHD1mDBii/ERXp0rmu0 sowicYjPrzHl+Onc39oxlOJmTHlyBRH5V3bWi6ndPxl+rtDLwX4K8La8alWJArTMrTmo3XX9Dgdn w4gY8rr74sr1jXrhvL+va/Dcy6U8GjRXsdzbRx3LqsD3bh0juVjWRXVKhW4Eg9VO4zISsW3ZsfBM x7kotvP+v6l5d+rvod5e2V3pHrrrxVYfXEmnxz+s0XH6vF8crIyCc0I+Dn8XCTU8u86GT/F+t0Ap 9fue/wDxa3tm205PAHX5q4i9J/JEymFQ9QgN6UooKkn6lyrJzDVHw0X06f5XbMHVfWXKwfS9VNtb m4W5MSG4RGjSYqOYRyGZQ3UKxRSR7DMduVMVdirsVdirsVS7y1/yjmlf8wdv/wAmlxVMcVdirsVe L+e9SfzHrReGYDTrHlBZilQ5r+9l+TstF/yQCOuU59PLIBRoOBqcoJ4Uv0m48w6G0g029+rGbi0o ManlQEqaOG7NlUNHkj9Mg0RymPJT1SXW9UhtYb67WWKyUpbII0QIrBQR8CrX7A64J6PJKrI2RLKZ c2QflrqsuiaqdOuZAbDU3VV6/u7qnFD8pRRD78ffL8OGUBRcrS5f4XrmWua7FXYqp3EbSQSRrI0L OpAlXjyUkfaHNXWo91I9sIUvLpbqG7/Lb6xbw+kkllfFYWMVAReQg7wRQR0qK/DGMt1Yri+H3NED cfmxzyVpF5f6R5jMWoT6cYoYf3NsYPSuTKlxEI7kzQyERfHvwKkfarUAjU4sgjjkWehjxGh3so8g 2QtNRlmN5ChuUjhRbQSu3JZUZg/qwRqo+ILWvfMbsuUI7CXET5F3XaIkdzHhA8wqed/K2rXXmayM EkV0t6q2sb3knBkcC6uiKRQkcAsZA75k67RyzEES4aaNJrI4ruN2m0vlK2g8u6U8WjjVtTt47dDE Lt7dVLKvqyozErsRypTc/M5shIhxBlkOSVy+Tby6tlsrny1EbKeGKO5ja7dnBMIikUyJNFy4gsKq q160zAzZ9UJkRiDD8ebmwOGUPVIiXd+Anegfl/5fils9Xu9JW01q1kllThPMyo7hY+XH1XRiYokG 9foqcvwSmYAzFScPNGAkRA3FP9Wu5LYwtG5DFkUpSoKtcQox+fF6D55kRFlpJYbr3mCfzD+VV7qz 2iWq3BQQwib1fhS5RKu3BOJ5KdqHb7sGqx8AIvo1cfFAlif5fRCTTNdimUIkiIrFPjNDaXgJoTEN v9YfMZr8NcB9/wChyOypUbHeP91H3sj8w+jY/l/5gkmgjrYeXYJIEu1t7iP1oBdGF+Dm5hYGRVZA Sw6DMzGPSG3VG8sj3n8dB9yceXruW8/J3T7uYIJrjy/FLII0SJOT2YLcY41REFTsqqAO2TaHiHnN 1Hm/WxQ/733PY/79b2zbaeQEA6/NH1FmP5feXdO81+WLjy/fPJFazm6aWSF4VmFJLGROCSCR/tQ7 t6fHtWuYOqNzLlYB6QyyD8lbGCVriHzZ5lS9leJ7m9GoL60wgDCNJW9KjoOR+GmY7cn2iaFp/kyy vb3UPMd/d2bJCJrrXb1ZY4RECgZXcRrGZC/xn9o0xVkdxc21tbyXNxKkNvEpeWaRgqKoFSzMaAAe OKqmKuxV2Kpd5a/5RzSv+YO3/wCTS4qmOKuxVin5ha++n6WthbOUvtR5Rq6kho4Vp6sgI6GjBV3r Vq9jkoi2rNk4I2xjypf6lBZTWtjFbmK3InPqJIWA+HYekQ/H92tQoPvtXJyAcHDOVGvemqyeZILi 0VRYyfV4haRSKssi1p6lDxBJkT6vWijY9t8GzbcwRy7vx8laV/NkK/V3+oskhg3UyAULiJWBUr9k xryI3FRT2GyT4g226JF5yn1Zv9A1IoHuCLqVISwVG4ekqqST9kJ1B+1XJRAas05g78+ab+WtO03z PZyXl7PMurREQ3wjZFHIIFWVV4dJYxXuAeQHTIkkORHHHIOLvV9f8nWVho11dRNfXzRtBP8AVFPq 8nt7uK5SQRxxO7mN460VSxBYDcjATbbjxCHJ5X5p8mzw+XppLfytrl/EbeDSksllt47hbVIPryus EGnzwn9/cyxt14MBxowAQNr0H8pvy1h8ueXrWWC61bTp5rWaKbTLmeKVIpJpAxlCehGpdCn7ssuw Zqjc4gqVfzVoNnoXkybS7V5ZbeCxuyryMDKfUuYHNSgTu3hks8zKJJauHhFJZ+RoUTa7QMPhtPtk nvP4k5haT6Sw0/J6dOsca81IiZ5I+ThQSSzqtD/rUC1zJEQHJJKQ+abu3stU0W9uSUtob0GaQK7B Q1leoCeKn9pwOvft3KG4fPXlOGzjMl+ESOMF2McwUBV3JJjXYfIYqyTFXYqleusfThQPu0kJEVPt UuoBXl2pX8cnDmiTBZ0t1/JidLX4rEcfqrrM8zspu1LcmaG0IIkLLQJ0HXHW36r51+O9pj/d7JD+ WthJfWurWcErwSTtAiyE13Nvd7fGJNj0NBXw3zXYQTCvP9Dkdm5BC5Hev1j3PQo/LcF56mjahJKF hsNMEklrNLbyF7eadgVmhMUi/EnVSDmZAUAGeaYlMyHUqfmzy/qS2P1qw1WSCy0zSr22ewn9a5Wc yxpwlmkaZXkeNYiA0nI1YmvWsmt4b5z/AOUv1v8A5jrn/k62bjTf3Ydbn+ssx8h3nmez8pS3Pliw i1PW45Lo21jM6xpIpl04TAuzxhT6XMqS3Wmx6Zgar6y5mn+gMy8s+Zvzau9R0q317ybDptnJEy6v fR39vKEmCtR4oldmCFk+xVj8Y+L4Dyxm5N/zHkRPKN2ZNDfzHH6lvz0hPU/eAToeREUczssdObKE aoFKHFXk09l+Wx0yRYfyu82TNbxyG1gmgvk9VjLE3Bn+syOAZFRqspoqmg2oVXvsbFo1YrxLAEqe oJHTFV2KuxVLvLX/ACjmlf8AMHb/APJpcVTHFXYqxTW/IMeratPqUuoyo8qoiRhEKxxxjZFJFaci zb92OSEqasmET5sY80+X/KPlSxjvvMHmF7C0mdoo5Xh5BnWN5uI4KxqUianifhHxEAnjLX+UgxuH zN+UMyNJF5vleJOPKQWNxwq5YKvL0aFm9NiF60UnoDjxlfykEU2p/lclrNdS+a5IbeARF5ZrSWJT 9Yge5hCGSJQ5kiiZl41rt3ZavGV/KQQcHmj8pJbb1z5quI0U8JuVjORHLwMnpO6RNH6hCtxUN8RB C1x4yv5SDIvLnmT8v9F8xIkOv3c11eMtglq1hdLHPJKUaLi4h4vTmCrA8Qr1rRgciTbbjxCHJNbX 8+/yuupZRb6q8lvAFMt4ttcmFS0qwgMfT5L8ciDkV4/EN8DY9BxV2KsQ/MmC7fQryWNB9WSxnWWY MOSs0sDKApBrUI2/bGf0FhNjX5HqRLrhLlwVtKE07GfwAzF0gIibFNWnG3Knp904SHmZBEqshZ2F QBzFa+FRtXt1zKchR+s2qXlwzTRr6ccQkqyjj8T05b1HXviqTeaZ7dfImsxeqgePSpw6hlqK27AV AO1a7Yqn5urVSwMyAqvNqsNlPRjv0xV31q1/38myeofiH2P5+v2ffFUt1uOS+jgtrVllWV4nnVJT E4hE0bNKkifECoU0owr2NRk4GjbGQtj3mzQbbRPyxvNKs5JZILcIUklKtJ8d0sjEkKq7Fj2yGomZ gnyYGNQIDGvydhumub14f3yxz2zTM7BeKeldKSOK7mrDbMPTRNcq3/QnR0Iyvb+0PTIP+Ujvv+YO 0/5O3OZjYoeZr6yGlanYG4jF8+n3M6WpdfVMSJwaQJXlwDMAWpSpGKvnbzmG/wAX63v/AMf9z2/4 tbNtpweAbuBmI4jsyzyFruo6B5cn1azsZ9cltzcBNEs0LXDiSfT45JI+KyO3FH5U40+HqOowdT9Z crB9LMLb8379rS0lu/InmaKacIs0MVj6vpTEkSKSzRfu1K7SEDkDWgzHbVTzL5svdU/LzX9Qi026 0oQTrbWg1G3aOSaLnCPX9CZVKq3qMFDDtvvsJ443IBjM0CXjw82+Y2ZA01rQMrfDYWSH4SCPiSJW HTsczZaMAE240dTZqn1Fmvct2KuxVLvLX/KOaV/zB2//ACaXFUxxV2KuxV5X5186apL5mby5J+W0 3mK0guoY7bUbqKRrM+rEji4Dm0uI0WORirnlUUqK7gKsbg8y3dwLyzg/JZbS39GaWOaezrG0tpDJ LAkkK2alixJReDN8TUUmpxVOdK80Xc80tteflk8VlcRXc3wW87r/AKJYrFGhSazhH+kpygjX4Wps U33VQNz5gs4oLGS2/JqSeS5sjfSp9Qjj9C4CTiS3cm2rz5IgU0qyyE8QRxZV1t5x1KDU5mtvyelt ZLKa5urK5iiaNpbjeJpKpZ8Q0iqrFi5qv2S7BVZVU1PVlt9A0+7tvyntp5LuAXT2kVr9YSIR3iRI hMVmTyeJjMh41UCvE4q9O8o69qWuaZLd6ho8+iTR3M0CWlyau8cT8UmHwptIN+nyJG+Kp3irzjW/ z9/L/Sby6tC9zeSWbSR3DW8ahVeGQxSLWZ4fsupHgeoqN8VdF+e/lZ/qLNpuqxpf2kl9C/1eOQCO NWIVhDLIwd2QxqKfb+EkHFUwvvzc8rwWcUlsJrm9nhiuI7Fo3iZY5okmQyOy8F+CVKgEkE0p1oJG gSiUqFpMPzuBkKfobcAGv1nxJH++vbMX85GrouP+ZjVpR5n/ADbu9a8t32lWWniyudStZIo7l5hK qCRQrVT01rVWp1wnVxF89knUxDHb/X7aab61YzavpfO+luZ4rTUEjSRp3VZAwW2DE0WiuSW8S2H8 3G6or+YijD5ss0SQy/pefiKzl9TB9UekY+MgFuAR+1Sn2t8idTGQrfdBzxIrfdk/5Yy6Nd+Y3+rQ 3cE8FvJKgluEmjKlwrKQIYm6y1G+Q0wgTcb270YBEmxb1bM5ynYql0H/ACkd9/zB2n/J25xVLvNH lawvY7/V7ewW58w/oy4sLRzLLD6kbn1VgZo5IqK8qLU1BpUVAJxV5ff+T/O51SF7ryRYarGOB1Ke z1Ke3ea6YLLJKj3cpPo1coVKszEE8h0ywZJDkSxMInozDyF5f1JbjUYtV8sHy1bxlDYPaapLOJQ9 VlDGKRG6xqwYqtQwFKqSYGRPNIAHJk2raI8WlXsmmfWZ9SSCVrKCS+uwjzhCY0Y+umzPQH4h88CX n6RfmddLa6XrvksX1s1yI9WvIdakNvJbh1pLBbTXBkWnLmA7Enh0UsOJBpSGPaZ5Z88Q273F/wDl Vb3Fy7Kv1eLXEhC8U3dAzzAKxHQuTX2yZyy7yx8OPc9B07zF+aEWppYTeSl/RYuTENSOrQuyWvq8 BKUcSSytw/eUJX+XY5WyZ5irsVS7y1/yjmlf8wdv/wAmlxVMcVdirsVdirsVdirsVdirsVdirsVa dFdSjgMrAhlIqCD1BGKt4qhtT0yy1Oxlsb6P1rWYASR1Za8SGHxKVYbjscBFoIth/wDyqby/yLeh ByIAJ/03oK/8vnvg8LH/ADfx8mvwh3Bjt95S0LT7ye3fy7dzrbEJDPaW9/PHIjJy2P1xd+S8fu36 0uGlxyHTfz/YwkAOn2KNx5d8v2sE8tz5bvVhhrKCkF61UASTk1b1QprIdqmnE13BGEaTGTtXz/Yg 0On2Mhs/yv8ALt7YxXP1NIVuolcwzC9SRQ614uv1w8WFaEZScOMGq5fjubBjFck+8reStL0GSW5i ij+uSc4xNGZwBCzBlTjNLPv8IqRTEQgPpFMoQAZFhZuxVLoP+Ujvv+YO0/5O3OKpjirsVdirsVdi rsVdirsVdiqlaWsVpaw2sIIhgjWKME1PFAFG/wAhiqrirsVdirsVdirsVdirsVdirsVdirsVdirs VYV5x8heYdc1T6/pnmy+0QBbdBaQGRoP3DSs7cEmhBaT1FBrtRaMGG2KsdP5N+c2jkSX8w9SkWS2 +rvGwuDGz/Vzb+oym7LdGZyqsKvRifhxVofkv5sEjr/j/UTYm1a1SxP1n0hWze2Dn/S6k+o4mPiw pt2VTLyX+V3mfy5qemXFx50vdTsLBJ45dMlSQQzCZaRluc8oUxdgo40A4qp5FlXo2KuxV2KqS2sS 3Ul0AfWljSJzXbjGXZdvnI2KquKuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2 Koe+sIL6D0JmmRKhqwTS271H+XC0b09q4ql3+EtK/wB/6j/3E9R/7KMVd/hLSv8Af+o/9xPUf+yj FXf4S0r/AH/qP/cT1H/soxV3+EtK/wB/6j/3E9R/7KMVQ3k+e3a58w2ltdS3UGn6n9WT1ppbhoyL K2keP1Jmd9nkY0r3xVkWKuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2KuxV2K uxV2KuxV2KuxV2KuxVD6jHNLp9zHBX13idYqGh5lSF37b4lXin+CvNdpo2tnV9Lu57W5shG0EFws kskv1m3KkCFb5zxSMkkQueIIplGGEwTxFpxxkCbZd+Wti0V4xdvNKSQpPG8evymSCQj0I2cGnEtz hLxsKcubt0NBe3PRcVdirsVdirsVdirsVdirsVdirsVdirsVf//Z + + + + uuid:5e8338dd-fabd-a24a-b4db-8ead06edb8ec + xmp.did:e4501db6-eadc-4920-95eb-2f2ea570137f + uuid:5D20892493BFDB11914A8590D31508C8 + proof:pdf + + xmp.iid:8e66c410-d0c9-4f4a-a8c9-679fa3eb35a2 + xmp.did:8e66c410-d0c9-4f4a-a8c9-679fa3eb35a2 + uuid:5D20892493BFDB11914A8590D31508C8 + proof:pdf + + + + + saved + xmp.iid:1cd8aa97-ea75-4dd7-88b4-5f2b2d06a9da + 2022-05-16T14:08:35-07:00 + Adobe Illustrator 26.2 (Macintosh) + / + + + saved + xmp.iid:e4501db6-eadc-4920-95eb-2f2ea570137f + 2022-05-16T17:20:26-07:00 + Adobe Illustrator 26.2 (Macintosh) + / + + + + + + + EmbedByReference + + /Users/danfu/Desktop/Screen Shot 2022-05-16 at 3.15.03 PM.png + 0 + 0 + + + + EmbedByReference + + /Users/danfu/Documents/Research/25 Streaming Attention/gpt2_attention.pdf + 0 + 0 + + + + + Print + Adobe Illustrator + False + True + 1 + + 5.500000 + 2.149717 + Inches + + + + + MyriadPro-Regular + Myriad Pro + Regular + Open Type + Version 2.062;PS 2.000;hotconv 1.0.57;makeotf.lib2.0.21895 + False + MYRIADPRO-REGULAR.OTF + + + MyriadPro-It + Myriad Pro + Italic + Open Type + Version 2.007;PS 002.000;Core 1.0.38;makeotf.lib1.7.9032 + False + Myriad Pro Italic.ttf + + + MyriadPro-Semibold + Myriad Pro + Semibold + Open Type + Version 2.062;PS 2.000;hotconv 1.0.57;makeotf.lib2.0.21895 + False + MYRIADPRO-SEMIBOLD.OTF + + + MyriadPro-SemiboldIt + Myriad Pro + Semibold Italic + Open Type + Version 2.062;PS 2.000;hotconv 1.0.57;makeotf.lib2.0.21895 + False + MYRIADPRO-SEMIBOLDIT.OTF + + + + + + Cyan + Magenta + Yellow + Black + + + + + + Default Swatch Group + 0 + + + + White + CMYK + PROCESS + 0.000000 + 0.000000 + 0.000000 + 0.000000 + + + Black + CMYK + PROCESS + 0.000000 + 0.000000 + 0.000000 + 100.000000 + + + CMYK Red + CMYK + PROCESS + 0.000000 + 100.000000 + 100.000000 + 0.000000 + + + CMYK Yellow + CMYK + PROCESS + 0.000000 + 0.000000 + 100.000000 + 0.000000 + + + CMYK Green + CMYK + PROCESS + 100.000000 + 0.000000 + 100.000000 + 0.000000 + + + CMYK Cyan + CMYK + PROCESS + 100.000000 + 0.000000 + 0.000000 + 0.000000 + + + CMYK Blue + CMYK + PROCESS + 100.000000 + 100.000000 + 0.000000 + 0.000000 + + + CMYK Magenta + CMYK + PROCESS + 0.000000 + 100.000000 + 0.000000 + 0.000000 + + + C=15 M=100 Y=90 K=10 + CMYK + PROCESS + 15.000000 + 100.000000 + 90.000000 + 10.000000 + + + C=0 M=90 Y=85 K=0 + CMYK + PROCESS + 0.000000 + 90.000000 + 85.000000 + 0.000000 + + + C=0 M=80 Y=95 K=0 + CMYK + PROCESS + 0.000000 + 80.000000 + 95.000000 + 0.000000 + + + C=0 M=50 Y=100 K=0 + CMYK + PROCESS + 0.000000 + 50.000000 + 100.000000 + 0.000000 + + + C=0 M=35 Y=85 K=0 + CMYK + PROCESS + 0.000000 + 35.000000 + 85.000000 + 0.000000 + + + C=5 M=0 Y=90 K=0 + CMYK + PROCESS + 5.000000 + 0.000000 + 90.000000 + 0.000000 + + + C=20 M=0 Y=100 K=0 + CMYK + PROCESS + 20.000000 + 0.000000 + 100.000000 + 0.000000 + + + C=50 M=0 Y=100 K=0 + CMYK + PROCESS + 50.000000 + 0.000000 + 100.000000 + 0.000000 + + + C=75 M=0 Y=100 K=0 + CMYK + PROCESS + 75.000000 + 0.000000 + 100.000000 + 0.000000 + + + C=85 M=10 Y=100 K=10 + CMYK + PROCESS + 85.000000 + 10.000000 + 100.000000 + 10.000000 + + + C=90 M=30 Y=95 K=30 + CMYK + PROCESS + 90.000000 + 30.000000 + 95.000000 + 30.000000 + + + C=75 M=0 Y=75 K=0 + CMYK + PROCESS + 75.000000 + 0.000000 + 75.000000 + 0.000000 + + + C=80 M=10 Y=45 K=0 + CMYK + PROCESS + 80.000000 + 10.000000 + 45.000000 + 0.000000 + + + C=70 M=15 Y=0 K=0 + CMYK + PROCESS + 70.000000 + 15.000000 + 0.000000 + 0.000000 + + + C=85 M=50 Y=0 K=0 + CMYK + PROCESS + 85.000000 + 50.000000 + 0.000000 + 0.000000 + + + C=100 M=95 Y=5 K=0 + CMYK + PROCESS + 100.000000 + 95.000000 + 5.000000 + 0.000000 + + + C=100 M=100 Y=25 K=25 + CMYK + PROCESS + 100.000000 + 100.000000 + 25.000000 + 25.000000 + + + C=75 M=100 Y=0 K=0 + CMYK + PROCESS + 75.000000 + 100.000000 + 0.000000 + 0.000000 + + + C=50 M=100 Y=0 K=0 + CMYK + PROCESS + 50.000000 + 100.000000 + 0.000000 + 0.000000 + + + C=35 M=100 Y=35 K=10 + CMYK + PROCESS + 35.000000 + 100.000000 + 35.000000 + 10.000000 + + + C=10 M=100 Y=50 K=0 + CMYK + PROCESS + 10.000000 + 100.000000 + 50.000000 + 0.000000 + + + C=0 M=95 Y=20 K=0 + CMYK + PROCESS + 0.000000 + 95.000000 + 20.000000 + 0.000000 + + + C=25 M=25 Y=40 K=0 + CMYK + PROCESS + 25.000000 + 25.000000 + 40.000000 + 0.000000 + + + C=40 M=45 Y=50 K=5 + CMYK + PROCESS + 40.000000 + 45.000000 + 50.000000 + 5.000000 + + + C=50 M=50 Y=60 K=25 + CMYK + PROCESS + 50.000000 + 50.000000 + 60.000000 + 25.000000 + + + C=55 M=60 Y=65 K=40 + CMYK + PROCESS + 55.000000 + 60.000000 + 65.000000 + 40.000000 + + + C=25 M=40 Y=65 K=0 + CMYK + PROCESS + 25.000000 + 40.000000 + 65.000000 + 0.000000 + + + C=30 M=50 Y=75 K=10 + CMYK + PROCESS + 30.000000 + 50.000000 + 75.000000 + 10.000000 + + + C=35 M=60 Y=80 K=25 + CMYK + PROCESS + 35.000000 + 60.000000 + 80.000000 + 25.000000 + + + C=40 M=65 Y=90 K=35 + CMYK + PROCESS + 40.000000 + 65.000000 + 90.000000 + 35.000000 + + + C=40 M=70 Y=100 K=50 + CMYK + PROCESS + 40.000000 + 70.000000 + 100.000000 + 50.000000 + + + C=50 M=70 Y=80 K=70 + CMYK + PROCESS + 50.000000 + 70.000000 + 80.000000 + 70.000000 + + + + + + Grays + 1 + + + + C=0 M=0 Y=0 K=100 + CMYK + PROCESS + 0.000000 + 0.000000 + 0.000000 + 100.000000 + + + C=0 M=0 Y=0 K=90 + CMYK + PROCESS + 0.000000 + 0.000000 + 0.000000 + 89.999400 + + + C=0 M=0 Y=0 K=80 + CMYK + PROCESS + 0.000000 + 0.000000 + 0.000000 + 79.998800 + + + C=0 M=0 Y=0 K=70 + CMYK + PROCESS + 0.000000 + 0.000000 + 0.000000 + 69.999700 + + + C=0 M=0 Y=0 K=60 + CMYK + PROCESS + 0.000000 + 0.000000 + 0.000000 + 59.999100 + + + C=0 M=0 Y=0 K=50 + CMYK + PROCESS + 0.000000 + 0.000000 + 0.000000 + 50.000000 + + + C=0 M=0 Y=0 K=40 + CMYK + PROCESS + 0.000000 + 0.000000 + 0.000000 + 39.999400 + + + C=0 M=0 Y=0 K=30 + CMYK + PROCESS + 0.000000 + 0.000000 + 0.000000 + 29.998800 + + + C=0 M=0 Y=0 K=20 + CMYK + PROCESS + 0.000000 + 0.000000 + 0.000000 + 19.999700 + + + C=0 M=0 Y=0 K=10 + CMYK + PROCESS + 0.000000 + 0.000000 + 0.000000 + 9.999100 + + + C=0 M=0 Y=0 K=5 + CMYK + PROCESS + 0.000000 + 0.000000 + 0.000000 + 4.998800 + + + + + + Brights + 1 + + + + C=0 M=100 Y=100 K=0 + CMYK + PROCESS + 0.000000 + 100.000000 + 100.000000 + 0.000000 + + + C=0 M=75 Y=100 K=0 + CMYK + PROCESS + 0.000000 + 75.000000 + 100.000000 + 0.000000 + + + C=0 M=10 Y=95 K=0 + CMYK + PROCESS + 0.000000 + 10.000000 + 95.000000 + 0.000000 + + + C=85 M=10 Y=100 K=0 + CMYK + PROCESS + 85.000000 + 10.000000 + 100.000000 + 0.000000 + + + C=100 M=90 Y=0 K=0 + CMYK + PROCESS + 100.000000 + 90.000000 + 0.000000 + 0.000000 + + + C=60 M=90 Y=0 K=0 + CMYK + PROCESS + 60.000000 + 90.000000 + 0.003100 + 0.003100 + + + + + + + Adobe PDF library 10.01 + + + + + + + + + + + + + + + + + + + + + + + + + endstream endobj 3 0 obj <> endobj 12 0 obj <>/Resources<>/Font<>/ProcSet[/PDF/Text]/Properties<>>>/Thumb 1055 0 R/TrimBox[0.0 0.0 396.0 154.78]/Type/Page>> endobj 1050 0 obj <>stream +H‰ÄW[oÛF~篘§…øÀñÜ/y«Ó´»É&mÖB»€AìÄ®/Jciö×ïwfHy(’¥´P”<ç2çòïýô˜=,Øñ÷YõG%˜´†ûÀtt¬iß?žW¿²Ûêx‰?Ó?É®ª£O{W-åoôËò]Ò“>ò(dd +Á³åMuºø¡–Ž-®ßÜ]|WKË÷÷µå‘-Îoñ)ñýru[¿^>­ž,«—³M)ÁE’iÏ­Ò1›z^70p~³úX7J’‘/ìŸuãÈÌåùÇ?¼IÏ·É©/ìóåýE2Þî”ÅŹbË3è:®ã·gŸ/Ïî/Ø?ؘò“ËÿÕ.]¦»dïÉmInk¸±ˆhàB;æð4D”§Ð¾«×Â1Á·x:kÙÇ÷•–Ðne'&á¾bøM ¥þÕ +Í”4É¥Pì3r‹ÈåˆfÍÂd9ÙÛ›í›*ìºê™»¦²è^/ª$èe¥å.˜¿Ã¿¤Yû¯ô/Ý’™È©0òy…ŠDñ%÷¶& …0m¸UD†×¯ÙðàkF”]À¿ÍÑJž<ÜÑ+ŽNíRx0²MÁðjé¬ÞrµV>Å´{ývW‹‚G|´Q×Ü(3yµöìtÒ²xJZ÷úín&¥áFv•¹tnòfíY=}·,O†º·op3 t)Mà,Ñ“Úrd!øsoÍÚ¢d¿³¡à> Öt’—óš{ƒ[í´5™i3Âr/ÈL)Ìäó£AL èGÌ4BfÊ4ºýÌ$¿•Pe~?+I&È}Ì´Eì÷²3¬ü Cµ£ØS6+â¢>C'§DÚqÝ™{ò4é(“&™IÓ(Wé +µ°h©%,G™)ÊeÄðÿÇŸ—µ-iTÇ&vÒ2Ÿ¬Á{+-@¶ +ÜZ©'©—+öh×Z•ðÝ””DÕ îCÑ …ø˜ãîd=Ÿ ¾¼Jëä­¤D¹¡zQ Ósˆ\0N)ņš]1PFä4 Es æf¡K'`žFœìå³…²&5êÞ£’ -eW“`ä¨Æ7TÓ÷7Ÿ®;/ÀWÀ›DäéUé›»«\¦é€Æ³eú'©´WïêF*jÊû›7¶­ó©ïÓ>ÐVüêÃêÓ}k‡ô¦©:­k§ ×çÚ´&¨5þþéîü¬sZÊrùxFKÊùÇÛó뇽â3¢÷” £µ€9Œj˜×¶Ot²‰CÌÓ”Ð:AØŒƒòµ¢'úãoJ¢uË Vôý¶…6©§­ dvúeìg¾_4„èÉÄÀ²vÚÊ@f§_Þps{ø%Q„Á›Íx™-n Dvº±`½[´í8/7Ê‹×i¿2;C€Ä¨9ža¸a&¢ô¾dµ€ŠíàÖZô"œ VE«ä.‰’¨L :Þ2%t0jq†=g¤WÃ1fi§EM°‡H×{ˆtM0_dÝûˆ´M°‡HÛ»$Ž—ë±pU¹ôÜÐ{ +”qמ-oªÅËG¬^þ^-åoPÏ–ïªÅ ö';ë~ùdž&3[¾eù,"rš¥‚-ϪÓÅ/j I¶`õëåÓR™H2$µÿªÅƒÁª±¼:Ò´xFÐZ4ëð/¿IlxZêš +:IŽ/éäújŒÂGö8‚ú£#ëáj/6| ù†I ‡#h (eŽÎnuî4´<8á ‘>åMá×Pgt@©'üzQú¥òiÕµJ_j“UÜÝŒ$|+gˆ^ñWÙû‡`e%º³%Ò&ÑæÈÂH„l$Ò(=ˆ)¼9]¼ª©)ÿ®–b˜rÿd9ÁžÚf@m¢"ÀnÚr.pCY´~LÈ«ÌØ(F-ýût2¦EU%b4\ü\qì(J•(눌Ù5¢M©½ê!ÚÁåKÚt(Iæì5m„”®¸äåà5‘.G:“ßµ_Ù-EQHëDSœE°©&,NÊôOucØâÓ}myd‹óìßµ”l±Z}H™G~é6ÞPnOOGq ¨®ÌPÇ’Ž“žliÃau}Êôüd‹ê«Qm¨b%E¡mö’4º¦¶LD—)èz3è +УüƒôÇ9ÆjÏ_Øñõêík3°Ê7ÞTø t‹ôò™.N€Ú¤ä?uƒ­pñÝó.‡yRh ÀX¥lðŲ +L ¡h™ÝYR€l± tmr¿ÌwõËaZ7s4Í&o\-Ä: +a—7e<¢ +ŽgÛ¬­VÔ [|¨›€ÌuÑÝTãq÷^žEàF;Ùk®ö«V›1Ø)Í_Ô‡)ôF7©è^)ºÊ®¥ÇŒÅܱ*ÎE¡®Íž­Rä2.Ã(Ûžª´^¤çkÁÎؘðCo} qAì´L‰…½ÄŽ›Ûvu­á_;ÙmÎNŒqTG%-XD …ÇÊó„nzƒzz> gƒÅr(€ ÀìÊME£Ãå^˜#iÛñèúº)}°Þ¯ÅiÁÓîM{V0¦žØÉ2Aã)‰½Ü–± ò¡´ÎÁqÝWcêþRà9Pëä騻˜[Ð%,¾–DSlÿUã‹ÛÛQÌMßåæh‰®‡vãGhlª­,)Š)˜)–½M4ÁG  @Å k×,®­n>¬q1Sƒ1±È,‚(9pË£Ò^°ºMtý`BñÖ«]·mŠîþ +ŽÒ`†—_ +ÔI‘.2$Èâ%c4šÿz.%Š²µeØ‹Ú¼ºïsŽY Û>ó¬”!wº¸($©¸f(ƒÔF+ ¢ÀfÃÐî:«—PÁÒ‰¹PN0 rl¢Ð0¢Ø ‚\÷Ay©\⮎Žž‹ç:BYuKäcWý:´ƒ$Ûºõ„z8¥ÿ#žÎï…¢q¦Æn¥6Ú”i¬ˆ  ,cpæüBX£s¬åŒÑs¸@‡££¼‹3F0ܹ-k»Îê¬3YƒTô»ÍKã×%s¦ˆБ9”¢1£ÉC¼ +Kç]À„’D”ݮ󮵻ÒyZi”[š™†&,ØãÀ]6O}ÄšÕ*FAèÇãÁ{¶ÁÌ‚¼õmˆ#²ôA˜ÈîvÏ€«Wñ¹›Øuý#ŒiDtlîæQ3œŒã­H†›¾±qôX YøO –«Ç§^e ˜„#.@†·eˆ*;ק·¤U4`%00wù¿d ˜ý é&üò˜(o(6YÏÚT©·÷OÁO ›ÿù[ïQôžŸÁMPùâ汪 äíþéE<ÀYÇw~Ôš¿ÚcÝ_Yc‹“ä ¬– nÀËFŒErm9˜Jf@‚Òt%ï¾aEÝàÆóáîK+­D{üZƒ!£ë›àÆÉ+/L¶1W0.i07=ó¶a“”ÛPX [{#Y?%·.ŽÕó¾HÂ-^÷•ªßýV +%&Ÿcõrjz±Ò¡b8컞Q=Å‚H(Ÿmß]ýŠ +ùð)åÿRÃÿ5±;µè^¦VK”Å¢—¼2'㮉‘m.C3›ÝA—[*âëg{/vÿ5ŽËá endstream endobj 1051 0 obj <> endobj 1055 0 obj <>stream +8;W!u]5>J?%-92.=DAZf-[QG,D?=`ql*2ck;ZE@K4c8[cB,&0]'Olb$C&@upF&UqjQi/3 +iDU>m!hh-c%imJ#>l`+_Vs4O,3\3$1 endstream endobj 1056 0 obj [/Indexed/DeviceRGB 255 1057 0 R] endobj 1057 0 obj <>stream +8;X]O>EqN@%''O_@%e@?J;%+8(9e>X=MR6S?i^YgA3=].HDXF.R$lIL@"pJ+EP(%0 +b]6ajmNZn*!='OQZeQ^Y*,=]?C.B+\Ulg9dhD*"iC[;*=3`oP1[!S^)?1)IZ4dup` +E1r!/,*0[*9.aFIR2&b-C#soRZ7Dl%MLY\.?d>Mn +6%Q2oYfNRF$$+ON<+]RUJmC0InDZ4OTs0S!saG>GGKUlQ*Q?45:CI&4J'_2j$XKrcYp0n+Xl_nU*O( +l[$6Nn+Z_Nq0]s7hs]`XX1nZ8&94a\~> endstream endobj 1045 0 obj <> endobj 1046 0 obj <> endobj 1047 0 obj <> endobj 1048 0 obj <> endobj 1064 0 obj [/View/Design] endobj 1065 0 obj <>>> endobj 1062 0 obj [/View/Design] endobj 1063 0 obj <>>> endobj 1060 0 obj [/View/Design] endobj 1061 0 obj <>>> endobj 1058 0 obj [/View/Design] endobj 1059 0 obj <>>> endobj 1041 0 obj <> endobj 1042 0 obj <> endobj 1043 0 obj <> endobj 1044 0 obj <> endobj 1069 0 obj <> endobj 1070 0 obj <>stream +H‰|SkPWÞMÈâƒFMºfìŠâcx?,ˆ`} E +‚ ¶Æ‘á‘ÕD1¡›^b"U"Q¬<ìV;ŒhEVÖ+”W¥j‹Q¡ÖVÛÒžÅK§ÝÐ?ýÕ¹3ßÜïœ{¾sî9÷☓Ãq|Þ†è­Ñáë<¢³9m²:–Ó{ű»2Ò’9‡“æ)œwuâç»ÈY_½z%Û³àÑœ/æKÖÈ0Ž?]§ObwiŒŒßŠþž œÂ OÆß××w +™5j} +ËÄgŒì^³Q—ªçÒõ\²‘U{3kÒÒ˜) ñ–39ŒÿVÄh «5jXŽIœ»´B<Ǫ#—¬f÷&s{½ÃóºóR1Z#h1 :­ƒÅ£IÖ©}ýT–T}†ÎÈiYƒ·Ï†ø-Ùé,̨Ù† [ˆc¾ŽaQ8#ÂpL…aR¡“X4¶ ûŸ…«ñsø0>. +íý(öçˆïm)i㟷á.j—8ñE±“EXÑ-­‚$0F uà “Ç'‰Ä©=¬B'øë¤c‡L*Ý訸 ÆnÙàGŒÊÇKÉÂÚÖÂN +Dƒ§¯¶ÒŸÖi¹¥x±ºãíJ¥ÕYÎ÷žm¾Ù¥èȸ¬9§¼˜_HÅl:”ÏÒ#›IkÅëeÊ~{OXd‚Ê—%"ú ùø³jßeÁ»ýÒ G¬Ù´ô¬å[¾u`o¦¬~TÞß{Jæh,  +ò‡X?þüo|fέ£å 5}yÏIcÂGÁbÑBä†4ˆ7äq?Ü­ª:OKAaéã¯÷áõ£|ø¨^N¬%Qr:°%ÁlDÃ[°ÀlÈ…h&,@ÑtQ8 î½Â +y…!9R"e&XÒ +p§¥Œ ‰õâÅü’!ÒÂí6o£<Öõ€Ó‹Î!~Ál©¦OwKÊ ;Êß¡¿;ÂÑ2´ôÙRX|·ý”í -]^mâ³2áp&>0"†À ø—j"‰–6Aû$€H ° ¶KÜáb+órrMæœyÅ„õW›&TB¤]<0×>¡ + ¤%‚\HÿLÙ€,vùw‚š‰ÊÒÊ£6嵪–¦›ÔPmdêìÐä ! /G¬ŒOL4(‹ yKÁCIaM=’ú®BzÈÒÅOïŸ@ë1œçóI4ñDˑׯ0¦ÁœûB»–x½D4]¼‘ü½iÑ +„G…-yøÇÓ–Ÿ~£¥h½ bzxQ¯£š¨,>€TäT•Ûko迦€øsÖÂâ€_ÐŒÐÍÚX -”ñ}ÁPFéŽÒmÑŠ‘DÒf-·Ú”òç]U7¯tRcíÁHœð¦wÔÖ+_åÒE¶Ã'ËRd´ ¤šøÀnÙØ°<}Œï#ýVGÜC§œå q&}œ®4V_Èk¢.~RVvŠ.ì–îÏ+6Pªœ‹4T€¬É`š³´Ð2★¬×ûìòžã³ÈtÈ)Ê-0+SsSRb©PÝQN zgö4ipøzû¥e)!Ï:*¹O5nnWHƒÌ413?ÁoŸT`Ãdœä˜¥ô`IAIAÑÁ|wS¢NkÈz/C——q°f­kÍÑ+•Í Wݶ+üìêMx ³æÀÎ|0ê÷kÍæ:7¸ä*µØ&Þ8‰màc»f›4'øÈ2U6©·µ”Õ–9+«“ÖçWüí2½{†}¦ý¨Ëk y]øÈÿ0™-‡7 endstream endobj 1068 0 obj <> endobj 1071 0 obj <>stream +H‰bd`ab`dd”qó óuqÖö­,ÊLL (Ê× NÍÍLÊÏIñ,É+ýfü!ÅÒÀÃôC†Gôwèï>–ž_ ¬ßÏð¿(x맰#ã­çÎù@Ò3J --t@¤ ˜4×Q0200“& +Ž)ùI© +Á•Å%©¹Å +žyÉùEùE‰%©)z +Ž99 +`#ŠŠR‹S‹Ê@‚w)d+¤f–d¤)$%Ó3ú‹RSJŠSRs‹²òA2HÜ4€%ƒ1Õ endstream endobj 1067 0 obj <> endobj 1072 0 obj <>stream +H‰|O[Haž1çOKWÚqdÄÈ?èAðŠ©•i^BmK²¢2Ô­ÝÍË.³Ó®v±U‚4/äˆë­`MÈ°’¢‡ÑJBˆ + ‚ì¡ Q¡ùRöÿëÙ¨Y{ê¥óðóëwX&8ˆaY6²ÀpЗoh”-FS‰lM,Tù4†¥b°+,ˆ®À]˯– ¹Az×½ñUè™ –ϵڴÁj³‚SSRR˜†W\zŽÉzLÂ¥vEª³ãÂúãVÙf•ŠdJÂ9µµx_`ÌŽ÷IvIv’’J“p‰ÖP¯à¼´ô´„ôÌÔ¤¿º°ÅŽX–ª-Ú6Y2aE6š¤:£\ƒ­Uø?—’ J÷7Ú$œ‰MR£Ë$3[˜|æ0ƒ´ÿ³‡¹Æ[ÁN”µOÑ…)VÃMS«Úƒi«¯ÄߊHÌEz8ò†‡ ¿ºÚÊWb’Gôˆ Àtº÷íß3ôåb»S?Fò‡5K~p¼ùòg÷¬ú¤«CÜ=Xv3“$ú׉o|Ü6´ –9¾ÿTâùÒ {ÛÚij3§§ëç ™êÅ‘¿28Íãðe8Ù:»ŠVEA Y ýä‡@€»E1Ÿ<݆ZÆ\cgï“q3"<¼‡!˜ç 'ÛoMŒÜö܈îDnó€t¥ü)luŠëyµƒfÏ’§þÆÞK+}G„J8Ãõ:,jm ì€HH…ì¸ûY__<¾óâ^l'âšâ¹9Ô5Ñó¨wj‰N‹í¯x/°Ž¬Å1yÎÇ3DøD"HÈøh‹ër¬{¤Ãéq’§¾Šª¼÷ mH0œ3tróè ÷\¼ªIë«U†ª'?!þ5 ÒA*‰‹‹G©d¯o>Õ`‹nE-ÃÍ#McG—DËíËèƒr7IvOºýVÑM*Ä©~«{B½´úß²¢†Ð¢nØÝ­•½Ý×»C7x*ó[~‡…>[óq-)Š¤^á‹gc˜ endstream endobj 1066 0 obj <> endobj 1073 0 obj <>stream +H‰|T Pgža™Ùåá€;™UwÉÎèa"á!,‚‚ñ^ñFYE%Ès²»º ^"$(Èc-ãÅórÊÓ§B@“GŒ!§ž˜C Š¨IÏÞU7KLÕU]ÕÕTõü_÷ßÝßßÝÕ8fm…á8. ‹ +t ÛmÔÅiÖ îjmª.Þ¢±X9^órkÞÉžA>Èäd]ûïX:`jæ#'i„³Âñþ‘@ÃÁ;é£tÎËÏOåf‘‹¦åb7Nåéé9-qïi ñZN½;-]›šÆ­Ö'Œ; Ƹt­Æƒ{/%…›‘ƵiZc†Eù+'N—Æiuéi\œ`LÒ þF­†K7Æi´©qÆíœÁbù/˜øRq:='Äâ6éu¤N”i\œ^³Pˆb˜Î’`Ø¥O7ê´i CÔwïÐrK86ÃpáÃÄfƒc186Ç +c1lž{[„¹Yažb,ÂDØj %0µÛ"Âü„2caØz ðx$®ÇMø9ü¥ÕV[­Ê­xQžÈl­²Žµ®%Þ"v·‰çäfry‚¼CNˆ7‰«$¸Ä ù»ŸM„ÍA›Û¶…¶¯ì6ز°§íÕö:û±>3²fŒRK¨Ï©>‡ß9vs rÌz»¨™oÆéÜ,*²æ Ìë§ +H0¡-‡ÃŒ’ˆC­ XÀÔçâ)2zú Ë‘€I¾…±œQSoR 9—=7ƒÑ/­›xû3ío~CFGÄ_Mª×þ¥½»òá‰dEæe¬ÕÇ$^Þ|#êZ±iö“æúë“üÕ 9®U-þðTd•VY ¦-Xë%wùÁÜíþòJ³’ö/ßu,»4[BCéÌÊÈÖ›×/ÔVžb{zëã$;wj2?TÐGJù/™Æò„¤¨ä”ð$6µ&«º^~åÌ¥¯”jy'ïz*‚Pójfƒ(’¢-´©ÍêXª&CŠ„Y,‡UM0»ç_¯€S6‰ÊJÄí_@¾¼?¤üçàÇÈÙ/H¯RÖ-"ÎT7ëP UnÙÀú.CîhÞò Ù+ÅBð…+“xØB0HD|€ üŒ.æÐ|1*›r!–Bò£`8”ˆÁBNÖ•VŸ¬Ÿs< 9˜(GÁb*gö=‰Ã Šømf%ƒ‘qh0©0]PÓ¹ryH|àzuÙ1kô ²jº³î(Àûù3pìOj‹¬dÏÅn8®Èú¤ðÀÇlÈ0‘TÎ|Á´¤Î`€Í[0“îOfèá•ÉQ—Ê=û½Á Ø'÷@2ÑT¡4‰é»½µç®·ÉB ¹ÈÁ=p®’î ¯Üt+Y :.¸†Ä5·—°­µW:K•}™MI«åtŸ:6~’¾[È07+¢¢#“’¢?aÃJcªZåUõ]J*w”Ãpè1^ `¦h$—)+¬(ú³éŸ‡îU¶·Hn}ÛðX9x/[䆔®š‰ìøÑÕX^W­,!÷†é#cuIú¨ìÈ|}AVaf‰VðÌ@õJïì‚ÜüOÙÔ¬¤´­Š÷5-ßõ_mëb§GÑ V—0Ž®"˜ ý–y¼Ú˜Z’¼-¦Rw…E¬ÿGGå ­Ï€­8ÿÁˆˆ¯‚ëÌ”²•˜—¢ â€xÊòEIÕ ·Õ£Ðð¤(SZÎÓwùÅ f:‘š€9$2˜} äE¢”W¾Ðd¨‰“£L øôè‘ËG¿¸$)!¿ÐÙv4¾ùϦÒòF!mÂK{…RmÇyÀÐ/{K˜üªoößR /¯ob/VŸm¼!YÓtÂÒ4¾¯âR{—¼Ãp52:ü^Šà ¼Ï4,PþŒéhyIâ^“~mxT‚Jª¬;IúåÐííï. Ò¿kÈ(2ía©½9Ïù½“©™ÒZÑ·ùJ0'oÎT Ÿ1pߧáͶ†½W±ô…³OúöQcx©¿Å£…HŽŒ(dHÉ÷{ŽÿéKý#oœÏý ï¸2‘Àýµü^Ìú¥²ó;¶§£~fÈL\ݪl ó9‹¤Š€÷÷}¶‘sðD…é’<ú …,Zá¦B‹Š•ÆE¦l–òê}yÖ=³DõÊ„—¡xÃ:8ùlJNR7ó&`Í8$LnÏ”Ö`GóÀKôé;`‡–‘y{¶ï‹V Îý1¸ƒ×³!pîmØ›}¥+žôø±”5¥K(MØ,N(¹_£º®\×ó£HÅðƒSÇ…!ù^ Ñü œÎœ~äE ³/óÊWÍp5ƒƒÙ9 ÓÂ’<-¦Ð°°’|3¥üúžÙWLíþMMY^BA.Ùù=„£.œITÄÏ‚ßÃe¹’(åîIÏNÛ³sNt#Š¤¸¢çü­×e¼óstž¿…H +2Ë3ø6à×Ì "óˆ®Ò¨¦®-Üsƒ–ÆWÒИ«÷V—ÅVPP«V—Š¢õ©@TFTÊL†@‚L‰„¡Ö€”B„ÊèâT¥¾Z­ZôŠhÛ÷ÜaÞZo_lWäÏÉ=ûìï;ßþö>ÓíÀ† +ä4èDÍ#ÅŸG»Ä­™’N¾+…åo–%”§M•æìÁ¬7™SôT³Ù±tñSR/yF@Seoù¿‹Mɤtó³gd9Y ÿPXÒFx5`g™ˆ÷ ££kÄd;¶‡Y$ì2 ÜáK˜†ìËÁ“Xò“M›-†É׈ ‘éˆyLZ<`€½|²Â½Èiæ HzΑrŒJ;Q¨+¦û6·´²µ5…í×¥×":ª˜šÝnß. —nI9â‡ZwB­›3«éÞöÀ/V¸û:FÆfd£´ÿ0È-ÏþÎut±FìqÜÍ´-ß:K0Rw3X"wÌÑì žDŒö‘0`‰æaòty馔ÞCÞEXãeîsYOèh¬“Ax |Hf¸’wÉ$V‚*ŸBØÐ8†€·BåUªG{2¬pQó ã +k°¥JãîLj|°Î,·@öûXÎLðÀCüªàõ UøjzÌN@‚^;a‹Yòf‹˜L›& Ù–….Ý a,ÓÂŒ£ œÅ‰Þök4ËpË»v âôù5‡tóEµK#—Æ-·ÖR‡Eßèu„+"¥ 5*tT¸Ë!éÁP¿í)ät%؈Z,[¡IüÆo„Ì_¹Ys€1/çWÕš×L÷eÁìÙAx›‰PºúâÚ—×ÛÊÎT0™”(þO>XQ)•ÚsRQ ±¹ N:š”žÄ%ï¢CŠšúµ _‡Ë\Í0Üzq”sF[tƹÿk¢B’öÐÎn—_±Ot—n?£+’4…è‹EC苹a¾ÇÑ[VÙ>*iîÈø´»Í˜[Ê +cÕ#äûçÄûk{2æ?uÔKLʵ‚šï² 4™ø‰qÎ`‰Ãàl˜ØÓU}¡›Ÿ(>g%x”¶J{¡Ujé ³ibQK‚.1]ÍÈýî¢Wøßxú´±ç—Kg} ¬^“‘ ‘ +5C`Ûï=ÆÎ; qCJìë ú‚bé›èÙ Éü¥XèóC°°§ÓXÐÀÄ=æ'ì HÚNîzf í›Ðš}êöÏô£ÂÕ~#Õ( v¦¾ä=à¤áogaÿ38½Dp¥°­»8?-ÕÄ{ÎωÈ9DïöN8x€Ýpxóf©|£ãÍ1V Ãí3^! ¼›¨žâ&gâM”1Øe`jMÕ§šé[¥žëXRÆ™;–¡ÓE—Ð#ªH&2Am­…ÎëÈ0†R*žÂÛ.`ÈGÈçl³³(‘¥@ôúêèzñÞ°Ð}ʾª?WVYÞÑET졹‚Û m<+ØÆÝÀäÿyáa˨ú-ƒù€ž»êa#xóÉl +K|Aâa•<1vŠÏ}¶íõv7—û¨×¸ófŒS+loâ¨çúö÷áTÑCüÄê)“Þ”edªrOÖ|O_.óveÉi\o˜ôùû´ñdeÝU%E¸ÎÁ^zqùÞMl£R(‡cQ\ïªÙD»øûí‹fÓ)QKÊþ J¿Gïã.ý“×ñãÇiÝçµÎË:ÎÔ«O·Ð?”Ê×r\oÀ¼E—,“ùî‹a0VSÊ¿øÿë€ú <~‚¢A2ˆ¬ Î’*&ö3æ'²àÍgèëÿÛˆo¶Ãoä6}¡øqõ¢U³½\×­öùñÙÃÓ½ý¬0ž¬“Ã?ŸCÏðŸ)©Ñ¥†o/l­§KÚ"ºi|=YõÂZøtÞ 2aõö¬–=L¹Ë'ïPz_ýŽMR˜ì+ GÍg.Ï׶ѿ4¬[<××m…lwMk«5è ¹R!IGü"¨åã牌ãðs(Q7 +Ko`.êk;é<ƒNk`EFDÝzMrF"-ÛäÅÑY‘:ÈŸNeîÎðÞˆñ6i ¯Ÿ×“^à;¢Æ,VÔt¨ºp¿}ý «»дð¤õ—ÅѦ“ÒJsQã¹:eb!SÒÇÏõ1zÒ„¿vãâmgw<f¯Ä*Iƒ£¢=ýeßžÇw“^ý:y}ð¾UŸå®xþzÇÛ$N°)ׯ,Œ‰,®ÿž>S•óu«⧩u1´²ò + Á0qL„Âõ+WN¶×¡bm²è,J19O©´ªT œŒžàÑþ…\?+ ‹Êš[«¿ã 06g ÿ ¥­Ó6wJ…0ç(¼:˜‚.gÝ/ª5a0˜9¦äÆ‚«à…®fjÎm6\ÁYÕ°Õà›·³“xI€¢à3‹’OÞ§R© +Ç°ª•YYºmP"´Ÿ+·¼VðŽY”VǸ®cîüìÄŒäTiŠ695‰9|(<4*>2.R-O-Ÿ)©ÌjÍoi¸ÜÚôðpÃnÒ+Á´Š5!Iû“ÃqFJ.S×j*À$‡£/¡ñ%¯Ð2ϪÐÎ2ï%ùFO•”—›+`=ÄIîŽÙÍÿ«¡“½(¿¬¸(÷·÷ï: ¾†i?ͦÿŽžþ]{ÚÞ©Š'²ýðœøÛ{âŸüi;&.™È‘Ž›ö]ÚžiJ&qüðšøÛ"½b"§Ü‚צ™ÿy8Ÿs}ççþÎ?‰‡·‡‡ïÇ‘oD ÏEm endstream endobj 1053 0 obj <> endobj 1054 0 obj <> endobj 1052 0 obj <> endobj 1074 0 obj <> endobj 1075 0 obj <>stream +%!PS-Adobe-3.0 %%Creator: Adobe Illustrator(R) 24.0 %%AI8_CreatorVersion: 26.2.1 %%For: (Dan Fu) () %%Title: (banner_pdf.pdf) %%CreationDate: 5/23/22 7:35 AM %%Canvassize: 16383 %%BoundingBox: -48 -188 428 138 %%HiResBoundingBox: -47.0016520173776 -187.4638671875 427.513185028343 137.479710095181 %%DocumentProcessColors: Cyan Magenta Yellow Black %%DocumentFiles:/Users/danfu/Desktop/Screen Shot 2022-05-16 at 3.15.03 PM.png %%+/Users/danfu/Documents/Research/25 Streaming Attention/gpt2_attention.pdf %AI5_FileFormat 14.0 %AI12_BuildNumber: 197 %AI3_ColorUsage: Color %AI7_ImageSettings: 0 %%CMYKProcessColor: 1 1 1 1 ([Registration]) %AI3_Cropmarks: 0 -154.779635293237 396 0 %AI3_TemplateBox: 198.5 -180.5 198.5 -180.5 %AI3_TileBox: -180 -365.389817646619 554 210.610182353381 %AI3_DocumentPreview: None %AI5_ArtSize: 14400 14400 %AI5_RulerUnits: 0 %AI24_LargeCanvasScale: 1 %AI9_ColorModel: 2 %AI5_ArtFlags: 0 0 0 1 0 0 1 0 0 %AI5_TargetResolution: 800 %AI5_NumLayers: 4 %AI17_Begin_Content_if_version_gt:24 4 %AI10_OpenToVie: 144.363704196929 -55.4272679317655 10.8769304979451 0 8196.57158459312 8110.74124052643 1716 1053 18 0 0 6 58 0 0 0 1 1 0 1 1 0 1 %AI17_Alternate_Content %AI9_OpenToView: 144.363704196929 -55.4272679317655 10.8769304979451 1716 1053 18 0 0 6 58 0 0 0 1 1 0 1 1 0 1 %AI17_End_Versioned_Content %AI5_OpenViewLayers: 7777 %AI17_Begin_Content_if_version_gt:24 4 %AI17_Alternate_Content %AI17_End_Versioned_Content %%PageOrigin:-108 -576 %AI7_GridSettings: 72 8 72 8 1 0 0.800000011920929 0.800000011920929 0.800000011920929 0.899999976158142 0.899999976158142 0.899999976158142 %AI9_Flatten: 1 %AI12_CMSettings: 00.MS %%EndComments endstream endobj 1076 0 obj <>stream +%AI24_ZStandard_Data(µ/ýXýNv…í .°Ô %m¸+|RÄ=¤Q¦׌3)Ñd‚a‘U•tN‘Ý›6ûyÉ=ÇJ’ÿ !éÕ +| +y¥²IžLvi…U<ä/^Ùo\åØ䯬º,s[µ2«X6!œûòˆ°d¥•Ì&·Ó%ŽM±²$_›ÕÐ|9è)S–˜½lY +Ñæ €4Ó•º&—”®K¼óå¦Ù‡úRؤ4iuiiSΖ`–Òö´c¡lr}£ f“<#áeÝ]ŒÌAé@Bè„ +È àð@¢ ,$H"&& I +P<@0TP, d X@â €@HTÀA2À@€dš€ iÂd’ˆ É$I9( ÇX$ã €øüªyõ¦$:9´êX¾Å·1K¶IÕáénhV®š <<@DL4HPPA„™` +ˆ PÀz,å2HO.CÉ‘M)Z¯”“¶-½ä»’ŸR³sÒV&y‘ â`@¤Áx»` Æ“YÜY Õ@°®ªÁ; 9’‡œ!ã@€E^9"H³ˆÊ ±ù¬zV«×`¤µQ£’£±¨a9ÄdÙ$"FÈÃâBF †#i¸äXÆâE¥–¡=Å8c7•Sk}–Ù²ësû73D +q04‰W„Â(BÊâ`` ‰g…‚u Ò8B $é†cñDäÒ²‘ˆ‹ÖÐ@Ênp04‹ãY™Úà°Z Ë™MÖÇ6ÃBªñšV­,“T _e’2bs܈x@ +€)r³é”!JC¢9’ÔjM;H¢‘½Ú˜Åñ A®Nu1U9ã”C£RÄRdÈl&*ƒeXW&9ÖGK2 ‹©‰xs· @‹'Z™£Ó÷ 7Ì`‡<Õ¢rUY]ae½ŨF1”£j¬F†ØÊÌÎÐÒn cXÃÌa 6×£óÕÙÝáåýǸÆ1œãn°Hh,4ÉÈF2¤#²§ûuûŸÿ w¼á³è‚^ü~Gó Fãÿ”ˆËó FƒaÑ°@ ‹)¹ì0¿`0 Âh0SrÙa£’I šˆ LD@âÁSDAŇ8\áP!ñ¨˜h8H@ð x@Ó6SvzeJI™%ï]6Ÿ.vfL–AD¬L–Ab%ßGÅCD88¹;A“Oæ;b)+ƒd +$t€âB„ Š 2 1ñà HЄX€ d +$<€¨¨xXðxð¡¬`¡€$<½—5ï{uîÞ¾J9oç{‘eqµ×»—=±~“!¾^Õ,ùbfžK¾dw!•{“[fÕNo_f/Ë–Ã~°Ø5éÝÊ/f­)¯Nn¿—^º[–ÂVql·;,–µz¡_ ›SÒl5ç;§TµÌ…t¯*õ>dUl¥|¦½,t5“î¦z¹jrfmË8«nÞ]k9ê¡UÞ‹’h¨îcƒ¥ÕyMÝ«u¡jí¾Óý +US^ïžD/öÓcx?󕤢$¼ûÜ`©“*ÕÕ=}/w­+GHÅ»º8~Õ ïIx%© }~ÅL›ºrU‰™¯M°Fu̬“YU~§òv˜®¬’Y§t½o¾Œ‰gXf,é‘ñty£«ÌDg¡ÒÝ‹ê²ä¹½TÅ4ÛMÉø#:L7_ž…o¶åI:fó² Vr3ˆÆOÅ“vˆfãª+*÷šªbOÎò%Lw^¬¦Ü;š3Û¬â­ÏBm‚sy—TOAÊû{K£Ñù®µkn0óÔB5sÙÙÕ–7;/Ý·”̦XXd™LûW¦fû9KFw×_’çk–‡¿t-Ýã`¶v&+fVrŒê#ÌÊl©±òT&!MÁWf‘¹ô•—=¡ÇÊ—C梴»DrK’MÕåTa9›æscfO²/¯Ò2×t•Ålö/ó}iÙm˜™h™WVèÌ.í—øãYíYÄ´›ýöÍÕ쨕éÊSzIó5U9I®æKYrˆ•­Ìª)G—x’Êí²S~¾y™LlšJ§9¯Ü—´r"–gbgYn©¶*ñ9{hz¼ºük+éеkYM•do)º$oXî’sMUy½ÄDYœ ÉX9E—š³¡2«TlšÐf•í%?ºÐ+ÖKËe’xFT‰FUŠæ.Îä íîÄ*–ò%¦ÄúZ+ÏÊñ b_®´•Ÿ&ü4¹kº¯\ê %MQåae1]ruXæ‰Ktç¬1–«V–¼„†^tÕ%½ÂâLz¼3=¹Ñý¬ìÜtT²â£²b÷Ìòéér¾ šëÓ¥HsçÐÔ?×ÙW?k~ⱜe£OBß8d‡®?SU¨ú*ªV]îS]ë^û´¯í+3òYõXÅÒ½ze™hVƒ†Í3÷ŽÙ:³Â¹›õ6g'D,u-Ó ­²z®NÙ$Z1¢©;r;Ç»ÚÕÂ{ç/Iã;Ûʦå®ÍªœÉ­™´3Û¥›tuS9T­{ß’ëZ­»jÉ×{­ÔfÏŠE®­e­“+i­é—®vv7ëõG×÷Ú³Û§Ô2ך+gµCì´š-}ïx,5ÃrFXò7Dlb±j"%’ VÓU¢{«Z +ÈôMùrE÷[tOÏÔËÌ´^**cª]/"»êC/«[ÉA±€Ýè>ìü9( ÈMQ ççò¢ËÖe–•©Ò¯µèS4xtu+ÂB¥4>i +ˆ,Ð`*KËAù@[ÊZ®áB˜S.®©¥=ZÁ9*wY)œ´ñNÛ,Ê*u•Ï*v–cV +¯ðqJ]¶J!•"RþÊ]ë,uä—^A—±*šV93½JÓ<-ójÚhô´úHÅ^FSüÞ‹ƒ·ÖžøûÛßïx¥2T¤>ëÊ6X»+yzyE½c¥‹9§Ï<Óý&_sV¬6XÅ›ýB=²[¢*;Ê3ê?÷*u ?óÁùè®6Õ¯¥ƒçlÉ;( ¨³KMÛš±-aŸÇÖ»U±Ã<›Êí\è +çÊïèõ’ÇlçFµ +ß©ó5VŠGW±£;D§òé•®ª:áýDeS-²9áê €4J‰y·œ,F5úžfU7Õš^¬z?·pÊWÕßY¯~˜vlÒ½¬Xøj–íƆì9´ó.—M´»:늯•ˆE÷¬>k­z'«;HÙ»êI‡†Fwx²é VùP[V{ßÉZËc³l½XÿÍõ´±b¢k™ª„nÝ„˜Å&ç Õ‰ÈåD7i纉ðhNõz§Q=YTw_9ÆÖ˜lT‰Ú:W}ˆ>ÅúЙu +m§,öÐ8…¥”Ù3«71Æ£rÙÔúÒ÷È‹“yö¼2ØKªñÛSùêÂ<üÔÏ>X£™);e)«ìY¤AoU³#vÐŽy/¿ÞÖ¹£•ÊlTƼ"¶2‹ì­º&›6ÄÚêõ¡7óxY<äSá¼D–íͯ>¥²bÖñ3ÅC3eWŸÒ•–ê¥}¢©’Eµ#,TD÷]¾dJ"¢Ï›V%¢"ÃÊ’Ý.M9Q?vVVš¼Â:a]bÕ‹ÎJ–ìk½¦"ªçP¦ÙAp"ºíÎÚéW’~uSiºœz3Hî¦:¢Ç*i<ö{+ц²X–êY‰ÍŸªìÙùä3ÂöM4û ‹š-zÝŠ¨¶LNßZ—Ü»­æóm®yS‰GoV±k’•‘¦zL,d}Ùиʲ¼zEÄëaž½ØtÉÌî¼)‡õ±äë €@hݾCY?¢Ý«ü-ûت4;S {twöb¯g·ÉŠ±Ç¿,±X;¾3ÔbqÈAi\ËeG?ÂzboÂ4¬ãP_Ô;?­Ôê¹d½ªÕ/醞U¯"ªC³º•Œlî:]½GHdJ¢«S“Ê áÏ—u‘Qmvs¬Þ–½þæ§óêP-Ë:á¯x¨uÖs1+­JÔV!±µ2…h®Êt–ÝX‘!zª‡äC¬7KçÚŸ.F42"ÝË͡ɨٺX&–A’-K:ü”|‘uƇuÎlÚJ"ƒ¤ç²j©åÏ‘³šå×<%~uŸoDTkùCweŽ\FI¶#¤+ƒä³·|º¬2G/ššF5ô‹Ì¨ê/úg·’EidEé[³‹ZÎRþTTª¨S7aÑÏ¢ÿ°÷5o&B—MÝLfúí-™¬“¦zbNI¢ §X88xT<<"< à ¡‚§å”Î _¾)8Ϊgù<ß<_OõU÷O+>çªPíZ-U*#Ò`oíà³”•x+7ž-h²J«ÄÖÒŸ–ç/ñª†YË™´riÛµœ{•LÆl,{sõ±¡KôÇÎÊßÝó+ãæ«n‰Í ùu·ù=±¬=ã¬9¿žù›S³O>ã5ËÓ+Sf¥Õ´yæ¥F–§ÞÙçFí+ñÖßqzÕŸ¦+{W‰Ó*í9( ¸’åeV©D´r©˜X—±7›sOYª ’ÎÌóÈ^Õ¢Ó™ÙÝ QÍ$†¶Q¾sObbÎyÕlYf +$Nz8¿|2Ï?":oZ’$Äà¥òÌügj$FO[xcH‡vÎPëf^loýD 6‰Òz–ä‹~gg¡ç‘Œè#^•+ªMïcmJ§Œ–u|R«ìtgIµ4TTÊ«›6 ”gÔ$±öUæ\ò"^6¯‡å†~¥&U󢧘wS“;«CÄR“lÎŽJúÈA¹ nÁö¥ø7h0õr¾bÛYI²b¡W¥s +³ÒUò•cLç]hÈ.©˜×Zù˜áä/{çbµÌÓå=ºSaï¨êCte(éVJ0k%Öpf®î6Àrú•‡¥1|g¨*ëu,+Ö?”Üp¬Yf÷ªbç÷²ºBdSý,oø—-Ñ^0'‡rî/N›9%Rf9¼$¯ôÁû¦ÅÚĨx‰’å+Ø&wÕ‰r쎜Uw¡»À ¬òFßåPÚÑÆ?Sâ3ïd•fù©âÿ§(§Zå~öéÔJ¶fÊLº’?£DÖù•üIš¼›sê²óvpú÷Ó†MS×èêSC…ÚÓ<ë†G…†,iîµk†Ø² ¾ 6±^uVn¨îz‰ª²V¿ú°>4TûôZüâ`!%dùC¼ÖY‘n¥»ùØÓìR/ž¥½Ž™ØqhìÖiÁ·viÚŒáKκí¬ÔÚ\ù{Ô¬ÛNÖ;}É1ç/Ë'¹zÕ!:¬«·‰ñc{¡¡ûNùkI¿×ß¿Å+½Þ¡³±gÞ{“š2/M Ù-+˳ØÈþì½dGc;U=V”9Wª´=«uº©î:Ô«§J[ÎÓ…Fc¯ý¸J‡d+èºÚàó\S•­š3Ëgaæ´:5t™cE—‡U3VŸÖ¹r*²ÚÔk.Eu׳å›Ê¬s-Fu&Óó¨’ΰîrN®Ú®×,OþŽx.óè!*Ú•]L K„J|†V:•§.”•˜5£ŸÜ3ëV*:‹È ¹)ßÍ ƒÌÖ‰º<¥§fž ?UöÍXÅG¯ÿf®e>Qa9Èi³ý¨x~Y£DÏÑ·¢cËf£«¹Ê¿QÒùG?}žeŽ*mòñèchÔ«ºHåGEeßô:ºÞBHIGŸ~$ÂG=MR¶þlyž”Ô;É­H™§›ÞfC::»šÍëtwfM³fQóÐf÷û:Xè¾åfÉI’aRUvqŠµÝV½MF4b‘èêä¢åF‰C/½çÂÛÍ‹òJ÷Ë£ïçXŠft®Ê5o ’— “Š›ØA£,—Dvûzhv™CL^¾” $/ÑU‰„žâÐLþ§SÛN9kˆšwÌÚAÛNTÄËJìZ~ƒˆ7?Žoõ‹‘ô_f.:þ˜ÖB¤äÐŒˆé,:MNfÉHN:Z±8)¡Ê’aÚœ°“ˆè¬ 3/1§5¤KJìn7/קCDWåq f+ý†°LÏ®º™Ñç”ÐX,J(³®ªˆ™‰gөЙs“ÇNfzæò­¡¶¬‡4‰Vl^¦³Zn5é9¤¤Væ!&QeÚ &))›ÄÊ<+ÃóøvJ°î–Y«cí,?f€‡þo›B•é$ÑDSèÒî|릺{NQæ§êîNDˆwþÓh.ùüí‡3ó¬²bŸù•M¡#Þi^ióšNö'Dì¤çÎ$4ÚœÐ2 mnÌé¸3÷ËS¥>ýóng?Éå €@ð©2‰8”f—ÙÅÌÒ%ÅÉjb9*,TWm¡$#J,Y‰IóŽ>iì9|νëûš<^Ëëz;'Éîe®ºZ¼bé8µ>«¦V–÷s­Uõ=ºzš¬d#²OÞÏyc­ƒ÷éÏ-ÏÝîS…c5n•“¢çèû,'6Ö%^Y§'²+fvP@Λ+¥‹g¨Ó÷Ð.‡z-åÖM•Û1'6»ÞÙ7ÓSæËgg/f7Çg¤Ã¬ó¥±isJ#›VŸCÙÞΦ¼|t«‘š×©ÖŠ®k¾Çš +Òsæ|g¼RYaæ­…ÐÕ:ªö¦X÷<#Nº”Û:ÑÿŸÅžåé;ó4}š;sLšJÓáÉcK#tåy~iL#×ÜŒ«­ÃäöÝ´vŸ¥*&y§=Ûžê“^rj§dÍ©‹1‹ArW½›¹Å|˜PÙ“—'¤›ÒÉÐä2¤´iRùÉqýŽ¨˜ýÜœÚ\g¢r>½å­¹¦ó6ËäcQ¯Œ·³P¹X¬Œ8…X嶕۽þæìöf]¬öKSÛÐõC~}Â"ù<ô+Q±ºSk\zë~fèêªÊëÉu檦í¬QË­ùuäûœõv›Rvµsšpj3Ú͹lèÏ %¦fÃœgK¥çöž8dzoî´%ÞÜòS~¾WÞÜVôÙmÞ¼%úXC›OsŠÑ7…Èð‰5+6•Ošš©Îï.v·¨÷ãÝb>gjæã¾Ôí–ÆVz·Gó£Z±tç˜viÒ•®yîtnÿLûnl>éͺJºQ¶¸/kô;:aÙÓp²ó?¦aÙdzê…°Á™áË­¨øêèxÍB—¼r[ó³Í-ó4mªù§¼QSM¥Ð“5§7®^îu.b¢Ó¥²®ŸÌb•«ôªÌê\•yVXìêµJÌÂjiV/™„-‡…F+ÝZ²êëÏÌ»‰ˆžGUÙ^fË2kÈŠ­¬¨lWå\R9+£Ìš²Ô•U&:³œ•¯ÇŽ2늆ðÊ/læÕ§ä•uʯ&ÓXÓLÌc5³XIæ¡[èªL•™V•Y#—´Â¦Y•¹š,«çj—¯¢ª³)¯ò™î²dUâ<+³²Ì±ªé—‰UD†™VEÇ’®)š§UÊõs‘•ª2"fÖÕ̧¹Èœ™söÒªWºž‰®[>ûÌ쓦h>UÇVQµžå³òÖåå3©‘Üçe™ª5ÅÒÍJMMúžÓš“ÙÓ­âeÉ©ä~ë¡Á´;åéVh +ŸXx¶ì“Š2{ÖâV[µ|ºÆÒ©,›ñ+›ÏIOñTbTç—¡BW&šY¦­ÐªU™øº*f•Ô¹LÄR©U§îèi~–¢ªç µXËŒTr +ý$*ë‹]g7̬ª>qæ9D-æúÜò¨`©ùôT¹©Äž†ö<ôjåYf)Õ•§ð7•÷Õåiv4ϳeó4•F<Â|Ú3ŸƒE=¬|vjÆw•È¨3wú®ú¶3š›~Ų3÷3;4˜6ý÷ܲ“ùšCÓü~“gCº©A;ÕÍ\eÐé1C#Î]ßü<.y>Õ)çvâ³µmQ)s™mŽ¦·MÅÌ37è*²òNƒ“TifWꈭûu]ô*š›m|3|)¤Óó”÷$ëLgÿ÷Ùb¹¯§g•“Kæ¦ íì™™h\·;Ñx˜Î2*zž³üô¶ŽJ´©¢Ÿè”˜SÓïn¥§›»½êhBÊÒ±p +¦åÊñÍw9$ºå‘n¼)–56Ý–/¾aщ. žè?,z"• ž®ÈèÓ¾ØÓô¤Òq‚-Ñ ŸåÆDE„c"“³Tô“ç\¥ýej2+$zµYlì›ZÉñUéØÌ2µóýÔ.Xca^)ÇI¥Û¸ö¼Ñ¦ïïÆË/s°Ÿ²åTcÃD«Þî–´«²Írâ«öáÜõÎiö½^ArÙí°ä÷e÷x3Åeœûw©s‹Ù¶æĽ9ê›õbå´r_C+T6;žQÁr’Gù9³Qõ7yþM_²›«öŽÙÅ&™*ÏîoüBe¬é­w)ó¼YUèl³‘‘­·Ê³ñXx%ëó©L¯š‹Zò4…VôS{5Eã»ñD/ÎK Ë‹|}™Ê,Ý'Í”–ÝRï? –Éz¬¯˜]*Dî]z¶´®ó®¦sú¦¯ÊËM$r]¶3kÙvÚ_^.ãÜÙ¦öÍ'Ë ?õ ÏttºU§L"ÝŽdéüüèƒäò,Ýf®Øçf:ÚU¡ñÑ{¢›…ój¨–ŸåÉÅ®nwûewéõÊ Qùöt™£+dyW>U¯¤ù™­b—­?nBRÖ?—«öݤڕo”WҎזŽiT¨¶Íñ-Xß”\j=%Ê1Õ7™36×ÛêB6½.K¶›º¡›Mx,}\14X0$$ (*,XH&°H@T?2åË…ÅȶÞüOÊRÍÚ¦ØÆb€uc–´¾ÝTû–ž–U Ÿ¨Äðh“Š4tŠ!„ƒ"a〠$ ¤’ùlØ}|d8N6 ŠÅ±±q”„AeÈ ÀÌÌ ÇSY¾4@Å/Ѐ2;Þ°„ɵBjOkc­x¼H)~h¿?¿OH6]&‘w°Ÿ…ô»ÜÇróvÀtý†Xd»"3x+Z d§ÑR”'ÇÙ¯äŽ#íÊ+*’- +™¢wEmãP°|'EM±"ún¶u€T +¶×Uí5קû®ð¦üù®øk*ªI ß%µÆ£n³ùïšZÊM È·ßi„7क़nDºmx*Hµ}š¿RàåÀ'ž£ xÐ’¤ÂI’zWõu uu£¼‚Ïc¼®6G /d|ˆñ‚­Ïkk·)$ÚLÇßF8ûïë`ñÊ÷ŠˆÛèó‰’t»Oÿ6<³@›5—ªÙË}Y4·ÇºÚå( è’S¦-:é]ö'Œ'cî}«[ûõägâEšAšx“ZjXˆ‡['(ñ4ÛÞºÅÕCW×5Ùiù®õz2Äë§_#ñ~9KA|â)ÿTÑÿO<0 …ɨx@ÉçÛ\<˜ßÚP¼CñV“ŽnÞiƒ0ï²½¨'”ÑÅÒÎÞÍÚ °wéÊbmï´Â~Ô;F/©¶-ŽÐÇÈE»ípÇAá`{•zºÈi®ŒÐSΊr•2@¢A¯ò<ÊiÅÛÔÀ|‡c£ÑÿëK«Ãb&l4¬~ú8S ~‘Î/;a¼qËH@˜TdT£%Žº–Ì«R!Rbꀞ "¸"ECƒ°âªd ªª¡Y*XCÕá« +ê,”…• dÊò2¼tüM÷]…#9AeÚϸ°£}ž’Ì­>‘ŒUïæˆ& +ùòébþ*"XVï–K:æôxH0.k¸u¡CĨ?gÁ â/þ÷jaMé¹UBÛhNpêOØ~á#tÒ¹§BG.@ Yƒßã|†¬Em‹6­äOpÌ‹sµŒ6fl +¯O ¦èmÃå3\oÔ‡ó1‡:Zé–]µ¦Nõ§“Q©rRð$ÜW¿ Z³’®âXDå!yê©I!0cÕÆ{HílÒ0ÑÃ{Ë7¼'B˜|÷Íðįmî6dû…tëeÔŽlÅOýõ²òÙ:oyB’‹YÕÌÒ6¤Mǵ°Øô™´ðm^¼Ms‚Ë7£“‰Øz}#º_ņ߇-C^·Kr^ÐM¬ |…§[÷$¤:U†&Žà0w½ÑÿŒñõg54Ž÷­?é«>@|¡yô«?kóz ‹»Õ¶•VoëµmÃ}ócEÁèO»°Üäþ2¸…ûܸ-žN²Êeq£«n B ÞÏyÝÚèw/RK˜ôÜÛ”[ÒÉÁ¶¿:‡q^ÅÖÿ$÷8B‰)=· S"ÄD£ ËŠ/ !ƶ¹ÁbxLS9ce®ãÈñÒßvÊ-|"Xƒ…_t÷ÈÛ$Ìò&ò¥L×+œo!òÆˤ¬3J~ÖÊ/ãë9ÿÀLÈfiõÈ—I,èòE•þ^bW¿&Õ.Óð•Öë«óÐsZá­» É’ªvizŠl*4x[„2m ~ïí¦]C¿°^ƒ }zòöÒ*ùw +1Ùhª¶ù +]ȘŠ2ús™¸Ý±ºø,nôÏva„d„¸¿ÑF9„hëãÁýëY‘}¢í=®Ë䷗׬=£VQÜw°p•³XÀž·ÈIda!Š¶…%¶ðò;áÒ'_8™báÀÆfì>aš+Wÿ¯[|ñ}ø¥\ Ž{¸r­™6±—\¹F»§aek»;‡ –eø*Jˆ%¿Ù„Œ E¯_µEV1]Xg¬†c¾%‚Lï)š(*X“9é˜-˜°åì[¡2VÒõ«ýÿŒ`}³ Àž@Æ…3©Gö6°pyÍ·X7–ñu«ï‰…0¸•a ^®Œ¶½Ñפ@®·]$ÍÓblèËŸHÛ•‰ NÑ9kÒóû¤ës€:5/Ì`ÅŸùŠÑéÍuÐkÎW–æžA9¸ÆÉŸÙ{Ócìçl…&n6—W.êüyÌ^\MP²ähˆ}Ðp–´Ê¦Ëá”ÒÆIªb%iH•¯(–Fö™²Q ¼õà.-HÌRy¨cb5Ü•ˆfó½ÙúHdxóÖkŒÊ!(µÁû pãABS”q-îµ6K;ƒ’ãþêE*È‚>q4ktG½`Ö8œÜ˜CÓmô‹Ÿ"Mi;ã³ +$Ê +ÈP£ƒœÊéØ,  J‡ Ž²r¥£ Õ@ße:Õ¹èUVdY·ªX†ˆ8|øTF„„Dk—â˜JÕ®}µ?æWMsâ;r²ZRs6lVkñÙµ/Œ%ÓqZ»:Å;é伉¢Ö «pÛÕ›ûŸ§A@_ó›{÷ƒŸ¢ÞYëö;;v›­¤+ëµmA¬X>Ç ^ÍëØF˜h +園W?Êì[S(Iÿ6blÈ•ÂëŠ+Ä·´‡©YBËD…vÝ—)¨"5¶”Uáh +wVÅõ/V…ï Y{«öÂaî–£¹*¬2æ=×ËO`àM`ëΆgr³uÂ2½·k3Ý=;lŸó’Ÿßy·RṶïùµ*D±[Ô¢³*ü”ñ.*¬­S(¾F +‡l[L¡8B[YÖ*43ÂLa +Ëö\@6­§j”33…OKSûÿ•C%þyå°'5Z À&*ƒìù-í8÷Á§Âeô: ~£]‰ðsVóáÍùˆ08ýè’Î^|ƒ +‡A4Y·¡÷ÌU³¥0ueÊz£‰6Óhýòmxëèe:é†êæ˜%’ç×Í/R<× <×´ë ξÃÌæƃ³ó[ÏVmYö,=ÿj]‘ ˆš#\G”€¸ÏÙÜ2¶÷w) +6¢ù8Ak|Ï¿P@ÐØD5åË@3Ý zu’›¾BKÕ)GçáŽÁi¿€h Ž+S‘XA§õ21 žÔk-!5m®µ_>¢{Ç«õä¾.è¼þ ÕÅÌhNˆ/yab Üð¸Aê EÑ +Äß–åøȦ"†E ®,×á´9ËÑ©Ä \ÁÄÿ +¸UëSÿ7© % +êðÐAX­æb·WÁ‚ +ˆÚôÛSD]"Ÿ¨‚¸š8=\!ˆk¦á¼˜9… ˆëL¸A<©Ë,¤DéAS#-¡•B[ÃèÏ©¥ò¡ãA´½weM™ÈA–áùcQ a˜8UsV@Âcà_-ÀŸ¥}%“ +ï±Îâs¼æ"ô ÕƒòWg1¦é† -Ž6 +ò¹ˆaj#økte‡õ* >añ=•™D >2wan™ +yGÑ ­oiLa­/ÁI {½’Ÿû&çgYO< ?^wš>q3ßE ð„¾\ý>Pà8I^¤h:ðµ·HðÏt•i£¥ý4c-ñÍBån9Ë»!Tiœ›aî(º0ØvÇ:cŠ›ï)ÂÊEÒÞýAYG´NÑ=òÀm=(æã?WŒ×ì Õ* ÔŸÄ)> ßÌžl€õ‰êùŸt…Äeý±Øwjû.Èü$'Mݧj΀UßËÿÛz>¶*“Šì +|BJV÷¬¬KÖ·¹%®Y­^áÁ—ËJˆ¿Á0Ð2ÎÃ~ìƒÐÖÂd{ñFÞ7 Œ:¼ÊŽÃƒp»Ì£Üäê¬!•Õ@û Gâ´èëc'tH| Zi[‡P§)uû½R‚4„çF„úº¥ÏãîîÞɧä7/æ$q$„ªµgM‚ðù•¿ŠÏ–®É‰›C©mÅ |©¸#®À0 §¿,BœžÓØÃ;Žj í½_a[ô¸ÓÐç>g$Κjƒj$ÒxDO}öHLu•ŒùLè *"=†(ü.Žsz°‰•xPÀ¨Š­Ò¢.Š5Þs|–Ì9iú]²ƒ²²¿¢^ ¾}>úI,9Åíâ%}Þ/Â|d‰Ÿ/ä4)<7ñ˜û9{Š,Áçkœ×ˆ0õ¯òsƒ*çD¾É¬w0^©qèLͪƒVTÈ­:ùTü­B¹ø†„0^… +«ƒk´» +8ˆ!Æ9¨Œµð¶›#žE pžÇÑ¿=:“ðA_æl@-®~ò#‡éŧî}ïû„Ìö} +Áà–{`Œ3¬ CÅ‘û{T`}ÀÈÞ™ãøNÃÔ.WÝèÊåàa’³µ¬àZá Xn÷ ëÚÿÜà̦×ø\Rph,ç€5î®ËL˜"ýoêð}ðúkh.#ÚotµÚ%¿‹A,;—¬ípC•Aï7û¨Ò–yŸøÞ4+nõËIßU³É—šû¿»-ºñbÊketøÆÎ^`p ceP¿¨ùŠ³z ÚÚ‡»0½*+ÊQej¸I¯Sy§ýQéWV6£òŠ…Â|gþYüh¥~§P·â=×jjê4õ·Óƒù0¥YopÏFŸKJ¨‚¶{ùÌéˆ5’V‰‘òï@ Ù +Jíª’<ÐK „ÉùF õ“° &ã«É„èÔSÑ~ÐWðÊšf…6”]v¿‡5X,zD–³Ž=œR”¯àîgÂM‡€©ì›5X—a$%УK[aM¬îxéjÔAàúŽÜ2f ÎÌq”R\á“Â`jMwCÉ-NÄ¢2‰«ÁEqÍ$GS¢AÏ/Á‘t.$BõçpÙ?×ÅünOô›¢É;j2Í ÎQ£ôo 0âðÖD,‚L3¸'MS‰î‰¹'#ƒ¼›uf•«ï¦D”ש)ˆÁþ˜Ú…䛑'|Y4ƒÍmS¼yÜà–ÔÌ a‰¬Ú¹; ¾žòÀ#t3HÆÌÓ«zý q»dPÒ¤1kÁž€6È ¦®ýÇ g€žMÑtði ©þLÃƒÌ j†+vMf°A'®X[èt7_ëB`èÞ Íä,MÀÇUÜw>é\À +¡¶Z·Q¯¸³å8AÔ +TlC%gTÖE+ÊÇàÅQ$mÑsÉ6YÂ+Ó_kdƒ_œ•gõÕ臀±­6æÒù "ƒÉ°±gK`šS¨$ÿ¹××a(Ñ=ƒsºSØ•A†áûÏfËéR–£èGÙÓ +!ĺ VÁþ—U8'ùd"H9g¥ð)sdÛùL ŠŸ•'7¯ú®sCš¼º9 ýpú}Y(!èÎÏàŸ9ßÜŽúJÕªá×d²±™Áev50ÆôÂáŠôZK^•¥™„©bç2XEh7DCãø1ϸ© *n5‰5a™WÓ(Å^ÍêŽAž­{Ç ?*мWN;Þb‡Éˆš½›¸’l1´èZa0øp/¯¤¾`jÒ² ƒ°'xF.ªŠó™öÁ*#~§–0ˆ ècÓ†0 ¬ÀBÔ+úpf¢*jMÅÞú…Á*ÖätÑ ºÃÞSq:ð+UOa,ûe‡¯Y„¡»ÌA2 ôóàŠß“ÜÄT4Jët fÓb¡fÃ"£é¹ã?—d$yá4M'Ê"‹§²’Ê}E¥®®43Ècߟë¹VmlW—E36!ííW‰-[0;´‘Ê€Mj-çÔî‰A,˜zt¯ºi\ Æ"WŸpÐi£®þ )ºL_÷ùL i4Ÿ%ˆ,Y¼T´ bu N)¿Iª[6¶ Ä2³ÍŒ‘8*Ø‚1Þ8—ά§·å–…´!›ð(×n Ž¯®‹6 ![’•±b‹‚[p¾ø†ž?· Û^¾¬;¿¦|–[ðM£'+WïmÊimb}î‚¥9ó]T?Šâ±L. †upñZ\^’ŠH[´AÐÆQ J TŸr¶€Û’¾X$Ïjæ_‡zh@ÎÒ +úy“"µã-ZÁ”Œ¨`QAªíÏ(¡ÎWåâc#¨ õ©‚g?&e"4–œ…"=r“›%ÊÆ—`‰Bò›HïŒá">·ð:MÈ6»Û‴íîI#¸Ã¼­vb¨+)Õ±&µˆclÍ$ºŠù=·˜²×Ô\ÂXÊr¢KÚ‘»*YØUµj¬jXqQ*KJØÇ€ËpØ Ñà¤Rþ³ÚdÉÙÎ>àóX´üºÞ/WÖÜ´d™lÇe»T{Bäºtë°vÊhH ,˜öÂ2\=bÝá%)ý9ÐÄlýK‘Ø´y¬Î~xX+þyÇÞ*ìTfS´Í•úãY럫.Øå¤SAþÖ«¤¢míTÍ÷ãJNU¦å–ÀÀÓ~˜ EÀ,ØO…?”«ß@¶(ÿ`1ÝÐ+úù¤ÀÉ*Æ/gZ½âÜ4å÷ Ö†Òªgl5©ú­Ã’ê·QÁïiEº’So 4SL}_ÎiÞ÷~Ç<R6Ê}B. +%P­eE]J[PFä¨)ö™¾Ðz2ÔóO³¾’C—8N2¿†b›ø»hˆ4id[†ôyêQ‚©àé{ÒpIpD°Â‹ò‚J^µX $EŒó©æ±”Šô¦ÖOÅŸ§ù¨Š²t¤ïCIƒ¶¯D|gª$F’¨NLÊ[ m_ ˆí’(q¿GÿmŒ\—ž¶!Žveô ^è+Äs +uB O¾ˆPviÿë]N´YÝ3Á/¥ÁrO”Üú0îU’⣠½ìÐ?νî'QøÏedj¯˜œ´7¿¡› µìÙÔ—Ã×A%…ÿ8‡†°§ì’'žœ‰­kƒ”ŠÐ#-ï"tí]膯¢@ˆ ®óGÁÝÆ°Xõ“õ X`T7\ãÉ|­ÀFô§ØVi1*°ùƒQO±Do`Èçe]*¬²D  Y¨—Í)g\ XèßõQ‰ö w~$è¶õC(,‡ˆØ| –è,l jâ%Jåyày‰úÝâ¿Dµ®ç‹J%ß~Y¢éõÓÖˆŽ¼ !Úš¬ç¾FOJg{¸‘ÛD +¢@qd·Üs5(}™õ•èÖK A[,è ô 94åï†ÿa D¤… Ī¢g„ˆo/´ˆNæ'ˆ3ýz + ð¢<ñ×Fƒ´wf»¿C)×þ­sdïлwǦÊ*ðH´Ù~7E‹‚ø¤^Wx±Åzœ¶È4a_Y†Ø¿ o+7‹ú©ì8,ë/VV("Ьœ¾¯ØÁ u튞ÙD@Ûþ?Nñµ3à´Ú*œÜ³iÝ«\‘hÐÈž-Sm\3;ªq¿T&ñtùéÊÐí£ò‡‰Ï#»ƒ­ƒ¥øxAéúпQªêâlfOµ`Bü³W4ð6¢’påÓ]æqÄd{Ø,ã® ß‚³*yÜE>q¤NŽñàô_´ñÀÕ㞇Õ·ÊÎ +²’ ëHŽè%¯ Õj,z6öê³u£Q`Ö|?›“x·EäÏÖ:Á¢sh•T9Ökgž eH[`­• >^ÎTI2 …¶Ç“ñ°l\+jsÝŽÂDmyëë$ëÌ°Dm—›gD¬"ÛL…š‡Æ$eDÚÚ—ZŽC'¿[ŽCÕ9Òä޲ߜÒVŽË6X(™ƒ"8‡ÐB½»åŒ Çn„è¶_ÊQ³.)JF VšŸé¯nH©fmÎ'ÕùëDI +6‹±C¼š£!®-θ°7NȄ±ÔÁV‘¯° ЫRÁ}Œ*"°;Úë\2ÝO¸’ +Ý×ÚÁÎejÒÔh;ã´ª'èþÌÓ õ‡G9‡‡Ý m†àüò®S0šª±Rz&ç…ÉêÃrÖDe¿ód Ͳ©°g]Jýz;WˆØ‰7,e2¢Y™üêäÃ4â^m!JŒÃÔ.Ζ*ôB:’ÏNù~~C:ý€WÅø 'ö1%Ÿz¡¢MÌY^=‚§×<NšÛ5\õ’éD×ð¸2F³P7Ãë¿á¥³kX‰“FVr ï’ßyRp^jÄ?ê'æaÿœk£ä¾jÚd‚Ÿ£ƒÇ®!%Wêð™Ÿ'\×1€èh¡Æ "c„¯EÞ¦Mûmæ„;*æ +9 2‚Àôº´n)Y¨:ÔF“Ð`ù{à ¡ñ’çI¬îbå5t(EÙ›žßÐMç9 lL%6Uà 6Î{½ÀÄ©ê*aÄùXØR¸b8€ˆÁSŒ©ìÙ+ñ«2 ž‰nÌ<PEY .á¯vNXÔ3ô¹á Hü"vÌÿPXÐqPðõm¨{ oÁ É=KÒ ¦2Ť…Ü5˜$…4‰(Ã.@Xrž–]Çû”‚F¢=Žµ žW5j–i$—*-à Xc. +æ%¦TÖ'!_Åw `ñV0ÏI,JÅâÿ’P˜ž¾^€{ÌÀeÜ]°i$¡ï v0½t–QQœ&Æ_vË![þ#6@,"¾ ×$ÇË 4@pÉîHe†µ²$zt1-Pcv¾Ð@ûI3 7›ãÉ xl}1hßüŽD«Èà[Í'Q™È%|}Α±-F¶=Ûr„‰uó ùiRɯ&êê±®ý4Kî%µ +aÑ鶓L¥å„ªîvLš+C bS`‘v2'Î,‘»,Ήe°ùªî'Æál«xó¨pÐU¼žÉêèš&mþHp«i采ÑLS½U6“­]˜Q†óAÄU†­é‰{þµ>i!Á ?• +®ÆÂCK|»HV5§¥šVB¹zÞ©’jI§t¢¨…jM\üTؤ”o’Hq¦î‰ %…öùHàÎv© Nd'B“f2Ôð„• ³ñ>ÅI–•¨äËÛùõAÎÑÖŽOÒ Û6õfLiD r–éa45ŒöIx²]þ°`9µH#ÁjNû°"EIªNN‚~+¨$€ÁT#z‡½Ü2Üž­úåFv\Å/¼!AتèŒV´ò½/ŽJú`rNª‰B­'PŠ6;þöˆp!êb>]àÐga¸·l¤{2©„ •]A+L5êØŸé(   Š"6€Äôhå?ÄÍ]õ¡£¶1Ê΃TD×ï+[ãõëpA¡LôÎVH¾&¿(t÷ßNâÙ×SÜ×ÀW}4Ó"5Ë繶ͩËFKåkdX}\ TŸMXî½øÒ¨]3ÚŸ[lëÄ ì8»îÏ»~-hKª_5ñ ò@…ÓC1Q +ÿÿ+ +8.ê´Ø÷I3½ŽÓž+ëäù˜Æ]›ötwÕJäQƒS8ÞÁ2*¯ƒ§Äý¡Ó(@o•£¤uµ"¼Áê×øÚô«Ê ¸F3%5žë‚J j7¯¬™¦»¸EGä3¹3 +SÔ;¼©/M¸Æ]¤ÀÏ‚ “Ù#&ËÔ„EbÛzÊJ Ðk‹ +Љz(e:{JŸ«N’°lŽÉ²; ’ŽŽ 4âÜ5Ò†n®MëÂ\ˆOiB‡|&ÐWJ ^TA;ÌHä³Hd'¯©³â\ß)SÓµ²b¨ç|cd+@‡{­L_Ä‹UçâSßh!O’NÉé¨ý,T×%Opg¨OÃäÛLâ(k’$“ Ĭ³¢%Ç“F5*žÌ?éÂTDÊ«>v<):êPc#Ð^HÙI·(ÁÈ$–C)wR°vÇ :.)äå’ëd©Ô¼ª’oõ*§Ï?p{+‡œgi,6W©sOy~_5¿] h'Bõ_¾r†zÿÑÖøPJ‘m0BbÌ©µe@pà¾eãœt±^<MlËjo‰ìÎcÐ}?Ý,”ËVœ¡Ò_袕°´o ÑB¾Ï½ŽeÕ•8Ù¥ê@·†}¨}K4l­§ˆ5ìÁô¼Î2.5R-sa4-m“ 'FyÀ×–vtöüµ•zAö-Û²|ñw/Ê̪t*½ÚÑ 9ñ’8ß(2ÃBJ|côàú\èŠã0w™õ®ÞAî=î’Fæì‹»ëÏÒ—²’@[')ÂR±T`¥«j‘„¶`áI¥A„k/˜“ûdþ‘û+zŽž³-<óÜxX¯”¬oBNX× ß·bR +GvJ̧•)ìžAÈÎt"˜ùŽŸ~# fÂZ§;H…‚M…׽Š²gþEBs1ßF˱\ˆ2JÍ]²ì}†Gʦô‹ŠÕã·Ž8} ¨ç¸™v`òK³ï„GðßXO.®YÔs{…r#G|Ñýeiö‘÷ª=ƒ·CYšV‚Ú `Ó Àj·ÑáW>Ü%7>dˆ•fÕˆ“½¸ÒLðA­M?›º•f:àWÀdÿc. ¤g™€•<¤ò«fe—u/ESJn^tr§¹aoû˜IGÇjÅ$_¬)jA#q›·™‰bÍ/ fHxõfëÍÌNÏP xÝyy<ÉPý7Óây­3ûo.¥Sz]JÅl<{}ôšL83Œ…CšIñ 4m™N]"˜±ÏÕ׊XOõf&ͧ禙9`ý*ìdØË% §ÕÕCi‡ÞÙÙ=Žî_vÏt®‡q¼4Ì(±¶awÛ@K2Ðd„:€ôˆ8+–ZMàŒ{î[¦j–v¾¾‹ ÍìHSøLñ3˜Ú³S¨k÷Ó‹o×™gŠø«š5š +gÒ[›6H^HesÍW¹¹ebSdg®˜þ•ùìÿ²ŽòŽÖ/ÝLu_/çhݨÅjvÐ+mÌ={ê`r¢3)$CÌÃàÂe ËE慄?W¹|1ifV%–4æÌyº+P ¯ëk* «Ù:•ˆól&jfeR;¢/<çk?èƒsæí KᮈˆH%Í5£ÚïítfÏ$¡Ž•OgŠ¿Q ¢3}Û×ape9ü”ÄÓ‡=®TûD¾œe5{,DÄpXµòŽ'tÌ$cÎw>•ïS +M8S»5^3…÷ ³ÆæÕ5LjúHŒ§Y¡>¢”J§>2‡ØåúŸ5º-ô¦] mêc¡ž¶*Ôëã3ÔºLY±¢êék§HÄ"ŒôPL~Žûh—vcÖcùD8Z~$Š"=lÊvq~¤BÅ`ª– K8Êw[-CfGn[ ÃçÒ£) o Óé ïŠTp^ç±Æ'àô!/Ög„“¢¦E®ú›Ø G8u] Â9n’¹j £ùDÎÏ#óØê*œ_>@E³­:©œžŒsš]Äeœo¿CKÎ8Ñet|BœíTiT€2§žv¢?Bœƒªâ´¡ª¤m`{7•Ó‰¨ú@x߇µÝÙjvÞöb¦nNÓÙ† Ž½°Â:H—³9ê]ZÚ㬺¼×بI§äl6ÔÙ +’òÒ¶¸œíc¯D‚‹s? ' +­rã׉Æltm;ü°ãc¶pÒ ³![ËNLÙ¬ø‹±B­²ý§ÈÈL4e“wœ=5eCz02—óâB2ÛL×U6‰Úºpf¶-1É›ëlÖƒ:CÛ툶[qh#ã¡ŽBÚn_Kh·ãŒê €ç2;`êtáI ßf¤£ƒ¥mΔ'ZR˜ÞIQ|§OÞ¯6'ˆ1ä|¾ÌCøƒd“ÎBP̪íz›©”q!nÀGIêÓ‚NåºüJÊKãzý|Å=¼\$!G¦ÐQC0-ÅÕ%Ó^5aR4,䢑VA +/…~M+k—šÛ(!Ö3Q×AH9=$ªP© ^^¤uS«U‰ü¯{ë›)Nð ­þOD¶ßIüªjàZ“î¸Èý™Žðm Af½Š8 ;™ÊX`!®aŒŠ#r0à%?!Üþ¢þ|@¤êZP‚|úÍ[9.›]ÿ‰÷³lÎcÅ‚T5å¸=„a™q3u'¶Œyù`³u}²Qˆ¹¨)±Ø¯ÎŸú¯õ,½Ñ åf@]éêƒ.û¼ž8›ƒ4”7a!´D·bo¼–d¢D:ÐœKé êrÍ À!^0YZÓ ´þ,×Ö÷Ë›©º×¦›¨ÒÓm¥­â]P’½”’/­m)öƒÙÙt&;Ã×’GX²L~€E•9k~†Â‡Òºn©BŸR$±ð5.Å)z¶¦<Ìp`o¡m/L&’ظv#ÞÒ•ÐÆY)œ;!8¢‡7}­§ë‡’ÅnZÖXíé]‹µ ³‹ÃO«P?¥+tù©Âáþp.›ŒlÜÆAÍâ8*á1šc·…›¥ -N¶Ì€’àëIŒFæ'[DÏBúÀjDkï¦ ó»V0‰Ù¸àÉ;òHàæ|T¥ï8·E½¾JÐœU Ê “i˜!/ˆw”U½·u´ ùþÁUŒ ©(}O&xb&®´²åEÐw1¼€ôŒÿÏÔÌM_.õäþ*–¸pÅ‹žJÛ€ñ¶Ì“è1¤ƒÿ>•†þf²¯QÙœoÖŸRNØÂߘqѳí,ß~4+/ªñé,"=ØÄý@ŠBÇi0˜|'·´3÷þÍÁ­ëÜÛ€w"kK% 1n.qË.ܨ³X¶¸O(VkÈ÷ÒÌ™dº&Aì¢v駓¹Â’‚ñ¾èT< nTX‹˜þäÈcÓ°‰Û²Ð +ÔK¤H+`¬-öÿUD‹ñŠ{2¦k/uÂ(OuSè9ÉVCƽC²¯n|Rrxq‚pGP4©”š÷…îÈGA3„FYrEo×¢TïjJ7¿”…7 +Ñå› +Ï‘¹~ḆL}—SMØX ÆžÊj"ê Ñ“¯ï×`°¦Ù©p‹JÐ.úÕ8覕0~Ô‚xgÎøC­ +^Ó}ŸŒå¾Dцw¢& véêï»J´uU€ts/ÄËï\O<Þ.ÎÛƒ´[ïÅý­žïvëZ®±fz ÃnÝØ^祵óaÀn¬”r¯¤ÎÈ#^#sK2 f ²í¯8Ò\zëÆý( YE Z¼€ý•öÒ›‘«“è?rˆÓ,úü—‹–%ŸMÆ^7æ9è4–zfÞW”K¶X‰$¥ÜP_:á7dÂД‹Àö÷²¾1Ù'ò"Î= Ò1i^Ÿà2ò¨ÆÆKìD2tÚNöXå §T®ˆâ òX̱­,¤Jñ5¡ Å4jEÐ$Q4$éÕš½RÑ”©¡À3L÷p¢°Båd‚/‰¨!š®rFu¶deyƒ²®‡2lW2 I `…/!Œ@ÕtŽ˜‘’ºaTÝ'T‘“d žBÓ}Ï0Þ–”‡¬$ÁIÃêˆHÊG Zѹö'p(r½kþ"ÒÎ]Åâv{ð¸7àQoΈŒŽ&‹'"Nuù–ê¾Ú ñX­í£¨#Ã5¥è¥õ‘,©å·Ä·[‰ õcŠJïv÷ICpôL&~Ú§Õƒ"yÛ×# Tˆn^}-¦>AÓ‡”%)nG£1ÄÅù øb´†FC™(‹‚Ðëi¼ÖDð!fjBXm%ý“ …þ=̪OäjK +ÒoR1(Ó×äGP‚Ðp¨A P¼¾ìõ/“- ç[¼´¿À%ÇØ3ÿF+¾' Êa®7´Hê¹ÙSr…bp«™›ÛɪÅdW¢©ªå/£ÒNãg¯B³5 'NöèFéØû¢ûaˆÀ$ÇÞQE/X1ÔËøŠÄÖnÞ$?ɶ‹Ê¬¢©¡Ðó&Ìh¨¯'_;e,yVcY±þ”S3^ìV‹2+±–¹&3õc~‹…—‡Ä‹‚ò“ü@|8=²²i- áL0aÀSÔ’¦†p¾E«‘ÌäaÛâ’’’©N­sIÄã‚ccÆ —C’‘XµR+”W{fÙÌZ[±õþË,kµw[ª‚× +ÏÕÕ®%¼4a/›ú'é®ç[>ÅúÃÇÆ$ñŽÉi’Óæ¡Q^—eÖ4䃤÷¨Jj¾S]ê#æÙؽ7·f™´)jq¸õwÿ¯–ƒ×Ió²Û¬J,A˜ÿ·Üf!(ÁéLJ‰¡ i”u7Ôaž\ne<÷/ñ°}Iˆ$d_yÜá¡&X(ð® ÇÁÕ½–¿ ÉZò­.ƒ¼ÿ—Ôd sïüïÃÞ;gnèÄÉ䟯'"\[mð¿Ƥ› +¤Došy bÙã`@‰Uv3¥Äræ’œgOÔ]HÁtîÉ#™Fü™6øÄä&¨sóž û¢¤¾Ã£§ðu7¾  ø•©æ}jt ïXM)ÒõK0øu4T"m4¦†‰‰¯ I_4öÅß}ƒ!è+(¶î3@q€ý’Àñ+É›÷¤ø 2Ñ[óªò~®Jë Ö0#=ÍÙf">x°Iå—\TKê {Ÿ‡ F|õ>``‚¬EÀúÆHíH.ã¦=‹é÷ŒÙס~*Jt׳„µ ΩLjÿÍÔ)Ï6¡†$¹ˆÑ–òûl!!'jt×Ã#A2°ì¹uaÐxnd¼;°ï¤e«¦mhäÕ]ú‚Ø›dê +ìAÄ+µö-‚JÔ pë…þŸïçTOy¤ä#ï‘ï =ôÉ)ã~|«¨vR’f©ø´ošC=èo9H[0=–S‚d˜+«~8ÍöxÁnYs¸¸¯¹4‚Mˆa]™ñ‡DÞá8|¤/± ·í3ì™Ó»ÃcävœEÉ,Y—1>‹Ia‡"6+Ùݺ¦[ùY•W€tèÎÛ¢xQ:Õ vù°÷¯Ë*{P Íj .ßɦå:‡Ä‹Å "—) %—2lš¢†kÑ„ï˜S÷Î?§#Ú¸‰P¶«´ãg æ…ˆb&oí×þÑXMÒNÖ žË€Ü“9øç£dÉ6 L8ãtqy(³}Œ`E„áo²÷ª¤²ös¼¸RÄħ¸2´àAK ¼m#V}žk#KxyÉÐÖ1G&®zºÉ° ±S«^Gò ÔéÇÛ\ÄV +ds6àîÂ^ rZ ¼a­ é  ‚„xdúõÛ®Ë3Qo° ˜ŠÄ™•Ô\vwœ;¬%îÏbüuF¤6 8í¦ ÷éh|ØÈ /¢_LË Äayà°¡ÈØÐPR0èMŸ ‹bTgÒfõü[¦¹yÙ©p(|Í°ßß”8hÚŒ«o›–!“jW|¬Ìï/ÅÀi\ Q†úk­£¿’`Ó•fÝt&Vßß:$f Ç»üVH¾‰|dtªüø—ÌgT»ëD/~7¿•ëIÇ @~Îœÿ¶cÕs;hÌÔºAä§ÊÔ/Ï}T¯iý<\²N—+Á*ؘáÂËáç©;¡ªw4¸ƒ¸/oÒ|ŽÓ]\/©îsPzˆ ÀSGa7¼.½2Hÿãx÷//äϯ‰¤sëYs—GC½ßæœÙPÿ[t_ShÅ6Rû?Ýš¢ú`Ûnºrv3A%DV²?’% õ½¯À–xÚ—«Î”$¡+’×Tà0ä¯ +f]N?ºkŸÖ+“³lÂJý;¨?uëÇN96ú>O…Æhs„o4ŽÞ¶:?AºÜFãÅHõÑŽq®BôCyp0MYP–±ëcÑØ•ë¾FJÑøP3ªOˆ^ÃØÚã qWqûø>Y{T#!ª‡þÞô¤|<ĽCé|X»›nºÜÑ6©Â…ëòÔmf»Z‚D Ù,i‘&惭ÁÇÁ•îô ‚RûGzœ±ôƒCñÇj»¸7@¯bý…ööR±¶·ì»Àúµ +xÄ)OBX_¼²Û¯6$Y=² Šìä²-c6žT©šÌÅ?'F£Cíw)iXù箜 ÅR8ô–ïç"+LL9ÀMøO¥NÓ¥•‰5Y݇N+GèpgF¼ëµÕø¬Â99›ï‘¶ˆâÖÐ{¨^ YŒ0ûŸ–/,‚œK½"½™2oÚ¦{lÑ#„k¶&½,lúóH#W—À#ïd¶§¬´lÍÚ[)MŸy{5éz<ñ“ÞÛžÖ}—Xξqõš\qOÁ‰NÌ€=‚”©%0ëÛ½¢VÏ]‘My¢”ºB¥o}çì®í`pŸ?QTØ8ó™ÔóæÃÐ7š´ñìÆC“4^ºéÐÄ]h’ëƒ1‰Á¹/1° T=éKïöE À-r‹¶šm”{XÃõF£¥²­j•¡³^ŠùGu£ Õòýßᾘ¸IeS°üŒôœ*=Ãر \ö×¾±¤i" êØÛŽ³…Ñ kÇ*2_.4>A.áÊ€ÿ`Æ0¥EÞ–>…HQc>¤¯k•Öàêe«Ê·—rNä^âTbøÑ,AÖï+Qæ(1jJ±Ó€çô(»ïzYize]Sk)´øJL¼$&üŽ˜§;^‹hÓP'C·gŽ,gRuK<‰”Ô*%¥ÖÜðô¼ß^X• oOeÁTé—cV9JWÙÉ=VZÉžb>=ÑÍ‹ú‰ù­<µ¹òY¤:á•«¾ùfù·dØ,!ªYâû ·£¢³d–K¿7ð|±´˜òîa*B73jS­¡ub–ý( ocT³.÷wJõ~ú6>ì§àÀŽR½QoOùI4L©{*ÿbN\ÏwôÊ ï…žZç”ðažÔÖç³­Õ}tÖäÛ™õÖŠÊÍ€C 8·c¢…¼[5žkF™€S4ôß2ΗÉÁ9)ˆ3ªûÕt´LeZÚÄ2çªA|ÜÛ95/oª›ª—›š8§‹á➜à "Pâ]R˜'«/Œ]¿ÔÕ¦vJñ½¶ÂJ,x>½FIëX–Oé3‰ØYmQú[£Fõë0Ð@î™Ô(z¨¿¥Blü )QOĵlç#ZbÜ0œÄÅ£’Çn¡,†~tnõ²§4"›³“Ô„‹ÞùÄøv/ + ê¾’GV€pc„àVÅë\ +íµS)à0—/Äù˜ ¨T¤gùÿdÍÏ6Á»`DyfwXR?8ƒÔÁ X¥¤'í„ðÔ Ëj¨>2€…Ÿ¼"U‰*ˆÀØ(J`."bK#Iªº×ö+12% ' AØõv¹RÄ `P$dúÐèI{Ѽj؉‚ž/ø#f³lôß÷ª|§V xœq â$@Úó© ÅY"Û‚)([ ®Î@¾ì±òUÒm/öãÔáØ:ø$Ck:óï~&jzn(ÂìO Ò)\KM êOöçQn…x1ÐjŠ~žˆ¹ì’0lyËê$ÈpÁ!'`ÚÀ¢$=9ÓìÈÍœå::­8›•?1#é#yOýIJÕ-»‰]9ÉFîߣzûx?´!Ž­U2|óA2涘c1\á¬MgŸ™Y˜d*ƒ²±@IÏO¼ +b!! ™ÜA7º0è&ô7a~ÏœÈUo ðM#ìä4÷“Ù(sûqÊ\h^ÍtÐÔq?÷á¶ÆBÌ¿S ûæâ©×Îнx+(tÒ¤„H¬TÂ%(¤PÆ 4L„›ør0TÌ_Çak"Ç€%p° Î'ònî£ÊI¨¹~u€YðýØáÔ‚ÞW~J¯§${üÀ¡g¼¿ü3KƒÒXñÐi’©Þqyèö¨¶àñã9ǃóS³¤ruþæÑqºÄ½SJf!ôÅüàŒÁOB‡-Æä)Þ+îЂÏ;ÉÑ’“ŒôE.{>ý¬?5ëÓ9Ÿt +£Eê·^?BuWÓ°u;pV-L>0u9Ÿä~1O*Èð1â<ømÏf`© \²Ïo8JzÞ&á—z°i«[?0@ Ãÿ5míS¤xM0x=”@°ß¶dhVƒùè]Ñ“¹xœß¦Äk«åúCÞù˺C4)lóP“D¶‰½ì°ân†°ì‚yz/'ugcS¸3|é—I"£–giôÑÿS¼ÐfGÍ4äfLdÿ]£~ ®´Cÿ¹ÊˆA¬¾1øoÌƱÁàX +¼Ôf8f5m['€eÀ¥Çãä¸7¨wüMšÀ);8¾)J(‡ÄkqbkœðÿŽˆ·®.]: ä0eûh‡ŽÛ!ã\ +“›îmzvµìØu æIâ#•!1¥mÃ7Êú¹ã—#=lœZ”jëæ]]e”Ôl25Z™&öë´m{ëWœÞ8vi¯ð!»f!‹düJ mzQ?·ãqe°òxzr%P›©_ãÉQÈ£¾xðP¶iZBÀBz÷ž™ðd#ÝÝä²*é}ÄìS̸¡EkÉ×&ö× +Ïžf^Ó‡Ž’–è×WÕŠV\c"@ÀýÚûMó/ÍB—þ„¥µö¯~OÒ‡™föŸÎ=¢Ç÷C´žØ|94!l5býSfbb®« +æ}Ûš^f t­VQí!Ið#?1ËËödc°ùÉ`à¥ÅùüÓàÚÌÖ&wPdîºúüjÖ9XSÑ´¦…£ìV·wè}ÄXÞ@ù|ˆ èE$4n +ü_C´g)¨µÁoÛŠKE³q¨ýš¼¦•òä’-E $†;5FÜ"›Î›ÛòSª:Ȥ(þwÞTpë‹ÑNÅöRa€·E*dpþ_ d¥ëÙÈ ˜3Ü«Í#këaZî¨z<Ú퀣ÑkW{\y¾&„ÆèT[;¬ÚÐ7(‘/J‡UW©…Õp‘³‘‡hŒÆkÈo]9<°ê¼èQ†´ûì(§#xFo‚Ñÿž ‹è:Ç +cKêðbòEɧ¸Í\WíýÝ~O ÅÌCfY‘™Ù«·€„ ™qÔfvÌ¥m̪MÌ6{,@Ìšl˜ã÷ˆr˜G˜sü—ŽÇ¢’ âq0>òðÃ,-ÆCÙå‘ zЫÇ•²²{|ZŒ\¡eÉW–åÎjLõ°œ ŠÌØÿV–K݉ÈéÆÆŽü©%œ>’2Ýé›u˜ÑùVÀ D(ÔÃìKº€§tØ@Æ™[ˆY#O-sÒKWÙàF^{¢]ˆ¤âø"~rZËãóýŽßøŽ-z‡—%ÝQ>ëÁJ“6/žg– xR|åƒ.ÄŽü=Z¡¢úà÷¿Ý5H`ÓV9ËÏFê9Mb&L…ÐÁ ¦»X|€Ö ºÂ¾[1& ­Y´ÕÊÉ)›÷äJ¤N`PrREA¢Sá1“\)šè—OE…\YGB6"*’ÛC†TX)™ +š§‚’MIÃÃá ˆ`®¯w¬ai»&k×Á׫-›œk™¥µÐ¬åÕ2f·„“¯SrÐód)U\å8–Jð 0?ŸŒ€&ÐÀØ'U‚©Î7sÔõ!ˆ ó]å!È…†àŠ…`ˆ„€!ø”øJ  Hu È üÿd;= ÒŒú€>@°h—½ÕNZ +”NHÇ,pC¥ ̸è‹P÷¼K-,K^v>p’ÃNÁ†àÎLQ±í$Ù‰Äí"salhº¡kì󧙞 • çÆK4})¼¹ÎX[Cb„+0&ît˜LþKZH9?a铈µtcÔA 940x7A‰ªê«¢6„„Fö—@p¿a§Ÿ#P‘zD™;ï€ÔÃÇ©n)þóï1ãxµèÄO…iùÀ ðJHîe²n"‡H¸6¦X!™åv@s¸ŒI"H’·ß½`1‚™ÅÃpŽ›QdxÈ6߉ó;ìùŽ{ê'ßN"ËYêFhâ ´Ëu€Á%âÉS5‹3ÌæézGÏ“kççá:{ò™#Ä“šÇÍBÊÌý€ à¨ò¤cò¸‘GUÿãØQüÖ–_´š;çwj»çë qPŠøÜ‘&)ØËnƒ%DEò?N•FVÑ„øB+µ0›ï²Ñ'#là2øcV1–³; Œ4Àœ¹Çbá[·çBËÖµÈÏ"¥VšõžW”¬-‹µ‚uo¹AæoSVó`"€QUËÐs!\<õk,Q#¥š„@"éùÆ8ÌÊäSwT¸v8yÄP”•&²ÜÄÒQ#²»GR)'K9+;Ëæ#˥㵷.£ ×£É߯Ê@£nÝÚ’Ø$aÝ¡iô@/æ[ï ”c,Šöc|T7{®pŽ¢•ØjÁ•2”ú^±hábnmÇ I ¨k¾Ÿ¬YqA¶º…4:ÙŠ JõDV"%ìˆFp0èÓQœ‡n,@YÕK]|üA:?ŒÏÿý–tàóëë<’·µ ¥·Åø-”å‚ç]hÛ ´¹êafó eöèÿPóÃ5ļp„ÐÑ" të¬Ûº¹ßu§^×Üÿ¯clžØ°“±Å®t|Æ°> +4ùxñ/¼Ã°WKR|1%XP¬ýí@èYÒ^µR–` Á”¸ ûz;†l8;`!¡öøÿÃ]·3¯z`¸º + ð_°î‰ªÐD +a9‘¨ ‘Ä.é…æ9a Ä©É¢ª7€ îG1˜8TŠ8 NÃo[ÒçÐàyRŠ­oÈøÍo&Ö7éÁnîÍⱄÀód«¶Àj{z†êšéÄ|Ë'ƒ‡Í³JÍ)Cxc™â`ñnJwŽ·1`û»i‘. ÍgpƒëÙב½u®ŽÙÔôtÀ–ŽÙѽ“¨Š†O +Ää Šm2Eèæslµ–¥¶Ùi—J»l´B+á³YÊbFÍfÅ,vÿ¯¬áÉ'Î@ȨíËÆÂqšnÅŽEb¸Ëa‡ƒÅº`›kY ×í뜯£{•«×Øç5^CÝ »ÔÕXèÚ7à Üò²%˜™ÃU.Z°Â ÖO îÀœ:¸~Ê«f¹a®ò13a°ÒÙ8=ÊÂÞ²Ô»§iChÙíg‡í˜³ñËT96‰VÐfµ½½0vèdÒÇ>ˆBoÌöD¹d¡¹2[·J·t˜]‚Îô»™åwšPc +ÃB%4 ˆ!$±)TA#ê3‚‚CãÑÎhÇ’’³¡K8’Ž—¤RÐ,ˆXȹ Ü•Óvg´ƒ A4Ò”HJ H9Z–½Ê‹´Øž'¸¥¡¡t]b¸%g jáI+[´‚< Ó¬̲LY]Èšc¬™b­öQrÈûë÷÷œXwÄV,`´bØuÌrÚ€·Lºä’¶ÿD5U]Q¯'ÅÆuè$1;X–*CÆ°#GCGøê)Êæº CÜL™B¾Î_ÙAËŠ”¿¤ìBêU†^†«AŽ%Äq¦(œ¤‚­V(³0`oxGúz¦ó}Öçž|kÑ稞€ó¤o"Ì­pe€NΙÏÁ»ü¼£–áV&$Š)Iì&Ä*çuÝX +NUƒ,9í×µ/æ|9j½ŸñÂð.ÿ5Ý$¹\O\xßænËU¶ð=xÍi?ù~¹+ëf,"bÙŸþÒW(^yŠ1 Ãi jXI̪Ÿí±¢Ã´¤&U€ÜJNójÊ`Ü”ÂHÊóŽ>ô¬cJÔ(ûm² _k°G!ºøy¨XD3÷üv 6†#+¹6Ä(–i¡«l¸d¤áq/ø‰fZҀޔ͉a‚î`¢ÖvŠ¯cº…šÃoÀ:nlj蟹Vtü0›‘&Iåþˆù€«Jö5ÃŒœõ.ïúþͼ°%›êàÎt©á´aÛåû¯© H¢O—F’v¥)ê¾Ök¾¡àž±(bã*L•¸ý 8Y= + ´ÇV×¼KuÎD(  ôèêêª^&h`¶ËÂú¬33=ÛRÇüqmÝY-«q`š,,̯šÚUÆW¢–0‚»¤.#ì9^;’¨ënDn¶J–ÍLãv¶sKw)¾§à)¯¸T ¬‹|Dò÷QÜ:—\y%È4)ó¡S¢²BŸ¬ë8ò 1ÇÉ~-ŸÕ8³:Û6í‹ú}låÈúÃÃbMÕú-ð!æ[Î.þ^^fƒC(Gž0qÌ¢ßxÚ¯†¹ú +âÚÑ”ªa‡Öê¢0¹G‹4ˆ/Ÿ ñòf’]ò |d…ÎãÂØ‘‹NÂ^ ;âtœ~Y eÐ1E[æws›žs̘køgBB´E]þ‹i§”n]¦íµ´Q…ûnÝbïÝ vËÅÄ[sõ4úžþPËF—sL(ê"Ƕ­îEÌ„²k-Gɱ”É«Ù f/x7ÑÖ „ÅqÐL ,¥e"nÉÄôPë ÉÅ#Ž ¼ÉïÖÃÐùï¦Îàj¾-$”¬T-¤0 +8`*8ÃdMvûžX±ŒÉ4K5/žÜéO_E^úã¸fâfZâ?9\-pDÒJ©Žå7W'¨FeÊù”‰Iéì5¦Kw]Ðh1˜^hi–_Øa‰¯Vu ‚*g+Êî:Ъy2C ²‰¬Ö–gª‰ÑzšsI84Më²XÏ­˜Û·•‹JÄΧŽûèdÉâðDI£?/X ææÈ—ÚfÿÔ‹û‡á$“RiÔôGÍœbÄ4‰¸¦¸+qÆB5ƱTì,: 2EfœÓ…4GÃ/ß…J¡0 c7s¢¶’…Ó‹~Ò:ÈBdŽ !°~^åVlÓV`ðÉ6ÛêqÖ P¼Ž×&Ñzxo[B9ÂœC‚Å hï¹Ù4òèPi<4jÌ}`²Ð\œñ.EHÓÇø»RKÿoa5°©¼£­ƒa&üšŒ׌!Ù™xM³2Œ%h\ £Ý…¡yûŸŒÌQj°+ à%È7¢šÞg ¼2#å;¸ÿºÖÓ¿!Àë˜é`º( +O& û¦Øˆ+o•Ü`ö“B2Øë:UŒÈÖl DFáoÓx0ÿ ]VÞT‡äF–2Oy +Ò»qƒdhê°DôzÞ˜üKü§ŠalˆÀ4ðü ŽÔÚf5nbkü4µÞÓ bÓv“^ÚÙ‹åùýgòÿOÚIÿüúŽÕÄÏ"y¡ÒV*½sƸPÑ/(ñ +)I)SJÊ.UU­( Ï»TÆé lÍB).B™—¨, +Ž¼ãÂÓ¼3ÿЦtÜážJ'd>j "GÝA¯»B0+#rZŒYºL  (Ø  H #PÀÁ€àÈÝK"¡>lPlpà ¸€Øà(8ÀJ‚ *‚d€A.À@T@Ê*Ø`Ì6(Á.AœÀ@Àd@‚ ,¸@ƒ p‚`À` 4È 1<àl V@‚ ,¸l@'È  +À€+¨À#Ø ƒdp H€ÁN0à &!y;šÊ’È,XºÛ£‚mýR:^­ƒ»X.´¥J8.¨SãÄ¿¨EÍWÌ8î5¨p 4aÁ¸&ü@Ȳ“Âl!1²†â<˜-0+éÉ# âœád;a]Œyvlždj5߃‰ù‹S†§,žE™Ê„ÐÉâ8i‚v•Èã¤VÅ•EÄ:…‰Xã‰æµó(hÃÕÄy‰å +Åë1ìTßÀâDBÃwæƃ>à îæ&ªE¥‘ûf‚󉢊¯QÂË`Ð\âfcu RQi’¡D+¶eRìL•Ê¬0âà`fÂ]jCW…‹^Ѫð±Q…(ù|¨$‘"ªq…æÀÂ!3P#¤)ªðbO¨º +¥<ˆ-j.3³,WÑËA]kuEàLj/² +õÃ-Pz)‘@B}UæÔXœ-QáXJj¾ò‚U}É—ÎýQKÖZ£s±´6/‘_¤úG­·þY @5šQÕÁ'FU€ƒ™²J7D¨ìG¬ TèOdý‚=ìJ…C¡)âg•pÆî +Ò1YþÅÇLŸ_];å8Èâàžå?hBÆöòÎØÙãC$ò8qF÷”²ÐÐI´ªøP¼¢C'ŽmßT•¢„µ‰2¥ŒßãЦQų†oQ§­â&„gòBÃjP½œŠŽ‰3åþDŒ"'‹²t†]µVO) šT’ÙÁ'¢qÙ^^d8cM=A%sWÊSi%tÌ:G8@ŸæQ´î5$EžFG,à…?ƒSëý~u‰* äq[wE ï!nDbj25x·}µhž9$Á9I®7¡ƒê¹°¡¶N¼bÓz0Û-~£ÙD¤PêpD;ÈuÆE‹Ù®¦S¬î@däeVN8n³àÀ‹s2ÉÐâUy/Ç!áÖ8F›†QGŽiûêtJn˜ÕŠ…qiîìN‰l©‰ÓÄëÛ=`EEJ’Ó«N ›s¤HëÆ‘K"o™vòrîÊ©8TKØ(«¯*F%Rc?p\GêˆjQ ‘#§pÊSÏÈKúµ9RrªE’[1é™PXΑšÅé„‘<ÒIrŠ´}¡Kf<¸\Èßp¡cáÂÉ^l„!ºX(áÔmÙƒ5ò´± +F“‰U¸SñÚì.T{‹ äJívØß«~jÑNRw‰Ìî­>óG +OˆàÁ…Š]E¾¤Sœ`hxÞéÙ‹(Äõ°êSTšžp%—=X¥Z¹GãPÁù,ˆÅ(ñXãbÆq_ðèì%Á0?h}ü½F +.ÊÐÞjpC26ÂMð€Ecí +¬~£—Aò…”JêàüðÎTç,aT6}'Vê3¥¸_eÏgmG5r¶âS¢GýQNï’©Ž’p·IYˆ1q{¤òÑKÂh^{¦­n1uMÕóÙ94ö¦Q£uejLÍc\RÊU +ò¼æ»0 ç5A¢ù«ˆÜ—M}{ž‚JB¤A/b¢ù÷;„'Ý™Ü=yÏ¿ éfdÍ;VWæ»ÄL'æ1ŸGyµŒ±¡Fœä´,{h +¾ü Qâž„UǦµÿ&¢:§K-:›ˆIU1Ö÷ÔD—:cNTŠà«f1.4S´ô´Î^jØyÐr¸ù–è\¨¬ »íÁQù:•E'iê@^ÙÏbOƒ1o®RSÖ­ Þë…P‰tìlÃÝÔ”Hßñ¼Z!…û’O˜ØnZ4Ä%Š&l¤³r„$U(žG‡¥Z4CD›mjÐ"5Q›ÈžN§ñ^B>CQ&YdÜ–èºÚ˜D§ìh щ´×~Òy-5ק$sy4ȹÛC©>(·°ešz!E»†>}»ƒÉd“]Ðý°ä¢Æ&2/Ç­Í71ÅÔ'êV¸ˆ=F„â 3ÒV‰…ç$â #CŠ‘ÂT¨ô¡0JFìE.Š½Û¥LC’ØI«¬Cc +ý‹e¢s±Ô»„\¦¬„·”Ɇ]f\ä¼,åtþ’RÍcèP!À{Ì*œ¨èkµ*L •âffîSµ@EBÑ°Tyˆ±«‹%&|XS|¢ø¿ñYÔ¶­¸˜ˆ(1Ž8‰z8q:â‰Çç°b9(îbc/š`bÉJ;Œ8+7X +;¾{B,!w±ÃME£‚&50§•iyo-M«îFo¿¤*åÐÛ­Ÿþ¥aш…Um§æŸó{ ‘ØŽ–‹%­yÖPy6ÚLœ_™cÞ‡Ð=¥)¡ß2aõ¶|r”Š–ˆÇM +»)’KļAs?#³0 ñŒ›”KœXÈ÷š ­îpÔe¨dÿE,qžrÿñ_ü(aü>+êQeÄÿ“BÑÁ*JÁ©pÓªLŠZ™¿ieŸVvì3aø¬ÌŸÅ¤5N|¤‰Ï¢L™ƒNGŒVó J£ì#* ’B! yPâô&³UÁ=˜ˆÈ¶¢ƒ@ã‡DÊWHÄí8üù¬XVH'Ö>p³_æA$ÑœävBRPë>Š¨U#"òÎ1‰ƒ„È'•BäoL¥‘ÉDšŽ012Iò:"†ÊéDÉæ´/ц¤¶jдâ¨áhFøGÅŒìÎl^~…¡9èÔgªí±gèj%HêyÓ’ÅL‰d&¨§òHæýyÁdd:Á•(§^Í+.Œ\*dw<2—Ö­Ùûšuñ2¡ùô½HØäXĸfº’´®c¯@³±ËÈïMãÈíPI¼y¥D¢ÎÌËTaÏ+·« WÇáŽ+f\ž ™ y†hJDbHŽËVCtxúGЫD~êÕldXe”û-pZUЋ!ó2âQ3QÅÚ\Ee¡(ÛEU±FQ¢<Ù<âOEYEmÑzªxî§6™jgÐYˆH¨3¬^èñJAòqg¦fJ35+Çȃ:s@º39Ìù¾.«‰rÔ&!õ)ªPá¥ÇxNo!â>efZÎGˆdT¥6ůÂ0W•—†H4×,ö‰>eÎ.ä ]C$¾<ä×MÎuS¬™!…Þ ŠáX¢É '>dßdô`ª’ ×p†ŠH‚j\¦¨¡ò#"ASâˆVy^qoB!ÄwDÿœÂZuÿkš¶Û‡¨S%!oÇR]"Pj„>߃.Q3òÅÈÁT$änmI•ô¶Tjwç’¸Tz™¾¬9IŒi¤Jhºª™óE*UPÈÔd2<óš–=íLQË"BÃUI-{f,kËÔ·²)#Õ–¥·‹¥Cna=¥™ªºÜ®ÖÈ·RÙm9'Þi •ŒŽº’˜'X¢‡¢4¦ + Í„Më:[4QECs!!ËxhìÔLhì yÆÞ´•}*|2´ºCJV¤z}0·ë{1X–qÂOT¶ŒCHjI…¼ðLí%É3—?&ÓÄ¢CŠEWkb5ˆH¬‰ƒÈ3SR4EŠXognPÑAýþ ÕC­KøõªBª5Bª=NÔöD/¬Pêê-i-D~ §Òd¥>–;ì#64¢>Bvrû ’Éop¤NEÈ1$SÓ"w$¿c[l‘çMÕ·¡zøI"î¼]Q‡bkøŽÏ`i„û òƒ¢ÓÒ…·Ïº2kã>KïÙÇçŸ:Zô’¦)„#ᔼ¥ÔêD‰à—Hµž¦óAt-ÄiÈû ¨ïq×Aƒ(Æ0Õð ©Ú ÿ—ðmŠTPL y +ÞtrÉ7t0BQr{-Lj§Ù‰#§0ÂÐEmä~ˆÛ¢*šü ¥“×  +–q[´(;ÊŸM¢ƒÅÐäAUCÜjÅZœ`T”¸LA sÆÕ>(Ib¦NŒšWLD†¥*Æ$¢ŠwåÙ9"EÁUòpÑ…†TJ9ðx\䟛¬¾IÅ&|µ£vÂÖV‘Û‰N( +%ùAä€C9Ùíȵé`%"%gj<¨WÉ/^— W§çð@Ìw†äÀ¦©fCæWd³æó°"&º 3¢¨â)»&á1ä4Eœƒ’ƒXó"oÕQKÝfA¤‘3 “¡Ú<¾P²Ps2kÓ/M¾L 9 ÉËäæ—„oIˆå)JÌÊDs°eòåás¢`ò4„B\µ\Är„Y£O³PÌEJ>pB”=2;2Òa2¹¤%Û¢ŠGT<ùAËC4©ÑP¬$eÇÔi¤U5õ SIä`¸jÈ4‰À™˜¡jMD)ÃÎE!%ÓXÈb™u¢;rL(¹"Ȱà«3á3a4!3.ጠUBVBQyRRG<Ò¼b!W„Ë%äNð”W\¨NðºsÄŠ=a†‹'ä€z@Õ™M•Œq‘™wQ¡he±é )±‹æ`ìá9A†>›Ô÷À,•6 9°Þðs#‘ÆvDeÌ ÑÈÂÇÈ·Ø=bˆuì†F"¸óHˆݵ1ïÍHˆÕ„n + ‰)˜ê¦ÖDSM˜sPY•tDÓSïô€eд0ó"LV Óƒ™™N…£æ;Z=Ø; +añ‚D+ +Œ"šT½ä¦ m¼Ð©Eˆñì›æ‘Kß}«›bA8¾e®ÍAY"†˜“E|΃GÇOàÁÔæ*9øM'þK‰&°¹¡pªÀƒ•D) ²?–}n +!# eIó©ñÂfµ%®µyè¤MŒ^æ¸ yÇfÑ¡Ú WôU„"215üH‘fþ‰V¼©ñPˆV2Õ¼IÿRŒ?ŽŸ—Ñ¥3 +"CZª†IʨU‘Ðå;¢äˆ²ŒênÔLós1 ѸÒZ¹Y%Å`AÒòƒ¸ tÃ! Ç"+Xö$´V'¿&­TL°b×*FR'"žJ†3¹_YɈLT†¯ê¢ÉPÙ¹h¯¢oÕ**rÏ*"cªÆÍjŠ$„ªf”J¹1#ª© )‡ƒ(8*¨ŠÆª=¨ª¦ˆ:E)–á")™b|eÆÐ7+L"ù”ئNf¢6úXIIýt‹÷Õ¤µs>.O(!s¸U/W•ˆd÷2ÓÍ=beZCRduæ%¬Î_ŸÍ«ñ!;¯­^;…: GØ”é„k$n’óNybQæÜÔWÄwŠÑQåÈT­¦¿¦ž’9îæzB©æ¿>)ã’°Tt´>?ŠÈ43%t8ÿøq"[ä=ÊÔqNa÷ôý«GHS¡‡Ô¡¾|ÚHg¿‘êUjÚ;”Ä낤eÖŒ"ÅŽšt6ړζ/!“Îú_µאּR[nÄÒò–ÝDGpIÑ­LùÑóá|¦ÅxÅê–bÄ{9dî’Ú~!Ò’¿$÷†JÁŸÔÜ©¹ìþ9²—[ºxñš£¯ò™fsÎ}3Å2H&dQ³T”ç`;0?Cv³ºÓš¶Ö¡êPM¨šNѪhŠ¤ã* Õ©šÀv¦¢SµÙ¸–Ð÷à"®«Ž`ØÐhêð˜T„¦1Q5!8Ñ(’o†¢Rãpñ±?§ÏÌ,ÌÔfõM©:-7Ôš¡±„fµ9(: ÍVöIÞ¦¸QYÊ•+Ê0 C´iÌ01tÌ¢R¡Oh#¤ÁY‹ò @AKÞÃs~<=*–z(Ðã–ªÐbv˽ëÕnª¯^d f™âå š°~Ëï‰B逛š»C4¹ëo£¾ºö`‘›AD¤k Õʪšªt §h4Z(î{†¤"#ý"S%‡bñÇÝ¡ g¤4]HŠ#Ç´Â94)/¡”Z#×B™âXf!R™‘1md$Ô¦»øô”=)¹hC-Kd MEÏAKhˆ}›\¤§å‘9;4Ž i¬¡e4gmh„©FÉk¬“D¨^¥´ÉG†hô#úÜâÉñŠPþXV­`Qˆc¨$bTEvÄÁ QÄ –ÌZ$#ñ`q¤2qxLÕ¶¿±Ô•¯cúÀ4j]"a²?ã r¼‹S‹Þ­^!·S|™^ÞÔ«×TU¢Ä-¢%JìZ©žÔ”_òù5Õyµð¨®š©ðËR×DÙ¨ðD&Q–ÖËT!ED5WTyÜKÝRTž‰*ª\ñ äÒfj܉Üõ +*‡å•V+¢ÜL鬞)¥ôQÂfñUžZˆ„Ý”…Ý:,©¸HŒ5– Ë‹X‡UaYvñH„µˆK‰]U–ºˆƒµ_¢.¦‰g'™Hè·ôŸ:ôS…F䌨ŠDôöé42ôüT‹û¡2C +U¬B.',y)¤ÓÔûJ¨l™FIØ­ù¤å<Ä i—5—FŸ§™>Ó˜d¦Ó?íO£èôGÊDûŸ6úyM5B‹içŽ^1]Έ¢¢\팢H +Ô*ŠÆ¡Ã+bÓ>™‰""G‰â"ZùB¶ÑF]X®›X¢‘‹—mTi>y°Œ—ÔZd¡…cÜhøHë„8V»T¤ˆ­¢JWQHmxà"Ÿâ®ž­ª(IÅ%8’ü†S”w!Å +I𘪾÷z#Η„#H'ˆ2áÍ'j܃šèE’éÈt”§æí`BÓ:ÔÔŸïòâÁƒúî–î÷Iñ>ჿ߰ò·É÷àwMħä6¸È0KE©3,^|ш¿q4^’!Ë•Ni¼xpj³ñ?Œ¿ZÆM‹=¤g¹yp%$5!å’¤.´”ŸÜ-}"“àIŽÔ¤“;بr]¾Ñ¹ÒMßä /‰ý“Ù,¨t›hËPheÄ"‹·c +*“ŒN#(HTÐÉG·©šwVT/¢ó7lHѹXÊ›hF:”©Î)Jø`¸Ã²s‡öÍØ%Œ›¦qw‡¦‘N[•ßÄê„ñv±S•*oûìÌÆQ¥ÆS±|o)=þÓ¹ˆ÷Œý³ßAÚŸÞ.—³ªf‰ÛŒ8O9ãaFµ£ªø~‰¼Óh[a«³Ó ™òµŠVnRoå&««‰0ˆ.ºhhãzo­³‰q¯`ƒ(²ÏH"û„CšÉ.B¡˜¼F4·×)å+¥Šd͈ˆ\†”šF»X•“SŠG±ˆå)Š15ÅpâößÎ|*ºe’¸‹4õêÜ$QŇ_øü ÝŠsXCUñ)„Êq!z*CñRŸÃê7šwqŠ-’$¦hv¤› h‡â'N|Šæˆ9²Žt­Ó^âÚ3‘ÇÞ’H)ÝI1\’„êäLB+çÍë›åMò4bÝá‚lÌ]»¥F¨¼ƒ»ƒ,‡—ýA Ç[¾aýÂa<>Rp/áû~kLk¦dKNãÈIJÅ9£!Ùiƒ?d¹¥|åØ%™¼J9‘2Qe§&D” ÉGÒ{¦¸ t_ÅL÷úª.ï“…ÐömB%DBû¾‰“7ò=¼Ñx¸Ê%Ɔ)³¢ŸkõÚ‹vÒ4Ñm›|UÞÅ[í1<Í\ŠÌ¶Jì$­ž#š›Š<8cù iá4&e†Ý½ÂXˆû‰¹ž…ÄØ"ܘc®73¦9U¨hHf"kßMÚ¹ÒÊF“öSr·V6%Ylµöü¹ò¦±VÞ4â¥JÏæ€ü»Kí/_H *žf2{k³xw%S]IE¨¨&®)žèª ×Èñ1+ÇiHÆð5[UÕ°+¤ù<Õ"kZi۶Ϊ“ª<Ëd¿·–ÙË£¶Ù'\GËù\µRŽún2›Ð^4ÓŽdß|*#o%£v2Ö¨“5FRåB6Qê'“šó†§YyÒH"¦¨äI[ ‹XIyš±”ˆ ò£‘ÈÝt´òÂqdJÆ=d¤:±ž1¼Ùœª¨ ±+Òí'­’L:®3‘MÆ}ÜäRÇcÇZ®ÐmX\y‘«òet¨^ßl’«,©M¼bްʼn·Øâ+,¥Š"m_¯<ȵã†Ä߶ã#º„Ò8 EHdʼnXë(.Åø‹3Ê>µŸ,,$.'™;MÆÅ<µ(,G™âbŠ¹¸MNÔ“xœ7·.N¦©QÆgÎ:Æħ8\¼f_+׬Z¼¦‡q1+g‘Lä2”³¬Î,5ù”«·^5ù”ž|JÖ“Oiª(£ˆ´éƤ2£åêŸN‰‰¨QÐüuŽP‹X!†É)¥ò“QœœÊÍ¥Ð8oe)"îA~¯Úð3‹è ê¼9êùs®íç‹«èË5vZe½¿çž“êDÖÉG%ñÅ%WvAM?*žºâç‘?.+i#ÁS†Dïôæ»kwm¥úŒû|ÞiFõ¥ á£[][xþ^µX÷Ò¾oŒÛ&W…„~Õ>^ÕD„û'ªF×ÿ‘•„VHƒŸšçÒ=;Vûüfç7üMZŽƒãÑ.eõÊZm"üj¬§’ªxc¼ÓïóîÍ Þ\ð›MŠð«‰ð«­:¯&üj£oZlÄ“F“ž¼Ð&JRÄs¶èÔðÉDái#Èœ¸'.úSCLÙÌJôÄm.žº©˜ O³b‹7y¡U+1çH71¤0:Ú?!( +Ý‹"OÁpÇ1²½j#î N'Æ©'èщíìØ£<Êù†15‘1Ÿb(¸Æ‹SÊYÓd\&E!Ô8 GÆ=1Å0¹ºGP¤û2;}” ÍN³Ù¿V¦f]¦ÈV¾h5› +—‘SU +Kh7 +jéÒ[F¼Ð&ZÜD_Úe8O£®RÛf‹žÒ +WÙ§;×—¯´­¯ŠZ¤¤éƒc‘ôÁ IÏázrÁâg¾W×îM¶Oq;)Qô)Hs-/‘ò7’ò‘%Aò‘”­‘D’¢TrKØM,ïH†üÂ_H¶¯ÕÙË[ìÐrŽTï…gt5gâ¾(qý¬ÈÍû_èb©ú†,s¡Ž„e:óZJ Ú”efb9§ãÔ"•Ù%¦.¥:Dæq^†^“™ð²Š™Ë.¶Ð„3ËæRB ¢êc›8¶XgÔ¡+T3‰×ÜÆæ66Y¿S‡ØK÷(¹FÑÄ|uUÃG/¼½Xß ?´ +ò¥}tã£;Ý9 `ƒ0À\°h ˜À Ut†Št +2nrhÑÝ¿‰BZæ‡E¥Fê{·­:˜{Æ'íN)Ö¦™1S3ëN8Yö$]gp]wŠo'¹°µ;åô0"¤BêNŸédJûx;u§ºÊiÃŒh»SGÊðèÖ;eÍ §2³šœÉeáÿ›õÇép悯ÕÇpjôNÝ®øÀQÅâõöpcZx誰FòNí2¿ïÎñ´wÊ7È l„KØN•t€  Ú©Ï ÀoÇÀÇ¿C)ï*g?Ž¬9;U;lO9‚”y?‰N<õœ˳xªÚˆK ì§OQ~Ýy +<GÇAÍî\×0 +ŠµKðTÝ©iè‹XMÓ‚§¶<ï>:tìG8ÚQÖ‘jˆPþý=ä1SB€‚XÌÔ8‰»÷‡n+ñÑ?y§|Zññ‡à³pâ)Eã¿H +L‡´„ Š§vððVU ¾â”W“ýœ1Üó¿¦í܇“ΫzÄSƒXßéU;IfO…© ã…á)iÌ­¶°Ë!Í–ÅSýYØËaˆ®ÊîoQ<$è$ä4åp¸9ž[‡C'Œ§rCñÙª0â)?ŒÛŠ-cø<ÉÔxªy]ûë°ì¨dõ"]T€Ét§ÂU„kÒf;ž%€¤¸ÉwÇSŒôÆ‹[i}•mjã)—Øà%7\´Uþx*dv?3³ hÍñÀ‡SÜHsxp¿èÜÕÝ + …¥ÇS»½­+b„b•, 2ò”ÙÓ­gµE©Pò”¬A­oXùO—ý@§¼\I5Í"Œógg"‚X’v5ž"L„·†ñÔob:ˆGã©4g4%qò‰£ë0|!ñTNJÂàiŽg!í¼Ìîò‚áx*Ê*Y3¼:žZúÉJ¢é³§„îŽv"¸ŒrÐ5žº Ð7CÝGok®×ã©ç™h¯Gxf‚R6.E O%œ-4ϨÇSº¼E@ˆw>žÚÅvüÃñ”xH­ú` +QbEÔêIç ´ñTÏ +ܳ"O•ù†Êêã)L xì)²ó´§{¶`°Û󶧀VñM¯ÉÚµ§ŽShtãÛž²ÙŠqC©˜¦9œÚžR´Í½.÷”÷bgÎBü H‘{ªC L§.Ó=ExQ‡ùj÷Ô\HÃÜSL“dQãžÚB,í) ²Ÿa>¥àS´½sd ¾Ê9qd¹›t¶”zDO›vOu×u¹§B·ñ¾ØQP¸PûcOhÑ1^¾§f[éåhst$ø{€<$:•N½ ¢¾§ž­GÙÞ=|ݳ°Õ±{ +,_âÐåÜS\YÝì›öÔuÐ󣶧¼Ÿxä(„³Ší©¬úðœ5ÜîöÖ#Ð4&b{ôR:Ž›Ì*ÝF{Ji øcãZβÁàŒÂbgïì© q×OùlÉ–Ó¼ÙX¶ì)yȬ)ìàØr…=õYà9JutX³§.Ó>ýŽ:ŠêU¸cš,ì©ËÀ{é¥,ØSôzëo1ÄöÔV`›£‘Î6¸m·§­ÚP¶§ÐÉ:iø¨<6åvÁ¾ÖDQÓžbì#Fí©Ü +šÏëwö-ZEÙˆJšX°î) +Lf“TIgƒ*Âé0¨5™Á³Ô¸4‹ªÌî_ &X®'sÑôð)øi rJøÔ®†Å%| +{ðÒž!AàSØ )=|yãG_†^ZÊç áSßÏ}uܳ] Ÿ’Ípœ=| +¤ì‰§0·4ÃÑù° Ÿjo *ÿ\ž$t¬¥6ºÈ5‰Ól >ÅVJ&çÐ`j¢’糌»+ˆýj\çHÍ9pª pº$¦‰B8ÿð)©ßÅeçâûl²Èv¢œ)– Q߶§èuÈÙ¥e~@R/–ë=S=øÔ¡Pཥ¼Õ>•Ïqw~œ×ý³%²ˆ„Íð)@Tï½_Ì5Ÿ¢.‘£A±$‹ÔÍØj}Ñ©¦‚OÁˆÀ‰¹·ù³7Ìî$5a'äøT«ÁmµBøħö©iÈñ©„«özП"ÅÛ&aëY>qè»—;ZÀ‚O!’Y $œ‚•ßßSÉyÿuÝçôž +]ÑàPˆyj›nïžÒU–ñp+Xi£(¾“¯æžz˜|ÌBùÒ°ÕBJFîžz£=©ý>3Fò®önÜØÜ=EwžG`ð)R6îú`á²JöôƒO1LòæªçקF¯ç°€ƒO uü67Þdæ¢Û2|ê²\1#Õ#àS¤±ó |ç˜3Ž•‚OŸó€ÛàS÷¸")ÆBãSŒÉf +Ÿâ…Å)ð]©Õi¸ÌŸÂÚùKi›C>J;éiÚJçŸz> ò)I–› +|®|ŠxTŠO ò>‚~ˆOñ³½=H•OÑJª|…”‹Ôg-{w˜ò)¾Z†SaÐÐLd¬ùÕVÜåS ´•öàϾ»{)åS¸ßÈÍCû—OE!ñ0ùÔ”çªÒ@*£ØãO>•ó³³´®4Aù_ùY/p2 >%Ó¤S–˜)>•Š²Šêù(þþ+Ÿ Ÿ"&ì4=&&ð)—¨(˜ž*"à„°²TVw£h—< 𤋮ð,T4Z@BÙZÙ¢ñQð)c¿Yå‘羶q°%|ÊòÿV‰ÀYjoñ/>5ãÔćYàŸzŠzŠ ³PѼ¡•’ëy,³øjçA¶·‘V‚øT¡5BA¦ñÓ/T|Š·ª\/ãS}ë4ÞK¢ßøTx:·’`Õôó ÚøT´@Pǧ²ÖEîp?f¼§˜‹iM®þž³ãYô/˜jOÉÓÚ‰7!ãöÐB*(Ä”-ZpI³nkO•¦#!}µ§´°Í"ʼn}J˘ªAˆì½½RjâbŸ +†“šŒödq~ÕÙ§¤ïY’}Šp{¡8bÕ§ +ÍÙW;5´¨’}êPöz´VÂJVìS”¤ªéÎRT(õ©O‰\ÿa §‹¬f“‡WÌÚµ»G«9Ì¡¦-ÖD¾ïó¿Rïaê­`›Õá>¥íBíœì>•ÇÐȾ%‘…„ñS+ü,:†ÉOYÖb¯ï`Å?å›iü³O7~Ê¡š°liÂOq[>ÎÝ +LÚ9QĽýI÷)­KJÚbÿtŸŠ\J¡ÅC«LÛˆ…v·uŸ2ˆJÛGñ¿Tíæ¸OdÇ{¬b‚P¥Û^\¢Ô·¾Á¢UªKŒÞK”rqª ?Ïýº=F,³ÞÕè}•Ž ìñwÅ•7ÆTš#×S»+¨H`^»&q'ô¯j¢AV*šôT•`§PuÍ30º‡ê‘·3ö§þ©Ù Ì\\”ÕVÿÔ@üoä†MíŽÊQ{”ñ¬ÉwdRŒÿTÖqyÆAP'Ž³tpùëµk€ÊÚ’C^h€j¥QV®OBG mê|µ•Î°yÒÛ^€ªÉRÁ'¸*¨œï +ö_T‚ŠdŠ$›P‚*ç&,‚7L— J)æÿÆJPݘŽ•³•»»ä€¡ì|ŠÑÁž ’3t9 ›Äèäæ½yTò$-Õ¢¡G ¡*ÁçE ¯Ø!‚ꪠ"Šç ªrµ,µ¸àª™~õS[:®’ úOsŠ€éô¶c‚ŠtÝu#Ça„%±R‚ÊffWQ·£²°=–Úݶ•WÐðq—‚jÍ*üiÈôVP•ZŸ~žˆ„˜Zî*¨þ…ïxýûA’:f.Ò)¨ZÏjö‘_/lþv22]!’»¬<õÐ \O^·‘Lú1õ⦠ªÕtá|~t +ª„)8Z¶rj2•F­»Ï•\˜À#@ž&D’Ò®þ,°¡3ε“l‰•©i'Xü´Ûàç¨ÞôÝ%¨ +!‡Äîm•ø-ºè´„ƒ*R&øh×m‰aTËšh˓ҭؑؠ2ŠbÖ‰36Ú¹T„±^)·tP‘®dä&Gû¿2¡8¨Ø^NKÙI<Ùª’÷fxC#Pê ÑA…__)ÿ“¨&°XÏAU÷’•IÍk}–½·k N¾Â<üSf¾&ÂSãpë09øu¥œû8¨JÃÕNN0WPµ*¢æŽÄƒ C¯m@ÇAeñ–±`ýEB7Ÿ ­ ð9^}zª*'²ËÐriŽ ÎAÕ3BûAå$iƒP=6'^ý^å~„Š/•¶]&ïªB¨D^sZ©å<¨ªÄp]]"þAÕ`a³ùöàA%›×”¯"¶˜?¨­ƒåKoðr„¤ƒÊªÎUµ@2â ê*¼ÜâwÞ—dLÓÌÔÅæîtP-¸’Yi+7îÚ„Ыåë÷¾åÐAEB Ñ %æÀAµD]õyx㲧ÃÝî(SÔ-Žæt`ªÏA%_w}èõAå#X‡ð[‹=¨¸ŸLìÀ(TÎg9n¡³}P!|™¶FX©}P},rÈ‚êV0Go»ÅEªçº }£48¦™Šñ°\ÐpGMŒ’ „J2Ü ÿn¿2 TzZ_Éö ÅŠcj„êø«¶œ1” çµ(êfºˆ-Þ3©hOÅ;ÌÙGp—S»;}Á’P…CœO¨ªr +ƒ-20£(^A¤ +¼ºˆU²…  àOØI~ z(Tì¯Úe˜ ‰)TIôW`WQ & +USaJ‡ú¾d£P%ÔFŒv§ê*RÓrF ÑðS¡ò“òû]™B%±èÔ‰ƒXɱ™P™†–b!ªBM ÀB¹d¤½˜OÛ=¨,;¡¢ÞëÔ®´Ø —ÖÀû¿ñ¸€ˆ·3€Ý„ª _†Ä‘Vš¿Ý‰Iuìž¾ïT=µ Õ*ÞSh\ñÔ¶øøx0\¦\¦õ ³×ó¸Q +U­_ƒÉ0û³ÛñnÑ Põ²¦~_ÿQ Z [&æ8…J¬œ» Äñ#]¤PY¨Ñ«ÆƒÒ›BÅä/9eÃߦP'—4ª5qíRÈÍÝXÙU…ª$‘Æ׶2D ³%ËÐ +U;){¡ZhÅãc¹Mˆ­·“9š U'Ÿÿ31…á /T¨l0o§ÁsOÆ]¨ÖqZ¨’Çe¹B’ƒž !kj]Êk(Té\Ç1Ëßp*®»•V¦A¡š.©ÅD¡/I#1P)TJ¦ôèú(T£6ÿ‹Ä’B*îCÓkG€¨KüKq ,*µUO¨¶í"„Ôn,§b»:½h9…*«"Ikiª«)Å(X+‚‰„{̪ÝÌJHÙ*„êŠó^•§PõÁ]Åxú7ãVªÝ V§¥ÿ*½¾‘MÛñµõhöŽvó|°ú+í¢ÚòîéOYv ŽbÕS|K*ÆÚ+nÀü›CpS¨Lp^Û¼*]¯û¥u +@½áŒ¬OžÁzŸP¹’ ÂÖÎ*4«®TÕǸN“‰ò*òƒ1“BE°æÛŠtæC¡Ê¤ïW€å,c7ÀŸi| ¿ñ`>²O)þŠ7E~Œƒ;Da#ô€Ë÷‘*²o¯ò<€B•aÝFþ@F¡êúÑó¿/208|<§P‰æGƒbqŽ+T¬êEÛËÉð÷ô_*¾£ë×r^¡òÕpùE§|°Bo®"áÆÿfþóP­_:ý§¬sûòÖË'Ø?¸F˜=u‚»¦ž+´*;_Ë3T¦ÅG5!_’e¨ì+/wD×l^ð¯ÈPí‘X\W‡bÖj¨H]h€§,+T¿QÁºÍŠgÃS*ùfî&³ÀIàZ†ªL9®ž¡: +Yü¥}Šª DaA†ê‹yïR»b]Ë0Höì‡|ú|7*T¤0°üçsn¢’?N¢ÿ¾†¡šÂž•#ê8ÐÂÄô@Û–CƒWUF¾·Hª¨ñ7øP’[‡cƒÒÙkY=jDtÿ!ƒypÆú0Åþ‰|D+ÜÚ¦ÊÜ7Í+TüiÁ¶U¨ÓN«‰,T[Ü8VŽWoè4Š^ o Õ¤áèö·è£˜20™Š¡d>&CE5b·bûÂBPP*ÄÔ4˜÷.T” m)4Žþ UªÊyˆOBTIqCÐMbÿ—Ñõ0T´çÍXà4èL¨7œÉ`†JÒ)C:f +:kKfQ«‰a–9ÇSÔ}™i„,ñ5xš¡Ú´C›õ41CõÇKö Œ®›¡¢+Hß±1§.2TN +ÏZ«D8C4¡2T‡™&õ“ˆ«Nn *ûB/TR— Á‡URR¼c-ATÙÇåô½;­ªu¢o².ôBzýW“±•ôËÄ4ÿ©ÓsÜËŒÔ%írÀˆ'å~†q{¿xC ~Q87kš_[*ý^‡Ò–ÄP9ž2éw\ÿŠ¡²››ÀÜ). u"Ñ÷2TœÂ‹ˆª:í»é£Š5L*Q2»–S0¡¡‡+Sé]¸%Û»´Ê0×O`¾oTÊ1Èø¥?õøŽÙhôø.|t) ÈÍk¥À–*/%ñÁÊÊI"˜¹‹ÎP•¢8*kTø^ªG‘Xh¨„d‡n,X«nÔ%êk¨N(c,1€4TN À!ÕP}šŸ×Î8Tc +æ4É¡ZmÞ"õŒÙ”²Ä¿ÃQE´ Ugˆ”AuñÚÂZB£ îé·ÉÁP©Æ¼Ö° QuÌPUØ,v=½@†êœ!=f†ªÅç@>€×ð¥ ȸùô`?†ª£Ñô-+­L,C6!Ä”²ÎPe¬H¦:†j¦î?¦&3#MD—¡T¾_¨¸Žá +¸-²X ¡:M | ”C¨Ø½ ôÝ«@A T;E¥Ym¡‡TÐ?ÊN‰*zx:h9ßJþ9ªìß.s&Éþ\fEžÖ"„ÛðžVŽ=£ƒO#¬´ÂÁ/® œšá§Ã$8,3* ¥=ãò*QÞÉ›¸÷v×àèÕâ“Œkö)2F‡8âþ&CçIùìvXŒz‘9Ëã#I–-œ +tÎ ÉCMUžDOë*@FˆOAGè„ F·i~! ºù9ÏÞ :6&˜ávü— Yù°=wÄ +»S‰+gÐ¨ß J8Ô•Ñ +*8Ì/I5b–ùØ$5¤ŠtT¡u3üÅ>s¦Rsa¼ˆômlÂ%6Tùí'0=a­X +V-Žm ¤G”pò’XuTˆçíýdªä¡Ë¬“S^Þn«…ÐúœŽâ^EîD‡R ¯²FÐø œ~_9í%žsV†1Œ¿ fÏGØZ’E…LLd8öêýMŠAþJ}©[p1:ÐQNg„]ôŠ¥¬1ãTö½NbßAX”âƒeø©Yñ¡j2‰Áeªû4öG·)dhWç”me®˜7,%á*ÝCÔÀ_t¢þ7ñ¨¯TþHX{®|eŸ"GftDñ¦ã»}xÚòι Ž© +ª«ýUŸÏÝD7ð;)žk{Oe@Öbâ!ì. sø¶ |!ÄæA‰]ð‡diQâ6Ü S¡EÛä»g׈?ˆ0)ªYÓ%ÈrÀÞ¹DPibÄ>ÖS“P ­·™¬Ê`[S=;•1-q¹kÚó:Îf✠ Ö´'„îOœÂ: QýŠOžcD ´²- Š©Éðt0»Æ/MG8Óí s†Ç„ðÎK ÖÖ³–Ú™ÊH*$³‚6ŸcfD2ª8q™Ã.¥Ÿ }þ'˽"™ Àìü”RôØäþ8T#¡ÄƒšEc˜96ÞÉ ÃQ<²R×á:–AO¸¨0¿4wQPœ¨uñ@‚(âRwgÞ»X™Õ [.áSXpüY¶»!#uÝâG(›ŠÔ.ƒ¿ò¤ÿü¸¬§€eïªäk“f˸m,B4T“½½°ç}çòDgž§ê‡‚î_™77¨â:Ö,~é_ƒ[±=B©Ž(ˆ¬ðˆPfÐP*5¯ë…'œæ ã@ 8ØÔ›W7´£–¢÷Y±6`†&Ž8@êÖ@b¬®yeÖQ¬#öööI½¤˜²NöüPª‚µê#<Ì_gÈZ`ü¹•p"x‰ø©’ŽŒT_äâx*­}Å¡A§œpŒT1 „ÍDÄ„$‰â&8-wQM.¨"%¼ n¿äðé-âL,Mç•‹jqRQÅè!y£¶O!DAeýØ3ßáÆzõì"]†C³«!Ž#³‰A˜¹¾ÁGEáèáÿrŸ1újÖø¶™“ð +*<ë(ÒÚ¨Ð?>hYˆ¤\Ä>}GK i2 Ë0êëé"€×žÞî à6‹ +ßä@Z +¹S¸ÖL xrua›%2=¼v—?3ZÎã™@+úiYž®.›ãbæh|·1»·Ò#GþÀvÙgGÇüô|×ó +ã-ªÚB +ç\OùÙýKkx¦~±ì™¥#©|Jãö ×ðgUJ~öívF7ó ´%5rÈÀ“ì¯~F˜©¶S‹ O0‚Qšø›­í‡ y¡LYõà×gM– t3Ùë ‹ PvY>¤Áà¢å sÿ+BX´±Ax ®Œ{ N eÒƒmÙwí)“$õ=¸ôÕZäç𠔸-p1Žm;Õ™žøµ.¬ù)œl5•ÁÊlâ¼| •°÷p,€¼ÿšFŠê–=]ÑÂ݈3\äç1ò%nÞùv]mŠLÂÔ{›ú8ÆÛ_l¸" ”) ŵ½[æåF¤º¿·‘BOÛóíìÉÿˆ&øÛ_¯´iS +Žõ0ãz‚Y…€›Ýúw!à‡šöŸ÷›7*É`£çeMDØ=íŠÇ¹ôEI•êuJ¯Š×Î抆¸³V~D«$Ú·Ì é“ÖŠ4 +ëÆ@ |÷ã >Ñ¢*Š—T™€ÃôØ„‹¥Êö&Vøå5igqsZ&ZØeÁö ²UD•Ï¤EjGÖ°}Ë„'Ç0a-´Om1}ªT¼ºó¼4ý‘µòÉÝò“®ªp—’Nè© yà(Ô5U# 2£„ÍÁȵlwô£Mâ#憎ÛÁöèfj½ËÔË Œ}‹IíKn„õG:u-ȉåubÔûfúu”äEÙdïÑŒ/ÁZmñ”)ñŽˆKÐõ|™˜^sb(a¾ x¯J9Á×Hó©ÏåpSÒpæÀ6ßJNU±9‡ Â…G. •­Q‹ Q‹71>ÔçtuŽfÀ¡`Pµäz¬ +ýŸâ×o „ âªùY¦Ixx‘^éÑ(ºV¸C(ë"ú ¡‡BZŠä¿ƒÉ?~‹qÐ×ñlΘ_òUÈÆ”À=H¢Ø¨1u&zq½P‹Riø”i 3±jÜÔ/ÇDqþdj¥W¦ן/ áü§2àÿ€m|ÿ’¼dš2ÆôÙ Õ-S·2åû>|¤3UÊ$¤´ü Ïsèj¨gô#ò#¿CWäÕ)˜>ZU­Þ@ÛÖA€Z\P$kLžæá­´z,‡k"b=St¥ðòÌÞµŠ°–ákë„ÅŸöÙò&`†Ñ…Ûë~p*Ü•7‹š‡ëY⃳²(#@bõ¬l#I{Ú¯Y…£„1çAãϼ<ü4釂ùï­´á¢ä±t¸Bü[ˆ±§ijx:æxñ­Ëª‚Ý^aùµ»µlêQ‰áߥ¼N@-æ´T]6YH£ã—ÓÝ4¡QG@Gi]ô”ÞÁÁpb—†ÜQŸTÂã5—o_ʼn“À‡p¤Ÿ±BWy–§Õ/1qô§5Ÿöbq#e*÷¶xhŽèªÁvÍ!üÒâ7ò4qøó¸³È†Ð_yv|›ð…22øÕY5—œã[îJ"éülÊ“Å•éÛ&Aد90ãÐ^mïíƒB»ù\'¯\V+X”*j@íê"„ÆÀ +-ƽº4,~áÂt÷7‚“´lŸÏÌ*nJÑ‚Æ +2è= ×”¼0ÔÞHa'.`6“jÚLÆ•+B›PÝ(ÜÃNׯ:#§a?Õ¹È_ÈzL·½má)ÙMèÃF¤›Äæ&‡¬ÇƒêéLYÕ±ëx@.×f5KiÁ»<õYiT14u.é–‹7WÂåø7£„{1K{”Bñïê1'uÛ™âtC°¤³ï}MyµS«"Í–_—_#iéÕ!y°[ž“!¼æÙ +\!²¿ÖEåÖj¿$ÍQ^Â%5©lY{‘Ï$úÄ ©hbr*å.¦‡pO¥¢s2ÎÝeÛ´t¶LwÑŒ"`S P×f0ÉJŸuÉ쥃Wxý¯v‘•€÷ðÏÖI<`Mc='IÙ¬Íǘ>]a ›&"Bj‡ÃR³L¡›<ï 1z/¡5Æe%õ1ÅÝV¹c0ÊhX„¤žðz(Æ^À© §|¾Î3æ üWgh“ zŸ.DrU$g/U –¹èQ ÏB‘2<·….LZÜ°¶UNü(Ç“ÍuÊ'ÖþS}s5—<’®¸£²'“¥Y€L­‰“÷«6æ§Xf¶l"e£JR­±ý°eSß û dÝhöÂææÌg•Òà2b§µ H…í±]°Ë8†Ð[qb·n!žX,äl»íë ™4¹ȱjoV t¨&øÕ˜AoCeLm‚u¨Š¿¦ß–;$¼ñÍSÅ\Bw}nÅ%úUë­%†)Û*WÆ`ghŽ‹¿sÞfYŽÕà ß}AÑkÿŠ&_} +¸-eáÉ\ 'Mªy²µ×u[w[Hþb–AIËÐTI3ɹ3&šL#òy»Áé5àÖsʱµ_E.ç.:¨ŸùàPÁy«I+¾ E›{ÍuŠø´”áèŽ*‹;@bk+r +ìôoc1Þï<Úû2û…»ñIv`Üb!j=¹hÊóS¢$ ,òI¯r$þ­z†í\åÊ6F/ea4Ë-F‚{@–©ìSdµÌ”˜ÀÊîÆ +û¢¥Á7;s(¬nT®õŒIŽ^"o Ò’©¡v«ýw!†ÖêÎ…+V å”¦ Åq“ŽsU&øÕìξza9„쪌^¤ÿƒcÎÅæ h¨A¼N)؆ÿ¨DÔVÕï™>ty 0js,:cu®€Ç¡ÆAª Â\b4Xè&ö´H÷Û4¶êÇCÃpJ¨ê¢ƒK2÷…{-SnÉ—¼V£SZÅ\E»®‡¶œn¨Yr=1¿TóäÙ¦¯Æ‹|´-r!vçîLÂipDkOÜF£’•Â½å÷«‰ÐŒˆ¤BQQ٠7¬’R´üQùÕ&WmÓˆù¼ˆ¸¡®Á¦äÐkæ) ^_ +ò_«EÍ ÝÍÚ÷´&º!+›˜ÏQÞoz-±B0øyßü3)]Ùóâ´còH[(Ö(°3Jwa‡Ç9~Üó +¿qŠÆnP›43’R’ü fS¢Êã(Ô—úIYªÃr].R,Ó Q×G¢jÚŠÅq­.Ù§f˜ƒµa\÷ ˜[B1Ëv×Ú ,XZÎÎœ +øx`Áá90ÅÝO}¿‘¨ÆÊõávÞ[à¹8Ynñ–©XgøÂÚG”C$mŒ³vºÔüsóŽXîWFè·¤×^°å@+„ˆÍ¶°7™Ãˆ¸û³Û‰ÄòB8!¯Ÿ¡P®õ½MZ1 ò.yôI w–ÓC±3…uAd^,+ õÂÝ n=xdUùˆw™¡Oa$ ˜÷ ¤CKð8i†ÿñeSs üstÝØ"E"3“( ? +í +‘cs|ä劥K¿é¾¨PÇÙÉðs¤v¡<ßöPÔ|¦ ¿Cã™z‘‹ÉKGMÌèmwn"j´±±´èÚâV2¨û«ð)$ÄÎb²&AP§S^îåv>å„NQga%îj‹ˆƒ—TvúÿZàãHù«ñ—G¬¸8(Þ …°µË:–KÃ+4õ»ÃÐOEU(–u’á£Í,QmÐ邤ÄRœ”Q‹Ø’´A轄DÕHÔN¶_æé‹€@@´ß²nŠû8:øhòeú\u½Þ´‡•ž9âŽRœk‰FPá±™ÏWºãñ„ó˜²U^…“Š„~c½jé¶ôŸuj?\ãïIôÛ?!R·¸ÇÛLô°al1¤8²—âD–ù jé[Ò.qR册)I¾>&S5Qe^MðÂõ´ŸyúS¦qOãM“g“Ú"cnš8î +‹ð¸æ@‹_n߃äÕÁƒ“(DPÊl™Ÿ¬I 1zÎÉcÂC&ú ^Iò +¬Wø]b1¹mµõ%Eã£{Ï7‘WÐ3;‘ÉŸº)Û X&÷HÖE|Ë—,Õ¤™}AxÅï©AN¨Ë|¾ñÖÖðŽ»”÷Î/S *w"|žÚÇ™"·6Ê9í ØMÙÁôò»Û‹·öóÕ?ÄÚù„– u+m# †Ý¼‘õ67¸+ÄïõhÀ £tgW Š –I1’á‘]“J‡:s€¯×ÁAYxŸ‰ÝIß±Sž€‘A´º”~½ƒg’Ò +ùƒÎLîàÏQt6_tú +Š KÙ´8ROÇp;lìåâA|ƒ7U“/ª7l¶ï­†j€Pw²\©Õ'ŸCC‹%'yW¦?Q!ï~¹Ï&ºE²Æ}ÛŽ£ÒeLUJº€mÖR2:·A>k+OƒR¡pA³çÍÝ®jÔA:pS;Xï'm^ºÇÕ³^z ×8¿…·S¢—¡1Í<„eåVq^ì¢úú¾*KYkñ¾m¾™rs'`Zض äV§èÄÔó¯þŒF/„žêGaœþvÍcN=–h¦Ü rGÂØ•M 9 "‘¤ç‚ò±Û•¿»¡:¼«&¿¸F +ÙpûÞÉã°”Šþ®œ…Z¤²Ô0›‹´Ü>ª˜é’ƒàPŒmUÄû#RëU“"ü ØÉ#_lŸS™ãë]õAÉ=Äë3+Æ뇈«W ѵ˜sÂã·ET„Ÿ<-ŸSž¼C©E)¿}†;—½¥‚ +ãûÅ°ÏÆü± »˜ÈSèø3MeŽè¼ãmÛMiÿKû7‹‹ÝT¹ØFêº@1…ÉíÝ„G»Œ#p ŸEs—˜1—Óì¥Ô£Ò,ºmà,›ךÃ>iN$UM=¨¼-Þ¾ÑCŒJÍ, ñªRb¶Êÿ© -¡VÐN«Œ~• ®ÅÕwû$S:$aBŸf«55 Œûú°Î¿£¼p=·Þ6¸Ï oœÙ‚LUP¼>»Ž¸øHjugó‘éÅ‘È© +¨À V£â*ˆc7ÞMüOºàçÕp -¡Ž>5—„¶Î•3 +ÂØm`åç‹øi=q!ïî—yv!ÙåÀŒ#Ê5_Øä± ¨:t1f¸¤1וëõswy’ÔØ]‚¢è2»vÝ)Qt§åZ¶°Ô "½˜² S}~T\Q˜BÚÈÄ·ê\©Ç|h%î|tAoFžqnÇF}M¬GÜqçIš§79m<'¾ÝgpFø‘ŠÔfcâTÿ›=0,jXtãB§_âŸßzs·®[øŽsí&* ÅV¢£p:×C—8‹¯Ò •ðR³uÝ›vÁÆrf¿JdâN«6tš[ÿxq´}<žþ‚vÌêΦ2›'DŠLÆ1ÕwäJì ·)×ÏP„šÕ•æl+­½“쿧$# ÏJÚbçͱ&k®¼tyÕw7¥0sW™°å®Lþ”T +ÐÁժʡKHrÎBàÀátþ6¼*ÈÍD/峧N•ãé°—:ÔÝæo¸é9úüfX`ºêÅÛ¹•HX“«+ÔÂ*±—'…ÆdnÙ›Q®üžJÚ;AoÃÍÙˆ°PÛ®â:7‘Bý9¥èð™>ƒÅÑúN¹p6/ò&Ü¢¿ ¤Ötllûÿ«•£Â-:ÁŽgφ˜œIž9^‘„åœWµ0³K´7ÙõËÁ“Lñôˆÿ|A«wF{JÐ iE’Ó˜› xmF’HÉ)‚MhÀ¥gîí“Ã5XI½£®éG*ZZ+¾ÜH5µö£b +Œª + ²/z½Í,„Å7¼ÿo7È]0ºó0Z. Á)`š2»¨•þÍ`È m÷»kjø–ývVÿT䬷ˆÃâKcð‚¦æƼa6Vyvõ"Ú_d&ÈygÊùÑKéÿψ[^àÂƦ[+)ŒÃ='=å=剰 †Æ’~…)t\(K*‘K'5AŸBYHàçsíJ›òB>Äs +8Ç5m'iã©jÇrŸ«òÛ™.dœ(½è~ÀmbQÏ”*"ªs{etW¯•FaÈ(˜³O÷†G?EŠÔƒ…Šlj1¬²ónR‡ÔƦ¹¼W5‹ÙHD× »dZE +£ŧ´&iöÏÈùÄcǨáEQiö +C‚<{\ÀÜÙŒÛ>8äOTdˆî•Ò ‚ªz˜’‘£åÎÈö©×#ùbQ*.”£Å!(b·\´†÷Ü[z Ä}õò†:¼§¨Îƒ% +´æk,H´ñ—Ä%=¡T~P!|ËC©R3&óà5=ðY誉Ãázbæeó£5þ³é³™k@÷üelZÓO)¥ÀªP–èõ /xöy°Är3§¾GgÍäù÷5íX®ÄA~‘n¼#×9V^|ŽtƒnEHQ§*#²©ŸJt=†ð{ÄF£Z«£sŒ7‰YXIƒhÄ‹Ž¬ÂÇÍA'»Ì†S !SÜ%­ö1("ÏT²PXÙÜ'é5/ÛG÷U%8Ë7ÈÓ˜@‰ AÁßFÚØÎPâFUl–e÷ÍØu5oÅݽ˜:åˆsk‡½ß`¨Ê`Íç¼B3åò„ë—ZÛgó²dJ5ךO‚¬ò™÷(Ô`­›rË6Q©o)H?ÆÙàЛ) ÉFÂ"óL´B™n¤<0æ$w- +¡$öž‰|B«KgŠ)ˆ +@dp¯–hõÀžŒBË^'ØôÀö>y@<þX؈8J`E¥€hR¯RË6ˆþP߸.Q¹œƒl—–…"3Q?Z×sŒ ږ÷ÆöÚcù…ŒS.£ê‚¦8†ƒ—÷n_Åè°Ùkµ"¼—$¹Â –H|ê„<³©‡QK$ÄYöÔ¼¨É´ãÙ… s†Í3UÚ¤RNzàOBL:;ÄSÍQúQæT¹mÚL² Ü„rØ«´0Ï´j•Üæ–’Ú.ËM6l=)¬¾XcJªk3æˆ`:Orí4$ÂjåÌ£C¦^ Jõ´G û¤ÿk""C9®Ñ)Åáù€Ô$ +7À=Mîðõ¶ˆNm¦ =þÀ¢ž•[¬ˆ{Bö¹R‡ÈN â³_ÌÒãOóBä¾³ëS¤ô¨d’"Ï ©$»:Ë¢šÚ¾é©2^Š)Ä–&>†0§«›7•RƧ0VˆD¿Œ™¼=S’‹&»>Ÿ2Iê.\³ð†ÈûlAø€v[A$æÄ÷®ræÓþ@µp-›%æ"® 7ö3@0‰ž+X)ýÕ¨˜çGßmMò¾ä€bø|'Ù0i×á<És–땺ÆðÊUó@"C†è÷ûƒî Nd•³nJ5‡Çm5db†®þ¹UÊ“Bn£é´+tľ@–+¯4mbmlAÂ…bœm°‰þJ[3„Qzý²é9…H²ŽL–MH¦/múî8§/‹_?e3T1€Ü¤q“ŠFPŽ8Qy84¥+Y+Á+hç¢ç*t&â½a +xhΕ6 ½ºM?ˆ<¦ƒ‘ü‡T€¥B¤”é0p3`±—D3mÐùåÚ%ªh)袆'îL†t6!oú¸³l‰¼—‹,ì"Š +ÇÉéºIg‰£Î#¨€“ÁQ8yhC6_MàýŽ~U?3\g§`ä!ºzR#63ÙfЅ׿¸X^Ý “BƒL- –H‡ª÷¼p£fj¢Ó¢…ŸôX-á/ÅÖW¶~C’ Ì“ºSà \Ð|Á¹_Œ~,†§&X×àÔí¹‘5Y´7‰9wX‚´ +•Tx+ÒûÂìÃÀ¨æY'Nˆõ¸ÛQÊoYã,En™7ŠBn68¥gd×VFùób$ª e$‚0˜E ãi§›û;¦Ó xà½^Ô;>ËìNÐœ·ãBŠ7s?¤ÆÓ9Ðfép#L¹<ònÝ¿sñ¯¦p ÐÝ¡ª=WÞ §OÄmòx<‹û|…L’RÄSúuø㘬1¾£ƒÚ4wJÜr0Ip óa>¬æô¾ <ÙÁ1Y,”;ž¯`œ4§c×v6údVZí„Gâ-z¯µò4Ô²ª!Œª>²xGàÃ6@ÖLí{Q Úá„…vùŸÃ%ù'’ƒ‚®’þÔf’+$Joöê;–°óW³@^"ÐRüͬÁV‚©æÐåî(‡ÃaÀÕéĘÊÝH/´´ +C$¦ê;xÝdÝm"9Ô€;DÀûpcò3ÞG¸.¶ݳH‘²1Ò fÕ8ïŸfË5âÜð)=Í5/á8U6ëò/e°²ýÙtt£Â›cá¤E×![„¿Ä³¸‚þ^ããŒZ¸î‘eè† óÝĤ/‡LÑ”€} +²”WJ5Ù¡]~«¬F#6°6ûæ“®µpJå0Á¢XM<ê"©‰À²g>ûªQ” x)µeê»v˜§+.Jb eÈÌÈÌâþO¢.Ÿíj/v¹`›x”—MµÍBLnò!8Ê€[¦WµegJÅD4^3ÏË•Bl"„‡t$k#´¤Ræ_n‹‘p¶dß(lA§æ\½q9!˜ïI;irò[N¬ŸTç«%9âÖˆ‹Q/Jäl}·ròU2sŠŸÄí_ƒ,Ó` \þàõ³S€‹m_¹¨ˆ²¸ +†Ø#Åà(îÇ„7o‹~¨ávïæÍY[îe&}·J ‰ŠŒÍ{×óï’“O®³ Xmà‘¡ ìjK`PÈn…º™Wèàq2lÈX ÑPÕ|ã«Ö  jeËç¡$‹\ñí úuU&ƒê/ËǦ`cy©(t’Ä”„¥_z*¡6¿”MÇGвûÕÁ & ¨öì‰åv ª0çF1 #(éïJžþ¯@ªÞÁâþm)C± ²¹3däö‡Ò&RºK†t¥Eo Ý>ñ¶†¦Ô·²BËŸ“‰gFƒHIÖë{¥.¥!%ïh3”.€{l‚ï×øA\!V…lŠ +áž`žˆ°°ÆëÀè Ðâp NøAí‚ÑœèÝ;ªh¨-Ót}v%—V앯Z¯®’/¿Q&ý5dº€C±$= |­¥›jD>I?‘ÖbÐÃêSTÑ3®0yÌÉÁ¹$$ÒZé‡$ìÉ‹äÏøÃkײ0+D¼Þ»¸ðnɃ箖§†5ŸF…ÉcóÐOuå‚p_Ènã݆С¸•™³™?-ôþû»ËM»?j Pˆqƒ/o†,̦ TzbÑ2éØŸB¼çH—ìçjã–jz0Ø(Ò~5EÑ´³`ÿ/­^@—îÖM.ô,/hÄ0NŸÔºä~çIúR\âÊ(¦†³ éÛé}YSInÙjµ°›XxTPŠ îÇÂäÓ™UÇ‚}ÿgœÔ¹AÞuh'o%€¯Û]a2ÃóW.Y­Úµ + T€g€ TÚ)“•ž ™U!ŒA³ÄûšD€Û,\@–Hw¾2ªdWÇ0QòB;_M”I¸ÀˆÆ÷@ç`EóWfLHÆý žsPor•?,)p­³¦áµÎ¢¸Tâú·" Q Y[ØÓè›NÒèýZŸÜÍ@Z!iõ³²¾Øù}nåãÊÒcÔcÞ˜\C7B6ë0Y=#à“q9ZTÁE÷¢(š± ¸Ä-^)/å‹M6@µ)$–œ³¢Äš%ŽœÏŸM‹ït‡ëì&w!AÐx4›Õ+ïãÞÞ+£â† äŽÞ!žKV·Eì\üûÊ‚LënI¿¾+i§«]à‘ä0Mk¢ôA'«·?1ÆJ8qS½'!Fˆ…£g-””ÇðZünBÉ=1Ý’¹bø…¯Ó ªûtbðâH×LAŒ-9yì}12ôœÛíѤ9·:‘*ô<ÏšøÕ¹ñÌÖQ ]ç5_Æú!šZ¡€GÖ78kÀÀHWT¹ŠÐë†ÎTáhžñ‚Œ=l š ëfÞBɈB_‰htŽ£øÌ…¨²)zâã!«î邚õm˜4KŒP¿ÃmôRô¬4¡ÅJôÜ1<©®è¡1öö83:ò³Ò•@Ë–QÑu[Ö]f`%òéÝe#±àÚ +Îk«®¦µ´EX„A4ƒËÔKôŽ*‡ŒøÞæ¶/µðG§L­ ÈåÅ0Þ]º)‚ ÄñέÿÝa^ÞnΚ_‰¾éžÐÏ€˜­útcŒýÕëñmžªF‡-†P©4…¿¯bŸ­Ì0@ã9™E™ZÛ‡âÑçEÚª Ââ®i\ÛºÈ$¹ý¹Ì`šÏxlÏ™IÌ1^¾k[ÄC‹È Ü§°˜-9m¹3–’ŽÐ"¾: +['`÷úýÂOruT00 +Ç ÖFr)0Î6PÈ]2|‹ÿƒ™ÂþQÑ“Rü^™Ÿú +X‰UÉ<eË—«á‡NÆ0H`¼×˜ÐL¦$ìp¦Þ'ä-ËAʃ#'=ƒY“µ ' ²ñJjGâ‹ÂÖC»ã¢TØ€Ý`*‡ëŸ‰ø¡"Jýz}è‡>BÉ¥ð aöYeeFê_®.ÔZ54€›.è„4™‰@‡nvË6Ì [Á·×óuª¯òúØPf +? ¤3Þη…<]Tˆ÷Q|Mj|qÍñu”ÂØCI{~wŒÖfc¢Ð÷¢+X’Æžõ0Å5úR@Žd*XkôÂ!­×µÚ‹ç°1^öá¦Ìz.dä;„ƒ"è죓mí‹Á5š»•Ð»ûð²¿`Õ–€§ÄiìØœÛ,ÀJ–|øKTG¢7î_;wžéÝ!ð[Áj–LÓrg„» # e9SñœWÃ2×HOSIªlPÀ†ÓnQ +¤IötUªÎS„ýW«, £\ÎZ•ÎtQvM¨§XÚ£{þ÷yi|–À1ŸÍ˜ˆ ãQŠå¿‹ai,ꬪNV1ÉM“ÉüÈÝ<Â%<膿.`Mý`èP†®epP@‰B¹,¥;*™Ú@¢;%ˆšhðÍ4XMÉÕÏ‘·fÒPá*‰õ°îg”5Fc†Ñq¢ˆEgã øV‎<åM˜c8¼`7)T€½tñGÓf gd0ö—?Ìá`Ï »·¬þX}dÂVFÙË¢æߊƒ*‚ÇD´ËÛy[]z+*ò~/í%¾åZÛȧâÆu=7úëš`ƒ(”$]o9" Ž Z°Z³L‚Ÿ'ü%°‰AÉ÷ií¡  Ú?ä_L#[4–ë¿änàŒE T}®§5¶\«,Ça +Å•1Aj ßÅþßPŠ³Áw#°=âÌó‚ƾ¢Õ÷÷K–½¿}Ç)Ù¬Í+°y®¡(ÞÑpƦÙ¨ÅOÉW1OTíµ#èŸô[†4Ù;8j´Î0¸ßÊ +亪å¦äìùy­P(S@ ÛeÕ›§¯DŸÐr1ãòƒŽ¢ô›ÏXñ"F JТŽIåçP_Èýh3â!4ÂÈ€ÏÆ+Óÿ‚Ä‘#8yo81G|‰—ünÚh$ã÷ öÙkãèµ{D¯žæÙ“[©kjŸ]=ÝaúÄ-øÄbÎv ‹ì5ÊfõÏÚ9#Þ]û°a~¼0¯ôÎã«4ótÃòÐ…Ü '34r~‘à)d `sÊ[¥ràJ“öd‘ÊV ¤Ã¾JïKÛ¢hÞf¯ñFVKΊÄÐwäŒÃ{+”D‚ÑÇlÄï²Ó "üðy Ñ5þäܨ™,ßçŠàE¼ïç×tQvŒqÅ‘¤¢?‡»B¡Õ´1÷¤ÈóÕ{'æ®7_ +‰2; 1e/¨Ö)þŸ‚;Ùµ/Çö0ó–hðL”UýZI/ßú›å… bªÏ¶GWšÙ€|Ë­é•hüë'Û?º…æ7M×8üC ®µÐÅFÑÖ(ÒªÎÆ—þç}"/—þzÜ#HwK¿N;® ƒm1V±üŸ:ËÃ7Šœ8PYä6ÒOLŸâªÁ¾ +•î4èó:?›õ( d€#¯Ù w xà›ïƒ6»lÃs:÷%Gž Dð}BÿæTþg²˜ ŠÞ.¾‚& ê+“ä•ú¯” ûWF¢Y&¢iÓ.1«óϤ¸»9%öƒ™4£DÛO;æ¸4l‡„-³! +)¤jh9ì¸è8^[Ÿ¸—5vÜ y²wJdKx¾ÓØ{ÿ›ÍíÉúmßm¾\>øâå×z·- .ò†oÞò[5Œ¿0Y÷- Ý1S2ï†Ö@ot´,~ &„ýV¸½e¹úþoªœÏß¡!6²·Ë+T£9ú²×ñ ª´ÖÈʲü0FÁé~mú¬R™y¹þÒÜGÞ 5ÃbqòcÛÞ?,’²Ñg“xl¼‹;0‰6뼸1+5+ŒÔŠb‰§ÌO™*K1‰˜¬?ÆÍ g‘•×ðJ¸‘Šf±cfw «…0›~ Ù‘¢)62ìG ðNAÆÒZÏ´‡ê“Ï2‰ÐÍÈŠœ0~ÔìXuö7êø•=­Rwb+X`Y"|ˆ“Q¶ùžÕ•±o¸¤_9óŸÂ‚’dDw=T€ÌŠÝoæ' ‡Û`ý‹‡WOX&µ«á¬-j4PdôzzÎ_A–q¬U<àgpA•2T2ìIoÂ%RCJLº,:o¬,“ òŽùžœOW¢§äì ‹¦2ütˆ«pÚíò¤ß.c{& Àp2EPÔy"“ÕQ¹…GjÃű¬$8FùóÕpmËXíãˆcï¿žGÅðµû"ùUˆ`îòÓ#ü®dI¢•½×õåJx+L“.Sˆ®åz/¿/¾…í——öÕ»þ)ð:ƒÑè.ça+IeŒALûæqâ´ ¾LM„*oÛ/'në±Ãñ? +Âç1,ˆT…lø +âtl8¹Y£¯FÄKKWø |HÁR¡ +Ö?ëð­ÑþƒW‹ ò~.³±¬e…)ëPŒ½c®ž+éGy˜9?üÊFN2|ì^ÒÖ-sŸ¸ƒ“üÖ@z,n1L\}”ƒÖÄŠ‹+À ®ý‹{ÅŒõ« +uÆ.²ùIžª¿Ùúâû°¯EÓù¹µy˜&I‰­Ïòa’ÐdH2¡fEÚðiL¹»)æ"òñ‹é Ö-ŸÂ‚ß±c¶u ° +œþd­'ñ·+y¹Rê ‰~°bzÛ‚“pÉ Cý}h'xÅ:Fnw¨a5¸nÛ¬^Ë>lÉPŸ`ÑÎ%ì³5AÄXnZðˆ¢xë +Æ¿Ú‰:MÝBÚ$­89QJûGókÄ*¤>æJk&$Nµ‹ô ².I‰÷K¨õù3£¼m¼.Îð÷k7SÇò:¹d > ÍŸ¹ÍŽ¹Ÿ º‚´ò?“¨²ƒ`ˆÕ[GxÌåÖZë$¶P•av¸VypêÈŸóãAi# œói^ù0 s²ªyDÏ£Øõ½! ‡…ŸÓ¤=6î³!ÎW¼ï'Ÿû÷Òrªð©I…êFdàca\ˆÛ†+µÞäcðoÝ­§²M;ùço™L‚¦P•ì»˜ ‚| …ËïOÈò¢ÝÃʤQ‘!C ÂË}ŸÃ†íð§0—‘wÕ±6¨NwH«I0#«ê A, ò×<°yEáñÈ7(üSÜPàÊ»S+…ð€¤³J‹®Y@$4Èün¨ùðórlŸÜ•!,‹”—\UÕ›Àj´jé2úI§ƒ|÷€:«7ÆŽ•™|©Èk7ö¸ÊyHòS\ͪÿI„lÍye@åöNdááŠ1¦`¢Bc鞸2ÌŸ§ØYÃhÓL|JÓ´'e),\0ðK2xÇ1¥"›¨ì3MpFô9–rˆJXGÖø{ê@ØÇf4ýû2ªi¨u]š%DÇx+0¼2ᆅ¬¦¢`é‹a%ãXµ ɾ#bô/3‡ÀìYô©,*ت¢œ‡Ž$-ù\¤?Èw&“‡ÉiÖ§YøJŽxÐ ¼CI›„„<ì¼`¡Í ä*1Â|â“ÄÝr¤‡G­‚Ž‰ãM²Ü“eð˜»>¨2™ÁR€¢L¸ô¨þ8Á£„cs=n*É`+Ï¿ÿÀìëê=™Nщȣv ÚjôÊ;0ãí»¦¯ÞTòZ}#/¢ùÆ&§Äfk$Ã…ü£‰2Ÿnù(t‰»»‰áOž’# +ázü¤"h°æ«%µ¶d¢³•K™5=¦ ¿yi<Ìþì#cùuzì8&DJ%›fuéÜN£t‘¾1غÙéÎ ½¹¥zŠ±ôýei`|ôªiç$Êá„ÕjW»ì…EWJ‰Ð nëI¦gÙ‚#yW¨¥ÓH9/wF¸6üΦ™Ý§Ç#Y|îv}ä_!Ð’0LQÿì =®€\þËR…¤ Ë +Ç—zhR€­"š¨g!gîò #Äýö. ÿ›€Ò—ŽXQP€Ìnø–gØ '‹¨æø_—_’E<r ƆђÄìÁ'®m&_ +ÝÚEIôØHa2 ;Ú¤ïõK*oÂ9ü·rôÅVâ™sÝò¤JííëÏûIà7w8‰`9?÷ +?1fÖ”Všz~9­j_›‚xç åóm[eØYÄ4ùÂÉjí¢¿JSq®„>#Å”T}Ë‘©¡¨jP3"K¦YÐå¤ñ_»A¬ñÓù–ïh¯O¤¦ÀÉà镪Òe`„>U•+zŠØãm†¬r! Ë–=åjݪðMoÒæÞ0X},‘ûéCÃ;bì¨Ò0üO¿ƒ‘4w2år-ë¶cZˆOª–ž_zÜCöü>?¤z«î>c$é”vÊEOÑh;÷nÇ¿ppAìoä¬Ôh¬< g©s!r¡Þ8d’bt\Eeø;øýD_à;¨‡?S¸4¸mO ²ÄMÞ8V…†'æ("2›dZ2ݦS_‚â¬Ð"~U±&³Úaêí7 ܬßùøâVàJ.1pÇ?ךÀ 0s… ãzÈlÆ7³ê`¼#×vÅ¡Ò¹n¦DZøîÆç¾à^ÍŠºcM{Xz®ÃÂá”_К=äP0vnÐ`$Tê9‰ ‡½î*…X·D!&‹Y|<õ{¡C¿mÝdP>É{«,Í„³@ÈÖ2ÎQg@ª<(K’ +¾§³óâ&9ñB) cHPÌÑÈÖIˆo3ÑYk¹Íúr•B;’Vº+_ +îâ‰;ÓÉÁ†‰ri܆Õ”(8}T©Ÿ!Dƒ3®­ŽßíØ~gÐÍí +¦¥(< Xíµ<‡ãy'5¨Óˆ&‚ŒÅ¬ý‚ 9´'7½5Š$êZ1¦8ý_\âC•´`˜qg“-DA¥僙³FÔK€$ ‡ƒn•)Ú_Àù‘?—mHêH¬;ØŸ·nĺÉͱ/"±à&.C (øïu(=ÓÍÔ<€t^ Kqs„d¥Ã(—þ%æšÐÀ‚¡`U@»§Ë1L› +–|V%ûýV䃨àH±7ؾõ¬Lç1œÅõDÜêhEæðº&¢ðilÁ›Öšv?…™D<3xFxOñ‚¸< ³ø±ëUµŽ<4“ÃéÔ×Ê5Ð3½ÔmVá|™ ½§iš_*ü÷*ñc‰ù•éK@س&nµ8h\szÍ`Ý3WºÆÙõüéÑ …›Py¿(=?aàÖX)kØ+)ÊY§Nd÷ƒ¨A‹ +?dˆGÈŒto8—Ó±„/‚:|³9ήÂ˼~P¶Q‹Á7˜U™5p5(Fhh|~¨—“§Ú±©s,ëÞG3Œ¾9££M0`«y»»ªT .OXÑX0ò§o-º¸‡,¤" ·.¹¼È3ÃÇýŒO¼ÓRg´úñ7ÍPÝÃZ` ÿ”ñ\Ö,Bêœ"t +2ë•ÝCU+56€˜ÓÌV,˜·Y°€Zèš}2 ±PÎœ´2MCJUŸ¨cf¿l›7™¶®2R0ë… @/î¬ÝKCž“Fà|”íÊV/•ÈU ¾÷ä¬ Á ©iÀ³Í † 픫Sæéž„Þ¦ æYœÖŽ 6D±x’VÄPŸ¹9dýã¨ziœaJPtm²äa}– $ < E®Ë%V ¨…ážFmwt©,Ñ`-<¹vEþT(l³ oµÂ:ã¹&L‹EY…êó™U‹eÝ}wና4 r*n²rúl)ñ~Š¿”\ÅaÞ‡Q ó³“|¢6ÌPm99ñçÐ/½£Pâ$8O9ï·gBÒÏ甋aÈÀˈú<Æ؆ôO$lÓ{ñ÷pïQ  ?†PÏ‹­{êó"nÞŒX±,óH„·Ú9_Lñ×IÑàôˆõU¸T%®gœxpGÑB%P5í·ª²rËÏìΕÞâ´&˹¤J*æ”àd6Ô|CÚ¿+Ú>’›M~y_ŒZ4Òsnù>ÆŠ†¥EdRÀêÒ4oÆèð3J¤= #a™¿ÎÖ¼½.ÆYhÎhm¶Ú`~ž|Àëjd¦˜­2—) R¨•aŒŸ¡l^ÛL b2RÝ=Àì)Ú¡Ñí¶âŸÜMoC1ñÖ)Z8·„UÅ6e[›³-J4ú-%²×”Þ©A®Œ3*`¢â‹QwGÅv=¾o‘)ðJà®Ø*‘ãÞë`nß׆Õb[V¸Ö¡7‚…+¶M¼·÷«f±E†ÛÄs­W3*À[lƒ)¼‡à%Ù>R/ý'¾Î_<†¢âïK]‘”L¶ìSÁ/¶¼+8¶`F +Hçdh[D&ˆ•6‚l߃€¨¼dl ©WB¶ù(slM‚É+†l1—ÉÖ`Ód©°Ìd»ÅP¨ƒØ“ðce»EÛÛc,ÛÃÎY¶kã +°l;Ó­ϲe¢PëÄ¥Ï,´l?t‘øu¿e¤¤ÿ³Hö‰O»&þ6Æ.Vï-qo‹|Ê“|*€ªž3ùB¡ªc·Á^%çÓe;* 6ªªXKŸ„ªNA8¿Å UÕ‹‘iÓ³ª¸æÈ«: ²]oR¦l%}Â"8pW5¼N›§¥4ØÓ´>ÁèÓ„¬ê&kѧWUU“ÂYU>™O•)‘¢O¨ªªYX N9‰¥f,àVû{å’ßDWÑŠÈ–ë+å^@C씿ì"Ênþ\JËJcÿ=¬1®W¨]@å›Ã‚.ˆº4´péVÀ§þ˜Ît­Q k5‹šÉb¡„B k­Ugî«SŽIlêÕr â°Ô%Qù³r Ư› x_½°µÞwe;¶%¾›«„KÍ…Øx™‹qÉOÏÌu¤R/­yK'}gŸÐ!ècLþ9‘“’:7sYiòAÓù–5_Çxð©>ñpLNÃ5sy…ªJb®#ÀзË@È_â¢ðêQ †ö1÷ïŸT$Ü™0©¿gKYÙˆëµø¨ÒÜÕ¢,A¨Eˆ ×îh-´ÆùY«MbºlV˜*©Ìe­¢Ê‚Êô{4מ_xÑb}e¸‡5ÝO_7à8ü0¬óÿ£ø*à=gxHpŸ#í«[OØ›³Áúšo.`C=¨Ð¾þÑß7øùBp®Um ¬J_Ûé†séäÿê\™ùa¯ ùðªžhK¬$·ŸÚCü¥­`6×–¢^™ÿÍ÷t',ňù‘*x‘ÄïŒ!Düøì Þ‹Æi å§ëG~Ç5îåa̘ªüH¼fö°Ã!FùEÂPa¼=·˜,*¹B~ Š—Ñ›»ÉÃb §ºh|7¹\Ä…7G…A ;†²ÞÓK xEX.ž—ZîÛâ1ù†ˆ8¼ù XWt†¹ä*SÛh¾7(8nVd]) ^µç%Y·…(n‹þ¿—ƒk>!ÜbçäëÀ•KnÆO~Èz£\©Ðz”¾l‚šÊ,fú¡R.fã:XžÀÍöÁ‹“ m•/–rmˆpåÚ÷î[˜l~0×cÞ}m «Ó%˜Ùb—ÀC¾µDI@ò €8¥äãåáX./ɺU·(é¨[¥è“´?ˆÅɧÆ.£NÄ@È–Y¡J™Z#ر†J#¦¢¾óõØÐÉÊfÇŠ_>®`:°óÝTR^ýªAaÆy¸¬öÒÄà"‚âe`8½¥I³Ði©!{\AAÄI_ +É@…É$Ö`äé/zCôCE ?x©1ë`à(tµWÑg2†ƒI\ÀGOš«õW¦¼»3bˆÊ.\+;}4°dP€a §€§ÍÖ ¬ŽcqÑ€JkŒk[Ø S<Ô…›²ù_X^ž¯ ¢><ÜVÿõ€<'^8wq³w¾ŽÑ Ýjz5äšêü½üý Ú1ÒvpýË5–’h•F"\üâ\ÀNÎ÷d9É@­¸\.x ƒ©ì-ô½>6º¯¬?âT“j‹mn¡NЩþ ü$ÆæggœTÊÄ/×ðoÀûê3®+÷ˆ8®¸O{ + Uìá¨~ëóä¸ü¢úì8®3š^Ç)ˆ>þË“§YWæƒà´Ê¯ºqQyºæå2.†˜~®Ü#¼ÁBÒé[!ºuȾO-%{S_çõ¢Ýí{h´3÷Â×Ûáq¡²vž ÌÙý9{mO~]|ã*ÌnôZáóÕ-ŠšP]xÆåbA= ‡LÓŸçB%ÝÂÚàŒî‡Æå´B'ž +²áç^!1ÖØyÇgÍxn~3±–š/á¥/gWqeå'Õä Ö’ÈL\íûqÚ„>±8Nñ;Œ+N ÅÇ endstream endobj 1077 0 obj <>stream +ûpe*®+´pe¾<Õi 8¹Qým½¾ùA{ƒ›Ùòœ‡îž +à×Õ¾ï.L‘xV*®Ãö…àZ µYsÊï}vo—ˆÌ¦‘&dŸöò°‹…è×ߟØu‘‰kÜuˆ+me'âÂœœûþt ¡š:ÚÕû0jvI?7>-lÙ?©éLf|K«ä/ú [âÀFVM톤^`4ói^§D ¸ÇÛýâê Õxþs¯ˆËvïùú¾(ÏÊLá5¢sï‹ç²ÅQ¡6›£þ4SÜáò™iÌ"z‰ù Þó2ñ÷cRý±êåi€Ê¥RÊ.ËcòÕõDr0&ˆ<ƒ‡ 0 sØ ØÉãØ®à.Žãêù²kÕ˜.¶¸µ×)Ö}¸êŠ$˜xUý)DÄŽŒ|äß 6°·áæ%_¸âÃeÂŒž·æJÿ0LË–E0¿† Š œÎÛ2irx€h‹¥»?‡ýÀº·ÖP*)ªVüäŸ2§ûí +i'ÕË_¡Gç¸åþé\hl2n㲨âH&ëÇ#lŽàŠNθ¾O_ç+Zc^ËÓÛQ(­`›ÿK +°ö§üŠüËÆz¾EÍÒV„Wͬaê*X9±ïó-ò%ؼ9n ˆ·H€Cyiª½4+Ò5¼±úe5UX…üIGêªÇ6ª­Š¨ ÔU=¾O/PUd½s©zÜáôâT€f9Jõ­`:å¨b -óªo‘Òy@þK&ÌSo†!Õ §"EÇÝhê]Çèb`*lŽbWêÓh¡“Šr`¤^{z‚¢¢m7êdù.*×]Ò_¢È »Ös(=?69*k7Üé +¨ýœ¼½ý´ó‰#®zò“ÐäîœÔé¥@¨ ÓHkʹm(³H8EI‹zå¦w[Ä°iAƒs‚AµÍ•r‹¦÷Y£1F­åë)S$©¥Æt½dµ"Â^"͵—B¨É-¾Ãi×-ñ(ÌÐR`Jya,…âBÊs´¨3CFKûHL•ÂE +uNJĤ€â—Læô¤…*½1 õÖ³ó³bÑòvø¬UângHÒsp…°Ž¹>é"­Å{:¤÷ˆ[ƒqqËúG«Á°Nâ£çŽ…<Š¼¡±¯£õ~&rôê!а¢&bg ´Ü. þÌ虿çTŒ(h8ò{‘óù[jÑ‘ïAcEüµ¤ˆËª÷¸6Ñ.]ì8‰¨r¾QØöªº ¢uŒ…D…èCpJÜלK¢á~¡À/¹Ba—¤Ä¡sk«ynïŠ$@PÅE `û*èÀq"ȵï¾ D±ûfïüWäb±ó÷ó]±€lýHK±Òs~Î&b+þ¸$–w¼O“ˆ4´¿&ˆep«O–‡õLôaÀaAóY3,”ù¡]XþA|.Vµ?dÑã¶vOÅ™çí¶òÄ™Ùã™x€ ×— <µzƬ¶8ýV,Ý2zÂ^;KóyÈ;ñ¯y.iPÐò…‹:BòÔ«tÊièØ¢t3J猬ðÜŠ9“Ð('å;\ÃWˆ†` Þ½¿8—>wvÇF{;Qœþ«}sc]³™½™™Aòûˆî¿]Gˆå&éÂd=:ïêZªãÃ]EÔá³+<Ó«6æ=:uMã†>†¬ñ~N—);Š®ŒÝÍo \Å™wÃóåR€4©¾r&ä +ewòŒøŒiÄzsdYa_ð¢î‹W[áF~µ[í/¾äV UxxÛêÏ ®lâ§K[Y¬ßÄÉVoÉ7Ø*0¥7òÖ +QàaÁØùuNÂsÇ›Z¥Ê¸IV î¦×‹é¸m¾E«>¯=i“tº\{g#Y¹D岡.Ð~욺Å_6eeK~=rjÑ­]38Z>Ú­!V³$½¬é,pìꆉÅ4®jþ–œzõxÅÜ¢æ¬\ùàžFµZI›¦qÀXA”·Êõ0iŠ¥Š;M¸RÉ´E샇v·N9þ9Ì”_ý3±+ÅX,ì›k`êI”ÑpÞ$ÀMÐì/Té=@²Zõ ?Ía ­à¼•-ƒ 6ɼƒ <áªÌÕs`†¸Ç€FE P«rïåõ• "9k|+)0¿nl–Mš×%\%êÔàª,]¸U^«–ŸI´«µ +Ñ♵…•«U[ ²^%ÀÛ&¨ÅZ•U'çÃå­B>Y½D'ˆ¬Uqã6žÕª°¦nÙøa 0Ѫ½'>õPjí5­°þèAìñ½µ­ß>Æ-]º€fåÂØqSOÀôþ•Oúµ2–18qNÃp +%\0VÂéçÑ2€ðgº…–·,Š‚šcÏŽÿûö F ’–d º`?ï²'+’Hibû¹”æ î¥n" oZL¯þš"¾’|6˜Ù¨»¢8;!æÑSßÃR]©âŽæ`¿ã|ãAw¡bQqŒl^‰5Þ‡ÆétTm¼#ÂýUÖà¯#…@ÅÔÎâÈv•Ÿ6733WÁÚ®P ÉÅOý0' ï¨,jÜ œö´`뺩r1 +ÑÏL…ì–’}ìÑIH™'4ŸÆ4ëÎõ¦º(ð¨}ä>ªšx[CVÎ>Ö ’"ØxtåPékFê&åü 9@ÊÛg›|°OÖ…n«¾t˜¯øàaã¯ßÁ5”ˆ&~Òòæ^¥C Ý „ž“?àɯ0‹ª$¤þJ$…Ó6²ÕSÌžd$5ˆÒ¼^Và5ð^êÌò¨ õ ™ýØ»±±#³£•ï%fÍÕÈi$í3rÁ,™ÄÛF,Ï[5â¹Ê_°d$¾lVÕˆzÿ +'˜!I¨Ë–X"ßÁ2rƒo`¸bû®åËtjD)Na-­bV|ÈËFö~oïX@„,™üG`lT‹ìeÉì˪xØ_Ö…þdŸ‘Q#æË–Q# !ùõ˜ÞteJcvAlÄ+]äÓlGÊ"Æ¥Ó{wíƯxÞâÑÞØçÀùâ–¢ƒãƒoüûc»[¹pì)߸ ¿¯,*.xe3¶¯\­h}3)o?‹,L‡uIŽöµ™òÓpp¬‡q:‚´¯iq¬úê ŽÏI©,îm¬ø¬]×8‹B¥qÉ‘äu‹"…3Öò6ÏŠ€ï¼3¾ÁÆ4f/_' UEÃk_©FÎX%¿“q‚õôg̼ñÞ×PHzUu9b%Ø»«„%¹qs\Ï»¨ñX%j0ÖX‰HeÏ"õlÄÞ¾(UÏzAg¿2„éÑœf4»ÀF{Q.™ž9£Q©‹f§ÄÙK³Åæ öõ› Í’ +š£/;V5IÈÙ¢¤$Ythö@' ]e(4»l5›pÔBhvLå´ÍŒfIÙ$0ÄÂòeÓÓ©I ÙÐG™E Ù¢”i|¿ø™íUÎÄqfûŠ“¤MvfŸœgd“Dg¶ë £Ä—tfàëÅô–èÄ3yº>³ñ9 €dM8 iÝÞRJ÷#Q÷4r|ÞóÓ[÷t»D4×1n5ŸÒ8 $]º€X8錦/ßÔ.î î)Nà:-@ANZ߃S¸ÿ6¶WÜüð&ØÁ;«#¸—»—tÏ?uoá4â[Ãî™.œJQ{“ëˆeZ÷¯Pì*Nå°Ðƒ¶½Á¾WõsÕá÷‹^{ïᎬà \Pá°ƒŒž2êzoiø¯äXCØC‡´ÚOûö,´QÞB›po€|P:‰ý„[93ž®n YYÎ`Vc¸1YàzõÐty)È>qŸ± Å0Î@2çÖA…™ê%¿~BÒÑž”sÓߦp†Ú*oœ…ˆ‡A˜¸ê“°,©t¿¹³c›À@´GUYK@®äáÞKÙ4yOÑä‚Ï»ÌÓ9¢z 5ÔsæovqUQ¦«óŒ8\(';Rëy™¯q3 óè›ó²*ä)J ý* VÂÉ'åYIP +çTÊË}Ù™ïæ‹LU”Àú @Üž6ÓZ»¼vàÚœ¿÷ë"j´¤ù ñ»e‡hâÈ’ãa±Wö£¥QŒúá„XÁµ!×?¼ +gŽN<[ve¨%ºnR™ DTQ‡¢éî¾ ~ž-‹‰¤Ž©«²A€¿F1¡hC>æ©Û%ê®ÓŽO¸•Q$YIýY¤}g¡ŽizG‹Ý5†Š«ÈSí4Þ„‹)¨ð{@ŒÌcî‚$4öi¶'p¢‘J‘LSáJa„S$¬òø+²š»cXýil½ÕóoL lÌVöK†ÔR_È .$L{Ì™î*¬0ß’zð6º5º{ ÁQ¡LW6à䀯ë XIJ:", Š læ Ap 3©ìïÀÏœb@}eŒÎhbäX·RïKY Ì¥oiKÀ+÷®Ðu @¼Ô„—›ÝO½Òi²çM{òÔ¿ü˜r´qó O³¦} ·$¬eóUû¥‡›ðå‰Íaüµ0ÔÕ€]£É-07¶¡Éð¨‚]öäihm‹s­i.¦$†*Æaå'ëR˜T¢3˜åY&ü1Î@ÎÈiFaE–Ì6Dä,’†Lo™³dúTÕy +(û£¢ŸH˜1óäÍŠ¨žh˜‘xÂZÃe§àÃ$èDý¹mš>r¨¿g¹æ𤊈Ç%tî®Zä´1Ì”`³nqHb +Æfx[ŸD¨Y¼ŠÙR-«­ +Zñ²ÐÔ˜vç +Ö~¼FU¨tˆE¦ö{~úÅë8«#´õ›©IïS™ÂY¢K\ã(‰þ@*7%q=%^LĆ:z¡šâNëBM;ì%T>:Áâu¶“!gq¸,WÚ0ãF¨LšúPÍd ¿‡`Xvï0àBwæ·x%|»z +œëÊêOjÛSûKˆ#à~D ä³Ý\T+zpãÅJ;*3'ù¶Aª€ïåM¯„kþË ùi•ìîk£cæž7I8¬¼¦`{3 §ÓË›¢êÚ„Æ·Ü8rñµÿê&—8¤6ð½Á‚>¿4ÚCUðqÕqT=ïé㳎 KjP›HhÈ€³fë =T8”AÜ œ÷™ÕäÚ ìIc)'Å°yiߟÞ?âl¸Ç@ˆÝ7( m(”¹$R-Ò (s¥Mƒ^: ’ä ðGÖ,:‘IÐû÷¡A ‘ÔS$g0/#@ÎN@r¯îÓsÒ7$WZU¯Ì+3-x¼$êK]iì[/*iOº:»Lƒ$) »ýa{¿è‡´*]„ÈÆðZÌ-®2´`f“°·ñÝ–& +UçÀSÝû‰¿S%VWqÔ§Öå•Ž!±é€‰ÀöTh\ü>+a~Èͤ¤®SM1 îäV–é§û¹,P7GéîϤ¤ìø )•l¾ÌcçLÝj{æÞ*Ÿ5v´ÞèìážJ‡HöNû–ý™R’ŒÑ#xâ¤}1+ ¬h÷ä‡ ®àíÏ,ŸQ ŽÚè¯ç."›&ÉÈ/)öÕX·t†+5aS·šBt”„Åx‚ù•Æ9•: d(Tàˆè[7ÆZ©Q[Ðá.Ê{—.Ú¶QŒÅ¤aíÉ{o=ˆþ¥ëPD°NAu× u½ÏaïÇò‰÷Á“Õl½àÿlñÃ,`‹½7#:NÇ5áá. ƒ±øò¡‡ŽÞ ){‹,’Ô6”½D8¥ +â‘ú 6ù¶ÎŠèû+h½gºôßDwÅ=Õ",ž¼§ï=˜l%ŠŠÍŠc“YÈPÄ‚:#@ðH°ž ‰N쎠D›wD‚’ +Ø—ã•cÛ”­5Vü³ýÚAÂPeýœ +ÄýyÇ!¦4ëâU˜8s¥šb¤8Ý_Z:M©­óÈK®\ÂÌV¨ÐàHäT{˜YÄÈ Ò.sØ,›‘w¤²0Ug4 +p91¢ôx¤–•4Mi©×SHÙ“TíËÏô¡Â.ùré€pÓßGÚªlæwèß ŽöðÍd¬aŸÊŸ¨ó0lô¥¸»  U©B[“¼º·ü%cC@Ò:<€é£‹µ¢á@!\#€¡¹‚SAdç»çh姄¯«4l‰.ÕsnÖÝ$¥ +W¿“l nˆ¶ غân g¼A@}l’oy²òsºôÁ<…·ÂV9Å®U1Ýž™”Ã]¼¸¥qœð½!öù%˜–nË÷*K“ŠŒlc4U1㑺þJ°€6©û‰öÈ\…yÐÇ Á2 íÀ9@6”&¹2|Ùº¯Ï¨Î¦ZðôÄdH+qOqŽ0•V¼º@k•µ’Àc&q×ïJµðÞ¨ðt¬gh°áü9J‹,Òô•§üA¶†fqªÇ”+®5´.ºyùy‡€‡4Q‹‰r¯×Å@Q;H.~M™âÆ ÔÅ/ ƒnÄç'ù2e_Ð ð†,ÖaúÆH€¢ÆÈÊ\\>‹ˆ³G»J!:·Yr·!zº°Ôn&¶5­ ¨Nu†ã8 ç–‡H Øàëç’‰5XN¨âÞ@¾[VŠ’\€ñ–‘Ÿ‘cÍà;&ä$ 'À£ºì‹°¼oø•æŽ5À/ŸáœðB è\½½dƒFgê³#EÒ«4ðWÜÌRÓXƲ±é¤ˆ(¡mýþ}pøJ—Ÿ[ cβGÚÛÚñL]&Œ·(„½Å·+‚½ɬðð$Ô«µ´ùÐeûÁ Þm›þæ'åx¶“îãa¡Ïeg ^ÂAùe +ó†{Ëf !CšV2qÅ’ð$ô8Ž¾Oc)·xUXݲ¬Ð~Ó°«j%-Ž¤L£G!q@ÀÖmûW¡.L!ÚÏí6Ëïd®½»xÛ^U£‘UNrÞÀsN±RÉjG•—íT‰h'…ù’ÿ†¶‘ݶv°:ÿ›'üjÊÅKÇZÀÞÓÌ-ÞɱIæ·ë†H\SØõ'ÃBE¬£Û2Ûqþ‚©ñqzT(œƒ7o0\aÙ‡/ÚôÝ¡à•UÑÑ%Kx?‚›‡üxìËøÙÁ@¿.yåµÖ2¥K;’¬z¢·zÂU€—=Ñ&‹YR/†z†çÚßs¸¨¢§.¦Ç<꣒£$Ó 4N°RÈdz:/¸…yN§kŠþ¸ÛŠÓ¶kRüºñ†:·î®ýˆùŒì–:?ú+Þ•aã/1êÎÚFbz¿R˜fk¬™¸ì •°…!×ì¢]VÈw¼ GRxƒIl„8ðÝr;Ò + . ]¬¬ø6î²A¨¸O“TøÝ9*CßÿZåP‰+Y$=ܵä8†ŠÚíŽG,tr>«eªµæ[Šà&*ÀÅy,ù`“.Ó–Ëo^Sª} (m"—¯É‘Jòð ¡¥8G$ÐD( ¼ +›(ˆŒìž,«Úϲ»og¼ƒÊÚ¨Ðh6]àÌ`âerÙFæ·)ËŒ§¢/vkšàŸ\ýrØW>:e“xS@2Ö¦6dWÚ´¯FH`KÛhÙˆ ‚VÅ7e `¹]‚¹:î>™äÔ¾çDÒC•àÉ‚×ðá“#ä¯JÆ{,Îß=¬ï¼™‚RV¡—e™ŠéÍpšÃþÉT¨ª= +ßÆÓéÐFq!Z5nTäY¿{ÎÖoN÷ä¬-Îà< Ð⬯ä„Ö¡ÁÙN Æ çd=ÌXï{yôʤ$êwöBÅNB_ÝÕQºþÆõÉ»ÕA”# qò&æŒjã;?´‹­pw—t/Ö™t;XÇò¸ÊÀ)àÊÛùC¨Æ.ÝFæ|¦KåÉ7/ ‰Iç¨Þ‡þTãñ;®K'ÙÈj¿‡Àè˜tuÖôïO6Ïk²ºðéR‰b9ðñÌXGþŒ‰ä#·“~Yc«š.ñ1£n(”ÄÒMbG£ÜüŠ@ØyOõïýªÞÚ©…¾Î!<&Ⱦä'õ¥ ¨>ß‘+·]àþbcé@¤Ta'ª¬òn¢ˆ”ÇÑ;€öÕ  A‡#äý}E8éÕL–çqã|B9%çÖ_O¹*åç°&qjäa½H†– _ìõû»©$G¤G Ab8}ÁÈ)ã~¯‘[ »×¹]”Ð`û‘#q^s&¹‰”î×€j¹*§ÚÜú±äHÞLiàÈ€‘%ç†ÄÕå¸n{yúRì$*ÇQÊKù{‰„> rÜÀ°åI€©•8xç8ʉþZþ- Žc•ŒJ?ù÷|‡ã\ìLdí€{‡ èÏû_Š!ªrãLP‰×$nù|–8’½"¯î¾%îFkª@¢Äa*Ãß„ÄJQÌšÝï+õ¤]mÏR¤ˆÓ0™Æ=Iš9iø¨$g¯­‰O“sŒ±ñ= “«¤í…Xrâ¾üºxtÍP`…%·Œ‰žpbɪ'JŽßx„‚j¿ˆ‚ä\yã=pxïŠë=TÞg±eçµðËÎ¥p›Bu±d:*Œóü"‹4ÓT‚Ùºët*‡ÐÛ›uA?·À`ÄŸMšž}BCçLÿÞêžãq˜îçèm*Æ8œ–0UŒ·¦¿œÅM~I^]!©P¼¡o1@Bñüy (Ðy((žª+Bx‘ų±§ßdÈ/JQ\¹ŠÛ@½7æÅÿè嘾FŒœªNÜM…ô«cqâPkfYèÄa÷I²Ožh/Ÿ–Ç ©:q“£1¬ìÄë>PK®ò‰f‹K,'Œ Õ̃0ο¾yÉñÉM-°¨œã3¡OSÉq-ž†9 nù«.^•(Œ«øn{ª0ËÇ“Ê}‹hSÀÇ£Jž¿,ó°yǃáK#âxK<Ë«Öl»q<Œìý¹ãÉ$égO|†ÈL$ + ²I°¤Ð¬`‚©VT’RÊ”’¼5¯ó™¥•‹M"ôú M>5E’ŽÂgNûþ‡tAî=§ƒÜeˆØ3A€àÜ­6H§´L˜®Ã˜'óêèá8/ùp¿h#¥‘–Zô«ÔÚ—F™š‡¸Jˆ_…h\‰" +aÑuÏ8!!ÆŸfˆÄ‚Œâ*#‚WårhÐîÐ?Á“_—ò+Áª`Eù!Áé6œ7³{9É7³‘‡Òa«“,ˆ"/ífI(¢xêSù´>×X RûWù¤Fî""b"_+wÍ×#3ENš„G^3H–GÍÿ×Ê'ÈÇ”¯Èîƒ9M9˜F£úÌV4¥ùERa‚S¤±¿±‘ˆ ™Vá3&Dªb_¢øâ™@ãŒÕ¿^ 2 +E$j熠ü|˜ug”ZijŸª Y&&»Ò' é™LEWÝ£iæLhWœy´£`”‹' é òÎèIàíЇ!ICM¨¹?j<jd 4´ /=dN´¢@C¥ˆ°9:èÏtÂLƒEH£Ó|Ø6a; ’‘Öƒ+3¼#Õáð®cÔD& 1«z•IÄæÌ x.md¶J°bxùp´¥d(‹©™çQ–ËMo%iø°¬ÀáÂ7†F¨ŒÞ뛌ˆ`,”@€``XÃJÖa† Ã. ÃpaŠB ¤CyBUu5<ô ÂH˜!†ˆ•è½#ÔCã9 %y‚,2F¹¨¹¤õe’*ãºÝWŒ¸¨bû8-ú!Öœ1uMM¸Êt3Å=]‘_Š©é:ÑbÐ&>*]fFZ1ÿtdHL1Iœs’G [#ÄÇ/¨ÇIZäÙH·gºôB#³Nífé%'œš×ÚÛVC<ó1"2SÆJÓ·Ó‚Ö’hÝiýº9c³Ò»ó‹+}•Q)Z¿Œuå…ef|MÇ¢®ÁÏúwo-“~@ÖЈ†¼S[çV$Ö‘ø,Eã{°œÿµKÚ§*Œ¢gÑšºÑž\ÅQ;ûZZY¼;WÉÄVö¹µUHWÛï[J)§2Ó™R|¦/Åg:M×ÚîÐtŸKmu"’§®_ÕœEÝš§˜F;Ä)’ö4-jnÂîÄ/3È®QŠîO 9T“œ¯j’b!Í™ˆ5£š¥¼«KDf~”]˜ãU]¬õ$Œââ4ZsñÓS袩o&¾pTnUæÊL?!‰$3žyx*ì/,ŸÑÞ$­âDDú W"ÿØ#ôœÔ—IÞâŠúœòFúÕœz1‹¼ ?“Dbišjh¢{Ž9ÏHvúÁ’ÔæH]ÒÓXo?È–oT¢±‘J¥<£Å3ºOjü£µÕÙ¹B2»D?DR=ŠÓt$^êÄ;£2áŒÉMg:BBãRˆ°úyœ¤Ÿ´â5íp»Œ>ôH½Ô9T¾.3ñOcDÓÕ+ï(§ÌÓIŸLŸSî +F?Äõ«²¶dˆ§$[]2Ù¼àúªFoây"Ò»ýµ¬ÑPQDª¨»¢‹AaÅ; –ƒÁšžHQs/~•+8Þªt‰æŒÆ¤–™H¯©/á×çn3}ÂóêØÁ±=÷„ââ,QJM˜HHéc ©I\ÏöŽ†)©Òì%è ×ì,Ž‰¨’ö ilb^,ñª¼/ÈYTk“2òAÆÆ/â˜ò×:“ìë0ÖÉT²pí  ã2¸Fs„D8d.í9ajˆö¢¶á˜í<8¶ïìÎÅ#ïEÚxä{È)4–í†kt[$æ²päÒ®ù§c_3Ƚb»Ä+³h¾Äƒä÷Á _‚” Îø%Ö‡ ¶¾o–~f.•Pùeðw°br-V šL¨o‹ò.2~7^CN<·DŒ‡'ý³>Kå×ìLC¥ãQ_Æ ’<ã‰qU.µš.cùHƒiQ¹CL‹˜ˆyIB«’®f&±ãe3U}Ç|#‹KàÌŽbõ5{sJGUâ¨zæÍ VÇu<2Caœý˜ö‘"gB™sBô}"bÏg¶’¬pY\JBjf’·f$5£Ðj©)D¢–»f¯Äzˆš½ÑJ6{ã¤lñ|žL‘fÖIâ"º jˆÎ¬9…[kná¸*¥5á¢Xwøæyt>¶œ=¯¨mVzLŒ(Æô˜]vü|9Ôtɨý'´Uå0N¨ +3E„ò¼)¡Z¥¾ã +ÓÕ%ɶt…ÊT*ËK ©Å–R˜ZÍ+ºMÁÅ8‡½”À¢mÄ@"C) ÉZ¬ªV”Ti!B…Uu Ô°¢ +§±#DH¨f¤µ +‡vÑYr4^tBdê+ ¤Ohf&x&xH žàa ëÆôxƒXüûPWª +s¹„ù/a~~ ¯½‚ü¦"Ix}þu5¡1->«E”&Fš}”G~Lˆ‘;¦‹ç±ÏtáJ™ ø4Š‘.œClÒ’6ÙæÉùÒ4lÂfÆS Då° +E¤ ã Ã)*™ 2AÆr)Vå5‚:U¢·É%H^ÛFdh–¢Eããë2_Í?/ëäx ’œ÷G–(¸&$Qœ°+œSýðÚ(äõÍ›&$ᡨ‰^O%ääx¯È+qÈ‹ÆC½\ôŒ’§ŽìÔ)G™‰¦Qì3Ö5”õœ ^Ö‘GÕ‘J³î”îF}ÔB_F 2®*HAíˆ!¾)êö+¶dìÜ4r^¤"_Š œ¿(V®\j–ˆÓĸ%Ísú…!!rMDº‘¯´"Q8"]‘t#£Qòkf3$—«ö˜}!ßÁa…Ð*:r¡Mkd–Ï'í³çÇ꘹p¤›»*|ðˆN‚š ê…OP˜™0ã0³À©ÄPcˆ(jÁâoIM½ŸsE‚¶n+=\Óšˆl«uã¸>¡§om07óGmÙË]jKÈyé³FÈÌ%f¡UëB‡ŠÑlòJ%]LÍ&j>Ã=j¾á,¯f"j/Ó5Á܃Ajá°uÏhcéS‹É'±I7ªiÏÕJ(*%“ÞŠ Ã-&H„K£KWzÔ\:%æÒ'±ÖôÁ¡½¦Ä{©11jÉAŽ+3!é’qµ*jZô‰šöeL•J}ЭhÌÒ]¤R1Ji•N¤%¥¸©}ŒõÕGIc±MŦžŽ‚¦3ñ) …LI$=Um¸}çt8q‰³W9W2¶´Öu–pABŸA#­V­¤‘4boÏhR v,–Ñœ–·ÐçÅ_âPF[Z¦˜©TjZ¹`#È‘Ùµ^­N;í4NØ¢ K´“Qf´;ƒœèGëÆ2ÿhå}T.Á}„6™– ·“N6ÝÜ®.ÏdÚ°çìV´Ó”+m$…ËòtO—ŒJ)e9wÙBºÁâW&F¬,íPV¯SR^[my YÙNô&ÇâIÄõã~Ä¡%&hµ¹ŽXdQ(g´B9£Â¡œ¬ƒ«PNFÔDÎ-—ÊEƒcˆQ¡·f››¬èÄJ"ª7í¡£$Ž‰d¢¢û:•Åy:³Ü̲n=?†¤‚hÆgiz˜/Jo™]xZåàȸQ)U«ñ–ÿ–¢R°CJÕHˆEÏ„H5O•Z”)ÎE™’ât8œVxD¯Ç _ + g ! $k̠˲ǩ¡£JR̃IšM]“)™KJý!‡SŽÒAB¢Ó¢x“þÖü#&ËÊOî‡N¥‰ÔaIÄ¥5¾¸ë¼üÖØW9“¨Šo½5È ¹|“(ZPœƒËÉ·’I±èqÈ¢c.UDšºìBc|3íLão¡£(D‡_U4†ø3õKZ Ÿ„>7ÕhÊ—2ĉ±ÐL’ô°¨âùjhD5CoÐæEÙL§ÆšÚK)¡+2ü$B>A¤×±{Œ•tßu«%èÕÿ›õålm-Ì•·^nܢ–ZÅšOdPV*}b$¼ 甚ªáœB +KÖe£´ÒÑGq2³³›¶él%‘ÔJ"Ç7ê÷Qö}˜Š¸ör)ŽV¯Ñ„-ÑhÂÒ4ÚÚ¥dÅšÇWJщÙÎQ5Yt•å¢‹^ffÑ—}«. ÉÙš‰…÷ðè[Jâ „K…Õ>¿PQ:Yh3¤©KüM Αë–sµ/‚QiÝŸ! õø‚dHk]TÕv~R ‘†fÒ“{"Üää'ïL†B[GV>5S‡N5'È»ÍbÌÅÔ4{êbõÌPe÷P"õüeŸÎ©ŸPO":ÿëød›—…uBL†ÂyŠb`õiéÛèóVNUðWÌ‚6£ÕÆýGdÍ&,´ª’UGuªhçXÈ{,yV›16Qo"Û§*SÇIY×F6ge㪲šT#‹V/ÙlÈœâ·)¹g‰T …dH¤bkVt©´anÜÚkõïojtÎPŒi3’XK#Ú‡äuÇw4ܬÛÇr†! +Aô1˜^J*B+L˜ÈAf:¡ÓqZÕGÜŠDŠ¡üš¼ì^ @0@\«å +-©ªàz¯@š˜ +ÿ!š +djX.á"á² +— M”†‘*) ¶'2Ó‘ŒÌf…GÄóÜÑ)†ÇÙiÕ)¡È*#2ªLø‘©%‘¿ 2*œ1k¿0Â÷ë›ÚÑÅ×Wú#')™cꪋ~V³’`±RA²ÇxŸKñ0aEIÒ4SyËŠž>ZŠ‘ëXž–Ñ[%«6o²§Ný…=’„æx#å­Yiˆ*5ÙfÊ Vo»è›ð'b1iF$’¬d—æ¥0¹‚C‹f¢%C +yµÙ–[YWÚ¦JßÃ^ +‡<ÅêU‹’Õ[cM"š«T“•éÑ¢¢¼."ìÈZ•öJxÔ!"š¢6„Å™L;2etÆ»‡ø™¿Bÿe¦RLÅùAZHŸê D_8†14£„ŸˆˆT|Ìmèh©8Å*^8ä¡OŸm]JŽ)Ã8·R…|I)QÇåÐ=Ašq’§ä„tB>ÂCyãhì­E¼†1‘^‚+—~ÎîD¸q4g®»¹Í1KÙU=hUwÆÙZi˜¯»£7#å1Ö(‘Š3ᥤ·È¶dœ,8¬“¬bà׬/3Æ:㌱ŽFÓH® )$ùe•Ù€¦n3"2ʯ z\$ãÙàÇòtûq[mDÔšÉÚÒdÒzSöxâ•0Äñj¤Ek¯Á’/’49œ0IsÛ|³ÖhR#ëGŽò¸6E6<UÕ‚ªÖ-؉Ò=ý¸œGg#ù.ÖbGÞ _ºé´‚&­hEpÚ7<«ñ&ãQ»’Éó¨í…Î+XåŠj¼ê—4+$CÆE¾MQ¦ŠRS%5E©âÔÈT+ô!k/hžkùˆ8ETk3S‹¸R¿,¥I#HD,‰b”,1«\®ØnlXÄez9X¡,:•QpH® xF’â’j^\®67œð®õC¯&T2Ù7Ú¬ÈP$„h>aÍÛec0nÉ +"‡S_,ŠÖzøYŸR0øVGHì:©ÁóëQ]#+¥Ú¾Ö|ÏWëh5³Kuö¹F6Hr¤‰œh°'W,­õÐýP*ÃÓÄ5ˆ´žÃµ5kƒVagøm5‰œ‹A¤œíØΥÊJE©­è4¤¢%V´Äèê×.üǹg S¹bDoaÉ- )ò4R&úêT'5‘ý„ì¼'µ=QŠ4B"F{éÞ|eŒP‚e£1|QFA'+8ÑÑYDÉ*N‰&Êù:‰:Ö¿ ã,åJk­›­õæñ:µÞ¥åÈ;WZY7(ëXäàôĪyåàùv¬‘æÏ©R*$’¸è,¢ÕxÔ´»s±†8$"%"#¿HqÆëGŽ”]#íÒ†ZUÖú*3žâi7r1’ýž¤'sª¤D…ïRduY¾Üœ&¸[<ÏD™!¢xÑü!:ÄC·š£d¹æÅá0e«J3„p(œîfP¤ 2ºDJ[–’Š<9´9!ãHéQÜ¢ÇÉZ¢i&ˆÒ †£45C3g³y|Ö©UËìïûÍ|E;²ÏOOtOZ$8“¥Ö?,5²QÚ°ãöQ<^+T[¹ßÊí'©0qœ7®¼8»#‹W?“̲NìÖ©Øt*jJ›~S‰à,‘Š4,z'*R{*.V ŸC‰ã© +Y«Ç¡Fm^-¤ç"}þW#Ó‘x¦&5G9aWC^m´žÚ¯í¨Tj½d&ó°À ƒáà˜´jß½^ :B1hh€J¬" È–b¦ñ >hÄË–LA⃤>áxɤpºæÉÏÙ„Ÿ/óµ0Òϸúç;bøàT¤d§âºöóa"‡ÐÿùÄïõ÷€™ŸÏ¨·µíXC(F$>—à¦-Ú=àüoÏç)×=^œåªEˆðùNÈ=_×ød÷°|’ëù8‰1Î×»6ô±¬Šš ú¼%½MQK裎 ú´$ï(÷ÿïÿÿßUïÁlÜñùþ߃>zÞƒ© OÛ6=RôAõ´ˆË„¾“úî³tîwï«ÚµPïÁyÓªqêâ½.ª÷ ‰{ ml?§õ}Ï£=àµ}=XJ«Þ¨éÁYBôÀ>Éó ú.g¼Œ©÷SGE –†‡÷`†Áª0Ã! ¢À ÄåÁq0ðï÷/xôõ ¬Iöƒ½#†B¢Ž/p“„±m».§£¥"¾ 4¾@÷di,Érž3åh¤ï\%}A%_DkÁKm|AYÒ7{ÖF´"}FéûùŒ|ÁnaѤCÆØËå ìôø‚j‚üèƒû©±P—Ž ù‚˜LIN•0¶µ,»HŸ•,éûp¤ïfù‚´WÒg40óò1é›*_p~|%}x~ÏWW5ä Np|úIŸMÛv—Òg •!'}ð‘/(Û‰·ªt|Aí;b€r„iþÈprô‚6éK#/ˆ{~ø$}¹íÿ׺@œMº`mn.ð"}TÉÆCâÉ\p†ôÞ-8ÜÌç²b¤o4ׂ]‰yU/-°KúâZ Hûœn³ ÆN(x1Ypþ |Ç‚­¶.§zÙY’>¨Áæ¾îç%ôÆ&øX”íè{‡Å‚kU'˜A¹ôC,}|ô™—>bZ,}B†:Ÿåy¿ ÜÃbúÐC,(´‡™¾-®é;£ÿ²]™>ØA"”Ï`*C’® O,€`ú,BÀ:¶)ªéÃœe¥)˜-Jei[&Aq‰С¾M:´iJvŸ>‹‰—l#¨OS²´$êû‹Q”EŠ³¨/JgÕ÷ð½Ä‚dbÜÏÖ8£X€âa ðËVûÓO¨ê;,`ýù +,ð +fS}©w>B-#—¤8U`äË|qyq6Þ¶ÝûƬóª&¢RQÏ‹+èÊ8Leè×µè¸È[ýßu^õ¹m¯d¡‡ÂG™ŽB–Rš´n¥ê3ý­ö½m7]}ôä +ö…Ö•‡ô«ï\®€RòÅöRÉ|Rä7ðT€4Ïü­ÐØ)œ¢³:]Á(zÞ‡5IÔˆ|º‚.8ºub^}K%X_Võ‘Ÿ¬OÇ›±>e€~½M"¬]$šû\Af‚®ÀIœ® ‰—°—Ä6E”:½ºíÞI~¼²¥¢ +À_}´­šN5 +’”¥XŸ<é^WàÅ›žsS¬qTÁ;Ö— W"lZ›ËúÎ4”W¢@›ˆ­ þ¬/) õmµÖ7>© ;5~¬ ^0¾ +.a­‚7ÉH¬ +ô\ëšµ>»U÷ØúJ·Ögãh}™Z„z꟱* õ;•›é‰êŽóéªÀl€Ÿl#[öê)qœ +zÍ1 èJQAšÒ8³Jë3"­Û¿ß +ösžs÷¯ ÷½—Z&#»¬2SÒÖW¨`Å¢€ +ÌÀß·ÑŸи¾õ)¼[Ÿ°Êgë“â(0‡2úµ¾p²§RðsK +¼ãB +$GÃ)£`‡Z]Š‚ýQ‚ÀPPy>(°]o[ßÚŸàŽ#‚­e?Á~È'­õ¡IO`»Ê©„3ÅíD x‚[¼ô:uqOõ§ôyq‚ ãM ùÙM0®õ‘² úç± z~`èþ×»ÍÓn[ë™Àz+¨Xë[œ‡Á(¦‘ù¤eIë;fð¥`  ÊUûà”àwY”ÀËÌ“à°Ö·5“``3M‚‹º¶5 r¸MÝOÖÇ/;UŽz%¬º |9@‚¿Ö—w ©OwzžÚTqÉ8§ÝFP[ß»É7¨ï™ïSïÁ%˜2AîäÁcÀ w"ˆ¿¾3%‚ÓÛÜ4͇Ô)L¢}³¼‚÷ë‹Ä!ˆ=4 +¸g__x!h‰ói\*fy& 4„@rOrîë Õ&«ÿÀq¢Ÿ È;{ ­ +‡:ˆjt¯¯A|`Žcž?ð‘g? ;(?†A§®y}ˬ~éČԧÍ{ñã<ÚI²z@8=™ä3äÙóá×7åwà¹ëàFÙÆص©uÀH>êÀc†²mDL:çȾ^ßo€%JÙä4B÷8`~,ì=ì®/2^_}N-Šå¸}¸]/oà÷Fb›–ËšJ7@'#AðcK%ŽðFŸ L¦ ˆH±Àk@Y]Ÿ­BÎj EŽ¾ëók`‘H†h@ñògà»í èö6÷t}c2ã²eàØ3™nI¼È@¿1àû}1°çúx&¾¿ÃN2îô*" pÿú‡ø’‰^àlëû.0vh]`~ë#7x ‹Änåv <òÙj¢Z Z@›ë³b€ÙÇ \ßì¯ÀÒpI^+çúÎÅ +üÆ>ýfŒ-Þ觀DbS F·°»ød=^ÙRn°—P`¥(àÏßøyw¾º>Y9H®„›À:®Ï›š@°L -L@ßà¿ G”\_˜@ðÔØ•@o^‘¯8XJÀu†×”“€Nj&Ç‚ÊK´Àöë‹áur#°P”0›ë#v°*~í—|u}£‡œ·Ç~®•B`ê@XŒ^ر|¬€@ÞáiÓÞõÝó0[}€Ý©áÖlö­ë“= Àø燆:)´ùÄ®ôHL½>íx€òy€B?ÿ¼¾æú¦õX×$Ç¡r}äÉ`çyÏÖª)ÒW$ÅëúŒ¼°¢©¡7 H²ðÒ­äu}Å]ŸuÐX’³œeŠI¨‡Mu}Cy4:à%å†}Bp}ýðû¦@=4šÆtUÔ÷û @¹ôŸEê|YÜÜ €³ô$•—›å2 ¨Bã 1€×é¨t|Íì`Q/Ѐö®ßŠrXfÀÄß_äQe0ë@†Qc '¨2zjlÛñº,Â'¨¬O©W†çZ¨ Q0‘%- &€n‘bǧ®Ù9½‘GfÙU„ðs|æcÀÂÀe° b™’Îè Œ +BÔì,z¤öX<@Åð†§6¾Ñ䤱ÿå @Ñ»Р@NRƒk¦y!¹ +Eš‚Ûjƒ¹ƒä °é!„óó>ï)·Ž¬Ü&'~'%E¦ŽéÕ¸“gr¹8,SxypGîO÷3ฮWIWöÊqr $L°lìSÍñfùdµÑ&E+a˜…÷˜{ÍWѵN§/Y­]:µcrØR°\žt>¦ÚaÝ%h+< ôsa(FT>‡ÿú+ˆ¡Ó. ÉÍÿ·ìGÛ{çÿ~bÉû ¼ ëµêJ©þ‡4h&S¦Ï2ÿ¿'š.fÿùÿ£¦E÷ß-Û­‘†|ŽýçšUzŠ,.ê¿Üôî (±`DŸÿSŒ«Ýûr¶'fùOaí[5¡9n3Æÿv°ƒÞ{/°@ˆŽú"yihÂ÷Ÿ.hÔÛ¡ßâ*@÷_ªT±¸³µª9~[FðhQ³^ì_Õ}éB#™¬ÿâ2–XIÉaBý#a\¨×å{sÌ磆õ`Õoî‹®’|þGçªÈ¶¨ÃšÙÌ­ŸF.™@]½¹úf?û©=ñopM ›õo#øÃjþ5¦¹\u£Uìø?&÷Nû¢T Çš­OÃÚ«þH¹èÔüg˜Z4…mL¦+B±¶K}æ8SÕüCVah¾u(ºGóò{@!ÂíWÍ¿©s£§‹Vó×üÛUHM8Ò¾\ŠÅiþ%´ƒùb%øM5çœÀÐü:íš.ÖÏ(r Í’î ÿ5Ç”›æ?AoŸÌPrJÏü›Np +Fj·¡ò!'n6‰{håèa»QƯË3ÿòDlºë<fþ!Þ·ÿ)©¼|¾ÕïÌgöÔÏÇì5ùŸ¨lŸXÝè˜ùßwçTÛŒUäy3ÿ¡¿ÙT²t’½¨Íü‹²·»œ ·)ºÖ3™n}ç&ÇÌÿ©èÀþ{;óOàî©ÊÌXFŸ¼ù[øAËÏÓçyv¨/ñü5‘#çùBj3\¨?uœrNäP}¡-æ^M¶=ó ±cÎœdöî`Èý.¯}Ž:×W3ÿÙ0¹Ú@qf ÅŸÆ.âÍüóåUñwãŒVfþùJ$Ö÷$3<;U$$‚’IæßbçV!ÓË…öËÍ‘oNæM¬o«ŸUæ…£öfšU1ÿcé@”ÃhôÎŽ#ó¿’h@Ó¨†ùw³oD­3Zza’ΧÊl7Ї-ã< ó_¿ÿsqõRúÂ@¿üßWŠk‹Ï&` +bÀÿwcÑ8ýˆY/ÿ²Ì + eef‘8$¤!c•ý—ÿ&P‚•oœúò/RBì—1Þ·ý·:qÊôÌZ4_þ}dÊ<ÏF`âË?8P, ¾ìÉFc6lìÄ +;ÂÂÝÿFpÒž)¿0ÿ.)\´ Âó¦© nŽpÍθæ¿\úðrÂ\,—nÔàu¿»¼ñª :˜Ÿs¸ß£<ï?â8ûs•ù7Š•u2x)Í_ëÌò@úŸ_1ó/¡þÁ0!73ÿºå¼phY“ë3-¸6 KMfþñ”9È ‘ìþ V Œgþ)Ž®úÃöi:®·‚¨3ÿÌ\"?»®ýµ•3ÿ)ë  v$=væ?(a4ó³ qi„ü{`·cHµOY~`6Š%×ÑÂø—¼¡‘<ŒÁ§«bü+½&Æ»‡ÿ-ÃÿÁ5¿Äj«á¿@¦æÂð¿ðâH‡ ÿŸØíey£Â†‘ìUø „¯û7›C¢8+6'µÒ ~ÌVÖð¨WªÆRìð/ ÈcÆm1ÃÍáŸâ>žL¹RgÕá?VÁÐ+C†ÿ[ÆÙ›'8‡ *˜§iˆöpøGÎûÎÚ«ëÐi‡ÿ>;XסÿR1üSËAè=‰Ä&¥¶å ÃUÞ£CrÚåœû—R-³Ì8ôðþXþ£zü †¬uŒZ&MûE™„Xá…·þ÷:bÖi ÿ³¨K†;ù¿ð’4-©‡€áß1ÀïnÀ™Ô|åˆ(ÊËðox{¸ ðIª]åÖŒFÈ—aáŸ;‚Â>ieêaR@ñU†!4 À ÿ* +Éwø¯Ô@ÿ†ñ0u¢b¥ …hQÃÿêý7ÛÇœåÎ%XÈÓ(B5‘ B׆ÿÓAïyê1F§Í—!šº¿ô 6iNâm>{Ãÿ~XN!Wú‡t<5Wçð¿èGȇ3Ö©-H5Ž]U‰Ï¤…ÿ#h¡Ì|gôšªµ#ümƉˆ(ÁÂ$ÙBZÖ²(+7üºù·»¿nWËÓ°âˆë@nI_¬š)"&iוŸ:ük­Òô^ÃèuèôF‡ÿ·ÙŒd÷¶W=SÉèTÚðOD.›XÒðÆq0´¹<§‚à†ÿiSÉ:¹':ôwë¬5ær9¥¶lRU0’>×eÐ>ÿ(ùQé:W1­á_JtÇ·løêÛ<û[™¼tì°¼T ;ÄxleðÇþ®_MA´µ,ýð——‘âÒ¡ìázfÿP‹ŒÈIº­ÂVÛ’•9Ä¿Rí.€;ˆ-!†U£T'Í€ ¾mJg—I;ˆZz‘C\lÿ²??áSš²SjŽ–˜ÿôC팾Š˜X$þƒ§´‚uæh1®í„{Ñ"þû|’"¦èÅV1ÒfD¼Œä‹ˆÿ’,£û¯—ìO¡nBñ ‘-ÿ­Ó!8?د™m÷~?Ã)­ ÿ… `-"3šþÕÿýÑ9݃¤ˆ ¤|Úû{Ú¼í3~‘!žÝîoO}vÿb| H‡±Ó“9 Ÿ:îOÈ"Äœf¾Èjû›BU7f…ö?AßÉBôð ŸõfÈþ¢¦@5w9HÏÕ_ã¹Ø³Í?) [¹Ø¼6këðÙêoÿåA ICÇ õŸÁ&ÜwÓ¿Œ¯oÓ¤"$ýKÿ=P«~‘$ú{§ª«ziº˜N:X|Üß¿þü%6¹Ä¶§mvþ50ÙßEmþÖð£ÏVºdþþv2h8»•-ÿN|7&2 wh&å?ÔAÏ,'ó¦¡Þ?4§~õ¸p6îc”ÿ2àƒ¤=8…™äoJ$ ÖÊ8½mD"šPáP€[—IþV0ýè9½¶¥@þó ÷Q±àâ_5€*#Ž>P%þmeš4)öÎáßJdd\´Oy”?âø…k|© ¡SðwtU*+Ø@§ÃÿïÖ$#ð÷uAÿÿ~;šýœ‰íò³ïŒà±MQUt{¿îª"vÙ}S!m|Òòþ¶%¿òþ’|«Ž÷«ô¶ÜÚx^É0Dz¦¤‹O?M‚dt`nx×Iå—§¨Të¶ýÑÅuXÕ[ûUHm=iÿú…MMŸònö—À¶«¬?_"˜ÞÊ^?²ÿ䬗$tYû †óÝ”¹Dý-Z_³±  +ooן.‰¹&Ñ£­Ÿ?–È—¬?3úÓ56ÕN[ýö85ÕêW#`“¢Ë}±>Qý¶Áe¨þõqåS¬f’ŠSÿ£†Ï¹ý+½ÃP±…áü_³I '­Ú~þ±y[àœ…Üå¦ç—"úr•"+ÀZþH„—*ikq~"Y ç×_³ÜQº »Å"©°ùѪ˜JŠð™?.)u‰ªE8bÆüE'9òµÕ_‘ìågY@ ­U!Z~;‰Ê")&u^åÏzv'¼.d¨êŒò󛚣ÕÂ%A“ŸH&ÿFݯêÔJà…äG*ÊrÊ…ÐCþ²²Î¢¡{˜ÛïVú×nÀÒ:þ÷P–³>ž‡oØø5²MîvG4QIiŒ¿‹ ”­Whx-~a‹`1;üy`Ø;ŠÿI^ ÐÍÑ> ügôb,KMCWøíLs:LM‘4OHü`þå[ÃÚLâOÄh鉊~û*²Ê6üÕ hÃAiáŸ8s e*$“ð“‡Ë‹ I&ü›Ì;Ö@‚ß5¬ ·“ ©M±m€À³ÛýÇ„F•eüïçJ®–ÏØT¿ßJ~Pj&Z,oú^_+aÍ·nÄŽlµ-/0vïoå#Ä%BÙúˆÔê}Fy$Íž¥ì4˜÷ÉG™<Ѧ½‚2ox_¥Ô €“ˆû«»w¾Gzî~ŹØýýÛ`ÛþV°¼~îé>»2úc£¾_QÏ}$€ó‚(¨4Vî[›Ýh/Ü¢÷ +p`ƒ,؃û7qðÝÿu7ÒÛß êÄ®a„¦©>%½}êmô U`ˆ[´}ä´?¤À”ótâ.˜8ÓOôà}í—TTk ,Þ M¢ù×¾ñ–ÿ`±À Œ+dDá#Ö ؾ¡áÅ pÞ¿Ú?sß©µP†3Uí{gÈÑA‚Ð/œøª}å*æD‘’½iÈ:±Ú?<Žf<©?Ckõ(gSÖbY^T¯Çç­àl¢×jŸ&’öh‡qåj_2(¶ìDaûiÿ÷/º³°–ŒÈhÁE‰¥(úp¶æP¬g_—«/¢­WHš}=ºxÇ»U 0Ñ Ui×e‹Ýq ä‘ßréo¶d’ZÙWÅ®¾àÄ–¼º·Ô†XF¨¶¢²Ÿ/1|o, T1h•}XŠïæ#Ù7™UÛl&€%€·Ê{v…䈨Ãkˆ}•Mˆ‚‡D6ÑÁ¾Ø¤{¡½Ê±Ìþú,šÇ€D@¥¬zý+äHÕPÏ¢e×°ò›p‰Ƹ¾¼w.ª<×?ë 9³ôEûmâúñÖŸ£VFÄõ½…{Ëx3®Ïám š¸W’Z\ÿYØÁi\¿nÀÑ#=¡µX™!¤šD„AiŽëg¹î@@§`e\ÿ‰®<¾f\ßb‘ªÐ„jÔ'Z¨”ëw ¸ÖŒ[ÀÃ?uš”ëã”R&U†ºÍõmå§åx"4  f®Ïè«·ÈSrXîæúäéÞ0Ö²Íõû?ªMZ.–á\Ÿ 6äNWhÁ["¦õ¬AµP¤BþȱUëúo N=Þ⨫õáxP Vü’0´ZDî–ÐZèø¢à†ø¸L?Óh»Z7û¼ çЪú` 7¼5É©Ö—•8@†ZÿPbB­Ÿ ¯Ž0q¨õMk½¬ÊX‚}ëè½Z$¬MUÚS\B«Vëã¸Ô¶ *Z²ZšlïMŒNË ¹àj}g"¨F¬Š#KV›ÇMÅÓXoèîÿYüìî÷ÚÁºj}D xä_ˆp›µ>¾i9å¦ut¬õë_·,GJ™1oªX)e +ta뫧mŠ–wõLqÎË„¸ßc×âØ[?¿¸Àظ%ð ÙÐi¬Ig§¸Y¡XáÊÖPã ŒOÃœlé “Øúµ…@Dãù‚Zlý 2ÆP@!~ ^9¶¾ãöeSá·²õ›G|³õe‘Š¶êÂN£DÃòbÀ³õÕW¡:YÑû‚˜¤±¶~0¦ +L´‘˜n6ÚúqúŒ±~jcq» mýÿŽÊ2 ýÿúYy ZèÌ·êñè¡ï¡™p³õëXIË©jfR_[ué+ïÂØ´õµaª‹1Plœà§’foæ)¥afy±øÆ£ú€MÚv™¤Ñ‚ÛÔGÚérC~þŸ¤þ´Ìãi#̈JÔ{#æP’)=Òõ­˜ä@©?ý2Ÿf vú?>N{±´éûVA°ë‰¢n”dúÏþý»Ç¾ß]”™_‰tí–þ7ZÂ0éB*¥ðÛ’t*Eì`²‰¢gÑ=gó -J‹¹¾ˆ¥ öfôeiÀüøäè›øøð_Âèó_/Î|×&D°'ú± )éä¥tb­í¡?/À^oÄó­Ð™¯Úÿ†À _àË6)p@æv ç‚ŒSEE?ÿðŽ‘>Cƒ'¼Åç«øq„ðù(Q´ÃÙ,Lv)òel]Ò;vѨЛÆZ„êü@hKµ`Zt™ó…0ÔŒ´°àÿég8¿¶›Qmÿ}ÑÍ~‘‰›¯¥¿Â`åra‰ ٥ɥ½x èNü™oPæPís +¢eþäúÄƧZ™Xéâ¦jP+’Ä1æÃ9[ò=.óÅ ³¹\]f-投¾è >O*û8.¿™<Ð@¨.¡åï–ît´X¾òkþ¼Õ‡afÁ*ÿ‚Ã(ÃvÄ#âiÊWcWµÅ·åÑáQþ‘ö–y,b$9Éü&΋xœ)€½L>1D¼VxqúŒ‚1ùfä q8 EžPòÏßÆ<¾„7+ê󑾞i1çM*Ño‘ïÁkq¢@CÅ5äsN_„4‹¹Ó0%Xø:Ä ï>>d=<ôñÿ¢½ÓFWµ“ ÆãS×ÀY.¥„„Ç1:¾íKëm¤¯;úºZã[}±K )!:Þ4ã—º+ïß©ºžÄøâ/~²+3h|´øGªŽ6A¿%DB[Å—<8^ãyÂý“ŧ”q«™µCÿq¢iC ‘øó£i‰ÿ*ÙŠ(Àv&áÞ!~¿¯Þ6ž>uϺ‹AÔ*­Ã÷¨[ð5…yÙlø«ú x ?ÛO‚3.D¡¾ˆ/IYcþb& ?{äUXø.?ctâCôŸÄwxÂ×Z6ˆW(-TçÌèáÿ!Rn¡CùàçÒƒ/µÕ3Òžüâ·S3WðíÝ(ÙMO ÃþmÙ7A;ÙÔÄò•Xp!ÄøøX‹¥øæp™Ñ^UiÿZ|bþü !Äé@½¬Îÿû_ò~N UØß¬!ŽJ9ÚïC‡i òûcl[· ÑÒ©Ü÷ gDBIÁ¦ï½sC@GÚTðU¾Ï§%ÒÊؽaË”ƒï_3$ÑÙaEaÈyõ|àÞWŽ8·÷)L"Ú+¤äláz¯%èÒ&‹´Þ±&J陸Þ?û é1>ƒÕ)·ê +7ïõk:ËŒý+¬äý»´§‰iõˆM¼÷èjû¡A%¢ô \TüЬˆ&îË*/w,Ô5^ë(Cï{cf0-.—Å-u¼AÙ½Bþú@Ÿs½‡òˆëÞˆ·¯ÖIuÏÖØRK÷±Â‡ŽAEƒ{"=RyˆÐýgë¦é#G»KNé±çóÝãV¨×JqóÀæ<â(¬½Ñ³ºÝ»ê15&«<ñƒîõ$R>Y¡{Û¨]°Ã†îgÚ;¹!€HC÷MdÂ'(þ7ÍÐ=Í'f’W †=ô†¿!Ûj™´†î – çj›^ÿÐ}„ê¼*nöú­Ð½ÿfDùm†î!ª»cOãÉÞú&6“xx”8t¿ÿ4ôí !ø!1§bè¾p?¬I‘7tïùÀ{Ÿ6Òz‰O‹Ü A°KzH炉?a™Ý3nˆ-W +Ýûç +u²¿àÅ›!³MÊÇ*Êä€ UíìVµØÅ”ŽÚ¢6t_u˜„>UÚÝ7ÅËFˆž÷iÛÐEEââü#–=Q²gãæmÐx²ñÕ«è&ðŒ¨±×ÒCšæ轸¯rÇÙhSã<ì× ºçCÄúNDÓ¢ e£Ì[(ºïrÐ(¡,ºŸH&íæÇ»z¢û‹Qk‘W/¹\B ÅW,†îgßКUŽ”þ>‹¡{h(,iÿcD#þ•vå-Ѻ×fÚÄ©ùn‡ºß³Û·½¸JÈñ UÛ÷Àн†~2RV89­ûSš¥B÷~èž5–rbX[Z‡îoG¼oˆZOuèþìXÇåxqy £Z ݃$™ I7w'a?tt6Úñý5ˆ4‡ŠÛ]̽H.S"ÚOîNX¡ØO÷-»'õík§Šû|¶@!1E¾Q†{£Ûöî)œœæ½óî µA°&èÛƒaÄ”Zˆ=ÞÞÓ¶ª ÛßFŠy¾jÁåÛ~vž^èVIk{TÊK`2÷Wh¶ÿ›Ì}Æ_GÄö{¼3ˆŒ¡§™’wíùª—Þ’mR-.wÑIjûtQµ®kºqÿb j߯ím¸IÃ#%Àf7-ª–iïeø’ª¢Èí‰n}A†öC—Mj6³Ë—£S†>À^·ðÙ“¸öìi Ù^¹‡„¿=6‰¦6ä48lŽ³79•¿÷’[œ½ý–Õ„GYˆ³§†+ɼÎBþ,µ6gÛ¬ÚB + Iœ=ärgžÞyèÓ¨œý»§Ú…Žòs@b„¤@$í¨gÝ°ÍÙsÝÌ,ssÎþvŽÇ½±å=9æZ+†)k³w¿lpQ…îýò7ûˆ”ý*‚Rà>±é‚Bª6ŸØ¤¬»ÓnÑ7ûvßd÷nöQCZ¸š«³t³§ìù$·” ±a)7û_R*fjäñk–'ƒ+)›†•mnöô³ ƒl·-tƒ¿7{¶zæ|ß쩤Ñ8–àìkþ%½#‘cpöFЕiKÎ>+9g«Ýúχq +lê5pKdŽÌ9ïuÖ…³§»TæAUl%ô  ý$· AX§à]¶Æ8íÕMž[—³ü§-c…ÝPLr/Ÿý%Å +Ðìü}oÉ'½„ÓŸ³Ÿ´´Gyl¡¨òš³w¸ÜÞZ <¤nãì …rÆu ‹³—翬0[-6V‰Ñíeòé|hÜgU·¾!Õzz‰BnßdÓìžšZµ +«ä²+þ½½š"9ûûh\ uýçœ}B!$EÍÙÿ­Yé ÙBxf‰—F^þÒÙLùÑ7gïUœPö>©bÎ>°˜1 q+ÐœýUºÔ5Á^5gŸÏÐT—qö­p6¯³ØƒA2úÐ"³Ï ½¥È@œ=3©ÀðQvÜËÙ«¸J”¤…!7SIߘ³o[PÃYëƒ1gÏ%¨±Íô^FåìG…‚°õÇ{Yä—¨}ÏD¼Yï³OÙcÙÿH¶¹ xÒ¨©¢ä“½¥ £Îɾ.¬^iÊuQS«Ë³$ýðÍ^w’pØÿ¥œDöT¹)tÝ9ÆB¶}V•;[MU0žKbn“ÈÎe{ö±Ÿ/dO¯ª$¶« í+½Ý {/ ƒ=a¾–öÃá·í„iÐýÈ‚ìñ肤YR½~¯"È~(Ö€WE0#dŸ½=çGö1 +•“ûîÐÐ=í;+¨V§B¯}2äÙ20J/§Ó:|4d/í"6œÅ6–óü„)ľðŸÄË›¼20:iCö°°pG¹ÂŸ¼ÇÊj×h8•7«ЈðK~·K×åÂvÅüA¯T¶ˆ­Æžbü§6æ¸ ÃŽ‡ìµÉc%©qQCCö—H/"*ŽDMÄ*À^KCŸà»¢¶&²_bLKõ™ahûñIŸ1syp9û/Ðõ{…þz‡0òfJÎQ vd‰=oøk˺øG ¬m@²uÕù«†3¨»µA’ýnÒôoš”ÉÕæIö[)5’}œQnéÈþ@¨.Å +Föx!›Ö~{m¹å@§Ã,r43æ¶2$4ûiÓhçöÁÓŠ ±¿êˆë!7²—(! +Ôr¯2ydß.Tü"Š%${Ùý$û.‘ïAÿ[ñ±©¶²ÂͳÛD[] !ɾ†ï<Žâüï +¢Æ9rƒeU ¹Iöˆ9ó +WÂÐ${Hí+¢lÓ®QtboÚ“Çn%™~$T–’½#µ‹^Œ¹P²'ÇYá;gúÿ™Xä÷¸¤Y´és§æ¥m~ïWIJb®¸˜ÿ(Ù{Ð/J:–F±A»b÷“{‰ÖÄ¡ +Ø;QÈ‚wu@ö#b$ãâ€asc_Œ„c Çõ%O™¥ Ò½‹ÒéWÆ pcOÉlª"`Γ¹¾§Ø,(ÁQ·«â\=‹> {„äÅÄ>Á>~¡Òa¯µóµ.ȧ"…=æ®9…«!U  öÌ-Ï“{‡xpÑš[SgþÃ[r"Ãy*ï&ê]pB9I2° +öéuDœ`]Ûòî+°‡OŽ*%ƒý¾OhÁþªX“Á~+«8kkNë]ìÏÕl§PNÉà^qHƒ½žËÞ¬çÞF_ñP ’ÑQƒ}Üv5k¾CcÚìúSÞj Åh¸4o‰¶wþPiš7FHü~6"R¿£Ü´Ó:м¾~ <1×/„)&Zß;LnæozÁ‚Ïàõ-“èé‰iH=¼^ƒ±ÐL9¼>,åÄì@½'jéb\h"aÕ 9Òðz®ÎÈБxài Áë«pRIk’¡Y•r1‘¿t^¿VhÁðèÀmoŸMa°Y™Wÿ¯/µkªªÄàôxxÃ'1Á”¾ë7œÜ‚´¸••Q!IÃx»ü×óŠZïzÅ8 aKT NcΫ’u2É(0²þýÔÚúìÂs™Íd%®ŽBï•Àeª8oýÁç?q ÍÖ`oKéúØTëQ^φw$¾Êf5´^ƒ©:f}"']ed}þ*¨&%E¡ªE¬çÈ$Mh$xþêsÿqˆŒ…k˜€TúÇ~ÎîXRÙôôK–¬ìV_\:[%#Vd•ÕóM†)\õë€mDa`”­z€ðµêµ¹4ÚÄâ¸ÞQõ?—œoè4SÙ>R½¬ªz~ +ùO½‘–IÕüÄÛ¦~ÙcSåÙ)m‡Äl©“|šJR?_„3ƒ¢6êã’ˆÎÑÔ¨Wk쀩‡eDýi€¢¾€KÁpõt£Ã%Þ¯®ÀÔ;³ ‚ ¹ùûésÀà.ܶ\¼ðPVzz×lÄrä¯(gžÞd<¿XêíˆÓÇ棧#þ”Ò>æIq¿RðåÒDIú¨IòK/QP[úJ]d<^AhÛš­ô.e]••jRúP;Ô¥[0 6韃Zþì+ $éÙb“:I_TV¡¤wÔ¢%±‡•ôž3ˆXªÞ.z,—;´©¶ ž|¾š"²-• ¢•Lòòå½8úÒLô° vVaþ† ·¸Ú&ã‹ΦŠ1UœW+ú:$tˆ9Û„á-ù$¿5¬,ùD&úCÞÍϵð Ñ{â•û~:ô6·+8öªÌ%CoY×SÓ™ÏÓÚšVÕ=¡×”…#ý矒É^8³µ@ +è³Ø¡.Iéß+üó‰bUOíƒóܺÂùy•Ž§ˆÚ|9oð} $Awã…Ûôy +¨^üw$>Ÿuœ6Ⲋ­2vËžnN"T·/ÐóÊÈ”Úó¿%Èãù:ãyÖ¶@aUcøÆÆ ½ó¶éšv~¥Ä¬ÎgŠk16TŠÎ#>eðÑžôö™çýímÛ(¬ôjøÞ€ä¥\óåŸáü•’ƒ¾ùÒen›Žc¢›/¡SkGh¤"ÆË´4Ž(7ªÅ-Í‚’Ý#óÅ0@¥æ¯7òÒ|_P¶™)PçD󠯣™¡ßÝ‚×æ@âa\ØQîÇèQ®—W…ï}É̈jqˆÜyaóüç‰yŠã–)næÛwíß +¤¦EDÌ¿ÈyððÉV¢*¾¼]«@õŽwy·vI˜ÖË1—çj¤í6h·ü³®Ü6°óâK-_óç¨Oª´B@å²¼¯”GÂf&àÖoÈSý +%JÆZºÉ[ƒ•W_›4*HžÊ£({DQm)@y]‡DCA,îé¹qà\ÉÿBñByŠ§\Ÿ|Ù9 #uò}¨Iˬ­Ë&O~À„´Ž|¡ÐÐzñð°2¥‘C=G +\®%øS³XK“rR W{ª2EÉæQÊ‹›XŠ„‹†$?0CJxq  ¾ EB¥W0ä“fYÊñ|‘äW°ŠK4tH]ügŠÆé'+©¤…N´Q\ÞòƦ +ÍEÞz§ˆgø1L‹¼ÁÈòL"¯…—¢¢iÁ7ä)2´)À‹-8RhHÀfI¦µ€n¿ÝïX×;V‚Ä* o]3Ý©„juóãéµ`>>¾Áèaï°·9G&:VÐi.èï~Ÿ¹²öêxÎÇíæxNa3'/Ïéa8þndÓe…MÕ©•tÿ}Åt1ó>våâÔf¶¸Qã§Å€~&Ðø§|ÆËW&‘Úz|ÖI À2žl]rLNÞÛ­øŽñyÍ/ú{ðu…ñ:IçIv´NÝ_|gÁ±¦ +µêò;çÅž×eé‚QxŸŸ.`Ãâ#u!ªw¿Ò¿Yñ½C?η*>úÔ²Î/¬OŽØQ|ö攧Q€õ‰?©{¨ŸŽý5ñ^zý‹îðŠ•%^9\¿ÂŽÄs-`3Š´ù_ÓˆWŒ1¼HÄG~tÑÙ FBür&ˆâ%o¨Äo—Ü!Pt3Áê=Ìu|ï.w, Üƒ:µÅ醿¨þ +Ü Ìw„¯‘BOøÂÍ(F*áÛ움!TÓÖÑ #üÛ œ:ŵœÃœ%,‹V@¶ÃoÏ<=ø-Ñú6Úf^ß–ë"%ª±/9Ï o·¾à…6@APìZÁ« ~FƒàMð6@¥¹]-:ÓtDðÿÙÞ÷.%(Þñ;ðX•«ÓxôÞ\ºW%=ÿ1¼ ïÜaÁÿ¶0Cà]ý‹Ñ÷a$žíö!†ãï݉pü-¿N¤ˆðú£A[„9ü†-ûïKX¢tTúØEþ=ÄðÔ•Ôâ/VuÁ(Õ÷Ïßw‚Lê\¿3¸äs±Rëw­öªËa-r¬*¾ùÝEV DÉt³ ‰ß½Š²Z‘ã}—¨¢º˜ï”àϾ§Ü`˜ÅæT}¯7´ªà×}/³˜Ró¿ýŸÌùžmøb¢(‹‹øN—<ý½‹{â l÷ž\¤Ì{Z´$b] +^gÕPhžâ­¿DöþYŠ!èµ[. +Ô\%ì G•"x¤ÞïÆ+g3Ù é}LM/®Gª† ÐûtÈU¿pÞ=§jêòàªò—÷›Àšh p‚í56ywx÷ÝåîØë/Ó¤xq‘*Æ-Æ*y¿€j~xö±ºGÀºíÕ¸ÒòÏøÏÇúÑä=áá{ 0’;²zòn›î W†XÊ»Š #úà&DPÿçö/FŒW!IEy×9Õqñãr»èË&ÀyPÞkIZß +j ì;‚e_Ú ”w“6…eù±§•Õ{c9×xò>ûo¡ º0OÞ‹¤ÅLnfø,VFPkˆm ãfJ˜OÞKÔØãŠ;ž~Yia_øÜj²­â½vDEãˆá]0õ¼&âh*ÐC—n›ô–‹€ïz< þ1è1ðÞ˜X§ø÷6?îÔ•¶ïH .–”À;Ý€O$qÙÄ_UÉ€¿-¢\2ÐIÞ +Ò@Ó½œ>EpH̲Ã}1Œæº¡Àr·z'AÊoï5:H ðž3 [}ÙÇ•›3ë6ÍÝ#*ïÞ}À{sªžsy#ðnO×ë}B Õ&›—*×,óã‹sˆ6Û`w¥§šjÙ –`aà=¡XO²d3ÞÏû üÆ ¨¤àý”³Tsw±ËÀ»úCò†ÒÒ0óÀûW!ex©BEõs4ðþ §†c¼ÊWªøx ,ÀUxÏi`Iá!ð~ä ·:@¹&Tu-‡…¹MœÊ¤î'6ªýÝÈvA¥eÑ\ðÎ8Qy<Ù4s)b­€÷1*Àxï+–¡¾ëó]ômÆïVjÍ"Y¼«ø%ˆtýïîÔý•ªÞyqô ùåïÞ‰aŠý»ßÅš4tì±æ¿»"þÆ ü¿¨¤£Ù÷F5<êƒÿïî?aÖ&Xÿed ¿;Ùοù¿{>ÆUÑ]P:z&–~‹´n/ÊHëÅý»ëƒ¦rÛ¿û!AAúwïtn_þ†R€÷8¸R´uj²Oh9V¾˜¨ðþæœhùàýÐ%g¡ÃúΓ¸w'ÝÝ]VoUàÞº>£Ü=„Ä l"þŒ|@äñ‡m (taÞÔv|ËwßÙ0ÃT<ø‰Tá óbÖǃE…Aß¡zÃn³ÅjÀâǼ2ë¡®” ;m„‡JiÜ&Söòí5E«ÔÉ1}Š–{J棢ËMvGu¢S +3f¹Y‹×gÎQÎ'äøÕ%6³Ù5» p‡OEIàä¼Xn,u žšï¢ô>²Ï ‡‹V¼i` >7AaEΔZ¶ÒT²@çŠ(1Ž"ÆmjEÏ*Âõd€!vÆ;ó?#hÆnDí çm‡ýM¶±ìÑ*£÷)wÄ{M‘QîŒÀ UìQ@ vN ñº‚Hú³8ä¸)Ú”YvËœ š·Ž`ã^¯uvUD@¤*S—Ú•¶n¶¡<dн‘„€«Œ\âÛ9­›*¸›y~¾n€·n èòPÑv¾FÐSdÎÛnÄGÔnTäÚ œ~Ë +ñÝìô“ý¹ÊÎfÏnH6ä+Ónü°y+—ÄP>ÑëËs|¡îÈ+¹@ˆ´lò[ݨ~ü€¿ºñnîi¥¹¢Œ+¿‰ Eb+êÇÆ×#}`tãÀÇ€ÜôòÎ;^ªn:åæùzŒÄÂe4ýn»1 +U}Ò’Æ:Œ—S¢w–N8- Û Á™,hÃÈm7wˆ-Úï#Ó‰N‹íFØ»î)‚½¢¤¡¤ÀÝ%ò¿Wɉ ,ª=²¦Øä½¢d%lŽ¤€Cwé êF €Œ¡YSÅ=]³íEV{”ºn¬ÌnÍîq0×ÖÍ÷m*ô~ËŠí—=ŸŠ7.˜0¯´j«¢âÉ%ÂúÝft¼ÙvÊ6`ÝT5ßÕnÔo†ô8fsÄk2ïBSi³o~o‘<>Þ™^Æ¢~ÒJË‘üˆëqoX´×Ò[vðFÀU)ó&ÌU´á}ˆ‰å Å°HÈsÞÐÏö[vàÍ ü9ý°(ËÛ_QФÁžZ!}ƒÄxBíMuðæ«îCõëdHi¿®­‚7Ì}¯»„^Ä‚iÑ›þ•Á›Ú8üê·˜0š\àJPX +Þð>pb©€VìH :é&ØŠs¬—¦¬s€;,ò3ê¯Sàa|w2 oÔ~»hº»^Ü7¯Šü yŒm:ãcˆE n¯¾¼!c^Å8aTkkd‘*ü:X¶èÁ‚7Ž€ Ö‚BøQ6ðQû¯ûÓ‰½Ýø×÷yXjÄþèŽÐVd åTˆälKp ^ùÏMêqùLæK‘Ñ.o ‰RÀÕ)preu”ŒxCºË9K?•$繊rÆxE*&4ç·Y7“Zžòs3hF8G,öDVägšWyC=ÉŠþ'H¯@Ê?7~ÑÊ7oÖ L>Êè}S–ƒwCxNG%<¥A2™G±x:•àùâö +-!ѳ=èQU™°òï=’HÜæD‰^á,4 ¥TK:Ƈö(eƒt6ýhyî)Ù…—¼55EÁ•«ƒ©–?ÕÙ‰ï,Ž&õeÉ輕­€z#K"qú"èÑD{Mdÿ2‘c!ñãàŽ=ÕÒþ¶ íBqÁ¸œÚiEôó¡rì xÜ!GëØ“d?ýª þ‚yìÉEH¦U+L<=Ùô4¹ 7@Dô‡´#Øy2‚q’–Œ=åƒHóìáuJ‡íYt±hüÝ{C‘“&’7üÐ÷ ÷œê'qò +Ö݉{®{¶ ûZ‘ü*ô{8Ò-"Ý¥û=j"¡ã¥×tb‹T¶uÿ*y/‡¦:á"Qµo&u¡ \¤ŸÁwå‹´á“BbUýÏK)ækÿÉãçqñ_Tñïi?™4û¢t‹$Aôw¡6 ”OC§Ñ¨,zHÞ-’­2¡¶vßÑC°Ìà‚ÄNrÔuÖL¶çÿínÅñœo=·mc¼û{ Þu8Â^o‘4jÊ@ò0¨·H˜ƒ’U~"Ö{ºéL— +±É¥€®Há¥èí¦¡ Š+Rx¼ð.švy±Þ™HgK¥Qïi_ë÷ÑIR0ó»èê='dî˜à¹"e¥,¬Ú8²Þ#Äö ÂÚ*V ½Öz“A! ³=0’ô×-×Áì%ižkùg@bpù¦}³|ø¸‚µè>T|øÔÇZ>WµüU,iôG‡†£åy8k›4|V‚¬t5|p)¼Ê*áõØðé±üS…€pMÖðÁÍ—©‡ H—üC ŸÛ[_ƒìöYà a9ƒbÔål¨ ”ôÃÒE|QÁ Y¼TÍød¤~”AËU2|fø ³ÿ‡û2|~FÔ€k•µôü¾‰gø<ŒA7ŒÐ.|˜ö¸EœøHFb—7ùW–ã¨Ä¤B.|˜§q×b-ÒÙga§4Í3V§aÁøGÎúmºÜà +–ž±a0é%Äyø—{0ñ :i*`$ÉÊåWÛÛè}¤ÏÊN£MO¹F‹_ÈŸßwû˜Œä÷÷="¬OHi%¦>ˆrî©—¡ðµ`$pÖ:Ø(x”›‚‘Lð“|;"Z¬mñîöSÁ{ÙÝ$ME VføiêÃ\5IˆôÅŒCá´Vâlt5>‚Kõý¨Í¾…³+(ñêQu¼OðYݲP ‰(õ‚ã³ì\è£y¡Ÿ£ÞD?­m™oŽ>_Ý*Ô¨u¯§ +ÆM|ÚÀ>?pj8RǨaý•6^-™ø4¨…Z¢ Å2¡Ýûá&>2þ,ü18>·ÿNhòI.Ø8÷òa“²ƒšóYû«ÇŸÏ~2CGò†ôaÄ¡ ’dý›¢KÉA}vqé:…ºÇTŠÆ‘/5(’ôS Ñ¢ ×Ë+}F3£â$ñ—]Jéó’!E«‚j$5Æ®,2ˆ—Néc±M}$÷¨©HR- {˜:=”©qâCU’HÒÄÊ‹ÜPÐ"IŽ¤n!‘¤›Õ(‘¤ŒP}ÎK$G‹¤C,t“ïPu¿CÖHòÉ2wpIÓrºEÕGî¾!ÉU…m»è‡’/TÃÆ„F/ƒ$Õ-I©Ë&"Ù iSÇ<’(ä%‰¬A’ÿÙ„+^Ûº—«¿|Ö2Ɍ䋒y’„9i8’.ž@ „ð„ËG£âÁ¬\™hƒ¤ë zôÖ¢©Âò“%óhæf—ä§äóØöâý’ïGVª‡7ˆ•Ð#9Í;fÈðÄç[à8Ih¤+Ò +êמgFÆȉÌÃ'>Ÿb`ä ƉŸnÐ#9ûÛBÀta9*Çéõ kˆxñêBl®ÚHÐ#Íf¦ ÇgSW:€‘Ûq|¬è°î>#…0z$*#¹uÉ«|º ¸[sÒÉ*ù³Øu!ŒEaz?R|üÐ50.«j%««”ií'z¦<¼i +* é‘ê°„Þ_9>…ž4íà÷1ÇgŽ>^ÚŸ€ñÐ1q|‚™ZA, ³Êññ\M+'…r+ÀñÙŽ5ªÁØâÆGÙ½¥óWØø`ÈÔ¥ÝAW;<¾h¹Œ‡ãó²m Ø +Ñz¤$)}µfŠãS¢Ÿ®+Ÿ¤ëë´Ì˜ÈºŸ}š§ë†©vN‘àÆøÕÔ뮌ÙøVÀEFÃÐ# cZÓX?̘ßø$ZlþÛÔ#QæåbaNäø ƒ8]3a}–¿¼©RŒQ€Q4àƧ슰¨Óg0ºñQ Ò§±…ºñ)| ­ÐIä:_æOó-Ÿ9 P!c=’ót†šÝøÀfV¾i=å–Äî”Nš ñ©l£Hâë2؈Ï÷©­$¡Ù‘lÉ{BÙÓ&G2'»P…´‘Š ÀÃhUÉ;©~Uu˜£‰½'…3´‘j„œÂçëlħÅI‹a:'n7âóH„ââÌŠø`¥•Ä®¢BâGÁÄ'Pz9YØ€‰Á^õ\“ƒ,QÙ‰åxšñ™Þ;sŸ‚#W¥Æ5hâ³+}3Á\©ëÙL|¢£l!y¶;’/ßœqI*>Öõ¨÷|íkMÅç šõäôÉïiGXÌ!úŽôçÆ;æÓ÷`ŠB;ÿ>®ÔW|ŽÅ¦Ò;RL‘ÅG’nC‚O„²ïø9/¬«U›üyDSPL°š0_æ{$—š¬,@ÅyÆ{¤*l&–Î+>§80ÍœŠÏ5½M£¼é=Ň7¥ìÇ·FñÑ<= ¥IñA’Gvi¦üúS|†?Rg ŠÏôuÀp¨ãž#pQ|dŽ&$r®•›™â# ÍE=ŸÙ‘<Çó3çÐÙ‘¬m7÷qn•léR|Ž•4h +¬#ùq4S†aîu$Ìdð¼¾âSÀñ„ +ÆÕ#³â“ŸpÇõæ¨kʺÉgÔèÕy•Â¦ÊÔ6Ê^õ?RBÎØÇ—|Œ÷zé2w¢|>b¯HDg0é?ÒZ²e"EÈêòî÷ÁÄ?’C}8ÎTí‘è;Ó¼þ7Îd»×Ž$èJpÿ[@—¤íHt•/‚Ê ÀÓŽ„Ý—ƒtƒŸ V+;RÒydb\õ­¿º¹ÃGöT€ZXGZÍ‘ riÍa)0wg[G:ˆ[}›KùÀ7|îíÒ‹kN 1ª‚”šïŸ†[yÀÚ%²™Ä§ù“š4`ác΋s(â áâ¶ßl Ÿ((ÕýV|(»É]w_§ ” 7È b|Àßù¯ t™@ß#QŒª[pZšN˜x'Æü…)è Ò«Ú¨=Rnk꿪ƒô¼øxÞ.·G¢šŽ¶ .cd™y¨N‰] ±Å‡cö<‡ŒÎâc :ã-ŽDT|¤!„`—z‹ÏãôXi=e²Å‡%‰”³ ýOÌV0Ý][e´þôìgÙr$¬ÍðMÛˆF«þ‘¬•³˜1`€¤mì»EÄJÑý„$/½zQjD¨§§F&ó³t|~ükDkjš +ä#Q[Ó!­ˆ­§ é ´7jÏ—é’>•pžføY÷{$÷¡œh Ÿ›rþJùÄp"ŽÈ|â•ÑZî|ÒKçÏxDŸysúˆsì¯>ìs/)i"s¸¢ +—G"bÔÞ‡Z_–´R¢Ý÷ÖKÁ‚wIŒÂöÏO ‰©Ÿõ²cdÔïG@KT‹?á£WF8 X¡qÛm’ .Ke §¸ÈJÿTSÃDþ×ýý»ÿy6oÐ$g¿©—ü¬dˆTbðwe¾páÑç/:µp‚’;a@¦Ð»â %몔èH±lÁwè( ÒÐ?vgÃ]*ÐÀœˆû“2íp#%3†‹ì”ãjH\&åZPbÒØ–ÿEÏøº2w…¥)é¹ô˜5^J¹ S²ÌC‡™¥r¨d‚ûñ@Z¾Þú‰)QŠNk8šNé)tSžJ‚ˆÄ‡J¤ÒBóÁe*Uý˜}Ü‚ÞÇtUé–{´ AµÐש4Š•~ãØO¸Ý﬑ˣk¥¯ý(¡ Ô=Œ¦qd?´f" k"yn†<ˆRv#4l딄„¤›–ÌP ­]§ËP¥çÌ÷è'ª. +•“n-p§P~Ša÷*Ôì2ƒ…xw +¦–¶·–ÐqgFÈÔuÄà©%ëMÆà­¥>p¨sÕ ¡´Ðµp¤Ìí°º–Zj^Û€Rîà Û¤¥ìN¾~'âjÒÕ®šó$š|®BÅÅF +ªÊï–¦» +ìoÁã)¡¯4O”†WN¡øq2E”Sè-VÆ"iëj´‡Ë—cSˆÛ™Uƒeª1¾Nî-Ù×¥¢…ÊåÒM0ô­¥ªÀ¼¬ÉÔ ÝÚ8ô2’NÔ€Æ-†zR†älÿ%.ýZ)êyJ =ã²{{/-‰ÙHÊÔ¦ŸS‡/9”W¥C– .íŠD(tô +[{‰J†ƒÑ؇òèÿ¡&$<8©"“õê­¢¯ÿ­Ê­12g%¤ÏM8Ž„s䊨389™ SîRbœŸ +I”LàN”¨¹ lžrÄ„~÷,QÀ‰ˆ‰úiXjk¢²Ö‰*HÔèoû‰²Š"{º:xO +)‚oˆ÷‚L‘ IÙŸDEá›”†É$Øó –†ÛŒ/*AËFuª2vGaÅ"ëăÁËt 9á\ª*ÄL»Ò•5(µËth²ƒ‹ŒæÁ­FBÑ`=›˜žç#ü©Ãø}Ó?Ùj¦ P 0äScPÍ(ÎÚöÞ+,.ŸÔE‡…YOkX]dlƒ@û„‹>^A6"¡'3“QÉêý 413]%væ‰Ç9`f’„‡²d„’~˜©Wìê§X‹œmo¡¾;‡3•rs aôD—éÄTüËÛÖÍ¢—JΘ†¦Yt¬¿È¡)8]Ö,Ò3¤.@øãàß,ÊÇ y+µ*°¿sÒ†.“ùÈ”=ú:‚ºLþK7©ŒÆË5 Í¢ˆ€ + Ì¢íýW‹Žêò¼ÌT'ô`\fò墆ïÝL\¡Ã½h©I6c9S-ÇèG€µ)3|&ZœÇgÐT~,;†_?šâä2°#ƒcm-# “¸«%M¯UÂ"ŸâÐÈõhÍ•€`¨‘€`æƒÔä‚g¾f±‘ ßlÞ Õ´ÞXÓéÈKP;[SGñ=톣ÆEQ¼Øüdôã×T‡@>K]6Ïÿ8ò%¾_-£ã×$ŸîÔ«S%|Ï}MV®Æ$þë~8BÔÜ÷’'/Ä›u›Ù™^/ ŽLꇦ£?ôOc8ؾ +׋Åw%GrªÙ¦S†b%’8’Nuq¢Î9_]ÓR­fƒö"MâèçÝ5KÄÉ¢CÎ#?”ØrЉ(n;Ç~Ñé7’s7–G"°‚Æ ƒ¥#q`(&°‚dâŸv/èXwÓ¯ioFÄ£†ÚìóÃQòè¬Ô-GžêŽ(rhȽ‰ó%†\]“óÔ‘p¬çpÄ­‚ Gb.÷èSæh‹Î#ÖM˜9ú&xá—ÐÉÁ&¬?íßÝat3qp€–V:Å°9r-‹7sôã¦Û~e!G³M „°Éè¡Õ”IªUä¨GÛÑI'ºDûU6Éâm]˜ŠØJ ƒM.Ð $=Øt ÜÉ1ÏÁ&9궀ôDä€bZZ•:#GG{ˆW•©"úN‰}¸S›Î¬%‘06ñoÚ1Áôü-à`›|ÈÚà +ZÝŒM4“~bXƒ„Ì,-†h¡#"h¶è€š¡#*Bì§ ®±¨ljúc¼/Áe©:Šw7ããÓŸ“’N?G§z”l¢òsT Âãm²q‹”MÿôÌaÔ(þd“:ˆYŸBE(¯ØÑ·dësmÒxIr͵)_þµCÂtU]›´nqžœQtG&s0Žâ;Br|˜™¶¥‹wvmrM‘]·Mîr~êöÔØÙì”s—ˆcÆmú€LÓåòH!Ü&í°Ù¡Ð›ÅÔèKïP©Åmâe‡ë%ÙÂq›~nŠˆ8±5Í–]‹Ô%øƒ¯ðµIÂˆå• sÙQ8ÃÌží›Å ;{m6¸Ì¸­"ËŽêÆg:±o xON\vdXågC/)ÙQnRNº&»Éà“ÉŽ>ò›k’F²µ©×T0—5ˆ:ªÝ ¯®‡x÷tÄÊEØ‘o— fn“áßGeÿ72¶ ‡íþ¨ˆòdŸOצÆí*¢;*Í#bv$ãHì÷Ü·^w°£~Ld«•DÒŽÌžxˆæÚ} =‘Ð0ð®MÂÅÛ·ÀʵÉÔÃÜÛÀ< Ðeu”7ëoÐðXu”Ø'å"Ÿ£ä»É>3ÀgPG˜§…ŽZôw¥Æ\g”ó¤©QG.´ÅØ»– “PG±r à¨ò¢‹È§£xÍšêBe’waèïÈÝe•§ïèqà&±^¼G›ü_2™f«õh«ŒaÎÒDðѦp|îä¤(®sT¨?è”Ѧnëe‹4‡îs”èÚÄz‚ñÁ:Ñ +{:zwzôr%ó§£Áœ…£vŸ˜è„_t{:êO£¦¢&ŸÈYáCפM€njŽ6wßRl–X¢qÜ›v$* 0w´™Yªðè8†k³tÓÌ<±|»8_PËòÖzVl>’çÔÓó†UÊYS!¤×Á ñ€W ƒ´Yuâämjø• +©ÄÀœŠæÜIŽ“ŠÝׇt)” RüĉS"5™ÎÖ¥HÀéý·H–Cú쌌”mio#uîH〤MÆjL$]– aôWÊÙÂÔÉEUDDJW!¼«á: àB:}ÂJú;)£ÿ(|I“Ö1é_ˆ MboÕiÖ«Ó>Ó¸ÝiìQº_Jz6Ö)™Íd¤5ÒÞÉê:éäYÉjfÞ¦Ô;•ò?¤@7ÞÞ©†íäl:I^|†2>çÜÓIÊGBòà„]ÓIa¬šõX™N*wxTûs:ɼ*D|ô°B­wzLFâ¤9[Žx<Š§ë;Öñ\zv’•ëôŸÅ JÛ&«=” Ü¥k׫Y WvË"ß?iÉΚç‡XŸv/š¥¼ÇÄÕv¢ ð£T ÙisKñëÈÓ¸kÏR +•´ÓE)?+ŶõÐKf^”°Â¯]\”X˜.¶XŠöÈ“·í+[5clïEéGÍ'vÙc‡‹’úÀñ£)ï¶oQr&s?þX”ÎÌE)©ä‰ßã‚{oÃäI㣜æÂÙÌïD)v·{rëE—‡’•ï+,COñš“ë‘ †„"bºÔd'Á#ûkR¥À%j %Æ9M3”6ñdd³÷xßu'…‰Mþ¤å‰² `+;+ÅBb-”F:‹YZ‹…Ò9X­Á!Š…RÊõ‘Æq÷,éBI”ûpÉÀB)¿N†JÍáY¯cã)OY'G¾ÖxRà¿ Û¶JãÙ;°w5Eù¤‚’b§)éJ­%ßy‡zD`Jµ£OÌ'ZÌÂ,KBÁ4ÄL˜dD^¤/7Qš1I(©Å“loл#á"9ÁÏæˆÆ1E¯YÙ?é+ÉßqRòg)©Ï‡%3à¨àüݼmmã$×7.NR À¹Šq¹²¶Sžö<郎¾^QL°Í;ÝÊÔÀå¼óùÌ&â$›V³½ ±àký¼-¸ŽÜV¨lXc³IǼ ÈT¥ƒjÈÈ&YH«›€Xs6IO—@BI`ü Ôû¶:N:ï,_ÈG‡“2¬ƒCK؇“ÃÉðÎ;A~щ¸ÞªþÖôÃÏ O÷M"’ç +ÐB“øBé6~§%ךŽ'Uzë“šàÓÐAýjä­'†Òç¡ö'O;õÝZ˜§Ñ,JÉ°3`L)ÅâRE:¥$5ÎQ=-}XwŒeŸ“§{Éc•†ª$y{ºø +›€ˆ³ÐSñcC3+½&Ë7Î[)<]UˆWÂŒf7K¦GÜŠ$–¤x:d)ƒ„HÞ,­Óª&ù#ã©K–v{n--Æu2ɶ´†)‚«ù!!—zÈèù´­få {±SΊ&ãžÁ®L~ìÓzXå'çà5†õñ§M4AÓòÑŽiɶý úè’oÑå@¤Ð<õÞ]Bû ¨lÒ%Ï|cÞ¤‹T Ì]éÒd.ÇòÛ¾&ƒ i  W/=â„$šüä5¦¬ Ñ%˜Ï0á0ˆa‰?‘•3DÛ[÷I¬ºgÓÕ³C62zDÔ‘.Õì$3¸¿ ë?›]XI$"x©èg¹Ó¥ç„Û¨ƒŸ”29êr£© +ùDË +La’1]Ú2tŠ&ót©f#Rò§ Ê ;ë L.]Mv¨Y!íì5ίVâö“Û +íªN­”…Ÿ¶tÈì.×N@QÉ<¸‘{ƒ& ¸w€ãRK¯Ða´ôº[ ÝdÉuõŠn-X“KŽÔëhîõ4Õ½BäOJ=›_À•\ŠýÖÙTÙîm¶ÉŸè~`Ñ …Ü2 B\gPý\r‰†U½1é’”.ÄO£é€÷áðšPûÓÖc?©…MºÄŸM¶ ŠéÒF¥ gÀµØWYoV3¥5Æ€³ +°2º +q €/ª;­vE.¢”Œ‹ï.)"Ë¿¹v—À´ª4'ªª`©xü³¶Ïy£ï. ³Ú(g@Á¼fÉX›:‘uŽˆ À§kS +{ܾTä%m —þÎퟮ4ß—÷-ô.}¾ØlXi®Ì…„\q·!Ë™ô`Ø<éøpû§¸/Q¼ôÝFjÚS¥úíǺ6w57Ž% Hڻģ:õ§“gNcïR¦0y8-1Þx2g¡™aM-e| ²•äql ‚ÏÉQîOíñ­9oØ¥V°a/R5° F¦=O#ÞRÔßu¥ï¹ñbªKäA¦]’Þcò‰aïëÕ\¼B—V\ÔX:Fªõ©3º¦EÄU]JÇןÔ§¥¥ºdIB¬‡”’z(E†ÇòZÂÅLHÏ̱ŽÈ²>•°ì¨±ý²Ûs‚Ðõ:õO¦(<ÃáꘘêŸBúº4këÜýêxá {aLŒØoìO±9ˆI ëV—˜8ü»´I;CÀQ›Ž¾K÷f +±‰¡´—xLKíQ`&…ñ]Êçw:r*fÿÔ£…HбK÷‰ÿyaÅòïOðö×=ÆeÈq¡¸?™\ÒöJ‚ôÂ9”·D/gaQÐs©3Õ`ZjI¸F/%t¦F>’½Ï¥x Íçê¡9<—øž^øsI¸/­µö +|øùò*G½ -úŸK¯¦ªþ‚ù+  +BŒ‹ø“›ÞƒÂ ³KŽ;¾†ÑùÜLÎë^®ýÆ·ëOµçGx›iõ€%¸‘å§c·'É“”àEè ›OÕbÓC~²Rkh…|m臃n}º©r¤Œ-•7©¢}S§}§G­²b˜úužw@gýv¢˜•(Ø››AÇIZö3MN9°âz9AX“n„‘y²üÂù¸F9 õÔ¢‘OÀ9ñ#ŒEçpXºA*تܜTÉ÷½éÓ!MQ)Ÿˆƒ›¢Þœqʉ+Åt.§óÒ£EÚLN;—“ ápæ¤'Šbš1øOÇq³`üÜ…Pýršfð) +]ð§ùW¨s‚üíöùqN€ªúw2ç/Úäèoâæ‘q\Î) ¤?÷TT.(ö[·t…ÜP)ʯ&rëgN¤‡¢¢D@öM‹FT”¢¹eóÕ5®EFPè*†È³D'Íûî~»d±ïç‹Nu_^S…ªŠè¤èzÂlÐ>¯Æ!ya‰Nù5€n;`õ8Ñ© +ÄWTÁ†èOe$•,D­¨vȺ¦ÓÓ?Ÿ¸"Ê· :ÿ¦I,£Å௠+T”xÓ)ã©–ÍŽô`B^»EÉŽ§Äëõü° %JíÔ-²ØÊš\vÑv¤ÕäZ÷‰5f£Sœ¾@k”ùór?Ò6Š›±Þ©§‚¿v)Ñ–=_™RìÝfoæ–vÔ‘Ç„c¢”PF|zß=êGUrð˜äÐ=*è„î£NrŒÜ~šÇûIaÿÌå - •jáOÆ.H¤PûÎ÷zË((ïÂ)&²cxPÌ÷”ë’”–E¬H%—R<`¡f[y+“b…¾IY†K¤Á“ºýkçR?Ýb³èv¦5DÕºI¢ñŒ¢¥!àA}¡(’ç3…‰*•¯¯]ÚX칕ò¥_^Ì¢J~)K¤T¢­‘{J]Å ¶¥ž|ò¶…AlÔíRTòƒGû–Ž*ë"ÃzyÔÉfˆéPækur +DLE~£¾í 7¤JÍf`‘wS&˔ٿn'¤\U“W1BwREsÕ)7úpM=»0r"LÙÔY˜·Ô“kVã¹M)u©3ì~«Të¬ÔRb’Ô,b.åæKU®’¦ðq škN "LÔÆ”¤Õ?ÿ ±LÇDÚŸ©mý; MFˆ;êÀ„(ÿL`àÔc%?Ïk ’1à”ºøM& q´G~'zþ›:3´Å…[!òÈQç|¸;nÚÚßÔ›¯8õ¬ÛàÔô§¤6²>P>OÒ â‡Iw§ÌÕˆR¢•µS“‚FùC£Éß:,úgmߣÝM“W-ßz9åI(ÿˆÛXN™|Ø•-ÂtÀg!V\&d'„×TÙ,ײÕ_Ò——SoqšÎÚÕ‚œœÂ~Ïœn=°'çw“SŒ“„Ñ#œ 9%¨Ñ˜«E™7#&Y*¢KNIŽ–ˆpÛ´sq$@r +¦¹C®NÈ)õ, s%›`┸Ÿ–c÷Œ‘S§Ä¾ü ûçcØGf¹-37?ß.ŒÙë¾×jªº-ê@C{¸ô5µ©Ü +/0'xÍèe¶[yrjIkÿV8ë/¨Añ»ÛüÌ4†Ù&J†|—3&ebÑߘœÚ–Õ§£sKzˆkrjƒ-(ëh™œºÞƒ +ñ œHÞJN¥'“dØX?f’SŸÔ‘äRn½È)@ŠLkÉ)KP[¡ Þ‡¨Pº„-N áÖ 8“S•=aG ‚fJ“SzÛ½.’S×®Õ°­(/ 9µ¢ j]JNœÉa²€?‘"ìä”⥌á\ÉЬé+¾…49Åœá·>í›/JÑAONE‹AnŠ)°±É¤nžIp¼Pl$§PÖTYƒçÉ)hË•°$MNI¯ ¶Ò0%‰"•x°$9õzÆØüê:²|ññT€®„ §ÆF­±ÐpýIrJø9…3Uk"•ÒƒÄ’…»äT¬‚9¿…,Ýž7äBÿú2©5UŠ$Uil?È^ejAUÛd¥ô‹œÌf'MÊNžkS¿nÈ)¹ý´i÷Ô˜ôEN]RgÐÇ©ŒÇTå†2Iæ8ž֔Êq*vG¬[Qœ²øq²¯ð$ü‡ò\ܯL!F³mÆ)æe4öåc|Å©PÇpi…£¿F9âT©‘Ã}•]ñ!ÇhF>tòJ\gÄ)óLE [œÂ¥Ù6p]psòó+N--… Éq^<î¡Ün3{zÿ䟲ö:?»ÝâqºMe,NÉÔö5¼OœÂå>=t¾Ð!N±QJV¶UMœ‚¥È¥³S¼BQnèä7‹Sõïp¾^XDË €öÅ)K`Û*NK×ÞÍVœânÙ§¾«Ö§q‚tuZÙ¬ôF€zèw8%ÑÝȾ§´îcGéM(ýâhÞé7T ±Ÿ™‹8œ‚Çpuºï‡h8%ÚÐ4w8PÄ5‰X7$©‚z8•¤/O‰RV6&¾0l8Å+éô6Â(Åpêö€‘Æ¥N)×Ô¹ò`‰SÞ?ŒÌ§©£%C4uÀ%ƒoWv N]3î˜Q‰S±z÷¦Ð†á‹8eŒ¨žŒ¡]§ÐÝ'$,™TwâT»Ã)öeN8õ¿vÀndÐipÊ4á•êÃî¨;˜|f€Ö/œê%Å­:XK±±NiNåS«ûÚ <„Sz„]utõ_½í¶×+ëáÆœ?mÏÌärå¢4qª.¬‘ê™j‡¥ÈR┦.Ï(”Ä©5ý9ýW‘‡SS½Îþ7EÃ)žƒÈ ™ËèpêžS ƒÈÈ» §ÂÙÌÇ@ZcpÍ¢'B'2m¾( §ê`óû;BdøJ=U»NiŠñ;YZœJ{É%Ý ’ƒ8í"ªBÆ'p*Ž–N' ÏÇ&Tà.à?ÓÚ1P#pênZ±¸>€;#k84*ÆqDJ÷{Wz"›aÿ¹¯j—™ +ó¡@÷¿iGäÂÜ¥Z)êQû8…Á¶fitîS§T Zp*nm 5û=18õ…Eåò”/{¾‚ÚL¬6ä‡PÅù2'8ÂÄ8Å^™ÿ¦~[ j/áÿ$™UxÞ]!Ê`‹2¾½ÆHµK5=¾+3ã“Q0¡cz̲bê›Js›M6ð+È7E*sw0TGåU×(¥G­ðMÊÂÍ ù¦âZ‹‡Ñ©fÍ |SŸVÙn`Iþ+PÇýN™÷MQq²ÃöMña%XKÞÿ‘ÌLŠJßTágmá¿ ßT•—ÂN˜¯|ß7Å”=³)q—ºÓ›„ ⤈„,ýöM-·J?¾)1£ßÔ~"œZŽ†©ÂƒS€£•Â–ù®§27ô^"Uà³pØ,ô7…Àm>ä ~ƒ®,ð›úè»góì¿Ù`Ðàäî›·Á¶â6œSž¦fv'ùªFéÈ·Iè°¤ž=ªœ@¥²wÈu@ÜHઃý:8Å G EzÿM©^)á𛚯NÒ˜ ÕÊÃÞéós¦˜_“ÿ¦Rx3v¸Dé›’>NZMž°öMeô[HV&"ëõûj‰ztÝ9–â›’ª(ÿW g©û¦pÕ‰¹TûŒl~S»Sº ªÑ›Òñ– :eÿ<"ô¦(\¥ dÙ›bkûÛ|àGX™Dö¦~ëzoêÿü膯î1]]¿)ÓÃyÃÅ ý¦•ˆÕš¡]âý¦r Ÿ²_NÙhZr[šDˆM`œºA"¦|ôsrÀ©û“òi~?·xÆZÀ©!8SÄ瑵]t.‡{äSÂ÷9û0§Î ê†#ÖªBÅl <öJNé¯Q“½F™qæ#| œZg=L. NiŒÙï¡Mè„àTR¡I™=œB¸ü²Ó@§$ZÑ#ç4œmw&ý僚J³½<úc Nawþ N541™w*œJ#ŠÚ!õ)§È3üÕ§h(OSìU¹§šINÍ$ˆ º¹¢ÖäK€Sx¾åfYP„÷àT5jdÊû{òWßj"ÏLJñppŠœòXµ¢0¾ZöFöNù—¢æÈ~©T©ô"úH€ï¹µ—™ÑŸ¹%$œÊ–kxht"œŠ™ÿÅ✺çzÚ¡Ap* ¥ÝöÞ2¾²Pヘã¯4:©«"œºÕ(íd•… §šõ ý¥„oËk»X$N©"¤2ºõù°H§“|Öõ/áì,âõ<6Môd™ÌqJmC†Ô¾Z‘°$¸%…î›dEJšäX\îÞÞBù‡|}DÆþ 3 +°?CìÂ`a)ídÿ…âFÒ;ä¼}†©¹`lÁz(XQ†vEY™è9E/Ìgö™ Ã<˜Q‚7lx9 2{(ã†xˆa(p&U8ØUÁnÃQźù2!B1 ¼E—],™F‚‘ŽB2Ž§çQÎäOf *HBÓíýH1s„$éö,ZØæš4¥†TRòŸ˜ydž,?;kÊl<‰>VÈ^àu…‰ÄÃ^ñ "ª>Ť‰m>‘L‘*š‹4ôÍ'M¥4qÞ÷–²G…Obf%ÃÂ'уä©TÊ-AW15†§À Å„.Ÿ  žJ±n SzÏÂPlSˆ8# +aà®íÚ凇 ††¹ÍèÂøï‚7w?ÎðŽ‹þ‡‹ Cþ¢Ñ7ù‹.ºèTµAÔý`ÊH¨„‡2X!ü"ì#dóKÊ°eø° E"ü2C˜™¨'ký9gzaƒ÷˜žj˜öå ÁAK`¨6×]ü8ÅŸ³~É ?ZK( ë0Aø!ÔCG¯Æ_Iͧ)&Â/Â@J„À ;¿0~ )Ò÷Oé`CÄ8xÂÅ[g±ÙøvÝà "![„ 1I +,‡ »V°Ã¾Ã3J¦Ô=ÌB&QS¾¥ÄÕ¸Twª¯è”ùCD>¡‘?´"2e+J&ÈHƒ0ú³ÜP¡èW!¥òpC%ʲŸ!UmVIÚLêÐç¬""“jæŸdÎÞ³™À0ô=jcfÁ¯jIåÂ:(÷Ú '+² 6ïÑNVn–¥‚äk‡áJi5Öm—3ZË'ä|å!ô!R÷#J|έ¡ÒÅÈd=EþPÿCVÞ8CLÊÑ Ñ‚_µjC°<†aŠ·pZ¶%?HhL³ýQîáǯ)#ìFS¥úU½)|RÌF¬ü䃶?„K;V Ë2P‚ë˜3„.H("‰Ê]çKeSrn|‘É'D áZ´=î!ÅþEÏ óÜ£Ü!>ŠžxÑòà™Ìb!Í' ‡¶&SÈ$Û´I&{ó¢ª'³ +soBUÉ퉥Ð|ô´ïœÁ)ƒ(#‹-dæçæ3óæ!:z=Ó„µ±Fƒ\­YA}E !þ ™\+CTÞ_çýþ½¯“:Çþœ•Ó Ú4¶?yX¿ÃÄ×DΙ>%0È£òÿ$Cšsœyɘ‹dÍïÑ †Æ(ÊP£QûCY'UÿÞÏ¿%¿¹œi Ã¾Æ8ŽçrÊå"ÿE†à/R±Í;ò‰îë eÂÉ $Ã+ô~ Û?”ò›üG†ù ù?DË£ ²×øÒ–\°ÈÿY!h4q£4)\ÈÐ~†âÀðWøU I`¥d©ÑKKã +”6”¿ËÊ õ ïûcòì9ÿb –BŸ*Õ >Õ žU)c*Õ~8Üáqã~Žï`õ3LP¢xgŃÞCàŒkXþ“á!vNe¸›ùgŠÏ\fšw1EêfdÛ$a´MSøVoVŒ©{ãP3^;1EC„I~L"6Íþ|fÇÎtkà<O‡|bøÚ¬™E+a ·üa! šÌÜ3‰ŒXá™ËŒ$'Æ×ç+Ÿ Å1T#­2QxÚ´ÐçA -Ü +ºwuR=ßfEžñÎGÒÿþ˜A!²†9+Jæ¤ÿà‚Â:„þÉâ4ѽˆ‘A6¯ѤO¾)Dˆº’¨ä¡ª¦BÐBá¦6ê^LAÓDÓiôÞµÞ½ Í8ô*¯mÞ°5aê™{6¥C/rÏÂ!ÏÏÊÊN¿QÃ5ß(aÚ¿L©“”0†‡ØÂÃøØüŸ•ñ¸¢î9ÅzN‘MrH¶f&Î>»çÑäÚðŸšR‘,W¶äöqá##½ñw -=ëË=ª[adü]^dX(Ö3Ô—‡Ð@ëòÊ7*?¢QB6<uK dP˜)d_H¶†:áò ÝŒ³B-Þ¢`Ð/ƒç +%þ¨ Ö"úÓþNÿ¤ífA3×Ì8<…Í1‡3?v¬n2%Ýç,!'çÁѱ=¿ŒA;…¨‡ßäøb^÷Ã9QÞá`9íË£\–ðRc¦Ým5Æ6j`µOðè§Ùxôw…¶#ìÖkÃ~O1ƒ"4…½‹Bƒ,¡E!XÎ<«_Ìùª˜”’ÏèmùBå™—$„c ‘O(‡¿#Hý}|º ¢g°|›6øTGý† +UBÒÌÓl|Ñ¥£w´(B²ÉD±ÎPˆÇÅ)Ÿóyˆ~§1¨z$yWCpÐp8l–õ x,b  +@Z!$Y @@@ºQÛNRYÔ2}®½€ýœò Ïd´¼«ê¨,:QŸÞ*¢Ý >Ä)‹Î4eqêsoÝ›¨/Þe”²(â¨Ï0±úäÊ€ʂ½?yÊB󀌪s«>FeQ9Tõµ§,ø«N¤´Êò‹ SÔ>sºeKeÁ4“ºê[£xâÄÞfùª¯8‰áˆðºg–‡cõ ‘Äz+HýŽë¾¢•û檯;€H<÷ꫯB£Ø⥺dîŒ7hD2œ²ØÚ·k5eÑÒ) ¢²8õÃúK¬/…²>ëŸõ™V—õТ´c}±×2•…ˆÊâ®S4€çÄ´Ùp™²Oú—+â^,¬_’¼à›V±¾Ùïnµ¬¯èLoñ¬ïdÍ()·ùRÊbÑú€åZ_ +¶>é™:¡8vw”…åSÒÖ—²Xo}g¶¾dj”E–"M¸x“’ª¡,z~—IºxÜX¬d”☔Åh) ÞGVªªõM˜ŽQh‡¬å:r`˜­ÏÝm«Z…o}½­¦,0a¬Ÿ²èÈõQ›\áú<™`n}믱i†«è‹__`gš•ê¾”E§”¾¾íbš<Êb’P©O¨¹§`9=W,¬eÛöO”“ŦX„¼É¢jÞëË= £fN¤¼š{2<^t_ߨXh Ódá^(“… “ÅÇ2®×—øDû®Áë;àÝaO“8RâªÇÄV˜,šv}ìK1KÖ ¼>¹daƒˆÝQË®oÙ¦ª±\2YäÙº¾ÀdA÷LÀªVµš,†÷­¯š™,`¢R É„3Y¤áú¸á#Ÿ5YØÚ„®2Y¦ÉÂ}®ïÚµzFu}ñ& f•$mr¥'žrÔdQÓJtI%3Ø!tÁ®ï®®¯³ÉÎõÕ5YÕÉ¢Jž, 3ò¥s"½®/Æ‚=|²:˜Ê‚»Pͦ¹C€çD^eABRW×wN²€hR¤]ß?ÊBH™º3¶˜W“×·³( 9ƒ‘ìúúP˜UŽ@ó0šD×( |4¯oϾ¾ˆ²r( 3QÕ^_A]ŸÊâéÏ®ïä6˜®/eÁFÆÇVÑQí1Ã໾Àëó±Q&ò(‹ÿ]Æ®¯ûHY<ð$ö=´„:y±ö(:¯oUÙf}Ý‹ÅRÊ‚~R4vÖÞk”f¢,¾ªoCYè²UXMIõÙP(ÐêX^”…N0&r E›Ú² ™”†ŒæQÞ^’²Ø}´¦ó™^.}%4e}e<Ö7 ‹²Ð½5—õ9•ðPR;£LÀð{À¼ûEÿUÏ?Ddø¹®v~ƒŠ/æa|÷á»Æ'Ϊoˆ ~F’È€$0LŠC+€{'áYKÆÃÈ&°t¾BȪñùÌH[ „¢,¾.ØtÛxZõ KEY$Í÷ÐNDY]Ÿm˲b/Êâ‡:áM/¹’s½yý£‰²`EÕD +~.^q@YdTÅ¢P¹Ì/Ø_ßœíE7( ·•ú°‚²¨u~j“ö/¶ +üÕLÕ%Ä]¸mÖÊÊB[‚›6Ö'‡¹2˯"ù¯Ýf±løJ‰Ë‹%réÜrbéž( mþÉE”Å‘’½zµØ«#õôU”…9‰mõ‚8"_ 5½p‰†¾‘(zPNlЮ®ó=3ÀDYØE©ðÙé.‚- †!Á-ÊBu³¸‚?Sª·­ ìdWÛŠSÊ"ÚOp¾xß²À‚Êi™éRÁÔ(PÞ© Êâ®ø,.OªË߀Î*”E'Þ#ìÊÉ‹PŠó[öœK°.H‚S‡Q ’¯{)"²úrlr™EY4…ÆAq•[‰²È]δ‡(z+#Ô¨VQq|á× Œôí…‰æÖþ™îTL©¤ Œ9óO‹sŠ²ˆV‰Çqºï˜@¢´‰²PöMæfËeÁŠeáZ–PRyè<&É=t­¹Æge¡?+%s=tõ”‹² pÕÄ:Û÷ ƒ„:açrMæP˜4«‰²(Ñ$µy€Ý +ü©QJ\@blYm–Ô.uáii¶Ä1ýYdöúŠù.ä…»gõ“„Z”Ž²øôñPDàï-\6é#eíß–&€‘Né±B› +wI½‰ìäB) åý†åØ~ãQ:JYh#MüEo3(¬<á›ñÊΞX/‚ƒR¢ÓÓÒ´…‰nVÞÞ"ÙŠD1^E,eAºkÈó.•nØ«!?^ÊâÒ(¤ðŒŠ‡ö&Šs)‹üC°”…N¿0í¢waw[Í\@ciæ*ea¡_1fóASÊÂè²4H<·©Zu6Ö%èïHÕBÎ7 +º³$¨ï©zPÃ,e¡1mitt–$•óKYL'UÔÁCŸÄ{Æs”ˆã£Dïh(~|˜²õ0ÖÔ )‘‹¸S°* +@ …JsÂÚL[–Æ͈L¿Ø|1eáv• +Ï€×”….…ä/SâAU0>ÖûÒüjÊB!Nõ¯ç§fÎÒˆéMYly.°ßžZpÏrÊBÀF–§úKä)‹( Fe±ôhÛPçÛöâû2ÁˆÊâÏ~AvÁu¨,ž”ÁÖT$éR© ³0·uH¬¹Ól£ä9†Ô“Êa1‰D¡<Ê]i,PôÚ}+m}ª©<  +Ü–…›Z{•<ÉÌVÐôj)ñÎý@yÊB„*³XŠëÛPJ•E|íð¡Af2¹]² ¹ ”kÀºPq¨«²ã +¬,xõ¯)ëBü++‹ã8¦™ ŸE×ÅUÖˆ›•…4pvҦȻp‹€LãÊ"M;Ðæ“…{‘re±³8™M hÁMTòy;$WµùYÚ[±°+ "é—LÉð‘eBèfneñÐö!²`l²Üé`¸¢‡ý6U™VP"X¨ ©Ö ,ðWq…p®ÀúðiÄþÌÊL 5šF' ±²ðìïG˜#ÒÈKV' š{G b¥ÄÊ?VÎ)Ün~¸×4W#¬,(âëœhäÞÙò^Áï‹Kš¨ôüpõ—+‹n’g%s ûV÷^嶄bdÃɾsi¤e`Àj+h:p1ÜÖÝrrrÁëËlÞ+¨fù°²è’›£×$†=kÄÚ¬,î¦Û=°zœ–•EXÍ-%1Ë÷ +–Œï>|¸Cöu ˆÖ®,œ¥&é3>{¼]ÚÊÂG Pú +V»ÅVïoeñÅ*¨/1‘@–ºþ½´¦ž§®(d¯KÛã•Eû\TĶ/Š +¥ÐsOyß+PV4‹™Ó«AÝ<•ˆÞ+ üº.£Ó@ñà÷ +~õ´¬3ðkÊq¨Ý˜î õñc ¿²p+&üË·2iÁ_ß+ ƒ[⎛Ø&úðÊ,ƒÉxÒ}‚Èó½Nˆ»Ìeú^áC(O&ŒÆV@øÊÂDj1:>Ó˜Ó¹‹K÷ +¬˜‚Õäð^çtwõ¼²ØMÀvvÿi?Ë+ -Éröé‚ÅfdW¹Î`Y˜1Mq³¿eq3wE +¤Å²W™'|³,z¯p—£¢SïÜËbp0㋸8üߧzpÞ+ ‹è‰\“«ô{ €òˆea^]ßU[&lÞœ±,”lñb’LeŲ(eDEWB,{Kq´ Së^óÎBJeÁ ¹fâCžÔziUÜ|•ÀT>Kû±Îi&ÃXʳ,Îó`’¡í{25’S7úïIMàM9´ýQLÚB¹¶î!h,VêmÅŠŠ(“l•’,‹›‡€ÂA¦µ"0n7I–E|Ï(`¸ÓR˲ ‚?Mî{…âæOë-OA]–EµÚm-zJ}ýu‹;asƒ½$Ëa²©-¹Ú<¦”ò{hºÂRÝùÊ‚*:ø +-Æ…'aäXòj ²ø +E¤~ Ñ—ˆ$ÜBùß›ò÷‹¯ðpj%ñg_A^½P«dY¨>‹äNÌ’Xç«ø +Îù³Ä/Y*:üA4Rû°pc› Ë"b>áTÎÍWÄÃ$pIj”¾É²xÞì(@Còd…øÀn¡)üñ +zص­á©Ú×ó‰°dY€(>4½g]Š£˜1Ë⊂w…>±e|˜T›e¡ýW06cdpËi"œ!Èš›d¦¶ñÏ»Âeœ(—˜b¨,‹Û3rcÿ|ÞlkFƒTsï.yW8°o­8Äþ­XûþcEŒx“IJè#+–t +”ºƒXš "¦§ƒÞöœâY´dûb],‹æ&‚£ÐÌ\–E•1"`~HÃ92õNbêÓ]¡É÷}®$ˆ3Y›À®biO92 ŸÓJw´ƒnGší˜¹êÜÅÐë®@ìã]Гÿg«Î–eñ.fåÞßZóÔ]"*ãÿЖk4j¨Ögµ<×» ¹g³pìú{Ñ™´,ÂlÕ ‡¤e±©´QN–qíCѲ8ù§NÑ-ÒfY¸9|¶¸’&[l=æ]îí¤R‹E®ü6­<Êœ1Sf¸[Æ4o”4FÎ?ÄýgYÜê(|*‚«,‹ò±Lí‹ £õvWˆl‡Ï®Vá²,¾µ:JwkyŠ«Ç ‰ÞñUQPbYèkŒ5òß´!â)–Å XFÙÝcfêòî +ž‘Z‰½ 8‹ŸBeÊyWØU+ DTô{=`G–…?zȇ¥BFƻ‚‡£H®<àîCH–ÅJv¤²,€L÷@P–…bQi£öxW@§ô¢mÔÜ2vz ‚ªeñePØUwíÓ Me%ÉÔåï +Ü÷‡qèÐ׆wbY|6õßÏQ3ÕpO†sÌ5>ù륣¿›Æ& a… Iô®Em.Ô!ªÇÏ«hP™ QXŲÐB±Õ° Ë“‰eFË’ÏÂJÊ!þ‘GËB$¿ùˆ+@XvýH¿–•HŒ fËnÊ$AH€às},‹soñmx, ö3»Ð»+,ÃŽ3lÀ°îéödJ,Vh´ÕÆXžÆr«ü*Vã: ‚ò"ÿ[9Ùë`(á颟¤®²‘Æ≢^q@^YÈ9Ð U§Y +€N`ÿÐÚ´NgÞ]AT)BS‹ý<™Ÿ˜®,4AaåžÎÓ@³+‹¹Tï#$Y'aÌ•œ ñI´²‘u›p4ÔÛ»+‹…d¼/<°»²¸‹˜ÐcŠd‘ûUÀ/ÀßOm'CÚ;/ÅhÖ/S=ì?­ªMÆ­Ï_¥c/4E´Õ«½>E\!ü+‹­\ÜXõÊ¢ÑJàÜi Œ©‘*ñÊB"_2â¢ëüs¼²8ˆ‚Çye‘e鞆ì…æ`äÀ$^Yì oUúR*i¾²Øš9‡T¥Sžÿ«HÖÍWÊÊâ,yeÓ¿4þ Ò©Beôsj\æ·÷„èD¹+‹ƒ‰ØûŠ·4øS¦T–…>/ˆ) c*ÐþYQÃïÞ¶’B`Y¸Žõzg±¢Í¬Ó­ò¸`Y˜5l’Kš¿Š¿³²e|è÷V©Ùg{¿b4—,‹÷öŒ}¹/l†oD…9+–é"¦ë³Ü(‚t4À•Ú[U …ê,’)ª +XúÝœÓûZnÐÚƒLö‰¾©2¹,„eÁŒ:;×qyÇbЖÐÿwÑF‚MIÅ¯Õ !ýÛ‡–ÅâÂk,›…*ô‡aYœçmøJKaôOá/º ò¸DΩ®‚¨;˜)8õËb¥¬€ƒšÔ|; tË‚v£Î°,Èø„o€‘B%!Ö$!îêZ–Åíð^šã‚ea\Ô%iɯ,Þ +¿HW³ Àµ›”P£áÝW“d@Ñ2’‹RQ•€T‘È“‚›³í$=iõ¸\ÄùÊÂøøo³«~yFè•…ÿÒõ{¿FHeze±æv5aØ{ôx½²U ¢À˜KMNháÉz½²·;%–~­<ä>õ£àÆ£|@»Ñ(P½õEó­âl…ª¢²Q\PiY¯,´BXš.˜¬Jþ ÀMÇ «+ðgºV=;D; +{yócYŒõB‰¤gNÍõ¸‚¨‚ÑšI敱,hc?-«™îJ=&‚ç”(1S"–…9ˆVI¥J3 ãXk¤ýõ%ËbÌo¶Ëe±,@4¤jvCLÓ ü°,‹árMÅôÆA1r¨—åÑcXLÑmÞƒeꌤF<[õ‡ßtP „é•…¯Kÿ^|Åû;%4CìŸ'žW/¨a©÷+ ‚Òƒ”X¦z +Ë"Ý Ÿ;WIaYhø‘J¤4*ÍA¡—!Ó/™w”“Öy Wùë ,2ÿÓ¨±,$^ºˆíg(l + +´oT’”Ô!–Åîfᬌª((´,«’"½ÀÍbY ˆð@„‚‚ŽM=!ךz…G¨,çì}îƲ ´ãY~7ó±,”“¼#çB¹£å®P,0‘cÓÙžJŽOGyž¥ pò;p¨ŸÒ`€eqßÊ7Vûl¼3X«–@|› +ç+.c.2Šd0™]¬öBØ>ûWÓ/„›àÇ´JÊ Â aýUB + +XÑkÿÊý}±DeAO…ô(›N¬'ˆe!TÖø|eÐV ö·˜XÐ:ô$ë|߶ƒÚôpÇ…e)N•0ÕX‡„ÓJ§ØúOP˜äƒàT™—ãH!´ËËB,¡ùCç?=yo³ó(#nr±nÚDwZuÅ÷-9`Yð +Ò…‘q‡éÓÖêu‚Â;3t}_Ÿ ËbÐ- 60þ3,‹F†èÔd…” +Ë}Ä­T‘U +w†VʯX×`©[ (D#ÉÁZzVED"Sê¯á¸Ç²øq0ÛŸÅq, Ç3w.4ÎqUxu€EÑ-P°m‹•Ø€BAÇ °,ÖQì@ô^Ȳ…v[–Ö¾°,„@àŠ®§oP2T ÏÃTo¾ñ¡Çãpɲp-¼ Z_Y¼!¬zÔ¯,.æˆþçã—èCú  Ðf\jÙ_Ø–bϹk)?©ì•{éä=yè¥f…+ý̬ ò„iu^Y,2ãÈNžna;ÔkÁ+ µS-d÷šÖø eEž`Y&ðô!®O ô[0ȹØ|øU¹ŒÖ“%¹ËÚõ t,j(>Rƒð•…†úÑbíûÏQÚ«¢çuv}BQwyöÇú<Áæ™8)‡ë€¤°,"Jt/¸> +ËÂüðžØ‹ÓžÁ@tyÙ„ùW~A7¡ òPÍF`YÚâ«ŽÁ%»ñ[=Á¹]l=­JÉ®,°.ˆžàU ÖÒ>óÀ½³Nª ŸCäË +z®ëÞj=a¶/èUöx‘gÉ’ñeŠ‘S—á‡êã e晩¬ZíÌÃA¥eQó g-'t ’0ž[ À¼²(^??1 ØPá+ Çž‡LYAÏã ˜‡n'R;þK@ׯ,€.ïO”Ç$ŒóÑv¾ Q8P)±VÆû½æñ. ŠqFO,úy;™+'ȵ‘‚x¡ZWÅ#h•f/ýxŸNد[R/mw)…÷ÇíN»Îœ²²˜æû7¬)¹ÏÊⶢ$¥A„kYYZ^\|s°Å›WY OÈineòsù€­ê;!šÔæé&lÕ}•E@Ä犿$† ¯²h¯qAñý8—ðc«,šLm\Âö¯q•ñ-"O^kã¾÷⌶&«,„}¸Avy$3}'0Ó©±²Ø.Þr»¾Z +úÿ7Ô©ß ¬'‡“¤ç’«,~vô‰-¯ŠÉW[eqÍ5AbP~H¨ï‡&rjUúLm'fCe–SÚUG‰$k ~SÏ*‹éƒP4ky]˜=%£;±ŸúHûY¿·Ê ºb› ‰¼7÷@5'©×"¤TYÔŠéâ;Jeaâ|@s£ï*Åw¢®Ä7>šåë;¡4_Ó©7Çý#ÆâN~w#=gW0NúŒö€Šjâ²/[}'¡Ú7K#ï*tÆ) Ÿ\ôVÓý)‹;T•æ¢Í=¿,à׉˜Rr–M…‘ó:Ò©,<½3Ñú¼¢åhPYR‰„Ê‚­w¸Ô¡Ã§å÷;¡²5!|×ï„‘¨Eßï„U\Ò¸Ãã;‘,*‹¬V¤„%@e¡†‰½úƒ?÷ß ÏAýgùel1[TÁw«jQ…ŠÊB•40bZ.~'ÐC;—ªÅ@*‹Ñm$oV-ãðy¯õwÂÿHÚ¨, r¨K^4"°CeK¼ÈSNF×a:nôü(K +¨, ß;ÇZÙû;Aä á†ÞÃèm¦iIe±:¡fÞçßLÛ ÅÙC…¾%dTÉ”r¯´6$:ø¶cW¥ßR£,•Å67>ðKÁTú™¸MJ§•TOÅyV¡ÊÂhu,qUÛP˜ª,t+Ùd¾lT†V* 7™)êÀ)Ž^N(}ãO”û¡ÙÜ–"©À)×/ò+/%“ý©,þÀ=ñª²ðŸ|Ø¥i˜ÝhUx– ÃYß2ì`VªUYŒö¨È”šåBXõxlEáA¾Àt”Ê‚?K–ëEwÒ󆤲˜ñˆž ï áêÑ€TASsl<£|ƒÛ¦¬ Pxv ÷œ\:RÙ—jB( *âSY€÷ÕGb­Êµ<>SeqÁ—ÞP˜Pì-ùîµy_Œâ~¿X„`^è›HT8äAk•»åWôZA1t—ÓÂÓŸØ]*f:.«‹Á¯²0RöõΕ/ÿó<u•E’Ñ2Ì‘.¸Ê‚ûCÆk6´ËR¾[ejÉë ?s…%¢ˆ H"@Ã’¿Ê¢Ç†ý·æx¨ê½_Efÿ|†j•E5(£ÑÑ­²€(èÉš®¬²¸¯bJyÄ©­²ø÷†ÅVY@b‰â²ìRze™­“"VCï9ó‡Gg+‹?㜄Ÿ­,L.ð“yçä)Е…ó–jîX<8FsK ;HüCs@%:óžW@e§7(Œ`¦Ë~ðE2õÊâ¾…ÃÿÂúF¼²ḣÙZƦ¾²€ÐÐc1EL +R}î•E[°„ÜA9Šn`¼~e!eë³²°µWÿI/'®,úÕÊ"TåDFµ•Å¤ÜÓsA¡&”ÙÊÂû!f‰_ÉÊ¢;ÏÖo[‘ s }UÅÚ¬,¤Ò6fUqË/t„UŸ•Å9)‚GVË9ôÕ‹ç˜ßG ÝÚ:ˆAÞØüNç_t ÍšbmeASM‹ÏV'ðçÊ"3¢‹qõ3’J[™.K;ÏÑI„v”Îre%›ÈÓÑ]e/»Ù+ZY4J)ðà pèKœÚÊSÖ‘;65½ùi•Å‡Ê;M¥M5ïN‡ÍV+ y*ŒNL+ aüHãJ™J+‹Q’Èe­,¶O*Zþ1ŽÜÊÖ9käÌ/ Û@Š oø ¡…%¸-z½|àïþÊ ¬,Um/‰´x܇CêXHde!8 +´6K0 [4™[Y žè1¯5¢õfs}„¤ŽpV@kñ„¥zŸ°;‘E¡õr›ä5ƒÀù¯§¦•…§ØŠoZg YY(9 ëYó©æ8„׶>«ïfÑ  +@ÿ£]OsÖGè¢Ñ⧓BÑYe!ÛÒr“ùÞ•^f•ÅÂb„Y”–—‚ó«Ü«,îã,È™´ëªì•¿cÜd‰rv­_|„ºÊZ~Ð>,^ª,¾F"߃YD]e" äâ†Ç>á,³>B—- £ôel,¸ûº«ÝÏ]AñlSöˆ¼Rz}„ð EÛ‰F`./£åüîÛ£ãÆÔþú¦mhºq,‚9¦Ê¢NS#‘#¬!EøT* gfju`¥§>úÂ>iTO4BH¤ÊÂ`i2Ÿ<ßØ-Ÿ* ÑúȾO–qüWe!ÉT5Ðäv‘ÊB¨;Ê€!À:•E_Ÿ]”Tµ^Á¦)Í/{Ÿ¤ê#œÆnNˆã¡#6Åýjb‚Ö¨>ÂèlÌ+Zê#¸(ùQ.2(ûÆåê>`VˆnÔG8ÿ”ò?óÉ„ ýcž>ŒÚgÔGPÚØ{Ç* ÊsJ8«¨ú*ÎÑO/»ý‹õ~š†"Ovìi_ëöPãl©p3‰v2tÁú­ó°HDø¾^FA}‘¿bec´I‰$EYÈ ‚C¶9î(‹`(û†<$%j0=mBv(²¢,xô‚­i‰Ï™ˆ€&X=9X”E¾—íŒt,Y|N2üD ÂÖ¯A”Å üb[ñÊh~a¬Ó$ÞS¬ö.:Êâ ȧ·qø…­çÿ¥\.)‹æ”béŽ,‰<‚4n€»EaÕÚ÷.üÁ{ìužBJÛ(eAóöùŸ+º¶¡"ä4I›j4Œ ñMY$èúÔ'ÒM®“) k}”«‡ 8°]VðzX,À5£€Ç”ºÎÎ^¿ÑÕ LY슔°©I¦ï4f¶¶éÁ”ÅÞòÏ|ÝÕR:mëè<úÓwëJYÝÉg¼é¸h®ì—Ôüà¢Ó¶xÊbÁæÎ ì)²xÉó$ ¨,þlµÀIÕ{Ùþë “ýZ±©¨,ÎE”.is%?*ÏqX*‹Îä^$ñD@41¥²øžÞCKÁzkGHe¡2À Êl›vÐ@/Fh.8”ÉùÃTXÌAm´Êùæ¸a*…v¼ù×'ÆÊ(ò!—ÇÁöÓ§,¬Éˆý)v§,ç…œ£—°#À1ËeYW9ˆ«Å‹ +ר|¼ësñ©AôçóÜX4•…m‘Cà—Š\6Osº¸km“ŽTŒ@ÅïvPåD‰Pÿ&²@* ¤‰U·Pó½¶b•…¥îVkäË ®²àýEèî* ’KF8ˆÓ¼ÜæéYYxGÑò²Èœ×Ò‰Wº‹B'®ZöÅêY«,ŒQ>I¢TÏ3z1Bb´'¦Ê¢õÛö1Åy.FˆleáŽØúbØÞ¢Êâ¼~7HÜAïbËX²u±Ibïpèy˜Àl#Üs1‚úœlD1_†ÊbLÜX:&@}@Ê8Œ¬8]¹žPVM4QYˆWé•2 6µ·p1€½\¼Ü‡‡NYäíK^Ij1¨õgAìI“m…o]ŒÓs·7wõb„«.-ÈŽí¶‹ßÅNpØV¢ROY|,#gXŒwÔmg:@ª„ð^|4#X) O~P:µŸÊS¹Y»bs¯-ÓlS‰Tì,=,éŠ ½ªœäÆ6e!¤OðöÂz>õ8ˆ:3="âš²Ðå0…§¸!µÁv5ÙJžyyÄ‘LYˆí‚Ø*S)ˆX}H +) m”ÍÈ{ØÑ”…"†ÄJ~ì“ãSÜ0Ã< ðõ%¼·)ˆÊ‚ö3ìÝà}§a¾Å™h^ÙziLYH,×!×lù¤hΖòÊMYŒfF¡ª=8ùSµ)‹Ik`¨²ãåÂù;eÑldQí} +sÜL(ã +Fïš²à§Çã·úX‘õ½œå° èjB÷T^,ÑÀ”|5è>eѦ ²xPšd]_;m°ˆaä²DAçu0SRYЉ£,\! ¶·>@¬,¬ê\]„K´È…÷ýJ[ÞêíºVKüI5HvVn÷¯qk˜Ôè¢?b•Å™Á åUÉOÊjÝàè9]óüŠd >qôº¥hE*„¢0÷j(KN»ßÒO•… mñkoS ¾›ê"àî müÙ0 ¾W\Ö±ÑVeU]„$ÛKÚù¸LÒUY0ƒžª!mi¹ÔEàLͲ!USƒúp‡Ql”*‹5€x˜HËèReñjÃ&Uä¡â»†Â~½—4¥(’ ö.­¢Šuk¬‘M •Ee‚0…¨á“ê¿’’qÃàu¥ÀÑSÖiD¢…±¤â”Å¥¤) U#—žSZÊBRò%["¦,^f*_Ù¾ Z+þ›È€I $x/±”Z5ÏÙ+? 2X´”…4šjÊNÃØ>±S¯¿þïÊ´ÖE0Žr÷s׬MZíjò™ {­ÃÖZXÖ 5 #™äÞm®D¹ þeCÃäÁl ¶Ê"e²+:ë"„>:øs‡M¬¥R3g--©¼®Ÿ3˜™ë"„Š÷8o䌸‡ÞÚ°¨,Ò¤àWLåäþÒº‹*_¡<3ÔUŸä¥w].†qòz" -$‘R ýmI5…?½—zõ­Ò.E-eñ’rfʽcL+eñÌ„|-¹Ìþí6ÉÏ®‰7EdtC2Ö»ø@Öm©AúºšØŠ[¢RSp]zÑ=QTpœä¸.‚«¦:,s³.B™,BîÊ;òjá(#ë"l¿±’_0 ë"SRýºeñØðU}ΔS¼ +¦VhØyI¾Y¬‹ÿ /ð8Ï‚,îÎ>i‡‹0î®–² º²ðFÆÂb JŽ)¶Ûà„øO,˜"eqè7šŸ Ø—²‰!èœØµhÑÙäö(‹)59½W„Vˆ¥Ç¢í) Lw‚Uò¢JŒ²"Y»«óŒ²`D ×;G³=å8Öc$'@k5¬cýö‰+°­Špé¢Ïes–C!Ï[A-õ‡poÐ6¿*´œ÷yäÛ]ü9ï™TGú¢,84¡`>¶ùÄo +Ù!S4 eqZÑ&á—7×t§)`¨ŠpQÂ诗02¨ªP +_ «präó£”ûK¯BY.T^íJr•P‰,põYÌ“KÅä·ä<ÊbŒ>ZˆåîUŽ¢Ou„ÿ3áU g%CÏøÚD(‚ÚþêúÞWbpBtlnN) &ÝF£ªÝ…@(ªŠ]2ÃQû‹'}Qov‰ÕC a/néŽX!ØÊÜš.—1Ò‰ÚM9Ô¾nR«Ræ›’jxªºµãRdÕ·ÇU\¥,.n¬ ¼ï ”l)‹åI³G¤ãB á­–²øÒ‰}®Å‰FªÑ@\ï-†z7Š€n“ü âe~,2P„ëFO͘R’-·]uÔ?:¾§›Z(ghÊ¢öûª¦,î¢*v„uAÄ}JYì åR}S)‹Q_;ò‘C,“¥, æ<|ÖSì€%z)‹*SIº\) hà-%ê( <Ȧ"^~ +†º›ú(‹÷B «½¼YÈ£,œäÇž6 ˆ²˜Rú á`ö¼jç),×lEYC´z—ÅGîDQþ-ÿyçl¥(‹?9Ž+8IEYpûÇ“É”tïXªALgAŽ(‹Qˆk÷Ðú_"å>ʢ̢ó“/XpdÞÑè(‹&t?¸Ï¶yIU®IÚŒ\4ÊbŠ{ÊèÂ5<ÍÅ$I¼eììÒî ‘›Äç­¿efEYŸfgi5@Q„&Cèº_è á" ƒ&ôî*Š!äjgÙ°Z”q‰²øZ0õÏìe£,¾e'ùWä„<ù—@…™k±à%²DYÌ‘y”ˆÃn¦C¾HYˆ®kÆP€N(óÆ+Ýh鲤,2òA +DP¥nxª9¤,$è€{Ò]±Ê›»ö½S‘‘Ï¡Sâp𮢠¼XÝîØr".:…aIYX˜@adèz öÉÊÀ ²]û» +%zÉMoŠbç#ô”Î,ƒ7Y@tôšH:!œ[’k–Vn²˜*Ã~Æ_ú£MóÔ&þð˜\)æ9tD>wÍRØ+õHíY¦äö¦ÉŸn™,2¡0& —Žf.F%÷ =d3 Ä|¾ŽÎ©úΙŸíúS q?1“E¡º|úí’‰‡aŸ !&%êÃ{Àyt~"¼çýÅ(wR6ÏßëâŒÕŸ3YwÌÙ=˜ŽŸ2Y<ª†Ÿ‹ WÇI¶!¦¤T€·§šµþÄß1Ýd²È§IÕ¥x狨°l)îeå¿ÉçDð‹?ä)¢#Œë\EÍ/Y<ëÄÜÎRìÁ‚ãxô]Ç—,îęבž&á|Éã$)Ž`AÂoɶ9ênr×éA§/€f*¢Õ,%qVïZ¯0V$hKr{!& —¢>Gó’¥×ÙÚæ–«×c²0'­Ÿ{K5±`²xâã_foÕÈPc;Lê6û¥tü˜,ôÕtñÏ&gôB‘®Oÿ6Æ[(a²ÇŸâ™ÆË®‚6Ëd0Yô‹Q%(®ãc`Ä ¦çûv:éQ¡Køàär[äÇd¡çäÌ a5„ëªÎ|C'/]2Y (6ÚJ6˜5÷ÐåÍ¢s¡†ïb²`µŽpª°§ù×a²X; š â0YŒÊ²ì>w‘je—(5*åÌdÁ©Â^…g¹н‡Ïó¤Á Õ`;¨$š“6\»wsª³’Ægh©¼œÑ°án0Yø4íáLWCâG‰ZÚÇõD««Z“…afnf»P´,*`§®ŠÿÆÀÇRš,thø$ÐÞŒf‘WB,§I ÃNE&ÔdÎ{‘¤¥3Y| é‹cä”"—W2YÐL·Êbpb3ù1YÛ³¶g£Ú'Iæd;`1´ +!Nóƒ.»U‚»ÛÁÖ½f%5Yì‚fñÈ­„œƒ9´ˆ+ÆYr*bHi² À»=kd>¿$s ’f‹»*²…ÌÉ‚ ®ìXäN«ö1 +¸l ÖÔØ5 ˆžDcÁ2ïÉbÐ6š³å€KvûPE[ÒSo¡“:ÆvuY³Û£ª™þØ”*¸á´ƒJ×,î™T$›ããóé:¤ïAeñ…ÎwÍÈfe!¢xþ“äò"Q9ÍLi–(%Êâ_sXhÒ³™Ã©¨6Dñì Zlšeá•K)„L¨; +Igò˜Ë9„‡[÷ÐH‚QÚ{€„Lõ+† œWµ–P{f‡(BñÅ—P…@‰Kóp†²px÷–©Ïa½Êâsã/cF"9¯è½t†G‡¨õ{¹-ŠÛ„tNAY˜Ußÿ4P¤ñã„"™C ¬¸$kuÛt´^óÄ ÿ>!%Ìó`J¼Añ¼D먧b\|v(ñóK88¯Ã)!ÞúăÆ„ñgž~9˜eΓ”cúYл`fD’l75ÇCó¿ŠÚxOÜo ƒi>`©_×Њ7àRMÓ‰°§(dôÛGO÷(òeUʳ©A“v¸*ÞÀè†Ûd²VAY|ÎÛf-} ¥D” l–ß^¼A»W:1`Tý×4â¬z›ÚCc( {Fwq°€¼þâo WÙü\ ›…²P·t}ÿRÄD¶[ÌÐü%†&5ÌOá7±( ïÖ—h®u‚«0¾ÏZ†=_«¶½˜•àÉ`”š¢·€ÔÉ{=ú‡RD7`“¢,Ò‘J ’`'Ê¢ÂSÚE]\N Ñ ’šyÇfGYÔ/cMPÓS Î( kõ] 7Ê¢ ðEÀÜNä( HäK‹ºA²²D¤Û•én7Œ²8!›¢CŒ²(‰,¾"Sè-;ˆÂ2ã#ÒðKº‚½-,–$C7Xè£ü(‹Ðÿcû(ŒúÞó@èÚˆt|De_è…5¢~Ë/¨4•ºA#‘6ü‹²ØÙ‘ôu¦ü.t­l Ïé ñ +P=‘ *ŠŸ,ú—‘ès'Ý 9Ã܉s‰“m PÄ«Ü$ +àþÀ‚(”ÅýП¨×éR*€K(‹‰×¦^:ÓÎzÁÐÝÀÔ¢ÿ.zBÅ‚}{'žBÚ¦.Hso¾ð,š!sø?Ä.£Xå ƒnÐbaµ÷»¯Ït‚ôiwtJÑŽÀ»Uí‡kÂà ‰.»jÖxœÈfŠÇãô—eo¼3ÊB‹¬¯-Ê¢{>ÖCJ7lxðÜ ÞgTIbƘAyn`Rûë6Ó.Þ‰² JRý*."þq=ñAQufüäEi¢,”Ô‹ +]B©,2Ï Äl Ô74Ž(.ÊBBo'lµho‹çZ)åéÒ<7Pm³tã¿ eÑ6FÏ /Xþv¦coÆV@Y,ký.[Þ Œ“œ4(‹A]¼uVsM§Ž}óÄ÷оX›ÀmÍÒ¾2ó¾<( Õ+@"ŠCñë²ÐúCsd1Áa!‹ ù‚åc1Ä_ü dõ—² ŠþEÂXlËím…–Äbdã¢XÜ!±(w î ôÐÔ‹åƒÅ¶3‰¶ã‹Q2XbÀ0Ô!°©âPmÚ+dFY¿‚2û +/ÇWðµWÐD¯hyÅüwE *° &Í)°Q +n+Æ ,6WØ /Ìv/®/0\‘jÞŠÜŠ‘³Ù&°èµÂ¨jEË—RÛ.0ëʵÀš`͘eÅ”¿ècÇ+@V¬ð1¬€1°âK“é«vÀ2^Û9£’Ú"¿UŒémn‰™…0gÆ \ŒUÅ€aÐ×U±Æª°AUAßO÷3,U@8°nTÁz!¡ŠÑ†=¸”QN…J=…‡XÒTô(z7‡M`‹¡B11O¤Âu!í¨Pò~eT%°Æ/Š\`Ó7Go`“ùSèz¨àO…Š6ƒ +ó©›Â€ŠE ¦Ÿu~ +]ð)rÏSps`„wŠÈêúS¤hÌ)ZN±'P8…³yvsS VusÜþäïÔ•©HѼi +]Fw¦XìÀ˜P¦ÀÅÌ$êÖ&¦QlSè˜¨Û L0ÅsÕMîR ¨ëìR Ú”³u`kKé&D–b ƒöI|è–{¤ˆMJqïÏíRHoRœê%6†î–Aëܤ’ˆ¦]\0¿)|åÔÂZ!)X|| xG‘±9Šc7 +)Ô(V,£ÀFñ˜‹¹S&O»€–(d9¢H¢à + tû)&D‰3£Á‚nÅ€‹·}­c +:Šb‡ :ù F6ÇÀܲ„  L©¤ß'ý¦ŠîS`„-Ö Ì?¡``ítà J·€ßCè„¢¿Në‘8´pÖ ìÖöÀʤÚþ°n`9S‡˜/ÃåÜŒH°°¢`5˜ „¿ €Áæ; FšL÷`÷&„Á¾æ#KØ8~Âp‡ +£Ó¥XbuarŠÀ0 !3¬öjX¼ «7<¬veB‘·ÁK(–@ŒÁ 19A[¯#F3‰A%ëKì°` +ÓX -)/¤Xn)Å*£I4çϲ½}ͯ‘ƒæbå(ÚâäŽba +ÌXv÷©â®û83¸‘Æjƒs¬“¯cÂÁÜôZÉÇš*¡è)Å‹™Pì)Á×Y‰€V^Tšå;yFé[œ~‚¹­™ß°d3 BÙÔ6+À±Èv3¼íµûöéTEÞì,iri,(R?Ø»ã}Ç +Š¬Gø Ç ß‘¦ +ù(|€@!9ä¸$Þ§±v³k€BûoöŸ£?ÁZ?¡®ø ~ö‰q&èp?>aô=‘®=´`=h8ãžà®=Abežx*œ<á,Î÷ðDßïc݉5Û rPt„ g½†zì„ œájØzÞrbÔ I‘N ýœ{3ÔæD/·œ`|3"9±=í¸l™È'6Ë8QAœþÁ ,ߌÿMð(âÞ Ûo¢ö†%qVAãhoŽMàŠ;iSŽ™ gy´&('+œÙU ûg}rÁLèrV&Xš¨ëg¢«3±±å Ò™X•³\3 +è©™˜XbNpÒL̼f š‰­ÊÜ)ålÕe›œÅ…‰5 D&ŽV¼àRiFdLI1ÐœåAL ä ¸0‘M胉F&®äì _‹+’œÍ~ t䌒/‘•^"¼Äi×%ÚÎ%Lâì(.ajÅÙô–ÐÄ{Úx"Ì–Ð19#TK\BKà0K ÷±Ä¥œ)†%XúJH¢+Q¹V‚VÎ +É8“Ÿ37Òc±y«6z0ªA?îÁÏuÜG¾Y%¤r–Uøœ¡£I¨3VŸódg”֔ƵoˆTXJ‚g´H‰då&£ÄŒä8¢y'”PË3@‰ã= ž'q6'až½Ý$ô¨Ièš&á²2 q0‰. j̳땄єįGN¹“Ä°êãxV]wV;‹-ÔY:t¹¡$A×sÆ}¿¡Îj£’°3Ч3涯zÎ +©œYK`&ó ã¬ñtÜJàQ÷”­ræR’È +/ä~’61ßKá%%‰âÊ™I¨ +’h)g“¯59DMåL¾€ôHXq$ Xx#ñýPÎHðÛÊžáHX*Žýi$øÁH\¿"ñîD"Õ¦œý_œÑ­ˆ\qF>p† o6­D$ªo³k8 o†ùÑ?g^…„~ ? Ÿ ä¬sq4 Áì¡'ÀQü#¤ö 2Τq&å¡ŽÔwœg’þ~ŒÌ„ãÌù—3û!H§#âÖ–³ê«8ÎjkœU  õo@„]SćݲÆYrÙ«X ·¿@CÇ™8²œQ÷?Ÿ3÷í[ƒÎ;>t–)Dgç +B.rÆï#@¶‚QîŸ,”öBÂCge4œµMùü-í²ÂŸ1g &{GùàÈЉÈYÍ@¡,yœi8Îê^øãLÐáho†\í™3f^t¶ýˆºùÌÕG˜Ê™³X‡œmÈœ¹[t&‡¬Îxì)íŒù›ãÜšégwæOý3 9bÐàÃÐú@kLJƒæ‡í”m„&¬rò¼KWC“\#›1Žhð”F`-WÛ|Ú瀤,šg‹Ñ¸—»k4j2¡£ÙŽ| SŒ˜Ò€ #i.E¦a#—´*¾JC¾‹H©4Ú\Ä€, w‹wiµNL£Ë"nIXk¦Q¹"6kZƒÆ›†{QÓTÄŸÓÄ"ßi•R„Õ¢ˆwpÎí‚"æJ)žÖy"áD4tÿLÄ>žF½D@ôiµJ¨a&›]¨¡–D`5»¢:WæˆøtRÃ0"L-žˆ@njØCD2PT!bHªaSˆOÕâ‚W±ùø¹g–˜F2Ëj>„s[ybíj€wx(ë‡c5ã‚D<èRiÅÊ„ÎdßûžŽÖDÚkm\ !ÔÖ˜ª! }k94ËÇ5z™‰Ó;½E‡=†Ðg× aˆç) Ñœ/„YçB€/bO qQ°qñZÅ +áx¢=»FB E!®ÐBtñ„xgB°×øJˆ,EB8Ƶ'#„}®!B(°kô ^«ýAÔk¼=ˆÝùnò×4rœÀF± bÖ`Ã¥A`WØD4ú²©"ƒHŸØ _Ÿ06à bçتZX!É +"Y²Q£ 6•ÕÝe‹‘ x3G±ÕÛO÷‘˜IáÙl;¬@˜ˆyÑFÊ@ QÚS/Ïý¦¶ „~µAK¸¶ÏÁ6ñR¶Ù¾§mHÛäÕ¶ÍoÀDn#¸@dn·éÞf»Ü¼Ÿp $q“ï7ÚIn!°Üä.jnò¹.Þn…ºc(ë6щÝÔ¤¤.¨’Çnz#^ z.¬Öº)EÝìŽ|ºMŽÕ¢³-UfY)ftÓQt£½ÓMôª¯Õíq^·ùB}vCxp7wSOçr‹7ºòÆã;o© Ä6½\ P¾Þ˜äöæ;àÛ‚&ß”“¾aÍöíáä7« „Ó~³Èþ&ËÂôeoy1v • #ùWR*Ð0"àv^@üœ™€À… X(@üÜõg3äMDÑÑÎÏÿP†ÿp…ó ïê„ûh·æàý­ pìî‡Ù²øS?$A?¸OÀÝç9–lU~ð±ÒAîGì€Kq 8=-~h[Æ 8ÜÜ”ÀùIÀL€?”X¾ +¸tòxpáÄ£ºîfp#pvýÒÀ5ñÓš \ÏíY}—?°‚[÷c2~КÁaìo«‚ë‘œ°»PIã Á9O+¯-~¸mSV¸‚ã‡*¸³CpAœôŸ‚Ó=*¸QTy6~€òo,~ÁõŒ€JjüØÀkö×.Âz8Bphr†xd!¸Žö=ºAª$Üù“äõ¡Svׇ³©HYàŠ¹>`:Î×Zà<—{Ján}€`ŠÁù˜ £¤0ê\¢Rp´O§“Û9õl\"¹&†…¼_8†) ×àZEDÜ›‰«V÷æQœºŸºƒ},Ϻ¸Æy]Ü–q©qî'Ý8i˜ã¼>Èàqx4ÎÇM[6){S>héCÀ>Ìv€, ûpÊ5N½I \’ÓÅ>$\r°»Éá4 \}’rxìÃÀû ”£°%˜,Çc,GXNÃ~Xb ¾x®n±\ö¼ŒŠe_ÛûshÃ>è'(öÁó$|xŒtÔ¿ˆ¹¸&ÍG&•‹±¬b \io|hGæΈ²W«,@æj’ìöÇ eÌ­dÌ•½õa)_<’9Óúà—û +1W_ +0bæ¢>èz˜#‘h3Ì/~aN +ŒÙ®—›C]>ð—|Ò sDò¡t´4˜ÄäCø€³äƒV¹¤zd¢ÌMJ]œÝɉ“(ÆÈÓCÁ\E5ó·$y+> å>@>PCº—+÷°¡õô=  ’¶rȹï!M“ÊÕ0*·íÉjM­\ô=#Ê÷ècåòËq|¯VËX›Ëyɘ˗ ÃhaŽÏÛÃäÇaíÇÌùÍ84{ØWÍÑ]ølΖõÀ“7Ç›Å9VÔÒœ³3=€(ÒÃóD<‡èajß¹8 :zî·ó$>Ç­óÒÏq:ƒ€Ž­yYÐÄ< +Èò°:t(”,sä¿èƒ£ƒÕx@8H'h<%Xñ°°Š§`:Y¸¤ÐP<Ø +âáùÂßÁˆ’(ð€¿ƒ…– /eI'¬¼iËîÝAwÜágûvàSÛ¹’X;L@£µÃíb¸¼\×7”ýÏk[r¥íp…ïÕ¡p±ƒÁy(&Ýá:h·¤#žuசu ]† ¤êÀ“y!©/DÓ‰%@Žn}ŸC<)(úùèðkÑéª7tÀv ¼€¡ƒp ÃÚø9Ð7ϯÎAh8B:Îa,éè5,…ÎTJ˜Ksøt¿¼[‰âLº!U±’Çv|Ïa'ó5%É%‡¥Ž ˆt°Ò­tÚÃÁ«çfãåc*£Ë8œKèn/BWPV +øÒéà¨ëá 2üí·p€IÂ!ƒÃ1 MÇ+Ýiü"‰¹ßpiy¡†ûZ¼Òµ–oXuŸß+]ro ¥AÓà‡OèÊBÇÁ°…’;E$ç¤Ûpº!*lðçà\n@ÿqýs–XÄr•8º Bk¸¹“.†S,¸:°Ø††Xé¢\(Ž ,¦H($S;G +}6h,×lXó¤Ã-8éÜBÁ±ÁíÄo… †¥ÛØ`•¥ûð5È(ö®Û\ƒ¾”»5Ü©5° !É·e ÞH:8a COÝW®†$¬à¡j ÿÔð·ÔpòˆóÃa5D$]±§¦9 }~¼ÅªÂჺ ¯ Ó`[¢”n±4DGéèIÃÇHÔףAª!h4ÌeÑàËç¨ hhFh0@ÐpMÒüôØ*ýÛg@Z:Üž!+É3°1¼aðš.”±tº)âÆútCg ÀÍàÄôk†”N™fàÕw@¬ªŽ„àxV·Î2^Ì€²X'äe Î:¸ óÎ2à׉^ˆºŽWe˜×¡¦ ¸ù:3Qª »#ø1«Â®ñæ(±Û2hÀÑI'¦=2ÜÉuª"¿fGI@ZŸ‰ŒÇl´Æ ¡v<–²v:<†Ø¿}mWÎ1P9WÑK_R¥èÛMk ^á6cØUb NÇÝÌ‹ÁÜr·h1h}îÈU uW‰b ƒG&8±ý…ÄÐËíèÃŒÝa? @äôB†»ÃdÀ翙Ã`v'- DKa8§»OÂàjº£K «ú¥¥ƒ|Ô]þƒa»ËñÍîŽcw–—Ý $^ÓS‚0,fw౺‹QYwÞ«Ý©A»+êUß»‹¾;Â`gËê;c×»ë%cA$Æ;è<¶ÚÞÃÓA~%ùEï” –öî7Á ßÑÕõÝD³Í¹ß]˜¼ä 3£<ì­= +^¦É +¯‡€û’†×Ã!#$o%CøxÕ âµßV‹Y<‡.²À—w¼bWJÃ/TõÞ®O[(>x•g×ò’ÿËÆáÇå³Ø!ÍC<0üjóð>0Œ“óŽoçEhϳƒÖ¡þóöT䤥yI ¢×Ô&U:螺Þ'dé)G\Ó£öz õä_ T)ÓŒdõx K±;[ëÅ Ë2]A7] ºóºïëQ À {Í{á!fo3¡=ó©ö°´½BC‘¹·Ñ½2´{rz¸Ž÷Jdï‰WßÓ30€÷ïrNË·3xšÌÀ·o`öà’j ûïþ0„>ê"Øû¾³¾÷Wƒi*¶ML…ƒ[¾zÀå»!N0ì'ºN”“Æ_(¬ÀžÃûëòmô/Xõò}Ä_ýš®èf}_@Yì ¼Ï"|aßËGï0m}/$»|èþeY_?ÂcàöJ«dÒèÓ +½ áó>èò…T ótÁ2ò]à5w¸Úº° ·]>—ƈPDºpftS—G$‰—o2!¾|ð{ ÌW‰\0pÒ%.ˆ’pG€ &Ì·{ òÝ-ð—[°ÄmAÌGDU˜+æ37Z°Çµ4æã«xÃ×›Z -µ;§ÞI |E #Ђ²ùÏžð~³àË·ÐÅÍ,<ßtYÐl•0É$ž±p’…ŸšoÖ .ýb UÍ·/ì7±àï°àt + „/XøÒ|°àN@~Y)ø +7Äç<¯À8M„qÎÔ|ÿYAs"¼\ÁiÁ® ñQn^8I–<Wádib8+ì'+¼Q¬pV`[Í·+i¾í®‚¤šb· + ð‚ʲ +)WúY7~§*IUÀMª°P£ïÉ +Ï‘ÔGU1UZ*H*ŒÙ¨@_¢vB_Ô|×ø‡ +ŸO„@ْʃ +\0«ý"£§@\5ÈDÀq0CSÀi2IŒô¥°8HaIe¡œš¡ÁJ¡˜¬e4Í{”Â\È(")\Ò|‘‚줚ïÚ(Ø4óM“6ð&%ËÌGnQH)EáùNßÌ +»e>p7€>áBa¢„/Ë||…N*(<2P€8@a­?.~‚謙ïã{‚¿ðRªË|Iª„fg²›ùâÊ|}O͉µÖÌǸzNÛ‘È|‘Lµ¸²Æ¸XOÀ­ð³_X=a¿Næ[ÜH;x]3¬Ì×DzžüjÌXriØ£­ôc¾œ“­²O-×üöéêe¾Ðd¶d¾bõ±¥žÐþ–®¡1çx=>b>Õ<=áô—%Ïa-xÑp˜biŽŠ†ÓÂ| N#R½b¾<ó=ó)Ž[z‚×Yçá…ÝàÝ•9zÂw ˜ž ^µÈ”O=Ñ—‚ò`¾~óý+æCC˜t^t0ßROR#ËÛŸ@„ È%Dî„PÌWØ Tú:¡Â|p…­.#˜ï­ïS"9¹N€ˆ×Ö0_õ*ÈŽÌõå+"^'hj@³ùBoô6uùò•[ãÌ:¡õÁ—MÈßtëg!X'¼Ùµ¨u‚½N /«xFoJN¼|ÆuÂS-ê!¨k eê)š›]¾u1ÑJò:Ab¬sù2¶UËÄ”¤å:ÁÖŠÉü¸aoavB©ë*ëÒ勘ô•ëØ X\>r¡º`:;¡_Çl/$;á9  +¨wùiR†N:Áß图œ`$ÀËç8aoµ‰Ú8aáÚ½ \ræ¤M˜›¼|M¯&X8MèY4aç™ðÞ83!æË@eâI R&´æÛ t†þíh¼˜þ&x &8ðKHÍK`Ê´G8Ì×î%b¾®Õ%P{ÅH¸„ãÈáYBec @¯„1 •@Å|sUXT • TBr)$Ìw £Å’` +ó¹ % u¡„FÕIâ‰"Ä|F‚ÖÈgwÒ·+ ÒN@)JÎ6· $Œd$0óñ ž"Áó„ï A/àœ?‚Ø1¸a%óqò˜‰ jø>Þ#'ªŒ“0Á™åª…#øs#ÀÂFØïÓ3Ï©ÌZA€¦#xŒ›ù:Áˆ3›¡.Âý,Â+½"´«"¸™"° çc‰"4íD˜ˆNˆ1’(h?"¸gÁ !‚ˆ B«}tòh}¬Çù–7U:„³h .Íð ík›3CçC¼âè|.g•2 AÀTs> +áæbBKÄ!túA`\;¼6®t¾%ƒàß‚p_ Âä¢ ¬HüY«l²`ç|G Õ.ÆS:_GíóùqPo(Ñ\7²çë8   §¾’êø?±År³B¢—r¦–X3ä€Pú˜»Zl@4 ݲJ%0WX.[\;>_,7 ì#΢ù|gŸ,¸~’(¢ +‚@èíóÁ x?Ÿ à?¨Ycÿ@Š4×ç3›îÿ³ŸoþYøæ>`ï¨ê¬Q?߉Ìü`÷v[®?Ÿ™åQ}+NøÁ˜¢\‹ûÀõøóÁ™?_éc +t¡>qEýO Ì$ìõCKI[,ÂíWuÈJ¨´ÿ¡¹»sÿ86 +SU}ÈJ(ý¡ôg·—ÛÎs{ww÷•tJJJJJ¢^¨z~6r³‹Ü좣—y€8O1]Ä$/ŒâÒ—g˜ÔU1Lê® V»&0"(­!#±Ý”Ïý¦ÎÐú÷ ‘~cÐÚÓzR¾ÕcrcÀ%·Õcnc@ådrW;&·Ö“vw„K#ø«U3^ÔKa&hŠDËhʘ³ü[l@ÛxÅG +„M´Kb%¥J‰A«`=g¶_ó+“Sò08æv(¤þ2r¬_0ÚÛè˜\•Y“çùªT¿ ô;N¿69&—$Võbcî&\Ì>ƒò'q-ÿ 4>R·Ë "n‚8Ùè–ÚY/Ùã/P7€#‚0@¢é+>œ@¸†ö ®R^;ž4Äz4¹ì€¤XÁY¨¨x6àÊæ6ÈÆ0@œ\pGüêw´·ô~¢lzGWqÉÓä’Û’–“«"{ö$ZÍ]%&õ¢Ä–üKNÉ—û°èôWè<p¹×ì”ÛšrO¡%÷Zô'qS´­[z<Õ0AœPÔŽR6¼£ì…Õ"8Áâè®ñÁÄ%„ á2â–˜ iUv@®`8˜&Ì|"æSd\bC¼‚ž.€Ù1Ÿ ‚ÁBò-´¦×¥¶ôQÌ”\ /¦o‰£|K8õ’3º‚@…“Pvi™-±Fãó{*ÿ/E&0bç'Em %£CN}ù4i€ýh²É9ûšOêí£ßÞÐù»vϾåw¿êoóE»"`@Ø \:Pº€ÉŒªœ¨"  ÿ&r>‚› +-9¨"\G× Z€š,Xùa‚]€šhlOnÑú¯0RÿÛ¯¹Z’›‡hg¡-ýïÙ¿rÓîM'd7·©·ö{öAâ³7£ÿJÇÜÆtR†±ÚÁ1±$/e¶äÅÔ¾xÎë<Ñ=Ç +©¯ANÕûÒQ}Ÿ¯úËȱR­xJ®œ#œ¢'’rŠ¥ƒò7VH¿¬pzqtG[}æÇé ©U‘E»)0i÷$öäSfI-MŽÉmñœ»î7íf˜AJ]ˆabQZÓ.É*ÚÝ€£äšÄ˜~Ì­é÷€Òƒ$¯ +¢µ+ VýÅ$ÿ±Ú¥cx$Á)zPcjQnl'õw¿©/ëA¹4•ï§}NÈÜ_Á£ì ï8e2¹;ƨ¯ >}™ÏéÇzÐgÇä]>stream +ÿÅæüG¼Ž¸,¶$×åcò4 ³·—ü ÈÜc¿æþú-û™ÓÑJî8ñÃÉ"t¡ #$U³SrmxFÚ¼Äx"€Ã°GŠR[Ã;þ$1"žMÐÕžááTÁK!X>œhxǬØÜÆäŽS'3#)rÛáÉ{ÀIöfŒ´·L +›‘”ÑÈÜ‹F&·D®GÒ‹Î(ªÃŠèêU„¥É-¹¶àÒ_ùŽø¶ © «CpvÎl×O « ENhÇ©›oÚué¤T(bH~ë¸E‘9ùœt +§{.Øä?Ä‚@YÀÑaÄÎTŒÉ sŽ×8Dï,ˆJZ@NZEW¬¥¥åÜM´˜[›[”7 +­ÞÞÚ½ý¢¼ (ïâ)÷ZQ¶æ{ö8ß´Ks{ò.4'7%æÜW¸i×µsö8`³Ï)>ÿ=Ò†tŸ“SA¡“[ó=÷—2Š&w„zõ’SÄá´' +©ÞD¤ªDë[OJUsƒö27åöÊÈi„Ë*¶kò­ÝÓ§Á9û“ +¡Ý×Î飘1¹7¸hB–Ì–€!¯16ç6¦æÜmrÏžÖ‹vw¿i•‰Wr'áRòZFÛ-¤ÖõY¹'¿ÊEûØNê· "y}>êo;J­rÉ,¹cà‚£j#jjñšÛ¯úÇtQEv„õЂ"3ŠÒ ›{.¨äŽxiÚÀbæSî9à³»ûE©\<¦÷léÏ,?ŠXÒc)E¶äQØŽ´%hDR*7£, ¬(‹B;nmtK¿¥sv]:g—e¶äÖä˜Ûb±Ù¿Ø| ^v@N/áP”mxœ*ŸüNØüW¼ä¾r+ʚЈ°±ß‘b´¥@ã3”CèE&¼Ø(2Ê %[¦‘—Oé¯|Ç…­ˆ·azHaÀD "`ÉÁ4Ûä0Ò²ó™â‰ö˜§0;Ã+ÒÊ~H¾„æsÍ°ú%ÇðÒáäD“cnq¿¨?ó%¹0¸ãGÇô‡ÃhKhÇmv+Â’ÄŠ´(hHníçìorMEvÄå°Ò¢ˆ={«¡,.<ŠÜ3?Ml†¢`pÉ­ImG‘±I¶ Ò—é í$f@[ 7BY½&È… O"fA `xBQ¿¢rJƒJl ɬ«·ä+ÐeE¸Ž»‡–¶C+ÛaŃÉà +%³FZv@L°_Û‚CbS䎴4<#í ˜Üa>e_³;â–Ђ¤DÀ„¢4°~˜\lÇkÍ÷ÜanLí R b5„“ÜUbÑ¿…¦Ô¶ÐŠ PØ‚¢TrCS. ‘6Tî)pGYˆ`‰€Ï³K @“£é ¡– K¨%䱄¸Å"!–0K€©g8–"B ‹-%8àظv80ˆ‚E‚,¡ +"Üpr8š +räÖñ‡õ8Î%ÔõLÃu]Ó Ažg}ÆqØÎí\Öi;¿s˜ÎóZ·ëZŽãÜ®cx®ëz¶s\†ãø ë`9<ÇwyÛ9IË-Á¦ã8Çå8—€ß±¬ëxL¬×5=븮ãæü–ïz†eݦs–u™–ë 65\븮ç´Ë0±Û¦cxÖuœ'Á†ã$$èu Ç(Hj×u\‚=Ó9œÏ:>!B µÞK‚,a„9 ·{Æé'³<×qYÇå9‡í8Î%Ø8<ë:.KÈјH¸™yÉEh˜ Š«òÎç|Îó9Çs8Çã¼Æi¸¦ížo;ÏuZ×e»¦g{ÎaÚ¾íÇi§aÚÆõÆë¸æPßò×t\Ûñ]Ó°\×pm×r]×r]Ëù ×· ×7mÏ9=õl¾ñ®ëÚ¦åYÆi\Çaœžñú¶áš®iº®ç¹¦é¯c¼®oü¦íù–ë™–c'¹óÚ†ñ¹–i¯ñ|Ö霮ç·ï;¿QXaC­ëø¬ÛpNÓvNË踆uš¦åÏç¶e¶ Û¶.Ó4lã²ÝA‚Œß4,ßxç÷L×wNbãú ×užã9nç9žçwŽãyŽÓô ã8œçx~çwlÓ±Œã3œã·Žç¸N×6¬ç9¬çzÃx\O¨eXÏqXÏó[ÏõÜ–ñX×u¾u:Çe]¯s|ÖmxÆgÇuœÖõ¦u<Çã9¯ãz^ë÷ Ûz<ÛúësÃ÷}×ð}ëøër®Çò|ëq ãz.ßú­Ç÷ë±Û¸ß·>ãz¬Ó9œÓw®ÛºnDZŒë°.ãòËñ Û7<Çò=Û2 ß7]Çò<Óõ­Ãu,ç6|ßñë8®Ãw|ñ,×p×ú<ãðËô-Ëw>ãsMÓ0]Ót.߶.Ëõ9Žñ¾ey¦qŽgù–óZŽëÆe|–å|–eY–ñzNÏx,Ïz,×òXÎs\¯k»†ïX¦ïY†é:¿é[žs\Žiºž@ÓqMDz\G°¡uý–u]Öõ¸Öé\Ïg™®q\Ïñ›¦áùÖ뛦uzÖçžuZÖaY¯o¸ÖáX·cŽõûÖéXŸqXÏe˜Îs‡g<ÏáÏç;§óœ¶s®s<×mšÎé<®'Ðyžçóœãr®Ûx Ó:>çx,ã·=ã3nË6.×r=ßõß3ãpã8LÓ¸×8ŽÃ2.×±ëw|ú~ÓúMß7MÏx=Ã9mÇ0mË0~Ãô}Ó÷}çu\ë8,ß²lÛÌö}Ïólëôm×2 Ïôß8LÏ÷ Ϻnã6ç9׺}Û¸}Ç4NÓ·}ÛùmË0m˹^×úßö¬Ãr­Ï¶žÛñLÏ3LÛ3mÏ3mËu-˳=Û²-ÏôÛ5<ëuë5çu\O¨o—éÛ®í–度g:žs9ŽóŽkÖõZÖs:ÎᜎIèu¹¦ñ\‡ï»¾ñxžåx†q}Žåü–g=Çs\žkz¶sX†çŸáY·iݶi˜žá›®k›¾iÜŽcøÆkŽi¦c¼¦c›¶áZ®ã:ÖIêX¾áZ×iÎãÎõ:Öa–ëX§åZžqÜžëY–k»¶mx®å{®íù®ëچ癮9Ø4Ãv×j»–뺶uØ®ë\†ïÚ†ï:®'à5ã8 ãµ­Ã6NÏ6ŒÛ0^ã5×q=!Çóù†óÎëúÎm8¯c¶õ:®'àz×8뵌߱ ÛöLÓplÛ²-×8|DzÛ4n׳Ëó Ï7nÏ2ç²mÏôlÇõ„žm›†i;–aÚ–a:·ïº¾ã®m[·áZžk;¿iؾkÙŽcø¶m¶m9‡o[·a{–õ¾oûžçZ¶aÜží¯oø®í{¶aÛ®aûÎm¾ãû®å–oX¶óø†åÛ–å‡o[–çy†é;Îe™¾q’ú¾ëû®o{–e™ža›®i[žcØ®u½¾uùžé9§ñÜÎóZ®ïù¾ã˜®ášÎó¦gø¾ï;‡iº¾oø¾m¿m:‡í»¶ñùŽë ³}ã7^Ã1~Çñœã·Œßw]ô­Û8Ç6Ã5n×r×8®ë2Œ×·ŒÃ÷ë5×:lç´lÏ°ß´ çt×´ŒÃ5ŽÇ¸×5lãp~ßöãõ ß7Óð}Ç7®Ã8‡ü¾ó×uXÇk›†q|ŽqXÇqÎãzœË9žËpnÏ7,ç´,×ùLGÐY°Š@"ë·~ßðMÓpnß8ŒÛô}Ã7ŽÃ¸}ër>×5®Û¸^çôŒÓ2­Ã²M×v­ç8|˶~ëðÇñ<ǺŒÓq<Çò|Ǹ,ãt\ç·m×°]ã· ß9|Óp[†q9¶ÁoùŽéXÆáYÆg8‡çüÆcÏc=¯eø㰜볌Ç4L×vÓ5ã9®ã6<ãø\×s=Ïõ]×3<×9 Û9.Ãvßð}ßum×4|ßwnÃuãõ×êû¶õ8ÏíûÖaÚži9Çm¿óZ†q\†ç¸Ã8.Ã9N×u,ã°LËpMç²Më÷¬ÓôX—õz¦mšÆç:®q8¯é®#¬"pÈ뼆뙮iY†s¸Æcžå9–u‡åy®k]Îã|Žñü–qXÖoX–ñüŽuû¾a¹–c9Že]¿åX¶ï:ŸáYñ<žužk›¦k–ë{žáùÆí–é«$³=ÓólÃsN˳¬ë°¬Ç3Ïö\ó Ïv„U±×t^Ï5\Ã4 Óx}×5ŽË5LÓ÷mÏt\Ëø¬Ç0M×3\Ó5-ëV8Ķ-Ëp¬Ïó<˲.ËpŒË6.Ë7<Óvçy Çô|Ï8,Ó4>Ç1,ç2–i½Æíz¾ïz¦mZ·e½¦í{†gZŸq˜Î隦óœžï›Îçy–cú–ã\×aݶqÖsú¦çžoœŽe›®iÛ¶éÎmtÆg܆ñü¶aš–á<Çk]¿ñ<žó|Öñ™ŽkXŸe<¦ï:ži¿cX×c]Ÿãz‚Ã:N㺠Ï8>×õ Ë´Žß³Lç·<Ãñ]ç²Lët}ËpLË5Ó4nÇ´NÃ2­Ó±<ë¶ ßw˺ù=㰌˴ãp.çv\O ïÜÆë9Ïs]†ëÚÎu<¯á™–íÜÖoØ®sš¶ó¿éy¶i™žõXŸá;×iøÆgÙÎiø¦éY†ï¸ž×´ Ë4ž“Ä9\ËV8È5N׳lÛ5lËô}Ãr=Ǹ]Ãv>Ãöß°Œç·Ãtã¸|ãð<Ït=Ãô×`ºŽëázŽë «$sLÛsnã°<Çõ„8 è«t[¦á§uz®sX¦kl½Æa¹žg xÛx\K aL̳™D—ãzB™Îi¹–`Ãñ,×"œ°Hˆë$Äp$Ôº< q âÜZ8ÝmCŽß²\çò<×÷<ß2DZM-Ï5-Ó±åÖƒÍ2ûÊIÿĤ#9êw„ò= Ô_Q¬~y§$6w ±¹£¨!§PÀ¢»n7$„oÈ›.éÛÊMý—Nêßèœ= ˆÜÆxQ¤Å¶6:%×µƒö¿à´w‘#ÝgôD{,œ=‹—¼îä,±1xœ>‰Ñ“üò£´B[ngrMžæ{ö;Æ'4"µ7½¡l¾Ò1¤f—ÜŠÀùÙZ€Ã‹èJ8x¦À!¹3ಫ6¿Àá“jEOÑNRû€ ƒ¦:_{¢t­ÀeÇ’d„¥‹ö´ÞÔËóY©†À¤#³Þ³K ÂdóA}¬¡)Z&H‘”R{ûA»Á õH®G­ªõ”Ý ­Sbl,¡Ð†¦0HñÁY…G“’ËÚ1¿08e·Ã PfÑÑÄ5–PÐŒ®0µ£½)4”<‚c æ¦ôorOª–Y³‚5”E‘·&h?YÚƃ¥² LúÍ&ÜfÇ›ÑTOèäÃçûAÿÉëŒD;„¤¨1·j€¢¬ÇäcvAX’#Áb2{R Â~x E½zN~H|þ¶ß³ñž¼øüyB矂Gu#Iåè,÷Ý» B÷Ü/Êß|Ð8„þ&n?xt–N¹ Ñ““Rª÷‹r{¼h^6@BùÐ yUÁƒÄÂ耘h<äµ8ŒzKhBP1»c¶löE`rGS6»eøCòÂ(.Umƒô‡Ò½&ÇÌî‚Q¾X„ö5Z0w!Kž÷£z{†H_ƤïJ÷äÒàšûÉì«b;º‰ÏßÐ.ηôu˜ ¨–òÚByápê­U5ÕkóE{Ûr +Öc^‰4pnƒÃçcƒni;kw($ž©jç³RÕàš»ˆÙ^;PnoÈÜÒ€ÌìN1Úç¥>Pí…Cií(ÛR[rW¼f.yœRûâ=÷bÔ_ADª­_šïéÏxÓnQh­ba¬~‡Cé_6ùXÏɇÓî‹bRŽgíÒÔ¬Þ–™´‡¡Mû¯êÍ!J½¼¡’Ÿý–|/õ_äL›ÝR[£c^i>(wE‘úÃÈ‘ö?^<{T<ÿ/úÃØžY×î駸)¹(jHl‡×O XQT‰ìxu±%¹*r U1ŸÒ{aDêà èÉtzSØ; +ÚÑ^#ÚÚ1?¢*pÉÁ$Â54åBçéãðŽ;†•”Œ¹-¥^–Zr Sƒö«´ç£^#êí›] 4<ŒÌ0>†ÄzÐîj×üOÄŽvÚeH[Š -*@W 6DÙ›ùÇìˆö’ûò1ÿÑ~Ïø¸R€Ä %˜ð`òÀ2Ò®zJî«·ôkFnšEhxF»ÖÏõO(Á1ÃÍPÖwÄÇìŒvYð¨—ámkxF[œ¥Ï.± (:†‘øˆ‘0å.“Sö=á“Š‡ô7°‚®4 Ò×€äI†ÑQd„ìçÉÄã‚‚…®;=Gû +HË%ãAÿWOÙƒˆý8i`5u`A­Ô–\›És–{¸Ü[?ä¶8¤úBá´—ùž{N©½Áâymz ¼âp±Üâ€N^‡(íú¸ú»¢ÔɮǯŸ¦ZTû"§úûXa¥‚‘ýB Oæ†óÅkrg?'¿ûQ»<R×Lòæ «_ƪ7ÇèÜc¾&w´z…À¤ïL­êßج]cP‰êƒ¤o žjwFÔï¥>ÊL‰ý¥½ cµ ëAû–ú§ 9yÖîÉ‹O?ÆÎó—ñš<†š¢( 7FP½a³Á)·'fIm‹­Éõž;q8õ_èHûÒÞljI`G'œtÇÁI˜šÔ—©Q˜š´gá¨Ý:ÓŸG é_ó-³(µ#¦›S +e¶Ä’x%±/¶æ.ÛQ½/RÕŠÌÉ—E¡ÐלÚ­ß›oÚÍ›¼‘8íÎxQ?e¦äÒت_W’7–³~AP1ýÎà‘z5Бz50Z¿.ŠH_ °êE‹rG°šZ +1JíÜ‚Ôš¸œ»(íiðLyêÃxOmhí{=¥ÿµ[rQ̈°#^CS«\sç`‡úÇ|É-…Õ!%0$ÔŒ'ý?ë‘ I®Ý³_ñ;N/ɳzȽTö1²¿ …‡ +2@M'±£-Ê,ÙŸ {[Ñ.c+ʢ̈²#d?P?á’+s+ÂÊ)Š¤VdÑÿ¥›~i½çïÚ1{—\‘vDlhJ·äs¿è·ÄKéo¸Yþp’]ƈ;¡%hJÎеÕ[ö-·$—F§Üa>¦¿Áåó¢…ÜR !ÊÖ1@WÔ…•OT‰ì¸‡xí.6åoÏà(.š4Ì#+7£ÞdVÔƒx™"û‘êÐât¢Ftá aF¤âåÃ)¯ñ)Bâetu¡9Ö­*Ú³kKînˆ°°ìXz±1¹1ŒïÙË}1ÁË!Ä¥ç5ãÊš i1ÜA­vÎþÂ*G‘ Rx%‘©Ñ5±1^Ô¹iA¸~¢<¸ˆ¦DÄŠ¤RfÈ©ØNڳؘYÝR[ƒêßp±ÔÖ€FØ—ÛQ«S|z‰Ã&—¬( +%–ÄÒxÐ^âR—Oõß`GÚ £_.$T Ιeíœü í+c‹ò´]õ«?‘úÿ‚Qß'ŒúÅá´Ë‚ˆDU.Qáà¨ýχä%“¼*ˆH]:Ô. #’7ø[¢Â¹Iûßê‡ÕG÷ämnN?ÎÔJM™•ùœ½LN¹-Cb5°† LÄWٯ٧В[0dG÷ôq¾èCknejOn®GíÒxÔÿ¦CêÞvJ_™Ûs×ù¨~Œ'ýY:郃ú·_´ï)N»>`´—ñžZapڳ̞»ŠÌ¹³rÓ.®WõºpÕïäõ‚@–:°ŽžThÌì Y»bSjkÀ&×EŽôWa”vƒÂ¨w×K:Òr¤†–ô•íœ¼ MÉñ ^b°ZÕ3D +!Æ{ò.2¨6¦›þ&^Ì>Cr7ábò46çÞÒIwœ¯Úí1Fù9Õ§&ä±Jfq¢©X“Çõ¦O1Ú}¹iñ›BXhJ(!­õ²tЮˆ–KC^g>ç$2½· Ò·Ù!ù[’ËûI©hpE\ `6¨â7=žä™M ãÇø0ârKrkÀfO’2SnSØšþ‰,ÉÅ°‚ª€ãó4"Ft-™]UlÇ,‹-É“h5ÿ•Ø4ZÅ ØàIK"³Ì&2Kƒ{þ-´[£Kî b@P06'¯Áfèê”D&@¡£0>šLdȽŦügnM*‡˜ˆáÉ~LþDV´•]|¢d—¢«*BP°=Í/>†ÀxÏ~Ä È)D ˆ©÷£Rqpõp² æ*!f@œJfCW²"ì‹î¸¥€åE³,0ŠVvCDVnG¨ +4?NAI%v É`™5‘!µ)lËÆæäYlJ­Š'/Óa[lAP+µ£öÅ–Ü{ŠR*˜O™Á1³Ä µ«Aµ+N{\R›23¢̒\Ú’k>ý›Ýr·`ôT¦Ä¦ÌšZšs[ëIýpúó«ÞbÕË3´z{„KU=EiÿÏô +«?ð—ôƒÖnχÔ}áº8^Ò7LêƒÕ¬þ¸Þ´{B¦Ä¶Ø–ÚWÏɣ̔[•Ys7‘)·)³¥öÅ{þ-µæ¾Rcî½ ÔKë=}«$ÖÃJ™©I#p©ª'H$Ÿo‰*¶“ö4vªýŠœ¨ŸBÛrëâ=yaÐê­Þ°Úåý¦®WýIU-]´‡í ;¨Tuc“þ VH)±åTI,9âeÕb[fm@ç^4:¹-vœûŒžèÏSœ~gjÕ. + Êp)µ$\KîʬùÇԚݗÙóSƒú54)—‡8õ’È|ž@°Âc‰ÂŒÑ•„‹É­¹Eû\ÉË“íwŒÏþ¶[êò~Ñ¿Ä«ÉŸÄž=Š™2ÛRcr;´ˆ°#^ES®žS;J»59&o“{þ36écsî48&"ºr`]KbG\Wù‰Ñ#^^L#à.vÍ"V@¢:€bc).>˜:°†®'dÇ,JÍ(kÃ;â?°~±`åƒ'-ócˆ–ÐÕƒkèÊ¡%âU$e"Sî5¶)•Mz!V(/0B# +5BPrŒ†›¢©8Hm™ ¨ +5?O#`FWê›ÀÚ8FXב«ÛøDyp ]?¸„°l~®p|®$d?S3¾ -N|C°Y.4€"FsB K8SÊl¹ŸÌŽX4£l,L´ÞsŸÑi1Á1$'p±_ŽØ7Ém–’›BSra=fÂôd¢ÄÛI½X±¡tÁ‹«‡˜O‘–Ð5EîÈM¡·&fDÙš¥ X6ž4A !^>E&HùAä'=ˆT€1¢Ĝ? NÙËä”| +MÉ¥É1·,x”\ ˜Ôxžx)±D ÒtˆÒtŒÎŸf‡œÚÚ*¨~<Òþ&åºÌœÛ/¥ÖCËèºae”]©)ù7f¯2[r{>ªßÒE÷U.ºï «_Å*•‹×ÜÆä”X¯úÇ +9‚cÅ´+cGê³rÐ-¯‡ÔçvJ‹¬ºÃpHÝ"p‰*'¨ô…a´zeèP»³^Ô'‰§dpIì.øÜ?¸ŽS(2&WD ©­@ctE¡%·,žr›Þ:ϯ‹×äc<§/ssö)³å¶¥“öµR×÷Kú¶(V½@ZPÿ<П'çù×|P¿¥‹ö9_µ»ë%}s=$*cPéˆ0ˆDÕ JšÊç¥{ÑúÛzÔ_¶«þ4·(÷„ly5[bWhLmoèìUä@ù;Q¾'ç¹ÿ„Ò®Í'íÞäžþ +ÍésXio™¡)9ѵN¡Ü]bIG~=¥#-r¤GJhAœ6à9¹È¨]›õ‹ã%I‰Ey\³_ñ˜|ŽÑÉßp±ôSØ–| ÍÉãà¢~ Wíâܦ_Üô+ƒ[î*¶#–&ÇÜu¿é×µƒþ%²£VÑÓ–ÑÔ lé›x9»2¸¥ï#.©<¼\$@v)ÜüX²°êá¤K8ù`i)6–ä˜ N(4¤ö&Dú!PÁlÀ‡œ sL!pˆ´l†®n†¤XhÌç6¥jé¤T78i•†&[S n¹ÍÀf{Çaá–¢,¼B„­ “äžxAýU.úÇÜšÜZú›˜MMàb‰—Bb Ln„°#dAÚˆ{ôÙVÄŒG,˜ÙÕ3<ª^@œ.¸€4‘ˆ]?´ŽS$¯d¶&×ôFhÍŽh+×€°tá&è ÅlÉ—ÄŽ² ZHl×Q¥V„eñ˜¼ìÇäcnÍÞ&3¦Z@DV½$öäEçG‘:&(jŽÑ6¥vä€È)áŽ‘Ô +.¨Éƒ‹*…m¹¯Ðšÿ‰Øò§03tíÐ:bKdGØÞÑu¸Hª1©{óE¿'1åa%$eLéËmyÝ@§úßP!åY§>OqÚÇ0Z¿°ô¿° +Ò§Òd»$÷°¦OëIÿPÚ½ñ¬T@ µê…Ž´Û!DH­J(2åÔÊVõ¢Àœ{JŒ¹wX!³23¤îŒœ¥¯Qhý}>¤. âUMι…±=µ±t_ñ˜[ïé×|Ю°ØäuÅãU…v4¥ûM½05çÏáF¹¯Ð˜½¬çôÎzÌF§ÜOfÉ쎹“ÀŽ×›ôŸé¬ßš®úµíªT²Ýó;âe„ÅÑ9}¢pÚý`‰*¶›ú¹žµëó%QIT·^µÛRcjorP¾‡Xí…Ôž÷£ö:ŸµwATêÚzÓžçÜQfËí‹Í¹·|JnÌÇä¾Ü’Ú”sÛó%‘Žà|S* 5?St "8»e?¢eÄå@ P”'Gê5s~_nK¯«×ôÖè˜}‹mÉÅÑ={´Ïù¦ÝáPúõ¡?îçìºàÛ]“o±%³"`DQj„¢@€ÂG‘ ”ä’ÍÔ/õ²0F©jxFØ0¡"º ó¿•ÿŠ­h{¨ ÐÀ BHp=®d<Œ€˜‘ÂW:ŠÈ3<Œ¨ÜˆöÛ¦GÔ)v8U¨ñ‰†ùD™Ô€ VrDS+6dæÆäK`ÉmÖQ–ÑÕÎr;C³VÕ|R*Q¹Ç€K^ökþ·´ûrcö(fÍþDŒ¹c¸9^_jPþ'„vsr”ÿ˜&9&G•¥¦äžÐŒ´^>RX? 8¸~¤LhBY °M`6”0¸rဋ )…Õ%]º×à˜» ìH«2Kò%2#nÊhkÂö#e£[ò\?Œtx5½Ü–YÚõ _Y²ïpCt5K^apGÚPɯÜ|¢0p¢(Ä‹¨ˆ n¹×ì”~ÌîÈ·àŽÚØoÉÅ ›<±øôyÁçïbcö/µg7%Æì¶ÔšÝ ,Ÿ(Û&Ç’Ú'!Ñ®ÖD NÈ Îj„ëø4båä²lSßÆFõžxAÿ”‡ñœYcʽÑSíÚØIúºtR~¶›ö µK£SrelMîÊ ™u±]qvK튜èOCÅÓëB#Ê‚@ÅG‘\¦èJ¢õüG°ž}Š™ô¹1wÖÎɧВØVnÚ¥sŒ¢[ŽQh–[ 3ËÂŒ6eÆìi=èï"gß t¬ÿ=Ó¾OµËctú8»$·Å–܇џ(”vQÜŽY6¡(2!¦± §™´¯ñ¬_[ê¯tÏ_œRõ§T&´!¦\Ói&ׄb©)¡X;§–ÕKJå€Î-K÷ÜgnP®®×ì'°´`reEÒ¦ÙA$&§äºvQ?¬vo´hj5Љú3¶gß2ƒö4]•ÊL©•Á-¹»àó߀úµ±"úí`gêý ý‡‘”Î'í‡Ë/þBÈ„6,ØEm¸9ҶȢ]Z/úå Ÿ¾Í¯ÈI7D>ÁЉú7¸i·•³~c¹¤#°õËÊ=ý±!¨/¡¨×ÎùoÐã¬R™ùxÂgz,Ýؘיnê ƒTo‹§äÞøŠ¤rCåõ6<šò€År;,*ý¯£vE¶ÜehM^·Üi¾g¿ 6ýãó«ÂõÚè¡zeäP¿-Þ²oñ–Ûž!Ò׫~g»©ŸFwÞê畼ä u5PÑäsÀ¨_ƒƒò.Ý´ËóU«ŠÂê·§8ýî£_•ò:ë=÷žbµÊœ~W<&vÅ[Jñ€R_…Qêý —}KÕö~U(¤ö/tkëMy9Ó,Œíu„ITÆ Ò‘´ÇÉEùsç¥ÝÏêí&Q ƒUoî'õƒBi_NyaÕã9ûšeèÉç³úèX¿2rª?QHõ9ÈIúÚÄ Ù•–”[c›î8^µÏZ{=Hÿ ¤¯kåv`¡¹5·$0#) 6@† €MýXµ;S‹ö,1(ÏQˆx†UªšÔ÷ù’¨j¼ª/ÛI÷\¯ê÷—¾@xˆÙÁ#äÛ¢ˆôÃÈ©òÊUþ(ÆÔïV—c³ú*1駆ô }KU9A«·¥sî)´¥vÆ›ö:Ũ?ëAû×îÉŸÄ’Y3ÒmOPÈ¿cE•·±"ºuÁ%vC&”/èÔj zZ±=ÙðŽ¦^rGX ðÙí Gê±¢é³vÇkIë(ê$–ÌÖ/DP  ‚Â0“Ô¾pIGb¹¤#:žR¿õKN}9ÕþÆÎ’7ÇCúÖÜžûÌí©…é¨þîíƒÓoî¥Êý¢_"ñÙëhñÜ…D¦–µkbA¤ª\©*¦åSÌ |‹Œ¹¯rÏÞåSr{Âfã9Š‚ÌÔÊlÉÅõ¤_Zù—x‘ +áú‰â ›{Ëç'$e2;:H< ã´—ñž}KLêå@óäMfCØPV†'teõ–¼I ™³Ì¦M}™ÎZåÊU«fhÖ#55kUŒ-úÏà”WÅê¿Ò-·#`F×–Ú‘Ö×ä{@©—•Kòf|žVfÈmIl“M®(êÃÊèzAK§:Æ' +Ål¹ÃxήÊˈ§S|DMp”Íõw¿©—Ñ֖ù‰Šñ Ý—’‡Qè(2’U\ÆçÉÕ[ruv’{0ç¸Ë2=R`Í(b°êÍ£I,™½`C$ÂEô„Âv”ŸøüSb>OøÌŽ¥ÖÎé¯ÄŒ¸b‚Dy@ÄŽ!!@!$•Œ’›‚æìê¡U­ßP¶ÆÁ¤B;>å ©ÝÛ’[F¨‰¤Õ̲pÔîî'õº0V½¿`Ô_©-³·RøSúî “¨t?©O¢•¼ºÌ ÜØnêë «O±Úý©Î'íaºè¿£'©ª•jE±ê{ £äý`'é « A±¢rq;¤®ðgýY‘þ¢ʵõžÜ×NÊ…é R:D(‚6ä„Õ#鶫þ¹RÕΗDUãU½3t”ªr†Ô.Íí‹Á$ªÜt«¾‹"Ò—««÷…Ñú÷«¿ŽÔŸ¹trAõs„ÕÞ¶Câ{†ÔýC‹ˆÉBPSÉ ©E!sö)jÍ}çäæ«ÿ ¨/ëI}\´¡Aý+¨¶ç³î;DªV„JÅzPlŒ*WŒr3˜)zZ™1µ,ÝÓG!Sî/4fOã=ÿšïù»vÏ_(”þ:`2[¢•œ*Y9ù—µêEV¥"Árò)°ç¿ÂM©„@$* ŒHÝFjÏ¢hýaèDûaºÅŸ½ +íHckþ8:ç÷Õ{v9èyö@0³2zšYæÇ +©_óA{–™s‹"öä7àmM^N®Ì×ìwBfó1û³£V&§ä²xLž¥ƒö(1f_afëauÔ¾xKîÏx´Åùa_¾$v*Ï6·0·¥ÖE6ýS`Îf'”µñiQà‚LÍðŽZ¡Êç ©¿ÎÚÍ¡]—³ÿ€³äv¸iî)fÓ¿ghízÀCí]¼dÖå¦Ü +‡Qo õ¢€9ý\Hj$8’NÀ’Ø[±ˆ[ì!âÖà’{ŠìhW™1{ôo±!y Rü˜ÊÀˆQœimvÇíLî¨5!;âšÈŽÜâpZõR+âÚ.>šB°ÒȇVÑõ„6äÄ’r “¼‰Q“MΉ5©^ PJ5cƒvqlRª–ú³xMîNÈämBå^Ã;Ү܎®18$·–F,@ÂRLa*ÁzvWdQ*˜óñš~ [bóirù’ØZP¹-‘EqÀj’±5w[OJ5ã={]¹Í™û O©Õ"ùÁÒ?ÛUV.Ú›´œÚ›ZÕçù’¾MXGP(f̬®ÉÕý¦^45† @Ãì^Cc¹ú³3;ä¶w”u±1¹*6$–dFô4³;JõŒF=Bð‚ˆx!Âê%wš‘¶\ö0 r[&±±`Ë¢hý¦ ýDy Eƒ'Nã#æÆÄÊ|Oî ˜ÔÞüŒ¦nvέm§ÔÍ*Q¹(&U™° 9½èŽR.ßrÛ FûÅjW(œþ> ÕûRkrGÀˆ°,4&ÂäôR;^ƒ¿ªwÆŽÕ,*£fÅ¡!=c,š×’Õ¤<¤—ÚÓoÙ”Ž¸0V»6z¨:Õ. '7ÖsBqÀÍlðù@iEý€Õoð—ô5 +“¼<Ū¯¢Xý50"y{ŠR….ÈÖˆ‰§8õ[fO.+åi¼êŸBFíAi?£GÚ{ÐýgtCSRxðlñ–\aÐJ•3¬z‡ÂhÿBù×ι5™EÙè’Q¸ŸòúáDdƦŒÒù¨;ð§Ô5“¾¯Sûâ5{ܳ¯õ¤]d÷Â̆•dæ#dÖsjwBçþBgÚï„N­OÝy„I_:Ö¯ Ê~&4â¶|Ƚäµä^˜IfgjÒ_´~_äP=SŸÇ +êŸCEõÛb#º’¼€iõŽ®18fƒÍ«“ò5]Ò—G($„Å*U‹"•jöKú\8†øŒÇëÍ{b#º²vM~ö[j{ŒÎ-=P¾Ç 'ûamzER;!³7 Ae@óCäd†Ì–Ȏ׋CÅÔ÷‰áä‡9CÜœ(‰€ ±ú!•:É ”°Hž¸MÉðŠ¶0¸#7ÆsîB(¹«^ ¦  ¬@ÊĘQ/ž’Çp¤)ÃMSÌ͹¿Øœûê6í…?!GbjLï*3¤15'—¤•äê3FS(´#vÇÜ[¹h¿óU» ¨ˆV Q}ÀZhxLcE<`RC §ù‰ñ*Òžic½å¿âõ8Í #&²")Úq—`2#I`4é?M(1æ–¥‹þ¶´N©JZ:œBÄ–±ÑôóqÂov 1© =¹à’X™\{£kj‹EçÎbE‘ÿ€¹ n‚Š´†¦&H±TBf”•ý–½N1ÚÝ1F»äX«^h=œ&`i¡”©5ÙðZ”Z‘T /©…ý”[×NÊçxJßaUqúAó!b#E­Øaò2“[û9¹39§ææÜÊÜš\ +bf@irÊ+­×äÚzÏ>¦{ò®3Ëú™nF¡¥`i€„Ê0ãâwé€ ÛA[oZJ¿Á Õ«ûQÿ:P†t˃Ŵo労.`½aãCdœþ¿_Ò7Gˆä¥ÁSõsô$ueä uwŠRß&\fUì,µ0xš;ŠS ÓIûœ`Ò—×S¢ÊR¿9Ågßc„ú?b³¯á EÙ‚I¾µƒþ·ÞôŸÑ1µ89hÏÂQÿ˜Ú”' +§½Í×¼Âà’Qô@w`PÚÃrIÝV®êŸ!§^nÍý¥«vq?i7WÀ„N´$9P”ô@+¡t"÷£)¸S¤Å ‘’ÐâBjã€Ð4ÏÐhÚ@ŠKXñÀˆR´Ò­V +1K&p?¦FÜr\9`Ò#2Ù 4 M %'I*š³Í "·«Æ’‰ˆÓÊ.ˆÔÉ ˆ-)b<Âä˜P V,m†…”1-¢^ ±q6@z ÌFDäp 8 §éx€Ë œxdÌ #&4¡)Ûïé£Ð~¦.h‰Ñ´ix0=€D‹§NlDAK#jŠ®'1%×çìopM¾+HTnØ#ÇÙ@Zßà0r”Д("Ab†LóÁÐl`í0ãA$fw„Zõ”Øb´›2;rAÐRcˆDä = BƒŠAL&¦‡-@ V\T* 18„L Ã‹†ì‚¹–Vt YÀ`\bL)¼j¡W6”^x?P==GÛ© 24ª0¸dÿáäš•Ó ´¸ RÓAUgí(‚+Ò²zG<Ñ +\rŒè›J`a¥ÊŽÒ˜JV?QZ>œ.´z4¹ØÛ´#öBŠ #Ù&Ý…cˆÞÊѤæ#ʫLjPÔ‰Ú6.9I¨ü‚(ðòªë`J¡ecHbXHpDBÀ©•O fn¡Æ% 0NPY%@† œ`Ѥ¦+MóR,ÓÈÖÐ5ÅMÉM¹9ÍðŽW[Ù_xá0¢—Ѽ̰J`Ù€Š¯r4å­M#j?¦NnBZ1P$l?žè !‘Qý@ŒŒê)^P5°cZK¤/ÉmùŽ´+¶#¶¥–Ô²rN_DLhÊ&·äWºço"Cn?¼†®%1¤V…¦Ü[¼dŸA‹GÓI-(ÛáÕ#*34–6ðcŠÁŠk3’6Ð@X¢ =õtŠ´,à·ÙÁ4"ö3bæ#ÕöU'”… P^r¡ „¥ -Ÿ¨ÙIUÆF‘”É5Lq){ˆÔ,ÃêL!dA¤FÌ|R)rDÚ3Pd< *¸p4Á7 YˆÙ(ßÈHª[3„Ô­–êH vép«1¤-;̤€!…k@ RßðxÒW=¢0¼t4° ]QÜŠ²/¹ã6ËGJÄkGSCCÈ}£ÃH ˜’»2sú$®#mœ )š²Ÿ‘Ü´KñÍJÐl@•mj$‰˜2Qš"1û‘:ÁIÙð–XÛ/êÇxNŸƒË'*¿éñäª@!ƒ‰ãøÒÁ5tÑjú3³¥”x·#b?¢BÀ~°PyA5€ˆ™   paíÀL‹é…׎"0;£)Z0¹í£Þ\“k23ھܔÝ­,T^@3 £©ßü`B¡!¹2:ä^Ó;rWrAØ4",…›FjF2´~¤0°€¤(Üø@‘ÄŽ[›’»ûEÿà°Ù×ðˆ¦8À~1±]QlD[”Q…íh+“[ò3¤ dZ8ãV ( +ž&˜ДݚTŒÕ +8:œ&ÈÐXª 惥Ì| Ä‚KGT1X1pƒ¼ipéó‘ªá¹Z>Q¶§ -Ÿ(³!©™óÇù¦#0Q ¢½[™0SCé/Xi›L9'± Q`>Qb>R8¾"­IíÇ“ÎAH&´ RX?žbC@´ˆ¤PlÇkL.©­õ ½ïgýÎä˜ýõú‚R/Í÷ü]rÇìjçôepMŸdF„åÐ +šbéžÞܲ›ßü`AŒÕ +fh m Œ¨3PQ䀜ZB¡)—H›‚ º®zI†G¤5±akšNðK'°è0R‹Ž"#^DÙ•nù]l‰@ÉŠdˆ×"2M'¸Ü bAŠF*pÑát×™ÒÐú¹– ]IÌ~¦.¬~DÕì”]–Û±¯ÐÚÑ„·n±W8†x€IÍè’» PHÊô$Â%$•ÁŒÑ6ƒš¡-14„\`ãAÒ54dD8¸‚®,¶¤ORû™Ò€KÏò˜¢ b>Od= :Ì|DU`é`²mréûò‚ZÁ·^=œ* Âs¢¶cÉìGS/Mxa1õ€Ì é†Ú"&;O!n8Œlˆíp‚mj¡or0axùŠW3–2¼€@•aanË݃k»c„vSð‚<•¨ù8]°’¢_Š¦h¨ÜæJÄë¨-Qóq‚éù4¹€D׸K<â;4&·± ëò)·+8#®¬ýþ~H"hñœ?‡¨›ÒW© e-´v e˜ñ¨’Ђ H^É}Îq ÂuÔÊØ¢^Ô{Óè(²%[ !b,Ñ­2€íRBV…‚¶ÌÚܦ|+Gý[fN^¶›Rµ&…€cKf3¸ù!2Âu4ÅRkî2¹$WÖ{~W»çG=ÒRcò(³$ׄ ¹=!CjI¸Ž×±£)“Ù‘”ìÇä®xÊëì×ÄÚ|P®«×Üu@è/£Kî(5£-N.Úá*ÊVXýpâ ò£;nkxÈ=¦4õ ž¨ eIÌ‚²øj“…Ö¨1"n‰Œh›âf´…ñšÿ¬÷ü…Ó/‹­h[åãɶñáÄRSò+´j¥ƒ†à‚;òIÌ„®&4#lˆXP”…–#øM"$±#Ã*( +Äk“Kî5ºäåFÅ#*·6áÑö†·ÜWjÈ,I¬èZBV¤%‰mWhÊý¥£zopQonÉã€O/ÌíhËò%÷ÐÙ½ù¢T(fKEÖôopÐ.KmɛĖ[®$–êèJËøh‚cvÑW>š>È~8½ì†².;¡ìMÙËôŠ¸2="îا1"m—ѶC‹H»RCöXBÚ Vp™ð¢ÃÈ‹nHßÔXâðòyB¡÷0doá([ÏøLÝ2BÙ 7E\1$?³Cú"d@¢>ðÂÔÃÌGÓ‡ØOTI ¹á:ÊŽˆ]íšDðš FJ`Ê}Œh{ãè`º †Ò/3¬qM#ZAS&dGÜ1)§Æ0 M)7¢k…W#$¶ R`?š8À€<©ÜŠ²&dH­ +MɃxañ›J1 R*EîHËÁtK"@A¥ÆÆRW&\ÐuCëçjãè("ö#e³> šÒ ®)­¡,ŠŒÉßà¤T@ µªœú8 tÛÊE{« )Ûf‡º†ÇzHÓ‰Ù–Õ[î®^Ó÷§Þ:H_x¨ý £µ‡a´vmvHþå¦ìš€){-åþá%tÅÑ5ÜOú% +«^\"3»#Ösò!`?P%0#ì îÉÃvÔ¶“ú7"*¶¡'¢s»bg¹1ó£;:µxÊ)Û›ã)…ËY½/µë¢Xí9Ø™î; TËCœò9êƒSr[fÏE êÃФ}ð—ä-•¨h»éŸûEùF)W…¶ŒB™)§^jÐ}eÝö~Ön cÕÛÂHýW§Ý^úû€Ô/,Wý~X-±p„¢LbIž…«IA“~]8ª—•ƒî08%¶…NÔÏ ™þj×üqnÔRfË¥v¤µÙ1ûZOúý“¼Àà´[ó=û[oú­¹I¿-¶eƒkvY:i•IŒ¹Ÿx=û°dÖÄÔó5¯­Ý“OqSú%`GZ 4G)Ù“gí ü å}Ái÷f§Ü²xJm/õ?¼‚¤H¼Ž¶-´§s«~k8% ¼†®"Z?ž8¼€@Á~I.OQú¿0V©d:ë‘ÆhK¡F‹"CîB`5„ÏZ•ûE»=¡Ó·“þ‰ H —Bê˜ O+µ¥?ƒ{R¥ÔŽ¸$3¡)]’«óU¿:^Ò‘˜®Z•2[öVA× V>S^A×›²_©%yê%§Gd@¥—¤öe23šJ‰1y +3CRVG¬Ë ¹¿zÌf—ܾ€Hm‡W&¨| µ€‰˜ZAO&dÈ,ÊL™á:b?´ŽZ ++ L d@Šœà„ 0°€ 4¸‚¤>¼†®%³¢l‰ÌHÛá…b#ÊÊìŽÛœ°Éß‚Ëíêg„UÁ eir̟ŦäMÌŽ¶%bGZœ]ÓÁ)y—ó7ñZþX©1¤‚EæšF2´€¤2°‚°!`CÙÚP_Ù’áÅ£ˆŽÓƒ)®ÁA$+H*MÚ}é¦^/Ú÷€S>†1É#Ê¿zGP,µ¤”Ë̹»Ð }ʬ¹e¡9wõ»ëY©d…|ðñ +±X¢ê1J½4Þ”je£~qnU*Üó§¡9ýÛ“§ñ¢|o¨ÜÂÈ™vaäX¿C"sç€'êû€Tç«ö1ž”±1µ9B¤ÿ¤î:Ô~Å[6éhÅiÐÔî€Ln‰•“ûB“ò«œ´§±=µèT{£Ò-”¨Viö—¼ÒØžümúï€RÿQé ô)}[·ªßÂHåsr’Q'´#'\3‹ëI»4¶©Ïó%u‰Á¤® £’×¥ï:J_r–¨ˆÀ¥*Û.Éå¨_•™³{öìÂÈ”ŽÀÈ”¨Z7ëôÓà™ú¶_“Çõ¦ÝÃ¥lxòóaBq;bmtÌØB¥Aq黢Xíâä <Í ÚÃvÐßµ‹vmpЮ†›#¨ 6Kìˬúõ¤¾pÝs?©/s{ò0ßRëRsú)1èssî­Ý“7•Bè§Ý™¯É%Ÿü°ÈÜqÀ&ÏÒAÿ+'_!&™µÀ¢ 1#º"CrÃçßSœþ@!´ŸÁ1¹&­gƒþ'`JîI ©±9÷ž¢ôÇÁ"êß`õ¦ÈmC´„¦PÄ’» +mÉÓܢݚr/‘airK¾\A¸T‚˜nZu2Kò&´ãÖDMHŠíç e¦ÜâܬU9^ÒÄ$ ‰Oªð({ &±,2 L yƒ{ú¿a“ª|R­ÐŽWÜR["3šÊkz áàÒ¡ä +I‰9¡ÌŽ¦LdFW˜’÷à +‚ +1óQdD-‘“šMn¹M¡±\CR+¶¤ÖFÇì_½åO *ÿÙ¯¹¿|Kl«Ç̾xÏ~çCªºÉ=û[R»bCjWº§ÿÚAýÝúÕý¦T^@¢.¸|<‘ ùHÑì’|ˆ˜ÐT×ö„LÉÓè”\—O¹M±aAÈ~¤JfDWnú¿tTï¬íƒAª¯Qé»CEÒßB¯4·çµÑÚÝõªÝYtOA‹ö3¶éwÆ“zc‘¼3ŒI¿Ž0Ê×zQ/ïW=Ò‚¸4‚¢éHŠ“çxípsœ‚©A¹.r¦=ÊÍ!F÷<Ò]ÙAóªC”îÂ`Õÿ«Þoêç—¼;A%¯‹"Ò¿¢½*7xbŸ5S.àÜ$©–³ú*†ÖîŒ÷ä¾(V{ -¤\ŸÑmŒ©v§ån Cí‰Ò'|î;Ã$oOpé«"¨äÅÑSõsŒNì ÍÂБö:ƧobfeëQ÷áO©+ƒ8¤*Fq‰Ê´þ/ˆI]D!_C!ÝC%*ÚNÚeá¢]œNH»KU+†IßXnêÇÔ¤~mgýc“F0K ¥^cê‚Cbc¼¨—×[¢Ú&uw†Ö×›ö=CjwENÔË:ýÚï٧ȜX-¦VEõWjG×ÚÐ+åmnR^'§¹mé¢_Ú”J¶£ze<(Z½®ÝsW©%¥\=§¶ƒž'¯ƒ…´[J½ VË-Š ÚgXiª€Ãƒ …V¤¥Ñ!·­]skóIû"ñ¹UQ¤þ8¹g¢…ÄŠ`)±·•J‘ê¡ýWè<»²_’ËS„þÅaÔër[î7ßÔ«B¿+µ#>evÜ_hÊý„ É5Kî08¦ó1}^QTÉ-È—Sà ÁÍúÍñ¬U9`ôûF»?"“ëBûyª[r ‡Ôªž0ÙM¡eiŸ!p–Q37玳SrsÅ"l‘ÇkR rjíW\Ù¿Øšÿ=ãƒ)žÁ±d„H¬çäò¥ÞÛÏÙ£¨!¯¹”ç!N¿º “ª†wÄ×èšÜï¹­>p‰jTúæ|T÷“ö6 “·ù¢ÿkíY:(ßÚ={Û–fwÔÖ„G¼-¸äS䎰2·¦s£~c½(?û9·5»åVfw¤µ—;ŽÒG˜D5 ZbÐú· &}yA©÷ƒf{ˆÓ=©tŸ±3õ[»¨¯óQ¿2µhWà +*ŽO“ òºBcninÒ¯W EI+ŠÔª­¡'Ú”*…ŒJeƒ“ú«ÞÊåC2½0N·>T$ñHPì +ctûÂHõW “Ž¼rT_ƒâÒW†Qéë£GÈ×FÏÔïÑBÚËèafUrAEZhGNÀŸÕwéœÛ™Q”Œ-zÍùª;0HÝWäH¹E0¦;ŽЭÑèÜê§~ï'åƒ@êNJ¹¾¡S;$B·2x [/œ\*–¼Š¨é'”ni¾§6%†Œb‘EwW®ê· "ùAV$ý>x„|U ­]¢°ÚUQ¤þ8_t§íªÿ®§DU.™å’¾5ÝRÕKGíwˆÓ¯=s#iD+ÈÜ/úë~R/ð—£b¨¤Yf­S@$‚>s`8‡Cbi ¤3Ó€RJ.´8‰  ù%C,òwàú_6Yl íšÎÁÜäWx]Ø¿W $ñ(ö')zƒ…×RíULÛº¨·¢Ït©ªl>Y@D)‡H­ ÑK™qV9Žø•`¬‡Í٤əÜÉ>‘/ÜÄÖžÀ—‹~°3û›»§ýÅŽýbçÓ±eM`wf\L3ð`«òkn8ÿ~b0R$)Ö„Î4oœëî,|Þé½ +ÊÇŸ`á” "áùæøQ u´¸™Ì¹rœò +:xãòx€c\*¤ž.Q̘"ÚÚRòŸäÇv'R=r4ÜäøÖ{3²HF"sœ»˜áE#Û$.ý4ò^°êJÀ›UlƒL"ëÓö0þe< çv¬7Ág³ÃhQ^t>3Œ§4ƒa*Øõß>ý—ü“,Še0ÊjÖšm‹Àh`ÂfÃÝÇ@»âÐõEé-çʶ­1ë¾j%Ô ž}‰¹·úGÙ•~à‹ü1p&ñ‘5Ê 6¨¥dËþÎ Âð°Jˆ5½%…j<%^ŽÞA-kˆ£A見lX6«¡ @jðRB5ÈI«–­’ÏÆUÎQLþônqåDEµhîª?!÷cu¨pd¿¼¨°ã÷æ~,ùÛK¸âÕÑK—ÕK ”ÏÐĵ—ỎÕ6h„ÜA}¼Zý`cÑ÷!{  6ô¨6@@=BÀ ¿éðílïA 0ŽëG ³%]¸Âüû˜ï‚0~h„²ùÑQ†m³wmº̨§ì).R³—36@!GªÄFþ¹8˜‡é[¸z‡àôŠ²1ݹö¡ÆÆ?«4&Š~£Ðú%–¡Ìû$•1‰9äb)f¤XVAAº{3m0…9v¸0Yè´ë¡¹w!çu»T_)ƒø›#e°'ÖN¡ð}èî¯Âô~„¾BÍ©ÓSì¾SÖƒWFšTGWÀ‡³a±ÕóšßqIœa!5Œ^WPü¹.Œ:ÆXe2øŸxalý¨E´ÄÏ +¤¼Üä=`wËŒnk'œ£¿^"‘Ü"yr’ÕN´ü»X-²\7 ‡×ïïÔ=6‚MØM ö‹_fNš¶{ ÿ„(=CA·[dqÞbyIèzΗ³Pª˜mÓýäøÿ[LÙA¸`á¯üè@1Ç` ëGMG3ñY9¿èZáÖ›Y›‚îT;ÓûŠâb}ñÛì-«ÊY^œæ±6§¡˜ßQ+£²ÈÊC<ö,¸‡3ˆ™rÇ%æ1"7[©P7­~¼“ÿºÁnØCiÿÅb%Sñá#ê:†¦ß¤4ÚñÇ+PƒÊOr®­§)/ÌĘLà_X¡ Ÿÿ ˜>"ÕýPäS‰÷‰Âb˜25ýƒxð,䮎° ˜Ð0­õæÕŸ°®¾Fd`ù!1]¡ˆ¤p{MØK¡ÊnžrÝr…‹,S]W¯Y%h‹À~˜ØÀØø…É÷ ¡Í $ÒCô‰ኌ'Eñã)\É6T5¿¿¾¦eÏ­[¨Ç1$ˆO¢²•£ñ6Ë'É•ÀÖ§ßi¼)¨l~8UÆpPC” ûño y¨Ý¸yš±«“”õæVÑBL+ÿÑBG7$N`¯ÜÂMÔñ‰ÄŸ«@0ªH–“Åë +ñ‘ýb6k‚U€Ä +Îc—$‘2 4¦=Ò,HH”Þ LœJPFªñj€x9щâ‹c»UMÔBW­õ9E›3¯9RDЪ|/+?« ¿âÌP7ØåqlÆ]K´[¡âŽÊ€\~*’°#o½bqž‹å€ÚºdØ<§DNè$1èF“hÀ<…xˆXpç©“üJW‡¥S:Ô.dܱü7­ÍS*a\ßȼ;fVJ^_xžu¿’9Nh“JéE…ðû:4·ütÅò:OãD9CPp¼µë´ä\`.úGì5Áá!¯ÿ­Q•¢¹˜(FO°úÉ =„íî[+îäâËlb“0êµc€:¿‚Jxdš 1ÓÐãF«ª9Þû²l°v+·Ç´ü7uñ-îÐÉ«.ŽµÇÓüz•ôväµ´86oºQ´ÛJ猞t¼I‰hkýûP¯¢áºúº'(/÷ ø;áìÏíõÉŒ +ío\­Ñ¢Ã?l€ž€e!1ø¯MQ„#­¿¦}aå´-yÁ–í¬ZœTº¢ò]e'«^ηw¥&±íÅ•±ç{™ØDK|‘%A&ö?””I>Uή¸›*O¯KÛ^ ½Q«{ÀyÉÆ+!!ÃZ z'Ýëû°…‚ìpUÍý„8Ñm+siCMW žI­Ô˜zXáÖPüSåšbV\fÕeq&bÛ+ÝŽŒÔZ ß•ÃFz‚ CË{j Åi5ûu^–ëŒ â$ +MÀ“l +ÝS`÷z~3#ìZH΂<)ëœÃ£n[>áPè1‡ó10˜CB³øY‡H4¼@Jx¬Çg_í§ x_!ÍZ=êbäØñÝüdw!«ïÉ;°˜úçJÄ,pÀ\ˆD•œ’÷ŸËf´xÄÔ¢w«ÂèÕeþ5„KÏ@¸œ>àŸm:&¯‹3ËÉÆ.úö&»ÏñïL^4ƒaîôøõ?Ë´¡sk™±QöóWÈ6,Á |oŒÀBŸF ŽÔÅ^ÓvèNÎç`#ÑÀ­#C•ŽºV]X7Â؈ù•I¥B2j?¸ä„Í^ ÖòUß{ŠÉ×ÿè©ÙЗBÒ‰l5/—Ñ?³~ê=rN4Pð‘¼LëæƒüÿLxú-Wö~X%\]z ýÕaÇØm`øZy;ðO¿ÎU”†¾ñu+bIÞˆ¦%…éïÏ, êÁ'ÞŸ×KPØ„ +"CŸÈ~x.¤HLÂJ¢Xéóþ@å¹'Y¯ðÃ[x"°·k¦’Ïí5ð0œ¹ èÂúWÅŸw…ä¿œA CZAþÆÂÙzÓšÈ vF7¹DßQ{„z!jâ±òµg &=.Ÿ©XáõVD/W +$*Bè°ÃI$‡E §W$ë®:ÇA«Q ­•8©=¶«=³´`íæ”…Ï%‚Vf%»øŠ”(§4ÞÙc âªÎ…L„b¸ºè‹Îg¿O0fOLNd¼P¾#`è·Ì"´sd'…ýù0á'„­†çMÔ!y@ûçlcÞrÄâÙÈBÀ¯*–¥àl¼k­MNRZM«‚ì×à÷¬´Êãs†V%Òúrcåq!„Áj”mÊ?g—W>‹ÝFÆ8'-;,) Jp$%s}1²ˆl‚*×"ºÄ–ˆ»|+L‰~dç®Í¢&³ vï/ÃÖ˜ù‚TöK°‰‡xr~ìIkÕ‹%îm²V“[$x7[+“W"§.QkZa¥Éa +ÂO^™gâgüÈ)lj§–(áƱ³ÿê· Ÿ–q$¸v`{íu-"?öºP9ý๰Šd¿ŠÜKWË”£F¥ ’–^¦<¯ ’ò\7¦Ô7ÈêÃ+@7› å3¡·•zDk¸/Æ 5d|ëú-$8o­ÈrV\=+öÙ¾IÊPبàù6ö#«2:mi(Õ’’Ù»ÛH‰Âê@üÅ;(SéeXHü´\Z㧌H_až§m ^{|ú"YR‚TdG:AÀ‹² E“GaJ¥ë÷ùNÉÒ°S ñ$´\m%Z‹‘™fíbÔM¦^$f[o+,Útç+Ò—J¿Ó{r—q°ÞWÚâºí–aE5Ö»Šê×™òžkx´+–ßî‹Y9í<Tƒ±et¨„s}~ØnDÄô *ðz©üÕÄHI$Öb‡·GŽõÖž“l@´$,B«f ƒeˆK6òäL|ž÷§/9€H¤=tÿ°»©Ýdˆ™Ê@Ÿ¾P•Ökw66ò’ø;G9+ bƒ djBmA™k ò3e82gtº/î]êÌ‹A›n3‹ÛÉïø ùql0¬Nf÷Dv Ó¹Õ¼h°{ܬÜkªãì@—Ê-mPTè%X9–®-Ä\¤‡¡ôÈ£,îg$tN: ¯qdÑíÿïwqÿa i“½Qõ‹„aLÜ•oÑI’¡\Ù®CÜʆ[Wÿv¬ŒñfÈ•ž™èóµJ€Éƒk,Ç_fÅ‹@ÉË,i×W­ûR­MfàÁ¯Wyëµ=Eû£…˜øàûLTÌÒ7’¦w4ô8ýíÌÕìÝ/&/Â7vi¢Ÿu;&¸#¤}S(ã![Xh7ñtG§ïDš) ë¦$à2¸nƒû\É/ù¤7´Ë¶×"Š¹(׎˜ÌŠùmÅèÞ^†÷ªDóo¥°ûaó^t‰FÕ~GC¡³9Å–sÙ± )Y©+lùÅrŸ_,ðüÇvèê)|7¥¨zÚÁZì¯jŽ°²bébÒ£ݳÏÒ͈lCÜÓäÆý+x‰°yÔ€¡¼µ‚xë>$þª`WH_ÆEOÌÏô´æ<:U¿Ñ…W?š +‡B¥q“¿äJßølÀ!†)>êÈ ¦- ív?dVêT+驦¤R+g3¼q ҫм +¨7]Ò8¢`ÿyÏÈ€a@\¤R1‹œšT`ÎÞi@¬ŸS0¼üð*}BõÕÜ¢a¢þgæ,îïì¬è/˜ÒC ›»€fß´‹³“>¹‚{Jqâå3IÒ!çþsÍì"'.çuÎ%D‘zí#|yèÝÂïi!{m‰ÏIdÊ1ÖôØ¡£Ä¼°ÈÙT&¬àõTÏ(=QˆØ‹'MÉÌäâî êÆy»š Áð?µ¤Ä±ˆ!m +ûËD'‚aí>‰šÚ~0ËT¹|΀ʣ¡óˆ•” £®‰Ë ô~DÖÕIAYnœ!П˜|·É  +‚ëkä5c0†»+»Œ‰¯œÄ Cª€{‰µ tg$¸Nj˜HÚ‹kJNàçZäÀ/DbŽ*nâ…m½‡ƒvЧe*mˆz±ú) ÈdgFlô,MMþYýîÑåÀ" ʨ®­#Qoÿ¤½Íƒ˜P 4.ôâz„]6™ó8C"\ƒ¾ÔÇ›];Ó¦F Ò†Õ1½“þ…£ùh’)LÎ4ddî5ãO-`º“ßü }ȼ“ý™ ’¬ñËåÃÕÜCgÕz.fêd¹ÁE€Ñ>®?9É€ŠšT@‰§0rUS×:’õµÊÛNí’¯ŸÐœ[HW‘ø¯:÷F¡×í­È‰vü­b¦Ÿçd"ˆ›à6ïG‡øÃÆ{%{Ï_±ï 7¡ +ÑéÖÖaGAwÞØI¼)“aö\B¨GDõ48œüPÒáb +ïÑ<2ìd…ó#û°RF`Eº#5ÅcoÏC@)»£\{žš#xqùFYÔAh™ÐHÖ%îÜ +ølNŠLà(~ ËÍœPÁfmFÈÇâ¡÷DH5·o>`#³o›Î»`Ï+½i1)S2Á tíoßm|¨I2+×mU^tâzΩ{—]MèvÏÏ @Úvzî Љ:% 9dº4X¦Ç óƒB”-6'Ýâîþ-¬YÂ*k”(¤š›ÞÂIM%$¨XMjÚ7‹Ô°ÕñlQCzˆ\LaÅveJFÅww˵[ñøŒ xD›MÚØÊ"{À*oéïÁË-/"Í4eàz‹ âÛÎVÁÕ ×ÌD™<ßKÿHHÇAŒ¿¿ÝHþFcåa,¬°ƒé¤=&w;Ê& +ŽÛü+¶*ú3í¯óUÌ+–Ê“¢´™¨o±]çUî¸è{ +Ì2HpvfF-ã»M§‹1¢00î©mG(œ,wÕ2M‡€ +ïßêK)-ëÒ€Ë(Ò 7üP 7䛳-qc=.`*3—6€ãsp1—7 |ª˜º ÅÒüóÇKEó¦‘?¬TædtµúC3ßÖÓü)ô".ð @ìÏRið×e XøI?À/,ÑÇzQŒQÚV¨ø÷4ì& +†ª:"RÒ+00Ø­1K¬´ lß)ö½=àÂÅ‘ ûtÿ4Ý)>î¯Áó¨`N ÕÂ)±_–>»ðgq°[¦Ò1FÏYöl˜èº~V$+÷³#Ç@ãïfü8òÁóÕ¢J5õîxÊ`Iw\1^^è³ç@_ù¿;=Oá®­x»AïPÐÖ;¦ÎøgµÓØ_¤Q„²T=¢º°#HÌ–æø®àlÇ㪢^Þ79½×Û›?(hœmˆ`±<6aÖ0Rdmȸ¾¢p ÐF²¬Ÿ|Yl º˜¶ MVœošMlóÂÕ•¥²p°þª§ Mèñ`=>6Ý—ÐJ6†^ +IúW«ÆTÀU½4PHÍ o88¢ ¶-µ3¿—,᥃³\yP´âÑ% +×xÆT+š¢"ÝÇþüÍö©:qôõVAó†40D’ cvê<ˆ°Ø^ÄüÒxÒœF…="Mr’³ã ¾ãª¿’°E3{Ä‚‰P~¾®\ÂILr<Ïaò¡†°›Þ&2F2œÙ°‘^’¹¤$bBíŒ,õêAÑ·1¹HG ÎŒà0ó\ß G[Þoêû‘I¡Uaæî»7БpqìĆxüL“@ ¢oåÿDX©ž'@øü^½F‘€ÖT‡N»¯mQLC8e¦vØÞKïDzì-Ðo~‚Ðüun{\—¿S +?!™±Ùõî|*çc.-¬âú¨Ð‹ s”òÿŽ{D²m‡ú‰(¿¯¼ÏÙúr} ¦¹…^‹h†Qr2¶SÞØ­’a‰ÀÁ7œã×Þ$s·ZCÿLÖ™ìË©¦:9ãÍps œ‰,FaåÄ„ éµV/¢§8o&d®OýGÝÿ¢§ÓêWv×VFyþ㈯_íû9ûEb¾f½Á¯ÉWØå\qÅ!ÝâÓ#Œer*«âçtÔF F…7Ý3áŸ1ÜUÆ!4L +–ûôscá0”º¦+ž´îiº¦„d'ä¶løwU¦Â>ÐH(Ga¨b͇…ØÏEÔìÄeªüJ(÷¹7-ŠÏ³[œ?äð*ÜDÒgsGÇ]'þÆk­qЛù~Ôk1%@îpÞâ ¤…QBíÌ#h‡Â ÓíÔX0u=fϬêdþ=¹dËö§›ãô¬©¨G§×®êÛW¶¥¤Ç²½oaÝd¸bVØI ®›ñ˜J53´7AÎL/F*>ás_¦p˜ø^.ð­>õËð’Gtõm_íN“|áÀf¸%Ã'¯’¢|™Ù&šb ²·3Z$!@`~Þ\£ëÀè!̵¦‰^å¼·äߧÁ½”9ªà×Âþ¹J±%„UXf—q„ÊÿŒPÏo‚òÎ{Ö¼pe6òâ8Ô½vょx >z¡{ÆeÍUu¹¶Îð+!ŽRr˜ÉÅû² À‡aXr{µÃe(êtˆõÈ­C¼ÿ¥Æ0:ŸT|ç…h1‰R$ ŽR„V=S–z¯„Ÿ£D‘äßÉ'Mð{Á~ìñšX›Å 3Ç£<Á£ ‘Ifw­ \˜”‚»p̤æ R»·ÃØ,°´qŸ#!rý"Èþø²ßú^)ƒ8R/ªâð|\ñÿ +~ƒ2Ø@ÔB9Œ±E¸Ì¥Tº:ÝU‘ˆ¼ŒQD4å·ÚCæÁqíÄ¡ýœøšBàYpj'á÷ áZ°P8Ú­¤`?Oë–X±Á\J§7ÚÀ—Ÿ ÷#j¦nü.lô¬¿Ieƒêj>æð’SJÞ©ä­à#IÉã˜0¿ÑXú†Ú7_Oé=&jÁL¦ã.^T†Rës{´¤·¬Ìk}ÈZ@³±+í!jOb7Ÿ,Ú!f¡ÎB1R×5šæÖHúo=˜ÖHž&H4¦<à<¢Ï‰ ìœuŠg†p“×1±Q¾½í{”%\É0ŒbDšÉ‹lŒÙ=4Ú`Œï]ÇÛªò„&ÃÙF×xr n‡¥ÑªÉͶ±ÔùÿöI¶=Hd+L™4*g:]eñG´: ÿÖ8 v»£tÇù„-¹UÚ[1Yhc §±ôîÛÌ•'‚á# ™7Ë5ÎP‚5¬ÌaÕÁEßšÙEåS÷©­5G¥è©“$Ø—]1„1Ý@åxà~FîE¹Rò›ãØ4“>B«V|¦. u‡CMðw RSœy^m­‰Ò¡»Ã;¥sœÑt2ê€4Û߸±ÁËìpОsd{6ÂÜæ$~8@1ݨl¸ö·U…å¢×æ "—¿Û &uZ „¯Æ.Ö=CV¬ö²Pܦ²DŸÂÈ¡™oC©õ34™Bì¶Õ¡9M£ czՕ猳’Ê ê{ˆÕggg§s©ôað2¾¿ìÑZ#½‘q‰ö6ÝÛB97£ÁæÁŸA¡mþZ^ Ûô òÀ¶b–¼6Ûdbß“‘Í|WôåÝ^Ðc¢Š”t8ÿzêÉ8òˆküsØæk"ÁQkŠØB,1?ü Ñ&ßWXö˧-§‰úóiœ†Õ¸»ÞZ¦Ä]…Ë´ê¹ô¢] -¯Èb©áŸÙùŽi kö¤B„ìÍ ×n:·TÑ ^F¾1r Ø ›×öÒ(Ê2¯#>Ë8Ìy„Ù»Ày0‰ß‡Š'¤`?Ó¨Bí@ÝÛqØÀÿ`q’Èìæá &†æÎú»C^†É¥ìiVG؇’ç›HK²ái±¸s •mJ5ŽÖ°/ª²á¢×p‡¯/UBîvÅ¢¿}v0 ”ma¬Ž[¯¨>`Þ¼ÛBÞ_>>©’'ŠD1;áÓî‰ç¹I^I¥ ±a°B’ô.²M`y¨$†á«tQ!(Ð3o"È¥óµC ñoAîoRËY$­YAPûrx¬ÄZ¨ÑMqøÝ ‹áïöÀšl +9³úlYjæZèè’’Ïåmxå}©ãÿz4 …,b.žÙ(O ¤’ªH €CŸ¼ä–QÇ –l ©­Ìà€åqŒs¹@ºU³”ÁÓOuÃC–Fõóéº 5$B×uðWÙÖö5 êßiýšµ=.âóoñ÷kcsþ¶0?ñ¼¼`Ò0Ÿ“ñ^Ç»¿¦é}(7á‹Ù'l‰ s4>ÐSø}ütOË ÁÒ¿š¯p™fEOEˆ<‚ËV†5»uƒÍõ¯AwÉ2] qV@ô +°S»uÁ]af&pŸ¶Áøb²˜W°£±æ×i-ìñòԣƋ®ßã«Ô´IX}9Mœ7cO¸q³ÆmÚoþ$ÃÌŠ•:™÷d9žN[ü3ë%bÂ(tøùÂŒí(ø4sd³(& þ ùaǃO…áb o"$ÁA5l/|ï¥(“z°A5üx~’Jœ¾~êþmP.D3…ÔŒ¢d¤¡‹}°/VTÝø…;™S€®þr%B`‘|,„YŸº\dÀ°–üx÷Ö´!ÈÐ&€ÀfþÆ!cô0ÒXß~°ŠdƒT÷bSP̧Y\#o!;M—þÉO`®œ ÛÏqÓ]çáÒ²&JÌF%R“ 6ž„r5lèºÓq4¶úÖ_ãS½e{Uœˆ¸ÅÄ%š±Á†T"W4à]4‚˜¥gÌxf@žDã<„ȱÂdÅ´}N¥„-²ƒÛxé£1|chž¤™–2£áÀ(«,Š™lŽîñÏÄ +˜¸‡½¸Ñ`à~÷iF3 ã9hó4л{K€æ W·’ˆ“tç3Üëx&eÆ8©mHøFéGBïÁ)°-`!„? +!)–]½a&t 7!éåj·r@Àü®Ö>ò¢ùj¢õäWŽ¸Ü#ô‰¥Å„‚õÒŠ[¨¿|ì EŒ8ßÌOç1f† &‰i iü ¡ú`×,GÚÐÇýv%d"qƒ Ó4I5r¾¢ƒAN#u€ü/ËÌþR;H>–r¡fq­ƒe¦Äxqò9”2EDn³‡ý§ O#ì«3ß gý§ ÉcHÓý$›ð¸~tÎ׿ÁbÆ„3€´¥V„¸Ù++h¤+Ø»H²=ó˜­î™7>íðáõ)#Å2|sgR4£”Ïx.´Î?¯Æt‡¡Ï¼ï݆¿XD¦VÏÅŽpnêœJ`ð±®r–r K$h÷˜0«° 2~·Ò¿Ó‚PyÎ^e Š¶âÕp,¿ Šùˆ‚Y¡¸åÂ67.Ü<&$éYçÄW€ÈBÁ¡_ï…ã Ê~(¥Sv’ªžŠ  Z +A .l7ÓÓ¼®Û4Hþm@tš©øÞw†¦Y›^ ga: 9u +ŽX(&6L)Nùfw#ÿu)kÈÄ-Æ8C¼…ôÿ±pÇA9£•%E†£HWÌúÒ ½)/>‘(¸ØŠµäŠx»šöÛå)‚ráŽ2AiôY|kÎVŒ^¹û~D¶OaaP š+–s¬“Ëv–Óçàµe1ÌKéò†Ñ Æî¡­»þÓº‚ìÌØ6ºH…‚>2q¸w²ŸUQ™‰úá_›Òub<«¿ Ö‘+¿DѤ‹}jð* ‹h<¢"Å€S»]ó ÝúuN¸Üp¸hóx ¥ßÅP¾"\îG2²Ö„b¹Ï.ö‘³tï ”®únÛñ;ó}¡|æ?‚Õ.³++Gîµ@ùSV@¨øeœÞ*Ö Ï™|!úvp¬¤îõ¢4Ñ}õÎw¨÷1Ž³‡9Å=MÈ\€Ñ,ñ…€ÑÉ^£¸)Q]ôUÂ|‡á)ÏBóÙ(Ø_‘–[}€óÙÑ9¢Ÿ~ž_7 |>}þ$÷­Í_–îrÂD}5Û,Ñé›D;L[ÔãX?&ßÄË£ÞAS;¯ÈBV±Š6?öb¤Î¡l&ÿ wB3ÌJ06(¨Q,Ôbj1*¬¸ú¥G& ’ŽßñQiÚ÷ +¤a0ó«!•Áé옖`…Ò•(æˆÄylᎉƒTÙ„à‹¶Ä„±ºM>r“á€fï™*jÐM‘ÿûƒ4«ÔV,¶÷$š†»,½i±ˆ=Ëlè²ÄïI5B Œ¾‹Wž¹ŽK4¡š|àðÜñ6,·Oªzx”¼¤ÄzÙ}‹›‹%ùbK¡J4!1^óÜ Õ}ŒâÖÇ€ã¬w;A¸ñxñý¯Ú’§°]‚ÓÅM7 ð”Hᦗ? OÍfö +|dI”ãl/0ÆYTÃàõWÝõc‘9TçÀÕ:°¾Ók”­ÇÓþÊÛ˜y3ÖL·²ê!§G +b7v§óÍ=Ëß!.Ukò@”F„z¸/ûØ·ÊÆ«)<Š #Ÿ;!Êì8k3ú 0dȱ´ .X`$Á´ —ÌP–O|x‚ ”í!‰˜Ê<*–¯‘ç7ù¸YNµ²UÔ0üáü€;œØIˆò¼pJa)RÂ]gÙZÿŸv! +üM‘b½»-ê´b•x\{—éöŒE»°†!aïˬšýqzõ‚¡§ª¢¥‘ðÕ}e«Ô½3Ùç0é:&ìZ2¨BùBB7Ü“4EEÎxE_¥£­÷ ™ÒHZkÄ^~ ›X‘ª°)½`îæG#Û¬Î_jlJ{ß‹S +±YMÂhCC™Û¡ÉÏË5/dÜšSŠyÆy\`âü]3¶C~êxØPXåGàR2"8ò:FòÌ–1A}Qü|qlëÐÀý3 _À`mô§Ów§…m€`ž!Z¦]ùËÿßCz¨i³lN¢÷ÅÔïæb\˜c5+)¦ÀÇå}²â ¿Y欪*¸ë²ÀbkÆâæC0¯îŒâ†é‡ß¡hŒ‰zO4}ÓÙœóÄ@$uõŸ j·Ü{©Y ‰oPsÌÇÍùMpIíÆ«ôâ5I!ÉÓ¤…ö^ùè! ¡>U\®xÈtö$Ô”b¼B÷Ñ•©\‰w/—2ã × ”ØjŽô ++@7I³û{ßqW¯˜(˼õ^¶¤èS%O}Ÿ•GŽŸžÑâŸ&ñ*\{Rò@оYG_î\G[*G f"»­»Gª…B»Fø¢wºpñ"$‡£0¬'aa+ÝrÛ†Xn±ø!gÂyõñçX—/ûý¸Uý©š‚Ö8•N¬M%a]C„ ʼnÄ#Ì*gÔ¯áz¹íçß|i¸ÄQ[5º³¹ºK(„ôwÕR-;Ý–àxÇž +ÆOm’ÃïÙjúÛkŸdDz–±Ç Þ`'l8Ž +EP—*»›z ¡55Šó²ý»Àô3tžÕkíHa›ãíMg>…óœŽá%슡6~­ZZf…‡ÍÅO´>z½ÔKÞѱé¼È ¸¼Oƒÿ½Unê®Èo‰3Þ¨Bô֪!¶ÌmÔ}jÜÕšÁ–óØT|¶‘ÒÁ(;$’"ôK8ûÌŠ!áL§Ovç„)b9Pþ¨)Pnu¹)F!Öªãø/ “üÖãC„xxXJ¶\f\^ŽÈé`6»X­Ü»ù®B×÷&p:Jr¾õÒò­ÛïœJV?’éðơlUù>zo¼NXKü, ¢¦—›šcŽ#桸gZmÚà&ž;w|g€gðËðG@Ÿ(A‡ßAs2Zn>mèG³?¢ÎÜÌÑ­}5Çm¢¤2P'5vVÞd‰ŒYÞ$3§H-âÒú© 3ÚÁ‰ú|ðÐ÷ Ûéâ°uH!Ç}è™@3­‹G¿æ,M›èuBú¡ê&&V‰¸“ì³"øm»tC.Ú…ôÂu8R ‰¬…+Ž,¶IfÌ“æœW :ñF.:BÞ +Zì$¨xðêîÃÀade»v8”‡š ãÚ) iÓ‡#‚­qÜD-¿æø§… +o>¡P¬?UøMšJ×Óiž ³æ¯ú´FQ$E’C ±cÂÝÞyQ”™%…ðùÑkcÙ‹„‚…½‡ Ìàà +@õ謓„€Oß|£…É‘NÐ4΂?ll`#f3Kóc¥¼7¼ú!<1»À1!¼ÝM’˜È§û·.ù7hyó ½ 7kͳ¢9¨Á)6ÖÿŒ€›>ãSƒÍÅ{„¦ÎÅ]t×,&ñg Û7b0¦†çú%~æ• {ÕÆÔ^p«.y8®Á ¤l?‰…ãxt¨£ÞÇó•ù¼;ÝÌÊa¶ –g ü€3AcCÇA ½ýW¶ÒǸ–Æ+¶¥`(ÄacÓ³™×ØqJ £+cß™…¦ÙþHêÐÍ9[ÞE{ÄYÏÜÑ\Áq ëgä–Tr%]©&ýÆVË3ÆFP³ö8Xq‡ÉE0wV?yø ?ü)É<ß«Ò{ÎË2tãtáúß-Àð0B‡©üã¦X¹¸Êyv阃³[?ë(ØoÝ$+ƒõü> Qíf1ÓcT#‡TÐP°DTˆè¤¿ à°8F"Π ’ä5ÓË%º¹%}UZלÈé‘kRVé)ïúùéóÛN¿ð&^N9Žr$èÚÄ"òøԟȵ,— ÂjrÒ“›þ“7T¸ð("Þå)~_ìÏÚCƒ£„¯Ýa@„Í·`yQ„Ý^/ §5ŠlÐé ÐM£ÌÚÍû%UG$&ƒfŒ£8ìà‰8èÂi½TâÝÜ”RWÀÀŒe'#•ä@ ½&k&‚ ÜVl¦¥Øëq´ hsNþTGd„•Q)lä0Sèhâ¾&€³µŸ?Ýàà³ ó4úäñG7¦¦ÀñÃr×è‡xU!øÓ€^,˜¾õq#•‡çö¾ * ˜ÍÉñBå’`¯@WñØ›kXÀ}С«¡C¿ò|M­w“±·g­'jèšMFŠUÅ)F‡–ÿÀ—Ôݧ¢´žXÙ˜¬è-™-¢:ø[6ì“ǤÿÂÿ=ùâh!óËèêW»_ä¦Yf±E3¨ûK?ÈËÏÍ}é»+› ˜x:`um“1³è_`¹2‹ì¹ÌÂ/Ä9-ˆê¤Û#„íÃûRfÁ3"C Ä‚OÁ+YìqÁ*›'JX(Q4Ó7)ØZ¬>lÍÌây ¾=Á~Æ‚µ0¡Ê, :}‚…½eÚX°ž^Áè­`u_UÁbžìPv$ævLï6‚å ‘‹ÃØ™…ãe|q©ê`2‡™`,XÌÂ`;aÁ,=Á¾ž°< ŒhfÁR8áð{=qÙbäÆ× v(¯`é€Á’^¤Á¬8‰Áº5:cŸ!Û•Á“£›‰1XÚR^z` xfQ“º5x1ÈÓU» ìc0Ö@iOáûÁ(3 X¬÷"ƒÉ{hƒ½ €fáÇe· +»…3šÅ¦Šöò%Á¨ËP,ôF3Žq`Ì6µ‰I³h.æR³¸yØS·?7Ÿ5 ”`X«’`º9Ýau»±Y`=×,ØKšÀ†›i¾ž +DÁôYB(˜Y0ÎG VßMÎmÁ”–5 !ћӴnyYæ‚m×,¶ä—Á7ìË ŽÍ‚.XL‚É#c¬RÜ\°¥3ƒ•÷çÌÇó±s*bÌtL_Õ¤8Xz0`#&Ïøa"Œõ’°Ë›°æHaÛ +»n ëóÖ*2¬xÑ°¦àhÃüŽÃj'MosÂêí\þ°Wµ¨ã“ }g³˜C$fh³P²+Ÿ&–µžX{£X"zŠéaÅÂ- ’³+±h¹Xâb•Ü\LuP¢d,—Mlc›ÅÙˆ±d‹½dL©œ±Ô0HÙïp±ÑÍâ Ž'U%ÒéØw¬÷‰·cŽé½•5™N‚,[2›E\½ ÌéU)‘Ùh³P»ÈªG&ËfÑò‹U›ÅÓ[ßf±ß «p³â’5Y(ü-™¡–¬Ü,YÁÅdé¬ÉÓɺŸ~²F”Õö1»[²½üݤáfÁL++j`,KçMÙRlYR7‹Êá² T—#Ûq³Hó²Ôè—ñÌ`–Æà žëâ2 OeöXfVØfÐÎŒt¢ÙÇ›fèZ͸ùšµÚlëãØÖ›5p–àfQÕèqÿVÎy³¨íÇ]£”Ø•HÔsǪêìY§öãînÂrVdžr³g[›ÅuÎr,¾Í¢ÖãÞlªÜ¬1 r³@àíÍî%ÉÍt?I2¡×sš ~[ó”ê&m6kF›±QܬG½>&ž*µ´Œ;ºÍBá¬IîƆdl9c Ûd³HŸ³|­Eš—£JÇØ,ÆÐYÉàweÍ" © :³ÝGÆÅ×,,ÍAˆ0™¹œÙb9ãñ9ËSÔ™¤mÍ‚¢{;ÓŸû®Y€@é½Î&r‰›à +Z›Åµ|SÄÑuUœ5·Îú#­0ð"6Ô6Ç\s6‹* mÒ™!lÌðò‰¢ƒbø·KgÒZ)q\g縙ÄÏ\þÊ$øï,Õwvé ˆÊ ‡3Ïï;kE¬ŸwöŠDæ +‘¹Ôf†gýÚ­ÞYTŒòîÌÏtgz2NYeg i³¨ÎÎîdgå-[a ÞY6ƒfn½3ÙZŠŽûÎ4¯E +ÛY•ÍBÑ‘zŽüUßAÒ•ïÌȳnDÏB{6=|VÙf!€ú¬ù™˜›E‘Ÿe`›EÒN?[¢ÑfÁ'HÃ>‹9:…€²YX8ÖÏ.Þ~æk³˜V?Ë…}6}|V·öl½èY5Ñ•äômÃ’c'ÎB£÷¢g à6 bölzF%ôƒäþ¹C›t”±ÕÚÙÅ +M#§EèùÎx5 <ží6‹Õ G0¿Õ™ŸÍâðæÁì¬ÿ´³mlMZ}š®ݶ³$HüFogyØβš†BWÒ´¹ +<»¯Y@¦@ŒKí﬷3/5‹°E!ÛºâËñÐíìY5 îí,}~gž5‹‰<ëÏ…äY5áUžUo‡å™\…gY#G«YüöŒšEA8’åGü;“ûë¬I³Èôu]j®³]4‹.ngƒÚw™c±Ó,BwÖñA\îÌJ³€1š ü¸³#·3²8„;S¤Y\Ø©/¬,è4 YðÌ{…g[Xž•\i¥L¨Â³ªçÎêžE]ž¥;õìz÷¬Ü4 49‘?+Q³€ù3Ôh ­@“SÐÖ”Ú™Ð,½ÐÓ,jàÐ"j-ú Zv‹¾Dƒù@QѲS³({ÑÊžÑâ„£É;‡ ͯEZÎ)iö:iy™ÒÄÓ,j}¥K«¨Y0%˜¶3Ê´1išÌݦ­“ÓòÛNë/Ôžfï?­×:¨É{€¨5£&G(¤&ÔÓ¨•&Œ¨ùÓ¹çÔöje+µ°DSkc̓O þQ-çšÅ0¨š›…ð¬Z]°Zè¥Õú+W •¯–~ k¯aŽ5|´¬åú¬If³hÀ­Ø,¸Åšgºµ9Ã5çšÅ–¹VÜêÚKÞµ¸×n‡¯ÕÖ,Ää×b"`ƒk^³ +¢Uób³  6B ØÜÓ5‹`.ùµL±f±ák½&¼×üªY€fB:%Ä›Ð<ÀÆ‹‚íµfÁNØ<…ÃV¾Ä¦i‹mkÙØòåc{EæÄ‘P6v²lÏÄlcª>SÍâ@èlå}65fhKñhëÓÖõP[íš…ªÚr­MxØ<Š¡[¾»|¶¬YŶÕ)ÜVªº-ÿyÛÕæ·¥³oËNp«¸f!o¸uÖ,p‹öÜ”rk_æÖhç–·D·ÃM·ö(¯Ëj¯ÛDg7u5‹‘vÖŠ-æÝZ¾oæÐâmg š¢b¼i¨ÑQ³ÀÀ ÞÎ-M×»–Æ»µÈ;5‹ib®ƒ[<âíöÈíæûîvfÁø»ñwxk¤YÀ yÃ9§YL˼•œèmo«· +5 iÚ[¿š~©÷F'74㛑™…äó­ÐÌBŽ}+øoú-.äo|öoepöÝ€ë˜Y€³À¡ûËJ3‹mzÙºJp%y¸3 °ÜÜšÐI†°Ý‘ö™p Gdp–OÁ53J§Y¸vU_ÁERÁí¡Y´Spu@pÝ3 ¤Rp|ßðhʉ"ƒ³v×nfA~ñŸkŒ c¡Û„5taèÍPÊÛy‚»¤‹»´(wNfË;wÍ,´é®gfáº#qj¨M Ç"ìv—ÉÌbüî˜yW&z'ÃÀM^(7wËiïÚ8¾óâ»›½-³ÈônSf1|%<ïB3ø®e!£w3  ½#áÁ½+ ½kìXf±ÝÞ5Pf±¸wÑŸzWµT½‹ë½[&³héÞŨ9m¾kêç»[™3®C‘::󶶷̺÷Ö'LL§Â/:x}Sð²óvwè;99¼æÌ,@êBO¥Ìâè¥'³hRC/΢¯žö¦2 J«ƒv˱Ï/³8z^Ò õ¼Î™zý³ª7u½zݦ·`fQ-?ø®o?˦W¶ÿòzR8Ö—‚½U3‹o±§jÊÞ…™…@ÑÙ«reÐêì•ÓžØÌ¢¿öâöÒkpoˆr/éÞîîeŸ…ŒøžFõ{·ø:ƒÏ‡ +Ÿ]_‹™þÄu[f1_|=-Žoº _ˆ•Ylgš™Z‚VÏo¸|É¡Ë7'.ÊX™fiØËçuÔ‘€Q„BYÔ'Q<ç—¯Eì¬c}ÊÂSiéO¿$~F‹Ejn`x:O¥c/ŸfüË÷;0_ð†ù–e‘\zÁ>%T‹ù$¡,N–(‹l˜/0(æ“î˜/"_$óåpø +Ê……¼)móå)½Þ‚¬ÄwŽ²p"Ê‚.U;ŒÒšÏ9Ê‚ ϼAšš¯ëDd ¤,(#£ˆì5_ªž-`#>ò ¤ï¨$ îš/¥æK - ”‰1ÚjÊâ²:þyĨOUÕX Yا,z¿;ÁO&~¾ÇÉ&† Ö|5Ä…5©,`‚ÙSó­VYÜhýšæk¢Vž;ó ‹³h>ü™/…^úÊ‚§cáÞ̧ܕEhüž Jg>¿•ãG0ó9–ºNkÎ|ÌÌ|ÍŽ‚f¤Ì÷7 ÑQÃæ~e±6'&W+ mÞþ•Elð2ß|áˆkjgš¢gf>3eE²2<%‘JÂImmNæ3 í˜çx6hR !sÇ̧£¦ÌÇÆfR`YX-›bF2Ÿ;'||IÇ|Ë _Ì h@ Ê ð âf •eá‹ùT•ˆl!Ïð¯˜ï Ò×ÔÀAc¾ ì€OÌ'æ;ä'<´,к6ÀÓ4¯2„’égYŒ‹eñ0,‹>ËâK‘eÁ³,L€ùωa>i"æ«jÌW{ƒ˜Ï÷Cöa>NM]˜€AÃk?Ô,‹É1_&˜QÌçfÛ§e†®p·U³,Â0æ6›£Ã0_ + Ë¿ª0ŸKó, +#˜Ï÷w–E{R¬eáÌ>nÅŒ¢Š–…¤SCfYË"úXÇeY`ÓâªLüòíYÃîÈì~+ÑcÔ —O§RDHËÐ.Ÿ¦YÉ‘vB“eAÎ×É/‰¨ ¼‡eñ† KH/ßo!„üÊ‚uù$%àˆlek%árå̺+ ù4‹jÂx¼|R‚á!t­,t‰_>ÃiÏ\Ypæ+ÚÂ)Á^YhÐ/ÅnÌA1ÿ9™\ pÆ|¶˜ÕÆX‹˜¯“¾º¯,ø-ù% !Æ|vøÆâ¨Ù›AæSpùôÛ6'ÝÈ|G+óý“R\æPïU¤8Å +PÍ|¾ÒêÌ·ê ùaÈsËb5ÎÛE´54(*ÓCä|ÏaY 5±,jžÚ1°ÁSÎçã»Î'ʲH×@ç[3Ëâ~Yo³bY´"ÆÖùìyÒ×–…Ñ+ËÂc:ŸÊ„Â6–Ea_wª:_6ü|H–…6^N#–EÔc>ßš,‹¿Ô©“-6³ÆËÂfM¾šñ,ÑŸNcš¡É¦3ü‚1è‘Ž²,Æqðœ“$!­Ïç¼)–…ÄR&ÕëeY@?¼]÷ç«Å½ˆ±,ÂZ`Y¤ççË*‹eÑIXÄ^YxE«Ïýù¦h`Yp[0–YbY /ËBœäº§ä'TbY@}–Å*kŠeQýîÁþ|î›°,@_–‘ÿ|\YŒè«¡XðẻûŸ¯KÒV ÐÇèôŸïctd%̲>Ÿ Lj+ 1Ÿ¯CúV:X¡Öç»”Ï{kŽ•Åä=_—Bß* B‡\QÀ ïop‚>Ù4è[J˜Ê¢áOø6*‹,Y%I@è³5èsT= Bß¡úŽ +ƒ­â›…¾á„¾¹ŒŒ©,ª¶Câ'ú;"Uív=Æ6¥²øù¨څ稆*‹£Wz•…§ú¼g%!jÀTY´ÚUØÐ\C¤;ˆDPž•EŒU+Â’„ò¦Ù1UsmÖVYÌd0.ôù’¡ï‚©Pý\¸Ð§;ÝÐ÷ZèëßÑ÷n¦XÓ—òG_EVP¨•…ý¾ìòé°ô¦¢¥Û®,¶WS¾t˽¤Ï¦¥ÓôÒgÒ­–Eë¯,X‚HŸÇîÊh‚ô=ÐÔV ¦‰–+ ÑPRr¾~§WUVR†\Ytë¨N+ äèƒKª1“•ÅÕ keÑ„>• éCN"}•õ«%¸ÁYY@íðÁkWØbŒéP+ Ÿ«Ä®oÓ`«+ ±ÔÊO:C· ’¾7+‹´‚1é+eÄ!(éóµ¤HŸÌÏÊ¢y=a%( WY„* (H¨ËcÄ7WY™ÖIßÕôIúÒUeQlMúP¹SYÌ +§ô-¨²™˜ôw\e±e²²È:ÊbÎIYä)}ÕÛ¢½ÒÇ‘Â…Œ²€±) eHXI(R ôyû—²XLú ¦,àú6%}*¨#}æ!%é[8eqT S)-?UT²sËT€qúÐ2˜ôyûôu^ª,F&•E„BóèëP¨,ÆC}*‹fzLß8.}ÖµTÓ'¨²¸€±Kß[’rV€x +#¤LŸSÑô¥î¦ï™.¾¦ïчÜæ\€ ÌêîWPý>N@1í€}Øw€}8WJ)¥´3Yƒªª,¨ªªªËUUUUUUUU@ÀEøvttttttttðx1K̳_g?Ì>?e ȈrQxi*{!üÂûPÍ|ðQhlŠâlˆàx’ænŒàtœÊL˜\nK®ùo‰«›.W“+-ůªÍC™ à”š‹ã› +4ìvíŽÝíHÅð €Ið)×™×K¦ÇÄÂë`˜c|ÉQì¶C]ËÍHË÷˜h$D·cbÓxnè ZÀd +X‚ÊÁ-¸8OgfŒæ-O/û‰OIž€ˆ™h®—¥ú‡¤¹ˆò˾òç©^5ih¬ï;»ì< çÛ@Ñø“¤ïÒ,ãS”cû§Óvç~(!²9&¡²9$¢1?ŠÌn˅Ϲj¡¥ Éw&š/´ºë’YxÝ2 ÿw¿õºšiÙ.#-ÛE^m)Fs¾æÉæã@Õ|¤XœËRÍ)–ó7Ìö?ŠÈLïi­çexqªK8!r¯ðé‚#I–û#Gñ~„Ž +_• ´Ì×¹’å²BëüR’[nè-» ÁýEAô»â•ÝGn×ú,~wÔ¶ç°^º”ûÞ ÕýÆìùŒ:S‡¼j3qr½µ4Åî+Îq¾dÙ7¼JrÅù–%º.ƒÔš¤´Á®üKMg]Ó§q; i¼Nhuÿ„Æô¸_ùö §[båwÆêœFJÒì—’÷̬Yß#í£<±Úz°á7äÕý¿0rO`S몸AÌ7þÀæ×à#ŠÃD½Øˆºî?‰§‡¤”¬ÇðÔ.`Å(W°?TjÛ 7Ý“Ñ›Ÿõºù#Æ«4'˜› TŒï CuV¡ä‰âÅj¬HNGò#¿“ñËy½r¼o/O1]­¥žçñûÂÂÉuÃö R¾oJ¦â ;Af£7ìÍЈ>‚ç*v¡<¢w0¡ÖLš_öI«¶J,¶i×[Kô ï çg$µÞ8ø˜â&Ë0œº¾£y¦íVŒaù:Lg°lÕ €Ù r´ÒˆÀe4šÕt݈ž¤2A*4jyÿ£šù@Ñ3ÄŽÁSE&Øeö$ +ߢSi Ì0BEð’ S@#„ΰÃ4?1¬b“‘ÌŠKyjÁµ<µìz¸ê+•ÙÎÖ+.OR]ÃÎÒ؉ ÔŒô+ÎùUeyL,¼ÎFšÖoƒÞrܯ»ÎÅ°Ë­C¨µ6‚ür#bÛüâ~Õ{©#:ž†–%—$hŽ$jŽ®fžéü  V܈ž¦³é·Å*8ã•=ÐélE°J ÂJ›ÄNUŠ²¬·‘®ÿÍ훎§šö+»k>R›?žjó_¦QóŽbU½‡šÖ µØH’\m'L1¸—¥ZŠ›»*B»s?™á5±m‰Ò+íëÏéDÓû–%YîE¹æ»0ÕrJBgxØ®|Ž˜]ë•Ú÷~f)GÂI†±Õ4ÙüäôŒÆJ–ÇÓ”×õ@Û~e|®§I§ƒZéu1ÍñýQ+ME9ÆŸÃø?+6•çW¾#¨T—!Tr÷`ÅlJGðýÐP›/ c[i†µY!Õí’†èwÆl»ÿÒ<ãQšalÍm;/…4÷?¯_m'fªS3’Võ¥–š‡‘ëmÆ™î¯îÿõ+·ëÁŽõH°¼‹¡Ö·0×s"Ʋ~$)ƇÉx“ãZÍŽÝ! ‘åa½ñ:'¢:ž•Q™^O­ÁSE6ÂÇ +m„ØeV¥¯è½P‘»Oô¥ßuq½• ·ì#Å.>„˜"2¾¦³J®6ŸjZDŸ;Šÿ©_5ŸçZæ¸Aš{ˆ RÏfµÍ8~ÝQšaøåÖ™Õ—íŽá=¹r:%Wš +àTNjÆ©89Eó=Ló¨8Χ*G »½<ÅÞ¬\¸¿‰ˆ^ç«’í3Ò²¦™Î¯í~ÓИXÒ™ÞMÓª,‚hÏ4\6å¾ý$E­·e8ŽGšÖ­ïõ¬Wq†ÉÝHÉü•¦8ÿ»Ï(ŠÙ[ ÁÞd Ë÷’b»¤Îé´]2=§!³ZEj Íï|Õ´\ ³'³4ã“ÙvÝ’«þk¿s;!ÖœZÛ|¤Õý/‚çÉ,Áü X´íÆ+¶¨Ø5ß‹†êsQHó9ð|î +†Û ¯ç¾ŠÓËÞâËßPÉú`5Í¿yšå~¨kþ”K¯ bÙùž)ùy5ÛX1ûKói.%³É8Íp,€Wh(Ê1žhÛAµu=-x~ÇýÖïŠÖw]°Ú¾ƒ’éd¹ð)„Zp"Ê°ÝÇšöÃøUµ¡Ü.L¯µ =Lð&Ø[J³ŒŸ®ù5ÎôhšžÃ²ï^šj?«Æ_-½ûÐmÚÿâg¥ö¢‡k­¤¨î« Ýu*Iöÿ¥™Þÿ¨í~–KÇ;RÑ{i>糇°¹B+1Žå|Õõÿ¨]÷›Ü³ð*΃`Ab90‚„gĵ¶™Šå[·îÀ Ñ#Ð É-àÍ/èÕC—ÊV¤^ù•dÖ½‚Œž.Ȳɭ!†ˆÍ@$+ÖˆK-ÆÙæÿªï~×;v§e”VÃE¸•vå«Õó,³}Át XüdÁ…Ø©ѳu6ÂgKMÆqËÍÇJÎû¨_û L¼t#(FµÔh¦bû’å—^T‹3ܧ‘ŠÿÀì™LÄÖ+Q:™µ:½ŸÞ´œGñK.hu×±í¿qËî¹ùe¦ú_ vï¯\¸_JÍГ~“|ó]Žð= Ÿ¸íh}Ï•é™™ÎôpŒUs +B¢TA©{Ë%ÓãvÏò¢Ûù¿ˆuëªì¼3+ï#©ey߯|+©NÇôm³ 8¥ãE§ô9ð¶þ§µ÷X°›ÿâß?Š`q%Æ4Y¥ß± ±Ö¬`Ë^Œè Å-<‘ú¦¡¬ÖkÐ<Ósz¤Ð<ðH•éDÉø+¤±º'#8Kv[nÝûm· oÈ‹VcË%ËJ×û0Dê7Ý0æ·|ωnµ5(‚M˜A‚‹Åö—¦š¿ä¶ç®`6]‘eD¿ëQf¥M I"y(µÈ`šæ|Œ3ÍÇ‘ªùƬÛdÿ[ƒÐù¦ÔÜg=ã=Ž_o ÈðÛŒ¢ïTϹ,Ýx?êϨ>7DÔÖ[½ñÿÖ+Ÿ#‚×ÕTÉö¦XÛ tG#-ã¯ÝxÝU=o‹I~Çt$§Ó©šïÈìºÜ’û>Ü0¹’§–ZÉòKmF*ÄŠñu Zl9U³~F1lÇpC¥†â§—Ò çg¨búf\Ió+?Cû]¦`ú,Ö[ìg²ôzËi¢ý8Ïö\Œ³½wqší/Móýhÿ‹ºh4V÷¾©ÿ‹Ö¹ 5_ö@æ|LÒ=wã|ã=»n=ªPún§úÕoðâæë'hë„ w(ŠÂ Y…§5;\r3ˆc¿”ûÖ3 +ZÓ¸Pr¼’@²<2bÏö$¯œÀj°^Ó.€RÓ. ZÓnøTÑx#ý–¥DçËŸÊê€Õ6X}÷“¾kzaFiyRCtºè6þÏ‚Ý-ø]§ò ƒCbçwõ«¶c%ÕýKBcuAªœÎ¥ÉÎÇ4Õxš¦úÞr4ßežâû/;ÖŸÐÑŠ%¼X¹t ï?ÍòM'³lÿ‡×öÜÔû¶kvÙuR/¼N©u뙄êu5Ó06ßÕÄE»ƒNË쀢ó9‘ Ù¾¢'ìmHeïËKeõRZÝÌmG)–ゾe4ª]º]MTlß‘’å€Tx4ng´ÖõœZùÝŠÌ>â‡IÞh‹ f¹®»~ßvNFr¼ ÕM‡ót÷g éüR ŸãzÇðZ˜h9à>—õÒñ”ˆætÌmúÞ%ZˉÜz íøm¹]ç·’Æò”YóEÐ*MH•ßý@çx=Ϲ—{¦§$Df÷õÂëjœav>Jæ,š xmû_šh=‰rÇSUç°è|¼ö±éžØt#Æ©²@Pïš'[b .ÄŽ•šG1|Ÿ‘ŽåŒ[3žHH>·•T¯»~ËwçÕ˜MU¼Fó<ãeøÂq!D±]E™ÖŸ$Óy™§:¿ü¢ñZ¯ø®¢…Ø4!Æ8t=:¿U¿í~±šÖã8Åwß­Ò–å’\µ¿ Žë¡¯Ü$È8¥FžNj%جwvÙx)Ì0 î†:Þ+¿g=Ó‘œÎ‰ˆŒI0k-EnóUÙý›¨zomË1}Ûjx»gzÈì{Î(h /¨›vh:§±}ç(Ç|À­·Ä­6”¦_âô²ËXÁâH•æl‚Ô&€Wh0’Zp!J1= =gíÎñœÚv;ש6,UÙq‹ÖC±Ù81Ší*~¾à^ÇzÄ±Ü ô|wzÛx«¤ºÿ©ˆþ'j×rAà±9ïí®H ǪÞ(üÆÂDïs¦ê? ôœ÷zÉhü,Çq`²VFDð81ÒÚÝ‘—¬Ž(]Ë¥¹Æ^–b¶—æY®&jÖƒ¥á!›è{0²‹UT¯kVåw4ɳ>†¹î{È™:sSä¾ Ã$º–ï@ª{nÆÏÖ[ ,¶–¤™¯ÄÎíŽÕ9^P +Ç+1’ó=Ur[ÕјŸ¸ô2ÃTªºïMCgvVDkuÝÏÑDÃä>Ug>ê×ÚÑúž¡£Õf¢§«­&šæ±ï;!¶=ÇÓí²ŒÎæsJDôº%¢9Ðsî7±F#Rå~ï¶,¨«VAQš^RÚ¸ŒïS]¿EÕûSEñ¹bVŒWQdš»0VáHÕ}i¶ì†·{vXí‚ Uí'^Óúlw¬ÎGmÿažè¼OT^dz$Ëó9åtÈ+|®|FÃjÈí‚e¶^íÂýC­:o#=ßU4’‘[N*)ž—:’ßÅ”ÌêÊMdw`¢õ~¨5ã…‚Æò–ÕyÞJñÌ÷yºíbœe9#׌aØ Y…û©–Üs7V/; äV Tojãÿ¤Öí^Ñú«YN¨5ãµBìþwèýƒéu#Eñ›JÒ«-xM÷±Bë9$£ö¿+Ôö'µj¹GªwÌÆ7Åé|Î5ŠÔ˜PØ=Ç ¯’ÒïªÞ¹vˆÍ¯ +•ån–Hí¥‘¹f*ö&ãÈõfÂgÅöáFªLÉMç–Ö|*¦¶_ +)Ng³ìzc±Óõ²~·êüŠ3̃¶³~Ź@ÅrX"5kI>'U4–‡"åKӔΑžñ,N1\ËÓ ×3eÛ·ß1;°/Ü$¯C?Ñë„Øvuí^Û—¦YŸ]ó}¨lþŠs|_‰~Á6½]©à5O3þD°iLŽ“‘š÷A,»_ÕÔÞ_A¥ùZSh~V“{Î)HM/‹í…ÐëíÅyÞÛ<Ù} ªüŸÔÆù^0ýåÂõHîúm ÉlÏíŽíOø\•ÈyŠ‡Ãk?k;_õÆç‚Àt;«¢³E¨ÙîÑÃD®:ªç­†ÔònŒf9 +)Bg,_1áWšëî ÅëÒMftO,î˜$¾z½æHCðŒª4 $ܳ‡™¤¸©³eDF‡*«ƒªÏõ8ÛrLDuº.¥º×ÿ»å4ßµL–SE—…Ès/ªtÚμBåÇí8™{²â8 ëšŸ¼Îñ€¦n»æuLª­çY¿qÿÑÐß¹uëy¦ëü;f'Äu³` Ìvö;ÇÛ‚åvËë|ΧE¿)Äi=`/²“)×X ½f%¥^³zb³A)µÕ®’èù'#yúEߟ^¶ž(ˆ,ïE‰ÞèárCIšõ¾i:_#%Ç)­pýÖ‘™Ÿ|uÎ÷ Q•Ép±èCîùî ƒ›™’ãNœáµ“gX-¸5Ã!¥åÀH绑Q›ßôªóÛ¡¶ÿ*Äöûp¿Þ€\±Ý‡=Ë ±m~2»æ_)ÅíÊOr<$!úŽ³,‡ ZϽâ|K5Kíþbk…¹!‹ÀÒºMbmÝ&1·”©É3«-%Êu§‘Šå³Cl~¶¨§ô’Ù”ÞtŒæÆrMJì>›ÆÃ8ÏpÀ)|n'ú¦ûeÑý"¥tÿ[Äöƒ…äx@ͯ¶›¦˜ +3ÌæìB³™žá’ÛuÜÑT’ÜNìIN'†‚û‘]5Ÿ¸Uãˆæ¤–ÝÇ¡žåt¦è¼É2 …)¶Ç@Åà¦]vÉ-ÿ%&­"™W ao4R3~*”Ž2jë©’äsTÜ/väFãùçâXU¦ô–ãª`÷¾#™EòܪŸD½È\ªá±&S˜{;–&&e¡E´Oe—#'²JÔ˜Fèê\»çµé×­ß¡¦åŠVy÷åÖëZøºÚ¨ ©Q\lЯZnhége +›ýa­U‚¢Ä¾§*²Ó—×ûF‹Eࢋ Dºuá¦èÌ[K‘r­ÅpVÍq²bdT"´Û”¨·Ãýj#yv¥•(ÁÚÔ6ÝP·n#gÝ’áIÑÿXLp9)%xžé(®šóƒÖv^ÆiÆï|»Ò€HZe!Y§uŠ¯¶*Ej(+F’6ªÂ2š„ãÆyzò2bꊡUÒ’Ùvµ]œb4å×™”ûþ‡“ÊjÐ<Ã÷F*4«ØÛO{¾'½è;µè¬&TÒ"— a©àz¯`’˜Ä5HMê Mê nÐø ©œâ”$fÓuæa¢BËI¥†¶ÊŒCan>Û.5“èT]溅¦b÷³`øœ“ÿ7¹lÿ×(ÝOZB÷‹–ÐúçR—ÚÌ9éÄÀ'4aµR…$¹0%…¥WWèoVWˆÉj LÂ*Ë ²JsÉZ=‡ÊmX"´lfCJRÛÁªÌû¢¤ñ~¨ã{¬h9¬{?tĦ3rÏt*Õ,·aRÖÙqi‹¬K–6„õ…4fÖ‚—d¥ÜF%®1fѸÌØE£»g5'X'Üšß’\¶}©hžOCÉõ\"¶þÇÃ-Áf{–¯‘ñS&¯¶íÔ›ñ)êlÃJÂøýbátzŸX¯ÈœHahÕ¤1›éìÖÔ”~oãz¥¶íGvËpÇ ­ú•»µaÈ,b F'Y‹/8 @  ”ZNx8NDt8P˜ !€E +pr–ŽP#+TÙZ„V2rÛŸßµœÎt­ÿrËê’‚Æìf©È(¸ñ–] •Ö,œÛ*Éã·°‡Š@ËŠ.+.tXpÑaG(𠙉 ¥æ¨@ÉÍ,`°02£”ðk¹t£ÌQ£1ÚJÕj~éôÆÁ‚µ·f;T¨Œ¿‰ñÆîVZˆ4‰\X´203ã#† Lf´ðC…ã  ^(£…™¡55Ê87Ñ@¾®†§³‰ƒT[¥‚ÅJ›@µÚG­ºiˆlŠÈ¬.L©Œnûûq¨æ;‹¾c‹ºÎVºR) ¿¥,!2&.€àˆ€ ˆÀAB(fp2#—“A &†d5IÅ´è‹å+¥f&qчA`kÄ.ù^ì’í>nX›Ö†¤d¾[°Ìf\½a.®Z+L!UÂçˆÈŠ4à‘a‡È +#F `Á¤›Ÿ Í'˜e)i xÔÕæ­ãÀDïþѼ®ܦCnÏ{ßvl§ +©õ Ñ(²Çm—àp»‹¯™‘™rZˆ„:ÏÒã¶äˆHj…è(laÇxÄæTDÚrK—¾ÆžFaiV"ôÛUHm'arµ¡Z¡±<·Òh°HkÈäT…˜«-/M lhœè¤øU€œ:U\æ…eѨÂÑH„âš­:A-Úò¸F…ÁpµÐK¡3ÚÐûNd× ÅõO¡òv)¬m*eE'^•U’°Z †:Ó?˜¤:%@c’¬F¢Zèé2J„6j56cq%3!5úŠÑr½+Ü,¹g§êjŒD hn¦t¬xá@Á +@pL P‚‚(p$€€.˜àPq…E‰H˜P1…帀ÎïË!éõJ‚ÅhFGk¸¨P™mú}Ë)¯rÑ×L/E©ÕÖ‹ÑÊ_Q#>(ˆàà`ÁLJ )8 pÀG HÀ1ƒ `àp€HPcC(”àèP"ä’ãàêJõ&¡VÑQ¤VeDFé<¸I,Ž†›…ŠÄ~€ÓL d‡#C `à¸0!"p@à€ +hà  ÁÇ +8àh`$”à(aⅣŊ™@D|€Ç¢Î$s!Ò™Æ6Ķ÷έܯõŽý±Þù\ЊޛÀÑ~IÐ䆀"H8$ˆÀGLÀ!bà +Ž„àh  … +$8Pœ@áP¡¢…ÃE #$=ø†A˜×* Êì–Œ(Ô¼cEÒÜZ3i~µéPËr$̨2 +Zz„H"¥€*P8*DÁÁtàH 8hÀ +$8N˜`ḰÁ‡ˆ¤´üH`.‹Z`¼V°Wlž­XÔ|/ñ#E6#¹uÖcMÛ}°g{ã׌€ÊM•ƒÄ  hàð .pL ÁŽ +HAÀã$y€ˆÌ5–` +ãVýùu“8~ÙŠÙò¡V¬æÚ…ærÿÒ°[ˆ#´ +¢”Ü7EÛÔyŸYûKµíýÇ1©­Á§ÚåIŒµ F8$hƒ +# +x1ÄÀh) CQ©5‚ ý¢ ¸ÔjàSZaPZ¿BºElœ"¨zGÔ þµš¿ã¶kQY®ß_°RhlŠRŒˆÑÃAÁC‡ >bÅÍ(ÏFd¼¤h hñ⧀ä10W€|EˆK( Ǩ•‰hPÙ…KU¯Ùn¹}8©ÒJžUi?¬ØýÅšÕf¢FOx1*Y¸2ÐAD +dž%̘y0Ceº@ŠÌ6#RX¶ˆ¨H AI6`ã š ¸”º°Äve0zÇ!–FäP¦òŠåÔ:gª¶[·fzÖ-Y”¥Ö[m™-O\pÁ‘8X@ dàÐÀ!‡Š4Àð‚²Tà… ÊAPjåBZtŽÑ^…,³ÔœØ±:•¤ÿƪŒÊSE8Î7Åó¬–ÞzUJñ.Ñ,7!—ŒW³ûvÇîT’f99Qs8Ik“b˜Íy-«›*2«7ÕýÛ 6þƒˆ5vÓÇm¡Ñ8+©é qÉêXìH‘…°Y’‡èyª«H2ÍGœIt@Ùu íÚ*•6òÔbYv½0·ÞZ¢Sj(ŽOd3U°7gÐ9/t´þË(Z¥1ÉŠM B³Ñ~à•ý÷UÑ}&^…Éeñ3ôf©Ô˜Ea?Š´ÊâtZC}éÁGeº,V\‰¦×]fõ@凴ÂÔbCñÃRÑeã‡Ë-$8Æ3³r;gÖíÿ~át;Ósx=ëÁBr:'#z”Ý®G†oá›Js!Ärƒ‚ÉtHP_t%Ñ¡zƒ£U2D' WìnܦíVžaýA©¶%—›Pkî?½l;g—íjÑx›jÙmçZ^ƒ~ÙpXGcÿ-·^'£,ë/ä¹_®VqJ"¯~TÉ«/vR‹q5š÷|·äz®â¿MókBÔ›øU½iðI*3Qt"nÓüiø®â÷T?áÓ—zÝü+¤1:-¤±:jõ=gÃ\óUa­u±Ø‚à.½IÔ[,fe0F½<¤Iu—íÔÛÈ• -Æë„Ö#DåReBË¢Jc2i©Õx£Ò&ÈxÓ$~šÆzªæ=–(Íï‚Zó£^sŸf +¦‡qÜ‚ãÙráy¾PwËéÛtI>ÓuÀQpÃ!-þL÷ äzdƒš„fŸéN¡Ý\Áø#|¢ÜÀÅ7ô,Í›ß4ÝŽÕ+®ÛÅ–„”Þ‹8Ÿè+Jéà7Üö4 +÷M4¿gŒÖ³‡4‰“­²Ût«ì&ž_ø7ǬDt¾)žCo¬SZÉUj­#êdæñ¤:k±j¥¿a7¨9Ï-RÓÕHÅöÃ,3c—š #T™Ëµ*®ÄJ”á— “€hŠža•ˆcTŒÄSëáÔº+¥å Âlú…ò‰l‡û7-"ËuÒ~—&˜eyƇð™’©n¾–;†çó4çq–c» +QÇb Û_ ±Út¤æ¸§ ³¹R8«ÒüJãzéuD_³¼ªVþï¡žã®`u\P‘e4VÇ$ä¶óAfù ak)É1› ›(¹ +¨ùÇK-åù7SÅ‚{¡nÝYñ>(h®—⻋à¹RkâU7`ò #!ŒJ“¡ŠÝAç|öȼ¯‘‚ÁÉ0År¾(oEHÖ˜Q’WPÙ’¤`ɬx×*€®Þ€®æ¿Ov¼ÿÁžùÑîûÿÔªý6I09B,»pûÅq +fÊõvD¿óÐÓß‘’åÀAó¼ß”='ªî!Æhm¢'jÌÅ°ëE)ƃƒævCCñºªØ=ÍåxUFĪùÕn».§‰§ˆ)mƒ‰ ÒœêUÏ• ³Ü:†[m!H°¸áÕMD=óQ’\pŬµiXܱ{¶;)µíTGc{¢Îoi¢ï+LóÅ)¾µê?r›þÛH³Ø:€Ns”j•–HMWÓãOH9’Kpa*[áó’ëúÚ¿(já‡Ûó¿È=ï{´c¼/[–CjÛûâ´½W1’ãNø¬Ð^üÂv`?Ã4÷µAp B|Œ”üp?˜Oq’aÓˆ,™‚•i˜ƒf0Vòžo;æŠuÆU*ßa¬Tlhšê4N5]°úÆ“iŠåJŠ]l(€¸ÞTåGŠg?ÒPýÆN“©L_¢ö‹Iä%×rd*kÑãUg¨±’'¬HÁ.Ð Ñ=Yoʯ¹hÊLƒ ”™ ++·“dX vë³aû”¥ÔÙ ßPZÃO{%ºe¶ ÛQšTk~Dq•©UÚÏöëžcíÒ³D³äb4£Ø@µ!XQ>Õd¿àN ]ö¦WœÌӬ籞õÈ­úÿ»Ž÷=×0½0[žë Z©©b)µî<‘›æÇD½ð'Ï*8M¨µŽ¦ÚŽ7ËUÚâ¯0^Ù\˜O!Ï+³©Ö Õë”-÷‰\t +nÿ†äv3Mõ¾#h5Vô¥Û yÉh„ˆašcˆ‘"ãp£Åægê,…°*É5Ç5½û2PŒEDŒÐ@Ãj\o½8mó;†Ygȯµ6Tä΃5Íë¤T/{ƒ;\5Z*föoQ’ï1Ks>†‰Ö—,½àJÊR¥Öd˜b¹o·ŒFÉòê,¥É…ÖñszØQšŸRµ¹@Åù"÷¼¿÷Ó ´þ;ô®KjÕujŽâˆXÅ„L¿$°|Ó.|²ÜNŽæ¿KRÜ9rÅÍPÇø5ÞGAšñ(|´ÜNü`½­è¡JCY†íÆ-zÿô¦ù2ˆVd>êY.™eûK’âû)]²†"v‰ž*5“$™Ó4óg–a¼‹r n…Щì†úuwõ7Õ¯·”§W[ˆ§² 4JõŽ!–5Lî-ó°lËdˆÔ:RóþFZÞŸøq³`SäÆ€³4O!¼rbÑü®&8Ê(¾7I†µ ±i=öû¦jÓÿ”gØÞ ÛU›è\ŽÚ-Ñ°û/;ÞŸ0½ì"Å-¸Ä«¶”¥Ø¿¢4ß™è‘z‹1”ZCYV¥é\Áà¨AiÿôÛö«Ñx>±7 *úOÌ®í~×4ÝËóKn%™Ga%+†pR!Œ’?¹†`œ¢;ÆÀr?` ³€#ô–¡z¹e‡Øý!—¼wqŽñÂê<ï%É¦Ó Su&¢Ç +M%IÞï@Ûw=PyÞ‘š®cÑ^;pB\z¡<ÏD©ÖðPZ) CIA%;–ÀêUá¥i¬$Ƶèÿ‡§¸I“kÍfZöÏ jŽè‘bó s$Ÿ +á…;.l¢óhÃømœ„§V&S‡qJC8efaÆËáƒZ{aIJ»8Vµ¥86Éw¼^n.Ö®6+ùŽÌÂën¢æþK’MWâ†ë-„Z á^0«-'ŠÆ Ñÿ9X1û‹!—›Š_œ´m—C%û}Ú0_9äõg! ªS òtbqŽÙPšbød—Ý*–ovÙuDCr<'£9Þ +08ô=·EÄfA6KVÆOÿ4ó{žs@ªÛxMÓ-¹ç}‹¦3aƒDp½bnÝüž©[Ÿ…û‡ÑypúžóeÉrÍ ²LÞ§$µì 4·@€ÎÍ­û$Ê¡d[öC…öqüŠûH‚Å­4Åz ÕMg¼k â¸Ó¼‚g½sÿˆQŒ— $ +VÁ…KV¤RC„2»¡Žï^¢6Môj Ä/IÞâüº—Åò-‚\m/‚_m!xªÐ€dÍÀhf½R‘ùžg½n¸Ã%7AŠ÷fžÊ>Ü8u`ñ–#”H¹@ð(Ũë~ ÏѹÇ|ÂÜR£Žý"ıŸÄV[V1 F¸ê0Id| ÖªàôÌ/Èìà#(”‚Ì¢ «éºB§ÊÍ\IQ¼Ÿ‰šý2Rñÿ¹=Ó¡˜©‚9øø½4³Î6ì4•]€9‚;ˆá¬’À³*k™ŒV¡G‰~òôzãðƒß‘†ïÏ­;ïÔZ2‚èb¨ÜbüÄú–åz®gú¾ƒråz6H+8FÄp?à©c§àÄtߌxËh´Þ$ù¶ðhvyà1rC·ä~0ÚÃØ4¯™Š÷1O4Êòü/Q’û,̲Ÿd†×²ýJBe7@|”ÌÀ~ƒV5iM×ýH½ä@’[÷dž¤Ù…”bCAlƒ™vÙ?ÜÉ'Ô¹UŸÔ&ä|án¼lH®4¡U—y†¹±b¥$½ÞŠXõœ  Ó¤è¯‘š÷+~Yx4Bó b¾n :Aò@Eò¦Ó˜ìe7I6©e°™ŠÐTÉÔhÓp„æ"~Ng:Õ09œiØžÅ0 ΨeÛ µã?ŒbTš„œ¡¶$9…›¡ø"ͯ^­L¤Un'Q©5 ?Dn˜çW‹SÚ C+X¦bàpÛh”Ê6ìD™ÍX^©ý¸_q?oXNôÆ£vå5Tžb~ ;BtJWj²ãH¥úÍvËí‹dî7·a¼Œ Ùɲœ_Ö“èÙº—ø±zƒ~ÉåTÇûA*²£ÖPõÌÏ`³D¿0³T/ñ“Õ6$ëi”èÿ ô+_â'ôöà :³$¿Ú,¥9>i°A‚ƒÃù“¥¿£¨…gA†í‘Uö\‰ 4È©!$(U@‡“(­yƒŽ™š":Ê2LŸ¼ÎõšÚ98Bî(S®•¥yÿ¢' NÒñ2°è)Ap)Bò+`7LS OÂã +@ `àDk æ| +ž)¶ /Kf\šÎ4äL¥•(Çz¨¹O´¾ó"Œ ¡8hðp … +0¤XU7acýÊ õ¢¦:fàr-KC_Èù¦Sœ_qZžT`¦T"D+µ8EíÄ(3A+¶@,¶H¬¶*V›+Ö³ð‘2ãp£4WYŠõ]šb¿ˆ2ËMd¹7ÂÕ&Á…«†ÀE*fBHô>yB¡-0"†‚FIÍ’dÿCŒä¥³ f¨c 9Z³Œ¤•Šâ™0{æ¯j; 3Qf2‚bûhü‚ ”.€“#XäHÅF#í² «Þ¨hÙ0Q°üŒá“Ú-Z„ÏÔ™Œc×FQëŽ8Åö±Ìr멆åNð|ÉF¤õ‡ž$ùŠ±<]7ƒXž“Aï;Œ^vš§¹oT·1£ä’£PBÝB±S¥öÀcjnÀÅ+ ÑÛƒOTšŸÔšœ¡9‡ž£yL´ ÿá¤Ø$ÁHpèÃxGáEËž€Åz¾p3ôÆ ³T—p“4¯€ãTÆ‚¨åV³ë8‘ƒu ˜…³?㦠àÓ$Ë.D–b´r’@%š¥²ÔR±ý"@B  +fªY|Es2Ùôˆ ’Y #U Õš‹ Øýêßp)AÐâ¥êˆ좨åVhÊ‚ ,Üv‡ž¥3 8Ct Œ\xàÕZÉ/ëƒNÒ !•ZT=GÂ*­Ðp¨³:Wq%|¸ä<ŒZq@Úð]E‰´†`âÃÅ“TêZ¡5(ñ¶ ‚;@2{$ÎÓ<Õt3Ðòÿ…ÞGx†)EÙ…©$ÂG N€Šª>H Xªc 7Eò ?FsH§:J´ N‚ +Òi‚J³©‚‹–| " T¤_r Ò2ä8•Å@jÁùlÅzç=ׄ2cñƒ*K¤ZãÐÃdÆÁÇɬ£¨×IJ;Pé¦!ÌtÑ-ŒLói–Û^ X„Ä'É3ŒÏàÅzVe¹å€ËóëÃoiL…‘ªìkæ×<É|=Zh/Kr¾ƒ¦ +ÎDËÕ’®œÉS,ƒ RýAŠPü¯ûÆ÷ÀåR@DK3ƒÌ·üâXÕö¢xG +-PÓ˜ dÕšÍÕËþã‚á+¸`Ïj¨Î\”o½e;Eyþƒ(¿ð#Ë-¸hX.D©·bh•6"‡‰ Ã,b ³Dü˜ÆZ³Þ\·âJ¥Ö\§ÖX¢]÷0Ì©,Ì' ([u&~St ¹"XÙª1ä8™™ÊzSÄz arÅ¡,¿îT¢Xd¶ZfªÔœ:"Ë, 5Bò8Bp 6Aj$\œ-Û3x$Ød»"Ôt׌±"€©–)äÕAø¬Ø` ÅùªW~#ø•#Äø€ŒWÅU›"P±J¦d ^ÀñP®iHžb d°ei˜¾€ /ˆ ”XâÐStñSJsà*f€$kN@ƶœÀŒmÙÅqÊ,¨-ÿY¸TÉTèȹÀãíb‘|: d:ãðK1z{8“ÈNÌRü¶Ör ê9/Cô.Bq 8Et4䦋‰†ã>–TdÓïøé"Ä€Ý#|¨Ò0à(™9¨Ï;øDµ‘ø]µ±8Åò;Öò?äxÕ¶¡ÆJV d(5!'¨Ý€F»f ³USÈQªŸb½¹@Åü(X>™mÛÙ@Íþ)Yr¤ú„˜ 7ƒ“®z…°KÎDYž+QŽù+Ͳ¿ÃÙ“ëm†’ + ‚Lõ«dV)I®Z c :Cô‘$XÞʼnžS4•ö9Å¡GiÌAK×Má†h.aç o0Ó]Ÿðy½© ÝtÍk™çû`ÏrÄmz_¢»Sv·vÝ7ì@¥©fÙc¹äd »ö-Nqlà¨0Z’VWètÕ‚—I9±áŽ”ÔqÕ¢r[ 5ÜÖò³ƒ ÁÜ€Õ6?©]ûOø‚Èn¦S9Ø®9õ NÏ׌á¶ç«na„«±ŠÅ ·iüo[†ëpB½w´`fZ!8Þ©ÿøx–'¬(ÕG€bqRk½Ž*¨MϨúÎG(ò“Àˆ41P*·ˆœ¥¹†™¥úˆQ¬¿ÃœßˆRÏn,}Û,ˆRÏú ">vPclAf¹ÁBu~<ï§(¿Ðl¢å8hß‚¸¥ÆÌÊép†UdF„Ib˜LÌ"ñÙ Ìm…o©ž‚ÇÈ ¡Å»f Òmc¸Y:³ “tÆóßCèT·V ‡Âh¾Yqšj› ¢×Z+3>Zi/K5yÏ yÍðzžb{ +“KMˆmë®f;JTj,Ej•ã8efBØD„¢ß<Ó÷9Ce+Ã"ͤ0Vì-wÝ`E‰5¦êå¢x…¦¡‡iìƒOT™‘:ǃ!ÜŠc!„¢ÓLÃgR¡uÜó{¶µã¸iØ-…é•æ·q»dxÐl{®ÅªUö4ņ šzËp±Ê\ªTf/’Ið4Û³•jDORÜåéeá9…Àb8§Ú(Ñ,´škØÍ&ëå泋ʒãVœb·¨ŸÌÆý<Ú²›–¬ö2 Ÿ5·q0«Æ½ä·âVoižãˆÕ·ßHˆ~gýÂû=[ñWìlÈE¿ÙXÅk*Ô,³+OV|ga„s¡tzË8JI`i^ER„¢à¦6:s©v¹ñlÃnGNä»ÏWk-…Òé ƒYjÓoÈ+|_-·¥0¿Î>’Zh"|–àÏmÚÎýÊ÷'ÌðÙu7jÙýàuýwjé4‚ÔõókÎ#¿e¼Šá”œGºÞ˽àBð(ÁY–â·ãUΗfÉòšÙz^Žt­²ªù!ͬ³ç·ü†¯ý`”È”P͵˜Œz@ƒM‹.ÍGžYfÏ 4?æiæ·,ÑzŸi›îÅ)æÓLÇ{.ÑšOÄt~y1¡Û°Ce6+×XNVL ºu÷±Û:3+¿#bÝu?,z.ŪŒÐ›¸í2™9¢¼¢0£sÀI¢¯V­ÉPÅðÄ®9—ñ"Rªú;~+vÍw–'9ÎG*¯ÓdÏÙ@Ñ{”圇1‹-ÉÕVBfº5BEù$„ªå>Ôl±ñ$åwP^¸ ÆEv ÌKgwe¦2¼)µ|÷€Â…XFÏ4H{†§Å¶]Ð,ËÙ0×úäÕí÷‚åxDBt½.8ždÔf€"¥é@ +™§ô­ï0òE+À¡fHÁ§6Ul$x²à>è\½ÍŠù7ë³*œh‰0ãÆÙâôB{`eÚõ¡§é­’,Ûm–í~5[†÷4”–5tV#FÌ—<Á ›ê +*4 Ó/DÄh1œ’°"¤&A^­)¿å=Rëö¿x¹†Ð0#¸â¦ûe'©Œ€1D Œ˜ùFàÁ¬)‚Áý¬ë:£ÕçÄÆõH“}À dµN‘SÔ.@„‘(Ýs¤1„øP ñÁÂL·Òy†pâçôÀ‹#U†pû„o©L‡zžZáu3¿&÷(L~4Õ-2›fÖ'Ú3/Ke/Š[o-Q°8ïW~wô­Óø02Œ: Ê2ä(ÕΙ‚Å…¸ ZO@f5´RdŽ +Á‰.–ªÖ;àJ`˜|Lª Ll„ˆQU€EÉ“'Pf°\ ±Gȱr™H¥êxŽT Ä(G4ÁWÔ'5ˆ,Ø•¨ `”™‰ S™ßÑ»É=÷=JrD#uHtj~B2ËÉ‚áu®â|Êòëžâ¤Bãàû¢GÜ5Õ°¸ŽãÔ˜r m²Kmç*Žc‘z½µ0µa¿äÅ"7—§–S >£JÛÜ¢@.-`’|ª€.Ÿ<½â^¤\l¾+¢¿ý¤n¹ª£1;­¤1»©#úµÛæ'»êüp‹¶K»ð|<ÿ«JÉMül‘¡$Ív˜¥ØEë „oj­KMGûWb…*Ã’¢ë€¯ÐD™æ ~žä*‚Wh'~[m&‚Vj.”Oõ+–Z‹dR›dêtƉ¦û~†Ų́²˜*Ø}% +f¿¡¢ù?*üË«QD¦÷ýÒéº]ù?gZ~{fåÿ(W^×ýÖïºÞù1z¶+«ñ9_5ß™žïÒlÎ)ˆF°êö·8Åø“¤_b Ç}­Ìl boÝ-š;Ÿ›±ÜJ³¡Ží4Õ/´jÚŠSlì‚Ë€bô>TÕíŠKjÇûßýYb¥ƒäÿ£•-÷Q{ Zá|«wÞZÕù#—›Ñ—n'ÃF&Á„ø½R£¥÷)Ñ®·ͪ´“(›É“Ë-¦ªµÄåb›‰íE0˜Ç«åÆ,³›@¯Ú:ŒYoD©µ -Õñ R/–åÞˆ=÷Y’\lpŽÚ/K±8êvLï'ß!«cy¿(½ŽH¥ßù<Éw¨”üé]ó?|Vl1n¤ÐTŒh¾ +^f*­‡U\ádK1nÙQ†Kñ—§ÒÛåjµv€‹ßÒoy$¨Õ†#óc$¥Ø:~’Æ@~Vg3E¬29P+k˜=OÓXƒì8ÄN•ÛL´Ü¿©†á™×õIeÏÁØA’k`¡r½4¿ÒJŠ`øP(±øÀCôÀÇÔëBL’<„O›Ÿ%2?Je 1À†K!¿ 0¨èáxð¡…£B +œ^¼`'Ô4½,àHÇ~ˆÜ+OfÄH‰pÁÄ£6,„S¨FyÀy*CñÓzás¥tŠéƒe cú˜„>¡Å"ÑGè€ÙV¬c J’T¬TÃHøœÆb›ÊˆðtEüŠÈ0’Su“æDøÌÀˆª€@Hx,³ˆtÊp£kIJ1€r“ ç)õ@†ÊUgH]âgiìäyµ¶ƒ³ï`Éölœ…$÷ ÅýÒ.œo3=ß™\÷^ï Õ錄ÆîN’\jF°µj›¯Ü¶ç\©Ø&Ô Í5ì4ÕM¢Zl%×(3 %)3¡¨3E#x“j~íR;!¤B‹`C‡Ã5oø ­I“Ú>Þ¬ú†œ&2?Mc=Ù08ß6 îÅêµÆ ’ó«Ý¹ÝöK׃*"ÓóB*Óƒél7®±ááTÅq1U°6áÖìV¬šßº[²:®"5¼¬!·¼'° cU¾ßvÇꂾô»éŸì²ý3Ð1œŒ"Ú +RŸy®õ)Ï07êY®ƒX5?ar‘M¹ñº—æxâGêìA¶Ì Å¹µç«^Ñgw-—ËýP-}.Y•ÓEµctOî›ÿCUë[ ­Ò`šb¸_ô|Ÿb×rCDï½³ÛÖçPÍù_¶¼ßÙ‚Å}$­Ð&8yjU€±‚¹HF©\«ÎŽE[mÖ%°·Ÿ­XÔ‹Þ ¹í€Ut¿Ã¸µ6òÌZ£¡†áp¤ao9R±7âV÷¡¶õ1Ï4¾º¥×uÑôšWùÜ/fÓyš+ØZtŸ+—å·jüK}gšåR|Âø9Vk"|Þ+~Û¯¥Ñ:ÄI…F†ÊÕÒ¬*KrÓüšhZÿqsóøa±}¹Þx¤è}“8äŽñ$Ç08`n7Ãø…Gi†Ý;ŒWi3Šaq0Ë5_Gʞ㡪ûØ ÷Ü궛Áã·g’üŠ|2s bG£‰%KNÌdEÔtÇ#xŠÞ°h¹2àÑC䀺5AÍT¬ƒpˆaFJvÀÒƒC-3P >5¡jð):kБ*cÁƒõV“û‘Vw^ÒJ¯a¬Îi “bþZäp88¡¢,怉3ì@QJ1 ·+¸xÛ$x˜Î|Rõ] Ÿ§5J¤4/ätËH”X +¤À Z ¤©u!†è b&‰Á%èmÂç„æ!lBƒ "EÉ1ADÇ +•ƒ«˜#&rŒ$8vµ(J™Ý\ÅìÜ!·þ;ÔæÏHÁÖz®âµ¥WšÿŸà³'+fƒyŠ¹¥8Åâh¨å¼ÉÑê,MÖK†²ªþÒ Ã¿fÁ:‘yø%Í9ìLÃ`‚Q¤«0ø ¡e4­ÎD•æz¼è r¶f8Ýt†Ñ:ô¥–§+ï4÷Vö¿eŠ…VÈÊÔý*k¬’‡ØÁ2côrѪù×-™Ý‘^˜ìo€8Y«/ø¯b«$eïs¨æ¸fÙ"$ßušìgûŸ²4ã±Û2½&µm‡¹ÄÀ ’ öKÍG +Ÿ;1šù#v”ê\ _ˆ§ Xiv} ·Ð¨”Ü~'»…pH ˆ+fÓÁ†ã\¤^m¾kœï:¾ÿX×û«w^‡ud–î®ù+‚Pr=Hi9Ch›hîúÿ‹‚Æð9—æךñ[vÂËvcT}į >Á9%A +(,dª|_3ö¨|R2ã‰]²½iU§jësÂéûCˆ¥–áÉÝBíjCnÛþ$p+!· +N’böŽäU™ŽUìíÉû}¤m> £š +SÇkíÃÍ’ü´ö'¹_o^šP"„Eh‘cÕZÇŠmƒ‹7þ€r‹…‘œÐhì“`”ŒdS\d‰Å–ia滆@BcÀ#¹¥€“kƒNRüç»—“‹«¡Šß–\·ÿ¨UûWœc¹å8ÿq »‹É}!þ$IÇâ ëAøœÎ<ŠYw¥y.ç9·A³«ð‚d–â'Ë®Â,Û¹L½ö¾-Ø?Ep mƒQ<†R +NE©u7Nµ¥ÈiBÀ#D×–S#C€AJÍh6™5€y†PzA°éº+èÍG˜Zn@-Y?«÷3O.µ 6]/d›ÈUÛ¯Þxú=ÇáT¿ÐH’anª»nêeÛ‰üŒàÄ0.‰èyRÇb­I0£©ÅqAEÉÀŒ)—*†Slp±*À¢‡(E‘é Î4Ì€(ÌP†DÐHÁ&É–Ô€Ï%Ϭ3‘¡˜Þ‚ŠÍA¬–q¬Z“‘‚¹ý¬hû Ò g|Öï’Õ1=›ç{Žš5Ã3ú¢ÙM±fu5Li+µ ÇU¤FãçÃC!®÷Äj›Õ’Ùe¥å¡™Ìð¬êwÐ-Û#ö¶²4Ë™ÃîJ®a¨ áB¢œú€sÔ~±Ef2äbû]×{'¸%V–À ¡¶L½ÖLU¥…ðM¡‰$¹àXœ^q<Ô5x}û¿Þzž!¶‡™-3‹Æ' ‘å‘Ïù 5Vi$xºÞTúÒW͔ڮ#äUÀCGLPK†óIÙsf–Î + ¡“dÀ›¯ϯ 3Grá´íG¯÷>^œbÐ,o¦bmX¾ë*ß5'2@®·–&º?´ÆéÀÞ³.X®Z´Èy­é8¡!x”뮃#U–Â$è|!„[ž EÍcÇÙÒœ0ZˆñPËz8Cñ)ʬ=_ôI³ +diµöqÔjÙšñXDiy4Í.¶>EjÀ¨ºO +Ÿë‰ÂéˆSº²J§Kbåÿ!–½1FÉS“Î8Ô3ÜÌÌÍ&JÆ—,Ãò$L­8 ¸ËÞ2}"{Ð"½âÉf ¿Yn3šOh~ÈxñsJ ÅÎ$M®?Jc:[¯8E#÷­ tKm$)&GÁ¦(þÀJ5Œ‹÷|BUFcýzã¥óÇîØ.c™uV5ç‹×÷Ks,Gƒ<ë]Œè9èú.e)¶§‘Šñ× 6]»ö·0Ù|/ü‰*6;]÷;Vl1‚Vk*L0|’먻µã=™#¸ƒŽkØ/úƒŒ‘ü‚Ì—ý€æjì²]…äĨñB€R\"Q\ª‹2™8¹àLœ^vœ*Yî*ÔöGÁé½/k¾ëTÓ÷(¢2¼®$²»¡oÝn&™þ§¸¹zQSäî@´6ñc¥6ã¨åa%ÚeÀ +Ÿ/¨T ©3 6Hh Z°m/P1‰!X™µ€‹š,„+W+t¾ Œ,«.ä ©o¤â{’¨˜€;J tü¦&ÌpÙ7Ðqþ%ßQ„Xc¨zŸãL÷EŠe}”NÅšÍ}»fuOBgxë¦3½ ‡…¦½ºÝ`ú¶ÕHúªÝ-Ñ÷]/Žyu÷Yˆã8=^l3Js^ƒˆ2ª!ÂføˆBŠRô›©øÍÇj–KYb±•ð)‚WˆÙ¦xÁ“á…^‰b¡ø©GšXfBîXŠõ6ó³ÃDÇw“%—›!wI2KMûÓ­ïÿ ®5¿®·¤yŸå’å}Ójp·d5\‚Td«YÎg9UF„g«‚'ˆË%iæ;ðbÉU@”ç%j¶ T²e :Jf ~¢Ø\œâ»’Wþ?ó¬r@Æ5ÅO©ÝÛ¹$åv-Hú] ³~×¼šé9³u¼;CeRŠEH®NˆN‡Ì¾ífÃr1„Uk-„Hó>Cn@QdDkþƒKéÊOè#å>!J™ÈbƒØ1‚ð=‰˜ÁŠU á–7øˆØ;Ú¯¶K«´B%¹ ¡ô  $xŒäÕšŒ N;ÄþW½ì¿N5,ŽêmÓI»ï¼¤V‡)²3^o/~ã¼ +s¼zÝÿ­P{Ž¨Uë‘×w>ÊmÏ¥0¿ðxžÆ pǼ¥Aˆâûúž» +±÷B-Zn%v{A~½á@Éûc—œ°»JòÊ9B8BøqÉ&ŠEsÁ¤9ÎU,oC%÷×õœü¦#šßi¿q;éVNçDT†çõŽÝñ4çu(pÎ{2^x1Fr2Cî’cž¥Ée¤È´ bŠŸª9Š ÕÛ!z"O1"J/L”]¸PËv„Þ!Mª53Y1À#xJ‹Ù]ÿYèH­Éð±r bÁÅ^½4¿î%J¬´Tô݃8†+”•ÖÁ‡i¾ˆªL¦YÎïXÉyN':ž òOuýÏ™žÿ-†Vkb„àPÉ0~ZoÛ-Y!!uŠu‚Ô:#â¦ée³cuÀjûþÌÖõ@iý¥×§õ;hµ~7Í’Ý•4ÃlK,4ê8ä¢ÿ)ά63S®2U.f®„‘ «æ™U&‚¼R“°âEÀy"›1»ç@ÙsÈë{NèK¯‹2¢ß ½é¼ÝºÞH±ŒgI®õHjÞ±Z–'œÖí’ÑúÝr:·3Iv¹u€—_ŠWf>ײ~…/ ·‚g|/ÑÃå–B(e6‚ü‚3I~Ù¨æÿTG6b¢V)€IcR’[Àðb²´rr%Gˆr>…<©Ò>Ü`½(Œ,“\œa6—#˜Æ‹mGº®kjåyMCh<±s3Ms?ºžëŠÅ­(ÍrÓ­Ø\»(1±4ÁÖLèX©‰¸©:3Zßv^0Ñ÷쥵^ÒZ·à‡Ê®»‰žóÑmü.eÏuü„ÙYy¹¥Ù}˜ä»ŽgŠöK‘¼ô6Ù¥²'Ä£ê¸ *´ %T¯3¢]°XbEX1bkÀ±:Ã`#e¶Â“K É#€á2… „ÆbX”Î`S ØÑ–[¡ÒL–av©9OÓÄJ“™j¡4µÊ¸àùÜÑPý®ôî%¥í\Rj;Ô{ÿƒš÷F¤Q=Ñ0´¥Ýg†éGðp½™ø¹bŒ:ñS¢Ï@rµ¥0Áìfô q<bá§;u¡Œn€âUgØqBTµÖÔŠC‚Ý“[t_Y?³í3†No¿"rO–¬¶ò Ÿµï>½NÜT÷¿ŸÊæÞQuÚ‰Œ.‡úµæs³5•ÕÑýCEo¼0‹~ã©¢ã–[wÊ}ïWsÜ‘üßDT¯‹‰Ží3†_v fÛoµ’Ña³cuN­¼Žy×ÑÉv +HšW5Ê.¶&xð¹CÓ™&–˜$ŠaQÚçj~»zÝ{ª×Õ+æöõôÖ#¹l9%·­—vá{£ ±SïoÁï>W¨ÍŸYj¡©a³$¢¸_'êåÒÄ* fÕv©8ÎF*Æ»@Çr@U5ÿ#XeæÀ„)…¢§¨¼Ü¾û4P´þÃø—ÂgµVóL瓆êtd&2:®#´¼ + Ç¥ 1×*¦ø 8¥ë­Ù4;ž'¼Ï#e÷¡Úy”ëæµé„Ú-»-rº:Og¦õM×ñË‚“pâ}W ‰Š'ˆ„š „é•¢ÈT¶ÃÝz 1±5¸$›L’Pð¦š/B<ï]øºÖV]iYp|¯44öïB2³+ZÙ{›çZÿƒ¶ùHçû•Ö킾cyØn¼®©Uïcšd<ÏT}2bËåT¯Æ\“Ô1’We xŠÒb¶b,‚ViD/å•æ`YÈrê$ùG"(Õ¶¨¨.ÈÎÐ fô@0ù ½ *¹™Ü±Çêס‡iî²$Ç1µp:f6^‡b<ÿUø€íA„ä笂­£³[He5l˜ê=Š17ç(=»ólÏ…±Øb¥Ð4ÜÁ-Сc¤`m(zœÒˆa=C!r£°™2 ‚Ž4Íz¦Y/ó,ï?|’ê ”p™2БSPÃ-àb)‘a‡¤ùKT|~×rNGòÿ/Jæg)B¯4ÃnG-´çøÍgUÛˆæü,$²:kþ?fÏöj›¿Ò”*;A\r³@Ãq!z–È´LŬ0·0ÄdÃnˆÒ VœZ¤—L¨QerŒâf˜æ'Ê0¾fJÖÏPÅø*Ð0¹#–íÿQÍrJm¯äªùEDo}ŽéUû™]öÜW(n‡$$ŸK +"³kfçsÂjœ.hmÏ••Ònt¿m9¨•üf«Uö¢ÿm˜ç| ž¬³–àØÍdxŽƒY¢åj h¹"¡:µëžf½Î–`ñ ” ·ljîâ—…6RÃÍ0×{¥TÝ #xÔÀ š*Œ`}FöƒåBÃáz™ÅXÁÐŽ[õÞHhž—D¯û±ù=Øó‘‘[ŽÉHÎOª×õTÕ÷kúîäÆ÷M.|OEtF7ŽóW”Zd<Î󛑗̨j¾$±Ì4è(½=†\j.Î2>xMó£’ÞuÕ®|Ž&º–ƒnçsծ܎ˆeû…X¶~x}ó“‚ÌòÄQnÿÉŒ7nÓ÷*·Þ?#µåIÉç¶Aí¹ŸµÝ'YŠ¹‘ø±"#ñ3E&Ó$ߧÛ8Ý ðØÁŠËcx…¢F©Þ d›vÀŸ±¼²ëhÁô`ºb I&çÕY +àÓœÄ…Ъ¬…æ¶TDŸ{:¢Ï…—È좈êvE౺¦¢ú\R ×±oýK³lÇYÆý>Î9ÝO*¯#bÙú)ø,7ãg„vrÉV1¿düŽ”¬¿à’£@e9Õq¤JSAŒ2³™vµ­4•È˜ø*¼,¥<øÀj(F¨/Í+$T= +Fç‘ŒÜ+˜ÍïÅçv¬f¹¥VN·¼ŽÑ(RÇ*(VÉ*Fáu+Fô„h–CIžå„Tö½z%Óëê‚{ð2”ÆÀb¤þŠÙMå|Êò¬jÕ|«8 Ë'µn»"œ‹"U™‡rkmCŽS™¤¸†œ¦3d Ƨ`ÓôZ@ÄW´Ádzl¥ÔˆAaqÁnÙMúuç«\2½kWž÷Ô²çd c¼¶[Çsrá~'¢±¹+#2»!ðüHmó{¦o?̳œw‘†ßdœ\o ´:¢fË€ˆœo.ŒOV´d +6^· æÓÚ‘h‹ÿ‚Ùd&èD?YŠå<èL©i¸q:“a³s½s;?Ke,Îðiˆ~–t†V2»ƒ‚Ó}`W¼Ï±†åSü¨ÌRíþÏó¬†ôšv•™å&2=¬w?y^Õ[ž^kBÞ3#?[m ´ˆz)€ë…â期)ÒíœU2<,$2¼d¶|7ñkJ·0ÃЄºj6`’d½ÈÜC‡‰-ã'ˬBJ“Å[5¾çJ~CrÕwi7Îÿ±¢ï8Ô1\OU¿rÇî>¸X³:àx¿^œb·b˜ý…ØÕ2¤J{á³Do9b¥É8Ïx *[ÏÔÂû2ŒUf.Ð1Q«Î?µó9/× ¯XMËÉDËvœé:ŸtVgDöoñsTn!dR×DÓø›({mïo¢i½´ëö'¯ë¾ˆ"´†/ÐulŸ1ÄR³`ÓTÇ ã4ö±Ì‚빞çn¤ê¹!vM—ZÛQ°ò¼ºÓ ë(n¥ùP×x"!º_‰¨>·…TfG5t–ÞšáQ·óñß?•ÕYå%yÝn,¥hz*G5߅Ы-ÄèÕö¡ÜR3b•½@¥èZ¦RbÀ_!hÀg ,Ù±ˆ'3À¥1—¥ÕZ¦ǃ,p˜¡óÁG¨,âÌjC^¹ÍL¿ì?T÷ÜK¿ úÒèwQîÛï ’ÏiƒâtÓnüΩçµÉùŸ9"LóÔ÷ÒPü´Š½™.±] ]i4Ñ2¾dùÕÖâßAŽb{ ôœŸ©ûCEï;ž¨YAŒ)Öƒh“È2kã4÷_½ô#pÊRÜp×9Y´+G­;ÝñŠ „ÏŒ:bÙöã•ÞNëê–n§ýÒóº`:^R]ÏÉuûÚ¹¿Xûq ê=‹mÇiÎëˆS2=¨vž÷1¬JS †³*Lи„iåÖ¢ˆe¡ã¤–! áK";!D*Óà;Š§N™ÍH~Ùc½ì-ÈòÞÆÙþS·tºžèXOõ²ûCBrº£¡¸ÿ+ôî;!½ûÚ!µ%ªÕC˜•vŒ–á-©kD¯eT¹léZNÉëvAÏVÜ"pÀ œÀén°Az“˜¹Z ñ÷ÊéÙ (:‡ Z˜J.€ æ X § 8¹²1ð,¡y ¹àZ7sûî{ƒâÿ˜g–Z!‘z¸5ÇÕTÃÚ˜]5ž Ž÷Ø4¾x]Ë!±ï=u[¿{`¢Œ"³õVÕz3Mô`£Ä`Ë\¸è7Qd4OtÞû¥×u»ñ¿m8dyEÖ3=ßUŽbq%H1»GQ,îGú¦›jëxXEi4¾[¸_ÄNÒY¤(nåu+Ë2Ðך ž*²‘¤ÖÚJS쮂Xu†òìj;Ib­Iãìg¨ì¡´BûHnÙ_œcþK³¼Ï¡ŽóÃ.9£Œ¢ÿ@µÔJ–]o-ÎòÛ¶;ÿ3¯k9hÖ­ßi¾ù4Ë·ßGï­r>–‘™k¶«o¿Œþ?áßgÑü!Š“Ë *TÖëH±Ô¤0™TøŠÆ6Öðà Q‚h—?¥$E¨L†TTŠRp˜V‚]sLõ«mØ-ïs¨âüpl÷øï6Î5?ºÓÑí¨èuß!øœ Ÿ#¢ã}½e6„S³/IùLsÝ¿‰’ãX¢]g,Ô¬1°Ñù~Ü’á>–Vs 5çyÊëv¢ïÔ»þ?½è~²‹M‚ ÕjƒÍÔ©eš%™†ÝKüPÁ5(™ž 8‘“T€Šž% +h®ç—é\NÕ O!Fj%²ÌzçHßw¨œÿ²|ë•×zøK¦Ç4TFL©L‰hnG$D¯ +’ÓyÁð¹'¶¬N(%Ë Fëy;ÑtÿèÅ6fè|r¬"C‚Å endstream endobj 1079 0 obj <>stream +üÆ*¶ 3KcŒtÓfšÌ*Ø,•¥R¡ÉLÃì'N¯8”f.XÓ©ó¹lw̼¶ÿ+L-5é2H,µëY/†¤vg…߇‡Ê#Æ«:ÓW톌­¶ +*àv†›¤2G¯;Û®;ªß}%™å‘‰F±°éjm·ô9&¯ŠÐ3?/¸5Mc j¶Ø<àd­yµØ¼_yô*¿Ã uÆbfÊLHkíˆ5ÿ×´Ë’)²`ÖÚñÚž‚ŠÁQ£G#˜u†"Ô²Kø F=Gˆ‚옃ÙÕY‰2,ÔžÓ‘ŽåT ­Ð€°æ»PPÏh…Û‘Íûqàq2{!çƒUøh5óY˜`oÒîzÏ3eÿ]œé½Ž”í§në4†¼h8·ñ»áUýoñ‹R“1L’O½hN4lß`ÅzF`ÄŒ7¿$ .Dí@`öê9åÆíP†Tg ˆNh<³ÎDø8MÐ2ô~@%{á#…F’ËÍï„Vv¿&jÆ3³ñ·[¦ç!ãWü¾âZüºÞPø¶ÞHŽbw£ï °—ŒÆ‰™¥vG0«ÞÔÂë”Ù8š'[†I-LôkƒÌ¶a&h c8eR¼J‹ÐBý‚ â¼Ñã]S@ƒE; Éž+ø|Ï2Õ®´˜h.ÅÐ +-Å vWzÉ¡ J­$»â\šf‰R¬ŸyŠù/Ì0F M§zÎÑóšŠê4J–[pb”ä.zÂì$DôÿFŠþ#½æüQ{Æ¿4ÁÞ^·Ö‚Às=`•[%Õ1&Y1¢7Òp3œ\$|˜êÉnXÃIrª¾¬ +ô*Â~?pá®'ÐÁ?Œá{±¿S:‚ßy=½í>|‚Ò#l”ÊÔ®|Ή¥ãÅÛt+Ey]HðÍ ¾ù/ɸÿÊÖ3¹í>vü†c¯íLÓø£8ßâÖ«ø}á[šæþË3l/CYõFÒô²±ê:A¬;ˆž+8føþÓ–ùQïûîÙ…ßyÁõ» :W0&×=×ÔÊé~ªí½;¯ûvÓluÙly×ô¼ŽÒ耠ð} !Pþ—)¶ùfžÊ*Ì@¥U˜‰jóŽý^.Z ¸]ØRÿýJ3A\bO¿hüUZ ôm7gI®b,ËÙ4ás-~Zk:Ͷ?zÓ‹Ñ„. C V«-Ëú–ÕðM×w'p Ô$(™Š-@B¬"à ˜“âÔÖ¼7rÑ|«˜½úmó³Ûù] +Ÿ®·=]q`®ØRìpÙ¥Y·]˜’™ åÚ‰ò+íë­Óm³nÈ$Ûs1K±ü‹\ ¬¶›hÚ/¤5³ DI³ë0Ý.Ò°?CsÀ*8>­¸$XÞE¹eGY†åx¤ê=’Y–íW Ÿ¦t1«ÖK¹q;%µ®ç¼j Qs$opfÿ¬æ¾÷kþƒè€Ó +ÀZ±#´~QV¹µ0¿ø,Ð0~ vß±‚ÝAŒUifô¥àÁ~y$§Ò4ðÑMôH¥y­Ühšä>ÏT=÷ä²ý:ÒvÊÛ5 ™ÕXfÛt(~Zm7Pó>™}ÛùPÛu¾ªš_T–gåÆïFÐ0Áe’æ8íwž—Û£Ðɺ7°b)Æa…j@ +!S™¥9Ðʾûà!Z/°B&ZÂ÷ý‚™jÙ)PI^5#çÛ@É/iƒ|î8^­}$½à‚Ù³ŸÄYåÆággz½ÕLÇy>)5 6Ngp¢Ì:ôDՃ׶ÿ‰¨ì†”Q½FMÕ Žå)‡ód׉»ä\ÐTE‘©ø¡*±óT7IŠÙ‹„ây+~Qg\Àê 5Hðc8Hj-ÄO“Ù +ôKî'ûµ‘óMw°á~ÉHÅÜH‰’!H±’1ÜÁ[œb}Ó«¦c‚Åzå7 rtšÏ ÃhLmû²‹KæÆû›ZÏh,«f4ŒU:žrÛ¦+rÏ{•)—Zqk†ãzåv$G1} ª4 +(^sœ¨4 0Fõ 0Bð“eWœ¯ú…×ÀÔñ“:S~Ùw²ay$WM·ä¶ï¼Aq»¤ÖkMhÈÝ÷]Ñ÷£Ö­jÉîvœï~ ³l¯q¦ñC«{ï3u㓾eÿ­¢´;aÔ½¿Qžù3KvKÒ}ç­ßNçxÒmœÎ˜eï[¢bm¾ëùîä¾ûÈk|N„ÍY#Ü´Èp,7¤Îï¼]:Þ‘Ð4K¿á¡*û²õzú¾Y ÃIÁ’`qÜB†ò‹ÁŠ–=&(¾8µĪÿ4UñÝGK†Ë +¹å†Ôö^ÅÏÜ‹_˜ˆp¬'Vé5¶_¹ÞÑÐX §.ÇêeæÅ4÷+«p: \s'€N¬½OY­¯oÿ—K¿ûàd@†9+F)ºŽ§Á†iÍjú*;M‘ÙŠŸ>ÓM½BŒ# í· ÿWµcy*|`qZŠê P†æ n²ØŠ†âuÌjÖÛ/Ò.À¨É©‚±½ «ÜLè,•Y°ù¶'äXÍ `šá3U2 $T ¶w‘ÃU_€¡‚%±h¿3ÛÎCYrÅÉ@fµÉHn¹5µr;,£3Þî›Î· bçHÝì²çŽØ4]HÐ+Î~S1¢÷%l®Ô$¼4«q»˜%ÔXª‡œ¦ú ž®; +£¸ $zMTÜ×~å7H„Oh ™HšMfhVœ„©86A-`²À8½Z¥Ð|Ùq^IIMç£õ’ûðCš³ N™©NíäÕHk-GIæ'©r¼!ôý_NÉðd˜î|̲-7¤Òí~¤î¹–dºáãé5¡n4¥v3-óÜuŸ‡fCA£ÍÚ°"ú‘¢å^à4É)°€Ñ"|¦Ì<€ã}‡MUˆO îäˆÖ×8Årh²b ¿ªy³Z–§³„ëqpI£àC¾ðR÷ÐCÕæÄjû±¢û+hŒÐb‚Æ,L1žgº¦£q¢éJè8™Y`éž3ÐÁ_³Øp¬bw 2Ø1 F'?RkdÎÿ]Õý 5ýGR×u(@UmjžÎj’n;$üîBJ݃ˆúDËÛäfÇ‘®ÿ7N³(zÖ§,Áð\²bÄéY¯Ä¶ï>äHµ¸Ér#ªý>è ,wL/ëë¯æ¹@Oe*„UcÎjZ5Êz4YwZ8»œdÇ"v¤Îdžå}ö;¿K^ßv+‚]o'J1>˜MûIøŠæZ¦b`¦_#€ˆÞ%J*³£uË ¨*†KñÃR‹A’ûG(<¯z=»àŠ5»‘´$w1cÝ 6ÅOœ^qÐí{ÎG-ï?–Zm§´J¨ã7Lõ"˜^eižKa’ýH0¼Œ#Ü ašË3ÌŽ3=÷=ð@q™:ËàòD6CÎËu/€¼ò'IqþkÍgUïuªç¾ +ô âWÕ&#·Áðq¡‰ ™šs¨±*®çfˆñDÚ³RAi4@¼ØÁ60¢ä×dU_1¾ëdð€Ý?°(É!”¥C€ÄìPëÚ °öí‚¡\„«IëÌK¥ã5¡s¼–âX®CI낈´ë„OÕÙŒ#X[JS÷#“cðýúÀ3ÄFa~Á©cz¿¦\†˜®K)®íV˜áü*˜ÇÏéí¢lÏ Ós=_x7Vj@`r&I2Ítü?I†ó¬ø=a.É8n¡©Z¥eÈY{Ðr4ßà…¶‚‡ËN¬¶é~Q±¼%ZªN¼pÑ^Ù4Ñ{9Oe`‚âfŠÆ&´m˜aBc1®ñ>”X¹è0ÂÂG¨ŒÂWÄÞ@#”¦b ‹àESË‚ ¼YîÛ4ÛwÀªz®…MwÌÀ$ju]÷KŠbþ‰*6}£©;ŒPg@­¹ßZÓÕDÁì)‚XjÚo\Ï×Û‘žù&|UhR®a >–U(tšêE,;Ï$×Ëy²éX½àN”æ»Ê2 ®ÂM7mAG{®`^Ï’ån mº*»NÅé¥`£-Ÿ<¯Èœàt~éE×1 ¹š|HÁ&ÈtÕ€Hd0P¨3(¸ìÿjűêŠë™¶ëL˜bz$~DÕ#yŠßÕZ eÖÚL–+íÞŠ[‘|"£¡‚¹!·l<A'3>Fl¾¦¸G±ËM’ .ZÞË8Íþ 6Ht7Kuå×¾FêÕI^ÝUüªÚDð Õ)Ðx×pˆØ>,:_rü²رJKÄÕ¦Aæ(’üÂ{ ½äR–å~›¾K|Z´TFÌžé>ŒXz#Þ÷’"·‡+5>Ogz’ÆH˜\pÆmšNä¹ÕVSço¢i;¥™.Åoë.ƒ&·Ò,÷Oœar?k›nå)–÷PjÁu ³ÞZ’ª´ßWœ¯YÏkJËn‘q»fþÿiºù&`”Ú2Bæ œƒ‘%¸…©ú rÝ':ÍjL«jX­k,©h5f’j~ §9‘ûµVÅÂçD‚Zg%|¢ê-dŽÞJˆâ%ÄpÞhußqµs(F©6 b”[|¸gšh9o¢Ë{`Q*Û’T6ÂF ?‚ë›Ú7¶;¿{aš÷zšê@*»ÿ3mϱzÙ/Ü4¡Q˜)›Ð‚TöÇÊî¢ãUŸæ|Kï¦øŽ‚4óCÔ<‘5(©~EpéŠG–_r+€RlA+µ¢Ö¤÷̼ªë„V·ÞÍ2]gòƒÕf"Ü‚sá£*»°BÅ:TŠG(29PrŒÁE †Åµ$Í|¥Ø~FIæ¿$Ãð*~Qg'„Rf?ÕtßBÌÐÜÆ® @ºì7aþ5bÿ:á¾/ê¦Q¢|F@C  %áÄ°ÝÓs|Qt<ÃBÀL±y‘û3M6^ÎÓMÃA‡’ $N|6ðQü@ÍGøD¡mØRƒsDVa†élyî3 Ño°ÀÙš%¹Jµ™X÷ÊïK/⥠+îˆP}ÂKš$Ft(Bð–å˜å™%ÁEZe¢äJ›Q¤J±3ä‘ÃTÆAÇI@©Vø†T’Vd«˜œ "ýƒÏY‡š¨3:Rj+ˆTnÀ,—ÞwÛÏ0vÝG’âû6hm'ƒEV"¨äV©^¥…$¹î'€Rj%€è)„ೈóˆc)E×¹ŠÙG˜Zp|”ä |LqD-¶Å®»Ç’JM§:ο4Ãð^²dœë˜R”zkprM#ðÂéõ@3‹AI4ìhXƒnÚE +öfr´B“#TvaŠ¹³j¼gx¿hõFIК‚• µ cU›–Ì_"¢ç­¢‚Ë€ÓDö ³ÓLÉûÆ­7;Hó@OeÇ/·”(pkÖ“,·ì&†Vjz ÌV³ÜR˜c~M§Ü¶ëx¨iJsŒª²çdšë¿‹³Ìojãx@Pö‡š©´-Xl;c¹>d9g]ÏšM»Ç¹þφŠ¥m¿‘WÍÆNSN7C8–Kñ¾Ûþ¨Üè:¶3¿å» +1ü&Áe &ìR+F×.PÖkx!¢opÇ+¸)àK`…O +Šu죞÷ I¯¶$õÌ‚¥ÜÆ ¯ü +ª4œ¥8 ¯ú‚—-ÂÜbSfÙù‘œo˜3~BÁ/;  ,< ,|ˆ$4 >’[V¬aB­µ_ÕMWa² !Ò !ô +ßÓØ +!ÖÛ "TZ‡ïè=!¥YÕ€eÊåò‹bË|À¨28NdP¼i4Gò’cy_ólב ³Ü&€á3Z@F¸$ ö–«íJÑÜÝ5”¢º–":$2‘¤–}GŠžÛ°£D†¡†H£,÷UŽk;6Leb¾ì;Gu 8Bñ?Hñ)X\ 3L‚“©XŒÚÍSì·y¦ñ:|¶î2|]ø@±?CJS K1‰MQ:ƒÌÒ»¥¾Ë vÉ͆õCˆb? ÓÜr¢é0ˆ@·"Xaƒ•YZ™ÅømÙ) ù²;ˆ ÕS¤@­­Øùâ{°±ºwÐABYVÝ)(azAø4b§ª¬AŒ¥Ö*|NNºn 4Qm!l¦Ú>Ð(Ío¢ãþ ž¡7/hº#€Npâxß n S™$º áÕˆñ+¯#]×EÅô b‡$( zW®Põ2^·!N0>–Z¸6!ˆT‚ɲär«Cõšð‚Å +Qv©±ð1‘Iø8^1øø!¢cýÒ€“$w0#Ô¶±†í[ø’Ìð€‚ea‚‹ Áò'€SqlŠÌ"Ì ••øU­ÅV¥…È1j·Ø8ì(‘Q@ƒTwàbDŸ@†a)þâHÕ¦ƒ Û3¹í:-W3†,˜ITkM…ÐéìAŒW­ÀŒå Ï, 4LqÈ-5Ÿ¬X®0H-!'[µ ·±àa"3ä&…&R÷4Íý噟eÓ‰è¡b«‘šýÂ+›Ž…ÎÜÉ(æüîç•Ë{¢Ùï?ŠáûN”=—ed¦÷¼Âÿ.~^j.€a¸ Yÿ‚„×q·h ÅñŠäx#BqžC™Î²ýŸVß áåɬCÌFMS™I\Šœ&µ6Mh%z Ð~¨ê¿‰rŒWRÓ,ÉùÂ3БEC@ÔŽ`FUü@ hB‹v # ³÷HÑr!JšP Xè\i’é9*Ùµƒ—¬z”,zÉ•¼‚ç©^ó4÷}Ò7žh¬BˆC3P°6¾¨²(ß6$O±`üDpbÆë`ªÕâï$»à&ø(6‘؉ªŸÅò0†WnE-89Qh*v¦Ø¨`ÍnÀé¥VM·Â…F!¦(΢†jà –Ý‚ Ó™ˆž&´ŽŸ§49Ogv¢Òdšâ½Ot¬Q“ôÎ0sqcçÀCDg se§.‘É@­Ö@’Sh~’Üí/¤†ñ$t˜Æ8ØD¥y¨±‚û€ãÅA¼Â8«Ú^·Ü`šä|÷K§ñÛwüªÜ<Ì Õ3Ä ÑYø€åE†_ze©‹açû–a”B뙪ëp m¿pŸ“ï\@+ +DŒXl˜ÐNìL©Á†÷t,½$0ѦQülåMôhÝGàL±è©ZË‘šéN„Qk` ^ Ë«´ï{œ«¶ŸX~ùuŸ‘†íS”^z“â—ÜŠž©5!U.4[°F°;¿§L¢\hˆÚ`’êT´h9Ió╆™¬X†î + Yƒë;F¸ø‚`$Êua&©^ÂGµ¦ƒ ë·Š3¢¤z´š‡$±ì\ºóH®k<‚àRšÈ:†^÷¨xÒ ËE 1š‹è¡bcñëÂwÃ{"ù/#‰eabű0Ç{ 1à6ž"õˆ³ëmCOÑØ2Ût™.:ƒ 8Mâ'eÖ‘ÄR+yj¹uйŠAÐâL +9b™=ТXU"”3ˆÜf¬ØL’gÿL³]gã\Ï©ð‰ÿ=TöÝÏʶóØiš?`Bg‹œÒíB¬©á³ˆ6«m¦YÞëvk%¶)Bªy-v¬%;Kè¨U #Áô ¸ßbçêÍO–=ŒVˆo¼ÿjÏnL±ò»,X1 §¶Ôzf·¬ŽåYx·5ØÑ;øœÐP«Ö~P2=*•-)`œÆ ¬|×3„b¼ 2ý?’÷3‚a= + )6;Sh n¨ÐlŽå}‹Ëí€4^)I)cØJñÌïjÁ½ bÅ¥øÙ²Wxa"“ñ³¥VÓ\ÿ#”P¿$¬$ÉCxÄö0t¶î)tªÞ\ü¸à|¢p;¨Û®ƒŒ\åXæ·Ü¤`ŤdË ¤€zE@Z{á ó{¦i»lÏÄO{BM6ìò Û)@r/by -:BÉRYš­¸—cÛîy5³ÀÃEÆߌ_ÚŒ3MWÂW·ÀGqK&O> 8! ‰zÍ Šá}R¸] +˜ ób¢(L±=PÜ—BõÆÀ“+‚ 0YŒgfãw[/ÝÆ D'"Ȩ8Mu;^p!Â3KRãçdÖAz¿P"ͽI_ø¥$6 &29Qh-x¶âJÄ4E@ùºIølÝYšd¿GÑK®ÁŠ{ñ î…ëÁEèÀ h!X2Œ¡Öý†ZÆK‰ríw°a}§˜žå–¿~áy:×0¹ 8Am D¬d B¼j,HtÀð^ÇJ¶û3¢Dhá¦zµ")UÆ'ÊÍͱƒ—îš…ðjmåÎÓPÅöB¨4DŠQ`¦XI«5?Pl%|Yq1†bz@-¸@+6@Sh;Ôò¾BÖÜ`¥{6±ƒåöáûGä|åSü¾äD’bzhXbXÅVCë·é¼ä¸bçŠm‚”"ù(Hs8Vkǯ¼²ËNêìÁŒ·ý±´r›Á¼jëPV­}ZªKЯ+è ¹3ôÍWŸÆZœWp „@ÅJŠÜ&z¸ì-~á¼Ér¬q¢÷/Lµƒ Ú +óì—¢û?SwS;Ïc^ßujšÊLÜT±Í,Ýx;Î: åúϳlûy˜s;Ÿ¨»ŽH5Ë9É禂ØŨþ¦êmÆÎþ…Ûˆ^ÍÐÕÊï‚Ø~å½7Qz¹!£ð~òi†×aÆJm‚ŠÒ|P¬Ÿiªíbše?X«VsJFãÅ•Ù‡%9p +Ç3±3ÅI– A‡R  •íâgëþ•×€^ÍÀb&ȼ€„.^ðhg Åî ˜(jqh9ªËrÝKŽã~ÓZfV{fAЩž»ð¢Çûbg©]ƒ„㙸éÂ[hZó ƒå¦r\ÏÙ8åx-ʶòJ·1çIâiý‹ÒófÍÿ=\z;[÷ )Gô?¬5éºo¼Âëdžbv-W/¿«²é»Žç §g "‡jÍ‹à–%‰ÕÆS=ÿ;„Xn°µUäT­p)Š7™Š%HùºEðdÝ5ØX¹a°yBIV­ ¹_÷ :Kcj¦ÔJÀ`µ…° Ç‘Õû¤µŽGÕ’€d(åÖ²”BÙ–õ,˵Ÿ‚Kƒ^wÔpÔ/™¥˜¾ÃÃ(·àBŽ\q0ΰ{ sÌ?Á3ÕÖ cU*Ûs+U1JœZ"t¤È‚Ò9™¢w$ѯ?Pf(ɳ +ŸUÜ/\‚íZ‚#y„˜#ùHrë­úmã5(AVy¨“$Ç÷$¹?BÇ +nÃÌQ½Â 6ý€Ið*?(E'3é>EÏÒ؆&³D­¸Å®8’áÖ›GñÊ-d¹%"È4V­"‹´r{STFÁKR ®8‘áz®†)§¡óŒÛ؉ší&x¡^µªRëLÙü˜§¹î,L®%àö«6@­8虨ª¾+ñ›:ûH~¹™×w3€Ø4bj-É/¹˜fúÏÂ,û=ŒWpÈ-ÚrJ-‚NÐZÄïi †‘+®Vœ b×ÄÙgcã›F©•,¹ì3Õ¯< b•Ú…œ$¸¦8 ±<ÈŽÖÍ3=wUÏí@Ót>Îôÿ)f7I†ÝžØùuKVbãèÉb{áû‚£AÂõ‚¶k'é•™ÈþHAfx?N9^²Z¦§íŽÙqµoÿÇO¬ÿPãDæ!är+VÙý,·ŒÆ‹¦tŽÝ©ÒjÓ¶ÿ7Ï÷ÜŠÒ¼_á»Zëð‰ÝI„쾌òMg¼¶ç®[·‡œ(³­7%û®ãÇKŸB§Ê ć랢&JM9ö‹ ³Þd”dþŠ2M÷¢‡ŠíGª¦^ÝwÄp>…¯ Ïâ—eA~ÉåHË|!0=ïÅ(ÆÈ©BÛY¶ë¤Ö3Eã6ržë;'»†OØ}ÅX\Œ_nY%ËÛ E–ªŒÎ -7¨ºÿ«¶ý$Ç°8D-6¤–³Ë–jÏy‘$ÞÅÍ–ýÃ'æ‹Çü–&ÙOíÆõ‚Õ·Ý‹àÚ„oØ°ÿ×<Ót%|¦Ö0È(ÕSø`ŵø Û¯$Ñu=U1þuË.â'êÌÄÏ•šM³]—Â<ïo¢iýúö«uÏlÜƲ[Î+»i½‹S|·q–ý9Nu˲}ׇþKøx*“0bAód曪â6T€Ukt£&°h»>ˆ[p,†YxB-;Š’Ì!†á?‚Zm/Ç-·›©Ø^ÃMÒƒ!8„/›ƒ +R„ÍÕH± O#ãaËtLŒT3|Mi —Z– 6!9 N0f(¹Š)p¢‡$»ô H¬¸Æ'¾(ú)3 6Ks /Fn 6JôB«5ä—½˜]û}°dþ‹á×iîÛ@Õ–f™ÿâ',·AæÊM ÕZ /¸¡µ]÷«²ëR­Ø@ðD‘uøP¡}µÞV”è:!ù?" ŽÄNœ‡1|fËü%——¡9$Þ·“!÷ Yÿ3}ëAÁkH¯Ø_Eщì„pi¾åzsaÔ‚#„Õâ‡J­„p +fJöÇPÃø@[ñ +«øâÀ(:ÈL®#Þmá*@9šû)|´î(~°âLø®ÚT±Ôz¬å·¥!3?QPßG5Û“Ö5hÊî §r»œ¦šß̾÷Ø.}/ô5«Ó*Z»±$Tf·Üªï!F­²¾09 °<:ÏÛnéx¿eºR#U‡‘CUæÂë·Ý2½ŸiœN†©ÞÃËøãüÎæXÇ£IºÿB©úßrÌZKIŠíVk™Þ7ÛvÁ+64Pj*n¬à@À8¥­(Ïu>Ô·ÞJ¿Á²Dÿu¨hþMt,‚ìzk!Äj;1ŠñMŒ`|%˜>©eÿ­]µ?Eùõ£èµFä5ËÑxqšù$DZ>†¹¦[žû1FöÜÎÓ=×¼ÒñT·ØBŒbv”d9ïÂLóMŽb»IrŒ!ŠÙwžè>RËæÅÿÓ s¾¦*¶wµi £õ +LŽæ?I2,Gó,ëÕ:^pêžkY†Û€Wö¾åI–KY‚كѴŸæ™¦“1$÷gÅ|2Hs8IuÂ.8ŸhÚbì‚ I‚­A»p=T;ï£Ó{9`v“¥×Z’‹–ë¥ùYðšNxeÏÝDÙ_sž×vñ™ý%C4ÝǯKîä7•ö¼ZóðÂK!ZÅy¨i¿ Slÿò,ów¤ì¹¤ĺéºà¹žTLÿ€ óÊ ø|"hµv²$ûQ–å¹£ž…°Š-¥ù…WyR¥³ç= Ë •®9åÙÕæ"H•öç B S1Ie-†[wf2´2;a~µeØQ‚»R‘ÕDËzH-7B¨2A,6ãX®ÇŠîÿLÙtD+ÜÅùÕ6#ÙÅ&#èg"‹˜ý…¹þëHÓý™çY_‚{û©ŠíÒ/û̪û(Êòޘܗª¶6]wg\ëW bF›FOÑ„¢ô‹®8™æšîæ©®ƒq†ña›Î\ _qLnyŸüŠù)ÄdÇ.†WfJ-¶’gVÛ TlÒìÂÏTÃ|1S° &Ë$™êÛ +ôKíŒô–Ç}ßé8ÓtJêÛjß5±ï?pK~#vÏmÉoZmi©ÝV¥«™~‘0¿Ê€ªè¸­w®§":ûŸ—Ðþ\Hdÿ"VmϱŠ×¼Ýøþ»=³Q¤ŽÑðEåx*v¶Ö4¼\¡uÜh±áÇrH^3½,!ûÏ«™Ý2;ßW ±á¥cu)z²Ö4Ì(Ñ3À(•ÉòzS +³S/©åe·duÃ(|©[vCˆ{¢´nA‡)4 3Oô8Te'€Zi¿kZ® ¦f¡F¨ì›Šµ)µï:š§Ù_â +M xÝa‰L„TZ Rì^5”†GƒÝ¢ÓXÅƨÚ2þÓzvƒ†‰îËz¹±ðe©i€Qz§Óú+Mω¥×‰$Åp3Œä¸Ÿ©û?¤Îïvœj?ÈðJ-…ɵ–Sƒkfé~p–ŒŽ…}ŠŒx=ç“Ø·ßÇIÇK‚ÅY‚0½Ø€°éü\¿³Bªçµì¿”íßyªù1DZ›O:¾½d<°VsÁT*û†ºÖ\§r½šd6Ë‘šÙ|Íóìÿ¿Aï¿“‹æÇâjc1²ÿMmüî +©že4v§ rÏÉ(Ç|0Ul2|ÀÚ†Ô4¾(É gu¾»\»ÌbÀWð¢ÒV|‰$6F"ŪÇHJÍW¥æ#L®´èK«lgûÕ&ÜžíL­ûŸUïmžèýÏôMd]÷Vö\íWif¥É`·Ìb,­ÎX¯ÐD–[j#G¯6‘d˜½&zÖ÷PÑyç— L­†ú…£õj•ÁXR‘,³În ç¼2;Ç[bézÏíÈ5Û[¢akÁëûïSeûy¦nÊò,rÃÝ@×ø)·î÷b’÷Øq?®Yí§Ê¾©ï9™çyó<óW’ê9c5nc%…ÖAÆÚåâüJk’åF’bö¾­6>¬6`OÝÊñª”âuINcý‰!6 G›õN…­ñt¿Ü@ø0Ñ9Ü(Áe$³Òx°`oN¢®¶çQÜBÍv¬h+-§)>Á¥(ã·µ&ô¥û‹„Èì-T²[‡è«¼ÄÚúš†aeiÇ]U௰—I+l\;o£ÄpêÒx>£½Bã|¯ÐH.zIˆ¬ßnJ£ƒ3±Ým¹es/Ïô½GÚÖg¥ÙÕóB¯9ÍøM›M•Ðp› /õ )V:¦ëe~ÂB«ad•²†H_j(’y̧Švû†­‰,·ê>Ù²• ‡õÊõbIizL«û"ˆ•æ1ôj«qžá~®é¸«X­ºeÛ§Þ¸=k´F“’JŸ-‹Êh8V3[NõÜ̶ïÍo%bß‹ŽÜy¤¢9ßËd6#’*¿•–Úf1R04žh{œÖë‚ÑúÎS^·å¦áI ­Ù%õغ];T_‰¹K¶VdÔ¦0UJ¼‚åZƒÈcPKs6+7,þåNJ⇊äó ÇKü\µ#¤$±+˜€Ó2~¨ÒšÜ·šÞ÷LÕ{›­vê‹^ÝÂ*û¬¨²Va¸:©w~8ÁS_kòMuT\O!í”–˜f),gV¹…Êܧ =BG*Þb<ç½]:—ïÈè]ÇS sû¡ïä«vþk<öâ$Uu2ŔԢu4ÃÉ«Ý€TŠÈªJ[Œµ×XR©‹­nQJ–‘TÍÔ“Ž‘UYf«¥®év½—Ebc_Sk6±©vÛo+F Q~Í8†½yÏû™f›N+Äö“­Òù¯’x­DÚ%ŸÙ‚Á‹P]g£*2ˆeR†"‡b—NÐW«D“•Ëf´\n瑸þó;K¡n™½LÃà:ì4Õ=àH‘Ù4×}pP½î+$÷W‡Øwj‘úízÄvš*›%ºÖH&/õ Q–¹„hikÊiŠÈ•……V©*…Q²Mâ› -6Ž6¬lÉeç½ßµß§ºÅæ2ÕB#j×ú$£ø()ÍG"uÁ›²Êv¢¦ØA.(1ìI‹'ŒuNªj¹BºŠQt¤åÄ…6m­{–°Ô@F]ïæRxŽòªÛP³â6R²Û×q %ºJT+>m"£R[LBؤ +ðÌ- …¤™å«Ñ„]T}i+yr*$š/(¥Ÿ¬+ªfUדÕ8ÍõwS7™ù±ŒÆüBèb<ǯo9¨'¹šø¬&ëx]-p5‰„™º!c£éHÙh +55–()sfÆZ¡‡«é‘$|”Ù¢Zá$r²ðG‘X£¼„G`k-RÚìKÛ +¹ï¿+ÙE’i­›ÝžCg6æYÍF.cÑ\òòE}qYÇaSTu”MØÉÉ×UEÃ’ú WI­@EM¡(=]©05iÝ@Uue¹ÄÂF_ba,˜Yû5'ú¦Ý!yÝjhµjyÒ.Ýÿ\*×Qa3ÏWÍVÔNÙÉ{Bêêä ”¾Èš]ØKY•®&"’«$£/©$ qUS ىɇŽjú}UQ¹È:eõ ÅEf6Ÿ-ØZNuüöˆEÖbÝ"Û¡r"Çøº}°B&-íØ8Ð<‘A€ÒƒÇ̆ +¸h €·†4uhBUv´»Pµ&nt ë(¥©©I…h*‹ª”»zåzßÔü¦=¿!³ì dQÁà$Ôi±¾gÈŠ¼¸HéîëBÕäôÄò +÷½[ ­âtÌ,ŠSpç—§’R·]‰yɯg'óŠ×ö["Šø“Ah†ÔdFßš“ WH [Ϥˆ.*Ôc,´e1' á ›”|ÂRSЯ-­¤°/q“˜:jD^KVÑú¤+¶ˆ”QR‹¤'Ñ‚1¥£` >8P&p€á‰ Xõ¼Pp^òh€C`¨q“!ð"+Ð[jÛÖjB +ûúÊ.ëACo|´ ÷â·eæBý*»j‘ÙZøÔ̘{ ±Õ¤ ¿ê`p?m&˜Òi°d3’4D@"(0p…áÁ‹Ç…) ˜ãmRWÚ±Uäí £zº¹z’B!RªÙv½W¤^u™©XšÌÌì«„VS:}¹‘Q_h)®¯­¢ 1w9ÃÌ%Â(o“º8.¸7*ŽxZ>Tq>\M•(²ºWVW+ 0/V'îÔÅÖ³–ˆÐ][Ô¥ñ›Ê¨l¿Ïå‚]´š‘)üvª«¨NÎDÛ]Õ¢‡#Ÿ”J7$ ɘi±²„„'” _Ρªâ' ša&Ïš9c_¢Ö )q]x}¹Dñ”•qêJìªþÁšÛx¶á4œ¥©ô‹­jI64½¨°ŒÑ‚ãÅ>.pQ€àÀÙ0£F„lV$ w”‹=&вÀ¹(çU—ÇEä kiáx¿Ü*ŒWë‘&×üˆÉÌftÖƒAw­ÆÃÔ9#D%'8|#(à–˜Àá-YAôc ˆ +ü¤èÀalA‚(Œ +tKX@p#rak,Qqƒj©¤¡˜ÓŽ”RTéK rÜŠ¯ËkØ"ôòË)“3ÉX_³ÄžD +XÊ› +NAšÞš¾.$LMf |A¢+ÉŠZ@0!ÒTä‰òõ¹‚œ…ÖÌ™ºD³‹’”Š×JL#c +‘×|YóÛ + E›5ñUÒ"!¥n¸úM°=ùX`úq"¤—¬tZ¢yáx1á¡€á  ±ÀÀpFöPør"Cpi¡'룭Q>¥TÌÊâŠén©[¬Zp–ë]ITV£Zy©‡aKV2žÒ‚n^ +ТtpüNlàP!ÁÄ|xp€Ô¤EÕ‚`H$t%{RÌ·J”´"k˜I¯RJ†YIEYuM‹ÌiÐ(°¶†§ŒÎN’ +µŒ0c‚8N(`@@u†Š¤&8’*B°ô’ÂàLÌ©@gú6=…)àY§MYXŽt¤’Eõ£„µ¶ÁŠ­¥4ÅlZ0ZM‹È ÄÐÆÈ– IP ~  i D')8 1RpüˆàÀácÚ¡‚€ïˆGX‘ xSB"Ðâ‰Añ¶E¾°%][äF^XZ¡Z¢+’éUÛ·§ù uÍÁ^\f#û‰Š¦ŽdbnE>¹ õ.É ¤3NpOtà¦èÀ,‘ ¾@ñ“Î̃0ñס=E¶0û*kâ”°Õ3*¥,0óØýdf×r‚»)¡¸Ò:ã¦'õ¬2õÏ%²Ï R;(p\AZ@´"B +f8P`@Óƒ…D,&­ŒàŒ1B©ÇW;(*ܸÅ7¢çŒ\)¨¦«é ¦ê˸5ǵX¡A\¯R¨ ÙŒ0\ R‡ ¾#8N(@ À)£K.( |Kf$àT#3ÁXøƒÎ^x;)ܸE _ +ëƒsÚª¨‹¾Èª-/-Òx®äDNCјÞT`"`ЧQ.йà€ÃÙ‚#§ƒ0ÀÑrAˆ\@Ø¢ˆ0\a‚£‡”`RXzA`;Òbªœp)r¬uáµÝ¢PÒu¡ÄlîÐíˆ }¦âD$Å ™| ;â"mŒ™´ `¥ƒC™bG‘„ŽjŒlI<8~BlàØÉ0ÇõƒN3#˜ƒ2¸6Æ¢;,ò&ëM•UÔŽ’V˜ÔXÞ$UVû6Ó’VWh1¥¨D><0]ˆ ÚBÇ’E Žš8^84àøI[À!•ª&!È. 8š1°fö| _‹+޸ʸ¢jOüiˆjƺšfi•_‚¨ÔE¦-öN×– ,ê6æI +Ç  F8°#08d>€à`ရ8Ž!. ¸²”`m èå½üH ŠÆ˜%mÆSKÕ+¯°eW­-*¿ñx®8È”°8š H@eaqÀç……'Ÿ¯:ÊT±¶N?t9›ĤL–²!mÐݪÉ[·š‚zsuI™¿ÄO᯵l, v㧌ÈþKIð7*%+ Ò|Fxp€›""A+è# … $Às¡² +Ž 8<"H Àf ¦#$ÆÄß„.¢O$žåè‘„NdÍISú6åPNL©,´Ê5+þ²Í‚« -}Ù\%a•À’f+ˆ‡ wf8¢)8py˜@€×¤ƒ€é‡ ŽÀŠÄ8@jfÅ,ž!# üœ0¡€•ä#ƒ¹9d9#ÊNÓ¦–ŽÚZ[AR`ì%ølÅäV{Ú·«oª)eç(Œ¿ÀŽ¼àp¹ e¾#ˆÒ G&& +ð©Hñ +øÓb Ûä 5ºPô©pk¡:t'¡e{*jz®º¢ƒ•S–Te €4±&ìL±:l©Q.VW(5 0Œ€(±€Ò„%BtƧ' ÜŸêê´£|òÛ­œ‡Ž¦mONÕ8¨¦"ýiI)—j‚Æ¥¢¼Æâ2]« ÇÕÔÖ?»d¸!%tÛ–8\æ[§Àþ¦ž-8èt¬px`@Q@Ç hej.\%Ÿ-¬Ý\S®Œ0ö+36D„Âæݶ¸ mO¤¯T3×£ZÂáZºÚ1Â"OƒÖjÐ!óYŠ“ÒW†)ct¡›œ€-\p˜x ߈ U(xÆHÁÃÁÉ)ˆª™@‹GG‚(œšŽj^$?'ήˎ4o³ç Š+”æʺc'¾æÅ¡0™SÓ:Î*,FΑš.”‘Æ®$%04pì¤ø°€Ó ƒ¨› +–pt0H%TŸÂJ*0äØÑ)">m¢È­4ÿ¢ìÌZÒvù¶Jrzwi™ÃÒXXd²g“½Eié«Øùu Ùr³€ÏÅ‹`6PàH¾ÁÀUŽÌà–˜Œ!% +@Wd&ˆ +"'\—°ŠÁmëÓÇ–4äU³bò²³®˜Â`f¢£5ÛUÒØßìTß_Äc+\ªq‹QÕ˜©H ¥©È +êUÕÌ‚êš±ôʹjºŠna}5e¡ãEM쳬§x\ù‰,/n:ã}ÍþUDjwFBcýkÐÛ®ÕD¿?mu­«UH];GBY]¢* >D©„œ +ÆQžל£ˆ# Ð ¬Qz²$õÊ°švG¾¾Ep€xèp€6æå®KÌH=:¾”ÃhiÔC|ŠéÊÁ|eƒvÊCKQ±ª0Ö*¼N…Ñ[ž`g?Uu›—Èí&Ät.ºk¿@1…Áz0ÊüHhºÜ<8š¬h`²Ðh˜ÒøÀ°’ReÂDI(NU!Ü¡tÄt ”75á„1UÕLI…y€™ÀXŽ¢ÂÌ)®4Sª+=Ôŵ& +e™}$¥æ7Ù®5š.VY·IìMûÄuæRÔ [¸"—dš–Ö,ªT+¿_öÃ(¨ +Ȥ/š—]…Í“ÆXl¨ -"¢,š&§/o×Všì‹fu4÷Ç(Éw?/6l¶N‡…T6wÅ—;r*÷¬ÎH’M¬@Lˆ!šÇ"HÔ+J`¬ •¸‰Klît%FC6]µ_ž˜¼€¼¤¼`PuŸ:¶°M#`ܨ—)¥.WŒÞëqšð~Ít ‡ÌjJû¨ÁZSÃ,?.\ +0¹‚#`šˆP`¥ñE°Òø¬Ø’^[ÌÞ¢âXQ“ÊykÄ«„æ-z¶£¨žj)+e8ª‹eežù^±³Hc´*&¸~kj}GQÊbAàÕQ:ÚANX®ôT€ÒìÈ W¿*f.•NØ‘²Kjmó”¤r!•ã¢1‰WÖjWVÈ8or]Y%î°ŽR[bESåùÕZM¿7Âk4CUð¤%ò ”¶©EËÚÄ"›áºCäàÄœ†|m¡d¶Z%ZÔ*RϤ`Y$6¸i9Oºå®ÄÄØߦôXSHm–tÇ“¡äú¥¤4ÜÍ4l¦M·}»d|oQú¬FËåÎëÕ#1²”­RÙ(jL¦ÄÄVÛ‚éøï·¬_E”v—5ôfCêk¿áÕ®Ñ= ¥Ù5‘ù¯Æöe¦³>œŠ.Ï[‰Òq)Ï-ùNU¬îòŠG›¬ÐCZRh(Õe¥F’"qŸÆdÀ'«³ªŠÜÓd%} +¯±Iê:Ó’¼ +~Çq»õWÙ>Ê¥Ë%µî7¦!2>¿Å$ßW9­ñ8Ü©±Ó rSTšÄ‰(+åé(ËçIJÜ4%VÅŽó¢J¯5M‘ǾIh5Ÿ”¾_^ÉüÚ!¸\Ï6\–s-«i¿rüÖZ¬„Ö2“ÓJ"“›}‘QÓÓmñ‘Ÿ¨ÄjDOb4%‘˜ ù%§%ƒÈh¿¡¯¹¶j f§y$Πɨ´Ñ¼Ï3UÛI”aj5Tò[~s¥á°à¸”ÑØ^ʨ>·¢Eßð#dæ¡~­U½m}NtÂLj푔ª#¿ä¸¤$õ=©éÌfTâr» =‰¡|—ÌT£°µ#¤5žÉ]ÇáHÓnK¬|¯ýÒç°–äsEEj}ŠÔjþRí"nÏoHîý²ñNoúÌ’ó€ä*Õ+²,Ùè§Å`»êíùíW]ë×v_¨Mçs²^l1Ö®µcW}OGÉõQIñ¹æÖíRçÿ>㶗&˜—i¼F4Çu +“‹2mµ¹)©ŸVT端0=È)œÔn,¥ÞE°Í«dNû:‰¡EÆn0X¯:RÉK~…Ò"› +ã€\1·ëØme¶Vâ§ÄF¹B©yŒ®ä¾"­²ŸWŒ-çgÁqþ5轇JŠ÷µ[/7+¿#5C°‘V<ÅëÔ†é^Ém´`d×!õ[˜“œ_—›ƒbÅ|&´è‰€bÜêt±ÔJKl<³,g’ßc¢c{P;¶C»ì}wÈíwy.¹ÈÑ Q¡ƒD]e,X¬zñ;^ûb¢Ï©Ÿêue':]•RœNFqüVC5³Q¿p;ðºv›™–Ó”Ü5›ÔÑ\_u4ö7 +¢÷ÁBo<7è-§Cí2;rÃgh'úžÙEÃ!³ìû–kf7>jËS"óG»l¹ŸkùíxmßSµÆ<ŠX󧡸œrë†ûÁ¦× ^wÜSë–’רݹœ}σ [{Ð2 B±&‰‹C]õ˜jW}çZV[fÛv¡/Y?«(FJÓg™å†ærC`3=š íŸ zËMÁmüw(žgnÛ{h˜Û lmÙ]Ã%‘Âu!«/øÖ–˜ŒÈJ«Ž$ +CK…ëÏ#ñTȬ¶rã—_tÜ,-Dn«J¢ï¿Cìûó{v[~Ñn4×0²¨þÆ‚Û‹ŒÞrÃjØÚ&D"7U¨Ø}Îh(ο±~©¡P¯Èd¢b8¢mÇ©ã´šàx/‘ÛŽýÆóQo»¼šó1„_ld¶ÔfÏye8®£ØµV#-ۛܵ~˜5ßUžal"Ç®4ž©ZÝÊç®à3\•èÌæ$"ŸA9¡Ù¬žÖqÞ$3sˆ¼ö«Ý†ÀctMl‘*¯#>ÅÞ|®[g=Y°µ¬P{¼ª÷,„Xf|JkC*úGÒ«­ãXuFS ³£D¯Òè\·h°Wî¨Ù ¤ie¦B¨…Öôb+!Ží:Î7S;Ç {zï—ŽÞû$×-‡$T6‡E´FcÉuß•ß´›õÛ–[jãþm¸me;7y•uÁâF£°)Ö[XuÛ½Ü1t«Î/½i{¶( ç=ZÛ…ŽØøâ—üVí*{‘~™A¿ì»Ü– +¢Û­d¹›*‰´ŠÞñ„j?‰Æl:X­±L&sˆcÑ—MöËl:„ÖÛh¥ÐP¶Jm !®²•îÔÛxäŸ.…Õ³žÖpZNlû”R;ODÏÛ`£5{ÐÆ︭犾¯ Ýùk[ïe¶{•ÆnÅîø$zEÆbÉÔ.¡4"û|½ÌžCå»QHüfÔTFSK½÷OGðþ–}¹ù!K­´G§9ˆqª‚ÜRciŠ½y%•á5%­óI¢ðOÐÙNV]ˆè­Šû‘„ä+¢²º¬£3¼òZן´mo‰†ËJ¨WrjU¨ºŽCRåz¦¡1¿’|/Ü¢ÙŠâ÷­¥9~ +Iž'jÇn;Ô±›¯jŽË‘®ånªc7ëÐ:N¦y5¯°³ [v×qY¯\ÎÉm··è3çn·zåzb¶}O"ë{!¡ÕE»p}QPÝe4F·ÇûÆìŸC{Ó±–íÄ/™ ©*¬>…%FrE2Áa³"&´š©œ†ôŠ¥é\Ãh:Ú0[L– ͇¶Ft¤Î—0“Ü+ŽEhêÐ9nˆEß{¤e8b•_fÙqDk:ÎÃJírÄ‚C%¹ñG!1ÛöüãÃå@Ïz!¯ §":]RQ|ÿŸß¶Ak7¨"2~1‹Æw$©Èn¤c;4ÍRß©Zmû‰U9]‹3üf‚(5æSeï¨ñMs½7)®õ/I¶Þº­ÓaÝ…•Ììr°[d¾®•Éɬ•—kVåý” ½.†œÎƒ‹]Åoªî«žñÐ-ÜOBŒŠ—©â@ìZn%öÖ( ípÊì„O]„ϨýaˆÄ&‚ÃØ|¶]lÌ©1˜©¸-Éeï¡Ý÷?©mû…Ø5_©uë±_ù>Ú•çˆæû!¢5~è-¯·g¶áum?fÛù —| +™õA®X.Ð ,J®Ë´ë]̺ãX–å7aîåi¾ûdËoJ¯:®éMÇ ¹f8f·m*zß¿Gj·)‘™ÇZ¶ƒ°¡2£Å(Â'UŸ©†ßd¦OòR€Àp I0X-ø‘{ÆYÉyè•Û‡ò(þ¥—²äZ«­¸°éŠ}e“U‹ÎjJ°yÈènÅp¿­Ø.3ýZ³™žßš[ø~¸ýB£ÉRÁe¶Zs+Œ¥T½ÃUg©n©ùpÁÚŒ`rÜ\†c‰ÕœBbl?Û1ÛP›~knßú(ϯFž_´tÍ&|°ÒT’å<ØŒ  =CÉíZ6YщG[h@Sò½¦9ΣÉ{¤|ÜEÓ‹"*ÓC:µÎDüœÒÎ+vÈKeêšÁïPCivVEgw@kùM˜E»UÕùCDlû°[^S©fÕc¤ak7Ó2œOUŸŠí8P18äUZ…ï¹]˵k¼ŽV MçÛ%o~×pX¯Üï«¢ñ@k_¬¶ù6Só}Ê÷±ßø˜E˱D¿ÔL Zg>Y±}ø¿uÄÖ°Jak/šKëš«W·ˆm¿ÑnX­à*Ó.³« +Õ +gÖæ"ß[‹à(ˆGíjÐZ?ó<ëK”`p1Q2\¶œ–jß³àøþ+×+!ÁñCðxMç{¿cxF¬ÛOÌšã|Ù°µ wì6ôŠÛ„àðYwˆ-'Ôší4V±7fwÍwvÛ}œêø òªÎ"xuæÒ4Ë4¿àb$±Ò¦\yîq¢å~¢ò9c• oy­Ó1¯ï½–[§afÕk°Vî%§s[6ˆ}÷’ßR€ô:äYgûOÔ¢óO.Üßܾù'|Bä<à/`-§ÂE3t^”¥&"ç‰ ƒû3j,Y÷r½B{ÁŒš»c·›¬W[ 5ìFC-Ç]Õrßn™øÌ®(èÌîÚÏuÎ>í×ØvèÇzç}dÖ½÷aÉr¾¬Xî—=ÿ#¬H±Pì y­@ÃÌJšag3Nµý¦Éö›$ÅöÂ,3–¦ÙÍ&º¶óTÕwé.— bßi®]dÀ®8íÉeë]ˆi©3À§2fÕÚŠ¡ÒœÊQ©¶ª‡ –f¢ ëê‚+á«j“0#Ÿ±~+`È€q"ÚB† /0P¸p!ã…fÐ *‚H(“Ä… 2`‰Ã¢°ÈâÄŠ1N¨€ÃÅ 1Z OW[´€†ˆœ 5xr ÖÁ&‹,XHŠÄç0éÄI¡a 1ˆ/†2ŒA£ 'Œ/ˆÂ*T]œ(‰Ä0„Â0`¨h³²A|x@CL¸Üî÷€{À=àp¸Üî÷€{À=àp¸Üî÷€{À=àp¸Üî÷€{À=àp¸Üî÷€{À=àp¸Üî÷€{À=àp¸Üî÷€{@àp¸Üî÷€{À=àp¸Üî÷€{À=àp¸Üî÷€{À=àp¸Üî÷€{À=àp¸Üî÷€{À=àp¸Üî÷€{À=àÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞÞpooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo¯ø|+€ +œ®;…’#y+¦á6TjJ+Ù›gšo"H•öâ,û¤sKê™LÝ{Šºk7^’ä¿ Sí§bÕjdÝ/.áv€­+¨DÝt1ǺîVMÜíØ ” ˜½Å«Í£èå–´Òñ|Ѹ]‹ ØýÏ—Æy/µ¦ +ìVív¬ŽTLn‚–k˜‘ MJˆ­‚ì¶^8ÞKPBu†Áå ŸBG]wRŒëqÙÈ +‘ñ.–Oo”f˜š*§S1ÂçbüÚ>]x LÀp’Ÿµ_ˆÛV×K·“QôZ[jásECô<(~CXÎ{ Íø©¼Îè»fc)hMoê•û«à÷ßû»ƒbËòÀ]5`nÛÄiYäp½‰èér{a¶ér ò<%–Nãˆ}ÓÁ ŠáX”ê½TKvµ[·ˆÊ¶“ë;ÜpÁ…ð¹ZsyŽýÀ©ÜÆ…’$º+¤`(zÂätœñ<`5žwå–Ùká ½®U ÃŒÿ»Ù7£d¬ø ã3@9ªCðâ«þ@SÕ&b4Ï §d5†ºl­t<¢uŽwôM³@Ê•Ó)–ûj´ÞDè„Ý…T9]’Й %"2Líï&êæW¹d6œÛx3Ï4Þ‡š.»«µ“ã¹ïÄ–ÕÐfÕ.XÁsæOH ƒ3Q³æÓÉj£mJÉ@²ìŠ“qübSi–íªü?ÍžÕè‘ÂíHètÅE02u6IÖÚ ’m'´¾ïºà: ™§¹ïA,óWŽt=™$™Ÿ©Üši¹.æ©®»ºó&dŠŬ´hnã$éµ—0óTñãæÏ ÕôŽR¶ ¢Z3]#µÿ¦úe÷8‚½å8ëyÂ7ÞÈp®·rœ×ÐÆküHé7|Óø `~CäH »+À8Åaäª-$G®s¬×@RÓ,xbÍlÌ<ÇùH­¶ß4nã\û?Šaö%Zu+°íœ#¼Î +|À;énÕ:†çG+…˜Þw³oX¯m0šé&ÌðÂðp¼J)¢ûÓ+šKk™C²œ/:ÇãbÝðÕ¶`(…㡸‘:‹à´KBJ÷ÜB¨åƼÂÿJ+|nEÐ + à UÚ pžg|º@øT+@Í’ì‚áÔçÂ,ïOü°ÐFbp0„b»I‘ýGFÓ,ÈfÕàã¤ßPJÕÖ¾ àX¥Û•ð]«ó»d6~g’$ËÍ šóI*oé»VÁPw­'vŽ'Ãdû[–é=T+¯{J»±ÄÎïdœêþG‘¼`•f…Ë®¢Ç<÷3}Ó0Zçz-ÌtŸª­ß¥Êð†¾t¿ð½O׉/·@aq;Ñxo +·[áceFr,Ç-±u Ñœ¿iÆ픂Ðn|¹f4f–ìþÇP ã&̾4ó} p¼¨¼MÒÌgYŠá}Ôö\Ó×í¥nP†é>†–)5 0Qo#n”Ìfí»¨5íèµìË2ÍïvëxWDk$©cy2€Îú øPn·ã¤çñDãx6MyÉq½—i¶ùX-Z žæû_³dû_ø„Ù%|µcðœù#?·Ý1ý?^×t!I®5é9/®Ó˜jÏ.ŠÖ+01¢ë<‚æÿʱ]烒áI³fxÖ-ý±JmƒŒ}h¬FÍ*z–ÙÐqÒi¬ Õuh–ÆFŽãû¯êÖëXËp3”ZjN¶e %Õ°Œ U›Ðº¾;^Ùw;QrÿD&gíÂérœî¹@ar"f¸â"Ì(z“ÍéùSÝjƒ¹Z¥±DÃqÆ­šÏä¶éÙ·]‰±¼?Ñãu! ÛÓ<×vB+oXë™$Íýd¿¼ŽÑ bÓ.(NÇjÐ0×t¾h<dxWÁÆ«Ö±n±ù\Ãö+€[h@Ѹ_š-ÓcVÑjH­h6®Ø3Ó-=&*· ‚ßY¿ï;kÌ¢÷0vÄò&f¾ì2J·]NTì—±V¹•Å3ì,É—Ù¹žÑ:×CIšù&€®Ö`šê¿Ð*Ï“në4¤\zN”í_aš÷#ȱ^ÅX>FÐü?Ažÿ/Îò>ˆUû{¨nºãŭ/J¿r|ç™À÷#¨T™©ðƒšÊó¾^³I+–äšN Óš*¾o1Um.Ê2]›ÎÓ‘ªï^˜êºhºN(ˆ~Ci%³ñ{žU ƒ<«ññC–ç|ÓåHÙþ£×¬7n×ü&{ÿ›Òi©sQ¬ÙÁèüÆ ò\§…ë±rBݵÐkÜ¿ÁÇhía´:²²û5η¿¦ˆÎ«ðX™…pIûpCÅ&£(¦§yªùBkœ?¬Ò÷Bé|Næ·›9ºó–Ô²ÜkÚ £t~÷3•Ói·h4¸Ú´ A³œ  4žå(¯óRïe¡÷DéX^“ +Ÿ[1–ñ*~ÂoÔ+™žØÍ‚du,ÅhÞ—ÈóGäX½…¹Þ Z8]:É­‚b˜M„N”œ„èÕvÕºï’T¶ÈNÖÚ ›¦z +ÛTnä5«áÄÒë^–l{S÷{±lX«oð1Éô^’h:BqÞE0,ÎGêæS³dx2€bp3€bq?ιòyVcø4«a“”ß¹ð ã[¹Úb­Ì–\´ÞɈ^W¼Îé|Pù 2ÝG²çѲÅ(š #Ô¬ ‘¼ßðS§‘~á}ÓuÝ âXnÅOXŠ1í—YºóTi¼?Yo0‚Xk!H07"¶Ý +Šã³m3ϳ݋`—}DOÖÚ Ÿ™¿Rl×õLÍ{#ù΢\û¢c6b’r$n¢ØPÇœ\Ç|Ke,ŠXn3Õ°¸ëO3=çEŠä< ŸÙŸRtãÙ0Ùw À¯¸#º?bLÏá@ÛvH+=ÏĨõ*~Är5NvÝðÚ¾‹‰ŠÉ±æü+·RÝü#¡1;¥¡±;ù~B'—Â'|߶ë®^ùÝSû¦Ë¡žñ5Ô°5°œŽè‹V{vA ¯ü-˜a´tÕvšÊN’á¼Ó«®Ë¡’ó0ϱ½¥)¾Û<Ût"ÀqÞDHÎïDßt¿jÜnh•ßåL×þ“¥Y/¢G NDÏÖýÄO îÅYÖ¯î:kw®·í’U Ó¬ÓhÆñD~çý +²M·ãœë)¯u=aužw¬’ÕðÒo˜Ùw'Ió\ `ÖýC[‰p^ÆÉÆsrÝzECõ>ÒøÝ‘zvc(E« )e ÀÇÏì5–8Eo1Ì*ã‰ÂýÛo½Æs+¯á3…ã­ËÃJÿsšc5¢Ú± +`€äü ßÕ#8JòëgÚæ—Ñr"h®Ò€pÏJŒê3~ÄèúgjÞãTÍqF+ïn³‘ŲU@Œ¢Ý€Q®ÿ4Iz›åœFÒZÇ‹Z»µÖõ\‘ï0€Öúc”­å™`{îXηøãa•ó-HöŸèKF㉈,Oy¥ßÙ Ýs2A¹ž‘*¿á3mÓ­ñ9+¡· büh¡©Æ¬AéýÛïR ‹3ñ¢gaã4F2 çRyÞP*¿£i²û4Ï´] iTÚ$•L…Èî“Çy˜e™ŸM÷…W7ÝK–÷­ÛA±cxG`4Q«š •c{RTïA€ÈyÀq~†Ñ˾²ÜrûUÏw¨¶^ô5£ÑõšÕøQãwK,=Ï)¿ÎñTüŒù0’Rk:Ù.ûÌT ¿’Dë}œîÿO*·+Vë4\ìlÉ=ð¢J®àÂDö!ç…Ö7ÝP‘{îÈUÏíHÓ™eYßr<ïcŽõ'¿µÝ‡¨²>[i8Òö¿h¥×˜ñ3û¨É3¨|Û f˜Ê>Œ^ù>¦±¥Õ™ + §zεn?MÓm'ˆ¼ÏàâT&å©,dˆîã4å4€¤r1Oôþ£XÆË(ãw0Jw]ËÒ¬'vÅ­ð¹Z31–óSíYk·Ž7íºûÉíZo2‹3 –õ2€ã½GpÌw!žû,rÂn%Fô½É}÷†Æ섾fzH+]•ÛØô œ*7”¤xNE)¦›œZ“‰ŠñOpšßs-Ë}Åì l°Ò\ðˆï>Ï:â—\ÏÚ¬Í&ªæÛ4×t/J¶ÝŠýŸ¢ýK-/Êçm½c5¦Û1ÔíX ŸuýWY¢÷*xÐúgÜy«Õª]@Ŧ]`}ב Íû>®8;^v>fÿ s·]ãù¤o¼àtLO»]³`»U» Y5³€‚ÊSš Aí+Ho²?d¨þ©ezÙ­šØmšFªY.Ä5^‹.=œ+7=Zl$DZy­Ó°nË,°rÉ*`^Én¤$Ýs*HxŽ“^ÃË¥Ó ¥g7cWÚ .¶;`v–d{®ª=³Ñó|Ó­ø…á>Šd=É2,o=ÿ­×µ|­f6~ kK³|ÿQá{¨¢ø\ó + d™•ö²ßa„ô<ªôE\40«n»¦X΃vI†å‹U: ?g9–£üŒÖï’Tú]“ZFãX­ß]·dxMAh5¢V4CÞ´àn §ºbL÷?ˆh¾J’íÇ‘¦ûÃê»îI(Í‚&¡³ + Ü8^зŒFvkVAõŠv¯Y¯bLëM”ã; Ÿ3rÓ‹bés:Ó´Ö+§[Ë»bÕjü¢cw4Îuäøu_áóŠ3NßtÒl›n…Ê줆ûUÓt>Óò§úuOyvÅ©rÁÈù²_¨ÁZƒ‘†íC­Ù_òäzsy†áy í:åÙïÂDû¨ï¿}îê•÷„È겊Ôj ¯q:šçšOÃt×Åø)ÿkšñ<è–ŽwõÆëŠVu'êþs»dyNDez>皎JÙ„§²¿09ž©û®©…ãù\Éüìx¯Ü¶í~¨m»gûîäXþ_ Q2» ƒ4A캧4Ãø!"7Ýr˶#1–÷1€c½QÜwiŠý:Q1ÿˆEó}®è»ÐêÎ? ¥á¿dv3Nv_ÃÉ +­6‘á¹ÿ’TÏ ­ê¹ßUÍ籢嘌äuKDd7†S:ÞŒá9Ȭw1Êí~Pzž‘êžkbÝtJn›êmûÛõ¾ˆ…ÿw¦kº 6IddšÈTìºhÚ¼®éZ ã|Ž”½÷Iãv/˶]HíºçF†ñ:¥¼N‰•çE»ï¼(š_¬Òk¸0Í}9MpE05ž*»Ï3}ϱn»š©yãTóy¤ð:œh{®Hëý¢ð;(»Î7…çáHÓtÈlœ° +Ï›‘žý-Ì´ÇIÏ#JÑ.hVÓÀ¼’Õˆšý#J±¤8ÆÏ(†Áý¦ò»éûŽäXÖÇ0ßt2M·ßgú¦;bãzb‹ hA£] ’„nû„M—Ç9¯±¬¢ÙáÑJ£€â§ô¼›§›N©ö“øx±q˜‰ªwqµ¹,Éø¤µN÷n»¤¾ûFkÜ/õ½  éL»1"c¥Feè á _ÀcJî`•6óDÿ‡¾f5h”ô9'•ŒnëÈì.l©N§åÒí’V9ïùþ³ Ûs/xÈvèÕ,oÇéî‹àÉb{1®ÿÂiÏx…ß ­ìº(¶ì6kV‘÷¬¥ 3`GizĪünÆÉž£Q¶ë„Î7Ýó:·ËnÍôº[4#H1{Kslÿ~ëuÔnû?ódïe”o Ò=—b$÷=~]v ¨›.¥øåö †èŒÒ ‹ëØq*{ðÑôš`åJæÀuVB4óeŠï;£y?Üš÷0R/¸Ƨzp+Æ+­n»›ª³2NudoíÒç’ˆêsGDó¿ö;Ÿcq’ãœØ¹ßjˆífu,"Ç°ž&Zî«(Ëý 1Ui#Åñ½Ÿ³‚Ûü¤ø.âSΣÕÿ’"»Ÿ¦CÁƒÕ&bG*­Dù…OnÛw<ÕôŒb\ ¡Ö›ÇO+.åxöÓ<×v1Ðñ^¹]ÏÅDÇøc¶=7ÝÖiðTÕs&|´â>à\µÍ †Ýu¦l¾µ;ŸËzã|©Ù Gª¾_µg4–Úø “¼¿0ÃD÷ŠÝŸ]øøh½£BËm‹Ò{š©9ŽÅiŽ[já~à4N§HkMDÏÔYˆQœ÷MÕt<Ö0\6¬öÓ¢ßz¬j½ŠR,¯r<ÿMÜtÙ%¤€ÕÈ«·–)ÞÈUç•\7›–óYÏrÂiœjVã¦)¿†åNð@™ÉDÏ} õ÷Cmß­(Çú˜§9¿3]ûeÇx”äyO̲ëŽÖºôªvQzVA +³þ„š_2|Û §f¼0Ë|–¤ãDëSœb¹fYß­ÓøyŽUÇœª÷:Q¸Ž”=4Û«ô»(¼DxÞÀù‚;1ºërœt½g¯ˆ}ã!pá…úŠ¥²#uö“DÛŸe`VÏ.øAÇòL~ÀÜ8´™«Ù4*ɱ\†›¦zˆ¨zM­¯ZÏô¼[5Ñë ¢-› 浬®y-«KâÞ€b%jL Ï6]Šqí÷0†íÅ+|îÊ»#Yf­Ù@ÑrÛìšUlšÇêX $6®¤ºñ>‚á{š©²'ºâ›Z{³ V¯V¡Iµs<ë¶NGJÖµj?J2W -™Uÿ{ é†”¡µ"öM3ÍòžÑø‚ëyÇ«{®åX–KñójsVçÿ1R›ÌkÜÆDZˎ#Eÿwžð<=f¾1íRÇò’Ø:ž4Ý'RÇð€ÑºÈMרÌGÁcå框JM†0L.huãù¬ë:G/¸§XÎ7u×™$Íú”¥yÿ¢Øõ&óDóc¦dgšÆŸ Ão7Ís¦™¶¡3¥–a'é‚%·©–ãj¦g9ß5ÝŸ‰žýp¦Ò<È(É;ŒXo*Ñ/ü/‹Î#·ì½\¾-¡õ_¡8J¯ã‰ÆíH„bù:Sj8Sõÿ§Ú¶#^ã4|¦ï»›éy¯SEïQ”c¼ŒÝ÷±ªçˆV9Þ±ZV£YU»àæHÏãQžU€³,³!£œç‰ÐÙr»03Uæ!Ûa¢d½Ï5ÍÿQåv/Év]Š0{ Q½¿¶éHŽá;K­7^åuÎ,‘·‘rLÛ‘Ûv$Ŷ‡ý÷Mßw-Às^*}»À{Å %³kižåD”á7$ùÞ×Y­i4´ˆÜ.(RÙ}å\5[§QÆ (£c9¢ï1~å{s¬niEË[ZÑòªY4h¢Ê>j¨ÐŽË¶]²:¦çsEßu¨êP2;ð¶í†ÔjVƒÄؽæX§áÅ @«M»QÕ–áUµcyÔm/X…óV7~ iŒN¥ÈÅa%Hí¡¦Š È9VÁqjvÁÑZ·1ÝÖï˜Ïu^CLÒئ©v+ò¢Õø0ÃGÂF,ŸRtãe7˜WùݶkF£ª=³Áb8uæc-ÛÔyŒŸÖZˆ,®æ©®;AšéDŒå¹—¤Úî'ŠöŸXqVAàÂçmQ^Í‘Ùuf9ÖŸØ¡J£ i~ÂWåæ!,ëC†j?×;·ÑÌÊó~Ð7Uû¦ó‚éz+Bó^EΖ›‰0<°»‹ŸòÞY5«‘„VAMt¬‡”"›zátA^´ÌjZ6Nùó¬ß­‚ßµý—rçu5N·%Éþ«p<$"x £õWUÿYšf¿‰-¸Ðô?Êß­@VÕg¦d6iZÏ¢\ÿUüºÚ\‰ó!n¼ì`¦ÒˆS2Ì,ïåyο0v©½8Óû&:ßâ<ço¦i¾O•ý—¦õÆ«|Ë-ËûYÙ}Å­4À«2•f¸-gª–^ß}_uÍ7QŠÙYšå{”ÿ{¿c4Z˜åÿŸ«´C­6%˜=gºž[Y¢û-€\p&|Xl.Ͳ>æ‰æ{Ãö¬µdY®ÅY–ÊžõZ¢µÉ]ïy¦oþtK¦÷õ–Ý@bÝz*Îq^¥YÖ»8Ñ~¥¹CE÷ƒÛô> .ç`õŸ&ªþßDÙsÃ*Oè‹V“šÏl½FÌ3ÝWY¦÷)Ç´ŸCÍÖ›…ª´Ä«4 -ÙÎr÷¹Cì¹(8ý÷aÑû‘%XŠ –ÚÉ-÷ÜÒñ¾ß²ßï€Vw è;/H•ëùHãy)Fõ¿Ãg=÷”¾€˜dóKÔ‰g¥àÆ«|Ž(=£áÓ„ÏÅ(Ër3Ju»5£áå1 •ÑF×z£˜IQ-÷ƒ–éa³l6ˆ¶q\±l6ŽWyÝo:¯“¿àJ(N2Ì´¾Äù…öC]ÛŸÖ²º,–í®–í8û€gûÏa'I½äm»aŒÖõJŽf= Ó|‡A®ÿ2Ƕ«vc)èÌî*i>GÝÒí¬ÛúQ™Þ÷;Ï{iªõB1¶?16㻌¶õB^3¼áÞw1žíEÛ7 ¨„Ú,ønÍòŒÔ9ÿ¤¨ÞƒÓÀ®8¨wí§vãv4Íuÿhã=³côDá4XŽå?ͱ]§2LÏQyjQxÄßMøT¥m¨‘:c)ªÿUìYÔ.üǹžsIªçz¢ì:›§›®y%ÃÍhÅì*z¾Ü^ŒîÿrªVÁÔº€ óm“dÿ[–ì9”eyÿÃ’ñ[¡öpK–{zÙz#¡1PTý7ñÛZ3Q†Å Ñó۴ŶÃ@f± Ãî ¿sŒßynçé®sq†á-ðtÁ„Y´KŸKÁãÕv¢§Ëí„È®{!Òm€™çh˜ï»è».ñK­f*vëÊóFì„ï&|]p!E´pýq#¶wøÆö%–~‡âk[båû&!4=fö}7R$ï]‘ó.€Êy4`ö2[k>Óö¾ˆuÿi¤æ½uͯ‰šù%Ær?DŒ×Ý —Ý‚K\§²'ÚÄÂïr¦ê¾ÍôÌwy–÷1†á»ñÜOAºéZ˜l¾2;¯›nëxÍmÜÎÆ:ÎçXÇz’e—]Ù%úŸ¨³Å/·ê9´Âíx¦kºêy̪õÏ/ÛÏôªû?X4ÿweû“[·XeÏu}ÙMø`©ÉPj©ÕHÍr*Ì0ÜIsk-†Žã¹žó:Ö³œïªÖ?µõ;é¶~·C]ë¬n?ÏTíGn×ý*¸í'f×þ˜¦úoê ðä€ÊM“›2ÓÓz‹m»à†YÇ[)ªù;Éxvú&€/–­‚iöMç«šùx”Þ1HóÝ«m³@Xß­î~Ñ:ÿG³duLk½Î8=ÓcZÏj¥ô<’ãY´žáÂHjxRBk4„ÑyLS½—aºÿ5N÷ÿƹÖ#uÓò^·ØBø\©9µë½XÒ™^W›v#)£ÒówóTû¡Zz¸ ÏHîïø‰Y©Ú/Ó¯:u+×k³jzA^³;+wNwÄÂç°Ýú –óƒT1Û + U†¹†‹ZÏh0«d7Tøhµ…u¹Ý0Ëzšf{NË%£ô%³€E‰þ›ÀÑZëà ³£Ñ~—ešŽGº¶{jé5p¦çþNÕ¼ŸnåuHà3½%NÇâ‡ë-W íŸ1<ówžòº-÷¬Æ+¯Sfås\.ZŸg=¯…Ž8¡D]"'K­¤¨þç8áx^0=,ô®3ibÑHÅjÂkº¯"GªÌAÈVŒÄZ™*µ>¯7Ÿiû_åÖõRšd|Î4í?fÛvRï[†i¶ƒñ ïSøhŵø}áe$·ÖŠÚt\ËÖ»<ËøꎧDT¿À(ÈlAê»F  1ß#xžkñï} r<še\ÏC8ÎÏ,ÙsB©YDiÚ.È9Þ‘Ý_^éy.Ð0¸$¡±¾U‘ÙÝK³¬wf×QÒñ Ö³Ç*ž¨ünGÚž+A’õ(Êó8ßuÃy;\ø.Vn#Àq?fÉÎcQžé~Õv^ `ÜŠ!×Ú ¥×›‰áŠßV[ÈÐÜQ¾õr ñ¼ëúÿcmÏÙLË{)˜½¨]Ó…È©j[ D‹– e›6„åöaËÍ8Ý~f¼Ž…ð ?² ßW¢a;VŒ¯¹†ñ$L0;ÈqŒoaªõØî^´+Ç#~É|«VÊ nÆ‘«íDÐ +Å«-$ötUç‘Û6ßÉ…ÿ‡\´œOö|·¡–í*†Wh)Ï°¸ëØ̪ùQ®üîgI¦·òSÎwäp¹øHÕ³V9]‰oë æh¶[±k6¢ºrp>Õ2³§B¦© …/ënÂÇõ6­žÕˆ¿€ægS «…â|"ð¸k¦Ç"|ßyÉü!™¯‚dû­Üx ßÿPÓøg:/ŒÎé¤V³¼)ÖLo8¥ÛM³t»§œŒ¤–ÚP‘:ZË ©ð¾ óM'#g¬ÿ=ë4œ×2ÒíX^kŒ)?¿¡-©‘¸¬))žZ×øà•-¼²õG,¼º¥Ó%·k9Â'ö„š!°ëÖŒN)͆òÊžÁUV¥i¬…˜%¹žãzçxKlÛÎIÞã8ÝtÒkiÜndHΓ Åð8Qµ?˜]û­BkýÑ»_ ³íPÓúŸTíGNãwÉé™&pÂw ™”Êñª\ù8hNw6Ó£YºçXð„áM|YiÉè»®Š5«@­ÛÈqªç`”â¼  ©±5IvÅÀ*ñB¤~!z½üÄûÂ09š'ÚŸeÓ©0ÏúèV~·$dv#º}Ó¡#µÖÂç%ò ï?‚c¿Í´¼_zÑr5Ñ0¸£ÚÒÛ®Êeßù l»•$z.Ï×ýcXþ›ÓuÉ*Y9PwŒ­?VÉh¼Ïÿ`ؽãWΧð‰í?êÛ®¨]ÏyÁò9pœŽ§zÖû¨íÔŠfQ:Ï£Y®û.J÷\1íGá#ÎË–óÐì˜ 'ù¿Â÷¨m¼¤NƒÆ™¾+9ŠýÅ°þƒ–¤zˆž,´%Ü +âUZÍS½Ÿjë6Öx"C.< ŸÓXÊrŒI†ÝO­Ô~¬ë? Q,_âæŠí¸çC%ÿªm»è›îC˜î“è Ó§ø çOø¶Þ>á;MUÌQzÝS–å=ͳýO^ëy>*›.fúuwq„2YnµÙTÉr3”[n|Rf*„\i?Vö^ÙeûƒÝ1žf«µ¦BÍRyf¥‰,¹Úh¤å¼Ïõ¼bÙ`Õý'áÃrK¦ÿÞjÚ /¢· œ¾o7"¬€Ã`Ž^hWEhySk}D—[Š°=ËOº.ù³€{M«À +Œ'Àw‹v£ÃPz$Iåö‚Ýr&Á«³ )_8’¨·<âðIVANC_4?V18À¥öÌSü6wW›f#Çùþ³ø}½uÃàdšì¾êþ ëw=W´ÈEà ÑÿAÚ¶¸Õu ¿0>ÄÍÓØ +ŸÓ¼ãfIÓüZs‚Çp5ج2˜êWZq«¶#³pþŽóM‡bg*íC Ô˜ ž«2%Ù­)èLÏÊåæ"E¶#Çô7K¸^Ëý÷I×}ŸÔM7‚ã¦oºj»¿Â4çA„èüÈ-'¤Âí|ªê¾ÒkÎW…Òü˜éÛ³ËÖ Éë„X5?EiÞKP±Z;óuoA¶ív¢p;¡.Z /·,/ÄøÕ¶²»# Ñ`Zåx+ų?˘ª6:\v>XiB,ZîCP î¨^¥ÿ(Äp> ¨WN® -Vl1ˆaxcVÝgbÕüĬ3i˜ýÊ•çaµg ‘ý:ŽWX¾f=Yn.ˆ^o`"7tŒ'I’ñ)Jô>iãõHÛv*˳¿Eyî—(Ëw軣„Û=±ŽGè@•é@Ùt@ÒwÑ*§Ñb\ã¹è “39ŽñË®šÏýŽá­Ïý(Kõ.Osãš³„Û¡tÀ)<Ⱥ¾ó :ÍGü•Võ~„HæSPBÇ ÒDv‚4ו𱲇 Ãùë8S{‹zÝýìWŽ—2ü’s`Ò +‚W°E®6iÚïÂ,ów¬çýu¬'qzÅù´b|ŒþÃð±B«€‚¤vpE·à–ÿ`Cþc ÙŠk»û¶ãþL¶ëîÛŠå|¨î98XnVŽê JÀlflÍh˜æ.Ñ°¸–h˜ý ÷ÒTó[šd¾ + §3 4ÝôðšQÛÃÏSýEfŸ¡Žñ@×õ_hçÍ8ÙuÙ­ äUÍGZÙýšæZÿ’Œ÷¼v ÀÔj,Éÿ+€Tò”#ÛÎݪѨfÍî’RxŸ§¹æÃøeÁm°QB¡“ÇrDË£g5F~ê~ rMÇ´Ö+8Néy3Êu9Î[ˆaÃ`ÓDÆX•&¤ªùß.š Ÿ©›¦yæë8ávJÞ4Pච+ªW2=a4nçÁ&*~rä*S +2£s^ÇîHÜT­Í Ýx8KxÞJRM‡Å® …Óˆ´b›à&¶2í*óeÏm>ëúò+ãgøºì.E50:§ä-»(Jö›j‹aÄRãvÇð| nºÂ.8>Vk(x²ÖFˆh¾Jíÿ‘¶ë:´hà NžV7Î2~š=«!H /Aˆ×LÁ„K)†á‹VøÜ•;†ç¼¦ù$tˆÒ#G-3p×ÌFQ—Í굌†ÊQ¬gA¾ë”Ö±<$6N$·;Af¡•ð=½;Š\i<Ð6Ý4‡Ä‚3a#”ö°ã5C˜·9è4Ño¢g> „ÙJ²¼·aºñr”o7̱Ÿf™õ†Â†ˆ-IUëKÄîW¿ì:¦zî„NXþÉ‘<LZ0œOYŽ÷F+»Îd)†ZÛÿ›©Y?c©…#™…†² ‹óvµ­0Ïù"• 9E«á1˵h~WSü.ÉÈý—šó+vÈû*Ukd¬ÖZšeýûžK9¢û'~¾â`žeüô ¯[fçx<Óõ\ à×ÝÃUÛ +cUZˆóŠ,¸-ëAŒb}‹Rí߉ºíXšä9Â,¹.Û6¢¶J4ì–ä®ù;Ôr¾ã¨õ&ƒ¨÷ÛŠÉ5•ýס²ŸDNÓØ)`t ¡—}Å9ÞûpÃö£·¼OÑUß³”vñ#–¦œ¦@gI€ _‰ž¨4‘æ•ZÉô +méEË%¯s¼<ŠÜJžÎNètÉ¡øù껆õ%ËpœÓËí%:Î;µq<:Wm ¼Èž¡´b€Ó +CTÙ +`¿òã§Al¾ý.jH Fi¼‚Ÿh|.†Ð«Í÷lÿ—ÔúÝw›¦×ÍšÝé(Ý} ³NgÓ|ûÓ1:#U>'#˜u¶a‡ Þ‚ãM¶pÙ¾cFÍn\¯kø@éwJêY ."8ô<ãx"|Xi)~Te:ϯ·d؉öû¢qÍõ‹%ËûEÝsBêüÎè›V²*·;I’å~¤î¿³¼—„ÕFÂç$çɆµídÃj:×q[¸Ìnè;–W´ºëz¨é9—fùߢ§ ¯á%iÞ‚»ïHË{ :Gì&ù^ÝŽåA³ó<¤¹äu‡Äe¿SÔ!³ÄNáÿ›çÚÂÇ Îe¨ö§‘å€Ñ9ÞrJ¿ƒ)–÷(Ã/»— ³H±ªNdäÞëH×ö$:ä·µfÃTÿ™Xz^3¿cfÕÿ=Iî,Ó,d•¼˜móCð8™9‰†9è:1yjIXY¢Ã"ïƒSxÞ +ž õ‰¥ó5½7A‚Ù)¬µ'Hùº‚=PÙ®W·îGíÚ.E(•V"§¨Ý2ÝJ^×þ=[w *FcZŒâB1üˆqœVÛt2R19B#w ìͦ9ÎÃà™B34ÅÖ²Lóyžõ;"Y.ÉEó•’Ö~ê‰ó+­¹û{¢q<_p z²Ô|¦h¾0‹î»DÅö)úîã…€©zƒ£™%AÄi…âg©N’˵é½6,®‰Å¦"¨uO¬’;GÂF‰~B§©¢ÜzarÁá`ÃqÊ¢°¶£Ñך¯ûåÆr Ûaü„ó'~²î%|SlG¬6™jWN6,Nks‘ÓŒØ| ×½ÁÇ·À ¥–†š$wÎU|Ÿ©ùÈì›.Ÿþ§c7€U¶ÞˆŸ(8 @Àl +Gj@Yi!I1¹g÷·¢óSø|áC„ä,ÿA,ûEŠé9fºŸì®ùEEñÿSY^ÍÓÌJÉêœ×z–Z!¯Y^4Kÿß4Ûz#{?¥šáyµk6’¸öä,Õ{ .ÙðŠß;i¨žWÅ’å)§cuB(îšE£±¬ÒíÀÚ´aE÷ ©r¼À©1gU]ªi-÷ z÷mžë ²Ì)žû0Ì4]M³=WŪÝ8)–åBŒ\kB_²<îÍFOtŽ·mÿWŽe9¡˜ÝÐÖZŸƒnás É®3è—Y8>GÝÂñb’bz=Qd$K,´ŽcÔÜåù¥Æ$TVw¿åDÕ{cNw¤Æñ€œe˜Óÿ›gú/‚ô²G §s¢d9ìnó“»³Dß0ãtfB' .¥hî÷8ßsÐ,ÞHÌm„ÏêL8¥ë… ÅøŽ_I0­?±ƒµ£\ÓÍ(ÙuÁêºÏbóCbk/Jv~»5£‘œÒiÌÏ~>Xl)zœÆ<à4yÈ‘*ƒyŠí4Óñ}ÂÌ— ‚GÉbÜj3)†ó(|`y9^q0~¼îhÀm"Ý&*ÆgàI‚SÃ5‡»î#@^û>Xrǯ~Ì3Ì?A%{Ž€ä¹Õ2Å2Ù–óh Ð0¤(ÅÈÑrƒa²é|¦m;žišnÅOS™†¡öÉóë^‚‡JmÂJ|F° ¤Æñ¢V´ +j˜ì¹¦˲ë F‘‹ícÙ¥&·õ7Pµ„ –Z…&:‹2ÿÅOL†0li–ùÃ-8F¬3eÖgZÉò"n+‚è¤À¡R„–"÷‡1LN¨Eï] \k3’^m2Î4G1k?á¥ÊF pŠ‚• õŒ¤É“ ®Dš¥–RÅ"sÉb…Ân[°<‰¾ƒ`rE;0?x!’B™™<»ÚŒß1Ó¹ˆ‰ ·S»£±ÜZè0•¹Êrsy†éc¢aøhºÿ3…Ó yÓ,8^Ýx,„ToZ–ÆDÌH•uÃðG2?Gº®si–û"Hqþ‡ +¿Ã‰¶ï`Çü7\n.€Äî9Ò5ßÙuû­_ø_F*¾oµm4’ÀkwJGî½/+æÆc5¿}µk4v’ñ¹’_—ÛÈ/ ×Ã4Ë#FÇô|M1¾è›ægƒÎû•g˜[sÇbÍìÂEn7´ˆÖlx»õ»d•Žw¼ÎóX¤]n6[¬3dÑWÚ ¶ í†Z¾­í¹*¶ž—äM«ZE»1„’ÑHY–å`¢ao]oOç™Ö§,Íw¢µ=Ç´¾çœÖ7Ó +¿VÙs0„SezˆÐÖnº)¨ìîÅÏTš3š` `°)z°ò]ð=‘éDÅìKBgy*Í0·"0YTš èÕÌHPÝOá ljءB«@Ã4c¨Õ–ĦçBv¾h 6U.§Õ,½Êiø@×w0KòžÄ—ót÷¢ðÿOºö¿4½ìtÚ2É4~iÛVÁ6K§q2ëWàlµÅxàXnÅOøM†éö_·e5^ü€Ý-¼X­}°±R{9ŠéIˆc>Šñü_bë7Z˜c}"Á£E¦µÖœWi’õ,„]q*|^qÂ0ý `—ËòüGñS¥æ@FH )U–ä²íVŠè; +KgP¼m +b¾o,AïO€Â-sȱJ{ ³ûXÕyä×|g‘†Ýh¤c{ÏTýoYªé<ˆcþŠŸ—]‹‘üA ÓÙŠŸ*5©v~]ÿÀxÏ/†Td'Ê«µÇ,8¿-7@°81Ug+FsþÚ}× áã´ÆàeZÆpã]›(¿ô&xŽê” «&@qjUÐ1‚zsq~½-·ï»°~ï:L;D•Z ÕZŒb8neY¾¿<»ØLœUf@­~k„ž+~Ç@Zq^¨ÚBèpÙAŠeý “­7Zët8Ñ4Ÿ%Æ ³é?JÒüï–éHèˆýGðxáS¹â^½âv¢ïÿŽ>wó\Ϲ(Ëú虯#m×¥8Áð!̬¶§×Û[v×}ëW˜ä{HÒ‹íÊ…÷ÒwŸ„OkDÏUšòíÒ¾]°½žÑ Råu\l]ÃŽzEOÓ›«%«;êžÝ½(Ùû¡Øn"GÊ,(Öß@Ûx'R®7_l­C¹ÄŽ ƒõº©~±IÁë}«Æ£$ÁàHäL™Áø‰áS’i¾ñŸ3bËr ͪúOkvû~Çî”Õ¹ŠÑ¬7Ñ“ÕÖ¡ÆhL‚>OP¬bCª{®„ojìÔT=„ŽY°mßÍ8Ë|çxïÒ4óq¦eýRѼΠ†Ï­Z•,Åâ¦Ø1½ãtŽ§eÓ ¯í9¡5íA#TÞ óÃ~ß}”å¸/Bû[ðtÙOŽä¾Qê®sRÝtÔíÛ.ÆqʬCš§‰ÆmÓ*ЀãD§ ’ÔîÀâ4‡Á#–W1²íŠSz^v‹VAK“Œ‡a¼*ÓvçyÓkœFËÑì'¡³õæ¡æJ-Ÿžû©¾õ„Û2¿ &ëc Ü€ŒÌÎm¼n‰­ëå<Ñÿ9Qip®àJôlÙ_¿ðÕnˆayš§ÚΧʶVßw*ÉòþAL–¼¢u¶æn€„9U‡¥–rLÏÑ0ãyJk]¯æiž»³tFHÊlˆeÏðUÅM0"%Kpb§H‚Ĩ%–}+æïÁŽý:ØqŸš 5‡#5KtËÍd¹e?¬R{! ÛMìdµM°n;x)ŠÏ0n½ý´f>ŽµŒnçx<Ѷ^ÍÓ\bäºs¨‘:ÃPc•fá&êìP³Ëæ Åî"Äñ>&é¦rTÛ…Å|c^FJÞ»HÃq4T®¶!wœo‘vÙ7àX½Up±‚› …i¬„Ï–[ £—šŒá—š²êæ+Íð¤„Öl¼ÊZ ±EVãdû­Ù4Á*Üÿ"ˆ¥&!Fˆ "çÉÌÄ&'­¢Õ ÚÚ ’DÇ© ×øeÕ¬Fw«fÁqZ§B¼žCwø!r§8>Ñ[¤Xi<Ö.48Y, -R(åuÞGJév0É3_†¹þ»Ñ +/\ö„oš…)n㑲û-„_p1‚cû ÒíBéÄ~í_ü¶ì,ÉóR¼Ò›˜Mó}¤í:¿¬¶?TgÂ.8%úÿc=û‡Ú°»ÎµË E±Ižòôb‹Ôr±3eƒ¶¦â$ÃåPÕû$Vn—¬Âé¨Y9^ˆq¼GRë5¬×6˜ á²Sp“ #¸Åv]Ï!­òº¥ 5=вžAIj†æ¶ÄÊóZ„äÿ +.8“!ZŒ’åÝ<×{#¶,‡õ®ëbü<™]°azï(éw>K´.C6žHÏÔšÐR<¤Uæâ§$Yb™9¹oº¤øÞ1ü‚)zÙO’aýdWž¥é…bË} 2Ö-9[*àUÌl†)f±#µ¦Â·eQºí´Zµ +²à÷]PQºß%Bóc §Êb­Èv®bx,žÇjö»0Å}«¸Ža—ýÊþG·ò:eÛ óëÍgÚ¶ó@SÁE–Øåöbøe§P“4§@F¿4ÁÞ€¦j:,ØrŠ V[‰RL/ã<ÿ;~[p1Š\o-Mt¦ù¾#AŠû7Q3 +£¸‡!ôŽµÜß™ªï\–êºÂñ^P—0,ÿQ$Ë­ò»?ª¶äV\Mtìo!”rCñ34oñ£t¦Âç”F§ ¡ö‰Kíåùu·ps4ásKaŠáÕuÌ3܇q¼ÂËHÇÿ˜h¹O²»o¿ò<’ Õ+Ût(Ýô'Fð<\qC/|Šá[ÕÌ_ŠíE¬»_ĶëRü\¹AXA¢78 j«øeµ­ê’£Ã4æ"¸ÕfbeÆÂ4ïWè9âxÿQësªcµ¥5ž'òšÍiµtð Ÿ›Ðu£GË j5ÓãiŽß`Ÿ€ò¨¢ “¤Û¿%sÐt•{ŠX®DXÈØžÈ-òÅ|]è ¼E_Ä.ØE~Á-²‹|»è ¸E_Œ/Ø…|Á[ä‹ì»è¸E»ˆ/²‹|[ä‹ù»Ðx‹¾ˆ]°‹ü‚[dù"vÑp‹¾_° 9~Š¯èU¸Š¾ÂSt½"WñxŠ¯˜W¸ +½Â§è]Wñ+xŠW‘Wt½‚§è{Wá+ø_‘«p}…§è*zE®â+ð_1¯pz…OÑ+º +)Ù PüV¼T¾abc12d12-bde2-4512-ac98-772bdaf169dcf661dbea-1926-462b-b850-c31cf2a-7918777687548l56.209637h W n Q48>é¢:U‘°.ª'¼ª¾ì+1Z4Òy…®°vQ¶Ñ¨˜< ‡ H 0Ý ;ðCTg\/Ŷ»’ÁEÊ0¸ìïì¸Lc©5çxìØhl‚ +ÎKt¥¨öàÁ€31e[Çùjy\ly56c3d710-d1a9-46e3-aa32-e827611835d9dbf-3ab4-42d8-a661-4a48c49ef7918055773925682v`Ë£:ªp`Õ VŽÆ¹Š×ÐåçQ½­1TPç[£Q +äté¢h€ä0èé¢Z¤:a%7º­m‘XTw!š.ª“‹Œ0æCqtpL”ÿba001d1-9e5c-4c24-a0c1-3aa654b077c8944eea1-9c3e-9ffe-d04c1fef2ee4087955035305023549664382977-284762f13b8-4b68-9c60-462c2ca4dfc5e6-7186-4489-806d-ec42b305e92d3241882m10SVGFilter / : /XMLNode : (fxmlnode-nodenamvalu1t/ArrayeTurbulence;childrenbaseFrequency(0.052attribute; ,numOctaves(stitchTilnoStresult,feCompositope(ininSourceGraphiinw100%hyxxobjectidAI__id/Def ;4fractalNois€ ¨A"¢‘‘¤I¡Ýa„D‘141%æ@@)# +£8£Dˆ @E ™@&`µ²ŒD8ðŸ3’¨Fr=6жÀ‹¸¢p&4É`W$J¶²cÁÜ´iÓ›9@³Éviw~¬§ýŒ){IÊÂG6ütvø´G-Bø +™µoÙi1Zbá]݃h+WøËÕ‡A8-ö+à‡©×/'QGM +c¸ÈçÌUâý³)žnŒ6Cx¨¿D>œYi\¦E™CiÆýA6fš“`ðM†óg=Ê虆²(GµÊÁA«žºmeJ÷L°v‚‹ ·ò«G²Ôeòdd¹ÝýÖuû\w¡ ~M®*1¤G2¨¯å#mgáô>Ü‚Ž«SŒ9­y r–< ÀW©§`ØÿjùýàZÀê¶vñkB#lÜ6Áfp4¦Ø NùÓ«ã"Óqñ—_š~.ªAùrF†Q}åÿƹ¨.ðDWs¢uر œ'¯]•ÊtR¨.–±ª†ÿ=ÝVª¾L‡Ëˆ™Îwã¬^†:'k3ÑÌÒ‰š‹5jÑò¨ÎÁ/#Ÿ1‘åøEò àG÷Zuä“/p´E;-5õ6Í qêA$éü6JuçÚù Y—G•ìØëÒ‘r3í*œM +² )M£t F<4QôoÙÑìç<>¸³‡w ##®Œ[¢ˆR8ƒ‘®–/)¼ÄOõ~ú¢58½¿ô«›ÍˆPlŒæ;¬8ÇbÀ1ãá,¢¢A@º€âX hšÐ]€ƒÅ±(À±,@°è}° pb5 +Xx4‡ÆÒX Ì Œ&Q@|PèºÂ[‘<<@Kah¡ÁREÓ0 Š¦a +hüAU¤¦²ƒoŸ”bq𮪮/Že«H ŒÅ +Š,̲PÐP ‚bq @¦YDš Ec°40 +eq,@ a˜åÁ—GÃÂÂB‚ò)#<(ÌÌâ°XÀO})no”Ï÷TùTÁ÷”ö¶±XXň²0,ŽHÃX †áÁÀþD€ƒÇÒ(0M“8øY…µÖZk×Z+ñÚµ’ûËâX`˜Åƒ"£8Ècq(KcÁ °e=tÁ¢@£i40ÖRü@±8" ÈB¡Pc¡4Æ¢¡œÓŒ==]8 +Ž†iX ³4À,†Â yèV胂¤±0–¥0 …fGE¡0 ÅAnÃB¡84LÀÁÑ A£P4À,‘eÆsM  Y ‡ÒÍ˧Á6h°4 òh4É£Aµ0<H€£aš… óy,KYp4<ÜáaqUUƃ Æ¢YÂ`ÁáP˜†µ04Ð(Šå±° À²°ˆ, +‡¯£ ¡,Å€³jí9ãñž»-tûè,ú=œªïQjWÜšº‰ºýTV·{êöü´æ.ò÷v”j•ãÆC[–à­âN_DCƒõ_¹RÖÄxhõ•BO^T‹‡ñÔY ‘ÊÅSøÉØëšh Œ÷Ÿ±‹päóºùè o-‘jUϺ"Äh Ê_„x«\S%¸'Gý£%‹gO?kJ„÷{R9¥£-[Ûxpsñ’Â[„†ÈG”Œ–œþÿlLŽ‹Ð«lÁH1vZTyNÊFK¢ÅSx¼Œs0á/õ¤óã&æC®rï]­!Ô’CxGéù^Ò“%cÜ*ï+B$ì^«íÜÅ^K8û'/röâ7¦]MU›¯ +µ××ÎðNB{ZíBü*uUöZâZažEjÉ#^:Æ8Þäú­ôüÜÚL·û©wÞ®Rï5ÿü¨ÛG©§Œq«šo ÖSWS¯Õ½Þ§Þ®%שºÚ©Ø{IñÒZ┫ÏÅóÏ~”‹wø:©î΃‡pï¿Q6Þ×ó¤ù¶Îý%{éïwOåÛB»¶`‰£äœß»wç|Gíý¦ÎU·– ½gÝsº—RêXÂæ®1Ò-Ÿ:?w”%™®YjKø~Z•¿Ì™Ò¹<ÌLâ¿û‰C­q2Ÿj¾Vt´Oñ%Ìf”2ýmâ\¼v½ö±çã-ê”з¿IþRŸi/-ÎÝ„ÆH(QÂîæYL8óûŸÔåM´xšáè ÇFŠ2z{úV©¶Pžó{öµlٲŶŒ±å·ú\oïZýç¼ô‘¯÷Zh­úúr[¶¸Ù–-[¶U[¶l¥¤§’îI.)LÉAûé½~ë¶8½ÓÙí +%[$cÉgº¶e+ѹÓ܇ŒHÎ1JO–\B­6¡=É–­¼³ÉòHbKÜO[©=ÿ ÛsòýNR‰,IhWãmÁ‹|Kö:­­¥Ôõ&KŽ÷×èŽJœ§VíEF^~žªîO¡M½tÎÆ8©îW/­òÃjÑÚ¤²±U5Ó¡Õ;¯«—ü“ºïÜõ^”-sº¥PúãGÙÝ3¾Ry:ªüãOçŠ0ÔAGIê”b”œ8wWù:éš,åèSöC躈Ýy?fôkt¨±Qö´ü§Ô?ûO_ê<*]÷Yÿùš[óÓA|vÓ/!´äþŸ­T¤›øOÒÞãäÎǤåŸz¿gÂCªwzY$Y¶,‚ŽJ«Ë:ª$!zíGclJ9|–­M˜ÝI[·Sº{&ÇDk‰î=œPJoLh‹Ý?Œ0¨¤S e¹d4 !ã@( $ÉD3Ò Çù²n,0‡CâYD$ŒBa€@¢`@ +㈂Hb­60gœaêÄ kšW'mT²‚F9Y„Taá$è“@lÂîŸk±Ž³TO;ÕE +ŠvÐgŒ§y¹N¦Öb®ã$'®0XègÕ½eÓ`ųþè”Üàl[.žÚÛRï­Bóã% ~­†2ÕÕ`âÉ>‚tÌqüþÄ2—&WI0:Úvôì>/ØŸ0€ó\m‘’ý™ôe½Ýl½D»6,,˜ 7챌‡oQÉ+ômJ#|0žØo£úLEB3¢lÔËëH/xEm0V'ýÔ7Ûq£ dsÑtò§àäIƒy³zNÆZÅ<ôMžÔZ»ôÚ©a2ìœþXcÊXdºˆõœa=€ $|ñ,ŠþÊ–sM°Ü§A@u‹²,qÖÀÐ1gï`íRzÿvÀ½ìkÐtBVWÉ4 ¿^=YÀ£²ÊÄZ…6TÃ8Ë–ì¤Å“9*†ØNŽ`ç Zc°¼ùüQ`û‰PiL£C~nÔL?<¶Y•’.éÌÒá¥/eI ‡ ª•Ü¾ÏQû.PS Ã—íf¼²ÍG.!ÒÔã"ñtwíˆKét¶¼ B²šcÈ| hSwÍ]W;J… †.]»OéÑ¿ ô8ôÅ‹€±ÇÓ× •^árËE©r[|P_/"R)ÖY¯˜½%Ú~ªé{TÁÖâx5T·–Èfzí¦Fµ‘P÷‘0îûëö)ÛÒWe­Ýz¶èM×N"IÇ!åô!Ùƒ“~o  Šµ7³Ž_ýÎ-c›û¶‰-ÞFßñÙu +èsà8MOA§õÏ€s³8FÖ1nÇž²ü:˜_žKÔãuÿ’[Gu4QGúòU·'¥ÒM¬ŽMÑR# +Í5cÆ“ÒCòà%‘Z´8F-ÅY€ÞT› +ÒYÞ8 ¿§G”jb4´­¥ðš½âIY§œÞjÓ¥D˜’Ñhç•lãï¡ZULÈ7¿'|“E³ŒQ¤¦=1£a°…÷B&ô…<þ¶×?tÊÏ+ù‚Ä|*u;PcGË@ÃÍUùNö+Ÿ?=× ›îóBëˆã„› 4œ*j•¼ÂOŠäsœæ¤Ð ˆÂSÿd Ÿ¤B7Ó^7|À/,9EvBÅ·ý +¹Üë@J(A`"›Roo³i:9#é²?‰µQ ¹¤Nü^uS èJ8QÔ—øÛøãpÓl‡˜.Ôl!ëŸ={jd@ÑÛMS°_ÁDÏ™µ@­-_y72™©ÙLäÓThŸ+ãX‰œSÔ’!“c -G¢M6"–ª%‰xÔÝ¿ã*µYš4k>7 ,Çf·Zäæ p19pÝLT˃ŽSãëxzéÀ+@Äjâ&Ö_Ň{ äÓKù ­ÊOeÜzVt]°û<ä¢Kg(lþ-íÿ=éS2‰¨9› I¡H~Lb¼2ѲjA(;œŠà¬ø€&xlo›Q€#†Ì‡ÐÖF±ÇÕÑ ž.ž½¼¹áý`VÔ &b`5ÅXU áÜzÅ¥¬PRþÑ{ + R‹aÒcÛ&^@i—S%h¹Î"#ývwÎ1ÊW0A-IŸóÈSþ÷~G³]–¯ýàLÑàT†U  ÛùàMŠ8SO3»Ó!ÀÞ*è6½¤«<ß‚¯ßA=Tþ”:ö®ç¨/”NØ®îý“N® ÑÁ±EýƒF —€_×§Åwì÷¬c"rgb'À(>[1ªz[¨Ì){ävô”V YÌÔ¡u>BµVSò7ý«œ78I™·}è dTº»Æ"ºÒ5M€Ó¸~“z·7ÓEdP:é!^¿ñIAÇrI°¯Àùêl¹fbš ƒ†:Kj©ñ¼8éãK}WA"ð‰¨>=Iûuòš³“#I'ºFèñ~oàe-u·L‘>àd[Â(@OÊHþØ>îX‹ï Ñ,Rz8½LizAZ¤ƒC¡D‹¼"$jÐRÜk~Ö2Ò§6þ 9y©'E.вÎ4÷ù©¤†Œ6Ú’`Àñá)™PXŸx½“ŠHôæï;Sc¤tCRë#‹­·G‘a2Gb4ß¿å>)ccÇ™2%!k6J›ÑO畺6¹ÝMÞ–|¹%ÝšÿÁ½ƒz(^J¾Ž²HÜÊeëÕâÏÞÖ!#µW«o¾h!Q“ üxuôâc‚l»Ò5–tÃJÛSè‡Ô²)˜†®Á+r*»õƒ™©/?Ø]f6Qd’å[Ðúi®ÖF+³&´Ë°LŽ£­ŸÖ)ïgP#õTü“Ò*ªI›tƒ3ƒ.#] œŽÓú"[)‹óÀÊËÝÕ•K³·íÌeõó=Á7¨¼‹f„’ZØ:Tˆÿ°–¡Ú,;ÃCJ³ðS ñ";%Í|—tÑ%ú†Ãï1C ‘ñ›7 ÝGvŒR;h™Ú“ ›œ]i˜£úô,x#â2^ ì‡å”D¤í3ÑJ’•U¸b ¡è^ˆ÷Î"k„ßzDzÝïî…ôÉHáiÈ…cIè—1ȉ¿§­Íˆ®5Ã9ÚM`æ´³x±•íöŠ’Š4gfÙZmËZ_Ø›Œç˜¡kÞ²ÄDV +™f\EâÚ¯™®×(N4wGÐ_ƒÜ¼Š˜Ú°ò9*1NÍöWœÅVylëIP3då 7 RÕ&°¥§®tô?/9til6ÖáVgé|ÉÞl¬ ç•>!Þ‘“¡Ž :BŠth¯÷œsNÒ§aO”V¦ÝdmÈ¥D=ÕUÊ…–Aª‚9ØVlÉk@£Š‰&mN§Ì‚—V¼Ž–ÎÕˆâêÇ­ÄqPF±@AÒUæ®°ÒD^Ùû(Á¿ø+ôe¤«&ám9ך[q¬¿[=ŽÝ—Í2}ª¶Óe·'bŠ ö³GÝg¹¸#Èð䪵vÒÅözE,—Dº ê"ZÓxF×¾óßñÏFMb¹‘bt'ÍlÅ”%BER6>‡-F£cß•ã´Õ†™ÕÒÐ34—).4jZ·Eö›Î3†Fj*‚Tñ`[ŠmïËÊ\¾ëN!{2Çž³]äÑUjgˆ3²¬§™]²û¶LV¨,Y*”ŽZ  <àH²bË3öt҇“²«oÃ&µ¾ h¦Kß>]÷æô±oÕa$4-”Ø%=ÉÈõ`/°ãz‘šV!Â2&1 +Ë´§S¥WFÕ5{@`Õú…œÏwÿ<Ýnººp¤@ª”*™ÿâV4èÜz-®OåÛ?¥VÅž$•4Ö˜ïÐ Édv¶¤)Ÿˆ$WÙ92•ÎrзãÇRö.O°­»œÿÍBÿzr¥ŽöäÆú¡ú3TiýºóE £fâ@ÝlF € Ôæ¬ìe•d+I¢ˆrªÞPu|]õ¬Á€õº{F ªùE¶7S·ïÂɶoŽši N¦b+»OºãYˆ“’*îÑg ¢7–ÏPXǼk²À‘r9ÿ¾Pø2BËÔ–+I­ø>žRÉ‘K}ÐÂlrÌék'qŒâ€”Ò¶(¢¯±Ë@«¡ù²K“¯Ù; a¬`•ƒ­Ô›†‰x†!Ù–{¢;<™/®}Ojé5}n$òÀy4+^ÌâI¹mÑ(P_Ö|%ÈóðI.nòÝ€W)éÏS„I|G÷íÄ$ÁÖ\ŠXíÁ} Ý_T¤È¬êÆ=Lå^”MuÞ¢˜mîÓ±¬ù’§ž,^ ;7–ÁMò ›Ã:2Ê~.§}¡øÊgHÔ“¡¹Jñ¡äW¢{«VmÒM.\³yÝéà¿jòýŒ×g¸…¬$ðžr^Àô|?>zI,— p¸?ŽÂ?•f»òIõà°pHF-µ[*νfƆv¶Wy+|éÂH6Á¤HQ½,ùñˆQ ðÈ6\¢{£èúA•\+®h¾Ýc„ëb¾²»îútV>Ø‹™²/äãvŘžmXÈõŠR2Š‘¤-†àèÂç~°núët5„î*IícšBrx:Hý›,T\Øò½±¥l>}Caëã˜:å[.Q)p EM\Igø2*?”#`8‰Ø 3Tª -Üä¢ÿâ$ ØFmðDZÔ6eêJƒu-OŸÈKÖ$׬f³õõ¥JÊË<”Æ;¦ÚqN±­'!¤þIëð¹*)a²¥Ò¨ù€&F€äÙQ˜yUòqT<çòO¥¡n 4I—$ 0jO7/}$uÔ +ˆ¾*×£6€BÔ{º$—Mâ4Yd0|‚ØZ¼?€'ÙʸŽ¨>Ò"*|ßB“"sùïh ¾¸1×€B{üÔ¥.y…YF®d±C­€«…¼)¤Ód§çi uHmŒ´C’ þÚÍW²Ú„xmÆ°)j^õØ4™<:¥N}»([XòÑãép„‚#pÞÑ(Äj49]g^új0Ä) „´Qi¶ÖðÞ$Þqšÿ`ÍÀ×*qêР‡GÓHñË C¼Ê'%vŸ¬ ¶Éí$j–øº©µC³ðf)…4à°HZ»†S(›„¢Öü%ÓqY³Ba®œL˜Ù[­’ƃ€(K:öÉjÅýoå±?æBÎì*‰?ÙQ ´]!.. ®.jg©€‡æê½í4 +A6þ®œíýEéTŽqi6&â½ò¸)Ç&Ÿn¸Ayo'u7LRV|À€€$•(½…ñY”ÑT?z¨IœÂª­¡{¬L×Pð8e¨–Q·þ[4ãG‰1Mbd˜$¼Õ”¤s‚¹&Ù*œõ3% r6TÀÓÑMÆ»×AXk +¨'ëxœÌŽÊ‹åq{)›¨0TÐÚTç[`9Ê +¡ú³™$'˜ ¸ ž¬xÉŽ›{ ДÝpËÏJ&¸úÐíìB€fMˆH±3¨½…n‚?Ad%ôz·qæ3.|{”v¹Âèo¿ø§Ú’#ˤU‰3.&ƒ’'MDx+H¥ÿËÕf/䈣b„{Q<ÿZïËò0ÑJf_ñs†î^¿^¦„« oÍ•·¹ekJ8`¹×ʉ‰¸0Iÿ=~ÔßuòSÚç2þm9©ó!µj¤5!S678Ѭ¡½©Ié–Ú·/´,NºNcøš( ³ÃŽçCJ ¬ÇÚ+ëG¤a:ïpéÄ +íæsŠ¤7ÔôÖÈVÊ“ÈÀ˜åH:¾†0.šhŒ¢‚ÖåÐxU–¢8DH†Û$ +ÓælÑ‹«’ZW…™¦-ÂHC1Nu&Ÿ”)TÉš“žM R[4E…˜ ÈP Ê.„Ì©ÔϦsRzQÍVŠ‡1–œ­]±ZÞ§µ¨OtˆDvsÉšBÑÝâ ÉÖ2X«îÜNQJ•ÔPÅ-\ÐŽrB-`ò šð½m(ªñ¢¬ñÜüöàŠD¬8w§b¦XæBG'£P,† ZJÂçˆÁ„2ãmíºû9*ïÔŒ¸ýÁ¬”/ ÿ‚ÂÓ2Ä»¹šdù"ù“_¿N¢ÉmQ¤/9ÍÍ^aËcÔn˜«\¿û™„$-©ãÇ®ÑèÖÓ6Q<×ÞáäPèeAŒïBÊ4±HÖ‹HÖÀ~H(’[B»Ú¹˜h®ÌãÞ ³ôC¹ýÜoNµmÌ5„Ýψ˜$UpÑž×ôg¡¢} â¾DÖöó²žyÎQüâ]4‚¥%8­Ù¦Ÿ.q*§¡Ôiµ(¤íÍ”C¡F&®Ê c á#n&užW›9°o”ÿ9þ!ä n½kõÓi½BC«LŽ’Ð} ФŒ)\¾Ž¬v‡f dÐ.*ÊÒ)Ö’äÒ¢èGмâ x‡ ¯­3þ!Da€—ŠyI&E +Òn$³JKQù§JÇñÚA€T— £‘ ŽÈ‹† iI9þ/µ™aô:m`ÛiXÛ•Ý: Ð8Û4š™ÅΛ‚»@*}ù>™ñÈ&Q®&ë? +©lÆ4Ö¼RšPDÃ$¢d’ÓÁÏ£dÕqX‚¯6ƒ Ø3,KÑátQˆ2}%óC{bùk°nTY6'Ÿ¹âúYÈDpZ´ä™AŽØêBÇ%r—<Ìrzw‹mI ¿ä ~'QDVŒ²/” вa!²,b^$½Tà l½_pÝmïñEáÐͨÕOÌÁNÏpQáƒ󄱄¶4w²/ð?™†¥qeÑÏH¨Çs’*—¯›úÄ"íŽÀ¬P„q 3ÔŠú¨A‹P~ˆ¾JÉ‚¬q8ZXtüËÄ~ è^“ÈO]9¬žÅŠ­%Ða­¬Cùqüà1å(t‡x˜ø,SU;†(gÇO¿„eãÅšÒÈg¡ž\ð2Ô¯Â(èý +b§ÿ1–Öʼ#“fÀ£ì³ ˆa!#rñ¤f}"³1¤Û„ ÖJÜ/òhñg +ÚFÜu¤ä8§A*,¿QKá*u \ÿhXÖE¡Jh-õ¸Z#™A:î"JéߤœÅø hå#…±Bð&Äv*FéDôÌŽ,\ÁÒ#µè™XE&ƒÏÈ‘£¶GÙÀ%Ø‚u(+ +@v\÷rrÐ;4^  O>Kvë?ˆ"VñMA†’¬|}%Á¤8á«WÛC©*›5ÞÊ÷hm”{U¡™x$@ƒ2` ‰8Upc ^¢AVí'5]œ°G„Ï1¹•"úËd4N>Àé€ËThUÁÖ×J‘ ŸÕÇ‹§$GÐù´£éÛ­ú°>nØf/1^àGRøj晇´ê5ô¦”) X„nTl3ÍQU +!KÏBn.Né´À @@B`'Z¬5Э?ZÂ?š¸KÿËZlg×Ç÷QÄ׆QJÛœUƒ™‘áÖ¢•±¢ßš‰&Œ}ÎÁ‹æpÈ1-‡tôCvdA0@¤xÒS&|«¼¦–ü$Y,Q\íºÖ;ÒtW\iÖ¤YÝ;˜µ)Ib°¾-T g{3ŠëO!øâ ˜K—Ô|_úØøI¹£Ã¼º…K·Ìˆ%ö†““ÀX6îh./ïdÄXà¤ôþ8!ǹÏR0ÀÖQHÌšQ§Ž]J«‡wób™“ÓV>ºd)x¹Y† Ø›;ßo»Ž:ýÇÇC£bé祃8ëJŠùŒõè+‰9×Áß'G_StÔY7 =å 2«‡ò¢/S°®>Ê7Þ‡ãÆ€1-ß3Ó5©[m-L.t +ÕÀ|Mª"Êîv§@ ^˜7ðÇíA(~Õ¿™#••Ú‰‘Z i^iS¤Úû±MH•W¢ÇZsÞL‚VnMOì£Ñ”Ñbõ垀(‰¨ÊG›ìnÿðƒÜN‹ºç˃éŠÅµø2ˆrZízGçÀ¯ïâ–q¨¯$K÷, qTÐs3€¿Áíæ¹ï/è-›!G#§>%噟Å éq~U_$Èœˆk½k]zŸ‡ jç8x}#­$Õ>gžx«<¢l…t‚Aº•)‘Ñâ‹­£^QÀE/%qrà+ݽèv]‘BªøcŽ^îF—u”À6«nÒü´iˆN" +â}™âÓ$ ¼øš=7ÑÆü€0s=ï +oÔ è +èÛ¾çAD/Õs˜ºä…ZÒÄ%@}/‹P&œ¾œLYŸXÉØH ªuÑí±½w>B5L@$;C<³x¤â˜~TÈÄjÆÁfczxÜÅ$_Ô¸d7#—ìf™ƾ]cƒü/›lPt/ø7ÛÆ!%¼”1“Rüä½ÁRÜ4 >>D E'b½×Ì-奰âìÚ H;^]“r‡”eá±&ïªá(° J`)M\&¶4§8{ëE༴<ØO"xL›i¡ß<þË+$|Û?ÿ¨Uר«ÞÇ4ž¢y^©÷l&xÒ…6uUÆ­hâ#”¹YÏùmê˜úÄ¿YˆúËÃg‹‰À§Þé+yé† c¢¯ªR&°ü‹‚b Vq+ؘE쟟sF…t°!Q{žW0÷±Fس íÝÎŽÂc 4Ž'¼¸™<"zs!(+Ö hNZNuãqŸiDL÷ƒò;£ß× Y—8³k—^2C›]vÊû¼dö^°‚âH7ÿ³ËÇ=%¤óZE½.“âæ—TåÞšR–$*5[ÑÑ uVØ)‚[§SÖ42å‹Ò²™SRÏÒ†lf‰¬8]qQ$·? P£äh™4E^t§áÐcn~N6×÷ Bù õÌHÖóâo,V,‰‡Æ¨üacD£Í™wÈ•P@À¯¿Œ€Õùߤ`* åŠn&s±¿ÈÝ‘\Iõ“O2š¸ã"hvõ„ôxV¼Yé3|Ì h§«Zå>å7ž¬OlòŸ3Zÿ/ó$'ûÐéVbÑ;0ðS Hq0Èó[ †kݸ”ŠÒðZñ°lÄ@îµ(É¥'¥9¦ÑÜì[ð0Ô¥ûçà6/ÎÎ}cƒ>PBRUr,u$¬;a13 cÌ(Žö4`Åãø—GY¬ü«³ˆX˜ÜÉ8—œ-iX)“ú kPH’P³§uD©ó³¨gÓþé|xzB²—„aϪm˜§³áÇl³¿~ˆAAW˜ÀwÅ –Elì'Y¹ßÔp^Àž˜!bÕ‡Ô2Àû#Ñw³c|¹ æ~þ Ò•« «zcR!½³+ü¦^²È"{Ñ‘)7fº‡(VªÈP{1kRR•-A=Î.kÓ“‘|kÉÞÌ m¨¹èõ‹:Ãôh¨¦!T™$ÖÚî¨60½/q]dÓk8GÆu¿ã)-¥r’’†Á™?0³U´9 +sê¨AÎ`c –&‡ù±WöKÆbcÒÜé÷²øÏP &@¦p5õ'àБ³ÃZîؼx-,—ÏŒ ¶€a$ú<á;®O¬/¢k^7«0"¹Út2›BE§}6¤+Ì*åªMŽ¾£ªECM¨­0^U ˆ$ÓÍNG–#Wëzó_I™M…ã$_9 GÕ|ôš|bs.IJ¸Ú}±&‹'Õ®ÕÉ¿K ÿtíîRÊHL¾ ü-"‰ üQ•ìÌÏÏÌÌÌìÌÌÌÌÎìÌÌüÌüÌÏÏÏÿüüüÌÏÌÏÏÏϽÛÝ»ÝËì•{ÂÙ=»»a7twúûWÃîÙÝun>ÿôßÿ›Î¿¾¿¬_ùßáÿÃÿøÿþé§ðŸ×gOå«·JÙTöär{¹åÏ úRŸ=¡OuÖYe·oÏvºÐwãú»û¾3_=»e_ôÛiÓnêÝ<5eÊ}1e9™©”}1W®ÎÌ{ÑÚ]½zW§_ý¿þ/ý‰ºSŸÜ©S§îЩCŸÔÙNÔ§Ÿýê?«¬ÐÝ¥×*Ý«{õÚ½Vwèu:ô«WŸÜ½zuè^}j秮ÞMa7l:aóºÿ>¯ä+ò¯„+WþJ9y%»tÙ %åX9J)ŠrzÞt$ô?åÆ+¦SèÓ¯å„îÎÓÝýêwèÐç|YÛý¢¼,yýù—ç4y6OžûWí•#OžqÚ¹rΟ=›¹þ6;»œ{öœWÏõéÕøuNYeSú¯Sw*é/?eþ{:­Ñ¿etè´Rút¶¤M·™%C§tÊ“O´›+¥ûÞ>ëÓ§ÔŸ²WùTv]§õ[Êf–Íüt’"¼åæBUæÚ²Òûs{>Ú;ñcIi”Œt2×Xi¥Pîcø[»'”Ûóû—»a|®Þ’ÈÛá|Ë¿pû)O=Q–p©ô•´§žbœ”NÉB–“¤M#qýMú éVÉ‘e¬+¥$š±!w„Ô{NºµF(§¹±ã“tþ¿­o½V~uúûú^á/]–Ï{.”»K·Nç…uúÝÞ¸Ïã23GÉr#ÜYÿw)º\Ië„t¢ÒgNŽΕïýÖ å2å +#ó®|uß\èí³î4+ŒQ¶$zòôü³ëËèë[ybo–Û12äX#ä)}æRèP®/?†ËθüÌÒgyÆH¡Ücä‰'n8{ŠÝŸ¤ÓeùhW¸ûu+wW†q¢ËÒç}?Úokä*aóËÙ¶ô©üv¿J§Óöô¾”aœv—!ý¼Ó­n×}¸zÃ^§ÐaÜeúö÷á¾—±Ê¡ô©råó«sÒ(%¤;›ÛÛ§Œð{SêJ¢·œôyÝÃ÷¯ áò$§·Q~•'Œ?©¤0öDç¾gæúqšËp.Üùüåo÷z]HßËæ'V~´%•áÒ§Ç,e7?—‘ߌTÎ]ž¢n¸Õ–…ªL&ròé(Îfƒ±pœtÔ‡F„d,ná¤Oÿìé£ìÛÝÓKž|çÊZçÖì,}^ïã÷òIʟדéJŸZŸOº;Ñ…ÿèOº$ÚžÆ],óx,Ÿû:•NE²x±S™&¦Ô“ ÿún†)Ý—#Ý*gùﰷÎv(g±)l~Ûﻑ%ÍzÇXé»üæõm/Œê>ÈY£ŒêòPXì¶L“pÑ»Á2 cyûéT:•‰ã¼}s¼ãëø-äÈFÒhúÿ¿~‹$ I#i$üŸdDÑltêû<¡ïÖ¥ éJ¦q{¢L™ÆfŽtâHŸœuNÓô"·\nºuFø™Læy\vnZ%dŽ”'WÙS~•éÛÌFZkyJ¢ õ}ûÉ<9'?%x5Òéñ´k’y[_|+y2QÆ…2þK(EñªŽÂ,Ñèº6ž[Ú,äÍBžKÆ)rÜH¹ÙIuÃÝl*ôÇÝqùIioº=òÛH®}ã“$z•ë7ä­ߥüWÚp÷Ûþ÷ï†-‰ü±ËúÒîÆ+‡¤ÏNoÆ©'êÖW%Ê“’#ŒTúô´Чùæc®^¥„òªäÞC9')w’óYRiãÓÂe§Ð§w¹±§ßJ'”ܾòyy}¾Ù9>¿¦´òæùóJ›)mÞvK›'ÿ¥M…]áK¢RFßô•0ÊþÿßãÒÙþ—&²B"„‹NJ‚Ÿ1€ÁÀ t‚ëÊ`’+€ mÃñ ÆZ4ñˆðÉ0š ‡ZD .Á‰ì‡CBðp´^lb÷±,8µQÜ5 +Y݈ª2n«HR@˜Ìæ„£,B2ϼ‰ltÁ™QH=éÌŸd>rt#,³ºÑ§’pq‘2™Ç’'ãq]È«R.*näº,ŒaÕ•¡p&X¶a¹¼‡îS…B¯J±6ÏR]Æå=h÷=Ë“žz—Íãâ +%Ûºn£>.OÐY#–},ŽÊ |¨.äòXœEry˜%ñ0@ØèR¥XÝs‘"^%¢:.’ÄÃ"dÇEâjì×ÌϼÌéáù=¿“%•3ÒŸEšÞ»\â‹ï"ßå±Ez°H…Ña£ ¤óñ}ˆ¦3a£<Ö¦º:ÎÚ2 Ì“ïᾟïa|þ¢EzÐ2ŽEzàÝ~/§ë‹{Q)õßwÔ…ññí{’_¿îøûÖ"=X¤‡+ËD|U…<‘…Uõyhç],¶ê¤£*Œ7½‹Ê>u¢y—“Ï(y—Žº0L¬‹Jñx°Ó^¯EšÞÄ2;íý»¼‡­mo«L(Œâ¼n´IÒc7ýè+z•Fp„^Œ’FYçŸNýfd÷—k{\Úðùÿå²$â~•—N¦¡>ý¥¤y+?åIÃüNáJÈ’†þëÏÞ^ßýµKؾ]vÓ]È.i˜gKŸ­»Ê·?§KÞÈov奒æIZ'o³eõ*çîôÿÉËMçvOÍ›ûÞM]öóó¦OÙ盾Soúû±ú×Gÿ}|²[Ò¼¥»O[6óKèÒ{¾Gï'cÜ)iâÎÚ¯²¬½.Ùå”oztù¤ô)eSZ¡Ï–4rú¤’&òör×ùæ¤r›ßV~õ'K{'õÕ奴RXç£u«¤ùéüh¥rNJ©|ð¥”Oºtß9i­®”»ñ+å)«¬M›zOž$Mìmɽý,éN’FòÜ'%ó2ó¾ì´ò5ó]¾ø”«Ü8Í–Le­Ðß~t_w ÷ùœüñèoû¿Ú¯_·.¬”þÄ/'9Q¡¤rzµV9ÍøpÆùPBIaæÎ9Mtò”¿ÝpòòdZ%Ívl9UÙrc­?ÉùÓ¬ËÜüÝ~êýËÛËSO“™²ÏîiNßÖ÷©{áÖí‰vOûO²?÷Éë»ÓÉ;NÝ3d®NIS{ê8¿§ø”·BÚ’f²lï©ý{z+á”4ñmüçý·û"ú‡=Û+5ï+ó{IŸ%}²ß¶Ëå éŒFú¤”Dt¾´¹'I3å¬='Ï©§¤™ýh­sšrN’hN-õú.ßï]¾:sœ.ùÛ_}® ç¿­>%OMYJùžVŸ'åé©ïk^}u>ï}g{NÚ]'sËéó¹ûœ°ú$çëÉ?ýÊùh·„ÿÓ«ñçKžréóŒLÙëÎ(;úŒRÆYŸå3¥ôŸ”°ÿ•Óü©:÷Ç÷cíI§ÇSuŸ˜:¬µö“+%Ѿ÷Dáü:%ÍWÈ‘NIT!Ë)i6öǧ.§•¾tölŸuBú>S¯’æ­|?=)û¿ßŸ­üÓü*iü§÷ò§÷oz–Óºl9ÁéÑ–4“Ùçäé³ü)éWúV¹0Ê®’†å› «üé=W9MY¥|I3%•+#C)wç?ŸN'»¿H'Ÿ$ÑûɹáÊéùËIçÔ[g­žB:qÓé×iSŽë|9Y³szq6tT²”SËiÖž“œ^No}‚RN¯Â—³NrZ‹·Mi?¥»+™Y[Æ#Aé>Uh«~#ÈÚ2L0ë¢>!uηñý\ÿßïîwï®Ü±þÊùRþNK'{ýøTNŽ.ã¿Ø1¶ôyþÝTÖéλÝÒg}ã”/’œcÄpÿgw“³O^æ$Ív;ïzœs÷ÅZkŒ1îÒÉQÆ®1NÆ‘eïõ‰Œ§ØÝÝÝ Û/nÞ½vãUåœÆíߥW„±#Œ“Ǹ1Æ«£ÿO>íÅ”2ì^æ¦ÛýQNZ™yó™îÿû ÎÿO„ÿ¿ð]:ûE=úº_¾»»±{åüí]Ù½r{ÝÛ÷+üï]ÙRJé.7>ïÿÃÿßç•~?RrË–+%ï÷îK‡ßß[ùg•ÿ½ÿ»»»Ï &aª8ëú@MÂvÔ‚Ü~´ jâC¡ nš·«Ã*‹çv}ºLãºÜ®Oôd@Г=Ь+AÛ‡£NL¶ìÒ°ÏBÐö1Ù2Ú hûx '{·OgužìÁæ±*Jƒ("“ +³®PÒ è“ò|x ±0È4ÏS!x0ŽX.È5Ô$ÀMÓ® +”‰@š¯Žƒ›¦m5Ü4íSqž;,ôÁF! £ ·¶ ;,ÔÁÊ„YpÓ4º'”I…ANKumUJAÛgÓ(Qgy '{àáP6JÄU܆B¨ª.D™hËQÖ…yV‡aä‚< Õ¥Y—f#GŒÒêFÓaX|XÞ¶(Ò\¬ÕQš(ÓôfVGTœg²Õe‰ˆXæNDªŽÚ*¬Ê˜@me£8[•‚ñ +³:–q|M0Ë +YV6@X¦™@m–Y•(ÛN>–qª Ô¦£<ÙöÁ,Ê–Eè6Œ(Û02‹Y0‡:lãc]Üèú`Õe…²íƒ-ÕueŸ·ÀÂmTf5âOfá¡j³Q ŒêB³6¡²H<‚Ž£0DÖ&’…°Ê„ú0©@D™D³¶WY0Ä"d +6zœÊ^ÅÞEa˜‰ÅUØÆã¨wòqïc]šF½n”M ®G=Ìzõªui/먪°7Ê&lÙ‹°fQ&&˜ÕY êaÙèK{£lBÅq#bX`Ôã¨W]VèaÔ3Á¬Š{Xuáqôª“îUõ*®:±t0,>v70fiDÈNB¸êÒ¬îqª*ä_ÇY›‡‚m øÀŽ»P0ërá¨ÍÛÀ<^FU˜µy!o—ëT.oÀ”´SÙ,Œ¯Ù§ +‘P°·;§–‰„ªM™v]•FnÓ‹ò¤LÛÌ,“j{ªK³® fpF‘l)VG]ÈO]H…Q××U‡©.ué>–…å È6“×2”O†¡T—çUueœºF"‹{[¥QÚ«î¬ÍäMø`ÕFi¯n`²n„y"J“ÖEe—} ¥Lÿæyû¨W706ò¶ U›eÉ“~¨ÚdBÕæc‚Y*‹Ä;•Ì¾­K»L ©: ]¨ÚÐ>†sT +Fiö²^'¡yUʶ+¡êâå¡D,6¶ªÃ6¸JDu\Ô'Û|2‹ŽÒ¸ªË0Ê4×UiVË°¶LƒA¦œÜ™.Tm¾SÙ´­JñxãªMÓF!oîæªRªÍ›ÐY5:ðL0«zlYvapÜ«0ëêFYÜ LÈuUšÈÚ2š².êYF…žÜºëal`P"ŒÐ×BUFË6 ¨â6@0Ì j§œ!Ðä\ÑiþôÉ3?Õô‘ÕÆ­ºF +´Q™§Û0–N…Ë>,¯S‘@2‘«Ñ2‘xbb\#­¹(Ovet¡jªª+£e¯2¡F&µ9á*Ϋ˜¥°^7Òº‘Ç«˜îT^WqÕöñ¶ +«:mÔU!owÓà‰F¨â,°t*RD„×…ª(ëºÌ#A„&PžóÛ™›N¯>ÖµBÊ'»ÖòXDTGW…lTJŠ·Q))<@”uÜèÓá° v‰$PeQ]šˆâ$Rt*\W…T—‡²6*e©ŽQ]%êTj§Ò©€,J§"I¼K§RAØEÇySᇌNÇY›Ì‡”µÂ¸ðgÆòÖãFÞfï—ûsÖZ«¤”ÖZgõZk¥qM>Þ¾‡Ï‡´/"Îù³«¤´g»Op¼—%/Êò%ü÷ ΋þÑ}¢ä=OpÞ³ÃÈüßÝóy‚ã=?O”¼÷'JÞÏ ez¤³»>ýù0ò„ÆÂp‚ó~Ǹÿ•Ò Ž÷t¢äÅ:ÁyqBcyw‚ãÅ:QòbÜ ÎûÐX$”FªÃ*Š«Ñ•Yj‚&bUµel ¨± |¨+³,ŸÉóQu]uQ ì¢S70••@t׈ +U b§=~ÜVa[¦yñ%ÜÝÝŽ ^÷©®Pa|1¾ò%…Ÿ§).ˆ!Ê„²® #ÆöïæÙ>i]¸sÒ}‡ñí¤•'e—6¤ß0>¬O™ë,-ÓˆFFY +¤ +Àe ŒªªÍ¤²èTÕe[ÌÒ4ª”L5²îCatH–³6%ݧºBÕÖ}tHª”L$E¼(Ú²êq…µñP6 +dQZ·U)lÙ ( „]žÏÆ}¶ëÃí&ä£âFï0J“pñä>-Û,bY—ˆ eRÄ{Ð:ìÁIG}<Žâ¼ 0˜?¹Ò—•#s„»Pò®*¥ÊÔ¸²iÝùã×è•yi×s3ác7Ò)½"@hL,'ÑÄOeêÍ-7zÉ¥ÃR:¬Í ·s×Ykûœ4Ư³Ö]ö§#g9B!@„Éq©W²  xe<Ä‚ +†“…záL80ƒ‚gÂG%„áðA‘±L8°FQ( ,™UA†!0‰ ˆì䲜ªòÚCq`#."ƒÊ¨6 TÆiÔ„ *RÄŽDì‰ ð£ŠÐ‚ÁÅ6pd´QBq áa¤CU¤AEê Ç€}áàãÂRâbÁŽDlDçÁà¤#£3¸{á@Ò4 .°ˆ’ŠÄy2( .¨ÍFB±"‘dP`!IÀ€ŠÄ‰àˆh‹Ô†c @Xx€Œ6›¡4bEŠ`°.œ$`Ôш•4âÇH†F{( ,¸,À3a]*•7+B 4|P@И4• + +ˆˆK?ÈX‹•BƒEur©XT'— —e 8“ÀXÍP5|P؈"8 ⑱QQ( 4\–±D4€"¢‚Òm + \ÅàA¹ŠuÐiG"vaëh4l`,D(‘  ˜p›DÌXà.(ɥ A4`Äh‚±PRâšN¾cäQ‘@ h€o @€ÁHB`Р]²ed¼B†ƒ’΀Æ0H™Áˆ“Œ¾m( <‡f È8pÓÇ„/lñòA¹‹Œ€}ðLXЦ¡e”`â0ʹT„4€"J6Q\âw8& +„¨¡0zX`Qu"BD( ±ˆPØ:ºi8Â$ ¤áÓˆ òÀÔH(^8(`Z›"œKÄI‡DDÜ@¾84dúAJ‘©\læA£ð ÎyE› èಠ„È¢QT¤îNp +WÇPñ†…Â; <8(|p¬ #0@sð Jöd1JÇf 86ăá !³ P×B=‡¸iëÔŽìÄ +=`ÄA¬ "BÄÉ@€‚ ⃓AAüÐ\H±£"ajqS`2!‰ø¶Y&FH¢œ +£­"E‚ÂHüN…H„ HBbÄïP4!±²ññˆ'j*’äm"Æ €lbná®ØÄ ~áAà"V¤‹ã¡€Øª§ €Ð4¨FìÁ$XAÔ±%;y°m›ÍH”‹‰ìå.(Q‘N<2¬ ` 54J&*²ƒ^tÐ ˆêC# !@'Ï]`EŠ¸°€..:M(~ h,ÀÔÑA§Œ¤dcK2\‘B B%¢$“€Ø&@±"Á…– ˆ¿Ä¨éÑ&¡@`Û*RF·@õ…€ +¡AÚÃ0ulÛÆ.ÁÂŽ`,T$‹Í‘ÃÀÄR Ê: 8ˆè¨H X# \E‚€j%ƒÌFdᑱ"i6Zâ"S`B9Z§²pHbE  a˜6ÕA$‰ˆHNƒ¬HÖ†ˆB ŒƒæAØ„DˆÈQ‘4ªËƒ~J6t´°!`mbF2HËTÒ cc£ÍPŒš/êh,ˆ¨pà*Òà 'PajР$ô@ÆÈIµi˜±P‘DD2ŒØ))ðô+ 2s€#rFCEz +cD4€DƒϨ2'¿%܉™p0?8 n{P àºÌŠÔ!29¸˜€F¾#â!i°5¸ð`ì•1E<ƒ:8|Pȼº¯4‚…üa°p|cÀa“1d€p,Š„†±p #‚”xEª¡4ƒ$P bDTX$m‰ãñ‘àÁI°y€B##0p›KaxÈtP®"!€¸8Ø< +—,52& *RÄæ$3Ø Kˆ_¢ÇÇ€›nÀ¶q‹’P¼2ž"Ä0AgPÇœ7(dP@Œè¦!5ál'dÛP PR-,‚aP.Bi!O!ƒª¨P`¡mpIgIThDTL ‡åÎED 8PQ‘¶#¡Áá°Ü0ç &H †‹¨€ÁvAâ +2 ˆ€€¥à*G…-㙠ضŠ~V„âÁÂÀ sšJÔP¨è0…DƹlmíVp8„¢3Rq©HîR‘À ˆ`h +H ‡ K(2 PÀ žÅLhlEÚ¨¨p06sé  À1*ÔØ2Bm4Ðhà¨àЊÁ¢"]0:1d€]ÌŒÆЄ8—ÐÉWh¶Ív&*ƨ”Wl8|E€Š 9?ð Qxlbìg3`³ H†‹Œ€ÌÆ©Ù4Àh\,h\@`dÌÏ’+£"aŶiªŽF¤À£€_pD ˜†6ÀiÛñAÃl6*N0 +8—ÊV4V£@eZ‡‘L?p¸ÌùŒ!ÍV7Ò±"±ÇƒFá5hD‘§€h`Р',XTÎG &Ü¥¤Ê ¸˜È j[¢N8Èi]ÅANÛ &Áj(«Ã(j¦·¨DôŠ$1ÉšC `(ŒD‰DVd௒>lFšÉƒ$ˆQ)d!„‘` +Èà¿IxKœ°Nï& ø²àöš%ömÕ?{[‚óC¡÷;(µA +’ïGð-AµNù3W˜•%P¼*öÀ¶â–ì¡Û@ÿÞ_£ä*}%Znb抒³â™ìàŠÉ hSÉq6„â¼áÞXÔvõéÌÒM–GÓ0ø°_61r‰šç~ϺïïJ9‹¦±ÿƒ¯*:‹¨lh¦=Qš[R%V£9ŸÜHØ;÷cjF–HŽÇ¯äÞux$Ä~¯Ù¥›ø\”­Âä.Öw¸fÈ^û¾*Óô$>yOÞë.³÷Ôgõ`òzÀ ?¼Ë1Ìs;lVa¼1Udl‰ò’q†38A?¾aUœý…ޫ涌[ͧñn¥B·€[ßl(‹'<äôRD6TSM0È ¬òL6LÁ{îDˆ +dbŠ*ê¥Y#ÆY€ÜoÀ½R²Åi—h9ŠÃhc¸ÃÔ‰Þ†±Èá}éÄñɯó‘ê¸ê™—¨È©ÍY(áÒ×·•ç{æÅ~tbâ{ïl7ßÜhÊ·!,†¾µ°)ÉÅkšƒöؼ(t%Ëy?÷®y–ù"Þã䌙H SàGàÃèÔ÷¯4jÕdÊ™z¿ÌW§S@âÍgGß7Êý<–ðõ=KŒ¬9F%D`Þ°È5Â'‚ÈVDå-"Ù%DŠüKâþ„Š¤¦–$X +Œ÷ ‚044ñÂÔü_˜P®÷ 4ÌÎëaP•C$p°/ü>‰+¤Õ"k§ëäÛõ!á%F‡Ð«,74È%ŒS®ägÛ¾aU„Ø``yýçßÖµÏz°xDZ rÏ Öuq× ¦Íöû/P_íFy‘pvÜÅÚ× ÖåÃxIu„ bƒYù ÇgE§Jè(œ]"çÿ`1œ"ûW{,ý×Ãòú•œ“Et%íF{|Qx§x}ü>lÉ—qÙÑ®3ß; -e¸qu'vKµêPª´1ºN`}Â8€#Ú}Rê,ùõ-'0xî,«ÙîâäB4Á÷î^2kYa}àóÁ‘éæ]“û:Š â +•îªCÈ#ý$á9]+œÎûñcæ~4…àŸYÄsÍ^EœH8³{4G ÄïW…ù[-WÁ´ö á+wÙuú9úzbù¡“YAùËQ§}Åb¥§éÇ]xáR\““ WºTÞ7#½—f«^Bcç%°ñ®˜¯[W¹²ùNë*“´µ¢j¬>w¸ðfYº+ÑügÃl¡~ûLyݱ»R²¾j9ìóG¯¢8@epItK]Ý®‘êrzJº1• o½:Tå R±´Õ ñ¾y,Òƒsia]T k%3ù†;¿dNÕ˜IÞÒUáaÌGö81«æNM=rZ‡ï”P]·ì¸Qú}¬‚`þO5ƒ.T'pÀA\=~SV‡rí"¿ùDž0òQü69W,nR…¦´mE”b¡¾ ã‹-Nww§Õw‡+äs<Þ(]Ðÿ÷¦¬œÈÓ}Ë5ÓþÐ^NMádàæqìtŸ(Rj)c…¹?ó8I=D Õ¦À& WpzãÇA´ìWlS³ŠŸú¾œèNÑٱ夨ö¿äÁ<†â˜¬ihÕ篊€ÆÜëõœmU&õ1bn‘%ûæƒÏûH–Ù|kf.€§û3XSký¦›a6z˜“0ήÁ ‚‚&³_ÌÚ#PÂA‹Êt®òNûøŽ–˜ØÆ>¬¦[ GH1@sGŽLó…Ò_¢ ¤±_…Dáñ3* TBÑìW8×o’oå4z ¤Ód¸jÈ¥ï4"þG €âxürJc&ßÉíOð×´2Ì´†¶¬˜[êÛn+@±jľ8›ß€=×C‰}N=Ôq¥2eZ-·ñZÙDL=h×^^˜þàŸï”pñÕÇLÄVÔá7«Ó$à»ÌT÷xCYÚ# oHþÜȾó™©Ïû#¿¸¬msÑõé•©‹ À!îj¶ëÏ<Ôíà~Þ” ~pÀê@½FTðæþæ"i©EÀ™º\U³ÚÏhÿÃgNüe€õæ0v2LNB"T3ò̉ÄçŠ\•æS³÷+˜! I‚A\hû?ͼü$$Th?F–.’^”‡ämüJ” Ée•€íY2?3Ç=⤃3=¢ü½©õÆX;R*wÊ:Šæ³ÿ%u<3³Swæ +œ³ÜTφ¾6œØSIgò¯ŒÜ„p«@ÈFˆN+pß#Œ -H¡ž$ýõŸ¯lÂq\^‘V’~I)|/­z.l±ÚQ¼¿ž‚¢þ§è+×,=¢ÅGL„ï õÝÖÏò"Š‰¡²#°n¶Že>Ï›wþ¾­%µÁ½œœ[¸Ï¿XŒ‚¼÷i@ÎÄ{x2p¨ôrGVı)óŸÚÈñ*8¡b¯DfsEŽ_Ž¨â•P7%Î?*¡ðWue‡Q—¬+v²«èÄͬ™<[?Cð³M—ɤ¹Y‘Z+ª9cZ9[6X…ÖÞ¶p³~zD¹ÜÎgw¹4zèq«öÒ’hNúHŸMSq¸]¤ÂÕ½1C©Û¿0kÝ^vîð†U37»à¦îÉq»ÀfZqç#Lʶœ·Á¡˜²¸TÿœÉ©€¤ Ìß~ß3*•Ñy*Xdÿ(„§°îáeØþçˆ b€1 ú{‘äü®?´ãYŽtÜí +l§…º 6¸XXÄO‰Šƒø=KoH»`æàˆü^ò¤”l" n¶£í`ÙfIÃAxFÉB`ÎÔ˜…\"ââ0¡x ‹:6{Q5 +Æ?¹¢/~‰¼uT1l(B^n›Ó¼¼ãAŒR­ÂE}e¢6Á3BÛb/FÉ°ï-oò@ôùž%ß7!„÷ååsÁá–2¤§*‹‰˜¡$(Fî]!^ý ÿ¯ïªàüdnÍ6XBÚ§.|ñå.îœäÁôB[Wi‰2`i—éÇ:¨EW^—Ѻ©r*±^Û?ÞNZßB¼Ó¼\[¸™R$F‰òÔ›ýaOFC[$€±ð±™5X h5É'ŒâŽX Ð=(< Üà•íB<ÄÛ HrÝg[8ŸåÊÄëm¤ÌY +ÖIåÍžÀèb\ÿ¢…Á,Sç<üÎ1J[30ÀMÐ+º:ÁªÃ=A óÔ7Œ™Êփï0V’ Ì}mäaÎe[úHè#ÀQó!³±à¡’à€°i)°ŸX’üœníáßÎà´­ø‘5pܘ¹a[~‹v÷ý}¦«c+ãÍÏÜ’·Ü³„£‘³<¨ø’‰œ.jY&=q§~Ÿ?Øý¶b7>Ì>ã˜#Œ²…‡¤¢lAC‰OƒF[6”ý±sjÞÇžO¬—y¾ï˜ÈÕø4A¢,Ú·âˆ=\8Ž0~ÀmÓWn”ÝE1Ù@F,4¹7W ÆYcžñ[À‡’Âw²6É90Yío)¤Ž ¥\÷6Þÿc–=tÂ.mËQm›ò¾ÇõiT~s\ìKL2ã;A¹³œ&2(*Eu«ž7K¹*ót\"<ôΡ Õªýñªòrz¹Ko‰F"×òD±Ðg'÷ãBï“JØ/©Äm êsV4\*¬âVEJcG:ЛTëõ”ëÞås>8Ú)œ7 +Ò Öóê~€¾hø™ÜúAúM0FYLÖÅŽø3—+Š>ÕžaQŽ4$‡ÊWg.@PMõ{Ðöjí±˜ßba’}nœ)®8]{tÐÁ°7^\õøä¯Ñù­º´i1+³‡WÓðAï––uðé gàëÌFÃÒU©`(³ò%à"--×Q˜ªuM7™cšÌNX<‚^€:sqÉhé$‰;9z|6»~£µ|·‚òÔÆê¬i6±¤ï€d¹g꼓®¹žñXÞ9()ÈZpoe’DÞ#+} Ô釲ÊÑgA;5«œ. ÊÛM"2þl»èœ‰³^ª@×B +§’ƒOȘ–¬ƒÀœ4¡Ø ¸0KùÛvg#‘Ž¿ô™/W­o4»g]îÅL?c$H–8­'Hý¬h®YïhæÖô¿ h Ù>ˆªvÀF»›%Ã>£Œ\¤•®¿ŠR¾bÓ»?w Þ4b†5‹<1IZF£Ï×®2…•¤‡÷;ŸÎ~0±Ùe‡í„¬·½FÐùPù›ûÊT(Æ–W?V4î#ÍåDÞÙ¨MÌ0`pÐÛÆ`äÝÍùuSèˆÓ(¡íZ¢ÁwwŸçíò^°,AB’c_zœô_Jé8ó|¶ØP~Ù£O½T`}º´”¡ •ü˜lŸlKÂíÔâ~ó„±xƾw¥U~Ï®Þßý&ÝÒt-«ˆ6XI§KÀÍ©9ݬOåýÿ4[«dIl;`•ÑLÏ7NÙ˜@%ñ&ŸPå[ë’‹7ëºñ—Ú¬^Û¯¼Œ›–(£½Ã˜™û¶Y[¯§Ùò>œJ.;¡Ü—§ÎÊžœù\­ QȺÉ3<¬:'‰°r$¨_§Çk*#uÕ™‘íd]ÀÐòîU’aÜÍçöÂ>öÄð%ÆÎ`«32Œk…ÁMt§­‰‘®ÌÈÞ£42å=w&ßÆš~-‡«8éÍ\¸›X!íÕÅÙ±`'7÷S²6՜֣Œp×û*7ìÌ›d³âø Ïb¤" =)è +ì‹ËÙË£¬Œ˜GA2Ïp÷v šàœ%Œ³¬t‚æmK²°Psâ'R!;g'öØÀ÷sàðö7£ì¸¥fãÉéÁòO A9 0°»”6'º +WW˜´Uáâ ŠWʤÖdK (o¹F(a›"Qâ$1ãz•þÿã]½B}?)ÈÈ<9‰O¾iS±Š>[¼a©î0 7Ý:£v]TR¤¢,EÊi1“Ÿ[mïMDvÓšÞ)\¥·Q[#¼òI»Šè …TÒ͸ÕMž8ÁËoøî„1ƒY³Æ;t ”1‰Tw5A‘aðTpq¤ƒ=ë*\ïKÍ„Aq§A´ÊÔÂ9t±"j>hñ×™¿O÷Ã¥¯e·Ê'êø4i©]ºÜlNÚ‰9ÍR$§ÓɦQ{Xß´Ø’bÙñ•ð‘ö=6ˆEÒÏF¦r2*V™j?H|BM­¹‹êa€›ø:øïaè?Æu£µ÷öÒâ÷m ž{²ÔC!P>ÕME‹qÈ×p›R‚æ4R–ÒtMîQŒGz¦t?‹ént™÷Öÿ®¨EOIõžOH\fGÈžÉ7Zœ€iÜ)è®1LÒþųËÒî6CH³fS<âáL¬;¤‰åºh‡¼’Þu¿gqcŽî¼â 3%4vü›’KS®Ä“¸Iúª=)ÿÚhg¾¡Ì¡ZäþPn<*ÊD4Go±A,=y\:Ë}@žÜ±À†5ã@CNˆVÖGB qÏ4Hþw +ÎÔÞgLGµŽuqç»4ªßÔЪôÓi«Á°|ê©}èˆ}Öê¾I»…ÇQè>àÁå‰5ÅI"@ —L ð6:q/ 0+ªèó&Y*ýÜˬŸ„øÂÀ5$eôà×FK?·çREÜ0 aÈïCM•C\ê!îè|BgiÜ"6÷êüšÇêA/•B–óÿ„Ͷ/\¶º’pÊ…Î¥†2ó‰&7ÏöBcq·1áÃü Nݲõ^®ýÞËo¿‰½‘>|@C„‹ü3¢I:kfªr^:ÅæWÝè½,ösÔrÓ)7ñÝ|fCPŽƒúî¼ç\ß=ÔSô¶Üæ.½2ümNf3­tkÄýL¡Ïb3Ö•ÞN0kì°"Òñl2Ð)§hhá+„¤ü·PÄüOª ™ôæ¿èîªìÕ·×v9½;¤Rf º›Æß0sꇆóêɽZ 1ƒ}wÓÌ(zÍuo»ðŸxGè#}§V&{!7—¼Ç/^D'.K%¾CrfØÒò½>FÈf›102· ."¿ ¹ýÊFX)×Ȥ ¾G¡RZHòTÀfºƒã$ ~øp+`7 è=Æêg ·òÓóS³²;a.oî÷ƒ¥.µçµQmÒzË#Ñž¬†´ï%ç]X§öQ]®z€+TQÆ·+èzžÛ?乆NÅ#(9 )DTŸúþÃ\–鲿¢C¬ž +ŸQ>|Aè Rb7¸Û-›U”„ÜJ8Üäǵ“?–í‚*îÄTÏAç+ø9V¨[[›-#À玷™Þ”,YÈUb8cüÞFRª¬ Û:.JSŠÁ;¤âQà€O”â!4î]+¢K«7RlãlÈRté™Ð  T +`xàäS¸ÅÈ@Ñ2j¤ Ô¼–‡e¸%£Æž±p# öÁÝœ¸MžBdÑ!J§„ì•WÔùûx±µ†ŸÐ R&l)œo2ã=pvfô1"ûWÞ£T>!€» fÊ…lS¢®g­L ñ´ò…•÷M€+¹¼ÅF]Ñ‹ÀõGâƤô¿¯§”Îi s¾ÉlÚJ’ìC‹þÝÊÎÜê% æŒpØN¾veŠ…ÌTjþRÜ3 TsßòÑ\(à. +Ç  ¸~ûÅ´lyƒN¶X Ó'´‘ã§A÷ö÷tFéËì¿eÿhbAÍŠ¦û—•@Šž|·9ôOqpLŸõèk@”ÄLRðï„0Â.Õ +DŸäÉ™Œ«±s¹“—“S$DÞÐÎnW8œiÕfMü7–á8?pP:”ël´6iÕßo4¿ã£ÙIt*ÙÉ!¨Æ?¶tÖ ýÿ  ØéZéH¶XÛ4~ikÓÍ¥×@ þÑÅ«TÔ +ñ ! £ûÈ¢kÎm­Ñcn[ÜŒµTçc³E9YÿOnÏÇ×^Çj¢ë+ˆí¡?k°õKKZiË}FÛå§.÷3›çÑñ«/1B+è<ÄóúÂC¢öè5b^íª¤+‰[‹Å¼hÝ’l_pÉØ@ffÅ%Ð]·*¤ç6Pd%¡ºJåÔÃUap &zo"šSq ¹Às²Ÿ'oä½»¡¯vÜ“›û¯ý·W "„(¤å¶UïLÿ×dŠÍ•5Íwf††°_ŸZxÅž|È5s_!º‰Íu;Ï /¨È˜0pahì²i5´ËâJ3xÇ–\,씜+¦I,–Â#cj·Í@è·ÑâTŽÔÕèfÆpf +ÿ“íîÌY}®\)Yh¿ *´‰þbøŽIh³wѶì>ãñ‚0T +ÔÀø¼@  ÿ¦h9ŽœX=”Xñ‹¸Á÷aɘ  ɼ7&ofÑ~ƒu¦Ìȉ#>qu•˜T¥tI~²MÅ£ ÝæØlcð2õÿ´fO—“xŸqLY¤XˆþŒŠÓœî¼fBIÕ&ƒ›ŽU÷Ìà$ÊMˆ1ÓÜœ +|ê@è³êÔ1¶x4Œ õhJ†T«5e £P3;B9™Þ¬ÝTÕ€ +ïRl¾ûå•èú;hZÅÝ¥çZSDþ€]BFn„§LÁôÏ3¾œ)š³P!I…–’×Ò!"_”hJÀHLǃçÂÎ F‚M=l+ÙÜâŠBÄð¹ƒÿ0êàiåÚÔ äLD‡F[®àûÌáô6¢+YŽ*Â)DìEaÛúO ­6M *‚_X-àw4Ï3×YÑxÇÄmŒ‹,û¨±ˆ®Ð’äúá‡v1·šn 4vÚk>XIxª *ÆŸÓ!3žEĽè}V<[‹kÐ(fÀOù Wà + Þ9´>}à5’$ˆS9·ý}ÄZ/±gDžÈ«o]§…³8¡oÒ¥î+ ¬yl„K6eaU÷@äwû®§MàR}_‰zAáPŸÌ$NJð?ú£§€¬ðx2ÔÚK†å5º•ãoX‚z™> ý žÞO<ãgXqÃ#È~·Í$ún7fMü²=¿r6Ûr³R¢°,¬Áµó*Áæ4ëxÛK®®J{ :® +×÷Ptú7Jq·9êÞß7{Ȳƒ‚ùÄrÄ¥dÎÊA½”6…½qNSV A¹Lï5q¶;ß°½'a Bm3WHaZ“¼ìù „\Æ ¢AAñlÒTIrp@؈…•ÏÓ´jH„  Ö M+ûq[L†#SÈ`ˆµ ð·,å”ïltKàçbdË㕘pcÆÚÜ+Ôƒ9dý‚9ÚÛ›z’ÞßP™™ƒla*n¦—ëÃ)Á$:@ö¹f[»Ô|^§#Þª¤ /µëK°ï …Wα̸ü- Ÿ×Õ£oÈ=Qb€çk‰ÃRioYUÏe¢hZIÙ­L%ѹÁt¢­&°hzÑ0µTN„# ç1;éþg……Ó¾õóóÑ][èêO°Ð@óþe|ƒ…“nO,Õ »à +ŠŸ¥hÕÌß¼a‡NØñhÌéËüãblñ^©rÍØç šW .LE°Å!°ÄN7S…º#æŸ?”VQÖ“k6¿±ÜXÅÇ|¹æ ÊÒUw)Ìâ8 +ÜŽ0ÁY»ÏÈ°/§ûezÙ\¦ù*‚Ê„ÂXÙ°Ç\ SA +δIîöRíÍKÌÚz .  +eT^ñ%íiãnª¡Q‘4²)…v!³tÓí`i(¾¹ø®Ö‹Fû ¿ÚÈ.Ÿ[ÁŽ 0k*H[AfЊÕ@‹)²¼‚ðÇŒ +«²·¹ý6žú×ñ=¨ê&$+òÎë­LX‡»ê7\M!D„wÃ$o¿àkÔƒu¨sŠ<°„úÝ#B +á‚ïGR žÊ†ËÅm/Ð%ß?H¥~|Š qSê`Çü³§·åRúž¸}KÉP’¸ð-ÑÛ©@šÃæŽÕÕ+Í%nñ·£ESŸ‡ãS‡Ö+ªTáŠ8c*ßâÊCû)‡!ðs–³`Ò~…jSç6Õ®äcájª1KRêÅbp FC^¯\pU\~m]CþMu©^Ý¢d‘\mj‡¨0è’íÖКíð&ŒYš:c+Rpã ‚¿öôÈ/üì~{&Ï uÔ«ÿHœésöüHÜl×£ÂR³3%|pŒ|üy+2ÜørÀNøá‰ÖAäß‚&@¼ìãT\m­[T¨œý6ôÖkìû€I™\Óõ!„n@S*ñŸñjÙ"¾­-§„ïÙœC`9üï ¦$Â_\Ö,Ú?¼ºöž1zZw/Õ¾Ö?8¹Œ)<Ñæ†Ã0ÎpÜê c´vFÊ ô ahˆ™v禷~úÔ¦šô >¡ÞñTpà 1p’߀#¸Ö0¨ú²‹õ—†—“‘ºÈ%Üí-¾®ÛÎëG3â¶ö¼íÜy{n™¢ôãñ¯Ó1ÜçUÀ Hx„,ÀÏ"±…ÀC)ó£¥î£TTMŒ#žøË Â.£VØ`þœã٘ɔƒ9Õ9θt]dmS„…°teL6Bcg |°hÄE·ÂwK¢*¿Ø»›ýÒôÅ@ÝxŽD7øÊ -¿XSÌ +±LƒØ‡³MßÒaÜ[®>š(ã&ò8m¾ 6 ~›Z¾)¿öGDZ:üi€h†éúß/¢µö€UP¼ò%Š³“I³QõLºðëM—ê»ç·²ê£uù¾Þƒøà©·™ì] ,ßé•Wºêhà´£nx…ï·‚g¦ö„RVrˆÎ{ÁÐ<È{)ÏÛßAW²ñ:%\T•MÙÂV(rð²+:Àö¶Ù’‹D¦Tö]òæÊ•ã(œ%åÍV´éO ¨ÛIX.!LÈëÁ ×ϸKI#íL… +•‡ ŸÇPHéÐ󌰅°5Nl¾­¬{ îq"®ÀI½Ez]ªM×vq»)VE=Ó¢!;—Ö+O›šl|µ>7!(’GÑH•ÓDW#™õZ¥rK˜A› +"ß@¥r5Mž£Äáˆu†CÂÉ-é¢ÍKëv>mÃï;âTq€àúmÁý@Mi¦Ï*³ Æ¡÷€ö²Z’ÑG)bÚÛß0[‰nÞž2'÷ÿÚh´™‘H€”ôãNúô®ÃÅ“5I??…Tb™k«{žÝ¨iÉ¿èpéã›ñ­'þðµgîžâÏiÛ †rÓ U-Ë–ätñ`|%¡¨Õ*ZëÖÌŒBã–ŽâtGUrÂíöUW_­ÌåÒ识ÿ½« Õ•ÉV0‚Í°Õi;ùhØ—iø';Im6¦«Ds¬Ù=§†œ’%QS¼í¦þ5c¯˜Ð.€Óu j½´ÐU¾`å=…YH°ãµ°Ì€?ðçÒqC€ä-(—14ÂLPàFÙ±—úm'PûíÆÁJ£ù±D «{KžÔ°V” žh*Ìþ›€®÷úÈÊb1µÌw7P¥|±iªŽm—5ª‚·ÚØ“ØØ[G¯`$–¾V„b)݉<[ŒñÄ7ƒœXaGLÅoÊ¥‰àk§:bj<3Ö¸,Æ,;`bËu˜³É¿¾‡+™Ïá|sÇ+R=F¸Û‰ ÃmðC½þu†çnxm„«9m_À¤á_éèŽàÇøÇÔŸ½Èù6gï>…”n"kˆÈ–q +l ±¿Π/SÀÚ!Z˜azs†µb³Xi‘m°¬ƒTð¿u!9÷°Ð—Üh+#0­aÕ‘Ñ%¶YË4[T÷ݶ©ÐÞUñX7¬ŸÌÈ3OàA¼Ô×GÒµÓe'C<ðãPÚÈÚ™3…B– Å»°l"{D_;’!i°â…q§áÅgp °O_. h;¿Vo«eã-¡°4„3(üA†(ýš¤i§& šÞÀŸ§¢¼†¬¹Jò|ò}é‚f¸Ypd)ÀùkÆ`„µsþá‘$Ë’%ZÄVbrôý]"ûMß:ÆȯTèTh¤Ò¢á•fS…)*Ü Ð}ÂF+vO—Rε_-·Zž´ñ0—=¶KB »:¢`oÒ›fã\$v~ endstream endobj 1080 0 obj <>stream +Êš¶|Ó”,KmšõÆݺÖâ` =ÂÏ·§§+Ä„Á:"‘dh„¹ùP%d¾0—.{('7ìÕ`Æ¡šÝkYîU×ÿ†ž{ðXŸÝ¢1IãEC{]Lýj*x¯Rú‰O¦Öº³»ÈK°½€Wë͵B…Ýw½Z†ãI| Ú¥P +­*ìØ^ø :ÏÚ…‰††&-Nì1'°™‡íô‰.ÀÞð l92v‰L4Tˆ—8ºÐÉ j²8Åæš^é’èCrÈû˜«8ZHÊÉZŽGç–³! |!ôTÓiD´Ñû&·®g&wZejËÝ඗Çs£—xê®xIΙ2(ËY+4&‚ë.²ôfWã¿/êãÌÖê㉠/.ÚF¨?|5¬7ھ݌Ù1Û…ÿ»dåM +ÿø"±]ƒ×¯¦•cÙ¿<ܘüîcÉhÛ’<-# uzñìx[Y¡mË%¸am¢ËF)îA‡RˆM¶àÛŒ0?Cc)d5o¶¨›²]À¦þ I¤$p3t Žš´47IZœ‡¸ßW°¶Ï®Ú*L„K݉þIaÞL­$go ˆ•.‹ûá'HÚ"ëÌyuõª`]ÝÕZrË ’ÿ´^4Ç $ ÚLRþÌ0²¸Oê2ÿ¥¶Rú’ +"€[ÓÈà¼,Hüª6 +™VykhX€ÌËo0Ï ÚÒ0¿²ÞÇGga--qVY%MÎÐB²AšzŠfQÔ£Ò,R"Ä} ñ‘ªôÔ~þj"g}Èâ+/ê>ìÆÝt¡nœk‹+ããš/+Cxþ´ÍDÖ#„4Cñ]} u” {6”ᔲ,c\ð-ÁÄæºáªêØ.6Gþn ª[šÊÝd(™}„Áq'V¸¦í&VÐÚ¨o²Ùá®ySŠ~ÇÀè<œ1‹ûÌ·~†³zSÕ«¹ž .À {Izí)&Ú Sß–Vï,I”}qJí+9 dcêðiÅ|¶Pជ‘™ì7Þጓ´ÌÇ”ȆŽ‡ÛÑgpÛï;‚ÁdQ[Õ¡f?P Öøy£€ÊvoÂÇT¶Œ—1,46{è}ç¤ÐvH+Б~üìvO÷&6;‹ôÖ»Hë#ÛëyJÁ"@™„Lz:ÀM!I€»8!nc:çü_‹;OŸÀêEÚLZÕ±tbýr¹B ôÂj'™WeUÀ 8&0ßB(§ù…Âih3Qü*Õ6Igµ¨" ò=îî´rOyµý7×0øƇ1ûü’ïùY·‡ZUµñà¦ê,š†jÂ$:âÒÿý濲ٕ ¹i:‰Œu²¿Y™.SëH~À¹zˆëôIBæ; 5ô|RZ'qçDs'¥€ý¼0L-ŸwçšQÇ›;Žr=‚3Jɲ,^Åèð™`akµ”`ç<ßd”æŠ!ìfÁêÇj‘ ãk«l®´ëˆ¶Ó_ñ;âÀ‹‚•k²‚ͪñ²´^¼–6€g£ÿËöò¦îI:{åZG·Ü³pvZrÉpq%®Ü©žŠ¡_¦ÿÍa Ož!º.\Œ!ã ‚ ˆûU ÉzCœz—jáycA~tRŸPtI–L¯‡Æ^‚«{ûFÿèUU(ÿT€ß?ÔxúÕ¡š6­Ú¡`c,cL¸92xè/ y¦mŸ7ú‚Q,C0™UÑ'>o춬¾ˆpT#8¸–„sÓ|‘ 4­šxã@è†õ±Šæ+dCÉq½ëø*Ž<™4p#CRY¡?7Ò»VEÆaIxGj´ôgm4X¨ ›Ì}t@£)«Óe¤Ÿ ká•räœfºò€F%Œ\¢Z‚‰8Tݯ ´È›"R… ŽxJG™Œ Èâ¡M0-oXB(kƒT‡  ‰,­NG“›½Ý•Â{Py¼n,qšä©‰‚@”aÒæn‰½y{.äýÛn’1tL¹=ònf ÷Îß1sÀQ[¬Õ±lJ~ +Uš 0˜HÑå p¢¾?R8_ËÞPâùŽë¢€<ù2YD<î¹ +z ºþÞªï´E¤XB_½îýéI‹\Æ8Èt®Úu¶r±tÒhpÂ>†!€æhw ¢«“ñsø-U…²–#=íÓ²0Œ+ ’aÐâõ…2iŠ^FvžØ9-©×= 1ð;h½øSç¶üçŠ-q¢ÇD…DeSÓ+ ÓX œ J6³¨R‰&Ú«*[I('’‡ ²˜ +励¥ÑZ÷B„ÑWçgjIðjN2ˆÎ4<ðÅù¥(û€³ôýp´XévÁyæR¥³´uP¼11$ q¦žÈÄæW« Mx æ†ÂáŸ'ü«Öe<º@ˆIF„lÈF p³à~Ó‡8³§\ +¶4âo# ü¯ðüá-ôÐìYºAèÜ{0`ù™Á³‘q=ºª~è‘ö’î¡Q¥‰¨ÐÀÒ*6ÉwH„à»ó”"@À¸àŽ­-@µ@PÀG¤ºìÎPj‚ÎE’a`9Ç€ø6xWþOŸ*eˆ4$¤ð„¬è2Lü [¿õp8ùË@|+`–A¯éÿ ´¦ó/ÙFZ$ö¸{îK˜^ +(tÝÉKBH\_]W% +ÌIŸìLT¥÷“%œ$hÌá!ŽüƒwR~OvþR¹¾ž6£,§BüÏ«ó>¡koµ`v·ü„z¶l0Úd;¥ü9CÆ°…ìaªÀ«ß$ÔNu9{E`e)õ*`´bÖVÅúªh,óóAN»Lþ‰@…m€Z…ÕÂ-t¡¨wÌp*ºlÿÈ-”0b¢õk¸øò•Mé²oò…Y§šy¿’ ~À#_ +¥¿L±pw<4Sõ ôbsZBv™l$kgÈ=ë_,XhQ@têá@ãô<ÝÄúèƒs O?ngš*¨ŽÉ1Ñ®^âeõ‰r`¸‰àd1,bs˜þÇË*‡¸2ŽÿŠ)ãB#Þ -Õ„;Ï5„ŽÈÝ´›õágÿÓ uthÑãâKsÜÏú}ÉK„LH'ô»Aü˜2¬jÕÛdÒ œ)¤ÀU·ò6êƒÒ~üº¥E@ wÁÀ„Ç.Ù‡h¯E ŒxwSÕ™M'p4¹1— A#pª8`¾ò)#LÔZ²r§{µä²’ÝGˆ&Ô ¸[þD?¦:© íò@!Šš.¿Úž(vzM4ÞIGšN¿j¡Æ€#XñùhͶ¡)‹Ô/<¯g¤‘de€76G\/ªè~ÞØvƒê*«;§râ¦;sØê~ËjfNpžÝ^­ÖÇpu­>øÖ¨ŸÙ·hº†ÿÛ I ÞÖ³NI¥%«nÅo%Ù㧘óz)9ÖÆ^*ójdV˜8T}îw}]o¸-Á—D@fækçÄÕ/°‰Ê©iVmÿ5!Á2÷òç¹Èå ‘÷õîa(ÍÍD÷O +})ÀeP¥úÁúÀ> Þ·œ€IïÍzËþÁ*pVP!VŠ˜Ý¨ÙZ@±`'‰M”ßœqŸ>O‡©|\p;ÂÔã¢cDé.÷ç¹›rå‘Ïk±ÑŠL߃hœ¤TÃR©(ÙÓG¦æò›L–ðóøðsËdÛ€ÆðMJRžõ*.ÂÂÜDeñæ Œû‘þî|ÀbYÌÇŸZÑeËi@ö¦ƒŽlƒž‚Þ¦ceªáÑU[ÞôåïK>&Î|æ!Û|1qõ—Pgu¡+¥ñ!N„ÐTPf'Þ[ª jÈ 5.VÈĪLBÊ¿°D°‚aŠ †\Å媋©D sÚìŽÔ‚zúl£+Êû·À›þæ…Üà‡0Ù!|¸(Ú%ƈXæ‹ø*b²‹ 1| +„»–Ä7™OvŽê,§çÚšCA‹•ÿ@8cY¤M<$1¶Ñ3Žg[2£¯¨w¡ž Ã{¼‰«{n˜'fk—®¨/“ŸºO(Ð8 ,„mA&ÂÍ"kæ¥Ó—EDÖ6©áÏÂa¨…%1“UÃ¥3Tp:€6„D +€Æ0º)e™~èÉdHYàåk9Ōɷ¶Â|¤ À%ÇÍõ{ñ¼œîMIÇê +Æ<@G¨Bt¶õ¹IU7}ðÝ“éÅÐBì“ 8€ÎðPSNõšLÊR¸tp«7jг7zÑGVL™\WÄ=E*ݸ?!Âd§ª—ìµÑÂþ%A\&‰*vÑJêëTÛ“¥Æ$6¶¿À2>¹4LejÉ 6kÍ ¡Ä‡u‚Èm邸DR‹¬Ïí'ú”Ó.ºNy9Ž©Šä„ T–Äk0ûôç7iÛ‰Y¦0(.‹Wž^å/§•fªUXPƒÇ EÆHÇmËHŒŒÝ\L0ñ³ø*g%Ë—gÂä[&á’M8ö?Tœ+.#+ûô~êî¬çŸp¶ŽØd’Poïˆùƒz¹Uœsad¡uÆÈ¢v~ÛhUwS‡UØL'h!Ht0×ÏéJæ·_95(ìãsþü±ðÐä¾VþéÒYmÔå¼²]oñjOßÄñô}]::*{‰Ê«Ú£´]|kp²dzâ]òV¨à¶kˆïõô1œ®¶€h,«T Ox¾š˜Ìr¬} À”0*íÀ¬Xâ¡ñ¼ô&³´ÓFeͬx´<ØÙcìÆd u@ÿÜyÎí“VL.EÓÈ•jí«Éìµ¹|GUq¬49W+X^¢Þó9¿‡Žúïr’òè–¸š!ukzo­¨$ÓºqqÔ±¬ II€V””VGqG+šb—Ð89¢Îû.ËP½–…rþIþnlT¡,ca¾õö¡"zÆ’?=?Å$ÀgQØBµaâE> õëlk¹ —­WÁîÝ4M+¤ºèXiÅ•ÞéôÝùQhE/ÀÛ:MŒ±“du%7+¸ G‹ …Ã?&rÏH@G‰1m¤9¡ä ð¬[]‹Þ¬d åÆSì†R›@y¸úVð™ÎïB·[l‰Ùºƒ®#å-·QËŸîÌýxG¹¸@:>1ĢŠ¢ƒñv—»3/©ä 0·;"X‰t|æ2;²0a ůp)s%Bbu%x Cˆ6îºÎÆÌ…¬ûÑ9[‚Qú¬‰n?`™Hÿ{…]‚>}³R +oÞxøgŸ-#s—¾©‹o?¹X½kÉvålTÝ°Q’UmlA¶pÀ¬i¯ç…\loªI.Ká +‡íI½Z£â6€çäœ9M;5‡Åéãþ2ÒêJÊ¡Ž3Ô»W}s‚jK??(;ßT!ü,dQý‘aY@ê¡š;йNÍw’yó-«µù^w£ùîÿ| ]´¾µ°™oòvØ#aS¶w/¶÷ÇÊ·üIükùî?·|·E¸'šx NST2zÌV ø2˜{Ò –ï sÊ·ŸÊuEN7Lw¬½û'Tù~Ân’B-ë¥Ìç1¹‚KMþì§1mÌà:JûM8Íó½íƒØ ¢œýç[±ô=÷Pô]òè=h‹¾ÃëäoÑwç7üÓ®±øÝHž‹.?L¬: rùžw¡Ôßdúê’F@ú¸òðOck&ðF B¯#ŒÁÖ¸'“îR7ÊÂ|càêƒNNRŽ‹1r‰È7“ûPã._ï=––1%ÿYHŒÛ 2-{j†Žv}çƒêM*ƒ˜È÷»ŠÜ±¡·D°`æ ~/…ÛÛàÀS‡ÐP–á.¡‚}Ä-”ÏNVå« +ØìÀ  Õ¼¯#–èGž”!n-µëC»Šãýcö”¼ë-f÷¹œâã!â?þ—BÂu¹Ð;Ä U‹tã¯CÔþº¢ÿšï¬1Ýù"€„„Ìÿ:_±ã‚¿ºÑÎ(µ·S†`”­ˆŠl^ø¾{ü•ƒw(Ù@—‘T ¶â›I<ú¦š1çOê9EÓŽùùlçatÁ¸FsL¢»ǽ«/rü;Dú˜ÇrN'>Yfº´®IŽÒ1|³’)€@3Ù8é3àuÔÿ²øÖľxó͹  s;yà‚0îVæ±þT·uÝÙ¹ë~c+Š‘2/ð!|×ýNù¢—< ™M®ë!ÑÈÓjÍ…Z\9Zý…!ȾÌ5;åõìβËpÀqã:ʯ¾$aqY“—l؉\LùH”ø¢  Øµ ÷ sñJ¸zõ3Ó4A#ID-Ý-ãK”üÔɵûj²¥òv”Uê_ÄfZ1Éf$m"sP(‘¹Ì¡ ²ÌŸCÞuW< 1Ï‚(ýX:Ù_ÌŠ™¿{5êì‘ ¤bxF&wÓƒ-ìTÊ2úFÎÞÈ0ïÕß"ª ® Ð3ÄûU +ÅáªHÄZ÷l:l…¾ðZ¢r$(,[Ëë×uNéÍ *T1øŒy¡m@+bˆ½‹jŒ4㳜¬yT%á*Ì’µH­åóO±d-Y žãJä!£‚ÓÕÚËÖv`Ü|P²i„œê4Ñj’êBjÚ€”°KõÍ6HÐ3©Uï0ÌxŸ^€®2¾0}@¼A²²Bj7–ójBo#cØy AÞ¯!Ø3F€¼f-ŸKʇ‰­Ÿç-ÅyŽ0ΛgìUØo ̙橹fª23oyÇDáù ÁH¿&åĨi|cåôßÐÔ?$‡7 ÇÁT=@ªâᯠˆCpJ ·@x±õËÿ–oÿ$šA– I˜ÏÂa>—@дPBÇhÛ6>r÷9Ày°÷¿Ñpš øTdEÎ%&‚|äüœ¦Ðöó +JˆúàˆÂ.üÎ!"‡.f1¢°#§O×ÁŪ¿\Í̵Ûb ÃY7gϛ߅iƒì¼£%[K•†;&–­LÑyr °¶v,ÁV–pƬåàǿΡŠaëŒ ‹ï¡ ,1>yø4‚mÉy‘RôLFZ[ÙlA"|:æS¾4â»4½±Sª$l²Æè dLKû¢JØC%€Xló¼ð +ú?ùg1éWs` Ž$Y=Ls>œx¾³ó.9tòÅÚLšœ°£ÅÌp»|#ÊÉIð\Š3f*+æ<ðMŽô§á´Ûjášû$R{Ž4* Ú½@ .UhX_ã(Œ¸UtÈßæv:!s—ð6§F³fq‡jô‚m¹7+•Åð¦CÖ +RÇc“v°ðæÆ‘†z®ì“*T_p9ô¶`"›KQÈ1Ž`y¼4 !dŠ¥ ?‘™;v*§D¤*o4 ŒŒÙŸ˜‰2 Àߌ‡Y +¦ÅA7.6l«4`5㯵¼Ð'È?y`®±bKCè"P«/åÛô#9tLUîK"Õ€¬¢rÍ”î)ø®%ªò†áË®{Ñ7«²D~¬ƒvT­ fÑ’EòYþO>!n³¹ÅÔ#Â|ùm.ÌY®×ŸwPÍÁ?;wXªÙ[ãÉây)=ÌÉFª2…ʱ½ªÀÌCSù9'ùšè»ø!Iä´ËÃFûØf£æªæ A šž¹£Âíe ±| g”é“kêC=n¢zyþa05þ_þ¼/§¢QŒ3"ONôŒœð\3O)–m‹ÅjëßÍW)퀌9ƒ·’Û6ø„숴J7,Wƒè?Q@ÁÖŸàÚ1s¹.ŒVâT1e ¶ýàÆ‹l n´ÍÛq Û‡„h /jé­"` VmЋŒÜ`Õ*Ì+ËåWïu¡zl 0ÞÕ– z©0üˆ®2Àªù.£†CúŸ ®³ $¯Ðßuœd–‚ItáHžÁô’¶·{ ZN2w-„l¨ê¤ÂFò# C•X Ô\ž'pñ Í…²Šæ¾ûÍRP‹Í¢@àU xÀMÖ¨^†Œ8¨}¡àÁ”a!ƒŒÚ •''A½ˆË»4üB®.€½ +S _œ +€×é 3ƒW€8A;bì.4²fÅ™còÿH¿\[E·Vâ +ßØhM1!W~h|“}#Œ_>6!gv’¦¯)P(º¨XÇ¡|¡’mË@Æh[ÿÿŽ~—KJÝBvkÊ`TK f:Óô˜¢Z˜BrViµtú¢³>Ê¢Ø2Ã_ ÙÖÔËͱcewPÈžûÕ¤ò .òxýûÒ‘qËà “¥'×ö©vlªà[’ *bèè´ÔU«wª$LèòûÊ0-D»DX¢Nð“wê€Úæ‰3ù.Á/©´çÙµo!e©;«ùq>‰‘ÚàV†ïÎ5 ýÇi'ÉîŽMë$|ûö€•×ÛÚÀRÚ~ä:l“QM(ÙÁÜù­,ƒ1›áNêmz¸6<;‰þÚ'L´{IYhp'¨g|QÉîÉOž;‹¾¾Ÿ;KEø5ßÜI3а¸s§˜ æã‚0Œ;U©(‹A}ë@oÜÙZýëmXº].¡_î\ï3ù®O >ðb$c}AmNâ%Þ•¸s)ÿ¥aç­;eˆ§wòDö¿¾‘ëÇ,;?뺻³w7X³žºeg‡ëßüSÒ¼Qï$´c¢K‰eõ HÕtʉØÒm¯ìÑÞ;‘LñÖ²Ü ¦žÞiCïÛÞrùÂÞ¹^”3’ô;{Æw®<3Ë%ß)#[ÿV§ï$MìŸSß)ÃE¢¾Æ•J€<ž-Î¥• ,¦;Õk4SÆæÎO7\ròq'lÊ;h%ÎL„I3'»Æy7g’cê…ƒ¼XerçÃqò‹‰5VÚ³hÜ©E#µµ„Ø»W\wJ®맵yö°ªu§—2£em׺Ó'­øPw–ãl¬Ìø¯;µöuFÓÖb<ÿa}¥Bê±wg½xaT—ÝŸ2 +!9ììÆÐEuŽImþ²tÂPP~"– +Ó¥j¼aj|¿M…VÄ„y¨ÂBŠ‰^éÄF—ìz,rçÏ¡I1:i»aêiÑÐt¢lb£“;ŸÄvàf¤“ñY›é\}Ó[×;¯õ§N¸Vð\… w>;íÎ×G·Ky¹ aõbü_KÐŽ·ÿôUOL‹¿.Àœ¢nO]»RÄæG6Ì´š…Ÿ]‡˜Ûûõ×9­SÙ¥È,w O8×jcUyÔ-^#¶á°†§ÔòÑ›x:üµÞ°¤Kx Ã&Ò¸½•qì”~b¡v ¹a}(XÃBÐ’BMY»Ì%lDuºUr£HEÇý¸ÿ5‹TIq (Æ`¡Rju‹â¿x'½<ð ℗+ Nàí9-ŠŸÇT£-É»‰._ò7¬c ´Z¿vOÇ°èänéã@¬ õ Ò?‰%%%¢ëMOh-A—îMúàr¦éÙR¥ºÒ›ÍQ7ºLÝqú݉Í(Ü–T€`“³°<˜x€ÇÒ¥A&#t]ÐØ×» iýü0§A ¦WÕ”bÁ´¼žEiQÉ®¶kæÓbh¤>_t6Èõ¶¦æçüÙž·°£‘å‘hs_I䤵A”¬e7EÿLÝJÕù{Q{³o:X*j×ðuÉ„Üûóιhø¢U®Y+Õ*›h¥TuÿßÏuæ!EWNtð÷Í X+G€T‰0zÙ’Œ·:œÀòÛL[äUzâõKÌ¢sPò 9 ÝÈžÕ€~oŒŒ^ ¼¡¶ô ;×Ò$ º2]õrrl À[Ý3¹Üb†ÿøÐî8íiƒH žv̽­ƒT=§Õ|‡.9 ×°´dö7ËÍIM#»&"ezRF–“»Ÿ&Y~X6ÛÄ"òÎaaÉ¢oRÛ$N”’elÜ=g˜Y(®i5wÛ¿>ñºDp ÇP½0Æ¿ìIhIzã㔵 ÇÆC µµãON'•0Ü#üìK¸‘TQQUù¤±¹x + e©Ç÷ÜFÿËlUE¤‹fÓ‰ýþؘb ,‹—·TOp”ëÌÑ”¤¦‹q±òÒלš7ÌlôhìÇ_u}„ÚÔù%çAÈùLÅ kÃT,kOQ¥fõ…Œ©?Ý°äâžÊØY2>N\Ùy-b'Oç*öóÖ<\°5ìu‚\Ì? 纎¾+T]g­”‰×D¹-Š0;ÿsšLا¾cãz1¹ßWä(@·–äßxF7ÞKüÔHÿ1Üí[+hÎ:T/ÇݱXGùST‚,<~ú$p‰ë/<æQä0Ó#º­Íï$½s­ú^`s¬B˜ÔUÛw™nÄŸ´cþ®M4ß„Åê§d?Tš¢DW8¥ñýó73uŠ0,ßÂ3w,Zo9DAÿla—[«ÔNµD„ò¯šaRÅ ¦Lu‘J0PfFµÛÜvej¸:kŸß‘xßqê­+tóŠS?³‡‘ÁÏE›1èY8*ÍÖ–F’fcØVûFŠÆp>¥ ­dNû§TŒéÉñl ¢k#wKZ²þÂ\Wˆåë“”W'-[)„1ßbuµ¬‰ÛñFû:׈dcKªØ³€,_°ã^a8ªhjG=#œÀùàÙU%•ßj¶jx“÷ @–=¡dÅægnc°²´ kS Ÿ g·ïjáOA=d²±i¶›*ÝÀ7¿·6µf&åpѦ<{LPR]°~…¤ç­zxñoê´UÁ« ª˜bt†¦±H¥È V%zoø?«pÌ -×_KÖAEZ ¾s[‹¦ڣȥѭ1 تèÇŒLÍ•žn>›]µ5¸¬ÀÙ¿ÿ{½Â¼m¾þWdß1cŒ9_*¿ËŠ5nX®y0s Êz÷™åKõ>‡Å6hlNžHSŠ(o¨ºÛ¶¢B’ŠíF:ü„W_4qÚû¢piï‹ÊvTeN³ÆÞOmuÚ¡Óôx¸ +ÖŽ±Q)3›È¡ O:õÐd +ÏRÊÆýDÝ'·ÓT›<ápªÉé\@Ÿ’?¥=™Ô¥Óš…/ ¼žlT¸ ÷w<Öõ»‡~øßÕŸz<‹sôC[œ¦@;º_púƒPh…ÿ€K^õ=½BR­}ÊØaXt¹„Éû—Ä[¡d¶)q­L¸'÷ökÿWéÈÕb4X è÷ûÊ<*t°Ê‰Î³VÏЀ«Mñ;M´à ¶ÔnÖp÷ER¹ç·Ï}¾qMµ©Àì—šêpeX¿ tÀ^uoW?<ð›Õ;*ÓVÔ^<¹« Øc8¬‘>|õùôÓŒ¿î÷ø´M\v"„Ë51_”þj~©@œp«Zì5§Š¤ž9Šrg£dña }J#A‡vlXëÎ,QV*…ñX¿Y~{ÁëQóÊÿChè#„þdmB]z¶ÔsW’àÿךþp¼ª§~«ºzz ÙRs`Á¨aK 5c•2>…ñÛˆ²¡¨Üq»“TØikü[ f˼…Mj°ÌÞ-ç§&8&NzÉ“ˆ­óôBTpê‹íãé÷g”³^¿rZMt^çZ¸_åXf(ºÐ_éXÈŽaTPŠžèQ -\:@°š4I÷];«'5kõ@‚/0ÓZs¿áPÅé>RXiñ¶—Mìu%D“‰A_Ëë]æ=^^ÑÙñâC.ø:ÓR¾÷@\Rp?•d0…×q¼Švñšu'••"dÚ“ mšoÚ‡¯¦ãvUÍ{z±f;àÜ´Ð~×cY‘˜Í„’¾°²0X© +5}a{ºª™rÝ·Óës/‘^œ‚צ¢àisäÚ&˜ÎZ“¶ëŒ©e*dikߧÖ-R½©'t"߆Æc +DQ®:A`L©Õöú竳J¥OåÙ!IR-‡ (¤ø·z]a,m¶ÿ´úºlN‘àx’Z1¨—CÈ-˜f ¥B«{Ó?R­ºŸtwë¹Ý6ªÞçÍ#%ÖÈØ001”á_éuB‘‰ò I-R»†ÎMͳM*gìÃxcì¬TÐî̈1°÷´ÂkÐ@¯Ä£W‚5¸–Q³ÞtÂjë¢|þ“ 75åpÙŒ©¥õ–dÒ9,ûÜêž³r¬ÑÍ@ƒø‰x‹|ÆûÖ÷ò‚ÀŠ\»±È¡&Ëår¡ŽàØãïw ê;½n« p‡j‰™¨zþ7ÌßBŸk?\§ßB§?üêE=”É |Uùóue?µðiF¤ªZUbH°»ñ ¶ïé¦Ótm6!áƳ.KÊW‰¼á_†e†<Øú~À‰¸ 9HÂÎV0ȼ4V¹;Ó´ÿSîš™ÿÝ1¬Ü2T= +I˶¶KÝ;G¯°–çbD)~¬Œg"Gÿ.0Zã7ÐÍqúU³u+Ò)­N›v6»ôñ‰‘ž-Ž”w”1r~ED]Å>*kŠà漡è·•WPô +Öœ$ãŠe²´_ IÀ=!œ+[.ÓëKbZTqQÁ¥o{§=Vź+ñ/(N“# sÁ¨EéEl®µ†žmjŽ†®* !|®-Ý¾Ç : è@ýN?_°¥=Ø·j¶\ T÷¡©¡…„¾mÏœ§²4{ämlb…E_ÏBÅ;­Å•ÿ0%¯”¼DhÈL®§åÈ Ä ‘nËèã Od9ëT¶^6Õ¨x0a_Ý#URz¤¬"ÀEPå›Â¶d1ýêb¸/ýÔH:»PR7¿2K*°¯ÞS ’U*zðÆ6­¸D;%Î3¿¤zÊÅ„ŠýÇnü&ëÆ˶pTÖl^Ö™W¤Ò>L©,¨Ä÷y4ÀK‡Ö…I.Cˆ¸;`ýÖ¥ñÎ,)æZ¥Cn¨*tÏ+ÉDodQ–.nÌ1ÆÅV +™î\6ÊÚ[gÅòÓ^¥iÒ\ò÷kºqFµó^÷Éü·þ¯VHôç¢$ŠFLrÓà ;Ö00ÕÙúkÐUŠæó?ˆ0£˜D + +Š’(Åþö< Ývy$u5Ú¼ÁsFþ™¼àX%ž¤ (¥®Èíá$•â±#]$´êAÛ2ŽûŽ«Æ~½m¥€Y_Ïã°«Öô|‚—xi´ÁÏÁ~¡Êd1#’Þ©o’ú‘.m3즠“uB3èr; ³¥Ëïy]Ü­”Ц…bQfªkùpë°ÇG»ÀþZ–ž\A’bÝZ³‹…`Ù^èe/§4Ïý—ç)ãÒ,†ok]K–HµÕܲ~EaǦ;Þ®ÊG{ÔÐæîûëx¦$ì/:£ÚPÀî„ïí/ò œÄ0Mد•”ÃŒ9ŠôÊ>E\4f2ƒËÃ÷ææάu’{ˆ¥Ù‘5‘‰Œ,D +K“ Bt».¦ß&ü¡O‚¦¡§`dà^AÛÙô^9ú@Õv!”шlÕpv>{TL8„ÃEj•0·×›¦Ë1TÚ©™#áÈÓe)”5¶ÛþÊm`ùæ £ á/Æ ÊB/¥ýIH-¼Î’é|EK®×(ý¥Ù-û©7îß«*ôêäJ|¥"Õ3·7ÏSlö6YúqþðzŽ<™¾ÜÕ(‚çÌòä°Ÿ¥ÃÓ4® žáq¡šû¥ŽK‹>=H][fÞÇÐ6L©9C–òÙ;€*Ì’ßÂûqÕ!I­³5·ýĸl$áþXûr +†àY|çe¼ý._q/_Ôw*µkÕ^äÚVB°ƒ³ò˜ÍÖ•²Ñ?…nÍæãJ­ÞlõÄdz˜ÿ¦Î›àRË)\0¾HjÏ -ˆx·H•ý;ñàÝB@ñ}È‹VîvhÐw?ª§w6¹û=½„ü¢ š%WJé–˜`¬z`|;•‡…;xoÕtþ*"|“[!˜‰§ê*è´tFJ  ’¦^3¨c³‚º;—¶]|äêÜ\ÐÚVší÷­}NŠ´•aôäsK½šÜî&Øl[qø‰W½Þ7¥";ÀQ´ð( e ‡Rhû_}qûí¿„4ћݚ{‡š=žðKž›ƒæ|±Ó€ËhìSXj0Ìq¯§¨7gÅh3ÜÊ‚ ÒÀ¸w?LÒJŸ9 +W1Ù¹‰—Y²^¢A-ó¸År§[¸Hï¾dxš¡@½Èîö­–î>±8jA¸ÿÏ!’£y_Ö7C&š3f×4K$*FuâºIê<[@wíT–åÑ6ê±QM=·N.â‡vAƒq¸šYÑC¸êb–F'¼béÊXåÊ-x¦’]ËT °Óá-=Ï&¦¨z +ûF)}cNÀ $|%É(!™Ñ` ¹Ù ¤´üòct#ÆÆÖ‘.‰pb ŒEÅ5Zê ¶‡ºÎÃ' -O…6ãW€&\+™.+{õeKÞ‹¼'蕆dz®qÕL²ö)("R=YÑM‡ÛIÊAb_qî3pÄR¨•À‹î"Üü H+wo¾¿I#š£Ãgûééo EäŠUí û©{¨ßY”f 0芚VøEŒˆ cؾ5z»^° k¹"øP=  ¤XK +³Q\ý-5ý=/´hϵł=>c={:P¤¢îôhúBn°Šè€>¶)ŽCL~žÒ!È`'—{.Ûö‡ÔgõjKN—aÚÙС^tØ< )Œ–®™L¡lG%Ø®ðî2—ä`‘+ÝZD±>㦲_#ƒYf”žˆ]ä¹ybì)6A!•óU¿PKøÄÒ/Ï9?@Ü1€…OåQ 1õ+ס¸‰Ãš¼¼ÒtCFûm+±:žƒ`͘êÀ/7mø0 +_u„` X/¨þxã¤*ÌDÔ|=U¡Hài‚ñ`—»jò#GŒú¾•0]2š$ + VFˆƒftͳ¨4 $ùúŸˆQ4å1 ÅÕи?IÑpE½ápÄ%L³yÄIŠK­øùCN“ÛÚKô?âÄ0âòíþÇ`ÊŸn¥H€°â½D1`qŽ…¤6ÒÎjÉqÙ–ø¯Ñ¤Iª +\×øRØ“ k0!ÈYìPWèŠ}V"¼Ög‚†²™æ*€ B8ºó¢Æä‚9æ‚6ìȱš „W–x¢ÆŒ.⎺T0‘¤·oß7άÀ¡ØÊá–}qœ.F˜¨—ÉËï}pK°õÒ5 pĨxæÏեŠ{$`ÀªJáDOÞJæÝËuè@¸Äõ.Ê +uwš×pN×B:×Ð ôý†hê…_•è\ 4Ñ‘ÆÇë3àZA( ýêÈî¬d×E…4ÅA¶NŠ¾r’;45þÀ(Iª}žµ[b؆Ùk¥ÚHÈ{ë"«v>¨ÒÚä:ºaûæLA¹€ÝYS}¥ÞîäÚÔ@]S0¹øJnò+:›Ñ "q£w*öùÔ˜Š®»VÁ 3Xëë4)Ï ›»R‰´Ñ[^ïY<*ÌJ 7÷%L•hyk’Ë5î&’Œðe¼"è¹vÀì;x«­¨à§;PÎUŒ˜ý Õ£û[¬¨Ë•ïÈÝ.¤UÞ±晸h‰2¯b|¢‚2Q©+%Ù¬é»@¢qhÜ-0ˆh¼“HS’JJdEQŽ; ÌÆÉ?ud¢ŽªÉoÃü+o„¼^ÆI²l’pB BF‘¢ý;‘BÐÕhj³AÐLVôÓÛ mVC0„ä†O—?¨Þ¢‹|KµÒo +®À%Ph¾(E ð˜äuÑòE06‡òÝo8k­y—ðfV–u²@KÎ*¨¸ªHqØØ´jˆ¨£ Å#XÚ³Èçi±Üd,s’(NT7¡Ñ(Û%\üÛQ™gkiÎz_þ¾Œ¹ßqì6±Öå'g³:tÿ•½ë ]oζ½ô¿ í¡è³G?4øo˜hµ >—ÞxôyiGù^)¹£Y¶‘8š”ÒÓ©«Èì›ùß««ÌZt™´Bü认û?Qlÿ8á¥DT·€hÙŒá£/ù‚ÀLó Ôm1¾±É šõµê$õ‹Qçêb +@$쑃âIûðÈ÷p«Ž!ÒÛz—f%Bìš~ë„É2i¡§.ÒpÅ‹–|»äÉve=ºè¹3X­ÿ9“b_´ž—hþOìÕl ;ÍÍrëð© . É3éš²DŽÚ¿#Yxk:Ÿu•b«xnx9Y2õÞB‹­Scþ¸?¶gã÷JÕÕˆÂãúד2q8äó<E +W¤´|á—òKOú@’…¸¦Ø¢F¢Ä÷´x8f\%MÃòÝ-ã‡ì@)i­8¿ÇHËÌ.Õ€ÿf“mßÊæh›¾Æ€—ê àÌýtð2ãò+ù‘€I§· "“-UsÚÿñ=ƒÛ»'æ°-=‰ÿ“-—aá‘hJþ%ç`LºÏ`!”Þ«Qev™Cþ%é€þØ£'píèƒõ˜´h5焉‡-4˜ž1‘Vª¹]>Óö°#Bœhã“A¸ˆð%м„Åî”ñÃ95*®Ä¥‰o-.¬4Ī5Í<’ßÚiz­[e·\°×î ‰¡Zväl4IÐ|IMÇÖÏþ¤Ó§;œîUU‰—ÈŠÿYÕ$pŸ¤"ÇI3+[;PëiÏBâÃ9ˆn— wßQff*på<-šå³¬Î’|BL)FÒQÓžzñ‹ÄGNSUWÊë™äk +ð{üA_U^°CPzN‘µæ0’W(mÊ”Œ¬U“ô¡ÂjF;Ñp¶DÓ3•Ö½"´…|8%7 ÉÿºžpM­I‡BXóê–;öP[ï蟢dÓÓàÛ¨‡¯3¢Éóm#ìŒ~hIS›Š§ï5ÚÑòãè4m^rqxЀŸ¦sG’À5Ę𻹈ËóÒÊihV‰C•Ä0íP±ú“º¾Qh²F×E2ÛWgåYëljó‡Ä\\>ÉG2a©lΙ§U0 +ûj¨´Õ 5í$˜mU~Ó0U^B8xo°½7jKGš +ÈOÌ _—2+ÛFÜëÒ+x 7u…zfO§ŠDÍ®j°ea¡ZÈ t­¥û*$öç#SÔUHkžgÍ°ùM'XƇ9°1N¥®ŠHJûÀ)…›U-[Ö®¾ˆ¡ŸŒ&Q^½N·'Si<üÔ¼E‡ìÅtr´>8: +Î¥ îPH@Tɳ§)Ã6›ímUYÙ^© *-ó Ô[02˜@4¢Ë7ÿ^‰`¥Åÿ¿²°ìp™aä„lÖv›ñ~á`.¾ÿìÖo@V…·N-ªØÔF²~4¸Ö 䮇Ø`¸KÄǽŸy¢¤hì ÓépI|ñÈlÈ01²m ×8xiàƒ@¼+. E"!äõÌ@Kv€v(È× ¥]ŽËgI铇;ˆˆÀB ,Kç9­ÅÌ(Ž[çQX—z‡c&mþUfSþ2’ÀŒî€Uëßqû }DôcU8q5ÝÀÞùGâÏ„ê_Hï(.mÂ90CGÂ…÷¹q…xg¤Œp€Òê;o–Pß³ø}Úš‘ {‘ " ö}†u õ×Ê$kÈL-Q%DÚ®—êô’Ž×â¸BW‹§}Ü;ßÀðÒfŒ,ß?b¥Sp3!ß­8ð7 çÑÏVŒh@¢[ zN{»DâÇìÇë1Hoôi2ŽÍ½Ü”ÌðV$TÈ~)$JË‹¹Ñ8 ô…ÊT¤µË‹GDcÖSu]¡Ÿ ¤ZóU…ÊØÐQ¡‹xÙO/ Y;DŸÎ?R&¯ÏÞðòrÃè<‹qQšÓbpQð?xh™wšüô‡Ñæóý™*!;fè¨ÏRft€9:IæqZÄåAöi*!;‘èˆ\= PšÝ~€dÊ_¶‘Á²Íiè¹^EŽà½(">˜…(³BQ^„yæÊLXKûe¹Dn˜ ÌPr‰t>Ùï5+ÅC¢\‹óu€$¾ û¿¢.iOfí÷[ ÔzŽGF}NßM¥CýAéx"g23Á)Šˆ¯Wˆo²9*9ð3”Næm)sú­èòÓÐ’5dSÖ®#ªÍy.´s”­“NÄ,“×ɯŸ!ëß·žõõMcó”6§m+±Ø)%ÑÕVœx‡BŽ&Z²Î ZÔ‡°X§ÿéq‚†–V—úî @ÜQé„xˆ +y(š¯›¨ˆÒûÎ$¢µ£-‰Ò*ˆZÿÂx"ç€æ6‡ÐÇ0ú—`vúœ«¨!>à†—·¼T+Øx:ùâÈz|dàÍPU!ÿ8-Ò |½”ÈཨôÎǬJË ¨ý%˜ÈŸ\RU܈h)Rß,¨Ÿ|çcÕÿ…•j½ ôÊÊwôõ•5®½Ð˜¡jr…85#ºÓŽ/ÿ¥Û vÞdðN¾3löÓ3껣§´­ü]ä#¢%¥˜‰ÇZêÇsc࿦¸´90¤ÁötK¼œÓFÒ‚àaP¥?˜æXÕ~ tÕî¨äÀç´‡ÕùÇöŸž¶ïYE[>÷{¹ë ÄÎÇX’¡Ï^¡j™E­ç´?Q,/oÇçÅÆÀ %ÿé‹Vu¹€Ó"Kçi—•ÕÚOÔDiȸu®P‰†cçÑÍ0@Öœ(Ñ5m|ßò´fTàÕI6ëaZ#¹Qg7ÃÚ»ÆÀ ÃË{Ë;ÛSÚ*Æäußõ3DËÞ>Ã[ÊÍ0¡Ç”©X"?öšûp8?¼d-UJ»Eâ¬}ðEýC6ËÓÙ‚\ÚÊÌÓ9-['mø>Œ»0Yð»kW)ei“*G¡»®ñë¿‚7k¥“ùxzô/ý¯¿yF9íL¨HŽ¤*ôsYƒœõH~àw$¢µ«˜J7d²ö#.rxÃ]\&‘›¼‡¸´¥QÂÖù +jÀô îñ¹}ú€DБ•ÀŒRN êÃß©DapaF= +éÑýg Œœ¶žÈ‰™´ +ËYV\Ú]¹‘‘ögjV(Áqáœí^t–IUè¬mÉ< 4Rè¿Âu~“zðÓQWÚ!¯ˆü1 +¡4gGe;¿H̤ݻ~†‘t»Dq ëllŸÎ©.%J1;ê32Ò†U<—ÃÛKÖ,«ãrÕæU;@Fý1†ñÝ­Ì­¨._]ï×9m¥ ç´"OOc¥[vþ0RÁ›á˜IñõdÍ<Ò­N2itžu¼ŽÆnDËR¦ç´˜!ó=,fá<>GkpÁ1iv#× õˆªå8!!rëØÕûEqú7ø¼{l]½[(‡åòGrBÔÕ;ÜŒk7b—‘v@°JWªÁZSïÝjÀ«v˜D>BH3ÃX„fÊ"ÿ ¤êSAä´SÚ”•R#1“,¢jqi(ùòÊÒp9>È=Nê K|Þ͉qH3ô ®Än´_–pðy÷ @üó´,˜Œ°€:¢íé ¡dÍVXN/ìéjÈE¸‚2Z^é…=ôVWç‹*K”Óz¸_¨„í´e^µ^‘Q_jVè«CX;ø‰Ÿž +M™âZqŸwëÁçÝÞÅ5ÄÂ%Ó§˜@oÄ=v¸ cˆ-p(_$X­­@€*ù, +´v~Ãèü¡Ò;¯HçÓNØ5.‰æš «”¢n—Oè1>]ƒù>À)3ÏŽWžÞê!34hH34 y!Í4uèc{g3ë•)­°’Nš?4k7\H348Þµ‹ÕhöuÈæó´ŸFƒà?}“Ù4_÷F‰9TLŒQ ±æ´­H“`Ù ´ó-D–XuCÀ¨D|] U²Û ßhD’WÄwW"Ä4 ã„4Ñ»Bá„Ó@\'Ô#¤öñò!u„4Ã]ºF‘¡ã† AìrUËa†œöENÂyœMÚGÕ¡@l‹h'ãлP…Ük>ï~YH3 mˆ÷s0–•kÄFÇ£vV -cÓ(ÜiT0R# ÅP1¤V³ÎTaž)ûý\ú_—p Õæ4«SAcù³-YO/‘ `Í +¥È2ê/‹Ø-N ¦S,»£v„ßatQÄ`s)u bY9ƒ=ºV †ø1ØÄ“?»£€Ÿ}(X6e&ºûh7Ê.Úí]v—c£f¾Ãä° +·Å +õQ4b…J(¨üM‰WÚ'&ï~z "¬½»-/¼È[ÄFÄI½= »'™ ¶yZ½9mQ8Böe2Âm{C("bó4ƒ' +;E$în/í6ÏÝnô b„+IÚ7Ù·¹×c™ÉÚ­¬P“@ ââ–×ˉ,$È)òQ¡¦x¤Jx^1btH…`@Å­ƒ ¡Dt2NÓ”öoŸì± 4„Í .¤·šµôã ä'å‹ïJnÔ˜ Ü~L.èñK9B•"çÐíÓ9­*!;=7‰:¡Ã€ž _/lBX€ÂOAv••’.+%/#ÉH±§©j,¤8f´8 +óx˜‡{¥lã(tæyB? +xÉþŹ\(W_—ãš–B©P*”4WF#iUŒ´ ½íLÀgøÂïóîïónDÚ¬ÙeÍ® €?ˆTð€HœqA¸ â‚|“ò •ÓOJˆ‡¸´úÌO˜¿~s˜Õy³Ü… {%Ásš‡Y(§=ºlõNíñH}¬wÊL1Øê ç—ök +_““¾1³;eü±P½±V¾ý5‰œÖ9Í¥P‰l‡ýƒªäãºY7}ݬ¯õu®ج5YÞçs>x"%¶ó X³ÎÇiÔÓ^Å‚0 + Û±™•¬·†­qJx¤‚å#º‡X¦ÓYÂÙ‹v;>3Xæ;lX„—ù*<…tÄÒ‘ÈóŽÓ€H©Ý<¥™˜z#  ±(»àð*`Á:Â"ö•²Šl…š:‰[@ê$n™L…#ÂÈcÏòtÏ‹Aý7üft%éPt›ê„8Ä5XÐ8–´~¶ŒŸÓ,§Ø">‰¸Ç¯ ª…2waf2˜XÈåµ”á4†pºª¼RôÊW)kYâ¢,‘/Žûâ¸&ç +x)ðÒve\WÆ•qM‹ªª¯Bi[¯¢µ,-Keþ WÛrU#mvû`ïóî„«œ~›”'Äõ9*§‡¸>¯œâÚ&å E¼„ÌaÜ_  æpÂa,€ðsæZ| æç3e[°œý5½ŸNÿtÞéŸí„I)]ÐC.βší°c†µpX.fX¶ó‘ÜV7wN[Á­0›ÞÚêfs tt£½šµós B9˜dVî«ÀL"Šit"Å]gµ‚Mæ`àŸÌ4ñ sx±°ŒuÝh1º=j˜Ì¡µ58p.'ŠÍÖج\•Á«XqŸ¢SUR@¼…±š5“8ØÙ h;(aÑr; VȲrOB9ZûlÙ$>‹1#z0'èŠØxºI‰E.€tk‡8:‰®T§ ȤÙ3LÚÕ„A…b!7r»AaÒraƒWÂ<Œó\.-Ë‹ãÆPè +@¼ ‡zsš_…’ø*”\ Æbqe\™P#…ªi·,®YÁ|Lƒª-‹Û²¸&HJ‘6)E"¥H›Óˆß¸ÊéesyÂäkAðfÁ¯Áì/ ^å'ÌJ‚‡µjAð…P&`½óõèÔøšükê1“ö—cf”>½Së`Ò—)39ÍBJ‘6§T •;ò¼c[tw„b…S¨^‡Äi¯2[Pë[§y—ÈvXÏv2–ocÆ =Û=8,Û[ÝlrÚÚ’ÅÞâ4˜ºrpýÈr‘ÕÂ!ãЛ•µÊ_‚Ÿ½U¯à Fèm…e-ˆ"FÉ( ÷*4ƒ‰‹ ¸‚§y2‡Ï9psÒn¨|‡Õæô¾Ù% +1Nëäü™ ¡Ga€à´ŠfàzWFà£$œ6@Èàn7/?3æ;pZf£–櫘3–匟w¯³ÐG-Óª™Fg7<×Y·Š€ð\`Ò`½À:U¹j px§2XZÍš(mJ‰7•Â†þØ›¯sDÁ½Ðn%ÿGÚ­ÝsÉš0zJôµ¸Ä\h9¿¤^ÔÒŽ…d˜´«„— +“v5 e ßçÝ0† ½²´½²Ô½²ã(t#ÎŽBﲤ].— $¼´¼´¼Äi/ÎEüøÊa·6¬6\9,/”6§¹2®éʸ¦EMË ãƒ\ÕtEFÚh¥R©>ÈÕ®Š‘vJqULkY\³`Â× ¸æ€k¾83ôónu{@¤­iˆ´Gˆ´Y³ €q¦÷%¸Êé!.Ís•SÛ&%dRB\X+LJNƒlYÀéaVèqÍ~-4?ƒ/ªisÌê`̓ðî %Á§ÂOæ§Ìñq‘§˜@—­/mWÆ5O1žR@²•ÓäIyv¬w>RÖ;Ý1c†Ùê”òã:x0©6ഘɚ]NãN$ ô×ô¾&waš2vsÍð3˜´Ã` +•©± ÂhÅ€Þ sG…™Ç ª”F²[ã7ôA¸}p!Ivê`ÊFNë`*æGwˆ,Íh`ÐCÓE´rÆ‚‘ÅÍ£–³pã4Xe†Öú‚­å +ÀXG¦ó%!Âc>{n™ÀíÓÿ¸ATf¸Q+aû,` F7¶c7b%lœã4Ò\<Û9Mˆåô+F-mîv£ý‚”µ;**x›¨‰Ò*ûqG%ñžñdÛ,dúñõˆ÷”¶â"Á•i^ƒˆfJhªâ¾õŠ+¨ÐA!@<×9 „ „‡âĽ?ëõ‰e§Î„"ù§¬ñåjl†zσöor O—vüÑ©Ï€œÊi‚IaF[Š¢Ù)Ä!)Jÿ-¨RÚm!ñuL¢©€Vä‹Ñ0ÂT Ù I'ÄåÆèëßäA6iè¸Ü•(ÍiìèhU!çé¤Õ½uÁ6>Åp—“ ¦ãòƒÊQèçhüúB-CG OçúÌ ’¯ó O&Ð,Kç·YÈ -* ãƒ\^p#K!3lŒ¾9M·"Ow%.Jw'†¦¤yÂ00¤^O/¾5kƒ·7L"ñÆÖùò‚7â`r9Z¶NZö[çùöX; ânŒ0Ê<ŠÆÈÓSÖ¸öà êˆhLãé(&cn—Oç4øaã…IûÕx³}ÝÑz%ÖÌic(&Ðc!× Iïõ#ˬPwDùµÑy,ÀÿZ1#zÕ—xT„îÄáQYÚü7 ji#< ÒÀ‚7›REî,¥ý;#¼a7¢'Xˆ3@M|”µ¾‹q ê…Í„+!ÞÿP´š53š­¢íÀ½¹˜ž+§9FÄû„E+‹¯Biƒ"§½#ØyäÒyô‡8bcû R!;Òجý¬dàÂãl|ïÀ¾¾ºÔÒ>+!Òæ´èc{»(ý·Îd1ÎnS\Ú©:“öè?Ý<„H{‘A˜!§VÒI‡aIêÝWN›¼)“çŽxi·ôå–»0 /›™Y°Ì¡Ù|žV³QAÞê!“ŒûŒŸaB€4CÄ!DÚâÔQÀÐÆr¡ÁòAÀgác¸¿ÄÀ¬Ï.rv£ÐŠt 0ºÊWKGadôY"οJÄÕ¸³ÓŸ*.wm`çÕ„ïmÉ:@Ç'>(„B?[t2פâ Tº\!ÍÐh³óocäé’ï•?ŒÒ(ÑÓñújv(š£\:ª`.‰eþ,<3}28ÔzœöérŽ÷º£N¥w~ePÚÁÎ3ط㕶•‘ö¢¬œ¾uB¼! ¹*¥ýˆkç©XäªilYã×­Š´‰þ8Eïî&®î Çdô¦„äÂvDáÆ0ºü\¾­²:½ÃrÆ-¢X ]ÅUØZ…Sù „Ìð#U-‡yBPÕèô[9¥U8ñWë<±=ö t…TO2`”hJ½,ü ÿàŽú„®‘?§±¥€¬+*T}ÍÄ •EB¸Ë +΋!õOžFçúoó>ß‹u +˜(¨b¤¡x£ÍiLaðyE’„•dPz·Î;J°ïÝÆré<§ ŽÈùüAÞºâÒ>)œS¢ +¹ÊE\E€lïËßý¹C’Ù:éGaë<Ì2^‹É¼3ôˆŸÛåÓâÌ É ‡å +’£Ð›ÙÚ7XÆ Tçe\?ÄüAî†Ô÷ðáé†Þ ®tÒlž°W%ªØ}Ÿw{ŸÃr†šwá 3i‹±'òÅäqÊE”VÇ­ó û½©vùAÞÍ1á•ö`ÐúúŒ1¬Ð”Y˜QÊùˆhxãD¢ úkcry§‚Ÿ¾h§á=Gá:…üP_v|/1¹Íi˜,âìñX€$ô_šH|‚ þ Þ?“`l»eu¼L3†¼¾4ºI‡EÈi²V³",±†Q˜Ìf¾ðö½y#ªDÚœü oÍ ¯´[Ö¸vNë_?C£"í?‚ŸkPß›i/(3O?K޻략v##íƒWqùé Qº]l?lÜ×aðáéüëgI‰y³Ø0:Ï)ú6xIðTã鲟!iá°¼aêlè~†ZÑöL¦2ÃCÿ¼»µÒ3ò\¦Ži<=öà÷œ¶Rð†;.Jƒ°rÝ8c„‘Ÿˆt¹@>§y‡Þ2Òb†cMƒíd@8(”µsÌ×þ xi¨°Œ* §iYÀéÅžÌ"ú`cD2î¥L {¨¶8 +]„!á ‚wkê„xèJÅü`Cüe>"zÖ€™¸9Š]Ž¦&koé­´»ì6̤=uâÒÖœÜ*¦#©sš Œp. W*tXst{ˆxÊ2…øq„ˆO$“^ØOKX´§G¼?€Œ|RNKw¥„‚®ÐŠücØhˆNM)áò àK~’”"íTÝf[åôˆƒDÚª¤1òô;EáÉ:¤ðøz¢œ×½uÞ°‰]~V¸ËQÆw†ƒ·Ï +Þ^¶¹ÜqQú%$^þf‚÷»¸|J¹ 3Æñ'G¡;X–Îϼ~†ÆçÝ +”ðr< ]¥ˆP߯”²´9¸uþ|p—›o!•H ŠòÌ`‘ú/20ÃIw0õ~Ìø,ô {X*»)"®”dmðÊÒêQ:üWg™w![()ãõš´»­Ogü’95£Rá©_GiÊi`áõ+é¤ ¬ó ÀVäüq‰¿2“ë$³šõ@«¿'Ê€ø5#ï\_‡£Ð7êä·®b1Qpü!ÌPñ CF·6‹i9¥kä?ø”¶‹ Èúƒyê]!…ˆCHô…rš«ƒ.ç©¥DÝM¦U–±!Òq9§±ºµLˆ—s “íÓS¢é¦ãuÕÃDÑœ÷ +qÙµ‘ÿÖ~:oñÌZÿ “5ràfÕ`Ñr¡Ù:R *IÆ=vHÁ&úFï×:0–Ó4åТ(& ¦’bh7Ú$InM’ä‡ü¼»÷þé»ën;§…Ý“&,TeÇáõºoNÓZ¢Ö[Gðãúã´Y«W ºÈ%'Hüˆ¡¸¿Ïkøl nä´Ó8›L +cuküšÏºJ’ð§ÁVïl½Ô“vFõ;‘¼ yØ&”lf€ØÚhͤŽ†Ï Ö†–WRƒÀKB¾óu›Øaã´mæ .0 “&¨ù0˜€$Ñ[@æñž˜@7X0iײÃÐq¹¸ +@œoÖ|Þ½r=ÔŸ1M +H»ÄTRËឦ@à3 ¯Ÿ5dQâë–X‡Ád”ƒÞ‚¯ +¶‚âh‘݆Æ: ‹ELÚQáëü[u‚áå©„»Ü«Ã…÷7½kçÚÀÌÄ úð`ç Ç …¾‡¡-YÚ5£°b$õq9†Ó& näz &ÐßÔw~ñ¬µ+TuBiÈÙ¨t!Î1¦Cè9m›-S,$z‚¸ÈepòÍÎ[¥b[{ÛxzƒÏåëïÆ…H{ ¢º\ %>‹džþ€}D´§v'þÊ,œoù霦E.ÂËÁÉž®€÷ÆvÓ<]’&^¾ºÞµC`•©òc.YC¶¥´ßK¤ó„D/œ¦‚:.W´2ÒfPΗ¿¥áFã¢+þ’…NŸÓ£Š¬ÁM¤ónì™QœªP„’ø-ì ®Éx)"ŽJâß_?Ã)Kûƒ:"˜_ êcºמ׌>B3É–Q<—?\.YË+ ç‘„sNÃlAIð¤ŒtÒ]*Ãi +úuHKêúòÙ)myò=]F1Q´ª'¾z¹òEâ +¬H4 tr†D>ß›¬‰À~:šª¸¡Øå†á…rZŒ›ô‚O‡,<ÛSG(䎰$ëîkV(8Ê%Úy ÙÃLÚ%âë¦ê¢ô»·Îo×(â׸öƒî(çÚÁNütTOÜ|„B®ÆnDÔâ âë­§0tuT–6§Qbp‚e¿W1—Ó&/û=»Ïöø!Ïèè’´>v!=zl܈¸IUhkÕ‘µì}ã~ÞœÃx¥-GP'CÓÈ÷P Ù—¸ô?!.‹™\.3·Òæ´œ’(L³!Þ0´d-wà #"çÌõ9WÇåžž)RïuDžðd½²Bu¾Ep#‡g ²´½>;ÜzT¨#å +ñ­e®Ð"¬}~;-è\$ )DœÂ:ÿK¥½8€í<¢ ‘öSEþÕ Ät$¡§˜ž]í"ÿ›Gä2êÿˆ': ”N5t³2ÊLÏvóÁ]¾ 9d?R”†,¶Î;"?Y¯§—b‡3"ÿSb—W>¨ïc.è€Æ0&Šæ44å +qXxÊ¿EÊœ>?¾~9Åž{yxÉTb¼œ„÷¡ëgȲ:!î.¸‘oh£ózW=LßжçúuRR¡jë¹<ö~\Þq_ÖD>ý·ªËûüAAPÖîÀÈà oNÄYÂfíq¬„ls7¾ÿm)òXêôÀ º•ôË¿c†÷iÖ®(+§wAƒˆ7$!xÃŒŽúú2J|4år9Á[¼PXFìß%§*”Ó1“¬Þˆc`ƒõTVkG%3Jw°—à)mrùô„J¢t;;H{ô‘ov_',Јþ$‘ï_¬ÂÐ"šø*O„w̳I4Õ›Gäh¬ +¹à¥¸›áëª$ñþu"ÎicAŹdðÊP˜Ñx(šÝ "~_O%—G>ÍécHu²vBébë¼Yº|:j×î;L(#FÂ…÷K"ƒ7,¥áüëš(ºƒ}qå´ßø|ïmªÍKñ§2ñÆADéGcë<¹7žÎšL^?tPŸ·L@è?@ær˜5Â[¥š(ÚS7"ŽšT—§dû›¿þ -¨9ãÅ‘µ~¸ðž¢¦__7ð{²ëgÈi{Ex¡]̑씢"m÷MU¨LbFôüPœø@Caè­Xy qá|P€|Adˆ3ðNc¯§+:͆O|Þ'‘Ï +ÜË#1zú¯ÿP4§Å°}⳪1ð]x¡&Å+m )¶”>ÿ ‚è%œDžºJÈf€ÖÎÿ‡ˆÒsëü¹‚ȇŒ‘ƒ˜ ì.ËÇårÿ"ŸåÈ#D4¡?]`?œ&P!û,U!ß#Ë÷ib€Ò¼»çÆvø¢SŸà‘tôõª"b ÐcíÀ‡øiËïåÓQ Ð5²V [çÇ…§‚rÚ7 PÚ+ºÑŽ :Ÿ²ÊÒæ³3ñ‡€‹g»ª4ÂF>"Ze¹B\  ÙÛ‘ï_s®}l_J'ýú­óˆÅhíÓëé]ðfýâ)Yà}À2¿àð¡è—Éâ-Á¸B[öÓyN“q§ü=$%dë²”øi,ù† +Hm€h‹€QtAöR?H 8×3W oÃëSA ŠÕé/"¼[ŒÑñNˆøê³._Í}° Û6…*FÚTØù7å +qÚ¢|‚eÖÿÀ,èš:=} ChNû&ÛÀ—±FÑl+V‚ P1) €•»ÉEÈi‘Aä“rŸ§µ6ò½ÜH¤\.oM£ˆ«.ÂË_0Ž4Ø. €u~}Ì(M ÁEU!OÌláqºM%Kq‡¸è‚”µ3Jrä„ç=Î¥9íCÀQÝFò¤Áv•v~ݨàí¨¨àÕBé¤ñémááé<òn¾nŽJÈV¨?yé¤WŒ +Þƒ'‘·&“È)ÈvGË].Æ*¥mÄH +f:ѱ9˜\Îi.%ƒÒ’ÅÖy"!âhÇärÍ÷J]®[™‰n‰žZE™Î«>+rùíÖ>A‘<½ÅS­–2¹\it~Æð"wŒJÈf¹fœ_1@Tz ºH´Œ+tK”ŸÀrúä¤|=öRþ«á!¼d=—¨˜K<,ÊçÎ ÁÛ6×l¯LäÛL.ÌfC1QºE€W /g%ª~}çx¸j« +ùáP¤~¢Au¾QñJ.aÓŒ3<Èä°×kÖš¡ìä(t“Å`Ѳrz‡¦CV&"N’<ËÓUÉë±è +@Üã8æXSÌpš9t ËFöèËßyG%9§¡£²è ï7Òy—¢ +¹øOWÏÏ÷,á%òEFÚY Þf¢t¢ÌJÃlÖš¡ç 8ìÈèF{2ΰ›Ÿ—B˜äΧeé¤!ðÓÍ o€Ø]˜Ú+K›?#7pZ@«Ký"öJ›ä®g.J»~ë¼nõù8E#KÄùDL6E§ªPô‘jm!3ßË>¸³‹Hç_Â(ä⦊¹ât)¿Ñ€xÜ`¿ç ‹¤ñv#ƒ÷€€ +•ã©·Üoùø‰›LŸj»|w✦9sx2LÐɆ"Â@{£¬õ¨+.í…áƒNO'¬þË_šÙEá¦9MŽÛÎOÜ$ž&Ü8íÂrÆí1òˆ[KCð¶~âÐvq)V†A?q‹ zÜà‚7¶íâvP¬„-TÐã÷ú>eæÃåAvš*E&b^Úœ0æ"û6,+B¤ÖOw””×Þ‘¿wðÖI¿c†ÓÎŽ ÞãÓ†ÒI·Þw^ñ²ßà EÄÛJ¥ßí¹Ã$òm°z7¼´ MäÓ_øý -[Ùf'.mÎ7°E®Ø"903û7|6€O(‘<]ðè’ú”yú ƒ€¬'ƒÛ Dÿ-qDÚñI#Ù¼êÿeÑ—CZ +: QC\¦ ÍÖ Ø$þ.cˆ¯³,Â×+ëúkA} : %oÉ#ë^hcyºÕšÑ̬X ÛªX [©X ÇÛàö­–ƒæ÷zœŒœ ÛÝT§´>ŸïÓX\ú¥ÕÂÖy>‚G1Üå‡O¸tþ¤Ìðæ4Çhöõïh4|‚×,Ä×ϲWAÄ7Ûü¼;ñ˜q~ö +™eeCü§«1ÂÈ^€Ìi!Äm·ç%˜á—¶ã—v?*§7,¼§´¹S4öà)mñëF#ƒ·xŠ«§úÒ†\Y¹œ&2ŸYgWˆT³îb_õ +7ã +KÌhIT;2P –b¨v þž]:ÿ´p͈)1A~|  ãɧó±#ò9-u"Þ t¼>aˆúfk|´^âVIÄM¡X [Ä +[Ÿ‚~ë×òõyòù¾‡H?â¼g¿mï÷3ô’W#Fžn–*’€ŒS°ó%s¼…T‚ÓÈÄæ^Ž‚L^?¨,êû×BiÈÞ:¯Ø/—·.ç]?ÃËèZM”Þpªâò0`ä²€Ó‡_ÈÁiã€øGgx›¼9!9E1 tr@:£²cè±ÝPí àë­À`…¶ *یڅóþ›á K…mHiÞ†*VÂÆH+…Í…!A²_„‘Ø°9N«¤2¥¡ šuÑë8,<,BOyŽwín%ƒÒ„‚+4[pYã×UPGäžòMï(¢x3Ú>Pfž>èý KL —”3&xJ»Ñ¿nTN?§”¯/¸€¬S +è›\ÖMâÝo7%([ˆ|![ˆÜÐ¥ÁídÏŒÂn䬎‹èÈúö"Pm€ÀŒNPúå«×Ç+¨¾óÜp@}76#úñHôÅßú;*d?þ &LÌKëŠ<â&i»¸eFq›ý7±â*l”Q1n+F·ˆÆf‚e¿oü‘ZÚd­MÉà}Hx#"Ë9íK:!ŽŽ<ÈNð²UCéÄ%_çÝÎŒÒãªyøéíh\»‹¿¡Èy­:Z'-þPñy·ãPài«˜8ÄÍGÞŽÊv»Ñþi«)¼þél<£4 ¦Æ¯¯„ÑÚ}Hú¼ûÀ¹Ãkà÷¼ëg˜ñdz*wGð{m"|=Y €ªåhõÄuÓHç,ÁRÚœæòˆìù¸€Ç-Þºqã¯ÿ6NóA·}²X·Í¢Géˆü•B|½b¾\h,$À—>"ZðOlmÓtF%èôµ&UWo„O¼üm€ßsŸNÄ#nâ…Ê€–hO‰ÎD!©¿‘.õh(äœ&H7È~7¡Z;ŸbB(}Pt£ý*øy¿0^iʇˆËdÿ劃Ô9m–:=ý-|(ÚƒeÕ'¶{s/X"Pn+^¹*á._è~†î¸ù:§‘2LÚ%]êÕ刂¥´ÓG@€ÿô8m°=¡r:Dõ}Ùfàí¦B(Í"/ŽË+§×\'ÄçâG*BOxnä*Ëäuþp_W)—ŸÛŠÇεìqMÇv3¬Ôý 'b“vq[¹¢“öLÈ\¡qÈró¨C …|݊Ƙñ‰ `Qÿ¯¹ ü0jý8B ‰ŽÈYP¥EE7Ú_éj8€x—˜Ã i‹†ô…šºi³2çkaFa• ±·× /ôzu\†fŒMáQ ‹6äø4̺ԟè¥{ëübÁüt쮟¡ËRE®VL!Þ O‡WŠÔ?VŒR¤~©F¾?3J¬.ŸŽp»Ñ6]ò§oH“ÜùØBévq¹|dÊ× ˆ™–a„ð“c|ȳ“®jÀ¨t;Á‰g¤I$™ï‡ø`Cùö àÿPßÍÌR[6¬*TsÉsbþk %ÑÂ:ë3kCÒ§‚Â1×ÿe·¨ÿ +\>½Å__øWñ’¬+³˜‰Ï Pä&íßÿ‰Ï€–ÓdÒ`ýkñš2( +8L"Wbˆ¯;ROOíæÆc[¥áH†±È[×ëépäéêÈåò…G¿Îi¨©,•&™Óq¦ç@9_¾nB(-®­)„Ò±»0_¦Ì¥Pň„ˆ?@qˆ“qW]x‹-ï gG¡¼’ӼɃìƈ›^ËËHÛ1r¹\Ñ ˆ¸c—`pÄùþ¥ê ïs£|}sØ"W€ï¾ñ=âëg(7@¶£i/§ˆ#d¤“†`ç›Ùê(+§ç´2FHXÖ×Aˆk†¨.õ•‘^Ó¸ö–ðH^o[²›^Kö&¹ógY + :ß±ý:Šé¸í¾î©¥ sÂkAXçUL¥ƒ·_–Èæqy©L Ì@ý±Pžøˆp=z ö‚r ÓX—´]Yƒf¨ó\wÊŸÓº,àô«©"ò5›!!ÍPbš4ˆWêòP…iÍßàCHà¼+âì\)Ѷ¤–èÃÁÉÿU¶dýZROýfíÖ³]C![âë Ãý@’DÈ^§TòŸaMëep…l—-Ì( w(:@¤E\VŠN_¹\ž űg°h_— +Q~\=L¸xIΕªä,qƒ+qDBÄËXï²#¢­“^`çý‚ý~Åp—wN«p$ðn[Þ k÷‹ˆ»±²´Õ´v¾Ã–ÃvÆQ賟a Z;«ôÌ 5ªñô÷¡B†I;#µ~ºç´ŽvJ;•dP:AIʘéh~z§zh‘"/¼|‘pÌŒÒ"¤“AéÓ¼·Ûëë LÇå#ƒ·¹ñJ-ÈšdðfÄ!x—ÂȇÒI³gä{NK;&¯sQ€8 –¶L•1Ãmv£-;Íf¸à*§ßk7Ú O&Ðc³Ö 9­ÁÉz¹™­½S|gxp»Ñž©f3$-߇ñë±ÕÈH;Au^ÕZ3* (ˆ]žª3é°h|ï¦L^ç´â>Î ì§a °ŒŠn´SAì²<Õ%rš¨Âè½b¸ jqQúèh${3lDܺ¤]Z²¦l‚ÈgÍ(—»Û+Ä ’tô¹.Uä»Ã)^ÚÞÀyÜqQút~â@ð +ÛÌ!è6Ùgô6pëÆmæ4*#‰-ïlQ<‘kTsà[LÚ~ó~´2òç´³õ\ûEwøÄíç{2Q9½A‚º|úF’ÁÛE[²öLŠÔ· ‰.xˆ´Wêé’˜ë߶ÄÑëé‰X—ˆÖÞ7i_Ľ"B}Ï)çËSÕDéîÍQŒtM©g¶¾šÕ—‚“‹Hn8›Š*©/×NÕêtðk­ÝKÜkud-¯J3º1Bß·)ê{j„·@¢e¾bÍÖÈiœr²²´]£Yj»DCDƒ¡H‰á«PÚYúHoN« ¬jøâ…>`ªÓoa:^oµdD§[BÄ/G²;ØøôùÝ|}+ x:'~3 +c܈?x êŒÌcM|€P\ÈE8º ¡Fš1V›×ô:ô/íÅ#qìü¢‘x9§©®qí¿5#z¤p‘¯v;Ϧ ¶ ¢†ª +yÈi:žá]:T!WÛÂéhy¨»˜\¾ÿíLÚÈèëãd6Û‘òõŒ+1êÔkˆk†£§GÄuÜxº¢•‘˜ J» ‘—:Z'ÍBF_g)ÜåjkFô€ê|Ï(ýPÕÂÆÓËXæÆÅÖyïU(mÔÑ:iw?ì|Æ“má벫,mßòÎò ¢tÁ”x"_¬Q¤¾U]Òîå!òSbM¬ÓŸ '¨` ñòY¸ªPy'D|@Fö_Ñnç/”·B7Ù U‘µ{øH´€ +qG…ðu85rf´‘a °ºY¤øO'¼ dCÒ@‹ÞRAaʼnŸ‚ˆt6Ä7ò¤õÕED2t/þ'ÎPà=3F”¯vÔà-YgàÉ +å´4VBö¦¹S–šìÛNžË÷ê¢W–vlèð0¢"~Ž£µ;(qˆŸ*¹DàÒV2(ýÁÎKL^Ø:éÎÓJL93Ãë|Knrit&í\€GSãÚב|™23ìg%DBœà§³íLÚûãäçÝêìÔ?À‡§“ñÉúº:ª”¶*q…PZTa4k¥Š·Ñyª +ùaÂ~ºÉgÑåÓyï2,4ª ¨?\D3t ÜË × å}¹ãÅ‘9í}̤­7ŒÎŸ’¥3˜uãâ™ÐâçADé±x7¶78Þûû|ߺÆ– „Ìä•“5y¥Ý;Æ ƒT—_Rnc¤ ¦QÄ#›²vÓ¤B£¥av¾!B=¸Ë%híücë­[ÞIºNß³‡ÒI LÚŽ ¼¿ˆšRK[ݸBܳèËhˆ´áÙÉ]3ݨƒÅ-´{Q—OŸ Þþ##÷ò8Ò¥>%0Êz€RŠ|õAÈž{|â`:‰múßžzN;_°¬ Tç.rt4Šø)QC\tZ¿JÜVaÖ…Z/O"—‘J3 +BüÀÅ¡ÿ²%kƒ’øœöB6"¾ÁtðˆÿØ$þº‰e-YéÔ£#úrÊEž¦NO—h8ž¾âøVg®Nž°=ð¸1-%:{F”ÿЩ¤kÿR΂–Bñw£½šµ³qÙdàÑOŠÑœÀ’„ža"ˆ¸C¼üñÈb¢/Ô5½2ouâ¬=¢<úƒõoú¼aÝŠ³ö8•˜ a‚rF³6Í'25`\FÅ8>ªPÀK[µ2. ¶¯Þ^¤SŸ..ßÇÛËå Iˆ¢i|zê"ˆøéê”6:êÔTDévdTL^O¼8•Ï§ó‹‡ +^p­“ŽÏFd#⟇xÙíÓÝúÎf­çFþG.—o´SÚ± †Ó8¥ˆ8§Ès…>67òø#H}UQK»Ñz.‡)dD» +^µgªAÿã—H!¾ÞêÄ"H¡_¡\ä.h„÷‰¶d ûõrPm.%¾†Ó8CÁD>ø ÔW1dD‡Š¹ó±Ä»BÝ4òé›G ñ fö¿B´NÅJØ8Mãÿ¸]&«¸=R'qã6o#ÍãfqÝ‚AÄÏÖ+m„ÊQèpGoÕÅ!›¡º(=³àüi°ÝîRút^ñà"ç4y±užLÉànO_lÓòa?Ñz¥=ilÖîAÆÏÔz‰Û[ΠD°¼ D?q-f¡ìý'Þ].¥}ù3ù£àãëÿH|™œªÐØcÑÞ†cèéØø~¸|ú¥ÕÜݨ.‡Àæþa£Ž×/Ygb*xu/TžíÇ|\/qû@£·õÔdcúrvSBö¬¹|ïè¨"ç°£ÐlOYò]¯§ïQÅåéŽèËWLÇå«.õï¢ñéøc7§¨Áv×8ÏihlFtâ}ý'fÒV€®Ä¿§që uð\ ±?Uz.wµŠŽ¤g-†þ("ÚÈ_g‚Wþ’†9€é)QV2£Æçûø3¿`ä"šÓÆú*|w„Ä.‡M!”žœ[ç{GoIü¾Ý "Î..ß/Ô‰6kW°^Úg'.í–`ñïR!Ñ Qß“2öD]>ý·Üåa@šx¹y_žÙdÁ/ÏlrZÉ"|ý5uDÊ•µ:Ô‘áÍJ´dí­2‰²ƒµD3*vàoãþ|ïÂÊ\>—YÐmÅæ @Ä#„ùèYþ‹ò3 äcÍXâëì&º~¶'ûéâd`†í–øú›qžÁΟ¼qF7ÚÝÏ0‘¨œvj72ÒÖ­á’?Ý䥓þ3œ|b7Ú…Lè˜I y‚Ÿ¾6"‡€‰—_b1ŒaŒ@P¾/6_-˜´ËŠˆ‡†ŽËÛJ€CÐ1"oSðÛ:®Â–½€ÔÏŒF ì÷Ïå‚¥3q¤º<%l¾ƒYÞ|âVÇåpÅ3£³H—z™E€ÿW"@Öm'v¹`VšQN˜ÃsMU&êÈòô\!û'Ä/áÂ;õ‘ÿ1ªÈS“—²Þµ«®÷ë}æ ^ÚjüÞó‚¹I)K;@„úþÐòÎ×Nìrµ€¾‰p€ÒðùøºOÄ<‡¤2(Í;2Òæ4G©"í3ó3` ´Ø͔։Óàù:²¢KçÑ”ZÚãJ:é¸ñ²)’¶"J¿ °ó ÅäõDÇ"¾iJ‚'3ÎsäwFÀ—ËL¶œöH9T¥{äéoìõô×6"g+&J³{pù~X<¤À½¼‘ÁW¡´K/íŒk’œfbl½V:!N("Îi'€Aÿ+‹°ö™Ú’u˜Ò©ŸÁ@¡ïH— +ÊaÍ-C!®ˆ<ò)‰_ +´ÀXU(¬Ò¡èŸx¬Éÿ¬¾N 5ýs‰ŠãÉ×IÙŽ]x‡©h†“¬óbÿ ?"J“)¸É®ÿ¶Îd`Ü>On‰‹À°µ¢N  $:SýÏåªB[Eʼn»¸Sþ+…C¶'Ü/4ã9­£¶(îê‰oc"¢g‚‰¶ Ã; ¾ŸÎœFKP~˜1ôoQœ4½kÍΧ‰–¬?‡mA[÷r(9¨ p¶— +õ½®/ØXÁ$4(VÇõ7€ÔIÜD¨ÄE‘‹íÓO w9C÷3\Y—sÚÉr‰1úú™\Î2…¸úcÕåD_žhEž^z¥“†A2ÒnA&¯#4§1LrçýÉQè “ÜyÒ#qŽüΰ#§,O[!N™yzªŠ(í9Ôµ´˜áård»ËÕ*Bdªg_—f©-ÑD:ï.<Û_f·öV¢%ë‚•?¬\ÉŸe?ßo`—äHx¤”¯Dÿë/Š‹¢9…Pë[’³ ­ñ‡¸hr„žÓî€A /W$JÇ °ó¥H§Þܘ¼žøúªÂ×9¹BÜì𒺱<]ÁK‚@>ß30ei+&‘‰Ò/”q•£°]>»0=Eóõ3lôÁ]ÞY€}šÊRúqÚæQÁ{ãÀyGàS‰rAÂýÅP¶#z3|b.h©P Iuú3×J²a¬•d? ËKŽEs¨Ÿ(N\}!3*ùÀk'Ä`NSSeiÃ{ÄùöaF´SŠü Ix¯ütÇCóXçÕN¥',ØùŽ”"íš(ýŽã¦âëÆäu36#º!péü¿OtÆw† ùƒ¼E]"Îsšd±Y»gÅ"ç´…Ó¾/ÜŒ+§}Ô…÷èEüZ'Ø/‘sˆh†îˆŠ¸ØêÜ6'À¸Ú.nœ†ò\¾ Ãؾ­3p"î/“пH3zY¤=p/i8ŸòdÝ |DN€‘Ö.ð2ð–5‘Îó2 ñÛ4U¡€EC6*☧~¥Ú |JôÛ,žV³ö™‚]tñz¶Ë߀Kuê2ÒI“ïç{q"šáˆž_oJ'ÝÀü6Tù¶Ç¿ ±‘t[#ë6Œ!`ä¼’Ðs¢èôgGèá³[»Ü1#úaê„8§ ªÖ·fô§*¥-à!i‚”"m¿Ñˆ¸YPz“®YjÏÂWþxúu —¨`S(àà’¶zhÉT)_¿\†9:³µÂi5+§)—‰Ó¾í”6Ûi>ïö +'—Ÿ8ÄcÐRÚ ‰*ò˜ñ”Åã½YûG°¼Hˆøj׎èú>À‡ˆ·åë®âáÀ¹·tôüúì䚊n´Û„Kþô31@i’#v8¦›ØE„²Ýʌҿ·8˜c Îbb—‡*ŠüUCD»B‚ÎsZ‹# Ðf–Ú a퓈ó‘À”qÚ"îk;Ã{w䟈\:¦¢BD:õá̲œqól“p‹åÆ·AX©nlÞÆi!AÜôÂõè_w?ýË:ÿ’:ßÚFk¶ïÙƒ€¬ .© "(—ôzDù”N=AàRÚ+ƒKڃÇ¢*ð‰¯H$ZO%¿VÎ9­‹Á•ÅÓ´iKL&*8í ËíßäARÉŒÒîx¹ºß$ \Új쥥!ÿéœÆÆÜåüSØc—)´qiÛ‰ŽÈg'sÕ¥^½žÎ=Ž:Y;§T ‚Uïä2-sñ4zfÄ1ô×ü==5;=ƒüO÷ÆÓ_ŽÈ9-Þ:dk@ˆ#„ŒB(½87k#¢É…J¸}ŸË·½I·)p‹ô¸•>øɨ˜O`­jïYÖyÓårùl—èæmßǘ‰·þ'ÄEÄ>˜é‘/‚"ŸîH„Bi ÝÍ`0F”y9’­ál^iØm#@@Åô46Ý"¾x5e}ÝA)Kû°yDÞ¦"“Ë×x6ŒÎ äe?=•˜(Ý‚`çI{ùƒ¢ +ùfÍî˘!o߯>‘.ÂÊé]ð,ô³Ñytã +ñG9É Èº bN-JÀS‹ÒÜ"y.‡å¡Ëx •¥[ÄÖùë”6¼Ö”*rDb>Y_ç4ô!ƒwu\žšdð^`ÖïYS¥#çÖùhx¹æÅð@¼œ…T½=²mw[!›w"ƒã];–N<Ýàón°yYk_¨!Ò^ÕÂâv±äT‘ÓVÛ$ÜR¯°µ,â*!¼ yÄm~¹áÆidè{—š,¨„åy@Ñ*DŒAÆÍ\ ÉÅ3byÙT>$PDJ,*"•I„¡0$ ÃXŒ¢(¢ ŠÅ˜b š x¸m0ü¤Rla­õã;’¨¸ò¶ä!‹ƒŒ³½¶Q] ›c¶{™Ê¾A"•¶¥á#È@qúÍphU#21Vc’]¦³ +—}& +%ÙõV0òï!©²KqÙÚIö°Žœê)ë¶üjBÔÖkÇZ~ÝòûaM[Ó,Š×ˆ%G$Ž +úRG²>Z9ïBH¶F:–¿ø×âÍì¹4úDU^fgêÊ-ófôBÔL{z¿ó©˜~UÞIYÄK{½êå$æ°±Þ¯ºh|Ìá{!9fWÖ¦?Ϫ|FÑûXk#pî%öRãfmwBiíEKÉy€œ¢ƒ#:ßx†ÎëÑJDt ÷0ˆnÈu±§TCÃNGtÆÝmïÓ+Üÿ™á…¥ö°ˆC‹(ÄÜÚ´=Ç`ÚÚqçãÞ{uèìÕn7:í°,#†Ml«ëâ%$f9 AÐ{u8ñ‡öîµÔ·u‘×JÁ{ݱ˜òéÒ˜qˆÜ”½‚Bðµ-[E¤¼^o¯oèr§áOÊm|Ùΰp«ÏXFr^Aýlh®÷¦¸ÒC6íÀýûÞð1 ërâöjŠ[MÞÖøw©‘Œ\Sl,êâxU²é<É¿òq0—­TRŽy³aÙ×iq§N˜è0PžB4"+ƒNÌwùB-ò~èçóc–“–»øˆ”öžjH>Ĩú|õŒÝk†B0ŠäúŠ?ù³à·´v×vÔ0ùŠxmijL*.ãÕlÊ8+é…ñ1ª;+cã¼5Þ„}ºêaõÛ‘È`Û‘–ñ[àƒò¤²”7öŠV± r²pÈëbJfÿĹÊQÓmG§žÐü|R]ò[Å-ú¾§BmÁð¿ÕfW+&Ð%¯è‡Ÿ‰8º/QLApÐÀN|•ÌRV†¹Î·â]äÔ„Š¤„òP×Ð7äª +±GŽ¤" +[ªàç¥'¸Aâ/­ a;D¿ÏÑ"0õüÔL0Òðc\,5ZKÖ&NùöÆhZy2–LÓv[àA”A·Ž®`§i³9§¨„2[$¤ÊaHÂ!µÑ4ÝŽéåÔHmÁyaèë©úƒ/ )mãB£硶fñ³ÛŒ GÕ¤yˆ—(äA%¶±u>þšï£àbÌ „#Q(7>™À©Ã~NßØÚJÃ…\ÇyÛ(êµò-:‰ƒ ÕL×Yày2k>õ‰P…ø±ÊXm<#,ð¡È:I‹dpVu.K±€£q˜P+²ÊCuÆ8çÌtýº¾|0Í…7ZY^Bže}Û³2Á®½KGõEcÄžìç8J—®I$ivo˜ ™ÓÄË}Ì< I«¶#2ž–W]J(´â0àÀGǪË$uÚÁÍ¥d +¼¤˜ ··úI©=ð7@.¼Ñ¨,iOêL‘Ãœ–eA$ ¼ ?™O’͵üŸôÂ8â#ð‚2‷Sz +:6{/¡XgTm媅jÏ"qheˆÖ]67 $|c©ÝðÍ"œÃ0‰•¬æ^l­Ä?—FX‰t;sg‹P²:/9nÅJg-œ˜VzK¦¼´n¹í‰€¼íè¬ÍÐ?güaÀÀÐÛ ¥±T‰Ó•0÷5 ;øÿ•ø¢E|û"&-ÇΠk´)« [ +ˆ‡Š”säM‚¶zˆ–³CxU.ԇ̽„Š!¶ÞÈ„©GR±›]k7 «BÆÞA“<¿xRñŒÃßUsk©÷Ÿc6+IëH†¢'lïMêH: +\V1Íéuäý?²IÇî€þ{Î]G:I2:û‹y@<º>bô¸ýI•yßì×T½Üúã†H­šq¥c^ÉË+=¬E–Ö´|<¿<®Ýx…“ ֤؈y”÷sg;¯0B¬bžD­ùôŠq‰µAi-'­Æax•”Ô{Ne«-ûi“ük×ÂP`E:p,¤¡ Nï?Y’ê–¬œ;2jâñ†Rη Y¨‘jŒ“†\©{j{¢ÜlDãÃÞû«ë‡0'Ùœ'ÙsnZÆ2‡¾½¸,+7à”¥rûègEYÑš­“ ²[`&Ð7Ð7 ;BÚêÄسü®C~(JÜŠ €ö`1nh²¼k$¿g]¦Cxæ%Dp3úß#ÿËrDd‘‘¸ &qjr„`…¸1ÌîÁ[KÏü²’l}•fá<³;C‰Í3Š>ü˜4ØÝ”²ˆ2aÔXö—¢û Eµ5ÅÓr(ÔÐî’„@m£ù˜0 qÖk‹ºýFœˆu!Ÿ¼Ãö{¹’@åÚ%ÙO§¯dž`z¿eo÷µXa:Øñn<>Úwàö¶³b«aƒ"â‡ßŸßf3>%½˜4B½gÛ”ÏÚṠ=:©=JMc´¼è~¶å¨öØcñµGÙã[(ÍθR(´’Îé$j–Z»­ý_g…×*!Wpä•Ñ7øO8ðQ EùŽ6pkh((«G»²Oój-.PõgQ@<²D$ƒm>õKêØ—ÅD¹¯/;ÔÏi¨Û~WwÙÏD¬¹Ù£‹”ªñË-—²­ßÅ¥2Ƙ°ÕuÀ‹¡ˆ9÷©k’'Îr™Ãâ“Š6E u¶¸HPOÌŸgM6¤u±‰«<@ÞçP­( DÁ|¡’]$ì2íÖ¢ñèí é!¤J3Eª« &lM‘•ÏBTÌ{¯:ÓþÛ"Õ)Üsá–¶ó@·eŸ°²%Æv®)Å\Êv-“r)¢kg +ïäŒßO¸†G›bÑï—v¤gšX  ¦ÛP·³”,ö„±Ýè+¿,8°yÃ\E‰É lSLV—Š Ûj½z2ƒ´Ÿ¬;¹“Ôâ‹*•6úÞkjÛSG÷‡˜þ‘߶XÕ)ê*¢[h%{e)ÕÔÑ+ù%"•Ç«Ëx·ÞQKvø†E•@9c„ RÀ4—Y#¯Bç6ãÒ9ÏAçª!¼¼Žºˆ‚Å4€—Ŭ¡ioz®é¬ÐÞ¾¬%Ó¼Y+ub™ÕÇPÎN7â”eñc?Â’|Ѱᯗå‡n‘ÍECbÿ–û„µØ÷p•f¤ŒHqöÊîëó>Ń²7›ÍƒÈ8å–<«*Šq® âžAp„nÚ8Y5•º;@ ŒórgÙ5•¯tVDȇ_gËE4Ç Péò‡‡ª‚kÚ£ÄOœUÖàÚKâ!-2†Åäðç›ùfÍ-·ß×Þ9¾¾íÜ¿Iš&„/†=àäê–ƒ5ÕºålO¸ŠG—CЄVBj®“ÓÃå,¤þ¤½Ü•w¹0ÉÕÜq»Òª?å0kyÜ·¬¹Ž(NîÄ,B”9=sÉ΃3Ùòa31RH˜\·R•*♜É`¡[3/ ôµñƒÄ!“ólæõQ±ÀŸ"ì0&¤^ ‰ ™Ö³¹ :>ëd_Æɦ"UwoÎO ³ÁVü,Ҹ+̼ß-D bÉVqʦ½Œù +²÷Y´oÊr·Ä*ÎÉÒ‹òþÂ×dH,§om5.Û)Ìîªq2ÊXœü‘–ù3; þìfþ¸¢R1œ7é=!äçôbx!•Éwó¬ŒB²5µk»@I¶fÁ‘A‚F/ã§9{ºªsnl™)t™“‰Äw7„ Ì}:‰'ñŽI×7Jð^œ­Ë–=i'_ ”›õ’£÷ßØfª}T}À|v=ÑÄË[s1GÎædkd>×FÛÇ;‘qî„i7NE±î^9³ìÏ|@õhq¹PÇiã°ùæíAP­ “.)ö_ë®>Ììiþæ ~ç7Á¾V€ˆÐ+ÚO¢ÚFäé|.˜TÏX2ÊduÐEìŒft&­ÊŠENÝTQ†ñç½ðˆt{ÜE`™z¢@D¬?Þ40…õaÅŸ6sÚ~ª@Ú~/¼¼ÉvN%\Ù…‚3GÃ1b,e¦1lf®h^€é5C|ánìÃ>³ûh¶Ç2´ƒ*õkÿ^§2÷”5ÌgþtbºmŠåÜ_^ý)jO 8ªAqÑ@äe»Z?^@+C@ûS·« €$ˆœç|Ÿ`Tmn–¨Õí¬a ]Û×zÐwØH&ø‡õ?=šÔž3>#R\>Õ)µë€A,ÿ­’ffiô²ºÒ¹B»Ð!jøÞ}Óˆ_£Ê¹ ËÌé†}»AÇOŸ§& ,~³pyòbuºKp&ΤTÂjQñaÅ +²*¿íX?z¤-1˜ ž4ÂÇ4®4À,Ë ¾ÆÂd“ø%10öÂÍ$³+$@¤2`2ª­rjH/îIzõà•³|Kß›6Ão^/$!Wvâ•a"\¾ë+AÚ÷hib4+Óeü‰NÚ¯j”©Ø(€†žXæA«øMKýòÉ,¼IØ‚œÇ®¥ôÊ dð ÿRl 麌܋§ÿ–4u_=zÙg÷C*~ y09¸[_&}BŽÁY© bŠ”¤ŒÑïøÉýhÆ[ÿ¥Aby©ö8ˆ°n83NÇdMèhtžõGo~“¡ì÷‘Þ±ˆeð dX2Z°6>GÂ.Y'g$Ø:LRÈx&`Òùs×½Â7IÇÞÿùpOŽªº^45Íøeµíó b²šÝ "P{& ó}ä©wŹ¹#˜L¢¤MLÁ§¬k‰DåÒLr.bÅÐp;+±Ž¿C·¿?õy’ÀÂ+ Í€x¤‹œI˜žO­)yÖÉžÉÙN?¯‡€ëmñz²k§ÌìT‚a•î S–4©òÐî‚Ááàõ¸?(Q“È'"’¿Ë*Áwh.ê´~¹Q. …ƒ1„gÜ {˜×ûã½ÕqÔE59Ôí Vn½# 3C ]sC¼@óíÆëï€L»rÿ.óÕnM‡j+¼„Öô½ Ÿ%BC™—ŽqÑñ8¿Í¾¸Á1ø)_s:WM;‹C¦éHú:ÝÈUºÌ­A<‡ÜÏ¿‘uå<à“Jít, È&å#F@®ÃTw!BNx"ú5õ›˜,.òvn[a3œý¨IØn:JÿmŸXäû}È©<Ãb)èuhÃ^_ã_“€YÛ»¶tî)Ú D8 ¹Ø̸0[ï}†%hÉÃC;";•¯Ñߣ¥¥vy‹…ƒª9ËþK¯^M +†Íh|ÿÿ_ƒ_RÒjýÿý¿48‹;1(»8LýÿÔmÌ #UÀ_gÏKtª¶ñAõrƒ9Å÷š;le™Ë€´Ûþï‹eaˆTŠ€ÃBVxâS{¼0 +ˆ^ÈÞ.TŒò˜é…ˆðPdR´L.~,ïªå÷þg+¥ü¬Wú3ZÉ„l½‡ê,›ÝÒçQMó‰©!4PècR N5˜9§ÅËA^£‡Ï˜¹böðØ7‘{;Šw¾2’óËk×"˜ïò•ø³Ä²3²{®áSŒM  +„Þ¨‡HÈ|3,,+a¶‹ÌŒïº1Ú ¿6QM°'.W;VÝ•LmzÏÜW+ ¼âw‘îß_sœ„ú•Ô&äž{€&ÛnäN»pxé*äaH”iÞ€°£³Ï7‚ñç3 O¿¢@|\0µ Æ½ð¹¥üq(5¾ñk\VúéTy‘êóÿ¯VŽarS•Vø“W%çH(I±mE ÓQ)­"lnµ=Ãz™õOX”7½®?½ }—òWJ©³@‡ñ"Ö¥-Õh{Ňcz†°«9"VjÑæeFÿþ¡B5V ±‘ÜfO@ +ÅÍÁ(wŠÈaÅg5Áž-K°<ÛÜÃD±åló@£çêÅß{Õr¤åq!q® J8º´E>£ð4è\/Oc;ˆºÎ¸$éwc@úmƒîiú&ô$?—%â~ReeàˆóG¯¥²×à“½ +²MKˆ—‡Í´|´i Ý< “JW|úçñnD¿m1ò¤Ãªá\Ê%¨é2Ó×ô¸aï%?Š>§K* õ|­Gù‰ênPn* (ÅʨÞÊW2¢e˜ÛçW‚5× ‘?@pÂ\ôñ4É%½ãeJÂΚø*Ë9³Ù×G“>ªÍß x“°Ž?ŸZýc(T§û¬Êe»æXýµ2ʶñ=ªøÇ*¤þ‹äúcź‡µ‚?’>Ä *¯z˜€ióIù›ÆI6þÇ ¹ÚÀ¡rq–DÔ˜‰ª’χŠ¶ÅÄÞaP¢ÿû9ë™çä}u;ðÝŒœ%Ø6Ìyo#˜éèÝâr +5±Eùncw¸†c^mË€¹°K  O›7 SóŸ¿…—¥D¿±mÑ]jÞàJ=;’&a ëØkÌdñ•)‚#½j÷AúRWöá-kª,¼!†Ûâeš¢%åîO:;¡S.uËtP¯o= Ú–¹ßï)×;ã¼atŽÎôá•øDl¨i$C­pÒÿkÐE`#[B‹5ŸGÐj#é’´ )¼ø’f}bÉ¢䇬Ï-–ñ6ÔÜžÅm!ÒHþë/ÈIþ}…3‹÷¬ •%Nrw?Ëi˜éYsÿå"$&è1ðü¬vüéN/:“0¤Œjí‚YïY¹¤œd‰wúíÌŽÉñ³ÔÃEˆ÷­ÿ­XL­$v:¯ô$P«Ïdíkk-—hÙ}~7(©\©´Ñz±‹¾‘WËA΀šJ±þ—^¶ÛX”69:õûùý8·4#Ìp&ï$ô*IfF~DÜ×ÕØŒl9¬ÎX´¹úÊ[ØÅh¼ ’QŠya˜ß7Ø%íÂG7ü[ê¥À#m$ÈoÇÁç´ oò|³j…AXãJÚÞívû®ûDûc8Ë0ù8~ÖWJÇÌ>Û(—ÓâçüFVeˆ†öhÈÁq:Åô'Ñîúu’ùöêûÀË–Œ°9vâ*ø²¤0“|M\¸¢1iL}¯qkoì?œP©e¬T?â&v )4¼`Â['äÓ Ú&°xF,*wÇŽ2µ“ØOÏXpW“ûäx¨l†3çÎÎý+ò¢~{¨!œ‹ïà`ÝâÐã—Ü€½<¦ý¿4šÎ‡è¶·ƒ‹5ÅS.úÖYÒ  Ž*¯O2…WI˜%…C×nãT!ŠÖL!·»t›q–°òï‡ý>XÙ˜œ15ã21z¾l”¥æFJ®3΋HùY =³ópãúáÌ­JÆ»²ÏU§h.yê`u›N##èüážnyÁŸÀIíÀÔ Ž¬±®™PÌó"ŠxØ9&…sÆ®;ru­Á`̈ˆþ9Ý,WøaÁëáµ>'ÐäÏM Mü›e²ñÀÛðIËŠ–¼hH"uþ*+ wb3©0Ÿ|º§·(ýÒªcÆ-kH“v¿ƒâ©9É%´ûlššHôJj4Ë ö’+œB~nL8BÞœ§0ZP”%ÔjÔ)óç)mî˜ÿüôSRýc¾Jþ÷õ¢‘k§5HKÌ‚B¨.ÙE€y‰]EàêÓ t+að§·/]¢¢‡ÈKˆÙÅ»÷½”«Gkµ?}óÇþ§Ê3)InN€Uƒæo|(ÃoÖøžW%Sðö¿„IœñEÍ$'Ißk^™ßFIã ðÁ&β^bÀµŽ²ËC›ghG dÊ໓šÿ¯jw“¨–SöNæÁ!úùýûù‘ÕÄ«-G²?E4û‘?U³?j Fø;º6F/–¶(ÍKaέZn9t[è¨k˜¥ÆB²à€ýk µaXíòáraNÞ£B1gˆÑ8!Úµ¨³~Õ‹iy M~ ÕM­ªÕÍŠ5 Dyº ¨ƒšÆØ =ðß Ó(W›ÅÝd£¦`.&²»K¾Ù;j_òáƒÜ©øçóðv¡´Ü“ÀËŽü†±Dà%ZŠ†Hg\Ä_q6:ú ñ¥"Ãv J w’Œ(Srþ7H«¿³Ya‘g:±½7°O¢ $º¹· ª@d%ÜŠAA6Ð~[v™WqÁžš%K°õ ¥øba]»žØ/Ê©¢±‡©@Aѽ¸À®rµλ(Ä­ìlæÃQ.;ë¥Ï$´ÏÀøš"M§YS.<õ½dgJ{_B9øŸß›ÝÏ3œnûß%²ÃZLüÎNàÊì‡a¯" +-aDiÚ¶C3±¹6ç3ûEk§<ªn[£2€  ë?Aår&©çRái‰tüiQ9ÂAD@,Y%,£’K¨Pìh”*Ú +ô-²rNωò¤"ôÉWº‚Ž_0R2F:‡Ê´áù»ŽÌžKY$—Qåšj2U +{ýP²Þ1f$ 'Á??ð~Óö ½D¿º¨$B´2ª4ðÒ0` œEv¸ƒjˆ.Læ=0vzí.ÏF¿©ÐÕv±}º™X?Ý©Ï +Z~Å8*M=cÊE9ìI­ýýÀ•ŽÔ…_èñaVò™€¤&TÇy̧¤gµIkÂÙ—Ee&X~vMHídg‚–ä»ã7-è­ Ú';k… zŸ CM#ýê}¢„^2¢×¥™a÷™ °5áä'hº&*ôä|Ùï&EˆÝ"Å—…õ'Ì©Á$[j; Gã[ù½3LÂÔ5܇º†Õ–è÷ºí”D<áLzªåDÂ?I§çdø[ÚE¥-–98Ì£•øN’˜oÏŒ +¢Kaº”EZNQŠT°ì"iõArÓ4p´¾Ù‰ Úùœý –Hbw¥wÍtù7å08lNkZ¦ôD“»ÅP£º2ªå¾)Ÿ©žeòn;^·žyk´²”}wZ«âM[ÕÒeW&‰üãÙab~sszø0¥87ÊÁ –aq¿AÖ”škš/›Â7yd{À}üúÜm0‡­X’ÓཿÕ¨ñt‘ß•ÈÊ jÿëýW2Bà¹e«‚NÓ,/–ê QiWªçG¢7‘2ƒqÐv-¡]Ÿ¡ÍŒ~¬¨`§u€ÊW ÈžëëÏäkЛÜØ‚C7n?x–ŒÄj˜×qþ:˜‰üØ°ls‘®8ÈO8X¦ÐïaŠòeö®¼lARƒ*›ÖdB¹EA @›K2Àw³ý}æÑÏæ–ö§Ñ‰¥ºÉH§šžUØ1Ÿc2¸ÈÛ—(Ù®Ù]% Š•‚uüd àÉaÝÄ”šsgEi­Ÿª§ÅGl¤SAl¹×÷ˆDá¸æ·dÂÐñ¯&Éf¯ +ìKmýš’Žò*…”©‚_½‚Ž^ÛP?Ø ¸Á¤Õ’øiƒ¢‡Ž/Œ•û‘¥Ð¦´„ŠNÊž<Õåbæ(Ãy&1zj¦Ÿ‰åI¢GV‰«Tñáâb0ÕÐG3£»­+òã]™ÇRTY18æÞoŒ‰#–0-Ï‚ÈNÜ¿`j ÊPU-Ò]ªdLy–«4³z|°6óUÜýkµåÁPµ¡ÆÍ90…ª¦¼ÿÌ&j +ÜôS´Q'H’nÂR«ì®xià ¼ +[íÜîbzyu¥Hù5D›#Ü`²ÒÍ.Q!ÐfnKSdrXpKzÐæjT^›ùîj–LfA«ùçÒ%ܨÅH¹ÑšŒ6„–_ + XZó{‹5Z!îRîD€Æ· þD¬þ`OMk9ƒ~˜o¾óåÁ/-“…†{cÞô“É•ÖZÔYò;²òé=NT½5³j¦¡·˜XãÐó.)«ÃõrýØõÀÓœAÍî´¤ç<î7ñÛåâ¶DkÐsÂ?ûŽD˜hLˆÙW ÖŽÁîºð‚bG«T¾Ìÿµ‚¨ ã]¯Ìº\s¦¡â³÷BHé¯H1¤2ÇL6•&½OèQ$^Ž œo5¥0·h¸QÞ"†¹ŒÐýîŽ×cpûZ®XBVÕÆa½«Ç´]BßZï¢Ë«à`yI*s§+X@¦¥£ïàý‡õ”ð ŒFS“Ú|Eƒ˜/”ü&È •Bn;À‹ÄqgWىˠUÈê€ñÙŽ²þMˆ«ÑëN¹ Új0¼¢°$¯PJ:ÇÏˇț]ß®È÷4ùSbrOVv7,ÇmÞÃå_Y±%˜¯¡ˆ©3Úp«¯Õì]—¹ì¤qºiŽ 2S·=«Œ:ë!¯æùqìQO‚öâÇ,BïŒ0Ôì®ßÖ!æ[åððõ¸ÉXðbZkQt•lƺ•Èêvˆ¥Ÿ’¡’x¼¾0,Ô•úé}Äã³ÇD\ä +’вłrà„—ÍÔ”:šð&4)«¥“z-^"…:ºlì$^²±™ìJ°"‡€dsK´ë÷¡¨¡"Ñ Kõµ93òâTÁ…ûÛšUíÁŠQ•ùÏíZ½(ÑÄ[wPÚcg¼€àiaý)éA„ZR%,iòð‹Q|TRÿZ¦©W¶+­á):öáhc€ùÚw–˜’š‡k€$òEÎöʽ}œI£¤µ¢oc!²1Þw&ð‡ÙØH;yTŇhðää¢h6èbµ'“×X`ÆùMpx]§áh„äãs®qã‚ú:Þ¼=[¶… ²®.+:ö£ÓÅbð+„•Nä™»‚ø 7èB$Ðî°;,[‚¥T+jX[GO9ˆETÄbÌàµÄB@ï6yVDš{å&|7Uh€àhÄ¡é䉺U`¾)󬧔Kµ¬É$¶‹„žÉ(äЯ`ºmžçÇN< ¤ëZCÚP«ª«Zç1 +F®$xå9ø­Fvì[FÒ[ —Ä%"wümÙ:)ù{-4Ñ+¹fØvWXù¥ÆÀv,veZÓÏ8ú%‚œkVRßüÁÊ©c”>˜ŠFEÓòÍfYNU*sqËQ³õ«¥]‰—±„_«ç, x¿ ·ô5–ŽŠÆ±´l¤”(q|}VÈH•zæ¤åÀ„"¢4®WþHY5õ%Ñõ'?ñIëXóÅQ…ê `‚é +çd‚ ÃPîléÁ ×T#S)‡JhÞ/¼£‘UPÞ£²HrQ]]%^êÃÄ«,šÒ‘ûÿ`5Y¯'Benš{0üƆT¬!……ÎØ«CîïñþhŸyÃÄB¬àmá³*;gIG¹ž¤õÿ3Û~VüB¢ôu«[pq¹ne­AšÙøÔœS‚ˆ]¥hzOZ*™ÿÅÇDÅŸ +“¶}‚ȴ†kK5¶~ ½–«û[ôÒé¸l¯ ’aÍüÍ,Øìæ /À‹0ë¹àIªX™XìÚ«ÌÜ»Øgyç)CXá»`®˜eL8H«Åˆ³Øº4’z^Ï^¬hgBä•›9éZæõ¥ÓÞ[áTc¥Ú¾cômŒÐµ( ¹[DÔò}…ÓQ’NR™âQd~oÙ°íúØ–|”b™ªv´ZI0—ÉÝÝŽ@¹Ã>u‹šeþ~}°  mÚpF Ñ€BÇëÄdÍË´‰†Sœ6ë*ó•ß»±Ÿñ°1ž_;òAl6Åà,Ìý°¨ÝÅSMà‡Zñ+¶Ñ¾ðIЋ GS/﯆ K¾ðT\(V]"h½…ãîê6EÍQG]yÎ|gˆ¼~][ Ò%H5cÇ É®²ò±Ñ11þ0[ȸƒ½*uó¶ºtAÂ4–Èt¶µ¯µ*ij²ã »"èqŽ7'%fGßÕÉëtÙè,­þ4sGë!£¬" +²Ä£½ÆiC{…‰.°n5À=ZL¨Bòå^Ý><*ôÕ82‰UÅVø£&³ù'"Ï0´ˆw“ZÚåºÞ]ž[x\ˆÿ@2¾c!V+% +ÚFj‘,!³­úÔL& /½g5b¨ðLLµé-ÒµÍV"ñf=Y¾tW èLé ‰‡ ´X­Rƒµç‹û"ü|üswˆ¸ WG½ã ‘>…IN +{åQ3Õ$~Öf’}ÇÕ4²džå˜õ~ ßÝb›€ï¨Æ§ÐeÿXŸ%û¥!ý 9{ÈóÚ¡@ý#˜{««ªš>ÞO^Ü+ÕÅunýJ¶÷פǰ Á~DËM\Üô敱(äùþ süQs­h7-”ã7Fö2CúÑŠ³kQéOG½Þ2 ´®»ÏÞDÿBf·ðÂ/ø.ÓâÿÊ>Ñ”¸/+cóç‡5’ü ý +#Æ2 XrOù×µîü¯ç~qZ=Tª W´ûáæQ¨ãíü&ü…èG§à³kRåº;t}Qð.`ëÃ=…6Cu•ðý³*1*1CVÞðSó½ +¢„«xú¤€âi ˜ ¾@5ïAß3Ob=v( ·‹ÐÛA…ài h +ñ™¦îóYö‡Àtd©O;‰v0;°·ï—§À¢q*2 ½+¢€±3“”®6}`RU˜Ì&sƒ’¿°»¶)ÚŽÃK/;-.oùºƒ_jf'mÔDNÛ¤{ºóà=C±¤H]N»'.³Þ罡óõmË{!áï„fb*àrJ6O´Lžp/ÇKC€s‹Ô& +/Äy\ǸU,~¦°mIùNRÔØù$å²–½ð´§Q€gbõ k[\î2¸‘–ß«Ø8° FùKjLD !Zˆ B[û¹ÎLµöU×%Âß6KsgÅÛžuäo”`û¨ ™½cY€m…!°cÌ2…W¬ +Æœ_,«Ú‘²pI Y>œ¸e!ƒÊ°­„w*‹‘¼Çm(«"îÎT:»Ò(6˜Þ#9ŸH“Â$ DÙá6—E-ªæþ?Ì?Ÿ\¼ˆ¬ï|ŠÇtK1¾‡bÚ¿™‘µKéóáä«ùÀ<ÅÃóAKR\/þyåtÒxRµ¶OÏÃvÎU:RŠ±ñd\ͦ ÷xš ýÀ¯³­ˆÌ° 4Ö©Þx¶£@Îz—{<¥x†:©ñ$v·% %©rÆÇUÀ¦"çÀÇS]_rh ŒQ µ£ƒÆÝxR»†î)ÛÊ|I;˜âä—?ˆ½ ¸hDÔMµ/ê3wbJ 0ý«yÐèÉdü͇ÒæÓ²uʳÖöÚ@{”2·¶H›ou6 ¥1±Ñ®774×Ræ§/dëg›GAe"ž³§ñd_xe¶5A_à™´øèo?Y¿qÏhÍUÈGæeÛg€»xp÷·»X9t€@*c‘žt7À£Ô±ÉùgŠ ×¼Ãdò^T’&Óë8ô!݇²‘C"ZEÊW21f„0[Þn­òÞ‹Ä' Š63â§ÊДÝ`Ñ"K Ä„F‘ºV>2ÒðªE]gËdþ…’7•â„Gùa'¨âßÖñ/Ò\HJܼ5o0@91o $ÐèÿcÉ ðpÚÊH˜õš'L;‹,?âSF?nÛYüNêGò’yÉ̆÷Ê8nF”™4p‹ÌüÓG%6 î“Ësbfb`‰Øò¨'›tÁL,Š âüAnåõǤ¶³T¦¯‰¹Pï@³3‡ˆ€Ì Gî¦kªöí¶Ÿvò°Ñ‡Oò!×+\/yBšò”*÷×{mf¾‰—ÇJ±*E!!++™—%™·Úçƹ®èɺZ¶& !€÷V +ù©ÊôýªñoùŠæMÿ¸icVÎA½¢”ÔÖ‰¿ÄÑ>üýU8þwÃbm3ë»á|¹¶TЙ}È%Ù; YõÜŸjÈ…ôPÔS‰™KØ–€üä§y@7£Þ¦;Vɱ?@È6&oÒM^%¸ÛD—âÃàõu¸¸®$¦{±boÞYXÍÊFÇ÷t+|ô[\Õ¿²þ®»ÿrßè_z/Lü+®FÆLx_£‚•_Ä¿M,üÖ¿.÷‹UúLÁçxê‡JMþ=›`©aõG·W‹}E¶†¶BôÌ‘ìßÚriø79Š ì”$ø÷V3ý ‹àø—Äù'‹¶§iöJž¢w+v|[ª3еxVÁøWÇKéS‰O·e$Æû”9š¸ÛÀqñÔIgJû>;KSìóêp%ÁºÀʵsür«i­T³ãŒ +Y¬4J¾ú·~¢˜E¨‚@—DÌäÀœ”& †Ã)‹Ãÿ††4ØBêÈ“TŽ(tk|¼Ž3ÈOÏ›¥£œ 1Ÿ‡Ç"î¨ œ ;%e é¸Óß&“Z}ÞÉuÛ«¦·=ãþ! ªd—îvd¶ú­Ã(HÔlÔ÷ÎB¡ì‚`” Ìڹ©³„v ¦4æ ¶‰C"EÐ`òͶôr­·Ý/þ8;©ŽÀñ©­ëc·6"º›kÍ“»YvM€Ç{GÍuMè-YÀç=S·V/ãA×¹¡M"Ó´,n«9°æ„½¨û©Øi…”ëŠàø{,‚V Ë¾À¾þ¸Ò5 Ä®šx¹*7¢¤yîä6ž/°(|ã1lM¼\‹!²Ò›;NÀð6Øó©9¹‹b8X;Ÿ÷K&‰bè*)¡G R0­*¼c. €ê%q©—ØþÍ3ûÍþ6ë`¶ÔÒ@ãÈ%r¶R¯×3çG¨÷ʼß°PoÙ>Éßë>ŠÇ +›¬Ø õÚ± HêÝø÷2¥sÄŽ;¾¤D¬ìŠ1àhz –W5Qö)É0Çý,‡ƒ™$µR¨®UÀºdû–äÆ–aÌ +Ž‚;ñ„᪽ ©¹Ådßé3M +³ t‚‰l}ž§¬Ú,ªa–èw­@ÆPÌ™¶“ìòŸÂ©^6@ìúË'™›9(k³Î'·(„¢drÐb@Å„Å ‚¸Í(ÇŠã›ì'¡~ð˜ à!+(\š¾?IŇã¼wLíÉ9¶U€ÂyËŒ^ôs‚Xµuǘárž­¦w +ïgE gÅ]±¼IN™(+/|lgÅCZN,Zœœ ’—,*p.‘Ê&n6ŠçÐS^G#…‘RåLÏÉ<[‘œ«¡ÿ“×&%ád½4™ÑŸµãé5½3ɽ<èSldèÞ†ú Qg™KfUÉxd×KUZÀÈF–Š›"r|cÍZ@áM<ìU¯8›bµ«´ª—Š›hÓªªÞ›…&€Â x'4ß~d«77“;ß|\õ†µ¾° Ê§È'­)þYZ]½#ö–N¶V½Áu¦!×8U½¢Ê.y‰]wn¢àÎÝu Ç >lÙç–W¯àèØìU/ÇfX' _K-¶Ÿíã ƒ·J†EAñŽ°iæD4èãáÃHK!1Éj[µéçÄ8~_æºZ㻚 ;eM®oÔ|¼™v½‰X& HsX+?\ViðÑÇ›yôoá9SMëäÒYìP½ÐõA,Çe+€kGôKA³+ 1Ó=rŸ_œBîbï¬lÒB¬«FZ¾V–»aÂý®8cr‰9nY`y‚0a牋´ìÝ1<årÚGZQ¬ˆêÊ0ì”^^xÍǾ|·÷W*Mà‹¹ààˆ!‹ÈÀWký69!’Ÿ |}TPI´ÔòŒ‚1S—æ«/߆×KGÅ]¤ÂÑÈ#û´ƒñ¡‘CW!ز‡I?×=2Є$:%^óo¬Ùüð=XÅ14¾\çÔ†öZÞ]nŠÚ[Ý㬄åèsÖçüñ»û+.„C°ëè[5¬Ê5ù:ƒ3˽ΗàJ(—ê üŠwQ)€%@€t»ºã]cˆÏäõw7ã|]ÛÛöî6M:ÞÛ©dA5–k…q‘Ì¢Y3€Cßʶ×Ó»«3°´6nº^#Þ]:K—T+ï–ÇÄM5úîB©ÿaj¯(ù,ýÑÇ£€iÒ»ëfÄ?òŸ<~w÷FôÓ.Õ$ÞMöäþçÝEå•»oñ–'[˜EÀ¸PÌ(ÊeȾàÈ°žç!š£‰ñèÍ¥Dô‰;ºa÷ê KÉ…¦™ŒZM„Xñ¯{o”íaËèi_†ËVÞ@’‘R“КÜøO´ðœ«ÇÐ¥BlÄ;º™úgɇ¾Ë >1à#Ö *"VˆbîÅXÝŠý¶!þ€PÙç=¹ ©x½K¯†B‚ºÓx¡ˆH¦㬚 °¿›Dpñb¨Ïsåÿ3¢aA5d«YIl#SDÁ¢…üJší,£Èqò©)鲈S¤jX*…1üqׄ٣P¯»}¿mœIÉ¥-ê+ñóË÷B¤e®-²Cá¡ïÕN9sg@v‰«òÖáÖ\° V->ÁFðwqÜéÉ6%ÐyQ…ÉYê_aâ HwÀ¹­3à•1Åo{&²éuzâî2 ;GÇŸüY¨AE¯{ð'‚ÓÙ^›p=+Xåb/|Ä{=ÎAµK¸1X¸wÍ+sÂ8nðáT=º]Ïd–¤e=zï‡øæ]Li¥îŽ›í°Ån¯Ìß-ß슞¿r3&þm>ôòö§TbT3‡|?¯•½0V!~ŒMÛS ‹7T@NøZƒø ¨òA¹$:L§¬}òÒOMˆcnƒ—h‘Wß•ÁhˆßÙÉ:“Wéðò¡lÜ,iC§:5ä;Òþ +4‹y ã7ÓrÈ2çV)RÕp#'¹"¶zw 2‰Žåb6³ ú÷3#»5ôú€ Zâp)°ÿ Šçk?Ë‚¢kÎ݇_ϬFÒ=ÓRtuYôðýP˜cš$GwK«¤¤¼~;aXÎD@IGÝù÷©åè6‘E?ÚM¿ðˆ‹®Æ¹õþÑþŒut't™Ë34RDùªÿ|2×À&¨s§°0kê¦ &¡TP¯7á°ã=Ú)vO:i•QK4•¬ø~Ϻ%{ë±`JáW\Ø2Êþ°Oàds¿ÂD©Ï.”ÿ§t(.ßbC{@ÅÒÐ31̽·P,×vÊì5¨F1ª¥™Ú!®C1ñ +ô8TÅ| B2¼‹e"N¼Hç•NÅz7¤Òy}Çd\ â#ý²Ç¥…;ÌyÆÚ’õá5§2<¼2÷³Ü””ŠúÆß’–üÍHjÊò¥¬§ "‹Ä”2l@¤Ûð‘9h{U€5€ÚiNá%s´Ê»¼6rªìz  V–0ð6J qQü¯ +jCZªÝÍV»/ÐW£ˆ77Õ+Ëö{LÈ\Þ]¬by?¡èc¨°ÿL#ø[D-/ÛÐ:¦]{Q<ñó¹x‰6w¦š…Òeà ‰£Ð‘0ÿߪ˜ªªV˜ØÁdTÔ6Ž8ã1‚ôò°ž:w†J ™.#ZææáKé$C·K¦Ô=ˆ†E¦;Tâ{ŒRºò,ðÝ\á­Œ1Ý‘ëÔ¾ž;±ØKEª›4aºhc¹âàh™îU<ƒr·•YJ×…¢m0¦›LmC›AüìáÜ0kß÷ ¦ƒ—†¡çCÙx£˜’Ò=¸¸êœóiVL•*«N¬Cþ Ÿ0h&¿€_à«Yq¿ðébô˜Êèç¦t9).TôµDUÍ#\¶¾3¨îíÅ7€*“7Ú*Æk(®Zº,“wGQN¾Ò2XFàÊ81,NGÝΓ…–"ÄèÓOqeÄ‹å9 ö³¹Y™%á âD7­®mWn™e§ð¢ð[R*»Él¯Ð©0U’gÚ5™¥ú¯ª§§D>Àï1<ß`EÖ¥™5_+‹µš¡õrÒd*Íp¼$ˆ)+Y +ß¡q"îŠõ«ˆo–˜ã–1“OSø†þ#ªgM”DdOWéÜÆòß}bl€s9…Ê®…£ <-Xg2-õ³f_àüÒ[Ó˜_b:úX_§¯s&ÇÎ1)O Ž0å5Æ™1 Á¿½ÚõŸ¢¸dWz– A·y¸u Ù“bÕîÕd8Õ(Æhâ(1¢h9` +Z˜år…Á vÄÙ ;U—tùd2œr÷ôRö Ôaõ˜b­·7_ßßä6ƒLÔ ìHÁ·WTwN ×´Yo-ÄL òU³OoÖ»‘pv1n/2DUET­hž,h˜ìÖËòáE²ö~¼¶ž|#bÁÿþ xR¿mÄÄ)ŸPššf"˜SÆ5Ù┦£[.ÆRè1’\¾]M=Ú]·Ëhcìêr…º>¯ðÀNœ‘ gv)dëW)Ÿ 84ýÌ,C£áƒTüˆµð%0[…~–]»ãJHZ¹”m&ù¸Ãf.þlÚã£Úß1E1s ô"$·§ìkD¸övõAG–k˪‘š°O s¢•>ëýÙ£ÙY¦7>ç™Ê¦­œ-µgw«Âmÿˆô”›_yürnËgð}Vk8~¯ ¼Š$†…î¦ç7³)Ñžo<§îŠÔji¾ÔH¬+¹JÔf/F·ãº/ïvÏf-ÔFó|šJF­E1[”’j¥$•]´¦ý~]:9ÉŸK» ïU³ðévïj¥j’=Ëç[þðâÌâþ( Á˜O“Dz_ý5TÁZ*p/ë ðÓ:Àí52Zo?o–PÇ”öÆ$ûhâbà +8%Y•Nu(¦Ùž»ó¿®×êš*É©'>aÉ–ƒ@ÓNÛÓ™nÄ…ä› •ˆo¶-¿*ˆg}MÖ÷« X7uÖ+ÌÎåÉùü«ÁV–³`§ðRÒ(q±ß#ûí8™×NF¹ËI:™ØAÅ6–Újþ[1øÿ…!R~s}˜ô"þºnó,ƒŽñr|Ž^¨‘ 0»žãóqM‰ eÄ÷–"Þg“u‡ø"×—±ñÝ…Œþ°Z?Ì+©Ê-dÐLºå2l<Œ"¸S§ñÊðj–Ëãkw Eø†è€nhé£S±}ÇyCÌ@2ÑÓŸR| Ø+ÍÁ‘œ/Nö—åžÁÊ©‰Ìƒ: –±2ÙEÇA¿#ý2 ¤löÿ?-ò†Ö ÖðÂÓ±Ðz«­!"ë˜^ƘñÚ +SW‚ð¿ǹ¼ÜËû_Áœ›äà:Û»Û:cˆ6¹ñ=öá%Ž|™Ѿj–ß}Y0n5ÍêÿÎ"màQ $÷–œ«æC +ðý<Ü3Õ#}/Ë£9AÅbÝLªé­ ç•ì+¾æ¬j2½Œ#(o*œªpßiÿÒ?Ço’/„)«<²cà490ŸÏ×I¾íÝG˜¤ÁS(á~^¿÷{WÊ,÷¥‰·“W̦l¼+Ì’¤ÞÅ“ðäd²*[¼ïy½¸F»œ;¼W”ï¾—­¢•T<7ž«ÏD·•rb^‚~IºUÐò6zÌ¥vrÏÛÆvqlEa{éîÔd,ô¹™Ž±üYÍ7à÷_BŒQ->Ž´p^|æí |­Q¯â+Ì«öíjf@>××!Û(c÷-¾„S0B¼üwð‚3Gçvé)Fÿ‹ýõ4Sض&{–aº_£~è>éDPs™\¦TŠBO/Xí4 mÐV¡ qΆ‚² G=ˆkã tÇStº¬ŸÙ}²žUµ8‹ƒIóUÃÕÖ’HØ €Ëì¼Ôœ¸É!hWºQ*Q'Rô|X•dY%‡|ÁAvl¾Õüd,‹yA¡>%aqäÿ³YUãv5%!gõ2È’q€Â†i‰÷šyrJ¡ ɇÌGS*—jÒÕ:æÇB£ò”·rus¯ÿx‘€<2q<¯è™xó*n%¼ž‡œÁãµ·ÑM;Ft7£{Su¬­¡8èµà'L ’,ë{‹HªME²6Ø¡ÿ™*6ðßÁXkÚŸ]í™T.³¬Œˆ)ÍT­£![yÓôëU £UƒøÒZdp–À:bšÊÖ¤y )#tGŒf2†ž=Í5Ñú2ŸfˆåiOÐ !Y~ߘi/´ýBYðã°üÕüÉ<5æoó \²9=]j™êJl§AAæuîD|²¼‰ùã| šÁ¤Ïïõ<"q,å5›n¤Qx5$¯¶Ž³jþe‡ÙÕ(»=}Ì/c0=•¶‘!öEx ëøyF»ÆáŒÃ:6£8Ý6"Ø:hÈdæÏÜz"d:Z>ªiA0E¤ªÀÀÀÀÀÀÀÀÀB TñóÞ–$Iº¤KÿÿÿZ„D ÚmÛÖ6]Ò_J)SJ)¥½ýÿõ§Õþ0PÂqxêM x ¿ŽÓ‹Ã+¤,LÒÚtôQ8„T+X7‚”JoÆjC­%D´ž‹ËkA+Ú/ÐX«eÞFëš“©œ0ƒA|â´Ìã}ht"H©´FË0¦L§†ql¡¸dñÞGÆå¤Ñz&"D[‚0êð:˜—n˜ ‡ ‹&è놇׉|xA%bè>”ÈÚ‘èÀHÌ´›Qx`mERDˆ€KUòq…8%Ò’(•ŽPáÂO(%‚ J“0H\š"2Èlš„€KBF4¡^Â…µŒOܨ#ÓÐé2™M&cµ #Ð×ì x|§*-rh3°Bü<¹”ä¡kxs)Ì€¯‹y‡Æ1'°pèH>Äà¢Ãá‘@@LÝļT‹(îL#¬eým”Ò†††22I\pó>,ÂÈ°0,i`Ý)âA-èAiåò< ¬u*=¼Ñ¶ÇÆ‚&lÖ-|Ë ÅÓc¢Ã¡5]2-X¥½Í¬û`0•Ö ‚ {šÈ’t¤#9Îh3ÄÑ:¼¯´Ê,I ˆ©;uŸW2Q`ŠÜe"C³%‹fK\"šõx"—~”D ÆâÓX‡Î0|HŇ!&´ ¹mkóÒ„È‘Y,I; ^PÖ”8&¦ÒSa£Y-{Jˆf6@DÒô¿¨:-"¡E1Lê4²aæVd ñ +t&`ßÈFÄK7‹h`]ˆ dµ–µ¤Í»°d€£â9pˆ÷ªˆ0uÊÀ +µéôððu20^ò Õ€ `æ úI˜4 + J‰ƒ“¡Ä¼ýÐèFL¥7©J§M˜¯ Ž"Z„‹Kk½°Va¤Ø€3‚ª´7ëàÆ'µ!@ⱂڶŸ Œ—žH)Î@XœÓ­F.xKv+à´La„I±hûÔ [ZÍð5‡N +ÚBD«ºSüiY^êÅ<$Õ2©Pña^]è`„Ä’ð›Å‡=‡¡Ó0€¬HîòÓx¬B8& ©hЭˆÅ>ʧ‚¿ÔÇÉ ñ—©Ó +V¨ F2Íéjä‚¿’hA-  Ž"´‚Ãâ´Y+Š˜ºÍKT«í´"4[1ÚðT†<"΢ن&ÅÝ*»˜<´Á:Ad-‰ŸRü•<šÕÏÁƒ éH€ +D.¡Ó| « ;x¤!òõ¨@ÅŒÍKaF´ nQÒl÷¡Ñ™H¤‘kC²Ôbƒ8ë:Š©ôGË +@ ¡¢±Ÿ* õ†û0u'˜H¥CÖ?ŽÅva¯Ãá™H°¯=*N›–i +h^ƒëhP- ±@þš#àÅ>'Ï“0uëÐjŽBãäñɤ1•…($0 îŒxHZ†âÀ:É”ê:})b+XWJ TZ(ÔkXÑn +œ6 wáN‚"T:Ý™8Áº‡Xå”!ðÀZ„ŒhA›ƒ‚„C)Úé‰Õ¬ç½L]Ágé´Ç˜hG2%¦ÔàY­–u204ûâ(H6-#±ø6§e µ.|Ð@± ¡TiGT.qlº†’齘È× •G´!ŠŠa D´›ŒhA7»!脆ՂjÙÅ’ôk e°V!”bÙòˆ,/M‚ÇÚÐÄ|=5ŠJùè|p$–H¥¿ o5\Çu¦€g¾®Dµ%ÏjEÞ³XÐØL„ÈãëÀtÛ¤xaÀpCNjéD ]LäkÐ7Ú‚Ô.%+÷J6‹ s„¤>¶N¦BÑ2-a l>4ºPS*=8ñ”;NÈ€ +>ÜÉØXË+X÷0j„/>¬s u¥;ŒÍK?¥ÕvåàëÑ Ñìo"^§7Úod„âQÆ…!Ã)Z…ˆ‹ÂUÅžÐ)¢dê(©‹“ÇAÑ9y +•O$1`ú<‡ç±4|lJ—F—‘#%ÍVLÈHGƒ–>D*Mñx¦)í1t”ƒÖ‚6|@¼-ü°B Dhv£ÐÙ|htâh?ÞOgñm63D³ÞæäR¾p‚Ç"B£0-¦¨bR\!:}t–ØÐp|TíGå!},$ уÖi`ç Ò ¾00¤ø ð±’ Ø…;+Ú]Y­El¤Às2¼„ W¤*­‰¬ MPÀD+XWù*³Óœ ‰ôµF‚‚ƒÀbH±É‚– %Å%ŠM €UÚ{ÁºOÅŸ¯=ë´l m$^ )ö„D Zf1ðX4«esóRìÂßæ”+<µm6.µˆ‰&Ç¢Q ‰šåøl#è™vð¼bê.ìÙ øpƒÃÐ 4H +.1Aœ¼‚¬´Z6²°°EÖ~"8\ +Á€!i,yDkR|Øêtþ +@2mÁŒ.(~˜áð >¡‰ øpÄ=tœÔj 9tx„\cP‘¿ T]*ŽT4XÅâ âdþâ9>‘pàAŒh¯–uŒb°3H¸… å¾³ÉVIŒB ½ú +ÏàX±ŠcØVD€j¹ŽN?c¥pPrÁš T«yX©J«TCL°.ÄBŠž”FhY÷@‘Q¸pˆ„ž8íjä‚õáÄ•N–l +§Nµ,ã9q„¨ÓFHÔ)FÆ]6"-q. +©ƒ_<'ã¦Jó^<'c‘ àdôâ9m¸Túâ9-ƒ¿AÀñé + Nq…VT<'Ó!ªâ93±·Äs2+¡„ÇáÃ1áãAáa”\pJO­ +Œ dÀ—!J¡Ñ…;Äs2›o&h‡Ö®Ãs2R!µÃs2‹°±¶cÒl"Kã𜠗ÉX9<'Ó £]†m£õFË1x††ae‰À„tD›Ò2ƒŒ’h @&2 ò°šíl¬%À)_Ç>@LEç=-ûŒR<À1¼Fž¨¤5ð‘6Áàb=<Ú¶¥Ó§0aѬÈfÁZO ÃÂ"!¦nb¡¢Ã‹q(/eh@< +“ðÑ$x¤8€R LÂGÛüè}¨â*ðT4–#DâÒö:Šª!Ô¤—gQziBF…Ö^¥Ò¢ÍƇË%úhyDdšIÆ!ÀˆÆƒ¨s})ψ@ì6 ¢ Ãs2œ ‚‘z@4Ë‘8ï`ƒZËfc±´”£°8G¾Ó.ôÝe¢A49& '³i¾ÓpŒ¸,#H"¡õ?Ìn#¬yT¨Óƒ¯Yðâ9™Ïˆ´£™R!hÀ¹`BH^ãe¾AÒ)«µ*’N-ÞŠV1HX$®!5<¯ † Æ8´S +Õ ^u‹ƒSJ(à†ZpLptº=px«U«N?XarcÕéZt«­a5IT#U¹jÕq  ´>®OËf‰7*ªî³U¤¶E§1®ÒÒ Äs2mqê”$6 qêTóf»†S'K<âr…ë8>˜Ð©Ó¯´å:ÝQ§%#|)0ÑaŒ¦ÒRNA¤ñˆ´ ƒ êt¾ ‚¾pE ‰•!©H¦æbñ„îFÏÉp£çd8ÎóÒ°/ÅpŒ¸Ô²Q ˆ¨VË0b ±ÓçyXG)½Tð Òuºâyi„‡ ·ÄÄ%ʼnT¥”b ÏÄn"V´™ÎÃ…A¥Õ~L›„…û°–ÁŒ’h?1ÍJ‹Ѿ¨ÒÒƒ †J49­’ ‡"D°NE@«e)†{/Õ²Z¦A©\ŠÑ2-;}­’hG PŠGÖ<†Žâ¥üƒX´Q@<¢ ^jI¬S„x|ZæÙÀXQ +pºåd`¼ôäYàø+ +ñ³xq´7 ùpx$ÐkM²¤±!Ø®–™Lñ«ØHö¥eth:<-«XµÇWê`@¬µEÔ ±m´_ƒgµ#ۀ΂}„8¶ˆy*>)œ-³ˆ3d`úÎ&Ö@ûelHÚ‰!¤R_ƒ¼_P©J«u‘J§eÉ#Ú0øšà‚8Hq§Á£Yƒ„Ä×N2À+ÁÑZD _‰waø ^"_^¡]*â¥Â*³ËËÔQDHï}d°–²q„:â‹ñkXPˆ¶+â>žçù0u ÐÅ’NP<ÄOË2¦‘ îà¡Llhh(‚Á VYÁ:‘´¸°G mûq—LÆjA™NÈ(Œ`hVûZã0À1Ø A8l*RäæëÀ¥e[‚£5¦Ò"¢YÑ¥Ò  +ŠÅë2ÚTh`ÀGÀ)\XËJŸH÷F%S7À1'•@µ¾hABbú*N›NXx(p‡T¹G) bèJ4X;r‘qñ@Èűæ‘S J4‰Ÿkàã2J b#wÉB)“âc…ñ.ì!P`­gäq‚u¤ “:@>žÉX­ãÔ7@2컄XÂØäã¡Ó.*¬µ \ÄO+ˆñÙ€.Y à^‰¨_êÁC -Wö·».œ<9þ{߃pYîœú ¹‡ýœzœ-%·2w2»73MœÄ¯Î²¡?OØÿ¹eôùʹ.÷êî”Ý‘±–“ÌrÎû<9ë`-sÐ'Ìs¾2äÜvsCŽ—9íæVÒy—=BîŽ{Þ¬Cê¯Ëý_†ä d]–ŽòI{dÿ!÷üuÖ¹ËÛêÛûùeŒ„Üýuyõe\…œú„úìP:„q#Cöžs6¿KŸ«Ï+¹íŽüÐYút¨¯*Ý9eÿoŽáNn}§~| ~K]{¶÷ä~U²zGNewdÕu áûTNå^'dø³·¶«”N9Ÿ—á·oÜ(!{tæø£ªªosÚí+)t^éñ#Q‘|<@—Í9Ïðp•d³JÏEXé¹åf•&¬qcP•}c¶0Yo¢Ì/ùú•«W¸©š5· _¾±BצãŠåÚÙ¬•»Ò«ÕÛ.Ï-º\¾½^7תT*—®¿ «¡k{ù«6]¾ñ þÆþ½Ò¥,û¾¶+ª6WTù¦bk‹ ek«˜_½úU|×Ø[¿\n +_¾©XÊÛ:¿dì^i|¸š0¶Z¿ý¹½X.ßtBßX* {åÓ[K<çÃè%¹þ{×0Ù^ÎRåä•Ì½ñ'7!oäžÛpJN•¥v¨à¡Ðêý¹©í°²ä¦ÿúºÜ,ö­Üüö…1Â8¹÷ÖÖÉE½:œÊ‹ºÍÕÙeôå>®ÅÎb gležHÉ #¿äÚbg›xsÜmâëÓ¡Îè9vÎ0yN»äßãó*„ÏyÜUùݵ››ËÜs.ÿ{CféÌþ»ÊXR#d¡äŽÉÊÙÑcϸ'ä<:+÷Ì:&n÷Œp²&PçG‡ü¯œ\9ÙeGÏÉœC‡üÏIž¬÷ÜÞ÷ø\ŒëÜ.|~8_¹oþ¸òúò”£Çg(aä Ç %Ç1#=DÏ­eW_°×%]ÂíÕ®¹¹XÓ·ùªÛë!„.÷žRUÐ¥òk{•Kn¾Rùæj†›ê»¾^ýåú̊Ί¼½V­Õ&[-ßzBÝX«·d×3œ­ W¯«rSµT«­ÅzßZ:ãéñÕj©¢P'Ñ룣3€ÈGÌešDœD’WKpµ @>D<¡Ï©å/cÇ©F9% € + € + € +(€& +`€(`€(`€IÜ ÷DzOæœD™µ Ì6®XÈÕÒ´,u)yy¨Z݈ˆk$¢³d:ñ‰OvUNzÎ9åº*?'YN.à³ú÷:ûd¥ÂÛCïg¹ÒWZd;quÀBš–¹XªŽÓŠ£¾/K_¿ýKÊårß ¿|k©âCgiìwEØ.¨ZèÛ—[«õò-ê˵-K·™D.ÖË·Uþâ—ü­]ªÍ[*ߤªÜ ‹Ùå³,‰\Ý„ òA2@®–‡‚+¦ }´<"\.‘P7Ñl3‘˜˜ìÏ)wÒsyBß9Ýÿ¿åŒ3zGŸ1*\õ(9Σ„.g„ü+}¶T¹²§ó2Ë×ÖõoWoØËêRåF©SuQZìw_Ž<¥Ce–{ên„ï(-ë˜@ÈRÕ%Jã +¥ˆ«£å¡m6.ÀJ½J±:ã(>dî9eÃ蜌*‘–ªÄ¡C#y(p‹ÄÃ¥oì—-õçfÓ3rÛª˜ÈƒE7È 2<`-’‡’Ž‘Õg€ƒõÑáÙl6˜ª”ydôÈÿ;ŸWBÉŘ•˜y:kïteUþÒ§þ”ƒ™§”y>B‡ñ{.|n³’œÑãk{TÆnŽrU¶?t*ŸsŸº wçòKÉÅ(£¾ÆØ\Œ®uF]©òW=JÆÍ>¥e$"J'—FM6“BDÉ4Ѿ‰f¥É¥'–lUšèD¢“'ÿ<™q“èDE\P•@JH\Z MËZ ÔnÆ!À‡;hƒÈ#-ÇçÒ!ZåØ<œKäD,€ b£u’ X§áì• +ÝF ckCÈA›°ìøͽ?¹'”p[NîŽᶿò?ìÄìP:|æw(á.GÖíØ}í]–öä|g|(W*·uasѽýçTn'Ï]–.ã?t7›2J8#äæ^WÊß›|­Ü`œ¾±TºÛªå¯’yzïÜUg¸Pö”qJ £”¾þó¡TµT¹ÉÉmW-ßT“ukñËå7„qrï~ä¹½’¹¹û[JØpÊgŽ#œúΨÏ–3B¨Jîÿø #ïDœPòŒûò}Fÿß!äÙS6ä¶rüBe1+®TYðçÆò–[d÷Í¡ûÖ«íê*í–¼“ƒü’þOøÞúü§wÃŽÛ¥lø’á~;tØý’aŒ½qe+Ïo.Jç—¼+ýWr*'3Nß•ÐgOÖè;åþÿìyzŒ¬ŽÅÌ,U½¶—d©¶fµoξ›ê§+Êøgd ×¥ËØÚ3vŒÜ\–,§ª*GNrúÎcŒPã”+§O;FÉ<]ãä^×É=g gtèú3²o+7cOºy§F~å†*åŒ.}·¹9®7džwÆ}÷9]¿cä`Ö1 ®¯êo›|ì?ç„ì’[9}FéPöªTîÑý—9îN݇ªR7J_]î˜D¹¯NØœ±`/ìæ$;Ë_—¼Y“øÿÒ›Uß¹)UáJn ?êëÎÕŸ 匓w}ïÿÉäõ–7BŽpU¡»üêsÂð¥ÂçÖÉ,QJCiý yw£Nx®\çŒÕ¿pß#ä¼;îvO®;cMdt¸Î£lžpölÈíÌb“¸çN”†Ò¢zôž-?zg¬_øÊíËøþúë:§.¿ãdN…ÝÝÿì +5Ε±ûUß㌑“ìì ¥¡4”†ÒZ ì´DDE2À;QÄuæ¶g”­rÖÑŠð—ß5¶rýÊX£”±=Nf¸1jœs§”-Û'Ï­’ù%=cMàB‡ÿ¾ryòäUnOùœÊɯwë~äfœ8¯ÆfNNØ>ß}·å¾2»2o¯kC9¹÷ŒÃ!tkÅbé&®6× +ú6Y*ßX/×Ký¹zµ|}+n¯(W[+Fß\ú»©·ZÏ+ó¶òþ6œûsB†ìrÂ[~oËãôØ1ÂŽ““Qcü¹ÐaäØ>%7W_yù=z\çú}@¬„¡YÂUŠ¨:J2\# +ÛP!”ënèø8m60\ýø.Ùå*·2+u”Ý2÷¹µŸ¡Cméê7j”Ê\Ü,&ã¶R#7Ì0ÙÒÎù-}wv̇sãöúÊ8ß[ªìÈ=ìŸ<åäùuçCn-äF…ëîQÝÝ¥”Ò›ru.d®¥jTŽß9‹í3ÿKîæ sÉ}ÂøRgw¿ü({ö++IW–&®+?þ2GYʆ\dŸz3œ c{Ô‡Í)JËJî{ÛQš|épNuzÜ%l~솪‘Û8aoýžp³Xœ;¿áf®ŽÏ £„Ú.¥öoü.u@ɵܬ4™‘r7„’¡kÿ÷s>Gð¨ÓGt‘ž$Aè”™‘Á°“@ãñ¨mý>nÞlÛ\$èy¯mŽ˜ +:¾ù­\ güÖþ¨ÚHAû¥ÑÖ=`ÐñdÔ`¢ÅøåØÏ;¿ò‹ÙŽÌoêú@r{-6ä9€:¤Y>¨ØªXb9Å*¼ßqgÖü¼bÒ³òkè@þ­&@mV‚/|¯Ÿž–˜Ðu„É<ï;æF©£þ™oå7¥.5×!ÈCF^å +1à{Þ›™‘WOòÜü—è¥íþÙÐ| dˆ»£ç|“€¯Yq.Ò0Õó.Ú¬V…ØÈæ×tI€î4î× å`GâN¨sÞ(^PÙ…FÊÚˆáØ#ïußÚJ¿ñsKc|ABøÖ¡OW }'—߶A€/j#-ˆøq²ðI 8ÈlrK:<Û€v£ø웧 Fˆ‘€JˆŠ»“¦+ _wby†êÉÏ&sá‘îR±k`Ÿ«o\‰€šËBjfŠ]ú„·)º _x½Ts™3]Áê²n¨|qs1ï\mÅb‡/IŠëÄ.^½SHÎÂ%æ¶Pá’gɆ9ÇÜ ÝxÌòƒýÕzÞ*Þ}‹%qõä¦=ÚDËðã¤ÕiŒcd·¶©'DÒ¹R¡j ÄrÉÏR‚ž<í³_SCekÚ$ç’,¤åˆ)¹$rrS­¥Ug<®(ß\ã( ³Íªz²£ cuת“ÀP±`U'XMðøé?/M0íð(|¼™`FIy³ÄKp´×¬†çHX†_¦š +^ÿGLp„À–|L™`µÂî +¬í š5ÔüÎ;õš`¿hÞ1f™ XÙö*Ë›` ‚ã&¹<9E =>mʶÄ@¸‰ 2•ðH…/ ƒ"8h‚Q‡­+’ìÚõΨ_&ÿ Ú• |wÄ»ã·ÅÖ¡SÌ.êÏþí@Ž˜%œºsV­Wƒ”1Vywk‚iÙAél±L0xíiO'&Á'.$Ÿµ fù&86¿ü$…b‘ „-Z?"6Mpñ—ùȱž&ÈÓûž¸¢\h‚ÚxoÊþ23c, KÞýĪãp`º3'ü¥x:œàˆ¬[¥Ô &HžkJœ˜™èž }cÝ}ŠŒ•(h4 oI +r€C +lrYÒ‡ R£ÿG&¼¿tð8t/™îX•›š&¾ ô0V%ztŒbq[ `S¨ì`‘ òö?³Gzívl°%Ü”zõ2V½†.ÀxooKâóUÜüæ´?•G¬Œ*ÙbaöÂ’!˜ø“° cQ Ö$ Ò«6ø ¾¤nÙæGdpÿ×¥‹/È‚H2uä#âqŠ(obÕèÿ«äÙó #9CÇ~äH¥oþ­=É1Í+ß'V†ÿ…w÷Þý9í`ß½l¤²ÒÇ]|¯) ÎÙJÛîBcaš§•YÔÐV!hÈ5ÙYI»˜ ¹gw 3M¢Ñ$<…î^qsL™ÍãWeùrApáaÙ›LàÉð/º_ö ìˆhéýVƒ ¯‚ó W@£6þj›~løø"Ž~©b£I™ø +¯XQDmG"‚¨O³H(ç=çzÛ2ƒZêÐ<Ìí6$ þAY¼4Åø²4°ž(ºXDÓçî°†8 +ÇÌŸØe"QÊ“Î'u·ÆYá£AHöJhÆñØ¿í¥›'̺'#…3s$èW ØvµÁ É”-žD +”Ü%c°,ªÀ£è¡YÉZ€fî,yNÍÃOAVmkVgp¥ÁTËzÞy‚ëq"PKjU‹Çú”\y)‘B¤¾÷ÚSÐc5çâʹfVƒø;\N‡pÓþ¥¸ä½òªLÂ!Éd‹f]„BnŒ²….ô.º£˜íÑ›}ª¯øMhέg¹;ùm§‰ñ¡¾Eäôå\pÏ€‰•°Ô#äÓ3æ.¢àWˆÆ»}êLˆ?ìÈÔsò»[•fµ®i¬PìÊÞ…ƒÑï¼8”Pé&©+~´÷D¾ªªòÃ-•DRsËê$©"¹·–—“ƒœš)]Á³':Rs~—´âØyèãú+©¹w…”n fY¥_ò"ÝÍ÷ ݉wåaŽa½Ôíð>žœnX w¾¤æ;°.g“ÿž™ønQQò«q&WÞ_óI ½'SÎ`%Y†<Ún£cðd CÊp¨¶¸Ô™Æ.‘¬?Ü–¤Çúý"¸í°ü' yµPƒKboà¶lCÐ}’5€¥ï®iÔäŠ2?‚š¨1ýqi¿„³o•¶®ŠI˜(Ó”„£¨tškIfwòiç7“ù×Ã߀?¡Qn ëÌ©~¨ÍÂèû§ÂùB…adIøK©—:&’ò¼‰Þ™ö±joƒ +à09÷yx”QM€a‚ù$6] ÆN¬3ð´rÊ aÉd>”÷lV1Œ`¯! †]º^­Œ»4]]ôê!Ÿ«»ï2ÙŒkÂÊn–™Ðß(Æf~Zè^ÎRÑ®NÈþ‹0uØ=u*¯œû12¥áC,Ÿð~\:²Õ¦¶ÐÜš7Ò±°v)ÆȥݡîìigF +0=!à=ðêøËôw}÷óG|"…Ø©ƒöŠì) z¿]qÍ4?ÆWƉÊR;ÿKÞ Õb2!@†¶57-L”á˽e}Úz6®Uñ@ ®RaäÖ±€Q¦QÉà"å €e„‚…)B,³'wRuZuì¯Ó«uÄØî{×Ë‘«ïýTÊÚõº$Ôe§åå.»½ýæ x|–JÊý¸Æ3SÏUꬄj("<}ýHÖ›Ø×Ç"à™àgð{††åP6/]ú﯂‹sÀ°KEs%·äÓNQ•RW7"¨m“­'cì\¾—ÎTf9'ïj¢"5ÂÛ5«O?½"-íÂÈojOÁMT8YÉiˆoÉ=Iª”ºN0ûŸvøtX8?LâÐMN‘~ìï¸Í^ˆ] Æâ‹ðEþ“’Ó‡«zWØë˜?Kµ¤bÉÁH(NGãHy…Õ‡jfðXÙ¸êáþ#­$ÂóIBzå³'À¶ÏXú7W¾˜QÆ,}‡ óñô]×Á’û¨¢êh¼êöJݹ¥{P5HYÁ còåDf)½m7O]x½í‚¦Xé‘ŒÄÅH+ø.rÄ;ßòTÜÍ@ +Ù÷¡fuoØ1å°öž†üÈ?í-9WØ“ˆÙ^R$¢ip f’1ÈÕï‰N¤trÈÀ*IÍRJ + @Ìö ±²¦/7÷AíYGX`´%jzëܪh+|pDBÐq £¾Ÿ½³ +¹( +ËSûfö×'½€y^’ñy‘êŽ#{¡bôŠùÞ˜ ±Ø%ÞA%‰oyÁ†FÛ»Ä@ÿÂZô% +"ôÄÇ¢Óï2Mô‡àÁˆç×Æ‘ØÀÙc™ñ´ ÄDpq{˜4y!J1 +v3ž?(2rCÓM/ì%”\ïPt=#cÚ ©®ŸªÖPö•3³æ‡/W( Ñð!Ž‹¥§¯:Ñ$,qÔ‹Ã0 Sƒ¸Cœ Ž¡(wOü=\=Y;ë¢ ‚=8•2:ZOc„$x ø +C°~¼»*P‹°Y= 8koÏJ°Án¿G|IAXº>¸Fm{O +6H¢¡‹ã>ºýÔŽèC¶óÄ/TŽDšÀº6ÒÆ5œ6°û¥0‹‹)õþô<úÁ{±¡&h3·'+R‚žª-Tâc;b*vQêþ¤ØeZ9âWiî2ìDØönEDmp _O¾a=Q»CUáÒï G9¡‚gƒÿ`'Ö¹S'ºÜay!ºTš’9ˆ’¦¢oTC·bˆLîF8ü4DÐm¬ânßÃÏ”Ýmg Kær‚ZŒÒøU‡—*Øz?üÉtÕ.BèH‚ï^cš ¡}ú +#`¥¥Ý—eG¥òs’ÖT=~o~LøqfÕÚ×_¸™Wbº¸íXM Ò‘ûG9Ÿ¹[°«ò ËÒ¤ÅD äQn?¬/‰bQþ©%Vr§ˆu4RÐ{!õca^ºûëð‡ËJ™Ì-¨+Ê q{Å?{¬óK}‡µš$H1kDAñÆ9e¿+mr™:ðÚFöŸÍyrÄ  Þ4 +Uôî€zK€gú®|;ƒLáB’6Á# hTsX1uûgÅt|¤»¯Çv5¨#ª€9·žb?o‚¶¿Áod¹±AjŽ2ÓØ”,/¤ŸÇ9µì…Rg?fݸ©¸“ºm@Á÷ÔÓ@öwW âƒ9¤VZT¯d-¦0_HŸ@¦Ô~a‰•Ê?"™XdĽ^Ť\Ù\kU*à>7ǸÓ7~¬Ø—¾˜³X“YʱS"èâ\XbÒ­’¾ K> +œÓŠX(íµŽ°ìvŒa1‹º ¦3–Êy2Y­J»}é䑦QX¾ÔÉÄî³\_ãë˜-(Ü>±¹ÛÙSyó(¶H×ÙÓ„f/~ÑÍo]¬)¡¾Õ ‡ÃBÙ.&”Tùù!jš¢ª~ ø˧-N +¹Þ‰IÆþï”Øs‹-TPPØd ŘHj!ˆ³ºØV +“Áã¼8Ö]«jŠÍ‹Xðº2fáÂùrçËWŽ‹×é.r1}[$nŠùLcó.°‚$hãbQH½W™ç©ŒO•1ŸP-JßX|©ï·Õ5DÅUékê<–÷ãu4•´f¬—Ã÷ºÉoy —” +Ó-Ö™å¢Wùå±Ët›–[ÝVØ ânû2WÞÕ–…Þú¶<Ã0Ç~‚SäxW¾ qÎí$,Kâf*¿æ8K÷Y!zî* ¿0S£ÂÝ`Þ$ÊóCHÌÂjÉé†ôWOÜ¢$™ GWÑFÔ¶tv1ÉZÿà¿,‹¾PïÕ*Ó8á|†(Ãœß2£å¨ñ‘±¬…qAtzË!)Ê“*mÏ4°! Ô¿u\•ÛÒK9HoŽO2J†XâÞGˆ.Í[”Ml5´L’;I®}çTÊÆV3Ù€=eÉ’Þ»ýÑ%­Ê €ñ#Z#Û·pYâû¡KÀŽ|ƒàª}ØÑb{NÒwŽNµé³‡•Æ @7à²2û$ÒwDÇ 6óÛü4hhØ 5L܉›'IéïS$^…š˜˜‚;Å›§çˆrÇqç„ki"|yn<_$â XE3ïµSwØ Š\¬Je¿7ö@€‘Ö©¢Ê/‹Æƅ탬‹‘ChMdçv¨ÏST O«­õô~a=ݯniLßn36ö±(–“B%Tñ‰…Ot…w÷-øÀ‘øzÙ׊dzµU´s„†<¼©2„Ýø·¢©u‘xª´Š½®­ꇔ‚/E`ÏÏ4P2ƒ`ï‚d¯£ñA?‚ÖIãq\(¶gwTæyÒìߺôBc¼ô—§Óù?øž»$Ž°r1 H‘ÆŽ“c: °œ5%bO(3:Öþ`wã¿Ãy­ BŽïž”‰² ñôØõäi!€wxâÄfËA‹®Z]‰šò#r[c g/hIŒ 1+FÞǹó\ +%Âã–Ó²YLÝó†C‡[ f\ñÊÉd‹ ƒ%ÝâÄiÌä¤ÐœßfñKŸñh`\çÕ@½=)tJYA¢ù($½v k(®ü_Ç|~˜ñx4të,ÛßP§À?šŒØÜÜ&¥5j”•CãñÜQ)……c³9?]ö|º·óZŽñãq<ÛáÚù"k‡1ĈÚ)¡¡×o­ó(çz?ïôPô%¯-¦þ^)–“–"ҭ׌Ëîé‹!·aD±}—c¨Â’ ñÁlôcA!ºvD!Þë(öÇ%𱦅l€¹ ” +BF<‘* eðúñS’_ÿÞY‡-œåe +§s2 +¡3e +Ñ92 +¤s2…Й2 +Aç Ò²w½Ýû´Ü¸M˜ºÝPl‘—Zm>"Zdþù«‹ýšöð¾â0eV¯~½ÁÙwñfe׿Õÿ?ÄQÓ¸èéÏdèn¸ôÒ´‘§`3®ôpQÃ…ýþ–ÕgS—q÷l,ë“ ÓªG¸“ßåËúv•¼1ëÚøÏŸ ÊÍÃ(埃Mz}ÑoPkÞ^®±Àûšª¥Q‰‹p»7[o6DˆªGòª*â¡u¿(²‚ .ÄŒhã…WÅr‘ÑÏÐ;êJq¼ÄØw­Q‡7Û›hXÅ*‰Ø„OÇœú¯<(¡÷Õ5bKÍgõ4U&Ñàï?¼”¨ú-¼4<ˆ.¡Ý2ŒS°V>fãôÉ_V9'hÛ¦“\ãÜX7ÿäòE…åì°[¢Ç2ªZMƒÜ¤àè…Õ”N4tf²¿$´‡&R×`èbqã÷ÁâO\ó3kã  Ëùž…ZÏ~ñ,%5ÜþªõÞñØã|zÁ´]bºJ-cØ{ÃYÄzìëø‹ÑTµâ&[›x +Z€zªŒ®„ú÷©üª-L­žzöWšuT`rò/)¾yž5L{;m7©¨so½•k_lE1mN#`vj¤O +iÊ}定å8VgŠžÅOx­p‹k©gÌî$æafâ—¼þ¹‹Ãã^ãõ¼ZÓŸ UFÁ”ß˃oµ‹ú^Ðk&@úùß¼r…u¥æ +;ð„ôíÏ¢aÚ­=UïtóÓéü +yþ*2óºÑÐ…BP#k(­þy¾J²„?E¦)k4±Oðê°Í bóãÃÆ Ðœ†œr¨KÍQª >ÑøQ¥e»;ÞÁCiãËßÍ+Qh'™GÒÇ›váM¥¾Èó“BØ]ඩlsç´P>sŠ¦´:BaŠC™ KÊØõˆk°Ç8êp£“Š$+Y¬w[\Cø½r„ò)*‡V\j`€!ľhyÕ€–¼’ ¸Ö1¤Â‹çaAP4ÍÏÉOŸï{§—'‚ìñ¤Öl{j²/‡×šsá,Aí~xõ“P&Éf#@†Õ15 b+µÔã¯ØÓ¿!>ç]„GV¨úì„Sÿ¹n’tq2èýq®?çÖ8¢‘é•Ä÷@`–íÝl`ì>Ab »||\8tœï݃Ñ8y¨Ðå$ŽÀÕîÞŒÙ}Ó¢Ìg£*ºP0qKš¢IîVS¬@= ÀÌiŸ®öEMù™•d ¡è´õ°fc§ÚT;#Ë!0„ƒ.ˆQ]áw…º'ý̲!ô-Rÿn#Ü°ß2OO9Qª Š1XòäÐìÁ÷´.uŽ¯“1_Éœàéu«XœUS2è‘× +7f‚Q͵'Œ2&±˜H ·é°• +Á5S>Ó¦[ µ•#ÿLqQ¹Ð¦ƒT~`á–ÒL ê"ÉäÒl€ùM>Ïæs½"‰ TNtùv—©å‘³•Ž¤@y"G1/ >˜ +¢^‘&>YªôW¶‡ðùÚ¬ïŒH*¿s¢¿>!DPJU}x6mnzmƒK]æõbá:iýÌPà„Dvæjƒ2ƒ3û{»ú Gœ–ÝõW­Óßðt4–¶ ‹a³Qó:rŸÉÚ*i€ŠÅÞŽ 8 UÚ\’sL4Òè½Á4…èt& yƸš)yÜ‚;<ÍsBåCAµÉ#ë ‚V²;ÎxFîÔ©ì2î3Ûx]V¯QšBÝŠ£7ÑÈú£Ü8oÁïë¡Ü5ÎR8aæ}}FZzF²EÁ*?Qš‘ÞVÌ'‘åÅ%2?ä&`?ÖÝ‹èTö9Œ:´E&:N +KNŽ/ÙðÜ˶8k€®4­‚ÙºCwË,‰ ìÅ6‘X:T£10G1Cf6`Z³ãõ¦æÚ3AFcŒÄªƒRPoæËGE£±,Héi‚Ä EŸÍ‹ŠÆ¤”w^Ûnb˜H0#.H„¢µ=™˜W2ß@\¦»gס„e·è0ãg®Ý­¯P‚$–§¼¤#:<'>ÐI¬©ó}Xÿ v~H5dNÞ°zàÄ l#cM”bSŽŒä¹,8i¶ª5®c«JÊWœ¡â€ûkl ñFLÊ™ÊdUÞI+6ª²Z eþ&ØÇ +Îm!±‹o>>fkÀ›BŸÞ½œ#“LÚ‘a²|jcöŠõk¶Ø9KìPxãŽ:‘i³ÚL›ÁU”†Ì®în‡uE(¤%5ŒlÝZ‰XS´&¯½S?”¨¨í¹ÃýxYÌOGÉYÜgÒúSa¤„p +;fÖhwû7?Œ.@õ2ŽõCÃý?Å?{©y2èqnÇý# ž¸à eŒ¯>ÚDœ[²¨h$†„hR.Wxõ4jµègeP½H¨´KA&.©Ã2&7&í_((æÔUÏ&QŠ©BŠ +WñD)òY´/Çν}Û3°àTëkýŒ?ûË‚Å6¾b +è~1×'“iÕt0„ Õ}èðJzˆ*ÉM0/©ÊÍõ ƒN9ÌóIpNMòÒ(¶Žq,v9>í9?KK‚„ªÜåë H8?Í‹Û ºGl& (Ä.§¸ýÞ:ŸR¿¯oº‹;OJ~}9¿:ŽÞ+ñA¿OÕ4?7Äÿ„99[>‰;¨KáŽsMzZCÆUŽŒæ)9» />À4’öÜ·cŽ”Lk%9³_OcêÛq<½mõÍô.t8>‹Þ¡ßúÍÙB B^YÌË”qI6¹*J$V/w·ºbY-|:Ê1ª1£o,'¡5Å0´jàRn ššŽœ+âAä›Ì©" æøUÇuE^LÞjñŸ/+êát˜ëH^û§ƒqa†Ž:óÍþNÎXîʸæ´O<ÜhW…»ÎtÉž]õ‘#õ´é£ð«6¢sÌ@ýÕzÒÆõFÉ€ÿîÀðÍ䔬:“¨ëœ›­Ü=³·ëqHóÈœ8Ç÷ÄŸL@ÒŒF=LïêZ8wÉ?‚ãX4~‚è–Èú{˜êÂàXQƒjä c„« šãØT¹¾}˜ú<Üx8©RÙA–“è®7Aƒ¯¹Œ#œ( ƒo¢Üà h !~v÷QU³g먼⇜pìXíqnI{Û*}&À ýà!%šFxHg“Öƒƒè§s±û–m‹–^Õe:t 3A0'£â5TÀ 2X!g¡ü~܉`&U8¥¾'b¾vô8Zžã¯ DÂzëØXTâ[-݇ý$O†j¿ìFþ¥ˆ¹íoû¸VÚô=Ÿ +…âÊÕ>A¦á8àÇHþx\ ,51X*‹f?Gµ;€è•/¤ÞF"ý81|µDAŒ/(DðF­Ž#úû`.ÀÌÑ¡vj+¡ß¯Â7£ æd¦SÞqܽABg>Öäqœ^ñÁ(“å~}­–ÒJB„üúžû¿…¢–{üo©¸õJ<þë%0UÇÕÉ$>0º.䚌©§ +Dø=T­'>NKç™z³Y¡8õ×PT%4$ü<Í°<ê¡wlöoƒƒ7þ¡¯ª™q©d­ŽÑ \Ï%³aÇ b iŽañØáa>䔎•B!ùEypµtŽÒc/zä<”®jþÙ󱇴$¥ì8Uª6Íéè ´y´^ºÚ{”B`S¸¿Á8,ðÄ+f#›‡Àd4\y!\¥¬MF ÜÆFTîÅJ˜„Ô/&G¯`4QÕé?D‡*º)Y¥ö  5Ôøh²vÉV×8KŽÜ^ ŽÖ/ÌüÓ“pzyÿM6G>]¥òµ'L³žpòfù®”‚óöÝ%YÐ/©ß0°±Ý8¢„cA™sY!ËPK;ýœ9Ì|.DZƒcÄs৪oà">Þt.³¤¿…ÅbY>àœÆ‹¨cøÌOÈ´ +qî÷žyZϤç߬ø ”gt,¬Â€ï¹ØÔÈîÿRä¦Ð¸=âEl [âKõq‹RÔDc4òÍNðV’¦2 ~‰òëgáÔñ×±q]ÆŒ¿üMõÈš8–Ç!†ïÑUL?.<È@’-À¶æpÒGoº ñÊTÔ3e”‡ñÌ@D€e¦¥œIâo2}⛓邦¥™ î“î0‚þ,3¾¬ÊýóSå9WflWÇ0¾¡´çS-3zîøJ,iš5 ªÑô"–3‰&§Ñ 2#Uä3¿! M0ää‡ ®¶…½!d¦8œÛ_b7ѶÃÿ™Ë ÐNS´jžyÊŒ)}¸ƒ÷4RƒIé9ñe–£/ç3«ù|D¡Ê†E[Ȧ‰©¦˜Ê‘ÐPt`–Ͳš³‹m¹æ†èZTŽ…ÍÃÌ4šìAºöí-Ï}zŠI Ù£ßb†¿~àÌŒ"xìjïUËYÌÐa8s‰~—t£Šë4Ìð, è8i–Ã׬#°LD0Ñô*r»âÕˆŽ\¾'ÀŒfݦ`n…V)um×eɹö"ÍPÈ5`fòµcRlk endstream endobj 1081 0 obj <>stream +û¯›ã%G18›ºTûØ\œÙd&<+Z\‘a³y_ýDCÍøÐT¤t¼¼]@ÌÂxþBA1ô¹¶AŸ$1äp’ðip¬Â[Ù*[)aÿtLí 8ü[Ä\;P€NÀaXmÔt˜?aÇ®úÞò¡¡.e”Z„¹°&ý±ŠP`p¢¥u4âÁï»x@éïÊ9vàaÓZ&PÎXÀ;Á¿â;ÚýzüjZr÷~šƒç+Zä﵎%)Òû j +–ir*¯&³ ȱY»ðTßP +8kK ; +ň‘‘¥ÈÕ&X¤—´Q €¿¦J ÇÔ`í’¾ ‚¼„©j1…Σi7DªÖC§ÆçÅSêKÉÿq8u—õó—D¼çèk—'Þ/ÈnJ¡Ž&<ÒÂB"öt“øKÐà¯8WÖyv`³’ö&¬¬Á& <ÎÁ÷ÀÑ¿û BïP ž¼w¤6 žïîÇ‘ÊlÑFûŒyÝÀïÞ´ñÐGð„ V¯óZ7£àåO»tÐ{Siæë²QÅ#6î½M?C…õ†dñ±EæÃoœ´!cºŠh,eôÞþqH¤(!(¸À~Ñ‹ë5÷v6"kñîBÕõ¦nŒ41t"‘Ø8QoŒ’"µqQaõb(/E#……t$X¥¨n!â2Óçlc43Ä#bä‹Þ\ƒRcˆÂ¨ ½ñÄüQÐ¥2HÓúŸMñÚ׸-‹\к0‚/ÛÖ7²”ëÚSäú›Lž4@{äd‘ XÈYË®ž"ƒÁJDн§«¯ óY}˜þ$ u„Jo°ú‹ðDhvûuG€›z0ñU“Ž-Ͷœå|1p·ÐMò†œ +8ΔGAóNª’¹hñÖÒÀ0ñ +¯Õ†/ Õ¦‰KA*Œ¿è L˜?U†“*šdËñÚ‚ìÿ<ªª”}Y剪4Îa:O/x?d>"§H_ìß>\›Å| eEü˜•‡¦b”ü¼ç ŠIu¢]]õ4 +Ù…n°LTŒ£øSÞo*‹ \¾BÛýY,ˆès¢\çZր嵫ɼ#ÌÈ1JSðž³D±ÕõÃXC²EV!s¿ï™L,Ícðé^Í•…¤‚÷Âñú†È¹½¤ëé´€“ Má“oˆŸe$GëÉæ8ÉîœæH®9ÎøÞ!ó6)¢ÕèÖÈæzËjÜ +T cé +‘OÝO5WSÅ$Žb‚Ójªý3{îQ5. t Tg¨(õɬe+“߇ ÁÕ„) £MÜW—jðпÑ&íØeÇŒô0@«&eÇPDÕD.WÀ “uÕhr¨¨MƒÃ(V 8Ž;½pœ|O?‘O.5IdK¦‹¬fv ’°ƒìméœTñeߢƒ)|ïà°ÜCRðO¹N,ò£l‡¢ˆì«Tð¢î©ø²“a‘{1Ÿ³u¬A/‚n¯÷G.»YÃ㎇›C ®AvdÍùé~¿eX“Û5زŒ‹QØ+ô“ †˜‹àÙŒqOònϪ-{Þi&Výþ•? Ú8Øa[ä3N_^ò51fOnJÇÔ˜|,£†‘K,ú6ëž‘¢WÚ7“Ãé9œP$ ‘"o2ÿŤÜ7±ù35ššz…jÈŠ:˜–[Áé‹ZÅO ÕÿNÉYtôâ¡¿e®v :Eý¬Ðý/QÉòñäXÐ\4ìsä½êΚuYÍ^à)ˆƒÈ{¯ÝŽÌ6jᱪ£êº‚Gw@¼ÒÙ.w5 •¾¥³ëÌÀý§¸8žnÜôÀ5&6Æà2ý¡£›ï£ÆÉ•¡hÈÀè'ûÄ ƒÝ˜DaeÁ£]ƒ¦«bÜS†¹ým¥—ÊÉ_OÓ¸'æœÌ{LPØÁÎÂÐŒî† êŠPyPû.â¹1hœDOÖõLÔ¡XÊ,tEö…}b•Mœä‰ èŽúÁ¹È0;ìÄ>4\ÇÎ7)n7Ä£.â¯,2,´ž1^T\Íê ×Ea˜SÌ(꫈ʶ¿ N\V³ûý\X¬î_„IêÿÓ.Èà#«Y%¸ÆŽ%·8]ö0˜Ôcd²™z°ìÛòƒƒq*S²"Tqb#ã"IH( â.åç:ß‘$B6X~ îKj¾8¯Ô4°ˆP7h£“ kçg"3g}?,¢Ä[‹I¹lO¢†•†GŽ´!ñÇ™l¾‚RŒ2lÄG(‡mñQU!D=á#G‚ÚÙ¦½ØŸì.PG©IŽ6dMhdQªý£ÁçÜ;?D©Dš9ñrØ›@jÁýû —ywÚíJ,Ë›øô‡ÜF ÆUQ‘6ØK¦·-Æíï–I>66ö1 +x† O5M‹Cæ ˜˜>dH#FŽ +Jþ—kü6«»®*¼Ê.»ÛÝÆðú%Ë{}Ù­¶«ÖÓ»ÆpÇñÞmÛª²ë¾r ËñvWø…•r˾êV•p˪û…U—u]VU¾ðÚ®êûºíŽ_•]Ç\e»Y-ØÕR·ú*ßÍ­û¶®úî½áÕeÛ­Æ.˺ñªõ´¶nã·UƬ+ïVWöuYU^¦¯sïTnjúʯ“eݘ¡ÏfS§¬2ÓW}w3×8eeæ +•Wgû\÷r?×uuŸ)ë:—©Ë®;Uå6N¡Í¸]6ãvuÕu¯Í”}_9eÙu§/ CÓ'üÜÐXn¾ëËt2Ý¥s/¬›:Õfœºû¹Sæ2^¦Ð%sUe7^ãoV]c˜e[x»l;¦Â+«ŒÛUËÝñꮫüÆ뫾/«dÛnÑ:Àmvu[¸mÙ÷eáÝ›U÷˾¬Ü²,ÌÆë†[¶ÇÚx}ßnå5VYÖUªòÊÊ+ïö kMW©²Çd6ë%%z‚ƒí‡d]w+sñº¦#EŠtab‡ƒ‹“$IÒ’˜”(A‚Él&A¢dƒ G<<¼ˆ ÎœIsÈ!È‹„‰~~O0z"££ZL¼ Hˆç0Ïï(ôH:*9 ù$â/½Tí¬KÅѺTü íÅÍÎÎÐÌÌžöBìffh73K{»÷×ÖÜ~DÄO áóõú{4¥kº¦'%ÏåÅâ̹µ;c¬=®†Ü¡2Úþ”¡ö¦ èaÈ9öÖ¼™ó0âš—£“ñæ\ÛÓâ–Iȇ²AOÏóÌÈÈȈ#6kظÆPðh ǾL¡àÏžI{ñµ¶6øðç ½=+ojj: 6 Y‰ÝËœ1,cHx3†„ï3|ó>BpžÆÐÍÑ !Ÿ1<ÃÁ‡•].NnppЋ—/Ÿç¹¦i0`ÄüuTò믺‚jµoÕKnOE¾•Š[¡@q)UUUTÂÃÛ n¿9s¦¨¦.)éçø¶6·çÉ£ç5-}”$?~ˆM›Cm¨ü*7´&ï…Úì½QÁ;4玎‹o|TèŠôø½iFv7¸§ÿE‹}Œ¬ìbba~~uM×óë.]¾üÇ !"8pÃo^žÞ”)yffg4hÐäŸÔ¹Q10ßb\u]ûðd /_TT,C¦oÎœ722Ò“ÿ£õ¥'éèè’½9¤>lÎxhΣ>·µÁAAAÝÝéX @Á)S&&¶ä&¿b%K`àà= `u]SÁË“´¸˜˜˜8Ï3â!¢˜0cºº¼%Iº#HÖuÄ•(AàçÂÅK_ß™C"%1QÑ_¼€AHøæ… +Š>R!:_Ì¡ –'úK¬‚˜÷IIF 2>EVŠ®.¯WU¡ C§Îü"y(9Ø~¸—¥BÅ45ýµ %L˜Ì懤HÁû¢¢" ‚2ÏÑ'tí,»~}Él¾ËËë]¸xyMÓjjleÊôA>>ž™}…–TTÔÑÐÒîåv,,,F‹½¯›#Ç]£:}Oh净‡‘‘ +M™’÷%%¥ÄÄÄ%%hhiW®d) _R2pw·.îiФ¹#GvìèquueÇÎ9î²dÙr®®.5µ5*xyww<.CÆŒÿǃ'ω&Ž£Ç(P EK|MjjŠ‡ ………OžDyŠ)æÉî! +îž#%BÇðÝŽÓô<JŽµòýp¯¦i(t ëêKJ¾xs6lüÊÞ” _¿Ï˜aã4=wèÀ¡ ¿#€@“Ï󌈈˜&šÿµ=z¾ÀÀ$DˆÐ„‰·7¸º8ŽèС7›Î©©)ÈGÇ ˜¾f +«ªh$b ^"@=½ê]Qœ/Ÿ„tAƒ-—Ìÿu”m? œ›AãdÈ”y‰çÇÏùðéÓ” Ad$¤_›¿¿?À7ÎÿuYÙ šž'Mš$©Ù„&`bÂQg§¤$€ðáÃ(P T¦ñÆ%Dˆ°´´tIÉH''§+**ž ¯B"úÃÀ½C~VŽèÏÊýî=ñ ˜'ÂóEtÌü±Ñ Óó|(8Þ~¸—¶õ7*TøÛÛÛ5bÁÉéqâÄacc;=GëyŽÚå¸û_¯ÐÒú$I>7W#:ù+W´<:6fÁ²…Æ §ç裓’R'$ÁZ€=ŸÐ(=?r²ñs4:Óô麮ë<*UÇðÔÔÔ{øøñ)wr?L¨÷¥×áØÜþz– A šç¹°° –-dÈôAB>Öƒ ‚||””Œlk{S:•<½úînÇcÄqº8®‡OŸïð㈃|GäÇw¸÷Dz€‡Uþ óz\ ¹×umXhè'ΙCç€:4i˜Îµ7kw:›Qö&3íÏ&ôO¦ÚMµ7™Nfz(tí +´ù{½ÀqjšÎ£¢÷1bÈ\ž=¯ëÓÿçºþüÓuýÏ5ý™žO×dee§BâýჇ#8p`РAßââz Bê={øü¯k%J–8rÔènß\PPð‹ŠŠ6lÐýž<.ÂÂn€H ùä‰#FŽ^¢‰)S¨Ð Q£Í ND±GEAýñÑñ‹)úA.CFgee…‰‰½ '×¢…ËÅ„ÓWÆ W×iz^[c“´¸ø*y{Ÿ2…J††ö &Üé„nÙÒå{ïT"ˆ÷èäyòèQ”˜xu„òiK¦ˆ!®H‘"5 +¨y?~^£Æ+)x]׫:BÔ|É’&ß°aÃÑ IsGÇƸ¸§‚WE‰’%Ž“&MÊÊJ‘u±±±¿y'-mù÷u¯!C†§Èʺ3gÜÜܘ ¸ž_]]Ž7`À`ž£K&ó…ïRËÅàñr4ƒüžÆOš÷ù1ó4…€”Ï 9"t.tÆÜÊ/gáÂë8pØ}Fƒ& ÃbÄi:4hP2¥É (’¡÷ydø<=Î!žËIÜsÅHÆ#’¤èoÑ9B„rõ¼£ÂŸ-Ñ.''£gnnp_TT$#+;$HÕ jw‚AM­mY3fÎlРEUÕ… 4iâ$L173;óú›ç¹ À2Zc]Ÿaƒî5]öâŇ–” ÔïëÒ@b÷3`g.ŽÖ\~V@5­(1qžç<=(P¦dmM‹…Ö_¼€Q+hjrÝýº› U(tèàñô„á^__Qð𺎹˜ÂÀ2,^x½)§öœX ÎéèpÔ[[[4=ÏH’ãf ¹ù:kq4(óÑšŽŠ‚zMù&ÃÓÓÿ OO0ãm»ò±é„Ö1 +˜ûú» ‚úkzž£:tðpãæM—òwEð !B4†ƒh®(ãzšÜó“tNî[¶tò#È‘Q‚Ĉ’w¥°:\‹C¿¬Žæ{™ÛòÌЫ©¹øŽ9µ'9"äOš$ð*yU®g%ËÃXò0O‚|«Ëëop1ïçϼ¼+IrÀ1d +´eKHݸã48€àÀyàyÀ/Ïûãò+pÂüÊ›0cGË)”çfúü½Ì¿«1$¼Ö˜×—oÐÓ#F(·¸¸®°®®"EŠçÙqÈï…pøŽ¨HÏ åQsBkª(‚Ã¥‘¾3&ÂsÄA+#> àrlkï»ßèØÈeK—R„ÙtªP¡Âĉojjº‚EKMÓLL®g†ËÅäas2yÒPœ/Zdè……ŸÃYóå‚lļM,ï‚wÀ¹hÑ»ÕÔsÔéx]F''ûúÌ|Äec‘ÿýš0a"KÅŠš‰éwü¼<ÎW>èï‘ Ñ —/Š#拈ô nù¡`–Á+on]ÝÇÝ뚦k#FŽ|€¸ÃG¤Êa Þ#¾+\–!M-§Ãqˆgee•½x‘—§gg§#ME<:9 Ò£# ¹„ %%#ùö®ü?\›Ûéz®«³ûðÁƒ $HÀÇw…×u_Ó4ƒ¦¦Ë±Rä‡UŸZ(Ž£îîÏïï +ššÎÎÐd9–’’êÓ§OŠXwݽfM›¸·¯ëZŒ˜1nܸÑÓãs°`ÁíX!òCÀ=DA>DXø˜[WÇQwG55 Ò£.]¾˜0QB™Š¸ò¾¾>UU +Jf @€ªäí 2äaabϧšó¹ö'T͹<ûsiö§ۡϵ?ŸjjϵCÕØGNÓŒÿû ükš¡¡ +¬yd$¤ššÚëêrÈr¬§Ç·F_UG,)HKcþ§B¥Š%K²dÙâÂEØ”)T)RTÓ´/`]·;#FŒ`˜™M*) òÑ1**êNœ@!A"JP Aooq—œ”<”º¶C~{€{ˆ‚{V¬È½ðçNç“‘qYYÙ† cddœ”<n? ÔËû!`Þ±Á3D3å…f |P_“*ÓäƒzA®ž‡A î\T&Þöã÷âå‹™ÙMÏó<¬þ- ëuf`ÂöâJd0_ŇŒg)rãjèq¿(@~g‹ç&É Ï8ù!6xÍ0¯Ÿ ƒ‚ƒN¢ÜŸ?̶zˆ" +ššXv]{ƒª½®ç6müÞÖææÄ ”¾2}G„ÑñàÉÃ&¶õ7èÚ_Óyæyžçyër$H˜ø¥%QrıxÜ› 9ræ +ruy=ò±€æþö×cÄŒ¹ŸŸß£¢ š$€aøª:bˆ"¯iÚuÛ+*F¢ò£Æ÷âÞæÆÆȤHÁkÑÂÆY°ØJÀÄ4%%…£.… ÊÊ^äÉ£‡ Á%%ss3 ôÿy@èžP/îMÆžÈHÈ%K˜¼¯K„ˆ2f9r„ Á4M,-Y Â{ððQ‡NÿÊ4ý›7oåäbT©²'ÅÇŒ#ú[’}Ýí…ØÿþÂ…>f ™8qæ011EEA¡|ZZš„ºpñ"!‚„ãáåí½kš¦AÆnO¨Gö¿þë¾¾>99¹EEÅÂÂB6lØœ Á¿~{“ƒéQ,Ã#+ûz 1¯×5íöGÐЖ0ìËnÙ1-\´Xcc»A‡ãÓk]wãÓç^__Q Hù Œ“þä.î‰õ–c]gÓÊÊÊ"$d‚@>Õ =rô}ڶͣgÏïÜÉsIÉH;v윭-Llì}]7„„¯@‚>,Z,XœÅÖÖÔ”Ôÿ×±y]FØÖÿ;A€°®#¢“’ww:zôøq () Ll쀀€ ,PS[K‘•åèØ866vÀLNE5uÆÌ™{éòEˆ"ÿ?€ .\ˆÑ A÷yŽ&pc퉯Äkñ¿öàaììvòñqÊ*u]³,  +”3!8P£Ý›7p"E‚ô×KJX6MW\V:tèÐôT[Æ°!·6÷ô”!7cî)Cí驶Œ·|zÖXΩ5Éx¾±-e´xª-»:¸í»ÊïêÊ{·- ¿r,¯ë«®òn_§³¹º­sC㥚ºKx™ªÍ½Óv¹Æ©Ë2ŸÍæútÛÝ>aèÛ:ËfúŒÌ4f™kú:›ð’¹lß ï&”é„2ÝWNå%,§Ì¥¼Sf «éÞl …™P^ºËf³¹7üdS&“ÉL™.s§Í~¦Ì—ù²M¶·m3Õ&Óe¾¯š2㥜.[x§ÐåSN®ë”Ÿî³]—P§û®0Óù®Ìn>ãtuÛUNãåÚÆìfWVm.]èÛ>›¬œÆ«¼L—ϽSYn©»Ýæ m:UvÙ2cnÝ%SvÓV†.Óu©îÔ¹c8uî¦ +î,³ðÓmw“}›òšî^¦¯ÛÜ©¬²l +ueæ’¹ŸñfÂO6….›qêtå:WWMYU–×' ]ÙöU[Y]÷ËÆûÝjܪq«ÆðŽYåºUöÛ-;ᘧK¹e¦;^î*¯kÃKèÒé²pÌtUÕ]Â0›>ÝUN—*s§N÷é¦P—¹Ý +ïûÆòª6Áí–…YÙ}å–mßwËl»cuuÕí¾¹{ݯì²íveue›à\w+·ëîö}×øe]Y¹Âq+³¯¬²[Våw«²R•×V]Ûõ}· Ç«¬º²»¶+¬åʫʲñNÕ½¶ì +Û•mR«ñªª²,·«ÚJJJ"""zyIÖ‚ ’’:ñö799ÙÈ8é¯éš¦ÝÜܾhÑ¢yžç9JE5Uw#GšŠïÍ›7uìܹÝÝ:ºº?ÕÕåÈ’e‹#GVUjfÌÕ`ÁB‹ Á&LHS Ù°ió)ª©ÃÃC0̈1 ëÿ!F®ACšžMœ0Á·Ç÷¦ äÄ‘“ MÏ,«Þù|~·mžŸ††¶DKûòŒ‰†–v––ö3´´§¡-ovf¶\¦=ÕÚžâ +ëWXXVX¿ªš:Uüª(ùYXWXXXZ)VÆ’ùçÊv3l¼‘‘†ÙW×_À€q–žõžkŠÄÁ¦H\kK‹Å +çBu­)ת’z×TÔ¹¤¢Þ5Eâ >xxË/`¼ê +ÅÑZl]=Ëê/º®^–âi]Yh\WwG¤‡_÷ef0`˜ +èÀÒ$JÞÄ„às² 9¢ "¿®ªÛ¸»;M…J4´4(uo_äï‰ùc[{AvHšná"ÌÎNÇÕÊ HNB¶¥Áx]×J”,¹³Óñ-[º¼E ÿë±­¿1Ï"566F''çÌ¡ƒ‡—7GÅŽ­íLLC† #”ÊC>:ŽC& +)SSRB„¡CÇÎãf2›b\_{ñõúŒŒÄÂÂ"rDA=(~Ç(R\„¹"dÈÉŒ!s((9,,ìOOOccdUUUf.^×uMÓ38qÈÊÊ01}N(ˆþø­4¹¨^wÃî]d‹‹K A#”é45¥Hõ„ÃoGä§÷ÃÀ½Ã@=D€WÞØÖêºNS9rÜ­ÐZȱÑùTãââzzzzYÙ‹=~Ô5MÃÁŸ™>4„„nÄÄÆ¢JD]eÓƒ±á2X8† ^A‚„c˜áœœþÁÁ18q˜ á0x :§—” Bdii ËŽ;têع(XPÁAƒ ð Á€àà"D8…NÁAƒ[pzð œBƒ·Áé£@TÄzetJ©ÅË}{jîüÚ¡ëˆ`7@?þ¨tŽ,4þ­°¹‹†¾"<£¼Ô[5ر-FKCÜfuW§1ɼêk\]h—_nÑ +´èËÆUUÍM3±ÀŸ +@ Pý)²Üê("˜49æÛš©|²»íõQ%9 …)9ÙòÆ0k‰÷7¢ˆÖè3^È5&wfa*×⦠ma¤f‰fS;„n…¦‡P׬Zñ„é„÷¢ê”Mp?µ?¾hÉ"Ì“ÄÌE›-Ë‚ÛÛçÄùìãNS…È…\€¼õÇàï@ÿüñŠL&:‡%..[ų‡è%ëšÃ’èP⟠žºIÈŸôõ–ôší³·/ÂaInjAÂ?€p£ëŽpWÇDìz±(ߧ@VduX6¸®yXb .©DžzÝS2^»³oür 9ãýÁHÁu-óD‘·RöàLôKžŸ´÷%_†t‚©¥Šøá¤ÏWÒuŠ|æŽlH³C!g¿Â®‹ 4@6žt·—²ßKr(¾ëâæto=²–®o܃ƒIæ ûòx üW½C3›Š×ÔáN¨G\éx+±¤…Æ—VêwªÛÓ² “äôðÿCÉî9<õÚOŸK>ÿ ç ømØ€qÎjMðV— bK +…Ù-}W$uÍÅ)4Ó¯Ä3rñÏ}‚­5PÏï&Éüºô§õôZ²Dö3«ñÚœ“„‹ø?ëöAƒãë_-þâw§kÝ%àå¹ÀÞáîA£#^òþ‰Ÿf;<ôÏ_¯‘MáΉûÎ&IÓ׿¸[ÿ®rº¨Ñ2­û1,*—Û%:Ž @%\¿®ã¼9<[OÐ?··m’^ÕÕ;53ä’qêýp;Y9[¦¼Þßtˆ¨é‹®y„'ãO8•9*'¹‡eæÄ»%°2s‰heâW]WæÀQ^Ib•Ö…›¤¬J­œt—Ÿ͆¶nêD²§s‰ž +šˆÌ¿Äɸ^,Œ~ŽÄ´‰@Û½r[’ˆiÔÉn< Î~ùä¢ö‡U`‡Û q³µåËÙå‡fjíØû2éã~ó½3}OYò³I $ç£KnÙ®Y9dûÜ›‹OÕ—MéüàÃ…»ž©i]À4¶x°Áwþp!¯­÷¶‚ý-üôž|Üí? *Ï+€9U I”/ÚN؆,A<¦ý3UÐ +JèÝaó,ñ +¼9ØR`p_¶ÉM\+·ÖïtªŸÕM +Y–È¥[ÓƒÛ%Š8ovg\*à—/„Žß“±è‚C3‹î)‘74j{%xêB?(½„<(æI¹ªœÀ{@x²}ÓÂ3^·B@u²§L×É>b_"vJõw÷°ñud”<É‘ª§¼r¸¾êŸ¡m­în>¼Û-‚ÏI‚]\´5£²·p,0`~i¨eÌ*0“Ã;"`. ñÑXné<+∋fkøwj“Gg¾Ä×ØâZâ…)êý{¨}|yø¾{Ì×òr¡§[ë hÞ*Â>P臔?I?ݾvµ•H™äe¡IÆ›ƒ)æûXÔþ¹w²žð^ª®kEïÃzÛAëK‚Š¨7/+‹öRûž2Á^ +qÓ ~ƒ_MÎ.ôÍ%B…as;RMÈ)\‡`’«Mm[òÕº˜ú'OH°íì[ã¼Èw4ÆàK`ÃDÃ-ß$ú2çKxŒÝ]µñ⤛²,!ŒùþMöGБ»ß×ýÚ¸[Êïê5;YFÍŒ¢‘o>‹/ñ@®$UR{ù`ªt¼­ëŸ2fŒ¦J"›Eò·ôÜ!_âaŸÄ +j ï….™–.Y»¾RsDþŒÜ¡óâ½=ep\™0~óI‚ôï:±0ˆê:&uÌï$#§m7V ÙtutÕKðÂÚýߦð*®+u†¯-ÙPàìùìåÏå¯ù,ýs³ß'4;¸~êS²ï î'æ uû0hKý +#k²á…óµ#bÛD4vþ)}΋U}odaK&Ž ÖSy +ùŽ\ÙÇYÎËdƒhË'>e—ùp,Vú£sÂ-qˆð¸SµÿØ}®=í¨:‰Ï¡÷,ëE ìwYB€¾ ž8aØý"?ÙÇ…%'ÝÀèý̧¼YòSko“Ð +V¯,²ÙÁ—P8$·X3Ã$Ó\}cõbVV;÷2:GÙƒyø½ÈšA@;ÿpºø’¸\Õðâžñ¹üz& Ì¿½„°š©p»O=€Ìú›­šâ3ÇDÒ°ÅÃlÌt[]t~Sýyy‹0ORJM“æ¿ŠFŽªå,E¿[æiC>CSJ-éo7ÿC\e²k¾—¿ÿZÜ£œW–ï1óØd—ñrqAÀ<2$Ùˆû+v\#òÌ>èæèÕ—oØLsì LÐãdK¨ÿ"}æ¦Î\Ÿ|ÒÀý®}Çø·%ŽíÓí%Û¤Ç&×&ÁÈÒFÇ?5ª»àüWÒ_øH2ѲDöÊ™þ‚×ÈÀ‹œñËøiv?= ÉO%éÄõÑÉ|•ã£àÛ×l?Fü#Ï%”GüÞ]‚} †u7gtnNhL#˜)¹/´ê’xb&ªCÞL“^¸ÝË Û ¿5Xèq:'+ n¨@M=T¤‡»O·¨Ò¯%áµÖ­—ìI‰k«?¡ÿïzóàóü2ˆQà¶ýl÷ÏP´/!"2‰&cÁ™]éŠjp¶Ì¡}ã\ÿ¹½ÕÓ9›S©žEsp?0⨴gݾ¤vaáöSmK ÜØÛ ¤X;­Æ=j›„Óe¯¿¯ÝïB¶Ù`ÏRàÿs,Ysæ[ÈÖÝ×Ú#ÄÝÃKP±®ÈMi%'€ z7Ìòž½gÍcY€ÎþûçAq}ö§·¢w´šò±FjAö[í?‹Ò•·é~ GRÿ² ¯è7m c|R؇ŽÞöžNc'éj=Þ~>ÿ tèý +¯olð­-,]yÌQÚQ¬QkHsysÎ.ýøа»T6ƒÏ°Ì¬B»oíŸ+à .J÷< 33y9ÀI¤çþú¥CN‘îþ½Juö°â¶úñ0D}š/€÷YÛv¢!‰és3ôr—¿¸ÒïŸk +l~ê‡+\ùÏOb:ØúVî÷k?N¬•«`„ƈ±Ñþùýš ñÚÔù‰žß¾„EñJÎòmMV9 ÝzKã_sŽûû£2ÓÔôL‰Ï'tö¸ tË–”Ãö›µL´Òð·Bö¹>Ös@©CÿÅÀ½môd®òbˆ—ŽÄjÚ 0È–Œ0 9nö4“ 4›¥|Úq$ïùÉaš§ŽÍ”lî.’ŽD(Þöy`ÈÛ¦]±Úú…›êö½EiÉñè‰Ù£ÌÙfe—ÜŞئa®£)ºK"g‰öŸU%cçÁ“¥—PÁZñ>ª%Ÿ7ðòÂT=7BkC]:> à Þå<ÖQxƶ<‘´º$ŠVÏ‹×%8ÖF~7ø.ȃò’mb6$°›µñŠ:} ˜êj[¹7Ú×#"W €éŠG–än¹½ÜÃWþŠ£+|©ØØ;<ïø©u—Ì(ḠÙ&GËqƾ +Úùª2BÝ^)âžÂøÿÂ.‰¯»~€3Iñ}Vþ{H•i±³Λjݽy IAÞaÀ e›~&É…ŒZ¢µ÷Á/ +¬î?ô4¬kç9ð1ìÆ.#»&s„çö°óQTÚ¶jýûz9-™?ð®dQ–ìZì»Óû\Nƒ¯­½jé1¬t´óô*‹2x´ßºXuôžKü]'ãc¬ž§v„?…XKJ ו|xCÿ«âhóßÓ`üJ—%Ž’KŒe"áùj'©7[MxkQ¥õs™ÍÈo\³w"9f­BhŽmIO·TKÏë'€°äküYQ]>¸(ä(E)>ûWƒóüÐ )Qtª9fà…®žA¸F¼$fYÊNÞDjY‚çX¶»v2‘AÐ6¸ÿtÙæöëØ*/Ôæs"|MÇÇFi\ªâ_6 –QüÛÎ2}¿Â¾æˆª cû)%(Û¾íÌç¬Âÿ$éÏ÷… >r +|—>É{\Ò|l n?‡X¢Mu›l´‚S—`®-¼Z VBÇ6bKर¢—ñíZ"r|õ}¹KnWý¶HDç¿$7¬Å€’YÂEÚˆêäüÿÃú‹Ø<ûÙlεyœd®K˜dd n*U.<±ZRò“[ y +Ruç€ýw îbNü<‚ŠÉú ˜ý«Å’d§½ù~«ysÉ_2neá=>="|%:W¯”eMVç‰ÑÁ)æ gW\G°‘XYâ =Š¼}ßAÎ,Á?ŒZÀ°Šì÷ÌÐv{2\nÔßä]2•ïåxñœŽÞáÂ…¢÷¶‚wük‰IséÏsퟂßjž[sñòiy¨ /€ªƒ;c¿8ÔO@ËŸ¯~ ëggT¦WêŒ?¯ß3Ž¬&Óªà.ñ³wpÝTËÇüyÞömÐ}îo{`b†r¢x¯'ö±ùÜdœµ$øNò¢¯´vkÝ¥çµæëHXÞ¾äû-.¾ª>¶bîTNή ;TÞã2²,£<“[öÞZå.Ðÿ_ÇÔxù/Åßç°M/á®D3ˆ.ÀXf±]ìÿ¨ï¹ fFD¶ž’À›,–Œ²¤%» Û´Z½¡„ÔKÐ4ÏÝ× V=K H—èNY1T86»ñÃÒI°#Ä^®?²”𫨖´»g,µÎ¯ ,_‹ð'f(9ÎÜ™¹F9™uIÑë<œ("Mø½80s¯Â–ìltxgÕèK¨]‡MbÄÅ×jÔ VsˆÏÈ[ËgíÁ·$4ÖÉ%µ®Að7–ðñ}]Šï[³÷ˆyx%6oDã!Øp·šÿæýìØÛ=k>ž±fæ´2ŸðXÌ,ñ¤VJ0Í Ì?Èü´éIû mÿ˜·ÁòÊo)£d¦³ÿ´9VúAçyu³ÑHymÓ'm½$HòÃPar ÀX¢÷¼â±JÍÏBTéRðȇ”ãN…¯Þ§ôÿ´úÅÐ2š÷9ÛÉŽO7|k‘ÝOߤëÿa«Õg¾d%6fåÂQ§t÷ÊÙd—G9Õ¹ ÈàQØBŠ:uîÝ>¬œÝAMŸöó$üž-ä/-¿ ,×™“«`¶u]bwup<÷BÅ–xn;ŸeY:ÀËž‡ÛérÁ.×^œÏ’ã&Ð;%0/ltig=¥o +òÆô F‘éè~¨"ºDÖÕ¶O$e¹!¯˜ºÄç&ì‡taãžQãð@ / +(z §ú8ü‘êg1g{½¹t]óC:y u×ÌcÉ·è=}1ûHì~Œ¾ jŸšñø¦¶n ø&ãÁ”e Ϫ¨¯1%~Ãr—ÿDý­ ;#ö}3Ùä_!—¸…^h„ØŽš#§*Vସ>+Î&Œ!G¨CåI%í<[V;bãê’#`ëË^"Þï¥üzgܼûrT!à3ïü#";ÅÒ‡Ž¿æðâ÷UÚTfOálèfÖp!Úœ#è~2[[V§2ãj,v# ‡gŒ½œ¾ÐÝê‘ç=f½%تÑ4€s5ˆ³­”á§K¿]ô@¤Þ*Û»×½ólâ>ˆµù¼dCCôæb‘¼ƒÎpÎm$>ž%®´öÿ?Ó;iÞ„{öû³þŽFþqîѧ:9Õ bÂ…îÈDZ¦;|L¸Xw©Cú¼¦­¾„ããÎ!¤kÕõžžº¤y³ÅNñwrb¾)ùØ/z©“ÏU¦ŒrüøCUVÝùú uÜõeéó]Ëðë ô€Ê`44Qó öîi +Ư/¶ç—D˜Éá»ÝìŸqyÄUO8õÚ]Ümß-¿û™Æ ¹ñ:O®¢½®ûKžÇÎÆ* +ñæÂ"[×ݵc¹öSŸi•ñ» +€‰Ñ—¸Rs’¾Àr@e àâþ{÷« 6¿û’/²Ác2vóñ쿽Jû¤úÿ°å¡¢ù´S?ñ¿2Zò‹,_³–RåHÈíKz²Â™ì«ý׸zߦ¥c+«Ì ¾¬K’ n ŠÉ1Æt‡–bËq:w¸f½ ž±­qß¼=pâ\žæ[À—÷Ð’I,pl¯Û[±g~Ë)ßá$÷Ã{±üÚÿßMÞÜQ¿é[+|ïƒRÏÜ’5]‹WÀMšÜ¥'¯‡QËæéïýïx¿höUòþ`ôgUó6\þe^¹é%9=‰¹ÚªÜ%M_z bTƒ–˜sxH­Uä]ÑÛo2˹0ç¯×ãàáRâÛ¿ŽÐ°Ç ®WKŽó“°iµϦaHÎMÙK%ݧÁ9Ž‰Ùź£ßu+À¯v{ÜmàbL-©“¸ÌGS<%DÁô­â±DRN БÿOîûå$´ý¨¸7ÁÁëkK–ü.Xú-®è¨éåd²x¦mï7÷b¯àå¢Ý+L¦<4VrcýöàòüÌ\YvÑ€¹UzÐÛ4ç†ø\â VÕ†§ö¾ûrkÛ[Ÿô‚—Ál)L¸8Ì[Z„p†À³ß?uÀö£¹GžO¹a+8Þƒ-üÃoðOjuÌ–p½FíZå:ü’E»öb›yæò Ç Øêú‡%Á{K~I]q8]óÛÒA Ž#Ôpkȯ’üf<ÃVÊÜñWÿ/ÞÛœ^µï†ò‘ù%º€»ê8_&ëTwÉ¢-jLµÔÅr^…úÃúJE´'ðî1yö:Cwvp¹-û%L犑F鬩äÚ¸Û9û bÂõttsõDé\Ö¯ÎüÁ;ƒ~o28Oy³ÈóK—®G6«Ó¥šTÛ·â‡iÏïß÷¥50gñ7¸).ŒÊ=¨¿^íKà‡Ã÷Xl«ÑFgõªˆ›YÐtÅf—µÅz˜U¢,Ľ ?Iõ¢À ÙÃœüᦵKyDÿ0âbR+0ï@Môî:ΣÖ]«:þŠ1p‡‡Dt¹@›I1‘Dnùj*J0jeq#q3‘®Œ€!¨,@Ý]RW´Ì=UmRlÝ`×âÄÔˆ-FÎdæ¹1!NËêŠàÀêéÛºÕ\'bê¨ +mHÑqÎTÍ/I yË:5ôGsÿ/½è%Yñ¥©Ï 5MÖ´QŠWZ妳£¸Î‘¶ƒ ßý€Ñ ¢¦¦L<ÁZŠµp%€ùùGl«àìWÐÑNˆ6m‡c"0'‚'"»çÕË<:”©ÛWÓõwÑO¯®XŸËÖ‰eõÄ4&œŠ™‘½ä+·a²„ÁSµJ¼ä‹+`†9ôçy3X);â%c¶ÃI•ÓŒÚI‘aíþ__èÐÑ^Ïä©÷Í3ÁÂ@\®ÈÏ@é6cµî鮹@ë9(t•²ý‚›„Óö¯zƒ|.Öt^6#XÐÚsni¯+£8•0Q—õÁ~ˆÇt±g¶H§pºÕK`PÖglÚ™L»zŒaRP²H32—œ;¦#®LÜB‡S#³Š[Îen‘>˜'½Š<Œ3i#}‰Š:tKJF)æ"­ém—>Ã[Ògq?»ÕBNÌ(ˆ·òµM@þ%çþDñ3YY˜³¼pXÆ1" V‰‘_J÷k¢}1œR{üÕyßÑò&õªûfÙ¦!Êòu•x4â0™ÜØj\¨ D~,™ž` \ +GË^@K,Ÿºà€MpB† ˱,+ÔÜTÇ VR)ݺÃK¼šs¯=¤¬ xÙÖJÿY?<ú«K.rÛ\F0|ÆU±ðýÖ:gÙ9Æ=“„ôi»ÒŸ¨Ä¯°Ìw£ ö…°?zï ª{sL‘¨¯Á#Ô ©×¨xÓY.ÑÔ™;Ú·²ÎRßïÙëʳKï­IŒðêÍ•t :n€Ûãr9~ê3¢ñܶ”™±xámÀý˜x"gž'M¸§)‰HÎ3EYÊl-™Cue´]´ÿXxxy±—O@„2tŽøï™ÊZëÔ{€JBlÈ0 +¯ºáô˜ì° Ó{Œú— 5 ¾Ú°µ«Ì^¤ùœ“DHfzÎyÂâ©Ðš¹$ñø»³ì4žF¥+†oྪ«Ñ”Ȇ­è鄳[Üôt10Éóipo¹.W©AE wÕûK–Ua' Ÿ%HÓ 2~zÇᥪï¬CïçÂQ™0Ÿ¤jò§&™ }-‹Œ@id£\¥ÂwÊÑaì;;ê–;̵TK’° D°ïä¥\-_Þ³>Nhï>ßÛ÷jÚìnùÊ’©´|'ä$%suZ5A_ ;çþËkÍ©!·„‚¿Ëǵ·· qu3’HüájbxCñ² ?ø3ô¢l£³¤nUAÝIæ)£õý7Uh©g;=·…6ÑuœiG.||s4|¤XQ;h¡WTOQ z}¥ÄñÅš$\a€˜›e:m«†Õ3˜­ÒFUc ‹Á±IÃÐ+ËÆ ¶ÊqA=ždAmT†48âK%ÊxEÓ7_^V”‰¥;µHöœ»ˆ›ÂYœí¢LÌ(“Ÿýn¯há ï¦ý–Zúha¶ PàªôGe¡ÖÏIçŠ"¤f]6ñ.c=`Ð1ŠÌ1È{Òc}ûQ„Ú¾a2p³kÉ·‘  +^3j‚l u3úÙ +¾±…/§É,¸Š¾¶ß¨Jh£\í}IË8é°©#Ëêy5GÔ÷Ò}wOFÑT–saÔPšüýóʘEŸ· uŒ)ÀæŒç8rÈ·´Mˆ†!°"Ó2¥êü}vöt„“Ì\›Þ? +3Bæ /óEjl£l1,kЛµX8Wk˜ÆŒXVÔ ½ÜH%>œ¢Òƒ'—„È×,)¯¡ñõ/Ñ}’'#´Ó@±ˆt‡ ‘WÕcÀpw-°•$¼ +®¡m’'2KÂnY©…ÑE›¤û:vg–ÔL2ìì±JTº&£Ü"œ0ä(ïòîÃó«5FØÔÁÃeàü=bþkØÞhýzi Ÿça¼šThXoxMèQë7¬c#]U:ƒ´XĆw2`SO_^cîÅéÏ„)·}ñB¬H­Ä º‡ˆi_õ Êsù : $´­lŠ¹ù +š²‘¥;6}åÚºc^¶nÐÙ•e”S- +b¯¹ÐÙ%!’pÆRø„‚ÿà ûµo§p–%ïQ–!RÉëËàPéÀH%ý§ø_.ž×õŠï#}—ßI=zô½5f? äý}„ÔÖB‘¾ÒÈ”S-K=hÒ\Fnôkœ";¤ÇÙ7 òP»0{9JÕKÛ—FBÐU-,Ì´ÃúÊã*Š5’¢dw_1Ãå¸sþ}»"<®êµšZŠUðzP%¢ïÈ»—sÆ4Ç÷mž÷óÎÊÜ46Ïq§µA+Á¹ÑNÙ'CE‚Û?ïÂå&V"*×4G0ö£6ø+*ºùÉ°m€ÕŽû­™ùjGB/oÄ @¹³6üœ8ú/e¨wÞϨ$Wzsà×ÆÛmÑ£Ô‹>æÊžwÕæúÕöÜ5Os±µ ¬ÑÍO†lÁªçý[²éSh0­¨ÙúLELŽ Ñd DücµÃåíü~à&ptZ è´oÆ×Û¬ŠÕƒ ƒÀø’4/±‹0 c΄Ó ×t–戮(¼˜B‰Z9†Žt¶Ô'bzíb¬n>…/`QU”¬Ïw}‹#ÝGhBfIÇí’B™f}Ý(³* ÈwÂŶp¯|ª¬œ3þóHÄ¿UAËå°úŠŒ¡˜DÒSl]öÕÏf*]PvÜ¡Ö#´BÄg!hnèÍbtˆzᯟ›Rª¡÷ùjs°°»bVÖT +.š¥¨X ?1Y; +S[™7ð?…C÷ UøÑò`«iNAÔý]1¿>ió:"Dì…ÿb²´¸ói\j5Ü?š™‰b­þ§¤ endstream endobj 7 0 obj <> endobj 8 0 obj <> endobj 9 0 obj <> endobj 10 0 obj <> endobj 45 0 obj <> endobj 46 0 obj <> endobj 47 0 obj <> endobj 48 0 obj <> endobj 82 0 obj <> endobj 83 0 obj <> endobj 84 0 obj <> endobj 85 0 obj <> endobj 119 0 obj <> endobj 120 0 obj <> endobj 121 0 obj <> endobj 122 0 obj <> endobj 156 0 obj <> endobj 157 0 obj <> endobj 158 0 obj <> endobj 159 0 obj <> endobj 193 0 obj <> endobj 194 0 obj <> endobj 195 0 obj <> endobj 196 0 obj <> endobj 230 0 obj <> endobj 231 0 obj <> endobj 232 0 obj <> endobj 233 0 obj <> endobj 267 0 obj <> endobj 268 0 obj <> endobj 269 0 obj <> endobj 270 0 obj <> endobj 304 0 obj <> endobj 305 0 obj <> endobj 306 0 obj <> endobj 307 0 obj <> endobj 341 0 obj <> endobj 342 0 obj <> endobj 343 0 obj <> endobj 344 0 obj <> endobj 377 0 obj <> endobj 378 0 obj <> endobj 379 0 obj <> endobj 380 0 obj <> endobj 413 0 obj <> endobj 414 0 obj <> endobj 415 0 obj <> endobj 416 0 obj <> endobj 449 0 obj <> endobj 450 0 obj <> endobj 451 0 obj <> endobj 452 0 obj <> endobj 485 0 obj <> endobj 486 0 obj <> endobj 487 0 obj <> endobj 488 0 obj <> endobj 521 0 obj <> endobj 522 0 obj <> endobj 523 0 obj <> endobj 524 0 obj <> endobj 557 0 obj <> endobj 558 0 obj <> endobj 559 0 obj <> endobj 560 0 obj <> endobj 593 0 obj <> endobj 594 0 obj <> endobj 595 0 obj <> endobj 596 0 obj <> endobj 629 0 obj <> endobj 630 0 obj <> endobj 631 0 obj <> endobj 632 0 obj <> endobj 665 0 obj <> endobj 666 0 obj <> endobj 667 0 obj <> endobj 668 0 obj <> endobj 701 0 obj <> endobj 702 0 obj <> endobj 703 0 obj <> endobj 704 0 obj <> endobj 737 0 obj <> endobj 738 0 obj <> endobj 739 0 obj <> endobj 740 0 obj <> endobj 773 0 obj <> endobj 774 0 obj <> endobj 775 0 obj <> endobj 776 0 obj <> endobj 809 0 obj <> endobj 810 0 obj <> endobj 811 0 obj <> endobj 812 0 obj <> endobj 845 0 obj <> endobj 846 0 obj <> endobj 847 0 obj <> endobj 848 0 obj <> endobj 881 0 obj <> endobj 882 0 obj <> endobj 883 0 obj <> endobj 884 0 obj <> endobj 919 0 obj <> endobj 920 0 obj <> endobj 921 0 obj <> endobj 922 0 obj <> endobj 961 0 obj <> endobj 962 0 obj <> endobj 963 0 obj <> endobj 964 0 obj <> endobj 1003 0 obj <> endobj 1004 0 obj <> endobj 1005 0 obj <> endobj 1006 0 obj <> endobj 1022 0 obj [/View/Design] endobj 1023 0 obj <>>> endobj 1020 0 obj [/View/Design] endobj 1021 0 obj <>>> endobj 1018 0 obj [/View/Design] endobj 1019 0 obj <>>> endobj 1016 0 obj [/View/Design] endobj 1017 0 obj <>>> endobj 980 0 obj [/View/Design] endobj 981 0 obj <>>> endobj 978 0 obj [/View/Design] endobj 979 0 obj <>>> endobj 976 0 obj [/View/Design] endobj 977 0 obj <>>> endobj 974 0 obj [/View/Design] endobj 975 0 obj <>>> endobj 938 0 obj [/View/Design] endobj 939 0 obj <>>> endobj 936 0 obj [/View/Design] endobj 937 0 obj <>>> endobj 934 0 obj [/View/Design] endobj 935 0 obj <>>> endobj 932 0 obj [/View/Design] endobj 933 0 obj <>>> endobj 900 0 obj [/View/Design] endobj 901 0 obj <>>> endobj 898 0 obj [/View/Design] endobj 899 0 obj <>>> endobj 896 0 obj [/View/Design] endobj 897 0 obj <>>> endobj 894 0 obj [/View/Design] endobj 895 0 obj <>>> endobj 864 0 obj [/View/Design] endobj 865 0 obj <>>> endobj 862 0 obj [/View/Design] endobj 863 0 obj <>>> endobj 860 0 obj [/View/Design] endobj 861 0 obj <>>> endobj 858 0 obj [/View/Design] endobj 859 0 obj <>>> endobj 828 0 obj [/View/Design] endobj 829 0 obj <>>> endobj 826 0 obj [/View/Design] endobj 827 0 obj <>>> endobj 824 0 obj [/View/Design] endobj 825 0 obj <>>> endobj 822 0 obj [/View/Design] endobj 823 0 obj <>>> endobj 792 0 obj [/View/Design] endobj 793 0 obj <>>> endobj 790 0 obj [/View/Design] endobj 791 0 obj <>>> endobj 788 0 obj [/View/Design] endobj 789 0 obj <>>> endobj 786 0 obj [/View/Design] endobj 787 0 obj <>>> endobj 756 0 obj [/View/Design] endobj 757 0 obj <>>> endobj 754 0 obj [/View/Design] endobj 755 0 obj <>>> endobj 752 0 obj [/View/Design] endobj 753 0 obj <>>> endobj 750 0 obj [/View/Design] endobj 751 0 obj <>>> endobj 720 0 obj [/View/Design] endobj 721 0 obj <>>> endobj 718 0 obj [/View/Design] endobj 719 0 obj <>>> endobj 716 0 obj [/View/Design] endobj 717 0 obj <>>> endobj 714 0 obj [/View/Design] endobj 715 0 obj <>>> endobj 684 0 obj [/View/Design] endobj 685 0 obj <>>> endobj 682 0 obj [/View/Design] endobj 683 0 obj <>>> endobj 680 0 obj [/View/Design] endobj 681 0 obj <>>> endobj 678 0 obj [/View/Design] endobj 679 0 obj <>>> endobj 648 0 obj [/View/Design] endobj 649 0 obj <>>> endobj 646 0 obj [/View/Design] endobj 647 0 obj <>>> endobj 644 0 obj [/View/Design] endobj 645 0 obj <>>> endobj 642 0 obj [/View/Design] endobj 643 0 obj <>>> endobj 612 0 obj [/View/Design] endobj 613 0 obj <>>> endobj 610 0 obj [/View/Design] endobj 611 0 obj <>>> endobj 608 0 obj [/View/Design] endobj 609 0 obj <>>> endobj 606 0 obj [/View/Design] endobj 607 0 obj <>>> endobj 576 0 obj [/View/Design] endobj 577 0 obj <>>> endobj 574 0 obj [/View/Design] endobj 575 0 obj <>>> endobj 572 0 obj [/View/Design] endobj 573 0 obj <>>> endobj 570 0 obj [/View/Design] endobj 571 0 obj <>>> endobj 540 0 obj [/View/Design] endobj 541 0 obj <>>> endobj 538 0 obj [/View/Design] endobj 539 0 obj <>>> endobj 536 0 obj [/View/Design] endobj 537 0 obj <>>> endobj 534 0 obj [/View/Design] endobj 535 0 obj <>>> endobj 504 0 obj [/View/Design] endobj 505 0 obj <>>> endobj 502 0 obj [/View/Design] endobj 503 0 obj <>>> endobj 500 0 obj [/View/Design] endobj 501 0 obj <>>> endobj 498 0 obj [/View/Design] endobj 499 0 obj <>>> endobj 468 0 obj [/View/Design] endobj 469 0 obj <>>> endobj 466 0 obj [/View/Design] endobj 467 0 obj <>>> endobj 464 0 obj [/View/Design] endobj 465 0 obj <>>> endobj 462 0 obj [/View/Design] endobj 463 0 obj <>>> endobj 432 0 obj [/View/Design] endobj 433 0 obj <>>> endobj 430 0 obj [/View/Design] endobj 431 0 obj <>>> endobj 428 0 obj [/View/Design] endobj 429 0 obj <>>> endobj 426 0 obj [/View/Design] endobj 427 0 obj <>>> endobj 396 0 obj [/View/Design] endobj 397 0 obj <>>> endobj 394 0 obj [/View/Design] endobj 395 0 obj <>>> endobj 392 0 obj [/View/Design] endobj 393 0 obj <>>> endobj 390 0 obj [/View/Design] endobj 391 0 obj <>>> endobj 360 0 obj [/View/Design] endobj 361 0 obj <>>> endobj 358 0 obj [/View/Design] endobj 359 0 obj <>>> endobj 356 0 obj [/View/Design] endobj 357 0 obj <>>> endobj 354 0 obj [/View/Design] endobj 355 0 obj <>>> endobj 324 0 obj [/View/Design] endobj 325 0 obj <>>> endobj 322 0 obj [/View/Design] endobj 323 0 obj <>>> endobj 320 0 obj [/View/Design] endobj 321 0 obj <>>> endobj 318 0 obj [/View/Design] endobj 319 0 obj <>>> endobj 287 0 obj [/View/Design] endobj 288 0 obj <>>> endobj 285 0 obj [/View/Design] endobj 286 0 obj <>>> endobj 283 0 obj [/View/Design] endobj 284 0 obj <>>> endobj 281 0 obj [/View/Design] endobj 282 0 obj <>>> endobj 250 0 obj [/View/Design] endobj 251 0 obj <>>> endobj 248 0 obj [/View/Design] endobj 249 0 obj <>>> endobj 246 0 obj [/View/Design] endobj 247 0 obj <>>> endobj 244 0 obj [/View/Design] endobj 245 0 obj <>>> endobj 213 0 obj [/View/Design] endobj 214 0 obj <>>> endobj 211 0 obj [/View/Design] endobj 212 0 obj <>>> endobj 209 0 obj [/View/Design] endobj 210 0 obj <>>> endobj 207 0 obj [/View/Design] endobj 208 0 obj <>>> endobj 176 0 obj [/View/Design] endobj 177 0 obj <>>> endobj 174 0 obj [/View/Design] endobj 175 0 obj <>>> endobj 172 0 obj [/View/Design] endobj 173 0 obj <>>> endobj 170 0 obj [/View/Design] endobj 171 0 obj <>>> endobj 139 0 obj [/View/Design] endobj 140 0 obj <>>> endobj 137 0 obj [/View/Design] endobj 138 0 obj <>>> endobj 135 0 obj [/View/Design] endobj 136 0 obj <>>> endobj 133 0 obj [/View/Design] endobj 134 0 obj <>>> endobj 102 0 obj [/View/Design] endobj 103 0 obj <>>> endobj 100 0 obj [/View/Design] endobj 101 0 obj <>>> endobj 98 0 obj [/View/Design] endobj 99 0 obj <>>> endobj 96 0 obj [/View/Design] endobj 97 0 obj <>>> endobj 65 0 obj [/View/Design] endobj 66 0 obj <>>> endobj 63 0 obj [/View/Design] endobj 64 0 obj <>>> endobj 61 0 obj [/View/Design] endobj 62 0 obj <>>> endobj 59 0 obj [/View/Design] endobj 60 0 obj <>>> endobj 28 0 obj [/View/Design] endobj 29 0 obj <>>> endobj 26 0 obj [/View/Design] endobj 27 0 obj <>>> endobj 24 0 obj [/View/Design] endobj 25 0 obj <>>> endobj 22 0 obj [/View/Design] endobj 23 0 obj <>>> endobj 1049 0 obj [1048 0 R 1047 0 R 1046 0 R 1045 0 R] endobj 1082 0 obj <> endobj xref +0 1083 +0000000004 65535 f +0000000016 00000 n +0000001977 00000 n +0000064028 00000 n +0000000005 00000 f +0000000006 00000 f +0000000011 00000 f +0000430892 00000 n +0000430962 00000 n +0000431042 00000 n +0000431112 00000 n +0000000013 00000 f +0000064080 00000 n +0000000014 00000 f +0000000015 00000 f +0000000016 00000 f +0000000017 00000 f +0000000018 00000 f +0000000019 00000 f +0000000020 00000 f +0000000021 00000 f +0000000030 00000 f +0000452525 00000 n +0000452556 00000 n +0000452409 00000 n +0000452440 00000 n +0000452293 00000 n +0000452324 00000 n +0000452177 00000 n +0000452208 00000 n +0000000031 00000 f +0000000032 00000 f +0000000033 00000 f +0000000034 00000 f +0000000035 00000 f +0000000036 00000 f +0000000037 00000 f +0000000038 00000 f +0000000039 00000 f +0000000040 00000 f +0000000041 00000 f +0000000042 00000 f +0000000043 00000 f +0000000044 00000 f +0000000049 00000 f +0000431183 00000 n +0000431254 00000 n +0000431335 00000 n +0000431406 00000 n +0000000050 00000 f +0000000051 00000 f +0000000052 00000 f +0000000053 00000 f +0000000054 00000 f +0000000055 00000 f +0000000056 00000 f +0000000057 00000 f +0000000058 00000 f +0000000067 00000 f +0000452061 00000 n +0000452092 00000 n +0000451945 00000 n +0000451976 00000 n +0000451829 00000 n +0000451860 00000 n +0000451713 00000 n +0000451744 00000 n +0000000068 00000 f +0000000069 00000 f +0000000070 00000 f +0000000071 00000 f +0000000072 00000 f +0000000073 00000 f +0000000074 00000 f +0000000075 00000 f +0000000076 00000 f +0000000077 00000 f +0000000078 00000 f +0000000079 00000 f +0000000080 00000 f +0000000081 00000 f +0000000086 00000 f +0000431477 00000 n +0000431548 00000 n +0000431629 00000 n +0000431702 00000 n +0000000087 00000 f +0000000088 00000 f +0000000089 00000 f +0000000090 00000 f +0000000091 00000 f +0000000092 00000 f +0000000093 00000 f +0000000094 00000 f +0000000095 00000 f +0000000104 00000 f +0000451597 00000 n +0000451628 00000 n +0000451481 00000 n +0000451512 00000 n +0000451363 00000 n +0000451395 00000 n +0000451245 00000 n +0000451277 00000 n +0000000105 00000 f +0000000106 00000 f +0000000107 00000 f +0000000108 00000 f +0000000109 00000 f +0000000110 00000 f +0000000111 00000 f +0000000112 00000 f +0000000113 00000 f +0000000114 00000 f +0000000115 00000 f +0000000116 00000 f +0000000117 00000 f +0000000118 00000 f +0000000123 00000 f +0000431775 00000 n +0000431849 00000 n +0000431933 00000 n +0000432007 00000 n +0000000124 00000 f +0000000125 00000 f +0000000126 00000 f +0000000127 00000 f +0000000128 00000 f +0000000129 00000 f +0000000130 00000 f +0000000131 00000 f +0000000132 00000 f +0000000141 00000 f +0000451127 00000 n +0000451159 00000 n +0000451009 00000 n +0000451041 00000 n +0000450891 00000 n +0000450923 00000 n +0000450773 00000 n +0000450805 00000 n +0000000142 00000 f +0000000143 00000 f +0000000144 00000 f +0000000145 00000 f +0000000146 00000 f +0000000147 00000 f +0000000148 00000 f +0000000149 00000 f +0000000150 00000 f +0000000151 00000 f +0000000152 00000 f +0000000153 00000 f +0000000154 00000 f +0000000155 00000 f +0000000160 00000 f +0000432081 00000 n +0000432155 00000 n +0000432239 00000 n +0000432313 00000 n +0000000161 00000 f +0000000162 00000 f +0000000163 00000 f +0000000164 00000 f +0000000165 00000 f +0000000166 00000 f +0000000167 00000 f +0000000168 00000 f +0000000169 00000 f +0000000178 00000 f +0000450655 00000 n +0000450687 00000 n +0000450537 00000 n +0000450569 00000 n +0000450419 00000 n +0000450451 00000 n +0000450301 00000 n +0000450333 00000 n +0000000179 00000 f +0000000180 00000 f +0000000181 00000 f +0000000182 00000 f +0000000183 00000 f +0000000184 00000 f +0000000185 00000 f +0000000186 00000 f +0000000187 00000 f +0000000188 00000 f +0000000189 00000 f +0000000190 00000 f +0000000191 00000 f +0000000192 00000 f +0000000197 00000 f +0000432387 00000 n +0000432461 00000 n +0000432545 00000 n +0000432619 00000 n +0000000198 00000 f +0000000199 00000 f +0000000200 00000 f +0000000201 00000 f +0000000202 00000 f +0000000203 00000 f +0000000204 00000 f +0000000205 00000 f +0000000206 00000 f +0000000215 00000 f +0000450183 00000 n +0000450215 00000 n +0000450065 00000 n +0000450097 00000 n +0000449947 00000 n +0000449979 00000 n +0000449829 00000 n +0000449861 00000 n +0000000216 00000 f +0000000217 00000 f +0000000218 00000 f +0000000219 00000 f +0000000220 00000 f +0000000221 00000 f +0000000222 00000 f +0000000223 00000 f +0000000224 00000 f +0000000225 00000 f +0000000226 00000 f +0000000227 00000 f +0000000228 00000 f +0000000229 00000 f +0000000234 00000 f +0000432693 00000 n +0000432767 00000 n +0000432851 00000 n +0000432925 00000 n +0000000235 00000 f +0000000236 00000 f +0000000237 00000 f +0000000238 00000 f +0000000239 00000 f +0000000240 00000 f +0000000241 00000 f +0000000242 00000 f +0000000243 00000 f +0000000252 00000 f +0000449711 00000 n +0000449743 00000 n +0000449593 00000 n +0000449625 00000 n +0000449475 00000 n +0000449507 00000 n +0000449357 00000 n +0000449389 00000 n +0000000253 00000 f +0000000254 00000 f +0000000255 00000 f +0000000256 00000 f +0000000257 00000 f +0000000258 00000 f +0000000259 00000 f +0000000260 00000 f +0000000261 00000 f +0000000262 00000 f +0000000263 00000 f +0000000264 00000 f +0000000265 00000 f +0000000266 00000 f +0000000271 00000 f +0000432999 00000 n +0000433073 00000 n +0000433157 00000 n +0000433231 00000 n +0000000272 00000 f +0000000273 00000 f +0000000274 00000 f +0000000275 00000 f +0000000276 00000 f +0000000277 00000 f +0000000278 00000 f +0000000279 00000 f +0000000280 00000 f +0000000289 00000 f +0000449239 00000 n +0000449271 00000 n +0000449121 00000 n +0000449153 00000 n +0000449003 00000 n +0000449035 00000 n +0000448885 00000 n +0000448917 00000 n +0000000290 00000 f +0000000291 00000 f +0000000292 00000 f +0000000293 00000 f +0000000294 00000 f +0000000295 00000 f +0000000296 00000 f +0000000297 00000 f +0000000298 00000 f +0000000299 00000 f +0000000300 00000 f +0000000301 00000 f +0000000302 00000 f +0000000303 00000 f +0000000308 00000 f +0000433305 00000 n +0000433379 00000 n +0000433463 00000 n +0000433537 00000 n +0000000309 00000 f +0000000310 00000 f +0000000311 00000 f +0000000312 00000 f +0000000313 00000 f +0000000314 00000 f +0000000315 00000 f +0000000316 00000 f +0000000317 00000 f +0000000326 00000 f +0000448767 00000 n +0000448799 00000 n +0000448649 00000 n +0000448681 00000 n +0000448531 00000 n +0000448563 00000 n +0000448413 00000 n +0000448445 00000 n +0000000327 00000 f +0000000328 00000 f +0000000329 00000 f +0000000330 00000 f +0000000331 00000 f +0000000332 00000 f +0000000333 00000 f +0000000334 00000 f +0000000335 00000 f +0000000336 00000 f +0000000337 00000 f +0000000338 00000 f +0000000339 00000 f +0000000340 00000 f +0000000345 00000 f +0000433611 00000 n +0000433685 00000 n +0000433769 00000 n +0000433843 00000 n +0000000346 00000 f +0000000347 00000 f +0000000348 00000 f +0000000349 00000 f +0000000350 00000 f +0000000351 00000 f +0000000352 00000 f +0000000353 00000 f +0000000362 00000 f +0000448295 00000 n +0000448327 00000 n +0000448177 00000 n +0000448209 00000 n +0000448059 00000 n +0000448091 00000 n +0000447941 00000 n +0000447973 00000 n +0000000363 00000 f +0000000364 00000 f +0000000365 00000 f +0000000366 00000 f +0000000367 00000 f +0000000368 00000 f +0000000369 00000 f +0000000370 00000 f +0000000371 00000 f +0000000372 00000 f +0000000373 00000 f +0000000374 00000 f +0000000375 00000 f +0000000376 00000 f +0000000381 00000 f +0000433917 00000 n +0000433991 00000 n +0000434075 00000 n +0000434149 00000 n +0000000382 00000 f +0000000383 00000 f +0000000384 00000 f +0000000385 00000 f +0000000386 00000 f +0000000387 00000 f +0000000388 00000 f +0000000389 00000 f +0000000398 00000 f +0000447823 00000 n +0000447855 00000 n +0000447705 00000 n +0000447737 00000 n +0000447587 00000 n +0000447619 00000 n +0000447469 00000 n +0000447501 00000 n +0000000399 00000 f +0000000400 00000 f +0000000401 00000 f +0000000402 00000 f +0000000403 00000 f +0000000404 00000 f +0000000405 00000 f +0000000406 00000 f +0000000407 00000 f +0000000408 00000 f +0000000409 00000 f +0000000410 00000 f +0000000411 00000 f +0000000412 00000 f +0000000417 00000 f +0000434223 00000 n +0000434297 00000 n +0000434381 00000 n +0000434455 00000 n +0000000418 00000 f +0000000419 00000 f +0000000420 00000 f +0000000421 00000 f +0000000422 00000 f +0000000423 00000 f +0000000424 00000 f +0000000425 00000 f +0000000434 00000 f +0000447351 00000 n +0000447383 00000 n +0000447233 00000 n +0000447265 00000 n +0000447115 00000 n +0000447147 00000 n +0000446997 00000 n +0000447029 00000 n +0000000435 00000 f +0000000436 00000 f +0000000437 00000 f +0000000438 00000 f +0000000439 00000 f +0000000440 00000 f +0000000441 00000 f +0000000442 00000 f +0000000443 00000 f +0000000444 00000 f +0000000445 00000 f +0000000446 00000 f +0000000447 00000 f +0000000448 00000 f +0000000453 00000 f +0000434529 00000 n +0000434603 00000 n +0000434687 00000 n +0000434761 00000 n +0000000454 00000 f +0000000455 00000 f +0000000456 00000 f +0000000457 00000 f +0000000458 00000 f +0000000459 00000 f +0000000460 00000 f +0000000461 00000 f +0000000470 00000 f +0000446879 00000 n +0000446911 00000 n +0000446761 00000 n +0000446793 00000 n +0000446643 00000 n +0000446675 00000 n +0000446525 00000 n +0000446557 00000 n +0000000471 00000 f +0000000472 00000 f +0000000473 00000 f +0000000474 00000 f +0000000475 00000 f +0000000476 00000 f +0000000477 00000 f +0000000478 00000 f +0000000479 00000 f +0000000480 00000 f +0000000481 00000 f +0000000482 00000 f +0000000483 00000 f +0000000484 00000 f +0000000489 00000 f +0000434835 00000 n +0000434909 00000 n +0000434993 00000 n +0000435067 00000 n +0000000490 00000 f +0000000491 00000 f +0000000492 00000 f +0000000493 00000 f +0000000494 00000 f +0000000495 00000 f +0000000496 00000 f +0000000497 00000 f +0000000506 00000 f +0000446407 00000 n +0000446439 00000 n +0000446289 00000 n +0000446321 00000 n +0000446171 00000 n +0000446203 00000 n +0000446053 00000 n +0000446085 00000 n +0000000507 00000 f +0000000508 00000 f +0000000509 00000 f +0000000510 00000 f +0000000511 00000 f +0000000512 00000 f +0000000513 00000 f +0000000514 00000 f +0000000515 00000 f +0000000516 00000 f +0000000517 00000 f +0000000518 00000 f +0000000519 00000 f +0000000520 00000 f +0000000525 00000 f +0000435141 00000 n +0000435215 00000 n +0000435299 00000 n +0000435373 00000 n +0000000526 00000 f +0000000527 00000 f +0000000528 00000 f +0000000529 00000 f +0000000530 00000 f +0000000531 00000 f +0000000532 00000 f +0000000533 00000 f +0000000542 00000 f +0000445935 00000 n +0000445967 00000 n +0000445817 00000 n +0000445849 00000 n +0000445699 00000 n +0000445731 00000 n +0000445581 00000 n +0000445613 00000 n +0000000543 00000 f +0000000544 00000 f +0000000545 00000 f +0000000546 00000 f +0000000547 00000 f +0000000548 00000 f +0000000549 00000 f +0000000550 00000 f +0000000551 00000 f +0000000552 00000 f +0000000553 00000 f +0000000554 00000 f +0000000555 00000 f +0000000556 00000 f +0000000561 00000 f +0000435447 00000 n +0000435521 00000 n +0000435605 00000 n +0000435679 00000 n +0000000562 00000 f +0000000563 00000 f +0000000564 00000 f +0000000565 00000 f +0000000566 00000 f +0000000567 00000 f +0000000568 00000 f +0000000569 00000 f +0000000578 00000 f +0000445463 00000 n +0000445495 00000 n +0000445345 00000 n +0000445377 00000 n +0000445227 00000 n +0000445259 00000 n +0000445109 00000 n +0000445141 00000 n +0000000579 00000 f +0000000580 00000 f +0000000581 00000 f +0000000582 00000 f +0000000583 00000 f +0000000584 00000 f +0000000585 00000 f +0000000586 00000 f +0000000587 00000 f +0000000588 00000 f +0000000589 00000 f +0000000590 00000 f +0000000591 00000 f +0000000592 00000 f +0000000597 00000 f +0000435753 00000 n +0000435827 00000 n +0000435911 00000 n +0000435985 00000 n +0000000598 00000 f +0000000599 00000 f +0000000600 00000 f +0000000601 00000 f +0000000602 00000 f +0000000603 00000 f +0000000604 00000 f +0000000605 00000 f +0000000614 00000 f +0000444991 00000 n +0000445023 00000 n +0000444873 00000 n +0000444905 00000 n +0000444755 00000 n +0000444787 00000 n +0000444637 00000 n +0000444669 00000 n +0000000615 00000 f +0000000616 00000 f +0000000617 00000 f +0000000618 00000 f +0000000619 00000 f +0000000620 00000 f +0000000621 00000 f +0000000622 00000 f +0000000623 00000 f +0000000624 00000 f +0000000625 00000 f +0000000626 00000 f +0000000627 00000 f +0000000628 00000 f +0000000633 00000 f +0000436059 00000 n +0000436133 00000 n +0000436217 00000 n +0000436291 00000 n +0000000634 00000 f +0000000635 00000 f +0000000636 00000 f +0000000637 00000 f +0000000638 00000 f +0000000639 00000 f +0000000640 00000 f +0000000641 00000 f +0000000650 00000 f +0000444519 00000 n +0000444551 00000 n +0000444401 00000 n +0000444433 00000 n +0000444283 00000 n +0000444315 00000 n +0000444165 00000 n +0000444197 00000 n +0000000651 00000 f +0000000652 00000 f +0000000653 00000 f +0000000654 00000 f +0000000655 00000 f +0000000656 00000 f +0000000657 00000 f +0000000658 00000 f +0000000659 00000 f +0000000660 00000 f +0000000661 00000 f +0000000662 00000 f +0000000663 00000 f +0000000664 00000 f +0000000669 00000 f +0000436365 00000 n +0000436439 00000 n +0000436523 00000 n +0000436597 00000 n +0000000670 00000 f +0000000671 00000 f +0000000672 00000 f +0000000673 00000 f +0000000674 00000 f +0000000675 00000 f +0000000676 00000 f +0000000677 00000 f +0000000686 00000 f +0000444047 00000 n +0000444079 00000 n +0000443929 00000 n +0000443961 00000 n +0000443811 00000 n +0000443843 00000 n +0000443693 00000 n +0000443725 00000 n +0000000687 00000 f +0000000688 00000 f +0000000689 00000 f +0000000690 00000 f +0000000691 00000 f +0000000692 00000 f +0000000693 00000 f +0000000694 00000 f +0000000695 00000 f +0000000696 00000 f +0000000697 00000 f +0000000698 00000 f +0000000699 00000 f +0000000700 00000 f +0000000705 00000 f +0000436671 00000 n +0000436745 00000 n +0000436829 00000 n +0000436903 00000 n +0000000706 00000 f +0000000707 00000 f +0000000708 00000 f +0000000709 00000 f +0000000710 00000 f +0000000711 00000 f +0000000712 00000 f +0000000713 00000 f +0000000722 00000 f +0000443575 00000 n +0000443607 00000 n +0000443457 00000 n +0000443489 00000 n +0000443339 00000 n +0000443371 00000 n +0000443221 00000 n +0000443253 00000 n +0000000723 00000 f +0000000724 00000 f +0000000725 00000 f +0000000726 00000 f +0000000727 00000 f +0000000728 00000 f +0000000729 00000 f +0000000730 00000 f +0000000731 00000 f +0000000732 00000 f +0000000733 00000 f +0000000734 00000 f +0000000735 00000 f +0000000736 00000 f +0000000741 00000 f +0000436977 00000 n +0000437051 00000 n +0000437135 00000 n +0000437209 00000 n +0000000742 00000 f +0000000743 00000 f +0000000744 00000 f +0000000745 00000 f +0000000746 00000 f +0000000747 00000 f +0000000748 00000 f +0000000749 00000 f +0000000758 00000 f +0000443103 00000 n +0000443135 00000 n +0000442985 00000 n +0000443017 00000 n +0000442867 00000 n +0000442899 00000 n +0000442749 00000 n +0000442781 00000 n +0000000759 00000 f +0000000760 00000 f +0000000761 00000 f +0000000762 00000 f +0000000763 00000 f +0000000764 00000 f +0000000765 00000 f +0000000766 00000 f +0000000767 00000 f +0000000768 00000 f +0000000769 00000 f +0000000770 00000 f +0000000771 00000 f +0000000772 00000 f +0000000777 00000 f +0000437283 00000 n +0000437357 00000 n +0000437441 00000 n +0000437515 00000 n +0000000778 00000 f +0000000779 00000 f +0000000780 00000 f +0000000781 00000 f +0000000782 00000 f +0000000783 00000 f +0000000784 00000 f +0000000785 00000 f +0000000794 00000 f +0000442631 00000 n +0000442663 00000 n +0000442513 00000 n +0000442545 00000 n +0000442395 00000 n +0000442427 00000 n +0000442277 00000 n +0000442309 00000 n +0000000795 00000 f +0000000796 00000 f +0000000797 00000 f +0000000798 00000 f +0000000799 00000 f +0000000800 00000 f +0000000801 00000 f +0000000802 00000 f +0000000803 00000 f +0000000804 00000 f +0000000805 00000 f +0000000806 00000 f +0000000807 00000 f +0000000808 00000 f +0000000813 00000 f +0000437589 00000 n +0000437663 00000 n +0000437747 00000 n +0000437821 00000 n +0000000814 00000 f +0000000815 00000 f +0000000816 00000 f +0000000817 00000 f +0000000818 00000 f +0000000819 00000 f +0000000820 00000 f +0000000821 00000 f +0000000830 00000 f +0000442159 00000 n +0000442191 00000 n +0000442041 00000 n +0000442073 00000 n +0000441923 00000 n +0000441955 00000 n +0000441805 00000 n +0000441837 00000 n +0000000831 00000 f +0000000832 00000 f +0000000833 00000 f +0000000834 00000 f +0000000835 00000 f +0000000836 00000 f +0000000837 00000 f +0000000838 00000 f +0000000839 00000 f +0000000840 00000 f +0000000841 00000 f +0000000842 00000 f +0000000843 00000 f +0000000844 00000 f +0000000849 00000 f +0000437895 00000 n +0000437969 00000 n +0000438053 00000 n +0000438127 00000 n +0000000850 00000 f +0000000851 00000 f +0000000852 00000 f +0000000853 00000 f +0000000854 00000 f +0000000855 00000 f +0000000856 00000 f +0000000857 00000 f +0000000866 00000 f +0000441687 00000 n +0000441719 00000 n +0000441569 00000 n +0000441601 00000 n +0000441451 00000 n +0000441483 00000 n +0000441333 00000 n +0000441365 00000 n +0000000867 00000 f +0000000868 00000 f +0000000869 00000 f +0000000870 00000 f +0000000871 00000 f +0000000872 00000 f +0000000873 00000 f +0000000874 00000 f +0000000875 00000 f +0000000876 00000 f +0000000877 00000 f +0000000878 00000 f +0000000879 00000 f +0000000880 00000 f +0000000885 00000 f +0000438201 00000 n +0000438275 00000 n +0000438359 00000 n +0000438433 00000 n +0000000886 00000 f +0000000887 00000 f +0000000888 00000 f +0000000889 00000 f +0000000890 00000 f +0000000891 00000 f +0000000892 00000 f +0000000893 00000 f +0000000902 00000 f +0000441215 00000 n +0000441247 00000 n +0000441097 00000 n +0000441129 00000 n +0000440979 00000 n +0000441011 00000 n +0000440861 00000 n +0000440893 00000 n +0000000903 00000 f +0000000904 00000 f +0000000905 00000 f +0000000906 00000 f +0000000907 00000 f +0000000908 00000 f +0000000909 00000 f +0000000910 00000 f +0000000911 00000 f +0000000912 00000 f +0000000913 00000 f +0000000914 00000 f +0000000915 00000 f +0000000916 00000 f +0000000917 00000 f +0000000918 00000 f +0000000923 00000 f +0000438507 00000 n +0000438581 00000 n +0000438665 00000 n +0000438739 00000 n +0000000924 00000 f +0000000925 00000 f +0000000926 00000 f +0000000927 00000 f +0000000928 00000 f +0000000929 00000 f +0000000930 00000 f +0000000931 00000 f +0000000940 00000 f +0000440743 00000 n +0000440775 00000 n +0000440625 00000 n +0000440657 00000 n +0000440507 00000 n +0000440539 00000 n +0000440389 00000 n +0000440421 00000 n +0000000941 00000 f +0000000942 00000 f +0000000943 00000 f +0000000944 00000 f +0000000945 00000 f +0000000946 00000 f +0000000947 00000 f +0000000948 00000 f +0000000949 00000 f +0000000950 00000 f +0000000951 00000 f +0000000952 00000 f +0000000953 00000 f +0000000954 00000 f +0000000955 00000 f +0000000956 00000 f +0000000957 00000 f +0000000958 00000 f +0000000959 00000 f +0000000960 00000 f +0000000965 00000 f +0000438813 00000 n +0000438887 00000 n +0000438971 00000 n +0000439045 00000 n +0000000966 00000 f +0000000967 00000 f +0000000968 00000 f +0000000969 00000 f +0000000970 00000 f +0000000971 00000 f +0000000972 00000 f +0000000973 00000 f +0000000982 00000 f +0000440271 00000 n +0000440303 00000 n +0000440153 00000 n +0000440185 00000 n +0000440035 00000 n +0000440067 00000 n +0000439917 00000 n +0000439949 00000 n +0000000983 00000 f +0000000984 00000 f +0000000985 00000 f +0000000986 00000 f +0000000987 00000 f +0000000988 00000 f +0000000989 00000 f +0000000990 00000 f +0000000991 00000 f +0000000992 00000 f +0000000993 00000 f +0000000994 00000 f +0000000995 00000 f +0000000996 00000 f +0000000997 00000 f +0000000998 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000439119 00000 n +0000439196 00000 n +0000439283 00000 n +0000439360 00000 n +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000439797 00000 n +0000439830 00000 n +0000439677 00000 n +0000439710 00000 n +0000439557 00000 n +0000439590 00000 n +0000439437 00000 n +0000439470 00000 n +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000000000 00000 f +0000070048 00000 n +0000070507 00000 n +0000070854 00000 n +0000071040 00000 n +0000069250 00000 n +0000069327 00000 n +0000069414 00000 n +0000069491 00000 n +0000452641 00000 n +0000064612 00000 n +0000068085 00000 n +0000080410 00000 n +0000080180 00000 n +0000080295 00000 n +0000068153 00000 n +0000068683 00000 n +0000068735 00000 n +0000069928 00000 n +0000069961 00000 n +0000069808 00000 n +0000069841 00000 n +0000069688 00000 n +0000069721 00000 n +0000069568 00000 n +0000069601 00000 n +0000075263 00000 n +0000074148 00000 n +0000073114 00000 n +0000071410 00000 n +0000071714 00000 n +0000073404 00000 n +0000074433 00000 n +0000075730 00000 n +0000080488 00000 n +0000080784 00000 n +0000082499 00000 n +0000148089 00000 n +0000213679 00000 n +0000279269 00000 n +0000344859 00000 n +0000410449 00000 n +0000452697 00000 n +trailer <]>> startxref 452884 %%EOF \ No newline at end of file diff --git a/flashattn_memory.jpg b/flashattn_memory.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d0b6b19e8bb1664e7fd2ce49c4dd5bcf1114df10 Binary files /dev/null and b/flashattn_memory.jpg differ diff --git a/flashattn_speedup.jpg b/flashattn_speedup.jpg new file mode 100644 index 0000000000000000000000000000000000000000..af9d1cf8edf979c65f6d24587a365471224d53d4 Binary files /dev/null and b/flashattn_speedup.jpg differ diff --git a/flashattn_speedup_3090.jpg b/flashattn_speedup_3090.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3e608e9f71e099c58ec93666333e620430514fb3 Binary files /dev/null and b/flashattn_speedup_3090.jpg differ diff --git a/flashattn_speedup_a100_d128.jpg b/flashattn_speedup_a100_d128.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ac6a677baca7b09fcfa62c6287cb5bdaf5245925 Binary files /dev/null and b/flashattn_speedup_a100_d128.jpg differ diff --git a/flashattn_speedup_t4.jpg b/flashattn_speedup_t4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8dec610efe67b16b322de621e6f101bbd96008c8 Binary files /dev/null and b/flashattn_speedup_t4.jpg differ diff --git a/flashattn_speedup_t4_fwd.jpg b/flashattn_speedup_t4_fwd.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4cbcc5b7749e7f35fdd0cf5ad2397de2e9f82bc6 Binary files /dev/null and b/flashattn_speedup_t4_fwd.jpg differ diff --git a/flop-count.yaml b/flop-count.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ee45b91585fffee5424f12b311641f42491e235f --- /dev/null +++ b/flop-count.yaml @@ -0,0 +1,5 @@ +flop_count: + _target_: src.callbacks.flop_count.FlopCount + profilers: ['fvcore'] + input_size: [3, 224, 224] + device: null diff --git a/ft_attention.cpp b/ft_attention.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b307cffc2bcdd3bff117d3581d97849d594523e9 --- /dev/null +++ b/ft_attention.cpp @@ -0,0 +1,232 @@ +#include +#include "ATen/cuda/CUDAContext.h" +#include + + +#include "decoder_masked_multihead_attention.h" + +#define CHECK_DEVICE(x) TORCH_CHECK(x.device().type() == torch::kCUDA, #x " must be on CUDA") +#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")") +#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") + +#define DISPATCH_FLOAT_AND_HALF_AND_BF16(TYPE, NAME, ...) \ + if (TYPE == at::ScalarType::Half) { \ + using scalar_t = at::Half; \ + __VA_ARGS__(); \ + } else if (TYPE == at::ScalarType::BFloat16) { \ + using scalar_t = at::BFloat16; \ + __VA_ARGS__(); \ + } else if (TYPE == at::ScalarType::Float) { \ + using scalar_t = float; \ + __VA_ARGS__(); \ + } else { \ + AT_ERROR(#NAME, " not implemented for type '", toString(TYPE), "'"); \ + } + +template +void masked_multihead_attention(const Masked_multihead_attention_params& params, + const cudaStream_t& stream); + +template +void cross_multihead_attention(const Masked_multihead_attention_params& params, + const cudaStream_t& stream); + +template +struct SATypeConverter { + using Type = T; +}; + +template<> +struct SATypeConverter { + using Type = uint16_t; +}; + +template<> +struct SATypeConverter { + using Type = __nv_bfloat16; +}; + +template +void set_params(Masked_multihead_attention_params ¶ms, + const size_t batch_size, + const size_t nheads, + const size_t nheads_kv, + const size_t memory_max_seqlen, + const size_t headdim, + const int timestep, + const int rotary_embedding_dim, + const float rotary_base, + const bool neox_rotary_style, + const int q_batch_stride, + const int k_batch_stride, + const int v_batch_stride, + const int nnz_heads, + T *q_ptr, + T *k_ptr, + T *v_ptr, + T *k_cache_ptr, + T *v_cache_ptr, + int *length_per_sample, + T *rotary_cos, + T *rotary_sin, + T *out_ptr, + int *nnz_head_idx) { + // Reset the parameters + memset(¶ms, 0, sizeof(params)); + params.q = q_ptr; + params.k = k_ptr; + params.v = v_ptr; + params.q_bias = nullptr; + params.k_bias = nullptr; + params.v_bias = nullptr; + params.k_cache = k_cache_ptr; + params.v_cache = v_cache_ptr; + params.out = out_ptr; + params.cache_indir = nullptr; + params.stride_q = q_batch_stride; + params.stride_k = k_batch_stride; + params.stride_v = v_batch_stride; + params.batch_size = batch_size; + params.beam_width = 1; + params.memory_max_len = memory_max_seqlen; + params.num_heads = nheads; + params.num_heads_kv = nheads_kv; + params.num_heads_q_kv_ratio = nheads / nheads_kv; + params.nnz_heads = nnz_heads; + params.hidden_size_per_head = headdim; + params.rotary_embedding_dim = rotary_embedding_dim; + params.rotary_base = rotary_base; + params.neox_rotary_style = neox_rotary_style; + params.timestep = timestep; + params.inv_sqrt_dh = 1.f / sqrt(float(headdim)); + params.total_padding_tokens = nullptr; + params.masked_tokens = nullptr; + params.prefix_prompt_lengths = nullptr; + params.max_prefix_prompt_length = 0; + params.relative_attention_bias = nullptr; + params.relative_attention_bias_stride = 0; + params.cross_attention_out = nullptr; + params.max_decoder_seq_len = 0; + params.is_return_cross_attentions = false; + params.finished = nullptr; + params.memory_length_per_sample = nullptr; + params.length_per_sample = length_per_sample; + params.rotary_cos = rotary_cos; + params.rotary_sin = rotary_sin; + params.nnz_head_idx = nnz_head_idx; +} + +torch::Tensor single_query_attention(const torch::Tensor q, + const torch::Tensor k, + const torch::Tensor v, + torch::Tensor k_cache, + torch::Tensor v_cache, + c10::optional length_per_sample_, + c10::optional rotary_cos_, + c10::optional rotary_sin_, + c10::optional nnz_head_idx_, + const int timestep, + int rotary_embedding_dim = 0, + const float rotary_base = 10000.0f, + const bool neox_rotary_style=true) { + CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); CHECK_DEVICE(k_cache); CHECK_DEVICE(v_cache); + int batch_size = v_cache.size(0); + int nheads = q.size(1); + int nheads_kv = v_cache.size(1); + int memory_max_seqlen = v_cache.size(2); + int headdim = v_cache.size(3); + auto input_type = q.scalar_type(); + TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16); + + CHECK_SHAPE(q, batch_size, nheads, headdim); + CHECK_SHAPE(k, batch_size, nheads_kv, headdim); + CHECK_SHAPE(v, batch_size, nheads_kv, headdim); + CHECK_SHAPE(v_cache, batch_size, nheads_kv, memory_max_seqlen, headdim); + // k_cache shape: [B, H, Dh/x, L, x] where x=8 for fp16 and x=4 for fp32 + int packsize = k_cache.dtype() == torch::kFloat32 ? 4 : 8; + CHECK_SHAPE(k_cache, batch_size, nheads_kv, headdim / packsize, memory_max_seqlen, packsize); + TORCH_CHECK(q.stride(2) == 1 && q.stride(1) == headdim); + TORCH_CHECK(k.stride(2) == 1 && k.stride(1) == headdim); + TORCH_CHECK(v.stride(2) == 1 && v.stride(1) == headdim); + CHECK_CONTIGUOUS(v_cache); CHECK_CONTIGUOUS(k_cache); + + TORCH_CHECK(q.scalar_type() == input_type); + TORCH_CHECK(k.scalar_type() == input_type); + TORCH_CHECK(v.scalar_type() == input_type); + TORCH_CHECK(k_cache.scalar_type() == input_type); + TORCH_CHECK(v_cache.scalar_type() == input_type); + + if (length_per_sample_.has_value()) { + auto length_per_sample = length_per_sample_.value(); + CHECK_DEVICE(length_per_sample); + CHECK_SHAPE(length_per_sample, batch_size); + CHECK_CONTIGUOUS(length_per_sample); + TORCH_CHECK(length_per_sample.dtype() == torch::kInt32); + } + + if (rotary_cos_.has_value()) { + auto rotary_cos = rotary_cos_.value(); + CHECK_DEVICE(rotary_cos); + rotary_embedding_dim = rotary_cos.size(-1) * 2; + CHECK_SHAPE(rotary_cos, batch_size, rotary_embedding_dim / 2); + CHECK_CONTIGUOUS(rotary_cos); + TORCH_CHECK(rotary_cos.scalar_type() == input_type); + + TORCH_CHECK(rotary_sin_.has_value()); + auto rotary_sin = rotary_sin_.value(); + CHECK_DEVICE(rotary_sin); + CHECK_SHAPE(rotary_sin, batch_size, rotary_embedding_dim / 2); + CHECK_CONTIGUOUS(rotary_sin); + TORCH_CHECK(rotary_sin.scalar_type() == input_type); + } + + if (nnz_head_idx_.has_value()) { + auto nnz_head_idx = nnz_head_idx_.value(); + CHECK_DEVICE(nnz_head_idx); + int nnz_heads = nnz_head_idx.size(0); + CHECK_SHAPE(nnz_head_idx, nnz_heads); + CHECK_CONTIGUOUS(nnz_head_idx); + TORCH_CHECK(nnz_head_idx.dtype() == torch::kInt32); + } + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + + torch::Tensor out = torch::empty_like(q); + + DISPATCH_FLOAT_AND_HALF_AND_BF16(q.scalar_type(), "single_query_attention", [&] { + using DataType = typename SATypeConverter::Type; + Masked_multihead_attention_params params; + set_params(params, batch_size, nheads, nheads_kv, memory_max_seqlen, headdim, timestep, + rotary_embedding_dim, rotary_base, neox_rotary_style, + q.stride(0), k.stride(0), v.stride(0), + nnz_head_idx_.has_value() ? nnz_head_idx_.value().size(0) : 0, + reinterpret_cast(q.data_ptr()), + reinterpret_cast(k.data_ptr()), + reinterpret_cast(v.data_ptr()), + reinterpret_cast(k_cache.data_ptr()), + reinterpret_cast(v_cache.data_ptr()), + length_per_sample_.has_value() + ? length_per_sample_.value().data_ptr() : nullptr, + rotary_cos_.has_value() + ? reinterpret_cast(rotary_cos_.value().data_ptr()) : nullptr, + rotary_sin_.has_value() + ? reinterpret_cast(rotary_sin_.value().data_ptr()) : nullptr, + reinterpret_cast(out.data_ptr()), + nnz_head_idx_.has_value() ? nnz_head_idx_.value().data_ptr() : nullptr + ); + auto stream = at::cuda::getCurrentCUDAStream(); + masked_multihead_attention(params, stream); + }); + return out; +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("single_query_attention", &single_query_attention, "Attention with a single query", + py::arg("q"), py::arg("k"), py::arg("v"), py::arg("k_cache"), py::arg("v_cache"), + py::arg("length_per_sample_"), py::arg("rotary_cos_"), + py::arg("rotary_sin_"), py::arg("nnz_head_idx_"), + py::arg("timestep"), py::arg("rotary_embedding_dim")=0, + py::arg("rotary_base")=10000.0f, py::arg("neox_rotary_style")=true); +} diff --git a/fused_dense.cpp b/fused_dense.cpp new file mode 100644 index 0000000000000000000000000000000000000000..52a203889b239e857cb1cf34c1260243438d3404 --- /dev/null +++ b/fused_dense.cpp @@ -0,0 +1,216 @@ +// Adapted from https://github.com/NVIDIA/apex/blob/master/csrc/fused_dense.cpp +// We make it work for bfloat16 +#include +#include +#include +#include +#include + +#include + +#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")") + +// https://github.com/NVIDIA/apex/blob/master/csrc/type_shim.h +// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +#define DISPATCH_HALF_AND_BF16(TYPE, NAME, ...) \ + switch (TYPE) { \ + case at::ScalarType::Half: { \ + using scalar_t = at::Half; \ + __VA_ARGS__(); \ + break; \ + } \ + case at::ScalarType::BFloat16: { \ + using scalar_t = at::BFloat16; \ + __VA_ARGS__(); \ + break; \ + } \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ + } + +template +int linear_bias_wgrad_cuda(const T *input, const T *d_output, int64_t in_features, int64_t batch_size, int64_t out_features, T *d_weight, T *d_bias, void *lt_workspace, size_t workspaceSize); + +template +int linear_act_forward_cuda(const T *input, const T *weight, const T *bias, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, T *output, void *pre_act, void *lt_workspace, size_t workspaceSize); + +template +int bias_act_linear_dgrad_bgrad_cuda(const T *weight, const T *d_output, const void *pre_act, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, T *d_input, T *d_bias, void *lt_workspace, size_t workspaceSize); + +std::vector linear_bias_wgrad(at::Tensor input, at::Tensor d_output, bool has_d_bias) { + + int64_t batch_size = input.size(0); + int64_t in_features = input.size(1); + int64_t out_features = d_output.size(1); + + TORCH_CHECK(input.dtype() == torch::kFloat16 || input.dtype() == torch::kBFloat16); + TORCH_CHECK(input.dtype() == d_output.dtype()); + TORCH_CHECK(input.is_cuda()); + TORCH_CHECK(d_output.is_cuda()); + TORCH_CHECK(input.is_contiguous()); + TORCH_CHECK(d_output.is_contiguous()); + CHECK_SHAPE(input, batch_size, in_features); + CHECK_SHAPE(d_output, batch_size, out_features); + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)input.get_device()}; + + // create output/workspace tensor + auto opts = input.options(); + auto d_weight = at::empty({out_features, in_features}, opts); + at::Tensor d_bias; + if (has_d_bias) { +#if defined(CUBLAS_VERSION) && CUBLAS_VERSION < 11600 + d_bias = d_output.view({-1, out_features}).sum(0, false); +#else + d_bias = at::empty({out_features}, opts); +#endif + } + // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind setting this to 1M. + // However, Apex sets it to 4M and TransformerEngine sets to 32M for Hopper and 4M for other GPUs + // https://github.com/NVIDIA/TransformerEngine/blob/a0f0065498bbcfc1da78cf9e8b166f5381613fbc/transformer_engine/pytorch/module.py#L91 + size_t workspaceSize = 1024 * 1024 * (at::cuda::getCurrentDeviceProperties()->major >= 9 ? 32 : 4); + auto lt_workspace = at::empty({static_cast(workspaceSize)}, opts.dtype(torch::kUInt8)); + + DISPATCH_HALF_AND_BF16(input.scalar_type(), "linear_bias_wgrad", [&] { + auto result = linear_bias_wgrad_cuda( + input.data_ptr(), + d_output.data_ptr(), + in_features, + batch_size, + out_features, + d_weight.data_ptr(), + has_d_bias ? d_bias.data_ptr() : nullptr, + (void*) (lt_workspace.data_ptr()), + workspaceSize); + TORCH_CHECK(result == 0, "linear_bias_wgrad failed."); + }); + + return {d_weight, d_bias}; +} + +std::vector linear_act_forward(at::Tensor input, at::Tensor weight, + c10::optional bias_, + bool is_gelu, bool save_pre_act, int heuristic) { + + int64_t batch_size = input.size(0); + int64_t in_features = input.size(1); + int64_t out_features = weight.size(0); + + TORCH_CHECK(input.dtype() == torch::kFloat16 || input.dtype() == torch::kBFloat16); + TORCH_CHECK(input.dtype() == weight.dtype()); + TORCH_CHECK(input.is_cuda()); + TORCH_CHECK(weight.is_cuda()); + TORCH_CHECK(input.is_contiguous()); + TORCH_CHECK(weight.is_contiguous()); + CHECK_SHAPE(input, batch_size, in_features); + CHECK_SHAPE(weight, out_features, in_features); + if (bias_.has_value()) { + auto bias = bias_.value(); + TORCH_CHECK(bias.dtype() == input.dtype()); + TORCH_CHECK(bias.is_cuda()); + TORCH_CHECK(bias.is_contiguous()); + CHECK_SHAPE(bias, out_features); + } + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)input.get_device()}; + + // create output/workspace tensor + auto opts = input.options(); + auto output = at::empty({batch_size, out_features}, opts); + at::Tensor pre_act; + // If ReLU, cuBlasLT stores a bit-mask (1 bit per element) + if (save_pre_act) { pre_act = at::empty({batch_size, is_gelu ? out_features : out_features / 8}, + is_gelu ? opts : opts.dtype(torch::kUInt8)); } + // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind setting this to 1M. + // However, Apex sets it to 4M and TransformerEngine sets to 32M for Hopper and 4M for other GPUs + // https://github.com/NVIDIA/TransformerEngine/blob/a0f0065498bbcfc1da78cf9e8b166f5381613fbc/transformer_engine/pytorch/module.py#L91 + size_t workspaceSize = 1024 * 1024 * (at::cuda::getCurrentDeviceProperties()->major >= 9 ? 32 : 4); + auto lt_workspace = at::empty({static_cast(workspaceSize)}, opts.dtype(torch::kUInt8)); + + DISPATCH_HALF_AND_BF16(input.scalar_type(), "linear_act_forward", [&] { + auto result = linear_act_forward_cuda( + input.data_ptr(), + weight.data_ptr(), + bias_.has_value()? bias_.value().data_ptr() : nullptr, + in_features, + batch_size, + out_features, + is_gelu, + heuristic, + output.data_ptr(), + save_pre_act ? pre_act.data_ptr() : nullptr, + (void*) (lt_workspace.data_ptr()), + workspaceSize); + TORCH_CHECK(result == 0, "linear_act_forward failed."); + }); + + std::vector result = {output}; + if (save_pre_act) { result.push_back(pre_act); }; + return result; +} + +std::vector bias_act_linear_dgrad_bgrad( + at::Tensor weight, at::Tensor d_output, at::Tensor pre_act, bool is_gelu, int heuristic +) { + + int64_t batch_size = d_output.size(0); + int64_t out_features = d_output.size(1); + int64_t in_features = weight.size(1); + + TORCH_CHECK(weight.dtype() == torch::kFloat16 || weight.dtype() == torch::kBFloat16); + TORCH_CHECK(weight.dtype() == d_output.dtype()); + TORCH_CHECK(is_gelu ? (pre_act.dtype() == weight.dtype()) : (pre_act.dtype() == torch::kUInt8)); + TORCH_CHECK(weight.is_cuda()); + TORCH_CHECK(d_output.is_cuda()); + TORCH_CHECK(pre_act.is_cuda()); + TORCH_CHECK(weight.is_contiguous()); + TORCH_CHECK(d_output.is_contiguous()); + TORCH_CHECK(pre_act.is_contiguous()); + CHECK_SHAPE(weight, out_features, in_features); + CHECK_SHAPE(d_output, batch_size, out_features); + // If ReLU, cuBlasLT stores a bit-mask (1 bit per element) + CHECK_SHAPE(pre_act, batch_size, is_gelu ? in_features : in_features / 8); + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)weight.get_device()}; + + // create output/workspace tensor + auto opts = weight.options(); + auto d_bias = at::empty({in_features}, opts); + auto d_input = at::empty({batch_size, in_features}, opts); + // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind setting this to 1M. + // However, Apex sets it to 4M and TransformerEngine sets to 32M for Hopper and 4M for other GPUs + // https://github.com/NVIDIA/TransformerEngine/blob/a0f0065498bbcfc1da78cf9e8b166f5381613fbc/transformer_engine/pytorch/module.py#L91 + size_t workspaceSize = 1024 * 1024 * (at::cuda::getCurrentDeviceProperties()->major >= 9 ? 32 : 4); + auto lt_workspace = at::empty({static_cast(workspaceSize)}, opts.dtype(torch::kUInt8)); + + DISPATCH_HALF_AND_BF16(weight.scalar_type(), "bias_act_linear_dgrad_bgrad", [&] { + auto result = bias_act_linear_dgrad_bgrad_cuda( + weight.data_ptr(), + d_output.data_ptr(), + pre_act.data_ptr(), + in_features, + batch_size, + out_features, + is_gelu, + heuristic, + d_input.data_ptr(), + d_bias.data_ptr(), + (void*) (lt_workspace.data_ptr()), + workspaceSize); + TORCH_CHECK(result == 0, "bias_act_linear_dgrad_bgrad failed."); + }); + + return {d_input, d_bias}; +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("linear_bias_wgrad", &linear_bias_wgrad, "linear bias wgrad"); + m.def("linear_act_forward", &linear_act_forward, "linear gelu/relu forward"); + m.def("bias_act_linear_dgrad_bgrad", &bias_act_linear_dgrad_bgrad, "bias gelu/relu linear dgrad bgrad"); +} diff --git a/fused_dense.py b/fused_dense.py new file mode 100644 index 0000000000000000000000000000000000000000..1e45b8e609812a1545781011141ec80f6dc3af0f --- /dev/null +++ b/fused_dense.py @@ -0,0 +1,688 @@ +# Copyright (c) 2023, Tri Dao. +# Inspired by https://github.com/NVIDIA/apex/blob/master/apex/fused_dense/fused_dense.py +# We make it work with pytorch amp and with bfloat16. +# The TensorParallel linear modules are inspired by https://github.com/NVIDIA/apex/blob/master/apex/transformer/tensor_parallel/layers.py +from functools import partial +from typing import Optional + +# import fused_dense_cuda # from apex +import fused_dense_lib as fused_dense_cuda +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from torch.cuda.amp import custom_bwd, custom_fwd +from torch.distributed import ProcessGroup + +from flash_attn.ops.activations import gelu_bwd, relu_bwd, sqrelu_bwd, sqrelu_fwd +from flash_attn.utils.distributed import ( + all_gather_raw, + all_reduce, + all_reduce_raw, + reduce_scatter, + reduce_scatter_raw, +) + + +class FusedDenseFunc(torch.autograd.Function): + @staticmethod + @custom_fwd + def forward( + ctx, x, weight, bias, return_residual=False, process_group=None, sequence_parallel=True + ): + """ + If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel + with sequence parallelism: we do an all_gather_raw of x before doing the matmul. + """ + ctx.compute_weight_gradient = weight.requires_grad + ctx.return_residual = return_residual + ctx.process_group = process_group + ctx.sequence_parallel = sequence_parallel + + if torch.is_autocast_enabled(): + x = x.to(dtype=torch.get_autocast_gpu_dtype()) + x = x.contiguous() + if process_group is not None and sequence_parallel: + # We want to kick off the all_gather early, before weight dtype conversion + total_x, handle_x = all_gather_raw(x, process_group, async_op=True) + else: + total_x = x + + if torch.is_autocast_enabled(): + weight = weight.to(dtype=torch.get_autocast_gpu_dtype()) + bias = bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None + weight = weight.contiguous() + if process_group is not None and sequence_parallel: + handle_x.wait() + batch_shape, n = total_x.shape[:-1], total_x.shape[-1] + batch_dim = batch_shape.numel() + # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174 + if min(batch_dim, n, *weight.shape) > 65535 * 32: + raise RuntimeError("fused_dense only supports matrix dims <= 2M") + output = F.linear(total_x, weight, bias) + if ctx.compute_weight_gradient: + ctx.save_for_backward(x, weight) + else: + ctx.save_for_backward(weight) + return output if not return_residual else (output, x) + + @staticmethod + @custom_bwd + def backward(ctx, grad_output, *args): + grad_output = grad_output.contiguous() + if ctx.return_residual: + (grad_input,) = args + grad_input = grad_input.contiguous() + process_group = ctx.process_group + sequence_parallel = ctx.sequence_parallel + if ctx.compute_weight_gradient: + x, weight = ctx.saved_tensors + if process_group is not None and sequence_parallel: + total_x, handle_x = all_gather_raw(x, process_group, async_op=True) + else: + total_x = x + else: + (weight,) = ctx.saved_tensors + total_x = None + batch_shape = grad_output.shape[:-1] + batch_dim = batch_shape.numel() + grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) + if ctx.needs_input_grad[0]: + if not ctx.return_residual: + grad_input = F.linear(grad_output, weight.t()) + else: + grad_input = torch.addmm( + grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_output, weight + ) + grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) + if process_group is not None: + reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw + grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True) + else: + grad_input = None + if ctx.needs_input_grad[1]: + assert ctx.compute_weight_gradient + if process_group is not None and sequence_parallel: + handle_x.wait() + grad_weight, grad_bias = fused_dense_cuda.linear_bias_wgrad( + total_x.reshape(batch_dim, total_x.shape[-1]), grad_output, ctx.needs_input_grad[2] + ) + else: + grad_weight = None + grad_bias = grad_output if ctx.needs_input_grad[2] else None + if process_group is not None and ctx.needs_input_grad[0]: + handle_grad_input.wait() + return grad_input, grad_weight, grad_bias, None, None, None + + +def fused_dense_func( + x: Tensor, + weight: Tensor, + bias: Optional[Tensor] = None, + return_residual: bool = False, + process_group: Optional[ProcessGroup] = None, + sequence_parallel: bool = True, +): + dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or ( + x.dtype == torch.float32 and torch.is_autocast_enabled() + ) + if x.is_cuda and weight.is_cuda and (bias is None or bias.is_cuda) and dtype_eligible: + return FusedDenseFunc.apply( + x, weight, bias, return_residual, process_group, sequence_parallel + ) + else: + assert process_group is None + out = F.linear(x, weight, bias) + return out if not return_residual else (out, x) + + +class FusedDense(nn.Linear): + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + return_residual: bool = False, + device=None, + dtype=None, + ) -> None: + super().__init__(in_features, out_features, bias=bias, device=device, dtype=dtype) + self.return_residual = return_residual + + def forward(self, x, process_group=None): + """ + If process_group is not None, we're doing Tensor Parallel with sequence parallelism: + we do an all_gather of x before doing the matmul. + """ + return fused_dense_func( + x, + self.weight, + self.bias, + return_residual=self.return_residual, + process_group=process_group, + ) + + +class ColumnParallelLinear(nn.Linear): + def __init__( + self, + in_features: int, + out_features: int, + process_group: ProcessGroup, + bias: bool = True, + sequence_parallel=True, + multiple_of=1, + device=None, + dtype=None, + ) -> None: + world_size = torch.distributed.get_world_size(process_group) + if out_features % multiple_of: + raise ValueError(f"out_features ({out_features}) must be a multiple of {multiple_of}") + multiple = out_features // multiple_of + # We want to split @multiple across world_size, but it could be an uneven split + div = multiple // world_size + mod = multiple % world_size + # The first @mod ranks get @div + 1 copies, the rest get @div copies + local_multiple = div + int(torch.distributed.get_rank(process_group) < mod) + super().__init__( + in_features, local_multiple * multiple_of, bias=bias, device=device, dtype=dtype + ) + self.process_group = process_group + self.sequence_parallel = sequence_parallel + + def forward(self, x): + # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism: + # we do an all_gather of x before doing the matmul. + # If not, then the input is already gathered. + return fused_dense_func( + x, + self.weight, + self.bias, + process_group=self.process_group, + sequence_parallel=self.sequence_parallel, + ) + + +class RowParallelLinear(nn.Linear): + def __init__( + self, + in_features: int, + out_features: int, + process_group: ProcessGroup, + bias: bool = True, + sequence_parallel=True, + multiple_of=1, + device=None, + dtype=None, + ) -> None: + world_size = torch.distributed.get_world_size(process_group) + rank = torch.distributed.get_rank(process_group) + if in_features % multiple_of: + raise ValueError(f"in_features ({in_features}) must be a multiple of {multiple_of}") + multiple = in_features // multiple_of + # We want to split @multiple across world_size, but it could be an uneven split + div = multiple // world_size + mod = multiple % world_size + # The first @mod ranks get @div + 1 copies, the rest get @div copies + local_multiple = div + int(torch.distributed.get_rank(process_group) < mod) + # Only rank 0 will have bias + super().__init__( + local_multiple * multiple_of, + out_features, + bias=bias and rank == 0, + device=device, + dtype=dtype, + ) + self.process_group = process_group + self.sequence_parallel = sequence_parallel + + def forward(self, x): + """ + We're doing Tensor Parallel with sequence parallelism: we do the matmul and then + a reduce_scatter of the result. + """ + out = fused_dense_func(x, self.weight, self.bias) + reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce + return reduce_fn(out, self.process_group) + + +class FusedMLPFunc(torch.autograd.Function): + @staticmethod + @custom_fwd + def forward( + ctx, + x, + weight1, + bias1, + weight2, + bias2, + activation="gelu_approx", + save_pre_act=True, + return_residual=False, + checkpoint_lvl=0, + heuristic=0, + process_group=None, + sequence_parallel=True, + ): + """ + If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel + with sequence parallelism: we do an all_gather of x before doing the matmul. + If sequence_parallel=False, then the input is already gathered. + + checkpoint_lvl: + 0: no recomputation in the bwd + 1: recompute gelu_out / relu_out in the bwd + 2: recompute pre_act and gelu_out / relu_out in the bwd + """ + assert -1 <= heuristic <= 4 + assert activation in ["gelu_approx", "relu", "sqrelu"] + if activation == "sqrelu": + assert heuristic == -1 + if not save_pre_act: + checkpoint_lvl = 2 + assert checkpoint_lvl in [0, 1, 2] + ctx.return_residual = return_residual + ctx.process_group = process_group + ctx.sequence_parallel = sequence_parallel + ctx.checkpoint_lvl = checkpoint_lvl + ctx.activation = activation + ctx.heuristic = heuristic + + if torch.is_autocast_enabled(): + x = x.to(dtype=torch.get_autocast_gpu_dtype()) + x = x.contiguous() + if process_group is not None and sequence_parallel: + # We want to kick off the all_gather early, before weight dtype conversion + total_x, handle_x = all_gather_raw(x, process_group, async_op=True) + else: + total_x = x + + if torch.is_autocast_enabled(): + dtype = torch.get_autocast_gpu_dtype() + weight1, weight2 = [a.to(dtype=dtype) for a in [weight1, weight2]] + bias1 = bias1.to(dtype=dtype) if bias1 is not None else None + bias2 = bias2.to(dtype=dtype) if bias2 is not None else None + weight1 = weight1.contiguous() + bias1 = bias1.contiguous() if bias1 is not None else None + weight2 = weight2.contiguous() + bias2 = bias2.contiguous() if bias2 is not None else None + if process_group is not None and sequence_parallel: + handle_x.wait() + batch_shape, n = total_x.shape[:-1], total_x.shape[-1] + batch_dim = batch_shape.numel() + # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174 + if min(batch_dim, n, *weight1.shape, *weight2.shape) > 65535 * 32: + raise RuntimeError("fused_dense only supports matrix dims <= 2M") + if heuristic == -1: + pre_act = F.linear(total_x, weight1, bias1) + activation_fn = ( + partial(F.gelu, approximate="tanh") + if activation == "gelu_approx" + else (sqrelu_fwd if activation == "sqrelu" else F.relu) + ) + with torch.jit.fuser("fuser2"): + output1 = activation_fn(pre_act) + # This is before adding bias1 + # pre_act = F.linear(total_x.reshape(batch_dim, n), weight1) + # with torch.jit.fuser('fuser2'): + # output1 = bias_gelu(pre_act, bias1) + else: + is_gelu = activation == "gelu_approx" + output1, *rest = fused_dense_cuda.linear_act_forward( + total_x.reshape(batch_dim, n), weight1, bias1, is_gelu, save_pre_act, heuristic + ) + if save_pre_act: + pre_act = rest[0] + output2 = F.linear(output1, weight2, bias2) + if checkpoint_lvl == 0 or (checkpoint_lvl == 1 and activation == "relu"): + # For RELU the pre_act is very small (just a bit-mask) so we just save it + ctx.save_for_backward(x, weight1, weight2, pre_act, output1) + elif checkpoint_lvl == 1: + ctx.save_for_backward(x, weight1, weight2, pre_act) + elif checkpoint_lvl == 2: + ctx.save_for_backward(x, weight1, weight2, bias1) + output2 = output2.reshape(*batch_shape, output2.shape[-1]) + return output2 if not return_residual else (output2, x) + + @staticmethod + @custom_bwd + def backward(ctx, grad_output, *args): + grad_output = grad_output.contiguous() + checkpoint_lvl = ctx.checkpoint_lvl + activation = ctx.activation + activation_fn = ( + partial(F.gelu, approximate="tanh") + if activation == "gelu_approx" + else (sqrelu_fwd if activation == "sqrelu" else F.relu) + ) + if ctx.return_residual: + (grad_input,) = args + grad_input = grad_input.contiguous() + process_group = ctx.process_group + sequence_parallel = ctx.sequence_parallel + x, weight1, weight2, *rest = ctx.saved_tensors + if process_group is None or not sequence_parallel: + total_x = x + batch_shape = grad_output.shape[:-1] + batch_dim = batch_shape.numel() + if checkpoint_lvl in [0, 1]: + if process_group is not None and sequence_parallel: + total_x, handle_x = all_gather_raw(x, process_group, async_op=True) + if checkpoint_lvl == 0 or (checkpoint_lvl == 1 and activation == "relu"): + pre_act, output1 = rest + elif checkpoint_lvl == 1: + (pre_act,) = rest + with torch.jit.fuser("fuser2"): + output1 = activation_fn(pre_act) + elif checkpoint_lvl == 2: + (bias1,) = rest + if process_group is not None and sequence_parallel: + total_x, _ = all_gather_raw(x, process_group) + if ctx.heuristic == -1: + pre_act = F.linear(total_x, weight1, bias1) + with torch.jit.fuser("fuser2"): + output1 = activation_fn(pre_act) + else: + output1, pre_act = fused_dense_cuda.linear_act_forward( + total_x.reshape(batch_dim, total_x.shape[-1]), + weight1, + bias1, + activation == "gelu_approx", + True, + ctx.heuristic, + ) + + grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) + output1 = output1.reshape(batch_dim, output1.shape[-1]) + pre_act = pre_act.reshape(batch_dim, pre_act.shape[-1]) + if ctx.needs_input_grad[3]: + grad_weight2, grad_bias2 = fused_dense_cuda.linear_bias_wgrad( + output1, grad_output, ctx.needs_input_grad[4] + ) + else: + grad_weight2 = None + grad_bias2 = grad_output if ctx.needs_input_grad[4] else None + if ctx.heuristic == -1: + # grad_pre_act = matmul_dgelu(grad_output, weight2, pre_act) + grad_output1 = F.linear(grad_output, weight2.t()) + activation_grad_fn = ( + gelu_bwd + if activation == "gelu_approx" + else (sqrelu_bwd if activation == "sqrelu" else relu_bwd) + ) + with torch.jit.fuser("fuser2"): + grad_pre_act = activation_grad_fn(grad_output1, pre_act) + else: + # The cublasLt epilogue has to compute both gelu/relu grad and bias grad, we can't + # just compute gelu/relu grad + grad_pre_act, grad_bias1 = fused_dense_cuda.bias_act_linear_dgrad_bgrad( + weight2, grad_output, pre_act, activation == "gelu_approx", ctx.heuristic + ) + if not ctx.needs_input_grad[2]: + grad_bias1 = None + if ctx.needs_input_grad[0]: + if not ctx.return_residual: + grad_input = F.linear(grad_pre_act, weight1.t()) + else: + grad_input = torch.addmm( + grad_input.reshape(batch_dim, grad_input.shape[-1]), grad_pre_act, weight1 + ) + grad_input = grad_input.reshape(*batch_shape, grad_input.shape[-1]) + if process_group is not None: + reduce_fn = reduce_scatter_raw if sequence_parallel else all_reduce_raw + grad_input, handle_grad_input = reduce_fn(grad_input, process_group, async_op=True) + else: + grad_input = None + if ctx.heuristic == -1: + if ctx.needs_input_grad[1]: + if process_group is not None and sequence_parallel and checkpoint_lvl != 2: + handle_x.wait() + grad_weight1, grad_bias1 = fused_dense_cuda.linear_bias_wgrad( + total_x.reshape(batch_dim, total_x.shape[-1]), + grad_pre_act, + ctx.needs_input_grad[2], + ) + else: + grad_weight1 = None + grad_bias1 = grad_pre_act if ctx.needs_input_grad[2] else None + else: + if ctx.needs_input_grad[1]: + if process_group is not None and sequence_parallel and checkpoint_lvl != 2: + handle_x.wait() + grad_weight1 = F.linear( + grad_pre_act.t(), total_x.reshape(batch_dim, total_x.shape[-1]).t() + ) + else: + grad_weight1 = None + if process_group is not None and ctx.needs_input_grad[0]: + handle_grad_input.wait() + return ( + grad_input, + grad_weight1, + grad_bias1, + grad_weight2, + grad_bias2, + None, + None, + None, + None, + None, + None, + None, + ) + + +def fused_mlp_func( + x: Tensor, + weight1: Tensor, + weight2: Tensor, + bias1: Optional[Tensor] = None, + bias2: Optional[Tensor] = None, + activation: str = "gelu_approx", + save_pre_act: bool = True, + return_residual: bool = False, + checkpoint_lvl: int = 0, + heuristic: int = 0, + process_group: Optional[ProcessGroup] = None, + sequence_parallel: bool = True, +): + assert activation in ["gelu_approx", "relu", "sqrelu"] + dtype_eligible = x.dtype in [torch.float16, torch.bfloat16] or ( + x.dtype == torch.float32 and torch.is_autocast_enabled() + ) + # If we save pre-activation, dimension must be divisible by 128 (relu) or 8 (gelu) + dim_eligible = not save_pre_act or (x.shape[-1] % (128 if activation == "relu" else 8) == 0) + if ( + x.is_cuda + and weight1.is_cuda + and weight2.is_cuda + and (bias1 is None or bias1.is_cuda) + and (bias2 is None or bias2.is_cuda) + and dtype_eligible + and dim_eligible + ): + return FusedMLPFunc.apply( + x, + weight1, + bias1, + weight2, + bias2, + activation, + save_pre_act, + return_residual, + checkpoint_lvl, + heuristic, + process_group, + sequence_parallel, + ) + else: + assert process_group is None + pre_act = F.linear(x, weight1, bias1) + activation_fn = ( + partial(F.gelu, approximate="tanh") + if activation == "gelu_approx" + else partial(F.relu, inplace=True) + ) + output1 = activation_fn(pre_act) + output2 = F.linear(output1, weight2, bias2) + return output2 if not return_residual else (output2, x) + + +class FusedMLP(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + bias1=True, + bias2=True, + activation="gelu_approx", + return_residual=False, + checkpoint_lvl=0, + heuristic="auto", + device=None, + dtype=None, + ): + """ + If process_group is not None, we're doing Tensor Parallel with sequence parallelism: + we do an all_gather of x before doing the matmul, gelu, then matmul. + Finally we do a reduce_scatter of the output. + + checkpoint_lvl (increasing lvl means slower but more memory saving): + 0: no recomputation in the bwd + 1: recompute gelu_out in the bwd + 2: recompute pre_act and gelu_out in the bwd + heuristic: + -1: don't fuse gemm + gelu (separate kernel) + 0..4: use this heuristic for the algo section in the fused gemm + gelu + 'auto': heuristic will be picked automatically: + For CUDA >= 11.8, we set heuristic=0 for both fp16 and bf16 for best perf. + For CUDA <= 11.7, we set heuristic=1 for fp16 and heuristic=-1 for bf16. + For H100, we set heuristic=-1 for both fp16 and bf16 as the fused cuBlasLt implementation + is slower than the unfused version. + return_residual: whether to return the input x along with the output. This is for + performance reason: for post-norm architecture, returning the input allows us + to fuse the backward of nn.Linear with the residual connection. + """ + assert checkpoint_lvl in [0, 1, 2] + assert activation in ["gelu_approx", "relu", "sqrelu"] + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features * 4 + self.activation = activation + self.return_residual = return_residual + self.checkpoint_lvl = checkpoint_lvl + self.heuristic = heuristic if activation != "sqrelu" else -1 + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs) + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs) + + def forward(self, x, process_group=None): + dtype = x.dtype if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype() + if self.heuristic == "auto": + if self.activation == "gelu_approx": + if torch.cuda.get_device_capability("cuda") == (9, 0): + heuristic = -1 + else: + cuda_ver = tuple(map(int, torch.version.cuda.split("."))) + heuristic = 0 if cuda_ver >= (11, 8) else (1 if dtype == torch.float16 else -1) + else: + heuristic = 0 + else: + heuristic = self.heuristic + out = fused_mlp_func( + x, + self.fc1.weight, + self.fc2.weight, + self.fc1.bias, + self.fc2.bias, + activation=self.activation, + save_pre_act=self.training, + return_residual=self.return_residual, + checkpoint_lvl=self.checkpoint_lvl, + heuristic=heuristic, + process_group=process_group, + ) + if self.return_residual: + out, x = out + if process_group is not None: + out = reduce_scatter(out, process_group) + return out if not self.return_residual else (out, x) + + +class ParallelFusedMLP(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + activation="gelu_approx", + process_group: ProcessGroup = None, + bias1=True, + bias2=True, + sequence_parallel=True, + checkpoint_lvl=0, + heuristic="auto", + device=None, + dtype=None, + ): + """ + process_group is required. We're doing Tensor Parallel with sequence parallelism: + we do an all_gather of x before doing the matmul, gelu, then matmul. + Finally we do a reduce_scatter of the output. + + checkpoint_lvl (increasing lvl means slower but more memory saving): + 0: no recomputation in the bwd + 1: recompute gelu_out in the bwd + 2: recompute pre_act and gelu_out in the bwd + heuristic: + -1: don't fuse gemm + gelu (separate kernel) + 0..4: use this heuristic for the algo section in the fused gemm + gelu + 'auto': heuristic will be picked automatically: + For CUDA >= 11.8, we set heuristic=0 for both fp16 and bf16 for best perf. + For CUDA <= 11.7, we set heuristic=1 for fp16 and heuristic=-1 for bf16. + """ + assert checkpoint_lvl in [0, 1, 2] + assert activation in ["gelu_approx", "relu", "sqrelu"] + assert process_group is not None + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features * 4 + self.activation = activation + self.process_group = process_group + self.sequence_parallel = sequence_parallel + self.checkpoint_lvl = checkpoint_lvl + self.heuristic = heuristic if activation != "sqrelu" else -1 + self.fc1 = ColumnParallelLinear( + in_features, hidden_features, process_group, bias=bias1, **factory_kwargs + ) + self.fc2 = RowParallelLinear( + hidden_features, out_features, process_group, bias=bias2, **factory_kwargs + ) + + def forward(self, x): + dtype = x.dtype if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype() + if self.heuristic == "auto": + if self.activation == "gelu_approx": + cuda_ver = tuple(map(int, torch.version.cuda.split("."))) + heuristic = 0 if cuda_ver >= (11, 8) else (1 if dtype == torch.float16 else -1) + else: + heuristic = 0 + else: + heuristic = self.heuristic + out = fused_mlp_func( + x, + self.fc1.weight, + self.fc2.weight, + self.fc1.bias, + self.fc2.bias, + activation=self.activation, + save_pre_act=self.training, + checkpoint_lvl=self.checkpoint_lvl, + heuristic=heuristic, + process_group=self.process_group, + sequence_parallel=self.sequence_parallel, + ) + reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce + return reduce_fn(out, self.process_group) diff --git a/fused_dense_cuda.cu b/fused_dense_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..32600e205ff09396509d77c20f70387ff90a3538 --- /dev/null +++ b/fused_dense_cuda.cu @@ -0,0 +1,717 @@ +// Adapted from https://github.com/NVIDIA/apex/blob/master/csrc/fused_dense_cuda.cu +#include +#include +#include +#include +#include +#include +#include + +/* Includes, cuda */ +#include +#include + +#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11000 +#include +#endif + +// FP16 Tensor core wrapper around cublas GEMMEx +cublasStatus_t gemm_bias( + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + const float* alpha, + const at::Half* A, + int64_t lda, + const at::Half* B, + int64_t ldb, + const float* beta, + at::Half* C, + int64_t ldc) { + return cublasGemmEx( + handle, + transa, + transb, + m, + n, + k, + alpha, + A, + CUDA_R_16F, + lda, + B, + CUDA_R_16F, + ldb, + beta, + C, + CUDA_R_16F, + ldc, + CUDA_R_32F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP); +} + +// BF16 Tensor core wrapper around cublas GEMMEx +cublasStatus_t gemm_bias( + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + const float* alpha, + const at::BFloat16* A, + int64_t lda, + const at::BFloat16* B, + int64_t ldb, + const float* beta, + at::BFloat16* C, + int64_t ldc) { + return cublasGemmEx( + handle, + transa, + transb, + m, + n, + k, + alpha, + A, + CUDA_R_16BF, + lda, + B, + CUDA_R_16BF, + ldb, + beta, + C, + CUDA_R_16BF, + ldc, + CUDA_R_32F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP); +} + +#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600 + +template +int gemm_bias_act_lt( + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + float alpha, + const Dtype* A, + int64_t lda, + const Dtype* B, + int64_t ldb, + const Dtype* bias, + Dtype* C, + int64_t ldc, + void* pre_act, + bool is_gelu, + int heuristic, + void *lt_workspace, + size_t workspaceSize + ) { + static_assert(std::is_same::value || std::is_same::value, + "gemm_bias_act_lt only supports fp16 and bf16"); + bool save_pre_act = pre_act != nullptr; + float beta = 0.0; + cudaDataType_t abcType = std::is_same::value ? CUDA_R_16F : CUDA_R_16BF; + + cublasLtHandle_t ltHandle = + reinterpret_cast(at::cuda::getCurrentCUDABlasHandle()); + + cublasStatus_t status = CUBLAS_STATUS_SUCCESS; + + cublasLtMatmulDescOpaque_t operationDesc = {}; + cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {}; + cublasLtMatmulPreferenceOpaque_t preference = {}; + + int returnedResults = 0; + constexpr int requestedAlgoCount = 5; + cublasLtMatmulHeuristicResult_t heuristicResult[requestedAlgoCount] = {0}; + // constexpr int requestedAlgoCount = 1; + // cublasLtMatmulHeuristicResult_t heuristicResult = {}; + cublasLtEpilogue_t epilogue = is_gelu + ? (save_pre_act ? CUBLASLT_EPILOGUE_GELU_AUX : CUBLASLT_EPILOGUE_GELU) + : (save_pre_act ? CUBLASLT_EPILOGUE_RELU_AUX : CUBLASLT_EPILOGUE_RELU); + + // Create operation descriptor; see cublasLtMatmulDescAttributes_t + // for details about defaults; here we just set the transforms for + // A and B. + status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa)); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + + if (save_pre_act) { + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &pre_act, sizeof(pre_act)); + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc)); + } + + if (bias != nullptr) { + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias)); + if (status != CUBLAS_STATUS_SUCCESS) { + goto CLEANUP; + } + epilogue = is_gelu + ? (save_pre_act ? CUBLASLT_EPILOGUE_GELU_AUX_BIAS : CUBLASLT_EPILOGUE_GELU_BIAS) + : (save_pre_act ? CUBLASLT_EPILOGUE_RELU_AUX_BIAS : CUBLASLT_EPILOGUE_RELU_BIAS); + } else { + epilogue = is_gelu + ? (save_pre_act ? CUBLASLT_EPILOGUE_GELU_AUX : CUBLASLT_EPILOGUE_GELU) + : (save_pre_act ? CUBLASLT_EPILOGUE_RELU_AUX : CUBLASLT_EPILOGUE_RELU); + } + + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); + if (status != CUBLAS_STATUS_SUCCESS) { + goto CLEANUP; + } + + // Create matrix descriptors. Not setting any extra attributes. + status = cublasLtMatrixLayoutInit( + &Adesc, abcType, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + status = cublasLtMatrixLayoutInit( + &Bdesc, abcType, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + status = cublasLtMatrixLayoutInit(&Cdesc, abcType, m, n, ldc); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + + // Create preference handle; In general, extra attributes can be + // used here to disable tensor ops or to make sure algo selected + // will work with badly aligned A, B, C. However, for simplicity + // here we assume A,B,C are always well aligned (e.g., directly + // come from cudaMalloc) + status = cublasLtMatmulPreferenceInit(&preference); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + status = cublasLtMatmulPreferenceSetAttribute( + &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + + // We just need the best available heuristic to try and run matmul. + // There is no guarantee that this will work. For example, if A is + // badly aligned, you can request more (e.g. 32) algos and try to + // run them one by one until something works. + status = cublasLtMatmulAlgoGetHeuristic( + ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, requestedAlgoCount, heuristicResult, &returnedResults); + // ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + + if (returnedResults == 0) { + status = CUBLAS_STATUS_NOT_SUPPORTED; + goto CLEANUP; + } + status = cublasLtMatmul(ltHandle, + &operationDesc, + &alpha, + A, + &Adesc, + B, + &Bdesc, + &beta, + C, + &Cdesc, + C, + &Cdesc, + // &heuristicResult.algo, + // TD [2022-04-29] Somehow algo 0 and 2 are a lot slower than other algos + &heuristicResult[heuristic].algo, + // NULL, + lt_workspace, + workspaceSize, + at::cuda::getCurrentCUDAStream()); + +CLEANUP: + // Descriptors are no longer needed as all GPU work was already + // enqueued. + return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; +} + +template int gemm_bias_act_lt( + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + float alpha, + const at::Half* A, + int64_t lda, + const at::Half* B, + int64_t ldb, + const at::Half* bias, + at::Half* C, + int64_t ldc, + void* pre_act, + bool is_gelu, + int heuristic, + void *lt_workspace, + size_t workspaceSize); + +template int gemm_bias_act_lt( + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + float alpha, + const at::BFloat16* A, + int64_t lda, + const at::BFloat16* B, + int64_t ldb, + const at::BFloat16* bias, + at::BFloat16* C, + int64_t ldc, + void* pre_act, + bool is_gelu, + int heuristic, + void *lt_workspace, + size_t workspaceSize); + +template +int gemm_bgradb_lt( + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + float alpha, + const Dtype* A, + int64_t lda, + const Dtype* B, + int64_t ldb, + Dtype* C, + int64_t ldc, + Dtype* bgrad, + void *lt_workspace, + size_t workspaceSize) { + static_assert(std::is_same::value || std::is_same::value, + "gemm_bgradb_lt only supports fp16 and bf16"); + float beta = 0.0; + cudaDataType_t abcType = std::is_same::value ? CUDA_R_16F : CUDA_R_16BF; + + cublasLtHandle_t ltHandle = + reinterpret_cast(at::cuda::getCurrentCUDABlasHandle()); + + cublasStatus_t status = CUBLAS_STATUS_SUCCESS; + + cublasLtMatmulDescOpaque_t operationDesc = {}; + cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {}; + cublasLtMatmulPreferenceOpaque_t preference = {}; + + int returnedResults = 0; + cublasLtMatmulHeuristicResult_t heuristicResult = {}; + cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT; + + // Create operation descriptor; see cublasLtMatmulDescAttributes_t + // for details about defaults; here we just set the transforms for + // A and B. + status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa)); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + + if (bgrad != nullptr) { + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad)); + if (status != CUBLAS_STATUS_SUCCESS) { + goto CLEANUP; + } + epilogue = CUBLASLT_EPILOGUE_BGRADB; + } + + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); + if (status != CUBLAS_STATUS_SUCCESS) { + goto CLEANUP; + } + + // Create matrix descriptors. Not setting any extra attributes. + status = cublasLtMatrixLayoutInit( + &Adesc, abcType, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + status = cublasLtMatrixLayoutInit( + &Bdesc, abcType, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + status = cublasLtMatrixLayoutInit(&Cdesc, abcType, m, n, ldc); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + + // Create preference handle; In general, extra attributes can be + // used here to disable tensor ops or to make sure algo selected + // will work with badly aligned A, B, C. However, for simplicity + // here we assume A,B,C are always well aligned (e.g., directly + // come from cudaMalloc) + status = cublasLtMatmulPreferenceInit(&preference); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + status = cublasLtMatmulPreferenceSetAttribute( + &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + + // We just need the best available heuristic to try and run matmul. + // There is no guarantee that this will work. For example, if A is + // badly aligned, you can request more (e.g. 32) algos and try to + // run them one by one until something works. + status = cublasLtMatmulAlgoGetHeuristic( + ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, 1, &heuristicResult, &returnedResults); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + + if (returnedResults == 0) { + status = CUBLAS_STATUS_NOT_SUPPORTED; + goto CLEANUP; + } + status = cublasLtMatmul(ltHandle, + &operationDesc, + &alpha, + A, + &Adesc, + B, + &Bdesc, + &beta, + C, + &Cdesc, + C, + &Cdesc, + //&heuristicResult.algo, + NULL, + lt_workspace, + workspaceSize, + at::cuda::getCurrentCUDAStream()); + +CLEANUP: + // Descriptors are no longer needed as all GPU work was already + // enqueued. + return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; +} + + +template int gemm_bgradb_lt( + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + float alpha, + const at::Half* A, + int64_t lda, + const at::Half* B, + int64_t ldb, + at::Half* C, + int64_t ldc, + at::Half* bgrad, + void *lt_workspace, + size_t workspaceSize); + +template int gemm_bgradb_lt( + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + float alpha, + const at::BFloat16* A, + int64_t lda, + const at::BFloat16* B, + int64_t ldb, + at::BFloat16* C, + int64_t ldc, + at::BFloat16* bgrad, + void *lt_workspace, + size_t workspaceSize); + +template +int gemm_dact_bgradb_lt( + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + float alpha, + const Dtype* A, + int64_t lda, + const Dtype* B, + int64_t ldb, + const void* pre_act, + Dtype* C, + int64_t ldc, + Dtype* bgrad, + bool is_gelu, + int heuristic, + void *lt_workspace, + size_t workspaceSize) { + static_assert(std::is_same::value || std::is_same::value, + "gemm_dact_bgradb_lt only supports fp16 and bf16"); + float beta = 0.0; + cudaDataType_t abcType = std::is_same::value ? CUDA_R_16F : CUDA_R_16BF; + + cublasLtHandle_t ltHandle = + reinterpret_cast(at::cuda::getCurrentCUDABlasHandle()); + + cublasStatus_t status = CUBLAS_STATUS_SUCCESS; + + cublasLtMatmulDescOpaque_t operationDesc = {}; + cublasLtMatrixLayoutOpaque_t Adesc = {}, Bdesc = {}, Cdesc = {}; + cublasLtMatmulPreferenceOpaque_t preference = {}; + + int returnedResults = 0; + constexpr int requestedAlgoCount = 5; + cublasLtMatmulHeuristicResult_t heuristicResult[requestedAlgoCount] = {0}; + cublasLtEpilogue_t epilogue = is_gelu ? CUBLASLT_EPILOGUE_DGELU_BGRAD : CUBLASLT_EPILOGUE_DRELU_BGRAD; + + // Create operation descriptor; see cublasLtMatmulDescAttributes_t + // for details about defaults; here we just set the transforms for + // A and B. + status = cublasLtMatmulDescInit(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transa)); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bgrad, sizeof(bgrad)); + if (status != CUBLAS_STATUS_SUCCESS) { + goto CLEANUP; + } + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &pre_act, sizeof(pre_act)); + if (status != CUBLAS_STATUS_SUCCESS) { + goto CLEANUP; + } + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &ldc, sizeof(ldc)); + + status = cublasLtMatmulDescSetAttribute(&operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)); + if (status != CUBLAS_STATUS_SUCCESS) { + goto CLEANUP; + } + + // Create matrix descriptors. Not setting any extra attributes. + status = cublasLtMatrixLayoutInit( + &Adesc, abcType, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + status = cublasLtMatrixLayoutInit( + &Bdesc, abcType, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + status = cublasLtMatrixLayoutInit(&Cdesc, abcType, m, n, ldc); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + + // Create preference handle; In general, extra attributes can be + // used here to disable tensor ops or to make sure algo selected + // will work with badly aligned A, B, C. However, for simplicity + // here we assume A,B,C are always well aligned (e.g., directly + // come from cudaMalloc) + status = cublasLtMatmulPreferenceInit(&preference); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + status = cublasLtMatmulPreferenceSetAttribute( + &preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspaceSize, sizeof(workspaceSize)); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + + // We just need the best available heuristic to try and run matmul. + // There is no guarantee that this will work. For example, if A is + // badly aligned, you can request more (e.g. 32) algos and try to + // run them one by one until something works. + status = cublasLtMatmulAlgoGetHeuristic( + ltHandle, &operationDesc, &Adesc, &Bdesc, &Cdesc, &Cdesc, &preference, requestedAlgoCount, heuristicResult, &returnedResults); + if (status != CUBLAS_STATUS_SUCCESS) goto CLEANUP; + + if (returnedResults == 0) { + status = CUBLAS_STATUS_NOT_SUPPORTED; + goto CLEANUP; + } + status = cublasLtMatmul(ltHandle, + &operationDesc, + &alpha, + A, + &Adesc, + B, + &Bdesc, + &beta, + C, + &Cdesc, + C, + &Cdesc, + //&heuristicResult.algo, + &heuristicResult[heuristic].algo, + // NULL, + lt_workspace, + workspaceSize, + at::cuda::getCurrentCUDAStream()); + +CLEANUP: + // Descriptors are no longer needed as all GPU work was already + // enqueued. + return status == CUBLAS_STATUS_SUCCESS ? 0 : 1; +} + +template int gemm_dact_bgradb_lt( + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + float alpha, + const at::Half* A, + int64_t lda, + const at::Half* B, + int64_t ldb, + const void* pre_act, + at::Half* C, + int64_t ldc, + at::Half* bgrad, + bool is_gelu, + int heuristic, + void *lt_workspace, + size_t workspaceSize); + +template int gemm_dact_bgradb_lt( + cublasOperation_t transa, + cublasOperation_t transb, + int64_t m, + int64_t n, + int64_t k, + float alpha, + const at::BFloat16* A, + int64_t lda, + const at::BFloat16* B, + int64_t ldb, + const void* pre_act, + at::BFloat16* C, + int64_t ldc, + at::BFloat16* bgrad, + bool is_gelu, + int heuristic, + void *lt_workspace, + size_t workspaceSize); + +#endif + +template +int linear_bias_wgrad_cuda(const T *input, const T *d_output, int64_t in_features, int64_t batch_size, int64_t out_features, T *d_weight, T *d_bias, void *lt_workspace, size_t workspaceSize) { + const float alpha = 1.0; + const float beta_zero = 0.0; + int status = 1; +#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600 + status = gemm_bgradb_lt( + // (cublasLtHandle_t)handle, + CUBLAS_OP_N, + CUBLAS_OP_T, + in_features, + out_features, + batch_size, + alpha, + input, + in_features, + d_output, + out_features, + d_weight, + in_features, + d_bias, + lt_workspace, + workspaceSize); +#endif + + if (status != 0){ + cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); + status = gemm_bias( + handle, + CUBLAS_OP_N, + CUBLAS_OP_T, + in_features, + out_features, + batch_size, + &alpha, + input, + in_features, + d_output, + out_features, + &beta_zero, + d_weight, + in_features); + // TD [2023-01-17]: I can't call Pytorch's gemm for now, due to linking error + // https://discuss.pytorch.org/t/how-can-i-use-the-function-at-gemm-float/95341 + // at::cuda::blas::gemm( + // 'N', + // 'T', + // in_features, + // out_features, + // batch_size, + // alpha, + // input, + // in_features, + // d_output, + // out_features, + // beta_zero, + // d_weight, + // in_features); + } + + return status; +} + +template +int linear_act_forward_cuda(const T *input, const T *weight, const T *bias, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, T *output, void *pre_act, void *lt_workspace, size_t workspaceSize) { + int status = 1; +#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600 + status = gemm_bias_act_lt( + CUBLAS_OP_T, + CUBLAS_OP_N, + out_features, + batch_size, + in_features, + /*alpha=*/1.0, + weight, + in_features, + input, + in_features, + bias, + output, + out_features, + pre_act, + is_gelu, + heuristic, + lt_workspace, + workspaceSize); + return status; +#else + return 1; +#endif +} + +template +int bias_act_linear_dgrad_bgrad_cuda(const T *weight, const T *d_output, const void *pre_act, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, T *d_input, T *d_bias, void *lt_workspace, size_t workspaceSize) { + const float alpha = 1.0; + int status = 1; +#if defined(CUBLAS_VERSION) && CUBLAS_VERSION >= 11600 + status = gemm_dact_bgradb_lt( + CUBLAS_OP_N, + CUBLAS_OP_N, + in_features, + batch_size, + out_features, + alpha, + weight, + in_features, + d_output, + out_features, + pre_act, + d_input, + in_features, + d_bias, + is_gelu, + heuristic, + lt_workspace, + workspaceSize); +#endif + return status; + +} + +template int linear_bias_wgrad_cuda(const at::Half *input, const at::Half *d_output, int64_t in_features, int64_t batch_size, int64_t out_features, at::Half *d_weight, at::Half *d_bias, void *lt_workspace, size_t workspaceSize); +template int linear_bias_wgrad_cuda(const at::BFloat16 *input, const at::BFloat16 *d_output, int64_t in_features, int64_t batch_size, int64_t out_features, at::BFloat16 *d_weight, at::BFloat16 *d_bias, void *lt_workspace, size_t workspaceSize); + +template int linear_act_forward_cuda(const at::Half *input, const at::Half *weight, const at::Half *bias, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, at::Half *output, void *pre_act, void *lt_workspace, size_t workspaceSize); +template int linear_act_forward_cuda(const at::BFloat16 *input, const at::BFloat16 *weight, const at::BFloat16 *bias, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, at::BFloat16 *output, void *pre_act, void *lt_workspace, size_t workspaceSize); + +template int bias_act_linear_dgrad_bgrad_cuda(const at::Half *weight, const at::Half *d_output, const void *pre_act, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, at::Half *d_input, at::Half *d_bias, void *lt_workspace, size_t workspaceSize); +template int bias_act_linear_dgrad_bgrad_cuda(const at::BFloat16 *weight, const at::BFloat16 *d_output, const void *pre_act, int64_t in_features, int64_t batch_size, int64_t out_features, bool is_gelu, int heuristic, at::BFloat16 *d_input, at::BFloat16 *d_bias, void *lt_workspace, size_t workspaceSize); \ No newline at end of file diff --git a/fused_softmax.cpp b/fused_softmax.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2aaed913314d33a9a7b5092cfc27b51a28a65f89 --- /dev/null +++ b/fused_softmax.cpp @@ -0,0 +1,148 @@ +/* coding=utf-8 + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace multihead_attn { +namespace fused_softmax { +namespace scaled_masked_softmax { + +torch::Tensor fwd_cuda( + torch::Tensor const& input, + torch::Tensor const& mask, + float scale_factor); + +torch::Tensor bwd_cuda( + torch::Tensor const& output_grads, + torch::Tensor const& softmax_results, + float scale_factor); + +int get_batch_per_block_cuda( + int query_seq_len, + int key_seq_len, + int batches, + int attn_heads); + +torch::Tensor fwd( + torch::Tensor const& input, + torch::Tensor const& mask, + float scale_factor) { + AT_ASSERTM(input.dim() == 4, "expected 4D tensor"); + AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || + (input.scalar_type() == at::ScalarType::BFloat16), + "Only fp16 and bf16 are supported"); + AT_ASSERTM(mask.dim() == 4, "expected 4D tensor"); + + return fwd_cuda(input, mask, scale_factor); +} + +torch::Tensor bwd( + torch::Tensor const& output_grads, + torch::Tensor const& softmax_results, + float scale_factor) { + + AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor"); + AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor"); + + AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || + (output_grads.scalar_type() == at::ScalarType::BFloat16), + "Only fp16 and bf16 are supported"); + AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || + (softmax_results.scalar_type() == at::ScalarType::BFloat16), + "Only fp16 and bf16 are supported"); + + return bwd_cuda(output_grads, softmax_results, scale_factor); +} + +int get_batch_per_block( + int query_seq_len, + int key_seq_len, + int batches, + int attn_heads) { + return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads); +} + +} // end namespace scaled_masked_softmax +} // end namespace fused_softmax +} // end namespace multihead_attn + +namespace multihead_attn { +namespace fused_softmax { +namespace scaled_upper_triang_masked_softmax { + +torch::Tensor fwd_cuda( + torch::Tensor const& input, + float scale_factor); + +torch::Tensor bwd_cuda( + torch::Tensor const& output_grads, + torch::Tensor const& softmax_results, + float scale_factor); + +torch::Tensor fwd(torch::Tensor const& input, float scale_factor) { + AT_ASSERTM(input.dim() == 3, "expected 3D tensor"); + AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) || + (input.scalar_type() == at::ScalarType::BFloat16), + "Only fp16 and bf16 are supported"); + + return fwd_cuda(input, scale_factor); +} + +torch::Tensor bwd( + torch::Tensor const& output_grads, + torch::Tensor const& softmax_results, + float scale_factor) { + + AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor"); + AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor"); + + AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) || + (output_grads.scalar_type() == at::ScalarType::BFloat16), + "Only fp16 and bf16 are supported"); + AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) || + (softmax_results.scalar_type() == at::ScalarType::BFloat16), + "Only fp16 and bf16 are supported"); + + return bwd_cuda(output_grads, softmax_results, scale_factor); +} + +} // end namespace scaled_upper_triang_masked_softmax +} // end namespace fused_softmax +} // end namespace multihead_attn + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("scaled_masked_softmax_forward", + &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, + "Self Multihead Attention scaled, time masked softmax -- Forward."); + + m.def("scaled_masked_softmax_backward", + &multihead_attn::fused_softmax::scaled_masked_softmax::bwd, + "Self Multihead Attention scaled, time masked softmax -- Backward."); + + m.def("scaled_masked_softmax_get_batch_per_block", + &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block, + "Return Batch per block size." + ); + + m.def("scaled_upper_triang_masked_softmax_forward", + &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd, + "Self Multihead Attention scaled, time masked softmax -- Forward."); + m.def("scaled_upper_triang_masked_softmax_backward", + &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd, + "Self Multihead Attention scaled, time masked softmax -- Backward."); +} diff --git a/fused_softmax.py b/fused_softmax.py new file mode 100644 index 0000000000000000000000000000000000000000..382f94f092cd3999b2378dfc2fa165a7c08017e2 --- /dev/null +++ b/fused_softmax.py @@ -0,0 +1,201 @@ +# [2022-10-23] Copied from https://github.com/NVIDIA/apex/blob/master/apex/transformer/functional/fused_softmax.py +# for benchmarking. +# We added support for seqlen=2k and seqlen=4k + +# coding=utf-8 +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +from apex._autocast_utils import _cast_if_autocast_enabled +from apex.transformer.enums import AttnMaskType +from fused_softmax_lib import ( + scaled_masked_softmax_backward, + scaled_masked_softmax_forward, + scaled_masked_softmax_get_batch_per_block, + scaled_upper_triang_masked_softmax_backward, + scaled_upper_triang_masked_softmax_forward, +) + + +class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function): + """ + Fused operation which performs following three operations in sequence + 1. Scale the tensor. + 2. Apply upper triangular mask (typically used in gpt models). + 3. Perform softmax. + """ + + @staticmethod + def forward(ctx, inputs, scale): + scale_t = torch.tensor([scale]) + softmax_results = scaled_upper_triang_masked_softmax_forward(inputs, scale_t[0]) + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads): + softmax_results, scale_t = ctx.saved_tensors + input_grads = scaled_upper_triang_masked_softmax_backward( + output_grads, softmax_results, scale_t[0] + ) + return input_grads, None + + +def scaled_upper_triang_masked_softmax(inputs, _, scale): + b, np, sq, sk = inputs.size() + assert sq == sk, "causal mask is only for self attention" + # Reshaping input to 3D tensor (attn_batches, sq, sk) + inputs = inputs.view(-1, sq, sk) + args = _cast_if_autocast_enabled(inputs, scale) + with torch.cuda.amp.autocast(enabled=False): + probs = ScaledUpperTriangMaskedSoftmax.apply(*args) + return probs.view(b, np, sq, sk) + + +# NOTE (mkozuki): `ScaledMaskedSoftmax` somehow doesn't work well with `torch.cuda.amp.custom_fwd`. +# Without `cast_inputs` kwarg, somehow inputs are not cast to dtype used in the autocast context. +# So I needed to manually write two `torch.autograd.Function` inheritances. +# Fused operation which performs following three operations in sequence +# 1. Scale the tensor. +# 2. Apply the mask. +# 3. Perform softmax. +class ScaledMaskedSoftmax(torch.autograd.Function): + @staticmethod + def forward(ctx, inputs, mask, scale): + scale_t = torch.tensor([scale]) + softmax_results = scaled_masked_softmax_forward(inputs, mask, scale_t[0]) + ctx.save_for_backward(softmax_results, scale_t) + return softmax_results + + @staticmethod + def backward(ctx, output_grads): + softmax_results, scale_t = ctx.saved_tensors + input_grads = scaled_masked_softmax_backward(output_grads, softmax_results, scale_t[0]) + return input_grads, None, None + + +def scaled_masked_softmax(inputs, mask, scale): + # input is 4D tensor (b, np, sq, sk) + args = _cast_if_autocast_enabled(inputs, mask, scale) + with torch.cuda.amp.autocast(enabled=False): + return ScaledMaskedSoftmax.apply(*args) + + +class FusedScaleMaskSoftmax(torch.nn.Module): + """ + fused operation: scaling + mask + softmax + + Arguments: + input_in_fp16: flag to indicate if input in fp16 data format. + input_in_bf16: flag to indicate if input in bf16 data format. + attn_mask_type: attention mask type (pad or causal) + scaled_masked_softmax_fusion: flag to indicate user want to use softmax fusion + mask_func: mask function to be applied. + softmax_in_fp32: if true, softmax in performed at fp32 precision. + scale: scaling factor used in input tensor scaling. + """ + + def __init__( + self, + input_in_fp16, + input_in_bf16, + attn_mask_type, + scaled_masked_softmax_fusion, + mask_func, + softmax_in_fp32, + scale, + ): + super().__init__() + self.input_in_fp16 = input_in_fp16 + self.input_in_bf16 = input_in_bf16 + if self.input_in_fp16 and self.input_in_bf16: + raise RuntimeError("both fp16 and bf16 flags cannot be active at the same time.") + self.input_in_float16 = self.input_in_fp16 or self.input_in_bf16 + self.attn_mask_type = attn_mask_type + self.scaled_masked_softmax_fusion = scaled_masked_softmax_fusion + self.mask_func = mask_func + self.softmax_in_fp32 = softmax_in_fp32 + self.scale = scale + + if not (self.scale is None or softmax_in_fp32): + raise RuntimeError("softmax should be in fp32 when scaled") + + if self.scaled_masked_softmax_fusion: + if self.attn_mask_type == AttnMaskType.causal: + self.fused_softmax_func = scaled_upper_triang_masked_softmax + elif self.attn_mask_type == AttnMaskType.padding: + self.fused_softmax_func = scaled_masked_softmax + else: + raise ValueError("Invalid attn_mask_type.") + + def forward(self, input, mask): + # [b, np, sq, sk] + assert input.dim() == 4 + + if self.is_kernel_available(mask, *input.size()): + return self.forward_fused_softmax(input, mask) + else: + return self.forward_torch_softmax(input, mask) + + def is_kernel_available(self, mask, b, np, sq, sk): + attn_batches = b * np + + if ( + self.scaled_masked_softmax_fusion # user want to fuse + and self.input_in_float16 # input must be fp16 + and ( + self.attn_mask_type == AttnMaskType.causal + or (self.attn_mask_type == AttnMaskType.padding and mask is not None) + ) + and 16 < sk <= 8192 # sk must be 16 ~ 8192 + and sq % 4 == 0 # sq must be divisor of 4 + and sk % 4 == 0 # sk must be divisor of 4 + and attn_batches % 4 == 0 # np * b must be divisor of 4 + ): + if 0 <= sk <= 8192: + batch_per_block = self.get_batch_per_block(sq, sk, b, np) + + if self.attn_mask_type == AttnMaskType.causal: + if attn_batches % batch_per_block == 0: + return True + else: + if sq % batch_per_block == 0: + return True + return False + + def forward_fused_softmax(self, input, mask): + # input.shape = [b, np, sq, sk] + scale = self.scale if self.scale is not None else 1.0 + return self.fused_softmax_func(input, mask, scale) + + def forward_torch_softmax(self, input, mask): + if self.input_in_float16 and self.softmax_in_fp32: + input = input.float() + + if self.scale is not None: + input = input * self.scale + mask_output = self.mask_func(input, mask) if mask is not None else input + probs = torch.nn.Softmax(dim=-1)(mask_output) + + if self.input_in_float16 and self.softmax_in_fp32: + if self.input_in_fp16: + probs = probs.half() + else: + probs = probs.bfloat16() + + return probs + + @staticmethod + def get_batch_per_block(sq, sk, b, np): + return scaled_masked_softmax_get_batch_per_block(sq, sk, b, np) diff --git a/fusedlamb-ds.yaml b/fusedlamb-ds.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a4fffbfb3e5fa97124841810a3ed66207bf1bf6a --- /dev/null +++ b/fusedlamb-ds.yaml @@ -0,0 +1,2 @@ +# @package train.optimizer +_target_: deepspeed.ops.lamb.FusedLamb diff --git a/fusedlamb.yaml b/fusedlamb.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c8d7b2b8ede4c8733b5dde4dca6b75a942e88882 --- /dev/null +++ b/fusedlamb.yaml @@ -0,0 +1,2 @@ +# @package train.optimizer +_target_: apex.optimizers.FusedLAMB diff --git a/generate_kernels.py b/generate_kernels.py new file mode 100644 index 0000000000000000000000000000000000000000..119e34956f97107adddd82b57425d14d762246eb --- /dev/null +++ b/generate_kernels.py @@ -0,0 +1,108 @@ +# Copied from Driss Guessous's PR in PyTorch: https://github.com/pytorch/pytorch/pull/105602 + +# This file is run to generate the kernel instantiations for the flash_attn kernels +# They are written to several files in order to speed up compilation + +import argparse +import itertools +from dataclasses import dataclass +from pathlib import Path +from typing import List, Optional + +DTYPE_MAP = { + "fp16": "cutlass::half_t", + "bf16": "cutlass::bfloat16_t", +} + +SM = [80] # Sm80 kernels support up to +HEAD_DIMENSIONS = [32, 64, 96, 128, 160, 192, 256] +IS_CAUSAL = ["false", "true"] +KERNEL_IMPL_TEMPLATE_FWD = """#include "flash_fwd_launch_template.h" + +template<> +void run_mha_fwd_<{DTYPE}, {HEAD_DIM}, {IS_CAUSAL}>(Flash_fwd_params ¶ms, cudaStream_t stream) {{ + run_mha_fwd_hdim{HEAD_DIM}<{DTYPE}, {IS_CAUSAL}>(params, stream); +}} +""" + +KERNEL_IMPL_TEMPLATE_FWD_SPLIT = """#include "flash_fwd_launch_template.h" + +template void run_mha_fwd_splitkv_dispatch<{DTYPE}, {HEAD_DIM}, {IS_CAUSAL}>(Flash_fwd_params ¶ms, cudaStream_t stream); +""" + +KERNEL_IMPL_TEMPLATE_BWD = """#include "flash_bwd_launch_template.h" + +template<> +void run_mha_bwd_<{DTYPE}, {HEAD_DIM}, {IS_CAUSAL}>(Flash_bwd_params ¶ms, cudaStream_t stream) {{ + run_mha_bwd_hdim{HEAD_DIM}<{DTYPE}, {IS_CAUSAL}>(params, stream); +}} +""" + + +@dataclass +class Kernel: + sm: int + dtype: str + head_dim: int + is_causal: bool + direction: str + + @property + def template(self) -> str: + if self.direction == "fwd": + return KERNEL_IMPL_TEMPLATE_FWD.format( + DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.head_dim, IS_CAUSAL=self.is_causal + ) + elif self.direction == "bwd": + return KERNEL_IMPL_TEMPLATE_BWD.format( + DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.head_dim, IS_CAUSAL=self.is_causal + ) + else: + return KERNEL_IMPL_TEMPLATE_FWD_SPLIT.format( + DTYPE=DTYPE_MAP[self.dtype], HEAD_DIM=self.head_dim, IS_CAUSAL=self.is_causal + ) + + @property + def filename(self) -> str: + return f"flash_{self.direction}_hdim{self.head_dim}_{self.dtype}_{'causal_' if self.is_causal == 'true' else ''}sm{self.sm}.cu" + + +def get_all_kernels() -> List[Kernel]: + for direction in ["fwd", "fwd_split", "bwd"]: + for dtype, head_dim, is_causal, sm in itertools.product(DTYPE_MAP.keys(), HEAD_DIMENSIONS, IS_CAUSAL, SM): + yield Kernel(sm=sm, dtype=dtype, head_dim=head_dim, is_causal=is_causal, direction=direction) + + +def write_kernel(kernel: Kernel, autogen_dir: Path) -> None: + prelude = """// Copyright (c) 2024, Tri Dao. +// Splitting the different head dimensions to different files to speed up compilation. +// This file is auto-generated. See "generate_kernels.py"\n +""" + (autogen_dir / kernel.filename).write_text(prelude + kernel.template) + + +def main(output_dir: Optional[str]) -> None: + if output_dir is None: + output_dir = Path(__file__).parent + else: + output_dir = Path(output_dir) + + for kernel in get_all_kernels(): + write_kernel(kernel, output_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="generate_kernels", + description="Generate the flash_attention kernels template instantiations", + ) + # Set an optional output directory + parser.add_argument( + "-o", + "--output_dir", + required=False, + help="Where to generate the kernels " + " will default to the current directory ", + ) + args = parser.parse_args() + main(args.output_dir) diff --git a/generation.py b/generation.py new file mode 100644 index 0000000000000000000000000000000000000000..0d9120c386596f25b544391af10dc479cf00c822 --- /dev/null +++ b/generation.py @@ -0,0 +1,740 @@ +# Copyright (c) 2023, Tri Dao. +# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/forward_step.py#L31 +import gc +import time +from collections import namedtuple +from dataclasses import dataclass, field +from functools import partial +from typing import Callable, Optional, Sequence, Union + +import torch +import torch.nn.functional as F +from einops import rearrange, repeat +from torch import Tensor +from torch.profiler import ProfilerActivity, profile, record_function + +try: + from transformers.generation import GreedySearchDecoderOnlyOutput, SampleDecoderOnlyOutput +except ImportError: + GreedySearchDecoderOnlyOutput = namedtuple("GreedySearchDecoderOnlyOutput", ["sequences", "scores"]) + SampleDecoderOnlyOutput = namedtuple("SampleDecoderOnlyOutput", ["sequences", "scores"]) + + +@dataclass +class InferenceParams: + """Inference parameters that are passed to the main model in order + to efficienly calculate and store the context during inference.""" + + max_seqlen: int + max_batch_size: int + seqlen_offset: int = 0 + batch_size_offset: int = 0 + key_value_memory_dict: dict = field(default_factory=dict) + lengths_per_sample: Optional[Tensor] = None + + def reset(self, max_seqlen, max_batch_size): + self.max_seqlen = max_seqlen + self.max_batch_size = max_batch_size + self.seqlen_offset = 0 + if self.lengths_per_sample is not None: + self.lengths_per_sample.zero_() + + +# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py +# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L231 +def modify_logits_for_top_k_filtering(logits, top_k): + """Set the logits for none top-k values to -inf. Done in-place.""" + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits.masked_fill_(indices_to_remove, float("-Inf")) + + +# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py +# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L170 +def modify_logits_for_top_p_filtering(logits, top_p): + """Set the logits for none top-p values to -inf. Done in-place.""" + if top_p <= 0.0 or top_p >= 1.0: + return + # First sort and calculate cumulative sum of probabilities. + sorted_logits, sorted_indices = torch.sort(logits, descending=False) + cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) + # Remove tokens with cumulative top_p above the threshold (token with 0 are kept) + sorted_indices_to_remove = cumulative_probs <= (1 - top_p) + # scatter sorted tensors to original indexing + indices_to_remove = sorted_indices_to_remove.scatter( + 1, sorted_indices, sorted_indices_to_remove + ) + logits.masked_fill_(indices_to_remove, float("-inf")) + + +def sample(logits, top_k=1, top_p=0.0, temperature=1.0): + """Sample from top-k logits. + Arguments: + logits: Tensor of shape (batch_size, vocab_size) + """ + if top_k == 1: # Short-circuit for greedy decoding + return logits.argmax(dim=-1) + else: + if top_p > 0.0: + assert top_p <= 1.0, "top-p should be in (0, 1]." + if top_k > 0: + top_k = min(top_k, logits.size(-1)) # Safety check + logits_top, indices = torch.topk(logits, top_k, dim=-1) + if temperature != 1.0: + logits_top /= temperature + modify_logits_for_top_p_filtering(logits_top, top_p) + return indices[ + torch.arange(indices.shape[0], device=indices.device), + torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1), + ] + else: + # Clone so that when we modify for top_p we don't change the original logits + logits_top = logits / temperature if temperature != 1.0 else logits.clone() + modify_logits_for_top_p_filtering(logits_top, top_p) + return torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze( + dim=-1 + ) + + +@torch.inference_mode() +def decode( + input_ids, + model, + max_length, + top_k=1, + top_p=0.0, + temperature=1.0, + eos_token_id=None, + teacher_outputs=None, + vocab_size=None, + tensor_parallel=1, + cg=False, + enable_timing=False, +): + """Decoding, either greedy or with top-k or top-p sampling. + If top-k = 0, don't limit the number of candidates (pure sampling). + Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first, + then top-p. + We assume that all sequences in the same batch have the same length. + + Arguments: + input_ids: (batch, seq_len) + max_length: int + teacher_outputs (optional): (batch, seq_len). If provided, instead of sampling from the + logits, the next token is taken from the teacher_outputs. Useful for testing. + Returns: GreedySearchDecoderOnlyOutput or SampleDecoderOnlyOutput, with the following fields: + sequences: (batch, max_length) + scores: tuples of (batch, vocab_size) + """ + batch_size, seqlen_og = input_ids.shape + teacher_output_len = teacher_outputs.shape[1] if teacher_outputs is not None else 0 + if cg: + if not hasattr(model, "_decoding_cache"): + model._decoding_cache = None + model._decoding_cache = update_graph_cache( + model, + model._decoding_cache, + batch_size, + seqlen_og, + max_length, + tensor_parallel=tensor_parallel, + ) + inference_params = model._decoding_cache.inference_params + inference_params.reset(max_length, batch_size) + else: + inference_params = InferenceParams(max_seqlen=max_length, max_batch_size=batch_size) + + def get_logits(input_ids, inference_params): + decoding = inference_params.seqlen_offset > 0 + if decoding: + position_ids = torch.full( + (batch_size, 1), + inference_params.seqlen_offset, + dtype=torch.long, + device=input_ids.device, + ) + else: + position_ids = None + if not cg or not decoding: + logits = model( + input_ids, + position_ids=position_ids, + inference_params=inference_params, + num_last_tokens=1, + ).logits.squeeze(dim=1) + else: + logits = model._decoding_cache.run( + input_ids, position_ids, inference_params.seqlen_offset + ).squeeze(dim=1) + return logits[..., :vocab_size] if vocab_size is not None else logits + + def sample_tokens(logits, inference_params): + if teacher_outputs is None or teacher_output_len <= inference_params.seqlen_offset: + token = sample(logits, top_k=top_k, top_p=top_p, temperature=temperature) + else: + token = teacher_outputs[:, inference_params.seqlen_offset] + # return rearrange(token, "b -> b 1") + return token.unsqueeze(1) + + def should_stop(current_token, inference_params): + if inference_params.seqlen_offset == 0: + return False + if eos_token_id is not None and (current_token == eos_token_id).all(): + return True + if inference_params.seqlen_offset >= max_length - 1: + return True + return False + + start = torch.cuda.Event(enable_timing=enable_timing) + end = torch.cuda.Event(enable_timing=enable_timing) + + if enable_timing: + if tensor_parallel > 1: + torch.distributed.barrier() + start.record() + scores, sequences = [], [input_ids] + while not should_stop(sequences[-1], inference_params): + scores.append(get_logits(sequences[-1], inference_params)) + inference_params.seqlen_offset += sequences[-1].shape[1] + sequences.append(sample_tokens(scores[-1], inference_params)) + if enable_timing: + end.record() + if tensor_parallel > 1: + torch.distributed.barrier() + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(start.elapsed_time(end)):.0f}ms") + output_cls = GreedySearchDecoderOnlyOutput if top_k == 1 else SampleDecoderOnlyOutput + return output_cls(sequences=torch.cat(sequences, dim=1), scores=tuple(scores)) + + +def sample_speculative(logits, logits_draft, tokens_draft, top_k=1, top_p=0.0, temperature=1.0): + """Algorithm 1 from [1] + [1] Fast Inference from Transformers via Speculative Decoding + Yaniv Leviathan, Matan Kalman, Yossi Matias + https://arxiv.org/abs/2211.17192 + + Arguments: + logits: Tensor of shape (batch_size, seqlen + 1, vocab_size) + logits_draft: Tensor of shape (batch_size, seqlen, vocab_size) + tokens_draft: Tensor of shape (batch_size, seqlen) + Return: + tokens: Tensor of shape (batch_size, seqlen + 1) + num_generated_tokens: Tensor of shape (batch_size), with value in [1, seqlen + 1]. + For each sequence in the batch, the number of valid tokens that were sampled by + speculative sampling. + """ + batch, seqlen_p_1, vocab_size = logits.shape + seqlen = seqlen_p_1 - 1 + assert logits_draft.shape == (batch, seqlen, vocab_size) + assert tokens_draft.shape == (batch, seqlen) + assert tokens_draft.dtype in [torch.int64, torch.int32] + # TODO: if top_k = 1 we can simplify things and only work with indices + if top_p > 0.0: + assert top_p <= 1.0, "top-p should be in (0, 1]." + # Clone so that when we modify for top_p we don't change the original logits + logits = logits / temperature if temperature != 1.0 else logits.clone() + logits_draft = logits_draft / temperature if temperature != 1.0 else logits_draft.clone() + if top_k > 0: + top_k = min(top_k, logits.size(-1)) # Safety check + modify_logits_for_top_k_filtering(logits, top_k) + modify_logits_for_top_k_filtering(logits_draft, top_k) + modify_logits_for_top_p_filtering(logits, top_p) + modify_logits_for_top_p_filtering(logits_draft, top_p) + probs = torch.softmax(logits, dim=-1) + probs_draft = torch.softmax(logits_draft, dim=-1) + gather = lambda probs, tokens: rearrange( + probs.gather(dim=-1, index=rearrange(tokens, "... -> ... 1")), "... 1 -> ..." + ) + # (batch, seqlen) + accepted = torch.rand(batch, seqlen, device=probs.device) * gather( + probs_draft, tokens_draft + ) <= gather(probs[:, :-1], tokens_draft) + accepted_all = accepted.all(dim=-1) + # (batch,) + first_rejected_idx = torch.where(accepted_all, seqlen, accepted.int().argmin(dim=-1)) + probs_diff = torch.clamp(probs[:, :-1] - probs_draft, min=0.0) + # torch.multinomial can deal with unnormalized probabilities + # probs_diff /= probs_diff.sum(dim=-1, keepdim=True) + resample_probs = torch.cat([probs_diff, probs[:, -1:]], dim=1) + resample_probs = rearrange( + resample_probs.gather(dim=1, index=repeat(first_rejected_idx, "b -> b 1 d", d=vocab_size)), + "b 1 d -> b d", + ) + resample = torch.multinomial(resample_probs, num_samples=1).squeeze(dim=-1) # (batch,) + tokens = F.pad(tokens_draft, (0, 1)) + tokens[:, first_rejected_idx] = resample + return tokens, first_rejected_idx + 1 + + +@torch.inference_mode() +def decode_speculative( + input_ids, + model, + model_draft, + max_length, + speculative_lookahead=3, + top_k=1, + top_p=0.0, + temperature=1.0, + eos_token_id=None, + vocab_size=None, + tensor_parallel=1, + cg=False, + enable_timing=False, + debug=False, +): + """ + TD: WIP, for my own understanding, lightly tested. Only support batch_size == 1 for now. + + Speculative decoding, either greedy or with top-k or top-p sampling. + If top-k = 0, don't limit the number of candidates (pure sampling). + Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first, + then top-p. + We assume that all sequences in the same batch have the same length. + + Arguments: + input_ids: (batch, seq_len) + max_length: int + Returns: GreedySearchDecoderOnlyOutput or SampleDecoderOnlyOutput, with the following fields: + sequences: (batch, max_length) + scores: tuples of (batch, vocab_size) + """ + batch_size, seqlen_og = input_ids.shape + assert batch_size == 1, "Speculative decoding implementation only supports batch_size=1" + assert eos_token_id is None, "Speculative decoding implementation doesn't support eos_token_id" + if cg: + if not hasattr(model_draft, "_decoding_cache"): + model_draft._decoding_cache = None + model_draft._decoding_cache = update_graph_cache( + model_draft, + model_draft._decoding_cache, + batch_size, + seqlen_og, + max_length, + # draft model needs to process either 1 or 2 tokens at a time + decoding_seqlens=(1, 2), + tensor_parallel=tensor_parallel, + ) + inference_params_draft = model_draft._decoding_cache.inference_params + inference_params_draft.reset(max_length, batch_size) + if not hasattr(model, "_decoding_cache"): + model._decoding_cache = None + model._decoding_cache = update_graph_cache( + model, + model._decoding_cache, + batch_size, + seqlen_og, + max_length, + decoding_seqlens=range(1, speculative_lookahead + 2), + tensor_parallel=tensor_parallel, + ) + inference_params = model._decoding_cache.inference_params + inference_params.reset(max_length, batch_size) + else: + inference_params_draft = InferenceParams(max_seqlen=max_length, max_batch_size=batch_size) + inference_params = InferenceParams(max_seqlen=max_length, max_batch_size=batch_size) + + def get_logits(input_ids, inference_params, model, num_last_tokens=1, cg=False): + decoding = inference_params.seqlen_offset > 0 + if decoding: + seqlen = input_ids.shape[1] + # if inference_params.lengths_per_sample is None: + # TODO: in the case of batched decoding where each sequence has a different length, + # we need to compute the position_ids for each sequence using lengths_per_sample + if True: + cache_seqlens = torch.full( + (input_ids.shape[0],), + inference_params.seqlen_offset, + dtype=torch.int32, + device=input_ids.device, + ) + else: + cache_seqlens = inference_params.lengths_per_sample + position_ids = cache_seqlens[:, None] + torch.arange( + seqlen, dtype=torch.long, device=input_ids.device + ) + else: + position_ids = None + if not cg or not decoding: + logits = model( + input_ids, + position_ids=position_ids, + inference_params=inference_params, + num_last_tokens=num_last_tokens, + ).logits + else: + # NOTE: careful, CUDA graph is set to have num_last_tokens=input_ids.shape[1]. + # This might not be compatible the num_last_tokens used here. + assert num_last_tokens <= input_ids.shape[1] + logits = model._decoding_cache.run( + input_ids, position_ids, inference_params.seqlen_offset + )[:, -num_last_tokens:] + return logits[..., :vocab_size] if vocab_size is not None else logits + + def sample_tokens(input_ids, get_logits_fn, inference_params, sample_fn, num_tokens=1): + """Sample `num_tokens` tokens from the model, given the previous logits. + Also return the logits of the sampled tokens. + Arguments: + input_ids: (batch, seqlen) + Return: + tokens: (batch, num_tokens) + scores: (batch, num_tokens), which contains @previous_logits and the logits of the next + (num_tokens - 1) tokens. The logits of the last token isn't computed. + """ + assert num_tokens >= 1 + sequences, scores = [input_ids], [] + for i in range(num_tokens): + scores.append(get_logits_fn(sequences[-1], inference_params)[:, -1]) + inference_params.seqlen_offset += sequences[-1].shape[1] + sequences.append(sample_fn(scores[-1]).unsqueeze(1)) + return torch.cat(sequences[1:], dim=1), torch.stack(scores, dim=1) + + sampling_kwargs = dict(top_k=top_k, top_p=top_p, temperature=temperature) + sample_fn = partial(sample, **sampling_kwargs) + get_logits_main = partial(get_logits, model=model, cg=cg) + get_logits_draft = partial(get_logits, model=model_draft, cg=cg) + sample_tokens_main = partial( + sample_tokens, + get_logits_fn=get_logits_main, + sample_fn=sample_fn, + inference_params=inference_params, + ) + sample_tokens_draft = partial( + sample_tokens, + get_logits_fn=get_logits_draft, + sample_fn=sample_fn, + inference_params=inference_params_draft, + ) + + if debug: + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained("gpt2") + if enable_timing: + if tensor_parallel > 1: + torch.distributed.barrier() + torch.cuda.synchronize() + start = time.time() + + sequences, scores = [input_ids], [] + num_main_model_calls = 0 + num_draft_tokens = 0 + num_accepted_tokens_history = [] + if seqlen_og >= max_length - 1: + # Don't do speculative sampling, just sample 1 token from the model + tokens, scores_new = sample_tokens_main(input_ids, num_tokens=1) + sequences.append(tokens) + scores.append(scores_new) + else: + # Sample from draft model, which produces @n_spec_tokens, and @model + # will then use to produce between 1 and 1 + @n_spec_tokens tokens. + # We want seqlen_og + 1 + @n_spec_tokens to be <= @max_length. + n_spec_tokens = min(speculative_lookahead, max_length - seqlen_og - 1) + tokens_draft, scores_draft = sample_tokens_draft(input_ids, num_tokens=n_spec_tokens) + num_draft_tokens += n_spec_tokens + if debug: + scores_draft_ref = model_draft( + torch.cat([input_ids, tokens_draft], dim=1), num_last_tokens=n_spec_tokens + 1 + ).logits + print((scores_draft - scores_draft_ref[:, :-1]).abs().max()) + + # Evaluate the draft tokens with the model + logits = get_logits_main( + torch.cat([input_ids, tokens_draft], dim=1), + inference_params, + num_last_tokens=n_spec_tokens + 1, + ) + num_main_model_calls += 1 + if debug: + logits_ref = model( + torch.cat([input_ids, tokens_draft], dim=1), num_last_tokens=n_spec_tokens + 1 + ).logits + print((logits - logits_ref).abs().max()) + # breakpoint() + tokens, num_generated_tokens = sample_speculative( + logits, scores_draft, tokens_draft, **sampling_kwargs + ) + num_accepted_tokens_history.append(num_generated_tokens - 1) + if debug: + print(tokens) + print(num_generated_tokens) + # breakpoint() + # TODO: we're using the fact that batch_size == 1 + # TODO: check eos_token_id + sequences.append(tokens[:1, : num_generated_tokens[0]]) + scores.append(logits[:1, : num_generated_tokens[0]]) + # Note that @model has not evaluated the last sampled token yet, so we'll need to pass + # that in the next time we call @model. + num_generated = num_generated_tokens[0].item() + inference_params.seqlen_offset = seqlen_og + num_generated - 1 + inference_params_draft.seqlen_offset = ( + inference_params.seqlen_offset - 1 + if num_generated > 1 + else inference_params.seqlen_offset + ) + if debug: + cur_ids = torch.cat([input_ids, sequences[-1]], dim=1) + scores_ref = model(cur_ids, num_last_tokens=num_generated_tokens[0].item() + 1).logits + print((scores[-1] - scores_ref[:, :-1]).abs().max()) + # breakpoint() + + while True: + # seqlen_offset is total length generated - 1 + if inference_params.seqlen_offset >= max_length - 1: + break + if inference_params.seqlen_offset >= max_length - 2: + # Don't do speculative sampling, just sample 1 token from the model + tokens, scores_new = sample_tokens_main(sequences[-1][:, -1:], num_tokens=1) + sequences.append(tokens) + scores.append(scores_new) + break + # Sample from draft model + n_spec_tokens = min( + speculative_lookahead, max_length - inference_params_draft.seqlen_offset - 2 + ) + # If the main model accepts all the draft tokens, plus it samples one new token, + # then at the next iteration the draft model need to evaluate the logits of the last draft + # token and the logits of the newly sampled token. So here we pass in the last 2 tokens + # of sequences[-1]. + # This exception is when the main model rejects all the draft tokens, in which case we + # will only have 1 token to pass in. + tokens_draft, scores_draft = sample_tokens_draft( + sequences[-1][:, -2:], num_tokens=n_spec_tokens + ) + num_draft_tokens += n_spec_tokens + if debug: + scores_draft_ref = model_draft( + torch.cat([cur_ids, tokens_draft], dim=1), num_last_tokens=n_spec_tokens + 1 + ).logits + print((scores_draft - scores_draft_ref[:, :-1]).abs().max()) + # breakpoint() + # Evaluate the draft tokens with the model + logits = get_logits_main( + torch.cat([sequences[-1][:, -1:], tokens_draft], dim=1), + inference_params, + num_last_tokens=n_spec_tokens + 1, + ) # (batch, n_spec_tokens + 1, vocab_size) + num_main_model_calls += 1 + if debug: + logits_ref = model( + torch.cat([cur_ids, tokens_draft], dim=1), num_last_tokens=n_spec_tokens + 1 + ).logits + print((logits - logits_ref).abs().max()) + # breakpoint() + tokens, num_generated_tokens = sample_speculative( + logits, scores_draft, tokens_draft, **sampling_kwargs + ) + num_accepted_tokens_history.append(num_generated_tokens - 1) + if debug: + print(tokens) + print(num_generated_tokens) + # breakpoint() + sequences.append(tokens[:1, : num_generated_tokens[0]]) + scores.append(logits[:1, : num_generated_tokens[0]]) + # We've evaluated 1 token from sequences[-1][:, -1:] above, plus + # num_generated_tokens[0].item() - 1 tokens from the draft model. + num_generated = num_generated_tokens[0].item() + inference_params.seqlen_offset += num_generated + inference_params_draft.seqlen_offset = ( + inference_params.seqlen_offset - 1 + if num_generated > 1 + else inference_params.seqlen_offset + ) + if debug: + cur_ids = torch.cat([cur_ids, sequences[-1]], dim=1) + scores_ref = model(cur_ids, num_last_tokens=num_generated_tokens[0].item() + 1).logits + print((scores[-1] - scores_ref[:, :-1]).abs().max()) + # breakpoint() + + if enable_timing: + if tensor_parallel > 1: + torch.distributed.barrier() + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + print(f"Number of calls to main model: {num_main_model_calls}") + print( + f"Acceptance rate: {torch.cat(num_accepted_tokens_history).sum().item() / num_draft_tokens * 100:.2f}%" + ) + sequences = torch.cat(sequences, dim=1) + scores = torch.cat(scores, dim=1) + if debug: + scores_ref = model(sequences).logits + print((scores - scores_ref[:, seqlen_og - 1 : -1]).abs().max()) + output_cls = GreedySearchDecoderOnlyOutput if top_k == 1 else SampleDecoderOnlyOutput + return output_cls(sequences=sequences, scores=scores) + + +class GenerationMixin: + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs): + raise NotImplementedError + + def generate( + self, + input_ids, + max_length, + top_k=1, + top_p=0.0, + temperature=1.0, + return_dict_in_generate=False, + output_scores=False, + **kwargs, + ): + output = decode( + input_ids, self, max_length, top_k=top_k, top_p=top_p, temperature=temperature, **kwargs + ) + if not output_scores: + output.scores = None + return output if return_dict_in_generate else output.sequences + + +def allocate_inference_cache( + max_batch_size, + max_seqlen, + nheads, + headdim, + layers: Union[int, Sequence], + device, + dtype=torch.float16, +): + assert dtype in [torch.float16, torch.bfloat16, torch.float32] + kv_cache_shape = (max_batch_size, max_seqlen, 2, nheads, headdim) + if isinstance(layers, int): + layers = range(layers) + return {i: torch.empty(kv_cache_shape, device=device, dtype=dtype) for i in layers} + + +@dataclass +class DecodingCGCache: + max_batch_size: int = 0 + max_seqlen: int = 0 + device = None + dtype = None + callables: dict = field(default_factory=dict) + mempool = None + inference_params: Optional[InferenceParams] = None + run: Optional[Callable] = None + + +@torch.inference_mode() +def update_graph_cache( + model, + cache, + batch_size, + seqlen_og, + max_seqlen, + decoding_seqlens=(1,), + tensor_parallel=1, + dtype=None, + n_warmups=2, +): + if cache is None: + cache = DecodingCGCache() + param_example = next(iter(model.parameters())) + device = param_example.device + if dtype is None: + dtype = param_example.dtype + if ( + (device, dtype) != (cache.device, cache.dtype) + or batch_size > cache.max_batch_size + or max_seqlen > cache.max_seqlen + ): # Invalidate the cache + cache.callables = {} + cache.mempool = None + cache.inference_params = None + gc.collect() + cache.device, cache.dtype = device, dtype + cache.max_batch_size, cache.max_seqlen = batch_size, max_seqlen + if hasattr(model, "allocate_inference_cache"): + inf_cache = model.allocate_inference_cache(batch_size, max_seqlen, dtype) + else: + headdim = getattr( + model.config, + "head_dim", + model.config.hidden_size // model.config.num_attention_heads, + ) + inf_cache = allocate_inference_cache( + batch_size, + max_seqlen, + model.config.num_attention_heads // tensor_parallel, + headdim, + model.config.num_hidden_layers, + device, + dtype, + ) + lengths_per_sample = torch.full((batch_size,), seqlen_og, dtype=torch.int32, device=device) + cache.inference_params = InferenceParams( + max_seqlen=max_seqlen, + max_batch_size=batch_size, + seqlen_offset=seqlen_og, + key_value_memory_dict=inf_cache, + lengths_per_sample=lengths_per_sample, + ) + cache.mempool = torch.cuda.graphs.graph_pool_handle() + for decoding_seqlen in decoding_seqlens: + if (batch_size, decoding_seqlen) not in cache.callables: + cache.callables[batch_size, decoding_seqlen] = capture_graph( + model, + cache.inference_params, + batch_size, + max_seqlen, + decoding_seqlen=decoding_seqlen, + mempool=cache.mempool, + n_warmups=n_warmups, + ) + + def dispatch(input_ids, position_ids, seqlen): + batch_size, decoding_seqlen = input_ids.shape[:2] + return cache.callables[batch_size, decoding_seqlen](input_ids, position_ids, seqlen) + + cache.run = dispatch + cache.inference_params.seqlen_offset = 0 # Reset so it's not confusing + return cache + + +def capture_graph( + model, inference_params, batch_size, max_seqlen, decoding_seqlen=1, mempool=None, n_warmups=2 +): + device = next(iter(model.parameters())).device + input_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device) + position_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device) + seqlen_offset_og = inference_params.seqlen_offset + inference_params.seqlen_offset = max_seqlen - decoding_seqlen + inference_params.lengths_per_sample[:] = inference_params.seqlen_offset + + # Warmup before capture + s = torch.cuda.Stream() + s.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(s): + for _ in range(n_warmups): + logits = model( + input_ids, + position_ids=position_ids, + inference_params=inference_params, + num_last_tokens=decoding_seqlen, + ).logits + s.synchronize() + # This might be needed for correctness if we run with NCCL_GRAPH_MIXING_SUPPORT=0, + # which requires that graph launch and non-captured launch to not overlap (I think, + # that's how I interpret the documentation). I'm not sure if this is required. + if torch.distributed.is_initialized(): + torch.distributed.barrier() + torch.cuda.current_stream().wait_stream(s) + # Captures the graph + # To allow capture, automatically sets a side stream as the current stream in the context + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, pool=mempool): + logits = model( + input_ids, + position_ids=position_ids, + inference_params=inference_params, + num_last_tokens=decoding_seqlen, + ).logits + + def run(new_input_ids, new_position_ids, seqlen): + inference_params.lengths_per_sample[:] = seqlen + input_ids.copy_(new_input_ids) + position_ids.copy_(new_position_ids) + graph.replay() + return logits.clone() + + inference_params.seqlen_offset = seqlen_offset_og + return run diff --git a/gpt.py b/gpt.py new file mode 100644 index 0000000000000000000000000000000000000000..3539f8f901695b29454358972d65031f4c4fabeb --- /dev/null +++ b/gpt.py @@ -0,0 +1,1080 @@ +# Copyright (c) 2024, Tri Dao. + +import logging +import math +import re +from collections import OrderedDict, namedtuple +from collections.abc import Sequence +from functools import partial +from typing import Dict, List + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange +from transformers import GPT2Config + +from flash_attn.models.bigcode import remap_state_dict_hf_bigcode +from flash_attn.models.falcon import remap_state_dict_hf_falcon +from flash_attn.models.gpt_neox import remap_state_dict_hf_gpt_neox +from flash_attn.models.gptj import remap_state_dict_hf_gptj +from flash_attn.models.llama import remap_state_dict_hf_llama +from flash_attn.models.opt import remap_state_dict_hf_opt +from flash_attn.modules.block import Block, ParallelBlock +from flash_attn.modules.embedding import GPT2Embeddings, ParallelGPT2Embeddings +from flash_attn.modules.mha import MHA, ParallelMHA +from flash_attn.modules.mlp import ( + FusedMLP, + GatedMlp, + Mlp, + ParallelFusedMLP, + ParallelGatedMlp, + ParallelMLP, +) +from flash_attn.ops.activations import sqrelu_fwd +from flash_attn.utils.distributed import ( + all_gather, + all_gather_raw, + get_dim_for_local_rank, + sync_shared_params, +) +from flash_attn.utils.generation import GenerationMixin +from flash_attn.utils.pretrained import state_dict_from_pretrained + +try: + from flash_attn.ops.fused_dense import ColumnParallelLinear +except ImportError: + ColumnParallelLinear = None + +try: + from flash_attn.ops.triton.mlp import FusedDenseSqreluDense +except ImportError: + FusedDenseSqreluDense = None + +try: + from flash_attn.ops.triton.layer_norm import layer_norm_fn, RMSNorm +except ImportError: + layer_norm_fn, RMSNorm = None, None + +logger = logging.getLogger(__name__) + + +def create_mixer_cls(config, layer_idx=None, process_group=None, device=None, dtype=None): + factory_kwargs = {"device": device, "dtype": dtype} + head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + attn_scale_power = 0.5 if not getattr(config, "mup_scale_qk_dot_by_d", False) else 1.0 + softmax_scale = 1.0 if not config.scale_attn_weights else (head_dim ** (-attn_scale_power)) + softmax_scale *= getattr(config, "mup_attn_multiplier", 1.0) + if config.scale_attn_by_inverse_layer_idx: + assert layer_idx is not None + softmax_scale /= float(layer_idx + 1) + dwconv = getattr(config, "attn_dwconv", False) + if dwconv: + assert process_group is None, "TensorParallel MHA does not support dwconv yet" + qkv_proj_bias = getattr(config, "qkv_proj_bias", True) + out_proj_bias = getattr(config, "out_proj_bias", True) + rotary_emb_dim = int(getattr(config, "rotary_emb_fraction", 0.0) * head_dim) + rotary_emb_base = getattr(config, "rotary_emb_base", 10000.0) + rotary_emb_scale_base = getattr(config, "rotary_emb_scale_base", None) + rotary_emb_interleaved = getattr(config, "rotary_emb_interleaved", False) + use_alibi = getattr(config, "use_alibi", False) + window_size = getattr(config, "window_size", (-1, -1)) + use_flash_attn = getattr(config, "use_flash_attn", False) + fused_bias_fc = getattr(config, "fused_bias_fc", False) + if not fused_bias_fc: + assert process_group is None, "TensorParallel MHA requires fused_bias_fc" + mha_cls = MHA if process_group is None else ParallelMHA + serial_kwargs = ( + {"fused_bias_fc": fused_bias_fc, "dwconv": dwconv} if process_group is None else {} + ) + parallel_kwargs = ( + { + "process_group": process_group, + "sequence_parallel": getattr(config, "sequence_parallel", True), + } + if process_group is not None + else {} + ) + num_heads_kv = getattr(config, "n_head_kv", None) + mixer_cls = partial( + mha_cls, + num_heads=config.num_attention_heads, + num_heads_kv=num_heads_kv, + qkv_proj_bias=qkv_proj_bias, + out_proj_bias=out_proj_bias, + dropout=config.attn_pdrop, + softmax_scale=softmax_scale, + causal=True, + layer_idx=layer_idx, + rotary_emb_dim=rotary_emb_dim, + rotary_emb_base=rotary_emb_base, + rotary_emb_scale_base=rotary_emb_scale_base, + rotary_emb_interleaved=rotary_emb_interleaved, + use_alibi=use_alibi, + window_size=window_size, + use_flash_attn=use_flash_attn, + **serial_kwargs, + **parallel_kwargs, + **factory_kwargs, + ) + return mixer_cls + + +def create_mlp_cls(config, layer_idx=None, process_group=None, device=None, dtype=None): + factory_kwargs = {"device": device, "dtype": dtype} + mlp_fc1_bias = getattr(config, "mlp_fc1_bias", True) + mlp_fc2_bias = getattr(config, "mlp_fc2_bias", True) + fused_mlp = getattr(config, "fused_mlp", False) + if fused_mlp: + assert config.activation_function in [ + "gelu_new", + "gelu_fast", + "gelu_approx", + "gelu_pytorch_tanh", + "relu", + "sqrelu", + ] + fused_dense_sqrelu_dense = getattr(config, "fused_dense_sqrelu_dense", False) + if fused_dense_sqrelu_dense: + assert config.activation_function == "sqrelu", ( + "fused_dense_sqrelu_dense only " "supports approximate activation_function sqrelu" + ) + assert not (fused_dense_sqrelu_dense and fused_mlp) + if not fused_mlp and not fused_dense_sqrelu_dense: + assert config.activation_function in [ + "gelu", + "gelu_new", + "gelu_fast", + "gelu_approx", + "gelu_pytorch_tanh", + "relu", + "sqrelu", + "glu", + "swiglu", + "geglu", + ] + if config.activation_function in ["glu", "swiglu", "geglu"]: + activation = ( + F.sigmoid + if config.activation_function == "glu" + else (F.silu if config.activation_function == "swiglu" else F.gelu) + ) + mlp_cls = GatedMlp if process_group is None else ParallelGatedMlp + parallel_kwargs = ( + { + "process_group": process_group, + "sequence_parallel": getattr(config, "sequence_parallel", True), + } + if process_group is not None + else {} + ) + mlp_multiple_of = getattr(config, "mlp_multiple_of", 128) + mlp_cls = partial( + mlp_cls, + hidden_features=config.n_inner, + activation=activation, + bias1=mlp_fc1_bias, + bias2=mlp_fc2_bias, + multiple_of=mlp_multiple_of, + **parallel_kwargs, + **factory_kwargs, + ) + else: + if config.activation_function == "relu": + activation = partial(F.relu, inplace=True) + elif config.activation_function == "sqrelu": + activation = sqrelu_fwd + else: + approximate = ( + "tanh" + if config.activation_function + in ["gelu_new", "gelu_fast", "gelu_approx", "gelu_pytorch_tanh"] + else "none" + ) + activation = partial(F.gelu, approximate=approximate) + mlp_cls = Mlp if process_group is None else ParallelMLP + parallel_kwargs = ( + { + "process_group": process_group, + "sequence_parallel": getattr(config, "sequence_parallel", True), + } + if process_group is not None + else {} + ) + mlp_cls = partial( + mlp_cls, + hidden_features=config.n_inner, + activation=activation, + bias1=mlp_fc1_bias, + bias2=mlp_fc2_bias, + **parallel_kwargs, + **factory_kwargs, + ) + else: + mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0) + # mlp_checkpoint_lvl could be a list, which contains the checkpoint_lvl for each layer + if isinstance(mlp_checkpoint_lvl, Sequence): + assert layer_idx is not None + mlp_checkpoint_lvl = mlp_checkpoint_lvl[layer_idx] + if fused_mlp: + if FusedMLP is None: + raise ImportError("fused_dense is not installed") + activation = ( + "gelu_approx" + if config.activation_function + in ["gelu_new", "gelu_fast", "gelu_approx", "gelu_pytorch_tanh"] + else config.activation_function + ) + mlp_cls = FusedMLP if process_group is None else ParallelFusedMLP + parallel_kwargs = ( + { + "process_group": process_group, + "sequence_parallel": getattr(config, "sequence_parallel", True), + } + if process_group is not None + else {} + ) + mlp_cls = partial( + mlp_cls, + hidden_features=config.n_inner, + activation=activation, + checkpoint_lvl=mlp_checkpoint_lvl, + bias1=mlp_fc1_bias, + bias2=mlp_fc2_bias, + **parallel_kwargs, + **factory_kwargs, + ) + elif fused_dense_sqrelu_dense: + if process_group is not None: + assert fused_mlp, "Tensor Parallel is not implemented for FusedDenseSqreluDense" + assert FusedDenseSqreluDense is not None + mlp_cls = partial( + FusedDenseSqreluDense, + hidden_features=config.n_inner, + checkpoint_lvl=mlp_checkpoint_lvl, + **factory_kwargs, + ) + else: + raise RuntimeError("MLP type not supported") + return mlp_cls + + +def create_block(config, layer_idx=None, process_group=None, device=None, dtype=None): + factory_kwargs = {"device": device, "dtype": dtype} + sequence_parallel = getattr(config, "sequence_parallel", True) + mixer_cls = create_mixer_cls(config, layer_idx, process_group=process_group, **factory_kwargs) + mlp_cls = create_mlp_cls(config, layer_idx, process_group=process_group, **factory_kwargs) + use_rms_norm = getattr(config, "rms_norm", False) + norm_cls = partial( + nn.LayerNorm if not use_rms_norm else RMSNorm, + eps=config.layer_norm_epsilon, + **factory_kwargs, + ) + # TD [2022-07-30]: Force residual in fp32, seems to make fp16 training more stable + residual_in_fp32 = getattr(config, "residual_in_fp32", False) + resid_dropout1 = config.resid_pdrop if layer_idx is None or layer_idx > 0 else config.embd_pdrop + prenorm = getattr(config, "prenorm", True) + parallel_block = getattr(config, "parallel_block", False) + if not parallel_block: + block = Block( + config.hidden_size, + mixer_cls, + mlp_cls, + norm_cls=norm_cls, + prenorm=prenorm, + resid_dropout1=resid_dropout1, + resid_dropout2=config.resid_pdrop, + fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False), + residual_in_fp32=residual_in_fp32, + sequence_parallel=sequence_parallel and process_group is not None, + mark_shared_params=process_group is not None, + ) + else: + assert prenorm + block = ParallelBlock( + config.hidden_size, + mixer_cls, + mlp_cls, + norm_cls=norm_cls, + resid_dropout1=resid_dropout1, + resid_dropout2=config.resid_pdrop, + tied_norm=getattr(config, "parallel_block_tied_norm", False), + fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False), + residual_in_fp32=residual_in_fp32, + sequence_parallel=sequence_parallel and process_group is not None, + mark_shared_params=process_group is not None, + ) + block.layer_idx = layer_idx + return block + + +class GPTPreTrainedModel(nn.Module): + """An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + + def __init__(self, config, *inputs, **kwargs): + super().__init__() + if not isinstance(config, GPT2Config): + raise ValueError( + "Parameter config in `{}(config)` should be an instance of class `GPT2Config`. " + "To create a model from a Google pretrained model use " + "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( + self.__class__.__name__, self.__class__.__name__ + ) + ) + self.config = config + + @classmethod + def from_pretrained( + cls, + model_name, + config, + *args, + strict=True, + device=None, + dtype=None, + world_size=1, + rank=0, + **kwargs, + ): + """ + Instantiate a GPTPreTrainedModel from a pre-trained model file or a pytorch state dict. + Download and cache the pre-trained model file if needed. + """ + # Instantiate model. + model = cls(config, *args, device=device, dtype=dtype, **kwargs) + # Load state_dict in cpu because we already initialized the model in GPU, and we don't + # want extra stuff taking up more GPU memory + state_dict = state_dict_from_pretrained(model_name, device="cpu", dtype=dtype) + if model_name.startswith("gpt2"): + state_dict = remap_state_dict_hf_gpt2(state_dict, config) + elif model_name.startswith("facebook/opt"): + state_dict = remap_state_dict_hf_opt(state_dict, config) + elif model_name.startswith("EleutherAI/gpt-j-") or model_name.startswith( + "togethercomputer/GPT-JT-" + ): + state_dict = remap_state_dict_hf_gptj(state_dict, config) + elif ( + model_name.startswith("EleutherAI/gpt-neox-") + or model_name.startswith("EleutherAI/pythia-") + or model_name.startswith("togethercomputer/RedPajama-INCITE-") + ): + state_dict = remap_state_dict_hf_gpt_neox(state_dict, config) + elif model_name.startswith("tiiuae/falcon-"): + state_dict = remap_state_dict_hf_falcon(state_dict, config) + elif model_name.startswith("meta-llama/Llama-"): + state_dict = remap_state_dict_hf_llama(state_dict, config) + elif model_name.startswith("bigcode/") or model_name.startswith("WizardLM/"): + state_dict = remap_state_dict_hf_bigcode(state_dict, config) + else: + raise NotImplementedError(f"Model {model_name} not supported") + if world_size > 1: + state_dict = shard_state_dict_tp(state_dict, config, world_size, rank) + load_return = model.load_state_dict(state_dict, strict=strict) + logger.info(load_return) + return model + + +# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454 +def _init_weights( + module, n_layer, initializer_range=0.02, mup_width_scale=1.0, rescale_prenorm_residual=True +): + mup_init_scale = math.sqrt(mup_width_scale) + if isinstance(module, nn.Linear): + nn.init.normal_(module.weight, std=initializer_range * mup_init_scale) + optim_cfg = getattr(module.weight, "_optim", {}) + optim_cfg.update({"lr_multiplier": mup_width_scale}) + setattr(module.weight, "_optim", optim_cfg) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, std=initializer_range) + + if rescale_prenorm_residual: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + for name, p in module.named_parameters(): + if name in ["out_proj.weight", "fc2.weight"]: + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + nn.init.normal_( + p, mean=0.0, std=initializer_range * mup_init_scale / math.sqrt(2 * n_layer) + ) + + +class GPTModel(GPTPreTrainedModel): + def __init__(self, config: GPT2Config, process_group=None, device=None, dtype=None): + super().__init__(config) + factory_kwargs = {"device": device, "dtype": dtype} + self.process_group = process_group + self.sequence_parallel = getattr(config, "sequence_parallel", True) + assert config.activation_function in [ + "gelu", + "gelu_new", + "gelu_fast", + "gelu_approx", + "gelu_pytorch_tanh", + "relu", + "sqrelu", + "glu", + "swiglu", + "geglu", + ] + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = ( + math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple + ) + self.embeddings_multiplier = getattr(config, "mup_embeddings_multiplier", 1.0) + # TD [2022-07-30]: Force residual in fp32, seems to make fp16 training more stable + self.residual_in_fp32 = getattr(config, "residual_in_fp32", False) + # These 2 options are for OPT-350m + self.prenorm = getattr(config, "prenorm", True) + use_rms_norm = getattr(config, "rms_norm", False) + word_embed_proj_dim = getattr(config, "word_embed_proj_dim", None) + # For GPT-J, GPT-NeoX + self.parallel_block = getattr(config, "parallel_block", False) + + if process_group is None: + self.embeddings = GPT2Embeddings( + config.hidden_size, + vocab_size, + config.max_position_embeddings, + word_embed_proj_dim=word_embed_proj_dim, + **factory_kwargs, + ) + else: + self.embeddings = ParallelGPT2Embeddings( + config.hidden_size, + vocab_size, + config.max_position_embeddings, + process_group=process_group, + sequence_parallel=self.sequence_parallel, + **factory_kwargs, + ) + + # We change the order of dropout, residual and layer norm: + # Instead of LN -> Attn / MLP -> Dropout -> Add, we do: + # Dropout -> Add -> LN -> Attn / MLP, returning both the residual branch (output of Add) and + # the main branch (output of MLP). The model definition is unchanged, but the mapping of the + # nn.Dropout probabilities are changed. + # This is for performance reason: we can fuse dropout + add + layer_norm. + self.layers = nn.ModuleList( + [ + create_block(config, layer_idx=i, process_group=process_group, **factory_kwargs) + for i in range(config.num_hidden_layers) + ] + ) + rotary_emb_fraction = getattr(config, "rotary_emb_fraction", 0.0) + if rotary_emb_fraction > 0.0: # Tie all the RotaryEmbedding modules to share the same cos/sin cache + for layer in self.layers[1:]: + layer.mixer.rotary_emb = self.layers[0].mixer.rotary_emb + + self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False) + if self.fused_dropout_add_ln: + if layer_norm_fn is None: + raise ImportError("Triton is not installed") + if self.prenorm: + self.drop_f = nn.Dropout(config.resid_pdrop) + norm_cls = nn.LayerNorm if not use_rms_norm else RMSNorm + self.ln_f = norm_cls( + config.hidden_size, eps=config.layer_norm_epsilon, **factory_kwargs + ) + if process_group is not None: + for p in self.ln_f.parameters(): + # Mark the norm parameters as "shared_params" so that we sync their values at init. + p._shared_params = True + # Mark the norm params as "sequence_parallel" so we run all-reduce on their grads. + if self.sequence_parallel: + p._sequence_parallel = True + + self.apply( + partial( + _init_weights, + n_layer=config.num_hidden_layers, + initializer_range=config.initializer_range, + mup_width_scale=getattr(config, "mup_width_scale", 1.0), + ) + ) + self.tie_weights() + + def tie_weights(self): + if self.process_group is not None: + sync_shared_params(self, self.process_group) + + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs): + return { + i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs) + for i, layer in enumerate(self.layers) + } + + def forward(self, input_ids, position_ids=None, inference_params=None): + # If using Tensor Parallel with sequence parallel, we combine the batch and the seqlen + # dimensions so that we can split on it easily, in case of small batch size. + # Only the attention layers need to know the seqlen. + embedding_kwargs = ( + {"combine_batch_seqlen_dim": True} + if self.process_group is not None and self.sequence_parallel + else {} + ) + hidden_states = self.embeddings(input_ids, position_ids=position_ids, **embedding_kwargs) + if self.embeddings_multiplier != 1.0: + hidden_states = hidden_states * self.embeddings_multiplier + if self.parallel_block: + hidden_states2 = None + residual = None + mixer_kwargs = ( + {"seqlen": input_ids.shape[1]} + if self.process_group is not None and self.sequence_parallel + else {} + ) + if inference_params is not None: + mixer_kwargs["inference_params"] = inference_params + for layer in self.layers: + if self.prenorm: + if not self.parallel_block: + hidden_states, residual = layer( + hidden_states, residual, mixer_kwargs=mixer_kwargs + ) + else: + hidden_states, hidden_states2, residual = layer( + hidden_states, hidden_states2, residual, mixer_kwargs=mixer_kwargs + ) + else: + hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs) + if self.prenorm: + if not self.fused_dropout_add_ln: + dropped = self.drop_f(hidden_states) + if not self.parallel_block: + residual = (dropped + residual) if residual is not None else dropped + else: + dropped2 = self.drop_f(hidden_states2) + residual = ( + (residual + dropped + dropped2) + if residual is not None + else dropped + dropped2 + ) + hidden_states = self.ln_f(residual.to(dtype=self.ln_f.weight.dtype)) + else: + # Set prenorm=False here since we don't need the residual + hidden_states = layer_norm_fn( + hidden_states, + self.ln_f.weight, + self.ln_f.bias, + residual=residual, + x1=None if not self.parallel_block else hidden_states2, + eps=self.ln_f.eps, + dropout_p=self.drop_f.p if self.training else 0.0, + prenorm=False, + is_rms_norm=isinstance(self.ln_f, RMSNorm) + ) + return hidden_states + + +class GPTLMHeadModel(GPTPreTrainedModel, GenerationMixin): + def __init__(self, config: GPT2Config, process_group=None, device=None, dtype=None): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__(config) + self.process_group = process_group + self.transformer = GPTModel(config, process_group=process_group, **factory_kwargs) + self.tie_word_embeddings = getattr(config, "tie_word_embeddings", True) + lm_head_bias = getattr(config, "lm_head_bias", False) + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = ( + math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple + ) + # This option is for OPT-350m + word_embed_proj_dim = getattr(config, "word_embed_proj_dim", None) + embed_dim = config.n_embd if word_embed_proj_dim is None else word_embed_proj_dim + if word_embed_proj_dim is not None: + self.project_out = nn.Linear(config.n_embd, embed_dim, bias=False, **factory_kwargs) + else: + self.project_out = None + mup_width_scale = getattr(config, "mup_width_scale", 1.0) + mup_output_multiplier = getattr(config, "mup_output_multiplier", 1.0) + self.output_scale = mup_output_multiplier * mup_width_scale + if process_group is None: + self.lm_head = nn.Linear(embed_dim, vocab_size, bias=lm_head_bias, **factory_kwargs) + else: + if ColumnParallelLinear is None: + raise ImportError("fused_dense_lib is not installed") + self.lm_head = ColumnParallelLinear( + embed_dim, + vocab_size, + process_group, + bias=lm_head_bias, + sequence_parallel=getattr(config, "sequence_parallel", True), + **factory_kwargs, + ) + self.norm_head = getattr(config, "norm_head", False) + # Initialize weights and apply final processing + self.apply( + partial( + _init_weights, + n_layer=config.num_hidden_layers, + initializer_range=config.initializer_range, + mup_width_scale=mup_width_scale, + ) + ) + self.tie_weights() + + def tie_weights(self): + if self.tie_word_embeddings: + self.lm_head.weight = self.transformer.embeddings.word_embeddings.weight + if self.process_group is not None: + sync_shared_params(self, self.process_group) + + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs): + return self.transformer.allocate_inference_cache( + batch_size, max_seqlen, dtype=dtype, **kwargs + ) + + def forward(self, input_ids, position_ids=None, inference_params=None, num_last_tokens=0): + """ + input_ids: (batch, seqlen) int tensor + inference_params: for generation. Adapted from Megatron-LM (and Apex) + https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470 + num_last_tokens: if > 0, only return the logits for the last n tokens + """ + assert ( + input_ids.ndim == 2 + ), f"Expected `input_ids` to have shape [b, slen], but got shape {input_ids.shape}" + b, slen = input_ids.shape + hidden_states = self.transformer( + input_ids, position_ids=position_ids, inference_params=inference_params + ) + if inference_params is not None: + assert hidden_states.ndim == 3, "sequence_parallel is not supported in generation mode" + if num_last_tokens > 0: + hidden_states = hidden_states[:, -num_last_tokens:] + if self.project_out is not None: + hidden_states = self.project_out(hidden_states) + if self.output_scale != 1.0: + hidden_states = hidden_states * self.output_scale + if not self.norm_head: + lm_logits = self.lm_head(hidden_states) + else: + lm_head_weight = F.normalize(self.lm_head.weight) + if isinstance(self.lm_head, ColumnParallelLinear) and self.lm_head.sequence_parallel: + hidden_states = all_gather(hidden_states, self.lm_head.process_group) + lm_logits = F.linear(hidden_states, lm_head_weight, bias=self.lm_head.bias) + # During inference, we want the full logit for sampling + if isinstance(self.lm_head, ColumnParallelLinear) and inference_params is not None: + lm_logits, _ = all_gather_raw(lm_logits, self.lm_head.process_group) + lm_logits = rearrange(lm_logits, "(n b) ... d -> b ... (n d)", b=b) + CausalLMOutput = namedtuple("CausalLMOutput", ["logits"]) + return CausalLMOutput(logits=lm_logits) + + def load_state_dict(self, state_dict, strict=True): + # Remapping from our checkpoints that used a different ordering of layers in the block + # Previous: Attn / MLP -> Dropout -> Add -> LN + # Current: Dropout -> Add -> LN -> Attn / MLP + if "transformer.ln_0.weight" in state_dict: + n_layers = len(self.transformer.layers) + ln_weight = state_dict.pop(f"transformer.layers.{n_layers - 1}.norm2.weight") + ln_bias = state_dict.pop(f"transformer.layers.{n_layers - 1}.norm2.bias") + state_dict["transformer.ln_f.weight"] = ln_weight + state_dict["transformer.ln_f.bias"] = ln_bias + for l in reversed(range(n_layers)): + ln_weight = state_dict.pop(f"transformer.layers.{l}.norm1.weight") + ln_bias = state_dict.pop(f"transformer.layers.{l}.norm1.bias") + state_dict[f"transformer.layers.{l}.norm2.weight"] = ln_weight + state_dict[f"transformer.layers.{l}.norm2.bias"] = ln_bias + if l > 0: + ln_weight = state_dict.pop(f"transformer.layers.{l - 1}.norm2.weight") + ln_bias = state_dict.pop(f"transformer.layers.{l - 1}.norm2.bias") + state_dict[f"transformer.layers.{l}.norm1.weight"] = ln_weight + state_dict[f"transformer.layers.{l}.norm1.bias"] = ln_bias + ln_weight = state_dict.pop("transformer.ln_0.weight") + ln_bias = state_dict.pop("transformer.ln_0.bias") + state_dict[f"transformer.layers.0.norm1.weight"] = ln_weight + state_dict[f"transformer.layers.0.norm1.bias"] = ln_bias + return super().load_state_dict(state_dict, strict=strict) + + +def shard_state_dict_tp(state_dict, config, world_size, rank): + """Convert the state_dict of a standard GPT model to the state_dict of a GPT model + with tensor parallel. + + This function modifies state_dict in place. + """ + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple + assert vocab_size % world_size == 0 + assert config.hidden_size % world_size == 0 + inner_dim = config.n_inner if config.n_inner is not None else 4 * config.hidden_size + assert inner_dim % world_size == 0 + + n_head = config.n_head + n_head_kv = getattr(config, "n_head_kv", n_head) + + embed_dim = config.hidden_size + head_dim = embed_dim // n_head + + def shard_first_dim(state_dict, key): + if key in state_dict: + x = state_dict[key] + dim = x.shape[0] // world_size + state_dict[key] = x[rank * dim : (rank + 1) * dim] + + def shard_last_dim(state_dict, key, multiple_of=1): + if key in state_dict: + x = state_dict[key] + dim_each_rank = [ + get_dim_for_local_rank(x.size(-1), world_size, local_rank, multiple_of) + for local_rank in range(world_size) + ] + beg, end = tuple(sum(dim_each_rank[:pos]) for pos in (rank, rank + 1)) + state_dict[key] = x[..., beg:end] + + def shard_gatedmlp_fc1_dim(state_dict, key): + if key in state_dict: + x = state_dict[key] + dim = x.shape[0] // world_size // 2 + state_dict[key] = rearrange( + rearrange(x, "(two o) ... -> two o ...", two=2)[:, rank * dim : (rank + 1) * dim], + "two o ... -> (two o) ...", + ) + + def shard_qkv_headdim(state_dict, key): + if key in state_dict: + n_head_each_rank = [ + get_dim_for_local_rank(n_head, world_size, local_rank) + for local_rank in range(world_size) + ] + n_head_kv_each_rank = [ + get_dim_for_local_rank(n_head_kv, world_size, local_rank) + for local_rank in range(world_size) + ] + + beg_n_head = sum(n_head_each_rank[:rank]) + end_n_head = sum(n_head_each_rank[: rank + 1]) + + beg_n_head_kv = sum(n_head_kv_each_rank[:rank]) + end_n_head_kv = sum(n_head_kv_each_rank[: rank + 1]) + + if n_head_kv == n_head: + x = rearrange(state_dict[key], "(three d) ... -> three d ...", three=3) + state_dict[key] = rearrange( + x[:, beg_n_head * head_dim : end_n_head * head_dim], + "three d ... -> (three d) ...", + ) + else: + x = rearrange( + state_dict[key], + "(nheadqkv headdim) ... -> nheadqkv headdim ...", + nheadqkv=n_head + 2 * n_head_kv, + ) + state_dict[key] = rearrange( + torch.cat( + [ + x[beg_n_head:end_n_head], + x[n_head + beg_n_head_kv : n_head + end_n_head_kv], + x[ + n_head + + n_head_kv + + beg_n_head_kv : n_head + + n_head_kv + + end_n_head_kv + ], + ], + dim=0, + ), + "nheadqkv headdim ... -> (nheadqkv headdim) ...", + ) + + shard_first_dim(state_dict, "transformer.embeddings.word_embeddings.weight") + if "lm_head.weight" in state_dict: + shard_first_dim(state_dict, "lm_head.weight") + if "transformer.embeddings.position_embeddings.weight" in state_dict: + shard_last_dim(state_dict, "transformer.embeddings.position_embeddings.weight") + for i in range(config.num_hidden_layers): + shard_qkv_headdim(state_dict, f"transformer.layers.{i}.mixer.Wqkv.weight") + shard_qkv_headdim(state_dict, f"transformer.layers.{i}.mixer.Wqkv.bias") + shard_last_dim( + state_dict, f"transformer.layers.{i}.mixer.out_proj.weight", multiple_of=head_dim + ) + if rank != 0: + state_dict.pop(f"transformer.layers.{i}.mixer.out_proj.bias", None) + if config.activation_function in ["glu", "swiglu", "geglu"]: + shard_gatedmlp_fc1_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.weight") + shard_gatedmlp_fc1_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.bias") + else: + shard_first_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.weight") + shard_first_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.bias") + shard_last_dim(state_dict, f"transformer.layers.{i}.mlp.fc2.weight") + if rank != 0: + state_dict.pop(f"transformer.layers.{i}.mlp.fc2.bias", None) + return state_dict + + +def combine_state_dicts_tp(state_dicts: List[Dict[str, torch.Tensor]], config: GPT2Config): + """Convert the list of sharded state_dict of a GPT model with tensor parallel to + the state_dict of a standard GPT model. + + This function is meant to be the "reverse" of shard_state_dict_tp. + + Precondition: + - state_dicts should be ordered in the same way as the shards were created. + """ + world_size = len(state_dicts) + keys = state_dicts[0].keys() + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple + assert vocab_size % world_size == 0 + assert config.hidden_size % world_size == 0 + inner_dim = config.n_inner if config.n_inner is not None else 4 * config.hidden_size + assert inner_dim % world_size == 0 + assert config.hidden_size % config.n_head == 0 + headdim = config.hidden_size // config.n_head + + # Sometimes the word embeddings are sharded on the 0th dim, sometimes on the 1st dim. + # vocab_size // world_size coordinates are nonzero. + def combine_word_embeddings(state_dicts, state_dict, key): + dim = 0 if state_dicts[0][key].shape[0] == vocab_size // world_size else 1 + state_dict[key] = torch.cat([s[key] for s in state_dicts], dim=dim) + + def combine_dim(state_dicts, state_dict, key, dim=-1): + if key in state_dict: + state_dict[key] = torch.cat([s[key] for s in state_dicts], dim=dim) + + def combine_qkv_headdim(state_dicts, state_dict, key): + n_head = config.n_head + n_head_kv = getattr(config, "n_head_kv", n_head) + if key in state_dict: + if n_head_kv == n_head: + xs = [ + rearrange(s[key], "(three d) ... -> three d ...", three=3) for s in state_dicts + ] + state_dict[key] = rearrange(torch.cat(xs, dim=1), "three d ... -> (three d) ...") + else: + n_head_each_rank = [ + get_dim_for_local_rank(n_head, world_size, local_rank) + for local_rank in range(world_size) + ] + n_head_kv_each_rank = [ + get_dim_for_local_rank(n_head_kv, world_size, local_rank) + for local_rank in range(world_size) + ] + xs = [ + rearrange( + s[key], + "(nheadqkv headdim) ... -> nheadqkv headdim ...", + nheadqkv=rank_n_head + 2 * rank_n_head_kv, + headdim=headdim, + ) + for s, rank_n_head, rank_n_head_kv in zip( + state_dicts, n_head_each_rank, n_head_kv_each_rank + ) + ] + wq = torch.cat([x[: n_head_each_rank[rank]] for rank, x in enumerate(xs)], dim=0) + wk = torch.cat( + [ + x[ + n_head_each_rank[rank] : n_head_each_rank[rank] + + n_head_kv_each_rank[rank] + ] + for rank, x in enumerate(xs) + ], + dim=0, + ) + wv = torch.cat( + [ + x[n_head_each_rank[rank] + n_head_kv_each_rank[rank] :] + for rank, x in enumerate(xs) + ], + dim=0, + ) + wqkv = torch.cat( + [wq, wk, wv], + dim=0, + ) + state_dict[key] = rearrange( + wqkv, + "nheadqkv headdim ... -> (nheadqkv headdim) ...", + ) + + def combine_gated_mlp(state_dicts, state_dict, key): + if key in state_dict: + xs = [rearrange(s[key], "(two d) ... -> two d ...", two=2) for s in state_dicts] + state_dict[key] = rearrange(torch.cat(xs, dim=1), "two d ... -> (two d) ...") + + state_dict = state_dicts[0].copy() # don't modify state_dict[0] inplace + combine_word_embeddings( + state_dicts, state_dict, "transformer.embeddings.word_embeddings.weight" + ) + if "lm_head.weight" in state_dict: + combine_word_embeddings(state_dicts, state_dict, "lm_head.weight") + if "transformer.embeddings.position_embeddings.weight" in state_dict: + combine_dim( + state_dicts, state_dict, "transformer.embeddings.position_embeddings.weight", -1 + ) + mlp_combine_fn = ( + combine_gated_mlp + if config.activation_function in ["glu", "swiglu", "geglu"] + else partial(combine_dim, dim=0) + ) + for i in range(config.num_hidden_layers): + combine_qkv_headdim(state_dicts, state_dict, f"transformer.layers.{i}.mixer.Wqkv.weight") + combine_qkv_headdim(state_dicts, state_dict, f"transformer.layers.{i}.mixer.Wqkv.bias") + combine_dim(state_dicts, state_dict, f"transformer.layers.{i}.mixer.out_proj.weight", -1) + mlp_combine_fn(state_dicts, state_dict, f"transformer.layers.{i}.mlp.fc1.weight") + combine_dim(state_dicts, state_dict, f"transformer.layers.{i}.mlp.fc1.bias", 0) + combine_dim(state_dicts, state_dict, f"transformer.layers.{i}.mlp.fc2.weight", -1) + return state_dict + + +def remap_state_dict_hf_gpt2(state_dict, config): + # Word embedding and position embedding + def key_mapping_pos_emb(key): + return re.sub(r"^wpe.", "transformer.embeddings.position_embeddings.", key) + + state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items()) + word_embeddings = state_dict.pop("wte.weight") + # It's possible that vocab_size is padded to be a multiple of 8, for example. + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple + state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad( + word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0]) + ) + state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"] + + # LayerNorm + def key_mapping_ln(key): + key = re.sub(r"^ln_f.(weight|bias)", r"transformer.ln_f.\1", key) + key = re.sub(r"^h.(\d+).ln_(1|2).(weight|bias)", r"transformer.layers.\1.norm\2.\3", key) + return key + + state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items()) + + # MLP + for d in range(config.num_hidden_layers): + W1 = state_dict.pop(f"h.{d}.mlp.c_fc.weight") + state_dict[f"transformer.layers.{d}.mlp.fc1.weight"] = W1.t() + W2 = state_dict.pop(f"h.{d}.mlp.c_proj.weight") + state_dict[f"transformer.layers.{d}.mlp.fc2.weight"] = W2.t() + + def key_mapping_mlp(key): + key = re.sub(r"^h.(\d+).mlp.c_fc.bias", r"transformer.layers.\1.mlp.fc1.bias", key) + key = re.sub(r"^h.(\d+).mlp.c_proj.bias", r"transformer.layers.\1.mlp.fc2.bias", key) + return key + + state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items()) + + # Attention + for d in range(config.num_hidden_layers): + state_dict.pop(f"h.{d}.attn.bias", None) # We don't store this bias + Wqkv = state_dict.pop(f"h.{d}.attn.c_attn.weight") + state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = Wqkv.t() + Wout = state_dict.pop(f"h.{d}.attn.c_proj.weight") + state_dict[f"transformer.layers.{d}.mixer.out_proj.weight"] = Wout.t() + + def key_mapping_attn(key): + key = re.sub(r"^h.(\d+).attn.c_attn.bias", r"transformer.layers.\1.mixer.Wqkv.bias", key) + key = re.sub( + r"^h.(\d+).attn.c_proj.bias", r"transformer.layers.\1.mixer.out_proj.bias", key + ) + return key + + state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items()) + + return state_dict + + +def remap_state_dict_megatron(state_dict, config): + def key_mapping_transformer(key): + key = re.sub(r"^language_model.encoder.", "transformer.", key) + key = re.sub(r"^language_model.", "transformer.", key) + return key + + state_dict = OrderedDict((key_mapping_transformer(k), v) for k, v in state_dict.items()) + + # Word embedding and position embedding + def key_mapping_pos_emb(key): + return re.sub(r"^wpe.", "transformer.embeddings.position_embeddings.", key) + + state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items()) + word_embeddings = state_dict.pop("transformer.embedding.word_embeddings.weight") + # It's possible that vocab_size is padded to be a multiple of 8, for example. + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = ( + math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple) * pad_vocab_size_multiple + ) + state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad( + word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0]) + ) + state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"] + + # LayerNorm + def key_mapping_ln(key): + key = re.sub(r"^transformer.final_layernorm.(weight|bias)", r"transformer.ln_f.\1", key) + key = re.sub( + r"^transformer.layers.(\d+).input_layernorm.(weight|bias)", + r"transformer.layers.\1.norm1.\2", + key, + ) + key = re.sub( + r"^transformer.layers.(\d+).post_attention_layernorm.(weight|bias)", + r"transformer.layers.\1.norm2.\2", + key, + ) + return key + + state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items()) + + # MLP + def key_mapping_mlp(key): + key = re.sub( + r"^transformer.layers.(\d+).mlp.dense_h_to_4h.(weight|bias)", + r"transformer.layers.\1.mlp.fc1.\2", + key, + ) + key = re.sub( + r"^transformer.layers.(\d+).mlp.dense_4h_to_h.(weight|bias)", + r"transformer.layers.\1.mlp.fc2.\2", + key, + ) + return key + + state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items()) + + # Attention + def key_mapping_attn(key): + key = re.sub( + r"^transformer.layers.(\d+).self_attention.rotary_emb.inv_freq", + r"transformer.layers.\1.mixer.rotary_emb.inv_freq", + key, + ) + key = re.sub( + r"^transformer.layers.(\d+).self_attention.query_key_value.(weight|bias)", + r"transformer.layers.\1.mixer.Wqkv.\2", + key, + ) + key = re.sub( + r"^transformer.layers.(\d+).self_attention.dense.(weight|bias)", + r"transformer.layers.\1.mixer.out_proj.\2", + key, + ) + return key + + state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items()) + # Megatron stores Wqkv as ((nheads 3 headdim), hidden_dim) + # while we store Wqkv as ((3 nheads headdim), hidden_dim) + headdim = config.hidden_size // config.num_attention_heads + for d in range(config.num_hidden_layers): + Wqkv = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.weight") + state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = rearrange( + Wqkv, + "(nheads three headdim) ... -> (three nheads headdim) ...", + three=3, + headdim=headdim, + ) + bqkv = state_dict.pop(f"transformer.layers.{d}.mixer.Wqkv.bias") + state_dict[f"transformer.layers.{d}.mixer.Wqkv.bias"] = rearrange( + bqkv, "(nheads three headdim) -> (three nheads headdim)", three=3, headdim=headdim + ) + + return state_dict diff --git a/gpt2-hf.yaml b/gpt2-hf.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d6cb22f0a6576b5bd84f809caa0677ba49d16cf1 --- /dev/null +++ b/gpt2-hf.yaml @@ -0,0 +1,13 @@ +defaults: + - _self_ + - gpt2model: gpt2-small + +_target_: transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel +_recursive_: True +config: + _target_: transformers.GPT2Config + # Mistral's config: https://github.com/stanford-crfm/mistral/blob/main/conf/models/gpt2-small.yaml + # However, reorder_and_upcast_attn slows things down + reorder_and_upcast_attn: false + scale_attn_by_inverse_layer_idx: true + n_positions: ${datamodule.max_length} diff --git a/gpt2-large.yaml b/gpt2-large.yaml new file mode 100644 index 0000000000000000000000000000000000000000..434a61eb99141305a7f01b9fa812614a0a1b7109 --- /dev/null +++ b/gpt2-large.yaml @@ -0,0 +1,6 @@ +# @package _global_ +model: + config: + n_embd: 1280 + n_head: 20 + n_layer: 36 diff --git a/gpt2-medium.yaml b/gpt2-medium.yaml new file mode 100644 index 0000000000000000000000000000000000000000..786091836aa683b26e4d39ab557b1556a5331250 --- /dev/null +++ b/gpt2-medium.yaml @@ -0,0 +1,6 @@ +# @package _global_ +model: + config: + n_embd: 1024 + n_head: 16 + n_layer: 24 diff --git a/gpt2-small.yaml b/gpt2-small.yaml new file mode 100644 index 0000000000000000000000000000000000000000..039c9180226c1f504b43ce559ad6bd9cbdcc8cf9 --- /dev/null +++ b/gpt2-small.yaml @@ -0,0 +1,6 @@ +# @package _global_ +model: + config: + n_embd: 768 + n_head: 12 + n_layer: 12 diff --git a/gpt2-xlarge.yaml b/gpt2-xlarge.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d67a0e4185463f178e4f4510e3b753f3c7e0c169 --- /dev/null +++ b/gpt2-xlarge.yaml @@ -0,0 +1,6 @@ +# @package _global_ +model: + config: + n_embd: 1600 + n_head: 25 + n_layer: 48 diff --git a/gpt2.yaml b/gpt2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6c3868d06e7f9e873c3146cd85318cc0bc7bc101 --- /dev/null +++ b/gpt2.yaml @@ -0,0 +1,13 @@ +defaults: + - _self_ + - gpt2model: gpt2-small + +_target_: flash_attn.models.gpt.GPTLMHeadModel +_recursive_: True +config: + _target_: transformers.GPT2Config + # Mistral's config: # https://github.com/stanford-crfm/mistral/blob/main/conf/models/mistral-small.yaml + # However, reorder_and_upcast_attn slows things down + reorder_and_upcast_attn: false + scale_attn_by_inverse_layer_idx: true + n_positions: ${datamodule.max_length} diff --git a/gpt2_training_curve.jpg b/gpt2_training_curve.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bcf31e5b101a3856fc68633cca50f50a6a13d26c Binary files /dev/null and b/gpt2_training_curve.jpg differ diff --git a/gpt2_training_efficiency.jpg b/gpt2_training_efficiency.jpg new file mode 100644 index 0000000000000000000000000000000000000000..60be8139c7eabb53e94c3ee64cff8c7339774a61 Binary files /dev/null and b/gpt2_training_efficiency.jpg differ diff --git a/gpt2l-flash.yaml b/gpt2l-flash.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dacda10e5e02227b571c244562cfd15a8c190066 --- /dev/null +++ b/gpt2l-flash.yaml @@ -0,0 +1,41 @@ +# @package _global_ +defaults: + - /experiment/owt/gpt2m-flash.yaml + - override /model/gpt2model: gpt2-large + # TD [2022-08-03] Surprisingly it's faster to use the ZeRO optimizer than just AdamW. + # Still, fairscale is even faster and uses less memory. + # I think it's because Pytorch is using ZeRO stage 1 and fairscale is using ZeRO stage 2? + # However, fairscale has issues with saving checkpoint (either OOM or very + # slow since it goes through the CPU?). Fairscale says Pytorch ZeRO is the + # upstream version of OSS + # https://github.com/facebookresearch/fairscale/issues/937 + # Pytorch ZeRO as also very slow for saving checkpoints due to + # consolidate_state_dict(), but I've fixed it to save separate checkpoint per GPU. + - override /optimizer: adamw-zero + + # FusedAdam doesn't seem to speed things up here, time per global step + # (i.e. batch size 512) on 8 A100s is around 2056ms for both AdamW and FusedAdam. + # This could be because each GPU is only doing the optimizer step for 1 / + # world_size of the parameters. + # Maybe the bottleneck here is the NCCL call to exchange parameters (ZeRO). + # - override /optimizer: adamw-apex-zero + +# Can enable mlp_chekcpoint_lvl to fit batch_size 16 on A100 40GB +# model: +# config: +# # mlp_checkpoint_lvl: ${eval:"[1] * 18 + [2] * 18"} +# mlp_checkpoint_lvl: 1 + +datamodule: + # batch_size: 16 + batch_size: ${eval:"4 if ${train.gpu_mem} < 24 else (8 if ${train.gpu_mem} < 40 else (16 if ${train.gpu_mem} < 80 else 32))"} + +trainer: + # strategy: null + # strategy: ${eval:"None if ${trainer.devices} == 1 else 'ddp_sharded'"} + strategy: + _target_: src.utils.ddp_zero1.DDPStrategyZero1 + find_unused_parameters: False + gradient_as_bucket_view: True + # TD [2022-08-03] Deepspeed makes the ppl curve go wild + # strategy: deepspeed_stage_1 diff --git a/gpt2l-hf.yaml b/gpt2l-hf.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b8a2924925b95ffedcabd55b8ed075d9063f45dd --- /dev/null +++ b/gpt2l-hf.yaml @@ -0,0 +1,14 @@ +# @package _global_ +defaults: + - /experiment/owt/gpt2m-hf.yaml + - override /model/gpt2model: gpt2-large + - override /optimizer: adamw-zero + +datamodule: + batch_size: 2 + +trainer: + strategy: + _target_: src.utils.ddp_zero1.DDPStrategyZero1 + find_unused_parameters: False + gradient_as_bucket_view: True diff --git a/gpt2l.yaml b/gpt2l.yaml new file mode 100644 index 0000000000000000000000000000000000000000..83d3ccf256af8c423dbb16aa84a989f7a634fa3a --- /dev/null +++ b/gpt2l.yaml @@ -0,0 +1,14 @@ +# @package _global_ +defaults: + - /experiment/owt/gpt2m.yaml + - override /model/gpt2model: gpt2-large + - override /optimizer: adamw-zero + +datamodule: + batch_size: 4 # Per GPU + +trainer: + strategy: + _target_: src.utils.ddp_zero1.DDPStrategyZero1 + find_unused_parameters: False + gradient_as_bucket_view: True diff --git a/gpt2m-flash.yaml b/gpt2m-flash.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bec671e26abc67dc3b8c2ab4a51ccaf710a18cd3 --- /dev/null +++ b/gpt2m-flash.yaml @@ -0,0 +1,17 @@ +# @package _global_ +defaults: + - /experiment/owt/gpt2s-flash.yaml + - override /model/gpt2model: gpt2-medium + +# Can enable mlp_checkpoint_lvl to fit batch_size 32 to A100 40GB +# model: +# config: +# mlp_checkpoint_lvl: 1 + +datamodule: + # batch_size: 32 + batch_size: ${eval:"8 if ${train.gpu_mem} < 24 else (16 if ${train.gpu_mem} < 40 else (32 if ${train.gpu_mem} < 80 else 64))"} + +train: + optimizer: + lr: 1.5e-4 diff --git a/gpt2m-hf.yaml b/gpt2m-hf.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1e570e21b86ff1dfcc71540867b032c5d2b755a9 --- /dev/null +++ b/gpt2m-hf.yaml @@ -0,0 +1,11 @@ +# @package _global_ +defaults: + - /experiment/owt/gpt2s-hf.yaml + - override /model/gpt2model: gpt2-medium + +datamodule: + batch_size: 4 + +train: + optimizer: + lr: 1.5e-4 diff --git a/gpt2m.yaml b/gpt2m.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4cc99335b7f5c8bf8d4114becfe19df9450e3c29 --- /dev/null +++ b/gpt2m.yaml @@ -0,0 +1,11 @@ +# @package _global_ +defaults: + - /experiment/owt/gpt2s.yaml + - override /model/gpt2model: gpt2-medium + +datamodule: + batch_size: 8 # Per GPU + +train: + optimizer: + lr: 1.5e-4 diff --git a/gpt2s-flash.yaml b/gpt2s-flash.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f2dc6956d97e074e55c514053daff39b15295c86 --- /dev/null +++ b/gpt2s-flash.yaml @@ -0,0 +1,19 @@ +# @package _global_ +defaults: + - /experiment/owt/base.yaml + - override /model: gpt2 + - override /model/gpt2model: gpt2-small + +model: + config: + # n_positions is already set to ${datamodule.max_length} + residual_in_fp32: True + use_flash_attn: True + fused_bias_fc: True + fused_mlp: True + fused_dropout_add_ln: True + pad_vocab_size_multiple: 8 + +datamodule: + # batch_size: 64 + batch_size: ${eval:"16 if ${train.gpu_mem} < 24 else (32 if ${train.gpu_mem} < 40 else 64)"} diff --git a/gpt2s-hf.yaml b/gpt2s-hf.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9b0f65cabb393e7179416509fcfede1abedeaaa6 --- /dev/null +++ b/gpt2s-hf.yaml @@ -0,0 +1,23 @@ +# @package _global_ +defaults: + - /experiment/owt/base.yaml + - override /model: gpt2-hf + - override /model/gpt2model: gpt2-small + - override /callbacks: [default, norm-monitor, flop-count] + +datamodule: + batch_size: 8 + +train: + # Use the standard torch.nn.CrossEntropyLoss + loss_fn: null + +callbacks: + flop_count: + input_size: + - ${datamodule.max_length} + input_dtype: + # It's surprisingly hard to get hydra to return torch.long since it's not a callable + _target_: torch.__getattribute__ + _args_: + - long diff --git a/gpt2s.yaml b/gpt2s.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c9faf60b01a81e59263ce6634c4860b9f7a4b18b --- /dev/null +++ b/gpt2s.yaml @@ -0,0 +1,8 @@ +# @package _global_ +defaults: + - /experiment/owt/base.yaml + - override /model: gpt2 + - override /model/gpt2model: gpt2-small + +datamodule: + batch_size: ${eval:"4 if ${train.gpu_mem} < 24 else (8 if ${train.gpu_mem} < 40 else 16)"} diff --git a/gpt2xl-flash.yaml b/gpt2xl-flash.yaml new file mode 100644 index 0000000000000000000000000000000000000000..717847ec20cc167787b203509ea590ded3c3f1b1 --- /dev/null +++ b/gpt2xl-flash.yaml @@ -0,0 +1,21 @@ +# @package _global_ +defaults: + - /experiment/owt/gpt2l-flash.yaml + - override /model/gpt2model: gpt2-xlarge + +# Can enable mlp_checkpoint_lvl to fit to A100 40GB +# model: +# config: +# # mlp_checkpoint_lvl: ${eval:"[1] * 18 + [2] * 18"} +# mlp_checkpoint_lvl: 1 + +datamodule: + batch_size: ${eval:"2 if ${train.gpu_mem} < 24 else (4 if ${train.gpu_mem} < 40 else (8 if ${train.gpu_mem} < 80 else 16))"} + # With adamw-zero optimizer, on A100 40GB: + # checkpoint_lvl=1, batch size = 4: mem 37GB, 4650ms / batch of 512 (285ms * 15 + 375ms * 1) + # checkpoint_lvl=1, batch size = 8: mem 46GB, 4330ms / batch of 512 (530ms * 7 + 620ms * 1) + # checkpoint_lvl=2, batch size = 8: mem 41GB, 4570ms / batch of 512 (560ms * 7 + 650ms * 1) + # With adamw-apex-distributed optimizer: + # checkpoint_lvl=1, batch size = 8: mem 41.5GB, 4500ms / batch of 512 (550ms * 7 + 650ms * 1) + # checkpoint_lvl=1 for 24 layers and checkpoint_lvl=2 for 24 layers, + # batch size = 8: mem 39GB, 4640ms / batch of 512 (565ms * 7 + 675ms * 1) diff --git a/gpt2xl-hf.yaml b/gpt2xl-hf.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8560bd11d7e3efaed4d607a37bd4fbd9cf3afe8 --- /dev/null +++ b/gpt2xl-hf.yaml @@ -0,0 +1,7 @@ +# @package _global_ +defaults: + - /experiment/owt/gpt2l-hf.yaml + - override /model/gpt2model: gpt2-xlarge + +datamodule: + batch_size: 1 diff --git a/gpt2xl.yaml b/gpt2xl.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a43db2f85b46e1003aa1d29339060f043e2b6644 --- /dev/null +++ b/gpt2xl.yaml @@ -0,0 +1,14 @@ +# @package _global_ +defaults: + - /experiment/owt/gpt2m.yaml + - override /model/gpt2model: gpt2-xlarge + - override /optimizer: adamw-zero + +datamodule: + batch_size: 2 # Per GPU + +trainer: + strategy: + _target_: src.utils.ddp_zero1.DDPStrategyZero1 + find_unused_parameters: False + gradient_as_bucket_view: True diff --git a/gpt3-2.7B-flash-8k.yaml b/gpt3-2.7B-flash-8k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f126d18b769bfa652a649bc3e19c29bdbda00499 --- /dev/null +++ b/gpt3-2.7B-flash-8k.yaml @@ -0,0 +1,18 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3xl-flash-8k.yaml + +model: + config: + n_embd: 2560 + n_head: 32 + n_layer: 32 + initializer_range: ${eval:"(2 / (${.n_embd} * 5)) ** 0.5"} + mlp_checkpoint_lvl: 0 + +datamodule: + batch_size: ${eval:"1 if ${train.gpu_mem} < 40 else (2 if ${train.gpu_mem} < 80 else 4)"} + +train: + optimizer: + lr: 1.6e-4 diff --git a/gpt3-2.7B-flash-hdim128-rotary-8k.yaml b/gpt3-2.7B-flash-hdim128-rotary-8k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..09fdee93cc89f1b1b2ad3e4e70681ef325c57723 --- /dev/null +++ b/gpt3-2.7B-flash-hdim128-rotary-8k.yaml @@ -0,0 +1,18 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3xl-flash-rotary-8k.yaml + +model: + config: + n_embd: 2560 + n_head: 20 + n_layer: 32 + initializer_range: ${eval:"(2 / (${.n_embd} * 5)) ** 0.5"} + mlp_checkpoint_lvl: 0 + +datamodule: + batch_size: ${eval:"1 if ${train.gpu_mem} < 24 else (2 if ${train.gpu_mem} < 40 else (4 if ${train.gpu_mem} < 80 else 8))"} + +train: + optimizer: + lr: 1.6e-4 diff --git a/gpt3-2.7B-flash-hdim128-rotary.yaml b/gpt3-2.7B-flash-hdim128-rotary.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d5caafd150cd192a103e7c3b4a6e72b33f738fec --- /dev/null +++ b/gpt3-2.7B-flash-hdim128-rotary.yaml @@ -0,0 +1,18 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3xl-flash-rotary.yaml + +model: + config: + n_embd: 2560 + n_head: 20 + n_layer: 32 + initializer_range: ${eval:"(2 / (${.n_embd} * 5)) ** 0.5"} + mlp_checkpoint_lvl: 0 + +datamodule: + batch_size: ${eval:"4 if ${train.gpu_mem} < 24 else (8 if ${train.gpu_mem} < 40 else (16 if ${train.gpu_mem} < 80 else 32))"} + +train: + optimizer: + lr: 1.6e-4 diff --git a/gpt3-2.7B-flash-hdim128.yaml b/gpt3-2.7B-flash-hdim128.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9fd391b79ce11e674c9c61f95f65756aec706987 --- /dev/null +++ b/gpt3-2.7B-flash-hdim128.yaml @@ -0,0 +1,18 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3xl-flash.yaml + +model: + config: + n_embd: 2560 + n_head: 20 # Headdim 128 is faster than headdim 80 + n_layer: 32 + initializer_range: ${eval:"(2 / (${.n_embd} * 5)) ** 0.5"} + mlp_checkpoint_lvl: 0 + +datamodule: + batch_size: ${eval:"1 if ${train.gpu_mem} < 40 else (2 if ${train.gpu_mem} < 80 else 4)"} + +train: + optimizer: + lr: 1.6e-4 diff --git a/gpt3-2.7B-flash-rotary-8k.yaml b/gpt3-2.7B-flash-rotary-8k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b259a29997e7f9bd39bb3f63d8803ae09a75a764 --- /dev/null +++ b/gpt3-2.7B-flash-rotary-8k.yaml @@ -0,0 +1,18 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3xl-flash-rotary-8k.yaml + +model: + config: + n_embd: 2560 + n_head: 32 + n_layer: 32 + initializer_range: ${eval:"(2 / (${.n_embd} * 5)) ** 0.5"} + mlp_checkpoint_lvl: 0 + +datamodule: + batch_size: ${eval:"1 if ${train.gpu_mem} < 24 else (2 if ${train.gpu_mem} < 40 else (4 if ${train.gpu_mem} < 80 else 8))"} + +train: + optimizer: + lr: 1.6e-4 diff --git a/gpt3-2.7B-flash-rotary.yaml b/gpt3-2.7B-flash-rotary.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1e1684c4f5068f42d48cea7eba45b0b612036fa9 --- /dev/null +++ b/gpt3-2.7B-flash-rotary.yaml @@ -0,0 +1,18 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3xl-flash-rotary.yaml + +model: + config: + n_embd: 2560 + n_head: 32 + n_layer: 32 + initializer_range: ${eval:"(2 / (${.n_embd} * 5)) ** 0.5"} + mlp_checkpoint_lvl: 0 + +datamodule: + batch_size: ${eval:"4 if ${train.gpu_mem} < 24 else (8 if ${train.gpu_mem} < 40 else (16 if ${train.gpu_mem} < 80 else 32))"} + +train: + optimizer: + lr: 1.6e-4 diff --git a/gpt3-2.7B-flash.yaml b/gpt3-2.7B-flash.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0dbfc21dbe98960c4732491392ab47f166b946f1 --- /dev/null +++ b/gpt3-2.7B-flash.yaml @@ -0,0 +1,18 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3xl-flash.yaml + +model: + config: + n_embd: 2560 + n_head: 32 + n_layer: 32 + initializer_range: ${eval:"(2 / (${.n_embd} * 5)) ** 0.5"} + mlp_checkpoint_lvl: 0 + +datamodule: + batch_size: ${eval:"1 if ${train.gpu_mem} < 40 else (2 if ${train.gpu_mem} < 80 else 4)"} + +train: + optimizer: + lr: 1.6e-4 diff --git a/gpt3-2.7B-hf-hdim128.yaml b/gpt3-2.7B-hf-hdim128.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cc365d9108124bb5f5c15e76dd9a183cedf0148d --- /dev/null +++ b/gpt3-2.7B-hf-hdim128.yaml @@ -0,0 +1,17 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3xl-hf.yaml + +model: + config: + n_embd: 2560 + n_head: 128 + n_layer: 32 + +# OOM on A100 80GB even with batch_size = 1 +datamodule: + batch_size: 1 + +train: + optimizer: + lr: 1.6e-4 diff --git a/gpt3-2.7B-hf.yaml b/gpt3-2.7B-hf.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff0a7a720f73bae3e0dd4303dbd870d83395fee6 --- /dev/null +++ b/gpt3-2.7B-hf.yaml @@ -0,0 +1,16 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3xl-hf.yaml + +model: + config: + n_embd: 2560 + n_head: 32 + n_layer: 32 + +datamodule: + batch_size: 1 + +train: + optimizer: + lr: 1.6e-4 diff --git a/gpt3_training_curve.jpg b/gpt3_training_curve.jpg new file mode 100644 index 0000000000000000000000000000000000000000..50cc9b51d3e2da1ab843a5e0db4f2b086969306a Binary files /dev/null and b/gpt3_training_curve.jpg differ diff --git a/gpt3_training_efficiency.jpg b/gpt3_training_efficiency.jpg new file mode 100644 index 0000000000000000000000000000000000000000..541fccf8ba24a712a2bcdf5eb5553c32d1b5ca55 Binary files /dev/null and b/gpt3_training_efficiency.jpg differ diff --git a/gpt3l-flash-8k.yaml b/gpt3l-flash-8k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ccbbebfd00472d1e1fc6e5340c8ffa1137240b8c --- /dev/null +++ b/gpt3l-flash-8k.yaml @@ -0,0 +1,10 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3l-flash.yaml + +datamodule: + max_length: 8192 + batch_size: ${eval:"1 if ${train.gpu_mem} < 40 else (2 if ${train.gpu_mem} < 80 else 4)"} + +train: + global_batch_size: 64 diff --git a/gpt3l-flash-rotary-30B.yaml b/gpt3l-flash-rotary-30B.yaml new file mode 100644 index 0000000000000000000000000000000000000000..74c6bb9ce57c7e7aad50f93588f93f976f032774 --- /dev/null +++ b/gpt3l-flash-rotary-30B.yaml @@ -0,0 +1,10 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3l-flash-rotary.yaml + +trainer: + max_steps: 60000 + +train: + scheduler: + t_initial: ${trainer.max_steps} diff --git a/gpt3l-flash-rotary-8k.yaml b/gpt3l-flash-rotary-8k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2b3ba3145b0e8bdb40acb30db4e6d86354afc0fb --- /dev/null +++ b/gpt3l-flash-rotary-8k.yaml @@ -0,0 +1,8 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3l-flash-8k.yaml + +model: + config: + max_position_embeddings: 0 # Disable absolute position embedding + rotary_emb_fraction: 0.5 diff --git a/gpt3l-flash-rotary.yaml b/gpt3l-flash-rotary.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f2856320273305147bf96633ed445507c0bdcc82 --- /dev/null +++ b/gpt3l-flash-rotary.yaml @@ -0,0 +1,8 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3l-flash.yaml + +model: + config: + max_position_embeddings: 0 # Disable absolute position embedding + rotary_emb_fraction: 0.5 diff --git a/gpt3l-flash.yaml b/gpt3l-flash.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eebc19a85338a69c623effc80533cf07c7962bc2 --- /dev/null +++ b/gpt3l-flash.yaml @@ -0,0 +1,24 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3s-flash.yaml + - override /optimizer: adamw-zero + +model: + config: + n_embd: 1536 + n_head: 16 + n_layer: 24 + # mlp_checkpoint_lvl: 1 # To fit batch_size 8 + +datamodule: + batch_size: ${eval:"2 if ${train.gpu_mem} < 24 else (4 if ${train.gpu_mem} < 40 else (8 if ${train.gpu_mem} < 80 else 16))"} + +train: + optimizer: + lr: 2.5e-4 + +trainer: + strategy: + _target_: src.utils.ddp_zero1.DDPStrategyZero1 + find_unused_parameters: False + gradient_as_bucket_view: True diff --git a/gpt3l-hf.yaml b/gpt3l-hf.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f70af10ccb4f81ee11cdb0d977d846708d0e53a1 --- /dev/null +++ b/gpt3l-hf.yaml @@ -0,0 +1,16 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3s-hf.yaml + +model: + config: + n_embd: 1536 + n_head: 16 + n_layer: 24 + +datamodule: + batch_size: 2 + +train: + optimizer: + lr: 2.5e-4 diff --git a/gpt3m-flash-8k.yaml b/gpt3m-flash-8k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d75e6d3a3a434213a919eb85280aeb7a2ee34c5d --- /dev/null +++ b/gpt3m-flash-8k.yaml @@ -0,0 +1,10 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3m-flash.yaml + +datamodule: + max_length: 8192 + batch_size: ${eval:"2 if ${train.gpu_mem} < 24 else (4 if ${train.gpu_mem} < 40 else 8)"} + +train: + global_batch_size: 64 diff --git a/gpt3m-flash-rotary-30B.yaml b/gpt3m-flash-rotary-30B.yaml new file mode 100644 index 0000000000000000000000000000000000000000..04630753e5643f01281d0310b0766f0b28c8e47c --- /dev/null +++ b/gpt3m-flash-rotary-30B.yaml @@ -0,0 +1,10 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3m-flash-rotary.yaml + +trainer: + max_steps: 60000 + +train: + scheduler: + t_initial: ${trainer.max_steps} diff --git a/gpt3m-flash-rotary-8k.yaml b/gpt3m-flash-rotary-8k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f217ac5211c4c61b3b789f3d572aa0908fe72898 --- /dev/null +++ b/gpt3m-flash-rotary-8k.yaml @@ -0,0 +1,8 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3m-flash-8k.yaml + +model: + config: + max_position_embeddings: 0 # Disable absolute position embedding + rotary_emb_fraction: 0.5 diff --git a/gpt3m-flash-rotary.yaml b/gpt3m-flash-rotary.yaml new file mode 100644 index 0000000000000000000000000000000000000000..adb0cb6142a2ee3ad5d82f7321d4f5f26be43053 --- /dev/null +++ b/gpt3m-flash-rotary.yaml @@ -0,0 +1,8 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3m-flash.yaml + +model: + config: + max_position_embeddings: 0 # Disable absolute position embedding + rotary_emb_fraction: 0.5 diff --git a/gpt3m-flash.yaml b/gpt3m-flash.yaml new file mode 100644 index 0000000000000000000000000000000000000000..830b2d5df079f2aae91e0ebdb235c753b7a51ba7 --- /dev/null +++ b/gpt3m-flash.yaml @@ -0,0 +1,16 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3s-flash.yaml + - override /model/gpt2model: gpt2-medium + +# Can enable mlp_checkpoint_lvl to fit batch_size 16 to A100 40GB +# model: +# config: +# mlp_checkpoint_lvl: 1 + +datamodule: + batch_size: ${eval:"4 if ${train.gpu_mem} < 24 else (8 if ${train.gpu_mem} < 40 else (16 if ${train.gpu_mem} < 80 else 32))"} + +train: + optimizer: + lr: 3.0e-4 diff --git a/gpt3m-hf.yaml b/gpt3m-hf.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e0e09e4e964a6e0aec5096d45badc5f787096d8c --- /dev/null +++ b/gpt3m-hf.yaml @@ -0,0 +1,11 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3s-hf.yaml + - override /model/gpt2model: gpt2-medium + +datamodule: + batch_size: 4 + +train: + optimizer: + lr: 3.0e-4 diff --git a/gpt3s-flash-8k.yaml b/gpt3s-flash-8k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..06ce6453d103c92c2c141dcd780f2c1aca756f7c --- /dev/null +++ b/gpt3s-flash-8k.yaml @@ -0,0 +1,10 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3s-flash.yaml + +datamodule: + max_length: 8192 + batch_size: ${eval:"2 if ${train.gpu_mem} < 24 else (4 if ${train.gpu_mem} < 40 else 8)"} + +train: + global_batch_size: 64 diff --git a/gpt3s-flash-rotary-30B.yaml b/gpt3s-flash-rotary-30B.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d434480060acfb240b12c74943a3617bc77a023e --- /dev/null +++ b/gpt3s-flash-rotary-30B.yaml @@ -0,0 +1,10 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3s-flash-rotary.yaml + +trainer: + max_steps: 60000 + +train: + scheduler: + t_initial: ${trainer.max_steps} diff --git a/gpt3s-flash-rotary-8k.yaml b/gpt3s-flash-rotary-8k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bdee8766f9795fb72eafc7cdaa0e7c6e8712423f --- /dev/null +++ b/gpt3s-flash-rotary-8k.yaml @@ -0,0 +1,8 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3s-flash-8k.yaml + +model: + config: + max_position_embeddings: 0 # Disable absolute position embedding + rotary_emb_fraction: 0.5 diff --git a/gpt3s-flash-rotary.yaml b/gpt3s-flash-rotary.yaml new file mode 100644 index 0000000000000000000000000000000000000000..41176eea179f279c7ac161540d9e67b00bf34d07 --- /dev/null +++ b/gpt3s-flash-rotary.yaml @@ -0,0 +1,8 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3s-flash.yaml + +model: + config: + max_position_embeddings: 0 # Disable absolute position embedding + rotary_emb_fraction: 0.5 diff --git a/gpt3s-flash.yaml b/gpt3s-flash.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45302fd4d3cb01b804ec5d5b99c86d24379b5366 --- /dev/null +++ b/gpt3s-flash.yaml @@ -0,0 +1,18 @@ +# @package _global_ +defaults: + - /experiment/pile/base.yaml + - override /model: gpt2 + - override /model/gpt2model: gpt2-small + +model: + config: + # n_positions is already set to ${datamodule.max_length} + residual_in_fp32: True + use_flash_attn: True + fused_dropout_add_ln: True + fused_mlp: True + fused_bias_fc: True + pad_vocab_size_multiple: 8 + +datamodule: + batch_size: ${eval:"8 if ${train.gpu_mem} < 24 else (16 if ${train.gpu_mem} < 40 else 32)"} diff --git a/gpt3s-hf.yaml b/gpt3s-hf.yaml new file mode 100644 index 0000000000000000000000000000000000000000..459121759ad1ed25f943ee7857dc70ecf11b60cc --- /dev/null +++ b/gpt3s-hf.yaml @@ -0,0 +1,12 @@ +# @package _global_ +defaults: + - /experiment/pile/base.yaml + - override /model: gpt2-hf + - override /model/gpt2model: gpt2-small + +datamodule: + batch_size: 8 + +train: + # Use the standard torch.nn.CrossEntropyLoss + loss_fn: null diff --git a/gpt3xl-flash-8k.yaml b/gpt3xl-flash-8k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d411906a136a8c0f41d1dfc4a3128a1c0da92463 --- /dev/null +++ b/gpt3xl-flash-8k.yaml @@ -0,0 +1,10 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3xl-flash.yaml + +datamodule: + max_length: 8192 + batch_size: ${eval:"1 if ${train.gpu_mem} < 40 else (2 if ${train.gpu_mem} < 80 else 4)"} + +train: + global_batch_size: 128 diff --git a/gpt3xl-flash-rotary-60B.yaml b/gpt3xl-flash-rotary-60B.yaml new file mode 100644 index 0000000000000000000000000000000000000000..48e421346257a78663221540b26be47e33b56afe --- /dev/null +++ b/gpt3xl-flash-rotary-60B.yaml @@ -0,0 +1,10 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3xl-flash-rotary.yaml + +trainer: + max_steps: 60000 + +train: + scheduler: + t_initial: ${trainer.max_steps} diff --git a/gpt3xl-flash-rotary-8k.yaml b/gpt3xl-flash-rotary-8k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d4c4cbe2fff8da4ddccced6cec9b0b4107330dfe --- /dev/null +++ b/gpt3xl-flash-rotary-8k.yaml @@ -0,0 +1,8 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3xl-flash-8k.yaml + +model: + config: + max_position_embeddings: 0 # Disable absolute position embedding + rotary_emb_fraction: 0.5 diff --git a/gpt3xl-flash-rotary.yaml b/gpt3xl-flash-rotary.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f05f70595da94de4a42ce78a42db7da338c9a6d4 --- /dev/null +++ b/gpt3xl-flash-rotary.yaml @@ -0,0 +1,8 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3xl-flash.yaml + +model: + config: + max_position_embeddings: 0 # Disable absolute position embedding + rotary_emb_fraction: 0.5 diff --git a/gpt3xl-flash.yaml b/gpt3xl-flash.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f432e35539fe3832e8347f512d17b6418ffb1610 --- /dev/null +++ b/gpt3xl-flash.yaml @@ -0,0 +1,35 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3s-flash.yaml + - override /optimizer: adamw-zero + +model: + config: + n_embd: 2048 + n_head: 16 + n_layer: 24 + +datamodule: + batch_size: ${eval:"1 if ${train.gpu_mem} < 24 else (2 if ${train.gpu_mem} < 40 else (4 if ${train.gpu_mem} < 80 else 8))"} + +train: + global_batch_size: 512 + optimizer: + lr: 2.0e-4 + scheduler: + t_initial: 300000 + +trainer: + strategy: + _target_: src.utils.ddp_zero1.DDPStrategyZero1 + find_unused_parameters: False + gradient_as_bucket_view: True + max_steps: 400000 + val_check_interval: ${eval:1000 * ${.accumulate_grad_batches}} + +callbacks: + model_checkpoint: + every_n_train_steps: 1000 + model_checkpoint_progress: + every_n_train_steps: 12500 + fault_tolerant: False # Saving takes too long diff --git a/gpt3xl-hf.yaml b/gpt3xl-hf.yaml new file mode 100644 index 0000000000000000000000000000000000000000..58f29bd6b865fb3b2d12b22c4ac0577ea78a841a --- /dev/null +++ b/gpt3xl-hf.yaml @@ -0,0 +1,35 @@ +# @package _global_ +defaults: + - /experiment/pile/gpt3s-hf.yaml + - override /optimizer: adamw-zero + +model: + config: + n_embd: 2048 + n_head: 16 + n_layer: 24 + +datamodule: + batch_size: 2 + +train: + global_batch_size: 512 + optimizer: + lr: 2.0e-4 + scheduler: + t_initial: 300000 + +trainer: + strategy: + _target_: src.utils.ddp_zero1.DDPStrategyZero1 + find_unused_parameters: False + gradient_as_bucket_view: True + max_steps: 400000 + val_check_interval: ${eval:1000 * ${.accumulate_grad_batches}} + +callbacks: + model_checkpoint: + every_n_train_steps: 1000 + model_checkpoint_progress: + every_n_train_steps: 12500 + fault_tolerant: False # Saving takes too long diff --git a/gpt_neox.py b/gpt_neox.py new file mode 100644 index 0000000000000000000000000000000000000000..c3894044172260a25c9c561fbaac8add91db5b23 --- /dev/null +++ b/gpt_neox.py @@ -0,0 +1,124 @@ +# Copyright (c) 2023, Tri Dao. + +import math +import re +from collections import OrderedDict + +import torch +import torch.nn.functional as F +from einops import rearrange +from transformers import GPT2Config, GPTNeoXConfig + + +def remap_state_dict_hf_gpt_neox(state_dict, config): + def key_mapping_layers(key): + return re.sub(r"^gpt_neox.", "transformer.", key) + + state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items()) + # Word embedding + def key_mapping_emb(key): + return re.sub(r"^transformer.embed_in.", "transformer.embeddings.word_embeddings.", key) + + state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items()) + word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight") + # It's possible that vocab_size is padded to be a multiple of 8, for example. + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple + state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad( + word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0]) + ) + if getattr(config, "tie_word_embeddings", False): + state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"] + else: + output_embeddings = state_dict.pop("embed_out.weight") + # It's possible that vocab_size is padded to be a multiple of 8, for example. + state_dict["lm_head.weight"] = F.pad( + output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0]) + ) + + # LayerNorm + def key_mapping_ln(key): + key = re.sub(r"^transformer.final_layer_norm.", r"transformer.ln_f.", key) + key = re.sub( + r"^transformer.layers.(\d+).input_layernorm.", r"transformer.layers.\1.norm1.", key + ) + key = re.sub( + r"^transformer.layers.(\d+).post_attention_layernorm.", + r"transformer.layers.\1.norm2.", + key, + ) + return key + + state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items()) + + # MLP + def key_mapping_mlp(key): + key = re.sub( + r"^transformer.layers.(\d+).mlp.dense_h_to_4h.", r"transformer.layers.\1.mlp.fc1.", key + ) + key = re.sub( + r"^transformer.layers.(\d+).mlp.dense_4h_to_h.", r"transformer.layers.\1.mlp.fc2.", key + ) + return key + + state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items()) + + # Attention + for l in range(config.n_layer): + # We don't store these biases + state_dict.pop(f"transformer.layers.{l}.attention.bias") + state_dict.pop(f"transformer.layers.{l}.attention.masked_bias") + # We don't store these + state_dict.pop(f"transformer.layers.{l}.attention.rotary_emb.inv_freq", None) + # GPT-NeoX stores Wqkv as ((nheads 3 headdim), hidden_dim) + # while we store Wqkv as ((3 nheads headdim), hidden_dim) + headdim = config.hidden_size // config.num_attention_heads + Wqkv = state_dict.pop(f"transformer.layers.{l}.attention.query_key_value.weight") + state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = rearrange( + Wqkv, + "(nheads three headdim) ... -> (three nheads headdim) ...", + three=3, + headdim=headdim, + ) + bqkv = state_dict.pop(f"transformer.layers.{l}.attention.query_key_value.bias") + state_dict[f"transformer.layers.{l}.mixer.Wqkv.bias"] = rearrange( + bqkv, "(nheads three headdim) -> (three nheads headdim)", three=3, headdim=headdim + ) + + def key_mapping_attn(key): + key = re.sub( + r"^transformer.layers.(\d+).attention.dense.", + r"transformer.layers.\1.mixer.out_proj.", + key, + ) + return key + + state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items()) + + return state_dict + + +def gpt_neox_config_to_gpt2_config(gpt_neox_config: GPTNeoXConfig) -> GPT2Config: + assert gpt_neox_config.rotary_emb_base == 10000 + return GPT2Config( + vocab_size=gpt_neox_config.vocab_size, + n_positions=0, # No absolute position embedding + n_embd=gpt_neox_config.hidden_size, + n_layer=gpt_neox_config.num_hidden_layers, + n_head=gpt_neox_config.num_attention_heads, + n_inner=gpt_neox_config.intermediate_size, + activation_function=gpt_neox_config.hidden_act, + resid_pdrop=0.0, # No dropout + embd_pdrop=0.0, + attn_pdrop=0.0, + layer_norm_epsilon=gpt_neox_config.layer_norm_eps, + initializer_range=gpt_neox_config.initializer_range, + bos_token_id=gpt_neox_config.bos_token_id, + eos_token_id=gpt_neox_config.eos_token_id, + # These are new arguments not in the original GPT2Config + prenorm=True, + parallel_block=gpt_neox_config.use_parallel_residual, + parallel_block_tied_norm=False, + rotary_emb_fraction=gpt_neox_config.rotary_pct, + tie_word_embeddings=gpt_neox_config.tie_word_embeddings, + ) diff --git a/gptj.py b/gptj.py new file mode 100644 index 0000000000000000000000000000000000000000..ca2330d79ce5b78a1229351956da20d88e356083 --- /dev/null +++ b/gptj.py @@ -0,0 +1,109 @@ +# Copyright (c) 2023, Tri Dao. + +import math +import re +from collections import OrderedDict + +import torch +import torch.nn.functional as F +from transformers import GPT2Config, GPTJConfig + + +def remap_state_dict_hf_gptj(state_dict, config): + def key_mapping_layers(key): + return re.sub(r"^transformer.h.", "transformer.layers.", key) + + state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items()) + # Word embedding + def key_mapping_emb(key): + return re.sub(r"^transformer.wte.", "transformer.embeddings.word_embeddings.", key) + + state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items()) + word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight") + # It's possible that vocab_size is padded to be a multiple of 8, for example. + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple + state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad( + word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0]) + ) + if getattr(config, "tie_word_embeddings"): + state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"] + else: + output_embeddings = state_dict.pop("lm_head.weight") + # It's possible that vocab_size is padded to be a multiple of 8, for example. + state_dict["lm_head.weight"] = F.pad( + output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0]) + ) + output_embeddings_bias = state_dict.pop("lm_head.bias") + state_dict["lm_head.bias"] = F.pad( + output_embeddings_bias, (0, vocab_size - output_embeddings_bias.shape[0]) + ) + + # LayerNorm + def key_mapping_ln(key): + return re.sub(r"^transformer.layers.(\d+).ln_1.", r"transformer.layers.\1.norm1.", key) + + state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items()) + + # MLP + def key_mapping_mlp(key): + key = re.sub( + r"^transformer.layers.(\d+).mlp.fc_in.", r"transformer.layers.\1.mlp.fc1.", key + ) + key = re.sub( + r"^transformer.layers.(\d+).mlp.fc_out.", r"transformer.layers.\1.mlp.fc2.", key + ) + return key + + state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items()) + + # Attention + for l in range(config.n_layer): + Wq = state_dict.pop(f"transformer.layers.{l}.attn.q_proj.weight") + Wk = state_dict.pop(f"transformer.layers.{l}.attn.k_proj.weight") + Wv = state_dict.pop(f"transformer.layers.{l}.attn.v_proj.weight") + state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0) + # We don't store these biases + state_dict.pop(f"transformer.layers.{l}.attn.bias") + state_dict.pop(f"transformer.layers.{l}.attn.masked_bias") + + def key_mapping_attn(key): + return re.sub( + r"^transformer.layers.(\d+).attn.out_proj.", + r"transformer.layers.\1.mixer.out_proj.", + key, + ) + + state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items()) + + return state_dict + + +def gptj_config_to_gpt2_config(gptj_config: GPTJConfig) -> GPT2Config: + headdim = gptj_config.n_embd // gptj_config.n_head + return GPT2Config( + vocab_size=gptj_config.vocab_size, + n_positions=0, # No absolute position embedding + n_embd=gptj_config.n_embd, + n_layer=gptj_config.n_layer, + n_head=gptj_config.n_head, + n_inner=gptj_config.n_inner, + activation_function=gptj_config.activation_function, + resid_pdrop=gptj_config.resid_pdrop, + embd_pdrop=gptj_config.embd_pdrop, + attn_pdrop=gptj_config.attn_pdrop, + layer_norm_epsilon=gptj_config.layer_norm_epsilon, + initializer_range=gptj_config.initializer_range, + bos_token_id=gptj_config.bos_token_id, + eos_token_id=gptj_config.eos_token_id, + # These are new arguments not in the original GPT2Config + prenorm=True, + parallel_block=True, + parallel_block_tied_norm=True, + rotary_emb_fraction=gptj_config.rotary_dim / headdim, + rotary_emb_interleaved=True, + tie_word_embeddings=False, + qkv_proj_bias=False, + out_proj_bias=False, + lm_head_bias=True, + ) diff --git a/gpu-monitor.yaml b/gpu-monitor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6780f6d1c1bce7a4792fe15cc45eb57b5488d4bb --- /dev/null +++ b/gpu-monitor.yaml @@ -0,0 +1,11 @@ +defaults: + - default.yaml + +gpu_stats_monitor: + _target_: pytorch_lightning.callbacks.GPUStatsMonitor + # [2021-08-13] TD: I just want the intra_step_size but it'll error if I + # don't have memory_utilization and gpu_utilization. + # Maybe I should write a callback with just the intra_step_size. + memory_utilization: True + gpu_utilization: True + intra_step_time: True diff --git a/interface.cpp b/interface.cpp new file mode 100644 index 0000000000000000000000000000000000000000..41a783fd0fcc661276d34f2cc7a758b80a27d453 --- /dev/null +++ b/interface.cpp @@ -0,0 +1,59 @@ +#include + +// CUDA forward declarations +std::vector softmax_xentropy_cuda( + const at::Tensor &input, + const at::Tensor &labels, + const float smoothing, + const int total_classes); + +at::Tensor softmax_xentropy_backward_cuda( + const at::Tensor &grad_loss, + at::Tensor &logits, + const at::Tensor &max_log_sum_exp, + const at::Tensor &labels, + const float smoothing, + const bool inplace, + const int total_classes); + +// C++ interface + +#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) + +std::vector softmax_xentropy_forward( + const at::Tensor &input, + const at::Tensor &labels, + const float smoothing, + const int total_classes=-1) { + // For tensor parallel cross entropy with smoothing, we want to pass in the total number + // of classes so that smoothing can be applied correctly. If total_classes=-1, use the + // last dimension of the input tensor. + CHECK_INPUT(input); + CHECK_INPUT(labels); + + return softmax_xentropy_cuda(input, labels, smoothing, total_classes); +} + +at::Tensor softmax_xentropy_backward( + const at::Tensor &grad_loss, + at::Tensor &logits, + const at::Tensor &max_log_sum_exp, + const at::Tensor &labels, + const float smoothing, + const bool inplace, + const int total_classes=-1) { + CHECK_INPUT(grad_loss); + CHECK_INPUT(logits); + CHECK_INPUT(max_log_sum_exp); + CHECK_INPUT(labels); + + return softmax_xentropy_backward_cuda(grad_loss, logits, max_log_sum_exp, labels, + smoothing, inplace, total_classes); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("forward", &softmax_xentropy_forward, "Softmax cross entropy loss with label smoothing forward (CUDA)", py::arg("input"), py::arg("labels"), py::arg("smoothing"), py::arg("total_classes")=-1); + m.def("backward", &softmax_xentropy_backward, "Softmax cross entropy loss with label smoothing backward (CUDA)", py::arg("grad_loss"), py::arg("logits"), py::arg("max_log_sum_exp"), py::arg("labels"), py::arg("smoothing"), py::arg("inplace"), py::arg("total_classes")=-1); +} diff --git a/invsqrt.yaml b/invsqrt.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bb16f3c15bfd4f3a337a3b082a12e37f14bafd1b --- /dev/null +++ b/invsqrt.yaml @@ -0,0 +1,3 @@ +# @package train.scheduler +_target_: src.optim.lr_scheduler.InvSqrt +num_warmup_steps: ??? diff --git a/k_activations.py b/k_activations.py new file mode 100644 index 0000000000000000000000000000000000000000..efb83c358eb4a85d069ee340a3c83f418f9a805b --- /dev/null +++ b/k_activations.py @@ -0,0 +1,162 @@ +# Adapted from https://github.com/facebookresearch/xformers/blob/main/xformers/triton/k_activations.py +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + +import math +from enum import Enum +from typing import Optional + +import triton +import triton.language as tl + +_sqrt2pi = math.sqrt(2.0 / math.pi) +_sqrt1_2 = math.sqrt(1.0 / 2) +_gaussian_pdf_normalization = 1.0 / math.sqrt(2 * math.pi) + + +class Activation(str, Enum): + SquaredReLU = "squared_relu" + GeLU = "gelu" + GeLUApprox = "gelu_approx" + LeakyReLU = "leaky_relu" + ReLU = "relu" + + +def get_triton_activation_kernel(activation: Optional[Activation]): + return ( + { + Activation.ReLU: relu, + Activation.LeakyReLU: leaky_relu, + Activation.GeLU: gelu, + Activation.GeLUApprox: gelu_approx, + Activation.SquaredReLU: squared_relu, + }[activation] + if activation + else None + ) + + +def get_triton_activation_bwd_kernel(activation: Optional[Activation]): + return ( + { + Activation.ReLU: relu_grad, + Activation.LeakyReLU: leaky_relu_grad, + Activation.GeLU: gelu_grad, + Activation.GeLUApprox: gelu_approx_grad, + Activation.SquaredReLU: squared_relu_grad, + }[activation] + if activation + else None + ) + + +@triton.jit +def tanh(x): + # Tanh is just a scaled sigmoid + return 2 * tl.sigmoid(2 * x) - 1 + + +@triton.jit +def cosh(x): + exp_x = tl.exp(x) + return (exp_x + 1.0 / exp_x) * 0.5 + + +# a Triton implementation of the most used activations +# See for instance http://arxiv.org/abs/1606.08415 for an overview + +# ReLU +@triton.jit +def relu(x): + """ + ReLU_ activation function + + .. _ReLU: https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html + """ + zero = 0.0 + return tl.where(x >= 0, x, zero.to(x.dtype)) + + +@triton.jit +def relu_grad(x): + # ReLU is different from other activations + # in that it does not require the input to retrospectively compute its gradient + # here the input is the downstream gradient, and we return the upstream gradient directly + zero = 0.0 + one = 1.0 + return tl.where(x >= 0, one.to(x.dtype), zero.to(x.dtype)) + + +@triton.jit +def squared_relu(x): + """ + Squared ReLU activation, as proposed in the Primer_ paper. + + .. _Primer: https://arxiv.org/abs/2109.08668 + """ + x_ = relu(x) + return (x_ * x_).to(x.dtype) + + +@triton.jit +def squared_relu_grad(x): + return tl.where(x >= 0, 2.0 * x, 0.0) + + +# Leaky ReLU +@triton.jit +def leaky_relu(x): + """ + LeakyReLU_ activation + + .. _LeakyReLU: https://pytorch.org/docs/stable/generated/torch.nn.LeakyReLU.html + """ + scale = 0.01 + 0.0 + scale = scale.to(x.dtype) + return tl.where(x >= 0, x, scale * x) + + +@triton.jit +def leaky_relu_grad(x): + min_grad = 0.01 + max_grad = 1 + + min_grad = min_grad.to(x.dtype) + max_grad = max_grad.to(x.dtype) + + return tl.where(x >= 0, max_grad, min_grad) + + +@triton.jit +def gelu(x): + """Gaussian Error Linear Unit (GELU)""" + return x * 0.5 * (1.0 + tl.libdevice.erf(x * _sqrt1_2)) + + +@triton.jit +def gelu_grad(x): + cdf = 0.5 * (1.0 + tl.libdevice.erf(x * _sqrt1_2)) + pdf = tl.exp(-0.5 * x * x) * _gaussian_pdf_normalization + return cdf + x * pdf + + +@triton.jit +def gelu_approx(x): + """ + GeLU_ activation - Gaussian error linear unit, with tanh approximation + + .. _GeLU: https://arxiv.org/pdf/1606.08415.pdf + """ + return 0.5 * x * (1.0 + tanh(_sqrt2pi * x * (1.0 + 0.044715 * x * x))) + + +@triton.jit +def gelu_approx_grad(x): + # CREDITS: Fast implementation proposed in + # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/fused_bias_gelu.py#L30 + tanh_out = tanh(0.79788456 * x * (1 + 0.044715 * x * x)) + return 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * ( + 1 + tanh_out + ) diff --git a/kernel_traits.h b/kernel_traits.h new file mode 100644 index 0000000000000000000000000000000000000000..b75a8c3c14ceaf5f0fe09506d5d39eb6ff6052d7 --- /dev/null +++ b/kernel_traits.h @@ -0,0 +1,952 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include "cute/algorithm/copy.hpp" +#include "cute/atom/mma_atom.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" + +#include "cutlass/cutlass.h" +#include "cutlass/layout/layout.h" +#include "cutlass/numeric_types.h" +#include "cutlass/pipeline/pipeline.hpp" + +using namespace cute; + +template +struct SharedStorageQKVO { + cute::array_aligned> smem_q; + cute::array_aligned> smem_k; + union { + cute::array_aligned> smem_v; + cute::array_aligned> smem_o; + }; + struct { + cutlass::arch::ClusterTransactionBarrier barrier_Q; + cutlass::arch::ClusterBarrier barrier_O; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_k; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_v; + int tile_count_semaphore; + }; +}; + +template +struct SharedStorageQKVOVt { + struct { + cute::array_aligned> smem_q; + cute::array_aligned> smem_k; + cute::array_aligned> smem_v; + union { + cute::array_aligned> smem_v_out; + cute::array_aligned> smem_o; + }; + }; + struct { + cutlass::arch::ClusterTransactionBarrier barrier_Q; + cutlass::arch::ClusterBarrier barrier_O; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_k; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_v; + typename cutlass::PipelineAsync::SharedStorage pipeline_vt; + int tile_count_semaphore; + }; +}; + +// If Share_Q_K_smem is true, that forces Is_Q_in_regs to be true +template +struct Flash_fwd_kernel_traits { + using Element = elem_type; + using ElementAccum = float; + using OutputType = elem_type; + using index_t = int64_t; + + // The number of threads. + static constexpr int kNWarps = kNWarps_; + static constexpr int kNThreads = kNWarps * cutlass::NumThreadsPerWarp; + static constexpr int NumProducerThreads = cutlass::NumThreadsPerWarp; + + static constexpr bool Is_Q_in_regs = Is_Q_in_regs_; + static_assert(kNWarps_ == 4 || kNWarps_ == 8 || kNWarps_ == 12 || kNWarps_ == 16); + static constexpr bool Is_WS = kNWarps_ >= 12; + static_assert(!(Is_WS && Is_Q_in_regs), "Warp-specialization does not support Q in registers"); + + static constexpr int kBlockM = kBlockM_; + static constexpr int kBlockN = kBlockN_; + static constexpr int kHeadDim = kHeadDim_; + static_assert(kHeadDim % 32 == 0); + using TileShape_MNK = Shape, Int, Int>; + + static constexpr int kClusterM = kClusterM_; + using ClusterShape_MNK = Shape, _1, _1>; + + static constexpr int kStages = kStages_; + + using AtomLayoutMNK = Layout, _1, _1>>; + using TiledMma0 = decltype(cute::make_tiled_mma( + std::conditional_t< + Is_Q_in_regs, + decltype(cute::GMMA::rs_op_selector()), + decltype(cute::GMMA::ss_op_selector()) + >{}, + AtomLayoutMNK{})); + using TiledMma1 = decltype(cute::make_tiled_mma( + cute::GMMA::rs_op_selector(TileShape_MNK{})), + GMMA::Major::K, GMMA::Major::MN>(), + AtomLayoutMNK{})); + + using SmemLayoutAtomQ = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutQ = decltype(tile_to_shape(SmemLayoutAtomQ{}, select<0, 2>(TileShape_MNK{}))); + + using SmemLayoutAtomK = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutK = + decltype(tile_to_shape(SmemLayoutAtomK{}, + make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int{}))); + + using SmemLayoutAtomV = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutV = + decltype(tile_to_shape(SmemLayoutAtomV{}, + make_shape(get<1>(TileShape_MNK{}), get<2>(TileShape_MNK{}), Int{}))); + + // Note this is the transpose in terms of the view, not in terms of memory. + using SmemLayoutVt = + decltype(composition(SmemLayoutV{}, + make_ordered_layout( + make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{}), Int{}), + Step<_2, _1, _3>{}))); + + using SmemLayoutAtomO = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutO = decltype(tile_to_shape(SmemLayoutAtomO{}, select<0, 2>(TileShape_MNK{}))); + + using SmemCopyAtomQ = Copy_Atom; + + using SharedStorage = SharedStorageQKVO; + + using MainloopPipeline = typename cutlass::PipelineTmaAsync; + using MainloopPipelineNoTMA = typename cutlass::PipelineAsync; + using PipelineState = typename cutlass::PipelineState; + // using BarrierType = typename MainloopPipeline::ProducerBarrierType; + +}; + +// Traits struct for fp8 kernel with in-kernel transpose +template +struct Flash_fwd_kernel_traits_fp8 { + using Element = elem_type; + static_assert(cutlass::sizeof_bits_v == 8); + using ElementAccum = float; + using OutputType = cutlass::half_t; + using index_t = int64_t; + + // The number of threads. + static constexpr int kNWarps = kNWarps_; + static constexpr int kNThreads = kNWarps * cutlass::NumThreadsPerWarp; + static constexpr int NumProducerThreads = cutlass::NumThreadsPerWarpGroup; + + static constexpr bool Is_Q_in_regs = Is_Q_in_regs_; + static_assert(kNWarps_ == 12 || kNWarps_ == 16); + static constexpr bool Is_WS = true; + static_assert(!Is_Q_in_regs, "Warp-specialization does not support Q in registers"); + + static constexpr int kBlockM = kBlockM_; + static constexpr int kBlockN = kBlockN_; + static constexpr int kHeadDim = kHeadDim_; + static_assert(kHeadDim % 32 == 0); + using TileShape_MNK = Shape, Int, Int>; + + static constexpr int kClusterM = kClusterM_; + using ClusterShape_MNK = Shape, _1, _1>; + + static constexpr int kStages = kStages_; + static_assert(kStages > 1); + + using AtomLayoutMNK = Layout, _1, _1>>; + using TiledMma0 = decltype(cute::make_tiled_mma( + cute::GMMA::ss_op_selector(), + AtomLayoutMNK{})); + + using TiledMma1 = decltype(cute::make_tiled_mma( + cute::GMMA::rs_op_selector(TileShape_MNK{}))>(), + AtomLayoutMNK{})); + + using SmemLayoutAtomQ = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutQ = decltype(tile_to_shape(SmemLayoutAtomQ{}, select<0, 2>(TileShape_MNK{}))); + + using SmemLayoutAtomK = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutK = + decltype(tile_to_shape(SmemLayoutAtomK{}, + make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int{}))); + + using TransposeShapeAtomV = Shape<_64, _64>; + using SmemLayoutAtomV = decltype(tile_to_shape(GMMA::Layout_K_SW64_Atom{}, TransposeShapeAtomV{})); + using SmemLayoutV = + decltype(tile_to_shape(SmemLayoutAtomV{}, + make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int{}))); + + // for fp8 in-kernel transpose -- src layout + using SmemLayoutDivideV = decltype(tiled_divide(SmemLayoutV{}, TransposeShapeAtomV{})); + using SmemShapeLDSM = Shape, Shape<_16, _4>>; + using FactoringShapeV = decltype(make_shape(SmemShapeLDSM{}, + shape<1>(SmemLayoutDivideV{}), shape<2>(SmemLayoutDivideV{}), shape<3>(SmemLayoutDivideV{}))); + using SmemLayoutTransposeV = decltype(composition(SmemLayoutDivideV{}, make_layout(FactoringShapeV{}))); + + // For fp8, this is the memory transpose. + using SmemLayoutAtomVt = decltype(tile_to_shape(GMMA::Layout_K_SW64_Atom{}, TransposeShapeAtomV{})); + using SmemLayoutVt = + decltype(tile_to_shape(SmemLayoutAtomVt{}, + make_shape(shape<2>(TileShape_MNK{}), shape<1>(TileShape_MNK{}), Int{}))); + + // for fp8 in-kernel transpose -- dst layout + using SmemLayoutVtTrans = + decltype(composition(SmemLayoutVt{}, + make_ordered_layout(product_each(shape(SmemLayoutV{})), Step<_2, _1, _3>{}))); + using SmemLayoutDivideVt = decltype(tiled_divide(SmemLayoutVtTrans{}, TransposeShapeAtomV{})); +#ifndef NO_FP8_COLUMN_PERMUTE + using SmemShapeSTSM = Shape, Shape<_8, _8>>; +#else + using SmemShapeSTSM = Shape, Shape<_16, _4>>; +#endif + using FactoringShapeVt = decltype(make_shape(SmemShapeSTSM{}, + shape<1>(SmemLayoutDivideVt{}), shape<2>(SmemLayoutDivideVt{}), shape<3>(SmemLayoutDivideVt{}))); + using SmemLayoutTransposeVt = decltype(composition(SmemLayoutDivideVt{}, make_layout(FactoringShapeVt{}))); + + using SmemLayoutAtomO = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutO = decltype(tile_to_shape(SmemLayoutAtomO{}, select<0, 2>(TileShape_MNK{}))); + + // used for rmem -> smem O copy in fp8 kernel to undo column permutation + using ThreadLayoutrO = Layout, _4, _1>, + Stride<_4, _32, _1, _0>>; + using ValueLayoutrO = Layout, Int>, + Stride<_0, _2, Stride<_4, _1>, _8>>; + using TiledCopyrO = decltype(make_tiled_copy(Copy_Atom, OutputType>{}, + ThreadLayoutrO{}, ValueLayoutrO{})); + + using TiledCopyShaperO = Shape<_8, Int, _16, Int>; + using SmemLayoutrO = decltype(composition(SmemLayoutO{}, Layout{})); + + using SmemCopyAtomQ = Copy_Atom; + + using SharedStorage = SharedStorageQKVOVt; + + using MainloopPipeline = typename cutlass::PipelineTmaAsync; + using MainloopPipelineNoTMA = typename cutlass::PipelineAsync; + using PipelineState = typename cutlass::PipelineState; + // using BarrierType = typename MainloopPipeline::ProducerBarrierType; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct SharedStorageQKVdOdKV; + +template +struct SharedStorageQKVdOdKV { + struct { + cute::array_aligned> smem_q; + cute::array_aligned> smem_do; + union { + struct { + cute::array_aligned> smem_k; + cute::array_aligned> smem_v; + }; + struct { + cute::array_aligned> smem_dk; + cute::array_aligned> smem_dv; + }; + }; + cute::array_aligned> smem_p; + cute::array_aligned> smem_ds; + }; + struct { + cute::uint64_t tma_load_mbar[8]; // 8 TMA barrier pre-allcoated for usage. + cutlass::arch::ClusterTransactionBarrier barrier_K; + cutlass::arch::ClusterTransactionBarrier barrier_V; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_q; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_do; + }; +}; + +template +struct SharedStorageQKVdOdKV { + struct { + cute::array_aligned> smem_q; + cute::array_aligned> smem_do; + union { + struct { + cute::array_aligned> smem_k; + cute::array_aligned> smem_v; + }; + struct { + cute::array_aligned> smem_dk; + cute::array_aligned> smem_dv; + }; + }; + union { // Put smem_p in a union just so we can still refer to it in the struct, even if it's not used. + cute::array_aligned> smem_p; + cute::array_aligned> smem_ds; + }; + }; + struct { + cute::uint64_t tma_load_mbar[8]; // 8 TMA barrier pre-allcoated for usage. + cutlass::arch::ClusterTransactionBarrier barrier_K; + cutlass::arch::ClusterTransactionBarrier barrier_V; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_q; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_do; + }; +}; + +template +struct SharedStorageQKVdOdKVWS; + +template +struct SharedStorageQKVdOdKVWS { + struct { + cute::array_aligned> smem_q; + cute::array_aligned> smem_do; + union { + struct { + cute::array_aligned> smem_k; + cute::array_aligned> smem_v; + }; + struct { + cute::array_aligned> smem_dk; + cute::array_aligned> smem_dv; + }; + }; + cute::array_aligned> smem_p; + cute::array_aligned> smem_ds; + cute::array_aligned> smem_dqacc; + cute::array_aligned smem_lse; + cute::array_aligned smem_dpsum; + }; + struct { + cute::uint64_t tma_load_mbar[8]; // 8 TMA barrier pre-allcoated for usage. + cutlass::arch::ClusterTransactionBarrier barrier_K; + cutlass::arch::ClusterTransactionBarrier barrier_V; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_q; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_do; + }; +}; + +template +struct SharedStorageQKVdOdKVWS { + struct { + cute::array_aligned> smem_q; + cute::array_aligned> smem_do; + union { + struct { + cute::array_aligned> smem_k; + cute::array_aligned> smem_v; + }; + struct { + cute::array_aligned> smem_dk; + cute::array_aligned> smem_dv; + }; + }; + union { // Put smem_p in a union just so we can still refer to it in the struct, even if it's not used. + cute::array_aligned> smem_p; + cute::array_aligned> smem_ds; + }; + cute::array_aligned> smem_dqacc; + cute::array_aligned smem_lse; + cute::array_aligned smem_dpsum; + }; + struct { + cute::uint64_t tma_load_mbar[8]; // 8 TMA barrier pre-allcoated for usage. + cutlass::arch::ClusterTransactionBarrier barrier_K; + cutlass::arch::ClusterTransactionBarrier barrier_V; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_q; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_do; + }; +}; + +template +struct SharedStorageQKVdOdKVSeqqPar; + +template +struct SharedStorageQKVdOdKVSeqqPar { + struct { + cute::array_aligned> smem_k; + cute::array_aligned> smem_v; + union { + struct { + cute::array_aligned> smem_q; + cute::array_aligned> smem_do; + }; + struct { + cute::array_aligned> smem_dq; + }; + }; + cute::array_aligned> smem_p; + cute::array_aligned> smem_ds; + }; + struct { + cute::uint64_t tma_load_mbar[8]; // 8 TMA barrier pre-allcoated for usage. + cutlass::arch::ClusterTransactionBarrier barrier_Q; + cutlass::arch::ClusterTransactionBarrier barrier_dO; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_k; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_v; + }; +}; + +template +struct SharedStorageQKVdOdKVSeqqPar { + struct { + cute::array_aligned> smem_k; + cute::array_aligned> smem_v; + union { + struct { + cute::array_aligned> smem_q; + cute::array_aligned> smem_do; + }; + struct { + cute::array_aligned> smem_dq; + }; + }; + union { // Put smem_p in a union just so we can still refer to it in the struct, even if it's not used. + cute::array_aligned> smem_p; + cute::array_aligned> smem_ds; + }; + }; + struct { + cute::uint64_t tma_load_mbar[8]; // 8 TMA barrier pre-allcoated for usage. + cutlass::arch::ClusterTransactionBarrier barrier_Q; + cutlass::arch::ClusterTransactionBarrier barrier_dO; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_k; + typename cutlass::PipelineTmaAsync::SharedStorage pipeline_v; + }; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Flash_bwd_kernel_traits { + using Element = elem_type; + using ElementAccum = float; + using index_t = int64_t; + + // The number of threads. + static constexpr int kNWarps = kNWarps_; + static constexpr int kNThreads = kNWarps * cutlass::NumThreadsPerWarp; + static constexpr int kNThreadsNonWS = 8 * cutlass::NumThreadsPerWarp; + // static constexpr int kNThreadsdQ = cutlass::NumThreadsPerWarpGroup; + static constexpr int kNThreadsdQ = 2 * cutlass::NumThreadsPerWarpGroup; + + static_assert(kNWarps_ == 8 || kNWarps_ == 12); + + static constexpr bool Is_WS = kNWarps_ >= 12; + + static constexpr int kBlockM = kBlockM_; + static constexpr int kBlockN = kBlockN_; + static constexpr int kHeadDim = kHeadDim_; + static_assert(kHeadDim % 32 == 0); + using TileShape_MNK = Shape, Int, Int>; + + static constexpr int kClusterN = kClusterN_; + using ClusterShape_MNK = Shape<_1, Int, _1>; + + static constexpr int kStages = 2; + + static constexpr bool SdP_swapAB = SdP_swapAB_; + static constexpr bool dKV_swapAB = dKV_swapAB_; + static constexpr bool dQ_swapAB = dQ_swapAB_; + static_assert(!(SdP_swapAB && dKV_swapAB)); // If SdP_swapAB, then we don't swap for dKV + + static constexpr bool Mma_dQ_is_RS = AtomLayoutMSdP == 2 && AtomLayoutMdQ == 2 && !SdP_swapAB && !dQ_swapAB; // If dQ_swapAB we can't use RS + + using TileShapeAtomSdP = std::conditional_t< + !SdP_swapAB, + Shape, Int, Int>, + Shape, Int, Int> + >; + using AtomLayoutSdP = std::conditional_t< + !SdP_swapAB, + Layout, Int<2 / AtomLayoutMSdP>, _1>>, + Layout, Int, _1>> + >; + using TiledMmaSdP = decltype(cute::make_tiled_mma( + cute::GMMA::ss_op_selector(), + AtomLayoutSdP{})); + + using TileShapeAtomdKV = std::conditional_t< + !dKV_swapAB, + Shape, Int, Int>, + Shape, Int, Int> + >; + using AtomLayoutdKV = std::conditional_t< + !dKV_swapAB, + Layout, Int<2 / AtomLayoutNdKV>, _1>>, + Layout, Int, _1>> + >; + using TiledMmadKV = decltype(cute::make_tiled_mma( + std::conditional_t< + !SdP_swapAB, + decltype(cute::GMMA::ss_op_selector()), + decltype(cute::GMMA::rs_op_selector()) + >{}, + AtomLayoutdKV{})); + + using TileShapeAtomdQ = std::conditional_t< + !dQ_swapAB, + Shape, Int, Int>, + Shape, Int, Int> + // Shape, Int, Int>, + // Shape, Int, Int> + >; + using AtomLayoutdQ = std::conditional_t< + !dQ_swapAB, + Layout, Int<2 / AtomLayoutMdQ>, _1>>, + Layout, Int, _1>> + // Layout, Int<1>, _1>>, + // Layout, Int<1>, _1>> + >; + static constexpr GMMA::Major MmadQMajorA = !dQ_swapAB ? GMMA::Major::K : GMMA::Major::MN; + static constexpr GMMA::Major MmadQMajorB = !dQ_swapAB ? GMMA::Major::MN : GMMA::Major::K; + using TiledMmadQ = decltype(cute::make_tiled_mma( + std::conditional_t< + !dQ_swapAB, + std::conditional_t< + Mma_dQ_is_RS, + decltype(cute::GMMA::rs_op_selector()), + decltype(cute::GMMA::ss_op_selector()) + >, + decltype(cute::GMMA::ss_op_selector()) + >{}, + AtomLayoutdQ{})); + + using GmemTiledCopyQdO = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{}))); + using GmemTiledCopyKV = cute::SM90_TMA_LOAD; + using GmemTiledCopydKV = cute::SM90_TMA_STORE; + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + static constexpr bool Has_cp_async = true; +#else + static constexpr bool Has_cp_async = false; +#endif + // For the dot_do_o preprocessing kernel + using Gmem_copy_struct = std::conditional_t< + Has_cp_async, + SM80_CP_ASYNC_CACHEGLOBAL, + DefaultCopy + >; + static constexpr int kBlockKSmem = kHeadDim % 64 == 0 ? 64 : 32; + static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element); + static_assert(kHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad"); + // Using kBlockKSmem instead of kHeadDim here to avoid bank conflicts, but doesn't seem + // to affect speed in practice. + static constexpr int kGmemThreadsPerRow = kBlockKSmem / kGmemElemsPerLoad; + static_assert(kNThreadsNonWS % kGmemThreadsPerRow == 0, "kNThreadsNonWS must be a multiple of kGmemThreadsPerRow"); + using GmemLayoutAtom = Layout, Int>, + Stride, _1>>; + using GmemLayoutAtomdQ = Layout, Int>, + Stride, _1>>; + using GmemTiledCopydO = decltype( + make_tiled_copy(Copy_Atom{}, + GmemLayoutAtom{}, + Layout>{})); // Val layout, 8 vals per store + using GmemTiledCopydQ = decltype( + make_tiled_copy(Copy_Atom{}, + GmemLayoutAtomdQ{}, + Layout>{})); // Val layout, 8 vals per store + using GmemLayoutAtomdQaccum = std::conditional_t< + kBlockKSmem == 32, + Layout, _8>, // Thread layout, 8 threads per row + Stride< _8, _1>>, + Layout, _16>, // Thread layout, 16 threads per row + Stride< _16, _1>> + >; + using GmemTiledCopydQaccum = decltype( + make_tiled_copy(Copy_Atom{}, + GmemLayoutAtomdQaccum{}, + Layout>{})); // Val layout, 4 vals per store + + using SmemLayoutAtomQ = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutQ = + decltype(tile_to_shape(SmemLayoutAtomQ{}, + make_shape(shape<0>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int{}))); + using SmemLayoutdO = SmemLayoutQ; + + using SmemLayoutAtomK = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutK = decltype(tile_to_shape(SmemLayoutAtomK{}, select<1, 2>(TileShape_MNK{}))); + + using SmemLayoutAtomV = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutV = decltype(tile_to_shape(SmemLayoutAtomV{}, select<1, 2>(TileShape_MNK{}))); + + using SmemLayoutAtomP = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<1>(TileShape_MNK{}))>()); + using SmemLayoutP = decltype(tile_to_shape(SmemLayoutAtomP{}, select<0, 1>(TileShape_MNK{}))); + using SmemLayoutAtomdS = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<1>(TileShape_MNK{}))>()); + using SmemLayoutdS = decltype(tile_to_shape(SmemLayoutAtomdS{}, select<0, 1>(TileShape_MNK{}))); + + // using SmemLayoutAtomdQacc = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + // using SmemLayoutdQacc = decltype(tile_to_shape(SmemLayoutAtomdQacc{}, select<0, 2>(TileShape_MNK{}))); + + // Note this is the transpose in terms of the view, not in terms of memory. + using SmemLayoutQt = + decltype(cute::composition(SmemLayoutQ{}, + make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{}), Int{}), + make_stride(Int{}, _1{}, Int{})))); + using SmemLayoutdOt = + decltype(cute::composition(SmemLayoutdO{}, + make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{}), Int{}), + make_stride(Int{}, _1{}, Int{})))); + using SmemLayoutKt = + decltype(cute::composition(SmemLayoutK{}, + make_layout(make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{})), + make_stride(Int{}, _1{})))); + using SmemLayoutPt = + decltype(cute::composition(SmemLayoutP{}, + make_layout(make_shape(get<1>(TileShape_MNK{}), get<0>(TileShape_MNK{})), + make_stride(Int{}, _1{})))); + using SmemLayoutdSt = + decltype(cute::composition(SmemLayoutdS{}, + make_layout(make_shape(get<1>(TileShape_MNK{}), get<0>(TileShape_MNK{})), + make_stride(Int{}, _1{})))); + + // using SmemLayoutdQacct = + // decltype(cute::composition(SmemLayoutdQacc{}, + // make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{})), + // make_stride(Int{}, _1{})))); + + using SmemLayoutdK = SmemLayoutK; + using SmemLayoutdV = SmemLayoutV; + using SmemLayoutdKt = SmemLayoutKt; + using SmemLayoutdVt = SmemLayoutKt; + + static constexpr int kSwizzle = kBlockKSmem == 32 ? 2 : 3; + using SmemLayoutAtomdQ = decltype( + // composition(Swizzle{}, + composition(Swizzle<3, 3, 3>{}, + Layout, Int<32>>, + Stride, _1>>{})); + using SmemLayoutdQ = decltype(tile_to_shape( + SmemLayoutAtomdQ{}, + make_shape(Int{}, Int{}))); + using SmemLayoutdQt = + decltype(cute::composition(SmemLayoutdQ{}, + make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{})), + make_stride(Int{}, _1{})))); + static constexpr int kSmemdQSize = size(SmemLayoutdQ{}) * sizeof(Element); + + using SmemLayoutAtomdQaccTMA = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<1>(TileShape_MNK{}))>()); + using SmemLayoutdQaccTMA = decltype(tile_to_shape(SmemLayoutAtomdQaccTMA{}, select<0, 2>(TileShape_MNK{}))); + using SmemLayoutdQacc = SmemLayoutdQ; + using SmemLayoutdQacct = SmemLayoutdQt; + using SmemLayoutdQacc2 = decltype(tile_to_shape( + SmemLayoutAtomdQ{}, + make_shape(Int{}, Int{}, _2{}))); + // using SmemLayoutdQacc = decltype(tile_to_shape(SmemLayoutAtomdQacc{}, select<0, 2>(TileShape_MNK{}))); + // using SmemLayoutdQacct = + // decltype(cute::composition(SmemLayoutdQacc{}, + // make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{})), + // make_stride(Int{}, _1{})))); + using RmemTiledCopydQacc = decltype( + make_tiled_copy(Copy_Atom{}, + GmemLayoutAtomdQaccum{}, + Layout>{})); // Val layout, 4 vals per store + + // using SmemCopyAtomQ = Copy_Atom; + using SmemCopyAtomPdS = Copy_Atom< + std::conditional_t, + Element>; + using SmemCopyAtomdKV = Copy_Atom< + std::conditional_t, + Element>; + using SmemCopyAtomdQ = Copy_Atom< + std::conditional_t, + Element>; + + using SharedStorage = std::conditional_t< + !Is_WS, + SharedStorageQKVdOdKV, + SharedStorageQKVdOdKVWS + // SmemLayoutK, SmemLayoutV, SmemLayoutdS, SmemLayoutdQacc2, SmemLayoutdK, SmemLayoutdV> + >; + + // using MainloopPipeline = typename cutlass::PipelineTmaAsync; + // using PipelineState = typename cutlass::PipelineState; + using MainloopPipeline = typename cutlass::PipelineTmaAsync; + +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Flash_bwd_seqqpar_kernel_traits { + using Element = elem_type; + using ElementAccum = float; + using index_t = int64_t; + + // The number of threads. + static constexpr int kNWarps = kNWarps_; + static constexpr int kNThreads = kNWarps * cutlass::NumThreadsPerWarp; + + static_assert(kNWarps_ == 8); + + static constexpr int kBlockM = kBlockM_; + static constexpr int kBlockN = kBlockN_; + static constexpr int kHeadDim = kHeadDim_; + static_assert(kHeadDim % 32 == 0); + using TileShape_MNK = Shape, Int, Int>; + + static constexpr int kClusterN = kClusterN_; + using ClusterShape_MNK = Shape<_1, Int, _1>; + + static constexpr int kStages = 2; + + static constexpr bool SdP_swapAB = SdP_swapAB_; + static constexpr bool dKV_swapAB = dKV_swapAB_; + static constexpr bool dQ_swapAB = dQ_swapAB_; + static_assert(!(SdP_swapAB && dKV_swapAB)); // If SdP_swapAB, then we don't swap for dKV + + static constexpr bool Mma_dQ_is_RS = AtomLayoutMSdP == 2 && AtomLayoutMdQ == 2 && !SdP_swapAB && !dQ_swapAB; // If dQ_swapAB we can't use RS + + using TileShapeAtomSdP = std::conditional_t< + !SdP_swapAB, + Shape, Int, Int>, + Shape, Int, Int> + >; + using AtomLayoutSdP = std::conditional_t< + !SdP_swapAB, + Layout, Int<2 / AtomLayoutMSdP>, _1>>, + Layout, Int, _1>> + >; + using TiledMmaSdP = decltype(cute::make_tiled_mma( + cute::GMMA::ss_op_selector(), + AtomLayoutSdP{})); + + using TileShapeAtomdKV = std::conditional_t< + !dKV_swapAB, + Shape, Int, Int>, + Shape, Int, Int> + >; + using AtomLayoutdKV = std::conditional_t< + !dKV_swapAB, + Layout, Int<2 / AtomLayoutNdKV>, _1>>, + Layout, Int, _1>> + >; + using TiledMmadKV = decltype(cute::make_tiled_mma( + std::conditional_t< + !SdP_swapAB, + decltype(cute::GMMA::ss_op_selector()), + decltype(cute::GMMA::rs_op_selector()) + >{}, + AtomLayoutdKV{})); + + using TileShapeAtomdQ = std::conditional_t< + !dQ_swapAB, + Shape, Int, Int>, + Shape, Int, Int> + >; + using AtomLayoutdQ = std::conditional_t< + !dQ_swapAB, + Layout, Int<2 / AtomLayoutMdQ>, _1>>, + Layout, Int, _1>> + >; + static constexpr GMMA::Major MmadQMajorA = !dQ_swapAB ? GMMA::Major::K : GMMA::Major::MN; + static constexpr GMMA::Major MmadQMajorB = !dQ_swapAB ? GMMA::Major::MN : GMMA::Major::K; + using TiledMmadQ = decltype(cute::make_tiled_mma( + std::conditional_t< + !dQ_swapAB, + std::conditional_t< + Mma_dQ_is_RS, + decltype(cute::GMMA::rs_op_selector()), + decltype(cute::GMMA::ss_op_selector()) + >, + decltype(cute::GMMA::ss_op_selector()) + >{}, + AtomLayoutdQ{})); + + using GmemTiledCopyQdO = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{}))); + using GmemTiledCopyKV = cute::SM90_TMA_LOAD; + using GmemTiledCopydKV = cute::SM90_TMA_STORE; + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + static constexpr bool Has_cp_async = true; +#else + static constexpr bool Has_cp_async = false; +#endif + // For the dot_do_o preprocessing kernel + using Gmem_copy_struct = std::conditional_t< + Has_cp_async, + SM80_CP_ASYNC_CACHEGLOBAL, + DefaultCopy + >; + static constexpr int kBlockKSmem = kHeadDim % 64 == 0 ? 64 : 32; + static constexpr int kGmemElemsPerLoad = sizeof(cute::uint128_t) / sizeof(Element); + static_assert(kHeadDim % kGmemElemsPerLoad == 0, "kHeadDim must be a multiple of kGmemElemsPerLoad"); + // Using kBlockKSmem instead of kHeadDim here to avoid bank conflicts, but doesn't seem + // to affect speed in practice. + static constexpr int kGmemThreadsPerRow = kBlockKSmem / kGmemElemsPerLoad; + static_assert(kNThreads % kGmemThreadsPerRow == 0, "kNThreads must be a multiple of kGmemThreadsPerRow"); + using GmemLayoutAtom = Layout, Int>, + Stride, _1>>; + using GmemTiledCopydO = decltype( + make_tiled_copy(Copy_Atom{}, + GmemLayoutAtom{}, + Layout>{})); // Val layout, 8 vals per store + using GmemTiledCopydQ = decltype( + make_tiled_copy(Copy_Atom{}, + GmemLayoutAtom{}, + Layout>{})); // Val layout, 8 vals per store + using GmemLayoutAtomdQaccum = std::conditional_t< + kBlockKSmem == 32, + Layout, // Thread layout, 8 threads per row + Stride< _8, _1>>, + Layout, // Thread layout, 16 threads per row + Stride< _16, _1>> + >; + using GmemTiledCopydQaccum = decltype( + make_tiled_copy(Copy_Atom{}, + GmemLayoutAtomdQaccum{}, + Layout>{})); // Val layout, 4 vals per store + + using SmemLayoutAtomQ = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutQ = decltype(tile_to_shape(SmemLayoutAtomQ{}, select<0, 2>(TileShape_MNK{}))); + using SmemLayoutdO = SmemLayoutQ; + + using SmemLayoutAtomK = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutK = decltype(tile_to_shape(SmemLayoutAtomK{}, + make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int{}))); + + using SmemLayoutAtomV = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutV = decltype(tile_to_shape(SmemLayoutAtomV{}, + make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int{}))); + + using SmemLayoutAtomP = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<1>(TileShape_MNK{}))>()); + using SmemLayoutP = decltype(tile_to_shape(SmemLayoutAtomP{}, select<0, 1>(TileShape_MNK{}))); + using SmemLayoutAtomdS = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<1>(TileShape_MNK{}))>()); + using SmemLayoutdS = decltype(tile_to_shape(SmemLayoutAtomdS{}, select<0, 1>(TileShape_MNK{}))); + + // Note this is the transpose in terms of the view, not in terms of memory. + using SmemLayoutQt = + decltype(cute::composition(SmemLayoutQ{}, + make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{})), + make_stride(Int{}, _1{})))); + using SmemLayoutdOt = + decltype(cute::composition(SmemLayoutdO{}, + make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{})), + make_stride(Int{}, _1{})))); + using SmemLayoutKt = + decltype(cute::composition(SmemLayoutK{}, + make_layout(make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{}), Int{}), + make_stride(Int{}, _1{}, Int{})))); + using SmemLayoutPt = + decltype(cute::composition(SmemLayoutP{}, + make_layout(make_shape(get<1>(TileShape_MNK{}), get<0>(TileShape_MNK{})), + make_stride(Int{}, _1{})))); + using SmemLayoutdSt = + decltype(cute::composition(SmemLayoutdS{}, + make_layout(make_shape(get<1>(TileShape_MNK{}), get<0>(TileShape_MNK{})), + make_stride(Int{}, _1{})))); + + using SmemLayoutdK = decltype(tile_to_shape(SmemLayoutAtomK{}, select<1, 2>(TileShape_MNK{}))); + using SmemLayoutdV = SmemLayoutdK; + using SmemLayoutdKt = SmemLayoutKt; + using SmemLayoutdVt = SmemLayoutKt; + using SmemLayoutdQTMA = decltype(tile_to_shape(SmemLayoutAtomK{}, select<0, 2>(TileShape_MNK{}))); + + static constexpr int kSwizzle = kBlockKSmem == 32 ? 2 : 3; + using SmemLayoutAtomdQ = decltype( + composition(Swizzle{}, + Layout>, + Stride, _1>>{})); + using SmemLayoutdQ = decltype(tile_to_shape( + SmemLayoutAtomdQ{}, + make_shape(Int{}, Int{}))); + using SmemLayoutdQt = + decltype(cute::composition(SmemLayoutdQ{}, + make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{})), + make_stride(Int{}, _1{})))); + static constexpr int kSmemdQSize = size(SmemLayoutdQ{}) * sizeof(Element); + + using SmemLayoutAtomdKV = decltype( + composition(Swizzle{}, + Layout>, + Stride, _1>>{})); + using SmemLayoutdKV = decltype(tile_to_shape( + SmemLayoutAtomdKV{}, + make_shape(Int{}, Int{}))); + using SmemLayoutdKVt = + decltype(cute::composition(SmemLayoutdKV{}, + make_layout(make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{})), + make_stride(Int{}, _1{})))); + static constexpr int kSmemdKVSize = size(SmemLayoutdKV{}) * sizeof(Element) * 2; + + // using SmemCopyAtomQ = Copy_Atom; + using SmemCopyAtomPdS = Copy_Atom< + std::conditional_t, + Element>; + using SmemCopyAtomdKV = Copy_Atom< + std::conditional_t, + Element>; + using SmemCopyAtomdQ = Copy_Atom< + std::conditional_t, + Element>; + + using SharedStorage = SharedStorageQKVdOdKVSeqqPar; + + // using MainloopPipeline = typename cutlass::PipelineTmaAsync; + // using PipelineState = typename cutlass::PipelineState; + using MainloopPipeline = typename cutlass::PipelineTmaAsync; + +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/layer_norm.py b/layer_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..6fcf50e1786503a6ab253d13114ce2b56bae1eff --- /dev/null +++ b/layer_norm.py @@ -0,0 +1,1086 @@ +# Copyright (c) 2024, Tri Dao. +# Implement dropout + residual + layer_norm / rms_norm. + +# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html +# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate. +# This is faster for dimensions up to 8k, but after that it's much slower due to register spilling. +# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine. + +import math + +import torch +import torch.nn.functional as F +from torch.cuda.amp import custom_fwd, custom_bwd + +import triton +import triton.language as tl + + +def layer_norm_ref( + x, + weight, + bias, + residual=None, + x1=None, + weight1=None, + bias1=None, + eps=1e-6, + dropout_p=0.0, + rowscale=None, + prenorm=False, + dropout_mask=None, + dropout_mask1=None, + upcast=False, +): + dtype = x.dtype + if upcast: + x = x.float() + weight = weight.float() + bias = bias.float() if bias is not None else None + residual = residual.float() if residual is not None else residual + x1 = x1.float() if x1 is not None else None + weight1 = weight1.float() if weight1 is not None else None + bias1 = bias1.float() if bias1 is not None else None + if x1 is not None: + assert rowscale is None, "rowscale is not supported with parallel LayerNorm" + if rowscale is not None: + x = x * rowscale[..., None] + if dropout_p > 0.0: + if dropout_mask is not None: + x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p) + else: + x = F.dropout(x, p=dropout_p) + if x1 is not None: + if dropout_mask1 is not None: + x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p) + else: + x1 = F.dropout(x1, p=dropout_p) + if x1 is not None: + x = x + x1 + if residual is not None: + x = (x + residual).to(x.dtype) + out = F.layer_norm(x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps).to( + dtype + ) + if weight1 is None: + return out if not prenorm else (out, x) + else: + out1 = F.layer_norm( + x.to(weight1.dtype), x.shape[-1:], weight=weight1, bias=bias1, eps=eps + ).to(dtype) + return (out, out1) if not prenorm else (out, out1, x) + + +def rms_norm_ref( + x, + weight, + bias, + residual=None, + x1=None, + weight1=None, + bias1=None, + eps=1e-6, + dropout_p=0.0, + rowscale=None, + prenorm=False, + dropout_mask=None, + dropout_mask1=None, + upcast=False, +): + dtype = x.dtype + if upcast: + x = x.float() + weight = weight.float() + bias = bias.float() if bias is not None else None + residual = residual.float() if residual is not None else residual + x1 = x1.float() if x1 is not None else None + weight1 = weight1.float() if weight1 is not None else None + bias1 = bias1.float() if bias1 is not None else None + if x1 is not None: + assert rowscale is None, "rowscale is not supported with parallel LayerNorm" + if rowscale is not None: + x = x * rowscale[..., None] + if dropout_p > 0.0: + if dropout_mask is not None: + x = x.masked_fill(~dropout_mask, 0.0) / (1.0 - dropout_p) + else: + x = F.dropout(x, p=dropout_p) + if x1 is not None: + if dropout_mask1 is not None: + x1 = x1.masked_fill(~dropout_mask1, 0.0) / (1.0 - dropout_p) + else: + x1 = F.dropout(x1, p=dropout_p) + if x1 is not None: + x = x + x1 + if residual is not None: + x = (x + residual).to(x.dtype) + rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps) + out = ((x * rstd * weight) + bias if bias is not None else (x * rstd * weight)).to(dtype) + if weight1 is None: + return out if not prenorm else (out, x) + else: + out1 = ((x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1)).to( + dtype + ) + return (out, out1) if not prenorm else (out, out1, x) + + +@triton.autotune( + configs=[ + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + triton.Config({}, num_warps=16), + triton.Config({}, num_warps=32), + ], + key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"], +) +# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None}) +# @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None}) +@triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None}) +@triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None}) +@triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None}) +@triton.jit +def _layer_norm_fwd_1pass_kernel( + X, # pointer to the input + Y, # pointer to the output + W, # pointer to the weights + B, # pointer to the biases + RESIDUAL, # pointer to the residual + X1, + W1, + B1, + Y1, + RESIDUAL_OUT, # pointer to the residual + ROWSCALE, + SEEDS, # Dropout seeds for each row + DROPOUT_MASK, + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + stride_x_row, # how much to increase the pointer when moving by 1 row + stride_y_row, + stride_res_row, + stride_res_out_row, + stride_x1_row, + stride_y1_row, + M, # number of rows in X + N, # number of columns in X + eps, # epsilon to avoid division by zero + dropout_p, # Dropout probability + IS_RMS_NORM: tl.constexpr, + BLOCK_N: tl.constexpr, + HAS_RESIDUAL: tl.constexpr, + STORE_RESIDUAL_OUT: tl.constexpr, + HAS_BIAS: tl.constexpr, + HAS_DROPOUT: tl.constexpr, + STORE_DROPOUT_MASK: tl.constexpr, + HAS_ROWSCALE: tl.constexpr, + HAS_X1: tl.constexpr, + HAS_W1: tl.constexpr, + HAS_B1: tl.constexpr, +): + # Map the program id to the row of X and Y it should compute. + row = tl.program_id(0) + X += row * stride_x_row + Y += row * stride_y_row + if HAS_RESIDUAL: + RESIDUAL += row * stride_res_row + if STORE_RESIDUAL_OUT: + RESIDUAL_OUT += row * stride_res_out_row + if HAS_X1: + X1 += row * stride_x1_row + if HAS_W1: + Y1 += row * stride_y1_row + # Compute mean and variance + cols = tl.arange(0, BLOCK_N) + x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32) + if HAS_ROWSCALE: + rowscale = tl.load(ROWSCALE + row).to(tl.float32) + x *= rowscale + if HAS_DROPOUT: + # Compute dropout mask + # 7 rounds is good enough, and reduces register pressure + keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p + x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0) + if STORE_DROPOUT_MASK: + tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N) + if HAS_X1: + x1 = tl.load(X1 + cols, mask=cols < N, other=0.0).to(tl.float32) + if HAS_ROWSCALE: + rowscale = tl.load(ROWSCALE + M + row).to(tl.float32) + x1 *= rowscale + if HAS_DROPOUT: + # Compute dropout mask + # 7 rounds is good enough, and reduces register pressure + keep_mask = ( + tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p + ) + x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0) + if STORE_DROPOUT_MASK: + tl.store(DROPOUT_MASK + (M + row) * N + cols, keep_mask, mask=cols < N) + x += x1 + if HAS_RESIDUAL: + residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32) + x += residual + if STORE_RESIDUAL_OUT: + tl.store(RESIDUAL_OUT + cols, x, mask=cols < N) + if not IS_RMS_NORM: + mean = tl.sum(x, axis=0) / N + tl.store(Mean + row, mean) + xbar = tl.where(cols < N, x - mean, 0.0) + var = tl.sum(xbar * xbar, axis=0) / N + else: + xbar = tl.where(cols < N, x, 0.0) + var = tl.sum(xbar * xbar, axis=0) / N + rstd = 1 / tl.sqrt(var + eps) + tl.store(Rstd + row, rstd) + # Normalize and apply linear transformation + mask = cols < N + w = tl.load(W + cols, mask=mask).to(tl.float32) + if HAS_BIAS: + b = tl.load(B + cols, mask=mask).to(tl.float32) + x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd + y = x_hat * w + b if HAS_BIAS else x_hat * w + # Write output + tl.store(Y + cols, y, mask=mask) + if HAS_W1: + w1 = tl.load(W1 + cols, mask=mask).to(tl.float32) + if HAS_B1: + b1 = tl.load(B1 + cols, mask=mask).to(tl.float32) + y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1 + tl.store(Y1 + cols, y1, mask=mask) + + +def _layer_norm_fwd( + x, + weight, + bias, + eps, + residual=None, + x1=None, + weight1=None, + bias1=None, + dropout_p=0.0, + rowscale=None, + out_dtype=None, + residual_dtype=None, + is_rms_norm=False, + return_dropout_mask=False, +): + if residual is not None: + residual_dtype = residual.dtype + M, N = x.shape + assert x.stride(-1) == 1 + if residual is not None: + assert residual.stride(-1) == 1 + assert residual.shape == (M, N) + assert weight.shape == (N,) + assert weight.stride(-1) == 1 + if bias is not None: + assert bias.stride(-1) == 1 + assert bias.shape == (N,) + if x1 is not None: + assert x1.shape == x.shape + assert rowscale is None + assert x1.stride(-1) == 1 + if weight1 is not None: + assert weight1.shape == (N,) + assert weight1.stride(-1) == 1 + if bias1 is not None: + assert bias1.shape == (N,) + assert bias1.stride(-1) == 1 + if rowscale is not None: + assert rowscale.is_contiguous() + assert rowscale.shape == (M,) + # allocate output + y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype) + assert y.stride(-1) == 1 + if weight1 is not None: + y1 = torch.empty_like(y) + assert y1.stride(-1) == 1 + else: + y1 = None + if ( + residual is not None + or (residual_dtype is not None and residual_dtype != x.dtype) + or dropout_p > 0.0 + or rowscale is not None + or x1 is not None + ): + residual_out = torch.empty( + M, N, device=x.device, dtype=residual_dtype if residual_dtype is not None else x.dtype + ) + assert residual_out.stride(-1) == 1 + else: + residual_out = None + mean = torch.empty((M,), dtype=torch.float32, device=x.device) if not is_rms_norm else None + rstd = torch.empty((M,), dtype=torch.float32, device=x.device) + if dropout_p > 0.0: + seeds = torch.randint( + 2**32, (M if x1 is None else 2 * M,), device=x.device, dtype=torch.int64 + ) + else: + seeds = None + if return_dropout_mask and dropout_p > 0.0: + dropout_mask = torch.empty(M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool) + else: + dropout_mask = None + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + if N > BLOCK_N: + raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + with torch.cuda.device(x.device.index): + _layer_norm_fwd_1pass_kernel[(M,)]( + x, + y, + weight, + bias, + residual, + x1, + weight1, + bias1, + y1, + residual_out, + rowscale, + seeds, + dropout_mask, + mean, + rstd, + x.stride(0), + y.stride(0), + residual.stride(0) if residual is not None else 0, + residual_out.stride(0) if residual_out is not None else 0, + x1.stride(0) if x1 is not None else 0, + y1.stride(0) if y1 is not None else 0, + M, + N, + eps, + dropout_p, + is_rms_norm, + BLOCK_N, + residual is not None, + residual_out is not None, + bias is not None, + dropout_p > 0.0, + dropout_mask is not None, + rowscale is not None, + ) + # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0 + if dropout_mask is not None and x1 is not None: + dropout_mask, dropout_mask1 = dropout_mask.tensor_split(2, dim=0) + else: + dropout_mask1 = None + return ( + y, + y1, + mean, + rstd, + residual_out if residual_out is not None else x, + seeds, + dropout_mask, + dropout_mask1, + ) + + +@triton.autotune( + configs=[ + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + triton.Config({}, num_warps=16), + triton.Config({}, num_warps=32), + ], + key=["N", "HAS_DRESIDUAL", "STORE_DRESIDUAL", "IS_RMS_NORM", "HAS_BIAS", "HAS_DROPOUT"], +) +# @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None}) +# @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None}) +# @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None}) +@triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None}) +@triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None}) +@triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None}) +@triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None}) +@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None}) +@triton.jit +def _layer_norm_bwd_kernel( + X, # pointer to the input + W, # pointer to the weights + B, # pointer to the biases + Y, # pointer to the output to be recomputed + DY, # pointer to the output gradient + DX, # pointer to the input gradient + DW, # pointer to the partial sum of weights gradient + DB, # pointer to the partial sum of biases gradient + DRESIDUAL, + W1, + DY1, + DX1, + DW1, + DB1, + DRESIDUAL_IN, + ROWSCALE, + SEEDS, + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + stride_x_row, # how much to increase the pointer when moving by 1 row + stride_y_row, + stride_dy_row, + stride_dx_row, + stride_dres_row, + stride_dy1_row, + stride_dx1_row, + stride_dres_in_row, + M, # number of rows in X + N, # number of columns in X + eps, # epsilon to avoid division by zero + dropout_p, + rows_per_program, + IS_RMS_NORM: tl.constexpr, + BLOCK_N: tl.constexpr, + HAS_DRESIDUAL: tl.constexpr, + STORE_DRESIDUAL: tl.constexpr, + HAS_BIAS: tl.constexpr, + HAS_DROPOUT: tl.constexpr, + HAS_ROWSCALE: tl.constexpr, + HAS_DY1: tl.constexpr, + HAS_DX1: tl.constexpr, + HAS_B1: tl.constexpr, + RECOMPUTE_OUTPUT: tl.constexpr, +): + # Map the program id to the elements of X, DX, and DY it should compute. + row_block_id = tl.program_id(0) + row_start = row_block_id * rows_per_program + # Do not early exit if row_start >= M, because we need to write DW and DB + cols = tl.arange(0, BLOCK_N) + mask = cols < N + X += row_start * stride_x_row + if HAS_DRESIDUAL: + DRESIDUAL += row_start * stride_dres_row + if STORE_DRESIDUAL: + DRESIDUAL_IN += row_start * stride_dres_in_row + DY += row_start * stride_dy_row + DX += row_start * stride_dx_row + if HAS_DY1: + DY1 += row_start * stride_dy1_row + if HAS_DX1: + DX1 += row_start * stride_dx1_row + if RECOMPUTE_OUTPUT: + Y += row_start * stride_y_row + w = tl.load(W + cols, mask=mask).to(tl.float32) + if RECOMPUTE_OUTPUT and HAS_BIAS: + b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32) + if HAS_DY1: + w1 = tl.load(W1 + cols, mask=mask).to(tl.float32) + dw = tl.zeros((BLOCK_N,), dtype=tl.float32) + if HAS_BIAS: + db = tl.zeros((BLOCK_N,), dtype=tl.float32) + if HAS_DY1: + dw1 = tl.zeros((BLOCK_N,), dtype=tl.float32) + if HAS_B1: + db1 = tl.zeros((BLOCK_N,), dtype=tl.float32) + row_end = min((row_block_id + 1) * rows_per_program, M) + for row in range(row_start, row_end): + # Load data to SRAM + x = tl.load(X + cols, mask=mask, other=0).to(tl.float32) + dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32) + if HAS_DY1: + dy1 = tl.load(DY1 + cols, mask=mask, other=0).to(tl.float32) + if not IS_RMS_NORM: + mean = tl.load(Mean + row) + rstd = tl.load(Rstd + row) + # Compute dx + xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd + xhat = tl.where(mask, xhat, 0.0) + if RECOMPUTE_OUTPUT: + y = xhat * w + b if HAS_BIAS else xhat * w + tl.store(Y + cols, y, mask=mask) + wdy = w * dy + dw += dy * xhat + if HAS_BIAS: + db += dy + if HAS_DY1: + wdy += w1 * dy1 + dw1 += dy1 * xhat + if HAS_B1: + db1 += dy1 + if not IS_RMS_NORM: + c1 = tl.sum(xhat * wdy, axis=0) / N + c2 = tl.sum(wdy, axis=0) / N + dx = (wdy - (xhat * c1 + c2)) * rstd + else: + c1 = tl.sum(xhat * wdy, axis=0) / N + dx = (wdy - xhat * c1) * rstd + if HAS_DRESIDUAL: + dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32) + dx += dres + # Write dx + if STORE_DRESIDUAL: + tl.store(DRESIDUAL_IN + cols, dx, mask=mask) + if HAS_DX1: + if HAS_DROPOUT: + keep_mask = ( + tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p + ) + dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0) + else: + dx1 = dx + tl.store(DX1 + cols, dx1, mask=mask) + if HAS_DROPOUT: + keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p + dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0) + if HAS_ROWSCALE: + rowscale = tl.load(ROWSCALE + row).to(tl.float32) + dx *= rowscale + tl.store(DX + cols, dx, mask=mask) + + X += stride_x_row + if HAS_DRESIDUAL: + DRESIDUAL += stride_dres_row + if STORE_DRESIDUAL: + DRESIDUAL_IN += stride_dres_in_row + if RECOMPUTE_OUTPUT: + Y += stride_y_row + DY += stride_dy_row + DX += stride_dx_row + if HAS_DY1: + DY1 += stride_dy1_row + if HAS_DX1: + DX1 += stride_dx1_row + tl.store(DW + row_block_id * N + cols, dw, mask=mask) + if HAS_BIAS: + tl.store(DB + row_block_id * N + cols, db, mask=mask) + if HAS_DY1: + tl.store(DW1 + row_block_id * N + cols, dw1, mask=mask) + if HAS_B1: + tl.store(DB1 + row_block_id * N + cols, db1, mask=mask) + + +def _layer_norm_bwd( + dy, + x, + weight, + bias, + eps, + mean, + rstd, + dresidual=None, + dy1=None, + weight1=None, + bias1=None, + seeds=None, + dropout_p=0.0, + rowscale=None, + has_residual=False, + has_x1=False, + is_rms_norm=False, + x_dtype=None, + recompute_output=False, +): + M, N = x.shape + assert x.stride(-1) == 1 + assert dy.stride(-1) == 1 + assert dy.shape == (M, N) + if dresidual is not None: + assert dresidual.stride(-1) == 1 + assert dresidual.shape == (M, N) + assert weight.shape == (N,) + assert weight.stride(-1) == 1 + if bias is not None: + assert bias.stride(-1) == 1 + assert bias.shape == (N,) + if dy1 is not None: + assert weight1 is not None + assert dy1.shape == dy.shape + assert dy1.stride(-1) == 1 + if weight1 is not None: + assert weight1.shape == (N,) + assert weight1.stride(-1) == 1 + if bias1 is not None: + assert bias1.shape == (N,) + assert bias1.stride(-1) == 1 + if seeds is not None: + assert seeds.is_contiguous() + assert seeds.shape == (M if not has_x1 else M * 2,) + if rowscale is not None: + assert rowscale.is_contiguous() + assert rowscale.shape == (M,) + # allocate output + dx = ( + torch.empty_like(x) + if x_dtype is None + else torch.empty(M, N, dtype=x_dtype, device=x.device) + ) + dresidual_in = ( + torch.empty_like(x) + if has_residual + and (dx.dtype != x.dtype or dropout_p > 0.0 or rowscale is not None or has_x1) + else None + ) + dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None + y = torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None + if recompute_output: + assert weight1 is None, "recompute_output is not supported with parallel LayerNorm" + + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + if N > BLOCK_N: + raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count + _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device) + _db = ( + torch.empty((sm_count, N), dtype=torch.float32, device=bias.device) + if bias is not None + else None + ) + _dw1 = torch.empty_like(_dw) if weight1 is not None else None + _db1 = torch.empty_like(_db) if bias1 is not None else None + rows_per_program = math.ceil(M / sm_count) + grid = (sm_count,) + with torch.cuda.device(x.device.index): + _layer_norm_bwd_kernel[grid]( + x, + weight, + bias, + y, + dy, + dx, + _dw, + _db, + dresidual, + weight1, + dy1, + dx1, + _dw1, + _db1, + dresidual_in, + rowscale, + seeds, + mean, + rstd, + x.stride(0), + 0 if not recompute_output else y.stride(0), + dy.stride(0), + dx.stride(0), + dresidual.stride(0) if dresidual is not None else 0, + dy1.stride(0) if dy1 is not None else 0, + dx1.stride(0) if dx1 is not None else 0, + dresidual_in.stride(0) if dresidual_in is not None else 0, + M, + N, + eps, + dropout_p, + rows_per_program, + is_rms_norm, + BLOCK_N, + dresidual is not None, + dresidual_in is not None, + bias is not None, + dropout_p > 0.0, + ) + dw = _dw.sum(0).to(weight.dtype) + db = _db.sum(0).to(bias.dtype) if bias is not None else None + dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None + db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None + # Don't need to compute dresidual_in separately in this case + if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None: + dresidual_in = dx + if has_x1 and dropout_p == 0.0: + dx1 = dx + return ( + (dx, dw, db, dresidual_in, dx1, dw1, db1) + if not recompute_output + else (dx, dw, db, dresidual_in, dx1, dw1, db1, y) + ) + + +class LayerNormFn(torch.autograd.Function): + @staticmethod + def forward( + ctx, + x, + weight, + bias, + residual=None, + x1=None, + weight1=None, + bias1=None, + eps=1e-6, + dropout_p=0.0, + rowscale=None, + prenorm=False, + residual_in_fp32=False, + is_rms_norm=False, + return_dropout_mask=False, + ): + x_shape_og = x.shape + # reshape input data into 2D tensor + x = x.reshape(-1, x.shape[-1]) + if x.stride(-1) != 1: + x = x.contiguous() + if residual is not None: + assert residual.shape == x_shape_og + residual = residual.reshape(-1, residual.shape[-1]) + if residual.stride(-1) != 1: + residual = residual.contiguous() + if x1 is not None: + assert x1.shape == x_shape_og + assert rowscale is None, "rowscale is not supported with parallel LayerNorm" + x1 = x1.reshape(-1, x1.shape[-1]) + if x1.stride(-1) != 1: + x1 = x1.contiguous() + weight = weight.contiguous() + if bias is not None: + bias = bias.contiguous() + if weight1 is not None: + weight1 = weight1.contiguous() + if bias1 is not None: + bias1 = bias1.contiguous() + if rowscale is not None: + rowscale = rowscale.reshape(-1).contiguous() + residual_dtype = ( + residual.dtype + if residual is not None + else (torch.float32 if residual_in_fp32 else None) + ) + y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd( + x, + weight, + bias, + eps, + residual, + x1, + weight1, + bias1, + dropout_p=dropout_p, + rowscale=rowscale, + residual_dtype=residual_dtype, + is_rms_norm=is_rms_norm, + return_dropout_mask=return_dropout_mask, + ) + ctx.save_for_backward( + residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd + ) + ctx.x_shape_og = x_shape_og + ctx.eps = eps + ctx.dropout_p = dropout_p + ctx.is_rms_norm = is_rms_norm + ctx.has_residual = residual is not None + ctx.has_x1 = x1 is not None + ctx.prenorm = prenorm + ctx.x_dtype = x.dtype + y = y.reshape(x_shape_og) + y1 = y1.reshape(x_shape_og) if y1 is not None else None + residual_out = residual_out.reshape(x_shape_og) if residual_out is not None else None + dropout_mask = dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None + dropout_mask1 = dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None + if not return_dropout_mask: + if weight1 is None: + return y if not prenorm else (y, residual_out) + else: + return (y, y1) if not prenorm else (y, y1, residual_out) + else: + if weight1 is None: + return ( + (y, dropout_mask, dropout_mask1) + if not prenorm + else (y, residual_out, dropout_mask, dropout_mask1) + ) + else: + return ( + (y, y1, dropout_mask, dropout_mask1) + if not prenorm + else (y, y1, residual_out, dropout_mask, dropout_mask1) + ) + + @staticmethod + def backward(ctx, dy, *args): + x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors + dy = dy.reshape(-1, dy.shape[-1]) + if dy.stride(-1) != 1: + dy = dy.contiguous() + assert dy.shape == x.shape + if weight1 is not None: + dy1, args = args[0], args[1:] + dy1 = dy1.reshape(-1, dy1.shape[-1]) + if dy1.stride(-1) != 1: + dy1 = dy1.contiguous() + assert dy1.shape == x.shape + else: + dy1 = None + if ctx.prenorm: + dresidual = args[0] + dresidual = dresidual.reshape(-1, dresidual.shape[-1]) + if dresidual.stride(-1) != 1: + dresidual = dresidual.contiguous() + assert dresidual.shape == x.shape + else: + dresidual = None + dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd( + dy, + x, + weight, + bias, + ctx.eps, + mean, + rstd, + dresidual, + dy1, + weight1, + bias1, + seeds, + ctx.dropout_p, + rowscale, + ctx.has_residual, + ctx.has_x1, + ctx.is_rms_norm, + x_dtype=ctx.x_dtype, + ) + return ( + dx.reshape(ctx.x_shape_og), + dw, + db, + dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None, + dx1.reshape(ctx.x_shape_og) if dx1 is not None else None, + dw1, + db1, + None, + None, + None, + None, + None, + None, + None, + ) + + +def layer_norm_fn( + x, + weight, + bias, + residual=None, + x1=None, + weight1=None, + bias1=None, + eps=1e-6, + dropout_p=0.0, + rowscale=None, + prenorm=False, + residual_in_fp32=False, + is_rms_norm=False, + return_dropout_mask=False, +): + return LayerNormFn.apply( + x, + weight, + bias, + residual, + x1, + weight1, + bias1, + eps, + dropout_p, + rowscale, + prenorm, + residual_in_fp32, + is_rms_norm, + return_dropout_mask, + ) + + +def rms_norm_fn( + x, + weight, + bias, + residual=None, + x1=None, + weight1=None, + bias1=None, + eps=1e-6, + dropout_p=0.0, + rowscale=None, + prenorm=False, + residual_in_fp32=False, + return_dropout_mask=False, +): + return LayerNormFn.apply( + x, + weight, + bias, + residual, + x1, + weight1, + bias1, + eps, + dropout_p, + rowscale, + prenorm, + residual_in_fp32, + True, + return_dropout_mask, + ) + + +class RMSNorm(torch.nn.Module): + + def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, device=None, dtype=None): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.eps = eps + if dropout_p > 0.0: + self.drop = torch.nn.Dropout(dropout_p) + else: + self.drop = None + self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.register_parameter("bias", None) + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.ones_(self.weight) + + def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False): + return rms_norm_fn( + x, + self.weight, + self.bias, + residual=residual, + eps=self.eps, + dropout_p=self.drop.p if self.drop is not None and self.training else 0.0, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32, + ) + + +class LayerNormLinearFn(torch.autograd.Function): + @staticmethod + @custom_fwd + def forward( + ctx, + x, + norm_weight, + norm_bias, + linear_weight, + linear_bias, + residual=None, + eps=1e-6, + prenorm=False, + residual_in_fp32=False, + is_rms_norm=False, + ): + x_shape_og = x.shape + # reshape input data into 2D tensor + x = x.reshape(-1, x.shape[-1]) + if x.stride(-1) != 1: + x = x.contiguous() + if residual is not None: + assert residual.shape == x_shape_og + residual = residual.reshape(-1, residual.shape[-1]) + if residual.stride(-1) != 1: + residual = residual.contiguous() + norm_weight = norm_weight.contiguous() + if norm_bias is not None: + norm_bias = norm_bias.contiguous() + residual_dtype = ( + residual.dtype + if residual is not None + else (torch.float32 if residual_in_fp32 else None) + ) + y, _, mean, rstd, residual_out, *rest = _layer_norm_fwd( + x, + norm_weight, + norm_bias, + eps, + residual, + out_dtype=None if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype(), + residual_dtype=residual_dtype, + is_rms_norm=is_rms_norm, + ) + y = y.reshape(x_shape_og) + dtype = torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype + linear_weight = linear_weight.to(dtype) + linear_bias = linear_bias.to(dtype) if linear_bias is not None else None + out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias) + # We don't store y, will be recomputed in the backward pass to save memory + ctx.save_for_backward(residual_out, norm_weight, norm_bias, linear_weight, mean, rstd) + ctx.x_shape_og = x_shape_og + ctx.eps = eps + ctx.is_rms_norm = is_rms_norm + ctx.has_residual = residual is not None + ctx.prenorm = prenorm + ctx.x_dtype = x.dtype + ctx.linear_bias_is_none = linear_bias is None + return out if not prenorm else (out, residual_out.reshape(x_shape_og)) + + @staticmethod + @custom_bwd + def backward(ctx, dout, *args): + x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors + dout = dout.reshape(-1, dout.shape[-1]) + dy = F.linear(dout, linear_weight.t()) + dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0) + if dy.stride(-1) != 1: + dy = dy.contiguous() + assert dy.shape == x.shape + if ctx.prenorm: + dresidual = args[0] + dresidual = dresidual.reshape(-1, dresidual.shape[-1]) + if dresidual.stride(-1) != 1: + dresidual = dresidual.contiguous() + assert dresidual.shape == x.shape + else: + dresidual = None + dx, dnorm_weight, dnorm_bias, dresidual_in, _, _, _, y = _layer_norm_bwd( + dy, + x, + norm_weight, + norm_bias, + ctx.eps, + mean, + rstd, + dresidual=dresidual, + has_residual=ctx.has_residual, + is_rms_norm=ctx.is_rms_norm, + x_dtype=ctx.x_dtype, + recompute_output=True, + ) + dlinear_weight = torch.einsum("bo,bi->oi", dout, y) + return ( + dx.reshape(ctx.x_shape_og), + dnorm_weight, + dnorm_bias, + dlinear_weight, + dlinear_bias, + dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None, + None, + None, + None, + None, + ) + + +def layer_norm_linear_fn( + x, + norm_weight, + norm_bias, + linear_weight, + linear_bias, + residual=None, + eps=1e-6, + prenorm=False, + residual_in_fp32=False, + is_rms_norm=False, +): + return LayerNormLinearFn.apply( + x, + norm_weight, + norm_bias, + linear_weight, + linear_bias, + residual, + eps, + prenorm, + residual_in_fp32, + is_rms_norm, + ) diff --git a/linear-warmup.yaml b/linear-warmup.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bb6a69896e8f042a9a2ca2b1102c853e863db936 --- /dev/null +++ b/linear-warmup.yaml @@ -0,0 +1,2 @@ +# @package train.scheduler +_target_: transformers.get_linear_schedule_with_warmup diff --git a/linear.py b/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..a8966dbc345ab0e593df0124451ee7be3dae131a --- /dev/null +++ b/linear.py @@ -0,0 +1,594 @@ +# Adapted from https://github.com/ELS-RD/kernl/blob/main/src/kernl/implementations/linear_layer.py +# and https://github.com/openai/triton/blob/master/python/triton/ops/matmul.py +from typing import Optional + +import torch +import triton +import triton.language as tl +from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time + +from flash_attn.ops.triton.k_activations import ( + gelu, + gelu_approx, + gelu_approx_grad, + gelu_grad, + squared_relu, + squared_relu_grad, +) + +# CREDITS: Initially inspired by the Triton tutorial on matrix multiplications + + +def init_to_zero(name): + return lambda nargs: nargs[name].zero_() + + +def get_configs_io_bound(): + configs = [] + for num_stages in [2, 3, 4, 5, 6]: + for block_m in [16, 32]: + for block_k in [32, 64]: + for block_n in [32, 64, 128, 256]: + num_warps = 2 if block_n <= 64 else 4 + configs.append( + triton.Config( + { + "BLOCK_M": block_m, + "BLOCK_N": block_n, + "BLOCK_K": block_k, + "SPLIT_K": 1, + }, + num_stages=num_stages, + num_warps=num_warps, + ) + ) + # split_k not used + # for split_k in [2, 4, 8, 16]: + # configs.append(triton.Config( + # {'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, + # num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) + return configs + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8 + ), + triton.Config( + {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8 + ), + triton.Config( + {"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=5, num_warps=2 + ), + # good for int8 + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, + num_stages=3, + num_warps=8, + ), + triton.Config( + {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, + num_stages=3, + num_warps=8, + ), + triton.Config( + {"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=5, num_warps=2 + ), + ] + + get_configs_io_bound(), + key=["CACHE_KEY_M", "CACHE_KEY_N", "CACHE_KEY_K"], + prune_configs_by={ + "early_config_prune": early_config_prune, + "perf_model": estimate_matmul_time, + "top_k": 10, + }, +) +@triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % (args["BLOCK_K"] * args["SPLIT_K"]) == 0, + } +) +@triton.jit +def kernel_fwd( + C, # Pointers to matrices + ACT_INPUT, + A, + B, + bias, + # Matrix dimensions + M, + N, + K, + CACHE_KEY_M, + CACHE_KEY_N, + CACHE_KEY_K, + # The stride variables represent how much to increase the ptr by when moving by 1 + # element in a particular dimension. E.g. stride_am is how much to increase a_ptr + # by to get the element one row down (A has M rows) + stride_cm, + # stride_cn, # Assume that stride_cn == 1 + stride_am, + stride_ak, + stride_bn, + stride_bk, + # Meta-parameters + BLOCK_M: tl.constexpr, + GROUP_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + # split k not used, not performant with activation, kept because early_config_prune is expecting it + SPLIT_K: tl.constexpr, + EVEN_K: tl.constexpr, + A_ROWMAJOR: tl.constexpr, + B_COLMAJOR: tl.constexpr, + BIAS: tl.constexpr, + SAVE_ACT_INPUT: tl.constexpr, + ACTIVATION: tl.constexpr, +): + + """ + Kernel for computing Out = activation(A x W + C) + - Input has shape (M, K) + - Weight has shape (K, N) + - Bias has shape (N,) + - Output has shape (M, N) + - ActInputs (optional) has shape (M, N) + 'ActInputs' optionally saves the A x W + C intermediate for backward computations + This kernel will consolidate over K + """ + + pid = tl.program_id(axis=0) + + grid_m = (M + BLOCK_M - 1) // BLOCK_M + grid_n = (N + BLOCK_N - 1) // BLOCK_N + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + + # now compute the block that each program will go through + # rm (resp. rn) denotes a range of indices + # for rows (resp. col) of C + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + # trick to avoid masking on M and N axis + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + rk = tl.arange(0, BLOCK_K) + + if A_ROWMAJOR: + A = A + (ram[:, None] * stride_am + rk[None, :]) + else: + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + if B_COLMAJOR: + B = B + (rk[:, None] + rbn[None, :] * stride_bn) + else: + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + for k in range(K, 0, -BLOCK_K): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + a = tl.load(A, mask=rk[None, :] < k, other=0.0) + b = tl.load(B, mask=rk[:, None] < k, other=0.0) + acc += tl.dot(a, b) + + if A_ROWMAJOR: + A += BLOCK_K + else: + A += BLOCK_K * stride_ak + if B_COLMAJOR: + B += BLOCK_K + else: + B += BLOCK_K * stride_bk + + # Putting bias after the matmul (instead of before) is faster, idk why + if BIAS: + bias = tl.load(bias + rn, mask=rn < N, other=0.0).to(tl.float32) + acc += bias[None, :] + + # optional: save the activation inputs + if SAVE_ACT_INPUT: + # act_in_ptrs = ACT_INPUT + ram[:, None] * stride_cm + rbn[None, :] * stride_cn + act_in_ptrs = ACT_INPUT + ram[:, None] * stride_cm + rbn[None, :] + tl.store(act_in_ptrs, acc) + + # optional: fused activation (while the data is in shared memory) + if ACTIVATION == "gelu": + acc = gelu(acc) + elif ACTIVATION == "gelu_approx": + acc = gelu_approx(acc) + elif ACTIVATION == "squared_relu": + acc = squared_relu(acc) + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + + # write back result + # C = C + rm[:, None] * stride_cm + rn[None, :] * stride_cn + C = C + rm[:, None] * stride_cm + rn[None, :] + mask = (rm < M)[:, None] & (rn < N)[None, :] + tl.store(C, acc) + + +def triton_linear_act( + x: torch.Tensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor] = None, + activation: str = "id", + save_act_input: bool = False, +) -> torch.Tensor: + """ + Compute e = activation(x @ weight.T + bias). + This wrapper kicks the `kernel_fwd` Triton kernel + :param x: input tensor + :param weight: weight matrix + :param bias: an optional bias tensor + :param activation: Activation name. Needs to be a Triton kernel. + :param act_input: an optional tensor to save the activation inputs (for backward) + :return: result tensor + """ + # if torch.is_autocast_enabled(): + # dtype = torch.get_autocast_gpu_dtype() + # x, weight, bias = [a.to(dtype=dtype) for a in [x, weight, bias]] + + assert activation in ["id", "gelu", "gelu_approx", "squared_relu"] + + batch_shape, n = x.shape[:-1], x.shape[-1] + batch_dim = batch_shape.numel() + x_reshaped = x.reshape(batch_dim, n) + + if x_reshaped.stride(0) > 1 and x_reshaped.stride(1) > 1: + x_reshaped = x_reshaped.contiguous() + if weight.stride(0) > 1 and weight.stride(1) > 1: + weight = weight.contiguous() + bias = bias.contiguous() if bias is not None else None + + assert ( + x.dtype == weight.dtype + ), f"Input and weight must have the same dtype, got {x.dtype} and {weight.dtype}" + if bias is not None: + assert ( + x.dtype == bias.dtype + ), f"Input and bias must have the same dtype, got {x.dtype} and {bias.dtype}" + assert ( + x_reshaped.shape[1] == weight.shape[1] + ), f"Incompatible dimensions: {x_reshaped.shape} - {weight.shape}" + + assert ( + bias is None or bias.shape[0] == weight.shape[0] + ), "Incompatible dimensions in between weight and bias" + + M, K = x_reshaped.shape + N, K = weight.shape + + output = torch.empty((M, N), device=x.device, dtype=x.dtype) + act_input = torch.empty_like(output) if save_act_input else None + + # 1D launch kernel where each block gets its own program. + grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),) # noqa + + kernel_fwd[grid]( + output, + act_input, + x_reshaped, + weight, # data ptrs + bias if bias is not None else x, # auto skip bias if not present + M, # shapes + N, + K, + M // 32, # key for triton cache (limit number of compilations) + N // 32, + K // 32, + stride_cm=output.stride(0), # strides + # stride_cn=output.stride(1), + stride_am=x_reshaped.stride(0), + stride_ak=x_reshaped.stride(1), + stride_bk=weight.stride(1), + stride_bn=weight.stride(0), + BIAS=bias is not None, # optional fused bias + SAVE_ACT_INPUT=save_act_input, # optional save activation inputs + ACTIVATION=activation, # optional fused activation + A_ROWMAJOR=x_reshaped.stride(1) == 1, + B_COLMAJOR=weight.stride(1) == 1, + GROUP_M=8, # speed optimization: group the programs + ) + + if not save_act_input: + return output.reshape(*batch_shape, output.shape[-1]) + else: + return ( + output.reshape(*batch_shape, output.shape[-1]), + act_input.reshape(*batch_shape, act_input.shape[-1]), + ) + + +@triton.autotune( + configs=[ + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8 + ), + triton.Config( + {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8 + ), + triton.Config( + {"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=5, num_warps=2 + ), + # good for int8 + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, + num_stages=3, + num_warps=8, + ), + triton.Config( + {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, + num_stages=3, + num_warps=8, + ), + triton.Config( + {"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, + num_stages=4, + num_warps=4, + ), + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4 + ), + triton.Config( + {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=5, num_warps=2 + ), + ] + + get_configs_io_bound(), + key=["CACHE_KEY_M", "CACHE_KEY_N", "CACHE_KEY_K"], + prune_configs_by={ + "early_config_prune": early_config_prune, + "perf_model": estimate_matmul_time, + "top_k": 10, + }, +) +@triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % (args["BLOCK_K"] * args["SPLIT_K"]) == 0, + } +) +@triton.jit +def kernel_bwd( + C, # Pointers to matrices + ACT_INPUT, + A, + B, + # Matrix dimensions + M, + N, + K, + CACHE_KEY_M, + CACHE_KEY_N, + CACHE_KEY_K, + # The stride variables represent how much to increase the ptr by when moving by 1 + # element in a particular dimension. E.g. stride_am is how much to increase a_ptr + # by to get the element one row down (A has M rows) + stride_cm, + # stride_cn, # Assume that stride_cn == 1 + stride_am, + stride_ak, + stride_bk, + stride_bn, + # Meta-parameters + BLOCK_M: tl.constexpr, + GROUP_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + # split k not used, not performant with activation, kept because early_config_prune is expecting it + SPLIT_K: tl.constexpr, + EVEN_K: tl.constexpr, + ACTIVATION: tl.constexpr, +): + + """ + Kernel for computing Out = activation(A x W + C) + - Input has shape (M, K) + - Weight has shape (K, N) + - Output has shape (M, N) + - ActInputs (optional) has shape (M, N) + 'ActInputs' optionally saves the A x W + C intermediate for backward computations + This kernel will consolidate over K + """ + + pid = tl.program_id(axis=0) + + grid_m = (M + BLOCK_M - 1) // BLOCK_M + grid_n = (N + BLOCK_N - 1) // BLOCK_N + # re-order program ID for better L2 performance + width = GROUP_M * grid_n + group_id = pid // width + group_size = min(grid_m - group_id * GROUP_M, GROUP_M) + pid_m = group_id * GROUP_M + (pid % group_size) + pid_n = (pid % width) // (group_size) + + # now compute the block that each program will go through + # rm (resp. rn) denotes a range of indices + # for rows (resp. col) of C + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + # trick to avoid masking on M and N axis + ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M) + rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N) + rk = tl.arange(0, BLOCK_K) + + A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak) + B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn) + + acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + + for k in range(K, 0, -BLOCK_K): + if EVEN_K: + a = tl.load(A) + b = tl.load(B) + else: + a = tl.load(A, mask=rk[None, :] < k, other=0.0) + b = tl.load(B, mask=rk[:, None] < k, other=0.0) + acc += tl.dot(a, b) + + A += BLOCK_K * stride_ak + B += BLOCK_K * stride_bk + + # optional: fused activation (while the data is in shared memory) + if ACTIVATION != "id": + act_in_ptrs = ACT_INPUT + ram[:, None] * stride_cm + rbn[None, :] + act_input = tl.load(act_in_ptrs).to(acc.dtype) + if ACTIVATION == "gelu": + acc *= gelu_grad(act_input) + elif ACTIVATION == "gelu_approx": + acc *= gelu_approx_grad(act_input) + elif ACTIVATION == "squared_relu": + acc *= squared_relu_grad(act_input) + + # rematerialize rm and rn to save registers + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) + + # write back result + C = C + rm[:, None] * stride_cm + rn[None, :] + mask = (rm < M)[:, None] & (rn < N)[None, :] + tl.store(C, acc, mask=mask) + + +def triton_dgrad_act( + grad_output: torch.Tensor, + weight: torch.Tensor, + activation: str = "id", + act_input: Optional[torch.Tensor] = None, +) -> torch.Tensor: + """ + Compute e = activation(grad_output @ weight + bias). + This wrapper kicks the `kernel_fwd` Triton kernel + :param grad_output: input tensor + :param weight: weight matrix + :param activation: Activation name. Needs to be a Triton kernel. + :param act_input: an optional tensor to save the activation inputs (for backward) + :return: result tensor + """ + assert activation in ["id", "gelu", "gelu_approx", "squared_relu"] + + batch_shape, n = grad_output.shape[:-1], grad_output.shape[-1] + batch_dim = batch_shape.numel() + grad_output_reshaped = grad_output.reshape(batch_dim, n) + + if grad_output_reshaped.stride(0) > 1 and grad_output_reshaped.stride(1) > 1: + grad_output_reshaped = grad_output_reshaped.contiguous() + if weight.stride(0) > 1 and weight.stride(1) > 1: + weight = weight.contiguous() + + assert ( + grad_output.dtype == weight.dtype + ), f"grad_output and weight must have the same dtype, got {grad_output.dtype} and {weight.dtype}" + assert ( + grad_output_reshaped.shape[1] == weight.shape[0] + ), f"Incompatible dimensions: {grad_output_reshaped.shape} - {weight.shape}" + if activation != "id": + assert act_input is not None, f"act_input is required for activation {activation}" + + # M, N, K in bwd are different from M, N, K in fwd + M, K = grad_output_reshaped.shape + K, N = weight.shape + + grad_input = torch.empty((M, N), device=grad_output.device, dtype=grad_output.dtype) + + # 1D launch kernel where each block gets its own program. + grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),) # noqa + + kernel_bwd[grid]( + grad_input, + act_input, + grad_output_reshaped, + weight, # data ptrs + M, # shapes + N, + K, + M // 32, # key for triton cache (limit number of compilations) + N // 32, + K // 32, + stride_cm=grad_input.stride(0), # strides + # stride_cn=grad_input.stride(1), + stride_am=grad_output_reshaped.stride(0), + stride_ak=grad_output_reshaped.stride(1), + stride_bk=weight.stride(0), + stride_bn=weight.stride(1), + ACTIVATION=activation, # optional fused activation + GROUP_M=8, # speed optimization: group the programs + ) + + return grad_input.reshape(*batch_shape, grad_input.shape[-1]) diff --git a/llama.py b/llama.py new file mode 100644 index 0000000000000000000000000000000000000000..3bfb51d17e27c1eeb5f09293b773cda8f4d81233 --- /dev/null +++ b/llama.py @@ -0,0 +1,422 @@ +# Copyright (c) 2023, Tri Dao. + +import json +import math +import os +import re +from collections import OrderedDict +from pathlib import Path +from typing import Dict, List, Union + +import torch +import torch.nn.functional as F +from sentencepiece import SentencePieceProcessor +from transformers import GPT2Config, LlamaConfig + +from einops import rearrange + + +def remap_state_dict_meta_llama( + state_dict: Dict[str, torch.Tensor], config: GPT2Config +) -> Dict[str, torch.Tensor]: + """Convert the state_dict in Meta format to standard GPT format. + + This function modifies state_dict in place. + """ + + def key_mapping_layers(key): + return f"transformer.{key}" if not key.startswith("output.") else key + + state_dict = OrderedDict((key_mapping_layers(k), v) for k, v in state_dict.items()) + + # Word embedding + def key_mapping_emb(key): + return re.sub( + r"^transformer.tok_embeddings.", "transformer.embeddings.word_embeddings.", key + ) + + state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items()) + word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight") + # It's possible that vocab_size is padded to be a multiple of 8, for example. + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = ( + math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple) * pad_vocab_size_multiple + ) + state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad( + word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0]) + ) + if getattr(config, "tie_word_embeddings"): + state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"] + else: + output_embeddings = state_dict.pop("output.weight") + # Need to recompute vocab_size since LLaMa shards the word embeddings and output embeddings + # differently. + vocab_size = ( + math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple) + * pad_vocab_size_multiple + ) + # It's possible that vocab_size is padded to be a multiple of 8, for example. + state_dict["lm_head.weight"] = F.pad( + output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0]) + ) + + # LayerNorm + def key_mapping_ln(key): + key = re.sub(r"^transformer.norm.", r"transformer.ln_f.", key) + key = re.sub( + r"^transformer.layers.(\d+).attention_norm.", + r"transformer.layers.\1.norm1.", + key, + ) + key = re.sub(r"^transformer.layers.(\d+).ffn_norm.", r"transformer.layers.\1.norm2.", key) + return key + + state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items()) + + # MLP + for l in range(config.n_layer): + w1 = state_dict.pop(f"transformer.layers.{l}.feed_forward.w1.weight") + w3 = state_dict.pop(f"transformer.layers.{l}.feed_forward.w3.weight") + # Our ordering is different + state_dict[f"transformer.layers.{l}.mlp.fc1.weight"] = torch.cat([w3, w1], dim=0) + + def key_mapping_mlp(key): + return re.sub( + r"^transformer.layers.(\d+).feed_forward.w2.", + r"transformer.layers.\1.mlp.fc2.", + key, + ) + + state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items()) + + # Attention + for l in range(config.n_layer): + Wq = state_dict.pop(f"transformer.layers.{l}.attention.wq.weight") + Wk = state_dict.pop(f"transformer.layers.{l}.attention.wk.weight") + Wv = state_dict.pop(f"transformer.layers.{l}.attention.wv.weight") + state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0) + # We don't store these + state_dict.pop(f"transformer.layers.{l}.attention.inner_attention.rope.freqs", None) + + def key_mapping_attn(key): + return re.sub( + r"^transformer.layers.(\d+).attention.wo.", + r"transformer.layers.\1.mixer.out_proj.", + key, + ) + + state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items()) + + state_dict.pop("transformer.rope.freqs", None) + + return state_dict + + +def remap_state_dict_hf_llama( + state_dict: Dict[str, torch.Tensor], config: GPT2Config +) -> Dict[str, torch.Tensor]: + """Convert the state_dict in Hugging Face format to standard GPT format. + + This function modifies state_dict in place. + """ + + # Embedding + def key_mapping_emb(key): + return re.sub(r"^model.embed_tokens.", "transformer.embeddings.word_embeddings.", key) + + state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items()) + word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight") + # It's possible that vocab_size is padded to be a multiple of 8, for example. + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = ( + math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple) * pad_vocab_size_multiple + ) + state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad( + word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0]) + ) + + # LM head + if getattr(config, "tie_word_embeddings"): + state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"] + else: + output_embeddings = state_dict.pop("lm_head.weight") + # Need to recompute vocab_size since LLaMa shards the word embeddings and output embeddings + # differently. + vocab_size = ( + math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple) + * pad_vocab_size_multiple + ) + # It's possible that vocab_size is padded to be a multiple of 8, for example. + state_dict["lm_head.weight"] = F.pad( + output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0]) + ) + + # MLP + for l in range(config.n_layer): + # Fusing weights this way based on difference in the following: + # https://github.com/huggingface/transformers/blob/b42010bb1d3cbf262d27e0a328661885be46dfdb/src/transformers/models/llama/modeling_llama.py#L220 + # https://github.com/Dao-AILab/flash-attention/blob/c60851a8253257eb970e06a022c82517a8033e8c/flash_attn/modules/mlp.py#L115 + w1 = state_dict.pop(f"model.layers.{l}.mlp.gate_proj.weight") + w3 = state_dict.pop(f"model.layers.{l}.mlp.up_proj.weight") + state_dict[f"transformer.layers.{l}.mlp.fc1.weight"] = torch.cat([w3, w1], dim=0) + + def key_mapping_mlp(key): + return re.sub( + r"^model.layers.(\d+).mlp.down_proj.", + r"transformer.layers.\1.mlp.fc2.", + key, + ) + + state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items()) + + # LayerNorm + def key_mapping_ln(key): + key = re.sub(r"^model.norm.", r"transformer.ln_f.", key) + key = re.sub( + r"^model.layers.(\d+).input_layernorm.", + r"transformer.layers.\1.norm1.", + key, + ) + key = re.sub( + r"^model.layers.(\d+).post_attention_layernorm.", + r"transformer.layers.\1.norm2.", + key, + ) + return key + + state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items()) + + def inv_permute(w): + # Inverse of permute implemented in: + # https://github.com/huggingface/transformers/blob/b42010bb1d3cbf262d27e0a328661885be46dfdb/src/transformers/models/llama/convert_llama_weights_to_hf.py#L114 + return rearrange( + w, "(h two d) n -> (h d two) n", d=config.n_embd // config.n_head // 2, two=2 + ) + + # Attention + for l in range(config.n_layer): + Wq = state_dict.pop(f"model.layers.{l}.self_attn.q_proj.weight") + Wk = state_dict.pop(f"model.layers.{l}.self_attn.k_proj.weight") + Wv = state_dict.pop(f"model.layers.{l}.self_attn.v_proj.weight") + + state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat( + [inv_permute(Wq), inv_permute(Wk), Wv], dim=0 + ) + # We don't store these + state_dict.pop(f"model.layers.{l}.self_attn.rotary_emb.inv_freq", None) + + def key_mapping_attn(key): + return re.sub( + r"^model.layers.(\d+).self_attn.o_proj.", + r"transformer.layers.\1.mixer.out_proj.", + key, + ) + + state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items()) + return state_dict + + +def inv_remap_state_dict_hf_llama( + state_dict: Dict[str, torch.Tensor], config: GPT2Config +) -> Dict[str, torch.Tensor]: + """Convert the state_dict in standard GPT format to Hugging Face format. + + This function is meant to be the inverse of remap_state_dict_hf_llama, up to a + multiplier pad in the embedding and lm_head. That is if the original embedding + isn't a multiple of pad_vocab_size_multiple, then + inv_remap_state_dict_hf_llama(remap_state_dict_hf_llama(state_dict)) != state_dict. + + This function modifies state_dict in place. + """ + + # Embedding + def key_mapping_emb(key): + return re.sub(r"^transformer.embeddings.word_embeddings.", "model.embed_tokens.", key) + + state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items()) + word_embeddings = state_dict.pop("model.embed_tokens.weight") + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = ( + math.ceil(word_embeddings.shape[0] / pad_vocab_size_multiple) * pad_vocab_size_multiple + ) + state_dict["model.embed_tokens.weight"] = F.pad( + word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0]) + ) + + # LM head + if getattr(config, "tie_word_embeddings"): + state_dict["lm_head.weight"] = state_dict["model.embed_tokens.weight"] + else: + output_embeddings = state_dict.pop("lm_head.weight") + vocab_size = ( + math.ceil(output_embeddings.shape[0] / pad_vocab_size_multiple) + * pad_vocab_size_multiple + ) + state_dict["lm_head.weight"] = F.pad( + output_embeddings, (0, 0, 0, vocab_size - output_embeddings.shape[0]) + ) + + # MLP + for l in range(config.n_layer): + w3, w1 = torch.chunk( + state_dict.pop(f"transformer.layers.{l}.mlp.fc1.weight"), chunks=2, dim=0 + ) + state_dict[f"model.layers.{l}.mlp.gate_proj.weight"] = w1 + state_dict[f"model.layers.{l}.mlp.up_proj.weight"] = w3 + + def key_mapping_mlp(key): + return re.sub( + r"^transformer.layers.(\d+).mlp.fc2.", + r"model.layers.\1.mlp.down_proj.", + key, + ) + + state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items()) + + # LayerNorm + def key_mapping_ln(key): + key = re.sub(r"^transformer.ln_f.", r"model.norm.", key) + key = re.sub( + r"^transformer.layers.(\d+).norm1.", + r"model.layers.\1.input_layernorm.", + key, + ) + key = re.sub( + r"^transformer.layers.(\d+).norm2.", + r"model.layers.\1.post_attention_layernorm.", + key, + ) + return key + + state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items()) + + def permute(w): + return rearrange( + w, "(h d two) n -> (h two d) n", d=config.n_embd // config.n_head // 2, two=2 + ) + + n_head = config.n_head + n_head_kv = getattr(config, "n_head_kv", n_head) + + embed_dim = config.hidden_size + head_dim = embed_dim // n_head + + q_dim = n_head * head_dim + k_dim = v_dim = n_head_kv * head_dim + + # Attention + for l in range(config.n_layer): + Wqkv = state_dict.pop(f"transformer.layers.{l}.mixer.Wqkv.weight") + Wq = Wqkv[:q_dim] + Wk = Wqkv[q_dim : q_dim + k_dim] + Wv = Wqkv[q_dim + k_dim : q_dim + k_dim + v_dim] + state_dict[f"model.layers.{l}.self_attn.q_proj.weight"] = permute(Wq) + state_dict[f"model.layers.{l}.self_attn.k_proj.weight"] = permute(Wk) + state_dict[f"model.layers.{l}.self_attn.v_proj.weight"] = Wv + state_dict.pop(f"transformer.layers.{l}.attention.inner_attention.rope.freqs", None) + + def key_mapping_attn(key): + return re.sub( + r"^transformer.layers.(\d+).mixer.out_proj.", + r"model.layers.\1.self_attn.o_proj.", + key, + ) + + state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items()) + return state_dict + + +def config_from_meta_checkpoint( + checkpoint_path: Union[str, os.PathLike], model_name: str +) -> LlamaConfig: + """Load a LlamaConfig from a checkpoint path.""" + with open(Path(checkpoint_path) / model_name / "params.json") as f: + params = json.load(f) + config = LlamaConfig( + hidden_size=params["dim"], + intermediate_size=None, + num_attention_heads=params["n_heads"], + num_hidden_layers=params["n_layers"], + rms_norm_eps=params["norm_eps"], + num_key_value_heads=params.get("n_kv_heads", None), + ) + multiple_of = params.get("multiple_of", 1) + ffn_dim_multiplier = params.get("ffn_dim_multiplier", None) + + # Compute the hidden dimension of the MLP + # https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/model.py#L224 + intermediate_size = 4 * config.hidden_size + # https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/model.py#L195-L199 + intermediate_size = int(2 * intermediate_size / 3) + # custom dim factor multiplier + if ffn_dim_multiplier is not None: + intermediate_size = int(ffn_dim_multiplier * intermediate_size) + intermediate_size = multiple_of * ((intermediate_size + multiple_of - 1) // multiple_of) + + config.intermediate_size = intermediate_size + if "rope_theta" in params: + config.rotary_emb_base = params["rope_theta"] + config.vocab_size = 32000 + # some CodeLLaMa have vocab_size 32000, some 32016 + # Sadly it's not specified in the `params.json` file :( + tokenizer = Path(checkpoint_path) / model_name / "tokenizer.model" + if tokenizer.is_file(): + config.vocab_size = SentencePieceProcessor(str(tokenizer)).vocab_size() + return config + + +def config_from_hf_checkpoint( + checkpoint_path: Union[str, os.PathLike], model_name: str +) -> LlamaConfig: + return LlamaConfig.from_pretrained(Path(checkpoint_path) / f"{model_name}-hf" / "config.json") + + +def config_from_checkpoint( + checkpoint_path: Union[str, os.PathLike], model_name: str, checkpoint_format="meta" +) -> LlamaConfig: + if checkpoint_format == "meta": + return config_from_meta_checkpoint(checkpoint_path, model_name) + else: + return config_from_hf_checkpoint(checkpoint_path, model_name) + + +def state_dicts_from_checkpoint( + checkpoint_path: Union[str, os.PathLike], model_name: str +) -> List[dict]: + # Need to sort, otherwise we mess up the ordering and the weights are wrong + return [ + torch.load(path, map_location="cpu") + for path in sorted((Path(checkpoint_path) / model_name).glob("consolidated.*.pth")) + ] + + +def llama_config_to_gpt2_config(llama_config: LlamaConfig) -> GPT2Config: + return GPT2Config( + vocab_size=llama_config.vocab_size, + n_positions=0, # No absolute position embedding + n_embd=llama_config.hidden_size, + n_layer=llama_config.num_hidden_layers, + n_head=llama_config.num_attention_heads, + n_inner=llama_config.intermediate_size, + activation_function="swiglu", # Hardcode since HF calls it 'silu' + # Llama doesn't have dropout, idk if it's because they only release the inference code + resid_pdrop=0.0, + embd_pdrop=0.0, + attn_pdrop=0.0, + layer_norm_epsilon=llama_config.rms_norm_eps, + initializer_range=llama_config.initializer_range, + bos_token_id=llama_config.bos_token_id, + eos_token_id=llama_config.eos_token_id, + # These are new arguments not in the original GPT2Config + pad_token_id=llama_config.pad_token_id, # Idk if this does anything + rms_norm=True, + rotary_emb_fraction=1.0, + rotary_emb_interleaved=True, + tie_word_embeddings=False, + qkv_proj_bias=False, + out_proj_bias=False, + mlp_fc1_bias=False, + mlp_fc2_bias=False, + rotary_emb_base=getattr(llama_config, "rotary_emb_base", 10000.0), + n_head_kv=llama_config.num_key_value_heads, + ) diff --git a/ln.h b/ln.h new file mode 100644 index 0000000000000000000000000000000000000000..9830c092d0aca9f3466154a18d1d3c32d651716e --- /dev/null +++ b/ln.h @@ -0,0 +1,281 @@ +#pragma once + +#include +#include +#include + +#ifdef OLD_GENERATOR_PATH +#include +#else +#include +#endif + +namespace layer_norm { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct LaunchParams{ + + size_t elts_per_thread; + size_t workspace_bytes; + size_t barrier_size; + + cudaDeviceProp * props; + + cudaStream_t stream; + + Params params; + +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct ParamsBase { + ParamsBase() + : ctas_per_col(0) + , rows(0) + , cols(0) + , x(nullptr) + , mu(nullptr) + , rs(nullptr) + , gamma(nullptr) + , gamma1(nullptr) + , rowscale(nullptr) + , colscale(nullptr) + , dropout_keep_p(1.f) + , dropout_scale(1.f) + , is_rms_norm(false) + , workspace(nullptr) + , barrier(nullptr) + { + } + + // For Multi-CTA, number of different CTA groups. Otherwise same as gridDim.x. + int ctas_per_col; + + // Input is interpreted as matrix. We normalize across columns. + int rows; + int cols; + + // Common data pointers. + void *x0; + void *x1; + void *residual; + void *x; + void *dmask; + void *dmask1; + void *mu; + void *rs; + void *gamma; + void *gamma1; + void *rowscale; + void *colscale; + void *x0_subset; + void *z_subset; + + float inverse_cols; + + float dropout_keep_p; + float dropout_scale; + float rowscale_const; + + bool is_rms_norm; + + // Multi-CTA workspace in gmem. + void *workspace; + + // Multi-CTA sync barriers in gmem. + int *barrier; + +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct FwdParams : public ParamsBase { + FwdParams() + : ParamsBase() + , z(nullptr) + , z1(nullptr) + , beta(nullptr) + , beta1(nullptr) + , epsilon(0.f) + { + } + + // Output of LN FWD. + void *z; + void *z1; + void *beta; + void *beta1; + float epsilon; + + // Random state. + at::PhiloxCudaState philox_args; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct BwdParams : public ParamsBase { + BwdParams() + : ParamsBase() + , dz(nullptr) + , dz1(nullptr) + , dx(nullptr) + , dbeta_part(nullptr) + , dgamma_part(nullptr) + , dbeta1_part(nullptr) + , dgamma1_part(nullptr) + , dcolscale_part(nullptr) + , dx0(nullptr) + , dx1(nullptr) + , dresidual(nullptr) + , dbeta(nullptr) + , dgamma(nullptr) + , dbeta1(nullptr) + , dgamma1(nullptr) + , dcolscale(nullptr) + { + } + + // Input: gradient wrt. LN FWD output. + void *dz; + void *dz1; + // Input: gradient wrt residual. + void *dx; + + // Workspace for Wgrad pre-reduction. + void *dbeta_part; + void *dgamma_part; + void *dbeta1_part; + void *dgamma1_part; + void *dcolscale_part; + + // Output: Dgrad. + void *dx0; + void *dx1; + void *dresidual; + // Output: Wgrad. + void *dbeta; + void *dgamma; + void *dbeta1; + void *dgamma1; + void *dcolscale; + +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +using FwdFunction = std::function&, const bool)>; +using BwdFunction = std::function&, const bool)>; +using FunctionKey = uint64_t; +using FwdRegistry = std::unordered_map; +using BwdRegistry = std::unordered_map; + +extern FwdRegistry FWD_FUNCS, PARALLEL_FWD_FUNCS; +extern BwdRegistry BWD_FUNCS, PARALLEL_BWD_FUNCS; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +using fp32 = float; +using fp16 = half; +using bf16 = nv_bfloat16; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct TypeId{}; + +template<> +struct TypeId{ + constexpr static uint32_t Value = 0; +}; + +template<> +struct TypeId{ + constexpr static uint32_t Value = 1; +}; + +template<> +struct TypeId{ + constexpr static uint32_t Value = 2; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Type2Key{ + constexpr static uint32_t Value = TypeId::Value << S; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct WeightType2Key : public Type2Key{}; + +template +struct InputType2Key : public Type2Key{}; + +template +struct ResidualType2Key : public Type2Key{}; + +template +struct OutputType2Key : public Type2Key{}; + +template +struct ComputeType2Key : public Type2Key{}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Types2Key{ + constexpr static uint32_t Value = WeightType2Key::Value | InputType2Key::Value | ResidualType2Key::Value | OutputType2Key::Value | ComputeType2Key::Value; + constexpr static inline uint64_t get(const uint64_t hidden_size){ + constexpr uint64_t type_key = Value; + return (type_key << 32) | hidden_size; + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct FwdRegistrar{ + FwdRegistrar(FwdFunction f){ + uint64_t key = Types2Key::get(HIDDEN_SIZE); + FWD_FUNCS.insert({ key, f }); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct BwdRegistrar{ + BwdRegistrar(BwdFunction f){ + uint64_t key = Types2Key::get(HIDDEN_SIZE); + BWD_FUNCS.insert({ key, f }); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct FwdParallelRegistrar{ + FwdParallelRegistrar(FwdFunction f){ + uint64_t key = Types2Key::get(HIDDEN_SIZE); + PARALLEL_FWD_FUNCS.insert({ key, f }); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct BwdParallelRegistrar{ + BwdParallelRegistrar(BwdFunction f){ + uint64_t key = Types2Key::get(HIDDEN_SIZE); + PARALLEL_BWD_FUNCS.insert({ key, f }); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace layer_norm diff --git a/ln_api.cpp b/ln_api.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3981bbad58e56023c33ff66b89c130f4d1636a36 --- /dev/null +++ b/ln_api.cpp @@ -0,0 +1,850 @@ +#include +#include "ATen/cuda/CUDAContext.h" +#include + +#include "ln.h" + +/* + +Supported Type combinations: + +input residual compute weights output +============================================ +fp32 fp32 fp32 fp32 fp32 +fp16 fp32 fp32 fp32 fp16 +fp16 fp16 fp32 fp32 fp16 +bf16 fp32 fp32 fp32 bf16 +bf16 bf16 fp32 fp32 bf16 +fp16 fp16 fp32 fp16 fp16 +bf16 bf16 fp32 bf16 bf16 + +Remarks: +Output type = Input type +Compute always in FP32 + +*/ + +namespace layer_norm { + +// Create registries and provide runtime versions of config hash functions. + +FwdRegistry FWD_FUNCS, PARALLEL_FWD_FUNCS; +BwdRegistry BWD_FUNCS, PARALLEL_BWD_FUNCS; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +uint32_t get_type_id(torch::Dtype dtype){ + if( dtype == torch::kFloat16 ) { + return TypeId::Value; + } else if( dtype == torch::kBFloat16 ) { + return TypeId::Value; + } else if( dtype == torch::kFloat32 ) { + return TypeId::Value; + } else { + TORCH_CHECK(false, "Type not supported: ", dtype); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +uint64_t get_key(torch::Dtype wtype, torch::Dtype itype, torch::Dtype rtype, torch::Dtype otype, torch::Dtype ctype, uint64_t hidden_size) { + using namespace layer_norm; + uint64_t type_key = get_type_id(wtype) | (get_type_id(itype) << 2) | (get_type_id(rtype) << 4) | (get_type_id(otype) << 6) | (get_type_id(ctype) << 8); + uint64_t launcher_key = (type_key << 32) | hidden_size; + return launcher_key; +} + +} // namespace layer_norm + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +layer_norm::FwdFunction & get_fwd_launcher(torch::Dtype wtype, torch::Dtype itype, torch::Dtype rtype, torch::Dtype otype, torch::Dtype ctype, uint32_t hidden_size) { + auto iter = layer_norm::FWD_FUNCS.find(layer_norm::get_key(wtype, itype, rtype, otype, ctype, hidden_size)); + if( iter != layer_norm::FWD_FUNCS.end() ) { + return iter->second; + } else { + TORCH_CHECK(false, "FWD: Unsupported hidden_size or types: ", hidden_size, wtype, itype, rtype, otype, ctype); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +layer_norm::BwdFunction & get_bwd_launcher(torch::Dtype wtype, torch::Dtype itype, torch::Dtype rtype, torch::Dtype otype, torch::Dtype ctype, uint32_t hidden_size) { + auto iter = layer_norm::BWD_FUNCS.find(layer_norm::get_key(wtype, itype, rtype, otype, ctype, hidden_size)); + if( iter != layer_norm::BWD_FUNCS.end() ) { + return iter->second; + } else { + TORCH_CHECK(false, "BWD: Unsupported hidden_size or types: ", hidden_size, wtype, itype, rtype, otype, ctype); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +layer_norm::FwdFunction & get_parallel_fwd_launcher(torch::Dtype wtype, torch::Dtype itype, torch::Dtype rtype, torch::Dtype otype, torch::Dtype ctype, uint32_t hidden_size) { + auto iter = layer_norm::PARALLEL_FWD_FUNCS.find(layer_norm::get_key(wtype, itype, rtype, otype, ctype, hidden_size)); + if( iter != layer_norm::PARALLEL_FWD_FUNCS.end() ) { + return iter->second; + } else { + TORCH_CHECK(false, "FWD: Unsupported hidden_size or types: ", hidden_size, wtype, itype, rtype, otype, ctype); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +layer_norm::BwdFunction & get_parallel_bwd_launcher(torch::Dtype wtype, torch::Dtype itype, torch::Dtype rtype, torch::Dtype otype, torch::Dtype ctype, uint32_t hidden_size) { + auto iter = layer_norm::PARALLEL_BWD_FUNCS.find(layer_norm::get_key(wtype, itype, rtype, otype, ctype, hidden_size)); + if( iter != layer_norm::PARALLEL_BWD_FUNCS.end() ) { + return iter->second; + } else { + TORCH_CHECK(false, "BWD: Unsupported hidden_size or types: ", hidden_size, wtype, itype, rtype, otype, ctype); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +std::vector dropout_add_ln_fwd(const at::Tensor &x0, // Input: BxSxhidden_size + c10::optional &residual_, // Residual: BxSxhidden_size + const at::Tensor &gamma, // hidden_size + c10::optional &beta_, // hidden_size + c10::optional &rowscale_, // BxS + c10::optional &colscale_, // hidden_size + c10::optional &x0_subset_, // BxS + c10::optional &z_subset_, // BxS + const float dropout_p, + const float epsilon, + const float rowscale_const, + const int64_t z_numrows, + c10::optional gen_, + bool residual_in_fp32=false, + bool is_rms_norm=false +) { + auto itype = x0.scalar_type(); + auto rtype = residual_.has_value() + ? residual_.value().scalar_type() + : (residual_in_fp32 ? torch::kFloat32 : x0.scalar_type()); + auto wtype = gamma.scalar_type(); + auto otype = itype; + auto ctype = torch::kFloat32; + auto mtype = torch::kUInt8; + + TORCH_CHECK(x0.is_cuda()); + TORCH_CHECK(gamma.is_cuda()); + + TORCH_CHECK(x0.is_contiguous()); + // c10::IntArrayRef does not own the storage, so we need to construct a vector. + // Otherwise just constructing IntArrayRef({blah}) will cause uninitialized memory because + // blah is then deallocated. + std::vector sizes_vec {!x0_subset_.has_value() ? x0.size(0) : x0_subset_.value().size(0), x0.size(1)}; + auto sizes = c10::IntArrayRef(sizes_vec); + TORCH_CHECK(x0.dim() == 2); + TORCH_CHECK(sizes.size() == 2); + + const int rows = sizes[0]; + const int cols = sizes[1]; + auto hidden_size = gamma.numel(); + TORCH_CHECK(hidden_size == cols); + + if (beta_.has_value()) { + auto beta = beta_.value(); + TORCH_CHECK(beta.dtype() == wtype); + TORCH_CHECK(beta.is_cuda()); + TORCH_CHECK(beta.is_contiguous()); + TORCH_CHECK(beta.sizes() == gamma.sizes()); + } + + if (residual_.has_value()) { + auto residual = residual_.value(); + TORCH_CHECK(residual.is_cuda()); + TORCH_CHECK(residual.is_contiguous()); + TORCH_CHECK(residual.sizes() == sizes); + } + + if (rowscale_.has_value()) { + auto rowscale = rowscale_.value(); + TORCH_CHECK(rowscale.is_cuda()); + TORCH_CHECK(rowscale.is_contiguous()); + TORCH_CHECK(rowscale.sizes() == c10::IntArrayRef{rows}); + TORCH_CHECK(rowscale.dtype() == itype); + } + + if (colscale_.has_value()) { + auto colscale = colscale_.value(); + TORCH_CHECK(colscale.is_cuda()); + TORCH_CHECK(colscale.is_contiguous()); + TORCH_CHECK(colscale.sizes() == c10::IntArrayRef{cols}); + TORCH_CHECK(colscale.dtype() == wtype); + } + + if (x0_subset_.has_value()) { + auto x0_subset = x0_subset_.value(); + TORCH_CHECK(x0_subset.is_cuda()); + TORCH_CHECK(x0_subset.is_contiguous()); + TORCH_CHECK(x0_subset.sizes() == c10::IntArrayRef{rows}); + TORCH_CHECK(x0_subset.dtype() == torch::kInt32); + + TORCH_CHECK(z_subset_.has_value()); + auto z_subset = z_subset_.value(); + TORCH_CHECK(z_subset.is_cuda()); + TORCH_CHECK(z_subset.is_contiguous()); + TORCH_CHECK(z_subset.sizes() == c10::IntArrayRef{rows}); + TORCH_CHECK(z_subset.dtype() == torch::kInt32); + } + + TORCH_CHECK((hidden_size % 8 == 0) && (hidden_size <= 8192)); + TORCH_CHECK(epsilon >= 0.f); + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)x0.get_device()}; + + auto opts = x0.options(); + + bool save_x = residual_.has_value() || (dropout_p > 0.f) || rowscale_.has_value() || colscale_.has_value() || x0_subset_.has_value() || (itype != rtype); + at::Tensor x; + if (save_x) { x = torch::empty(sizes, opts.dtype(rtype)); } + at::Tensor dmask; + if (dropout_p > 0.f) { dmask = torch::empty(x0.sizes(), opts.dtype(mtype)); }; + auto z = torch::empty(z_subset_.has_value() ? c10::IntArrayRef{z_numrows, cols} : sizes, opts.dtype(otype)); + + auto mu = torch::empty({ rows }, opts.dtype(ctype)); + auto rsigma = torch::empty({ rows }, opts.dtype(ctype)); + + layer_norm::LaunchParams launch_params; + + launch_params.props = at::cuda::getCurrentDeviceProperties(); + launch_params.stream = at::cuda::getCurrentCUDAStream().stream(); + TORCH_CHECK(dropout_p < 1.f); + launch_params.params.dropout_keep_p = 1.f - dropout_p; + launch_params.params.residual = residual_.has_value() ? residual_.value().data_ptr() : nullptr; + launch_params.params.rowscale = rowscale_.has_value() ? rowscale_.value().data_ptr() : nullptr; + launch_params.params.colscale = colscale_.has_value() ? colscale_.value().data_ptr() : nullptr; + launch_params.params.x0_subset = x0_subset_.has_value() ? x0_subset_.value().data_ptr() : nullptr; + launch_params.params.z_subset = z_subset_.has_value() ? z_subset_.value().data_ptr() : nullptr; + + auto gen = at::get_generator_or_default( + gen_, at::cuda::detail::getDefaultCUDAGenerator()); + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + const int multiple = hidden_size <= 1536 ? 256 : (hidden_size <= 3072 ? 512 : 1024); + // Request the kernel launcher. + auto launcher = get_fwd_launcher(wtype, itype, rtype, otype, ctype, round_multiple(hidden_size, multiple)); + + // Set the kernel runtime parameters. + layer_norm::FwdParams ¶ms = launch_params.params; + params.rows = rows; + params.cols = cols; + params.x0 = x0.data_ptr(); + params.x = save_x ? x.data_ptr() : nullptr; + params.dmask = dropout_p > 0.f ? dmask.data_ptr() : nullptr; + params.mu = mu.data_ptr(); + params.rs = rsigma.data_ptr(); + params.gamma = gamma.data_ptr(); + params.beta = beta_.has_value() ? beta_.value().data_ptr() : nullptr; + params.z = z.data_ptr(); + params.epsilon = epsilon; + params.dropout_scale = 1.f / (1.f - dropout_p); + params.inverse_cols = 1.f / float(params.cols); + params.rowscale_const = rowscale_const; + params.is_rms_norm = is_rms_norm; + + // Query the kernel-specific launch parameters. + launcher(launch_params, true); + + at::Tensor workspace, barrier; + + if (dropout_p > 0.f) { + // number of times random will be generated per thread, to offset philox counter in thc random + // state + int64_t counter_offset = launch_params.elts_per_thread; + + // See Note [Acquire lock when using random generators] + { + std::lock_guard lock(gen->mutex_); + params.philox_args = gen->philox_cuda_state(counter_offset); + } + } + + if( launch_params.barrier_size > 0 ) { + auto options = x0.options(); + barrier = torch::zeros(launch_params.barrier_size, options.dtype(torch::kInt32)); + workspace = torch::empty(launch_params.workspace_bytes, options.dtype(torch::kChar)); + params.workspace = workspace.data_ptr(); + params.barrier = barrier.data_ptr(); + } + + // Launch the kernel. + launcher(launch_params, false); + + return { z, x, dmask, mu, rsigma }; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +std::vector dropout_add_ln_bwd(const at::Tensor &dz, // BxSxhidden_size + c10::optional &dx_, // BxSxhidden_size + const at::Tensor &x, // BxSxhidden_size + c10::optional &x0_, // BxSxhidden_size + c10::optional &dmask_, // BxSxhidden_size + const at::Tensor &mu, // BxS, FP32! + const at::Tensor &rsigma, // BxS, FP32! + const at::Tensor &gamma, // hidden_size + c10::optional &rowscale_, // BxS + c10::optional &colscale_, // hidden_size + c10::optional &x0_subset_, // BxS + c10::optional &z_subset_, // BxS + const float dropout_p, + const float rowscale_const, + const int64_t x0_numrows, + const bool has_residual, + bool is_rms_norm=false +) { + + auto itype = dz.scalar_type(); + auto rtype = x.scalar_type(); + auto wtype = gamma.scalar_type(); + auto otype = itype; + auto ctype = torch::kFloat32; + auto mtype = torch::kUInt8; + + if (dropout_p > 0.f) { TORCH_CHECK(dmask_.has_value()); } + + TORCH_CHECK(dz.dtype() == otype); + TORCH_CHECK(mu.dtype() == ctype); + TORCH_CHECK(rsigma.dtype() == ctype); + + TORCH_CHECK(x.is_cuda()); + TORCH_CHECK(dz.is_cuda()); + TORCH_CHECK(mu.is_cuda()); + TORCH_CHECK(rsigma.is_cuda()); + TORCH_CHECK(gamma.is_cuda()); + + TORCH_CHECK(x.is_contiguous()); + TORCH_CHECK(dz.is_contiguous()); + + auto sizes = x.sizes(); + TORCH_CHECK(sizes.size() == 2); + auto rows = sizes[0]; + auto cols = sizes[1]; + TORCH_CHECK(dz.dim() == 2); + TORCH_CHECK(dz.size(1) == cols); + auto hidden_size = gamma.numel(); + TORCH_CHECK(hidden_size == cols); + + // c10::IntArrayRef does not own the storage, so we need to construct a vector. + // Otherwise just constructing IntArrayRef({blah}) will cause uninitialized memory because + // blah is then deallocated. + std::vector x0_sizes_vec {!x0_subset_.has_value() ? rows : x0_numrows, cols}; + auto x0_sizes = c10::IntArrayRef(x0_sizes_vec); + + if (dx_.has_value()) { + auto dx = dx_.value(); + TORCH_CHECK(dx.dtype() == rtype); + TORCH_CHECK(dx.is_cuda()); + TORCH_CHECK(dx.is_contiguous()); + TORCH_CHECK(dx.sizes() == sizes); + } + + if (dmask_.has_value()) { + auto dmask = dmask_.value(); + TORCH_CHECK(dmask.dtype() == mtype); + TORCH_CHECK(dmask.is_cuda()); + TORCH_CHECK(dmask.is_contiguous()); + TORCH_CHECK(dmask.sizes() == x0_sizes); + } + + if (rowscale_.has_value()) { + auto rowscale = rowscale_.value(); + TORCH_CHECK(rowscale.is_cuda()); + TORCH_CHECK(rowscale.is_contiguous()); + TORCH_CHECK(rowscale.sizes() == c10::IntArrayRef{rows}); + TORCH_CHECK(rowscale.dtype() == itype); + } + + if (colscale_.has_value()) { + auto colscale = colscale_.value(); + TORCH_CHECK(colscale.is_cuda()); + TORCH_CHECK(colscale.is_contiguous()); + TORCH_CHECK(colscale.sizes() == c10::IntArrayRef{cols}); + TORCH_CHECK(colscale.dtype() == wtype); + + TORCH_CHECK(x0_.has_value()); + auto x0 = x0_.value(); + TORCH_CHECK(x0.is_cuda()); + TORCH_CHECK(x0.is_contiguous()); + TORCH_CHECK(x0.sizes() == x0_sizes); + TORCH_CHECK(x0.dtype() == itype); + } + + if (x0_subset_.has_value()) { + auto x0_subset = x0_subset_.value(); + TORCH_CHECK(x0_subset.is_cuda()); + TORCH_CHECK(x0_subset.is_contiguous()); + TORCH_CHECK(x0_subset.sizes() == c10::IntArrayRef{rows}); + TORCH_CHECK(x0_subset.dtype() == torch::kInt32); + + TORCH_CHECK(z_subset_.has_value()); + auto z_subset = z_subset_.value(); + TORCH_CHECK(z_subset.is_cuda()); + TORCH_CHECK(z_subset.is_contiguous()); + TORCH_CHECK(z_subset.sizes() == c10::IntArrayRef{rows}); + TORCH_CHECK(z_subset.dtype() == torch::kInt32); + } + + TORCH_CHECK((hidden_size % 8 == 0) && (hidden_size <= 8192)); + + TORCH_CHECK(mu.numel() == rows); + TORCH_CHECK(mu.sizes() == rsigma.sizes()); + + TORCH_CHECK(gamma.numel() == cols); + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)dz.get_device()}; + + auto opts = x.options(); + + auto dx0 = torch::empty(x0_sizes, opts.dtype(itype)); + at::Tensor dresidual; + if (has_residual) { dresidual = torch::empty_like(x, opts.dtype(rtype)); } + auto dgamma = torch::empty_like(gamma); + auto dbeta = torch::empty_like(gamma); + at::Tensor dcolscale; + if (colscale_.has_value()) { + dcolscale = torch::empty_like(colscale_.value()); + } + + layer_norm::LaunchParams launch_params; + launch_params.stream = at::cuda::getCurrentCUDAStream().stream(); + launch_params.props = at::cuda::getCurrentDeviceProperties(); + TORCH_CHECK(dropout_p < 1.f); + launch_params.params.dropout_keep_p = 1.f - dropout_p; + launch_params.params.dresidual = has_residual ? dresidual.data_ptr() : nullptr; + launch_params.params.rowscale = rowscale_.has_value() ? rowscale_.value().data_ptr() : nullptr; + launch_params.params.colscale = colscale_.has_value() ? colscale_.value().data_ptr() : nullptr; + launch_params.params.x0_subset = x0_subset_.has_value() ? x0_subset_.value().data_ptr() : nullptr; + launch_params.params.z_subset = z_subset_.has_value() ? z_subset_.value().data_ptr() : nullptr; + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + const int multiple = hidden_size <= 1536 ? 256 : (hidden_size <= 3072 ? 512 : 1024); + auto launcher = get_bwd_launcher(wtype, itype, rtype, otype, ctype, round_multiple(hidden_size, multiple)); + + launcher(launch_params, true); + + auto dgamma_part = torch::empty({ launch_params.params.ctas_per_col, hidden_size }, opts.dtype(ctype)); + auto dbeta_part = torch::empty({ launch_params.params.ctas_per_col, hidden_size }, opts.dtype(ctype)); + at::Tensor dcolscale_part; + if (colscale_.has_value()) { + dcolscale_part = torch::empty({ launch_params.params.ctas_per_col, hidden_size }, opts.dtype(ctype)); + } + at::Tensor workspace, barrier; + + layer_norm::BwdParams ¶ms = launch_params.params; + params.rows = rows; + params.cols = cols; + params.x = x.data_ptr(); + params.x0 = x0_.has_value() ? x0_.value().data_ptr() : nullptr; + params.dmask = dropout_p > 0.f ? dmask_.value().data_ptr() : nullptr; + params.mu = mu.data_ptr(); + params.rs = rsigma.data_ptr(); + params.gamma = gamma.data_ptr(); + params.dz = dz.data_ptr(); + params.dx = dx_.has_value() ? dx_.value().data_ptr() : nullptr; + params.dx0 = dx0.data_ptr(); + params.dbeta = dbeta.data_ptr(); + params.dgamma = dgamma.data_ptr(); + params.dcolscale = colscale_.has_value() ? dcolscale.data_ptr() : nullptr; + params.dbeta_part = dbeta_part.data_ptr(); + params.dgamma_part = dgamma_part.data_ptr(); + params.dcolscale_part = colscale_.has_value() ? dcolscale_part.data_ptr() : nullptr; + params.dropout_scale = 1.f / (1.f - dropout_p); + params.inverse_cols = 1.f / float(params.cols); + params.rowscale_const = rowscale_const; + params.is_rms_norm = is_rms_norm; + + if( launch_params.barrier_size > 0 ) { + // TODO Any way to avoid this? + barrier = torch::zeros(launch_params.barrier_size, opts.dtype(torch::kInt32)); + workspace = torch::empty(launch_params.workspace_bytes, opts.dtype(torch::kChar)); + params.workspace = workspace.data_ptr(); + params.barrier = barrier.data_ptr(); + } + + launcher(launch_params, false); + + std::vector result = { dx0, dresidual, dgamma, dbeta, dgamma_part, dbeta_part }; + if (colscale_.has_value()) { + result.push_back(dcolscale); + result.push_back(dcolscale_part); + } + return result; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +std::vector dropout_add_ln_parallel_residual_fwd( + const at::Tensor &x0, // Input: BxSxhidden_size + c10::optional &x1_, // Input: BxSxhidden_size + c10::optional &residual_, // Residual: BxSxhidden_size + const at::Tensor &gamma0, // hidden_size + c10::optional &beta0_, // hidden_size + c10::optional &gamma1_, // hidden_size + c10::optional &beta1_, // hidden_size + const float dropout_p, + const float epsilon, + c10::optional gen_, + bool residual_in_fp32=false, + bool is_rms_norm=false +) { + auto itype = x0.scalar_type(); + auto rtype = residual_.has_value() + ? residual_.value().scalar_type() + : (residual_in_fp32 ? torch::kFloat32 : x0.scalar_type()); + auto wtype = gamma0.scalar_type(); + auto otype = itype; + auto ctype = torch::kFloat32; + auto mtype = torch::kUInt8; + + TORCH_CHECK(x0.is_cuda()); + TORCH_CHECK(gamma0.is_cuda()); + + TORCH_CHECK(x0.is_contiguous()); + const auto sizes = x0.sizes(); + TORCH_CHECK(x0.dim() == 2); + + const int rows = sizes[0]; + const int cols = sizes[1]; + auto hidden_size = gamma0.numel(); + TORCH_CHECK(hidden_size == cols); + + if (x1_.has_value()) { + auto x1 = x1_.value(); + TORCH_CHECK(x1.is_cuda()); + TORCH_CHECK(x1.is_contiguous()); + TORCH_CHECK(x1.sizes() == sizes); + } + + if (residual_.has_value()) { + auto residual = residual_.value(); + TORCH_CHECK(residual.is_cuda()); + TORCH_CHECK(residual.is_contiguous()); + TORCH_CHECK(residual.sizes() == sizes); + } + + if (beta0_.has_value()) { + auto beta0 = beta0_.value(); + TORCH_CHECK(beta0.dtype() == wtype); + TORCH_CHECK(beta0.is_cuda()); + TORCH_CHECK(beta0.is_contiguous()); + TORCH_CHECK(beta0.sizes() == gamma0.sizes()); + } + + if (gamma1_.has_value()) { + auto gamma1 = gamma1_.value(); + TORCH_CHECK(gamma1.dtype() == wtype); + TORCH_CHECK(gamma1.is_cuda()); + TORCH_CHECK(gamma1.is_contiguous()); + TORCH_CHECK(gamma1.sizes() == gamma0.sizes()); + } + + if (beta1_.has_value()) { + auto beta1 = beta1_.value(); + TORCH_CHECK(beta1.dtype() == wtype); + TORCH_CHECK(beta1.is_cuda()); + TORCH_CHECK(beta1.is_contiguous()); + TORCH_CHECK(beta1.sizes() == gamma0.sizes()); + } + + TORCH_CHECK((hidden_size % 8 == 0) && (hidden_size <= 8192)); + TORCH_CHECK(epsilon >= 0.f); + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)x0.get_device()}; + + auto opts = x0.options(); + + bool save_x = residual_.has_value() || x1_.has_value() || (dropout_p > 0.f) || (itype != rtype); + at::Tensor x; + if (save_x) { x = torch::empty(sizes, opts.dtype(rtype)); } + at::Tensor dmask0, dmask1; + if (dropout_p > 0.f) { + dmask0 = torch::empty(x0.sizes(), opts.dtype(mtype)); + if (x1_.has_value()) { dmask1 = torch::empty(x0.sizes(), opts.dtype(mtype)); } + }; + auto z0 = torch::empty(sizes, opts.dtype(otype)); + at::Tensor z1; + if (gamma1_.has_value()) { z1 = torch::empty(sizes, opts.dtype(otype)); } + + auto mu = torch::empty({ rows }, opts.dtype(ctype)); + auto rsigma = torch::empty({ rows }, opts.dtype(ctype)); + + layer_norm::LaunchParams launch_params; + + launch_params.props = at::cuda::getCurrentDeviceProperties(); + launch_params.stream = at::cuda::getCurrentCUDAStream().stream(); + TORCH_CHECK(dropout_p < 1.f); + launch_params.params.dropout_keep_p = 1.f - dropout_p; + launch_params.params.residual = residual_.has_value() ? residual_.value().data_ptr() : nullptr; + + auto gen = at::get_generator_or_default( + gen_, at::cuda::detail::getDefaultCUDAGenerator()); + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + const int multiple = hidden_size <= 1536 ? 256 : (hidden_size <= 3072 ? 512 : 1024); + // Request the kernel launcher. + auto launcher = get_parallel_fwd_launcher(wtype, itype, rtype, otype, ctype, round_multiple(hidden_size, multiple)); + + // Set the kernel runtime parameters. + layer_norm::FwdParams ¶ms = launch_params.params; + params.rows = rows; + params.cols = cols; + params.x0 = x0.data_ptr(); + params.x1 = x1_.has_value() ? x1_.value().data_ptr() : nullptr; + params.x = save_x ? x.data_ptr() : nullptr; + params.dmask = dropout_p > 0.f ? dmask0.data_ptr() : nullptr; + params.dmask1 = (dropout_p > 0.f && x1_.has_value()) ? dmask1.data_ptr() : nullptr; + params.mu = mu.data_ptr(); + params.rs = rsigma.data_ptr(); + params.gamma = gamma0.data_ptr(); + params.gamma1 = gamma1_.has_value() ? gamma1_.value().data_ptr() : nullptr; + params.beta = beta0_.has_value() ? beta0_.value().data_ptr() : nullptr; + params.beta1 = beta1_.has_value() ? beta1_.value().data_ptr() : nullptr; + params.z = z0.data_ptr(); + params.z1 = gamma1_.has_value() ? z1.data_ptr() : nullptr; + params.epsilon = epsilon; + params.dropout_scale = 1.f / (1.f - dropout_p); + params.inverse_cols = 1.f / float(params.cols); + params.is_rms_norm = is_rms_norm; + + // Query the kernel-specific launch parameters. + launcher(launch_params, true); + + at::Tensor workspace, barrier; + + if (dropout_p > 0.f) { + // number of times random will be generated per thread, to offset philox counter in thc random + // state + int64_t counter_offset = 2 * launch_params.elts_per_thread; + + // See Note [Acquire lock when using random generators] + { + std::lock_guard lock(gen->mutex_); + params.philox_args = gen->philox_cuda_state(counter_offset); + } + } + + if( launch_params.barrier_size > 0 ) { + auto options = x0.options(); + barrier = torch::zeros(launch_params.barrier_size, options.dtype(torch::kInt32)); + workspace = torch::empty(launch_params.workspace_bytes, options.dtype(torch::kChar)); + params.workspace = workspace.data_ptr(); + params.barrier = barrier.data_ptr(); + } + + // Launch the kernel. + launcher(launch_params, false); + + return { z0, z1, x, dmask0, dmask1, mu, rsigma }; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +std::vector dropout_add_ln_parallel_residual_bwd( + const at::Tensor &dz0, // BxSxhidden_size + c10::optional &dz1_, // BxSxhidden_size + c10::optional &dx_, // BxSxhidden_size + const at::Tensor &x, // BxSxhidden_size + c10::optional &dmask0_, // BxSxhidden_size + c10::optional &dmask1_, // BxSxhidden_size + const at::Tensor &mu, // BxS, FP32! + const at::Tensor &rsigma, // BxS, FP32! + const at::Tensor &gamma0, // hidden_size + c10::optional &gamma1_, // hidden_size + const float dropout_p, + const bool has_x1, + const bool has_residual, + bool is_rms_norm=false +) { + + auto itype = dz0.scalar_type(); + auto rtype = x.scalar_type(); + auto wtype = gamma0.scalar_type(); + auto otype = itype; + auto ctype = torch::kFloat32; + auto mtype = torch::kUInt8; + + if (dropout_p > 0.f) { TORCH_CHECK(dmask0_.has_value()); } + + TORCH_CHECK(dz0.dtype() == otype); + TORCH_CHECK(dz0.dtype() == otype); + TORCH_CHECK(mu.dtype() == ctype); + TORCH_CHECK(rsigma.dtype() == ctype); + + TORCH_CHECK(x.is_cuda()); + TORCH_CHECK(dz0.is_cuda()); + TORCH_CHECK(mu.is_cuda()); + TORCH_CHECK(rsigma.is_cuda()); + TORCH_CHECK(gamma0.is_cuda()); + + TORCH_CHECK(x.is_contiguous()); + TORCH_CHECK(dz0.is_contiguous()); + + auto sizes = x.sizes(); + TORCH_CHECK(sizes.size() == 2); + auto rows = sizes[0]; + auto cols = sizes[1]; + TORCH_CHECK(dz0.dim() == 2); + TORCH_CHECK(dz0.size(1) == cols); + auto hidden_size = gamma0.numel(); + TORCH_CHECK(hidden_size == cols); + + if (dz1_.has_value()) { + auto dz1 = dz1_.value(); + TORCH_CHECK(dz1.dtype() == otype); + TORCH_CHECK(dz1.is_cuda()); + TORCH_CHECK(dz1.is_contiguous()); + TORCH_CHECK(dz1.sizes() == sizes); + + TORCH_CHECK(gamma1_.has_value()); + auto gamma1 = gamma1_.value(); + TORCH_CHECK(gamma1.dtype() == wtype); + TORCH_CHECK(gamma1.is_cuda()); + TORCH_CHECK(gamma1.is_contiguous()); + TORCH_CHECK(gamma1.sizes() == gamma0.sizes()); + } + + if (dx_.has_value()) { + auto dx = dx_.value(); + TORCH_CHECK(dx.dtype() == rtype); + TORCH_CHECK(dx.is_cuda()); + TORCH_CHECK(dx.is_contiguous()); + TORCH_CHECK(dx.sizes() == sizes); + } + + if (dmask0_.has_value()) { + auto dmask0 = dmask0_.value(); + TORCH_CHECK(dmask0.dtype() == mtype); + TORCH_CHECK(dmask0.is_cuda()); + TORCH_CHECK(dmask0.is_contiguous()); + TORCH_CHECK(dmask0.sizes() == sizes); + + if (has_x1) { + TORCH_CHECK(dmask1_.has_value()); + auto dmask1 = dmask1_.value(); + TORCH_CHECK(dmask1.dtype() == mtype); + TORCH_CHECK(dmask1.is_cuda()); + TORCH_CHECK(dmask1.is_contiguous()); + TORCH_CHECK(dmask1.sizes() == sizes); + } + } + + TORCH_CHECK((hidden_size % 8 == 0) && (hidden_size <= 8192)); + + TORCH_CHECK(mu.numel() == rows); + TORCH_CHECK(mu.sizes() == rsigma.sizes()); + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)dz0.get_device()}; + + auto opts = x.options(); + + auto dx0 = torch::empty(sizes, opts.dtype(itype)); + at::Tensor dx1; + if (has_x1) { dx1 = torch::empty(sizes, opts.dtype(itype)); } + at::Tensor dresidual; + if (has_residual) { dresidual = torch::empty_like(x, opts.dtype(rtype)); } + auto dgamma0 = torch::empty_like(gamma0); + auto dbeta0 = torch::empty_like(gamma0); + at::Tensor dgamma1, dbeta1; + if (gamma1_.has_value()) { + dgamma1 = torch::empty_like(gamma0); + dbeta1 = torch::empty_like(gamma0); + } + + layer_norm::LaunchParams launch_params; + launch_params.stream = at::cuda::getCurrentCUDAStream().stream(); + launch_params.props = at::cuda::getCurrentDeviceProperties(); + TORCH_CHECK(dropout_p < 1.f); + launch_params.params.dropout_keep_p = 1.f - dropout_p; + launch_params.params.dresidual = has_residual ? dresidual.data_ptr() : nullptr; + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + const int multiple = hidden_size <= 1536 ? 256 : (hidden_size <= 3072 ? 512 : 1024); + auto launcher = get_parallel_bwd_launcher(wtype, itype, rtype, otype, ctype, round_multiple(hidden_size, multiple)); + + launcher(launch_params, true); + + auto dgamma0_part = torch::zeros({ launch_params.params.ctas_per_col, hidden_size }, opts.dtype(ctype)); + auto dbeta0_part = torch::zeros({ launch_params.params.ctas_per_col, hidden_size }, opts.dtype(ctype)); + at::Tensor dgamma1_part, dbeta1_part; + if (gamma1_.has_value()) { + dgamma1_part = torch::zeros_like(dgamma0_part); + dbeta1_part = torch::zeros_like(dbeta0_part); + } + at::Tensor workspace, barrier; + + layer_norm::BwdParams ¶ms = launch_params.params; + params.rows = rows; + params.cols = cols; + params.x = x.data_ptr(); + params.dmask = dropout_p > 0.f ? dmask0_.value().data_ptr() : nullptr; + params.dmask1 = (dropout_p > 0.f && has_x1) ? dmask1_.value().data_ptr() : nullptr; + params.mu = mu.data_ptr(); + params.rs = rsigma.data_ptr(); + params.gamma = gamma0.data_ptr(); + params.gamma1 = gamma1_.has_value() ? gamma1_.value().data_ptr() : nullptr; + params.dz = dz0.data_ptr(); + params.dz1 = dz1_.has_value() ? dz1_.value().data_ptr() : nullptr; + params.dx = dx_.has_value() ? dx_.value().data_ptr() : nullptr; + params.dx0 = dx0.data_ptr(); + params.dx1 = has_x1 ? dx1.data_ptr() : nullptr; + params.dbeta = dbeta0.data_ptr(); + params.dgamma = dgamma0.data_ptr(); + params.dbeta1 = gamma1_.has_value() ? dbeta1.data_ptr() : nullptr; + params.dgamma1 = gamma1_.has_value() ? dgamma1.data_ptr() : nullptr; + params.dbeta_part = dbeta0_part.data_ptr(); + params.dgamma_part = dgamma0_part.data_ptr(); + params.dbeta1_part = gamma1_.has_value() ? dbeta1_part.data_ptr() : nullptr; + params.dgamma1_part = gamma1_.has_value() ? dgamma1_part.data_ptr() : nullptr; + params.dropout_scale = 1.f / (1.f - dropout_p); + params.inverse_cols = 1.f / float(params.cols); + params.is_rms_norm = is_rms_norm; + + if( launch_params.barrier_size > 0 ) { + // TODO Any way to avoid this? + barrier = torch::zeros(launch_params.barrier_size, opts.dtype(torch::kInt32)); + workspace = torch::empty(launch_params.workspace_bytes, opts.dtype(torch::kChar)); + params.workspace = workspace.data_ptr(); + params.barrier = barrier.data_ptr(); + } + + launcher(launch_params, false); + + std::vector result = { dx0, dx1, dresidual, dgamma0, dbeta0, dgamma1, dbeta1, dgamma0_part, dbeta0_part, dgamma1_part, dbeta1_part }; + return result; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.doc() = "CUDA DropoutAddLayerNorm"; + m.def("dropout_add_ln_fwd", &dropout_add_ln_fwd, "Run Dropout + Add + LayerNorm forward kernel", + py::arg("x0"), py::arg("residual"), py::arg("gamma"), py::arg("beta_"), + py::arg("rowscale_"), py::arg("colscale_"), py::arg("x0_subset_"), py::arg("z_subset_"), + py::arg("dropout_p"), py::arg("epsilon"), py::arg("rowscale_const"), py::arg("z_numrows"), + py::arg("gen_"), py::arg("residual_in_fp32")=false, py::arg("is_rms_norm")=false); + m.def("dropout_add_ln_bwd", &dropout_add_ln_bwd, "Run Dropout + Add + LayerNorm backward kernel", + py::arg("dz"), py::arg("dx_"), py::arg("x"), py::arg("x0_"), py::arg("dmask_"), py::arg("mu"), + py::arg("rsigma"), py::arg("gamma"), py::arg("rowscale_"), py::arg("colscale_"), + py::arg("x0_subset_"), py::arg("z_subset_"), py::arg("dropout_p"), py::arg("rowscale_const"), + py::arg("x0_numrows"), py::arg("has_residual"), py::arg("is_rms_norm")=false); + m.def("dropout_add_ln_parallel_residual_fwd", &dropout_add_ln_parallel_residual_fwd, "Run Dropout + Add + LayerNorm parallel residual forward kernel", + py::arg("x0"), py::arg("x1_"), py::arg("residual"), py::arg("gamma0"), py::arg("beta0_"), + py::arg("gamma1_"), py::arg("beta1_"), py::arg("dropout_p"), py::arg("epsilon"), + py::arg("gen_"), py::arg("residual_in_fp32")=false, py::arg("is_rms_norm")=false); + m.def("dropout_add_ln_parallel_residual_bwd", &dropout_add_ln_parallel_residual_bwd, "Run Dropout + Add + LayerNorm parallel residual backward kernel", + py::arg("dz0"), py::arg("dz1_"), py::arg("dx_"), py::arg("x"), py::arg("dmask0_"), + py::arg("dmask1_"), py::arg("mu"), py::arg("rsigma"), py::arg("gamma0"), py::arg("gamma1_"), + py::arg("dropout_p"), py::arg("has_x1"), py::arg("has_residual"), py::arg("is_rms_norm")=false); +} diff --git a/ln_bwd_1024.cu b/ln_bwd_1024.cu new file mode 100644 index 0000000000000000000000000000000000000000..f7101f6450fcdb8baa4ff4e79379d913048696b6 --- /dev/null +++ b/ln_bwd_1024.cu @@ -0,0 +1,15 @@ +#include "ln_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_BWD_LAUNCHER( 1024, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1024, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1024, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1024, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1024, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1024, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1024, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1024, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1024, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1024, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); diff --git a/ln_bwd_1280.cu b/ln_bwd_1280.cu new file mode 100644 index 0000000000000000000000000000000000000000..a80a5762a178bd1fd1cd2ef4d0fb2010c1eea22e --- /dev/null +++ b/ln_bwd_1280.cu @@ -0,0 +1,15 @@ +#include "ln_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_BWD_LAUNCHER( 1280, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1280, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1280, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1280, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1280, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1280, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1280, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1280, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1280, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 1280, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); diff --git a/ln_bwd_1536.cu b/ln_bwd_1536.cu new file mode 100644 index 0000000000000000000000000000000000000000..0c25c088494d52f3b68251235d29c23a46ffc430 --- /dev/null +++ b/ln_bwd_1536.cu @@ -0,0 +1,15 @@ +#include "ln_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_BWD_LAUNCHER( 1536, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 1536, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 1536, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_BWD_LAUNCHER( 1536, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_BWD_LAUNCHER( 1536, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_BWD_LAUNCHER( 1536, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 8, 4); +REGISTER_BWD_LAUNCHER( 1536, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 8, 4); +REGISTER_BWD_LAUNCHER( 1536, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 8, 4); +REGISTER_BWD_LAUNCHER( 1536, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_BWD_LAUNCHER( 1536, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 8, 4); diff --git a/ln_bwd_2048.cu b/ln_bwd_2048.cu new file mode 100644 index 0000000000000000000000000000000000000000..06c0e608a3e48ec7fad2081bc6ff82425ea1c56a --- /dev/null +++ b/ln_bwd_2048.cu @@ -0,0 +1,15 @@ +#include "ln_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_BWD_LAUNCHER( 2048, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 2048, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 2048, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 2048, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 2048, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 2048, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 2048, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 2048, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 2048, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 2048, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4); \ No newline at end of file diff --git a/ln_bwd_256.cu b/ln_bwd_256.cu new file mode 100644 index 0000000000000000000000000000000000000000..20945432b8e97be21d80ada73aa0b3e709733a5b --- /dev/null +++ b/ln_bwd_256.cu @@ -0,0 +1,15 @@ +#include "ln_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_BWD_LAUNCHER( 256, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 256, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 256, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 256, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 256, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 256, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 256, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 256, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 256, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 256, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); diff --git a/ln_bwd_2560.cu b/ln_bwd_2560.cu new file mode 100644 index 0000000000000000000000000000000000000000..309184c37b93e1f90bc1020a47973dae84f0f0c8 --- /dev/null +++ b/ln_bwd_2560.cu @@ -0,0 +1,15 @@ +#include "ln_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_BWD_LAUNCHER( 2560, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 2560, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 2560, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_BWD_LAUNCHER( 2560, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_BWD_LAUNCHER( 2560, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_BWD_LAUNCHER( 2560, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 8, 4); +REGISTER_BWD_LAUNCHER( 2560, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 8, 4); +REGISTER_BWD_LAUNCHER( 2560, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 8, 4); +REGISTER_BWD_LAUNCHER( 2560, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_BWD_LAUNCHER( 2560, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 8, 4); diff --git a/ln_bwd_3072.cu b/ln_bwd_3072.cu new file mode 100644 index 0000000000000000000000000000000000000000..e156b11cd92f450a6ce8e0c432487bd36d6f9847 --- /dev/null +++ b/ln_bwd_3072.cu @@ -0,0 +1,15 @@ +#include "ln_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_BWD_LAUNCHER( 3072, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 3072, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 3072, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 3072, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 3072, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 3072, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 3072, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 3072, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 3072, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 3072, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4); \ No newline at end of file diff --git a/ln_bwd_4096.cu b/ln_bwd_4096.cu new file mode 100644 index 0000000000000000000000000000000000000000..b715b0efe48c4111ae4301365018d19f537c7a81 --- /dev/null +++ b/ln_bwd_4096.cu @@ -0,0 +1,15 @@ +#include "ln_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_BWD_LAUNCHER( 4096, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 4096, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 4096, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 4096, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 4096, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 4096, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 4096, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 4096, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 4096, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 4096, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4); \ No newline at end of file diff --git a/ln_bwd_512.cu b/ln_bwd_512.cu new file mode 100644 index 0000000000000000000000000000000000000000..2b472118f0a0025917edc4c706492ca5dc8fa205 --- /dev/null +++ b/ln_bwd_512.cu @@ -0,0 +1,15 @@ +#include "ln_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_BWD_LAUNCHER( 512, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 512, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 512, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 512, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 512, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 512, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 512, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 512, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 512, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 512, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); diff --git a/ln_bwd_5120.cu b/ln_bwd_5120.cu new file mode 100644 index 0000000000000000000000000000000000000000..38f3fbd406db8989f4a9806e64075bf52444c529 --- /dev/null +++ b/ln_bwd_5120.cu @@ -0,0 +1,15 @@ +#include "ln_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_BWD_LAUNCHER( 5120, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 5120, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 5120, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 5120, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 5120, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 5120, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 5120, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 5120, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 5120, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_BWD_LAUNCHER( 5120, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4); \ No newline at end of file diff --git a/ln_bwd_6144.cu b/ln_bwd_6144.cu new file mode 100644 index 0000000000000000000000000000000000000000..469ed4b6c7691c581bbd1db5b8587de860afcb16 --- /dev/null +++ b/ln_bwd_6144.cu @@ -0,0 +1,15 @@ +#include "ln_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_BWD_LAUNCHER( 6144, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 6144, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 6144, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 6144, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 6144, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 6144, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 6144, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 6144, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 6144, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 6144, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4); \ No newline at end of file diff --git a/ln_bwd_7168.cu b/ln_bwd_7168.cu new file mode 100644 index 0000000000000000000000000000000000000000..549eab11aa3c770bea97bda727495f3e141ec24b --- /dev/null +++ b/ln_bwd_7168.cu @@ -0,0 +1,15 @@ +#include "ln_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_BWD_LAUNCHER( 7168, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 7168, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 7168, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 8, 4); +REGISTER_BWD_LAUNCHER( 7168, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 8, 4); +REGISTER_BWD_LAUNCHER( 7168, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 8, 4); +REGISTER_BWD_LAUNCHER( 7168, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 8, 4); +REGISTER_BWD_LAUNCHER( 7168, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 8, 4); +REGISTER_BWD_LAUNCHER( 7168, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 8, 4); +REGISTER_BWD_LAUNCHER( 7168, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 8, 4); +REGISTER_BWD_LAUNCHER( 7168, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 8, 4); \ No newline at end of file diff --git a/ln_bwd_768.cu b/ln_bwd_768.cu new file mode 100644 index 0000000000000000000000000000000000000000..5db64d3d7b184f6ffb01ae0e1a26e0acec3bbe3d --- /dev/null +++ b/ln_bwd_768.cu @@ -0,0 +1,15 @@ +#include "ln_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_BWD_LAUNCHER( 768, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 768, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 768, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 768, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 768, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 768, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 768, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 768, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 768, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_BWD_LAUNCHER( 768, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); diff --git a/ln_bwd_8192.cu b/ln_bwd_8192.cu new file mode 100644 index 0000000000000000000000000000000000000000..e6514e613fe9cbf444ad4919a5acf9579b216c9e --- /dev/null +++ b/ln_bwd_8192.cu @@ -0,0 +1,15 @@ +#include "ln_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_BWD_LAUNCHER( 8192, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 8192, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 8192, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 8192, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 8192, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 8192, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 8192, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 8192, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 8192, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_BWD_LAUNCHER( 8192, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4); \ No newline at end of file diff --git a/ln_bwd_kernels.cuh b/ln_bwd_kernels.cuh new file mode 100644 index 0000000000000000000000000000000000000000..c7261d218442acbcf60b61ce2e8803556193d8cd --- /dev/null +++ b/ln_bwd_kernels.cuh @@ -0,0 +1,534 @@ +#pragma once + +#include "ln.h" +#include "ln_utils.cuh" +#include "ln_kernel_traits.h" +#include "static_switch.h" + +namespace layer_norm { + +template +__global__ __launch_bounds__(Ktraits::THREADS_PER_CTA) +void ln_bwd_kernel(layer_norm::BwdParams params) { + + enum { ROWS_PER_CTA = Ktraits::ROWS_PER_CTA }; + enum { WARPS_M = Ktraits::WARPS_M }; + enum { WARPS_N = Ktraits::WARPS_N }; + enum { THREADS_PER_ROW = Ktraits::THREADS_PER_ROW }; + enum { COLS = Ktraits::COLS }; + enum { BYTES_PER_ROW = Ktraits::BYTES_PER_ROW }; + enum { LDGS = Ktraits::LDGS }; + enum { NUM_ELTS = Ktraits::ELTS_PER_LDG }; + enum { THREADS_PER_WARP = Ktraits::THREADS_PER_WARP }; + enum { CTAS_PER_ROW = Ktraits::CTAS_PER_ROW }; + + using input_t = typename Ktraits::input_t; + using compute_t = typename Ktraits::compute_t; + using index_t = typename Ktraits::index_t; + using mask_t = typename Ktraits::mask_t; + using Ivec = typename Ktraits::Ivec; + using Rvec = typename Ktraits::Rvec; + using Ovec = typename Ktraits::Ovec; + using Wvec = typename Ktraits::Wvec; + using Cvec = typename Ktraits::Cvec; + using Mvec = typename Ktraits::Mvec; + using Reducer = typename Ktraits::Reducer; + using reduce_t = typename Reducer::Type; + + extern __shared__ char smem_[]; + + const bool has_residual = params.dresidual != nullptr; + const bool prenorm = params.dx != nullptr; + + const index_t tidx = threadIdx.x; + const index_t bidn = blockIdx.x % CTAS_PER_ROW; + const index_t bidm = blockIdx.x / CTAS_PER_ROW; + const index_t lane = tidx % THREADS_PER_WARP; + const index_t warp = tidx / THREADS_PER_WARP; + const index_t warp_m = warp / Ktraits::WARPS_N; + const index_t warp_n = warp % Ktraits::WARPS_N; + const index_t tid_r = warp_n * THREADS_PER_WARP + lane; + + const index_t r = bidm * Ktraits::ROWS_PER_CTA + warp_m; + const index_t c = bidn * THREADS_PER_ROW + warp_n * THREADS_PER_WARP + lane; + + static_assert(COLS == THREADS_PER_ROW * LDGS * NUM_ELTS * CTAS_PER_ROW); + + const input_t *rowscale = static_cast(params.rowscale); + const index_t *x0_subset = static_cast(params.x0_subset); + const index_t *z_subset = static_cast(params.z_subset); + + Cvec dzy_sum[LDGS]; + Cvec dz_sum[LDGS]; + Cvec dcolscale_sum[LDGS]; + + memset(dzy_sum, 0, sizeof(dzy_sum)); + memset(dz_sum, 0, sizeof(dz_sum)); + if (Has_colscale) { memset(dcolscale_sum, 0, sizeof(dcolscale_sum)); } + + compute_t * smem_wgrad = reinterpret_cast(smem_); + char *smem_dgrad = smem_ + Ktraits::SMEM_BYTES_WGRAD; + + Reducer reducer(params, bidm, bidn, warp_m, warp_n, lane, smem_dgrad); + + Sum sum; + + const index_t num_valid_ldgs = + ((params.cols / Ktraits::ELTS_PER_LDG) - 1 - c + Ktraits::VEC_COLS_PER_LDG) / Ktraits::VEC_COLS_PER_LDG; + + Wvec gamma[LDGS]; + Wvec colscale[LDGS]; + index_t idx = c; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + if (Is_even_cols || (it < num_valid_ldgs)) { + gamma[it].load_from(params.gamma, idx); + if (Has_colscale) { colscale[it].load_from(params.colscale, idx); } + idx += Ktraits::VEC_COLS_PER_LDG; + } + } + // TODO if ROWS_PER_CTA does not divide rows, we might get divergence in the + // last blocks with syncthreads! + // grid stride over rows + #pragma unroll 1 + for( int row = r; row < params.rows; row += params.ctas_per_col * ROWS_PER_CTA ) { + const compute_t mu_r = static_cast(params.mu)[row]; + const compute_t rs_r = static_cast(params.rs)[row]; + const compute_t rowscale_val = !Has_subset ? (params.rowscale == nullptr ? 1.0f : compute_t(rowscale[row])) : params.rowscale_const; + const int row_z = !Has_subset ? row + 1 : z_subset[row]; + const int row_x0 = !Has_subset ? row + 1 : x0_subset[row]; + const bool load_dz = !Has_subset || row_z > 0; + const bool save_dx0 = !Has_subset || row_x0 > 0; + Mvec dmask[LDGS]; + Rvec dx[LDGS]; + compute_t dy[LDGS * NUM_ELTS]; + compute_t y[LDGS * NUM_ELTS]; + compute_t mdy_local = 0.f; + compute_t mdyy_local = 0.f; + // If dz is not loaded, then dy should be 0 and we don't care about the value of y. + if (load_dz) { + index_t idx_x = row * params.cols / Ktraits::ELTS_PER_LDG + c; + index_t idx_z = !Has_subset ? idx_x : (load_dz ? (row_z - 1) * params.cols / Ktraits::ELTS_PER_LDG + c : 0); + index_t idx_x0 = !Has_subset ? idx_x : (save_dx0 ? (row_x0 - 1) * params.cols / Ktraits::ELTS_PER_LDG + c : 0); + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + if (Is_even_cols || (it < num_valid_ldgs)) { + Rvec x; + Ovec dz; + dz.load_from(params.dz, !Has_subset ? idx_x : idx_z); + if (prenorm) { dx[it].load_from(params.dx, idx_x); } + x.load_from(params.x, idx_x); + if (Is_dropout) { dmask[it].load_from(params.dmask, !Has_subset ? idx_x : idx_x0); } + idx_x += Ktraits::VEC_COLS_PER_LDG; + idx_z += Ktraits::VEC_COLS_PER_LDG; + idx_x0 += Ktraits::VEC_COLS_PER_LDG; + #pragma unroll + for( int jt = 0; jt < NUM_ELTS; jt++ ) { + compute_t x_tmp = x.data.elt[jt]; + compute_t y_tmp = rs_r * (x_tmp - (!params.is_rms_norm ? mu_r : 0.f)); + compute_t dy_tmp = compute_t(gamma[it].data.elt[jt]) * compute_t(dz.data.elt[jt]); + compute_t dz_tmp = dz.data.elt[jt]; + + mdy_local += dy_tmp; + mdyy_local += dy_tmp * y_tmp; + + dy[it * NUM_ELTS + jt] = dy_tmp; + y[it * NUM_ELTS + jt] = y_tmp; + + dzy_sum[it].data.elt[jt] += dz_tmp * y_tmp; + dz_sum[it].data.elt[jt] += dz_tmp; + } + } + } + } else { + index_t idx_x = row * params.cols / Ktraits::ELTS_PER_LDG + c; + index_t idx_x0 = !Has_subset ? idx_x : (save_dx0 ? (row_x0 - 1) * params.cols / Ktraits::ELTS_PER_LDG + c : 0); + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + if (Is_even_cols || (it < num_valid_ldgs)) { + if (prenorm) { dx[it].load_from(params.dx, idx_x); } + if (Is_dropout) { dmask[it].load_from(params.dmask, !Has_subset ? idx_x : idx_x0); } + idx_x += Ktraits::VEC_COLS_PER_LDG; + idx_x0 += Ktraits::VEC_COLS_PER_LDG; + } + } + } + + reduce_t result = reducer.allreduce({mdy_local, mdyy_local}, sum); + mdy_local = layer_norm::Get<0>::of(result) * params.inverse_cols; + mdyy_local = layer_norm::Get<1>::of(result) * params.inverse_cols; + + index_t idx_x = row * params.cols / Ktraits::ELTS_PER_LDG + c; + index_t idx_x0 = !Has_subset ? idx_x : (save_dx0 ? (row_x0 - 1) * params.cols / Ktraits::ELTS_PER_LDG + c : 0); + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + if (Is_even_cols || (it < num_valid_ldgs)) { + Ivec dx0; + Rvec dresidual; + Ivec x0; + if (Has_colscale && save_dx0) { x0.load_from(params.x0, !Has_subset ? idx_x : idx_x0); } + #pragma unroll + for( int jt = 0; jt < NUM_ELTS; jt++ ) { + compute_t dx_tmp_res; + if (load_dz) { + compute_t dy_tmp = dy[it * NUM_ELTS + jt]; + compute_t y_tmp = y[it * NUM_ELTS + jt]; + compute_t dx_tmp = rs_r * (dy_tmp - (mdyy_local * y_tmp + (!params.is_rms_norm ? mdy_local : 0.f))); + dx_tmp_res = prenorm ? dx_tmp + compute_t(dx[it].data.elt[jt]) : dx_tmp; + } else { + dx_tmp_res = prenorm ? compute_t(dx[it].data.elt[jt]) : 0.f; + } + if (has_residual) { dresidual.data.elt[jt] = dx_tmp_res; } + if (save_dx0) { + compute_t dx0_tmp_res = dx_tmp_res * rowscale_val; + if (Is_dropout) { + dx0_tmp_res *= params.dropout_scale; + if (Has_colscale) { + dcolscale_sum[it].data.elt[jt] += dmask[it].data.elt[jt] ? dx0_tmp_res * compute_t(x0.data.elt[jt]) : 0.f; + dx0.data.elt[jt] = dmask[it].data.elt[jt] ? dx0_tmp_res * compute_t(colscale[it].data.elt[jt]) : 0.f; + } else { + dx0.data.elt[jt] = dmask[it].data.elt[jt] ? dx0_tmp_res : 0.f; + } + } else { + if (Has_colscale) { + dcolscale_sum[it].data.elt[jt] += dx0_tmp_res * compute_t(x0.data.elt[jt]); + dx0.data.elt[jt] = dx0_tmp_res * compute_t(colscale[it].data.elt[jt]); + } else { + dx0.data.elt[jt] = dx0_tmp_res; + } + } + } + } + if (has_residual) { dresidual.store_to(params.dresidual, idx_x); } + if (save_dx0) { dx0.store_to(params.dx0, !Has_subset ? idx_x : idx_x0); } + idx_x += Ktraits::VEC_COLS_PER_LDG; + idx_x0 += Ktraits::VEC_COLS_PER_LDG; + } + } + + } // end: grid stride loop + + if( WARPS_M == 1 ) { + idx = r * params.cols / Ktraits::ELTS_PER_LDG + c; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + if (Is_even_cols || (it < num_valid_ldgs)) { + dz_sum[it].store_to(params.dbeta_part, idx); + dzy_sum[it].store_to(params.dgamma_part, idx); + if (Has_colscale) { dcolscale_sum[it].store_to(params.dcolscale_part, idx); } + idx += Ktraits::VEC_COLS_PER_LDG; + } + } + } else { + static_assert(WARPS_M == 1 || Ktraits::CTAS_PER_ROW == 1, "Multiple rows per CTA not supported for Multi-CTA."); + // Finalize reduction of part dgamma and dbeta for this CTA + // by reducing over the rows held across the WARPS_M warps + + // Assumption: blockSize divides hidden size. + enum { NUM_RES = COLS / Ktraits::THREADS_PER_CTA }; + static_assert(NUM_RES * Ktraits::THREADS_PER_CTA == COLS, ""); + + idx = warp_m * Ktraits::VEC_COLS + tid_r; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + dz_sum[it].store_to(smem_wgrad, idx); + idx += THREADS_PER_ROW; + } + __syncthreads(); + compute_t cta_dz_sum[NUM_RES]; + memset(cta_dz_sum, 0, sizeof(compute_t) * NUM_RES); + for( int it = 0; it < ROWS_PER_CTA; it++ ) { + for( int jt = 0; jt < NUM_RES; jt++ ) { + cta_dz_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA]; + } + } + __syncthreads(); + + idx = warp_m * Ktraits::VEC_COLS + tid_r; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + dzy_sum[it].store_to(smem_wgrad, idx); + idx += THREADS_PER_ROW; + } + __syncthreads(); + compute_t cta_dzy_sum[NUM_RES]; + memset(cta_dzy_sum, 0, sizeof(compute_t) * NUM_RES); + for( int it = 0; it < ROWS_PER_CTA; it++ ) { + for( int jt = 0; jt < NUM_RES; jt++ ) { + cta_dzy_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA]; + } + } + + compute_t cta_dcolscale_sum[NUM_RES]; + if (Has_colscale) { + __syncthreads(); + idx = warp_m * Ktraits::VEC_COLS + tid_r; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + dcolscale_sum[it].store_to(smem_wgrad, idx); + idx += THREADS_PER_ROW; + } + __syncthreads(); + memset(cta_dcolscale_sum, 0, sizeof(compute_t) * NUM_RES); + for( int it = 0; it < ROWS_PER_CTA; it++ ) { + for( int jt = 0; jt < NUM_RES; jt++ ) { + cta_dcolscale_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA]; + } + } + } + + const index_t num_valid_writes + = (params.cols - 1 - tidx + Ktraits::THREADS_PER_CTA) / Ktraits::THREADS_PER_CTA; + compute_t *dgamma_part = static_cast(params.dgamma_part) + bidm * params.cols + tidx; + compute_t *dbeta_part = static_cast(params.dbeta_part) + bidm * params.cols + tidx; + compute_t *dcolscale_part = Has_colscale ? static_cast(params.dcolscale_part) + bidm * params.cols + tidx : nullptr; + for( int jt = 0; jt < NUM_RES; jt++ ) { + if (Is_even_cols || (jt < num_valid_writes)) { + *dgamma_part = cta_dzy_sum[jt]; + dgamma_part += Ktraits::THREADS_PER_CTA; + *dbeta_part = cta_dz_sum[jt]; + dbeta_part += Ktraits::THREADS_PER_CTA; + if (Has_colscale) { + *dcolscale_part = cta_dcolscale_sum[jt]; + dcolscale_part += Ktraits::THREADS_PER_CTA; + } + } + } + + } +} + +template +__global__ __launch_bounds__(Kernel_traits::THREADS_PER_CTA) +void ln_bwd_finalize_kernel(BwdParams params) +{ + + using compute_t = typename Kernel_traits::compute_t; + using weight_t = typename Kernel_traits::weight_t; + using index_t = typename Kernel_traits::index_t; + using Reducer = typename Kernel_traits::Reducer; + using reduce_t = typename Reducer::Type; + + Sum sum; + enum { NUM_ELT = Kernel_traits::ELTS_PER_LDG }; + enum { THREADS_PER_WARP = Kernel_traits::THREADS_PER_WARP }; + + __shared__ char smem_[Kernel_traits::SMEM_BYTES_PER_CTA]; + + constexpr uint32_t bidm = 0; + + const uint32_t bidn = blockIdx.x; + const uint32_t tidx = threadIdx.x; + const uint32_t warp = tidx / THREADS_PER_WARP; + const uint32_t lane = tidx % THREADS_PER_WARP; + + Reducer reducer(params, bidm, bidn, 0, 0, lane, smem_); + + const uint32_t c = bidn * THREADS_PER_WARP + lane; + const uint32_t c_out = bidn * THREADS_PER_WARP / 2 + lane; + constexpr uint32_t COL_STRIDE = Kernel_traits::CTAS * THREADS_PER_WARP; + for( uint32_t col = c, col_out = c_out; col < Kernel_traits::COLS; col += COL_STRIDE, col_out += COL_STRIDE / 2 ) { + // Each thread sums over NUM_ELT columns. + Vec dbeta_local, dgamma_local, dcolscale_local; + memset(&dgamma_local, 0, sizeof(dgamma_local)); + memset(&dbeta_local, 0, sizeof(dbeta_local)); + if (Has_colscale) { memset(&dcolscale_local, 0, sizeof(dcolscale_local)); } + if (Is_even_cols || col < params.cols) { + for( uint32_t row = warp; row < params.ctas_per_col; row += Kernel_traits::ROWS_PER_CTA ) { + index_t idx = row * params.cols + col; + + Vec dbeta_part, dgamma_part, dcolscale_part; + dbeta_part.load_from(params.dbeta_part, idx); + dgamma_part.load_from(params.dgamma_part, idx); + if (Has_colscale) { dcolscale_part.load_from(params.dcolscale_part, idx); } + #pragma unroll + for( int it = 0; it < NUM_ELT; it++ ) { + dgamma_local.data.elt[it] += dgamma_part.data.elt[it]; + dbeta_local.data.elt[it] += dbeta_part.data.elt[it]; + if (Has_colscale) { dcolscale_local.data.elt[it] += dcolscale_part.data.elt[it]; } + } + } + } + void * smem_gamma = smem_; + void * smem_beta = &smem_[Kernel_traits::SMEM_BYTES_TRANSPOSE]; + void * smem_colscale = &smem_[2 * Kernel_traits::SMEM_BYTES_TRANSPOSE]; + + const int write_row = warp; + const int write_col = lane ^ write_row; + const int write_idx = write_row * THREADS_PER_WARP + write_col; + + dgamma_local.store_to(smem_gamma, write_idx); + dbeta_local.store_to(smem_beta, write_idx); + if (Has_colscale) { dcolscale_local.store_to(smem_colscale, write_idx); } + + __syncthreads(); + + // It would be probably safe to reuse the first row of smem_beta and smem_gamma + void * smem_gamma_out = &smem_[Kernel_traits::NUM_FACTORS * Kernel_traits::SMEM_BYTES_TRANSPOSE]; + void * smem_beta_out = &smem_[Kernel_traits::NUM_FACTORS * Kernel_traits::SMEM_BYTES_TRANSPOSE + Kernel_traits::SMEM_BYTES_OUTPUT]; + void * smem_colscale_out = &smem_[Kernel_traits::NUM_FACTORS * Kernel_traits::SMEM_BYTES_TRANSPOSE + 2 * Kernel_traits::SMEM_BYTES_OUTPUT]; + + + // More than one iter iff ROWS_PER_CTA < 32. + for( int w = warp; w < THREADS_PER_WARP; w += Kernel_traits::ROWS_PER_CTA ) { + const int read_row = lane; + const int read_col = w ^ read_row; + const int read_idx = read_row * THREADS_PER_WARP + read_col; + + memset(&dbeta_local, 0, sizeof(dbeta_local)); + memset(&dgamma_local, 0, sizeof(dgamma_local)); + if (Has_colscale) { memset(&dcolscale_local, 0, sizeof(dcolscale_local)); } + + // Load beta and gamma transposed + if(read_row < Kernel_traits::ROWS_PER_CTA){ + dbeta_local.load_from(smem_beta, read_idx); + dgamma_local.load_from(smem_gamma, read_idx); + if (Has_colscale) { dcolscale_local.load_from(smem_colscale, read_idx); } + } + + // Call reducer on the loaded value(s) and convert. + #pragma unroll + for( int it = 0; it < NUM_ELT; it++ ) { + compute_t b_i = dbeta_local.data.elt[it]; + compute_t g_i = dgamma_local.data.elt[it]; + b_i = reducer.allreduce(b_i, sum); + g_i = reducer.allreduce(g_i, sum); + + dgamma_local.data.elt[it] = g_i; + dbeta_local.data.elt[it] = b_i; + if (Has_colscale) { + compute_t cs_i = dcolscale_local.data.elt[it]; + cs_i = reducer.allreduce(cs_i, sum); + dcolscale_local.data.elt[it] = cs_i; + } + } + + // Leader stores the result at the current column. + if(lane == 0){ + dgamma_local.store_to(smem_gamma_out, w); + dbeta_local.store_to(smem_beta_out, w); + if (Has_colscale) { dcolscale_local.store_to(smem_colscale_out, w); } + } + + } + + // All writes done. + __syncthreads(); + + // Pack and store: 2-wide stores with half the threads. + if (Is_even_cols || col_out * 2 < params.cols) { + if( warp == Kernel_traits::ROWS_PER_CTA - 1 && lane < THREADS_PER_WARP / 2 ) { + + using src_t = typename TypeToVec2::Type; + using dst_t = typename TypeToVec2::Type; + Vec dbeta_vec2, dgamma_vec2, dcolscale_vec2; + Vec dbeta_out2, dgamma_out2, dcolscale_out2; + + dgamma_vec2.load_from(smem_gamma_out, lane); + dbeta_vec2.load_from(smem_beta_out, lane); + if (Has_colscale) { dcolscale_vec2.load_from(smem_colscale_out, lane); } + #pragma unroll + for( int it = 0; it < NUM_ELT; it++ ) { + dgamma_out2.data.elt[it] = Converter::convert(dgamma_vec2.data.elt[it]); + dbeta_out2.data.elt[it] = Converter::convert(dbeta_vec2.data.elt[it]); + if (Has_colscale) { dcolscale_out2.data.elt[it] = Converter::convert(dcolscale_vec2.data.elt[it]); } + } + dgamma_out2.store_to(params.dgamma, col_out); + dbeta_out2.store_to(params.dbeta, col_out); + if (Has_colscale) { dcolscale_out2.store_to(params.dcolscale, col_out); } + } + } + } +} +} // namespace layer_norm + +using namespace layer_norm; + +template< + typename weight_t, + typename input_t, + typename residual_t, + typename output_t, + typename compute_t, + typename index_t, + int HIDDEN_SIZE, + int CTAS_PER_ROW, + int WARPS_M, + int WARPS_N, + int BYTES_PER_LDG_MAIN, + int BYTES_PER_LDG_FINAL +> +void launch_(LaunchParams &launch_params, const bool configure_params){ + + using Kernel_traits = Kernel_traits; + bool is_dropout = launch_params.params.dropout_keep_p < 1.f; + bool has_colscale = launch_params.params.colscale != nullptr; + bool has_subset = launch_params.params.x0_subset != nullptr; + bool is_even_cols = launch_params.params.cols == HIDDEN_SIZE; + BOOL_SWITCH(is_dropout, IsDropoutConst, [&] { + BOOL_SWITCH(has_colscale, HasColscaleConst, [&] { + BOOL_SWITCH(has_subset, HasSubsetConst, [&] { + BOOL_SWITCH(is_even_cols, IsEvenColsConst, [&] { + auto kernel = &ln_bwd_kernel; + if( configure_params ) { + int ctas_per_sm; + CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &ctas_per_sm, kernel, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES)); + launch_params.params.ctas_per_col = launch_params.props->multiProcessorCount * ctas_per_sm / Kernel_traits::CTAS_PER_ROW; + launch_params.barrier_size = 0; + launch_params.workspace_bytes = 0; + if(Kernel_traits::CTAS_PER_ROW > 1) { + launch_params.barrier_size = 2 * launch_params.params.ctas_per_col; + launch_params.workspace_bytes = launch_params.params.ctas_per_col + * Kernel_traits::WARPS_M + * Kernel_traits::CTAS_PER_ROW + * sizeof(typename Kernel_traits::reduce_t) + * 2; + } + return; + } + + if( Kernel_traits::SMEM_BYTES >= 48 * 1024 ) { + CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES)); + } + auto stream = launch_params.stream; + auto ctas_per_col = launch_params.params.ctas_per_col; + + if( Kernel_traits::CTAS_PER_ROW == 1 ) { + kernel<<>>(launch_params.params); + } else { + dim3 grid(Kernel_traits::CTAS_PER_ROW * ctas_per_col); + dim3 block(Kernel_traits::THREADS_PER_CTA); + void *params_ = (void *)&launch_params.params; + cudaLaunchCooperativeKernel((void *)kernel, grid, block, (void **)¶ms_, Kernel_traits::SMEM_BYTES, stream); + } + + using Kernel_traits_f = layer_norm::Kernel_traits_finalize; + + auto kernel_f = &layer_norm::ln_bwd_finalize_kernel; + kernel_f<<>>(launch_params.params); + }); + }); + }); + }); +} diff --git a/ln_fwd_1024.cu b/ln_fwd_1024.cu new file mode 100644 index 0000000000000000000000000000000000000000..824d86e9fd05920d3e557b42356feec86c904f68 --- /dev/null +++ b/ln_fwd_1024.cu @@ -0,0 +1,15 @@ +#include "ln_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_FWD_LAUNCHER( 1024, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1024, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1024, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1024, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1024, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1024, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1024, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1024, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1024, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1024, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); diff --git a/ln_fwd_1280.cu b/ln_fwd_1280.cu new file mode 100644 index 0000000000000000000000000000000000000000..1ff58cbc2889a2c06c51df560d2b35ca4e079201 --- /dev/null +++ b/ln_fwd_1280.cu @@ -0,0 +1,15 @@ +#include "ln_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_FWD_LAUNCHER( 1280, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1280, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1280, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1280, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1280, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1280, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1280, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1280, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1280, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1280, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); diff --git a/ln_fwd_1536.cu b/ln_fwd_1536.cu new file mode 100644 index 0000000000000000000000000000000000000000..a8e19d4dba97d91cd246e62ba80a2936ac05755c --- /dev/null +++ b/ln_fwd_1536.cu @@ -0,0 +1,15 @@ +#include "ln_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_FWD_LAUNCHER( 1536, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1536, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1536, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1536, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1536, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1536, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1536, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1536, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1536, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 1536, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); diff --git a/ln_fwd_2048.cu b/ln_fwd_2048.cu new file mode 100644 index 0000000000000000000000000000000000000000..6f9794c1e77f91a333d64cc6e461560622b87e12 --- /dev/null +++ b/ln_fwd_2048.cu @@ -0,0 +1,15 @@ +#include "ln_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_FWD_LAUNCHER( 2048, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2048, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2048, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2048, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2048, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2048, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2048, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2048, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2048, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2048, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); diff --git a/ln_fwd_256.cu b/ln_fwd_256.cu new file mode 100644 index 0000000000000000000000000000000000000000..f3a541c6dbf20cd94bb56607bbb23e6a81059bdc --- /dev/null +++ b/ln_fwd_256.cu @@ -0,0 +1,15 @@ +#include "ln_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_FWD_LAUNCHER( 256, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 256, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 256, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 256, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 256, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 256, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 256, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 256, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 256, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 256, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); diff --git a/ln_fwd_2560.cu b/ln_fwd_2560.cu new file mode 100644 index 0000000000000000000000000000000000000000..1650671e059ec358f8109c1d592694458e77d489 --- /dev/null +++ b/ln_fwd_2560.cu @@ -0,0 +1,15 @@ +#include "ln_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_FWD_LAUNCHER( 2560, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2560, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2560, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2560, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2560, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2560, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2560, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2560, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2560, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 2560, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); diff --git a/ln_fwd_3072.cu b/ln_fwd_3072.cu new file mode 100644 index 0000000000000000000000000000000000000000..25bb8691dc9f6a95297301efbd91567a5c22d1c2 --- /dev/null +++ b/ln_fwd_3072.cu @@ -0,0 +1,15 @@ +#include "ln_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_FWD_LAUNCHER( 3072, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 3072, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 3072, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 3072, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 3072, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 3072, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 3072, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 3072, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 3072, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 3072, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16); diff --git a/ln_fwd_4096.cu b/ln_fwd_4096.cu new file mode 100644 index 0000000000000000000000000000000000000000..b2bffb5831bf1b6eb18cd1e2cd2c4636a06f5736 --- /dev/null +++ b/ln_fwd_4096.cu @@ -0,0 +1,15 @@ +#include "ln_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_FWD_LAUNCHER( 4096, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 4096, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 4096, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 4096, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 4096, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 4096, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 4096, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 4096, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 4096, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 4096, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16); diff --git a/ln_fwd_512.cu b/ln_fwd_512.cu new file mode 100644 index 0000000000000000000000000000000000000000..a08fe34c55d61eecdbc74caa41dfbec10b3a8126 --- /dev/null +++ b/ln_fwd_512.cu @@ -0,0 +1,15 @@ +#include "ln_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_FWD_LAUNCHER( 512, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 512, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 512, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 512, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 512, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 512, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 512, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 512, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 512, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 512, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); diff --git a/ln_fwd_5120.cu b/ln_fwd_5120.cu new file mode 100644 index 0000000000000000000000000000000000000000..bebbd69f05b38a5e3c0dae5d248de467118ef8c5 --- /dev/null +++ b/ln_fwd_5120.cu @@ -0,0 +1,15 @@ +#include "ln_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_FWD_LAUNCHER( 5120, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 5120, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 5120, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 5120, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 5120, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 5120, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 5120, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 5120, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 5120, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 5120, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16); diff --git a/ln_fwd_6144.cu b/ln_fwd_6144.cu new file mode 100644 index 0000000000000000000000000000000000000000..4df01ead2f292e255221e6fb0b48e63941a22cab --- /dev/null +++ b/ln_fwd_6144.cu @@ -0,0 +1,15 @@ +#include "ln_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_FWD_LAUNCHER( 6144, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 6144, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 6144, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 6144, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 6144, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 6144, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 6144, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 6144, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 6144, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 6144, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16); diff --git a/ln_fwd_7168.cu b/ln_fwd_7168.cu new file mode 100644 index 0000000000000000000000000000000000000000..8343666d10c2788cb2c19ba4f448eef2ccf2b956 --- /dev/null +++ b/ln_fwd_7168.cu @@ -0,0 +1,15 @@ +#include "ln_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_FWD_LAUNCHER( 7168, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 7168, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 7168, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 7168, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 7168, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 7168, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 7168, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 7168, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 7168, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_FWD_LAUNCHER( 7168, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16); diff --git a/ln_fwd_768.cu b/ln_fwd_768.cu new file mode 100644 index 0000000000000000000000000000000000000000..06d5a3b09cdd4941764885f5107bbbfa6b264eef --- /dev/null +++ b/ln_fwd_768.cu @@ -0,0 +1,15 @@ +#include "ln_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_FWD_LAUNCHER( 768, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 768, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 768, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 768, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 768, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 768, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 768, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 768, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 768, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_FWD_LAUNCHER( 768, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); diff --git a/ln_fwd_8192.cu b/ln_fwd_8192.cu new file mode 100644 index 0000000000000000000000000000000000000000..bf7cb40252baf820c88dff1337c81dffd934087a --- /dev/null +++ b/ln_fwd_8192.cu @@ -0,0 +1,15 @@ +#include "ln_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_FWD_LAUNCHER( 8192, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 8192, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 8192, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 8192, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 8192, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 8192, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 8192, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 8192, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 8192, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16); +REGISTER_FWD_LAUNCHER( 8192, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16); diff --git a/ln_fwd_kernels.cuh b/ln_fwd_kernels.cuh new file mode 100644 index 0000000000000000000000000000000000000000..f6bccb8c28a2b3d967dddc3d8b21e1888ed2e29c --- /dev/null +++ b/ln_fwd_kernels.cuh @@ -0,0 +1,272 @@ +#pragma once + +#ifdef OLD_GENERATOR_PATH +#include +#else +#include +#endif + +#include // For at::cuda::philox::unpack +#include + +#include "ln.h" +#include "ln_utils.cuh" +#include "ln_kernel_traits.h" +#include "static_switch.h" + +namespace layer_norm { + +template +__global__ __launch_bounds__(Ktraits::THREADS_PER_CTA) +void ln_fwd_kernel(FwdParams params) { + + enum { ROWS_PER_CTA = Ktraits::ROWS_PER_CTA }; + enum { WARPS_N = Ktraits::WARPS_N }; + enum { WARPS_M = Ktraits::WARPS_M }; + enum { THREADS_PER_ROW = Ktraits::THREADS_PER_ROW }; + enum { VEC_COLS_PER_LDG = Ktraits::VEC_COLS_PER_LDG }; + enum { BYTES_PER_ROW = Ktraits::BYTES_PER_ROW }; + enum { LDGS = Ktraits::LDGS }; + enum { NUM_ELTS = Ktraits::NUM_ELTS }; + enum { CTAS_PER_ROW = Ktraits::CTAS_PER_ROW }; + + using input_t = typename Ktraits::input_t; + using residual_t = typename Ktraits::residual_t; + using output_t = typename Ktraits::output_t; + using index_t = typename Ktraits::index_t; + using compute_t = typename Ktraits::compute_t; + using mask_t = typename Ktraits::mask_t; + using Ivec = typename Ktraits::Ivec; + using Rvec = typename Ktraits::Rvec; + using Ovec = typename Ktraits::Ovec; + using Wvec = typename Ktraits::Wvec; + using Cvec = typename Ktraits::Cvec; + using Mvec = typename Ktraits::Mvec; + + using Stats = typename Ktraits::Stats; + using stats_t = typename Stats::stats_t; + + const bool has_residual = params.residual != nullptr; + const bool save_x = has_residual || Is_dropout || Has_colscale || (params.rowscale != nullptr) || Has_subset || !(std::is_same::value); + + extern __shared__ char smem_[]; + + const index_t tidx = threadIdx.x; + const index_t bidn = blockIdx.x % CTAS_PER_ROW; + const index_t bidm = blockIdx.x / CTAS_PER_ROW; + const index_t lane = tidx % THREADS_PER_WARP; + const index_t warp = tidx / THREADS_PER_WARP; + const index_t warp_m = warp / WARPS_N; + const index_t warp_n = warp % WARPS_N; + + const index_t r = bidm * ROWS_PER_CTA + warp_m; + const index_t c = bidn * THREADS_PER_ROW + warp_n * THREADS_PER_WARP + lane; + + Stats stats(params, bidm, bidn, warp_m, warp_n, lane, smem_); + + compute_t *mu_ptr = static_cast(params.mu); + compute_t *rs_ptr = static_cast(params.rs); + + const input_t *rowscale = static_cast(params.rowscale); + const index_t *x0_subset = static_cast(params.x0_subset); + const index_t *z_subset = static_cast(params.z_subset); + + // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/Dropout.cu + curandStatePhilox4_32_10_t state; + if (Is_dropout) { + auto seeds = at::cuda::philox::unpack(params.philox_args); + const index_t tidx_global = blockIdx.x * blockDim.x + threadIdx.x; + curand_init(std::get<0>(seeds), tidx_global, std::get<1>(seeds), &state); + } + + const index_t num_valid_ldgs = ((params.cols / Ktraits::ELTS_PER_LDG) - 1 - c + VEC_COLS_PER_LDG) / VEC_COLS_PER_LDG; + + Wvec gamma[LDGS]; + Wvec beta[LDGS]; + Wvec colscale[LDGS]; + index_t idx = c; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + if (Is_even_cols || (it < num_valid_ldgs)) { + gamma[it].load_from(params.gamma, idx); + if (params.beta != nullptr) { + beta[it].load_from(params.beta, idx); + } else { + beta[it].zero_(); + } + if (Has_colscale) { colscale[it].load_from(params.colscale, idx); } + idx += VEC_COLS_PER_LDG; + } + } + + for( int row = r; row < params.rows; row += params.ctas_per_col * ROWS_PER_CTA ) { + const compute_t rowscale_val = !Has_subset ? (params.rowscale == nullptr ? 1.0f : compute_t(rowscale[row])) : params.rowscale_const; + const int row_x0 = !Has_subset ? row + 1 : x0_subset[row]; + const int row_z = !Has_subset ? row + 1 : z_subset[row]; + const bool load_x0 = !Has_subset || row_x0 > 0; + index_t idx_x = row * params.cols / Ktraits::ELTS_PER_LDG + c; + index_t idx_x0 = !Has_subset ? idx_x : (load_x0 ? (row_x0 - 1) * params.cols / Ktraits::ELTS_PER_LDG + c : 0); + compute_t xf[LDGS * NUM_ELTS]; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + if (Is_even_cols || (it < num_valid_ldgs)) { + Ivec x0; + Rvec residual; + Rvec x; + Mvec dmask; + if (load_x0) { x0.load_from(params.x0, !Has_subset ? idx_x : idx_x0); } + if (has_residual) { residual.load_from(params.residual, idx_x); } + #pragma unroll + for( int jt = 0; jt < NUM_ELTS; jt++ ) { + // TD [2022-04-22]: We're memory bound, not compute bound, so we don't need to use + // the more efficient curand_uniform4. + compute_t x_ij; + if (load_x0) { + mask_t keep = !Is_dropout ? true : curand_uniform(&state) <= params.dropout_keep_p; + if (Is_dropout) { dmask.data.elt[jt] = keep; } + compute_t x0_ij = compute_t(x0.data.elt[jt]) * rowscale_val; + x0_ij = keep ? (Is_dropout ? x0_ij * params.dropout_scale : x0_ij) : 0.0f; + if (Has_colscale) { x0_ij *= compute_t(colscale[it].data.elt[jt]); } + x_ij = has_residual ? x0_ij + compute_t(residual.data.elt[jt]) : x0_ij; + } else { + x_ij = has_residual ? compute_t(residual.data.elt[jt]) : 0.f; + } + if (save_x) { x.data.elt[jt] = x_ij; } + xf[it * NUM_ELTS + jt] = x_ij; + } + if (save_x) { x.store_to(params.x, idx_x); } + if (Is_dropout && load_x0) { dmask.store_to(params.dmask, !Has_subset ? idx_x : idx_x0); } + idx_x += VEC_COLS_PER_LDG; + idx_x0 += VEC_COLS_PER_LDG; + } + } + + static_assert(CTAS_PER_ROW == 1, "Don't support multiple CTAs per row for now"); + const index_t num_vecs = params.cols / Ktraits::ELTS_PER_LDG; + const index_t num_full_ldgs = num_vecs / Ktraits::VEC_COLS_PER_LDG; + const index_t remaining_vecs = num_vecs % Ktraits::VEC_COLS_PER_LDG; + auto valid_elts_in_warp_fn = [num_full_ldgs, remaining_vecs] (int warp_n) -> int { + // Need to convert to int, otherwise the subtraction will wrap around. + const index_t valid_partial_vecs_in_warp = + std::min(std::max(int(remaining_vecs) - int(warp_n * THREADS_PER_WARP), int(0)), + int(THREADS_PER_WARP)); + return (num_full_ldgs * THREADS_PER_WARP + valid_partial_vecs_in_warp) * NUM_ELTS; + }; + stats_t s = stats.template compute( + xf, params.inverse_cols, valid_elts_in_warp_fn, num_valid_ldgs * NUM_ELTS + ); + + compute_t mu = layer_norm::Get<0>::of(s); + compute_t m2 = layer_norm::Get<1>::of(s); + + if( bidn == 0 && warp_n == 0 && lane == 0 ) { + mu_ptr[row] = mu; + } + + compute_t rs = rsqrtf(m2 * params.inverse_cols + params.epsilon + (!params.is_rms_norm ? 0.f : mu * mu)); + + if( bidn == 0 && warp_n == 0 && lane == 0 ) { + rs_ptr[row] = rs; + } + + const bool save_z = !Has_subset || row_z > 0; + if (save_z) { + index_t idx_z = (!Has_subset ? row : (row_z - 1)) * params.cols / Ktraits::ELTS_PER_LDG + c; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + if (Is_even_cols || (it < num_valid_ldgs)) { + Ovec z; + #pragma unroll + for( int jt = 0; jt < NUM_ELTS; jt++ ) { + compute_t y_ij = compute_t(rs * (xf[it * NUM_ELTS + jt] - (!params.is_rms_norm ? mu : 0.f))); + compute_t g_ij = gamma[it].data.elt[jt]; + compute_t b_ij = beta[it].data.elt[jt]; + z.data.elt[jt] = output_t(g_ij * y_ij + b_ij); + } + z.store_to(params.z, idx_z); + idx_z += VEC_COLS_PER_LDG; + } + } + } + + } +} + +} // namespace layer_norm + +using namespace layer_norm; + +template< + typename weight_t, + typename input_t, + typename residual_t, + typename output_t, + typename compute_t, + typename index_t, + int HIDDEN_SIZE, + int CTAS_PER_ROW, + int WARPS_M, + int WARPS_N, + int BYTES_PER_LDG +> +void launch_(LaunchParams &launch_params, const bool configure_params){ + + using Kernel_traits = Kernel_traits; + bool has_colscale = launch_params.params.colscale != nullptr; + bool has_subset = launch_params.params.x0_subset != nullptr; + bool is_even_cols = launch_params.params.cols == HIDDEN_SIZE; + BOOL_SWITCH(launch_params.params.dropout_keep_p < 1.f, IsDropoutConst, [&] { + BOOL_SWITCH(has_colscale, HasColscaleConst, [&] { + BOOL_SWITCH(has_subset, HasSubsetConst, [&] { + BOOL_SWITCH(is_even_cols, IsEvenColsConst, [&] { + auto kernel = &ln_fwd_kernel; + if( configure_params ) { + int ctas_per_sm; + CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &ctas_per_sm, kernel, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES_FWD)); + launch_params.params.ctas_per_col = launch_params.props->multiProcessorCount * ctas_per_sm / Kernel_traits::CTAS_PER_ROW; + const size_t rows_per_loop = launch_params.params.ctas_per_col * Kernel_traits::ROWS_PER_CTA; + launch_params.elts_per_thread = (launch_params.params.rows + rows_per_loop - 1) / rows_per_loop * Kernel_traits::LDGS * Kernel_traits::NUM_ELTS; + launch_params.barrier_size = 0; + launch_params.workspace_bytes = 0; + if(Kernel_traits::CTAS_PER_ROW > 1) { + launch_params.barrier_size = 2 * launch_params.params.ctas_per_col; + launch_params.workspace_bytes = launch_params.params.ctas_per_col + * Kernel_traits::WARPS_M + * Kernel_traits::CTAS_PER_ROW + * sizeof(typename Kernel_traits::Stats::stats_t) + * 2; + } + return; + } + + if( Kernel_traits::SMEM_BYTES_FWD >= 48 * 1024 ) { + CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES_FWD)); + } + auto stream = launch_params.stream; + auto ctas_per_col = launch_params.params.ctas_per_col; + + if( Kernel_traits::CTAS_PER_ROW == 1 ) { + kernel<<>>(launch_params.params); + } else { + dim3 grid(Kernel_traits::CTAS_PER_ROW * ctas_per_col); + dim3 block(Kernel_traits::THREADS_PER_CTA); + void *params_ = (void *)&launch_params.params; + cudaLaunchCooperativeKernel((void *)kernel, grid, block, (void **)¶ms_, Kernel_traits::SMEM_BYTES_FWD, stream); + } + }); + }); + }); + }); +} diff --git a/ln_kernel_traits.h b/ln_kernel_traits.h new file mode 100644 index 0000000000000000000000000000000000000000..77de6bf9af60c9ae70427097db26cf4ed130b359 --- /dev/null +++ b/ln_kernel_traits.h @@ -0,0 +1,172 @@ +#pragma once + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace layer_norm { +template< + uint32_t HIDDEN_SIZE_, + typename weight_t_, + typename input_t_, + typename residual_t_, + typename output_t_, + typename compute_t_, + typename index_t_, + uint32_t THREADS_PER_CTA_ +> +struct Kernel_traits_base { + + using weight_t = weight_t_; + using input_t = input_t_; + using residual_t = residual_t_; + using output_t = output_t_; + using compute_t = compute_t_; + using index_t = index_t_; + + enum { HIDDEN_SIZE = HIDDEN_SIZE_ }; + enum { THREADS_PER_CTA = THREADS_PER_CTA_ }; + enum { THREADS_PER_WARP = 32 }; + +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template< + uint32_t HIDDEN_SIZE_, + typename weight_t_, + typename input_t_, + typename residual_t_, + typename output_t_, + typename compute_t_, + typename index_t_, + bool Has_colscale, + uint32_t THREADS_PER_CTA_, + uint32_t BYTES_PER_LDG_, + typename Base = Kernel_traits_base +> +struct Kernel_traits_finalize : public Base { + enum { ROWS_PER_CTA = Base::THREADS_PER_CTA / Base::THREADS_PER_WARP }; + static_assert((int) ROWS_PER_CTA <= (int) Base::THREADS_PER_WARP); + // Bytes per global load from the input. + enum { BYTES_PER_LDG = BYTES_PER_LDG_ }; + // Number of elements fetched by a global load. + enum { ELTS_PER_LDG = BYTES_PER_LDG / sizeof(compute_t_) }; + // Bytes per global store of the weights. + enum { BYTES_PER_STG = ELTS_PER_LDG * sizeof(weight_t_) }; + static_assert(sizeof(BYTES_PER_LDG) == 4, "Conflict-free smem transpose only implemented for 4B compute type!"); + static_assert(Base::THREADS_PER_CTA == ROWS_PER_CTA * Base::THREADS_PER_WARP, "We assume one warp per row!"); + // The total number of BYTES_PER_LDG-wide words in a hidden vector. + enum { COLS = HIDDEN_SIZE_ * sizeof(compute_t_) / BYTES_PER_LDG }; + static_assert(COLS * BYTES_PER_LDG == HIDDEN_SIZE_ * sizeof(compute_t_)); + + // Shared memory size to transpose the CTA result. + enum { SMEM_BYTES_TRANSPOSE = Base::THREADS_PER_CTA * BYTES_PER_LDG }; + // Shared memory size to coalsece the CTA result. + enum { SMEM_BYTES_OUTPUT = Base::THREADS_PER_WARP * BYTES_PER_LDG }; + // Shared memory requirement per CTA. + static constexpr int NUM_FACTORS = Has_colscale ? 3 : 2; + enum { SMEM_BYTES_PER_CTA = NUM_FACTORS * SMEM_BYTES_TRANSPOSE + NUM_FACTORS * SMEM_BYTES_OUTPUT }; + + // The type of the reducer. + using Reducer = layer_norm::Reducer; + + // Condition for the whole CTA to participate in syncthreads. + static_assert(COLS % Base::THREADS_PER_WARP == 0); + enum { CTAS = COLS / Base::THREADS_PER_WARP }; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +template< + typename weight_t_, + typename input_t_, + typename residual_t_, + typename output_t_, + typename compute_t_, + typename index_t_, + uint32_t HIDDEN_SIZE_, + uint32_t CTAS_PER_ROW_, + uint32_t WARPS_M_, + uint32_t WARPS_N_, + uint32_t BYTES_PER_LDG_ = 16, + typename Base = Kernel_traits_base< + HIDDEN_SIZE_, + weight_t_, + input_t_, + residual_t_, + output_t_, + compute_t_, + index_t_, + WARPS_M_*WARPS_N_*THREADS_PER_WARP + > +> +struct Kernel_traits : public Base { + + using input_t = typename Base::input_t; + using residual_t = typename Base::residual_t; + using weight_t = typename Base::weight_t; + using compute_t = typename Base::compute_t; + using output_t = typename Base::output_t; + using index_t = typename Base::index_t; + // using mask_t = unsigned char; + using mask_t = bool; + + enum { CTAS_PER_ROW = CTAS_PER_ROW_ }; + enum { WARPS_M = WARPS_M_ }; + enum { WARPS_N = WARPS_N_ }; + enum { COLS = HIDDEN_SIZE_ }; + enum { HIDDEN_SIZE = HIDDEN_SIZE_ }; + enum { BYTES_PER_LDG = BYTES_PER_LDG_ }; + enum { NUM_ELTS = BYTES_PER_LDG / sizeof(input_t) }; + + enum { THREADS_PER_ROW = WARPS_N * THREADS_PER_WARP }; + enum { THREADS_PER_CTA = WARPS_M * THREADS_PER_ROW }; + enum { ROWS_PER_CTA = WARPS_M }; + + enum { BYTES_PER_ROW = COLS * sizeof(input_t) }; + enum { BYTES_PER_ROW_PER_CTA = THREADS_PER_ROW * BYTES_PER_LDG }; + // Multi-row per CTA not supported for multi-CTA => no smem for WGRAD needed + enum { SMEM_BYTES_WGRAD = CTAS_PER_ROW > 1 ? 0 : ROWS_PER_CTA * COLS * sizeof(compute_t) }; + static_assert(WARPS_M == 1 || CTAS_PER_ROW == 1); + + using reduce_t = typename layer_norm::TypeToVec2::Type; + using Reducer = layer_norm::Reducer; + + enum { SMEM_BYTES_DGRAD = Reducer::SMEM_BYTES }; + enum { SMEM_BYTES = SMEM_BYTES_DGRAD + SMEM_BYTES_WGRAD }; + + using Ivec = layer_norm::Vec; + using Rvec = layer_norm::Vec; + using Ovec = layer_norm::Vec; + using Wvec = layer_norm::Vec; + using Cvec = layer_norm::Vec; + using Mvec = layer_norm::Vec; + enum { ELTS_PER_LDG = BYTES_PER_LDG / sizeof(input_t) }; + + // Assume that each thread can handle the same number of elements in the output and weights as in the input. + static_assert(sizeof(input_t) == sizeof(output_t)); + static_assert(sizeof(input_t) <= sizeof(residual_t)); + // The number of columns fetched per load from input: one per thread. + enum { VEC_COLS_PER_LDG = CTAS_PER_ROW * THREADS_PER_ROW }; + // The total number of vectorized loads/stores per hidden vector. + enum { VEC_COLS = COLS / ELTS_PER_LDG }; + // The number of loads per thread for the input. + enum { LDGS = VEC_COLS / VEC_COLS_PER_LDG }; + static_assert(LDGS * VEC_COLS_PER_LDG == VEC_COLS); + //static_assert(LDGS * BYTES_PER_ROW_PER_CTA * CTAS_PER_ROW == BYTES_PER_ROW, ""); + + using Stats = layer_norm::Stats; + enum { SMEM_BYTES_FWD = Stats::SMEM_BYTES }; + +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace layer_norm diff --git a/ln_parallel_bwd_1024.cu b/ln_parallel_bwd_1024.cu new file mode 100644 index 0000000000000000000000000000000000000000..6f4e77466c6c6d5a00275d54f4e68da062a5fc1a --- /dev/null +++ b/ln_parallel_bwd_1024.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_PARALLEL_BWD_LAUNCHER( 1024, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1024, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1024, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1024, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1024, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1024, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1024, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1024, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1024, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1024, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); diff --git a/ln_parallel_bwd_1280.cu b/ln_parallel_bwd_1280.cu new file mode 100644 index 0000000000000000000000000000000000000000..2dba3bebf26e99b853e7ef4b9b56421cf483e0bd --- /dev/null +++ b/ln_parallel_bwd_1280.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_PARALLEL_BWD_LAUNCHER( 1280, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1280, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1280, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1280, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1280, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1280, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1280, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1280, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1280, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1280, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); diff --git a/ln_parallel_bwd_1536.cu b/ln_parallel_bwd_1536.cu new file mode 100644 index 0000000000000000000000000000000000000000..c2ac4b1b0998ca412dea02466f0d8fbe69f48216 --- /dev/null +++ b/ln_parallel_bwd_1536.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1536, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1536, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 1536, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 8, 4); diff --git a/ln_parallel_bwd_2048.cu b/ln_parallel_bwd_2048.cu new file mode 100644 index 0000000000000000000000000000000000000000..f7f959e2fa785a4df3b6a32506f527e1723d83cc --- /dev/null +++ b/ln_parallel_bwd_2048.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2048, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2048, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2048, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4); \ No newline at end of file diff --git a/ln_parallel_bwd_256.cu b/ln_parallel_bwd_256.cu new file mode 100644 index 0000000000000000000000000000000000000000..fa613cf45e1045d046cefc4afd55ded754bc20a4 --- /dev/null +++ b/ln_parallel_bwd_256.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_PARALLEL_BWD_LAUNCHER( 256, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 256, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 256, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 256, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 256, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 256, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 256, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 256, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 256, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 256, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); diff --git a/ln_parallel_bwd_2560.cu b/ln_parallel_bwd_2560.cu new file mode 100644 index 0000000000000000000000000000000000000000..5f5707612df09149885d7883728672dc3a2b751f --- /dev/null +++ b/ln_parallel_bwd_2560.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2560, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2560, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 2560, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 8, 4); diff --git a/ln_parallel_bwd_3072.cu b/ln_parallel_bwd_3072.cu new file mode 100644 index 0000000000000000000000000000000000000000..8fdcb8ffb4d0f0e0fcae6aee930808bd0349ede5 --- /dev/null +++ b/ln_parallel_bwd_3072.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 3072, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 3072, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 3072, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16, 4); \ No newline at end of file diff --git a/ln_parallel_bwd_4096.cu b/ln_parallel_bwd_4096.cu new file mode 100644 index 0000000000000000000000000000000000000000..8decfb085ac8ace1e3694a491bb66a83209027b8 --- /dev/null +++ b/ln_parallel_bwd_4096.cu @@ -0,0 +1,17 @@ +#include "ln_parallel_residual_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +// Use 8 warps otherwise there's a lot of register spilling + +REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 4096, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 4096, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 4096, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4); \ No newline at end of file diff --git a/ln_parallel_bwd_512.cu b/ln_parallel_bwd_512.cu new file mode 100644 index 0000000000000000000000000000000000000000..178453d3045bfefd95018320d357ea8662018782 --- /dev/null +++ b/ln_parallel_bwd_512.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_PARALLEL_BWD_LAUNCHER( 512, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 512, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 512, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 512, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 512, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 512, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 512, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 512, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 512, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 512, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); diff --git a/ln_parallel_bwd_5120.cu b/ln_parallel_bwd_5120.cu new file mode 100644 index 0000000000000000000000000000000000000000..815521973da7266534c7e8b167fa0b8baa47fa2c --- /dev/null +++ b/ln_parallel_bwd_5120.cu @@ -0,0 +1,17 @@ +#include "ln_parallel_residual_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +// Use 8 warps otherwise there's a lot of register spilling + +REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 5120, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 5120, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 5120, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 8, 4); \ No newline at end of file diff --git a/ln_parallel_bwd_6144.cu b/ln_parallel_bwd_6144.cu new file mode 100644 index 0000000000000000000000000000000000000000..eb8668d8a229d2ec24e5eac57db00f9d650615eb --- /dev/null +++ b/ln_parallel_bwd_6144.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 6144, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 6144, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 6144, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4); \ No newline at end of file diff --git a/ln_parallel_bwd_7168.cu b/ln_parallel_bwd_7168.cu new file mode 100644 index 0000000000000000000000000000000000000000..0c12dc476678ce7b24c5fcd0b9408eb686bd6825 --- /dev/null +++ b/ln_parallel_bwd_7168.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 7168, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 7168, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 8, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 7168, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 8, 4); \ No newline at end of file diff --git a/ln_parallel_bwd_768.cu b/ln_parallel_bwd_768.cu new file mode 100644 index 0000000000000000000000000000000000000000..8beece8ab19cea2baefedd118f5d15c90a646526 --- /dev/null +++ b/ln_parallel_bwd_768.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_PARALLEL_BWD_LAUNCHER( 768, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 768, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 768, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 768, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 768, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 768, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 768, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 768, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 768, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 768, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16, 4); diff --git a/ln_parallel_bwd_8192.cu b/ln_parallel_bwd_8192.cu new file mode 100644 index 0000000000000000000000000000000000000000..5ad47c94fdff599dde62574d1c535c4bbacae551 --- /dev/null +++ b/ln_parallel_bwd_8192.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_bwd_kernels.cuh" + +// Create backward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINAL + +REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 8192, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 8192, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16, 4); +REGISTER_PARALLEL_BWD_LAUNCHER( 8192, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16, 4); \ No newline at end of file diff --git a/ln_parallel_fwd_1024.cu b/ln_parallel_fwd_1024.cu new file mode 100644 index 0000000000000000000000000000000000000000..3c64e169302eea0f94ff65641728c35689d7c4ba --- /dev/null +++ b/ln_parallel_fwd_1024.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1024, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1024, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1024, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); diff --git a/ln_parallel_fwd_1280.cu b/ln_parallel_fwd_1280.cu new file mode 100644 index 0000000000000000000000000000000000000000..9bbfce5bc6c5e0303d70552bb36cf380601dcd38 --- /dev/null +++ b/ln_parallel_fwd_1280.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1280, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1280, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1280, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); diff --git a/ln_parallel_fwd_1536.cu b/ln_parallel_fwd_1536.cu new file mode 100644 index 0000000000000000000000000000000000000000..b57f5edce8eb7b6779475f6eadb8aabba299c802 --- /dev/null +++ b/ln_parallel_fwd_1536.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1536, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1536, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 1536, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); diff --git a/ln_parallel_fwd_2048.cu b/ln_parallel_fwd_2048.cu new file mode 100644 index 0000000000000000000000000000000000000000..6fa322d96b4e11aacf5722985672e141f929299b --- /dev/null +++ b/ln_parallel_fwd_2048.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2048, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2048, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2048, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); diff --git a/ln_parallel_fwd_256.cu b/ln_parallel_fwd_256.cu new file mode 100644 index 0000000000000000000000000000000000000000..27445a6bc50c98935c7a5093ee5ffdddf52e2494 --- /dev/null +++ b/ln_parallel_fwd_256.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_PARALLEL_FWD_LAUNCHER( 256, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 256, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 256, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 256, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 256, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 256, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 256, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 256, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 256, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 256, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); \ No newline at end of file diff --git a/ln_parallel_fwd_2560.cu b/ln_parallel_fwd_2560.cu new file mode 100644 index 0000000000000000000000000000000000000000..fdde470c267302adca3d63f2c6b736b67af7ee86 --- /dev/null +++ b/ln_parallel_fwd_2560.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2560, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2560, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 2560, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); diff --git a/ln_parallel_fwd_3072.cu b/ln_parallel_fwd_3072.cu new file mode 100644 index 0000000000000000000000000000000000000000..992f71037607066fb4e4d0f1624669f21c2f53b1 --- /dev/null +++ b/ln_parallel_fwd_3072.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 3072, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 3072, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 3072, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16); diff --git a/ln_parallel_fwd_4096.cu b/ln_parallel_fwd_4096.cu new file mode 100644 index 0000000000000000000000000000000000000000..381837e60874e44aa5e0efccb8749b2ff41ac3fa --- /dev/null +++ b/ln_parallel_fwd_4096.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 4096, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 4096, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 4096, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16); diff --git a/ln_parallel_fwd_512.cu b/ln_parallel_fwd_512.cu new file mode 100644 index 0000000000000000000000000000000000000000..4ba478b01fbdbc2ff5aab0a15fb698eba369f61a --- /dev/null +++ b/ln_parallel_fwd_512.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_PARALLEL_FWD_LAUNCHER( 512, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 512, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 512, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 512, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 512, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 512, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 512, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 512, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 512, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 512, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); diff --git a/ln_parallel_fwd_5120.cu b/ln_parallel_fwd_5120.cu new file mode 100644 index 0000000000000000000000000000000000000000..7ada35228cb603ddd26b06e186989746a86926a8 --- /dev/null +++ b/ln_parallel_fwd_5120.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp32, fp32, fp32, fp32, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp16, fp32, fp32, fp32, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 5120, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 5120, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 5120, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16); diff --git a/ln_parallel_fwd_6144.cu b/ln_parallel_fwd_6144.cu new file mode 100644 index 0000000000000000000000000000000000000000..6f531c881f7f53651c56e3afd1f0f53c580815ec --- /dev/null +++ b/ln_parallel_fwd_6144.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 6144, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 6144, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 6144, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16); diff --git a/ln_parallel_fwd_7168.cu b/ln_parallel_fwd_7168.cu new file mode 100644 index 0000000000000000000000000000000000000000..c99e752cd484a99e97f8bf7a92e433a817c54d64 --- /dev/null +++ b/ln_parallel_fwd_7168.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp32, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp16, fp16, fp32, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp32, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp32, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 7168, bf16, bf16, fp32, bf16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp32, bf16, bf16, bf16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 7168, fp16, fp16, fp16, fp16, fp32, 1, 1, 4, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 7168, bf16, bf16, bf16, bf16, fp32, 1, 1, 4, 16); diff --git a/ln_parallel_fwd_768.cu b/ln_parallel_fwd_768.cu new file mode 100644 index 0000000000000000000000000000000000000000..f33f519c7fb2934b3b5aabf36a2d9046c4b51ee3 --- /dev/null +++ b/ln_parallel_fwd_768.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_PARALLEL_FWD_LAUNCHER( 768, fp32, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 768, fp16, fp32, fp32, fp32, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 768, fp32, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 768, fp16, fp16, fp32, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 768, fp32, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 768, fp32, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 768, bf16, bf16, fp32, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 768, fp32, bf16, bf16, bf16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 768, fp16, fp16, fp16, fp16, fp32, 1, 4, 1, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 768, bf16, bf16, bf16, bf16, fp32, 1, 4, 1, 16); diff --git a/ln_parallel_fwd_8192.cu b/ln_parallel_fwd_8192.cu new file mode 100644 index 0000000000000000000000000000000000000000..360e6d4471062cd40bf245ecff22b579f56d4020 --- /dev/null +++ b/ln_parallel_fwd_8192.cu @@ -0,0 +1,15 @@ +#include "ln_parallel_residual_fwd_kernels.cuh" + +// Create forward launch function and register. Macro signature: +// HIDDEN_SIZE, WTYPE, ITYPE, RYTPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG + +REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp32, fp32, fp32, fp32, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp16, fp32, fp32, fp32, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp32, fp16, fp32, fp16, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp16, fp16, fp32, fp16, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp32, fp16, fp16, fp16, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp32, bf16, fp32, bf16, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 8192, bf16, bf16, fp32, bf16, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp32, bf16, bf16, bf16, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 8192, fp16, fp16, fp16, fp16, fp32, 1, 1, 8, 16); +REGISTER_PARALLEL_FWD_LAUNCHER( 8192, bf16, bf16, bf16, bf16, fp32, 1, 1, 8, 16); diff --git a/ln_parallel_residual_bwd_kernels.cuh b/ln_parallel_residual_bwd_kernels.cuh new file mode 100644 index 0000000000000000000000000000000000000000..521495724400fde6eaecb27e255154a51d8ddbb0 --- /dev/null +++ b/ln_parallel_residual_bwd_kernels.cuh @@ -0,0 +1,540 @@ +#pragma once + +#include "ln.h" +#include "ln_utils.cuh" +#include "ln_kernel_traits.h" +#include "static_switch.h" +#include "ln_bwd_kernels.cuh" + +namespace layer_norm { + +template +__global__ __launch_bounds__(Ktraits::THREADS_PER_CTA) +void ln_parallel_residual_bwd_kernel(layer_norm::BwdParams params) { + + enum { ROWS_PER_CTA = Ktraits::ROWS_PER_CTA }; + enum { WARPS_M = Ktraits::WARPS_M }; + enum { WARPS_N = Ktraits::WARPS_N }; + enum { THREADS_PER_ROW = Ktraits::THREADS_PER_ROW }; + enum { COLS = Ktraits::COLS }; + enum { BYTES_PER_ROW = Ktraits::BYTES_PER_ROW }; + enum { LDGS = Ktraits::LDGS }; + enum { NUM_ELTS = Ktraits::ELTS_PER_LDG }; + enum { THREADS_PER_WARP = Ktraits::THREADS_PER_WARP }; + enum { CTAS_PER_ROW = Ktraits::CTAS_PER_ROW }; + + using input_t = typename Ktraits::input_t; + using compute_t = typename Ktraits::compute_t; + using index_t = typename Ktraits::index_t; + using mask_t = typename Ktraits::mask_t; + using Ivec = typename Ktraits::Ivec; + using Rvec = typename Ktraits::Rvec; + using Ovec = typename Ktraits::Ovec; + using Wvec = typename Ktraits::Wvec; + using Cvec = typename Ktraits::Cvec; + using Mvec = typename Ktraits::Mvec; + using Reducer = typename Ktraits::Reducer; + using reduce_t = typename Reducer::Type; + + extern __shared__ char smem_[]; + + const bool has_residual = params.dresidual != nullptr; + const bool has_x1 = params.dx1 != nullptr; + const bool prenorm = params.dx != nullptr; + + const index_t tidx = threadIdx.x; + const index_t bidn = blockIdx.x % CTAS_PER_ROW; + const index_t bidm = blockIdx.x / CTAS_PER_ROW; + const index_t lane = tidx % THREADS_PER_WARP; + const index_t warp = tidx / THREADS_PER_WARP; + const index_t warp_m = warp / Ktraits::WARPS_N; + const index_t warp_n = warp % Ktraits::WARPS_N; + const index_t tid_r = warp_n * THREADS_PER_WARP + lane; + + const index_t r = bidm * Ktraits::ROWS_PER_CTA + warp_m; + const index_t c = bidn * THREADS_PER_ROW + warp_n * THREADS_PER_WARP + lane; + + static_assert(COLS == THREADS_PER_ROW * LDGS * NUM_ELTS * CTAS_PER_ROW); + + Cvec dz0y_sum[LDGS]; + Cvec dz0_sum[LDGS]; + Cvec dz1y_sum[LDGS]; + Cvec dz1_sum[LDGS]; + + memset(dz0y_sum, 0, sizeof(dz0y_sum)); + memset(dz0_sum, 0, sizeof(dz0_sum)); + if (!Tied_norm) { + memset(dz1y_sum, 0, sizeof(dz1y_sum)); + memset(dz1_sum, 0, sizeof(dz1_sum)); + } + + compute_t * smem_wgrad = reinterpret_cast(smem_); + char *smem_dgrad = smem_ + Ktraits::SMEM_BYTES_WGRAD; + + Reducer reducer(params, bidm, bidn, warp_m, warp_n, lane, smem_dgrad); + + Sum sum; + + const index_t num_valid_ldgs = + ((params.cols / Ktraits::ELTS_PER_LDG) - 1 - c + Ktraits::VEC_COLS_PER_LDG) / Ktraits::VEC_COLS_PER_LDG; + + Wvec gamma0[LDGS]; + Wvec gamma1[LDGS]; + index_t idx = c; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + if (Is_even_cols || (it < num_valid_ldgs)) { + gamma0[it].load_from(params.gamma, idx); + if (!Tied_norm) { gamma1[it].load_from(params.gamma1, idx); } + idx += Ktraits::VEC_COLS_PER_LDG; + } + } + // TODO if ROWS_PER_CTA does not divide rows, we might get divergence in the + // last blocks with syncthreads! + // grid stride over rows + #pragma unroll 1 + for( int row = r; row < params.rows; row += params.ctas_per_col * ROWS_PER_CTA ) { + const compute_t mu_r = static_cast(params.mu)[row]; + const compute_t rs_r = static_cast(params.rs)[row]; + Mvec dmask0[LDGS], dmask1[LDGS]; + Rvec dx[LDGS]; + compute_t dy[LDGS * NUM_ELTS]; + compute_t y[LDGS * NUM_ELTS]; + compute_t mdy_local = 0.f; + compute_t mdyy_local = 0.f; + index_t idx = row * params.cols / Ktraits::ELTS_PER_LDG + c; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + if (Is_even_cols || (it < num_valid_ldgs)) { + Rvec x; + Ovec dz0, dz1; + dz0.load_from(params.dz, idx); + if (!Tied_norm) { dz1.load_from(params.dz1, idx); } + if (prenorm) { dx[it].load_from(params.dx, idx); } + x.load_from(params.x, idx); + if (Is_dropout) { + dmask0[it].load_from(params.dmask, idx); + if (has_x1) { dmask1[it].load_from(params.dmask1, idx); } + } + idx += Ktraits::VEC_COLS_PER_LDG; + #pragma unroll + for( int jt = 0; jt < NUM_ELTS; jt++ ) { + compute_t x_tmp = x.data.elt[jt]; + compute_t y_tmp = rs_r * (x_tmp - (!params.is_rms_norm ? mu_r : 0.f)); + compute_t dy_tmp = compute_t(gamma0[it].data.elt[jt]) * compute_t(dz0.data.elt[jt]); + if (!Tied_norm) { + dy_tmp += compute_t(gamma1[it].data.elt[jt]) * compute_t(dz1.data.elt[jt]); + } + compute_t dz0_tmp = dz0.data.elt[jt]; + compute_t dz1_tmp; + if (!Tied_norm) { dz1_tmp = dz1.data.elt[jt]; } + + mdy_local += dy_tmp; + mdyy_local += dy_tmp * y_tmp; + + dy[it * NUM_ELTS + jt] = dy_tmp; + y[it * NUM_ELTS + jt] = y_tmp; + + dz0y_sum[it].data.elt[jt] += dz0_tmp * y_tmp; + dz0_sum[it].data.elt[jt] += dz0_tmp; + if (!Tied_norm) { + dz1y_sum[it].data.elt[jt] += dz1_tmp * y_tmp; + dz1_sum[it].data.elt[jt] += dz1_tmp; + } + } + } + } + + reduce_t result = reducer.allreduce({mdy_local, mdyy_local}, sum); + mdy_local = layer_norm::Get<0>::of(result) * params.inverse_cols; + mdyy_local = layer_norm::Get<1>::of(result) * params.inverse_cols; + + idx = row * params.cols / Ktraits::ELTS_PER_LDG + c; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + if (Is_even_cols || (it < num_valid_ldgs)) { + Ivec dx0, dx1; + Rvec dresidual; + #pragma unroll + for( int jt = 0; jt < NUM_ELTS; jt++ ) { + compute_t dx_tmp_res; + compute_t dy_tmp = dy[it * NUM_ELTS + jt]; + compute_t y_tmp = y[it * NUM_ELTS + jt]; + compute_t dx_tmp = rs_r * (dy_tmp - (mdyy_local * y_tmp + (!params.is_rms_norm ? mdy_local : 0.f))); + dx_tmp_res = prenorm ? dx_tmp + compute_t(dx[it].data.elt[jt]) : dx_tmp; + if (has_residual) { dresidual.data.elt[jt] = dx_tmp_res; } + if (Is_dropout) { + dx0.data.elt[jt] = dmask0[it].data.elt[jt] ? dx_tmp_res * params.dropout_scale : 0.f; + if (has_x1) { dx1.data.elt[jt] = dmask1[it].data.elt[jt] ? dx_tmp_res * params.dropout_scale : 0.f; } + } else { + dx0.data.elt[jt] = dx_tmp_res; + if (has_x1) { dx1.data.elt[jt] = dx_tmp_res; } + } + } + if (has_residual) { dresidual.store_to(params.dresidual, idx); } + dx0.store_to(params.dx0, idx); + if (has_x1) { dx1.store_to(params.dx1, idx); } + idx += Ktraits::VEC_COLS_PER_LDG; + } + } + + } // end: grid stride loop + + if( WARPS_M == 1 ) { + idx = r * params.cols / Ktraits::ELTS_PER_LDG + c; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + if (Is_even_cols || (it < num_valid_ldgs)) { + dz0_sum[it].store_to(params.dbeta_part, idx); + dz0y_sum[it].store_to(params.dgamma_part, idx); + if (!Tied_norm) { + dz1_sum[it].store_to(params.dbeta1_part, idx); + dz1y_sum[it].store_to(params.dgamma1_part, idx); + } + idx += Ktraits::VEC_COLS_PER_LDG; + } + } + } else { + static_assert(WARPS_M == 1 || Ktraits::CTAS_PER_ROW == 1, "Multiple rows per CTA not supported for Multi-CTA."); + // Finalize reduction of part dgamma and dbeta for this CTA + // by reducing over the rows held across the WARPS_M warps + + // Assumption: blockSize divides hidden size. + enum { NUM_RES = COLS / Ktraits::THREADS_PER_CTA }; + static_assert(NUM_RES * Ktraits::THREADS_PER_CTA == COLS, ""); + + idx = warp_m * Ktraits::VEC_COLS + tid_r; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + dz0_sum[it].store_to(smem_wgrad, idx); + idx += THREADS_PER_ROW; + } + __syncthreads(); + compute_t cta_dz0_sum[NUM_RES]; + memset(cta_dz0_sum, 0, sizeof(compute_t) * NUM_RES); + for( int it = 0; it < ROWS_PER_CTA; it++ ) { + for( int jt = 0; jt < NUM_RES; jt++ ) { + cta_dz0_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA]; + } + } + __syncthreads(); + + idx = warp_m * Ktraits::VEC_COLS + tid_r; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + dz0y_sum[it].store_to(smem_wgrad, idx); + idx += THREADS_PER_ROW; + } + __syncthreads(); + compute_t cta_dz0y_sum[NUM_RES]; + memset(cta_dz0y_sum, 0, sizeof(compute_t) * NUM_RES); + for( int it = 0; it < ROWS_PER_CTA; it++ ) { + for( int jt = 0; jt < NUM_RES; jt++ ) { + cta_dz0y_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA]; + } + } + + compute_t cta_dz1_sum[NUM_RES], cta_dz1y_sum[NUM_RES]; + if (!Tied_norm) { + __syncthreads(); + idx = warp_m * Ktraits::VEC_COLS + tid_r; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + dz1_sum[it].store_to(smem_wgrad, idx); + idx += THREADS_PER_ROW; + } + __syncthreads(); + memset(cta_dz1_sum, 0, sizeof(compute_t) * NUM_RES); + for( int it = 0; it < ROWS_PER_CTA; it++ ) { + for( int jt = 0; jt < NUM_RES; jt++ ) { + cta_dz1_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA]; + } + } + __syncthreads(); + idx = warp_m * Ktraits::VEC_COLS + tid_r; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + dz1y_sum[it].store_to(smem_wgrad, idx); + idx += THREADS_PER_ROW; + } + __syncthreads(); + memset(cta_dz1y_sum, 0, sizeof(compute_t) * NUM_RES); + for( int it = 0; it < ROWS_PER_CTA; it++ ) { + for( int jt = 0; jt < NUM_RES; jt++ ) { + cta_dz1y_sum[jt] += smem_wgrad[it * COLS + tidx + jt * Ktraits::THREADS_PER_CTA]; + } + } + } + + const index_t num_valid_writes + = (params.cols - 1 - tidx + Ktraits::THREADS_PER_CTA) / Ktraits::THREADS_PER_CTA; + compute_t *dgamma0_part = static_cast(params.dgamma_part) + bidm * params.cols + tidx; + compute_t *dbeta0_part = static_cast(params.dbeta_part) + bidm * params.cols + tidx; + compute_t *dgamma1_part = !Tied_norm ? static_cast(params.dgamma1_part) + bidm * params.cols + tidx : nullptr; + compute_t *dbeta1_part = !Tied_norm ? static_cast(params.dbeta1_part) + bidm * params.cols + tidx : nullptr; + for( int jt = 0; jt < NUM_RES; jt++ ) { + if (Is_even_cols || (jt < num_valid_writes)) { + *dgamma0_part = cta_dz0y_sum[jt]; + dgamma0_part += Ktraits::THREADS_PER_CTA; + *dbeta0_part = cta_dz0_sum[jt]; + dbeta0_part += Ktraits::THREADS_PER_CTA; + if (!Tied_norm) { + *dgamma1_part = cta_dz1y_sum[jt]; + dgamma1_part += Ktraits::THREADS_PER_CTA; + *dbeta1_part = cta_dz1_sum[jt]; + dbeta1_part += Ktraits::THREADS_PER_CTA; + } + } + } + + } +} + +template +__global__ __launch_bounds__(Kernel_traits::THREADS_PER_CTA) +void ln_parallel_residual_bwd_finalize_kernel(BwdParams params) +{ + + using compute_t = typename Kernel_traits::compute_t; + using weight_t = typename Kernel_traits::weight_t; + using index_t = typename Kernel_traits::index_t; + using Reducer = typename Kernel_traits::Reducer; + using reduce_t = typename Reducer::Type; + + Sum sum; + enum { NUM_ELT = Kernel_traits::ELTS_PER_LDG }; + enum { THREADS_PER_WARP = Kernel_traits::THREADS_PER_WARP }; + + // Multiplying by 2 since we have both gamma0 and gamma1 + __shared__ char smem_[2 * Kernel_traits::SMEM_BYTES_PER_CTA]; + + constexpr uint32_t bidm = 0; + + const uint32_t bidn = blockIdx.x; + const uint32_t tidx = threadIdx.x; + const uint32_t warp = tidx / THREADS_PER_WARP; + const uint32_t lane = tidx % THREADS_PER_WARP; + + Reducer reducer(params, bidm, bidn, 0, 0, lane, smem_); + + const uint32_t c = bidn * THREADS_PER_WARP + lane; + const uint32_t c_out = bidn * THREADS_PER_WARP / 2 + lane; + constexpr uint32_t COL_STRIDE = Kernel_traits::CTAS * THREADS_PER_WARP; + for( uint32_t col = c, col_out = c_out; col < Kernel_traits::COLS; col += COL_STRIDE, col_out += COL_STRIDE / 2 ) { + // Each thread sums over NUM_ELT columns. + Vec dbeta0_local, dgamma0_local, dbeta1_local, dgamma1_local; + memset(&dgamma0_local, 0, sizeof(dgamma0_local)); + memset(&dbeta0_local, 0, sizeof(dbeta0_local)); + memset(&dgamma1_local, 0, sizeof(dgamma1_local)); + memset(&dbeta1_local, 0, sizeof(dbeta1_local)); + if (Is_even_cols || col < params.cols) { + for( uint32_t row = warp; row < params.ctas_per_col; row += Kernel_traits::ROWS_PER_CTA ) { + index_t idx = row * params.cols + col; + + Vec dbeta0_part, dgamma0_part, dbeta1_part, dgamma1_part; + dbeta0_part.load_from(params.dbeta_part, idx); + dgamma0_part.load_from(params.dgamma_part, idx); + dbeta1_part.load_from(params.dbeta1_part, idx); + dgamma1_part.load_from(params.dgamma1_part, idx); + #pragma unroll + for( int it = 0; it < NUM_ELT; it++ ) { + dgamma0_local.data.elt[it] += dgamma0_part.data.elt[it]; + dbeta0_local.data.elt[it] += dbeta0_part.data.elt[it]; + dgamma1_local.data.elt[it] += dgamma1_part.data.elt[it]; + dbeta1_local.data.elt[it] += dbeta1_part.data.elt[it]; + } + } + } + void * smem_gamma0 = smem_; + void * smem_beta0 = &smem_[Kernel_traits::SMEM_BYTES_TRANSPOSE]; + void * smem_gamma1 = &smem_[2 * Kernel_traits::SMEM_BYTES_TRANSPOSE]; + void * smem_beta1 = &smem_[3 * Kernel_traits::SMEM_BYTES_TRANSPOSE]; + + const int write_row = warp; + const int write_col = lane ^ write_row; + const int write_idx = write_row * THREADS_PER_WARP + write_col; + + dgamma0_local.store_to(smem_gamma0, write_idx); + dbeta0_local.store_to(smem_beta0, write_idx); + dgamma1_local.store_to(smem_gamma1, write_idx); + dbeta1_local.store_to(smem_beta1, write_idx); + + __syncthreads(); + + // It would be probably safe to reuse the first row of smem_beta0 and smem_gamma0 + void * smem_gamma0_out = &smem_[4 * Kernel_traits::SMEM_BYTES_TRANSPOSE]; + void * smem_beta0_out = &smem_[4 * Kernel_traits::SMEM_BYTES_TRANSPOSE + Kernel_traits::SMEM_BYTES_OUTPUT]; + void * smem_gamma1_out = &smem_[4 * Kernel_traits::SMEM_BYTES_TRANSPOSE + 2 * Kernel_traits::SMEM_BYTES_OUTPUT]; + void * smem_beta1_out = &smem_[4 * Kernel_traits::SMEM_BYTES_TRANSPOSE + 3 * Kernel_traits::SMEM_BYTES_OUTPUT]; + + // More than one iter iff ROWS_PER_CTA < 32. + for( int w = warp; w < THREADS_PER_WARP; w += Kernel_traits::ROWS_PER_CTA ) { + const int read_row = lane; + const int read_col = w ^ read_row; + const int read_idx = read_row * THREADS_PER_WARP + read_col; + + memset(&dbeta0_local, 0, sizeof(dbeta0_local)); + memset(&dgamma0_local, 0, sizeof(dgamma0_local)); + memset(&dbeta1_local, 0, sizeof(dbeta1_local)); + memset(&dgamma1_local, 0, sizeof(dgamma1_local)); + + // Load beta and gamma transposed + if(read_row < Kernel_traits::ROWS_PER_CTA){ + dbeta0_local.load_from(smem_beta0, read_idx); + dgamma0_local.load_from(smem_gamma0, read_idx); + dbeta1_local.load_from(smem_beta1, read_idx); + dgamma1_local.load_from(smem_gamma1, read_idx); + } + + // Call reducer on the loaded value(s) and convert. + #pragma unroll + for( int it = 0; it < NUM_ELT; it++ ) { + compute_t b0_i = dbeta0_local.data.elt[it]; + compute_t g0_i = dgamma0_local.data.elt[it]; + compute_t b1_i = dbeta1_local.data.elt[it]; + compute_t g1_i = dgamma1_local.data.elt[it]; + b0_i = reducer.allreduce(b0_i, sum); + g0_i = reducer.allreduce(g0_i, sum); + b1_i = reducer.allreduce(b1_i, sum); + g1_i = reducer.allreduce(g1_i, sum); + + dgamma0_local.data.elt[it] = g0_i; + dbeta0_local.data.elt[it] = b0_i; + dgamma1_local.data.elt[it] = g1_i; + dbeta1_local.data.elt[it] = b1_i; + } + + // Leader stores the result at the current column. + if(lane == 0){ + dgamma0_local.store_to(smem_gamma0_out, w); + dbeta0_local.store_to(smem_beta0_out, w); + dgamma1_local.store_to(smem_gamma1_out, w); + dbeta1_local.store_to(smem_beta1_out, w); + } + + } + + // All writes done. + __syncthreads(); + + // Pack and store: 2-wide stores with half the threads. + if (Is_even_cols || col_out * 2 < params.cols) { + if( warp == Kernel_traits::ROWS_PER_CTA - 1 && lane < THREADS_PER_WARP / 2 ) { + + using src_t = typename TypeToVec2::Type; + using dst_t = typename TypeToVec2::Type; + Vec dbeta0_vec2, dgamma0_vec2, dbeta1_vec2, dgamma1_vec2; + Vec dbeta0_out2, dgamma0_out2, dbeta1_out2, dgamma1_out2; + + dgamma0_vec2.load_from(smem_gamma0_out, lane); + dbeta0_vec2.load_from(smem_beta0_out, lane); + dgamma1_vec2.load_from(smem_gamma1_out, lane); + dbeta1_vec2.load_from(smem_beta1_out, lane); + #pragma unroll + for( int it = 0; it < NUM_ELT; it++ ) { + dgamma0_out2.data.elt[it] = Converter::convert(dgamma0_vec2.data.elt[it]); + dbeta0_out2.data.elt[it] = Converter::convert(dbeta0_vec2.data.elt[it]); + dgamma1_out2.data.elt[it] = Converter::convert(dgamma1_vec2.data.elt[it]); + dbeta1_out2.data.elt[it] = Converter::convert(dbeta1_vec2.data.elt[it]); + } + dgamma0_out2.store_to(params.dgamma, col_out); + dbeta0_out2.store_to(params.dbeta, col_out); + dgamma1_out2.store_to(params.dgamma1, col_out); + dbeta1_out2.store_to(params.dbeta1, col_out); + } + } + } +} + +} // namespace layer_norm + +using namespace layer_norm; + +template< + typename weight_t, + typename input_t, + typename residual_t, + typename output_t, + typename compute_t, + typename index_t, + int HIDDEN_SIZE, + int CTAS_PER_ROW, + int WARPS_M, + int WARPS_N, + int BYTES_PER_LDG_MAIN, + int BYTES_PER_LDG_FINAL +> +void launch_parallel_residual_(LaunchParams &launch_params, const bool configure_params){ + + using Kernel_traits = Kernel_traits; + bool is_dropout = launch_params.params.dropout_keep_p < 1.f; + bool tied_norm = launch_params.params.gamma1 == nullptr; + bool is_even_cols = launch_params.params.cols == HIDDEN_SIZE; + BOOL_SWITCH(is_dropout, IsDropoutConst, [&] { + BOOL_SWITCH(tied_norm, TiedNormConst, [&] { + BOOL_SWITCH(is_even_cols, IsEvenColsConst, [&] { + auto kernel = &ln_parallel_residual_bwd_kernel; + if( configure_params ) { + int ctas_per_sm; + CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &ctas_per_sm, kernel, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES)); + launch_params.params.ctas_per_col = launch_params.props->multiProcessorCount * ctas_per_sm / Kernel_traits::CTAS_PER_ROW; + launch_params.barrier_size = 0; + launch_params.workspace_bytes = 0; + if(Kernel_traits::CTAS_PER_ROW > 1) { + launch_params.barrier_size = 2 * launch_params.params.ctas_per_col; + launch_params.workspace_bytes = launch_params.params.ctas_per_col + * Kernel_traits::WARPS_M + * Kernel_traits::CTAS_PER_ROW + * sizeof(typename Kernel_traits::reduce_t) + * 2; + } + return; + } + + if( Kernel_traits::SMEM_BYTES >= 48 * 1024 ) { + CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES)); + } + auto stream = launch_params.stream; + auto ctas_per_col = launch_params.params.ctas_per_col; + + if( Kernel_traits::CTAS_PER_ROW == 1 ) { + kernel<<>>(launch_params.params); + } else { + dim3 grid(Kernel_traits::CTAS_PER_ROW * ctas_per_col); + dim3 block(Kernel_traits::THREADS_PER_CTA); + void *params_ = (void *)&launch_params.params; + cudaLaunchCooperativeKernel((void *)kernel, grid, block, (void **)¶ms_, Kernel_traits::SMEM_BYTES, stream); + } + + using Kernel_traits_f = layer_norm::Kernel_traits_finalize; + + auto kernel_f = !TiedNormConst + ? &layer_norm::ln_parallel_residual_bwd_finalize_kernel + : &layer_norm::ln_bwd_finalize_kernel; + kernel_f<<>>(launch_params.params); + + }); + }); + }); +} diff --git a/ln_parallel_residual_fwd_kernels.cuh b/ln_parallel_residual_fwd_kernels.cuh new file mode 100644 index 0000000000000000000000000000000000000000..0e55cb4038b4dbe30d9eb47609df3afea4c4f5fb --- /dev/null +++ b/ln_parallel_residual_fwd_kernels.cuh @@ -0,0 +1,281 @@ +#pragma once + +#ifdef OLD_GENERATOR_PATH +#include +#else +#include +#endif + +#include // For at::cuda::philox::unpack +#include + +#include "ln.h" +#include "ln_utils.cuh" +#include "ln_kernel_traits.h" +#include "static_switch.h" + +namespace layer_norm { + +template +__global__ __launch_bounds__(Ktraits::THREADS_PER_CTA) +void ln_parallel_residual_fwd_kernel(FwdParams params) { + + enum { ROWS_PER_CTA = Ktraits::ROWS_PER_CTA }; + enum { WARPS_N = Ktraits::WARPS_N }; + enum { WARPS_M = Ktraits::WARPS_M }; + enum { THREADS_PER_ROW = Ktraits::THREADS_PER_ROW }; + enum { VEC_COLS_PER_LDG = Ktraits::VEC_COLS_PER_LDG }; + enum { BYTES_PER_ROW = Ktraits::BYTES_PER_ROW }; + enum { LDGS = Ktraits::LDGS }; + enum { NUM_ELTS = Ktraits::NUM_ELTS }; + enum { CTAS_PER_ROW = Ktraits::CTAS_PER_ROW }; + + using input_t = typename Ktraits::input_t; + using residual_t = typename Ktraits::residual_t; + using output_t = typename Ktraits::output_t; + using index_t = typename Ktraits::index_t; + using compute_t = typename Ktraits::compute_t; + using mask_t = typename Ktraits::mask_t; + using Ivec = typename Ktraits::Ivec; + using Rvec = typename Ktraits::Rvec; + using Ovec = typename Ktraits::Ovec; + using Wvec = typename Ktraits::Wvec; + using Cvec = typename Ktraits::Cvec; + using Mvec = typename Ktraits::Mvec; + + using Stats = typename Ktraits::Stats; + using stats_t = typename Stats::stats_t; + + const bool has_residual = params.residual != nullptr; + const bool has_x1 = params.x1 != nullptr; + const bool save_x = has_residual || has_x1 || Is_dropout || !(std::is_same::value); + + extern __shared__ char smem_[]; + + const index_t tidx = threadIdx.x; + const index_t bidn = blockIdx.x % CTAS_PER_ROW; + const index_t bidm = blockIdx.x / CTAS_PER_ROW; + const index_t lane = tidx % THREADS_PER_WARP; + const index_t warp = tidx / THREADS_PER_WARP; + const index_t warp_m = warp / WARPS_N; + const index_t warp_n = warp % WARPS_N; + + const index_t r = bidm * ROWS_PER_CTA + warp_m; + const index_t c = bidn * THREADS_PER_ROW + warp_n * THREADS_PER_WARP + lane; + + Stats stats(params, bidm, bidn, warp_m, warp_n, lane, smem_); + + compute_t *mu_ptr = static_cast(params.mu); + compute_t *rs_ptr = static_cast(params.rs); + + // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/Dropout.cu + curandStatePhilox4_32_10_t state; + if (Is_dropout) { + auto seeds = at::cuda::philox::unpack(params.philox_args); + const index_t tidx_global = blockIdx.x * blockDim.x + threadIdx.x; + curand_init(std::get<0>(seeds), tidx_global, std::get<1>(seeds), &state); + } + + const index_t num_valid_ldgs = ((params.cols / Ktraits::ELTS_PER_LDG) - 1 - c + VEC_COLS_PER_LDG) / VEC_COLS_PER_LDG; + + Wvec gamma0[LDGS]; + Wvec beta0[LDGS]; + Wvec gamma1[LDGS]; + Wvec beta1[LDGS]; + index_t idx = c; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + if (Is_even_cols || (it < num_valid_ldgs)) { + gamma0[it].load_from(params.gamma, idx); + if (params.beta != nullptr) { + beta0[it].load_from(params.beta, idx); + } else { + beta0[it].zero_(); + } + if (!Tied_norm) { + gamma1[it].load_from(params.gamma1, idx); + if (params.beta1 != nullptr) { + beta1[it].load_from(params.beta1, idx); + } else { + beta1[it].zero_(); + } + } + idx += VEC_COLS_PER_LDG; + } + } + + for( int row = r; row < params.rows; row += params.ctas_per_col * ROWS_PER_CTA ) { + index_t idx = row * params.cols / Ktraits::ELTS_PER_LDG + c; + compute_t xf[LDGS * NUM_ELTS]; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + if (Is_even_cols || (it < num_valid_ldgs)) { + Ivec x0; + Ivec x1; + Rvec residual; + Rvec x; + Mvec dmask0; + Mvec dmask1; + x0.load_from(params.x0, idx); + if (has_x1) { x1.load_from(params.x1, idx); } + if (has_residual) { residual.load_from(params.residual, idx); } + #pragma unroll + for( int jt = 0; jt < NUM_ELTS; jt++ ) { + // TD [2022-04-22]: We're memory bound, not compute bound, so we don't need to use + // the more efficient curand_uniform4. + compute_t x_ij; + mask_t keep0 = !Is_dropout ? true : curand_uniform(&state) <= params.dropout_keep_p; + if (Is_dropout) { dmask0.data.elt[jt] = keep0; } + compute_t x0_ij = compute_t(x0.data.elt[jt]); + x0_ij = keep0 ? (Is_dropout ? x0_ij * params.dropout_scale : x0_ij) : 0.0f; + if (has_x1) { + mask_t keep1 = !Is_dropout ? true : curand_uniform(&state) <= params.dropout_keep_p; + if (Is_dropout) { dmask1.data.elt[jt] = keep1; } + compute_t x1_ij = compute_t(x1.data.elt[jt]); + x1_ij = keep1 ? (Is_dropout ? x1_ij * params.dropout_scale : x1_ij) : 0.0f; + x_ij = has_residual ? x0_ij + x1_ij + compute_t(residual.data.elt[jt]) : x0_ij + x1_ij; + } else { + x_ij = has_residual ? x0_ij + compute_t(residual.data.elt[jt]) : x0_ij; + } + if (save_x) { x.data.elt[jt] = x_ij; } + xf[it * NUM_ELTS + jt] = x_ij; + } + if (save_x) { x.store_to(params.x, idx); } + if (Is_dropout) { + dmask0.store_to(params.dmask, idx); + if (has_x1) { dmask1.store_to(params.dmask1, idx); } + } + idx += VEC_COLS_PER_LDG; + } + } + + static_assert(CTAS_PER_ROW == 1, "Don't support multiple CTAs per row for now"); + const index_t num_vecs = params.cols / Ktraits::ELTS_PER_LDG; + const index_t num_full_ldgs = num_vecs / Ktraits::VEC_COLS_PER_LDG; + const index_t remaining_vecs = num_vecs % Ktraits::VEC_COLS_PER_LDG; + auto valid_elts_in_warp_fn = [num_full_ldgs, remaining_vecs] (int warp_n) -> int { + // Need to convert to int, otherwise the subtraction will wrap around. + const index_t valid_partial_vecs_in_warp = + std::min(std::max(int(remaining_vecs) - int(warp_n * THREADS_PER_WARP), int(0)), + int(THREADS_PER_WARP)); + return (num_full_ldgs * THREADS_PER_WARP + valid_partial_vecs_in_warp) * NUM_ELTS; + }; + stats_t s = stats.template compute( + xf, params.inverse_cols, valid_elts_in_warp_fn, num_valid_ldgs * NUM_ELTS + ); + + compute_t mu = layer_norm::Get<0>::of(s); + compute_t m2 = layer_norm::Get<1>::of(s); + + if( bidn == 0 && warp_n == 0 && lane == 0 ) { + mu_ptr[row] = mu; + } + + compute_t rs = rsqrtf(m2 * params.inverse_cols + params.epsilon + (!params.is_rms_norm ? 0.f : mu * mu)); + + if( bidn == 0 && warp_n == 0 && lane == 0 ) { + rs_ptr[row] = rs; + } + + idx = row * params.cols / Ktraits::ELTS_PER_LDG + c; + #pragma unroll + for( int it = 0; it < LDGS; it++ ) { + if (Is_even_cols || (it < num_valid_ldgs)) { + Ovec z0; + Ovec z1; + #pragma unroll + for( int jt = 0; jt < NUM_ELTS; jt++ ) { + compute_t y_ij = compute_t(rs * (xf[it * NUM_ELTS + jt] - (!params.is_rms_norm ? mu : 0.f))); + compute_t g0_ij = gamma0[it].data.elt[jt]; + compute_t b0_ij = beta0[it].data.elt[jt]; + z0.data.elt[jt] = output_t(g0_ij * y_ij + b0_ij); + if (!Tied_norm) { + compute_t g1_ij = gamma1[it].data.elt[jt]; + compute_t b1_ij = beta1[it].data.elt[jt]; + z1.data.elt[jt] = output_t(g1_ij * y_ij + b1_ij); + } + } + z0.store_to(params.z, idx); + if (!Tied_norm) { z1.store_to(params.z1, idx); } + idx += VEC_COLS_PER_LDG; + } + } + + } +} + +} // namespace layer_norm + +using namespace layer_norm; + +template< + typename weight_t, + typename input_t, + typename residual_t, + typename output_t, + typename compute_t, + typename index_t, + int HIDDEN_SIZE, + int CTAS_PER_ROW, + int WARPS_M, + int WARPS_N, + int BYTES_PER_LDG +> +void launch_parallel_residual_(LaunchParams &launch_params, const bool configure_params){ + + using Kernel_traits = Kernel_traits; + bool is_even_cols = launch_params.params.cols == HIDDEN_SIZE; + bool tied_norm = launch_params.params.gamma1 == nullptr; + BOOL_SWITCH(launch_params.params.dropout_keep_p < 1.f, IsDropoutConst, [&] { + BOOL_SWITCH(tied_norm, TiedNormConst, [&] { + BOOL_SWITCH(is_even_cols, IsEvenColsConst, [&] { + auto kernel = &ln_parallel_residual_fwd_kernel; + if( configure_params ) { + int ctas_per_sm; + CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &ctas_per_sm, kernel, Kernel_traits::THREADS_PER_CTA, Kernel_traits::SMEM_BYTES_FWD)); + launch_params.params.ctas_per_col = launch_params.props->multiProcessorCount * ctas_per_sm / Kernel_traits::CTAS_PER_ROW; + const size_t rows_per_loop = launch_params.params.ctas_per_col * Kernel_traits::ROWS_PER_CTA; + launch_params.elts_per_thread = (launch_params.params.rows + rows_per_loop - 1) / rows_per_loop * Kernel_traits::LDGS * Kernel_traits::NUM_ELTS; + launch_params.barrier_size = 0; + launch_params.workspace_bytes = 0; + if(Kernel_traits::CTAS_PER_ROW > 1) { + launch_params.barrier_size = 2 * launch_params.params.ctas_per_col; + launch_params.workspace_bytes = launch_params.params.ctas_per_col + * Kernel_traits::WARPS_M + * Kernel_traits::CTAS_PER_ROW + * sizeof(typename Kernel_traits::Stats::stats_t) + * 2; + } + return; + } + + if( Kernel_traits::SMEM_BYTES_FWD >= 48 * 1024 ) { + CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, Kernel_traits::SMEM_BYTES_FWD)); + } + auto stream = launch_params.stream; + auto ctas_per_col = launch_params.params.ctas_per_col; + + if( Kernel_traits::CTAS_PER_ROW == 1 ) { + kernel<<>>(launch_params.params); + } else { + dim3 grid(Kernel_traits::CTAS_PER_ROW * ctas_per_col); + dim3 block(Kernel_traits::THREADS_PER_CTA); + void *params_ = (void *)&launch_params.params; + cudaLaunchCooperativeKernel((void *)kernel, grid, block, (void **)¶ms_, Kernel_traits::SMEM_BYTES_FWD, stream); + } + }); + }); + }); +} diff --git a/ln_utils.cuh b/ln_utils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..178d6fda895b478ac76e2a77a2b1b35115fcc279 --- /dev/null +++ b/ln_utils.cuh @@ -0,0 +1,783 @@ +#pragma once + +#include + +#include +#include + +#include "ln.h" + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +constexpr uint32_t THREADS_PER_WARP = 32; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline void check_cuda_(cudaError_t status, const char *file, int line) { + if( status != cudaSuccess ) { + fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(status), file, line); + exit(status); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#define CHECK_CUDA(ans) \ + { check_cuda_((ans), __FILE__, __LINE__); } + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#define DIVUP(x, y) (((x) + ((y)-1)) / (y)) + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#define REGISTER_FWD_LAUNCHER(HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG) \ + void ln_fwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE(LaunchParams &launch_params, \ + const bool configure_params) { \ + launch_( \ + launch_params, configure_params); \ + } \ + static FwdRegistrar reg_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE( \ + ln_fwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE) + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#define REGISTER_BWD_LAUNCHER( \ + HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINALIZE) \ + void ln_bwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE(LaunchParams &launch_params, \ + const bool configure_params) { \ + launch_(launch_params, configure_params); \ + } \ + static BwdRegistrar reg_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE( \ + ln_bwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE) + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#define REGISTER_PARALLEL_FWD_LAUNCHER(HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG) \ + void ln_parallel_residual_fwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE(LaunchParams &launch_params, \ + const bool configure_params) { \ + launch_parallel_residual_( \ + launch_params, configure_params); \ + } \ + static FwdParallelRegistrar reg_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE( \ + ln_parallel_residual_fwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE) + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +#define REGISTER_PARALLEL_BWD_LAUNCHER( \ + HIDDEN_SIZE, WTYPE, ITYPE, RTYPE, OTYPE, CTYPE, CTAS_PER_ROW, WARPS_M, WARPS_N, BYTES_PER_LDG, BYTES_PER_LDG_FINALIZE) \ + void ln_parallel_residual_bwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE(LaunchParams &launch_params, \ + const bool configure_params) { \ + launch_parallel_residual_(launch_params, configure_params); \ + } \ + static BwdParallelRegistrar reg_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE( \ + ln_parallel_residual_bwd_##HIDDEN_SIZE##_##WTYPE##_##ITYPE##_##RTYPE##_##OTYPE##_##CTYPE) + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ float2 operator+(const float2 & a, const float2 & b){ + return {a.x + b.x, a.y + b.y}; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +inline __device__ void operator+=(float2 & a, const float2 & b){ + a.x += b.x; + a.y += b.y; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Sum { + inline __device__ Sum(){} + inline __device__ T operator()(const T &a, const T &b){ + return a + b; + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +inline __device__ T warp_shuffle_xor(const T & x, uint32_t idx){ + return __shfl_xor_sync(uint32_t(-1), x, idx); +} + +template<> +inline __device__ float2 warp_shuffle_xor(const float2 & x, uint32_t idx){ + return { warp_shuffle_xor(x.x, idx), warp_shuffle_xor(x.y, idx) }; +} + +template +inline __device__ T warp_shuffle_down(const T & x, uint32_t idx){ + return __shfl_down_sync(uint32_t(-1), x, idx); +} + +template<> +inline __device__ float2 warp_shuffle_down(const float2 & x, uint32_t idx){ + return { warp_shuffle_down(x.x, idx), warp_shuffle_down(x.y, idx) }; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace layer_norm { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct uint16 { + uint4 u; + uint4 v; + uint4 s; + uint4 t; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +struct uint8 { + uint4 u; + uint4 v; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct BytesToType {}; + +template<> +struct BytesToType<64> { + using Type = uint16; + static_assert(sizeof(Type) == 64); +}; + +template<> +struct BytesToType<32> { + using Type = uint8; + static_assert(sizeof(Type) == 32); +}; + +template<> +struct BytesToType<16> { + using Type = uint4; + static_assert(sizeof(Type) == 16); +}; + +template<> +struct BytesToType<8> { + using Type = uint64_t; + static_assert(sizeof(Type) == 8); +}; + +template<> +struct BytesToType<4> { + using Type = uint32_t; + static_assert(sizeof(Type) == 4); +}; + +template<> +struct BytesToType<2> { + using Type = uint16_t; + static_assert(sizeof(Type) == 2); +}; + +template<> +struct BytesToType<1> { + using Type = uint8_t; + static_assert(sizeof(Type) == 1); +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct TypeToVec2 {}; + +template<> +struct TypeToVec2 { + using Type = float2; +}; + +template<> +struct TypeToVec2 { + using Type = half2; +}; + +template<> +struct TypeToVec2 { + using Type = nv_bfloat162; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Get { + template + static inline __device__ R of(const T &vec); +}; + +template<> +template +inline __device__ R Get<0>::of(const T &vec) { + return vec.x; +} + +template<> +template +inline __device__ R Get<1>::of(const T &vec) { + return vec.y; +} + +template<> +template +inline __device__ R Get<2>::of(const T &vec) { + return vec.z; +} + +template<> +template +inline __device__ R Get<3>::of(const T &vec) { + return vec.w; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Converter{ + static inline __device__ Dst convert(const Src &from) { + return Dst(from); + } +}; + +template<> +struct Converter{ + static inline __device__ half2 convert(const float2 &x) { + return __float22half2_rn(x); + } +}; + +template<> +struct Converter{ + static inline __device__ nv_bfloat162 convert(const float2 &x) { +#if __CUDA_ARCH__ >= 800 + return __float22bfloat162_rn(x); +#else + union { + nv_bfloat162 raw; + nv_bfloat16 x; + nv_bfloat16 y; + } tmp; + tmp.x = __float2bfloat16_rn(x.x); + tmp.y = __float2bfloat16_rn(x.y); + return tmp.raw; +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Zeros{ + static inline __device__ T get() { + return T(0.f); + } +}; + +template<> +struct Zeros{ + static inline __device__ float2 get() { + return make_float2(0.f, 0.f); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Vec { + + enum { BYTES = NUM_ELT * sizeof(Elt_type) }; + + using Vec_type = typename BytesToType::Type; + + using Alias_type = union { + Vec_type vec; + Elt_type elt[NUM_ELT]; + }; + + Alias_type data; + + template + inline __device__ void to(Vec &other) { + #pragma unroll + for( int it = 0; it < NUM_ELT; it++ ) { + other.data.elt[it] = S(this->data.elt[it]); + } + } + + template + inline __device__ void assign(const Op &op) { + #pragma unroll + for( int it = 0; it < NUM_ELT; it++ ) { + this->data.elt[it] = op(it); + } + } + + inline __device__ void zero_() { + #pragma unroll + for( int it = 0; it < NUM_ELT; it++ ) { + this->data.elt[it] = Elt_type(0.f); + } + } + + inline __device__ void load_from(const void *base_ptr, const size_t idx) { + this->data.vec = static_cast(base_ptr)[idx]; + } + + inline __device__ void store_to(void *base_ptr, const size_t idx) { + static_cast(base_ptr)[idx] = this->data.vec; + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct InterCTASync { + + template + inline __device__ InterCTASync(Params & params, uint32_t bidm, uint32_t bidn) + : phase_counter_(0) + , b0_(params.barrier + bidm) // The barrier for this group of CTAs. + , b1_(params.barrier + bidm + params.ctas_per_col) // The barrier for this group of CTAs. + { + // BARRIERS ARE ASSUMED TO BE INITIALIZED TO 0! + } + + inline __device__ void spin_wait_(int *barrier, int step, int expected) { + asm volatile("red.release.gpu.global.add.s32 [%0], %1;" ::"l"(barrier), "r"(step)); + for( int found = -1; found != expected; ) { + asm volatile("ld.global.acquire.gpu.b32 %0, [%1];" : "=r"(found) : "l"(barrier)); + } + } + + inline __device__ void sync(){ + // ALL THREADS MUST ENTER! + + // We switch barrier every iteration. + int *barrier = phase_counter_ & 0x1 ? b1_ : b0_; + // We decrement every other iteration. + bool dec = phase_counter_ & 0x2; + int step = dec ? -1 : 1; + int expected = dec ? 0 : CTAS_PER_ROW; + // There are only 4 phases: up/down for b0/b1. + phase_counter_ = (phase_counter_ + 1) & 0x3; + + if( threadIdx.x == 0 ) { + spin_wait_(barrier, step, expected); + } + // CTA waits for thread 0 + __syncthreads(); + } + + int phase_counter_; + int * b0_; + int * b1_; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Reducer : public Reducer { + + using InterCTASync = InterCTASync; + using Base = Reducer; + using Type = typename Base::Type; + + enum { SMEM_BYTES = Base::SMEM_BYTES }; + + enum { WS_BARRIER_BYTES = 2 * sizeof(int) }; + enum { WS_DATA_BYTES = WARPS_M * CTAS_PER_ROW * sizeof(T) }; + + // size of the barriers + temporary result per CTA (multiply with CTAS_PER_ROW to get total) + enum { WORKSPACE_BYTES_PER_GROUP = Base::WORKSPACE_BYTES_PER_GROUP + WS_BARRIER_BYTES + WS_DATA_BYTES }; + + template + inline __device__ Reducer(Params & params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane, void * smem) + : Base(params, bidm, bidn, warp_m, warp_n, lane, smem) + , inter_cta_(params, bidm, bidn) + , bidn_(bidn) // CTA id within the group. + , w0_(static_cast(params.workspace) + (bidm * WARPS_M + warp_m) * CTAS_PER_ROW) + , w1_(w0_ + params.ctas_per_col * WARPS_M * CTAS_PER_ROW) + { + } + + template + inline __device__ T allreduce(T data, Op &op) { + data = Base::reduce(data, op); + // We switch workspace every iteration. + T *workspace = inter_cta_.phase_counter_ & 0x1 ? w1_ : w0_; + + // Warp leaders 0 hold the CTA-local results. + if( this->warp_n_ == 0 && this->lane_ == 0 ) { + workspace[bidn_] = data; + } + inter_cta_.sync(); + static_assert(CTAS_PER_ROW <= 32); + T total = Zeros::get(); + if(this->lane_ < CTAS_PER_ROW){ + total = workspace[this->lane_]; + } + total = Reducer::allreduce_(total, op); + + return total; + } + + InterCTASync inter_cta_; + + T *w0_; + T *w1_; + int bidn_; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Reducer { + + using Type = T; + enum { SMEM_BYTES = 0 }; + enum { WORKSPACE_BYTES_PER_GROUP = 0 }; + + enum { THREADS_PER_WARP = 32 }; + + template + inline __device__ Reducer(Params & params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane, void * smem) + : warp_n_(warp_n) + , lane_(lane) + { + } + + template + static inline __device__ T allreduce_(T data, Op &op) { + #pragma unroll + for( int it = 1; it < THREADS_PER_WARP; it *= 2 ) { + data = op(data, warp_shuffle_xor(data, it)); + } + return data; + } + + template + inline __device__ T allreduce(T data, Op &op) { + return allreduce_(data, op); + } + + template + inline __device__ T reduce(T data, Op &op){ + // only lane 0 holds the result! + #pragma unroll + for( int it = THREADS_PER_WARP / 2; it > 0; it /= 2 ) { + data = op(data, warp_shuffle_down(data, it)); + } + return data; + } + int warp_n_; + int lane_; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Reducer : public Reducer { + + using Base = Reducer; + + using Type = T; + + enum { SMEM_BYTES = Base::SMEM_BYTES + WARPS_M * WARPS_N * sizeof(T) * 2 }; + enum { WORKSPACE_BYTES_PER_GROUP = 0 }; + + enum { THREADS_PER_WARP = 32 }; + + template + inline __device__ Reducer(Params & params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane, void * smem) + : Base(params, bidm, bidn, warp_m, warp_n, lane, smem) + , use0_(true) + { + smem0_ = &static_cast(smem)[warp_m * WARPS_N]; + smem1_ = smem0_ + WARPS_M * WARPS_N; + } + + template + inline __device__ T allreduce(T data, Op & op) { + T * smem = use0_ ? smem0_ : smem1_; + use0_ = !use0_; + data = Base::reduce(data, op); + if( this->lane_ == 0 ) { + smem[this->warp_n_] = data; + } + __syncthreads(); + T out = Zeros::get(); + #pragma unroll + for( int it = 0; it < WARPS_N; it++ ) { + out = op(out, smem[it]); + } + return out; + } + + template + inline __device__ T reduce(T data, Op &op) { + T * smem = use0_ ? smem0_ : smem1_; + use0_ = !use0_; + // only intra-CTA group leader holds the result! + data = Base::reduce(data, op); + if( this->lane_ == 0 ) { + smem[this->warp_n_] = data; + } + __syncthreads(); + T out = Zeros::get(); + if( this->warp_n_ == 0 && this->lane_ == 0 ) { + #pragma unroll + for( int it = 0; it < WARPS_N; it++ ) { + out = op(out, smem[it]); + } + } + return out; + } + + T * smem0_; + T * smem1_; + bool use0_; + +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +inline __device__ void warp_chan_upd_dynamic(T &m_a, T &m2_a, int_t &n_a, int num_active){ + //Assume at least leftmost is valid and init: step = next_pow2(num_active) / 2 (might get NaN otherwise) + const int highest_bit_set = (8 * sizeof(num_active)) - __clz(num_active - 1); + + #pragma unroll + for( int step = (1 << (highest_bit_set - 1)); step > 0; step /= 2 ) { + // Exchange + int_t n_b = warp_shuffle_down(n_a, step); + T m_b = warp_shuffle_down(m_a, step); + T m2_b = warp_shuffle_down(m2_a, step); + + // Update + const int_t n_ab = n_a + n_b; // We can handle one of them being 0, not both. + const T rn_ab = 1.f / n_ab; // Might have different n per thread, otherwise this would simplify :( + const T delta = m_a - m_b; + const float m2_ab = m2_a + m2_b + delta * delta * n_a * n_b * rn_ab; + const float m_ab = (n_a * m_a + n_b * m_b) * rn_ab; + + n_a = n_ab; + m_a = m_ab; + m2_a = m2_ab; + } + // Intra-warp broadcast (only lane 0 has valid stats). + m_a = __shfl_sync(uint32_t(-1), m_a, 0); + m2_a = __shfl_sync(uint32_t(-1), m2_a, 0); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Stats { + // This could be done generically with the Reducer. But then we would have to exchange 3 instead of 2 fields. + + using InterCTASync = InterCTASync; + using BlockStats = Stats; + using stats_t = typename BlockStats::stats_t; + + enum { SMEM_BYTES = BlockStats::SMEM_BYTES }; + + template + inline __device__ Stats(Params & params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane, void * smem) + : inter_cta_(params, bidm, bidn) + , block_stats_(params, bidm, bidn, warp_m, warp_n, lane, smem) + , bidn_(bidn) // CTA id within the group. + , w0_(static_cast(params.workspace) + (bidm * WARPS_M + warp_m) * CTAS_PER_ROW) + , w1_(w0_ + params.ctas_per_col * WARPS_M * CTAS_PER_ROW) + , warp_n_(warp_n) + , lane_(lane) + { + } + + template + inline __device__ stats_t compute(const T (&elts)[N], const T rn) { + constexpr T ELTS_PER_ROW_PER_CTA = N * WARPS_N * THREADS_PER_WARP; + // TODO rn is not really needed here.. + constexpr T block_rn = 1.f / T(ELTS_PER_ROW_PER_CTA); + stats_t block_stats = block_stats_.compute(elts, block_rn); + + stats_t *workspace = inter_cta_.phase_counter_ & 0x1 ? w1_ : w0_; + + if( warp_n_ == 0 && lane_ == 0 ) { + workspace[bidn_] = block_stats; + } + + // Wait for all CTAS_PER_ROW CTAS in the group to have written their result. + inter_cta_.sync(); + + T n = Zeros::get(); + T m = Zeros::get(); + T m2 = Zeros::get(); + + // Assume CTA group size in N less than 32, such that we can finalize with a single warp. + static_assert(CTAS_PER_ROW <= 32); + + // Every warp does the final reduction locally. + if( lane_ < CTAS_PER_ROW ) { + stats_t result = workspace[lane_]; + n = ELTS_PER_ROW_PER_CTA; + m = layer_norm::Get<0>::of(result); + m2 = layer_norm::Get<1>::of(result); + } + + warp_chan_upd_dynamic(m, m2, n, CTAS_PER_ROW); + + return { m, m2 }; + } + + InterCTASync inter_cta_; + BlockStats block_stats_; + + stats_t *w0_; + stats_t *w1_; + int bidn_; + int warp_n_; + int lane_; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Stats { + + using WarpStats = Stats; + using stats_t = typename WarpStats::stats_t; + + enum { SMEM_BYTES = WARPS_M * WARPS_N * sizeof(stats_t) * 2 }; + + template + inline __device__ Stats(Params & params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane, void * smem) + : warp_stats_(params, bidm, bidn, warp_m, warp_n, lane, smem) + , use0_(true) + { + smem0_ = static_cast(smem) + warp_m * WARPS_N; + smem1_ = smem0_ + WARPS_M * WARPS_N; + } + + template + inline __device__ stats_t compute(const T (&elts)[N], const T row_norm_factor, + function_t valid_elts_in_warp_fn, const int num_valid_elts = N) { + stats_t * smem = use0_ ? smem0_ : smem1_; + use0_ = !use0_; + // Compute warp local for all WARPS_N + const auto warp_n = warp_stats_.reducer_.warp_n_; + const T warp_norm_factor = 1.f / T(Is_even_cols ? N * THREADS_PER_WARP : valid_elts_in_warp_fn(warp_n)); + stats_t warp_stats = warp_stats_.template compute( + elts, warp_norm_factor, valid_elts_in_warp_fn, num_valid_elts + ); + + //Each warp warp leader stores its stats + const auto lane = warp_stats_.reducer_.lane_; + if( lane == 0 ) { + smem[warp_n] = warp_stats; + } + __syncthreads(); + + int n = 0;; + T m = Zeros::get(); + T m2 = Zeros::get(); + + // Assume that there are less than 32 warps, such that we can finalize with a single warp + static_assert(WARPS_N <= 32); + if(lane < WARPS_N){ + stats_t result = smem[lane]; + n = Is_even_cols ? N * THREADS_PER_WARP : valid_elts_in_warp_fn(lane); + m = layer_norm::Get<0>::of(result); + m2 = layer_norm::Get<1>::of(result); + } + + warp_chan_upd_dynamic(m, m2, n, WARPS_N); + + return { m, m2 }; + } + WarpStats warp_stats_; + stats_t * smem0_; + stats_t * smem1_; + bool use0_; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Stats { + + using stats_t = typename TypeToVec2::Type; + // The simple Warp reducer. + using Reducer = Reducer; + + enum { SMEM_BYTES = 0 }; + + template + inline __device__ Stats(Params & params, uint32_t bidm, uint32_t bidn, uint32_t warp_m, uint32_t warp_n, uint32_t lane, void * smem) + : reducer_(params, bidm, bidn, warp_m, warp_n, lane, smem) + { + } + + template + inline __device__ stats_t compute(const T (&elts)[N], const T row_norm_factor, + // const int valid_elts_in_warp_ignored_, const int num_valid_elts = N) { + function_t valid_elts_in_warp_fn, const int num_valid_elts = N) { + + auto sum = Sum(); + + T m = Zeros::get(); + #pragma unroll + for( int it = 0; it < N; it++ ) { + if (Is_even_cols || (it < num_valid_elts)) { + m += elts[it]; + } + } + m = reducer_.allreduce(m, sum) * row_norm_factor; + + T m2 = Zeros::get(); + #pragma unroll + for( int it = 0; it < N; it++ ) { + if (Is_even_cols || (it < num_valid_elts)) { + T diff = (elts[it] - m); + m2 += diff * diff; + } + } + m2 = reducer_.allreduce(m2, sum); + + return {m, m2}; + } + + Reducer reducer_; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace layer_norm diff --git a/mainloop_bwd_sm90_tma_gmma_ws.hpp b/mainloop_bwd_sm90_tma_gmma_ws.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7483f4efdb5c704115ac154ecd0aa7b00ea6157f --- /dev/null +++ b/mainloop_bwd_sm90_tma_gmma_ws.hpp @@ -0,0 +1,841 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include +#include "cutlass/pipeline/pipeline.hpp" + +#include "cute/tensor.hpp" + +#include "cutlass/gemm/collective/collective_builder.hpp" + +#include "named_barrier.hpp" +#include "softmax.h" +#include "utils.h" + +namespace flash { + +using namespace cute; + +template +struct CollectiveMainloopBwd { + + static constexpr int kStages = Stages; + using ClusterShape = ClusterShape_; + using TileShape_MNK = TileShape_MNK_; + using Element = Element_; + using ElementAccum = ElementAccum_; + using ArchTag = ArchTag_; + static constexpr bool Is_causal = Is_causal_; + static constexpr bool Varlen = Varlen_; + static constexpr bool SdP_swapAB = true; + static constexpr bool dKV_swapAB = dKV_swapAB_; + static constexpr bool dQ_swapAB = dQ_swapAB_; + static_assert(!(SdP_swapAB && dKV_swapAB)); // If SdP_swapAB, then we don't swap for dKV + + static constexpr int kBlockM = get<0>(TileShape_MNK{}); + static constexpr int kBlockN = get<1>(TileShape_MNK{}); + static constexpr int kHeadDim = get<2>(TileShape_MNK{}); + + static constexpr int NumdQWarpGroups = 2; + static constexpr int kNThreadsdQ = NumdQWarpGroups * cutlass::NumThreadsPerWarpGroup; + + static_assert(ArchTag::kMinComputeCapability >= 90); + static_assert(get<0>(ClusterShape{}) == 1 && get<2>(ClusterShape{}) == 1); + + static constexpr bool Mma_dQ_is_RS = AtomLayoutMSdP == 2 && AtomLayoutMdQ == 2 && !SdP_swapAB && !dQ_swapAB; // If dQ_swapAB we can't use RS + using TileShapeAtomSdP = std::conditional_t< + !SdP_swapAB, + Shape, Int, Int>, + Shape, Int, Int> + >; + using AtomLayoutSdP = std::conditional_t< + !SdP_swapAB, + Layout, Int<2 / AtomLayoutMSdP>, _1>>, + Layout, Int, _1>> + >; + using TiledMmaSdP = decltype(cute::make_tiled_mma( + cute::GMMA::ss_op_selector(), + AtomLayoutSdP{})); + + using TileShapeAtomdKV = std::conditional_t< + !dKV_swapAB, + Shape, Int, Int>, + Shape, Int, Int> + >; + using AtomLayoutdKV = std::conditional_t< + !dKV_swapAB, + Layout, Int<2 / AtomLayoutNdKV>, _1>>, + Layout, Int, _1>> + >; + using TiledMmadKV = decltype(cute::make_tiled_mma( + std::conditional_t< + !SdP_swapAB, + decltype(cute::GMMA::ss_op_selector()), + decltype(cute::GMMA::rs_op_selector()) + >{}, + AtomLayoutdKV{})); + + using TileShapeAtomdQ = std::conditional_t< + !dQ_swapAB, + Shape, Int, Int>, + Shape, Int, Int> + >; + using AtomLayoutdQ = std::conditional_t< + !dQ_swapAB, + Layout, Int, _1>>, + Layout, Int, _1>> + >; + static constexpr GMMA::Major MmadQMajorA = !dQ_swapAB ? GMMA::Major::K : GMMA::Major::MN; + static constexpr GMMA::Major MmadQMajorB = !dQ_swapAB ? GMMA::Major::MN : GMMA::Major::K; + using TiledMmadQ = decltype(cute::make_tiled_mma( + std::conditional_t< + !dQ_swapAB, + std::conditional_t< + Mma_dQ_is_RS, + decltype(cute::GMMA::rs_op_selector()), + decltype(cute::GMMA::ss_op_selector()) + >, + decltype(cute::GMMA::ss_op_selector()) + >{}, + AtomLayoutdQ{})); + + using SmemLayoutAtomQ = decltype(cutlass::gemm::collective::detail::ss_smem_selector, Int>()); + using SmemLayoutQ = + decltype(tile_to_shape(SmemLayoutAtomQ{}, + make_shape(shape<0>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int{}))); + using SmemLayoutdO = SmemLayoutQ; + + using SmemLayoutAtomK = decltype(cutlass::gemm::collective::detail::ss_smem_selector, Int>()); + using SmemLayoutK = decltype(tile_to_shape(SmemLayoutAtomK{}, select<1, 2>(TileShape_MNK{}))); + + using SmemLayoutAtomV = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); + using SmemLayoutV = decltype(tile_to_shape(SmemLayoutAtomV{}, select<1, 2>(TileShape_MNK{}))); + + using SmemLayoutAtomP = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<1>(TileShape_MNK{}))>()); + using SmemLayoutP = decltype(tile_to_shape(SmemLayoutAtomP{}, select<0, 1>(TileShape_MNK{}))); + using SmemLayoutAtomdS = decltype(cutlass::gemm::collective::detail::ss_smem_selector(TileShape_MNK{})), decltype(cute::get<1>(TileShape_MNK{}))>()); + using SmemLayoutdS = decltype(tile_to_shape(SmemLayoutAtomdS{}, make_shape(Int{}, Int{}, Int{}))); + + // Need stride to be multiple of 32, otherwise we get error (misaligned address) when doing TMA if e.g. kBlockM=80 + using SmemLayoutLSE = cute::Layout, Int>, cute::Stride<_1, Int>>; + using SmemLayoutLSEMma = cute::Layout, Int, Int>, cute::Stride<_0, _1, Int>>; + + // Note this is the transpose in terms of the view, not in terms of memory. + using SmemLayoutQt = + decltype(cute::composition(SmemLayoutQ{}, + make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{}), Int{}), + make_stride(Int{}, _1{}, Int{})))); + using SmemLayoutdOt = + decltype(cute::composition(SmemLayoutdO{}, + make_layout(make_shape(get<2>(TileShape_MNK{}), get<0>(TileShape_MNK{}), Int{}), + make_stride(Int{}, _1{}, Int{})))); + using SmemLayoutKt = + decltype(cute::composition(SmemLayoutK{}, + make_layout(make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{})), + make_stride(Int{}, _1{})))); + using SmemLayoutPt = + decltype(cute::composition(SmemLayoutP{}, + make_layout(make_shape(get<1>(TileShape_MNK{}), get<0>(TileShape_MNK{})), + make_stride(Int{}, _1{})))); + using SmemLayoutdSt = + decltype(cute::composition(SmemLayoutdS{}, + make_layout(make_shape(Int{}, Int{}, Int{}), + make_stride(Int{}, _1{}, Int{})))); + + // Thread layout, 256 threads per row + using R2SLayoutAtomdQaccum = Layout>, Stride<_1>>; + using R2STiledCopydQaccum = decltype(make_tiled_copy(Copy_Atom{}, R2SLayoutAtomdQaccum{}, + Layout>{})); // Val layout, 4 vals per store + using SmemLayoutdQaccum = Layout>, Stride<_1>>; + // We want dQaccum smem to have last dimension 32, so that we only need to do 1 TMA instruction. + // The layout Layout_K_SW128_Atom has 32 elements per row. + // // TMA limit is that each dimension in smem must be <= 256. + // static constexpr int ElemsPerRowTMA = (kBlockM * kHeadDim) / 32 <= 256 ? 32 : 64; + static constexpr int ElemsPerRowTMA = 32; // If we change this, we'll also need to change the dQ shape in host. + static_assert((kBlockM * kHeadDim) % ElemsPerRowTMA == 0); + using TileShape_dQaccum = cute::Shape, Int>; + // using TileShape_dQaccum = cute::Shape, Int>; + using SmemLayoutdQaccumTMA = + decltype(tile_to_shape(GMMA::Layout_K_SW128_Atom{}, TileShape_dQaccum{})); + using SmemLayoutdQaccumTMANoSwizzle = decltype(get_nonswizzle_portion(SmemLayoutdQaccumTMA{})); + + using SmemCopyAtomPdS = Copy_Atom< + std::conditional_t, + Element>; + using SmemCopyAtomdKV = Copy_Atom< + std::conditional_t, + Element>; + + using GmemTiledCopyQdO = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape{}))); + using GmemTiledCopyKV = cute::SM90_TMA_LOAD; + using GmemTiledCopydQaccum = cute::SM90_TMA_REDUCE_ADD; + using GmemTiledCopyLSE = cute::SM90_TMA_LOAD; + + using ShapeQKV = cute::Shape; // (seqlen, d, head, batch) + using StrideQKV = cute::Stride; + using ShapeLSE = cute::Shape; // (seqlen, head, batch) + using StrideLSE = cute::Stride<_1, int64_t, int64_t>; // (seqlen, head, batch) + + using TMA_QdO = decltype(make_tma_copy( + GmemTiledCopyQdO{}, + make_tensor(make_gmem_ptr(static_cast(nullptr)), ShapeQKV{}, StrideQKV{}), + take<0, 2>(SmemLayoutQ{}), + select<0, 2>(TileShape_MNK{}), + size<1>(ClusterShape{}))); // mcast along N mode for this M load, if any + + using TMA_K = decltype(make_tma_copy( + GmemTiledCopyKV{}, + make_tensor(make_gmem_ptr(static_cast(nullptr)), ShapeQKV{}, StrideQKV{}), + SmemLayoutK{}, + select<1, 2>(TileShape_MNK{}), + _1{})); // no mcast for KV + + using TMA_V = decltype(make_tma_copy( + GmemTiledCopyKV{}, + make_tensor(make_gmem_ptr(static_cast(nullptr)), ShapeQKV{}, StrideQKV{}), + SmemLayoutV{}, + select<1, 2>(TileShape_MNK{}), + _1{})); // no mcast for KV + + using TMA_add_dQ = decltype(make_tma_copy( + GmemTiledCopydQaccum{}, + make_tensor(make_gmem_ptr(static_cast(nullptr)), ShapeQKV{}, StrideQKV{}), + SmemLayoutdQaccumTMA{}, + TileShape_dQaccum{}, + _1{})); // no mcast for dQ + + using TMA_LSE = decltype(make_tma_copy( + GmemTiledCopyLSE{}, + make_tensor(make_gmem_ptr(static_cast(nullptr)), ShapeLSE{}, StrideLSE{}), + select<0>(SmemLayoutLSE{}), + select<0>(TileShape_MNK{}), + _1{})); // no mcast for LSE + + static constexpr int NumMmaThreads = size(TiledMmaSdP{}); + + using MainloopPipeline = typename cutlass::PipelineTmaAsync; + using PipelineState = typename MainloopPipeline::PipelineState; + + // Set the bytes transferred in this TMA transaction (may involve multiple issues) + static constexpr uint32_t TmaTransactionBytesQ = static_cast(size(take<0, 2>(SmemLayoutQ{})) * cutlass::sizeof_bits_v / 8); + static constexpr uint32_t TmaTransactionBytesK = static_cast(size(SmemLayoutK{}) * cutlass::sizeof_bits_v / 8); + static constexpr uint32_t TmaTransactionBytesV = static_cast(size(SmemLayoutV{}) * cutlass::sizeof_bits_v / 8); + static constexpr uint32_t TmaTransactionBytesLSE = static_cast(size(select<0>(SmemLayoutLSE{})) * cutlass::sizeof_bits_v / 8); + + struct TensorStorage : cute::aligned_struct<1024> { + cute::array_aligned> smem_k; + cute::array_aligned> smem_v; + // It's important that smem_dqacc is aligned to 1024 bytes for the TMA, so that the 1st row + // has no swizzle. + // If the address is only 128 bytes aligned, it's possible that the 1st row has swizzle + // and when we read it back in the postprocess kernel, the swizzle will not match. + cute::array_aligned, 1024> smem_dqacc; + cute::array_aligned> smem_q; + cute::array_aligned> smem_do; + cute::array_aligned> smem_ds; + cute::array_aligned, 128> smem_lse; + cute::array_aligned, 128> smem_dpsum; + }; + + static constexpr int SharedStorageQdOSize = sizeof(decltype((TensorStorage{}).smem_q)) + sizeof(decltype((TensorStorage{}).smem_do)) + sizeof(decltype((TensorStorage{}).smem_ds)) + sizeof(decltype((TensorStorage{}).smem_dqacc)); + + // Host side kernel arguments + struct Arguments { + Element const* ptr_Q; + ShapeQKV const shape_Q; + StrideQKV const stride_Q; + Element const* ptr_K; + ShapeQKV const shape_K; + StrideQKV const stride_K; + Element const* ptr_V; + StrideQKV const stride_V; + Element const* ptr_dO; + StrideQKV const stride_dO; + ElementAccum* ptr_dQaccum; + ShapeQKV const shape_dQaccum; + StrideQKV const stride_dQaccum; + float const* ptr_LSE_log2; + ShapeLSE const shape_LSE; + StrideLSE const stride_LSE_log2; + float const* ptr_dPsum; + StrideLSE const stride_dPsum; + float const softmax_scale; + int num_batch; + int* dq_semaphore; + int const* cu_seqlens_q = nullptr; + int const* cu_seqlens_k = nullptr; + }; + + // Device side kernel params + struct Params { + ShapeQKV const shape_Q; + ShapeQKV const shape_K; + ShapeQKV const shape_dQaccum; + cutlass::FastDivmod qhead_per_khead_divmod; + TMA_QdO tma_load_Q, tma_load_dO; + TMA_K tma_load_K; + TMA_V tma_load_V; + TMA_add_dQ tma_add_dQ; + TMA_LSE tma_load_LSE, tma_load_dPsum; + float const* ptr_LSE_log2; + ShapeLSE const shape_LSE; + StrideLSE const stride_LSE_log2; + float const* ptr_dPsum; + StrideLSE const stride_dPsum; + float const softmax_scale; + float const softmax_scale_log2; + int num_batch; + int* dq_semaphore; + int const* cu_seqlens_q = nullptr; + int const* cu_seqlens_k = nullptr; + }; + + static Params + to_underlying_arguments(Arguments const& args) { + Tensor mQ = make_tensor(make_gmem_ptr(args.ptr_Q), args.shape_Q, args.stride_Q); + TMA_QdO tma_load_Q = make_tma_copy( + GmemTiledCopyQdO{}, + mQ, + SmemLayoutQ{}(_, _, _0{}), + select<0, 2>(TileShape_MNK{}), + size<1>(ClusterShape{})); // mcast along N mode for this M load, if any + Tensor mdO = make_tensor(make_gmem_ptr(args.ptr_dO), args.shape_Q, args.stride_dO); + TMA_QdO tma_load_dO = make_tma_copy( + GmemTiledCopyQdO{}, + mdO, + SmemLayoutdO{}(_, _, _0{}), + select<0, 2>(TileShape_MNK{}), + size<1>(ClusterShape{})); // mcast along N mode for this M load, if any + Tensor mK = make_tensor(make_gmem_ptr(args.ptr_K), args.shape_K, args.stride_K); + TMA_K tma_load_K = make_tma_copy( + GmemTiledCopyKV{}, + mK, + SmemLayoutK{}, + select<1, 2>(TileShape_MNK{}), + _1{}); // no mcast for KV + Tensor mV = make_tensor(make_gmem_ptr(args.ptr_V), args.shape_K, args.stride_V); + TMA_V tma_load_V = make_tma_copy( + GmemTiledCopyKV{}, + mV, + SmemLayoutV{}, + select<1, 2>(TileShape_MNK{}), + _1{}); // no mcast for KV + Tensor mdQaccum = make_tensor(make_gmem_ptr(args.ptr_dQaccum), args.shape_dQaccum, args.stride_dQaccum); + TMA_add_dQ tma_add_dQ = make_tma_copy( + GmemTiledCopydQaccum{}, + mdQaccum, + SmemLayoutdQaccumTMA{}, + TileShape_dQaccum{}, + _1{}); // no mcast for dQaccum + Tensor mLSE = make_tensor(make_gmem_ptr(args.ptr_LSE_log2), args.shape_LSE, args.stride_LSE_log2); + TMA_LSE tma_load_LSE = make_tma_copy( + GmemTiledCopyLSE{}, + mLSE, + select<0>(SmemLayoutLSE{}), + select<0>(TileShape_MNK{}), + _1{}); // no mcast for LSE + Tensor mdPsum = make_tensor(make_gmem_ptr(args.ptr_dPsum), args.shape_LSE, args.stride_dPsum); + TMA_LSE tma_load_dPsum = make_tma_copy( + GmemTiledCopyLSE{}, + mdPsum, + select<0>(SmemLayoutLSE{}), + select<0>(TileShape_MNK{}), + _1{}); // no mcast for dPsum + if constexpr (Deterministic) { assert(args.dq_semaphore != nullptr); } + return {args.shape_Q, args.shape_K, args.shape_dQaccum, + cutlass::FastDivmod(cute::ceil_div(get<2>(args.shape_Q), get<2>(args.shape_K))), + tma_load_Q, tma_load_dO, tma_load_K, tma_load_V, tma_add_dQ, tma_load_LSE, tma_load_dPsum, + args.ptr_LSE_log2, args.shape_LSE, args.stride_LSE_log2, args.ptr_dPsum, args.stride_dPsum, + args.softmax_scale, float(args.softmax_scale * M_LOG2E), + args.num_batch, args.dq_semaphore, args.cu_seqlens_q, args.cu_seqlens_k}; + } + + /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance + CUTLASS_DEVICE + static void prefetch_tma_descriptors(Params const& params) { + cute::prefetch_tma_descriptor(params.tma_load_Q.get_tma_descriptor()); + cute::prefetch_tma_descriptor(params.tma_load_dO.get_tma_descriptor()); + cute::prefetch_tma_descriptor(params.tma_load_K.get_tma_descriptor()); + cute::prefetch_tma_descriptor(params.tma_load_V.get_tma_descriptor()); + cute::prefetch_tma_descriptor(params.tma_load_LSE.get_tma_descriptor()); + cute::prefetch_tma_descriptor(params.tma_load_dPsum.get_tma_descriptor()); + cute::prefetch_tma_descriptor(params.tma_add_dQ.get_tma_descriptor()); + } + + CUTLASS_DEVICE + int get_seqlen_q(Params const& params, int bidb) { + if constexpr (!Varlen) { + return get<0>(params.shape_Q); + } else { + return params.cu_seqlens_q == nullptr + ? get<0>(params.shape_Q) + : params.cu_seqlens_q[bidb + 1] - params.cu_seqlens_q[bidb]; + } + } + + CUTLASS_DEVICE + int get_seqlen_k(Params const& params, int bidb) { + if constexpr (!Varlen) { + return get<0>(params.shape_K); + } else { + return params.cu_seqlens_k == nullptr + ? get<0>(params.shape_K) + : params.cu_seqlens_k[bidb + 1] - params.cu_seqlens_k[bidb]; + } + } + + CUTLASS_DEVICE + int get_m_block_min(Params const& params, int n_block, int bidb) { + if constexpr (Is_causal) { + int const seqlen_q = get_seqlen_q(params, bidb); + int const seqlen_k = get_seqlen_k(params, bidb); + return std::max(0, (n_block * kBlockN + seqlen_q - seqlen_k) / kBlockM); + } else { + return 0; + } + } + + template + CUTLASS_DEVICE void + load(Params const& params, + MainloopPipeline pipeline_q, + MainloopPipeline pipeline_do, + PipelineState& smem_pipe_write, + SharedStorage &shared_storage, + SchedulerPrefetch const& scheduler_prefetch, + cute::tuple block_coord, + int work_idx + ) { + + Tensor sQ = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_q.data()), SmemLayoutQ{}); + Tensor sdO = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_do.data()), SmemLayoutdO{}); + Tensor sK = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_k.data()), SmemLayoutK{}); + Tensor sV = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_v.data()), SmemLayoutV{}); + Tensor sLSE = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_lse.data()), SmemLayoutLSE{}); + Tensor sdPsum = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_dpsum.data()), SmemLayoutLSE{}); + + auto [n_block, bidh, bidb] = block_coord; + int bidh_kv = params.qhead_per_khead_divmod.divide(bidh); + + // Prepare the TMA loads + uint32_t block_rank_in_cluster = cute::block_rank_in_cluster(); + constexpr uint32_t cluster_shape_x = get<0>(ClusterShape()); + uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x}; + bool const is_varlen_q = Varlen && params.cu_seqlens_q != nullptr; + bool const is_varlen_k = Varlen && params.cu_seqlens_k != nullptr; + Tensor mQ = params.tma_load_Q.get_tma_tensor(params.shape_Q)(_, _, bidh, !is_varlen_q ? bidb : 0); + Tensor mdO = params.tma_load_dO.get_tma_tensor(params.shape_Q)(_, _, bidh, !is_varlen_q ? bidb : 0); + Tensor mK = params.tma_load_K.get_tma_tensor(params.shape_K)(_, _, bidh_kv, !is_varlen_k ? bidb : 0); + Tensor mV = params.tma_load_V.get_tma_tensor(params.shape_K)(_, _, bidh_kv, !is_varlen_k ? bidb : 0); + Tensor mLSE = params.tma_load_LSE.get_tma_tensor(params.shape_LSE)(_, bidh, !is_varlen_q ? bidb : 0); + Tensor mdPsum = params.tma_load_dPsum.get_tma_tensor(params.shape_LSE)(_, bidh, !is_varlen_q ? bidb : 0); + + int const offset_q = !is_varlen_q ? 0 : params.cu_seqlens_q[bidb]; + int const offset_k = !is_varlen_k ? 0 : params.cu_seqlens_k[bidb]; + int const offset_padded = !is_varlen_q ? 0 : (params.cu_seqlens_q[bidb] + bidb * 128) / 128 * 128; + Tensor gQ = local_tile(domain_offset(make_coord(offset_q, _0{}), mQ), select<0, 2>(TileShape_MNK{}), make_coord(_, _0{})); // (M, K, _) + Tensor gdO = local_tile(domain_offset(make_coord(offset_q, _0{}), mdO), select<0, 2>(TileShape_MNK{}), make_coord(_, _0{})); // (M, K, _) + Tensor gK = local_tile(domain_offset(make_coord(offset_k, _0{}), mK), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{})); // (N, K) + Tensor gV = local_tile(domain_offset(make_coord(offset_k, _0{}), mV), select<1, 2>(TileShape_MNK{}), make_coord(n_block, _0{})); // (N, K) + Tensor gLSE = local_tile(domain_offset(make_coord(offset_padded), mLSE), select<0>(TileShape_MNK{}), make_coord(_)); // (M, _) + Tensor gdPsum = local_tile(domain_offset(make_coord(offset_padded), mdPsum), select<0>(TileShape_MNK{}), make_coord(_)); // (M, _) + + Tensor sK_x = make_tensor(sK.data(), make_layout(sK.layout(), Layout<_1>{})); + Tensor gK_x = make_tensor(gK.data(), make_layout(gK.layout(), Layout<_1>{})); + Tensor sV_x = make_tensor(sV.data(), make_layout(sV.layout(), Layout<_1>{})); + Tensor gV_x = make_tensor(gV.data(), make_layout(gV.layout(), Layout<_1>{})); + auto [tQgQ, tQsQ] = tma_partition(params.tma_load_Q, block_rank_in_cluster, Layout{}, + group_modes<0, 2>(sQ), group_modes<0, 2>(gQ)); // (TMA, k), (TMA, PIPE) + auto [tdOgdO, tdOsdO] = tma_partition(params.tma_load_dO, block_rank_in_cluster, Layout{}, + group_modes<0, 2>(sdO), group_modes<0, 2>(gdO)); // (TMA, k), (TMA, PIPE) + auto [tKgK, tKsK] = tma_partition(params.tma_load_K, _0{}, Layout<_1>{}, + group_modes<0, 2>(sK_x), group_modes<0, 2>(gK_x)); // (TMA), (TMA) + auto [tVgV, tVsV] = tma_partition(params.tma_load_V, _0{}, Layout<_1>{}, + group_modes<0, 2>(sV_x), group_modes<0, 2>(gV_x)); // (TMA), (TMA) + auto [tLSEgLSE, tLSEsLSE] = tma_partition(params.tma_load_LSE, _0{}, Layout<_1>{}, + sLSE, gLSE); // (TMA, k), (TMA, PIPE) + auto [tLSEgdPsum, tLSEsdPsum] = tma_partition(params.tma_load_dPsum, _0{}, Layout<_1>{}, + sdPsum, gdPsum); // (TMA, k), (TMA, PIPE) + + uint16_t mcast_mask_qdo = 0; + if constexpr (cute::is_same_v) { + auto block_layout = Layout{}; // (m,n) -> block_id + for (int n = 0; n < size<1>(block_layout); ++n) { + mcast_mask_qdo |= (uint16_t(1) << block_layout(n, cluster_local_block_id.x, _0{})); + } + } + + int m_block_max = cute::ceil_div(get_seqlen_q(params, bidb), get<0>(TileShape_MNK{})); + int m_block_min = get_m_block_min(params, n_block, bidb); + int m_block = m_block_min; + + int lane_predicate = cute::elect_one_sync(); + + // // Wait for the MMA warpgroups to say that smem_q is ready + // cutlass::arch::NamedBarrier::sync(NumMmaThreads + cutlass::NumThreadsPerWarp, static_cast(BwdNamedBarriers::QueryEmpty) /*id*/); + + if (lane_predicate) { + // Copy K tile and V tile from GMEM to SMEM. + shared_storage.barrier_KV.arrive_and_expect_tx(TmaTransactionBytesK + TmaTransactionBytesV); + copy(params.tma_load_K.with(reinterpret_cast(shared_storage.barrier_KV), 0 /*mcast_mask*/), tKgK, tKsK); + copy(params.tma_load_V.with(reinterpret_cast(shared_storage.barrier_KV), 0 /*mcast_mask*/), tVgV, tVsV); + + pipeline_q.producer_acquire(smem_pipe_write); + copy(params.tma_load_Q.with(*pipeline_q.producer_get_barrier(smem_pipe_write), mcast_mask_qdo), tQgQ(_, m_block), tQsQ(_, smem_pipe_write.index())); + copy(params.tma_load_LSE.with(*pipeline_q.producer_get_barrier(smem_pipe_write), 0), tLSEgLSE(_, m_block), tLSEsLSE(_, smem_pipe_write.index())); + #pragma unroll 2 + for (; m_block < m_block_max - 1; ++m_block) { + pipeline_do.producer_acquire(smem_pipe_write); + copy(params.tma_load_dO.with(*pipeline_do.producer_get_barrier(smem_pipe_write), mcast_mask_qdo), tdOgdO(_, m_block), tdOsdO(_, smem_pipe_write.index())); + copy(params.tma_load_dPsum.with(*pipeline_do.producer_get_barrier(smem_pipe_write), 0), tLSEgdPsum(_, m_block), tLSEsdPsum(_, smem_pipe_write.index())); + ++smem_pipe_write; + pipeline_q.producer_acquire(smem_pipe_write); + copy(params.tma_load_Q.with(*pipeline_q.producer_get_barrier(smem_pipe_write), mcast_mask_qdo), tQgQ(_, m_block + 1), tQsQ(_, smem_pipe_write.index())); + copy(params.tma_load_LSE.with(*pipeline_q.producer_get_barrier(smem_pipe_write), 0), tLSEgLSE(_, m_block + 1), tLSEsLSE(_, smem_pipe_write.index())); + } + } + scheduler_prefetch(); + if (lane_predicate) { + pipeline_do.producer_acquire(smem_pipe_write); + copy(params.tma_load_dO.with(*pipeline_do.producer_get_barrier(smem_pipe_write), mcast_mask_qdo), tdOgdO(_, m_block), tdOsdO(_, smem_pipe_write.index())); + copy(params.tma_load_dPsum.with(*pipeline_do.producer_get_barrier(smem_pipe_write), 0), tLSEgdPsum(_, m_block), tLSEsdPsum(_, smem_pipe_write.index())); + ++smem_pipe_write; + } + } + + /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster + CUTLASS_DEVICE void + load_tail(MainloopPipeline pipeline_q, MainloopPipeline pipeline_do, + PipelineState& smem_pipe_write) { + // Need to copy since pipeline_q.producer_tail(smem_pipe_write) will increment smem_pipe_write + PipelineState smem_pipe_write_do = smem_pipe_write; + int lane_predicate = cute::elect_one_sync(); + // Issue the epilogue waits + if (lane_predicate) { + /* This helps avoid early exit of blocks in Cluster + * Waits for all stages to either be released (all Consumer UNLOCKs), or if the stage was never used + * then would just be acquired since the phase was still inverted from make_producer_start_state + */ + pipeline_q.producer_tail(smem_pipe_write); + pipeline_do.producer_tail(smem_pipe_write_do); + } + } + + template + CUTLASS_DEVICE void + store_dq(Params const& params, + SharedStorage &shared_storage, + cute::tuple block_coord + ) { + + Tensor sdQ = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_dqacc.data()), SmemLayoutdQaccumTMA{}); + Tensor sdQnoswizzle = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_dqacc.data()), SmemLayoutdQaccumTMANoSwizzle{}); + auto [n_block, bidh, bidb] = block_coord; + + bool const is_varlen_q = Varlen && params.cu_seqlens_q != nullptr; + // We reshaped dQaccum to have last dimension 32, so the offset needs to be multiplied by kHeadDim / 32 + int const offset_padded = !is_varlen_q ? 0 : ((params.cu_seqlens_q[bidb] + bidb * 128) / 128 * 128) * (kHeadDim / ElemsPerRowTMA); + // Prepare the TMA loads + Tensor mdQaccum = params.tma_add_dQ.get_tma_tensor(params.shape_dQaccum)(_, _, bidh, !is_varlen_q ? bidb : 0); + Tensor gdQaccum = local_tile(domain_offset(make_coord(offset_padded, _0{}), mdQaccum), TileShape_dQaccum{}, make_coord(_, _0{})); // (M, K, _) + auto block_tma_dQ = params.tma_add_dQ.get_slice(_0{}); + Tensor tdQgdQ = block_tma_dQ.partition_D(gdQaccum); // (TMA, TMA_M, TMA_K) + Tensor tdQsdQ = block_tma_dQ.partition_S(sdQ); // (TMA, TMA_M, TMA_K) + + int m_block_max = cute::ceil_div(get_seqlen_q(params, bidb), get<0>(TileShape_MNK{})); + int m_block_min = get_m_block_min(params, n_block, bidb); + int m_block = m_block_min; + int const num_batch = params.num_batch; + int const num_head = get<2>(params.shape_Q); + int *lock_ptr = !Deterministic ? nullptr : params.dq_semaphore + bidb * num_head + bidh; + using Barrier = cutlass::GenericBarrier; + int lane_predicate = cute::elect_one_sync(); + #pragma unroll 2 + for (; m_block < m_block_max; ++m_block) { + if constexpr (Deterministic) { + Barrier::wait_eq(lock_ptr, threadIdx.x % cutlass::NumThreadsPerWarp, m_block * num_batch * num_head, n_block); + } + cutlass::arch::NamedBarrier::sync(kNThreadsdQ + cutlass::NumThreadsPerWarp, static_cast(BwdNamedBarriers::dQFull) /*id*/); // sdQ full, to be written to gmem + if (lane_predicate) { + cute::copy(params.tma_add_dQ, tdQsdQ, tdQgdQ(_, _, _, m_block)); + tma_store_arrive(); + } + tma_store_wait<0>(); + if constexpr (Deterministic) { + Barrier::arrive_inc(lock_ptr, threadIdx.x % cutlass::NumThreadsPerWarp, m_block * num_batch * num_head); + } + cutlass::arch::NamedBarrier::arrive(kNThreadsdQ + cutlass::NumThreadsPerWarp, static_cast(BwdNamedBarriers::dQEmpty) /*id*/); // sdQ empty, ready to be written to + } + } + + CUTLASS_DEVICE void + mma_init() { + // // Tell producer (warp 0) that smem_q is ready + // cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarp, static_cast(BwdNamedBarriers::QueryEmpty) /*id*/); + int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0); + if (cutlass::canonical_warp_group_idx() == 1 && warp_idx_in_warpgroup == 0) { + cutlass::arch::NamedBarrier::arrive(kNThreadsdQ + cutlass::NumThreadsPerWarp, static_cast(BwdNamedBarriers::dQEmpty) /*id*/); // sdQ empty, ready to be written to + } + } + + template + CUTLASS_DEVICE void + mma(Params const& params, + MainloopPipeline pipeline_q, + MainloopPipeline pipeline_do, + PipelineState& smem_pipe_read, + FrgTensordKV& tdKrdK, + FrgTensordKV& tdVrdV, + int thread_idx, + int work_idx, + cute::tuple block_coord, + SharedStorage& shared_storage + ) { + static_assert(is_rmem::value, "dK and dV tensor must be rmem resident."); + + Tensor sQ = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_q.data()), SmemLayoutQ{}); + Tensor sdO = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_do.data()), SmemLayoutdO{}); + Tensor sK = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_k.data()), SmemLayoutK{}); + Tensor sV = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_v.data()), SmemLayoutV{}); + Tensor sQt = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_q.data()), SmemLayoutQt{}); + Tensor sdOt = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_do.data()), SmemLayoutdOt{}); + Tensor sKt = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_k.data()), SmemLayoutKt{}); + Tensor sdS = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_ds.data()), SmemLayoutdS{}); + Tensor sdSt = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_ds.data()), SmemLayoutdSt{}); + Tensor sdQ = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_dqacc.data()), SmemLayoutdQaccum{}); + Tensor sLSEMma = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_lse.data()), SmemLayoutLSEMma{}); + Tensor sdPsumMma = make_tensor(make_smem_ptr(shared_storage.mainloop.smem_dpsum.data()), SmemLayoutLSEMma{}); + + static_assert(stride<0>(typename TiledMmaSdP::ALayout{}) == 0 and + stride<0>(typename TiledMmaSdP::BLayout{}) == 0 and + size<0>(typename TiledMmaSdP::ALayout{}) == cutlass::NumThreadsPerWarpGroup and + size<0>(typename TiledMmaSdP::BLayout{}) == cutlass::NumThreadsPerWarpGroup, + "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup"); + constexpr int MmaWarpGroups = NumMmaThreads / cutlass::NumThreadsPerWarpGroup; + Layout warp_group_thread_layout = make_layout(make_shape(Int{}), + make_stride(Int{})); + Layout warp_group_thread_layout_dq = make_layout(make_shape(Int{}), + make_stride(Int{})); + + int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0); + TiledMmaSdP tiled_mma_SdP; + TiledMmadKV tiled_mma_dKV; + TiledMmadQ tiled_mma_dQ; + static_assert(!dKV_swapAB); + + auto wg_mma_SdP = tiled_mma_SdP.get_slice(warp_group_thread_layout(warp_group_idx)); + auto thread_mma_SdP = tiled_mma_SdP.get_thread_slice(thread_idx); + auto wg_mma_dKV = tiled_mma_dKV.get_slice(warp_group_thread_layout(warp_group_idx)); + auto wg_mma_dQ = tiled_mma_dQ.get_slice(!Varlen ? warp_group_thread_layout_dq(NumdQWarpGroups == 2 ? warp_group_idx : 0) : thread_idx); + // auto wg_mma_dQ = tiled_mma_dQ.get_thread_slice(thread_idx); + + auto smem_tiled_copy_PdS = make_tiled_copy_C(SmemCopyAtomPdS{}, tiled_mma_SdP); + auto smem_thr_copy_PdS = smem_tiled_copy_PdS.get_thread_slice(thread_idx); + Tensor tdSsdS = smem_thr_copy_PdS.partition_D(sdSt); // ((Atom,AtomNum),PIPE_M,PIPE_N) + + R2STiledCopydQaccum r2s_tiled_copy_dQaccum; + // auto r2s_thr_copy_dQaccum = r2s_tiled_copy_dQaccum.get_thread_slice(thread_idx); + auto r2s_thr_copy_dQaccum = r2s_tiled_copy_dQaccum.get_thread_slice(NumdQWarpGroups == 2 ? thread_idx : thread_idx % cutlass::NumThreadsPerWarpGroup); + Tensor tdQsdQaccum = r2s_thr_copy_dQaccum.partition_D(sdQ); + + // Allocate "fragments/descriptors" + Tensor tSrQ = wg_mma_SdP.partition_fragment_B(sQ); + Tensor tSrK = wg_mma_SdP.partition_fragment_A(sK); + Tensor tdPrdO = wg_mma_SdP.partition_fragment_B(sdO); + Tensor tdPrV = wg_mma_SdP.partition_fragment_A(sV); + Tensor tdVrdO = wg_mma_dKV.partition_fragment_B(sdOt); + Tensor tdKrQ = wg_mma_dKV.partition_fragment_B(sQt); + + int n_block = get<0>(block_coord); + int bidh = get<1>(block_coord); + int bidb = get<2>(block_coord); + int const seqlen_q = get_seqlen_q(params, bidb); + int const seqlen_k = get_seqlen_k(params, bidb); + + int m_block_max = cute::ceil_div(get_seqlen_q(params, bidb), get<0>(TileShape_MNK{})); + int m_block_min = get_m_block_min(params, n_block, bidb); + int m_block = m_block_min; + + // thread_mma_SdP.partition_C(sLSEMma) has shape ((2, 2, V), MMA_M, MMA_N, PIPE), we only take the row indices. + Tensor tLSEsLSE = thread_mma_SdP.partition_C(sLSEMma)(make_coord(_, _0{}, _), _0{}, _0{}, _); // (2, V, PIPE) + Tensor tLSEsdPsum = thread_mma_SdP.partition_C(sdPsumMma)(make_coord(_, _0{}, _), _0{}, _0{}, _); + + + clear(tdKrdK); + clear(tdVrdV); + // tiled_mma_dKV.accumulate_ = GMMA::ScaleOut::Zero; + + cutlass::ConsumerToken barrier_token = static_cast(shared_storage.barrier_KV.try_wait(work_idx % 2)); + if (barrier_token == cutlass::BarrierStatus::WaitAgain) { shared_storage.barrier_KV.wait(work_idx % 2); } + + auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) { + auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); + pipeline.consumer_wait(smem_pipe_read, barrier_token); + }; + + auto compute_dQ = [&]() { + static_assert(!Mma_dQ_is_RS); + // SMEM fence to make sure sP is written before it's read by WGMMA + cutlass::arch::fence_view_async_shared(); + cutlass::arch::NamedBarrier::sync(kNThreadsdQ + cutlass::NumThreadsPerWarp, static_cast(BwdNamedBarriers::dQEmpty) /*id*/); // sdQ empty, ready to be written to + Tensor tdQrdQ = partition_fragment_C(tiled_mma_dQ, select(TileShape_MNK{})); + if constexpr (!dQ_swapAB) { + Tensor tdQrdS = wg_mma_dQ.partition_fragment_A(sdS); + Tensor tdQrK = wg_mma_dQ.partition_fragment_B(sKt); + flash::gemm(tiled_mma_dQ, tdQrdS(_, _, _, smem_pipe_read.index()), tdQrK, tdQrdQ); + } else { + Tensor tdQrdS = wg_mma_dQ.partition_fragment_B(sdS); + Tensor tdQrK = wg_mma_dQ.partition_fragment_A(sKt); + flash::gemm(tiled_mma_dQ, tdQrK, tdQrdS(_, _, _, smem_pipe_read.index()), tdQrdQ); + } + pipeline_q.consumer_release(smem_pipe_read); // release Q + warpgroup_wait<0>(); + Tensor taccdQrdQ = r2s_thr_copy_dQaccum.retile_S(tdQrdQ); // ((Atom,AtomNum), MMA_M, MMA_N) + cute::copy(r2s_tiled_copy_dQaccum, taccdQrdQ, tdQsdQaccum); + cutlass::arch::fence_view_async_shared(); + cutlass::arch::NamedBarrier::arrive(kNThreadsdQ + cutlass::NumThreadsPerWarp, static_cast(BwdNamedBarriers::dQFull) /*id*/); // sdQ full, to be written to gmem + }; + + // We have separate iterations with causal masking. Not necessary for hdim 128 but for hdim 64 + // this helps quite a bit to not have to do causal masking for most of the iterations. + if constexpr (Is_causal) { + static constexpr int n_masking_steps = cute::ceil_div(kBlockN, kBlockM) + 1; + CUTLASS_PRAGMA_NO_UNROLL + for (; m_block < std::min(m_block_max, m_block_min + n_masking_steps); ++m_block) { + Tensor tSrS = partition_fragment_C(tiled_mma_SdP, select<1, 0>(TileShape_MNK{})); + pipeline_q.consumer_wait(smem_pipe_read); + flash::gemm(tiled_mma_SdP, tSrK, tSrQ(_, _, _, smem_pipe_read.index()), tSrS); + Tensor tLSErLSE = make_fragment_like(tLSEsLSE(_, _, _0{})); + cute::copy(tLSEsLSE(_, _, smem_pipe_read.index()), tLSErLSE); + + Tensor tdPrdP = partition_fragment_C(tiled_mma_SdP, select<1, 0>(TileShape_MNK{})); + pipeline_do.consumer_wait(smem_pipe_read); + flash::gemm(tiled_mma_SdP, tdPrV, tdPrdO(_, _, _, smem_pipe_read.index()), tdPrdP); + warpgroup_wait<1>(); + Tensor cS = cute::make_identity_tensor(select<1, 0>(TileShape_MNK{})); + Tensor taccScS = thread_mma_SdP.partition_C(cS); + int causal_row_offset = 1 + seqlen_k - n_block * kBlockN - seqlen_q + m_block * kBlockM; + #pragma unroll + for (int i = 0; i < size(tSrS); ++i) { + if (int(get<0>(taccScS(i))) >= std::min(int(get<1>(taccScS(i))) + causal_row_offset, + seqlen_k - n_block * kBlockN)) { + tSrS(i) = -INFINITY; + } + } + // Reshape tSrS from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M), ncol=(2, MMA_N)) + Tensor scores = make_tensor(tSrS.data(), flash::convert_layout_acc_transposed_rowcol(tSrS.layout())); + flash::scale_apply_exp2(scores, group_modes<0, 2>(tLSErLSE), params.softmax_scale_log2); + + Tensor tLSErdPsum = make_fragment_like(tLSEsdPsum(_, _, _0{})); + cute::copy(tLSEsdPsum(_, _, smem_pipe_read.index()), tLSErdPsum); + + // Convert scores from fp32 to fp16/bf16 + Tensor rP = flash::convert_type(tSrS); + + warpgroup_wait<0>(); + // Reshape tdPrdP from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M), ncol=(2, MMA_N)) + Tensor dS = make_tensor(tdPrdP.data(), scores.layout()); + for (int mi = 0; mi < size<0>(dS); ++mi) { + #pragma unroll + for (int ni = 0; ni < size<1>(dS); ++ni) { dS(mi, ni) = scores(mi, ni) * (dS(mi, ni) - tLSErdPsum(mi)); } + } + Tensor rdS = flash::convert_type(tdPrdP); + + // Because of double buffering on dS, we don't need to sync here. + // Otherwise we might have WG1 writing to dS before WG2 is done reading from it during MmadQ. + // But because both WGs have to sync at the end of the loop and double buffering, this race condition + // is not possible. + Tensor tdSadS = smem_thr_copy_PdS.retile_S(rdS); // ((Atom,AtomNum), MMA_N, MMA_N) + cute::copy(smem_tiled_copy_PdS, tdSadS, tdSsdS(_, _, _, smem_pipe_read.index())); + + Tensor tdVrP = make_tensor(rP.data(), convert_layout_acc_Aregs(tSrS.layout())); + flash::gemm(tiled_mma_dKV, tdVrP, tdVrdO(_, _, _, smem_pipe_read.index()), tdVrdV); + + Tensor tdKrdS = make_tensor(rdS.data(), convert_layout_acc_Aregs(tdPrdP.layout())); + flash::gemm(tiled_mma_dKV, tdKrdS, tdKrQ(_, _, _, smem_pipe_read.index()), tdKrdK); + pipeline_do.consumer_release(smem_pipe_read); // release dO + + compute_dQ(); + ++smem_pipe_read; + } + } + + CUTLASS_PRAGMA_NO_UNROLL + for (; m_block < m_block_max; ++m_block) { + Tensor tSrS = partition_fragment_C(tiled_mma_SdP, select<1, 0>(TileShape_MNK{})); + pipeline_q.consumer_wait(smem_pipe_read); + flash::gemm(tiled_mma_SdP, tSrK, tSrQ(_, _, _, smem_pipe_read.index()), tSrS); + Tensor tLSErLSE = make_fragment_like(tLSEsLSE(_, _, _0{})); + cute::copy(tLSEsLSE(_, _, smem_pipe_read.index()), tLSErLSE); + + Tensor tdPrdP = partition_fragment_C(tiled_mma_SdP, select<1, 0>(TileShape_MNK{})); + pipeline_do.consumer_wait(smem_pipe_read); + flash::gemm(tiled_mma_SdP, tdPrV, tdPrdO(_, _, _, smem_pipe_read.index()), tdPrdP); + warpgroup_wait<1>(); + Tensor cS = cute::make_identity_tensor(select<1, 0>(TileShape_MNK{})); + Tensor taccScS = thread_mma_SdP.partition_C(cS); + #pragma unroll + for (int i = 0; i < size(tSrS); ++i) { + if (int(get<0>(taccScS(i))) >= int(seqlen_k - n_block * kBlockN)) { tSrS(i) = -INFINITY; } + } + // Reshape tSrS from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M), ncol=(2, MMA_N)) + Tensor scores = make_tensor(tSrS.data(), flash::convert_layout_acc_transposed_rowcol(tSrS.layout())); + // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(tLSErLSE); } + // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(scores); } + flash::scale_apply_exp2(scores, group_modes<0, 2>(tLSErLSE), params.softmax_scale_log2); + // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(scores); } + + Tensor tLSErdPsum = make_fragment_like(tLSEsdPsum(_, _, _0{})); + cute::copy(tLSEsdPsum(_, _, smem_pipe_read.index()), tLSErdPsum); + + // Convert scores from fp32 to fp16/bf16 + Tensor rP = flash::convert_type(tSrS); + + warpgroup_wait<0>(); + // Reshape tdPrdP from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M), ncol=(2, MMA_N)) + Tensor dS = make_tensor(tdPrdP.data(), scores.layout()); + #pragma unroll + for (int mi = 0; mi < size<0>(dS); ++mi) { + #pragma unroll + for (int ni = 0; ni < size<1>(dS); ++ni) { dS(mi, ni) = scores(mi, ni) * (dS(mi, ni) - tLSErdPsum(mi)); } + } + // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(dS); } + Tensor rdS = flash::convert_type(tdPrdP); + + Tensor tdSadS = smem_thr_copy_PdS.retile_S(rdS); // ((Atom,AtomNum), MMA_N, MMA_N) + cute::copy(smem_tiled_copy_PdS, tdSadS, tdSsdS(_, _, _, smem_pipe_read.index())); + + Tensor tdVrP = make_tensor(rP.data(), convert_layout_acc_Aregs(tSrS.layout())); + flash::gemm(tiled_mma_dKV, tdVrP, tdVrdO(_, _, _, smem_pipe_read.index()), tdVrdV); + + Tensor tdKrdS = make_tensor(rdS.data(), convert_layout_acc_Aregs(tdPrdP.layout())); + flash::gemm(tiled_mma_dKV, tdKrdS, tdKrQ(_, _, _, smem_pipe_read.index()), tdKrdK); + pipeline_do.consumer_release(smem_pipe_read); // release dO + + compute_dQ(); + ++smem_pipe_read; + } + // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(tdVrdV); } + #pragma unroll + for (int i = 0; i < size(tdKrdK); ++i) { tdKrdK(i) *= params.softmax_scale; } + } + +}; + +} // namespace flash + diff --git a/mainloop_fwd_sm90_tma_gmma_ws.hpp b/mainloop_fwd_sm90_tma_gmma_ws.hpp new file mode 100644 index 0000000000000000000000000000000000000000..7c83f72a5aa38cfa00732b6f0b93ea65485ed5cd --- /dev/null +++ b/mainloop_fwd_sm90_tma_gmma_ws.hpp @@ -0,0 +1,1025 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include "cutlass/pipeline/pipeline.hpp" + +#include "cute/tensor.hpp" + +#include "cutlass/gemm/collective/collective_builder.hpp" + +#include "named_barrier.hpp" +#include "utils.h" + +namespace flash { + +using namespace cute; + +// 4 warps +struct SmemTransposeFp8_64x64 { + + using Element = cutlass::float_e4m3_t; + + using ldsm_thread_shape = Shape<_4, _1, _8, _4>; + using ldsm_value_shape = Shape<_2, _8, _2, _1>; + using ldsm_value_stride = Stride<_2, _4, _1, _0>; + using TiledCopyLDSM = decltype(make_tiled_copy( + Copy_Atom{}, Layout{}, + Layout{})); + TiledCopyLDSM tiled_copy_ldsm; + + using stsm_thread_shape = Shape<_4, _1, _8, _4>; + // using stsm_thread_stride = Stride<_1, _0, _4, _32>; +#ifndef NO_FP8_COLUMN_PERMUTE + using stsm_value_shape = Shape<_4, _4, _1, _2>; + using stsm_value_stride = Stride<_1, _8, _0, _4>; +#else + using stsm_value_shape = Shape<_4, _4, _2, _1>; + using stsm_value_stride = Stride<_1, _8, _4, _0>; +#endif + + using TiledCopySTSM = + decltype(make_tiled_copy(Copy_Atom{}, + Layout{}, + Layout{})); + TiledCopySTSM tiled_copy_stsm; + + template + CUTLASS_DEVICE void operator()(SmemTensor &&s_in, SmemTensorOut &&s_out) { + using namespace cute; + + auto tid = threadIdx.x; + auto thr_copy_ldsm = tiled_copy_ldsm.get_thread_slice(tid); + auto thr_copy_stsm = tiled_copy_stsm.get_thread_slice(tid); + + auto tXsX = thr_copy_ldsm.partition_S(s_in); + auto tXrX = make_tensor(shape(tXsX)); + auto tXsX_out = thr_copy_stsm.partition_D(s_out); + + cute::copy(tiled_copy_ldsm, tXsX, tXrX); + + auto data = tXrX.data(); + // size(tXrX) == 32 + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < size(tXrX); n += 8) { + uint32_t *data_32bit = reinterpret_cast(&data[n]); + auto upper = data_32bit[0]; + auto lower = data_32bit[1]; + data_32bit[0] = __byte_perm(upper, lower, 0x6420); + data_32bit[1] = __byte_perm(upper, lower, 0x7531); + } + + cute::copy(tiled_copy_stsm, tXrX, tXsX_out); + } +}; + +template +struct CollectiveMainloopFwd { + + using Element = typename Ktraits::Element; + using TileShape_MNK = typename Ktraits::TileShape_MNK; + using ClusterShape = typename Ktraits::ClusterShape_MNK; + + static constexpr int kStages = Ktraits::kStages; + static constexpr int kHeadDim = Ktraits::kHeadDim; + + using GmemTiledCopyQ = cute::SM90_TMA_LOAD; + using GmemTiledCopyKV = decltype(cutlass::gemm::collective::detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape{}))); + + using SmemLayoutQ = typename Ktraits::SmemLayoutQ; + using SmemLayoutK = typename Ktraits::SmemLayoutK; + using SmemLayoutV = typename Ktraits::SmemLayoutV; + using SmemLayoutVt = typename Ktraits::SmemLayoutVt; + + using TMA_Q = decltype(make_tma_copy( + GmemTiledCopyQ{}, + make_tensor( + make_gmem_ptr(static_cast(nullptr)), + repeat_like(typename Seqlen_traits::StrideT{}, int32_t(0)), + typename Seqlen_traits::StrideT{} + ), + SmemLayoutQ{}, + select<0, 2>(TileShape_MNK{}), + _1{})); // no mcast for Q + + using TMA_K = decltype(make_tma_copy( + GmemTiledCopyKV{}, + make_tensor( + make_gmem_ptr(static_cast(nullptr)), + repeat_like(typename Seqlen_traits::StrideT{}, int32_t(0)), + typename Seqlen_traits::StrideT{} + ), + take<0, 2>(SmemLayoutK{}), + select<1, 2>(TileShape_MNK{}), + size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any + + // TMA_V may differ from TMA_K for fp8 kernel (e.g. swizzling mode) + using TMA_V = decltype(make_tma_copy( + GmemTiledCopyKV{}, + make_tensor( + make_gmem_ptr(static_cast(nullptr)), + repeat_like(typename Seqlen_traits::StrideT{}, int32_t(0)), + typename Seqlen_traits::StrideT{} + ), + take<0, 2>(SmemLayoutV{}), + select<1, 2>(TileShape_MNK{}), + size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any + + static constexpr int NumMmaThreads = size(typename Ktraits::TiledMma0{}); + using MainloopPipeline = typename Ktraits::MainloopPipeline; + using MainloopPipelineNoTMA = typename Ktraits::MainloopPipelineNoTMA; + using PipelineParams = typename MainloopPipeline::Params; + using PipelineState = typename MainloopPipeline::PipelineState; + + // Set the bytes transferred in this TMA transaction (may involve multiple issues) + static constexpr uint32_t TmaTransactionBytesQ = static_cast(size(SmemLayoutQ{}) * cutlass::sizeof_bits_v / 8); + static constexpr uint32_t TmaTransactionBytesK = static_cast(size(take<0, 2>(SmemLayoutK{})) * cutlass::sizeof_bits_v / 8); + + // static constexpr bool UseSchedulerBarrier = kHeadDim <= 128; + static constexpr bool UseSchedulerBarrier = + cutlass::sizeof_bits_v == 8 ? kHeadDim >= 128 + : kHeadDim <= 128; + + // Host side kernel arguments + struct Arguments { + Element const* ptr_Q; + typename Seqlen_traits::LayoutT layout_Q; + Element const* ptr_K; + typename Seqlen_traits::LayoutT layout_K; + Element const* ptr_V; + typename Seqlen_traits::LayoutT layout_V; + float const softmax_scale_log2; + }; + + // Device side kernel params + struct Params { + typename Seqlen_traits::LayoutT layout_Q; + typename Seqlen_traits::LayoutT layout_K; + typename Seqlen_traits::LayoutT layout_V; + cutlass::FastDivmod qhead_per_khead_divmod; + TMA_Q tma_load_Q; + TMA_K tma_load_K; + TMA_V tma_load_V; + float const softmax_scale_log2; + }; + + + static Params + to_underlying_arguments(Arguments const& args) { + Tensor mQ = make_tensor(make_gmem_ptr(args.ptr_Q), args.layout_Q); + TMA_Q tma_load_Q = make_tma_copy( + GmemTiledCopyQ{}, + mQ, + SmemLayoutQ{}, + select<0, 2>(TileShape_MNK{}), + _1{}); // no mcast for Q + Tensor mK = make_tensor(make_gmem_ptr(args.ptr_K), args.layout_K); + TMA_K tma_load_K = make_tma_copy( + GmemTiledCopyKV{}, + mK, + SmemLayoutK{}(_, _, _0{}), + select<1, 2>(TileShape_MNK{}), + size<0>(ClusterShape{})); // mcast along M mode for this N load, if any + Tensor mV = make_tensor(make_gmem_ptr(args.ptr_V), args.layout_V); + TMA_V tma_load_V = make_tma_copy( + GmemTiledCopyKV{}, + mV, + SmemLayoutV{}(_, _, _0{}), + select<1, 2>(TileShape_MNK{}), + size<0>(ClusterShape{})); // mcast along M mode for this N load, if any + return {args.layout_Q, args.layout_K, args.layout_V, + cutlass::FastDivmod(cute::ceil_div(get<2>(args.layout_Q.shape()), get<2>(args.layout_K.shape()))), + tma_load_Q, tma_load_K, tma_load_V, + args.softmax_scale_log2}; + } + + /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance + CUTLASS_DEVICE + static void prefetch_tma_descriptors(Params const& mainloop_params) { + cute::prefetch_tma_descriptor(mainloop_params.tma_load_Q.get_tma_descriptor()); + cute::prefetch_tma_descriptor(mainloop_params.tma_load_K.get_tma_descriptor()); + cute::prefetch_tma_descriptor(mainloop_params.tma_load_V.get_tma_descriptor()); + } + + CUTLASS_DEVICE + int get_n_block_max( + Params const& mainloop_params, int m_block, + const Seqlen_traits& seqlen_traits_q, + const Seqlen_traits& seqlen_traits_k + ) { + static constexpr int kBlockM = get<0>(TileShape_MNK{}); + static constexpr int kBlockN = get<1>(TileShape_MNK{}); + int const seqlen_q = Seqlen_traits::kUseVarSeqLen ? seqlen_traits_q.actual_seq_len : shape<0>(mainloop_params.layout_Q); + int const seqlen_k = Seqlen_traits::kUseVarSeqLen ? seqlen_traits_k.actual_seq_len : shape<0>(mainloop_params.layout_K); + int n_block_max = cute::ceil_div(seqlen_k, kBlockN); + if constexpr (Is_causal) { + n_block_max = std::min(n_block_max, + cute::ceil_div((m_block + 1) * kBlockM + seqlen_k - seqlen_q, kBlockN)); + } + return n_block_max; + } + + template + CUTLASS_DEVICE void + load(Params const& mainloop_params, + MainloopPipeline pipeline_k, + MainloopPipeline pipeline_v, + PipelineState& smem_pipe_write_k, + PipelineState& smem_pipe_write_v, + SharedStorage &shared_storage, + Scheduler& scheduler, + typename Scheduler::Params const& scheduler_params, + typename Scheduler::WorkTileInfo& work_tile_info, + cute::tuple block_coord, + int work_idx, + const Seqlen_traits& seqlen_traits_q, + const Seqlen_traits& seqlen_traits_k + ) { + + Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQ{}); + Tensor sK = make_tensor(make_smem_ptr(shared_storage.smem_k.data()), SmemLayoutK{}); + Tensor sV = make_tensor(make_smem_ptr(shared_storage.smem_v.data()), SmemLayoutV{}); + + Tensor mQ = mainloop_params.tma_load_Q.get_tma_tensor(mainloop_params.layout_Q.shape()); + Tensor mK = mainloop_params.tma_load_K.get_tma_tensor(mainloop_params.layout_K.shape()); + Tensor mV = mainloop_params.tma_load_V.get_tma_tensor(mainloop_params.layout_V.shape()); + + auto [m_block, bidh, bidb] = block_coord; + int bidh_kv = mainloop_params.qhead_per_khead_divmod.divide(bidh); + + // Prepare the TMA loads + uint32_t block_rank_in_cluster = cute::block_rank_in_cluster(); + constexpr uint32_t cluster_shape_x = get<0>(ClusterShape()); + uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x}; + Tensor gQ = seqlen_traits_q.get_local_tile_tensor( + mQ, select<0, 2>(TileShape_MNK{}), bidh, bidb)(_, _, m_block); // (M, K) + Tensor gK = seqlen_traits_k.get_local_tile_tensor( + mK, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb); // (N, K, _) + Tensor gV = seqlen_traits_k.get_local_tile_tensor( + mV, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb); // (N, K, _) + + Tensor sQ_x = make_tensor(sQ.data(), make_layout(sQ.layout(), Layout<_1>{})); + Tensor gQ_x = make_tensor(gQ.data(), make_layout(gQ.layout(), Layout<_1>{})); + auto [tQgQ, tQsQ] = tma_partition(mainloop_params.tma_load_Q, _0{}, Layout<_1>{}, + group_modes<0, 2>(sQ_x), group_modes<0, 2>(gQ_x)); // (TMA), (TMA) + auto [tKgK, tKsK] = tma_partition(mainloop_params.tma_load_K, block_rank_in_cluster, Layout{}, + group_modes<0, 2>(sK), group_modes<0, 2>(gK)); // (TMA, k), (TMA, PIPE) + auto [tVgV, tVsV] = tma_partition(mainloop_params.tma_load_V, block_rank_in_cluster, Layout{}, + group_modes<0, 2>(sV), group_modes<0, 2>(gV)); // (TMA, k), (TMA, PIPE) + + uint16_t mcast_mask_kv = 0; + if constexpr (cute::is_same_v) { + auto block_layout = Layout{}; // (m,n) -> block_id + for (int m = 0; m < size<0>(block_layout); ++m) { + mcast_mask_kv |= (uint16_t(1) << block_layout(m, cluster_local_block_id.y, _0{})); + } + } + + int n_block_max = get_n_block_max(mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k); + int n_block = n_block_max - 1; + + int lane_predicate = cute::elect_one_sync(); + if (lane_predicate) { + pipeline_k.producer_acquire(smem_pipe_write_k); + copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write_k), mcast_mask_kv), + tKgK(_, n_block), tKsK(_, smem_pipe_write_k.index())); + ++smem_pipe_write_k; + } + + // Wait for the MMA warpgroups to say that smem_q is ready + cutlass::arch::NamedBarrier::sync(NumMmaThreads + cutlass::NumThreadsPerWarp, static_cast(FwdNamedBarriers::QueryEmpty) /*id*/); + + if (lane_predicate) { + shared_storage.barrier_Q.arrive_and_expect_tx(TmaTransactionBytesQ); + copy(mainloop_params.tma_load_Q.with(reinterpret_cast(shared_storage.barrier_Q), 0 /*mcast_mask*/), tQgQ, tQsQ); + } + + // Wait for warp 1 to signal that smem_v are ready and V can be copied from gmem + // Need ClusterBarrier, not just NamedBarrier. Otherwise we might have CTA 0 finishing the + // TMA store on O first, call TMA multicast load on V, before CTA 1 can finishing TMA store on O. + shared_storage.barrier_O.wait((work_idx + 1) % 2); + + if (lane_predicate) { + // CUTLASS_PRAGMA_NO_UNROLL + #pragma unroll 2 + for (; n_block > 0; --n_block) { + pipeline_k.producer_acquire(smem_pipe_write_k); + copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write_k), mcast_mask_kv), + tKgK(_, n_block - 1), tKsK(_, smem_pipe_write_k.index())); + ++smem_pipe_write_k; + pipeline_v.producer_acquire(smem_pipe_write_v); + copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write_v), mcast_mask_kv), + tVgV(_, n_block), tVsV(_, smem_pipe_write_v.index())); + ++smem_pipe_write_v; + } + } + scheduler.prefetch_next_work(scheduler_params, work_tile_info); + if (lane_predicate) { + pipeline_v.producer_acquire(smem_pipe_write_v); + copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write_v), mcast_mask_kv), + tVgV(_, n_block), tVsV(_, smem_pipe_write_v.index())); + ++smem_pipe_write_v; + } + scheduler.broadcast_next_work(work_tile_info); + } + + template + CUTLASS_DEVICE void + load_fp8(Params const& mainloop_params, + MainloopPipeline pipeline_k, + MainloopPipeline pipeline_v, + MainloopPipelineNoTMA pipeline_vt, + PipelineState& smem_pipe_write, + PipelineState& smem_pipe_read, + SharedStorage &shared_storage, + Scheduler& scheduler, + typename Scheduler::Params const& scheduler_params, + typename Scheduler::WorkTileInfo& work_tile_info, + cute::tuple block_coord, + int work_idx, + const Seqlen_traits& seqlen_traits_q, + const Seqlen_traits& seqlen_traits_k + ) { + + using SmemLayoutTransposeV = typename Ktraits::SmemLayoutTransposeV; + using SmemLayoutTransposeVt = typename Ktraits::SmemLayoutTransposeVt; + + Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQ{}); + Tensor sK = make_tensor(make_smem_ptr(shared_storage.smem_k.data()), SmemLayoutK{}); + Tensor sV = make_tensor(make_smem_ptr(shared_storage.smem_v.data()), SmemLayoutV{}); + + Tensor sV_divide = as_position_independent_swizzle_tensor(make_tensor(make_smem_ptr(shared_storage.smem_v.data()), SmemLayoutTransposeV{})); + Tensor sVt_divide = as_position_independent_swizzle_tensor(make_tensor(make_smem_ptr(shared_storage.smem_v_out.data()), SmemLayoutTransposeVt{})); + + auto smem_transpose_V = SmemTransposeFp8_64x64(); + auto do_transpose_V = [&](int stage) { + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < shape<2>(SmemLayoutTransposeV{}); ++j) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < shape<1>(SmemLayoutTransposeV{}); ++i) { + smem_transpose_V(flatten(sV_divide(_, i, j, stage)), + flatten(sVt_divide(_, i, j, stage))); + } + } + }; + + Tensor mQ = mainloop_params.tma_load_Q.get_tma_tensor(mainloop_params.layout_Q.shape()); + Tensor mK = mainloop_params.tma_load_K.get_tma_tensor(mainloop_params.layout_K.shape()); + Tensor mV = mainloop_params.tma_load_V.get_tma_tensor(mainloop_params.layout_V.shape()); + + auto [m_block, bidh, bidb] = block_coord; + int bidh_kv = mainloop_params.qhead_per_khead_divmod.divide(bidh); + + // Prepare the TMA loads + uint32_t block_rank_in_cluster = cute::block_rank_in_cluster(); + constexpr uint32_t cluster_shape_x = get<0>(ClusterShape()); + uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x}; + Tensor gQ = seqlen_traits_q.get_local_tile_tensor( + mQ, select<0, 2>(TileShape_MNK{}), bidh, bidb)(_, _, m_block); // (M, K) + Tensor gK = seqlen_traits_k.get_local_tile_tensor( + mK, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb); // (N, K, _) + Tensor gV = seqlen_traits_k.get_local_tile_tensor( + mV, select<1, 2>(TileShape_MNK{}), bidh_kv, bidb); // (N, K, _) + + Tensor sQ_x = make_tensor(sQ.data(), make_layout(sQ.layout(), Layout<_1>{})); + Tensor gQ_x = make_tensor(gQ.data(), make_layout(gQ.layout(), Layout<_1>{})); + auto [tQgQ, tQsQ] = tma_partition(mainloop_params.tma_load_Q, _0{}, Layout<_1>{}, + group_modes<0, 2>(sQ_x), group_modes<0, 2>(gQ_x)); // (TMA), (TMA) + auto [tKgK, tKsK] = tma_partition(mainloop_params.tma_load_K, block_rank_in_cluster, Layout{}, + group_modes<0, 2>(sK), group_modes<0, 2>(gK)); // (TMA, k), (TMA, PIPE) + auto [tVgV, tVsV] = tma_partition(mainloop_params.tma_load_V, block_rank_in_cluster, Layout{}, + group_modes<0, 2>(sV), group_modes<0, 2>(gV)); // (TMA, k), (TMA, PIPE) + + uint16_t mcast_mask_kv = 0; + if constexpr (cute::is_same_v) { + auto block_layout = Layout{}; // (m,n) -> block_id + for (int m = 0; m < size<0>(block_layout); ++m) { + mcast_mask_kv |= (uint16_t(1) << block_layout(m, cluster_local_block_id.y, _0{})); + } + } + + int n_block_max = get_n_block_max(mainloop_params, m_block, seqlen_traits_q, seqlen_traits_k); + int n_block = n_block_max - 1; + + int lane_predicate = cute::elect_one_sync(); + int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0); + if (warp_idx_in_warpgroup == 0 && lane_predicate) { + pipeline_k.producer_acquire(smem_pipe_write); + copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write), mcast_mask_kv), + tKgK(_, n_block), tKsK(_, smem_pipe_write.index())); + } + + // Wait for the MMA warpgroups to say that smem_q is ready + // for fp8, change from NumThreadsPerWarp to NumThreadsPerWarpGroup + cutlass::arch::NamedBarrier::sync(NumMmaThreads + cutlass::NumThreadsPerWarpGroup, static_cast(FwdNamedBarriers::QueryEmpty) /*id*/); + + if constexpr(Is_causal) { + if (warp_idx_in_warpgroup == 0 && lane_predicate) { + shared_storage.barrier_Q.arrive_and_expect_tx(TmaTransactionBytesQ); + copy(mainloop_params.tma_load_Q.with(reinterpret_cast(shared_storage.barrier_Q), 0 /*mcast_mask*/), tQgQ, tQsQ); + pipeline_v.producer_acquire(smem_pipe_write); + copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv), + tVgV(_, n_block), tVsV(_, smem_pipe_write.index())); + } + + shared_storage.barrier_O.wait((work_idx + 1) % 2); + + CUTLASS_PRAGMA_UNROLL + for (int iter = 0; iter < kStages && n_block > 0; ++iter, --n_block) { + pipeline_v.consumer_wait(smem_pipe_read); + // pipeline_vt.producer_acquire(smem_pipe_write); + do_transpose_V(smem_pipe_read.index()); + pipeline_vt.producer_commit(smem_pipe_write); + pipeline_v.consumer_release(smem_pipe_read); + + ++smem_pipe_write; + ++smem_pipe_read; + + if (warp_idx_in_warpgroup == 0 && lane_predicate) { + pipeline_k.producer_acquire(smem_pipe_write); + copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write), mcast_mask_kv), + tKgK(_, n_block-1), tKsK(_, smem_pipe_write.index())); + pipeline_v.producer_acquire(smem_pipe_write); + copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv), + tVgV(_, n_block-1), tVsV(_, smem_pipe_write.index())); + } + } + + #pragma unroll 2 + for (; n_block > 0; --n_block) { + pipeline_v.consumer_wait(smem_pipe_read); + pipeline_vt.producer_acquire(smem_pipe_write); + do_transpose_V(smem_pipe_read.index()); + pipeline_vt.producer_commit(smem_pipe_write); + pipeline_v.consumer_release(smem_pipe_read); + + ++smem_pipe_write; + ++smem_pipe_read; + + if (warp_idx_in_warpgroup == 0 && lane_predicate) { + pipeline_k.producer_acquire(smem_pipe_write); + copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write), mcast_mask_kv), + tKgK(_, n_block-1), tKsK(_, smem_pipe_write.index())); + pipeline_v.producer_acquire(smem_pipe_write); + copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv), + tVgV(_, n_block-1), tVsV(_, smem_pipe_write.index())); + } + } + + scheduler.prefetch_next_work(scheduler_params, work_tile_info); + scheduler.broadcast_next_work(work_tile_info); + + pipeline_v.consumer_wait(smem_pipe_read); + if (n_block_max > kStages) + pipeline_vt.producer_acquire(smem_pipe_write); + do_transpose_V(smem_pipe_read.index()); + pipeline_vt.producer_commit(smem_pipe_write); + pipeline_v.consumer_release(smem_pipe_read); + + ++smem_pipe_write; + ++smem_pipe_read; + } else { + if (warp_idx_in_warpgroup == 0 && lane_predicate) { + shared_storage.barrier_Q.arrive_and_expect_tx(TmaTransactionBytesQ); + copy(mainloop_params.tma_load_Q.with(reinterpret_cast(shared_storage.barrier_Q), 0 /*mcast_mask*/), tQgQ, tQsQ); + pipeline_v.producer_acquire(smem_pipe_write); + copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv), + tVgV(_, n_block), tVsV(_, smem_pipe_write.index())); + } + // With fp8 kernel, smem_o is in union with smem_v_out, + // so could use NamedBarrier instead of ClusterBarrier. + // But, this doesn't appear to have any benefit. + shared_storage.barrier_O.wait((work_idx + 1) % 2); + + pipeline_v.consumer_wait(smem_pipe_read); + // pipeline_vt.producer_acquire(smem_pipe_write); + do_transpose_V(smem_pipe_read.index()); + pipeline_vt.producer_commit(smem_pipe_write); + pipeline_v.consumer_release(smem_pipe_read); + + ++smem_pipe_write; + ++smem_pipe_read; + --n_block; + + constexpr int extra_iterations = kStages - 1; + CUTLASS_PRAGMA_UNROLL + for (int iter = 0; iter < extra_iterations && n_block >= 0; ++iter) { + if (warp_idx_in_warpgroup == 0 && lane_predicate) { + pipeline_k.producer_acquire(smem_pipe_write); + copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write), mcast_mask_kv), + tKgK(_, n_block), tKsK(_, smem_pipe_write.index())); + pipeline_v.producer_acquire(smem_pipe_write); + copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv), + tVgV(_, n_block), tVsV(_, smem_pipe_write.index())); + } + + pipeline_v.consumer_wait(smem_pipe_read); + // pipeline_vt.producer_acquire(smem_pipe_write); + do_transpose_V(smem_pipe_read.index()); + pipeline_vt.producer_commit(smem_pipe_write); + pipeline_v.consumer_release(smem_pipe_read); + + ++smem_pipe_write; + ++smem_pipe_read; + --n_block; + } + + // CUTLASS_PRAGMA_NO_UNROLL + #pragma unroll 2 + for (; n_block >= 0; --n_block) { + + if (warp_idx_in_warpgroup == 0 && lane_predicate) { + pipeline_k.producer_acquire(smem_pipe_write); + copy(mainloop_params.tma_load_K.with(*pipeline_k.producer_get_barrier(smem_pipe_write), mcast_mask_kv), + tKgK(_, n_block), tKsK(_, smem_pipe_write.index())); + pipeline_v.producer_acquire(smem_pipe_write); + copy(mainloop_params.tma_load_V.with(*pipeline_v.producer_get_barrier(smem_pipe_write), mcast_mask_kv), + tVgV(_, n_block), tVsV(_, smem_pipe_write.index())); + } + + pipeline_v.consumer_wait(smem_pipe_read); + pipeline_vt.producer_acquire(smem_pipe_write); + do_transpose_V(smem_pipe_read.index()); + pipeline_vt.producer_commit(smem_pipe_write); + pipeline_v.consumer_release(smem_pipe_read); + + ++smem_pipe_write; + ++smem_pipe_read; + } + // scheduler.prefetch_next_work(scheduler_params, work_tile_info); + // scheduler.broadcast_next_work(work_tile_info); + } + } + + /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster + CUTLASS_DEVICE void + load_tail(MainloopPipeline pipeline_k, MainloopPipeline pipeline_v, + PipelineState& smem_pipe_write_k, PipelineState& smem_pipe_write_v) { + int lane_predicate = cute::elect_one_sync(); + int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0); + // Issue the epilogue waits + if (warp_idx_in_warpgroup == 0 && lane_predicate) { + /* This helps avoid early exit of blocks in Cluster + * Waits for all stages to either be released (all Consumer UNLOCKs), or if the stage was never used + * then would just be acquired since the phase was still inverted from make_producer_start_state + */ + pipeline_k.producer_tail(smem_pipe_write_k); + pipeline_v.producer_tail(smem_pipe_write_v); + } + } + + /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster + CUTLASS_DEVICE void + load_tail_one_write(MainloopPipeline pipeline_k, MainloopPipeline pipeline_v, + PipelineState& smem_pipe_write) { + int lane_predicate = cute::elect_one_sync(); + int warp_idx_in_warpgroup = __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0); + // Issue the epilogue waits + if (warp_idx_in_warpgroup == 0 && lane_predicate) { + /* This helps avoid early exit of blocks in Cluster + * Waits for all stages to either be released (all Consumer UNLOCKs), or if the stage was never used + * then would just be acquired since the phase was still inverted from make_producer_start_state + */ + pipeline_k.producer_tail(smem_pipe_write); + pipeline_v.producer_tail(smem_pipe_write); + } + } + + CUTLASS_DEVICE void + warp_scheduler_barrier_sync() { + if constexpr (UseSchedulerBarrier) { + cutlass::arch::NamedBarrier::sync(NumMmaThreads, static_cast(FwdNamedBarriers::WarpSchedulerWG1) - 1 + cutlass::canonical_warp_group_idx() /*id*/); + } + } + + CUTLASS_DEVICE void + warp_scheduler_barrier_arrive() { + if constexpr (!UseSchedulerBarrier) { return; } + static_assert(NumMmaThreads == 2 * cutlass::NumThreadsPerWarpGroup || NumMmaThreads == 3 * cutlass::NumThreadsPerWarpGroup); + if constexpr (NumMmaThreads == 2 * cutlass::NumThreadsPerWarpGroup) { + cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast(FwdNamedBarriers::WarpSchedulerWG1) - 1 + (3 - cutlass::canonical_warp_group_idx()) /*id*/); + } else { + cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast(FwdNamedBarriers::WarpSchedulerWG1) - 1 + (cutlass::canonical_warp_group_idx() <= 2 ? cutlass::canonical_warp_group_idx() + 1 : cutlass::canonical_warp_group_idx() + 1 - 3) /*id*/); + cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast(FwdNamedBarriers::WarpSchedulerWG1) - 1 + (cutlass::canonical_warp_group_idx() <= 1 ? cutlass::canonical_warp_group_idx() + 2 : cutlass::canonical_warp_group_idx() + 2 - 3) /*id*/); + } + } + + CUTLASS_DEVICE void + mma_init() { + // Tell producer (warp 0) that smem_q is ready + cutlass::arch::NamedBarrier::arrive(NumMmaThreads + Ktraits::NumProducerThreads, static_cast(FwdNamedBarriers::QueryEmpty) /*id*/); + if constexpr (!UseSchedulerBarrier) { return; } + static_assert(NumMmaThreads == 2 * cutlass::NumThreadsPerWarpGroup || NumMmaThreads == 3 * cutlass::NumThreadsPerWarpGroup); + if (cutlass::canonical_warp_group_idx() > 1) { + cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast(FwdNamedBarriers::WarpSchedulerWG1) - 1 + 1 /*id*/); + } + if constexpr (NumMmaThreads == 3 * cutlass::NumThreadsPerWarpGroup) { + if (cutlass::canonical_warp_group_idx() > 2) { + cutlass::arch::NamedBarrier::arrive(NumMmaThreads, static_cast(FwdNamedBarriers::WarpSchedulerWG1) - 1 + 2 /*id*/); + } + } + + } + + template + CUTLASS_DEVICE void + mma(Params const& mainloop_params, + MainloopPipeline pipeline_k, + MainloopPipeline pipeline_v, + PipelineState& smem_pipe_read_k, + PipelineState& smem_pipe_read_v, + FrgTensorO& tOrO, + Softmax& softmax, + int n_block_count, + int thread_idx, + int work_idx, + int m_block, + SharedStorage& shared_storage, + const Seqlen_traits& seqlen_traits_q, + const Seqlen_traits& seqlen_traits_k + ) { + static_assert(is_rmem::value, "O tensor must be rmem resident."); + + static constexpr int kBlockM = get<0>(TileShape_MNK{}); + static constexpr int kBlockN = get<1>(TileShape_MNK{}); + + Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQ{}); + Tensor sK = make_tensor(make_smem_ptr(shared_storage.smem_k.data()), SmemLayoutK{}); + Tensor sVt = make_tensor(make_smem_ptr(shared_storage.smem_v.data()), SmemLayoutVt{}); + + typename Ktraits::TiledMma0 tiled_mma0; + typename Ktraits::TiledMma1 tiled_mma1; + auto threadMma0 = tiled_mma0.get_thread_slice(thread_idx); + auto threadMma1 = tiled_mma1.get_thread_slice(thread_idx); + + // Allocate "fragments/descriptors" for first matmul. + Tensor tSrQ = threadMma0.partition_fragment_A(sQ); + Tensor tSrK = threadMma0.partition_fragment_B(sK); + // Allocate "fragments/descriptors" for second matmul. + // Note: S becomes P. + Tensor tOrV = threadMma1.partition_fragment_B(sVt); + + auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) { + auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); + pipeline.consumer_wait(smem_pipe_read, barrier_token); + }; + + tiled_mma1.accumulate_ = GMMA::ScaleOut::Zero; + int const seqlen_q = seqlen_traits_q.actual_seq_len; + int const seqlen_k = seqlen_traits_k.actual_seq_len; + int n_block = n_block_count - 1; + + cutlass::ConsumerToken barrier_token = static_cast(shared_storage.barrier_Q.try_wait(work_idx % 2)); + if (barrier_token == cutlass::BarrierStatus::WaitAgain) { shared_storage.barrier_Q.wait(work_idx % 2); } + + Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{})); + consumer_wait(pipeline_k, smem_pipe_read_k); + warp_scheduler_barrier_sync(); + flash::gemm(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read_k.index()), tSrS); + warp_scheduler_barrier_arrive(); + + if (work_idx != 0) { + int lane_predicate = cute::elect_one_sync(); + if (cutlass::canonical_warp_idx_sync() == Ktraits::kNWarps - 1 && lane_predicate) { + tma_store_wait<0>(); + #pragma unroll + for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) { + shared_storage.barrier_O.arrive(cta_id, lane_predicate); + } + } + } + warpgroup_wait<0>(); + pipeline_k.consumer_release(smem_pipe_read_k); + ++smem_pipe_read_k; + + auto col_limit_causal = [&](int row, int n_block) { + return row + 1 + seqlen_k - n_block * kBlockN - seqlen_q + m_block * kBlockM; + }; + { + Tensor cS = cute::make_identity_tensor(select<0, 1>(TileShape_MNK{})); + Tensor tScS = threadMma0.partition_C(cS); + #pragma unroll + for (int i = 0; i < size(tSrS); ++i) { + if constexpr (!Is_causal) { // Just masking based on col + if (int(get<1>(tScS(i))) >= int(seqlen_k - n_block * kBlockN)) { tSrS(i) = -INFINITY; } + } else { // mask based on both row and col + // using std::min is faster than doing col >= limit0 or col >= limit1 + // Need to cast get<1>(tScS(i)) to (signed) int since by default it's unsigned, and the + // right hand side can be negative and might be converted to a very large unsigned integer. + if (int(get<1>(tScS(i))) >= std::min(seqlen_k - n_block * kBlockN, + col_limit_causal(int(get<0>(tScS(i))), n_block))) { + tSrS(i) = -INFINITY; + } + } + } + } + + softmax.template online_softmax(tSrS, mainloop_params.softmax_scale_log2); + Tensor tOrP = make_tensor(convert_type(tSrS).data(), convert_layout_acc_Aregs(tSrS.layout())); + Tensor scores_scale = make_fragment_like(softmax.row_max); + clear(scores_scale); + + constexpr int n_masking_steps = !Is_causal ? 1 : cute::ceil_div(kBlockM, kBlockN) + 1; + // Only go through these if Is_causal, since n_masking_steps = 1 when !Is_causal + #pragma unroll + for (int masking_step = 0; masking_step < n_masking_steps - 1 && n_block > 0; ++masking_step, --n_block) { + Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{})); + consumer_wait(pipeline_k, smem_pipe_read_k); + warp_scheduler_barrier_sync(); + flash::gemm(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read_k.index()), tSrS); + if (masking_step > 0) { softmax.rescale_o(tOrO, scores_scale); } + consumer_wait(pipeline_v, smem_pipe_read_v); + flash::gemm(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read_v.index()), tOrO); + warp_scheduler_barrier_arrive(); + warpgroup_wait<1>(); + pipeline_k.consumer_release(smem_pipe_read_k); // release K + Tensor cS = cute::make_identity_tensor(select<0, 1>(TileShape_MNK{})); + Tensor tScS = threadMma0.partition_C(cS); + #pragma unroll + for (int i = 0; i < size(tSrS); ++i) { + if (int(get<1>(tScS(i))) >= col_limit_causal(int(get<0>(tScS(i))), n_block - 1)) { + tSrS(i) = -INFINITY; + } + } + cute::copy(softmax.template max(tSrS, mainloop_params.softmax_scale_log2), scores_scale); + softmax.template online_softmax(tSrS, mainloop_params.softmax_scale_log2); + warpgroup_wait<0>(); + pipeline_v.consumer_release(smem_pipe_read_v); // release V + ++smem_pipe_read_k; + ++smem_pipe_read_v; + cute::copy(make_tensor(convert_type(tSrS).data(), convert_layout_acc_Aregs(tSrS.layout())), tOrP); + } + + #pragma unroll 1 + for (; n_block > 0; --n_block) { + Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{})); + consumer_wait(pipeline_k, smem_pipe_read_k); + warp_scheduler_barrier_sync(); + flash::gemm(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read_k.index()), tSrS); + softmax.rescale_o(tOrO, scores_scale); + consumer_wait(pipeline_v, smem_pipe_read_v); + flash::gemm(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read_v.index()), tOrO); + warp_scheduler_barrier_arrive(); + warpgroup_wait<1>(); + pipeline_k.consumer_release(smem_pipe_read_k); // release K + // auto scores_scale = softmax.template max(tSrS); + cute::copy(softmax.template max(tSrS, mainloop_params.softmax_scale_log2), scores_scale); + softmax.template online_softmax(tSrS, mainloop_params.softmax_scale_log2); + warpgroup_wait<0>(); + pipeline_v.consumer_release(smem_pipe_read_v); // release V + ++smem_pipe_read_k; + ++smem_pipe_read_v; + // softmax.rescale_o(tOrO, scores_scale); + cute::copy(make_tensor(convert_type(tSrS).data(), convert_layout_acc_Aregs(tSrS.layout())), tOrP); + } + // Tell warp 0 that smem_q is ready + cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarp, static_cast(FwdNamedBarriers::QueryEmpty) /*id*/); + softmax.rescale_o(tOrO, scores_scale); + consumer_wait(pipeline_v, smem_pipe_read_v); + flash::gemm(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read_v.index()), tOrO); + cute::copy(softmax.template finalize(tSrS, mainloop_params.softmax_scale_log2), scores_scale); + warpgroup_wait<0>(); + pipeline_v.consumer_release(smem_pipe_read_v); // release V, otherwise producers will hang + ++smem_pipe_read_v; + + softmax.rescale_o(tOrO, scores_scale); + return; + } + + template + CUTLASS_DEVICE void + mma_fp8(Params const& mainloop_params, + MainloopPipeline pipeline_k, + MainloopPipelineNoTMA pipeline_vt, + PipelineState& smem_pipe_read, + PipelineState& smem_pipe_release, + FrgTensorO& tOrO, + Softmax& softmax, + int n_block_count, + int thread_idx, + int work_idx, + int m_block, + SharedStorage& shared_storage, + const Seqlen_traits& seqlen_traits_q, + const Seqlen_traits& seqlen_traits_k + ) { + static_assert(is_rmem::value, "O tensor must be rmem resident."); + + static constexpr int kBlockM = get<0>(TileShape_MNK{}); + static constexpr int kBlockN = get<1>(TileShape_MNK{}); + + Tensor sQ = make_tensor(make_smem_ptr(shared_storage.smem_q.data()), SmemLayoutQ{}); + Tensor sK = make_tensor(make_smem_ptr(shared_storage.smem_k.data()), SmemLayoutK{}); + Tensor sVt = make_tensor(make_smem_ptr(shared_storage.smem_v_out.data()), SmemLayoutVt{}); + + typename Ktraits::TiledMma0 tiled_mma0; + typename Ktraits::TiledMma1 tiled_mma1; + auto threadMma0 = tiled_mma0.get_thread_slice(thread_idx); + auto threadMma1 = tiled_mma1.get_thread_slice(thread_idx); + + // Allocate "fragments/descriptors" for first matmul. + Tensor tSrQ = threadMma0.partition_fragment_A(sQ); + Tensor tSrK = threadMma0.partition_fragment_B(sK); + // Allocate "fragments/descriptors" for second matmul. + Tensor tOrV = threadMma1.partition_fragment_B(sVt); + + auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) { + auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); + pipeline.consumer_wait(smem_pipe_read, barrier_token); + }; + + tiled_mma1.accumulate_ = GMMA::ScaleOut::Zero; + // workaround for fp8 only perf regression pending change to seqlen traits class + int const seqlen_q = Seqlen_traits::kUseVarSeqLen ? seqlen_traits_q.actual_seq_len : shape<0>(mainloop_params.layout_Q); + int const seqlen_k = Seqlen_traits::kUseVarSeqLen ? seqlen_traits_k.actual_seq_len : shape<0>(mainloop_params.layout_K); + int n_block = n_block_count - 1; + + cutlass::ConsumerToken barrier_token = static_cast(shared_storage.barrier_Q.try_wait(work_idx % 2)); + if (barrier_token == cutlass::BarrierStatus::WaitAgain) { shared_storage.barrier_Q.wait(work_idx % 2); } + + Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{})); + + consumer_wait(pipeline_k, smem_pipe_read); + warp_scheduler_barrier_sync(); + flash::gemm(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS); + if (work_idx != 0) { + int lane_predicate = cute::elect_one_sync(); + if (cutlass::canonical_warp_idx_sync() == Ktraits::kNWarps - 1 && lane_predicate) { + tma_store_wait<0>(); + #pragma unroll + for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) { + shared_storage.barrier_O.arrive(cta_id, lane_predicate); + } + } + } + warpgroup_wait<0>(); + warp_scheduler_barrier_arrive(); + pipeline_k.consumer_release(smem_pipe_read); + + auto col_limit_causal = [&](int row, int n_block) { + return row + 1 + seqlen_k - n_block * kBlockN - seqlen_q + m_block * kBlockM; + }; + { + Tensor cS = cute::make_identity_tensor(select<0, 1>(TileShape_MNK{})); + Tensor tScS = threadMma0.partition_C(cS); + #pragma unroll + for (int i = 0; i < size(tSrS); ++i) { + if constexpr (!Is_causal) { // Just masking based on col + if (int(get<1>(tScS(i))) >= int(seqlen_k - n_block * kBlockN)) { tSrS(i) = -INFINITY; } + } else { // mask based on both row and col + if (int(get<1>(tScS(i))) >= std::min(seqlen_k - n_block * kBlockN, + col_limit_causal(int(get<0>(tScS(i))), n_block))) { + tSrS(i) = -INFINITY; + } + } + } + } + + softmax.template online_softmax(tSrS, mainloop_params.softmax_scale_log2); + Tensor tOrP = make_tensor(convert_type(tSrS).data(), convert_layout_acc_Aregs_fp8(tSrS.layout())); + permute_regs_A_to_C(tOrP); + + Tensor scores_scale = make_fragment_like(softmax.row_max); + clear(scores_scale); + + consumer_wait(pipeline_vt, smem_pipe_read); + flash::gemm(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read.index()), tOrO); + if constexpr(!Delay_V_release) { pipeline_vt.consumer_release(smem_pipe_read); } + + ++smem_pipe_read; + --n_block; + constexpr int extra_iterations = !Is_causal ? kStages - 1 : cute::ceil_div(kBlockM, kBlockN); + + if constexpr(Is_causal) { + CUTLASS_PRAGMA_UNROLL + for (int iter = 0; iter < extra_iterations && n_block >= 0; ++iter, --n_block) { + Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{})); + consumer_wait(pipeline_k, smem_pipe_read); + warp_scheduler_barrier_sync(); + flash::gemm(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS); + + Tensor cS = cute::make_identity_tensor(select<0, 1>(TileShape_MNK{})); + Tensor tScS = threadMma0.partition_C(cS); + #pragma unroll + for (int i = 0; i < size(tSrS); ++i) { + if (int(get<1>(tScS(i))) >= col_limit_causal(int(get<0>(tScS(i))), n_block)) { + tSrS(i) = -INFINITY; + } + } + + warp_scheduler_barrier_arrive(); + pipeline_k.consumer_release(smem_pipe_read); + consumer_wait(pipeline_vt, smem_pipe_read); + + cute::copy(softmax.template max(tSrS, mainloop_params.softmax_scale_log2), scores_scale); + softmax.rescale_o(tOrO, scores_scale); + softmax.template online_softmax(tSrS, mainloop_params.softmax_scale_log2); + Tensor tOrP = make_tensor(convert_type(tSrS).data(), convert_layout_acc_Aregs_fp8(tSrS.layout())); + permute_regs_A_to_C(tOrP); + + if constexpr(Delay_V_release) { + pipeline_vt.consumer_release(smem_pipe_release); + ++smem_pipe_release; + } + flash::gemm(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read.index()), tOrO); + if constexpr(!Delay_V_release) { pipeline_vt.consumer_release(smem_pipe_read); } + ++smem_pipe_read; + } + } else { + CUTLASS_PRAGMA_UNROLL + for (int iter = 0; iter < extra_iterations && n_block >= 0; ++iter, --n_block) { + Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{})); + consumer_wait(pipeline_k, smem_pipe_read); + if constexpr(Delay_V_release) { + pipeline_vt.consumer_release(smem_pipe_release); + ++smem_pipe_release; + } + warp_scheduler_barrier_sync(); + flash::gemm(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS); + warp_scheduler_barrier_arrive(); + if constexpr(!Delay_V_release) { pipeline_k.consumer_release(smem_pipe_read); } + else { consumer_wait(pipeline_vt, smem_pipe_read); } + + cute::copy(softmax.template max(tSrS, mainloop_params.softmax_scale_log2), scores_scale); + softmax.rescale_o(tOrO, scores_scale); + softmax.template online_softmax(tSrS, mainloop_params.softmax_scale_log2); + Tensor tOrP = make_tensor(convert_type(tSrS).data(), convert_layout_acc_Aregs_fp8(tSrS.layout())); + permute_regs_A_to_C(tOrP); + + if constexpr (Delay_V_release) { pipeline_k.consumer_release(smem_pipe_read); } + else { consumer_wait(pipeline_vt, smem_pipe_read); } + flash::gemm(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read.index()), tOrO); + if constexpr(!Delay_V_release) { pipeline_vt.consumer_release(smem_pipe_read); } + ++smem_pipe_read; + } + } + + if constexpr(Delay_V_release) { + warp_scheduler_barrier_sync(); + CUTLASS_PRAGMA_NO_UNROLL + for (; n_block >= 0; --n_block) { + Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{})); + consumer_wait(pipeline_k, smem_pipe_read); + pipeline_vt.consumer_release(smem_pipe_release); + flash::gemm(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS); + warp_scheduler_barrier_arrive(); + warpgroup_wait<0>(); + consumer_wait(pipeline_vt, smem_pipe_read); + + cute::copy(softmax.template max(tSrS, mainloop_params.softmax_scale_log2), scores_scale); + softmax.rescale_o(tOrO, scores_scale); + softmax.template online_softmax(tSrS, mainloop_params.softmax_scale_log2); + Tensor tOrP = make_tensor(convert_type(tSrS).data(), convert_layout_acc_Aregs_fp8(tSrS.layout())); + permute_regs_A_to_C(tOrP); + + pipeline_k.consumer_release(smem_pipe_read); + flash::gemm(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read.index()), tOrO); + warp_scheduler_barrier_sync(); + warpgroup_wait<0>(); + ++smem_pipe_read; + ++smem_pipe_release; + } + warp_scheduler_barrier_arrive(); + pipeline_vt.consumer_release(smem_pipe_release); + ++smem_pipe_release; + } else { + if constexpr (kHeadDim == 128) { warp_scheduler_barrier_sync(); } + CUTLASS_PRAGMA_NO_UNROLL + for (; n_block >= 0; --n_block) { + Tensor tSrS = partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{})); + consumer_wait(pipeline_k, smem_pipe_read); + if constexpr (kHeadDim == 256) { warp_scheduler_barrier_sync(); } + flash::gemm(tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS); + warp_scheduler_barrier_arrive(); + pipeline_k.consumer_release(smem_pipe_read); + + cute::copy(softmax.template max(tSrS, mainloop_params.softmax_scale_log2), scores_scale); + softmax.rescale_o(tOrO, scores_scale); + softmax.template online_softmax(tSrS, mainloop_params.softmax_scale_log2); + Tensor tOrP = make_tensor(convert_type(tSrS).data(), convert_layout_acc_Aregs_fp8(tSrS.layout())); + permute_regs_A_to_C(tOrP); + + consumer_wait(pipeline_vt, smem_pipe_read); + if constexpr (kHeadDim == 128) { warp_scheduler_barrier_sync(); } + flash::gemm(tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read.index()), tOrO); + pipeline_vt.consumer_release(smem_pipe_read); + ++smem_pipe_read; + } + if constexpr (kHeadDim == 128) { warp_scheduler_barrier_arrive(); } + } + cutlass::arch::NamedBarrier::arrive(NumMmaThreads + cutlass::NumThreadsPerWarpGroup, static_cast(FwdNamedBarriers::QueryEmpty) /*id*/); + + cute::copy(softmax.template finalize(tSrS, mainloop_params.softmax_scale_log2), scores_scale); + softmax.rescale_o(tOrO, scores_scale); + return; + } + +}; + +} // namespace flash diff --git a/many_loggers.yaml b/many_loggers.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7bc3d6762674a8220d771e36ca10791ae6447e06 --- /dev/null +++ b/many_loggers.yaml @@ -0,0 +1,9 @@ +# train with many loggers at once + +defaults: + # - comet.yaml + - csv.yaml + # - mlflow.yaml + # - neptune.yaml + # - tensorboard.yaml + - wandb.yaml diff --git a/mask.h b/mask.h new file mode 100644 index 0000000000000000000000000000000000000000..7ba435a37bb65fffca9d3f227113a657919bde07 --- /dev/null +++ b/mask.h @@ -0,0 +1,213 @@ +/****************************************************************************** + * Copyright (c) 2024, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include + +namespace flash { + +using namespace cute; + +template +__forceinline__ __device__ void apply_mask(Tensor &tensor, const int max_seqlen_k, + const int col_idx_offset_ = 0) { + // tensor has shape (nrow=(2, MMA_M), ncol=(2, MMA_N)) + static_assert(Layout::rank == 2, "Only support 2D Tensor"); + const int lane_id = threadIdx.x % 32; + const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2; + #pragma unroll + for (int nj = 0; nj < size<1, 1>(tensor); ++nj) { + const int col_idx_base = col_idx_offset + nj * 8; + #pragma unroll + for (int j = 0; j < size<1, 0>(tensor); ++j) { + const int col_idx = col_idx_base + j; + if (col_idx >= max_seqlen_k) { + // Without the "make_coord" we get wrong results + #pragma unroll + for (int mi = 0; mi < size<0>(tensor); ++mi) { + tensor(mi, make_coord(j, nj)) = -INFINITY; + } + } + } + } +} + +template +__forceinline__ __device__ void apply_mask_local(Tensor &tensor, const int col_idx_offset_, + const int max_seqlen_k, const int row_idx_offset, + const int max_seqlen_q, const int warp_row_stride, + const int window_size_left, const int window_size_right) { + // tensor has shape (nrow=(2, MMA_M), ncol=(2, MMA_N)) + static_assert(Layout::rank == 2, "Only support 2D Tensor"); + const int lane_id = threadIdx.x % 32; + const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2; + #pragma unroll + for (int mi = 0; mi < size<0, 1>(tensor); ++mi) { + const int row_idx_base = row_idx_offset + mi * warp_row_stride; + #pragma unroll + for (int i = 0; i < size<0, 0>(tensor); ++i) { + const int row_idx = row_idx_base + i * 8; + const int col_idx_limit_left = std::max(0, row_idx + max_seqlen_k - max_seqlen_q - window_size_left); + const int col_idx_limit_right = std::min(max_seqlen_k, row_idx + 1 + max_seqlen_k - max_seqlen_q + window_size_right); + #pragma unroll + for (int nj = 0; nj < size<1, 1>(tensor); ++nj) { + const int col_idx_base = col_idx_offset + nj * 8; + #pragma unroll + for (int j = 0; j < size<1, 0>(tensor); ++j) { + const int col_idx = col_idx_base + j; + if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) { + tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + } + } + } + // if (cute::thread0()) { + // printf("mi = %d, i = %d, row_idx = %d, max_seqlen_k = %d\n", mi, i, row_idx, max_seqlen_k); + // print(tensor(make_coord(i, mi), _)); + // // print(tensor(_, j + nj * size<1, 0>(tensor))); + // } + } + } +} + +template +__forceinline__ __device__ void apply_mask_causal(Tensor &tensor, const int col_idx_offset_, + const int max_seqlen_k, const int row_idx_offset, + const int max_seqlen_q, const int warp_row_stride) { + // Causal masking is equivalent to local masking with window_size_left = infinity and window_size_right = 0 + apply_mask_local(tensor, col_idx_offset_, max_seqlen_k, row_idx_offset, + max_seqlen_q, warp_row_stride, -1, 0); +} + +template +__forceinline__ __device__ void apply_mask_causal_w_idx( + Tensor &tensor, Tensor const &idx_rowcol, + const int col_idx_offset_, const int max_seqlen_k, const int row_idx_offset) +{ + // tensor has shape (nrow=(2, MMA_M), ncol=(2, MMA_N)) + static_assert(Layout0::rank == 2, "Only support 2D Tensor"); + static_assert(Layout1::rank == 2, "Only support 2D Tensor"); + CUTE_STATIC_ASSERT_V(size<0>(tensor) == size<0>(idx_rowcol)); + CUTE_STATIC_ASSERT_V(size<1>(tensor) == size<1>(idx_rowcol)); + #pragma unroll + for (int mi = 0; mi < size<0>(tensor); ++mi) { + const int col_idx_limit = std::min(max_seqlen_k, 1 + row_idx_offset + get<0>(idx_rowcol(mi, 0))); + #pragma unroll + for (int ni = 0; ni < size<1, 1>(tensor); ++ni) { + if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) { + tensor(mi, ni) = -INFINITY; + } + } + // if (cute::thread0()) { + // printf("ni = %d, j = %d, col_idx = %d, max_seqlen_k = %d\n", ni, j, col_idx, max_seqlen_k); + // print(tensor(_, make_coord(j, ni))); + // // print(tensor(_, j + ni * size<1, 0>(tensor))); + // } + } +} + +template +struct Mask { + + const int max_seqlen_k, max_seqlen_q; + const int window_size_left, window_size_right; + const float alibi_slope; + + __forceinline__ __device__ Mask(const int max_seqlen_k, const int max_seqlen_q, + const int window_size_left, const int window_size_right, + const float alibi_slope=0.f) + : max_seqlen_k(max_seqlen_k) + , max_seqlen_q(max_seqlen_q) + , window_size_left(window_size_left) + , window_size_right(window_size_right) + , alibi_slope(!Has_alibi ? 0.0 : alibi_slope) { + }; + + // Causal_mask: whether this particular iteration needs causal masking + template + __forceinline__ __device__ void apply_mask(Tensor &tensor_, + const int col_idx_offset_, + const int row_idx_offset, + const int warp_row_stride) { + static_assert(!(Causal_mask && Is_local), "Cannot be both causal and local"); + static_assert(Layout::rank == 3, "Only support 3D Tensor"); + static_assert(decltype(size<0>(tensor_))::value == 4, "First dimension must be 4"); + static constexpr bool Need_masking = Has_alibi || Causal_mask || Is_local || !Is_even_MN; + // if (cute::thread0()) { printf("Has_alibi = %d, Causal_mask=%d, Is_local=%d, Is_even_MN = %d, Need_masking = %d\n", Has_alibi, Causal_mask, Is_local, Is_even_MN, Need_masking); } + if constexpr (Need_masking) { + // Reshape tensor_ from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N)) + Tensor tensor = make_tensor(tensor_.data(), flash::convert_layout_acc_rowcol(tensor_.layout())); + // Do we need both row and column indices, or just column incides? + static constexpr bool Col_idx_only = !(Has_alibi && !Is_causal) && !Is_local && !Causal_mask; + const int lane_id = threadIdx.x % 32; + const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2; + if constexpr (Col_idx_only) { + #pragma unroll + for (int nj = 0; nj < size<1, 1>(tensor); ++nj) { + const int col_idx_base = col_idx_offset + nj * 8; + #pragma unroll + for (int j = 0; j < size<1, 0>(tensor); ++j) { + const int col_idx = col_idx_base + j; + #pragma unroll + for (int mi = 0; mi < size<0>(tensor); ++mi) { + // No causal, no local + if constexpr (Has_alibi) { + tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx; + } + if constexpr (!Is_even_MN) { + if (col_idx >= max_seqlen_k) { tensor(mi, make_coord(j, nj)) = -INFINITY; } + } + } + } + } + } else { + #pragma unroll + for (int mi = 0; mi < size<0, 1>(tensor); ++mi) { + const int row_idx_base = row_idx_offset + mi * warp_row_stride; + #pragma unroll + for (int i = 0; i < size<0, 0>(tensor); ++i) { + const int row_idx = row_idx_base + i * 8; + const int col_idx_limit_left = std::max(0, row_idx + max_seqlen_k - max_seqlen_q - window_size_left); + const int col_idx_limit_right = std::min(max_seqlen_k, row_idx + 1 + max_seqlen_k - max_seqlen_q + window_size_right); + #pragma unroll + for (int nj = 0; nj < size<1, 1>(tensor); ++nj) { + const int col_idx_base = col_idx_offset + nj * 8; + #pragma unroll + for (int j = 0; j < size<1, 0>(tensor); ++j) { + const int col_idx = col_idx_base + j; + if constexpr (Has_alibi) { + if constexpr (Is_causal) { + tensor(make_coord(i, mi), make_coord(j, nj)) += alibi_slope * col_idx; + } else { + tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx); + + } + } + if constexpr (Causal_mask) { + if (col_idx >= col_idx_limit_right) { + tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + } + } + if constexpr (Is_local) { + if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) { + tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + } + } + if constexpr (!Causal_mask && !Is_local && !Is_even_MN) { + // Causal and Local already handles MN masking + if (col_idx >= max_seqlen_k) { + tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + } + } + } + } + } + } + } + } + }; + +}; + +} // namespace flash diff --git a/mha.py b/mha.py new file mode 100644 index 0000000000000000000000000000000000000000..77640c2b239ac729cad79ce3b2504e0eeacb5f73 --- /dev/null +++ b/mha.py @@ -0,0 +1,1020 @@ +# Copyright (c) 2023, Tri Dao. + +import math +from functools import partial + +import torch +import torch.nn as nn +from einops import rearrange, repeat + +from flash_attn.utils.distributed import get_dim_for_local_rank + +try: + from flash_attn import ( + flash_attn_kvpacked_func, + flash_attn_qkvpacked_func, + flash_attn_varlen_kvpacked_func, + flash_attn_varlen_qkvpacked_func, + flash_attn_with_kvcache, + ) +except ImportError: + flash_attn_varlen_qkvpacked_func, flash_attn_varlen_kvpacked_func = None, None + flash_attn_qkvpacked_func, flash_attn_kvpacked_func = None, None + flash_attn_with_kvcache = None + +try: + from flash_attn.ops.fused_dense import ColumnParallelLinear, FusedDense, RowParallelLinear +except ImportError: + FusedDense, ColumnParallelLinear, RowParallelLinear = None, None, None + +try: + from flash_attn.layers.rotary import RotaryEmbedding +except ImportError: + RotaryEmbedding = None + + +# From https://github.com/ofirpress/attention_with_linear_biases/blob/4b92f28a005ead2567abe2359f633e73e08f3833/fairseq/models/transformer.py#L742 +def get_alibi_slopes(nheads): + def get_slopes_power_of_2(nheads): + start = 2 ** (-(2 ** -(math.log2(nheads) - 3))) + ratio = start + return [start * ratio**i for i in range(nheads)] + + if math.log2(nheads).is_integer(): + return get_slopes_power_of_2(nheads) + else: + closest_power_of_2 = 2 ** math.floor(math.log2(nheads)) + return ( + get_slopes_power_of_2(closest_power_of_2) + + get_alibi_slopes(2 * closest_power_of_2)[0::2][: nheads - closest_power_of_2] + ) + + +class FlashSelfAttention(nn.Module): + """Implement the scaled dot product attention with softmax. + Arguments + --------- + softmax_scale: The temperature to use for the softmax attention. + (default: 1/sqrt(d_keys) where d_keys is computed at + runtime) + attention_dropout: The dropout rate to apply to the attention + (default: 0.0) + """ + + def __init__( + self, + causal=False, + softmax_scale=None, + attention_dropout=0.0, + window_size=(-1, -1), + alibi_slopes=None, + deterministic=False, + ): + super().__init__() + assert flash_attn_varlen_qkvpacked_func is not None, "FlashAttention is not installed" + assert flash_attn_qkvpacked_func is not None, "FlashAttention is not installed" + self.causal = causal + self.softmax_scale = softmax_scale + self.drop = nn.Dropout(attention_dropout) + self.register_buffer("alibi_slopes", alibi_slopes, persistent=False) + self.window_size = window_size + self.deterministic = deterministic + + def forward(self, qkv, causal=None, cu_seqlens=None, max_seqlen=None): + """Implements the multihead softmax attention. + Arguments + --------- + qkv: The tensor containing the query, key, and value. + If cu_seqlens is None and max_seqlen is None, then qkv has shape (B, S, 3, H, D). + If cu_seqlens is not None and max_seqlen is not None, then qkv has shape + (total, 3, H, D), where total is the sum of the sequence lengths in the batch. + causal: if passed, will override self.causal + cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into qkv. + max_seqlen: int. Maximum sequence length in the batch. + Returns: + -------- + out: (total, H, D) if cu_seqlens is not None and max_seqlen is not None, + else (B, S, H, D). + """ + assert qkv.dtype in [torch.float16, torch.bfloat16] + assert qkv.is_cuda + causal = self.causal if causal is None else causal + unpadded = cu_seqlens is not None + if self.alibi_slopes is not None: + self.alibi_slopes = self.alibi_slopes.to(torch.float32) + if unpadded: + assert cu_seqlens.dtype == torch.int32 + assert max_seqlen is not None + assert isinstance(max_seqlen, int) + return flash_attn_varlen_qkvpacked_func( + qkv, + cu_seqlens, + max_seqlen, + self.drop.p if self.training else 0.0, + softmax_scale=self.softmax_scale, + causal=causal, + alibi_slopes=self.alibi_slopes, + window_size=self.window_size, + deterministic=self.deterministic, + ) + else: + return flash_attn_qkvpacked_func( + qkv, + self.drop.p if self.training else 0.0, + softmax_scale=self.softmax_scale, + causal=causal, + alibi_slopes=self.alibi_slopes, + window_size=self.window_size, + deterministic=self.deterministic, + ) + + +class FlashCrossAttention(nn.Module): + """Implement the scaled dot product attention with softmax. + Arguments + --------- + softmax_scale: The temperature to use for the softmax attention. + (default: 1/sqrt(d_keys) where d_keys is computed at + runtime) + attention_dropout: The dropout rate to apply to the attention + (default: 0.0) + """ + + def __init__( + self, + causal=False, + softmax_scale=None, + attention_dropout=0.0, + alibi_slopes=None, + window_size=(-1, -1), + deterministic=False, + ): + super().__init__() + assert flash_attn_varlen_kvpacked_func is not None, "FlashAttention is not installed" + assert flash_attn_kvpacked_func is not None, "FlashAttention is not installed" + self.causal = causal + self.softmax_scale = softmax_scale + self.drop = nn.Dropout(attention_dropout) + self.register_buffer("alibi_slopes", alibi_slopes, persistent=False) + self.window_size = window_size + self.deterministic = deterministic + + def forward( + self, + q, + kv, + causal=None, + cu_seqlens=None, + max_seqlen=None, + cu_seqlens_k=None, + max_seqlen_k=None, + ): + """Implements the multihead softmax attention. + Arguments + --------- + q: The tensor containing the query. (B, Sq, H, D) + kv: The tensor containing the key and value. (B, Sk, 2, H_k, D) + causal: if passed, will override self.causal + cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into q. + max_seqlen: int. Maximum sequence length in the batch of q. + cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into kv. + max_seqlen_k: int. Maximum sequence length in the batch of k and v. + """ + assert q.dtype in [torch.float16, torch.bfloat16] + assert q.is_cuda and kv.is_cuda + causal = self.causal if causal is None else causal + unpadded = cu_seqlens is not None + if self.alibi_slopes is not None: + self.alibi_slopes = self.alibi_slopes.to(torch.float32) + if unpadded: + assert cu_seqlens.dtype == torch.int32 + assert max_seqlen is not None + assert isinstance(max_seqlen, int) + assert cu_seqlens_k is not None + assert cu_seqlens_k.dtype == torch.int32 + assert max_seqlen_k is not None + assert isinstance(max_seqlen_k, int) + return flash_attn_varlen_kvpacked_func( + q, + kv, + cu_seqlens, + cu_seqlens_k, + max_seqlen, + max_seqlen_k, + self.drop.p if self.training else 0.0, + softmax_scale=self.softmax_scale, + causal=causal, + alibi_slopes=self.alibi_slopes, + window_size=self.window_size, + deterministic=self.deterministic, + ) + else: + batch_size, seqlen_q = q.shape[0], q.shape[1] + seqlen_k = kv.shape[1] + assert kv.shape[0] == batch_size and kv.shape[4] == q.shape[3] + return flash_attn_kvpacked_func( + q, + kv, + self.drop.p if self.training else 0.0, + causal=causal, + softmax_scale=self.softmax_scale, + alibi_slopes=self.alibi_slopes, + window_size=self.window_size, + deterministic=self.deterministic, + ) + + +class SelfAttention(nn.Module): + """Implement the scaled dot product attention with softmax. + Arguments + --------- + softmax_scale: The temperature to use for the softmax attention. + (default: 1/sqrt(d_keys) where d_keys is computed at + runtime) + attention_dropout: The dropout rate to apply to the attention + (default: 0.0) + """ + + def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0): + super().__init__() + self.causal = causal + self.softmax_scale = softmax_scale + self.drop = nn.Dropout(attention_dropout) + + def forward(self, qkv, causal=None, key_padding_mask=None): + """Implements the multihead softmax attention. + Arguments + --------- + qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) + causal: if passed, will override self.causal + key_padding_mask: boolean mask to apply to the attention weights. True means to keep, + False means to mask out. (B, S) + """ + batch_size, seqlen = qkv.shape[0], qkv.shape[1] + causal = self.causal if causal is None else causal + q, k, v = qkv.unbind(dim=2) + softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1]) + scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale) + if key_padding_mask is not None: + padding_mask = torch.full( + (batch_size, seqlen), -10000.0, dtype=scores.dtype, device=scores.device + ) + padding_mask.masked_fill_(key_padding_mask, 0.0) + # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess) + scores = scores + rearrange(padding_mask, "b s -> b 1 1 s") + if causal: + # "triu_tril_cuda_template" not implemented for 'BFloat16' + # So we have to construct the mask in float + causal_mask = torch.triu( + torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1 + ) + # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess) + scores = scores + causal_mask.to(dtype=scores.dtype) + attention = torch.softmax(scores, dim=-1, dtype=v.dtype) + attention_drop = self.drop(attention) + output = torch.einsum("bhts,bshd->bthd", attention_drop, v) + return output + + +class CrossAttention(nn.Module): + """Implement the scaled dot product attention with softmax. + Arguments + --------- + softmax_scale: The temperature to use for the softmax attention. + (default: 1/sqrt(d_keys) where d_keys is computed at + runtime) + attention_dropout: The dropout rate to apply to the attention + (default: 0.0) + """ + + def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0): + super().__init__() + self.causal = causal + self.softmax_scale = softmax_scale + self.drop = nn.Dropout(attention_dropout) + + def forward(self, q, kv, causal=None, key_padding_mask=None): + """Implements the multihead softmax attention. + Arguments + --------- + q: The tensor containing the query. (B, Sq, H, D) + kv: The tensor containing the key and value. (B, Sk, 2, H_k, D) + causal: if passed, will override self.causal + key_padding_mask: boolean mask to apply to the attention weights. True means to keep, + False means to mask out. (B, Sk) + """ + batch_size, seqlen_q = q.shape[0], q.shape[1] + causal = self.causal if causal is None else causal + seqlen_k = kv.shape[1] + assert kv.shape[0] == batch_size and kv.shape[4] == q.shape[3] + if kv.shape[3] != q.shape[2]: # MQA/GQA + kv = repeat(kv, "... hkv d -> ... (hkv g) d", g=q.shape[2] // kv.shape[3]) + k, v = kv.unbind(dim=2) + softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1]) + scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale) + if key_padding_mask is not None: + padding_mask = torch.full( + (batch_size, seqlen_k), -10000.0, dtype=scores.dtype, device=scores.device + ) + padding_mask.masked_fill_(key_padding_mask, 0.0) + # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess) + scores = scores + rearrange(padding_mask, "b s -> b 1 1 s") + if causal: + # causal mask needs to take into account the difference between seqlen_q and seqlen_k + row_idx = rearrange( + torch.arange(seqlen_q, device=q.device, dtype=torch.long), "s -> s 1" + ) + col_idx = torch.arange(seqlen_k, device=kv.device, dtype=torch.long) + sk = ( + seqlen_k + if key_padding_mask is None + else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1") + ) + causal_mask = col_idx > row_idx + sk - seqlen_q + scores = scores.masked_fill(causal_mask, -10000.0) + attention = torch.softmax(scores, dim=-1, dtype=v.dtype) + attention_drop = self.drop(attention) + output = torch.einsum("bhts,bshd->bthd", attention_drop, v) + return output + + +class LinearResidual(nn.Linear): + """Wrap nn.Linear to return the residual as well. For compatibility with FusedDense.""" + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return super().forward(input), input + + +def _update_kv_cache(kv, inference_params, layer_idx): + """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)""" + # Pre-allocate memory for key-values for inference. + num_heads, head_dim = kv.shape[-2:] + if layer_idx not in inference_params.key_value_memory_dict: + kv_cache = torch.empty( + inference_params.max_batch_size, + inference_params.max_seqlen, + 2, + num_heads, + head_dim, + dtype=kv.dtype, + device=kv.device, + ) + inference_params.key_value_memory_dict[layer_idx] = kv_cache + else: + kv_cache = inference_params.key_value_memory_dict[layer_idx] + # Adjust key and value for inference + batch_start = inference_params.batch_size_offset + batch_end = batch_start + kv.shape[0] + sequence_start = inference_params.seqlen_offset + sequence_end = sequence_start + kv.shape[1] + assert batch_end <= kv_cache.shape[0] + assert sequence_end <= kv_cache.shape[1] + assert kv_cache is not None + kv_cache[batch_start:batch_end, sequence_start:sequence_end, ...] = kv + return kv_cache[batch_start:batch_end, :sequence_end, ...] + + +class MHA(nn.Module): + """Multi-head self-attention and cross-attention""" + + def __init__( + self, + embed_dim, + num_heads, + num_heads_kv=None, + cross_attn=False, + qkv_proj_bias=True, + out_proj_bias=True, + dropout=0.0, + softmax_scale=None, + causal=False, + layer_idx=None, + dwconv=False, + rotary_emb_dim=0, + rotary_emb_base=10000.0, + rotary_emb_scale_base=None, + rotary_emb_interleaved=False, + use_alibi=False, + window_size=(-1, -1), + fused_bias_fc=False, + use_flash_attn=False, + return_residual=False, + checkpointing=False, + device=None, + dtype=None, + ) -> None: + """ + num_heads_kv: can be used to toggle MQA / GQA. If None, use num_heads. + return_residual: whether to return the input x along with the output. This is for + performance reason: for post-norm architecture, returning the input allows us + to fuse the backward of nn.Linear with the residual connection. + """ + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.embed_dim = embed_dim + self.cross_attn = cross_attn + self.causal = causal + self.layer_idx = layer_idx + self.dwconv = dwconv + self.rotary_emb_dim = rotary_emb_dim + self.use_flash_attn = use_flash_attn + self.return_residual = return_residual + self.checkpointing = checkpointing + if use_alibi: + assert use_flash_attn, "ALiBi code path requires flash_attn" + alibi_slopes = torch.tensor(get_alibi_slopes(num_heads), device=device) + else: + alibi_slopes = None + if window_size != (-1, -1): + assert use_flash_attn, "Local (sliding window) attention code path requires flash_attn" + + self.num_heads = num_heads + self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads + assert ( + self.num_heads % self.num_heads_kv == 0 + ), "num_heads must be divisible by num_heads_kv" + assert self.embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads" + self.head_dim = self.embed_dim // num_heads + qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv) + kv_dim = 2 * self.head_dim * self.num_heads_kv + + if self.rotary_emb_dim > 0: + assert not cross_attn, "MHA with rotary embedding does not support cross-attention yet" + assert RotaryEmbedding is not None, "rotary_emb is not installed" + self.rotary_emb = RotaryEmbedding( + self.rotary_emb_dim, + base=rotary_emb_base, + scale_base=rotary_emb_scale_base, + interleaved=rotary_emb_interleaved, + device=device, + ) + + if fused_bias_fc and FusedDense is None: + raise ImportError("fused_dense is not installed") + linear_cls = nn.Linear if not fused_bias_fc else FusedDense + linear_resid_cls = ( + LinearResidual if not fused_bias_fc else partial(FusedDense, return_residual=True) + ) + wqkv_cls = linear_cls if not self.return_residual else linear_resid_cls + inner_attn_cls = ( + partial(FlashSelfAttention, alibi_slopes=alibi_slopes, window_size=window_size) + if use_flash_attn + else SelfAttention + ) + inner_cross_attn_cls = ( + partial(FlashCrossAttention, alibi_slopes=alibi_slopes, window_size=window_size) + if use_flash_attn + else CrossAttention + ) + if not self.cross_attn: + self.Wqkv = wqkv_cls(embed_dim, qkv_dim, bias=qkv_proj_bias, **factory_kwargs) + else: + self.Wq = linear_cls(embed_dim, embed_dim, bias=qkv_proj_bias, **factory_kwargs) + self.Wkv = wqkv_cls(embed_dim, kv_dim, bias=qkv_proj_bias, **factory_kwargs) + if self.dwconv: + if self.num_heads_kv == self.num_heads: + self.dwconv_qkv = nn.Conv1d( + qkv_dim, qkv_dim, kernel_size=3, padding=2, groups=qkv_dim + ) + else: + self.dwconv_q = nn.Conv1d( + embed_dim, embed_dim, kernel_size=3, padding=2, groups=embed_dim + ) + self.dwconv_kv = nn.Conv1d(kv_dim, kv_dim, kernel_size=3, padding=2, groups=kv_dim) + self.inner_attn = inner_attn_cls( + causal=causal, + softmax_scale=softmax_scale, + attention_dropout=dropout, + ) + self.inner_cross_attn = inner_cross_attn_cls( + causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout + ) + self.out_proj = linear_cls(embed_dim, embed_dim, bias=out_proj_bias, **factory_kwargs) + + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None): + dtype = self.out_proj.weight.dtype if dtype is None else dtype + device = self.out_proj.weight.device + return torch.empty( + batch_size, + max_seqlen, + 2, + self.num_heads_kv, + self.head_dim, + dtype=dtype, + device=device, + ) + + def _update_kv_cache(self, kv, inference_params): + """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)""" + assert not self.dwconv, "Generation does not support dwconv yet" + assert self.layer_idx is not None, "Generation requires layer_idx in the constructor" + return _update_kv_cache(kv, inference_params, self.layer_idx) + + def _apply_rotary_update_kvcache_attention(self, q, kv, inference_params): + """ + Fast path that combine 3 steps: apply rotary to Q and K, update kv cache, and apply attention. + q: (batch_size, seqlen_q, nheads, head_dim) + kv: (batch_size, seqlen_k, 2, nheads_kv, head_dim) + """ + assert inference_params is not None and inference_params.seqlen_offset > 0 + assert self.use_flash_attn + if self.rotary_emb_dim > 0: + assert self.rotary_emb.scale is None, "This code path does not support xPos" + self.rotary_emb._update_cos_sin_cache( + inference_params.max_seqlen, device=q.device, dtype=q.dtype + ) + rotary_cos, rotary_sin = self.rotary_emb._cos_cached, self.rotary_emb._sin_cached + else: + rotary_cos, rotary_sin = None, None + batch = q.shape[0] + kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch] + cache_seqlens = ( + inference_params.lengths_per_sample[:batch] + if inference_params.lengths_per_sample is not None + else inference_params.seqlen_offset + ) + alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None) + context = flash_attn_with_kvcache( + q, + kv_cache[:, :, 0], + kv_cache[:, :, 1], + kv[:, :, 0], + kv[:, :, 1], + rotary_cos=rotary_cos, + rotary_sin=rotary_sin, + cache_seqlens=cache_seqlens, + softmax_scale=self.inner_cross_attn.softmax_scale, + causal=self.inner_cross_attn.causal, + rotary_interleaved=self.rotary_emb.interleaved if self.rotary_emb_dim > 0 else False, + alibi_slopes=alibi_slopes, + ) + return context + + def _update_kvcache_attention(self, q, kv, inference_params): + """Write kv to inference_params, then do attention""" + if ( + inference_params.seqlen_offset == 0 + or flash_attn_with_kvcache is None + or not self.use_flash_attn + ): + # TODO: this only uses seqlen_offset and not lengths_per_sample. + kv = self._update_kv_cache(kv, inference_params) + return self.inner_cross_attn(q, kv) + else: + batch = q.shape[0] + kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch] + cache_seqlens = ( + inference_params.lengths_per_sample[:batch] + if inference_params.lengths_per_sample is not None + else inference_params.seqlen_offset + ) + alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None) + return flash_attn_with_kvcache( + q, + kv_cache[:, :, 0], + kv_cache[:, :, 1], + kv[:, :, 0], + kv[:, :, 1], + cache_seqlens=cache_seqlens, + softmax_scale=self.inner_cross_attn.softmax_scale, + causal=self.inner_cross_attn.causal, + alibi_slopes=alibi_slopes, + ) + + def forward( + self, + x, + x_kv=None, + key_padding_mask=None, + cu_seqlens=None, + max_seqlen=None, + mixer_subset=None, + inference_params=None, + **kwargs, + ): + """ + Arguments: + x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if + cu_seqlens is None and max_seqlen is None, else (total, hidden_dim) where total + is the is the sum of the sequence lengths in the batch. + x_kv: (batch, seqlen, hidden_dim), only applicable for cross-attention. If None, use x. + cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths + of the sequences in the batch, used to index into x. Only applicable when using + FlashAttention. + max_seqlen: int. Maximum sequence length in the batch. + key_padding_mask: boolean mask, True means to keep, False means to mask out. + (batch, seqlen). Only applicable when not using FlashAttention. + mixer_subset: for cross-attention only. If not None, will take a subset of x + before applying the query projection. Useful for e.g., ViT where we only care + about the CLS token in the last layer. + inference_params: for generation. Adapted from Megatron-LM (and Apex) + https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470 + """ + if cu_seqlens is not None: + assert max_seqlen is not None + assert key_padding_mask is None + assert self.use_flash_attn + assert not self.dwconv + assert self.rotary_emb_dim == 0 + if key_padding_mask is not None: + assert cu_seqlens is None + assert max_seqlen is None + assert not self.use_flash_attn + if inference_params is not None: + assert key_padding_mask is None + assert cu_seqlens is None and max_seqlen is None + assert not self.dwconv + + kwargs = ( + {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen, **kwargs} + if self.use_flash_attn + else {"key_padding_mask": key_padding_mask, **kwargs} + ) + seqlen_offset = ( + 0 + if inference_params is None + else ( + inference_params.lengths_per_sample + if inference_params.lengths_per_sample is not None + else inference_params.seqlen_offset + ) + ) + rotary_max_seqlen = inference_params.max_seqlen if inference_params is not None else None + batch, seqlen = x.shape[:2] + if not self.cross_attn and self.num_heads_kv == self.num_heads: + assert x_kv is None and mixer_subset is None + if not self.return_residual: + qkv = self.Wqkv(x) + else: + qkv, x = self.Wqkv(x) + if self.dwconv: + qkv = rearrange( + self.dwconv_qkv(rearrange(qkv, "b s d -> b d s"))[..., :-2], "b d s -> b s d" + ).contiguous() + qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim) + if ( + inference_params is None + or inference_params.seqlen_offset == 0 + or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0) + or not self.use_flash_attn + ): + if self.rotary_emb_dim > 0: + qkv = self.rotary_emb( + qkv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen + ) + if inference_params is None: + if not self.checkpointing: + context = self.inner_attn(qkv, **kwargs) + else: + context = torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, **kwargs) + else: + context = self._update_kvcache_attention( + qkv[:, :, 0], qkv[:, :, 1:], inference_params + ) + else: + context = self._apply_rotary_update_kvcache_attention( + qkv[:, :, 0], qkv[:, :, 1:], inference_params + ) + else: + if self.cross_attn: + if not self.return_residual: + q = self.Wq(x if mixer_subset is None else x[:, mixer_subset]) + kv = self.Wkv(x_kv if x_kv is not None else x) + else: + if x_kv is not None: + kv, x_kv = self.Wkv(x_kv) + else: + kv, x = self.Wkv(x) + q = self.Wq(x if mixer_subset is None else x[:, mixer_subset]) + else: + assert self.num_heads_kv != self.num_heads + if not self.return_residual: + qkv = self.Wqkv(x) + else: + qkv, x = self.Wqkv(x) + q = qkv[..., : self.num_heads * self.head_dim] + kv = qkv[..., self.num_heads * self.head_dim :] + q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim) + kv = rearrange(kv, "... (two hkv d) -> ... two hkv d", two=2, d=self.head_dim) + if self.dwconv: + q = rearrange( + self.dwconv_q(rearrange(q, "b s d -> b d s"))[..., :-2], "b d s -> b s d" + ).contiguous() + kv = rearrange( + self.dwconv_kv(rearrange(kv, "b s d -> b d s"))[..., :-2], "b d s -> b s d" + ).contiguous() + if ( + inference_params is None + or inference_params.seqlen_offset == 0 + or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0) + or not self.use_flash_attn + ): + if self.rotary_emb_dim > 0: + q, kv = self.rotary_emb( + q, kv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen + ) + if inference_params is None: + if not self.checkpointing: + context = self.inner_cross_attn(q, kv, **kwargs) + else: + context = torch.utils.checkpoint.checkpoint( + self.inner_cross_attn, q, kv, **kwargs + ) + else: + context = self._update_kvcache_attention(q, kv, inference_params) + else: + context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params) + out = self.out_proj(rearrange(context, "... h d -> ... (h d)")) + return out if not self.return_residual else (out, x) + + +class ParallelMHA(nn.Module): + """Multi-head self-attention and cross-attention""" + + def __init__( + self, + embed_dim, + num_heads, + process_group, + num_heads_kv=None, + qkv_proj_bias=True, + out_proj_bias=True, + dropout=0.0, + softmax_scale=None, + causal=False, + layer_idx=None, + rotary_emb_dim=0, + rotary_emb_base=10000.0, + rotary_emb_scale_base=None, + rotary_emb_interleaved=False, + use_alibi=False, + window_size=(-1, -1), + use_flash_attn=False, + checkpointing=False, + sequence_parallel=True, + device=None, + dtype=None, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.embed_dim = embed_dim + self.causal = causal + self.layer_idx = layer_idx + self.rotary_emb_dim = rotary_emb_dim + self.use_flash_attn = use_flash_attn + self.checkpointing = checkpointing + self.process_group = process_group + self.world_size = process_group.size() + self.local_rank = torch.distributed.get_rank(process_group) + + self.num_heads = num_heads + assert self.embed_dim % self.num_heads == 0, "embed_dim must be divisible by num_heads" + + self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads + assert ( + self.num_heads % self.num_heads_kv == 0 + ), "num_heads must be divisible by num_heads_kv" + + self.num_heads_per_rank = get_dim_for_local_rank( + self.num_heads, self.world_size, self.local_rank + ) + self.num_heads_kv_per_rank = get_dim_for_local_rank( + self.num_heads_kv, self.world_size, self.local_rank + ) + self.head_dim = self.embed_dim // num_heads + qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv) + + if use_alibi: + assert use_flash_attn, "ALiBi code path requires flash_attn" + num_heads_local = math.ceil(self.num_heads / self.world_size) + alibi_slopes = torch.tensor( + get_alibi_slopes(num_heads)[ + self.local_rank * num_heads_local : (self.local_rank + 1) * num_heads_local + ], + device=device, + ) + else: + alibi_slopes = None + if window_size != (-1, -1): + assert use_flash_attn, "Local (sliding window) attention code path requires flash_attn" + + if self.rotary_emb_dim > 0: + assert RotaryEmbedding is not None, "rotary_emb is not installed" + self.rotary_emb = RotaryEmbedding( + self.rotary_emb_dim, + base=rotary_emb_base, + scale_base=rotary_emb_scale_base, + interleaved=rotary_emb_interleaved, + device=device, + ) + + if ColumnParallelLinear is None or RowParallelLinear is None: + raise ImportError("fused_dense is not installed") + self.Wqkv = ColumnParallelLinear( + embed_dim, + qkv_dim, + process_group, + bias=qkv_proj_bias, + sequence_parallel=sequence_parallel, + multiple_of=self.head_dim * (self.num_heads // self.num_heads_kv + 2), + **factory_kwargs, + ) + inner_attn_cls = ( + partial(FlashSelfAttention, alibi_slopes=alibi_slopes, window_size=window_size) + if use_flash_attn + else SelfAttention + ) + inner_cross_attn_cls = ( + partial(FlashCrossAttention, alibi_slopes=alibi_slopes, window_size=window_size) + if use_flash_attn + else CrossAttention + ) + self.inner_attn = inner_attn_cls( + causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout + ) + self.inner_cross_attn = inner_cross_attn_cls( + causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout + ) + self.out_proj = RowParallelLinear( + embed_dim, + embed_dim, + process_group, + bias=out_proj_bias, + sequence_parallel=sequence_parallel, + multiple_of=self.head_dim, + **factory_kwargs, + ) + + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None): + dtype = self.out_proj.weight.dtype if dtype is None else dtype + device = self.out_proj.weight.device + return torch.empty( + batch_size, + max_seqlen, + 2, + self.num_heads_kv_per_rank, + self.head_dim, + dtype=dtype, + device=device, + ) + + def _update_kv_cache(self, kv, inference_params): + """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)""" + assert self.layer_idx is not None, "Generation requires layer_idx in the constructor" + return _update_kv_cache(kv, inference_params, self.layer_idx) + + def _apply_rotary_update_kvcache_attention(self, q, kv, inference_params): + """ + Fast path that combine 3 steps: apply rotary to Q and K, update kv cache, and apply attention. + q: (batch_size, seqlen_q, nheads, head_dim) + kv: (batch_size, seqlen_k, 2, nheads_kv, head_dim) + """ + assert inference_params is not None and inference_params.seqlen_offset > 0 + assert self.use_flash_attn + if self.rotary_emb_dim > 0: + assert self.rotary_emb.scale is None, "This code path does not support xPos" + self.rotary_emb._update_cos_sin_cache( + inference_params.max_seqlen, device=q.device, dtype=q.dtype + ) + rotary_cos, rotary_sin = self.rotary_emb._cos_cached, self.rotary_emb._sin_cached + else: + rotary_cos, rotary_sin = None, None + batch = q.shape[0] + kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch] + cache_seqlens = ( + inference_params.lengths_per_sample[:batch] + if inference_params.lengths_per_sample is not None + else inference_params.seqlen_offset + ) + alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None) + context = flash_attn_with_kvcache( + q, + kv_cache[:, :, 0], + kv_cache[:, :, 1], + kv[:, :, 0], + kv[:, :, 1], + rotary_cos=rotary_cos, + rotary_sin=rotary_sin, + cache_seqlens=cache_seqlens, + softmax_scale=self.inner_cross_attn.softmax_scale, + causal=self.inner_cross_attn.causal, + rotary_interleaved=self.rotary_emb.interleaved if self.rotary_emb_dim > 0 else False, + alibi_slopes=alibi_slopes, + ) + return context + + def _update_kvcache_attention(self, q, kv, inference_params): + """Write kv to inference_params, then do attention""" + if inference_params.seqlen_offset == 0 or not self.use_flash_attn: + # TODO: this only uses seqlen_offset and not lengths_per_sample. + kv = self._update_kv_cache(kv, inference_params) + return self.inner_cross_attn(q, kv) + else: + batch = q.shape[0] + kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch] + cache_seqlens = ( + inference_params.lengths_per_sample[:batch] + if inference_params.lengths_per_sample is not None + else inference_params.seqlen_offset + ) + alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None) + context = flash_attn_with_kvcache( + q, + kv_cache[:, :, 0], + kv_cache[:, :, 1], + kv[:, :, 0], + kv[:, :, 1], + cache_seqlens=cache_seqlens, + softmax_scale=self.inner_cross_attn.softmax_scale, + causal=self.inner_cross_attn.causal, + alibi_slopes=alibi_slopes, + ) + return context + + def forward(self, x, seqlen=None, inference_params=None, **kwargs): + """ + Arguments: + x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if seqlen=None. + If seqlen is not None, x is (batch * seqlen, hidden_dim). This is so that when we + split x during sequence parallel, we split the batch * seqlen dimension + (in case batch is small). + """ + qkv = self.Wqkv(x) + if seqlen is not None: + qkv = rearrange(qkv, "(b s) ... -> b s ...", s=seqlen) + seqlen_offset = ( + 0 + if inference_params is None + else ( + inference_params.lengths_per_sample + if inference_params.lengths_per_sample is not None + else inference_params.seqlen_offset + ) + ) + rotary_max_seqlen = inference_params.max_seqlen if inference_params is not None else None + if self.num_heads_kv == self.num_heads: + qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, d=self.head_dim) + if ( + inference_params is None + or inference_params.seqlen_offset == 0 + or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0) + or not self.use_flash_attn + ): + if self.rotary_emb_dim > 0: + qkv = self.rotary_emb( + qkv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen + ) + if inference_params is None: + if not self.checkpointing: + context = self.inner_attn(qkv, **kwargs) + else: + context = torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, **kwargs) + else: + context = self._update_kvcache_attention( + qkv[:, :, 0], qkv[:, :, 1:], inference_params + ) + else: + context = self._apply_rotary_update_kvcache_attention( + qkv[:, :, 0], qkv[:, :, 1:], inference_params + ) + else: + q = rearrange( + qkv[..., : self.num_heads_per_rank * self.head_dim], + "... (h d) -> ... h d", + d=self.head_dim, + ) + kv = rearrange( + qkv[..., self.num_heads_per_rank * self.head_dim :], + "... (two hkv d) -> ... two hkv d", + two=2, + d=self.head_dim, + ) + if ( + inference_params is None + or inference_params.seqlen_offset == 0 + or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0) + or not self.use_flash_attn + ): + if self.rotary_emb_dim > 0: + q, kv = self.rotary_emb( + q, kv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen + ) + if inference_params is None: + if not self.checkpointing: + context = self.inner_cross_attn(q, kv, **kwargs) + else: + context = torch.utils.checkpoint.checkpoint( + self.inner_cross_attn, q, kv, **kwargs + ) + else: + context = self._update_kvcache_attention(q, kv, inference_params) + else: + context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params) + context = rearrange(context, "b s h d -> b s (h d)") + if seqlen is not None: + context = rearrange(context, "b s d -> (b s) d") + out = self.out_proj(context) + return out diff --git a/mha_bwd.cpp b/mha_bwd.cpp new file mode 100644 index 0000000000000000000000000000000000000000..884215adf0c53cdda06189a68d2a1cfca5250f75 --- /dev/null +++ b/mha_bwd.cpp @@ -0,0 +1,379 @@ +/****************************************************************************** + * Copyright (c) 2024, Tri Dao. + ******************************************************************************/ + +#include "flash_common.hpp" + +#include "fmha_bwd.hpp" +#include "mask.hpp" + +fmha_bwd_traits get_ck_fmha_bwd_traits(const mask_info &mask, + std::string dtype, + int head_size, + bool has_dropout, + bool enable_alibi) +{ + return fmha_bwd_traits{head_size, + head_size, + dtype, + false, // is_group_mode + mask.type, + enable_alibi ? bias_enum::alibi : bias_enum::no_bias, + false, // has_dbias + has_dropout}; +} + +fmha_bwd_args get_ck_fmha_bwd_args(const mask_info &mask, + // sizes + const int b, + const int seqlen_q, + const int seqlen_k, + const int h, + const int h_k, + const int hdim, + // device pointers + const at::Tensor q, + const at::Tensor k, + const at::Tensor v, + c10::optional &alibi_slopes_, + const at::Tensor out, + const at::Tensor softmax_lse, + const at::Tensor dout, + at::Tensor d, + at::Tensor dq, + at::Tensor dk, + at::Tensor dv, + float softmax_scale, + float p_dropout, + uint64_t drop_seed, + uint64_t drop_offset) +{ + // q: (batch_size, seqlen_q, nheads, hdim) + // k: (batch_size, seqlen_k, nheads_k, hdim) + // v: (batch_size, seqlen_k, nheads_k, hdim) + // o: (batch_size, seqlen_q, nheads, hdim) + // dq: (batch_size, seqlen_q, nheads, hdim) + // dk_expanded: (batch_size, seqlen_k, nheads, hdim) + // dv_expanded: (batch_size, seqlen_k, nheads, hdim) + // do: (batch_size, seqlen_q, nheads, hdim) + + // alibi_slopes:(batch_size, nheads) or (nhead) + // lse: (batch_size, nheads, seqlen_q) + // d: (batch_size, nheads, seqlen_q) + + ck_tile::index_t stride_q = q.stride(1); + ck_tile::index_t stride_k = k.stride(1); + ck_tile::index_t stride_v = v.stride(1); + ck_tile::index_t stride_o = out.stride(1); + ck_tile::index_t stride_do = dout.stride(1); + ck_tile::index_t stride_dk = dk.stride(1); + ck_tile::index_t stride_dv = dv.stride(1); + + ck_tile::index_t nhead_stride_q = q.stride(2); + ck_tile::index_t nhead_stride_k = k.stride(2); + ck_tile::index_t nhead_stride_v = v.stride(2); + ck_tile::index_t nhead_stride_o = out.stride(2); + ck_tile::index_t nhead_stride_do = dout.stride(2); + ck_tile::index_t nhead_stride_lse = softmax_lse.stride(1); + + ck_tile::index_t batch_stride_q = q.stride(0); + ck_tile::index_t batch_stride_k = k.stride(0); + ck_tile::index_t batch_stride_v = v.stride(0); + ck_tile::index_t batch_stride_o = out.stride(0); + ck_tile::index_t batch_stride_do = dout.stride(0); + ck_tile::index_t batch_stride_lse = softmax_lse.stride(0); + ck_tile::index_t batch_stride_dk = dk.stride(0); + ck_tile::index_t batch_stride_dv = dv.stride(0); + + float p_undrop = 1.0 - p_dropout; + + void *alibi_slopes_ptr = nullptr; + ck_tile::index_t stride_alibi_slopes = 0; + + if (alibi_slopes_.has_value()) { + auto alibi_slopes = alibi_slopes_.value(); + CHECK_DEVICE(alibi_slopes); + TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension"); + TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({h}) || alibi_slopes.sizes() == torch::IntArrayRef({b, h})); + alibi_slopes_ptr = alibi_slopes.data_ptr(); + stride_alibi_slopes = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0; + } + + return fmha_bwd_args{q.data_ptr(), + k.data_ptr(), + v.data_ptr(), + alibi_slopes_ptr, // bias + out.data_ptr(), + softmax_lse.data_ptr(), + dout.data_ptr(), + d.data_ptr(), + nullptr, // rand_val + dq.data_ptr(), + dk.data_ptr(), + dv.data_ptr(), + nullptr, // dbias + nullptr, // seqstart_q + nullptr, // seqstart_k + nullptr, // seqlen_k_ptr + seqlen_q, + seqlen_k, + b, + seqlen_q, // max_seqlen_q + seqlen_k, // max_seqlen_k + hdim, // hdim_q + hdim, // hdim_v + h, // nhead + h_k, // nhead_k + softmax_scale, + stride_q, + stride_k, + stride_v, + stride_alibi_slopes, + stride_o, + 0, // stride_randval + stride_do, + stride_dk, + stride_dv, + 0, // stride_dbias, FA without bias + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + 0, // nhead_stride_bias, FA without bias + nhead_stride_o, + 0, // nhead_stride_randval + nhead_stride_do, + nhead_stride_lse, + 0, // nhead_stride_dbias, FA without dbias + batch_stride_q, + batch_stride_k, + batch_stride_v, + 0 , // batch_stride_bias, FA without bias + batch_stride_o, + 0, // batch_stride_randval + batch_stride_do, + batch_stride_lse, + batch_stride_dk, + batch_stride_dv, + 0 , // batch_stride_dbias, FA without dbias + mask.left, + mask.right, + static_cast(mask.type), + p_dropout, + p_undrop, + false, // s_randval + {drop_seed, drop_offset}}; +} + +std::vector +mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_size_og + const at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &out, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &softmax_lse, // b x h x seqlen_q + c10::optional &dq_, // batch_size x seqlen_q x num_heads x head_size + c10::optional &dk_, // batch_size x seqlen_k x num_heads_k x head_size + c10::optional &dv_, // batch_size x seqlen_k x num_heads_k x head_size + c10::optional &alibi_slopes_, // num_heads or batch_size x num_heads + const float p_dropout, // probability to drop + const float softmax_scale, + const bool is_causal, + int window_size_left, + int window_size_right, + const float /*softcap*/, + const bool deterministic, + c10::optional gen_, + c10::optional &rng_state) +{ +#ifdef FLASHATTENTION_DISABLE_BACKWARD + TORCH_CHECK(false, "This flash attention build does not support backward."); +#endif + if (is_causal) { window_size_right = 0; } + + bool is_dropout = p_dropout > 0.0; + auto stream = at::cuda::getCurrentHIPStream().stream(); + + auto q_dtype = q.dtype(); + TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, + "FlashAttention only support fp16 and bf16 data type"); + + TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype"); + TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype"); + TORCH_CHECK(out.dtype() == q_dtype, "query and out must have the same dtype"); + TORCH_CHECK(dout.dtype() == q_dtype, "query and dout must have the same dtype"); + + std::string q_dtype_str = q_dtype == torch::kFloat16 ? "fp16" : "bf16"; + + CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); + CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse); + + TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension"); + TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension"); + + const auto sizes = q.sizes(); + + const int batch_size = sizes[0]; + const int seqlen_q = sizes[1]; + const int num_heads = sizes[2]; + const int head_size_og = dout.size(3); // unpadded hdim + const int head_size_8x = sizes[3]; + const int seqlen_k = k.size(1); + const int num_heads_k = k.size(2); + TORCH_CHECK(batch_size > 0, "batch size must be positive"); + TORCH_CHECK(head_size_8x % 8 == 0, "head_size_8x should be a multiple of 8"); + TORCH_CHECK(head_size_8x <= 128, "CK FlashAttention backward only supports head dimension at most 128"); + TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + TORCH_CHECK(head_size_8x == round_multiple(head_size_og, 8), "head_size_8x must be head_size_og rounded to a multiple of 8"); + + if (window_size_left >= seqlen_k) { window_size_left = -1; } + if (window_size_right >= seqlen_k) { window_size_right = -1; } + + mask_info mask; + if (is_causal) { + std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + "0"; + mask = mask_info::decode(mask_identify, seqlen_q, seqlen_k); // casual + } + else if (window_size_left == -1 && window_size_right == -1) { + mask = mask_info::decode("0", seqlen_q, seqlen_k); // no mask + } + else { + // Local is the more general case where window_size_right >= 0 or window_size_left >= 0. + std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + std::to_string(window_size_right); + mask = mask_info::decode(mask_identify, seqlen_q, seqlen_k); // local + } + + // q, k, v, out had been padded in mha_fwd + // dq_, dk_, dv_ are also padded tensor + CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_8x); + CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size_8x); + CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size_8x); + CHECK_SHAPE(out, batch_size, seqlen_q, num_heads, head_size_8x); + CHECK_SHAPE(dout, batch_size, seqlen_q, num_heads, head_size_og); + + at::Tensor dq, dk, dv; + if (dq_.has_value()) { + dq = dq_.value(); + TORCH_CHECK(dq.dtype() == q_dtype, "dq must have the same dtype as q"); + CHECK_DEVICE(dq); + TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension"); + CHECK_SHAPE(dq, batch_size, seqlen_q, num_heads, head_size_8x); + } else { + dq = torch::empty_like(q); + } + if (dk_.has_value()) { + dk = dk_.value(); + TORCH_CHECK(dk.dtype() == q_dtype, "dk must have the same dtype as q"); + CHECK_DEVICE(dk); + TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension"); + CHECK_SHAPE(dk, batch_size, seqlen_k, num_heads_k, head_size_8x); + } else { + dk = torch::empty_like(k); + } + if (dv_.has_value()) { + dv = dv_.value(); + TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q"); + CHECK_DEVICE(dv); + TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension"); + CHECK_SHAPE(dv, batch_size, seqlen_k, num_heads_k, head_size_8x); + } else { + dv = torch::empty_like(v); + } + + at::Tensor dout_padded; + if (head_size_og % 8 != 0) { + dout_padded = torch::nn::functional::pad(dout, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + } else { + dout_padded = dout; + } + + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + + auto opts = q.options(); + auto softmax_d = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(at::kFloat)); + // TODO - CK does not support dq_accum + + at::Tensor dk_expanded, dv_expanded; + if (num_heads_k != num_heads) { // MQA / GQA + dk_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size_8x}, opts); + dv_expanded = torch::empty({batch_size, seqlen_k, num_heads, head_size_8x}, opts); + } else { + dk_expanded = dk; + dv_expanded = dv; + } + + auto gen = at::get_generator_or_default( + gen_, at::cuda::detail::getDefaultCUDAGenerator()); + + uint64_t drop_seed = 1, drop_offset = 0; + int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size(); + + if (rng_state.has_value()) { + uint64_t* d = reinterpret_cast(rng_state.value().data_ptr()); + drop_seed = d[0]; + drop_offset = d[1]; + } else if(is_dropout) { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + auto philox_args = gen->philox_cuda_state(counter_offset); + std::tie(drop_seed, drop_offset) = flash::unpack(philox_args); + } + + if (seqlen_q > 0) { + ck_tile::stream_config stream_config{stream}; + dq.zero_(); // ck use atomic operation on dq + + auto traits = + get_ck_fmha_bwd_traits(mask, q_dtype_str, head_size_8x, is_dropout, alibi_slopes_.has_value()); + + auto args = + get_ck_fmha_bwd_args( + mask, + batch_size, + seqlen_q, + seqlen_k, + num_heads, + num_heads_k, + head_size_8x, + q, + k, + v, + alibi_slopes_, + out, + softmax_lse, + dout_padded, + softmax_d, + dq, + dk_expanded, + dv_expanded, + softmax_scale, + p_dropout, + drop_seed, + drop_offset); + + fmha_bwd(traits, args, stream_config); + } else { + // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0. + dk_expanded.zero_(); + dv_expanded.zero_(); + softmax_d.zero_(); + } + + // For MQA/GQA we need to sum dK and dV across the groups + if (num_heads_k != num_heads) { + at::sum_out(dk, at::reshape(dk_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size_8x}), {3}); + at::sum_out(dv, at::reshape(dv_expanded, {batch_size, seqlen_k, num_heads_k, num_heads / num_heads_k, head_size_8x}), {3}); + } + if (head_size_og % 8 != 0) { + dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + dk = dk.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + } + + return { dq, dk, dv, softmax_d }; +} \ No newline at end of file diff --git a/mha_fwd.cpp b/mha_fwd.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c1eeba5070885af479abb2e97e8c737afa91fba3 --- /dev/null +++ b/mha_fwd.cpp @@ -0,0 +1,348 @@ +/****************************************************************************** + * Copyright (c) 2024, Tri Dao. + ******************************************************************************/ + +#include "flash_common.hpp" + +#include "fmha_fwd.hpp" +#include "mask.hpp" + +fmha_fwd_traits get_ck_fmha_fwd_traits(const mask_info &mask, + std::string dtype, + int head_size, + bool has_dropout, + bool has_lse, + bool enable_alibi) +{ + return fmha_fwd_traits{head_size, + head_size, + dtype, + false, // is_group_mode + true, // is_v_rowmajor + mask.type, + enable_alibi ? bias_enum::alibi : bias_enum::no_bias, + has_lse, + has_dropout, + false}; // do_fp8_static_quant +} + +fmha_fwd_args get_ck_fmha_fwd_args(bool has_lse, + bool has_dropout_randval, + const mask_info &mask, + // sizes + const int b, + const int seqlen_q, + const int seqlen_k, + const int h, + const int h_k, + const int d, + // device pointers + const at::Tensor q, + const at::Tensor k, + const at::Tensor v, + c10::optional &alibi_slopes_, + at::Tensor out, + at::Tensor softmax_lse, + at::Tensor dropout_randval, + float softmax_scale, + float p_dropout, + uint64_t drop_seed, + uint64_t drop_offset) +{ + // q: (batch_size, seqlen_q, nheads, d) + // k: (batch_size, seqlen_k, nheads_k, d) + // v: (batch_size, seqlen_k, nheads_k, d) + // o: (batch_size, seqlen_q, nheads, d) + + // alibi_slopes:(batch_size, nheads) or (nhead) + // lse: (batch_size, nheads, seqlen_q) + // randval: (batch_size, nheads, seqlen_q, seqlen_k) + + ck_tile::index_t stride_q = q.stride(1); + ck_tile::index_t stride_k = k.stride(1); + ck_tile::index_t stride_v = v.stride(1); + ck_tile::index_t stride_o = out.stride(1); + ck_tile::index_t stride_randval = has_dropout_randval ? dropout_randval.stride(2) : 0; + + ck_tile::index_t nhead_stride_q = q.stride(2); + ck_tile::index_t nhead_stride_k = k.stride(2); + ck_tile::index_t nhead_stride_v = v.stride(2); + ck_tile::index_t nhead_stride_o = out.stride(2); + ck_tile::index_t nhead_stride_lse = has_lse ? softmax_lse.stride(1) : 0; + ck_tile::index_t nhead_stride_randval = has_dropout_randval ? dropout_randval.stride(1) : 0; + + ck_tile::index_t batch_stride_q = q.stride(0); + ck_tile::index_t batch_stride_k = k.stride(0); + ck_tile::index_t batch_stride_v = v.stride(0); + ck_tile::index_t batch_stride_o = out.stride(0); + + ck_tile::index_t batch_stride_lse = has_lse ? softmax_lse.stride(0) : 0; + ck_tile::index_t batch_stride_randval = has_dropout_randval ? dropout_randval.stride(0) : 0; + + void *alibi_slopes_ptr = nullptr; + ck_tile::index_t stride_alibi_slopes = 0; + + if (alibi_slopes_.has_value()) { + auto alibi_slopes = alibi_slopes_.value(); + CHECK_DEVICE(alibi_slopes); + TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension"); + TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({h}) || alibi_slopes.sizes() == torch::IntArrayRef({b, h})); + alibi_slopes_ptr = alibi_slopes.data_ptr(); + stride_alibi_slopes = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0; + } + + return fmha_fwd_args{q.data_ptr(), + k.data_ptr(), + v.data_ptr(), + alibi_slopes_ptr, // bias + has_dropout_randval ? dropout_randval.data_ptr() : nullptr, + nullptr, // lse_acc + nullptr, // o_acc + has_lse ? softmax_lse.data_ptr() : nullptr, + out.data_ptr(), + nullptr, // seqstart_q + nullptr, // seqstart_k + nullptr, + seqlen_q, + seqlen_k, + b, + seqlen_q, // max_seqlen_q + d, // hdim_q + d, // hdim_v + h, // nhead + h_k, // nhead_k + 1, // num_splits + softmax_scale, // scale_s + 1, // scale_p + 1, // scale_o + stride_q, + stride_k, + stride_v, + stride_alibi_slopes, + stride_randval, + 0, // stride_o_acc, + stride_o, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + 0, // nhead_stride_bias, FA without bias + nhead_stride_randval, + nhead_stride_lse, + 0, // nhead_stride_lse_acc + 0, // nhead_stride_o_acc + nhead_stride_o, + batch_stride_q, + batch_stride_k, + batch_stride_v, + 0, // batch_stride_bias, FA without bias + batch_stride_randval, + batch_stride_lse, + 0, // batch_stride_lse_acc + 0, // batch_stride_o_acc + batch_stride_o, + 0, // split_stride_lse_acc + 0, // split_stride_o_acc + mask.left, + mask.right, + static_cast(mask.type), + p_dropout, + has_dropout_randval, + {drop_seed, drop_offset}}; +} + +std::vector +mha_fwd(at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size + c10::optional &out_, // batch_size x seqlen_q x num_heads x head_size + c10::optional &alibi_slopes_, // num_heads or batch_size x num_heads + const float p_dropout, + const float softmax_scale, + bool is_causal, + int window_size_left, + int window_size_right, + const float /*softcap*/, + const bool return_dropout_randval, + c10::optional gen_) +{ + auto q_dtype = q.dtype(); + TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, + "FlashAttention only support fp16 and bf16 data type"); + + TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype"); + TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype"); + + std::string q_dtype_str = q_dtype == torch::kFloat16 ? "fp16" : "bf16"; + + CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); + + TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + + const auto sizes = q.sizes(); + + const int batch_size = sizes[0]; + int seqlen_q = sizes[1]; + int num_heads = sizes[2]; + const int head_size_og = sizes[3]; + const int seqlen_k = k.size(1); + const int num_heads_k = k.size(2); + TORCH_CHECK(batch_size > 0, "batch size must be positive"); + TORCH_CHECK(head_size_og <= 256, "CK only supports head dimension at most 256"); + TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + + if (window_size_left >= seqlen_k) { window_size_left = -1; } + if (window_size_right >= seqlen_k) { window_size_right = -1; } + + // causal=true is the same as causal=false in this case + if (seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; } + + mask_info mask; + if (is_causal) { + // Causal is the special case where window_size_right == 0 and window_size_left < 0. + window_size_right = 0; + std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + "0"; + mask = mask_info::decode(mask_identify, seqlen_q, seqlen_k); // casual + } + else if (window_size_left == -1 && window_size_right == -1) { + mask = mask_info::decode("0", seqlen_q, seqlen_k); // no mask + } + else { + // Local is the more general case where window_size_right >= 0 or window_size_left >= 0. + std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + std::to_string(window_size_right); + mask = mask_info::decode(mask_identify, seqlen_q, seqlen_k); // local + } + + // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case + // H/t Daniel Haziza + const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && window_size_left < 0 && window_size_right < 0 && p_dropout == 0.f && head_size_og % 8 == 0 && !alibi_slopes_.has_value(); + const int ngroups = num_heads / num_heads_k; + if (seqlenq_ngroups_swapped) { + q = q.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2); + seqlen_q = ngroups; + num_heads = num_heads_k; + } + + CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size_og); + CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size_og); + CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size_og); + + at::Tensor q_padded, k_padded, v_padded; + if (head_size_og % 8 != 0) { + q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + } + else { + q_padded = q; + k_padded = k; + v_padded = v; + } + + at::Tensor out; + if (out_.has_value()) { + out = out_.value(); + TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs"); + CHECK_DEVICE(out); + TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); + CHECK_SHAPE(out, batch_size, sizes[1], sizes[2], head_size_og); + if (seqlenq_ngroups_swapped) { + out = out.reshape({batch_size, num_heads_k, ngroups, head_size_og}).transpose(1, 2); + } + if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); } + } + else { + out = torch::empty_like(q_padded); + } + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + const int head_size_8x = round_multiple(head_size_og, 8); + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + + auto opts = q.options(); + bool has_lse = true; + bool has_dropout = p_dropout > 0.0f; + + at::Tensor softmax_lse; + // TODO - check gradient, only training require lse + softmax_lse = torch::empty({batch_size, num_heads, seqlen_q}, opts.dtype(torch::kFloat32)); + + at::Tensor p; + if (return_dropout_randval) { + TORCH_CHECK(has_dropout, "return_dropout_randval require p_dropout > 0"); + p = torch::empty({batch_size, num_heads, seqlen_q, seqlen_k}, opts.dtype(torch::kUInt8)); + } + + uint64_t drop_seed = 1, drop_offset = 0; + int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size(); + auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA); + auto rng_state = torch::empty({2}, options.dtype(torch::kInt64)); + + if (p_dropout > 0.0) { + auto gen = at::get_generator_or_default( + gen_, at::cuda::detail::getDefaultCUDAGenerator()); + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + auto philox_args = gen->philox_cuda_state(counter_offset); + std::tie(drop_seed, drop_offset) = flash::unpack(philox_args); + } + + rng_state[0] = *(reinterpret_cast(&drop_seed)); + rng_state[1] = *(reinterpret_cast(&drop_offset)); + + if (seqlen_k > 0) { + auto stream = at::cuda::getCurrentHIPStream().stream(); + ck_tile::stream_config stream_config{stream}; + + auto traits = + get_ck_fmha_fwd_traits(mask, q_dtype_str, head_size_8x, has_dropout, has_lse, alibi_slopes_.has_value()); + + auto args = + get_ck_fmha_fwd_args( + has_lse, + return_dropout_randval, + mask, + batch_size, + seqlen_q, + seqlen_k, + num_heads, + num_heads_k, + head_size_8x, + q_padded, + k_padded, + v_padded, + alibi_slopes_, + out, + softmax_lse, + p, + softmax_scale, + p_dropout, + drop_seed, + drop_offset); + + fmha_fwd(traits, args, stream_config); + } + else { + // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0. + out.zero_(); + softmax_lse.fill_(std::numeric_limits::infinity()); + } + + at::Tensor out_padded = out; + if (head_size_og % 8 != 0) { + out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + if (out_.has_value()) { out_.value().copy_(out); } + } + + if (seqlenq_ngroups_swapped) { + out = out.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og}); + out_padded = out_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og}); + q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og}); + softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1}); + } + return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse, p, rng_state}; +} diff --git a/mha_varlen_bwd.cpp b/mha_varlen_bwd.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d8eabab154746c4b063df9aabba23b391705ff47 --- /dev/null +++ b/mha_varlen_bwd.cpp @@ -0,0 +1,406 @@ +/****************************************************************************** + * Copyright (c) 2024, Tri Dao. + ******************************************************************************/ + +#include "flash_common.hpp" + +#include "fmha_bwd.hpp" +#include "mask.hpp" + +fmha_bwd_traits get_ck_fmha_varlen_bwd_traits(const mask_info &mask, + std::string dtype, + int head_size, + bool has_dropout, + bool enable_alibi) +{ + return fmha_bwd_traits{head_size, + head_size, + dtype, + true, // is_group_mode + mask.type, + enable_alibi ? bias_enum::alibi : bias_enum::no_bias, + false, // has_dbias + has_dropout}; +} + +fmha_bwd_args get_ck_fmha_varlen_bwd_args(const mask_info &mask, + // sizes + const int b, + const int max_seqlen_q, + const int max_seqlen_k, + const int h, + const int h_k, + const int hdim, + // device pointers + const at::Tensor q, + const at::Tensor k, + const at::Tensor v, + const at::Tensor seqlens_q, + const at::Tensor seqlens_k, + c10::optional &alibi_slopes_, + const at::Tensor out, + const at::Tensor softmax_lse, + const at::Tensor dout, + at::Tensor d, + at::Tensor dq, + at::Tensor dk, + at::Tensor dv, + float softmax_scale, + float p_dropout, + uint64_t drop_seed, + uint64_t drop_offset) +{ + // q: (total_q, nheads, hdim) + // k: (total_k, nheads_k, hdim) + // v: (total_k, nheads_k, hdim) + // o: (total_q, nheads, hdim) + // dq: (total_q, nheads, hdim) + // dk_expanded: (total_k, nheads, hdim) + // dv_expanded: (total_k, nheads, hdim) + // do: (total_q, nheads, hdim) + + // alibi_slopes:(batch_size, nheads) or (nhead) + // lse: (batch_size, nheads, max_seqlen_q) + // d: (batch_size, nheads, max_seqlen_q) + + ck_tile::index_t total_q = q.size(0); + ck_tile::index_t total_k = k.size(0); + + ck_tile::index_t stride_q = q.stride(0); + ck_tile::index_t stride_k = k.stride(0); + ck_tile::index_t stride_v = v.stride(0); + ck_tile::index_t stride_o = out.stride(0); + ck_tile::index_t stride_do = dout.stride(0); + ck_tile::index_t stride_dk = dk.stride(0); + ck_tile::index_t stride_dv = dv.stride(0); + + ck_tile::index_t nhead_stride_q = q.stride(1); + ck_tile::index_t nhead_stride_k = k.stride(1); + ck_tile::index_t nhead_stride_v = v.stride(1); + ck_tile::index_t nhead_stride_o = out.stride(1); + ck_tile::index_t nhead_stride_do = dout.stride(1); + ck_tile::index_t nhead_stride_lse = softmax_lse.stride(1); + + ck_tile::index_t batch_stride_q = 0; + ck_tile::index_t batch_stride_k = 0; + ck_tile::index_t batch_stride_v = 0; + ck_tile::index_t batch_stride_o = 0; + ck_tile::index_t batch_stride_do = 0; + ck_tile::index_t batch_stride_lse = softmax_lse.stride(0);; + ck_tile::index_t batch_stride_dk = 0; + ck_tile::index_t batch_stride_dv = 0; + + float p_undrop = 1.0 - p_dropout; + + void *alibi_slopes_ptr = nullptr; + ck_tile::index_t stride_alibi_slopes = 0; + + if (alibi_slopes_.has_value()) { + auto alibi_slopes = alibi_slopes_.value(); + CHECK_DEVICE(alibi_slopes); + TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension"); + TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({h}) || alibi_slopes.sizes() == torch::IntArrayRef({b, h})); + alibi_slopes_ptr = alibi_slopes.data_ptr(); + stride_alibi_slopes = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0; + } + + return fmha_bwd_args{q.data_ptr(), + k.data_ptr(), + v.data_ptr(), + alibi_slopes_ptr, // bias + out.data_ptr(), + softmax_lse.data_ptr(), + dout.data_ptr(), + d.data_ptr(), + nullptr, // rand_val + dq.data_ptr(), + dk.data_ptr(), + dv.data_ptr(), + nullptr, // dbias + seqlens_q.data_ptr(), // seqstart_q + seqlens_k.data_ptr(), // seqstart_k + nullptr, // seqlen_k_ptr + total_q, + total_k, + b, + max_seqlen_q, // max_seqlen_q + max_seqlen_k, // max_seqlen_k + hdim, // hdim_q + hdim, // hdim_v + h, // nhead + h_k, // nhead_k + softmax_scale, + stride_q, + stride_k, + stride_v, + stride_alibi_slopes, + stride_o, + 0, // stride_randval + stride_do, + stride_dk, + stride_dv, + 0, // stride_dbias, FA without bias + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + 0, // nhead_stride_bias, FA without bias + nhead_stride_o, + 0, // nhead_stride_randval + nhead_stride_do, + nhead_stride_lse, + 0, // nhead_stride_dbias, FA without dbias + batch_stride_q, + batch_stride_k, + batch_stride_v, + 0 , // batch_stride_bias, FA without bias + batch_stride_o, + 0, // batch_stride_randval + batch_stride_do, + batch_stride_lse, + batch_stride_dk, + batch_stride_dv, + 0 , // batch_stride_dbias, FA without dbias + mask.left, + mask.right, + static_cast(mask.type), + p_dropout, + p_undrop, + false, // s_randval + {drop_seed, drop_offset}}; +} + +std::vector +mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads x head_size + const at::Tensor &q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + const at::Tensor &k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &out, // total_q x num_heads x head_size + const at::Tensor &softmax_lse, // b x h x s softmax logsumexp + c10::optional &dq_, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + c10::optional &dk_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + c10::optional &dv_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &cu_seqlens_q, // b+1 + const at::Tensor &cu_seqlens_k, // b+1 + c10::optional &alibi_slopes_, // num_heads or b x num_heads + const int max_seqlen_q, + const int max_seqlen_k, // max sequence length to choose the kernel + const float p_dropout, // probability to drop + const float softmax_scale, + const bool zero_tensors, + const bool is_causal, + int window_size_left, + int window_size_right, + const float /*softcap*/, + const bool deterministic, + c10::optional gen_, + c10::optional &rng_state) +{ +#ifdef FLASHATTENTION_DISABLE_BACKWARD + TORCH_CHECK(false, "This flash attention build does not support backward."); +#endif + if (is_causal) { window_size_right = 0; } + + bool is_dropout = p_dropout > 0.0; + auto stream = at::cuda::getCurrentCUDAStream().stream(); + + auto q_dtype = q.dtype(); + TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, + "FlashAttention only support fp16 and bf16 data type"); + + TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype"); + TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype"); + TORCH_CHECK(out.dtype() == q_dtype, "query and out must have the same dtype"); + TORCH_CHECK(dout.dtype() == q_dtype, "query and dout must have the same dtype"); + TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32"); + TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32"); + + std::string q_dtype_str = q_dtype == torch::kFloat16 ? "fp16" : "bf16"; + + CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); + CHECK_DEVICE(out); CHECK_DEVICE(dout); CHECK_DEVICE(softmax_lse); + CHECK_DEVICE(cu_seqlens_q); CHECK_DEVICE(cu_seqlens_k); + + TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension"); + TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension"); + CHECK_CONTIGUOUS(cu_seqlens_q); + CHECK_CONTIGUOUS(cu_seqlens_k); + + const auto sizes = q.sizes(); + + const int total_q = sizes[0]; + const int batch_size = cu_seqlens_q.numel() - 1; + const int num_heads = sizes[1]; + const int head_size_og = dout.size(2); + const int head_size_8x = sizes[2]; + const int total_k = k.size(0); + const int num_heads_k = k.size(1); + TORCH_CHECK(batch_size > 0, "batch size must be positive"); + TORCH_CHECK(head_size_8x % 8 == 0, "head_size should be a multiple of 8"); + TORCH_CHECK(head_size_8x <= 128, "CK FlashAttention backward only supports head dimension at most 128"); + TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + TORCH_CHECK(head_size_8x == round_multiple(head_size_og, 8), "head_size_8x must be head_size_og rounded to a multiple of 8"); + + if (window_size_left >= max_seqlen_k) { window_size_left = -1; } + if (window_size_right >= max_seqlen_k) { window_size_right = -1; } + + mask_info mask; + if (is_causal) { + std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + "0"; + mask = mask_info::decode(mask_identify, max_seqlen_q, max_seqlen_k); // casual + } + else if (window_size_left == -1 && window_size_right == -1) { + mask = mask_info::decode("0", max_seqlen_q, max_seqlen_k); // no mask + } + else { + // Local is the more general case where window_size_right >= 0 or window_size_left >= 0. + std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + std::to_string(window_size_right); + mask = mask_info::decode(mask_identify, max_seqlen_q, max_seqlen_k); // local + } + + // q, k, v, out had been padded in mha_fwd + // dq_, dk_, dv_ are also padded tensor + CHECK_SHAPE(q, total_q, num_heads, head_size_8x); + CHECK_SHAPE(k, total_k, num_heads_k, head_size_8x); + CHECK_SHAPE(v, total_k, num_heads_k, head_size_8x); + CHECK_SHAPE(out, total_q, num_heads, head_size_8x); + CHECK_SHAPE(dout, total_q, num_heads, head_size_og); + CHECK_SHAPE(cu_seqlens_q, batch_size + 1); + CHECK_SHAPE(cu_seqlens_k, batch_size + 1); + + at::Tensor dq, dk, dv; + if (dq_.has_value()) { + dq = dq_.value(); + TORCH_CHECK(dq.dtype() == q_dtype, "dq must have the same dtype as q"); + CHECK_DEVICE(dq); + TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension"); + CHECK_SHAPE(dq, total_q, num_heads, head_size_8x); + } else { + dq = torch::empty_like(q); + } + if (dk_.has_value()) { + dk = dk_.value(); + TORCH_CHECK(dk.dtype() == q_dtype, "dk must have the same dtype as q"); + CHECK_DEVICE(dk); + TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension"); + CHECK_SHAPE(dk, total_k, num_heads_k, head_size_8x); + } else { + dk = torch::empty_like(k); + } + if (dv_.has_value()) { + dv = dv_.value(); + TORCH_CHECK(dv.dtype() == q_dtype, "dv must have the same dtype as q"); + CHECK_DEVICE(dv); + TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension"); + CHECK_SHAPE(dv, total_k, num_heads_k, head_size_8x); + } else { + dv = torch::empty_like(v); + } + + at::Tensor dout_padded; + if (head_size_og % 8 != 0) { + dout_padded = torch::nn::functional::pad(dout, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + } else { + dout_padded = dout; + } + + + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + + auto opts = q.options(); + auto softmax_d = torch::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(at::kFloat)); + // TODO - CK does not support dq_accum + + at::Tensor dk_expanded, dv_expanded; + if (num_heads_k != num_heads) { // MQA / GQA + dk_expanded = torch::empty({total_k, num_heads, head_size_8x}, opts); + dv_expanded = torch::empty({total_k, num_heads, head_size_8x}, opts); + } else { + dk_expanded = dk; + dv_expanded = dv; + } + + if(zero_tensors) { + dq.zero_(); + dk_expanded.zero_(); + dv_expanded.zero_(); + softmax_d.zero_(); + } + + auto gen = at::get_generator_or_default( + gen_, at::cuda::detail::getDefaultCUDAGenerator()); + + uint64_t drop_seed = 1, drop_offset = 0; + int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size(); + + if (rng_state.has_value()) { + uint64_t* d = reinterpret_cast(rng_state.value().data_ptr()); + drop_seed = d[0]; + drop_offset = d[1]; + } else if(is_dropout) { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + auto philox_args = gen->philox_cuda_state(counter_offset); + std::tie(drop_seed, drop_offset) = flash::unpack(philox_args); + } + + if (max_seqlen_q > 0) { + ck_tile::stream_config stream_config{stream}; + dq.zero_(); // ck use atomic operation on dq + + auto traits = + get_ck_fmha_varlen_bwd_traits(mask, q_dtype_str, head_size_8x, is_dropout, alibi_slopes_.has_value()); + + auto args = + get_ck_fmha_varlen_bwd_args( + mask, + batch_size, + max_seqlen_q, + max_seqlen_k, + num_heads, + num_heads_k, + head_size_8x, + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + alibi_slopes_, + out, + softmax_lse, + dout_padded, + softmax_d, + dq, + dk_expanded, + dv_expanded, + softmax_scale, + p_dropout, + drop_seed, + drop_offset); + + fmha_bwd(traits, args, stream_config); + } else { + // If seqlen_q == 0, then we have an empty tensor. We need to set the output to 0. + dk_expanded.zero_(); + dv_expanded.zero_(); + softmax_d.zero_(); + } + + // For MQA/GQA we need to sum dK and dV across the groups + if (num_heads_k != num_heads) { + at::sum_out(dk, at::reshape(dk_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size_8x}), {2}); + at::sum_out(dv, at::reshape(dv_expanded, {total_k, num_heads_k, num_heads / num_heads_k, head_size_8x}), {2}); + } + if (head_size_og % 8 != 0) { + dq = dq.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + dk = dk.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + dv = dv.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + } + + return { dq, dk, dv, softmax_d }; +} \ No newline at end of file diff --git a/mha_varlen_fwd.cpp b/mha_varlen_fwd.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2d2f4cfef1643bc796add7884d5a57a255d6218d --- /dev/null +++ b/mha_varlen_fwd.cpp @@ -0,0 +1,371 @@ +/****************************************************************************** + * Copyright (c) 2024, Tri Dao. + ******************************************************************************/ + +#include "flash_common.hpp" + +#include "fmha_fwd.hpp" +#include "mask.hpp" + +fmha_fwd_traits get_ck_fmha_varlen_fwd_traits(const mask_info &mask, + std::string dtype, + int head_size, + bool has_dropout, + bool has_lse, + bool enable_alibi) +{ + return fmha_fwd_traits{head_size, + head_size, + dtype, + true, // is_group_mode + true, // is_v_rowmajor + mask.type, + enable_alibi ? bias_enum::alibi : bias_enum::no_bias, + has_lse, + has_dropout, + false}; // do_fp8_static_quant +} + +fmha_fwd_args get_ck_fmha_varlen_fwd_args(bool has_lse, + bool has_dropout_randval, + const mask_info &mask, + // sizes + const int b, + const int max_seqlen_q, + const int h, + const int h_k, + const int d, + // device pointers + const at::Tensor q, + const at::Tensor k, + const at::Tensor v, + const at::Tensor seqlens_q, + const at::Tensor seqlens_k, + c10::optional &alibi_slopes_, + at::Tensor out, + at::Tensor softmax_lse, + at::Tensor dropout_randval, + float softmax_scale, + float p_dropout, + uint64_t drop_seed, + uint64_t drop_offset) +{ + // q: (total_q, nheads, d) + // k: (total_k, nheads_k, d) + // v: (total_k, nheads_k, d) + // o: (total_q, nheads, d) + + // alibi_slopes:(batch, nheads) or (nhead) + // lse: (batch, nheads, max_seqlen_q) + // randval: (nheads, total_q, max_seqlen_k) + + ck_tile::index_t total_q = q.size(0); + ck_tile::index_t total_k = k.size(0); + + ck_tile::index_t stride_q = q.stride(0); + ck_tile::index_t stride_k = k.stride(0); + ck_tile::index_t stride_v = v.stride(0); + ck_tile::index_t stride_o = out.stride(0); + ck_tile::index_t stride_randval = has_dropout_randval ? dropout_randval.stride(1) : 0; + + ck_tile::index_t nhead_stride_q = q.stride(1); + ck_tile::index_t nhead_stride_k = k.stride(1); + ck_tile::index_t nhead_stride_v = v.stride(1); + ck_tile::index_t nhead_stride_o = out.stride(1); + ck_tile::index_t nhead_stride_lse = has_lse ? softmax_lse.stride(1) : 0; + ck_tile::index_t nhead_stride_randval = has_dropout_randval ? dropout_randval.stride(0) : 0; + + ck_tile::index_t batch_stride_q = 0; + ck_tile::index_t batch_stride_k = 0; + ck_tile::index_t batch_stride_v = 0; + ck_tile::index_t batch_stride_o = 0; + + ck_tile::index_t batch_stride_lse = has_lse ? softmax_lse.stride(0) : 0; + ck_tile::index_t batch_stride_randval = 0; + + void *alibi_slopes_ptr = nullptr; + ck_tile::index_t stride_alibi_slopes = 0; + + if (alibi_slopes_.has_value()) { + auto alibi_slopes = alibi_slopes_.value(); + CHECK_DEVICE(alibi_slopes); + TORCH_CHECK(alibi_slopes.stride(-1) == 1, "ALiBi slopes tensor must have contiguous last dimension"); + TORCH_CHECK(alibi_slopes.sizes() == torch::IntArrayRef({h}) || alibi_slopes.sizes() == torch::IntArrayRef({b, h})); + alibi_slopes_ptr = alibi_slopes.data_ptr(); + stride_alibi_slopes = alibi_slopes.dim() == 2 ? alibi_slopes.stride(0) : 0; + } + + return fmha_fwd_args{q.data_ptr(), + k.data_ptr(), + v.data_ptr(), + alibi_slopes_ptr, // bias + has_dropout_randval ? dropout_randval.data_ptr() : nullptr, + nullptr, // lse_acc + nullptr, // o_acc + has_lse ? softmax_lse.data_ptr() : nullptr, + out.data_ptr(), + seqlens_q.data_ptr(), // seqstart_q + seqlens_k.data_ptr(), // seqstart_k + nullptr, // seqlen_kpads + total_q, + total_k, + b, + max_seqlen_q, + d, // hdim_q + d, // hdim_v + h, // nhead + h_k, // nhead_k + 1, // num_splits + softmax_scale, // scale_s + 1, // scale_p + 1, // scale_o + stride_q, + stride_k, + stride_v, + stride_alibi_slopes, + stride_randval, + 0, // stride_o_acc, + stride_o, + nhead_stride_q, + nhead_stride_k, + nhead_stride_v, + 0, // nhead_stride_bias, FA without bias + nhead_stride_randval, + nhead_stride_lse, + 0, // nhead_stride_lse_acc + 0, // nhead_stride_o_acc + nhead_stride_o, + batch_stride_q, + batch_stride_k, + batch_stride_v, + 0, // batch_stride_bias, FA without bias + batch_stride_randval, + batch_stride_lse, + 0, // batch_stride_lse_acc + 0, // batch_stride_o_acc + batch_stride_o, + 0, // split_stride_lse_acc + 0, // split_stride_o_acc + mask.left, + mask.right, + static_cast(mask.type), + p_dropout, + has_dropout_randval, + {drop_seed, drop_offset}}; +} + +std::vector +mha_varlen_fwd(at::Tensor &q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + const at::Tensor &k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table. + const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i or num_blocks x page_block_size x num_heads_k x head_size if there's a block_table. + c10::optional &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &cu_seqlens_q, // b+1 + const at::Tensor &cu_seqlens_k, // b+1 + c10::optional & /*seqused_k*/, + c10::optional &/*leftpad_k_*/, // batch_size + c10::optional &block_table_, // batch_size x max_num_blocks_per_seq + c10::optional &alibi_slopes_, // num_heads or b x num_heads + int max_seqlen_q, + const int max_seqlen_k, + const float p_dropout, + const float softmax_scale, + const bool zero_tensors, + bool is_causal, + int window_size_left, + int window_size_right, + const float /*softcap*/, + const bool return_dropout_randval, + c10::optional gen_) +{ + auto q_dtype = q.dtype(); + TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, + "FlashAttention only support fp16 and bf16 data type"); + + TORCH_CHECK(k.dtype() == q_dtype, "query and key must have the same dtype"); + TORCH_CHECK(v.dtype() == q_dtype, "query and value must have the same dtype"); + TORCH_CHECK(cu_seqlens_q.dtype() == torch::kInt32, "cu_seqlens_q must have dtype int32"); + TORCH_CHECK(cu_seqlens_k.dtype() == torch::kInt32, "cu_seqlens_k must have dtype int32"); + + std::string q_dtype_str = q_dtype == torch::kFloat16 ? "fp16" : "bf16"; + + CHECK_DEVICE(q); CHECK_DEVICE(k); CHECK_DEVICE(v); + CHECK_DEVICE(cu_seqlens_q); + CHECK_DEVICE(cu_seqlens_k); + + // TODO - Support paged_KV + const bool paged_KV = block_table_.has_value(); + TORCH_CHECK(!paged_KV, "CK does not support paged_KV yet"); + + TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension"); + CHECK_CONTIGUOUS(cu_seqlens_q); + CHECK_CONTIGUOUS(cu_seqlens_k); + + const auto sizes = q.sizes(); + + const int batch_size = cu_seqlens_q.numel() - 1; + int num_heads = sizes[1]; + const int head_size_og = sizes[2]; + const int num_heads_k = k.size(1); + + const int max_num_blocks_per_seq = 0; + const int num_blocks = 0; + + if (max_seqlen_q == 1 && !alibi_slopes_.has_value()) { is_causal = false; } // causal=true is the same as causal=false in this case + + // TODO + // Faster to transpose q from (b, 1, (nheads_kv ngroups), d) to (b, ngroups, nheads_kv, d) in this case + // H/t Daniel Haziza + + const int total_q = q.size(0); + const int total_k = k.size(0); + + TORCH_CHECK(batch_size > 0, "batch size must be postive"); + TORCH_CHECK(head_size_og <= 256, "CK only supports head dimension at most 256"); + TORCH_CHECK(num_heads % num_heads_k == 0, "Number of heads in key/value must divide number of heads in query"); + + if (window_size_left >= max_seqlen_k) { window_size_left = -1; } + if (window_size_right >= max_seqlen_k) { window_size_right = -1; } + + mask_info mask; + + if (is_causal) { + // Causal is the special case where window_size_right == 0 and window_size_left < 0. + window_size_right = 0; + std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + "0"; + mask = mask_info::decode(mask_identify, max_seqlen_q, max_seqlen_k); // casual + } + else if (window_size_left == -1 && window_size_right == -1) { + mask = mask_info::decode("0", max_seqlen_q, max_seqlen_k); // no mask + } + else { + // Local is the more general case where window_size_right >= 0 or window_size_left >= 0. + std::string mask_identify = "b:" + std::to_string(window_size_left) + "," + std::to_string(window_size_right); + mask = mask_info::decode(mask_identify, max_seqlen_q, max_seqlen_k); // local + } + + CHECK_SHAPE(q, total_q, num_heads, head_size_og); + CHECK_SHAPE(k, total_k, num_heads_k, head_size_og); + CHECK_SHAPE(v, total_k, num_heads_k, head_size_og); + CHECK_SHAPE(cu_seqlens_q, batch_size + 1); + CHECK_SHAPE(cu_seqlens_k, batch_size + 1); + + at::Tensor q_padded, k_padded, v_padded; + if (head_size_og % 8 != 0) { + q_padded = torch::nn::functional::pad(q, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + k_padded = torch::nn::functional::pad(k, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + v_padded = torch::nn::functional::pad(v, torch::nn::functional::PadFuncOptions({0, 8 - head_size_og % 8})); + } + else { + q_padded = q; + k_padded = k; + v_padded = v; + } + + at::Tensor out; + if (out_.has_value()) { + out = out_.value(); + TORCH_CHECK(out.dtype() == q_dtype, "Output must have the same dtype as inputs"); + CHECK_DEVICE(out); + TORCH_CHECK(out.stride(-1) == 1, "Output tensor must have contiguous last dimension"); + CHECK_SHAPE(out, total_q, num_heads, head_size_og); + + if (head_size_og % 8 != 0) { out = torch::empty_like(q_padded); } + } + else { + out = torch::empty_like(q_padded); + } + + auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; }; + const int head_size_8x = round_multiple(head_size_og, 8); + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)q.get_device()}; + + auto opts = q.options(); + bool has_lse = true; + bool has_dropout = p_dropout > 0.0f; + + at::Tensor softmax_lse; + // TODO - check gradient, only training require lse + softmax_lse = torch::empty({batch_size, num_heads, max_seqlen_q}, opts.dtype(torch::kFloat32)); + + at::Tensor p; + if (return_dropout_randval) { + TORCH_CHECK(has_dropout, "return_dropout_randval require p_dropout > 0"); + p = torch::empty({num_heads, total_q, max_seqlen_k}, opts.dtype(torch::kUInt8)); + } + + if (zero_tensors) + { + out.zero_(); + softmax_lse.fill_(-std::numeric_limits::infinity()); + if (return_dropout_randval) {p.zero_();} + } + + uint64_t drop_seed = 1, drop_offset = 0; + int64_t counter_offset = batch_size * num_heads * ck_tile::get_warp_size(); + auto options = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA); + auto rng_state = torch::empty({2}, options.dtype(torch::kInt64)); + + if (p_dropout > 0.0) { + auto gen = at::get_generator_or_default( + gen_, at::cuda::detail::getDefaultCUDAGenerator()); + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + auto philox_args = gen->philox_cuda_state(counter_offset); + std::tie(drop_seed, drop_offset) = flash::unpack(philox_args); + } + + rng_state[0] = *(reinterpret_cast(&drop_seed)); + rng_state[1] = *(reinterpret_cast(&drop_offset)); + + if (max_seqlen_k > 0) { + auto stream = at::cuda::getCurrentHIPStream().stream(); + ck_tile::stream_config stream_config{stream}; + + auto traits = + get_ck_fmha_varlen_fwd_traits(mask, q_dtype_str, head_size_8x, has_dropout, has_lse, alibi_slopes_.has_value()); + + auto args = + get_ck_fmha_varlen_fwd_args( + has_lse, + return_dropout_randval, + mask, + batch_size, + max_seqlen_q, + num_heads, + num_heads_k, + head_size_8x, + q_padded, + k_padded, + v_padded, + cu_seqlens_q, + cu_seqlens_k, + alibi_slopes_, + out, + softmax_lse, + p, + softmax_scale, + p_dropout, + drop_seed, + drop_offset); + + fmha_fwd(traits, args, stream_config); + } + else { + // If seqlen_k == 0, then we have an empty tensor. We need to set the output to 0. + out.zero_(); + softmax_lse.fill_(std::numeric_limits::infinity()); + } + + at::Tensor out_padded = out; + if (head_size_og % 8 != 0) { + out = out.index({"...", torch::indexing::Slice(torch::indexing::None, head_size_og)}); + if (out_.has_value()) { out_.value().copy_(out); } + } + + return {out, q_padded, k_padded, v_padded, out_padded, softmax_lse, p, rng_state}; +} diff --git a/mlflow.yaml b/mlflow.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bfb3781b1591fc1d3190101da06dbce160396130 --- /dev/null +++ b/mlflow.yaml @@ -0,0 +1,10 @@ +# https://mlflow.org + +mlflow: + _target_: pytorch_lightning.loggers.mlflow.MLFlowLogger + experiment_name: ${name} + tracking_uri: null + tags: null + save_dir: ./mlruns + prefix: "" + artifact_location: null diff --git a/mlp.py b/mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..b795310f1c8afc8203124597bb6ca70f1af7ed11 --- /dev/null +++ b/mlp.py @@ -0,0 +1,149 @@ +# The triton fused matmul + sqrelu is faster for fp16 but slower for bf16, compared +# to naive implementation. +import fused_dense_lib as fused_dense_cuda +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.cuda.amp import custom_bwd, custom_fwd + +from flash_attn.ops.activations import sqrelu_bwd, sqrelu_fwd +from flash_attn.ops.triton.linear import triton_dgrad_act, triton_linear_act + + +class FusedDenseSqreluDenseFunc(torch.autograd.Function): + @staticmethod + @custom_fwd + def forward(ctx, x, weight1, bias1, weight2, bias2, checkpoint_lvl=0): + """checkpoint_lvl: + 0: no recomputation in the bwd + 1: recompute gelu_out in the bwd + 2: recompute act_input and gelu_out in the bwd + """ + if torch.is_autocast_enabled(): + dtype = torch.get_autocast_gpu_dtype() + x, weight1, bias1, weight2, bias2 = [ + a.to(dtype=dtype) for a in [x, weight1, bias1, weight2, bias2] + ] + is_bf16 = x.dtype == torch.bfloat16 + assert checkpoint_lvl in [0, 1, 2] + x = x.contiguous() + weight1 = weight1.contiguous() + bias1 = bias1.contiguous() + weight2 = weight2.contiguous() + bias2 = bias2.contiguous() + batch_shape, n = x.shape[:-1], x.shape[-1] + batch_dim = batch_shape.numel() + if is_bf16: + act_input = fused_dense_cuda.linear_bias_forward( + x.reshape(batch_dim, n), weight1, bias1 + ) + output1 = sqrelu_fwd(act_input) + else: + save_act_input = checkpoint_lvl != 2 + result = triton_linear_act( + x.reshape(batch_dim, n), + weight1, + bias1, + activation="squared_relu", + save_act_input=save_act_input, + ) + if save_act_input: + output1, act_input = result + else: + output1 = result + output2 = fused_dense_cuda.linear_bias_forward(output1, weight2, bias2) + ctx.checkpoint_lvl = checkpoint_lvl + if checkpoint_lvl == 0: + ctx.save_for_backward(x, weight1, bias1, weight2, act_input, output1) + elif checkpoint_lvl == 1: + ctx.save_for_backward(x, weight1, bias1, weight2, act_input) + elif checkpoint_lvl == 2: + ctx.save_for_backward(x, weight1, bias1, weight2) + return output2.reshape(*batch_shape, output2.shape[-1]) + + @staticmethod + @custom_bwd + def backward(ctx, grad_output): + grad_output = grad_output.contiguous() + checkpoint_lvl = ctx.checkpoint_lvl + x, weight1, bias1, weight2, *rest = ctx.saved_tensors + batch_shape, n = x.shape[:-1], x.shape[-1] + batch_dim = batch_shape.numel() + is_bf16 = x.dtype == torch.bfloat16 + if checkpoint_lvl == 0: + act_input, output1 = rest + elif checkpoint_lvl == 1: + (act_input,) = rest + output1 = sqrelu_fwd(act_input) + elif checkpoint_lvl == 2: + if is_bf16: + act_input = fused_dense_cuda.linear_bias_forward( + x.reshape(batch_dim, n), weight1, bias1 + ) + output1 = sqrelu_fwd(act_input) + else: + output1, act_input = triton_linear_act( + x.reshape(batch_dim, n), + weight1, + bias1, + activation="squared_relu", + save_act_input=True, + ) + + if is_bf16: + grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) + grad_weight2, grad_bias2 = fused_dense_cuda.linear_bias_wgrad(output1, grad_output) + grad_output1 = grad_output @ weight2 + grad_act_input = sqrelu_bwd(grad_output1, act_input) + grad_input, grad_weight1, grad_bias1 = fused_dense_cuda.linear_bias_backward( + x.reshape(batch_dim, n), weight1, grad_act_input + ) + else: + grad_output = grad_output.reshape(batch_dim, grad_output.shape[-1]) + grad_weight2, grad_bias2 = fused_dense_cuda.linear_bias_wgrad(output1, grad_output) + grad_act_input = triton_dgrad_act( + grad_output, weight2, activation="squared_relu", act_input=act_input + ) + grad_input, grad_weight1, grad_bias1 = fused_dense_cuda.linear_bias_backward( + x.reshape(batch_dim, n), weight1, grad_act_input + ) + return grad_input.reshape_as(x), grad_weight1, grad_bias1, grad_weight2, grad_bias2, None + + +fused_dense_sqrelu_dense_function = FusedDenseSqreluDenseFunc.apply + + +class FusedDenseSqreluDense(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + bias1=True, + bias2=True, + checkpoint_lvl=0, + device=None, + dtype=None, + ): + """ + checkpoint_lvl (increasing lvl means slower but more memory saving): + 0: no recomputation in the bwd + 1: recompute gelu_out in the bwd + 2: recompute gelu_in and gelu_out in the bwd + """ + assert checkpoint_lvl in [0, 1, 2] + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features * 4 + assert bias1 == True, "DenseSqreluDense module without bias is currently not supported" + assert bias2 == True, "DenseSqreluDense module without bias is currently not supported" + self.checkpoint_lvl = checkpoint_lvl + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs) + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs) + + def forward(self, x): + assert x.is_cuda + return fused_dense_sqrelu_dense_function( + x, self.fc1.weight, self.fc1.bias, self.fc2.weight, self.fc2.bias, self.checkpoint_lvl + ) diff --git a/model-summary.yaml b/model-summary.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3dba049adc7820fe47b84d7eaeb660e5d0254cdb --- /dev/null +++ b/model-summary.yaml @@ -0,0 +1,2 @@ +model_summary: + _target_: pytorch_lightning.callbacks.RichModelSummary diff --git a/mse.yaml b/mse.yaml new file mode 100644 index 0000000000000000000000000000000000000000..50b0484d9d0213bfa18b3e5b3e40047b990d0e02 --- /dev/null +++ b/mse.yaml @@ -0,0 +1,3 @@ +# @package eval.metrics +mse: + _target_: torchmetrics.MeanSquaredError diff --git a/multi-step.yaml b/multi-step.yaml new file mode 100644 index 0000000000000000000000000000000000000000..42cd60716a4654f469aa7d6384098bc1068f381c --- /dev/null +++ b/multi-step.yaml @@ -0,0 +1,2 @@ +# @package train.scheduler +_target_: torch.optim.lr_scheduler.MultiStepLR diff --git a/named_barrier.hpp b/named_barrier.hpp new file mode 100644 index 0000000000000000000000000000000000000000..58c080f4ab151f6096fdb07f4b3d320d69545027 --- /dev/null +++ b/named_barrier.hpp @@ -0,0 +1,41 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include "cutlass/arch/barrier.h" + +namespace flash { + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Enumerates the reserved named barriers to avoid potential conflicts + +enum class FwdNamedBarriers { + QueryEmpty = 0, + ValueEmpty = 1, + TileCountSmemEmpty = 2, + TileCountSmemFull = 3, + WarpSchedulerWG1 = 4, + WarpSchedulerWG2 = 5, + WarpSchedulerWG3 = 6, + ProducerWG = 7 +}; + +enum class BwdNamedBarriers { + QueryEmpty = 0, + KVEmpty = 1, + TileCountSmemEmpty = 2, + TileCountSmemFull = 3, + // WarpSchedulerWG1 = 4, + // WarpSchedulerWG2 = 5, + dQEmptyWG1 = 4, + dQEmptyWG2 = 5, + dSFull = 6, + // dSEmptyWG1 = 7, + // dSEmptyWG2 = 8, + dQEmpty = 7, + dQFull = 8, +}; + +} // flash diff --git a/neptune.yaml b/neptune.yaml new file mode 100644 index 0000000000000000000000000000000000000000..117af9379319ff9b000311b863ddf42fe08b1b67 --- /dev/null +++ b/neptune.yaml @@ -0,0 +1,11 @@ +# https://neptune.ai + +neptune: + _target_: pytorch_lightning.loggers.neptune.NeptuneLogger + api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable + project_name: your_name/template-tests + close_after_fit: True + offline_mode: False + experiment_name: ${name} + experiment_id: null + prefix: "" diff --git a/none.yaml b/none.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/norm-monitor.yaml b/norm-monitor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f4c6e2ccb0c6418f083c387f887c01a0702b7fc5 --- /dev/null +++ b/norm-monitor.yaml @@ -0,0 +1,2 @@ +norm_monitor: + _target_: src.callbacks.norm_monitor.NormMonitor diff --git a/num-tokens.yaml b/num-tokens.yaml new file mode 100644 index 0000000000000000000000000000000000000000..047d423541e83ee8c8bc7debf3893507e89a4afb --- /dev/null +++ b/num-tokens.yaml @@ -0,0 +1,3 @@ +# @package eval.metrics +num-tokens: + _target_: src.metrics.num_tokens.NumTokens diff --git a/openwebtext.yaml b/openwebtext.yaml new file mode 100644 index 0000000000000000000000000000000000000000..327decbd86fff635e83ce45bfc7f2f088ab3c257 --- /dev/null +++ b/openwebtext.yaml @@ -0,0 +1,15 @@ +_target_: src.datamodules.language_modeling_hf.LMDataModule +dataset_name: openwebtext +dataset_config_name: null +tokenizer_name: gpt2 +cache_dir: ${oc.env:DATA_DIR,${data_dir}}/openwebtext/cache +max_length: 1024 +val_ratio: 0.0005 +val_split_seed: 2357 +add_eos: True +batch_size: 8 # per GPU +batch_size_eval: ${eval:${.batch_size} * 2} +num_workers: 32 # For preprocessing only +shuffle: True +pin_memory: True +__train_len: ${div_up:9035582198, ${.max_length}} diff --git a/opt.py b/opt.py new file mode 100644 index 0000000000000000000000000000000000000000..501f9eb6cf44be86aeb77a4e0f35048255850c30 --- /dev/null +++ b/opt.py @@ -0,0 +1,116 @@ +# Copyright (c) 2023, Tri Dao. + +import math +import re +from collections import OrderedDict + +import torch +import torch.nn.functional as F +from transformers import GPT2Config, OPTConfig + + +def remap_state_dict_hf_opt(state_dict, config): + def key_mapping_model(key): + key = re.sub(r"^model.decoder.", "transformer.", key) + # The OPT-350m model uses '^decoder' instead of '^model.decoder' + key = re.sub(r"^decoder.", "transformer.", key) + return key + + state_dict = OrderedDict((key_mapping_model(k), v) for k, v in state_dict.items()) + # Word embedding and position embedding + def key_mapping_emb(key): + key = re.sub(r"^transformer.embed_tokens.", "transformer.embeddings.word_embeddings.", key) + # The OPT-350m model uses has project_in and project_out + key = re.sub(r"^transformer.project_in.", "transformer.embeddings.project_in.", key) + key = re.sub(r"^transformer.project_out.", "project_out.", key) + key = re.sub( + r"^transformer.embed_positions.", "transformer.embeddings.position_embeddings.", key + ) + return key + + state_dict = OrderedDict((key_mapping_emb(k), v) for k, v in state_dict.items()) + # OPT uses the first 2 indices of pos_emb for padding tokens + pos_embeddings = state_dict.pop("transformer.embeddings.position_embeddings.weight") + state_dict["transformer.embeddings.position_embeddings.weight"] = pos_embeddings[2:] + word_embeddings = state_dict.pop("transformer.embeddings.word_embeddings.weight") + # It's possible that vocab_size is padded to be a multiple of 8, for example. + pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1) + vocab_size = math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple + state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad( + word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0]) + ) + state_dict["lm_head.weight"] = state_dict["transformer.embeddings.word_embeddings.weight"] + + # LayerNorm + def key_mapping_ln(key): + key = re.sub(r"^transformer.final_layer_norm.", r"transformer.ln_f.", key) + # The OPT-175B checkpoint calls this 'decoder.layer_norm' instead of 'decoder.final_layer_norm' + key = re.sub(r"^transformer.layer_norm.", r"transformer.ln_f.", key) + key = re.sub( + r"^transformer.layers.(\d+).self_attn_layer_norm.", r"transformer.layers.\1.norm1.", key + ) + key = re.sub( + r"^transformer.layers.(\d+).final_layer_norm.", r"transformer.layers.\1.norm2.", key + ) + return key + + state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items()) + + # MLP + def key_mapping_mlp(key): + return re.sub( + r"^transformer.layers.(\d+).fc(1|2).", r"transformer.layers.\1.mlp.fc\2.", key + ) + + state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items()) + + # Attention + for l in range(config.n_layer): + Wq = state_dict.pop(f"transformer.layers.{l}.self_attn.q_proj.weight") + Wk = state_dict.pop(f"transformer.layers.{l}.self_attn.k_proj.weight") + Wv = state_dict.pop(f"transformer.layers.{l}.self_attn.v_proj.weight") + bq = state_dict.pop(f"transformer.layers.{l}.self_attn.q_proj.bias") + bk = state_dict.pop(f"transformer.layers.{l}.self_attn.k_proj.bias") + bv = state_dict.pop(f"transformer.layers.{l}.self_attn.v_proj.bias") + state_dict[f"transformer.layers.{l}.mixer.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0) + state_dict[f"transformer.layers.{l}.mixer.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0) + + def key_mapping_attn(key): + return re.sub( + r"^transformer.layers.(\d+).self_attn.out_proj.", + r"transformer.layers.\1.mixer.out_proj.", + key, + ) + + state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items()) + + return state_dict + + +def opt_config_to_gpt2_config(opt_config: OPTConfig) -> GPT2Config: + assert opt_config.layerdrop == 0.0 + assert opt_config.layer_norm_elementwise_affine + word_embed_proj_dim = ( + None + if opt_config.word_embed_proj_dim == opt_config.hidden_size + else opt_config.word_embed_proj_dim + ) + return GPT2Config( + vocab_size=opt_config.vocab_size, + n_positions=opt_config.max_position_embeddings, + n_embd=opt_config.hidden_size, + n_layer=opt_config.num_hidden_layers, + n_head=opt_config.num_attention_heads, + n_inner=opt_config.ffn_dim, + activation_function=opt_config.activation_function, + resid_pdrop=opt_config.dropout, + # HF's implementation of OPT doesn't seem to have embedding dropout + embd_pdrop=opt_config.dropout, + attn_pdrop=opt_config.attention_dropout, + initializer_range=opt_config.init_std, + bos_token_id=opt_config.bos_token_id, + eos_token_id=opt_config.eos_token_id, + # These are new arguments not in the original GPT2Config + prenorm=opt_config.do_layer_norm_before, + word_embed_proj_dim=word_embed_proj_dim, + ) diff --git a/params-log.yaml b/params-log.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b2a49dd8d7ade6be883b52f9e335fb574180da30 --- /dev/null +++ b/params-log.yaml @@ -0,0 +1,5 @@ +params_log: + _target_: src.callbacks.params_log.ParamsLog + total_params_log: True + trainable_params_log: True + non_trainable_params_log: True diff --git a/patch_embed.py b/patch_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..05562f8e8bcdb58e947c6f402a49eacd2d031871 --- /dev/null +++ b/patch_embed.py @@ -0,0 +1,67 @@ +# We use the same API as https://github.com/rwightman/pytorch-image-models/blob/v0.6.11/timm/models/layers/patch_embed.py +# But we use nn.Linear instead of Conv2d and it's about 8x faster. + +from functools import partial + +import torch.nn as nn +from einops import rearrange +from torch import _assert +from torch.nn.modules.utils import _pair + +try: + from flash_attn.ops.fused_dense import FusedDense +except ImportError: + FusedDense = None + + +class PatchEmbed(nn.Module): + """2D Image to Patch Embedding""" + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + norm_layer=None, + flatten=True, + bias=True, + fused_bias_fc=False, + ): + super().__init__() + img_size = _pair(img_size) + patch_size = _pair(patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + if fused_bias_fc and FusedDense is None: + raise ImportError("fused_dense is not installed") + + linear_cls = nn.Linear if not fused_bias_fc or not bias else FusedDense + self.proj = linear_cls(in_chans * patch_size[0] * patch_size[1], embed_dim, bias=bias) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x): + _, _, H, W = x.shape + _assert( + H == self.img_size[0], + f"Input image height ({H}) doesn't match model ({self.img_size[0]}).", + ) + _assert( + W == self.img_size[1], + f"Input image width ({W}) doesn't match model ({self.img_size[1]}).", + ) + x = self.proj( + rearrange( + x, + "b c (h p1) (w p2) -> b h w (c p1 p2)", + p1=self.patch_size[0], + p2=self.patch_size[1], + ) + ) + if self.flatten: + x = rearrange(x, "b h w c -> b (h w) c") + x = self.norm(x) + return x diff --git a/perplexity.yaml b/perplexity.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2edd2178828315901be9448710f30f145d874375 --- /dev/null +++ b/perplexity.yaml @@ -0,0 +1,3 @@ +# @package eval.metrics +ppl: + _target_: src.metrics.perplexity.Perplexity diff --git a/philox.cuh b/philox.cuh new file mode 100644 index 0000000000000000000000000000000000000000..cd7e4d2fae660d568546c4348bb26b6da7f6c297 --- /dev/null +++ b/philox.cuh @@ -0,0 +1,51 @@ +// Pytorch also has an implementation of Philox RNG: https://github.com/pytorch/pytorch/blob/8ca3c881db3e3510fcb7725389f6a0633c9b992c/torch/csrc/jit/tensorexpr/cuda_random.h +#pragma once +// Philox CUDA. + +namespace flash { + +struct ull2 { + unsigned long long x; + unsigned long long y; +}; + +__forceinline__ __device__ uint2 mulhilo32(const unsigned int a, const unsigned int b) { + uint2 *res; + unsigned long long tmp; + asm ("mul.wide.u32 %0, %1, %2;\n\t" + : "=l"(tmp) + : "r"(a), "r"(b)); + res = (uint2*)(&tmp); + return *res; +} + +__forceinline__ __device__ uint4 philox_single_round(const uint4 ctr, const uint2 key) { + constexpr unsigned long kPhiloxSA = 0xD2511F53; + constexpr unsigned long kPhiloxSB = 0xCD9E8D57; + uint2 res0 = mulhilo32(kPhiloxSA, ctr.x); + uint2 res1 = mulhilo32(kPhiloxSB, ctr.z); + uint4 ret = {res1.y ^ ctr.y ^ key.x, res1.x, res0.y ^ ctr.w ^ key.y, res0.x}; + return ret; +} + +__forceinline__ __device__ uint4 philox(unsigned long long seed, + unsigned long long subsequence, + unsigned long long offset) { + constexpr unsigned long kPhilox10A = 0x9E3779B9; + constexpr unsigned long kPhilox10B = 0xBB67AE85; + uint2 key = reinterpret_cast(seed); + uint4 counter; + ull2 *tmp = reinterpret_cast(&counter); + tmp->x = offset; + tmp->y = subsequence; + #pragma unroll + for (int i = 0; i < 6; i++) { + counter = philox_single_round(counter, key); + key.x += (kPhilox10A); + key.y += (kPhilox10B); + } + uint4 output = philox_single_round(counter, key); + return output; +} + +} // namespace flash diff --git a/plateau.yaml b/plateau.yaml new file mode 100644 index 0000000000000000000000000000000000000000..436c264dc2acd534a333e0bc4b28f93139d8ec7c --- /dev/null +++ b/plateau.yaml @@ -0,0 +1,9 @@ +# @package _global_ +train: + scheduler_interval: epoch + scheduler_monitor: ??? + scheduler: + _target_: torch.optim.lr_scheduler.ReduceLROnPlateau + factor: 0.2 # Decay factor when ReduceLROnPlateau is used + patience: 20 + min_lr: 0.0 # Minimum learning rate during annealing diff --git a/poly-warmup.yaml b/poly-warmup.yaml new file mode 100644 index 0000000000000000000000000000000000000000..79808ea42c6c1d851ef94da812f7c2d448ec6ca0 --- /dev/null +++ b/poly-warmup.yaml @@ -0,0 +1,2 @@ +# @package train.scheduler +_target_: transformers.get_polynomial_decay_schedule_with_warmup diff --git a/pretrained.py b/pretrained.py new file mode 100644 index 0000000000000000000000000000000000000000..40e76bd2692335c7f474f6b6479be67eb95f8d20 --- /dev/null +++ b/pretrained.py @@ -0,0 +1,79 @@ +import os +from functools import partial + +import torch +from safetensors.torch import load_file as safe_load_file +from transformers.utils import ( + SAFE_WEIGHTS_INDEX_NAME, + SAFE_WEIGHTS_NAME, + WEIGHTS_INDEX_NAME, + WEIGHTS_NAME, +) +from transformers.utils.hub import cached_file, get_checkpoint_shard_files + + +def state_dict_from_pretrained(model_name, device=None, dtype=None): + # If not fp32, then we don't want to load directly to the GPU + mapped_device = "cpu" if dtype not in [torch.float32, None] else device + is_sharded = False + load_safe = False + resolved_archive_file = None + + weights_path = os.path.join(model_name, WEIGHTS_NAME) + weights_index_path = os.path.join(model_name, WEIGHTS_INDEX_NAME) + safe_weights_path = os.path.join(model_name, SAFE_WEIGHTS_NAME) + safe_weights_index_path = os.path.join(model_name, SAFE_WEIGHTS_INDEX_NAME) + + if os.path.isfile(weights_path): + resolved_archive_file = cached_file( + model_name, WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False + ) + elif os.path.isfile(weights_index_path): + resolved_archive_file = cached_file( + model_name, WEIGHTS_INDEX_NAME, _raise_exceptions_for_missing_entries=False + ) + is_sharded = True + elif os.path.isfile(safe_weights_path): + resolved_archive_file = cached_file( + model_name, SAFE_WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False + ) + load_safe = True + elif os.path.isfile(safe_weights_index_path): + resolved_archive_file = cached_file( + model_name, SAFE_WEIGHTS_INDEX_NAME, _raise_exceptions_for_missing_entries=False + ) + is_sharded = True + load_safe = True + else: # Try loading from HF hub instead of from local files + resolved_archive_file = cached_file(model_name, WEIGHTS_NAME, + _raise_exceptions_for_missing_entries=False) + if resolved_archive_file is None: + resolved_archive_file = cached_file(model_name, WEIGHTS_INDEX_NAME, + _raise_exceptions_for_missing_entries=False) + if resolved_archive_file is not None: + is_sharded = True + + if resolved_archive_file is None: + raise EnvironmentError(f"Model name {model_name} was not found.") + + if load_safe: + loader = partial(safe_load_file, device=mapped_device) + else: + loader = partial(torch.load, map_location=mapped_device) + + if is_sharded: + # resolved_archive_file becomes a list of files that point to the different + # checkpoint shards in this case. + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + model_name, resolved_archive_file + ) + state_dict = {} + for sharded_file in resolved_archive_file: + state_dict.update(loader(sharded_file)) + else: + state_dict = loader(resolved_archive_file) + # Convert dtype before moving to GPU to save memory + if dtype is not None: + state_dict = {k: v.to(dtype=dtype) for k, v in state_dict.items()} + state_dict = {k: v.to(device=device) for k, v in state_dict.items()} + return state_dict diff --git a/profile.yaml b/profile.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f6c547a43fc7932cb9aa657a6f1a8e492a09ebbe --- /dev/null +++ b/profile.yaml @@ -0,0 +1,31 @@ +# @package _global_ +# Run the Pytorch profiler + +trainer: + profiler: + _target_: pytorch_lightning.profilers.PyTorchProfiler + dirpath: ${hydra.run.dir} + schedule: + _target_: torch.profiler.schedule + wait: 5 + warmup: 5 + active: 5 + use_cuda: True + max_steps: 20 + +logger: + wandb: + mode: disabled + +callbacks: + model_checkpoint: null + model_checkpoint_progress: null + early_stopping: null + +hydra: + # sets output paths for all file logs to 'logs/profile/' + run: + dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/profile/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/profile/multirun_${now:%Y-%m-%d_%H-%M-%S} + subdir: ${hydra.job.num} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..3201555763efa5ab9159a3e58d0dd43ff79daffb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[tool.black] +line-length = 100 +target-version = ['py38'] \ No newline at end of file diff --git a/rms_norm.py b/rms_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..068348d61290e3839dd082b540d898578ba1e8e2 --- /dev/null +++ b/rms_norm.py @@ -0,0 +1,174 @@ +# Copyright (c) 2022, Tri Dao. +# Adapted from https://github.com/NVIDIA/apex/blob/master/apex/contrib/layer_norm/layer_norm.py + +import torch +from torch.nn import init + +from flash_attn.ops.layer_norm import ( + DropoutAddLayerNormFn, + DropoutAddLayerNormParallelResidualFn, + DropoutAddLayerNormSubsetFn, +) + + +def rms_norm(x, weight, epsilon): + return DropoutAddLayerNormFn.apply( + x, None, weight, None, None, None, 0.0, epsilon, False, False, True + ) + + +def dropout_add_rms_norm( + x0, + residual, + weight, + bias, + dropout_p, + epsilon, + rowscale=None, + layerscale=None, + prenorm=False, + residual_in_fp32=False, + return_dropout_mask=False, +): + """residual_in_fp32 only has an effect if residual is None. + Otherwise residual dtype is residual.dtype. + """ + return DropoutAddLayerNormFn.apply( + x0, + residual, + weight, + bias, + rowscale, + layerscale, + dropout_p, + epsilon, + residual_in_fp32, + prenorm, + True, + return_dropout_mask, + ) + + +def dropout_add_rms_norm_subset( + x0, + residual, + weight, + bias, + dropout_p, + epsilon, + layerscale=None, + x0_subset=None, + out_subset=None, + rowscale_const=1.0, + out_numrows=0, + prenorm=False, + residual_in_fp32=False, + return_dropout_mask=False, +): + """residual_in_fp32 only has an effect if residual is None. + Otherwise residual dtype is residual.dtype. + """ + return DropoutAddLayerNormSubsetFn.apply( + x0, + residual, + weight, + bias, + layerscale, + x0_subset, + out_subset, + dropout_p, + epsilon, + rowscale_const, + out_numrows, + residual_in_fp32, + prenorm, + True, + return_dropout_mask, + ) + + +def dropout_add_rms_norm_parallel_residual( + x0, + x1, + residual, + weight0, + bias0, + weight1, + bias1, + dropout_p, + epsilon, + prenorm=False, + residual_in_fp32=False, + return_dropout_mask=False, +): + """residual_in_fp32 only has an effect if residual is None. + Otherwise residual dtype is residual.dtype. + """ + return DropoutAddLayerNormParallelResidualFn.apply( + x0, + x1, + residual, + weight0, + bias0, + weight1, + bias1, + dropout_p, + epsilon, + residual_in_fp32, + prenorm, + True, + return_dropout_mask, + ) + + +class RMSNorm(torch.nn.Module): + def __init__(self, hidden_size, eps=1e-5, device=None, dtype=None): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.eps = eps + self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.register_parameter("bias", None) + self.reset_parameters() + + def reset_parameters(self): + init.ones_(self.weight) + + def forward(self, x): + return rms_norm(x, self.weight, self.eps) + + +class DropoutAddRMSNorm(torch.nn.Module): + def __init__( + self, + hidden_size, + prenorm=False, + p=0.0, + eps=1e-5, + residual_in_fp32=False, + device=None, + dtype=None, + ): + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.prenorm = prenorm + self.p = p + self.eps = eps + self.residual_in_fp32 = residual_in_fp32 + self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.register_parameter("bias", None) + self.reset_parameters() + + def reset_parameters(self): + init.ones_(self.weight) + + def forward(self, x0, residual=None): + return dropout_add_rms_norm( + x0, + residual, + self.weight, + None, + self.p if self.training else 0.0, + self.eps, + prenorm=self.prenorm, + residual_in_fp32=self.residual_in_fp32, + ) diff --git a/rotary.cpp b/rotary.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b2a3cf0f75010dd2988f3f173b4be6379c166277 --- /dev/null +++ b/rotary.cpp @@ -0,0 +1,41 @@ +/****************************************************************************** + * Copyright (c) 2023, Tri Dao. + ******************************************************************************/ + +#include +#include + +#define CHECK_DEVICE(x) TORCH_CHECK(x.device().type() == torch::kCUDA, #x " must be on CUDA") +#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")") + +void apply_rotary_cuda(const torch::Tensor x1, const torch::Tensor x2, + const torch::Tensor cos, const torch::Tensor sin, + torch::Tensor out1, torch::Tensor out2, + const bool conj); + +void apply_rotary(const torch::Tensor x1, const torch::Tensor x2, + const torch::Tensor cos, const torch::Tensor sin, + torch::Tensor out1, torch::Tensor out2, + const bool conj) { + CHECK_DEVICE(x1); CHECK_DEVICE(x2); + CHECK_DEVICE(cos); CHECK_DEVICE(sin); + CHECK_DEVICE(out1); CHECK_DEVICE(out1); + TORCH_CHECK(x1.dtype() == x2.dtype()); + TORCH_CHECK(cos.dtype() == sin.dtype()); + TORCH_CHECK(out1.dtype() == out2.dtype()); + TORCH_CHECK(x1.dtype() == cos.dtype()); + TORCH_CHECK(x1.dtype() == out1.dtype()); + TORCH_CHECK(x1.sizes() == x2.sizes()); + TORCH_CHECK(cos.sizes() == sin.sizes()); + TORCH_CHECK(out1.sizes() == out2.sizes()); + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)x1.get_device()}; + + apply_rotary_cuda(x1, x2, cos, sin, out1, out2, conj); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("apply_rotary", &apply_rotary, "Apply rotary embedding"); +} diff --git a/rotary.h b/rotary.h new file mode 100644 index 0000000000000000000000000000000000000000..7f1614ad24872f3edc1798a008cc94cb14dff5a0 --- /dev/null +++ b/rotary.h @@ -0,0 +1,152 @@ +/****************************************************************************** + * Copyright (c) 2024, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include + +#include "utils.h" + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace flash { + +using namespace cute; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +__forceinline__ __device__ void copy_rotary_interleaved(Tensor const &S, + Tensor &D, + Tensor const &Cos, + Tensor const &Sin, + Tensor const &identity_MN, + const int max_MN, const int min_MN, + const int dim, const int rotary_dim) { + CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{}); + CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{}); + CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D)); // MMA + CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D)); // MMA_M + CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D)); // MMA_K + CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Cos)); // MMA_M + CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Cos)); // MMA_K + CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Sin)); // MMA_M + CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Sin)); // MMA_K + CUTE_STATIC_ASSERT_V(size<0>(Cos) == size<0>(Sin)); // MMA_K + static_assert(decltype(size<0>(S))::value == decltype(size<0>(Cos))::value * 2); + static_assert(decltype(size<0>(Cos))::value % 2 == 0); // Since we do fast conversion from fp16/bf16 to fp32 + Tensor rCos = make_fragment_like(Cos); + Tensor rSin = make_fragment_like(Sin); + Tensor rS = make_fragment_like(S); + #pragma unroll + for (int m = 0; m < size<1>(S); ++m) { + if (get<0>(identity_MN(0, m, 0)) >= min_MN && get<0>(identity_MN(0, m, 0)) < max_MN) { + #pragma unroll + for (int k = 0; k < size<2>(S); ++k) { + if (Is_even_K || get<1>(identity_MN(0, 0, k)) < dim) { + cute::copy(S(_, m, k), rS(_, m, k)); + if (get<1>(identity_MN(0, 0, k)) < rotary_dim) { + cute::copy(Cos(_, m, k), rCos(_, m, k)); + cute::copy(Sin(_, m, k), rSin(_, m, k)); + Tensor S_fp32 = convert_type(rS(_, m, k)); + Tensor cos_fp32 = convert_type(rCos(_, m, k)); + Tensor sin_fp32 = convert_type(rSin(_, m, k)); + #pragma unroll + for (int i = 0; i < size<0>(rS) / 2; ++i) { + float real = S_fp32(2 * i) * cos_fp32(i) - S_fp32(2 * i + 1) * sin_fp32(i); + float imag = S_fp32(2 * i) * sin_fp32(i) + S_fp32(2 * i + 1) * cos_fp32(i); + S_fp32(2 * i) = real; + S_fp32(2 * i + 1) = imag; + } + // Idk but I need to copy for the convert_type to work + Tensor S_fp32_copy = make_fragment_like(S_fp32); + cute::copy(S_fp32, S_fp32_copy); + using T = typename Engine0::value_type; + Tensor S_og_type = convert_type(S_fp32_copy); + cute::copy(S_og_type, rS(_, m, k)); + } + cute::copy(rS(_, m, k), D(_, m, k)); + } else if (Clear_OOB_K) { + cute::clear(D(_, m, k)); + } + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +__forceinline__ __device__ void copy_rotary_contiguous(Tensor const &S, + Tensor &D, + Tensor const &Cos, + Tensor const &Sin, + Tensor const &identity_MN, + const int max_MN, const int min_MN, + const int dim, const int rotary_dim) { + CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{}); + CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{}); + CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D)); // MMA + CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D)); // MMA_M + CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D)); // MMA_K + CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Cos)); // MMA_M + CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Cos)); // MMA_K + CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(Sin)); // MMA_M + CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(Sin)); // MMA_K + CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(Cos)); // MMA + CUTE_STATIC_ASSERT_V(size<0>(Cos) == size<0>(Sin)); + static_assert(decltype(size<0>(Cos))::value % 2 == 0); // Since we do fast conversion from fp16/bf16 to fp32 + Tensor rCos = make_fragment_like(Cos); + Tensor rSin = make_fragment_like(Sin); + Tensor rS = make_fragment_like(S); + Tensor rS_other = make_fragment_like(rS(_, 0, 0)); + #pragma unroll + for (int m = 0; m < size<1>(S); ++m) { + if (get<0>(identity_MN(0, m, 0)) >= min_MN && get<0>(identity_MN(0, m, 0)) < max_MN) { + #pragma unroll + for (int k = 0; k < size<2>(S); ++k) { + if (Is_even_K || get<1>(identity_MN(0, 0, k)) < dim) { + cute::copy(S(_, m, k), rS(_, m, k)); + if (get<1>(identity_MN(0, 0, k)) < rotary_dim) { + const bool is_left = get<1>(identity_MN(0, 0, k)) < rotary_dim / 2; + Tensor gS_other = make_tensor(S(_, m, k).data() + (is_left ? rotary_dim / 2 : -rotary_dim / 2), S(_, m, k).layout()); + cute::copy(gS_other, rS_other); + // if (cute::thread0()) { print_tensor(rS(_, m, k)); print_tensor(rS_other); } + Tensor gCos = make_tensor(Cos(_, m, k).data() + (is_left ? 0 : -rotary_dim / 2), Cos(_, m, k).layout()); + Tensor gSin = make_tensor(Sin(_, m, k).data() + (is_left ? 0 : -rotary_dim / 2), Sin(_, m, k).layout()); + cute::copy(gCos, rCos(_, m, k)); + cute::copy(gSin, rSin(_, m, k)); + // if (cute::thread0()) { print_tensor(rCos(_, m, k)); print_tensor(rSin(_, m, k)); } + Tensor S_fp32 = convert_type(rS(_, m, k)); + Tensor S_other_fp32 = convert_type(rS_other); + Tensor cos_fp32 = convert_type(rCos(_, m, k)); + Tensor sin_fp32 = convert_type(rSin(_, m, k)); + #pragma unroll + for (int i = 0; i < size<0>(rS); ++i) { + S_fp32(i) = S_fp32(i) * cos_fp32(i) + S_other_fp32(i) * (is_left ? -sin_fp32(i) : sin_fp32(i)); + } + // Idk but I need to copy for the convert_type to work + Tensor S_fp32_copy = make_fragment_like(S_fp32); + cute::copy(S_fp32, S_fp32_copy); + using T = typename Engine0::value_type; + Tensor S_og_type = convert_type(S_fp32_copy); + cute::copy(S_og_type, rS(_, m, k)); + // if (cute::thread0()) { print_tensor(rS(_, m, k)); } + } + cute::copy(rS(_, m, k), D(_, m, k)); + } else if (Clear_OOB_K) { + cute::clear(D(_, m, k)); + } + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace flash diff --git a/rotary.py b/rotary.py new file mode 100644 index 0000000000000000000000000000000000000000..6c04a523ede814ea075e6773572cb56cac8bff64 --- /dev/null +++ b/rotary.py @@ -0,0 +1,227 @@ +# Copyright (c) 2023, Tri Dao. + +from typing import Optional, Union + +import torch + +import triton +import triton.language as tl + + +@triton.jit +def rotary_kernel( + OUT, # Pointers to matrices + X, + COS, + SIN, + CU_SEQLENS, + SEQLEN_OFFSETS, # this could be int or a pointer + # Matrix dimensions + seqlen, + rotary_dim, + seqlen_ro, + # strides + stride_out_batch, + stride_out_seqlen, + stride_out_nheads, + stride_out_headdim, + stride_x_batch, + stride_x_seqlen, + stride_x_nheads, + stride_x_headdim, + # Meta-parameters + BLOCK_K: tl.constexpr, + IS_SEQLEN_OFFSETS_TENSOR: tl.constexpr, + IS_VARLEN: tl.constexpr, + INTERLEAVED: tl.constexpr, + CONJUGATE: tl.constexpr, + BLOCK_M: tl.constexpr, +): + pid_m = tl.program_id(axis=0) + pid_batch = tl.program_id(axis=1) + pid_head = tl.program_id(axis=2) + rotary_dim_half = rotary_dim // 2 + + if not IS_VARLEN: + X = X + pid_batch * stride_x_batch + pid_head * stride_x_nheads + OUT = OUT + pid_batch * stride_out_batch + pid_head * stride_out_nheads + else: + start_idx = tl.load(CU_SEQLENS + pid_batch) + seqlen = tl.load(CU_SEQLENS + pid_batch + 1) - start_idx + X = X + start_idx * stride_x_seqlen + pid_head * stride_x_nheads + OUT = OUT + start_idx * stride_out_seqlen + pid_head * stride_out_nheads + + if pid_m * BLOCK_M >= seqlen: + return + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + if not IS_SEQLEN_OFFSETS_TENSOR: + rm_cs = rm + SEQLEN_OFFSETS + else: + rm_cs = rm + tl.load(SEQLEN_OFFSETS + pid_batch) + rk = tl.arange(0, BLOCK_K) + rk_half = tl.arange(0, BLOCK_K // 2) + + if not INTERLEAVED: + # Load the 1st and 2nd halves of X, do calculation, then store to 1st and 2nd halves of OUT + X = X + (rm[:, None] * stride_x_seqlen + rk_half[None, :] * stride_x_headdim) + COS = COS + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :]) + SIN = SIN + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :]) + cos = tl.load( + COS, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=1.0 + ).to(tl.float32) + sin = tl.load( + SIN, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=0.0 + ).to(tl.float32) + x0 = tl.load( + X, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half), other=0.0 + ).to(tl.float32) + x1 = tl.load( + X + rotary_dim_half * stride_x_headdim, + mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half), + other=0.0, + ).to(tl.float32) + if CONJUGATE: + sin = -sin + o0 = x0 * cos - x1 * sin + o1 = x0 * sin + x1 * cos + # write back result + OUT = OUT + (rm[:, None] * stride_out_seqlen + rk_half[None, :] * stride_out_headdim) + tl.store(OUT, o0, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half)) + tl.store( + OUT + rotary_dim_half * stride_out_headdim, + o1, + mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half), + ) + else: + # We don't want to load X[0, 2, 4, ...] and X[1, 3, 5, ...] separately since both are slow. + # Instead, we load x0 = X[0, 1, 2, 3, ...] and x1 = X[1, 0, 3, 2, ...]. + # Loading x0 will be fast but x1 will be slow. + # Then we load cos = COS[0, 0, 1, 1, ...] and sin = SIN[0, 0, 1, 1, ...]. + # Then we do the calculation and use tl.where to pick put the right outputs for the even + # and for the odd indices. + rk_swap = rk + ((rk + 1) % 2) * 2 - 1 # 1, 0, 3, 2, 5, 4, ... + rk_repeat = tl.arange(0, BLOCK_K) // 2 + X0 = X + (rm[:, None] * stride_x_seqlen + rk[None, :] * stride_x_headdim) + X1 = X + (rm[:, None] * stride_x_seqlen + rk_swap[None, :] * stride_x_headdim) + COS = COS + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :]) + SIN = SIN + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :]) + cos = tl.load( + COS, + mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half), + other=1.0, + ).to(tl.float32) + sin = tl.load( + SIN, + mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half), + other=0.0, + ).to(tl.float32) + x0 = tl.load(X0, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim), other=0.0).to( + tl.float32 + ) + x1 = tl.load( + X1, mask=(rm[:, None] < seqlen) & (rk_swap[None, :] < rotary_dim), other=0.0 + ).to(tl.float32) + if CONJUGATE: + sin = -sin + x0_cos = x0 * cos + x1_sin = x1 * sin + out = tl.where(rk[None, :] % 2 == 0, x0_cos - x1_sin, x0_cos + x1_sin) + OUT = OUT + (rm[:, None] * stride_out_seqlen + rk[None, :] * stride_out_headdim) + tl.store(OUT, out, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim)) + + +def apply_rotary( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + seqlen_offsets: Union[int, torch.Tensor] = 0, + cu_seqlens: Optional[torch.Tensor] = None, + max_seqlen: Optional[int] = None, + interleaved=False, + inplace=False, + conjugate=False, +) -> torch.Tensor: + """ + Arguments: + x: (batch, seqlen, nheads, headdim) if cu_seqlens is None + else (total_seqlen, nheads, headdim). + cos: (seqlen_ro, rotary_dim / 2) + sin: (seqlen_ro, rotary_dim / 2) + seqlen_offsets: integer or integer tensor of size (batch,) + cu_seqlens: (batch + 1,) or None + max_seqlen: int + Returns: + y: (batch, seqlen, nheads, headdim) + """ + is_varlen = cu_seqlens is not None + if not is_varlen: + batch, seqlen, nheads, headdim = x.shape + else: + assert max_seqlen is not None, "If cu_seqlens is passed in, then max_seqlen must be passed" + total_seqlen, nheads, headdim = x.shape + batch_p_1 = cu_seqlens.shape[0] + batch = batch_p_1 - 1 + seqlen = max_seqlen + seqlen_ro, rotary_dim = cos.shape + assert sin.shape == cos.shape + rotary_dim *= 2 + assert rotary_dim <= headdim, "rotary_dim must be <= headdim" + assert headdim <= 256, "Only support headdim <= 256" + assert seqlen_ro >= seqlen, "seqlen_ro must be >= seqlen" + + assert ( + cos.dtype == sin.dtype + ), f"cos and sin must have the same dtype, got {cos.dtype} and {sin.dtype}" + assert ( + x.dtype == cos.dtype + ), f"Input and cos/sin must have the same dtype, got {x.dtype} and {cos.dtype}" + + cos, sin = cos.contiguous(), sin.contiguous() + if isinstance(seqlen_offsets, torch.Tensor): + assert seqlen_offsets.shape == (batch,) + assert seqlen_offsets.dtype in [torch.int32, torch.int64] + seqlen_offsets = seqlen_offsets.contiguous() + else: + assert seqlen_offsets + seqlen <= seqlen_ro + + output = torch.empty_like(x) if not inplace else x + if rotary_dim < headdim and not inplace: + output[..., rotary_dim:].copy_(x[..., rotary_dim:]) + + BLOCK_K = ( + 32 + if rotary_dim <= 32 + else (64 if rotary_dim <= 64 else (128 if rotary_dim <= 128 else 256)) + ) + grid = lambda META: (triton.cdiv(seqlen, META["BLOCK_M"]), batch, nheads) # noqa + BLOCK_M = 4 if interleaved else (8 if rotary_dim <= 64 else 4) + + # Need this, otherwise Triton tries to launch from cuda:0 and we get + # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?) + with torch.cuda.device(x.device.index): + rotary_kernel[grid]( + output, # data ptrs + x, + cos, + sin, + cu_seqlens, + seqlen_offsets, + seqlen, # shapes + rotary_dim, + seqlen_ro, + output.stride(0) if not is_varlen else 0, # batch_strides if not varlen else 0 + output.stride(-3), # seqlen_stride or total_seqlen_stride + output.stride(-2), # nheads_stride + output.stride(-1), # headdim_stride + x.stride(0) if not is_varlen else 0, # batch_strides if not varlen else 0 + x.stride(-3), # seqlen stride or total_seqlen_stride + x.stride(-2), # nheads stride + x.stride(-1), # headdim stride + BLOCK_K, + isinstance(seqlen_offsets, torch.Tensor), + is_varlen, + interleaved, + conjugate, + BLOCK_M, + ) + return output diff --git a/rotary_cuda.cu b/rotary_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..2dd0ff3f6e248183a03e4988f7032a0fac90b1dd --- /dev/null +++ b/rotary_cuda.cu @@ -0,0 +1,45 @@ +/****************************************************************************** + * Copyright (c) 2023, Tri Dao. + ******************************************************************************/ + +#include +#include +#include + +void apply_rotary_cuda(const torch::Tensor x1, const torch::Tensor x2, + const torch::Tensor cos, const torch::Tensor sin, + torch::Tensor out1, torch::Tensor out2, + const bool conj) { + auto iter = at::TensorIteratorConfig() + .add_output(out1) + .add_output(out2) + .add_input(x1) + .add_input(x2) + .add_input(cos) + .add_input(sin) + .check_all_same_dtype(false) + .promote_inputs_to_common_dtype(false) + .build(); + + if (!conj) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel", [&] { + at::native::gpu_kernel_multiple_outputs( + iter, [] GPU_LAMBDA (scalar_t x1, scalar_t x2, scalar_t cos, + scalar_t sin) -> thrust::tuple { + scalar_t out1 = float(x1) * float(cos) - float(x2) * float(sin); + scalar_t out2 = float(x1) * float(sin) + float(x2) * float(cos); + return {out1, out2}; + }); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel", [&] { + at::native::gpu_kernel_multiple_outputs( + iter, [] GPU_LAMBDA (scalar_t x1, scalar_t x2, scalar_t cos, + scalar_t sin) -> thrust::tuple { + scalar_t out1 = float(x1) * float(cos) + float(x2) * float(sin); + scalar_t out2 = -float(x1) * float(sin) + float(x2) * float(cos); + return {out1, out2}; + }); + }); + } +} \ No newline at end of file diff --git a/run.py b/run.py new file mode 100644 index 0000000000000000000000000000000000000000..2b22d8e2cc560e8446063e406fd6540fc688292c --- /dev/null +++ b/run.py @@ -0,0 +1,68 @@ +from typing import Callable + +import dotenv +import hydra +from omegaconf import OmegaConf, DictConfig + +# load environment variables from `.env` file if it exists +# recursively searches for `.env` in all folders starting from work dir +dotenv.load_dotenv(override=True) + +OmegaConf.register_new_resolver('eval', eval) +OmegaConf.register_new_resolver('div_up', lambda x, y: (x + y - 1) // y) +# Delay the evaluation until we have the datamodule +# So we want the resolver to yield the same string. +OmegaConf.register_new_resolver('datamodule', lambda attr: '${datamodule:' + str(attr) + '}') + +# Turn on TensorFloat32 +import torch.backends +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True + + +def dictconfig_filter_key(d: DictConfig, fn: Callable) -> DictConfig: + """Only keep keys where fn(key) is True. Support nested DictConfig. + """ + # Using d.items_ex(resolve=False) instead of d.items() since we want to keep the + # ${datamodule:foo} unresolved for now. + return DictConfig({k: dictconfig_filter_key(v, fn) if isinstance(v, DictConfig) else v + # for k, v in d.items_ex(resolve=False) if fn(k)}) + for k, v in d.items() if fn(k)}) + + +@hydra.main(config_path="configs/", config_name="config.yaml") +def main(config: DictConfig): + + # Remove config keys that start with '__'. These are meant to be used only in computing + # other entries in the config. + config = dictconfig_filter_key(config, lambda k: not k.startswith('__')) + + # Imports should be nested inside @hydra.main to optimize tab completion + # Read more here: https://github.com/facebookresearch/hydra/issues/934 + from src.train import train + from src.eval import evaluate + from src.utils import utils + + # A couple of optional utilities: + # - disabling python warnings + # - forcing debug-friendly configuration + # - verifying experiment name is set when running in experiment mode + # You can safely get rid of this line if you don't want those + utils.extras(config) + + # Pretty print config using Rich library + if config.get("print_config"): + utils.print_config(config, resolve=True) + + # Train model + mode = config.get('mode', 'train') + if mode not in ['train', 'eval']: + raise NotImplementedError(f'mode {mode} not supported') + if mode == 'train': + return train(config) + elif mode == 'eval': + return evaluate(config) + + +if __name__ == "__main__": + main() diff --git a/scaled_masked_softmax.h b/scaled_masked_softmax.h new file mode 100644 index 0000000000000000000000000000000000000000..14b9f6e4242cf027aebf4d14637aecd1a0c17901 --- /dev/null +++ b/scaled_masked_softmax.h @@ -0,0 +1,528 @@ +/* coding=utf-8 + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace { + +template +__device__ __inline__ void copy_vector(Datatype *dst, const Datatype *src); + +template <> +__device__ __inline__ void copy_vector(c10::BFloat16 *dst, const c10::BFloat16 *src) { *dst = *src; } + +template <> +__device__ __inline__ void copy_vector(c10::BFloat16 *dst, const c10::BFloat16 *src) { *((float2*) dst) = *((float2*) src); } + +template <> +__device__ __inline__ void copy_vector(c10::Half *dst, const c10::Half *src) { *dst = *src; } + +template <> +__device__ __inline__ void copy_vector(c10::Half *dst, const c10::Half *src) { *((float2*) dst) = *((float2*) src); } + +template <> +__device__ __inline__ void copy_vector(uint8_t *dst, const uint8_t *src) { *dst = *src; } + +template <> +__device__ __inline__ void copy_vector(uint8_t *dst, const uint8_t *src) {*((half2*) dst) = *((half2*) src); } + +int log2_ceil(int value) { + int log2_value = 0; + while ((1 << log2_value) < value) ++log2_value; + return log2_value; +} + +template +struct Add { + __device__ __forceinline__ T operator()(T a, T b) const { + return a + b; + } +}; + +template +struct Max { + __device__ __forceinline__ T operator()(T a, T b) const { + return a < b ? b : a; + } +}; + +template +__device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) +{ +#if CUDA_VERSION >= 9000 + return __shfl_xor_sync(mask, value, laneMask, width); +#else + return __shfl_xor(value, laneMask, width); +#endif +} + +template class ReduceOp> +__device__ __forceinline__ void warp_reduce(acc_t* sum) { + ReduceOp r; + #pragma unroll + for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE); + sum[i] = r(sum[i], b); + } + } +} + +/* + * Extended softmax (from native aten pytorch) with following additional features + * 1) input scaling + * 2) Explicit masking + */ +template +__global__ void scaled_masked_softmax_warp_forward( + output_t *dst, + const input_t *src, + const uint8_t *mask, + const acc_t scale, + int micro_batch_size, + int element_count, + int pad_batches) +{ + // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and + // warp_size of method warp_softmax_forward_kernel. + constexpr int next_power_of_two = 1 << log2_elements; + constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE; + constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1; + constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4; + + // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, ) + // gridDim/blockIdx = (seq_len, attn_heads, batches) + int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z))+ threadIdx.y) * WARP_BATCH; + int pad_first_batch = 0; + if (pad_batches != 1) { // bert style + pad_first_batch = (blockDim.y * (blockIdx.x + gridDim.x * blockIdx.z) + threadIdx.y) * WARP_BATCH; + } else { // gpt2 style + pad_first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH; + } + + // micro_batch_size might not be a multiple of WARP_BATCH. Check how + // many batches have to computed within this WARP. + int local_batches = micro_batch_size - first_batch; + if (local_batches > WARP_BATCH) + local_batches = WARP_BATCH; + + // there might be multiple batches per warp. compute the index within the batch + int local_idx = threadIdx.x; + + src += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx; + dst += first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx; + mask += pad_first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx; + + // load data from global memory + acc_t elements[WARP_BATCH][WARP_ITERATIONS]; + input_t temp_data[ELEMENTS_PER_LDG_STG]; + uint8_t temp_mask[ELEMENTS_PER_LDG_STG]; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { + int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; + + if (element_index < batch_element_count) { + int itr_idx = i*element_count+it*WARP_SIZE; + copy_vector(temp_data, src + itr_idx); + copy_vector(temp_mask, mask + itr_idx); + + #pragma unroll + for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { + if (temp_mask[element] != 1) { + elements[i][it + element] = (acc_t)temp_data[element] * scale; + } else { + elements[i][it + element] = -10000.0; + } + } + } else { + #pragma unroll + for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { + elements[i][it + element] = -std::numeric_limits::infinity(); + } + } + } + } + + // compute max_value + acc_t max_value[WARP_BATCH]; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + max_value[i] = elements[i][0]; + #pragma unroll + for (int it = 1; it < WARP_ITERATIONS; ++it) { + max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it]; + } + } + warp_reduce(max_value); + + // compute scale value to account for full mask + acc_t scale_value[WARP_BATCH]; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + scale_value[i] = (max_value[i] == -10000.0) ? 0.0 : 1.0; + } + + acc_t sum[WARP_BATCH] { 0.0f }; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + elements[i][it] = std::exp((elements[i][it] - max_value[i])); + sum[i] += elements[i][it]; + } + } + warp_reduce(sum); + + // store result + output_t out[ELEMENTS_PER_LDG_STG]; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + if (i >= local_batches) + break; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { + int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; + if (element_index < element_count) { + #pragma unroll + for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { + out[element] = elements[i][it + element] * scale_value[i]/ sum[i]; + } + copy_vector(dst + i * element_count + it * WARP_SIZE, out); + } else { + break; + } + } + } +} + +template +__global__ void scaled_masked_softmax_warp_backward( + output_t *gradInput, + input_t *grad, + const input_t *output, + acc_t scale, + int micro_batch_size, + int element_count) +{ + // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and + // warp_size of method warp_softmax_backward_kernel. + constexpr int next_power_of_two = 1 << log2_elements; + constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE; + constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1; + constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4; + + // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, ) + // gridDim/blockIdx = (seq_len, attn_heads, batches) + int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH; + + // micro_batch_size might not be a multiple of WARP_BATCH. Check how + // many batches have to computed within this WARP. + int local_batches = micro_batch_size - first_batch; + if (local_batches > WARP_BATCH) + local_batches = WARP_BATCH; + + // there might be multiple batches per warp. compute the index within the batch + int local_idx = threadIdx.x; + + // the first element to process by the current thread + int thread_offset = first_batch * element_count + ELEMENTS_PER_LDG_STG * local_idx; + grad += thread_offset; + output += thread_offset; + gradInput += thread_offset; + + // load data from global memory + acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f }; + acc_t output_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f }; + input_t temp_grad[ELEMENTS_PER_LDG_STG]; + input_t temp_output[ELEMENTS_PER_LDG_STG]; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { + int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; + if (element_index < batch_element_count) { + copy_vector(temp_grad, grad + i * element_count + it * WARP_SIZE); + copy_vector(temp_output, output + i * element_count + it * WARP_SIZE); + + #pragma unroll + for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { + output_reg[i][it + element] = (acc_t)temp_output[element]; + } + #pragma unroll + for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { + grad_reg[i][it + element] = (acc_t)temp_grad[element] * output_reg[i][it + element]; + } + } + } + } + + acc_t sum[WARP_BATCH]; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + sum[i] = grad_reg[i][0]; + #pragma unroll + for (int it = 1; it < WARP_ITERATIONS; ++it) { + sum[i] += grad_reg[i][it]; + } + } + warp_reduce(sum); + + // store result + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + if (i >= local_batches) + break; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { + int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; + if (element_index < element_count) { + // compute gradients + output_t out[ELEMENTS_PER_LDG_STG]; + #pragma unroll + for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { + out[element] = (output_t)(scale * (grad_reg[i][it + element] - output_reg[i][it + element] * sum[i])); + } + copy_vector(gradInput + i * element_count + it * WARP_SIZE, out); + } + } + } +} +} // end of anonymous namespace + +int get_batch_per_block(int query_seq_len, int key_seq_len, int batches, int attn_heads){ + int log2_elements = log2_ceil(key_seq_len); + const int next_power_of_two = 1 << log2_elements; + + int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; + + constexpr int threads_per_block = 128; + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + + return batches_per_block; +} + +template +void dispatch_scaled_masked_softmax_forward( + output_t *dst, + const input_t *src, + const uint8_t *mask, + const input_t scale, + int query_seq_len, + int key_seq_len, + int batches, + int attn_heads, + int pad_batches) +{ + TORCH_INTERNAL_ASSERT(key_seq_len >= 0 && key_seq_len <= 8192 ); + if (key_seq_len == 0) { + return; + } else { + int log2_elements = log2_ceil(key_seq_len); + const int next_power_of_two = 1 << log2_elements; + int batch_count = batches * attn_heads * query_seq_len; + + // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward. + int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + + // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward. + int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; + + // use 128 threads per block to maximimize gpu utilization + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + TORCH_INTERNAL_ASSERT(query_seq_len%batches_per_block == 0); + dim3 blocks(query_seq_len/batches_per_block, attn_heads, batches); + dim3 threads(warp_size, warps_per_block, 1); + // Launch code would be more elegant if C++ supported FOR CONSTEXPR + switch (log2_elements) { + case 0: // 1 + scaled_masked_softmax_warp_forward + <<>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); + break; + case 1: // 2 + scaled_masked_softmax_warp_forward + <<>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); + break; + case 2: // 4 + scaled_masked_softmax_warp_forward + <<>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); + break; + case 3: // 8 + scaled_masked_softmax_warp_forward + <<>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); + break; + case 4: // 16 + scaled_masked_softmax_warp_forward + <<>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); + break; + case 5: // 32 + scaled_masked_softmax_warp_forward + <<>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); + break; + case 6: // 64 + scaled_masked_softmax_warp_forward + <<>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); + break; + case 7: // 128 + scaled_masked_softmax_warp_forward + <<>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); + break; + case 8: // 256 + scaled_masked_softmax_warp_forward + <<>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); + break; + case 9: // 512 + scaled_masked_softmax_warp_forward + <<>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); + break; + case 10: // 1024 + scaled_masked_softmax_warp_forward + <<>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); + break; + case 11: // 2048 + scaled_masked_softmax_warp_forward + <<>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); + break; + case 12: // 4096 + scaled_masked_softmax_warp_forward + <<>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); + break; + case 13: // 8192 + scaled_masked_softmax_warp_forward + <<>>(dst, src, mask, scale, batch_count, key_seq_len, pad_batches); + break; + default: + break; + } + } +} + +template +void dispatch_scaled_masked_softmax_backward( + output_t *grad_input, + input_t *grad, + const input_t *output, + const acc_t scale, + int query_seq_len, + int key_seq_len, + int batches, + int attn_heads) +{ + TORCH_INTERNAL_ASSERT( key_seq_len >= 0 && key_seq_len <= 8192 ); + if (key_seq_len == 0) { + return; + } else { + int log2_elements = log2_ceil(key_seq_len); + const int next_power_of_two = 1 << log2_elements; + int batch_count = batches * attn_heads * query_seq_len; + + // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward. + int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + + // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward. + int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; + + // use 128 threads per block to maximimize gpu utilization + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + int blocks = batch_count/batches_per_block; + dim3 threads(warp_size, warps_per_block, 1); + // Launch code would be more elegant if C++ supported FOR CONSTEXPR + switch (log2_elements) { + case 0: // 1 + scaled_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, key_seq_len); + break; + case 1: // 2 + scaled_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, key_seq_len); + break; + case 2: // 4 + scaled_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, key_seq_len); + break; + case 3: // 8 + scaled_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, key_seq_len); + break; + case 4: // 16 + scaled_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, key_seq_len); + break; + case 5: // 32 + scaled_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, key_seq_len); + break; + case 6: // 64 + scaled_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, key_seq_len); + break; + case 7: // 128 + scaled_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, key_seq_len); + break; + case 8: // 256 + scaled_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, key_seq_len); + break; + case 9: // 512 + scaled_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, key_seq_len); + break; + case 10: // 1024 + scaled_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, key_seq_len); + break; + case 11: // 2048 + scaled_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, key_seq_len); + break; + case 12: // 4096 + scaled_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, key_seq_len); + break; + case 13: // 8192 + scaled_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, key_seq_len); + break; + default: + break; + } + } +} diff --git a/scaled_masked_softmax_cuda.cu b/scaled_masked_softmax_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..a08e752699c239fc6867ebdac434d3687dabbad8 --- /dev/null +++ b/scaled_masked_softmax_cuda.cu @@ -0,0 +1,121 @@ +/* coding=utf-8 + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "scaled_masked_softmax.h" +#include "type_shim.h" + +namespace multihead_attn { +namespace fused_softmax { +namespace scaled_masked_softmax { + +int get_batch_per_block_cuda(int query_seq_len, int key_seq_len, int batches, int attn_heads){ + return get_batch_per_block(query_seq_len, key_seq_len, batches, attn_heads); +} + + +torch::Tensor fwd_cuda( + torch::Tensor const& input, + torch::Tensor const& mask, + float scale_factor) +{ + // input is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] + const int batches = input.size(0); + const int pad_batches = mask.size(0); + const int attn_heads = input.size(1); + const int query_seq_len = input.size(2); + const int key_seq_len = input.size(3); + TORCH_INTERNAL_ASSERT(key_seq_len <= 8192); + TORCH_INTERNAL_ASSERT(query_seq_len > 1); + TORCH_INTERNAL_ASSERT(pad_batches == 1 || pad_batches == batches); + TORCH_INTERNAL_ASSERT(mask.size(1) == 1); + TORCH_INTERNAL_ASSERT(mask.size(2) == query_seq_len); + TORCH_INTERNAL_ASSERT(mask.size(3) == key_seq_len); + + // Output + auto act_options = input.options().requires_grad(false); + torch::Tensor softmax_results = + torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); + + // Softmax Intermediate Result Ptr + void* input_ptr = static_cast(input.data_ptr()); + void* mask_ptr = static_cast(mask.data_ptr()); + void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); + + DISPATCH_HALF_AND_BFLOAT( + input.scalar_type(), + "dispatch_scaled_masked_softmax_forward", + dispatch_scaled_masked_softmax_forward( + reinterpret_cast(softmax_results_ptr), + reinterpret_cast(input_ptr), + reinterpret_cast(mask_ptr), + scale_factor, + query_seq_len, + key_seq_len, + batches, + attn_heads, + pad_batches + ); + ); + return softmax_results; +} + +torch::Tensor bwd_cuda( + torch::Tensor const& output_grads_, + torch::Tensor const& softmax_results_, + float scale_factor) { + + auto output_grads = output_grads_.contiguous(); + auto softmax_results = softmax_results_.contiguous(); + + //output grads is a 4d tensor with dimensions [batches, attn_heads, seq_len, seq_len] + const int batches = output_grads.size(0); + const int attn_heads = output_grads.size(1); + const int query_seq_len = output_grads.size(2); + const int key_seq_len = output_grads.size(3); + + auto act_options = output_grads.options().requires_grad(false); + torch::Tensor input_grads = + torch::empty({batches, attn_heads, query_seq_len, key_seq_len}, act_options); + void* input_grads_ptr = static_cast(input_grads.data_ptr()); + void* output_grads_ptr = static_cast(output_grads.data_ptr()); + + //Softmax Grad + DISPATCH_HALF_AND_BFLOAT( + output_grads_.scalar_type(), + "dispatch_scaled_masked_softmax_backward", + dispatch_scaled_masked_softmax_backward( + reinterpret_cast(input_grads_ptr), + reinterpret_cast(output_grads_ptr), + reinterpret_cast(softmax_results.data_ptr()), + scale_factor, + query_seq_len, + key_seq_len, + batches, + attn_heads + ); + ); + return input_grads; +} +} +} +} diff --git a/scaled_upper_triang_masked_softmax.h b/scaled_upper_triang_masked_softmax.h new file mode 100644 index 0000000000000000000000000000000000000000..21e93fb313a00d2bc51ecc09f4a87f02ece5a4f6 --- /dev/null +++ b/scaled_upper_triang_masked_softmax.h @@ -0,0 +1,529 @@ +/* coding=utf-8 + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace { + +template +__device__ __inline__ void copy_vector(Datatype *dst, const Datatype *src); + +template <> +__device__ __inline__ void copy_vector(c10::BFloat16 *dst, const c10::BFloat16 *src) { *dst = *src; } + +template <> +__device__ __inline__ void copy_vector(c10::BFloat16 *dst, const c10::BFloat16 *src) { *((float2*) dst) = *((float2*) src); } + +template <> +__device__ __inline__ void copy_vector(c10::Half *dst, const c10::Half *src) { *dst = *src; } + +template <> +__device__ __inline__ void copy_vector(c10::Half *dst, const c10::Half *src) { *((float2*) dst) = *((float2*) src); } + +template <> +__device__ __inline__ void copy_vector(uint8_t *dst, const uint8_t *src) { *dst = *src; } + +template <> +__device__ __inline__ void copy_vector(uint8_t *dst, const uint8_t *src) {*((half2*) dst) = *((half2*) src); } + +template +__device__ __inline__ void copy_zero_vector(Datatype *dst); + +template <> +__device__ __inline__ void copy_zero_vector(c10::BFloat16 *dst) { *dst = 0.0; } + +template <> +__device__ __inline__ void copy_zero_vector(c10::BFloat16 *dst) { *((float2*) dst) = make_float2(0.0f, 0.0f); } + +template <> +__device__ __inline__ void copy_zero_vector(c10::Half *dst) { *dst = 0.0; } + +template <> +__device__ __inline__ void copy_zero_vector(c10::Half *dst) { *((float2*) dst) = make_float2(0.0f, 0.0f); } + + +int log2_ceil(int value) { + int log2_value = 0; + while ((1 << log2_value) < value) ++log2_value; + return log2_value; +} + +template +struct Add { + __device__ __forceinline__ T operator()(T a, T b) const { + return a + b; + } +}; + +template +struct Max { + __device__ __forceinline__ T operator()(T a, T b) const { + return a < b ? b : a; + } +}; + +template +__device__ __forceinline__ T WARP_SHFL_XOR_NATIVE(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) +{ +#if CUDA_VERSION >= 9000 + return __shfl_xor_sync(mask, value, laneMask, width); +#else + return __shfl_xor(value, laneMask, width); +#endif +} + +template class ReduceOp> +__device__ __forceinline__ void warp_reduce(acc_t* sum) { + ReduceOp r; + #pragma unroll + for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + acc_t b = WARP_SHFL_XOR_NATIVE(sum[i], offset, WARP_SIZE); + sum[i] = r(sum[i], b); + } + } +} + +/* + * Extended softmax (from native aten pytorch) with following additional features + * 1) input scaling + * 2) Implicit time (diagonal masking) + */ +template +__global__ void scaled_upper_triang_masked_softmax_warp_forward( + output_t *dst, + const input_t *src, + const acc_t scale, + int micro_batch_size, + int stride, + int element_count) +{ + // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and + // warp_size of method warp_softmax_forward_kernel. + constexpr int next_power_of_two = 1 << log2_elements; + constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE; + constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1; + constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4; + + int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x; + int local_seq = blockIdx.x + 1; + int warp_iteration_limit = (local_seq + ELEMENTS_PER_LDG_STG * WARP_SIZE - 1)/ WARP_SIZE; + + // micro_batch_size might not be a multiple of WARP_BATCH. Check how + // many batches have to computed within this WARP. + int local_batches = micro_batch_size - first_batch; + if (local_batches > WARP_BATCH) + local_batches = WARP_BATCH; + + // there might be multiple batches per warp. compute the index within the batch + int local_idx = threadIdx.x; + + src += first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx; + dst += first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx; + + // load data from global memory + acc_t elements[WARP_BATCH][WARP_ITERATIONS]; + input_t temp_data[ELEMENTS_PER_LDG_STG]; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : local_seq; + + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { + int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; + + if (element_index < batch_element_count) { + copy_vector(temp_data, src + i*element_count*stride + it*WARP_SIZE); + + #pragma unroll + for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { + if ((element_index + element) < batch_element_count) { + elements[i][it+element] = (acc_t)temp_data[element] * scale; + } else { + elements[i][it + element] = -std::numeric_limits::infinity(); + } + } + } else { + #pragma unroll + for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { + elements[i][it + element] = -std::numeric_limits::infinity(); + } + } + } + } + + // compute max_value + acc_t max_value[WARP_BATCH]; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + max_value[i] = elements[i][0]; + #pragma unroll + for (int it = 1; it < WARP_ITERATIONS; ++it) { + max_value[i] = (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it]; + } + } + warp_reduce(max_value); + + acc_t sum[WARP_BATCH] { 0.0f }; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + if (it < warp_iteration_limit) { + elements[i][it] = std::exp((elements[i][it] - max_value[i])); + sum[i] += elements[i][it]; + } + } + } + warp_reduce(sum); + + // store result + output_t out[ELEMENTS_PER_LDG_STG]; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + if (i >= local_batches) + break; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { + int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; + + if (element_index < local_seq) { + + #pragma unroll + for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { + if (element_index + element < local_seq) { + out[element] = elements[i][it + element] / sum[i]; + } else { + out[element] = 0; + } + } + copy_vector(dst + i * element_count * stride + it * WARP_SIZE, out); + } else if (element_index < element_count) { + copy_zero_vector(dst + i * element_count * stride + it * WARP_SIZE); + } else { + break; + } + } + } +} + +template +__global__ void scaled_upper_triang_masked_softmax_warp_backward( + output_t *gradInput, + input_t *grad, + const input_t *output, + acc_t scale, + int micro_batch_size, + int stride, + int element_count) +{ + // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and + // warp_size of method warp_softmax_backward_kernel. + constexpr int next_power_of_two = 1 << log2_elements; + constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE; + constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1; + constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4; + + int first_batch = (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * WARP_BATCH + blockIdx.x; + int local_seq = blockIdx.x + 1; + + // micro_batch_size might not be a multiple of WARP_BATCH. Check how + // many batches have to computed within this WARP. + int local_batches = micro_batch_size - first_batch; + if (local_batches > WARP_BATCH) + local_batches = WARP_BATCH; + + // there might be multiple batches per warp. compute the index within the batch + int local_idx = threadIdx.x; + + // the first element to process by the current thread + int thread_offset = first_batch * stride + ELEMENTS_PER_LDG_STG * local_idx; + grad += thread_offset; + output += thread_offset; + gradInput += thread_offset; + + // load data from global memory + acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f }; + acc_t output_reg[WARP_BATCH][WARP_ITERATIONS] { 0.0f }; + input_t temp_grad[ELEMENTS_PER_LDG_STG]; + input_t temp_output[ELEMENTS_PER_LDG_STG]; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : local_seq; + + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { + int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; + if (element_index < batch_element_count) { + copy_vector(temp_grad, grad + i * element_count * stride + it * WARP_SIZE); + copy_vector(temp_output, output + i * element_count * stride + it * WARP_SIZE); + + #pragma unroll + for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { + if (element_index + element < batch_element_count) { + output_reg[i][it + element] = (acc_t)temp_output[element]; + } + } + #pragma unroll + for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { + if (element_index + element < batch_element_count) { + grad_reg[i][it + element] = (acc_t)temp_grad[element] * output_reg[i][it + element]; + } + } + } + } + } + + acc_t sum[WARP_BATCH]; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + sum[i] = grad_reg[i][0]; + #pragma unroll + for (int it = 1; it < WARP_ITERATIONS; ++it) { + sum[i] += grad_reg[i][it]; + } + } + warp_reduce(sum); + + // store result + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + if (i >= local_batches) + break; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { + int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; + if (element_index < element_count) { + // compute gradients + output_t out[ELEMENTS_PER_LDG_STG]; + #pragma unroll + for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { + out[element] = (output_t)(scale * (grad_reg[i][it + element] - output_reg[i][it + element] * sum[i])); + } + copy_vector(gradInput + i * element_count * stride + it * WARP_SIZE, out); + } + } + } +} + +} // end of anonymous namespace + +template +void dispatch_scaled_upper_triang_masked_softmax_forward( + output_t *dst, + const input_t *src, + const input_t scale, + int softmax_elements, + int softmax_elements_stride, + int attn_batches) +{ + TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 8192 ); + if (softmax_elements == 0) { + return; + } else { + int log2_elements = log2_ceil(softmax_elements); + const int next_power_of_two = 1 << log2_elements; + int seq_len = softmax_elements; + int batch_count = attn_batches * seq_len; + + // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward. + int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + + // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward. + int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; + + // use 128 threads per block to maximimize gpu utilization + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0); + + int blocks_per_seq = attn_batches / batches_per_block; + dim3 blocks(seq_len, blocks_per_seq, 1); + dim3 threads(warp_size, warps_per_block, 1); + // Launch code would be more elegant if C++ supported FOR CONSTEXPR + switch (log2_elements) { + case 0: // 1 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 1: // 2 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 2: // 4 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 3: // 8 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 4: // 16 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 5: // 32 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 6: // 64 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 7: // 128 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 8: // 256 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 9: // 512 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 10: // 1024 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 11: // 2048 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 12: // 4096 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 13: // 8192 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + default: + break; + } + } +} + +template +void dispatch_scaled_upper_triang_masked_softmax_backward( + output_t *grad_input, + input_t *grad, + const input_t *output, + const acc_t scale, + int softmax_elements, + int softmax_elements_stride, + int attn_batches) +{ + TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 8192 ); + if (softmax_elements == 0) { + return; + } else { + int log2_elements = log2_ceil(softmax_elements); + const int next_power_of_two = 1 << log2_elements; + int seq_len = softmax_elements; + int batch_count = attn_batches * seq_len; + + // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward. + int warp_size = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + + // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward. + int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; + + // use 128 threads per block to maximimize gpu utilization + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + TORCH_INTERNAL_ASSERT(attn_batches % batches_per_block == 0); + + int blocks_per_seq = attn_batches / batches_per_block; + dim3 blocks(seq_len, blocks_per_seq, 1); + dim3 threads(warp_size, warps_per_block, 1); + // Launch code would be more elegant if C++ supported FOR CONSTEXPR + switch (log2_elements) { + case 0: // 1 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 1: // 2 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 2: // 4 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 3: // 8 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 4: // 16 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 5: // 32 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 6: // 64 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 7: // 128 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 8: // 256 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 9: // 512 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 10: // 1024 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 11: // 2048 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 12: // 4096 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 13: // 8192 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + default: + break; + } + } +} diff --git a/scaled_upper_triang_masked_softmax_cuda.cu b/scaled_upper_triang_masked_softmax_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..79ec30be364317580742e0297ea86145789a38e7 --- /dev/null +++ b/scaled_upper_triang_masked_softmax_cuda.cu @@ -0,0 +1,98 @@ +/* coding=utf-8 + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "scaled_upper_triang_masked_softmax.h" +#include "type_shim.h" + +namespace multihead_attn { +namespace fused_softmax { +namespace scaled_upper_triang_masked_softmax { + +torch::Tensor fwd_cuda( + torch::Tensor const& input, + float scale_factor) +{ + // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] + const int attn_batches = input.size(0); + const int seq_len = input.size(1); + TORCH_INTERNAL_ASSERT(seq_len <= 8192); + + // Output + auto act_options = input.options().requires_grad(false); + torch::Tensor softmax_results = + torch::empty({attn_batches, seq_len, seq_len}, act_options); + + // Softmax Intermediate Result Ptr + void* input_ptr = static_cast(input.data_ptr()); + void* softmax_results_ptr = static_cast(softmax_results.data_ptr()); + + DISPATCH_HALF_AND_BFLOAT( + input.scalar_type(), + "dispatch_scaled_upper_triang_masked_softmax_forward", + dispatch_scaled_upper_triang_masked_softmax_forward( + reinterpret_cast(softmax_results_ptr), + reinterpret_cast(input_ptr), + scale_factor, + seq_len, + seq_len, + attn_batches); + ); + return softmax_results; +} + + +torch::Tensor bwd_cuda( + torch::Tensor const& output_grads_, + torch::Tensor const& softmax_results_, + float scale_factor) { + + auto output_grads = output_grads_.contiguous(); + auto softmax_results = softmax_results_.contiguous(); + + //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] + const int attn_batches = output_grads.size(0); + const int seq_len = output_grads.size(1); + TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2)); + + void* output_grads_ptr = static_cast(output_grads.data_ptr()); + + //Softmax Grad + DISPATCH_HALF_AND_BFLOAT( + output_grads_.scalar_type(), + "dispatch_scaled_upper_triang_masked_softmax_backward", + dispatch_scaled_upper_triang_masked_softmax_backward( + reinterpret_cast(output_grads_ptr), + reinterpret_cast(output_grads_ptr), + reinterpret_cast(softmax_results.data_ptr()), + scale_factor, + seq_len, + seq_len, + attn_batches); + ); + + //backward pass is completely in-place + return output_grads; +} +} +} +} diff --git a/seq_len.h b/seq_len.h new file mode 100644 index 0000000000000000000000000000000000000000..76c4d08a317c6288321ec72a67eef4dda959a21a --- /dev/null +++ b/seq_len.h @@ -0,0 +1,168 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include +#include + +namespace flash { + +static constexpr int kMaxTileSize = 128; + +template class SeqLenTraits { +public: + // Total number of queries / keys. Unpadded. + int sum_s = 0; + // seq len offsets. + int *cu_seq_len = nullptr; + // actual seq len array. + int *seq_used = nullptr; + // seq len of the current batch. + int actual_seq_len = -1; + + // Whether this is for fixed-seq-len or var-seq-len. + static constexpr bool kUseVarSeqLen = UseVarSeqLen; + + using ShapeT = std::conditional_t< + UseVarSeqLen, + cute::Shape, + cute::Shape + >; + using StrideT = std::conditional_t< + UseVarSeqLen, + cute::Shape, + cute::Shape + >; + using LayoutT = cute::Layout; + + using ShapeLseT = std::conditional_t< + UseVarSeqLen, + cute::Shape, + cute::Shape + >; + using StrideLseT = std::conditional_t< + UseVarSeqLen, + cute::Shape, + cute::Shape + >; + using LayoutLseT = cute::Layout; + + CUTLASS_HOST SeqLenTraits() {} + + CUTLASS_HOST SeqLenTraits( + int sum_s, int max_seq_len, int *cu_seq_len = nullptr, int *seq_used = nullptr): + sum_s(sum_s), cu_seq_len(cu_seq_len), seq_used(seq_used), actual_seq_len(max_seq_len) {} + + // Returns the layout of a tensor in MKHB format in global memory. + // padded: only useful for var-seq-len for dq_accum and softmax_d. + CUTLASS_HOST_DEVICE auto get_gmem_layout( + int m, int k, int h, int b, + int64_t m_stride, int64_t h_stride, int64_t b_stride, + bool padded = false) const { + static_assert(!UseVarSeqLen, "Default implementation is for FixedSeqLen."); + return make_layout(make_shape(m, k, h, b), + make_stride(m_stride, cute::_1{}, h_stride, b_stride)); + } + + // Returns the layout of a tensor in MKHB format in global memory. + // padded: only useful for var-seq-len for dq_accum and softmax_d. + CUTLASS_HOST_DEVICE auto get_lse_gmem_layout( + int m, int h, int b, bool padded = false) const { + static_assert(!UseVarSeqLen, "Default implementation is for FixedSeqLen."); + return make_layout(make_shape(b, h, m), + make_stride(int64_t(h * m), int64_t(m), cute::_1())); + } + + CUTLASS_DEVICE void init(int bidb) {} + + template + CUTLASS_DEVICE auto get_local_tile_tensor( + const MTensor &m_tensor, const Shape &tile_shape, + int bidh, int bidb, bool padded = false) const { + auto g_tensor = local_tile( + m_tensor(_, _, bidh, bidb), tile_shape, make_coord(_, _0{})); + return g_tensor; + } + + template + CUTLASS_DEVICE auto get_lse_local_tile_tensor( + const MTensor &m_tensor, const Shape &tile_shape, + int bidh, int bidb, bool padded = false) const { + auto g_tensor = local_tile(m_tensor(bidb, bidh, _), tile_shape, make_coord(_)); + return g_tensor; + } +}; + +using FixedSeqLenTraits = SeqLenTraits; + +using VarSeqLenTraits = SeqLenTraits; + +// Returns the static layout of a var-seq-len tensor in global memory based on +// max_seq_len and max_batch_size. +// padded: only useful for var-seq-len for dq_accum and softmax_d. +// When padded is True, use B_M + kMaxTileSize * B as the total B_M. +template <> +CUTLASS_HOST_DEVICE auto VarSeqLenTraits::get_gmem_layout( + int m, int k, int h, int b, + int64_t m_stride, int64_t h_stride, int64_t b_stride, + bool padded) const { + return make_layout( + make_shape(sum_s + (padded ? kMaxTileSize * b : 0), k, h), + make_stride(m_stride, cute::_1{}, h_stride)); +} + +// padded: only useful for var-seq-len for dq_accum and softmax_d. +// When padded is True, use B_M + kMaxTileSize * B as the total B_M. +template <> +CUTLASS_HOST_DEVICE auto VarSeqLenTraits::get_lse_gmem_layout( + int m, int h, int b, bool padded) const { + return make_layout( + make_shape(h, sum_s + (padded ? kMaxTileSize * b : 0)), + make_stride(int64_t(sum_s + (padded ? kMaxTileSize * b : 0)), cute::_1())); +} + +template <> +CUTLASS_DEVICE void VarSeqLenTraits::init(int bidb) { + actual_seq_len = + seq_used ? seq_used[bidb] : (cu_seq_len[bidb + 1] - cu_seq_len[bidb]); +} + +template <> +template +CUTLASS_DEVICE auto VarSeqLenTraits::get_local_tile_tensor( + const MTensor &m_tensor, const Shape &tile_shape, + int bidh, int bidb, bool padded) const { + auto g_offset = local_tile( + m_tensor(_, _, bidh), + cute::make_shape(1, get<1>(tile_shape)), + make_coord(cu_seq_len[bidb] + (padded ? kMaxTileSize * bidb : 0), _0{})); + auto g_sequence = make_tensor( + g_offset.data(), + make_layout( + cute::make_shape(actual_seq_len, get<1>(tile_shape)), + g_offset.stride() + )); + auto g_tensor = local_tile(g_sequence, tile_shape, make_coord(_, _0{})); + return g_tensor; +} + +template <> +template +CUTLASS_DEVICE auto VarSeqLenTraits::get_lse_local_tile_tensor( + const MTensor &m_tensor, const Shape &tile_shape, + int bidh, int bidb, bool padded) const { + auto g_offset = local_tile( + m_tensor(bidh, _), cute::make_shape(_1{}), + make_coord(cu_seq_len[bidb] + (padded ? kMaxTileSize * bidb : 0))); + auto g_sequence = make_tensor( + g_offset.data(), + make_layout(cute::make_shape(actual_seq_len), cute::make_shape(_1{}))); + auto g_tensor = local_tile(g_sequence, tile_shape, make_coord(_)); + return g_tensor; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace flash diff --git a/sequence-model.yaml b/sequence-model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..435cf0501a3c4b24efd35d72fbcabcaf19748589 --- /dev/null +++ b/sequence-model.yaml @@ -0,0 +1 @@ +_target_: src.tasks.seq.SequenceModel diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..12e86667c4cd3486466a36db186f812b88156131 --- /dev/null +++ b/setup.py @@ -0,0 +1,295 @@ +# Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + +import sys +import warnings +import os +import re +import shutil +import ast +from pathlib import Path +from packaging.version import parse, Version +import platform + +from setuptools import setup, find_packages +import subprocess + +import urllib.request +import urllib.error +from wheel.bdist_wheel import bdist_wheel as _bdist_wheel + +import torch +from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension, CUDA_HOME + + +# with open("../README.md", "r", encoding="utf-8") as fh: +with open("../README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + + +# ninja build does not work unless include_dirs are abs path +this_dir = os.path.dirname(os.path.abspath(__file__)) + +PACKAGE_NAME = "flashattn-hopper" + +BASE_WHEEL_URL = "https://github.com/Dao-AILab/flash-attention/releases/download/{tag_name}/{wheel_name}" + +# FORCE_BUILD: Force a fresh build locally, instead of attempting to find prebuilt wheels +# SKIP_CUDA_BUILD: Intended to allow CI to use a simple `python setup.py sdist` run to copy over raw files, without any cuda compilation +FORCE_BUILD = os.getenv("FAHOPPER_FORCE_BUILD", "FALSE") == "TRUE" +SKIP_CUDA_BUILD = os.getenv("FAHOPPER_SKIP_CUDA_BUILD", "FALSE") == "TRUE" +# For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI +FORCE_CXX11_ABI = os.getenv("FAHOPPER_FORCE_CXX11_ABI", "FALSE") == "TRUE" + + +def get_platform(): + """ + Returns the platform name as used in wheel filenames. + """ + if sys.platform.startswith("linux"): + return "linux_x86_64" + elif sys.platform == "darwin": + mac_version = ".".join(platform.mac_ver()[0].split(".")[:2]) + return f"macosx_{mac_version}_x86_64" + elif sys.platform == "win32": + return "win_amd64" + else: + raise ValueError("Unsupported platform: {}".format(sys.platform)) + + +def get_cuda_bare_metal_version(cuda_dir): + raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True) + output = raw_output.split() + release_idx = output.index("release") + 1 + bare_metal_version = parse(output[release_idx].split(",")[0]) + + return raw_output, bare_metal_version + + +def check_if_cuda_home_none(global_option: str) -> None: + if CUDA_HOME is not None: + return + # warn instead of error because user could be downloading prebuilt wheels, so nvcc won't be necessary + # in that case. + warnings.warn( + f"{global_option} was requested, but nvcc was not found. Are you sure your environment has nvcc available? " + "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, " + "only images whose names contain 'devel' will provide nvcc." + ) + + +def append_nvcc_threads(nvcc_extra_args): + return nvcc_extra_args + ["--threads", "4"] + + +cmdclass = {} +ext_modules = [] + +# We want this even if SKIP_CUDA_BUILD because when we run python setup.py sdist we want the .hpp +# files included in the source distribution, in case the user compiles from source. +subprocess.run(["git", "submodule", "update", "--init", "../csrc/cutlass"]) + +if not SKIP_CUDA_BUILD: + print("\n\ntorch.__version__ = {}\n\n".format(torch.__version__)) + TORCH_MAJOR = int(torch.__version__.split(".")[0]) + TORCH_MINOR = int(torch.__version__.split(".")[1]) + + check_if_cuda_home_none("--fahopper") + cc_flag = [] + _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME) + if bare_metal_version < Version("12.3"): + raise RuntimeError("FA Hopper is only supported on CUDA 12.3 and above") + cc_flag.append("-gencode") + cc_flag.append("arch=compute_90a,code=sm_90a") + + # HACK: The compiler flag -D_GLIBCXX_USE_CXX11_ABI is set to be the same as + # torch._C._GLIBCXX_USE_CXX11_ABI + # https://github.com/pytorch/pytorch/blob/8472c24e3b5b60150096486616d98b7bea01500b/torch/utils/cpp_extension.py#L920 + if FORCE_CXX11_ABI: + torch._C._GLIBCXX_USE_CXX11_ABI = True + repo_dir = Path(this_dir).parent + cutlass_dir = repo_dir / "csrc" / "cutlass" + sources = [ + "flash_api.cpp", + "flash_fwd_hdim64_fp16_sm90.cu", + "flash_fwd_hdim64_bf16_sm90.cu", + "flash_fwd_hdim128_fp16_sm90.cu", + "flash_fwd_hdim128_bf16_sm90.cu", + "flash_fwd_hdim256_fp16_sm90.cu", + "flash_fwd_hdim256_bf16_sm90.cu", + "flash_bwd_hdim64_fp16_sm90.cu", + "flash_bwd_hdim96_fp16_sm90.cu", + "flash_bwd_hdim128_fp16_sm90.cu", + # "flash_bwd_hdim256_fp16_sm90.cu", + "flash_bwd_hdim64_bf16_sm90.cu", + "flash_bwd_hdim96_bf16_sm90.cu", + "flash_bwd_hdim128_bf16_sm90.cu", + "flash_fwd_hdim64_e4m3_sm90.cu", + "flash_fwd_hdim128_e4m3_sm90.cu", + "flash_fwd_hdim256_e4m3_sm90.cu" + ] + nvcc_flags = [ + "-O3", + # "-O0", + "-std=c++17", + "-U__CUDA_NO_HALF_OPERATORS__", + "-U__CUDA_NO_HALF_CONVERSIONS__", + "-U__CUDA_NO_BFLOAT16_OPERATORS__", + "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", + "-U__CUDA_NO_BFLOAT162_OPERATORS__", + "-U__CUDA_NO_BFLOAT162_CONVERSIONS__", + "--expt-relaxed-constexpr", + "--expt-extended-lambda", + "--use_fast_math", + "--ptxas-options=-v", # printing out number of registers + "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage", # printing out number of registers + "-lineinfo", + "-DCUTLASS_DEBUG_TRACE_LEVEL=0", # Can toggle for debugging + "-DNDEBUG", # Important, otherwise performance is severely impacted + ] + include_dirs = [ + # Path(this_dir) / "fmha-pipeline", + # repo_dir / "lib", + # repo_dir / "include", + cutlass_dir / "include", + # cutlass_dir / "examples" / "common", + # cutlass_dir / "tools" / "util" / "include", + ] + + ext_modules.append( + CUDAExtension( + name="flashattn_hopper_cuda", + sources=sources, + extra_compile_args={ + "cxx": ["-O3", "-std=c++17"], + # "cxx": ["-O0", "-std=c++17"], + "nvcc": append_nvcc_threads( + nvcc_flags + cc_flag + ), + }, + include_dirs=include_dirs, + # Without this we get and error about cuTensorMapEncodeTiled not defined + libraries=["cuda"] + ) + ) + # ext_modules.append( + # CUDAExtension( + # name="flashattn_hopper_cuda_ws", + # sources=sources, + # extra_compile_args={ + # "cxx": ["-O3", "-std=c++17"], + # "nvcc": append_nvcc_threads( + # nvcc_flags + ["-DEXECMODE=1"] + cc_flag + # ), + # }, + # include_dirs=include_dirs, + # # Without this we get and error about cuTensorMapEncodeTiled not defined + # libraries=["cuda"] + # ) + # ) + + +def get_package_version(): + with open(Path(this_dir) / "__init__.py", "r") as f: + version_match = re.search(r"^__version__\s*=\s*(.*)$", f.read(), re.MULTILINE) + public_version = ast.literal_eval(version_match.group(1)) + local_version = os.environ.get("FLASHATTN_HOPPER_LOCAL_VERSION") + if local_version: + return f"{public_version}+{local_version}" + else: + return str(public_version) + + +def get_wheel_url(): + # Determine the version numbers that will be used to determine the correct wheel + # We're using the CUDA version used to build torch, not the one currently installed + # _, cuda_version_raw = get_cuda_bare_metal_version(CUDA_HOME) + torch_cuda_version = parse(torch.version.cuda) + torch_version_raw = parse(torch.__version__) + # For CUDA 11, we only compile for CUDA 11.8, and for CUDA 12 we only compile for CUDA 12.2 + # to save CI time. Minor versions should be compatible. + torch_cuda_version = parse("11.8") if torch_cuda_version.major == 11 else parse("12.2") + python_version = f"cp{sys.version_info.major}{sys.version_info.minor}" + platform_name = get_platform() + package_version = get_package_version() + # cuda_version = f"{cuda_version_raw.major}{cuda_version_raw.minor}" + cuda_version = f"{torch_cuda_version.major}{torch_cuda_version.minor}" + torch_version = f"{torch_version_raw.major}.{torch_version_raw.minor}" + cxx11_abi = str(torch._C._GLIBCXX_USE_CXX11_ABI).upper() + + # Determine wheel URL based on CUDA version, torch version, python version and OS + wheel_filename = f"{PACKAGE_NAME}-{package_version}+cu{cuda_version}torch{torch_version}cxx11abi{cxx11_abi}-{python_version}-{python_version}-{platform_name}.whl" + wheel_url = BASE_WHEEL_URL.format(tag_name=f"v{package_version}", wheel_name=wheel_filename) + return wheel_url, wheel_filename + + +class CachedWheelsCommand(_bdist_wheel): + """ + The CachedWheelsCommand plugs into the default bdist wheel, which is ran by pip when it cannot + find an existing wheel (which is currently the case for all installs). We use + the environment parameters to detect whether there is already a pre-built version of a compatible + wheel available and short-circuits the standard full build pipeline. + """ + + def run(self): + if FORCE_BUILD: + return super().run() + + wheel_url, wheel_filename = get_wheel_url() + print("Guessing wheel URL: ", wheel_url) + try: + urllib.request.urlretrieve(wheel_url, wheel_filename) + + # Make the archive + # Lifted from the root wheel processing command + # https://github.com/pypa/wheel/blob/cf71108ff9f6ffc36978069acb28824b44ae028e/src/wheel/bdist_wheel.py#LL381C9-L381C85 + if not os.path.exists(self.dist_dir): + os.makedirs(self.dist_dir) + + impl_tag, abi_tag, plat_tag = self.get_tag() + archive_basename = f"{self.wheel_dist_name}-{impl_tag}-{abi_tag}-{plat_tag}" + + wheel_path = os.path.join(self.dist_dir, archive_basename + ".whl") + print("Raw wheel path", wheel_path) + shutil.move(wheel_filename, wheel_path) + except urllib.error.HTTPError: + print("Precompiled wheel not found. Building from source...") + # If the wheel could not be downloaded, build from source + super().run() + +setup( + name=PACKAGE_NAME, + version=get_package_version(), + packages=find_packages( + exclude=( + "build", + "csrc", + "include", + "tests", + "dist", + "docs", + "benchmarks", + ) + ), + py_modules=["flash_attn_interface"], + description="FlashAttention-3", + long_description=long_description, + long_description_content_type="text/markdown", + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", + "Operating System :: Unix", + ], + ext_modules=ext_modules, + cmdclass={"bdist_wheel": CachedWheelsCommand, "build_ext": BuildExtension} + if ext_modules + else { + "bdist_wheel": CachedWheelsCommand, + }, + python_requires=">=3.8", + install_requires=[ + "torch", + "einops", + "packaging", + "ninja", + ], +) diff --git a/sgd.yaml b/sgd.yaml new file mode 100644 index 0000000000000000000000000000000000000000..43b834653e8735acf54ca3b21f89cc480a65ac3e --- /dev/null +++ b/sgd.yaml @@ -0,0 +1,2 @@ +# @package train.optimizer +_target_: torch.optim.SGD diff --git a/smoke.yaml b/smoke.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eac3dd286536a02635797f834e93ebb63007f009 --- /dev/null +++ b/smoke.yaml @@ -0,0 +1,22 @@ +# @package _global_ +# Smoke test: disable logging and model checkpointing + +logger: + wandb: + mode: disabled + +callbacks: + model_checkpoint: null + model_checkpoint_progress: null + +hydra: + # https://hydra.cc/docs/tutorials/basic/running_your_app/logging/ + # sets level of only chosen command line loggers to 'DEBUG' + # verbose: [src.train, src.utils.utils] + + # sets output paths for all file logs to 'logs/debug/' + run: + dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/debug/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: ${oc.env:RESULT_DIR,${work_dir}/logs}/debug/multirun_${now:%Y-%m-%d_%H-%M-%S} + subdir: ${hydra.job.num} diff --git a/softmax.h b/softmax.h new file mode 100644 index 0000000000000000000000000000000000000000..79433b82e71754e05b0aa013c0c7d4f03070e51c --- /dev/null +++ b/softmax.h @@ -0,0 +1,234 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include + +#include + +#include + +#include "utils.h" + +#include "cutlass/fast_math.h" + +namespace flash { + +using namespace cute; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +__device__ __forceinline__ void thread_reduce_(Tensor const &tensor, Tensor &summary, Operator &op) { + static_assert(Layout0::rank == 2, "Only support 2D Tensor"); + static_assert(Layout1::rank == 1, "Only support 1D Tensor"); + CUTE_STATIC_ASSERT_V(size<0>(summary) == size<0>(tensor)); + #pragma unroll + for (int mi = 0; mi < size<0>(tensor); mi++) { + summary(mi) = zero_init ? tensor(mi, 0) : op(summary(mi), tensor(mi, 0)); + #pragma unroll + for (int ni = 1; ni < size<1>(tensor); ni++) { + summary(mi) = op(summary(mi), tensor(mi, ni)); + } + } +} + +template +__device__ __forceinline__ void quad_allreduce_(Tensor &dst, Tensor &src, Operator &op) { + CUTE_STATIC_ASSERT_V(size(dst) == size(src)); + #pragma unroll + for (int i = 0; i < size(dst); i++){ + dst(i) = Allreduce<4>::run(src(i), op); + } +} + +template +__device__ __forceinline__ void reduce_(Tensor const& tensor, Tensor &summary, Operator &op) { + thread_reduce_(tensor, summary, op); + quad_allreduce_(summary, summary, op); +} + +template +__device__ __forceinline__ void reduce_max(Tensor const& tensor, Tensor &max){ + MaxOp max_op; + reduce_(tensor, max, max_op); +} + +template +__device__ __forceinline__ void reduce_sum(Tensor const& tensor, Tensor &sum){ + SumOp sum_op; + thread_reduce_(tensor, sum, sum_op); + if constexpr (warp_reduce) { quad_allreduce_(sum, sum, sum_op); } +} + +__forceinline__ __device__ __half2 half_exp(__half2 x) { + uint32_t tmp_out, tmp_in; + tmp_in = reinterpret_cast(x); + asm ("ex2.approx.f16x2 %0, %1;\n" + : "=r"(tmp_out) + : "r"(tmp_in)); + __half2 out = reinterpret_cast<__half2&>(tmp_out); + return out; +} + +// Apply the exp to all the elements. +template +__forceinline__ __device__ void max_scale_exp2_sum(Tensor &tensor, Tensor &max, Tensor &sum, const float scale) { + static_assert(Layout0::rank == 2, "Only support 2D Tensor"); static_assert(Layout1::rank == 1, "Only support 1D Tensor"); CUTE_STATIC_ASSERT_V(size<0>(max) == size<0>(tensor)); + #pragma unroll + for (int mi = 0; mi < size<0>(tensor); ++mi) { + MaxOp max_op; + max(mi) = zero_init ? tensor(mi, 0) : max_op(max(mi), tensor(mi, 0)); + #pragma unroll + for (int ni = 1; ni < size<1>(tensor); ni++) { + max(mi) = max_op(max(mi), tensor(mi, ni)); + } + max(mi) = Allreduce<4>::run(max(mi), max_op); + // If max is -inf, then all elements must have been -inf (possibly due to masking). + // We don't want (-inf - (-inf)) since that would give NaN. + const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale; + sum(mi) = 0; + #pragma unroll + for (int ni = 0; ni < size<1>(tensor); ++ni) { + // Instead of computing exp(x - max), we compute exp2(x * log_2(e) - + // max * log_2(e)) This allows the compiler to use the ffma + // instruction instead of fadd and fmul separately. + tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled); + sum(mi) += tensor(mi, ni); + } + } +} + +// Apply the exp to all the elements. +template +__forceinline__ __device__ void scale_apply_exp2(Tensor &tensor, Tensor const &max, const float scale) { + constexpr static float max_offset = Use_max_offset ? 8.0f : 0.0f; + static_assert(Layout0::rank == 2, "Only support 2D Tensor"); + static_assert(Layout1::rank == 1, "Only support 1D Tensor"); + CUTE_STATIC_ASSERT_V(size<0>(max) == size<0>(tensor)); + #pragma unroll + for (int mi = 0; mi < size<0>(tensor); ++mi) { + // If max is -inf, then all elements must have been -inf (possibly due to masking). + // We don't want (-inf - (-inf)) since that would give NaN. + // If we don't have float around M_LOG2E the multiplication is done in fp64. + const float max_scaled = Check_inf + ? (max(mi) == -INFINITY ? 0.f : (!Scale_max ? max(mi) : max(mi) * scale) - max_offset) + : (!Scale_max ? max(mi) : max(mi) * scale) - max_offset; + #pragma unroll + for (int ni = 0; ni < size<1>(tensor); ++ni) { + // Instead of computing exp(x - max), we compute exp2(x * log_2(e) - + // max * log_2(e)) This allows the compiler to use the ffma + // instruction instead of fadd and fmul separately. + tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled); + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Softmax { + constexpr static bool Use_max_offset = Use_max_offset_; + // constexpr static float max_offset = Use_max_offset ? 8.0f : 0.0f; + // constexpr static float max_offset_E = max_offset * float(M_LN2); + + using TensorT = decltype(make_tensor(Shape>{})); + TensorT row_max, row_sum; + + CUTLASS_DEVICE Softmax() {}; + + template + __forceinline__ __device__ TensorT max(Tensor0 &acc_s, float softmax_scale_log2) { + // Reshape acc_s from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, V, MMA_N)) + Tensor scores = make_tensor(acc_s.data(), flash::convert_layout_acc_rowcol(acc_s.layout())); + static_assert(decltype(size<0>(scores))::value == kNRows); + TensorT scores_scale; + if constexpr (Is_first) { + flash::template reduce_max(scores, row_max); + cute::fill(scores_scale, 1.f); + } else { + Tensor scores_max_prev = make_fragment_like(row_max); + cute::copy(row_max, scores_max_prev); + flash::template reduce_max(scores, row_max); + #pragma unroll + for (int mi = 0; mi < size(row_max); ++mi) { + float scores_max_cur = !Check_inf + ? row_max(mi) + : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi)); + scores_scale(mi) = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2); + row_sum(mi) *= scores_scale(mi); + } + } + return scores_scale; + }; + + template + __forceinline__ __device__ TensorT online_softmax(Tensor0 &acc_s, float softmax_scale_log2) { + // Reshape acc_s from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, V, MMA_N)) + Tensor scores = make_tensor(acc_s.data(), flash::convert_layout_acc_rowcol(acc_s.layout())); + static_assert(decltype(size<0>(scores))::value == kNRows); + TensorT scores_scale; + if constexpr (Is_first) { + flash::template reduce_max(scores, row_max); + flash::template scale_apply_exp2(scores, row_max, softmax_scale_log2); + flash::reduce_sum(scores, row_sum); + cute::fill(scores_scale, 1.f); + // if (cute::thread0()) { print_tensor(scores); printf("\n scale = %f\n", softmax_scale_log2); print_tensor(row_sum); } + } else { + // Tensor scores_max_prev = make_fragment_like(row_max); + // cute::copy(row_max, scores_max_prev); + // flash::template reduce_max(scores, row_max); + // // if (cute::thread0()) { print_tensor(scores); printf("\n"); print_tensor(row_max); printf("\n"); } + // #pragma unroll + // for (int mi = 0; mi < size(row_max); ++mi) { + // float scores_max_cur = !Check_inf + // ? row_max(mi) + // : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi)); + // scores_scale(mi) = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2); + // row_sum(mi) *= scores_scale(mi); + // } + flash::template scale_apply_exp2(scores, row_max, softmax_scale_log2); + // We don't do the reduce across threads here since we don't need to use the row_sum. + // We do that reduce at the end when we need to normalize the softmax. + flash::reduce_sum(scores, row_sum); + } + return scores_scale; + }; + + template + __forceinline__ __device__ TensorT finalize(Tensor0 &acc_s, float softmax_scale_log2, float rp_dropout=1.0) { + constexpr static float max_offset_E = Use_max_offset ? 8.0f * float(M_LN2) : 0.0f; + // Reshape acc_s from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, V, MMA_N)) + Tensor scores = make_tensor(acc_s.data(), flash::convert_layout_acc_rowcol(acc_s.layout())); + static_assert(decltype(size<0>(scores))::value == kNRows); + SumOp sum_op; + quad_allreduce_(row_sum, row_sum, sum_op); + TensorT scores_scale; + #pragma unroll + for (int mi = 0; mi < size(row_max); ++mi) { + float sum = row_sum(mi); + float inv_sum = (sum == 0.f || sum != sum) ? 0.f : 1.f / sum; + row_sum(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : (row_max(mi) * softmax_scale_log2) * float(M_LN2) - max_offset_E + __logf(sum); + scores_scale(mi) = !Is_dropout ? inv_sum : inv_sum * rp_dropout; + } + return scores_scale; + }; + + template + __forceinline__ __device__ void rescale_o(Tensor1 &acc_o, TensorT const &scores_scale) { + // Reshape acc_o from (MMA=4, MMA_M, MMA_K) to (nrow=(2, MMA_M), ncol=(2, MMA_K)) + Tensor acc_o_rowcol = make_tensor(acc_o.data(), flash::convert_layout_acc_rowcol(acc_o.layout())); + static_assert(decltype(size<0>(acc_o_rowcol))::value == kNRows); + #pragma unroll + for (int mi = 0; mi < size(row_max); ++mi) { + #pragma unroll + for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) { acc_o_rowcol(mi, ni) *= scores_scale(mi); } + } + }; + +}; + +} // namespace flash diff --git a/static_switch.h b/static_switch.h new file mode 100644 index 0000000000000000000000000000000000000000..d9ec6222498987673280c62af3af690675bd9a06 --- /dev/null +++ b/static_switch.h @@ -0,0 +1,79 @@ +// Inspired by +// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h +// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h + +#pragma once + +/// @param COND - a boolean expression to switch by +/// @param CONST_NAME - a name given for the constexpr bool variable. +/// @param ... - code to execute for true and false +/// +/// Usage: +/// ``` +/// BOOL_SWITCH(flag, BoolConst, [&] { +/// some_function(...); +/// }); +/// ``` +// + +#define BOOL_SWITCH(COND, CONST_NAME, ...) \ + [&] { \ + if (COND) { \ + constexpr static bool CONST_NAME = true; \ + return __VA_ARGS__(); \ + } else { \ + constexpr static bool CONST_NAME = false; \ + return __VA_ARGS__(); \ + } \ + }() + +#define PREC_SWITCH(PRECTYPE, ...) \ + [&] { \ + if (PRECTYPE == 1) { \ + using kPrecType = cutlass::half_t; \ + constexpr static bool kSoftFp16 = false; \ + constexpr static bool kHybrid = false; \ + return __VA_ARGS__(); \ + } else if (PRECTYPE == 2) { \ + using kPrecType = cutlass::float_e4m3_t; \ + constexpr static bool kSoftFp16 = false; \ + constexpr static bool kHybrid = false; \ + return __VA_ARGS__(); \ + } else if (PRECTYPE == 3) { \ + using kPrecType = cutlass::float_e4m3_t; \ + constexpr static bool kSoftFp16 = false; \ + constexpr static bool kHybrid = true; \ + return __VA_ARGS__(); \ + } else if (PRECTYPE == 4) { \ + using kPrecType = cutlass::float_e4m3_t; \ + constexpr static bool kSoftFp16 = true; \ + constexpr static bool kHybrid = false; \ + return __VA_ARGS__(); \ + } \ + }() + +#define HEADDIM_SWITCH(HEADDIM, ...) \ + [&] { \ + if (HEADDIM == 64) { \ + constexpr static int kHeadSize = 64; \ + return __VA_ARGS__(); \ + } else if (HEADDIM == 128) { \ + constexpr static int kHeadSize = 128; \ + return __VA_ARGS__(); \ + } else if (HEADDIM == 256) { \ + constexpr static int kHeadSize = 256; \ + return __VA_ARGS__(); \ + } \ + }() + +#define SEQLEN_SWITCH(USE_VAR_SEQ_LEN, NAME, ...) \ + [&] { \ + bool useSeqLen = USE_VAR_SEQ_LEN; \ + if (useSeqLen) { \ + using NAME = flash::VarSeqLenTraits; \ + return __VA_ARGS__(); \ + } else { \ + using NAME = flash::FixedSeqLenTraits; \ + return __VA_ARGS__(); \ + } \ + }() diff --git a/step.yaml b/step.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e0d9a0ce808720175c708b3f1a7d55db655bec28 --- /dev/null +++ b/step.yaml @@ -0,0 +1,3 @@ +# @package train.scheduler +_target_: torch.optim.lr_scheduler.StepLR +step_size: ??? diff --git a/tensorboard.yaml b/tensorboard.yaml new file mode 100644 index 0000000000000000000000000000000000000000..acd1fa411d0407535066cd809dbb2a11915a4154 --- /dev/null +++ b/tensorboard.yaml @@ -0,0 +1,10 @@ +# https://www.tensorflow.org/tensorboard/ + +tensorboard: + _target_: pytorch_lightning.loggers.tensorboard.TensorBoardLogger + save_dir: "tensorboard/" + name: "default" + version: ${name} + log_graph: False + default_hp_metric: True + prefix: "" diff --git a/test_baichuan.py b/test_baichuan.py new file mode 100644 index 0000000000000000000000000000000000000000..1d2964bd5e25ab14a124afcdff3776f17ee137d8 --- /dev/null +++ b/test_baichuan.py @@ -0,0 +1,460 @@ +# Copyright (c) 2023, Tri Dao. +import os +import time +from pathlib import Path + +import torch +import pytest + +from einops import rearrange + +from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM + +from flash_attn.models.gpt import ( + GPTLMHeadModel, + combine_state_dicts_tp, + shard_state_dict_tp, +) +from flash_attn.models.baichuan import ( + remap_state_dict_hf_baichuan, + baichuan_config_to_gpt2_config, +) +from flash_attn.utils.distributed import all_gather_raw +from flash_attn.utils.pretrained import state_dict_from_pretrained +from flash_attn.utils.generation import update_graph_cache + + +@pytest.mark.parametrize( + "model_name", + [ + "baichuan-inc/Baichuan-7B", + "baichuan-inc/Baichuan-13B-Base", + "baichuan-inc/Baichuan2-7B-Base", + "baichuan-inc/Baichuan2-13B-Base", + ], +) +def test_baichuan_state_dict(model_name): + config = baichuan_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + pretrained_state_dict = remap_state_dict_hf_baichuan( + state_dict_from_pretrained(model_name), config + ) + model = GPTLMHeadModel(config, device="meta") # Without device='meta' init is very slow + state_dict = model.state_dict() + assert len(state_dict.keys()) == len(pretrained_state_dict.keys()) + assert state_dict.keys() == pretrained_state_dict.keys() + for k in state_dict.keys(): + assert state_dict[k].shape == pretrained_state_dict[k].shape + + +@pytest.mark.parametrize( + "model_name", + [ + "baichuan-inc/Baichuan-7B", + "baichuan-inc/Baichuan-13B-Base", + "baichuan-inc/Baichuan2-7B-Base", + "baichuan-inc/Baichuan2-13B-Base", + ], +) +def test_baichuan_optimized(model_name): + """Check that our implementation of Baichuan (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + device = "cuda" + config = baichuan_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = False # We don't have fused GatedMLP yet + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + + pretrained_state_dict = remap_state_dict_hf_baichuan( + state_dict_from_pretrained(model_name), config + ) + model = GPTLMHeadModel(config, device=device, dtype=dtype) + model.load_state_dict(pretrained_state_dict) + model.eval() + + torch.manual_seed(0) + batch_size = 2 + max_seqlen = 256 + seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device) + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device + ) + with torch.no_grad(): + out = model.transformer(input_ids) + logits = model(input_ids).logits + del model + + # Without device_map, the model is loaded on the CPU, which is very slow + # Need auto here since the 13B fp32 model doesn't fit in memory on a A100 40GB + model_ref = AutoModelForCausalLM.from_pretrained( + model_name, device_map="auto", trust_remote_code=True + ) + model_ref.eval() + with torch.no_grad(): + out_ref = model_ref.model(input_ids).last_hidden_state.to(device=device) + logits_ref = model_ref(input_ids).logits.to(device=device) + del model_ref + + model_hf = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=dtype, + device_map={"": device}, + trust_remote_code=True, + ) + model_hf.eval() + with torch.no_grad(): + out_hf = model_hf.model(input_ids).last_hidden_state + logits_hf = model_hf(input_ids).logits + del model_hf + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}") + assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item() + + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}") + assert (logits - logits_ref).abs().max().item() < 3 * ( + logits_hf - logits_ref + ).abs().max().item() + + +# torchrun --no_python --nproc_per_node=2 pytest -q -s tests/models/test_baichuan.py -k "test_baichuan_parallel_forward" +@pytest.mark.parametrize("world_size", [2]) +@pytest.mark.parametrize( + "model_name", + [ + "baichuan-inc/Baichuan-7B", + "baichuan-inc/Baichuan-13B-Base", + "baichuan-inc/Baichuan2-7B-Base", + "baichuan-inc/Baichuan2-13B-Base", + ], +) +def test_baichuan_parallel_forward(model_name, world_size): + """Check that our implementation of Baichuan (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + from apex.transformer import parallel_state + + dtype = torch.float16 + config = baichuan_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = False # We don't have fused GatedMLP yet + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + process_group = parallel_state.get_tensor_model_parallel_group() + + pretrained_state_dict = remap_state_dict_hf_baichuan( + state_dict_from_pretrained(model_name), config + ) + + model = GPTLMHeadModel(config, process_group=process_group, device=device, dtype=dtype) + model.load_state_dict(shard_state_dict_tp(pretrained_state_dict, config, world_size, rank)) + model.eval() + + torch.manual_seed(0) + batch_size = 2 + max_seqlen = 256 + seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device) + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device + ) + with torch.no_grad(): + out = model.transformer(input_ids) + out, _ = all_gather_raw(out, process_group=process_group) + out = rearrange(out, "(b s) d -> b s d", b=batch_size) + logits = model(input_ids).logits + logits = rearrange(logits, "(b s) d -> b s d", b=batch_size) + logits, _ = all_gather_raw(logits, process_group) + logits = rearrange(logits, "(n b) ... d -> b ... (n d)", b=batch_size) + del model + parallel_state.destroy_model_parallel() + + if rank == 0: + # Without device_map, the model is loaded on the CPU, which is very slow + model_ref = AutoModelForCausalLM.from_pretrained( + model_name, device_map="auto", trust_remote_code=True + ) + model_ref.eval() + with torch.no_grad(): + out_ref = model_ref.model(input_ids).last_hidden_state.to(device=device) + logits_ref = model_ref(input_ids).logits.to(device=device) + del model_ref + + model_hf = AutoModelForCausalLM.from_pretrained( + model_name, torch_dtype=dtype, device_map="auto", trust_remote_code=True + ) + model_hf.eval() + with torch.no_grad(): + out_hf = model_hf.model(input_ids).last_hidden_state.to(device=device) + logits_hf = model_hf(input_ids).logits.to(device=device) + del model_hf + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}") + assert (out - out_ref).abs().max().item() < 2 * (out_hf - out_ref).abs().max().item() + + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}") + assert (logits - logits_ref).abs().max().item() < 2 * ( + logits_hf - logits_ref + ).abs().max().item() + + +@pytest.mark.parametrize( + "model_name", ["baichuan-inc/Baichuan-7B", "baichuan-inc/Baichuan-13B-Base"] +) +def test_baichuan_generation(model_name): + dtype = torch.float16 + device = "cuda" + config = baichuan_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = False # We don't have fused GatedMLP yet + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + eos_token_id = tokenizer.eos_token_id + + torch.manual_seed(0) + batch_size = 1 + seqlen = 2048 + max_length = 2048 + 150 + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device + ) + + model_hf = AutoModelForCausalLM.from_pretrained( + model_name, torch_dtype=dtype, device_map={"": device}, trust_remote_code=True + ) + model_hf.eval() + print("HF fp16") + torch.cuda.synchronize() + start = time.time() + out_hf = model_hf.generate( + input_ids=input_ids, + max_length=max_length, + return_dict_in_generate=True, + output_scores=True, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + del model_hf + + # Need auto here since the 13B fp32 model doesn't fit in memory on a A100 40GB + model_ref = AutoModelForCausalLM.from_pretrained( + model_name, device_map="auto", trust_remote_code=True + ) + model_ref.eval() + with torch.no_grad(): + logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1].to(device=device) + del model_ref + + pretrained_state_dict = remap_state_dict_hf_baichuan( + state_dict_from_pretrained(model_name), config + ) + model = GPTLMHeadModel(config, device=device, dtype=dtype) + model.load_state_dict(pretrained_state_dict) + model.eval() + + model(input_ids) # Warm up + print("Without CUDA graph") + torch.cuda.synchronize() + start = time.time() + out = model.generate( + input_ids=input_ids, + max_length=max_length, + eos_token_id=eos_token_id, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + teacher_outputs=out_hf.sequences, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + + # Capture graph outside the timing loop + batch_size, seqlen_og = input_ids.shape + model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length) + print("With CUDA graph") + torch.cuda.synchronize() + start = time.time() + out_cg = model.generate( + input_ids=input_ids, + max_length=max_length, + cg=True, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + teacher_outputs=out_hf.sequences, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + + with torch.no_grad(): + logits_parallel = model(out_hf.sequences).logits[:, (seqlen - 1) : -1] + logits_hf = torch.stack(out_hf.scores, dim=1) + logits = torch.stack(out.scores, dim=1) + logits_cg = torch.stack(out_cg.scores, dim=1) + + del model + + hf_error = (logits_hf - logits_ref).abs().max().item() + + print(f"HF fp16 logits max diff: {hf_error}") + print(f"Logits max diff: {(logits - logits_ref).abs().max().item() }") + print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item() }") + + assert (logits_parallel - logits_ref).abs().max().item() < 2 * hf_error + assert (logits - logits_ref).abs().max().item() < 2 * hf_error + assert torch.equal(logits_cg, logits) + + +# torchrun --no_python --nproc_per_node=2 pytest -q -s tests/models/test_baichuan.py -k "baichuan_parallel_generation" +@pytest.mark.parametrize("world_size", [2]) +@pytest.mark.parametrize("model_name", ["baichuan-inc/Baichuan-7B"]) +def test_baichuan_parallel_generation(model_name, world_size): + """Check that our implementation matches the HF implementation: + the scores in fp16 should be around the same as the HF scores in fp16, when compared to + the HF scores in fp32. + """ + from apex.transformer import parallel_state + + dtype = torch.float16 + config = baichuan_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = False # We don't have fused GatedMLP yet + config.fused_dropout_add_ln = False + config.residual_in_fp32 = True + config.pad_vocab_size_multiple = 8 * world_size + config.sequence_parallel = False # Need to set this to False for generation + + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0" + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + process_group = parallel_state.get_tensor_model_parallel_group() + + torch.manual_seed(0) + batch_size = 1 + seqlen = 100 + max_length = 150 + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device + ) + + # Need this, otherwise when we capture the graph the process for GPU 1 would run on both + # GPU0 and GPU1 and things would hang + torch.cuda.set_device(device) + + pretrained_state_dict = remap_state_dict_hf_baichuan( + state_dict_from_pretrained(model_name), config + ) + + model = GPTLMHeadModel(config, process_group=process_group, device=device, dtype=dtype) + model.load_state_dict(shard_state_dict_tp(pretrained_state_dict, config, world_size, rank)) + model.eval() + + print("Without CUDA graph") + out = model.generate( + input_ids=input_ids, + max_length=max_length, + tensor_parallel=world_size, + vocab_size=config.vocab_size, + # teacher_outputs=out_hf.sequences, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + ) + + # Capture graph outside the timing loop + batch_size, seqlen_og = input_ids.shape + model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length) + print("With CUDA graph") + out_cg = model.generate( + input_ids=input_ids, + max_length=max_length, + tensor_parallel=world_size, + vocab_size=config.vocab_size, + cg=True, + # teacher_outputs=out_hf.sequences, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + ) + del model + parallel_state.destroy_model_parallel() + + if rank == 0: + # Without device_map, the model is loaded on the CPU, which is very slow + model_hf = AutoModelForCausalLM.from_pretrained( + model_name, torch_dtype=dtype, device_map="auto", trust_remote_code=True + ) + model_hf.eval() + print("HF fp16") + torch.cuda.synchronize() + start = time.time() + with torch.inference_mode(): + out_hf = model_hf.generate( + input_ids=input_ids, + max_length=max_length, + return_dict_in_generate=True, + output_scores=True, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + del model_hf + + model_ref = AutoModelForCausalLM.from_pretrained( + model_name, device_map="auto", trust_remote_code=True + ) + model_ref.eval() + with torch.inference_mode(): + logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1] + del model_ref + logits_hf = torch.stack(out_hf.scores, dim=1) + + logits = torch.stack(out.scores, dim=1) + logits_cg = torch.stack(out_cg.scores, dim=1) + + hf_error = (logits_hf - logits_ref).abs().max().item() + print(f"HF fp16 logits max diff: {hf_error}") + print(f"Logits max diff: {(logits - logits_ref).abs().max().item() }") + print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item() }") + assert (logits - logits_ref).abs().max().item() < 2 * hf_error + assert torch.equal(logits_cg, logits) diff --git a/test_bert.py b/test_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..4c519b37e0e37603d21d517e9fb2175791204a75 --- /dev/null +++ b/test_bert.py @@ -0,0 +1,324 @@ +import re +from collections import OrderedDict + +import pytest +import torch +import torch.nn.functional as F +from einops import rearrange +from transformers import BertConfig +from transformers.models.bert.modeling_bert import BertForPreTraining as BertForPreTrainingHF +from transformers.models.bert.modeling_bert import BertModel as BertModelHF + +from flash_attn.models.bert import ( + BertForPreTraining, + BertModel, + inv_remap_state_dict, + remap_state_dict, +) +from flash_attn.utils.pretrained import state_dict_from_pretrained + + +@pytest.mark.parametrize("model_name", ["bert-base-uncased", "bert-large-uncased"]) +# @pytest.mark.parametrize('model_name', ["bert-base-uncased"]) +def test_bert_state_dict(model_name): + config = BertConfig.from_pretrained(model_name) + pretrained_state_dict = remap_state_dict(state_dict_from_pretrained(model_name), config) + model = BertForPreTraining(config) + state_dict = model.state_dict() + assert state_dict.keys() == pretrained_state_dict.keys() + for k in state_dict.keys(): + assert state_dict[k].shape == pretrained_state_dict[k].shape + + +def get_hf_models(model_name, config, dtype): + pretrained_state_dict = state_dict_from_pretrained(model_name) + + def key_mapping_ln_gamma_beta(key): + key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key) + key = re.sub(r"LayerNorm.beta$", "LayerNorm.bias", key) + return key + + pretrained_state_dict = OrderedDict( + (key_mapping_ln_gamma_beta(k), v) for k, v in pretrained_state_dict.items() + ) + model_hf = BertForPreTrainingHF(config) + # Missing key(s) in state_dict: "bert.embeddings.position_ids", "cls.predictions.decoder.bias" + # position_ids is a buffer, and predictions.decoder.bias is tied to predictions.bias. + model_hf.load_state_dict(pretrained_state_dict, strict=False) + model_hf.cuda().to(dtype=dtype) + return model_hf + + +@pytest.mark.parametrize("model_name", ["bert-base-uncased"]) +def test_bert_non_optimized(model_name): + """Check that our implementation of BERT (without any optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + config = BertConfig.from_pretrained(model_name) + + model = BertForPreTraining.from_pretrained(model_name, config) + model = model.cuda().to(dtype=dtype) + + model_ref = get_hf_models(model_name, config, torch.float32) + model_hf = get_hf_models(model_name, config, dtype) + + model.eval() + model_ref.eval() + model_hf.eval() + + torch.manual_seed(0) + batch_size = 4 + max_seqlen = 512 + seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device="cuda") + attention_mask = torch.arange(max_seqlen, device="cuda")[None, :] < seqlens[:, None] + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device="cuda" + ) + out = model.bert(input_ids, attention_mask=attention_mask) + sequence_output, pooled_output = out.last_hidden_state, out.pooler_output + out_hf = model_hf.bert(input_ids, attention_mask=attention_mask) + sequence_output_hf, pooled_output_hf = out_hf.last_hidden_state, out_hf.pooler_output + out_ref = model_ref.bert(input_ids, attention_mask=attention_mask) + sequence_output_ref, pooled_output_ref = out_ref.last_hidden_state, out_ref.pooler_output + + print(f"Output max diff: {(sequence_output - sequence_output_ref).abs().max().item()}") + print(f"Output mean diff: {(sequence_output - sequence_output_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(sequence_output_hf - sequence_output_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(sequence_output_hf - sequence_output_ref).abs().mean().item()}") + assert (sequence_output - sequence_output_ref).abs().max().item() < 3 * ( + sequence_output_hf - sequence_output_ref + ).abs().max().item() + assert (pooled_output - pooled_output_ref).abs().max().item() < 3 * ( + pooled_output_hf - pooled_output_ref + ).abs().max().item() + + +@pytest.mark.parametrize("model_name", ["bert-base-uncased", "bert-large-uncased"]) +# @pytest.mark.parametrize('model_name', ["bert-base-uncased"]) +def test_bert_optimized(model_name): + """Check that our implementation of BERT (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + config = BertConfig.from_pretrained(model_name) + # Our implementation of fused_mlp assumes the activation is + # nn.GELU(approximate='tanh'). Huggingface calls it "gelu_new", "gelu_fast", or "gelu_pytorch_tanh". + # If you just want "gelu", disable fused_mlp. + config.hidden_act = "gelu_new" + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = True + config.fused_dropout_add_ln = True + + model = BertForPreTraining.from_pretrained(model_name, config) + model = model.cuda().to(dtype=dtype) + + model_ref = get_hf_models(model_name, config, torch.float32) + model_hf = get_hf_models(model_name, config, dtype) + + model.eval() + model_ref.eval() + model_hf.eval() + + torch.manual_seed(0) + batch_size = 4 + max_seqlen = 512 + seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device="cuda") + attention_mask = torch.arange(max_seqlen, device="cuda")[None, :] < seqlens[:, None] + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device="cuda" + ) + out = model.bert(input_ids, attention_mask=attention_mask) + sequence_output, pooled_output = out.last_hidden_state, out.pooler_output + out_hf = model_hf.bert(input_ids, attention_mask=attention_mask) + sequence_output_hf, pooled_output_hf = out_hf.last_hidden_state, out_hf.pooler_output + # Need to zero out the padded tokens in the sequence before comparison. + sequence_output_hf[~attention_mask, :] = 0.0 + out_ref = model_ref.bert(input_ids, attention_mask=attention_mask) + sequence_output_ref, pooled_output_ref = out_ref.last_hidden_state, out_ref.pooler_output + sequence_output_ref[~attention_mask, :] = 0.0 + + print( + f"BertModel output max diff: {(sequence_output - sequence_output_ref).abs().max().item()}" + ) + print( + f"BertModel output mean diff: {(sequence_output - sequence_output_ref).abs().mean().item()}" + ) + print( + f"HF fp16 BertModel max diff: {(sequence_output_hf - sequence_output_ref).abs().max().item()}" + ) + print( + f"HF fp16 BertModel mean diff: {(sequence_output_hf - sequence_output_ref).abs().mean().item()}" + ) + assert (sequence_output - sequence_output_ref).abs().max().item() < 4 * ( + sequence_output_hf - sequence_output_ref + ).abs().max().item() + assert (pooled_output - pooled_output_ref).abs().max().item() < 4 * ( + pooled_output_hf - pooled_output_ref + ).abs().max().item() + + out = model(input_ids, attention_mask=attention_mask) + prediction_scores, seq_relationship_scores = out.prediction_logits, out.seq_relationship_logits + # Need to zero out the padded tokens in the sequence before comparison. + prediction_scores = prediction_scores.clone() + prediction_scores[~attention_mask, :] = 0.0 + out_hf = model_hf(input_ids, attention_mask=attention_mask) + prediction_scores_hf, seq_relationship_scores_hf = ( + out_hf.prediction_logits, + out_hf.seq_relationship_logits, + ) + prediction_scores_hf[~attention_mask, :] = 0.0 + out_ref = model_ref(input_ids, attention_mask=attention_mask) + prediction_scores_ref, seq_relationship_scores_ref = ( + out_ref.prediction_logits, + out_ref.seq_relationship_logits, + ) + prediction_scores_ref[~attention_mask, :] = 0.0 + + print( + f"prediction_scores max diff: {(prediction_scores - prediction_scores_ref).abs().max().item()}" + ) + print( + f"prediction_scores mean diff: {(prediction_scores - prediction_scores_ref).abs().mean().item()}" + ) + print( + f"HF fp16 prediction_scoresff: {(prediction_scores_hf - prediction_scores_ref).abs().max().item()}" + ) + print( + f"HF fp16 prediction_scoresiff: {(prediction_scores_hf - prediction_scores_ref).abs().mean().item()}" + ) + assert (prediction_scores - prediction_scores_ref).abs().max().item() < 2 * ( + prediction_scores_hf - prediction_scores_ref + ).abs().max().item() + assert (seq_relationship_scores - seq_relationship_scores_ref).abs().max().item() < 2 * ( + seq_relationship_scores_hf - seq_relationship_scores_ref + ).abs().max().item() + + +@pytest.mark.parametrize("last_layer_subset", [False, True]) +# @pytest.mark.parametrize('last_layer_subset', [True]) +@pytest.mark.parametrize("has_key_padding_mask", [True, False]) +# @pytest.mark.parametrize('has_key_padding_mask', [True]) +@pytest.mark.parametrize("model_name", ["bert-base-uncased", "bert-large-uncased"]) +# @pytest.mark.parametrize('model_name', ["bert-base-uncased"]) +def test_bert_dense_seq_output(model_name, has_key_padding_mask, last_layer_subset): + """Check that our implementation of BERT (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + config = BertConfig.from_pretrained(model_name) + # Our implementation of fused_mlp assumes the activation is + # nn.GELU(approximate='tanh'). Huggingface calls it "gelu_new", "gelu_fast", or "gelu_pytorch_tanh". + # If you just want "gelu", disable fused_mlp. + config.hidden_act = "gelu_new" + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = True + config.fused_dropout_add_ln = True + config.dense_seq_output = True + config.last_layer_subset = last_layer_subset + config.use_xentropy = True + + model = BertForPreTraining.from_pretrained(model_name, config) + model = model.cuda().to(dtype=dtype) + + model_ref = get_hf_models(model_name, config, torch.float32) + model_hf = get_hf_models(model_name, config, dtype) + + model.eval() + model_ref.eval() + model_hf.eval() + + torch.manual_seed(0) + batch_size = 4 + max_seqlen = 512 + seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device="cuda") + if has_key_padding_mask: + attention_mask = torch.arange(max_seqlen, device="cuda")[None, :] < seqlens[:, None] + else: + attention_mask = None + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device="cuda" + ) + labels = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device="cuda" + ) + if attention_mask is not None: + labels[~attention_mask] = 0 + labels[(torch.rand(batch_size, max_seqlen, device="cuda") > 0.15)] = 0 + masked_tokens_mask = labels.flatten() > 0 + next_sequence_label = torch.randint(0, 2, (batch_size,), device="cuda") + + out = model( + input_ids, + attention_mask=attention_mask, + labels=labels, + next_sentence_label=next_sequence_label, + ) + prediction_scores, seq_relationship_scores = out.prediction_logits, out.seq_relationship_logits + out_hf = model_hf( + input_ids, + attention_mask=attention_mask, + labels=labels, + next_sentence_label=next_sequence_label, + ) + prediction_scores_hf, seq_relationship_scores_hf = ( + out_hf.prediction_logits, + out_hf.seq_relationship_logits, + ) + prediction_scores_hf = rearrange(prediction_scores_hf, "b s d -> (b s) d")[masked_tokens_mask] + out_ref = model_ref( + input_ids, + attention_mask=attention_mask, + labels=labels, + next_sentence_label=next_sequence_label, + ) + prediction_scores_ref, seq_relationship_scores_ref = ( + out_ref.prediction_logits, + out_ref.seq_relationship_logits, + ) + prediction_scores_ref = rearrange(prediction_scores_ref, "b s d -> (b s) d")[masked_tokens_mask] + + print( + f"prediction_scores max diff: {(prediction_scores - prediction_scores_ref).abs().max().item()}" + ) + print( + f"prediction_scores mean diff: {(prediction_scores - prediction_scores_ref).abs().mean().item()}" + ) + print( + f"HF fp16 prediction_scoresff: {(prediction_scores_hf - prediction_scores_ref).abs().max().item()}" + ) + print( + f"HF fp16 prediction_scoresiff: {(prediction_scores_hf - prediction_scores_ref).abs().mean().item()}" + ) + assert (prediction_scores - prediction_scores_ref).abs().max().item() < 2 * ( + prediction_scores_hf - prediction_scores_ref + ).abs().max().item() + assert (seq_relationship_scores - seq_relationship_scores_ref).abs().max().item() < 2 * ( + seq_relationship_scores_hf - seq_relationship_scores_ref + ).abs().max().item() + # The loss calculation from HF is wrong: it doesn't ignore the labels that are 0. + # assert (out.loss - out_ref.loss).abs().max().item() < 2 * (out_hf.loss - out_ref.loss).abs().max().item() + + +@pytest.mark.parametrize("model_name", ["bert-base-uncased", "bert-large-uncased"]) +def test_inv_remap_state_dict(model_name: str): + """ + Verify that we can convert a HF BERT model to flash_attn and back. + """ + + state_dict = state_dict_from_pretrained(model_name) + config = BertConfig.from_pretrained(model_name) + + flash_state_dict = remap_state_dict(state_dict, config) + recovered_state_dict = inv_remap_state_dict(flash_state_dict, config) + + assert set(state_dict.keys()) == set(recovered_state_dict.keys()) + + for k in state_dict.keys(): + assert state_dict[k].shape == recovered_state_dict[k].shape + torch.testing.assert_close(state_dict[k], recovered_state_dict[k], rtol=1e-6, atol=1e-6) diff --git a/test_bigcode.py b/test_bigcode.py new file mode 100644 index 0000000000000000000000000000000000000000..b69038dde7d6cdbbb7498b33995b96cb3aa2f06a --- /dev/null +++ b/test_bigcode.py @@ -0,0 +1,204 @@ +import time + +import pytest +import torch +from transformers import AutoTokenizer, GPTBigCodeConfig +from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeForCausalLM + +from flash_attn.models.bigcode import bigcode_config_to_gpt2_config, inv_remap_state_dict_hf_bigcode +from flash_attn.models.gpt import GPTLMHeadModel, remap_state_dict_hf_bigcode +from flash_attn.utils.generation import update_graph_cache +from flash_attn.utils.pretrained import state_dict_from_pretrained + + +@pytest.mark.parametrize("model_name", ["bigcode/starcoderbase-1b", "WizardLM/WizardCoder-1B-V1.0"]) +def test_bigcode_state_dict(model_name): + config = bigcode_config_to_gpt2_config(GPTBigCodeConfig.from_pretrained(model_name)) + pretrained_state_dict = remap_state_dict_hf_bigcode( + state_dict_from_pretrained(model_name), config + ) + model = GPTLMHeadModel(config, device="meta") + state_dict = model.state_dict() + assert state_dict.keys() == pretrained_state_dict.keys() + for k in state_dict.keys(): + assert state_dict[k].shape == pretrained_state_dict[k].shape + + +@pytest.mark.parametrize("model_name", ["bigcode/starcoderbase-1b", "WizardLM/WizardCoder-1B-V1.0"]) +def test_bigcode_optimized(model_name): + """Check that our implementation of BigCode (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + device = "cuda" + config = bigcode_config_to_gpt2_config(GPTBigCodeConfig.from_pretrained(model_name)) + config.use_flash_attn = True # FlashAttention-2 supports headdim 256 + config.fused_bias_fc = True + config.fused_mlp = True + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + + model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype) + model.eval() + + torch.manual_seed(0) + batch_size = 2 + max_seqlen = 256 + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device + ) + with torch.no_grad(): + out = model.transformer(input_ids) + logits = model(input_ids).logits + del model + + # Without device_map, the model is loaded on the CPU, which is very slow + model_ref = GPTBigCodeForCausalLM.from_pretrained(model_name, device_map={"": device}) + model_ref.eval() + with torch.no_grad(): + out_ref = model_ref.transformer(input_ids).last_hidden_state + logits_ref = model_ref(input_ids).logits + del model_ref + + model_hf = GPTBigCodeForCausalLM.from_pretrained( + model_name, torch_dtype=dtype, device_map={"": device} + ) + model_hf.eval() + out_hf = model_hf.transformer(input_ids).last_hidden_state + logits_hf = model_hf(input_ids).logits + del model_hf + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}") + assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item() + + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}") + assert (logits - logits_ref).abs().max().item() < 3 * ( + logits_hf - logits_ref + ).abs().max().item() + + +@pytest.mark.parametrize("model_name", ["bigcode/starcoderbase-1b", "WizardLM/WizardCoder-1B-V1.0"]) +def test_bigcode_generation(model_name): + """Check that our implementation of BigCode (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + device = "cuda" + config = bigcode_config_to_gpt2_config(GPTBigCodeConfig.from_pretrained(model_name)) + config.use_flash_attn = True # FlashAttention-2 supports headdim 256 + config.fused_bias_fc = True + config.fused_mlp = True + config.fused_dropout_add_ln = True + # Only prenorm supports residual_in_fp32 + config.residual_in_fp32 = True + + tokenizer = AutoTokenizer.from_pretrained(model_name) + eos_token_id = tokenizer.eos_token_id + + torch.manual_seed(0) + batch_size = 1 + seqlen = 100 + max_length = 150 + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device + ) + + model_hf = GPTBigCodeForCausalLM.from_pretrained( + model_name, torch_dtype=dtype, device_map={"": device} + ) + model_hf.eval() + print("HF fp16") + torch.cuda.synchronize() + start = time.time() + out_hf = model_hf.generate( + input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + del model_hf + + model_ref = GPTBigCodeForCausalLM.from_pretrained(model_name, device_map={"": device}) + model_ref.eval() + with torch.no_grad(): + logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1] + del model_ref + + model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype) + model.eval() + + print("Without CUDA graph") + torch.cuda.synchronize() + start = time.time() + out = model.generate( + input_ids=input_ids, + max_length=max_length, + eos_token_id=eos_token_id, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + teacher_outputs=out_hf.sequences, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + + # Capture graph outside the timing loop + batch_size, seqlen_og = input_ids.shape + model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length) + print("With CUDA graph") + torch.cuda.synchronize() + start = time.time() + out_cg = model.generate( + input_ids=input_ids, + max_length=max_length, + cg=True, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + teacher_outputs=out_hf.sequences, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + + with torch.no_grad(): + logits_parallel = model(out_hf.sequences).logits[:, (seqlen - 1) : -1] + logits_hf = torch.stack(out_hf.scores, dim=1) + logits = torch.stack(out.scores, dim=1) + logits_cg = torch.stack(out_cg.scores, dim=1) + + del model + + hf_error = (logits_hf - logits_ref).abs().max().item() + assert (logits_parallel - logits_ref).abs().max().item() < 2 * hf_error + + print(f"HF fp16 logits max diff: {hf_error}") + print(f"Logits max diff: {(logits - logits_ref).abs().max().item() }") + assert (logits - logits_ref).abs().max().item() < 2 * hf_error + print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item() }") + assert (logits_cg - logits_ref).abs().max().item() < 2 * hf_error + + +@pytest.mark.parametrize("model_name", ["bigcode/starcoderbase-1b", "WizardLM/WizardCoder-1B-V1.0"]) +def test_inv_remap_state_dict(model_name: str): + """ + Verify that we can convert a HF BigCode model to flash_attn and back. + """ + + state_dict = state_dict_from_pretrained(model_name) + config = GPTBigCodeConfig.from_pretrained(model_name) + + flash_state_dict = remap_state_dict_hf_bigcode(state_dict, config) + recovered_state_dict = inv_remap_state_dict_hf_bigcode(flash_state_dict, config) + + assert set(state_dict.keys()) == set(recovered_state_dict.keys()) + + for k in state_dict.keys(): + assert state_dict[k].shape == recovered_state_dict[k].shape + torch.testing.assert_close(state_dict[k], recovered_state_dict[k], rtol=1e-6, atol=1e-6) diff --git a/test_block_parallel.py b/test_block_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..d74cfa11fbe8b7c3fcdbf344376ac12bc09a3b9e --- /dev/null +++ b/test_block_parallel.py @@ -0,0 +1,273 @@ +# Run test with: +# torchrun --no_python --nproc_per_node=8 pytest -q -s tests/modules/test_block_parallel.py + +import math +from functools import partial + +import pytest +import torch +import torch.nn as nn +import torch.nn.functional as F +from apex.transformer import parallel_state, tensor_parallel +from einops import rearrange +from flash_attn.modules.block import Block +from flash_attn.modules.mha import MHA, ParallelMHA +from flash_attn.modules.mlp import FusedMLP, ParallelFusedMLP +from flash_attn.utils.distributed import allreduce_sequence_parallel_grad + +is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8 + + +@pytest.mark.parametrize("dtype", [torch.float16] + ([torch.bfloat16] if is_sm8x else [])) +# @pytest.mark.parametrize('dtype', [torch.float16]) +@pytest.mark.parametrize("world_size", [1, 2, 4, 8]) +# @pytest.mark.parametrize('world_size', [2]) +@pytest.mark.parametrize("sequence_parallel", [True, False]) +# @pytest.mark.parametrize('sequence_parallel', [True]) +@pytest.mark.parametrize("dim", [1024]) +def test_block_parallel(dim, sequence_parallel, world_size, dtype): + head_dim = 64 + assert dim % head_dim == 0 + num_heads = dim // head_dim + assert num_heads % world_size == 0 + rtol, atol = (3e-3, 5e-2) if dtype == torch.bfloat16 else (3e-3, 3e-3) + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + # set seed + torch.random.manual_seed(0) + batch_size = 2 + seqlen = 1024 + assert (batch_size * seqlen) % world_size == 0 + x_pt = torch.randn(batch_size * seqlen, dim, device=device, dtype=dtype, requires_grad=True) + residual_pt = torch.randn(batch_size * seqlen, dim, device=device, requires_grad=True) + # We need to generate g here so that all processes get the same gradient, + # as rank 0 will have an extra bias that changes the RNG. + # If we don't divide by batch_size, the gradient gets a bit too large. + g = torch.randn_like(x_pt) / 32 + if sequence_parallel: + x = ( + tensor_parallel.scatter_to_sequence_parallel_region(x_pt) + .detach() + .clone() + .requires_grad_() + ) + residual = ( + tensor_parallel.scatter_to_sequence_parallel_region(residual_pt) + .detach() + .clone() + .requires_grad_() + ) + else: + x = x_pt.detach().clone().requires_grad_() + residual = residual_pt.detach().clone().requires_grad_() + + mixer_cls_pt = partial( + MHA, + num_heads=num_heads, + rotary_emb_dim=int(head_dim // 2), + use_flash_attn=True, + device=device, + dtype=dtype, + ) + mlp_cls_pt = partial(FusedMLP, hidden_features=4 * dim, device=device, dtype=dtype) + norm_cls = partial(nn.LayerNorm, device=device, dtype=dtype) + model_pt = Block(dim, mixer_cls_pt, mlp_cls_pt, norm_cls, fused_dropout_add_ln=True) + with torch.no_grad(): + nn.init.normal_(model_pt.norm1.weight) + nn.init.normal_(model_pt.norm1.bias) + nn.init.normal_(model_pt.norm2.weight) + nn.init.normal_(model_pt.norm2.bias) + + mixer_cls = partial( + ParallelMHA, + num_heads=num_heads, + process_group=parallel_state.get_tensor_model_parallel_group(), + rotary_emb_dim=int(head_dim // 2), + use_flash_attn=True, + sequence_parallel=sequence_parallel, + device=device, + dtype=dtype, + ) + mlp_cls = partial( + ParallelFusedMLP, + hidden_features=4 * dim, + process_group=parallel_state.get_tensor_model_parallel_group(), + sequence_parallel=sequence_parallel, + device=device, + dtype=dtype, + ) + model = Block( + dim, + mixer_cls, + mlp_cls, + norm_cls, + fused_dropout_add_ln=True, + sequence_parallel=sequence_parallel, + mark_shared_params=True, + ) + + partition_dim = dim // world_size + partition_hidden_dim = 4 * dim // world_size + with torch.no_grad(): + model.mixer.Wqkv.weight.copy_( + rearrange( + rearrange(model_pt.mixer.Wqkv.weight, "(three o) i -> three o i", three=3)[ + :, rank * partition_dim : (rank + 1) * partition_dim + ], + "three o i -> (three o) i", + ) + ) + model.mixer.Wqkv.bias.copy_( + rearrange( + rearrange(model_pt.mixer.Wqkv.bias, "(three o) -> three o", three=3)[ + :, rank * partition_dim : (rank + 1) * partition_dim + ], + "three o -> (three o)", + ) + ) + model.mixer.out_proj.weight.copy_( + model_pt.mixer.out_proj.weight[:, rank * partition_dim : (rank + 1) * partition_dim] + ) + if rank == 0: + model.mixer.out_proj.bias.copy_(model_pt.mixer.out_proj.bias) + model.mlp.fc1.weight.copy_( + model_pt.mlp.fc1.weight[rank * partition_hidden_dim : (rank + 1) * partition_hidden_dim] + ) + model.mlp.fc1.bias.copy_( + model_pt.mlp.fc1.bias[rank * partition_hidden_dim : (rank + 1) * partition_hidden_dim] + ) + model.mlp.fc2.weight.copy_( + model_pt.mlp.fc2.weight[ + :, rank * partition_hidden_dim : (rank + 1) * partition_hidden_dim + ] + ) + if rank == 0: + model.mlp.fc2.bias.copy_(model_pt.mlp.fc2.bias) + model.norm1.weight.copy_(model_pt.norm1.weight) + model.norm1.bias.copy_(model_pt.norm1.bias) + model.norm2.weight.copy_(model_pt.norm2.weight) + model.norm2.bias.copy_(model_pt.norm2.bias) + + mixer_kwargs = {"seqlen": seqlen} + out, out_residual = model(x, residual, mixer_kwargs=mixer_kwargs) + out_pt, out_residual_pt = model_pt( + rearrange(x_pt, "(b s) d -> b s d", s=seqlen), + rearrange(residual_pt, "(b s) d -> b s d", s=seqlen), + ) + out_pt, out_residual_pt = [rearrange(x, "b s d -> (b s) d") for x in [out_pt, out_residual_pt]] + partition_batch_dim = batch_size * seqlen // world_size + assert torch.allclose( + out, + out_pt[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] + if sequence_parallel + else out_pt, + rtol=rtol, + atol=atol, + ) + assert torch.allclose( + out_residual, + out_residual_pt[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] + if sequence_parallel + else out_residual_pt, + rtol=rtol, + atol=atol, + ) + + (out_pt + 2 * out_residual_pt).backward(g) + (out + 2 * out_residual).backward( + g[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] if sequence_parallel else g + ) + allreduce_sequence_parallel_grad(model, parallel_state.get_tensor_model_parallel_group()) + parallel_state.destroy_model_parallel() + + assert torch.allclose( + x.grad, + x_pt.grad[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] + if sequence_parallel + else x_pt.grad, + rtol=rtol, + atol=atol / 10, # magnitude of x.grad is quite small + ) + assert torch.allclose( + residual.grad, + residual_pt.grad[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] + if sequence_parallel + else residual_pt.grad, + rtol=rtol, + atol=atol, + ) + # The error for d_weight and d_bias is quite a bit higher + assert torch.allclose( + model.mixer.Wqkv.weight.grad, + rearrange( + rearrange(model_pt.mixer.Wqkv.weight.grad, "(three o) i -> three o i", three=3)[ + :, rank * partition_dim : (rank + 1) * partition_dim + ], + "three o i -> (three o) i", + ), + rtol=rtol, + atol=atol * 10, + ) + assert torch.allclose( + model.mixer.Wqkv.bias.grad, + rearrange( + rearrange(model_pt.mixer.Wqkv.bias.grad, "(three o) -> three o", three=3)[ + :, rank * partition_dim : (rank + 1) * partition_dim + ], + "three o -> (three o)", + ), + rtol=rtol, + atol=atol * 5, + ) + assert torch.allclose( + model.mixer.out_proj.weight.grad, + model_pt.mixer.out_proj.weight.grad[:, rank * partition_dim : (rank + 1) * partition_dim], + rtol=rtol, + atol=atol * 10, + ) + if rank == 0: + assert torch.allclose( + model.mixer.out_proj.bias.grad, + model_pt.mixer.out_proj.bias.grad, + rtol=rtol, + atol=atol * 5, + ) + assert torch.allclose( + model.mlp.fc1.weight.grad, + model_pt.mlp.fc1.weight.grad[ + rank * partition_hidden_dim : (rank + 1) * partition_hidden_dim + ], + rtol=rtol, + atol=atol * 10, + ) + assert torch.allclose( + model.mlp.fc1.bias.grad, + model_pt.mlp.fc1.bias.grad[rank * partition_hidden_dim : (rank + 1) * partition_hidden_dim], + rtol=rtol, + atol=atol * 5, + ) + assert torch.allclose( + model.mlp.fc2.weight.grad, + model_pt.mlp.fc2.weight.grad[ + :, rank * partition_hidden_dim : (rank + 1) * partition_hidden_dim + ], + rtol=rtol, + atol=atol * 10, + ) + if rank == 0: + assert torch.allclose( + model.mlp.fc2.bias.grad, model_pt.mlp.fc2.bias.grad, rtol=rtol, atol=atol * 5 + ) + + assert torch.allclose( + model.norm1.weight.grad, model_pt.norm1.weight.grad, rtol=rtol, atol=atol * 5 + ) + assert torch.allclose(model.norm1.bias.grad, model_pt.norm1.bias.grad, rtol=rtol, atol=atol * 5) + assert torch.allclose( + model.norm2.weight.grad, model_pt.norm2.weight.grad, rtol=rtol, atol=atol * 5 + ) + assert torch.allclose(model.norm2.bias.grad, model_pt.norm2.bias.grad, rtol=rtol, atol=atol * 5) diff --git a/test_btlm.py b/test_btlm.py new file mode 100644 index 0000000000000000000000000000000000000000..eb5316bbe17ca074a0a2b7b758c1864597df3607 --- /dev/null +++ b/test_btlm.py @@ -0,0 +1,245 @@ +# Copyright (c) 2023, Tri Dao. +import time + +import torch +import pytest + +from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM + +from flash_attn.models.gpt import GPTLMHeadModel +from flash_attn.models.btlm import btlm_config_to_gpt2_config, remap_state_dict_hf_btlm +from flash_attn.utils.pretrained import state_dict_from_pretrained +from flash_attn.utils.generation import update_graph_cache + + +@pytest.mark.parametrize("model_name", ["cerebras/btlm-3b-8k-base"]) +def test_btlm_state_dict(model_name): + config = btlm_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + pretrained_state_dict = remap_state_dict_hf_btlm(state_dict_from_pretrained(model_name), config) + model = GPTLMHeadModel(config, device="meta") # Without device='meta' init is very slow + state_dict = model.state_dict() + assert len(state_dict.keys()) == len(pretrained_state_dict.keys()) + assert state_dict.keys() == pretrained_state_dict.keys() + for k in state_dict.keys(): + assert state_dict[k].shape == pretrained_state_dict[k].shape + + +@pytest.mark.parametrize("model_name", ["cerebras/btlm-3b-8k-base"]) +def test_btlm_optimized(model_name): + """Check that our implementation of Btlm (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + device = "cuda" + config = btlm_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + config.fused_bias_fc = True + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + + pretrained_state_dict = remap_state_dict_hf_btlm(state_dict_from_pretrained(model_name), config) + model = GPTLMHeadModel(config, device=device, dtype=dtype) + model.load_state_dict(pretrained_state_dict) + model.eval() + + torch.manual_seed(0) + batch_size = 2 + max_seqlen = 256 + seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device) + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device + ) + with torch.no_grad(): + out = model.transformer(input_ids) + logits = model(input_ids).logits + del model + + # Without device_map, the model is loaded on the CPU, which is very slow + # Need auto here since the 13B fp32 model doesn't fit in memory on a A100 40GB + model_ref = AutoModelForCausalLM.from_pretrained( + model_name, device_map="auto", trust_remote_code=True + ) + model_ref.eval() + with torch.no_grad(): + out_ref = model_ref.transformer(input_ids).last_hidden_state.to(device=device) + logits_ref = model_ref(input_ids).logits.to(device=device) + del model_ref + + model_hf = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=dtype, + device_map={"": device}, + trust_remote_code=True, + ) + model_hf.eval() + with torch.no_grad(): + out_hf = model_hf.transformer(input_ids).last_hidden_state + logits_hf = model_hf(input_ids).logits + del model_hf + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}") + assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item() + + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}") + assert (logits - logits_ref).abs().max().item() < 3 * ( + logits_hf - logits_ref + ).abs().max().item() + + +@pytest.mark.parametrize("model_name", ["cerebras/btlm-3b-8k-base"]) +def test_btlm_generation(model_name): + dtype = torch.float16 + device = "cuda" + config = btlm_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + config.fused_bias_fc = True + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + eos_token_id = tokenizer.eos_token_id + + torch.manual_seed(0) + batch_size = 1 + seqlen = 2048 + max_length = 2048 + 150 + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device + ) + + model_hf = AutoModelForCausalLM.from_pretrained( + model_name, torch_dtype=dtype, device_map={"": device}, trust_remote_code=True + ) + model_hf.eval() + print("HF fp16") + torch.cuda.synchronize() + start = time.time() + out_hf = model_hf.generate( + input_ids=input_ids, + max_length=max_length, + return_dict_in_generate=True, + output_scores=True, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + del model_hf + + # Need auto here since the 13B fp32 model doesn't fit in memory on a A100 40GB + model_ref = AutoModelForCausalLM.from_pretrained( + model_name, device_map="auto", trust_remote_code=True + ) + model_ref.eval() + with torch.no_grad(): + logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1].to(device=device) + del model_ref + + pretrained_state_dict = remap_state_dict_hf_btlm(state_dict_from_pretrained(model_name), config) + model = GPTLMHeadModel(config, device=device, dtype=dtype) + model.load_state_dict(pretrained_state_dict) + model.eval() + + model(input_ids) # Warm up + print("Without CUDA graph") + torch.cuda.synchronize() + start = time.time() + out = model.generate( + input_ids=input_ids, + max_length=max_length, + eos_token_id=eos_token_id, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + teacher_outputs=out_hf.sequences, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + + # Capture graph outside the timing loop + batch_size, seqlen_og = input_ids.shape + model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length) + print("With CUDA graph") + torch.cuda.synchronize() + start = time.time() + out_cg = model.generate( + input_ids=input_ids, + max_length=max_length, + cg=True, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + teacher_outputs=out_hf.sequences, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + + with torch.no_grad(): + logits_parallel = model(out_hf.sequences).logits[:, (seqlen - 1) : -1] + logits_hf = torch.stack(out_hf.scores, dim=1) + logits = torch.stack(out.scores, dim=1) + logits_cg = torch.stack(out_cg.scores, dim=1) + + del model + + hf_error = (logits_hf - logits_ref).abs().max().item() + + print(f"HF fp16 logits max diff: {hf_error}") + print(f"Logits max diff: {(logits - logits_ref).abs().max().item() }") + print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item() }") + + assert (logits_parallel - logits_ref).abs().max().item() < 2 * hf_error + assert (logits - logits_ref).abs().max().item() < 2 * hf_error + assert torch.equal(logits_cg, logits) + + +@pytest.mark.parametrize("model_name", ["cerebras/btlm-3b-8k-base"]) +def test_btlm_init(model_name): + dtype = torch.float32 + device = "cuda" + btlm_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + config = btlm_config_to_gpt2_config(btlm_config) + model = GPTLMHeadModel(config, device=device, dtype=dtype) + model_ref = AutoModelForCausalLM.from_config(btlm_config, trust_remote_code=True).to(device) + + assert model.transformer.embeddings.word_embeddings.weight.mean().abs() < 1e-4 + assert ( + model.transformer.embeddings.word_embeddings.weight.std() + - model_ref.transformer.wte.weight.std() + ).abs() < 1e-4 + assert model.lm_head.weight.mean().abs() < 1e-4 + assert (model.lm_head.weight.std() - model_ref.lm_head.weight.std()).abs() < 1e-4 + for l in range(config.n_layer): + assert model.transformer.layers[l].mixer.Wqkv.weight.mean().abs() < 1e-4 + assert ( + model.transformer.layers[l].mixer.Wqkv.weight.std() + - model_ref.transformer.h[l].attn.c_attn.weight.std() + ).abs() < 1e-4 + assert model.transformer.layers[l].mixer.Wqkv.bias.abs().max() == 0.0 + assert model.transformer.layers[l].mixer.out_proj.weight.mean().abs() < 1e-4 + assert ( + model.transformer.layers[l].mixer.out_proj.weight.std() + - model_ref.transformer.h[l].attn.c_proj.weight.std() + ).abs() < 1e-4 + assert model.transformer.layers[l].mixer.out_proj.bias.abs().max() == 0.0 + assert model.transformer.layers[l].mlp.fc1.weight.mean().abs() < 1e-4 + assert ( + model.transformer.layers[l].mlp.fc1.weight.std() + - model_ref.transformer.h[l].mlp.c_fc.weight.std() + ).abs() < 1e-4 + assert model.transformer.layers[l].mlp.fc1.bias.abs().max() == 0.0 + assert model.transformer.layers[l].mlp.fc2.weight.mean().abs() < 1e-4 + assert ( + model.transformer.layers[l].mlp.fc2.weight.std() + - model_ref.transformer.h[l].mlp.c_proj.weight.std() + ).abs() < 1e-4 + assert model.transformer.layers[l].mlp.fc2.bias.abs().max() == 0.0 diff --git a/test_cross_entropy.py b/test_cross_entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..9d67f5906a17cb3511e0557be71663332519b36b --- /dev/null +++ b/test_cross_entropy.py @@ -0,0 +1,68 @@ +import math + +import pytest +import torch +import torch.nn.functional as F +from einops import rearrange +from flash_attn.losses.cross_entropy import CrossEntropyLoss + +is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8 + + +@pytest.mark.parametrize( + "dtype", [torch.float16, torch.float32] + ([torch.bfloat16] if is_sm8x else []) +) +# @pytest.mark.parametrize("dtype", [torch.float16]) +@pytest.mark.parametrize("inplace_backward", [False, True]) +# @pytest.mark.parametrize("inplace_backward", [False]) +@pytest.mark.parametrize("lse_square_scale", [0.0, 1e-2]) +@pytest.mark.parametrize("return_z_loss", [False, True]) +# @pytest.mark.parametrize("lse_square_scale", [1e-2]) +@pytest.mark.parametrize("logit_scale", [1.0, 0.7]) +# @pytest.mark.parametrize("logit_scale", [1.0]) +@pytest.mark.parametrize("smoothing", [0.0, 0.9]) +# @pytest.mark.parametrize("smoothing", [0.0]) +@pytest.mark.parametrize("vocab_size", [50257, 128 * 1024]) # test vocab larger than 64k for split +# @pytest.mark.parametrize("vocab_size", [12]) +def test_cross_entropy_loss( + vocab_size, smoothing, logit_scale, lse_square_scale, return_z_loss, inplace_backward, dtype +): + device = "cuda" + rtol, atol = (1e-5, 1e-6) if dtype == torch.float32 else (1e-3, 1e-4) + # set seed + torch.random.manual_seed(0) + batch_size = 1 if dtype == torch.float32 else 4 # Otherwise OOM + seqlen = 4096 if lse_square_scale == 0.0 and logit_scale == 1.0 else 1024 # Otherwise OOM + x_pt = torch.randn( + batch_size * seqlen, vocab_size, device=device, dtype=dtype, requires_grad=True + ) + x = x_pt.detach().clone().requires_grad_() + y = torch.randint(0, vocab_size, (batch_size * seqlen,), dtype=torch.long, device=device) + if batch_size * seqlen > 10: + y[torch.randperm(batch_size * seqlen)[:10]] = -100 + model_pt = torch.nn.CrossEntropyLoss(label_smoothing=smoothing) + model = CrossEntropyLoss( + label_smoothing=smoothing, + logit_scale=logit_scale, + lse_square_scale=lse_square_scale, + return_z_loss=return_z_loss, + inplace_backward=inplace_backward, + ) + if return_z_loss: + out, out_z_loss = model(x, y) + else: + out = model(x, y) + x_pt_scaled = (x_pt.float() * logit_scale) if logit_scale != 1.0 else x_pt.float() + out_pt = model_pt(x_pt_scaled, y) + if lse_square_scale > 0.0: + lse_pt = torch.logsumexp(x_pt_scaled, dim=-1) + z_loss_pt = lse_square_scale * (lse_pt[y != -100] ** 2).mean() + if return_z_loss: + assert torch.allclose(out_z_loss, z_loss_pt, rtol=rtol, atol=atol) + out_pt += z_loss_pt + assert torch.allclose(out, out_pt, rtol=1e-5, atol=1e-6) + + g = torch.randn_like(out) + out_pt.backward(g) + out.backward(g) + assert torch.allclose(x.grad, x_pt.grad, rtol=rtol, atol=atol) diff --git a/test_cross_entropy_parallel.py b/test_cross_entropy_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..c8b97fc291d7004e106b2dbd0e115ab290c67f78 --- /dev/null +++ b/test_cross_entropy_parallel.py @@ -0,0 +1,88 @@ +# Run test with: +# torchrun --no_python --nproc_per_node=2 pytest -q -s tests/losses/test_cross_entropy_parallel.py + +import math + +import pytest +import torch +from apex.transformer import parallel_state, tensor_parallel +from flash_attn.losses.cross_entropy import CrossEntropyLoss + +is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8 + + +@pytest.mark.parametrize( + "dtype", [torch.float16, torch.float32] + ([torch.bfloat16] if is_sm8x else []) +) +# @pytest.mark.parametrize("dtype", [torch.float16]) +@pytest.mark.parametrize("inplace_backward", [False, True]) +# @pytest.mark.parametrize("inplace_backward", [False]) +@pytest.mark.parametrize("lse_square_scale", [0.0, 1e-2]) +# @pytest.mark.parametrize("lse_square_scale", [0.0]) +@pytest.mark.parametrize("logit_scale", [0.7]) +# @pytest.mark.parametrize("logit_scale", [1.0]) +@pytest.mark.parametrize("smoothing", [0.0, 0.9]) +# @pytest.mark.parametrize("smoothing", [0.0]) +@pytest.mark.parametrize("vocab_size", [50264, 256 * 1024]) # test vocab larger than 64k for split +# @pytest.mark.parametrize("vocab_size", [50264]) # test vocab larger than 64k for split +# @pytest.mark.parametrize("world_size", [1, 2]) +@pytest.mark.parametrize("world_size", [2]) +def test_cross_entropy_loss_parallel( + vocab_size, world_size, smoothing, logit_scale, lse_square_scale, inplace_backward, dtype +): + assert vocab_size % world_size == 0 + rtol, atol = ( + (1e-5, 2e-5) + if dtype == torch.float32 + else ((1e-3, 1e-4) if dtype == torch.float16 else (1e-2, 3e-3)) + ) + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + partition_vocab_size = vocab_size // world_size + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + # set seed + torch.random.manual_seed(0) + batch_size = 8 + seqlen = 128 + x_pt = ( + torch.randn(batch_size * seqlen, vocab_size, device=device, dtype=dtype) * 10 + ).requires_grad_() + x = ( + tensor_parallel.scatter_to_tensor_model_parallel_region(x_pt) + .detach() + .clone() + .requires_grad_() + ) + y = torch.randint(0, vocab_size, (batch_size * seqlen,), dtype=torch.long, device=device) + y[torch.randperm(batch_size * seqlen)[:10]] = -100 + model_pt = torch.nn.CrossEntropyLoss(label_smoothing=smoothing, reduction="none") + model = CrossEntropyLoss( + label_smoothing=smoothing, + logit_scale=logit_scale, + reduction="none", + lse_square_scale=lse_square_scale, + inplace_backward=inplace_backward, + process_group=parallel_state.get_tensor_model_parallel_group(), + ) + out = model(x, y) + out_pt = model_pt(x_pt.float() * logit_scale, y) + if lse_square_scale > 0.0: + lse_pt = torch.logsumexp(x_pt.float() * logit_scale, dim=-1) + out_pt += lse_square_scale * lse_pt.square() + out_pt.masked_fill_(y == -100, 0.0) + assert torch.allclose(out, out_pt, rtol=1e-5, atol=1e-6) + + g = torch.randn_like(out) + out_pt.backward(g) + out.backward(g) + assert torch.allclose( + x.grad, + x_pt.grad[:, (rank * partition_vocab_size) : (rank + 1) * partition_vocab_size], + rtol=rtol, + atol=atol, + ) + + parallel_state.destroy_model_parallel() diff --git a/test_dropout_layer_norm.py b/test_dropout_layer_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..ccc8c8ea0692c282301b7e55d7254d767df3900f --- /dev/null +++ b/test_dropout_layer_norm.py @@ -0,0 +1,1189 @@ +import math + +import pytest +import torch +import torch.nn.functional as F +from einops import rearrange, repeat +from flash_attn.ops.layer_norm import ( + DropoutAddLayerNorm, + dropout_add_layer_norm, + dropout_add_layer_norm_parallel_residual, + dropout_add_layer_norm_subset, +) +from flash_attn.ops.rms_norm import ( + DropoutAddRMSNorm, + dropout_add_rms_norm, + dropout_add_rms_norm_parallel_residual, + dropout_add_rms_norm_subset, +) + +try: + from apex.normalization import FusedRMSNorm + from apex.normalization.fused_layer_norm import fused_rms_norm_affine +except: + FusedRMSNorm, fused_rms_norm_affine = None, None + + +is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8 + + +@pytest.mark.parametrize("is_rms_norm", [False, True]) +@pytest.mark.parametrize("has_colscale", [True, False]) +# @pytest.mark.parametrize('has_colscale', [False]) +@pytest.mark.parametrize("has_rowscale", [True, False]) +# @pytest.mark.parametrize('has_rowscale', [True]) +@pytest.mark.parametrize("has_residual", [True, False]) +# @pytest.mark.parametrize('has_residual', [False]) +@pytest.mark.parametrize("dropout_p", [0.37, 0.0]) +# @pytest.mark.parametrize('dropout_p', [0.0]) +@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16]) +# @pytest.mark.parametrize('weight_dtype', [torch.float32]) +@pytest.mark.parametrize( + "input_dtype,residual_dtype", + [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)] + + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []), +) +# @pytest.mark.parametrize('input_dtype,residual_dtype', [(torch.float16, torch.float32)]) +@pytest.mark.parametrize( + "hidden_size", + [192, 256, 384, 768, 1024, 1280, 1536, 1600, 2048, 2560, 3000, 3072, 4096, 5120, 6144], +) +# @pytest.mark.parametrize('hidden_size', [256]) +def test_dropout_layer_norm_training( + hidden_size, + input_dtype, + residual_dtype, + weight_dtype, + dropout_p, + has_residual, + has_rowscale, + has_colscale, + is_rms_norm, +): + if weight_dtype == torch.float16 and input_dtype == torch.bfloat16: + pytest.skip() # Not supported + if is_rms_norm and FusedRMSNorm is None: + pytest.skip() # We need Apex's FusedRMSNorm to test + layer_norm_cls = torch.nn.LayerNorm if not is_rms_norm else FusedRMSNorm + our_layer_norm_cls = DropoutAddLayerNorm if not is_rms_norm else DropoutAddRMSNorm + our_layer_norm_func = dropout_add_layer_norm if not is_rms_norm else dropout_add_rms_norm + device = "cuda" + # rtol, atol = (1e-5, 1e-6) if input_dtype == torch.float32 else (1e-3, 1e-4) + rtol, atol = (1e-3, 1e-4) + # set seed + torch.random.manual_seed(0) + batch_size = 8 + seqlen = 512 + x0_pt = torch.randn( + batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True + ) + x0 = x0_pt.detach().clone().requires_grad_() + x0_ref = x0_pt.detach().clone().float().requires_grad_() + if has_colscale: + colscale = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + colscale_pt = colscale.detach().clone().requires_grad_() + colscale_ref = colscale.detach().clone().float().requires_grad_() + else: + colscale = None + if has_residual: + res_pt = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True) + res = res_pt.detach().clone().requires_grad_() + res_ref = res_pt.detach().clone().float().requires_grad_() + else: + res = None + if has_rowscale: + rowscale = torch.empty(batch_size, seqlen, device=device, dtype=input_dtype) + survival_rate = 0.87 + rowscale = rowscale.bernoulli_(survival_rate) / survival_rate + x0_scaled_pt = x0_pt * rearrange(rowscale, "... -> ... 1") + x0_scaled_ref = x0_ref * rearrange(rowscale, "... -> ... 1") + else: + rowscale = None + x0_scaled_pt = x0_pt + x0_scaled_ref = x0_ref + if has_colscale: + x0_scaled_pt = x0_scaled_pt * colscale_pt + x0_scaled_ref = x0_scaled_ref * colscale_ref + model_pt = layer_norm_cls(hidden_size).to(device=device, dtype=weight_dtype) + torch.nn.init.normal_(model_pt.weight) + if not is_rms_norm: + torch.nn.init.normal_(model_pt.bias) + model_ref = layer_norm_cls(hidden_size).to(device=device, dtype=torch.float32) + model = our_layer_norm_cls(hidden_size, p=dropout_p, device=device, dtype=weight_dtype) + with torch.no_grad(): + model.weight.copy_(model_pt.weight) + model_ref.weight.copy_(model_pt.weight) + if not is_rms_norm: + model.bias.copy_(model_pt.bias) + model_ref.bias.copy_(model_pt.bias) + residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32 + out, dmask = our_layer_norm_func( + x0, + res, + model.weight, + model.bias, + model.p, + model.eps, + rowscale=rowscale, + layerscale=colscale, + residual_in_fp32=residual_in_fp32, + return_dropout_mask=True, + ) + assert out.dtype == input_dtype + print(f"Actual dropout fraction: {1 - dmask.float().mean().item()}") + if has_residual: + residual_pt = ( + (x0_scaled_pt.float() * dmask.float()) / (1 - dropout_p) + res_pt.float() + ).to(dtype=residual_dtype) + residual_ref = (x0_scaled_ref * dmask.float()) / (1 - dropout_p) + res_ref + else: + residual_pt = ((x0_scaled_pt.float() * dmask.float()) / (1 - dropout_p)).to( + dtype=residual_dtype + ) + residual_ref = (x0_scaled_ref * dmask.float()) / (1 - dropout_p) + out_pt = model_pt(residual_pt.to(dtype=weight_dtype)).to(dtype=input_dtype) + out_ref = model_ref(residual_ref) + assert (out - out_ref).abs().max() <= 4 * (out_pt - out_ref).abs().max() + 1e-4 + + g = torch.randn_like(out) / batch_size + out_pt.backward(g) + out.backward(g) + out_ref.backward(g) + assert (x0.grad - x0_ref.grad).abs().max() <= 4 * (x0_pt.grad - x0_ref.grad).abs().max() + 1e-4 + if has_residual: + assert (res.grad - res_ref.grad).abs().max() <= 4 * ( + res_pt.grad - res_ref.grad + ).abs().max() + 1e-4 + assert (model.weight.grad - model_ref.weight.grad).abs().max() <= 3 * ( + model_pt.weight.grad - model_ref.weight.grad + ).abs().max() + 3e-5 + if not is_rms_norm: + assert (model.bias.grad - model_ref.bias.grad).abs().max() <= 2 * ( + model_pt.bias.grad - model_ref.bias.grad + ).abs().max() + 3e-5 + if has_colscale: + assert (colscale.grad - colscale_ref.grad).abs().max() <= 2 * ( + colscale_pt.grad - colscale_ref.grad + ).abs().max() + 2e-4 + + +@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16]) +@pytest.mark.parametrize( + "input_dtype,residual_dtype", + [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)] + + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []), +) +@pytest.mark.parametrize("hidden_size", [768, 1024, 1280, 1536, 1600, 2048, 2560, 3072, 4096, 5120]) +def test_dropout_layer_norm_eval(hidden_size, input_dtype, residual_dtype, weight_dtype): + if weight_dtype == torch.float16 and input_dtype == torch.bfloat16: + pytest.skip() # Not supported + device = "cuda" + # rtol, atol = (1e-5, 1e-6) if dtype == torch.float32 else (1e-3, 1e-4) + rtol, atol = (1e-3, 1e-4) + dropout_p = 0.37 + # set seed + torch.random.manual_seed(0) + batch_size = 32 + seqlen = 512 + x0_pt = torch.randn( + batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True + ) + x0 = x0_pt.detach().clone().requires_grad_() + x0_ref = x0_pt.detach().clone().float().requires_grad_() + res_pt = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True) + res = res_pt.detach().clone().requires_grad_() + res_ref = res_pt.detach().clone().float().requires_grad_() + model_pt = torch.nn.LayerNorm(hidden_size, device=device, dtype=weight_dtype) + torch.nn.init.normal_(model_pt.weight) + torch.nn.init.normal_(model_pt.bias) + model = DropoutAddLayerNorm(hidden_size, p=dropout_p, device=device, dtype=weight_dtype) + model_ref = torch.nn.LayerNorm(hidden_size, device=device, dtype=torch.float32) + with torch.no_grad(): + model.weight.copy_(model_pt.weight) + model.bias.copy_(model_pt.bias) + model_ref.weight.copy_(model_pt.weight) + model_ref.bias.copy_(model_pt.bias) + model_pt.eval() + model.eval() + model_ref.eval() + out = model(x0, res) + residual_pt = (x0_pt.float() + res_pt.float()).to(dtype=residual_dtype) + residual_ref = x0_ref + res_ref + out_pt = model_pt(residual_pt.to(dtype=weight_dtype)).to(input_dtype) + out_ref = model_ref(residual_ref) + assert (out - out_ref).abs().max() <= 4 * (out_pt - out_ref).abs().max() + 1e-4 + + +@pytest.mark.parametrize("is_rms_norm", [False, True]) +@pytest.mark.parametrize("has_colscale", [True, False]) +@pytest.mark.parametrize("has_rowscale", [True, False]) +@pytest.mark.parametrize("has_residual", [True, False]) +@pytest.mark.parametrize("dropout_p", [0.37, 0.0]) +@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16]) +@pytest.mark.parametrize( + "input_dtype,residual_dtype", + [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)] + + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []), +) +# @pytest.mark.parametrize('has_colscale', [True]) +# @pytest.mark.parametrize('has_rowscale', [False]) +# @pytest.mark.parametrize('has_residual', [True]) +# @pytest.mark.parametrize('dropout_p', [0.0]) +# @pytest.mark.parametrize('weight_dtype', [torch.float32]) +# @pytest.mark.parametrize('input_dtype,residual_dtype', [(torch.float32, torch.float32)]) +@pytest.mark.parametrize( + "hidden_size", + [192, 256, 384, 768, 1024, 1280, 1536, 1600, 2048, 2560, 3000, 3072, 4096, 5120, 6144], +) +# @pytest.mark.parametrize('hidden_size', [256]) +def test_dropout_layer_norm_prenorm_training( + hidden_size, + input_dtype, + residual_dtype, + weight_dtype, + dropout_p, + has_residual, + has_rowscale, + has_colscale, + is_rms_norm, +): + if weight_dtype == torch.float16 and input_dtype == torch.bfloat16: + pytest.skip() # Not supported + if is_rms_norm and FusedRMSNorm is None: + pytest.skip() # We need Apex's FusedRMSNorm to test + layer_norm_cls = torch.nn.LayerNorm if not is_rms_norm else FusedRMSNorm + our_layer_norm_cls = DropoutAddLayerNorm if not is_rms_norm else DropoutAddRMSNorm + our_layer_norm_func = dropout_add_layer_norm if not is_rms_norm else dropout_add_rms_norm + device = "cuda" + # rtol, atol = (1e-5, 1e-6) if input_dtype == torch.float32 else (1e-3, 1e-4) + rtol, atol = (1e-3, 2e-4) + # set seed + torch.random.manual_seed(0) + batch_size = 8 + seqlen = 512 + x0_pt = torch.randn( + batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True + ) + x0 = x0_pt.detach().clone().requires_grad_() + x0_ref = x0_pt.detach().clone().float().requires_grad_() + if has_colscale: + colscale = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + colscale_pt = colscale.detach().clone().requires_grad_() + colscale_ref = colscale.detach().clone().float().requires_grad_() + else: + colscale = None + if has_residual: + res_pt = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True) + res = res_pt.detach().clone().requires_grad_() + res_ref = res_pt.detach().clone().float().requires_grad_() + else: + res = None + if has_rowscale: + rowscale = torch.empty(batch_size, seqlen, device=device, dtype=input_dtype) + survival_rate = 0.87 + rowscale = rowscale.bernoulli_(survival_rate) / survival_rate + x0_scaled_pt = x0_pt * rearrange(rowscale, "... -> ... 1") + x0_scaled_ref = x0_ref * rearrange(rowscale, "... -> ... 1") + else: + rowscale = None + x0_scaled_pt = x0_pt + x0_scaled_ref = x0_ref + if has_colscale: + x0_scaled_pt = x0_scaled_pt * colscale_pt + x0_scaled_ref = x0_scaled_ref * colscale_ref + model_pt = layer_norm_cls(hidden_size).to(device=device, dtype=weight_dtype) + torch.nn.init.normal_(model_pt.weight) + if not is_rms_norm: + torch.nn.init.normal_(model_pt.bias) + model_ref = layer_norm_cls(hidden_size).to(device=device, dtype=torch.float32) + model = our_layer_norm_cls( + hidden_size, prenorm=True, p=dropout_p, device=device, dtype=weight_dtype + ) + with torch.no_grad(): + model.weight.copy_(model_pt.weight) + model_ref.weight.copy_(model_pt.weight) + if not is_rms_norm: + model.bias.copy_(model_pt.bias) + model_ref.bias.copy_(model_pt.bias) + residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32 + out, residual, dmask = our_layer_norm_func( + x0, + res, + model.weight, + model.bias, + model.p, + model.eps, + rowscale=rowscale, + layerscale=colscale, + prenorm=True, + residual_in_fp32=residual_in_fp32, + return_dropout_mask=True, + ) + print(f"Actual dropout fraction: {1 - dmask.float().mean().item()}") + if has_residual: + residual_pt = ( + (x0_scaled_pt.float() * dmask.float()) / (1 - dropout_p) + res_pt.float() + ).to(dtype=residual_dtype) + residual_ref = (x0_scaled_ref * dmask.float()) / (1 - dropout_p) + res_ref + else: + residual_pt = ((x0_scaled_pt.float() * dmask.float()) / (1 - dropout_p)).to( + dtype=residual_dtype + ) + residual_ref = (x0_scaled_ref * dmask.float()) / (1 - dropout_p) + out_pt = model_pt(residual_pt.to(dtype=weight_dtype)).to(dtype=input_dtype) + out_ref = model_ref(residual_ref) + assert out.dtype == input_dtype + assert residual.dtype == residual_dtype + assert (out - out_ref).abs().max() <= 4 * (out_pt - out_ref).abs().max() + 1e-4 + assert (residual - residual_ref).abs().max() <= 4 * ( + residual_pt - residual_ref + ).abs().max() + 1e-4 + + g = torch.randn_like(out) / batch_size + (out_pt * F.sigmoid(residual_pt)).backward(g) + (out * F.sigmoid(residual)).backward(g) + (out_ref * F.sigmoid(residual_ref.to(dtype=residual_dtype))).backward(g) + assert (x0.grad - x0_ref.grad).abs().max() <= 4 * (x0_pt.grad - x0_ref.grad).abs().max() + 1e-4 + if has_residual: + assert (res.grad - res_ref.grad).abs().max() <= 4 * ( + res_pt.grad - res_ref.grad + ).abs().max() + 1e-4 + assert (model.weight.grad - model_ref.weight.grad).abs().max() <= 2 * ( + model_pt.weight.grad - model_ref.weight.grad + ).abs().max() + 2e-4 + if not is_rms_norm: + assert (model.bias.grad - model_ref.bias.grad).abs().max() <= 2 * ( + model_pt.bias.grad - model_ref.bias.grad + ).abs().max() + 2e-4 + if has_colscale: + assert (colscale.grad - colscale_ref.grad).abs().max() <= 2 * ( + colscale_pt.grad - colscale_ref.grad + ).abs().max() + 2e-4 + + +@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16]) +@pytest.mark.parametrize( + "input_dtype,residual_dtype", + [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)] + + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []), +) +@pytest.mark.parametrize("hidden_size", [768, 1024, 1280, 1536, 1600, 2048, 2560, 3072, 4096, 5120]) +def test_dropout_layer_norm_prenorm_eval(hidden_size, input_dtype, residual_dtype, weight_dtype): + if weight_dtype == torch.float16 and input_dtype == torch.bfloat16: + pytest.skip() # Not supported + device = "cuda" + # rtol, atol = (1e-5, 1e-6) if dtype == torch.float32 else (1e-3, 1e-4) + rtol, atol = (1e-3, 1e-4) + dropout_p = 0.37 + # set seed + torch.random.manual_seed(0) + batch_size = 32 + seqlen = 512 + x0_pt = torch.randn( + batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True + ) + x0 = x0_pt.detach().clone().requires_grad_() + x0_ref = x0_pt.detach().clone().float().requires_grad_() + res_pt = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True) + res = res_pt.detach().clone().requires_grad_() + res_ref = res_pt.detach().clone().float().requires_grad_() + model_pt = torch.nn.LayerNorm(hidden_size, device=device, dtype=weight_dtype) + torch.nn.init.normal_(model_pt.weight) + torch.nn.init.normal_(model_pt.bias) + model = DropoutAddLayerNorm( + hidden_size, prenorm=True, p=dropout_p, device=device, dtype=weight_dtype + ) + model_ref = torch.nn.LayerNorm(hidden_size, device=device, dtype=torch.float32) + with torch.no_grad(): + model.weight.copy_(model_pt.weight) + model.bias.copy_(model_pt.bias) + model_ref.weight.copy_(model_pt.weight) + model_ref.bias.copy_(model_pt.bias) + model_pt.eval() + model.eval() + model_ref.eval() + out, residual = model(x0, res) + residual_pt = (x0_pt.float() + res_pt.float()).to(dtype=residual_dtype) + residual_ref = x0_ref + res_ref + out_pt = model_pt(residual_pt.to(dtype=weight_dtype)).to(input_dtype) + out_ref = model_ref(residual_ref) + assert (out - out_ref).abs().max() <= 4 * (out_pt - out_ref).abs().max() + 1e-4 + assert (residual - residual_ref).abs().max() <= 4 * ( + residual_pt - residual_ref + ).abs().max() + 1e-4 + + +@pytest.mark.parametrize("has_colscale", [True, False]) +@pytest.mark.parametrize("has_residual", [True, False]) +@pytest.mark.parametrize("dropout_p", [0.37, 0.0]) +@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16]) +@pytest.mark.parametrize( + "input_dtype,residual_dtype", + [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)] + + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []), +) +# @pytest.mark.parametrize('has_colscale', [True]) +# @pytest.mark.parametrize('has_residual', [True]) +# @pytest.mark.parametrize('dropout_p', [0.0]) +# @pytest.mark.parametrize('weight_dtype', [torch.float32]) +# @pytest.mark.parametrize('input_dtype,residual_dtype', [(torch.float32, torch.float32)]) +@pytest.mark.parametrize( + "hidden_size", + [192, 256, 384, 768, 1024, 1280, 1536, 1600, 2048, 2560, 3000, 3072, 4096, 5120, 6144], +) +# @pytest.mark.parametrize('hidden_size', [256]) +def test_dropout_layer_norm_subset_training( + hidden_size, input_dtype, residual_dtype, weight_dtype, dropout_p, has_residual, has_colscale +): + if weight_dtype == torch.float16 and input_dtype == torch.bfloat16: + pytest.skip() # Not supported + device = "cuda" + # rtol, atol = (1e-5, 1e-6) if input_dtype == torch.float32 else (1e-3, 1e-4) + rtol, atol = (1e-3, 2e-4) + # set seed + torch.random.manual_seed(0) + batch_size = 8 + seqlen = 512 + drop_path_rate = 0.4 + drop_path_scale = 1 / (1 - drop_path_rate) + + def generate_droppath_masks(batch_size, seqlen, drop_path_rate, device): + # Do it on CPU so we can get the numrows (with .item()) without GPU-CPU sync + mask_batch = torch.rand(batch_size) < 1 - drop_path_rate + numrows = (mask_batch).sum().item() * seqlen + mask_batch = mask_batch.to(device=device, non_blocking=True) + mask_batch_seqlen = repeat(mask_batch, "b -> (b s)", s=seqlen) + subset = torch.cumsum(mask_batch_seqlen, dim=0, dtype=torch.int32).masked_fill_( + ~mask_batch_seqlen, 0 + ) + return mask_batch, numrows, rearrange(subset, "(b s) -> b s", b=batch_size) + + x0_mask_batch, x0_numrows, x0_subset = generate_droppath_masks( + batch_size, seqlen, drop_path_rate, device + ) + out_mask_batch, out_numrows, out_subset = generate_droppath_masks( + batch_size, seqlen, drop_path_rate, device + ) + + x0_pt = torch.randn( + batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True + ) + x0 = x0_pt.detach().clone()[x0_mask_batch].requires_grad_() + x0_ref = x0_pt.detach().clone().float().requires_grad_() + if has_colscale: + colscale = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + colscale_pt = colscale.detach().clone().requires_grad_() + colscale_ref = colscale.detach().clone().float().requires_grad_() + else: + colscale = None + if has_residual: + res_pt = torch.randn_like(x0_pt, dtype=residual_dtype, requires_grad=True) + res = res_pt.detach().clone().requires_grad_() + res_ref = res_pt.detach().clone().float().requires_grad_() + else: + res = None + + if has_colscale: + x0_scaled_pt = x0_pt * colscale_pt + x0_scaled_ref = x0_ref * colscale_ref + else: + x0_scaled_pt = x0_pt + x0_scaled_ref = x0_ref + + model_pt = torch.nn.LayerNorm(hidden_size, device=device, dtype=weight_dtype) + torch.nn.init.normal_(model_pt.weight) + torch.nn.init.normal_(model_pt.bias) + model_ref = torch.nn.LayerNorm(hidden_size, device=device, dtype=torch.float32) + model = DropoutAddLayerNorm( + hidden_size, prenorm=False, p=dropout_p, device=device, dtype=weight_dtype + ) + with torch.no_grad(): + model.weight.copy_(model_pt.weight) + model.bias.copy_(model_pt.bias) + model_ref.weight.copy_(model_pt.weight) + model_ref.bias.copy_(model_pt.bias) + + residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32 + out, dmask = dropout_add_layer_norm_subset( + x0, + res, + model.weight, + model.bias, + model.p, + model.eps, + layerscale=colscale, + x0_subset=x0_subset, + out_subset=out_subset, + rowscale_const=drop_path_scale, + out_numrows=out_numrows, + prenorm=False, + residual_in_fp32=residual_in_fp32, + return_dropout_mask=True, + ) + print(f"Actual dropout fraction: {1 - dmask.float().mean().item()}") + + x0_scaled_pt = ( + x0_scaled_pt.masked_fill(repeat(~x0_mask_batch, "b -> b s d", s=seqlen, d=hidden_size), 0) + * drop_path_scale + ) + x0_scaled_ref = ( + x0_scaled_ref.masked_fill(repeat(~x0_mask_batch, "b -> b s d", s=seqlen, d=hidden_size), 0) + * drop_path_scale + ) + dmask_expanded = torch.zeros_like(x0_pt, dtype=torch.uint8) + dmask_expanded[x0_mask_batch] = dmask + if has_residual: + residual_pt = ( + (x0_scaled_pt.float() * dmask_expanded.float()) / (1 - dropout_p) + res_pt.float() + ).to(dtype=residual_dtype) + residual_ref = (x0_scaled_ref * dmask_expanded.float()) / (1 - dropout_p) + res_ref + else: + residual_pt = ((x0_scaled_pt.float() * dmask_expanded.float()) / (1 - dropout_p)).to( + dtype=residual_dtype + ) + residual_ref = (x0_scaled_ref * dmask_expanded.float()) / (1 - dropout_p) + out_pt = model_pt(residual_pt.to(dtype=weight_dtype)).to(dtype=input_dtype)[out_mask_batch] + out_ref = model_ref(residual_ref)[out_mask_batch] + assert out.dtype == input_dtype + assert (out - out_ref).abs().max() <= 4 * (out_pt - out_ref).abs().max() + 1e-4 + + g = torch.randn_like(out) / batch_size + out_pt.backward(g) + out.backward(g) + out_ref.backward(g) + assert (x0.grad - x0_ref.grad[x0_mask_batch]).abs().max() <= 4 * (x0_pt.grad - x0_ref.grad)[ + x0_mask_batch + ].abs().max() + 1e-4 + if has_residual: + assert (res.grad - res_ref.grad).abs().max() <= 4 * ( + res_pt.grad - res_ref.grad + ).abs().max() + 1e-4 + assert (model.weight.grad - model_ref.weight.grad).abs().max() <= 2 * ( + model_pt.weight.grad - model_ref.weight.grad + ).abs().max() + 2e-4 + assert (model.bias.grad - model_ref.bias.grad).abs().max() <= 2 * ( + model_pt.bias.grad - model_ref.bias.grad + ).abs().max() + 2e-4 + if has_colscale: + assert (colscale.grad - colscale_ref.grad).abs().max() <= 2 * ( + colscale_pt.grad - colscale_ref.grad + ).abs().max() + 2e-4 + + +@pytest.mark.parametrize("has_colscale", [True, False]) +@pytest.mark.parametrize("has_residual", [True, False]) +@pytest.mark.parametrize("dropout_p", [0.37, 0.0]) +@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16]) +@pytest.mark.parametrize( + "input_dtype,residual_dtype", + [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)] + + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []), +) +# @pytest.mark.parametrize('has_colscale', [True]) +# @pytest.mark.parametrize('has_residual', [True]) +# @pytest.mark.parametrize('dropout_p', [0.0]) +# @pytest.mark.parametrize('weight_dtype', [torch.float32]) +# @pytest.mark.parametrize('input_dtype,residual_dtype', [(torch.float32, torch.float32)]) +@pytest.mark.parametrize( + "hidden_size", + [192, 256, 384, 768, 1024, 1280, 1536, 1600, 2048, 2560, 3000, 3072, 4096, 5120, 6144], +) +# @pytest.mark.parametrize('hidden_size', [256]) +def test_dropout_layer_norm_subset_prenorm_training( + hidden_size, input_dtype, residual_dtype, weight_dtype, dropout_p, has_residual, has_colscale +): + if weight_dtype == torch.float16 and input_dtype == torch.bfloat16: + pytest.skip() # Not supported + device = "cuda" + # rtol, atol = (1e-5, 1e-6) if input_dtype == torch.float32 else (1e-3, 1e-4) + rtol, atol = (1e-3, 2e-4) + # set seed + torch.random.manual_seed(0) + batch_size = 8 + seqlen = 512 + drop_path_rate = 0.4 + drop_path_scale = 1 / (1 - drop_path_rate) + + def generate_droppath_masks(batch_size, seqlen, drop_path_rate, device): + # Do it on CPU so we can get the numrows (with .item()) without GPU-CPU sync + mask_batch = torch.rand(batch_size) < 1 - drop_path_rate + numrows = (mask_batch).sum().item() * seqlen + mask_batch = mask_batch.to(device=device, non_blocking=True) + mask_batch_seqlen = repeat(mask_batch, "b -> (b s)", s=seqlen) + subset = torch.cumsum(mask_batch_seqlen, dim=0, dtype=torch.int32).masked_fill_( + ~mask_batch_seqlen, 0 + ) + return mask_batch, numrows, rearrange(subset, "(b s) -> b s", b=batch_size) + + x0_mask_batch, x0_numrows, x0_subset = generate_droppath_masks( + batch_size, seqlen, drop_path_rate, device + ) + out_mask_batch, out_numrows, out_subset = generate_droppath_masks( + batch_size, seqlen, drop_path_rate, device + ) + + x0_pt = torch.randn( + batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True + ) + x0 = x0_pt.detach().clone()[x0_mask_batch].requires_grad_() + x0_ref = x0_pt.detach().clone().float().requires_grad_() + if has_colscale: + colscale = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + colscale_pt = colscale.detach().clone().requires_grad_() + colscale_ref = colscale.detach().clone().float().requires_grad_() + else: + colscale = None + if has_residual: + res_pt = torch.randn_like(x0_pt, dtype=residual_dtype, requires_grad=True) + res = res_pt.detach().clone().requires_grad_() + res_ref = res_pt.detach().clone().float().requires_grad_() + else: + res = None + + if has_colscale: + x0_scaled_pt = x0_pt * colscale_pt + x0_scaled_ref = x0_ref * colscale_ref + else: + x0_scaled_pt = x0_pt + x0_scaled_ref = x0_ref + + model_pt = torch.nn.LayerNorm(hidden_size, device=device, dtype=weight_dtype) + torch.nn.init.normal_(model_pt.weight) + torch.nn.init.normal_(model_pt.bias) + model_ref = torch.nn.LayerNorm(hidden_size, device=device, dtype=torch.float32) + model = DropoutAddLayerNorm( + hidden_size, prenorm=True, p=dropout_p, device=device, dtype=weight_dtype + ) + with torch.no_grad(): + model.weight.copy_(model_pt.weight) + model.bias.copy_(model_pt.bias) + model_ref.weight.copy_(model_pt.weight) + model_ref.bias.copy_(model_pt.bias) + + residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32 + out, residual, dmask = dropout_add_layer_norm_subset( + x0, + res, + model.weight, + model.bias, + model.p, + model.eps, + layerscale=colscale, + x0_subset=x0_subset, + out_subset=out_subset, + rowscale_const=drop_path_scale, + out_numrows=out_numrows, + prenorm=True, + residual_in_fp32=residual_in_fp32, + return_dropout_mask=True, + ) + print(f"Actual dropout fraction: {1 - dmask.float().mean().item()}") + + x0_scaled_pt = ( + x0_scaled_pt.masked_fill(repeat(~x0_mask_batch, "b -> b s d", s=seqlen, d=hidden_size), 0) + * drop_path_scale + ) + x0_scaled_ref = ( + x0_scaled_ref.masked_fill(repeat(~x0_mask_batch, "b -> b s d", s=seqlen, d=hidden_size), 0) + * drop_path_scale + ) + dmask_expanded = torch.zeros_like(x0_pt, dtype=torch.uint8) + dmask_expanded[x0_mask_batch] = dmask + if has_residual: + residual_pt = ( + (x0_scaled_pt.float() * dmask_expanded.float()) / (1 - dropout_p) + res_pt.float() + ).to(dtype=residual_dtype) + residual_ref = (x0_scaled_ref * dmask_expanded.float()) / (1 - dropout_p) + res_ref + else: + residual_pt = ((x0_scaled_pt.float() * dmask_expanded.float()) / (1 - dropout_p)).to( + dtype=residual_dtype + ) + residual_ref = (x0_scaled_ref * dmask_expanded.float()) / (1 - dropout_p) + out_pt = model_pt(residual_pt.to(dtype=weight_dtype)).to(dtype=input_dtype)[out_mask_batch] + out_ref = model_ref(residual_ref)[out_mask_batch] + assert out.dtype == input_dtype + assert residual.dtype == residual_dtype + assert (out - out_ref).abs().max() <= 4 * (out_pt - out_ref).abs().max() + 1e-4 + assert (residual - residual_ref).abs().max() <= 4 * ( + residual_pt - residual_ref + ).abs().max() + 1e-4 + + g = torch.randn_like(out) / batch_size + (out_pt * F.sigmoid(residual_pt[out_mask_batch]) + residual_pt.mean(0, keepdim=True)).backward( + g + ) + (out * F.sigmoid(residual[out_mask_batch]) + residual.mean(0, keepdim=True)).backward(g) + ( + out_ref * F.sigmoid(residual_ref[out_mask_batch].to(dtype=residual_dtype)) + + residual_ref.mean(0, keepdim=True) + ).backward(g) + assert (x0.grad - x0_ref.grad[x0_mask_batch]).abs().max() <= 4 * (x0_pt.grad - x0_ref.grad)[ + x0_mask_batch + ].abs().max() + 1e-4 + if has_residual: + assert (res.grad - res_ref.grad).abs().max() <= 4 * ( + res_pt.grad - res_ref.grad + ).abs().max() + 1e-4 + assert (model.weight.grad - model_ref.weight.grad).abs().max() <= 2 * ( + model_pt.weight.grad - model_ref.weight.grad + ).abs().max() + 2e-4 + assert (model.bias.grad - model_ref.bias.grad).abs().max() <= 2 * ( + model_pt.bias.grad - model_ref.bias.grad + ).abs().max() + 2e-4 + if has_colscale: + assert (colscale.grad - colscale_ref.grad).abs().max() <= 2 * ( + colscale_pt.grad - colscale_ref.grad + ).abs().max() + 2e-4 + + +@pytest.mark.parametrize("is_rms_norm", [False, True]) +# @pytest.mark.parametrize('is_rms_norm', [False]) +@pytest.mark.parametrize("tied_norm", [False, True]) +# @pytest.mark.parametrize('tied_norm', [False]) +@pytest.mark.parametrize("has_residual", [True, False]) +# @pytest.mark.parametrize('has_residual', [False]) +@pytest.mark.parametrize("has_x1", [True, False]) +# @pytest.mark.parametrize('has_x1', [True]) +@pytest.mark.parametrize("dropout_p", [0.37, 0.0]) +# @pytest.mark.parametrize('dropout_p', [0.0]) +@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16]) +# @pytest.mark.parametrize('weight_dtype', [torch.float16]) +@pytest.mark.parametrize( + "input_dtype,residual_dtype", + [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)] + + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []), +) +# @pytest.mark.parametrize('input_dtype,residual_dtype', [(torch.float16, torch.float32)]) +@pytest.mark.parametrize( + "hidden_size", + [192, 256, 384, 768, 1024, 1280, 1536, 1600, 2048, 2560, 3000, 3072, 4096, 5120, 6144], +) +# @pytest.mark.parametrize('hidden_size', [256]) +def test_dropout_layer_norm_parallel_residual_training( + hidden_size, + input_dtype, + residual_dtype, + weight_dtype, + dropout_p, + has_x1, + has_residual, + tied_norm, + is_rms_norm, +): + if weight_dtype == torch.float16 and input_dtype == torch.bfloat16: + pytest.skip() # Not supported + if is_rms_norm and fused_rms_norm_affine is None: + pytest.skip() # We need Apex's FusedRMSNorm to test + our_layer_norm_func = ( + dropout_add_layer_norm_parallel_residual + if not is_rms_norm + else dropout_add_rms_norm_parallel_residual + ) + device = "cuda" + # rtol, atol = (1e-5, 1e-6) if input_dtype == torch.float32 else (1e-3, 1e-4) + rtol, atol = (1e-3, 1e-4) + # set seed + torch.random.manual_seed(0) + batch_size = 8 + seqlen = 512 + x0_pt = torch.randn( + batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True + ) + x0 = x0_pt.detach().clone().requires_grad_() + x0_ref = x0_pt.detach().clone().float().requires_grad_() + if has_x1: + x1_pt = torch.randn( + batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True + ) + x1 = x1_pt.detach().clone().requires_grad_() + x1_ref = x1_pt.detach().clone().float().requires_grad_() + else: + x1 = None + if has_residual: + res_pt = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True) + res = res_pt.detach().clone().requires_grad_() + res_ref = res_pt.detach().clone().float().requires_grad_() + else: + res = None + weight0 = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + bias0 = ( + torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + if not is_rms_norm + else None + ) + weight0_pt = weight0.detach().clone().requires_grad_() + weight0_ref = weight0.detach().clone().float().requires_grad_() + bias0_pt = bias0.detach().clone().requires_grad_() if bias0 is not None else None + bias0_ref = bias0.detach().clone().float().requires_grad_() if bias0 is not None else None + if not tied_norm: + weight1 = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + bias1 = ( + torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + if not is_rms_norm + else None + ) + weight1_pt = weight1.detach().clone().requires_grad_() + weight1_ref = weight1.detach().clone().float().requires_grad_() + bias1_pt = bias1.detach().clone().requires_grad_() if bias1 is not None else None + bias1_ref = bias1.detach().clone().float().requires_grad_() if bias1 is not None else None + else: + weight1, bias1 = None, None + epsilon = 1e-5 + residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32 + + out0, out1, dmask0, dmask1 = our_layer_norm_func( + x0, + x1, + res, + weight0, + bias0, + weight1, + bias1, + dropout_p, + epsilon, + residual_in_fp32=residual_in_fp32, + return_dropout_mask=True, + ) + assert out0.dtype == input_dtype + if not tied_norm: + assert out1.dtype == input_dtype + print(f"Actual dropout fraction: {1 - dmask0.float().mean().item()}") + if has_residual: + if has_x1: + residual_pt = ( + (x0_pt.float() * dmask0.float()) / (1 - dropout_p) + + (x1_pt.float() * dmask1.float()) / (1 - dropout_p) + + res_pt.float() + ).to(dtype=residual_dtype) + residual_ref = ( + (x0_ref * dmask0.float()) / (1 - dropout_p) + + (x1_ref * dmask1.float()) / (1 - dropout_p) + ) + res_ref + else: + residual_pt = ((x0_pt.float() * dmask0.float()) / (1 - dropout_p) + res_pt.float()).to( + dtype=residual_dtype + ) + residual_ref = (x0_ref * dmask0.float()) / (1 - dropout_p) + res_ref + else: + if has_x1: + residual_pt = ( + (x0_pt.float() * dmask0.float()) / (1 - dropout_p) + + (x1_pt.float() * dmask1.float()) / (1 - dropout_p) + ).to(dtype=residual_dtype) + residual_ref = (x0_ref * dmask0.float()) / (1 - dropout_p) + ( + x1_ref * dmask1.float() + ) / (1 - dropout_p) + else: + residual_pt = ((x0_pt.float() * dmask0.float()) / (1 - dropout_p)).to( + dtype=residual_dtype + ) + residual_ref = (x0_ref * dmask0.float()) / (1 - dropout_p) + if not is_rms_norm: + out0_pt = F.layer_norm( + residual_pt.to(dtype=weight_dtype), (hidden_size,), weight0_pt, bias0_pt, eps=epsilon + ).to(dtype=input_dtype) + out0_ref = F.layer_norm(residual_ref, (hidden_size,), weight0_ref, bias0_ref, eps=epsilon) + if not tied_norm: + out1_pt = F.layer_norm( + residual_pt.to(dtype=weight_dtype), + (hidden_size,), + weight1_pt, + bias1_pt, + eps=epsilon, + ).to(dtype=input_dtype) + out1_ref = F.layer_norm( + residual_ref, (hidden_size,), weight1_ref, bias1_ref, eps=epsilon + ) + else: + out0_pt = fused_rms_norm_affine( + residual_pt.to(dtype=weight_dtype), weight0_pt, (hidden_size,), eps=epsilon + ).to(dtype=input_dtype) + out0_ref = fused_rms_norm_affine(residual_ref, weight0_ref, (hidden_size,), eps=epsilon) + if not tied_norm: + out1_pt = fused_rms_norm_affine( + residual_pt.to(dtype=weight_dtype), weight1_pt, (hidden_size,), eps=epsilon + ).to(dtype=input_dtype) + out1_ref = fused_rms_norm_affine(residual_ref, weight1_ref, (hidden_size,), eps=epsilon) + + assert (out0 - out0_ref).abs().max() <= 4 * (out0_pt - out0_ref).abs().max() + 1e-4 + if not tied_norm: + assert (out1 - out1_ref).abs().max() <= 4 * (out1_pt - out1_ref).abs().max() + 1e-4 + + g0 = torch.randn_like(out0) / batch_size + if tied_norm: + out0.backward(g0) + out0_pt.backward(g0) + out0_ref.backward(g0) + else: + g1 = torch.randn_like(out1) / batch_size + (out0 * g0 + out1 * g1).sum().backward() + (out0_pt * g0 + out1_pt * g1).sum().backward() + (out0_ref * g0 + out1_ref * g1).sum().backward() + assert (x0.grad - x0_ref.grad).abs().max() <= 4 * (x0_pt.grad - x0_ref.grad).abs().max() + 1e-4 + if has_x1: + assert (x1.grad - x1_ref.grad).abs().max() <= 4 * ( + x1_pt.grad - x1_ref.grad + ).abs().max() + 1e-4 + if has_residual: + assert (res.grad - res_ref.grad).abs().max() <= 4 * ( + res_pt.grad - res_ref.grad + ).abs().max() + 1e-4 + assert (weight0.grad - weight0_ref.grad).abs().max() <= 3 * ( + weight0_pt.grad - weight0_ref.grad + ).abs().max() + 3e-5 + if not is_rms_norm: + assert (bias0.grad - bias0_ref.grad).abs().max() <= 2 * ( + bias0_pt.grad - bias0_ref.grad + ).abs().max() + 3e-5 + if not tied_norm: + assert (weight1.grad - weight1_ref.grad).abs().max() <= 3 * ( + weight1_pt.grad - weight1_ref.grad + ).abs().max() + 3e-5 + if not is_rms_norm: + assert (bias1.grad - bias1_ref.grad).abs().max() <= 2 * ( + bias1_pt.grad - bias1_ref.grad + ).abs().max() + 3e-5 + + +@pytest.mark.parametrize("is_rms_norm", [False, True]) +# @pytest.mark.parametrize('is_rms_norm', [False]) +@pytest.mark.parametrize("tied_norm", [False, True]) +# @pytest.mark.parametrize('tied_norm', [False]) +@pytest.mark.parametrize("has_residual", [True, False]) +# @pytest.mark.parametrize('has_residual', [False]) +@pytest.mark.parametrize("has_x1", [True, False]) +# @pytest.mark.parametrize('has_x1', [True]) +@pytest.mark.parametrize("dropout_p", [0.37, 0.0]) +# @pytest.mark.parametrize('dropout_p', [0.0]) +@pytest.mark.parametrize("weight_dtype", [torch.float32, torch.float16]) +# @pytest.mark.parametrize('weight_dtype', [torch.float16]) +@pytest.mark.parametrize( + "input_dtype,residual_dtype", + [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)] + + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []), +) +# @pytest.mark.parametrize('input_dtype,residual_dtype', [(torch.float16, torch.float32)]) +@pytest.mark.parametrize( + "hidden_size", + [192, 256, 384, 768, 1024, 1280, 1536, 1600, 2048, 2560, 3000, 3072, 4096, 5120, 6144], +) +# @pytest.mark.parametrize('hidden_size', [256]) +def test_dropout_layer_norm_parallel_residual_prenorm_training( + hidden_size, + input_dtype, + residual_dtype, + weight_dtype, + dropout_p, + has_x1, + has_residual, + tied_norm, + is_rms_norm, +): + if weight_dtype == torch.float16 and input_dtype == torch.bfloat16: + pytest.skip() # Not supported + if is_rms_norm and fused_rms_norm_affine is None: + pytest.skip() # We need Apex's FusedRMSNorm to test + our_layer_norm_func = ( + dropout_add_layer_norm_parallel_residual + if not is_rms_norm + else dropout_add_rms_norm_parallel_residual + ) + device = "cuda" + # rtol, atol = (1e-5, 1e-6) if input_dtype == torch.float32 else (1e-3, 1e-4) + rtol, atol = (1e-3, 1e-4) + # set seed + torch.random.manual_seed(0) + batch_size = 8 + seqlen = 512 + x0_pt = torch.randn( + batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True + ) + x0 = x0_pt.detach().clone().requires_grad_() + x0_ref = x0_pt.detach().clone().float().requires_grad_() + if has_x1: + x1_pt = torch.randn( + batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True + ) + x1 = x1_pt.detach().clone().requires_grad_() + x1_ref = x1_pt.detach().clone().float().requires_grad_() + else: + x1 = None + if has_residual: + res_pt = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True) + res = res_pt.detach().clone().requires_grad_() + res_ref = res_pt.detach().clone().float().requires_grad_() + else: + res = None + weight0 = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + bias0 = ( + torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + if not is_rms_norm + else None + ) + weight0_pt = weight0.detach().clone().requires_grad_() + weight0_ref = weight0.detach().clone().float().requires_grad_() + bias0_pt = bias0.detach().clone().requires_grad_() if bias0 is not None else None + bias0_ref = bias0.detach().clone().float().requires_grad_() if bias0 is not None else None + if not tied_norm: + weight1 = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + bias1 = ( + torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + if not is_rms_norm + else None + ) + weight1_pt = weight1.detach().clone().requires_grad_() + weight1_ref = weight1.detach().clone().float().requires_grad_() + bias1_pt = bias1.detach().clone().requires_grad_() if bias1 is not None else None + bias1_ref = bias1.detach().clone().float().requires_grad_() if bias1 is not None else None + else: + weight1, bias1 = None, None + epsilon = 1e-5 + residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32 + + out0, out1, residual, dmask0, dmask1 = our_layer_norm_func( + x0, + x1, + res, + weight0, + bias0, + weight1, + bias1, + dropout_p, + epsilon, + prenorm=True, + residual_in_fp32=residual_in_fp32, + return_dropout_mask=True, + ) + assert out0.dtype == input_dtype + if not tied_norm: + assert out1.dtype == input_dtype + print(f"Actual dropout fraction: {1 - dmask0.float().mean().item()}") + if has_residual: + if has_x1: + residual_pt = ( + (x0_pt.float() * dmask0.float()) / (1 - dropout_p) + + (x1_pt.float() * dmask1.float()) / (1 - dropout_p) + + res_pt.float() + ).to(dtype=residual_dtype) + residual_ref = ( + (x0_ref * dmask0.float()) / (1 - dropout_p) + + (x1_ref * dmask1.float()) / (1 - dropout_p) + ) + res_ref + else: + residual_pt = ((x0_pt.float() * dmask0.float()) / (1 - dropout_p) + res_pt.float()).to( + dtype=residual_dtype + ) + residual_ref = (x0_ref * dmask0.float()) / (1 - dropout_p) + res_ref + else: + if has_x1: + residual_pt = ( + (x0_pt.float() * dmask0.float()) / (1 - dropout_p) + + (x1_pt.float() * dmask1.float()) / (1 - dropout_p) + ).to(dtype=residual_dtype) + residual_ref = (x0_ref * dmask0.float()) / (1 - dropout_p) + ( + x1_ref * dmask1.float() + ) / (1 - dropout_p) + else: + residual_pt = ((x0_pt.float() * dmask0.float()) / (1 - dropout_p)).to( + dtype=residual_dtype + ) + residual_ref = (x0_ref * dmask0.float()) / (1 - dropout_p) + if not is_rms_norm: + out0_pt = F.layer_norm( + residual_pt.to(dtype=weight_dtype), (hidden_size,), weight0_pt, bias0_pt, eps=epsilon + ).to(dtype=input_dtype) + out0_ref = F.layer_norm(residual_ref, (hidden_size,), weight0_ref, bias0_ref, eps=epsilon) + if not tied_norm: + out1_pt = F.layer_norm( + residual_pt.to(dtype=weight_dtype), + (hidden_size,), + weight1_pt, + bias1_pt, + eps=epsilon, + ).to(dtype=input_dtype) + out1_ref = F.layer_norm( + residual_ref, (hidden_size,), weight1_ref, bias1_ref, eps=epsilon + ) + else: + out0_pt = fused_rms_norm_affine( + residual_pt.to(dtype=weight_dtype), weight0_pt, (hidden_size,), eps=epsilon + ).to(dtype=input_dtype) + out0_ref = fused_rms_norm_affine(residual_ref, weight0_ref, (hidden_size,), eps=epsilon) + if not tied_norm: + out1_pt = fused_rms_norm_affine( + residual_pt.to(dtype=weight_dtype), weight1_pt, (hidden_size,), eps=epsilon + ).to(dtype=input_dtype) + out1_ref = fused_rms_norm_affine(residual_ref, weight1_ref, (hidden_size,), eps=epsilon) + + assert (out0 - out0_ref).abs().max() <= 4 * (out0_pt - out0_ref).abs().max() + 1e-4 + if not tied_norm: + assert (out1 - out1_ref).abs().max() <= 4 * (out1_pt - out1_ref).abs().max() + 1e-4 + assert (residual - residual_ref).abs().max() <= 4 * ( + residual_pt - residual_ref + ).abs().max() + 1e-4 + + g0 = torch.randn_like(out0) / batch_size + if tied_norm: + (out0 * F.sigmoid(residual)).backward(g0) + (out0_pt * F.sigmoid(residual_pt)).backward(g0) + (out0_ref * F.sigmoid(residual_ref)).backward(g0) + else: + g1 = torch.randn_like(out1) / batch_size + (out0 * F.sigmoid(residual) * g0 + out1 * g1).sum().backward() + (out0_pt * F.sigmoid(residual_pt) * g0 + out1_pt * g1).sum().backward() + (out0_ref * F.sigmoid(residual_ref) * g0 + out1_ref * g1).sum().backward() + assert (x0.grad - x0_ref.grad).abs().max() <= 4 * (x0_pt.grad - x0_ref.grad).abs().max() + 1e-4 + if has_x1: + assert (x1.grad - x1_ref.grad).abs().max() <= 4 * ( + x1_pt.grad - x1_ref.grad + ).abs().max() + 1e-4 + if has_residual: + assert (res.grad - res_ref.grad).abs().max() <= 4 * ( + res_pt.grad - res_ref.grad + ).abs().max() + 1e-4 + assert (weight0.grad - weight0_ref.grad).abs().max() <= 3 * ( + weight0_pt.grad - weight0_ref.grad + ).abs().max() + 3e-5 + if not is_rms_norm: + assert (bias0.grad - bias0_ref.grad).abs().max() <= 2 * ( + bias0_pt.grad - bias0_ref.grad + ).abs().max() + 3e-5 + if not tied_norm: + assert (weight1.grad - weight1_ref.grad).abs().max() <= 3 * ( + weight1_pt.grad - weight1_ref.grad + ).abs().max() + 3e-5 + if not is_rms_norm: + assert (bias1.grad - bias1_ref.grad).abs().max() <= 2 * ( + bias1_pt.grad - bias1_ref.grad + ).abs().max() + 3e-5 + + +def test_dropout_layer_norm_randomness(): + hidden_size = 256 + dtype = torch.float32 + dropout_p = 0.1 + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 8 + seqlen = 512 + x0 = torch.randn( + batch_size, seqlen, hidden_size, device=device, dtype=dtype, requires_grad=True + ) + res = torch.randn_like(x0, dtype=dtype, requires_grad=True) + model = DropoutAddLayerNorm(hidden_size, p=dropout_p, device=device, dtype=dtype) + torch.random.manual_seed(42) + _, dmask0 = dropout_add_layer_norm( + x0, res, model.weight, model.bias, model.p, model.eps, return_dropout_mask=True + ) + # Subsequent call should have a different dropout mask + _, dmask1 = dropout_add_layer_norm( + x0, res, model.weight, model.bias, model.p, model.eps, return_dropout_mask=True + ) + torch.random.manual_seed(42) + # Resetting the seed, should get the same dropout mask + _, dmask2 = dropout_add_layer_norm( + x0, res, model.weight, model.bias, model.p, model.eps, return_dropout_mask=True + ) + assert not torch.equal(dmask0, dmask1) + assert torch.equal(dmask0, dmask2) diff --git a/test_embedding_parallel.py b/test_embedding_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..c99293620bd6633ecced4b35611b441c5ee0aacc --- /dev/null +++ b/test_embedding_parallel.py @@ -0,0 +1,106 @@ +# Run test with: +# torchrun --no_python --nproc_per_node=8 pytest -q -s tests/modules/test_embedding_parallel.py + +import pytest +import torch +import torch.nn as nn +import torch.nn.functional as F +from apex.transformer import parallel_state +from einops import rearrange +from flash_attn.modules.embedding import GPT2Embeddings, ParallelGPT2Embeddings + +is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8 + + +@pytest.mark.parametrize("dtype", [torch.float16] + ([torch.bfloat16] if is_sm8x else [])) +# @pytest.mark.parametrize('dtype', [torch.bfloat16]) +@pytest.mark.parametrize("world_size", [1, 2, 4, 8]) +# @pytest.mark.parametrize('world_size', [2]) +@pytest.mark.parametrize("sequence_parallel", [True, False]) +# @pytest.mark.parametrize('sequence_parallel', [False]) +@pytest.mark.parametrize("has_pos_emb", [True, False]) +# @pytest.mark.parametrize('has_pos_emb', [True]) +@pytest.mark.parametrize("dim", [1024]) +def test_embedding_parallel(dim, has_pos_emb, sequence_parallel, world_size, dtype): + vocab_size = 50264 + seqlen = 2048 + assert vocab_size % world_size == 0 + assert dim % world_size == 0 + rtol, atol = (3e-3, 5e-2) if dtype == torch.bfloat16 else (3e-3, 3e-3) + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + # set seed + torch.random.manual_seed(0) + batch_size = 8 + seqlen = 1024 + assert (batch_size * seqlen) % world_size == 0 + input_ids_pt = torch.randint(0, vocab_size, (batch_size, seqlen), device=device) + input_ids = input_ids_pt.detach().clone() + + model_pt = GPT2Embeddings( + dim, vocab_size, seqlen if has_pos_emb else 0, device=device, dtype=dtype + ) + model = ParallelGPT2Embeddings( + dim, + vocab_size, + seqlen if has_pos_emb else 0, + parallel_state.get_tensor_model_parallel_group(), + sequence_parallel=sequence_parallel, + device=device, + dtype=dtype, + ) + partition_vocab_size = vocab_size // world_size + partition_dim = dim // world_size + with torch.no_grad(): + model.word_embeddings.weight.copy_( + model_pt.word_embeddings.weight[ + rank * partition_vocab_size : (rank + 1) * partition_vocab_size + ] + ) + if has_pos_emb: + model.position_embeddings.weight.copy_( + model_pt.position_embeddings.weight[ + :, rank * partition_dim : (rank + 1) * partition_dim + ] + ) + + out = model(input_ids, combine_batch_seqlen_dim=True) + out_pt = rearrange(model_pt(input_ids), "b s d -> (b s) d") + partition_batch_dim = batch_size * seqlen // world_size + assert torch.allclose( + out, + out_pt[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] + if sequence_parallel + else out_pt, + rtol=rtol, + atol=atol, + ) + + g = torch.randn_like(out_pt) + out_pt.backward(g) + out.backward( + g[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] if sequence_parallel else g + ) + parallel_state.destroy_model_parallel() + + assert torch.allclose( + model.word_embeddings.weight.grad, + model_pt.word_embeddings.weight.grad[ + rank * partition_vocab_size : (rank + 1) * partition_vocab_size + ], + rtol=rtol, + atol=atol, + ) + if has_pos_emb: + assert torch.allclose( + model.position_embeddings.weight.grad, + model_pt.position_embeddings.weight.grad[ + :, rank * partition_dim : (rank + 1) * partition_dim + ], + rtol=rtol, + atol=atol, + ) diff --git a/test_falcon.py b/test_falcon.py new file mode 100644 index 0000000000000000000000000000000000000000..582f907b4b1ba5765669ab4f18abd3b061e27df8 --- /dev/null +++ b/test_falcon.py @@ -0,0 +1,408 @@ +# Copyright (c) 2023, Tri Dao. + +import os +import time +from pathlib import Path + +current_dir = Path(__file__).parent.absolute() + +import pytest +import torch +from einops import rearrange +from flash_attn.models.falcon import falcon_config_to_gpt2_config, remap_state_dict_hf_falcon +from flash_attn.models.gpt import GPTLMHeadModel, combine_state_dicts_tp, shard_state_dict_tp +from flash_attn.utils.distributed import all_gather_raw +from flash_attn.utils.generation import update_graph_cache +from flash_attn.utils.pretrained import state_dict_from_pretrained +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + + +@pytest.mark.parametrize("model_name", ["tiiuae/falcon-7b", "tiiuae/falcon-40b"]) +def test_falcon_state_dict(model_name): + config = falcon_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + pretrained_state_dict = remap_state_dict_hf_falcon( + state_dict_from_pretrained(model_name), config + ) + model = GPTLMHeadModel(config, device="meta") # Without device='meta' init is very slow + state_dict = model.state_dict() + assert state_dict.keys() == pretrained_state_dict.keys() + for k in state_dict.keys(): + assert state_dict[k].shape == pretrained_state_dict[k].shape + + +@pytest.mark.parametrize("model_name", ["tiiuae/falcon-7b"]) +def test_falcon_optimized(model_name): + """Check that our implementation (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + device = "cuda" + config = falcon_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = False # We don't have fused MLP for "gelu" activation + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + + model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype) + model.eval() + + torch.manual_seed(0) + batch_size = 2 + max_seqlen = 256 + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device + ) + with torch.no_grad(): + out = model.transformer(input_ids) + logits = model(input_ids).logits + del model + + # Without device_map, the model is loaded on the CPU, which is very slow + model_ref = AutoModelForCausalLM.from_pretrained( + model_name, device_map={"": device}, trust_remote_code=True + ) + model_ref.eval() + with torch.no_grad(): + out_ref = model_ref.transformer(input_ids).last_hidden_state.to(device=device) + logits_ref = model_ref(input_ids).logits.to(device=device) + del model_ref + + model_hf = AutoModelForCausalLM.from_pretrained( + model_name, torch_dtype=dtype, device_map={"": device}, trust_remote_code=True + ) + model_hf.eval() + out_hf = model_hf.transformer(input_ids).last_hidden_state + logits_hf = model_hf(input_ids).logits + del model_hf + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}") + assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item() + + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}") + assert (logits - logits_ref).abs().max().item() < 3 * ( + logits_hf - logits_ref + ).abs().max().item() + + +# torchrun --no_python --nproc_per_node=4 pytest -q -s tests/models/test_falcon.py -k "falcon_parallel_forward" +# We want to run this on a machine with 4 x A100 80GB or 8 x A100 40GB so we have enough +# memory to run the model in fp32. +@pytest.mark.parametrize("world_size", [4]) +@pytest.mark.parametrize("model_name", ["tiiuae/falcon-40b"]) +def test_falcon_parallel_forward(model_name, world_size): + from apex.transformer import parallel_state + + dtype = torch.float16 + config = falcon_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + config.use_flash_attn = False + config.fused_bias_fc = True + config.fused_mlp = False # We don't have fused MLP for "gelu" activation + config.fused_dropout_add_ln = False + config.residual_in_fp32 = True + + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + process_group = parallel_state.get_tensor_model_parallel_group() + + pretrained_state_dict = remap_state_dict_hf_falcon( + state_dict_from_pretrained(model_name), config + ) + + model = GPTLMHeadModel(config, process_group=process_group, device=device, dtype=dtype) + model.load_state_dict(shard_state_dict_tp(pretrained_state_dict, config, world_size, rank)) + model.eval() + + torch.manual_seed(0) + batch_size = 2 + max_seqlen = 256 + seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device) + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device + ) + with torch.no_grad(): + out = model.transformer(input_ids) + out, _ = all_gather_raw(out, process_group=process_group) + out = rearrange(out, "(b s) d -> b s d", b=batch_size) + logits = model(input_ids).logits + logits = rearrange(logits, "(b s) d -> b s d", b=batch_size) + logits, _ = all_gather_raw(logits, process_group) + logits = rearrange(logits, "(n b) ... d -> b ... (n d)", b=batch_size) + del model + parallel_state.destroy_model_parallel() + + if rank == 0: + model_hf = AutoModelForCausalLM.from_pretrained( + model_name, torch_dtype=dtype, device_map="auto", trust_remote_code=True + ) + model_hf.eval() + out_hf = model_hf.transformer(input_ids).last_hidden_state.to(device=device) + logits_hf = model_hf(input_ids).logits.to(device=device) + del model_hf + + # Without device_map, the model is loaded on the CPU, which is very slow + model_ref = AutoModelForCausalLM.from_pretrained( + model_name, device_map="auto", trust_remote_code=True + ) + model_ref.eval() + with torch.no_grad(): + out_ref = model_ref.transformer(input_ids).last_hidden_state.to(device=device) + logits_ref = model_ref(input_ids).logits.to(device=device) + del model_ref + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}") + assert (out - out_ref).abs().max().item() < 2 * (out_hf - out_ref).abs().max().item() + + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}") + assert (logits - logits_ref).abs().max().item() < 2 * ( + logits_hf - logits_ref + ).abs().max().item() + + +@pytest.mark.parametrize("model_name", ["tiiuae/falcon-7b"]) +def test_falcon_generation(model_name): + """Check that our implementation (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + device = "cuda" + config = falcon_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = False # We don't have fused MLP for "gelu" activation + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + + tokenizer = AutoTokenizer.from_pretrained(model_name) + eos_token_id = tokenizer.eos_token_id + + torch.manual_seed(0) + batch_size = 1 + seqlen = 100 + max_length = 150 + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device + ) + + model_hf = AutoModelForCausalLM.from_pretrained( + model_name, torch_dtype=dtype, device_map={"": device}, trust_remote_code=True + ) + model_hf.eval() + print("HF fp16") + torch.cuda.synchronize() + start = time.time() + out_hf = model_hf.generate( + input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + del model_hf + + model_ref = AutoModelForCausalLM.from_pretrained( + model_name, device_map={"": device}, trust_remote_code=True + ) + model_ref.eval() + with torch.no_grad(): + logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1] + del model_ref + + model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype) + model.eval() + + print("Without CUDA graph") + torch.cuda.synchronize() + start = time.time() + out = model.generate( + input_ids=input_ids, + max_length=max_length, + eos_token_id=eos_token_id, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + teacher_outputs=out_hf.sequences, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + + # Capture graph outside the timing loop + batch_size, seqlen_og = input_ids.shape + model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length) + print("With CUDA graph") + torch.cuda.synchronize() + start = time.time() + out_cg = model.generate( + input_ids=input_ids, + max_length=max_length, + cg=True, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + teacher_outputs=out_hf.sequences, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + + with torch.no_grad(): + logits_parallel = model(out_hf.sequences).logits[:, (seqlen - 1) : -1] + logits_hf = torch.stack(out_hf.scores, dim=1) + logits = torch.stack(out.scores, dim=1) + logits_cg = torch.stack(out_cg.scores, dim=1) + + del model + + hf_error = (logits_hf - logits_ref).abs().max().item() + assert (logits_parallel - logits_ref).abs().max().item() < 2 * hf_error + + print(f"HF fp16 logits max diff: {hf_error}") + print(f"Logits max diff: {(logits - logits_ref).abs().max().item() }") + assert (logits - logits_ref).abs().max().item() < 2 * hf_error + print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item() }") + assert torch.equal(logits_cg, logits) + + +# torchrun --no_python --nproc_per_node=4 pytest -q -s tests/models/test_falcon.py -k "falcon_parallel_generation" +# We want to run this on a machine with 4 x A100 80GB or 8 x A100 40GB so we have enough +# memory to run the model in fp32. +@pytest.mark.parametrize("world_size", [4]) +@pytest.mark.parametrize("model_name", ["tiiuae/falcon-40b"]) +def test_falcon_parallel_generation(model_name, world_size): + """Check that our implementation matches the HF implementation: + the scores in fp16 should be around the same as the HF scores in fp16, when compared to + the HF scores in fp32. + """ + from apex.transformer import parallel_state + + dtype = torch.float16 + config = falcon_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + config.use_flash_attn = False + config.fused_bias_fc = True + config.fused_mlp = False # We don't have fused MLP for "gelu" activation + config.fused_dropout_add_ln = False + config.residual_in_fp32 = True + config.pad_vocab_size_multiple = 8 * world_size + config.sequence_parallel = False # Need to set this to False for generation + + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0" + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + process_group = parallel_state.get_tensor_model_parallel_group() + + torch.manual_seed(0) + batch_size = 1 + seqlen = 100 + max_length = 150 + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device + ) + + # Need this, otherwise when we capture the graph the process for GPU 1 would run on both + # GPU0 and GPU1 and things would hang + torch.cuda.set_device(device) + + pretrained_state_dict = remap_state_dict_hf_falcon( + state_dict_from_pretrained(model_name), config + ) + + model = GPTLMHeadModel(config, process_group=process_group, device=device, dtype=dtype) + model.load_state_dict(shard_state_dict_tp(pretrained_state_dict, config, world_size, rank)) + model.eval() + + print("Without CUDA graph") + out = model.generate( + input_ids=input_ids, + max_length=max_length, + tensor_parallel=world_size, + vocab_size=config.vocab_size, + # teacher_outputs=out_hf.sequences, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + ) + + # Capture graph outside the timing loop + batch_size, seqlen_og = input_ids.shape + model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length) + print("With CUDA graph") + out_cg = model.generate( + input_ids=input_ids, + max_length=max_length, + tensor_parallel=world_size, + vocab_size=config.vocab_size, + cg=True, + # teacher_outputs=out_hf.sequences, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + ) + del model + parallel_state.destroy_model_parallel() + + if rank == 0: + model_hf = AutoModelForCausalLM.from_pretrained( + model_name, torch_dtype=dtype, device_map="auto", trust_remote_code=True + ) + model_hf.eval() + print("HF fp16") + torch.cuda.synchronize() + start = time.time() + with torch.inference_mode(): + out_hf = model_hf.generate( + input_ids=input_ids, + max_length=max_length, + return_dict_in_generate=True, + output_scores=True, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + del model_hf + + model_ref = AutoModelForCausalLM.from_pretrained( + model_name, device_map="auto", trust_remote_code=True + ) + model_ref.eval() + with torch.inference_mode(): + logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1] + del model_ref + logits_hf = torch.stack(out_hf.scores, dim=1) + + logits = torch.stack(out.scores, dim=1) + logits_cg = torch.stack(out_cg.scores, dim=1) + + hf_error = (logits_hf - logits_ref).abs().max().item() + print(f"HF fp16 logits max diff: {hf_error}") + print(f"Logits max diff: {(logits - logits_ref).abs().max().item() }") + assert (logits - logits_ref).abs().max().item() < 2 * hf_error + print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item() }") + assert torch.equal(logits_cg, logits) diff --git a/test_flash_attn.py b/test_flash_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..72d55134e58cc2f90f2922244ac99a0c19fa32b6 --- /dev/null +++ b/test_flash_attn.py @@ -0,0 +1,2525 @@ +import math + +import pytest +import torch +import torch.nn.functional as F +from einops import rearrange, repeat +from flash_attn import ( + flash_attn_func, + flash_attn_kvpacked_func, + flash_attn_qkvpacked_func, + flash_attn_varlen_func, + flash_attn_varlen_kvpacked_func, + flash_attn_varlen_qkvpacked_func, + flash_attn_with_kvcache, +) +from flash_attn.bert_padding import pad_input, unpad_input +from flash_attn.flash_attn_interface import _get_block_size_n +from flash_attn.layers.rotary import apply_rotary_emb + +MAX_HEADDIM_SM8x = 192 + + +is_sm75 = torch.cuda.get_device_capability("cuda") == (7, 5) +is_sm8x = torch.cuda.get_device_capability("cuda")[0] == 8 +is_sm80 = torch.cuda.get_device_capability("cuda") == (8, 0) +is_sm90 = torch.cuda.get_device_capability("cuda") == (9, 0) + + +def attn_bias_from_alibi_slopes( + slopes, seqlen_q, seqlen_k, query_padding_mask=None, key_padding_mask=None, causal=False, key_leftpad=None +): + batch, nheads = slopes.shape + device = slopes.device + slopes = rearrange(slopes, "b h -> b h 1 1") + if causal: + return torch.arange(-seqlen_k + 1, 1, device=device, dtype=torch.float32) * slopes + else: + row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1") + col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long) + if key_leftpad is not None: + key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1") + col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0]) + col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32) + sk = ( + seqlen_k + if key_padding_mask is None + else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1") + ) + sq = ( + seqlen_q + if query_padding_mask is None + else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1") + ) + relative_pos = torch.abs(row_idx + sk - sq - col_idx) + return -slopes * relative_pos.to(dtype=slopes.dtype) + + +def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"): + assert mode in ["full", "random", "third"] + if mode == "full": + lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32) + elif mode == "random": + lengths = torch.randint( + max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device + ) + elif mode == "third": + lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device) + padding_mask = ( + repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths + ) + return padding_mask + + +def generate_qkv( + q, k, v, query_padding_mask=None, key_padding_mask=None, kvpacked=False, qkvpacked=False +): + """ + Arguments: + q: (batch_size, seqlen_q, nheads, d) + k: (batch_size, seqlen_k, nheads_k, d) + v: (batch_size, seqlen_k, nheads_k, d) + query_padding_mask: (batch_size, seqlen), bool + key_padding_mask: (batch_size, seqlen), bool + """ + assert not (kvpacked and qkvpacked) + batch_size, seqlen_q, nheads, d = q.shape + _, seqlen_k, nheads_k, _ = k.shape + assert k.shape == (batch_size, seqlen_k, nheads_k, d) + assert v.shape == (batch_size, seqlen_k, nheads_k, d) + + if query_padding_mask is not None: + q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, query_padding_mask) + output_pad_fn = lambda output_unpad: pad_input( + output_unpad, indices_q, batch_size, seqlen_q + ) + else: + q_unpad = rearrange(q, "b s h d -> (b s) h d") + cu_seqlens_q = torch.arange( + 0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device + ) + max_seqlen_q = seqlen_q + output_pad_fn = lambda output_unpad: rearrange( + output_unpad, "(b s) h d -> b s h d", b=batch_size + ) + + if key_padding_mask is not None: + k_unpad, indices_k, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask) + v_unpad, _, _, _ = unpad_input(v, key_padding_mask) + else: + k_unpad = rearrange(k, "b s h d -> (b s) h d") + v_unpad = rearrange(v, "b s h d -> (b s) h d") + cu_seqlens_k = torch.arange( + 0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device + ) + max_seqlen_k = seqlen_k + + if qkvpacked: + assert (query_padding_mask == key_padding_mask).all() + assert nheads == nheads_k + qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1) + qkv = torch.stack([q, k, v], dim=2) + if query_padding_mask is not None: + dqkv_pad_fn = lambda dqkv_unpad: pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q) + else: + dqkv_pad_fn = lambda dqkv_unpad: rearrange( + dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size + ) + return ( + qkv_unpad.detach().requires_grad_(), + cu_seqlens_q, + max_seqlen_q, + qkv.detach().requires_grad_(), + output_pad_fn, + dqkv_pad_fn, + ) + elif kvpacked: + kv_unpad = torch.stack([k_unpad, v_unpad], dim=1) + kv = torch.stack([k, v], dim=2) + dq_pad_fn = output_pad_fn + if key_padding_mask is not None: + dkv_pad_fn = lambda dkv_unpad: pad_input(dkv_unpad, indices_k, batch_size, seqlen_k) + else: + dkv_pad_fn = lambda dkv_unpad: rearrange( + dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size + ) + return ( + q_unpad.detach().requires_grad_(), + kv_unpad.detach().requires_grad_(), + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q.detach().requires_grad_(), + kv.detach().requires_grad_(), + output_pad_fn, + dq_pad_fn, + dkv_pad_fn, + ) + else: + dq_pad_fn = output_pad_fn + if key_padding_mask is not None: + dk_pad_fn = lambda dk_unpad: pad_input(dk_unpad, indices_k, batch_size, seqlen_k) + else: + dk_pad_fn = lambda dk_unpad: rearrange(dk_unpad, "(b s) h d -> b s h d", b=batch_size) + return ( + q_unpad.detach().requires_grad_(), + k_unpad.detach().requires_grad_(), + v_unpad.detach().requires_grad_(), + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q.detach().requires_grad_(), + k.detach().requires_grad_(), + v.detach().requires_grad_(), + output_pad_fn, + dq_pad_fn, + dk_pad_fn, + ) + + +def construct_local_mask( + seqlen_q, + seqlen_k, + window_size=(-1, -1), # -1 means infinite window size + query_padding_mask=None, + key_padding_mask=None, + device=None, + key_leftpad=None, +): + row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1") + col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long) + if key_leftpad is not None: + key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1") + col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0]) + col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32) + sk = ( + seqlen_k + if key_padding_mask is None + else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1") + ) + sq = ( + seqlen_q + if query_padding_mask is None + else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1") + ) + if window_size[0] < 0: + return col_idx > row_idx + sk - sq + window_size[1] + else: + sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk + return torch.logical_or( + col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk), + col_idx < row_idx + sk - sq - window_size[0], + ) + + +def attention_ref( + q, + k, + v, + query_padding_mask=None, + key_padding_mask=None, + attn_bias=None, + dropout_p=0.0, + dropout_mask=None, + causal=False, + window_size=(-1, -1), # -1 means infinite window size + softcap=0.0, + upcast=True, + reorder_ops=False, + key_leftpad=None, +): + """ + Arguments: + q: (batch_size, seqlen_q, nheads, head_dim) + k: (batch_size, seqlen_k, nheads_k, head_dim) + v: (batch_size, seqlen_k, nheads_k, head_dim) + query_padding_mask: (batch_size, seqlen_q) + key_padding_mask: (batch_size, seqlen_k) + attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k) + dropout_p: float + dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k) + causal: whether to apply causal masking + window_size: (int, int), left and right window size + upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast + output back to fp16/bf16. + reorder_ops: whether to change the order of operations (scaling k instead of scaling q, etc.) + without changing the math. This is to estimate the numerical error from operation + reordering. + Output: + output: (batch_size, seqlen_q, nheads, head_dim) + attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout + """ + if causal: + window_size = (window_size[0], 0) + dtype_og = q.dtype + if upcast: + q, k, v = q.float(), k.float(), v.float() + seqlen_q, seqlen_k = q.shape[1], k.shape[1] + k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2]) + v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2]) + d = q.shape[-1] + if not reorder_ops: + scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k) + else: + scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d)) + if softcap > 0: + scores = scores / softcap + scores = scores.tanh() + scores = scores * softcap + if key_padding_mask is not None: + scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf")) + if window_size[0] >= 0 or window_size[1] >= 0: + local_mask = construct_local_mask( + seqlen_q, + seqlen_k, + window_size, + query_padding_mask, + key_padding_mask, + q.device, + key_leftpad=key_leftpad, + ) + scores.masked_fill_(local_mask, float("-inf")) + if attn_bias is not None: + scores = scores + attn_bias + attention = torch.softmax(scores, dim=-1).to(v.dtype) + # Some rows might be completely masked out so we fill them with zero instead of NaN + if window_size[0] >= 0 or window_size[1] >= 0: + attention = attention.masked_fill(torch.all(local_mask, dim=-1, keepdim=True), 0.0) + # We want to mask here so that the attention matrix doesn't have any NaNs + # Otherwise we'll get NaN in dV + if query_padding_mask is not None: + attention = attention.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0) + dropout_scaling = 1.0 / (1 - dropout_p) + # attention_drop = attention.masked_fill(~dropout_mask, 0.0) * dropout_scaling + # output = torch.einsum('bhts,bshd->bthd', attention_drop , v) + if dropout_mask is not None: + attention_drop = attention.masked_fill(~dropout_mask, 0.0) + else: + attention_drop = attention + output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling) + if query_padding_mask is not None: + output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0) + return output.to(dtype=dtype_og), attention.to(dtype=dtype_og) + + +def attention_kvpacked_ref( + q, + kv, + query_padding_mask=None, + key_padding_mask=None, + attn_bias=None, + dropout_p=0.0, + dropout_mask=None, + causal=False, + window_size=(-1, -1), # -1 means infinite window size + softcap=0.0, + upcast=True, + reorder_ops=False, + key_leftpad=None, +): + return attention_ref( + q, + kv[:, :, 0], + kv[:, :, 1], + query_padding_mask, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + upcast=upcast, + causal=causal, + window_size=window_size, + softcap=softcap, + reorder_ops=reorder_ops, + key_leftpad=key_leftpad, + ) + + +def attention_qkvpacked_ref( + qkv, + key_padding_mask=None, + attn_bias=None, + dropout_p=0.0, + dropout_mask=None, + causal=False, + window_size=(-1, -1), # -1 means infinite window size + softcap=0.0, + upcast=True, + reorder_ops=False, +): + return attention_ref( + qkv[:, :, 0], + qkv[:, :, 1], + qkv[:, :, 2], + key_padding_mask, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + upcast=upcast, + causal=causal, + window_size=window_size, + softcap=softcap, + reorder_ops=reorder_ops, + ) + + +def generate_sparsity_mask(seqlen, sparsity=0.3): + repeats = seqlen // 16 // 2 + # mask = torch.stack([torch.tensor([1, 0] * repeats, dtype=torch.bool, device='cuda'), + # torch.tensor([0, 1] * repeats, dtype=torch.bool, device='cuda')], dim=-1) + # mask = torch.stack([torch.tensor([1, 1] * repeats, dtype=torch.bool, device='cuda'), + # torch.tensor([1, 1] * repeats, dtype=torch.bool, device='cuda')], dim=-1) + # mask = torch.stack([torch.tensor([1, 1] * repeats, dtype=torch.bool, device='cuda')], dim=-1) + # mask = torch.stack([torch.tensor([1, 0] * repeats, dtype=torch.bool, device='cuda')], dim=-1) + nrow, ncol = seqlen // 16, seqlen // 256 + mask = torch.rand(nrow, ncol, device="cuda") < sparsity + return mask + + +def attention_blocksparse_ref(qkv, blockmask, attn_mask, dropout_p, dropout_mask): + """ + Arguments: + qkv: (batch_size, seqlen, 3, nheads, head_dim) + blockmask: (seqlen / 16, seqlen / 256) + attn_mask: (batch_size, seqlen) + dropout_p: float + dropout_mask: (batch_size, nheads, seqlen, seqlen) + Output: + output: (batch_size, seqlen, nheads, head_dim) + attention: softmax after dropout + """ + q, k, v = qkv.float().unbind(dim=2) + d = qkv.shape[-1] + seqlen = qkv.shape[1] + scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k) + scores.masked_fill_(rearrange(~attn_mask, "b s -> b 1 1 s"), float("-inf")) + blockmask = repeat(blockmask, "s_16 s_256 -> (s_16 16) (s_256 256)") + blockmask = blockmask[:seqlen, :seqlen] + scores.masked_fill_(rearrange(~blockmask, "t s -> 1 1 t s"), float("-inf")) + attention = torch.softmax(scores, dim=-1) + attention = attention.masked_fill(rearrange(~attn_mask, "b s -> b 1 s 1"), 0.0) + attention = attention.masked_fill_(rearrange(~blockmask, "t s -> 1 1 t s"), 0.0) + attention_drop = attention.masked_fill(~dropout_mask, 0.0) / (1 - dropout_p) + output = torch.einsum("bhts,bshd->bthd", attention_drop, v) + output.masked_fill_(rearrange(~attn_mask, "b s -> b s 1 1"), 0) + return output.to(dtype=qkv.dtype), attention.to(dtype=qkv.dtype) + + +def convert_flash_attn_S_to_softmax( + S, + seqlen_q, + seqlen_k, + query_padding_mask, + key_padding_mask, + head_dim, + is_dropout, + causal=False, + window_size=(-1, -1), # -1 means infinite window size +): + """FlashAttention stores the S matrix in a different way. + Arguments: + S: (batch_size, nheads, seqlen_q_rounded, seqlen_k_rounded) + query_padding_mask: (batch_size, seqlen_q_rounded) + key_padding_mask: (batch_size, seqlen_k_rounded) + """ + if causal: + window_size = (window_size[0], 0) + seqlen_q_rounded, seqlen_k_rounded = S.shape[-2:] + S_converted = S + if window_size[0] >= 0 or window_size[1] >= 0: + local_mask = construct_local_mask( + seqlen_q, + seqlen_k, + window_size, + query_padding_mask, + key_padding_mask, + S.device, + ) + local_mask = F.pad( + local_mask, + (0, seqlen_k_rounded - seqlen_k, 0, seqlen_q_rounded - seqlen_q), + value=True, + ) + S_converted = S_converted.masked_fill(local_mask, 0.0) + + # Need to zero out things not in attention_mask in case S was initialized with random values + # and some of those values aren't overwritten. + seqlen_q_og = ( + query_padding_mask.shape[-1] if query_padding_mask is not None else seqlen_q_rounded + ) + if query_padding_mask is not None: + query_padding_mask = F.pad(query_padding_mask, (0, seqlen_q_rounded - seqlen_q_og)) + S_converted = S_converted.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0) + seqlen_k_og = key_padding_mask.shape[-1] if key_padding_mask is not None else seqlen_k + if key_padding_mask is not None: + key_padding_mask = F.pad(key_padding_mask, (0, seqlen_k_rounded - seqlen_k_og)) + S_converted = S_converted.masked_fill(rearrange(~key_padding_mask, "b s -> b 1 1 s"), 0.0) + S_converted = F.pad(S_converted, (0, 0, 0, seqlen_q_og - seqlen_q_rounded)) + S_converted = F.pad(S_converted, (0, seqlen_k_og - seqlen_k_rounded)) + return S_converted[:, :, :seqlen_q, :seqlen_k] + + +def normalize_flash_attn_S( + attn_unnorm, + q, + k, + v, + query_padding_mask=None, + key_padding_mask=None, + attn_bias=None, + is_dropout=False, + causal=False, + window_size=(-1, -1), # -1 means infinite window size +): + """ + Arguments: + q: (batch_size, seqlen_q, nheads, head_dim) + k, v: (batch_size, seqlen_k, nheads, head_dim) + key_padding_mask: (batch_size, seqlen_q) + attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k) + Output: + softmax_lse: (batch_size, nheads, seqlen_q) + softmax_max: (batch_size, nheads, seqlen_q) + """ + if causal: + window_size = (window_size[0], 0) + q, k, v = q.float(), k.float(), v.float() + _, seqlen_q, _, head_dim = q.shape + seqlen_k = k.shape[1] + scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(head_dim), k) + if key_padding_mask is not None: + scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf")) + if window_size[0] >= 0 or window_size[1] >= 0: + local_mask = construct_local_mask( + seqlen_q, + seqlen_k, + window_size, + query_padding_mask, + key_padding_mask, + q.device, + ) + scores.masked_fill_(local_mask, float("-inf")) + if attn_bias is not None: + scores = scores + attn_bias.to(dtype=scores.dtype) + block_size_n = _get_block_size_n(scores.device, head_dim, is_dropout, causal) + scores_block = scores.split(block_size_n, dim=-1) + lse_block = torch.stack([torch.logsumexp(s, dim=-1) for s in scores_block], dim=-1) + lse = torch.logsumexp(lse_block, dim=-1) + # lse could be -inf (i.e. all values in scores are -inf), and we want to set those to inf + # so that when we do torch.exp(m - lse), we get 0.0 instead of NaN. + lse[lse == float("-inf")] = float("inf") + scores_max_block = torch.stack([torch.amax(s, dim=-1) for s in scores_block], dim=-1) + cummax_block = torch.cummax(scores_max_block.flip(-1), dim=-1).values.flip(-1).unbind(dim=-1) + attn_unnorm_block = attn_unnorm.split(block_size_n, dim=-1) + attn_norm = torch.cat( + [ + a * rearrange(torch.exp(m - lse), "b h s -> b h s 1") + for a, m in zip(attn_unnorm_block, cummax_block) + ], + dim=-1, + ) + if query_padding_mask is not None: + attn_norm.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0) + return attn_norm.to(dtype=attn_unnorm.dtype) + + +def get_dropout_fraction( + dropout_mask, + query_padding_mask=None, + key_padding_mask=None, + causal=False, + window_size=(-1, -1), # -1 means infinite window size +): + """ + dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k), bool. True means keep, False means drop. + query_padding_mask: (batch_size, seqlen_q) + key_padding_mask: (batch_size, seqlen_k) + """ + if causal: + window_size = (window_size[0], 0) + batch_size, nheads, seqlen_q, seqlen_k = dropout_mask.shape + dropped = ~dropout_mask + valid = torch.ones_like(dropout_mask) + if query_padding_mask is not None: + dropped.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), False) + valid.masked_fill_(rearrange(~query_padding_mask, "b s -> b 1 s 1"), False) + if key_padding_mask is not None: + dropped.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), False) + valid.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), False) + if window_size[0] >= 0 or window_size[1] >= 0: + local_mask = construct_local_mask( + seqlen_q, + seqlen_k, + window_size, + query_padding_mask, + key_padding_mask, + dropout_mask.device, + ) + dropped.masked_fill_(local_mask, False) + valid.masked_fill_(local_mask, False) + dropped_total = dropped.sum() + return dropped.sum() / valid.sum() + + +@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +# @pytest.mark.parametrize("dtype", [torch.float16]) +@pytest.mark.parametrize("deterministic", [False, True]) +# @pytest.mark.parametrize("deterministic", [False]) +@pytest.mark.parametrize("alibi", [False, True]) +# @pytest.mark.parametrize("alibi", [False]) +@pytest.mark.parametrize("local", [False, True]) +# @pytest.mark.parametrize("local", [False]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize("causal", [False]) +@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [32, 64, 96, 128]) +# @pytest.mark.parametrize("d", [64]) +# @pytest.mark.parametrize('seqlen', [128, 256, 384, 512, 768, 1024, 2048]) +@pytest.mark.parametrize("seqlen", [97, 128, 200, 384, 768, 1024, 1025, 2048]) +# @pytest.mark.parametrize("seqlen", [512]) +@pytest.mark.parametrize("dropout_p", [0.0, 0.17]) +# @pytest.mark.parametrize("dropout_p", [0.0]) +def test_flash_attn_qkvpacked(seqlen, d, dropout_p, causal, local, alibi, deterministic, dtype): + if seqlen >= 2048 and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30: + pytest.skip() # Reference implementation OOM + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 4 + nheads = 9 + window_size = (-1, -1) if not local else torch.randint(0, seqlen, (2,)) + qkv = torch.randn( + batch_size, seqlen, 3, nheads, d, device=device, dtype=dtype, requires_grad=True + ) + if alibi: + alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 + attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen, seqlen, causal=causal) + else: + alibi_slopes, attn_bias = None, None + out, lse, S_dmask = flash_attn_qkvpacked_func( + qkv, + dropout_p, + causal=causal, + window_size=window_size, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + if dropout_p > 0.0: + S_dmask_converted = convert_flash_attn_S_to_softmax( + S_dmask, + seqlen, + seqlen, + None, + None, + d, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + dropout_mask = S_dmask_converted >= 0 + attn_unnorm = S_dmask_converted.abs() + attn = normalize_flash_attn_S( + attn_unnorm, + qkv[:, :, 0], + qkv[:, :, 1], + qkv[:, :, 2], + None, + None, + attn_bias, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + dropout_fraction = get_dropout_fraction( + dropout_mask, None, None, causal=causal, window_size=window_size + ).item() + print(f"Actual dropout fraction: {dropout_fraction}") + else: + dropout_mask = None + + out_ref, attn_ref = attention_qkvpacked_ref( + qkv, None, attn_bias, dropout_p, dropout_mask, causal=causal, window_size=window_size + ) + out_pt, attn_pt = attention_qkvpacked_ref( + qkv, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + upcast=False, + reorder_ops=True, + ) + # v = qkv[:, :, 2].float() + # qk = torch.einsum('bshd,bthd->bhst', qkv[:, :, 0], qkv[:, :, 1]).float() + # if causal: + # causal_mask = torch.triu(torch.ones(seqlen, seqlen, dtype=torch.bool, device=qkv.device), 1) + # qk.masked_fill_(causal_mask, float('-inf')) + # m = qk.amax(-1, keepdim=True) + # s_tmp = torch.exp((qk - m) / math.sqrt(d)) + # p_tmp = torch.softmax(qk / math.sqrt(d), -1) + # p_dropped = p_tmp if dropout_mask is None else p_tmp.masked_fill(~dropout_mask, 0) + # lse_ref = torch.logsumexp(qk / math.sqrt(d), -1) + # qk_max1 = torch.max(qk[:, :, 128:, 192:], -1, keepdim=True).values + # qk_max2 = torch.max(qk[:, :, 128:, 128:], -1, keepdim=True).values + # qk_max3 = torch.max(qk[:, :, 128:, 64:], -1, keepdim=True).values + # qk_max4 = torch.max(qk[:, :, 128:, :], -1, keepdim=True).values + # o1 = torch.einsum('bhst,bthd->bshd', torch.exp((qk[:, :, 128:, 192:] - qk_max1) / math.sqrt(d)), v[:, 192:]) + # o2 = torch.einsum('bhst,bthd->bshd', torch.exp((qk[:, :, 128:, 128:] - qk_max2) / math.sqrt(d)), v[:, 128:]) + # o3 = torch.einsum('bhst,bthd->bshd', torch.exp((qk[:, :, 128:, 64:] - qk_max3) / math.sqrt(d)), v[:, 64:]) + # o4 = torch.einsum('bhst,bthd->bshd', torch.exp((qk[:, :, 128:, :] - qk_max4) / math.sqrt(d)), v[:, :]) + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + if dropout_p > 0.0: + print(f"Attention max diff: {(attn - attn_ref).abs().max().item()}") + print(f"Attention Pytorch max diff: {(attn_pt - attn_ref).abs().max().item()}") + + g = torch.randn_like(out) + # do_o = (g.float() * out.float()).sum(-1) + # dv_tmp = torch.einsum('bhts,bthd->bshd', attn_pt[:, :, :64], g[:, :64]) + # dv_tmp1 = torch.einsum('bhts,bthd->bshd', attn_pt[:, :, 64:], g[:, 64:]) + if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90): + (dqkv,) = torch.autograd.grad(out, qkv, g) + (dqkv_ref,) = torch.autograd.grad(out_ref, qkv, g) + (dqkv_pt,) = torch.autograd.grad(out_pt, qkv, g) + print(f"dQ max diff: {(dqkv[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}") + print(f"dK max diff: {(dqkv[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}") + print(f"dV max diff: {(dqkv[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}") + print(f"dQKV mean diff: {(dqkv - dqkv_ref).abs().mean().item()}") + print(f"dQ Pytorch max diff: {(dqkv_pt[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}") + print(f"dK Pytorch max diff: {(dqkv_pt[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}") + print(f"dV Pytorch max diff: {(dqkv_pt[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}") + print(f"dQKV Pytorch mean diff: {(dqkv_pt - dqkv_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most twice the numerical error + # of a Pytorch implementation. + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + + if dropout_p > 0.0: + assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item() + # With alibi, many of the prob values are 0.0 & -0.0 so dropout_fraction isn't accurate + if not alibi: + assert abs(dropout_fraction - dropout_p) <= (0.01 if not local else 0.025) + + if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90): + assert (dqkv - dqkv_ref).abs().max().item() <= 2 * (dqkv_pt - dqkv_ref).abs().max().item() + + +@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +# @pytest.mark.parametrize('dtype', [torch.float16]) +@pytest.mark.parametrize("deterministic", [False, True]) +# @pytest.mark.parametrize("deterministic", [True]) +@pytest.mark.parametrize("alibi", [False, True]) +# @pytest.mark.parametrize("alibi", [True]) +@pytest.mark.parametrize("local", [False, True]) +# @pytest.mark.parametrize("local", [True]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize('causal', [False]) +@pytest.mark.parametrize("d", [32, 59, 64, 80, 96, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [64]) +@pytest.mark.parametrize("seqlen", [97, 128, 200, 257, 384, 512, 768, 1025, 2048]) +# @pytest.mark.parametrize('seqlen', [128]) +@pytest.mark.parametrize("dropout_p", [0.0, 0.17]) +# @pytest.mark.parametrize('dropout_p', [0.0]) +def test_flash_attn_varlen_qkvpacked( + seqlen, d, dropout_p, causal, local, alibi, deterministic, dtype +): + if seqlen >= 2048 and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30: + pytest.skip() # Reference implementation OOM + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 5 + nheads = 6 + window_size = (-1, -1) if not local else torch.randint(0, seqlen, (2,)) + qkv = torch.randn( + batch_size, seqlen, 3, nheads, d, device=device, dtype=dtype, requires_grad=True + ) + + key_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode="random") + # key_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode='full') + if alibi: + alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 + attn_bias = attn_bias_from_alibi_slopes( + alibi_slopes, seqlen, seqlen, key_padding_mask, key_padding_mask, causal=causal + ) + else: + alibi_slopes, attn_bias = None, None + + qkv_unpad, cu_seqlens, max_seqlen, qkv, output_pad_fn, dqkv_pad_fn = generate_qkv( + *qkv.unbind(dim=2), key_padding_mask, key_padding_mask, qkvpacked=True + ) + + out_unpad, sm_lse, S_dmask = flash_attn_varlen_qkvpacked_func( + qkv_unpad, + cu_seqlens, + max_seqlen, + dropout_p, + causal=causal, + window_size=window_size, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + out = output_pad_fn(out_unpad) + if dropout_p > 0.0: + S_dmask_converted = convert_flash_attn_S_to_softmax( + S_dmask, + seqlen, + seqlen, + key_padding_mask, + key_padding_mask, + d, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + dropout_mask = S_dmask_converted >= 0 + attn_unnorm = S_dmask_converted.abs() + attn = normalize_flash_attn_S( + attn_unnorm, + qkv[:, :, 0], + qkv[:, :, 1], + qkv[:, :, 2], + key_padding_mask, + key_padding_mask, + attn_bias, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + dropout_fraction = get_dropout_fraction( + dropout_mask, key_padding_mask, key_padding_mask, causal=causal, window_size=window_size + ).item() + print(f"Actual dropout fraction: {dropout_fraction}") + else: + dropout_mask = None + + out_ref, attn_ref = attention_qkvpacked_ref( + qkv, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + ) + out_pt, attn_pt = attention_qkvpacked_ref( + qkv, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + upcast=False, + reorder_ops=True, + ) + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + if dropout_p > 0.0: + print(f"Attention max diff: {(attn - attn_ref).abs().max().item()}") + print(f"Attention Pytorch max diff: {(attn_pt - attn_ref).abs().max().item()}") + + g = torch.randn_like(out) + if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90): + (dqkv_unpad,) = torch.autograd.grad(out, qkv_unpad, g) + dqkv = dqkv_pad_fn(dqkv_unpad) + (dqkv_ref,) = torch.autograd.grad(out_ref, qkv, g) + (dqkv_pt,) = torch.autograd.grad(out_pt, qkv, g) + print(f"dQ max diff: {(dqkv[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}") + print(f"dK max diff: {(dqkv[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}") + print(f"dV max diff: {(dqkv[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}") + print(f"dQKV mean diff: {(dqkv - dqkv_ref).abs().mean().item()}") + print(f"dQ Pytorch max diff: {(dqkv_pt[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}") + print(f"dK Pytorch max diff: {(dqkv_pt[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}") + print(f"dV Pytorch max diff: {(dqkv_pt[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}") + print(f"dQKV Pytorch mean diff: {(dqkv_pt - dqkv_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most twice the numerical error + # of a Pytorch implementation. + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + + if dropout_p > 0.0: + assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item() + # With alibi, many of the prob values are 0.0 & -0.0 so dropout_fraction isn't accurate + if not alibi: + assert abs(dropout_fraction - dropout_p) <= (0.01 if not local else 0.025) + + if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90): + assert (dqkv - dqkv_ref).abs().max().item() <= 2 * (dqkv_pt - dqkv_ref).abs().max().item() + + +@pytest.mark.parametrize("kvpacked", [True, False]) +# @pytest.mark.parametrize("kvpacked", [False]) +@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +# @pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"]) +# @pytest.mark.parametrize("mha_type", ["mha"]) +@pytest.mark.parametrize("deterministic", [False, True]) +# @pytest.mark.parametrize("deterministic", [True]) +@pytest.mark.parametrize("alibi", [False, True]) +# @pytest.mark.parametrize("alibi", [False]) +@pytest.mark.parametrize("local", [False, True]) +# @pytest.mark.parametrize("local", [False]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize("causal", [True]) +@pytest.mark.parametrize("d", [32, 40, 59, 64, 96, 111, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [56, 80]) +# @pytest.mark.parametrize("d", [64]) +@pytest.mark.parametrize( + "seqlen_q,seqlen_k", + [ + (113, 203), + (128, 217), + (113, 211), + (108, 256), + (256, 512), + (512, 256), + (1024, 1024), + (1023, 1024), + (1024, 1023), + (2048, 2048), + ], +) +# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)]) +@pytest.mark.parametrize("dropout_p", [0.0, 0.17]) +# @pytest.mark.parametrize("dropout_p", [0.0]) +@pytest.mark.parametrize("softcap", [0.0, 50.0]) +def test_flash_attn_output( + seqlen_q, seqlen_k, d, dropout_p, causal, local, alibi, deterministic, mha_type, dtype, kvpacked, softcap +): + if ( + max(seqlen_q, seqlen_k) >= 2048 + and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30 + ): + pytest.skip() # Reference implementation OOM + if softcap > 0.0 and dropout_p > 0.0: + pytest.skip("Softcap and dropout not supported together") + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 4 + nheads = 6 if softcap == 0.0 else 4 # softcap reference impl takes more memory + nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 2) + assert nheads % nheads_k == 0 + window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,)) + q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True) + if softcap > 0: + # Ensure the values of qk are at least within softcap range. + q = q * softcap + if kvpacked: + kv = torch.randn( + batch_size, seqlen_k, 2, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + else: + k = torch.randn( + batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + v = torch.randn( + batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + if alibi: + alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 + attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen_q, seqlen_k, causal=causal) + else: + alibi_slopes, attn_bias = None, None + + if kvpacked: + out, lse, S_dmask = flash_attn_kvpacked_func( + q, + kv, + dropout_p, + causal=causal, + window_size=window_size, + softcap=softcap, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + else: + out, lse, S_dmask = flash_attn_func( + q, + k, + v, + dropout_p, + causal=causal, + window_size=window_size, + softcap=softcap, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + if dropout_p > 0.0: + S_dmask_converted = convert_flash_attn_S_to_softmax( + S_dmask, + seqlen_q, + seqlen_k, + None, + None, + d, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + dropout_mask = S_dmask_converted >= 0 + attn_unnorm = S_dmask_converted.abs() + if kvpacked: + kv_rep = repeat(kv, "b s two h d -> b s two (h g) d", g=nheads // nheads_k) + k_rep, v_rep = kv_rep.unbind(dim=2) + else: + k_rep = repeat(k, "b s h d -> b s (h g) d", g=nheads // nheads_k) + v_rep = repeat(v, "b s h d -> b s (h g) d", g=nheads // nheads_k) + attn = normalize_flash_attn_S( + attn_unnorm, + q, + k_rep, + v_rep, + None, + None, + attn_bias, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + dropout_fraction = get_dropout_fraction( + dropout_mask, None, None, causal=causal, window_size=window_size + ).item() + print(f"Actual dropout fraction: {dropout_fraction}") + else: + dropout_mask = None + + if kvpacked: + out_ref, attn_ref = attention_kvpacked_ref( + q, + kv, + None, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + softcap=softcap, + ) + out_pt, attn_pt = attention_kvpacked_ref( + q, + kv, + None, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + softcap=softcap, + upcast=False, + reorder_ops=True, + ) + else: + out_ref, attn_ref = attention_ref( + q, + k, + v, + None, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + softcap=softcap, + ) + out_pt, attn_pt = attention_ref( + q, + k, + v, + None, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + softcap=softcap, + upcast=False, + reorder_ops=True, + ) + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + if dropout_p > 0.0: + print(f"Attention max diff: {(attn - attn_ref).abs().max().item()}") + print(f"Attention Pytorch max diff: {(attn_pt - attn_ref).abs().max().item()}") + + g = torch.randn_like(out) + do_o = (g.float() * out.float()).sum(-1) + if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90): + if kvpacked: + ( + dq, + dkv, + ) = torch.autograd.grad(out, (q, kv), g) + dk, dv = dkv.unbind(2) + ( + dq_ref, + dkv_ref, + ) = torch.autograd.grad(out_ref, (q, kv), g) + dk_ref, dv_ref = dkv_ref.unbind(2) + ( + dq_pt, + dkv_pt, + ) = torch.autograd.grad(out_pt, (q, kv), g) + dk_pt, dv_pt = dkv_pt.unbind(2) + else: + ( + dq, + dk, + dv, + ) = torch.autograd.grad(out, (q, k, v), g) + ( + dq_ref, + dk_ref, + dv_ref, + ) = torch.autograd.grad(out_ref, (q, k, v), g) + ( + dq_pt, + dk_pt, + dv_pt, + ) = torch.autograd.grad(out_pt, (q, k, v), g) + print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}") + print(f"dK max diff: {(dk - dk_ref).abs().max().item()}") + print(f"dV max diff: {(dv - dv_ref).abs().max().item()}") + print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}") + print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}") + print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}") + print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}") + print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}") + print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}") + print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}") + print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}") + print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most twice the numerical error + # of a Pytorch implementation. + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + + if dropout_p > 0.0: + assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item() + # With alibi, many of the prob values are 0.0 & -0.0 so dropout_fraction isn't accurate + if not alibi: + assert abs(dropout_fraction - dropout_p) <= (0.01 if not local else 0.025) + + if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90): + assert (dq - dq_ref).abs().max().item() <= 3 * (dq_pt - dq_ref).abs().max().item() + assert (dk - dk_ref).abs().max().item() <= 3 * (dk_pt - dk_ref).abs().max().item() + assert (dv - dv_ref).abs().max().item() <= 3 * (dv_pt - dv_ref).abs().max().item() + + +@pytest.mark.parametrize("kvpacked", [True, False]) +# @pytest.mark.parametrize('kvpacked', [False]) +@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +# @pytest.mark.parametrize('dtype', [torch.float16]) +@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"]) +# @pytest.mark.parametrize('mha_type', ["mqa"]) +@pytest.mark.parametrize("deterministic", [False, True]) +# @pytest.mark.parametrize("deterministic", [True]) +@pytest.mark.parametrize("alibi", [False, True]) +# @pytest.mark.parametrize("alibi", [True]) +@pytest.mark.parametrize("local", [False, True]) +# @pytest.mark.parametrize("local", [True]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize('causal', [True]) +@pytest.mark.parametrize("d", [32, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [64]) +@pytest.mark.parametrize( + "seqlen_q,seqlen_k", + [ + (1, 147), + (113, 203), + (128, 217), + (113, 211), + (108, 256), + (256, 512), + (512, 256), + (1024, 1024), + (1023, 1024), + (1024, 1023), + (2048, 2048), + ], +) +# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(128, 128)]) +@pytest.mark.parametrize("dropout_p", [0.0, 0.17]) +@pytest.mark.parametrize("softcap", [0.0, 50.0]) +# @pytest.mark.parametrize('dropout_p', [0.0]) +def test_flash_attn_varlen_output( + seqlen_q, seqlen_k, d, dropout_p, causal, local, alibi, deterministic, mha_type, dtype, kvpacked, softcap +): + if ( + max(seqlen_q, seqlen_k) >= 2048 + and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30 + ): + pytest.skip() # Reference implementation OOM + if softcap > 0.0 and dropout_p > 0.0: + pytest.skip("Softcap and dropout not supported together") + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 4 + nheads = 6 if softcap == 0.0 else 4 # softcap reference impl takes more memory + nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 2) + assert nheads % nheads_k == 0 + window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,)) + q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True) + if softcap > 0: + # Ensure the values of qk are at least within softcap range. + q = q * softcap + + if kvpacked: + kv = torch.randn( + batch_size, seqlen_k, 2, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + else: + k = torch.randn( + batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + v = torch.randn( + batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + + query_padding_mask = generate_random_padding_mask(seqlen_q, batch_size, device, mode="random") + key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode="random") + # key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode='full') + if alibi: + alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 + attn_bias = attn_bias_from_alibi_slopes( + alibi_slopes, seqlen_q, seqlen_k, query_padding_mask, key_padding_mask, causal=causal + ) + else: + alibi_slopes, attn_bias = None, None + + if kvpacked: + ( + q_unpad, + kv_unpad, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q, + kv, + output_pad_fn, + dq_pad_fn, + dkv_pad_fn, + ) = generate_qkv(q, *kv.unbind(dim=2), query_padding_mask, key_padding_mask, kvpacked=True) + out_unpad, sm_lse, S_dmask = flash_attn_varlen_kvpacked_func( + q_unpad, + kv_unpad, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + causal=causal, + window_size=window_size, + softcap=softcap, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + else: + ( + q_unpad, + k_unpad, + v_unpad, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q, + k, + v, + output_pad_fn, + dq_pad_fn, + dk_pad_fn, + ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False) + out_unpad, sm_lse, S_dmask = flash_attn_varlen_func( + q_unpad, + k_unpad, + v_unpad, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + causal=causal, + window_size=window_size, + softcap=softcap, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + out = output_pad_fn(out_unpad) + if dropout_p > 0.0: + S_dmask_converted = convert_flash_attn_S_to_softmax( + S_dmask, + seqlen_q, + seqlen_k, + query_padding_mask, + key_padding_mask, + d, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + dropout_mask = S_dmask_converted >= 0 + attn_unnorm = S_dmask_converted.abs() + if kvpacked: + kv_rep = repeat(kv, "b s two h d -> b s two (h g) d", g=nheads // nheads_k) + k_rep, v_rep = kv_rep.unbind(dim=2) + else: + k_rep = repeat(k, "b s h d -> b s (h g) d", g=nheads // nheads_k) + v_rep = repeat(v, "b s h d -> b s (h g) d", g=nheads // nheads_k) + attn = normalize_flash_attn_S( + attn_unnorm, + q, + k_rep, + v_rep, + query_padding_mask, + key_padding_mask, + attn_bias, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + dropout_fraction = get_dropout_fraction( + dropout_mask, + query_padding_mask, + key_padding_mask, + causal=causal, + window_size=window_size, + ).item() + print(f"Actual dropout fraction: {dropout_fraction}") + else: + dropout_mask = None + + if kvpacked: + out_ref, attn_ref = attention_kvpacked_ref( + q, + kv, + query_padding_mask, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + softcap=softcap, + ) + out_pt, attn_pt = attention_kvpacked_ref( + q, + kv, + query_padding_mask, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + softcap=softcap, + upcast=False, + reorder_ops=True, + ) + else: + out_ref, attn_ref = attention_ref( + q, + k, + v, + query_padding_mask, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + softcap=softcap, + ) + out_pt, attn_pt = attention_ref( + q, + k, + v, + query_padding_mask, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + softcap=softcap, + upcast=False, + reorder_ops=True, + ) + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + if dropout_p > 0.0: + print(f"Attention max diff: {(attn - attn_ref).abs().max().item()}") + print(f"Attention Pytorch max diff: {(attn_pt - attn_ref).abs().max().item()}") + + g = torch.randn_like(out) + if ((d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90)): + if kvpacked: + ( + dq_unpad, + dkv_unpad, + ) = torch.autograd.grad(out, (q_unpad, kv_unpad), g) + dk, dv = dkv_pad_fn(dkv_unpad).unbind(2) + ( + dq_ref, + dkv_ref, + ) = torch.autograd.grad(out_ref, (q, kv), g) + dk_ref, dv_ref = dkv_ref.unbind(2) + ( + dq_pt, + dkv_pt, + ) = torch.autograd.grad(out_pt, (q, kv), g) + dk_pt, dv_pt = dkv_pt.unbind(2) + else: + ( + dq_unpad, + dk_unpad, + dv_unpad, + ) = torch.autograd.grad(out, (q_unpad, k_unpad, v_unpad), g) + dk = dk_pad_fn(dk_unpad) + dv = dk_pad_fn(dv_unpad) + ( + dq_ref, + dk_ref, + dv_ref, + ) = torch.autograd.grad(out_ref, (q, k, v), g) + ( + dq_pt, + dk_pt, + dv_pt, + ) = torch.autograd.grad(out_pt, (q, k, v), g) + dq = dq_pad_fn(dq_unpad) + print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}") + print(f"dK max diff: {(dk - dk_ref).abs().max().item()}") + print(f"dV max diff: {(dv - dv_ref).abs().max().item()}") + print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}") + print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}") + print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}") + print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}") + print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}") + print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}") + print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}") + print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}") + print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most twice the numerical error + # of a Pytorch implementation. + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + + if dropout_p > 0.0: + assert (attn - attn_ref).abs().max().item() <= 2 * (attn_pt - attn_ref).abs().max().item() + # With alibi, many of the prob values are 0.0 & -0.0 so dropout_fraction isn't accurate + if not alibi: + assert abs(dropout_fraction - dropout_p) <= (0.01 if not local else 0.04) + + if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90): + assert (dq - dq_ref).abs().max().item() <= 3 * (dq_pt - dq_ref).abs().max().item() + assert (dk - dk_ref).abs().max().item() <= 3 * (dk_pt - dk_ref).abs().max().item() + assert (dv - dv_ref).abs().max().item() <= 3 * (dv_pt - dv_ref).abs().max().item() + + +@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +# @pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("local", [False, True]) +# @pytest.mark.parametrize("local", [True]) +@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [56, 80]) +# @pytest.mark.parametrize("d", [64, 128]) +@pytest.mark.parametrize("swap_sq_sk", [False, True]) +# @pytest.mark.parametrize("swap_sq_sk", [True]) +@pytest.mark.parametrize( + "seqlen_q,seqlen_k", + [ + (1, 239), + (3, 799), + (127, 512), + (127, 513), + (113, 203), + (128, 217), + (113, 211), + (108, 256), + (256, 512), + (1023, 1024), + ], +) +# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)]) +def test_flash_attn_causal(seqlen_q, seqlen_k, swap_sq_sk, d, local, dtype): + if ( + max(seqlen_q, seqlen_k) >= 2048 + and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30 + ): + pytest.skip() # Reference implementation OOM + if swap_sq_sk: + seqlen_q, seqlen_k = seqlen_k, seqlen_q + device = "cuda" + causal = True + # set seed + torch.random.manual_seed(0) + batch_size = 8 + nheads = 9 + window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,)) + q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True) + k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True) + v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True) + out = flash_attn_func(q, k, v, 0.0, causal=causal, window_size=window_size) + out_ref, attn_ref = attention_ref( + q, k, v, None, None, None, 0.0, None, causal=causal, window_size=window_size + ) + out_pt, attn_pt = attention_ref( + q, + k, + v, + None, + None, + None, + 0.0, + None, + causal=causal, + window_size=window_size, + upcast=False, + reorder_ops=True, + ) + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + + g = torch.randn_like(out) + do_o = (g.float() * out.float()).sum(-1) + ( + dq, + dk, + dv, + ) = torch.autograd.grad(out, (q, k, v), g) + ( + dq_ref, + dk_ref, + dv_ref, + ) = torch.autograd.grad(out_ref, (q, k, v), g) + ( + dq_pt, + dk_pt, + dv_pt, + ) = torch.autograd.grad(out_pt, (q, k, v), g) + print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}") + print(f"dK max diff: {(dk - dk_ref).abs().max().item()}") + print(f"dV max diff: {(dv - dv_ref).abs().max().item()}") + print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}") + print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}") + print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}") + print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}") + print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}") + print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}") + print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}") + print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}") + print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most twice the numerical error + # of a Pytorch implementation. + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + 1e-5 + + assert (dq - dq_ref).abs().max().item() <= 2 * (dq_pt - dq_ref).abs().max().item() + 1e-5 + assert (dk - dk_ref).abs().max().item() <= 2 * (dk_pt - dk_ref).abs().max().item() + 1e-5 + assert (dv - dv_ref).abs().max().item() <= 2 * (dv_pt - dv_ref).abs().max().item() + 1e-5 + + +@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +# @pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("local", [False, True]) +# @pytest.mark.parametrize("local", [True]) +@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [56, 80]) +# @pytest.mark.parametrize("d", [64]) +@pytest.mark.parametrize("swap_sq_sk", [False, True]) +# @pytest.mark.parametrize("swap_sq_sk", [True]) +@pytest.mark.parametrize( + "seqlen_q,seqlen_k", + [ + (1, 239), + (3, 799), + (127, 512), + (127, 513), + (113, 203), + (128, 217), + (113, 211), + (108, 256), + (256, 512), + (1023, 1024), + ], +) +# TODO: add smaller page sizes when https://github.com/Dao-AILab/flash-attention/pull/824 is merged +@pytest.mark.parametrize("paged_kv_block_size", [None, 256, 512]) +# @pytest.mark.parametrize("seqlen_q,seqlen_k", [(256, 128)]) +def test_flash_attn_varlen_causal( + seqlen_q, seqlen_k, swap_sq_sk, d, local, paged_kv_block_size, dtype +): + if ( + max(seqlen_q, seqlen_k) >= 2048 + and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30 + ): + pytest.skip() # Reference implementation OOM + if swap_sq_sk: + seqlen_q, seqlen_k = seqlen_k, seqlen_q + device = "cuda" + causal = True + # set seed + torch.random.manual_seed(0) + batch_size = 8 + nheads = 9 + window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,)) + q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True) + + if paged_kv_block_size is None: + k = torch.randn( + batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True + ) + v = torch.randn( + batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True + ) + block_table = None + else: + k, v, block_table, k_cache_paged, v_cache_paged, num_blocks = _generate_block_kvcache( + seqlen_k, paged_kv_block_size, batch_size, nheads, d, device, dtype + ) + query_padding_mask = generate_random_padding_mask(seqlen_q, batch_size, device, mode="random") + key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode="random") + ( + q_unpad, + k_unpad, + v_unpad, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q, + k, + v, + output_pad_fn, + dq_pad_fn, + dk_pad_fn, + ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False) + out_unpad = flash_attn_varlen_func( + q_unpad, + k_unpad if paged_kv_block_size is None else k_cache_paged, + v_unpad if paged_kv_block_size is None else v_cache_paged, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + 0.0, + causal=causal, + window_size=window_size, + block_table=block_table, + ) + out = output_pad_fn(out_unpad) + out_ref, attn_ref = attention_ref( + q, + k, + v, + query_padding_mask, + key_padding_mask, + None, + 0.0, + None, + causal=causal, + window_size=window_size, + ) + out_pt, attn_pt = attention_ref( + q, + k, + v, + query_padding_mask, + key_padding_mask, + None, + 0.0, + None, + causal=causal, + window_size=window_size, + upcast=False, + reorder_ops=True, + ) + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + + g = torch.randn_like(out) + do_o = (g.float() * out.float()).sum(-1) + test_backward = block_table is None + if test_backward: + ( + dq_unpad, + dk_unpad, + dv_unpad, + ) = torch.autograd.grad(out, (q_unpad, k_unpad, v_unpad), g) + dq = dq_pad_fn(dq_unpad) + dk = dk_pad_fn(dk_unpad) + dv = dk_pad_fn(dv_unpad) + ( + dq_ref, + dk_ref, + dv_ref, + ) = torch.autograd.grad(out_ref, (q, k, v), g) + ( + dq_pt, + dk_pt, + dv_pt, + ) = torch.autograd.grad(out_pt, (q, k, v), g) + print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}") + print(f"dK max diff: {(dk - dk_ref).abs().max().item()}") + print(f"dV max diff: {(dv - dv_ref).abs().max().item()}") + print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}") + print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}") + print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}") + print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}") + print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}") + print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}") + print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}") + print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}") + print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most twice the numerical error + # of a Pytorch implementation. + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + 1e-5 + + if test_backward: + assert (dq - dq_ref).abs().max().item() <= 2 * (dq_pt - dq_ref).abs().max().item() + 1e-5 + assert (dk - dk_ref).abs().max().item() <= 2 * (dk_pt - dk_ref).abs().max().item() + 1e-5 + assert (dv - dv_ref).abs().max().item() <= 2 * (dv_pt - dv_ref).abs().max().item() + 1e-5 + + +@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +# @pytest.mark.parametrize("dtype", [torch.float16]) +@pytest.mark.parametrize("deterministic", [False, True]) +# @pytest.mark.parametrize("deterministic", [True]) +@pytest.mark.parametrize("alibi", [False, True]) +# @pytest.mark.parametrize("alibi", [True]) +@pytest.mark.parametrize("local", [False, True]) +# @pytest.mark.parametrize("local", [False]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize("causal", [True]) +@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [56, 80]) +# @pytest.mark.parametrize("d", [64]) +@pytest.mark.parametrize("swap_sq_sk", [False, True]) +# @pytest.mark.parametrize("swap_sq_sk", [False]) +@pytest.mark.parametrize( + "seqlen_q,seqlen_k", + [ + (3, 1024), + (1, 339), + (64, 800), + (3, 799), + (64, 2048), + (16, 20000), + (16, 100000), + (128, 128), + (256, 256), + ], +) +# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)]) +def test_flash_attn_splitkv( + seqlen_q, seqlen_k, swap_sq_sk, d, causal, local, alibi, deterministic, dtype +): + if swap_sq_sk: + seqlen_q, seqlen_k = seqlen_k, seqlen_q + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 1 + nheads = 12 + window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,)) + q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True) + k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True) + v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True) + if alibi: + alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 + attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen_q, seqlen_k, causal=causal) + else: + alibi_slopes, attn_bias = None, None + out, lse, _ = flash_attn_func( + q, + k, + v, + 0.0, + causal=causal, + window_size=window_size, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + out_ref, attn_ref = attention_ref( + q, k, v, None, None, attn_bias, 0.0, None, causal=causal, window_size=window_size + ) + out_pt, attn_pt = attention_ref( + q, + k, + v, + None, + None, + attn_bias, + 0.0, + None, + causal=causal, + window_size=window_size, + upcast=False, + reorder_ops=True, + ) + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + + g = torch.randn_like(out) + do_o = (g.float() * out.float()).sum(-1) + ( + dq, + dk, + dv, + ) = torch.autograd.grad(out, (q, k, v), g) + ( + dq_ref, + dk_ref, + dv_ref, + ) = torch.autograd.grad(out_ref, (q, k, v), g) + ( + dq_pt, + dk_pt, + dv_pt, + ) = torch.autograd.grad(out_pt, (q, k, v), g) + print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}") + print(f"dK max diff: {(dk - dk_ref).abs().max().item()}") + print(f"dV max diff: {(dv - dv_ref).abs().max().item()}") + print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}") + print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}") + print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}") + print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}") + print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}") + print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}") + print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}") + print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}") + print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most twice the numerical error + # of a Pytorch implementation. + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + 1e-5 + + mult = 2 if not alibi else 8 + assert (dq - dq_ref).abs().max().item() <= mult * (dq_pt - dq_ref).abs().max().item() + 2e-4 + assert (dk - dk_ref).abs().max().item() <= mult * (dk_pt - dk_ref).abs().max().item() + 2e-4 + assert (dv - dv_ref).abs().max().item() <= mult * (dv_pt - dv_ref).abs().max().item() + 2e-4 + + +# @pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +@pytest.mark.parametrize("dtype", [torch.float16]) +@pytest.mark.parametrize("num_splits", [1, 0]) +# @pytest.mark.parametrize("num_splits", [1]) +@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"]) +# @pytest.mark.parametrize("mha_type", ["mha"]) +@pytest.mark.parametrize("new_kv", [False, True]) +# @pytest.mark.parametrize("new_kv", [False]) +@pytest.mark.parametrize("alibi", [False, True]) +# @pytest.mark.parametrize("alibi", [False]) +@pytest.mark.parametrize("local", [False, True]) +# @pytest.mark.parametrize("local", [False]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize("causal", [False]) +@pytest.mark.parametrize("seqlen_new_eq_seqlen_q", [True, False]) +# @pytest.mark.parametrize("seqlen_new_eq_seqlen_q", [True]) +@pytest.mark.parametrize("rotary_interleaved", [False, True]) +# @pytest.mark.parametrize("rotary_interleaved", [False]) +@pytest.mark.parametrize("rotary_fraction", [0.0, 0.5, 1.0]) +# @pytest.mark.parametrize("rotary_fraction", [0.0]) +@pytest.mark.parametrize("paged_kv_block_size", [None, 256]) +# @pytest.mark.parametrize("paged_kv_block_size", [256, 512]) +# @pytest.mark.parametrize("paged_kv_block_size", [None]) +@pytest.mark.parametrize("has_leftpad", [False, True]) +# @pytest.mark.parametrize("has_leftpad", [True]) +# @pytest.mark.parametrize("has_batch_idx", [False, True]) +@pytest.mark.parametrize("has_batch_idx", [False]) +@pytest.mark.parametrize("d", [32, 59, 64, 80, 128, 256]) +# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [56, 80]) +# @pytest.mark.parametrize("d", [128]) +@pytest.mark.parametrize( + "seqlen_q,seqlen_k", + [ + (1, 128), + (1, 339), + (3, 1024), + (64, 800), + (64, 256), + (3, 799), + (64, 2048), + (16, 20000), + (1, 128 * 1024), + (16, 128 * 1024), + (128, 128), + ], +) +# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)]) +def test_flash_attn_kvcache( + seqlen_q, + seqlen_k, + d, + has_batch_idx, + has_leftpad, + paged_kv_block_size, + rotary_fraction, + rotary_interleaved, + seqlen_new_eq_seqlen_q, + causal, + local, + alibi, + new_kv, + mha_type, + num_splits, + dtype, +): + if seqlen_q > seqlen_k and new_kv: + pytest.skip() + if not new_kv and rotary_fraction > 0.0: + pytest.skip() + if has_batch_idx and paged_kv_block_size is not None: + pytest.skip() + if has_leftpad and paged_kv_block_size is not None: + pytest.skip() + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 2 + batch_size_cache = batch_size if not has_batch_idx else batch_size * 2 + nheads = 6 + # rotary_dim must be a multiple of 16, and must be <= d + rotary_dim = math.floor(int(rotary_fraction * d) / 16) * 16 + nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 3) + assert nheads % nheads_k == 0 + window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,)) + q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype) + seqlen_new = seqlen_q if seqlen_new_eq_seqlen_q else torch.randint(1, seqlen_q + 1, (1,)).item() + if new_kv: + k = torch.randn(batch_size, seqlen_new, nheads_k, d, device=device, dtype=dtype) + v = torch.randn(batch_size, seqlen_new, nheads_k, d, device=device, dtype=dtype) + else: + k, v = None, None + if paged_kv_block_size is None: + k_cache = torch.randn(batch_size_cache, seqlen_k, nheads_k, d, device=device, dtype=dtype) + v_cache = torch.randn(batch_size_cache, seqlen_k, nheads_k, d, device=device, dtype=dtype) + block_table = None + else: + ( + k_cache, + v_cache, + block_table, + k_cache_paged, + v_cache_paged, + num_blocks, + ) = _generate_block_kvcache( + seqlen_k, paged_kv_block_size, batch_size, nheads_k, d, device, dtype + ) + cache_seqlens = torch.randint( + 0 if new_kv else 1, + # If we don't use seqlen_q in the case of causal and rotary, cos/sin won't be long enough + ( + (seqlen_k - (seqlen_q if (causal or local) and rotary_dim > 1 else seqlen_new) + 1) + if new_kv + else (seqlen_k + 1) + ), + (batch_size,), + dtype=torch.int32, + device=device, + ) + if has_leftpad: + cache_leftpad = torch.cat([torch.randint(0, cache_seqlens[i].item(), (1,), dtype=torch.int32, device=device) + if cache_seqlens[i].item() > 0 else torch.zeros(1, dtype=torch.int32, device=device) + for i in range(batch_size)]) + else: + cache_leftpad = None + arange = rearrange(torch.arange(seqlen_k, device=device), "s -> 1 s") + cache_seqlens_expanded = rearrange(cache_seqlens, "b -> b 1") + key_padding_mask = arange < cache_seqlens_expanded + (seqlen_new if new_kv else 0) + if has_leftpad: + key_padding_mask = torch.logical_and( + key_padding_mask, arange >= cache_leftpad.unsqueeze(-1).expand(-1, seqlen_k) + ) + if has_batch_idx: + cache_batch_idx = torch.randperm(batch_size_cache, dtype=torch.int32, device=device)[ + :batch_size + ] + else: + cache_batch_idx = None + if alibi: + alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 + attn_bias = attn_bias_from_alibi_slopes( + alibi_slopes, seqlen_q, seqlen_k, None, key_padding_mask, causal=causal, key_leftpad=cache_leftpad + ) + else: + alibi_slopes, attn_bias = None, None + # cache_seqlens = torch.tensor([64], dtype=torch.int32, device=device) + if rotary_dim > 0: + angle = ( + torch.rand( + seqlen_k if paged_kv_block_size is None else num_blocks * paged_kv_block_size, + rotary_dim // 2, + device=device, + ) + * 2 + * math.pi + ) + cos = torch.cos(angle).to(dtype=dtype) + sin = torch.sin(angle).to(dtype=dtype) + if causal or local: + q_ro = apply_rotary_emb( + q, cos, sin, seqlen_offsets=cache_seqlens, interleaved=rotary_interleaved + ) + else: + q_ro = rearrange( + apply_rotary_emb( + rearrange(q, "b s h d -> b 1 (s h) d"), + cos, + sin, + seqlen_offsets=cache_seqlens, + interleaved=rotary_interleaved, + ), + "b 1 (s h) d -> b s h d", + s=seqlen_q, + ) + # q_ro = q + k_ro = apply_rotary_emb( + k, cos, sin, seqlen_offsets=cache_seqlens, interleaved=rotary_interleaved + ) + else: + cos, sin = None, None + q_ro, k_ro = q, k + # k_cache[:, 64:] = -1 + k_cache_ref = ( + k_cache if not has_batch_idx else k_cache[cache_batch_idx.to(dtype=torch.long)] + ).clone() + v_cache_ref = ( + v_cache if not has_batch_idx else v_cache[cache_batch_idx.to(dtype=torch.long)] + ).clone() + if new_kv: + update_mask = torch.logical_and( + cache_seqlens_expanded <= arange, arange < cache_seqlens_expanded + seqlen_new + ) + k_cache_ref[update_mask] = rearrange(k_ro, "b s ... -> (b s) ...") + v_cache_ref[update_mask] = rearrange(v, "b s ... -> (b s) ...") + k_cache_rep = repeat(k_cache_ref, "b s h d -> b s (h g) d", g=nheads // nheads_k) + v_cache_rep = repeat(v_cache_ref, "b s h d -> b s (h g) d", g=nheads // nheads_k) + out = flash_attn_with_kvcache( + q, + k_cache if paged_kv_block_size is None else k_cache_paged, + v_cache if paged_kv_block_size is None else v_cache_paged, + k, + v, + rotary_cos=cos, + rotary_sin=sin, + cache_seqlens=cache_seqlens, + cache_batch_idx=cache_batch_idx, + cache_leftpad=cache_leftpad, + block_table=block_table, + causal=causal, + window_size=window_size, + rotary_interleaved=rotary_interleaved, + alibi_slopes=alibi_slopes, + num_splits=num_splits, + ) + # out = flash_attn_with_kvcache( + # q, k_cache, v_cache, cache_seqlens=cache_seqlens, causal=causal, window_size=window_size + # ) + # out = flash_attn_with_kvcache(q, k_cache, v_cache, causal=causal, window_size=window_size) + # qk = torch.einsum("bqhd,bkhd->bhqk", q, k_cache_ref) + # m = qk.amax(-1, keepdim=True) + # s_tmp = torch.exp((qk - m) / math.sqrt(d)) + # o1 = torch.einsum('bhst,bthd->bshd', s_tmp, v_cache_ref) + # lse_ref = torch.logsumexp(qk / math.sqrt(d), -1) + # probs = torch.softmax(qk, dim=-1) + out_ref, _ = attention_ref( + q_ro, + k_cache_rep, + v_cache_rep, + None, + key_padding_mask, + attn_bias, + 0.0, + None, + causal=causal, + window_size=window_size, + key_leftpad=cache_leftpad, + ) + out_pt, _ = attention_ref( + q_ro, + k_cache_rep, + v_cache_rep, + None, + key_padding_mask, + attn_bias, + 0.0, + None, + causal=causal, + window_size=window_size, + upcast=False, + reorder_ops=True, + key_leftpad=cache_leftpad, + ) + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most twice the numerical error + # of a Pytorch implementation. + if new_kv: + if paged_kv_block_size is None: + k_cache_select = ( + k_cache if not has_batch_idx else k_cache[cache_batch_idx.to(dtype=torch.long)] + ) + v_cache_select = ( + v_cache if not has_batch_idx else v_cache[cache_batch_idx.to(dtype=torch.long)] + ) + else: + k_cache_select = rearrange( + k_cache_paged[block_table.to(dtype=torch.long).flatten()], + "(b nblocks) block_size ... -> b (nblocks block_size) ...", + b=batch_size, + )[:, :seqlen_k] + v_cache_select = rearrange( + v_cache_paged[block_table.to(dtype=torch.long).flatten()], + "(b nblocks) block_size ... -> b (nblocks block_size) ...", + b=batch_size, + )[:, :seqlen_k] + assert torch.allclose(k_cache_select, k_cache_ref, rtol=1e-3, atol=1e-3) + assert torch.equal(v_cache_select, v_cache_ref) + mult = 3 if not alibi else 5 + assert (out - out_ref).abs().max().item() <= mult * (out_pt - out_ref).abs().max().item() + 1e-5 + + +def _generate_block_kvcache(seqlen_k, paged_kv_block_size, batch_size, nheads_k, d, device, dtype): + num_blocks = math.ceil(seqlen_k / paged_kv_block_size) * batch_size * 3 + k_cache_paged = torch.randn( + num_blocks, paged_kv_block_size, nheads_k, d, device=device, dtype=dtype + ) + v_cache_paged = torch.randn( + num_blocks, paged_kv_block_size, nheads_k, d, device=device, dtype=dtype + ) + block_table = rearrange( + torch.randperm(num_blocks, dtype=torch.int32, device=device), + "(b nblocks) -> b nblocks", + b=batch_size, + ) + k_cache = rearrange( + # pytorch 1.12 doesn't have indexing with int32 + k_cache_paged[block_table.to(dtype=torch.long).flatten()], + "(b nblocks) block_size ... -> b (nblocks block_size) ...", + b=batch_size, + )[:, :seqlen_k] + v_cache = rearrange( + v_cache_paged[block_table.to(dtype=torch.long).flatten()], + "(b nblocks) block_size ... -> b (nblocks block_size) ...", + b=batch_size, + )[:, :seqlen_k] + return k_cache, v_cache, block_table, k_cache_paged, v_cache_paged, num_blocks + + +# @pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +@pytest.mark.parametrize("dtype", [torch.float16]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize('causal', [True]) +@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [32, 56, 64, 80, 96, 128]) +# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [128]) +@pytest.mark.parametrize( + "seqlen_q,seqlen_k", + [ + (1, 239), + (239, 1), + (3, 799), + (799, 3), + (1024, 128), + (97, 97), + (128, 128), + (200, 200), + (256, 256), + (257, 257), + (384, 384), + (512, 512), + (768, 768), + (1024, 1024), + ], +) +@pytest.mark.parametrize("dropout_p", [0.0, 0.17]) +# @pytest.mark.parametrize("dropout_p", [0.0]) +def test_flash_attn_race_condition(seqlen_q, seqlen_k, d, dropout_p, causal, dtype): + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 60 # Sometimes we need large batch size for the race conditions to trigger + nheads = 4 + q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True) + k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True) + v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True) + torch.random.manual_seed(42) + out0, lse0, _ = flash_attn_func(q, k, v, dropout_p, causal=causal, return_attn_probs=True) + g = torch.randn_like(out0) + if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90): + ( + dq0, + dk0, + dv0, + ) = torch.autograd.grad(out0, (q, k, v), g) + # Numerical error if we just do any arithmetic on dq + dq_atol = 2 * ((dq0 + 0.3 - 0.3) - dq0).abs().max().item() + + for i in range(250): + torch.random.manual_seed(42) + out, lse, _ = flash_attn_func(q, k, v, dropout_p, causal=causal, return_attn_probs=True) + assert torch.equal(out, out0) + assert torch.equal(lse, lse0) + + if (d <= MAX_HEADDIM_SM8x or dropout_p == 0) or (is_sm80 or is_sm90): + ( + dq, + dk, + dv, + ) = torch.autograd.grad(out, (q, k, v), g) + dq_equal = torch.allclose(dq, dq0, atol=dq_atol) + if not dq_equal: + print(f"Iter {i}, {dq_atol = }, dQ max diff: {(dq - dq0).abs().max().item()}") + assert torch.equal(dv, dv0) + assert torch.equal(dk, dk0) + assert dq_equal + + +@pytest.mark.parametrize("dtype", [torch.float16]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize('causal', [False]) +@pytest.mark.parametrize("d", [16, 32, 64]) +# @pytest.mark.parametrize('d', [16]) +@pytest.mark.parametrize("seqlen", [1, 2, 5, 17, 128]) +# @pytest.mark.parametrize('seqlen', [2]) +def test_flash_attn_bwd_overflow(seqlen, d, causal, dtype): + """We previously had a bug where not masking elements beyond seqlen_k caused NaN in dQ, + in the case where seqlen % 128 != 0. + """ + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 2 + nheads = 5 + q = torch.randn([batch_size, seqlen, nheads, d], dtype=dtype, device="cuda") * 5 + k, v = [ + torch.randn([batch_size, seqlen, nheads, d], dtype=dtype, device="cuda") * 3 + for _ in range(2) + ] + q.requires_grad_(True) + k.requires_grad_(True) + v.requires_grad_(True) + out = flash_attn_func(q, k, v, causal=causal) + g = torch.randn_like(out) + out.backward(g) + q_pt = q.detach().clone().requires_grad_(True) + k_pt = k.detach().clone().requires_grad_(True) + v_pt = v.detach().clone().requires_grad_(True) + out_pt, _ = attention_ref(q_pt, k_pt, v_pt, causal=causal, upcast=False, reorder_ops=True) + out_pt.backward(g) + q_ref = q.detach().clone().requires_grad_(True) + k_ref = k.detach().clone().requires_grad_(True) + v_ref = v.detach().clone().requires_grad_(True) + out_ref, attn_ref = attention_ref(q_ref, k_ref, v_ref, causal=causal) + out_ref.backward(g) + print(f"dQ max diff: {(q.grad - q_ref.grad).abs().max().item()}") + print(f"dK max diff: {(k.grad - k_ref.grad).abs().max().item()}") + print(f"dV max diff: {(v.grad - v_ref.grad).abs().max().item()}") + print(f"dQ Pytorch max diff: {(q_pt.grad - q_ref.grad).abs().max().item()}") + print(f"dK Pytorch max diff: {(k_pt.grad - k_ref.grad).abs().max().item()}") + print(f"dV Pytorch max diff: {(v_pt.grad - v_ref.grad).abs().max().item()}") + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + assert (q.grad - q_ref.grad).abs().max().item() <= 5 * ( + q_pt.grad - q_ref.grad + ).abs().max().item() + 1e-3 + assert (k.grad - k_ref.grad).abs().max().item() <= 5 * ( + k_pt.grad - k_ref.grad + ).abs().max().item() + 1e-3 + assert (v.grad - v_ref.grad).abs().max().item() <= 5 * ( + v_pt.grad - v_ref.grad + ).abs().max().item() + 1e-3 + + +@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +# @pytest.mark.parametrize('dtype', [torch.bfloat16]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize('causal', [False]) +@pytest.mark.parametrize("d", [64, 128]) +# @pytest.mark.parametrize('d', [64]) +@pytest.mark.parametrize("seqlen", [97, 128, 200, 256]) +# @pytest.mark.parametrize('seqlen', [128]) +def test_flash_attn_bwd_transpose(seqlen, d, causal, dtype): + """We previously had a bug where we were using the wrong strides of dout, which shows up + when dout is not contiguous. + """ + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 5 + nheads = 2 + q, k, v = [ + torch.randn([batch_size, seqlen, nheads, d], dtype=dtype, device="cuda", requires_grad=True) + for _ in range(3) + ] + out = rearrange(flash_attn_func(q, k, v, causal=causal), "b s ... -> s b ...") + # So g is not contiguous + g = torch.randn(seqlen, 2 * batch_size, nheads, d, dtype=dtype, device="cuda")[:, ::2] + out.backward(g) + q_pt = q.detach().clone().requires_grad_(True) + k_pt = k.detach().clone().requires_grad_(True) + v_pt = v.detach().clone().requires_grad_(True) + out_pt, attn_pt = attention_ref(q_pt, k_pt, v_pt, causal=causal, upcast=False, reorder_ops=True) + out_pt = rearrange(out_pt, "b s ... -> s b ...") + out_pt.backward(g) + q_ref = q.detach().clone().requires_grad_(True) + k_ref = k.detach().clone().requires_grad_(True) + v_ref = v.detach().clone().requires_grad_(True) + out_ref, attn_ref = attention_ref(q_ref, k_ref, v_ref, causal=causal) + out_ref = rearrange(out_ref, "b s ... -> s b ...") + out_ref.backward(g) + print(f"dQ max diff: {(q.grad - q_ref.grad).abs().max().item()}") + print(f"dK max diff: {(k.grad - k_ref.grad).abs().max().item()}") + print(f"dV max diff: {(v.grad - v_ref.grad).abs().max().item()}") + print(f"dQ Pytorch max diff: {(q_pt.grad - q_ref.grad).abs().max().item()}") + print(f"dK Pytorch max diff: {(k_pt.grad - k_ref.grad).abs().max().item()}") + print(f"dV Pytorch max diff: {(v_pt.grad - v_ref.grad).abs().max().item()}") + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + assert (q.grad - q_ref.grad).abs().max().item() <= 2 * ( + q_pt.grad - q_ref.grad + ).abs().max().item() + assert (k.grad - k_ref.grad).abs().max().item() <= 2 * ( + k_pt.grad - k_ref.grad + ).abs().max().item() + assert (v.grad - v_ref.grad).abs().max().item() <= 2 * ( + v_pt.grad - v_ref.grad + ).abs().max().item() + + +@pytest.mark.parametrize("dtype", [torch.float16]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize('causal', [False]) +@pytest.mark.parametrize("d", [16, 32, 64]) +# @pytest.mark.parametrize('d', [16]) +def test_flash_attn_bwd_varlen_overflow(d, causal, dtype): + """We previously had a bug where not masking elements beyond seqlen_k caused NaN in dQ, + in the case where seqlen % 128 != 0 or varlen. + """ + device = "cuda" + # set seed + torch.random.manual_seed(0) + nheads = 5 + q_cuseqlen = torch.tensor([0, 76, 110, 256], device=device, dtype=torch.int32) + k_cuseqlen = torch.tensor([0, 1, 2, 3], device=device, dtype=torch.int32) + Mq = 256 + Mk = 3 + + q = torch.randn([Mq, nheads, d], dtype=dtype, device=device) * 3 + k, v = [torch.randn([Mk, nheads, d], dtype=dtype, device=device) * 3 for _ in range(2)] + q.requires_grad_(True) + k.requires_grad_(True) + v.requires_grad_(True) + + out = flash_attn_varlen_func(q, k, v, q_cuseqlen, k_cuseqlen, Mq, Mk, causal=causal) + g = torch.randn_like(out) + out.backward(g) + + assert not q.grad.isnan().any() + assert not k.grad.isnan().any() + assert not v.grad.isnan().any() + + +@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +# @pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("local", [False, True]) +# @pytest.mark.parametrize("local", [True]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize("causal", [True]) +@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [56, 80]) +# @pytest.mark.parametrize("d", [64]) +@pytest.mark.parametrize("swap_sq_sk", [False, True]) +# @pytest.mark.parametrize("swap_sq_sk", [False]) +@pytest.mark.parametrize( + "seqlen_q,seqlen_k", + [ + (1, 239), + (3, 799), + (127, 512), + (127, 513), + (113, 203), + (128, 217), + (113, 211), + (108, 256), + (256, 512), + (1023, 1024), + ], +) +# @pytest.mark.parametrize('seqlen_q,seqlen_k', [(256, 128)]) +def test_flash_attn_deterministic(seqlen_q, seqlen_k, swap_sq_sk, d, causal, local, dtype): + if ( + max(seqlen_q, seqlen_k) >= 2048 + and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30 + ): + pytest.skip() # Reference implementation OOM + if swap_sq_sk: + seqlen_q, seqlen_k = seqlen_k, seqlen_q + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 4 + nheads = 9 + window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,)) + q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True) + k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True) + v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True) + out = flash_attn_func(q, k, v, 0.0, causal=causal, window_size=window_size, deterministic=True) + + g = torch.randn_like(out) + dq0, dk0, dv0 = torch.autograd.grad(out, (q, k, v), g, retain_graph=True) + for _ in range(50): + dq, dk, dv = torch.autograd.grad(out, (q, k, v), g, retain_graph=True) + assert torch.equal(dv, dv0) + assert torch.equal(dk, dk0) + assert torch.equal(dq, dq0) + + +@pytest.mark.parametrize("dtype", ([torch.float16] if is_sm75 else [torch.float16, torch.bfloat16])) +# @pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("local", [False, True]) +# @pytest.mark.parametrize("local", [True]) +@pytest.mark.parametrize("causal", [False, True]) +# @pytest.mark.parametrize("causal", [True]) +@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256]) +# @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [32, 64, 96, 128, 160, 192]) +# @pytest.mark.parametrize('d', [56, 80]) +# @pytest.mark.parametrize("d", [64]) +@pytest.mark.parametrize("swap_sq_sk", [False, True]) +# @pytest.mark.parametrize("swap_sq_sk", [True]) +@pytest.mark.parametrize( + "seqlen_q,seqlen_k", + [ + (1, 239), + (3, 799), + (127, 512), + (127, 513), + (113, 203), + (128, 217), + (113, 211), + (108, 256), + (256, 512), + (1023, 1024), + ], +) +# @pytest.mark.parametrize("seqlen_q,seqlen_k", [(256, 128)]) +def test_flash_attn_varlen_deterministic(seqlen_q, seqlen_k, swap_sq_sk, d, causal, local, dtype): + if ( + max(seqlen_q, seqlen_k) >= 2048 + and torch.cuda.get_device_properties("cuda").total_memory <= 16 * 2**30 + ): + pytest.skip() # Reference implementation OOM + if swap_sq_sk: + seqlen_q, seqlen_k = seqlen_k, seqlen_q + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 2 + nheads = 9 + window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,)) + q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True) + k = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True) + v = torch.randn(batch_size, seqlen_k, nheads, d, device=device, dtype=dtype, requires_grad=True) + query_padding_mask = generate_random_padding_mask(seqlen_q, batch_size, device, mode="random") + key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode="random") + ( + q_unpad, + k_unpad, + v_unpad, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q, + k, + v, + output_pad_fn, + dq_pad_fn, + dk_pad_fn, + ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False) + out = flash_attn_varlen_func( + q_unpad, + k_unpad, + v_unpad, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + 0.0, + causal=causal, + window_size=window_size, + deterministic=True, + ) + + g = torch.randn_like(out) + dq0, dk0, dv0 = torch.autograd.grad(out, (q_unpad, k_unpad, v_unpad), g, retain_graph=True) + for _ in range(50): + dq, dk, dv = torch.autograd.grad(out, (q_unpad, k_unpad, v_unpad), g, retain_graph=True) + assert torch.equal(dv, dv0) + assert torch.equal(dk, dk0) + assert torch.equal(dq, dq0) diff --git a/test_flash_attn_ck.py b/test_flash_attn_ck.py new file mode 100644 index 0000000000000000000000000000000000000000..fbcb51cefee1055ed4b7a1d3991489d12de4a6e5 --- /dev/null +++ b/test_flash_attn_ck.py @@ -0,0 +1,754 @@ +import math + +import pytest +import torch +import torch.nn.functional as F +from einops import rearrange, repeat +from flash_attn import ( + flash_attn_func, + flash_attn_kvpacked_func, + flash_attn_qkvpacked_func, + flash_attn_varlen_func, + flash_attn_varlen_kvpacked_func, + flash_attn_varlen_qkvpacked_func, +) + +from test_flash_attn import ( + attn_bias_from_alibi_slopes, + convert_flash_attn_S_to_softmax, + generate_qkv, + generate_random_padding_mask, + attention_ref, + attention_kvpacked_ref, + attention_qkvpacked_ref, +) + +def is_bwd_hdim_supported(d): + return d <= 128 and d % 2 == 0 + + +def ck_randval_to_dropout_mask(randval, p): + # If p = 0.3, randval in 255 * (0.7, 1.0] will be dropout + # randval in 255 * [0, 0.7] will be kept + # If return dropout_mask >=0, value will be kept + return torch.floor(255.0 * (1 - p) - randval) + + +def pad_rearrange_dropout_mask_hts_to_bhss(S_dmask, cu_seqlens_q, seqlen_q_rounded, seqlen_k_rounded): + """ pad + rearrange [nheads, total_q, max_seqlen_k] into [b, nheads, seqlen_q_rounded, seqlen_k_rounded] + Arguments: + S_dmask: (nheads, total_q, max_seqlen_k) + cu_seqlens_q: (b + 1) + Output: + S_dmask: (b, nheads, seqlen_q_rounded, seqlen_k_rounded) + """ + batch_size = cu_seqlens_q.numel() - 1 + seqlens_q = torch.roll(cu_seqlens_q, shifts = -1) - cu_seqlens_q + seqlens_q = seqlens_q[0:batch_size].tolist() + S_dmask = torch.split(S_dmask, seqlens_q, dim=1) + # [(nheads, seqlen_q0, max_seqlen_k), (nheads, seqlen_q1, max_seqlen_k), ..., (nheads, seqlen_qb, max_seqlen_k)] + masks = () + for mask in S_dmask: + # (nheads, seqlen_qi, max_seqlen_k) -> (nheads, seqlen_q_rounded, seqlen_k_rounded) + mask = F.pad(mask, (0, seqlen_k_rounded - mask.shape[2], 0, seqlen_q_rounded - mask.shape[1], 0, 0)).unsqueeze(1) + masks = masks + (mask, ) + S_dmask = torch.cat(masks, dim=1) + + S_dmask = S_dmask.transpose(0, 1) + return S_dmask + + +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("deterministic", [False]) +@pytest.mark.parametrize("alibi", [False, True]) +@pytest.mark.parametrize("local", [False, True]) +@pytest.mark.parametrize("causal", [False, True]) +@pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256]) +@pytest.mark.parametrize("seqlen", [97, 128, 200, 384, 768, 1024, 1025, 2048]) +@pytest.mark.parametrize("dropout_p", [0.0, 0.17]) +def test_flash_attn_qkvpacked(seqlen, d, dropout_p, causal, local, alibi, deterministic, dtype): + if d > 256: + pytest.skip() + + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 4 + nheads = 9 + window_size = (-1, -1) if not local else torch.randint(0, seqlen, (2,)) + + qkv = torch.randn( + batch_size, seqlen, 3, nheads, d, device=device, dtype=dtype, requires_grad=True + ) + + if alibi: + alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 + attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen, seqlen, causal=causal) + else: + alibi_slopes, attn_bias = None, None + out, lse, S_dmask = flash_attn_qkvpacked_func( + qkv, + dropout_p, + causal=causal, + window_size=window_size, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + if dropout_p > 0.0: + # TODO - move to c++ mha_varlen_fwd() + S_dmask = ck_randval_to_dropout_mask(S_dmask, dropout_p) + S_dmask_converted = convert_flash_attn_S_to_softmax( + S_dmask, + seqlen, + seqlen, + None, + None, + d, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + dropout_mask = S_dmask_converted >= 0 + # CK does not return P. Hence, we don't test the attn here. + else: + dropout_mask = None + + out_ref, attn_ref = attention_qkvpacked_ref( + qkv, None, attn_bias, dropout_p, dropout_mask, causal=causal, window_size=window_size + ) + out_pt, attn_pt = attention_qkvpacked_ref( + qkv, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + upcast=False, + reorder_ops=True, + ) + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most twice the numerical error + # of a Pytorch implementation. + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + + g = torch.randn_like(out) + if is_bwd_hdim_supported(d): + (dqkv,) = torch.autograd.grad(out, qkv, g) + (dqkv_ref,) = torch.autograd.grad(out_ref, qkv, g) + (dqkv_pt,) = torch.autograd.grad(out_pt, qkv, g) + print(f"dQ max diff: {(dqkv[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}") + print(f"dK max diff: {(dqkv[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}") + print(f"dV max diff: {(dqkv[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}") + print(f"dQKV mean diff: {(dqkv - dqkv_ref).abs().mean().item()}") + print(f"dQ Pytorch max diff: {(dqkv_pt[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}") + print(f"dK Pytorch max diff: {(dqkv_pt[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}") + print(f"dV Pytorch max diff: {(dqkv_pt[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}") + print(f"dQKV Pytorch mean diff: {(dqkv_pt - dqkv_ref).abs().mean().item()}") + + # TODO - use 10 times to check, wait for ck to change dq type to f32 + assert (dqkv - dqkv_ref).abs().max().item() <= 10 * (dqkv_pt - dqkv_ref).abs().max().item() + + +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("deterministic", [False]) +@pytest.mark.parametrize("alibi", [False, True]) +@pytest.mark.parametrize("local", [False, True]) +@pytest.mark.parametrize("causal", [False, True]) +@pytest.mark.parametrize("d", [32, 59, 64, 80, 96, 128, 160, 192, 224, 256]) +@pytest.mark.parametrize("seqlen", [97, 128, 200, 257, 384, 512, 768, 1025, 2048]) +@pytest.mark.parametrize("dropout_p", [0, 0.17]) +def test_flash_attn_varlen_qkvpacked(seqlen, d, dropout_p, causal, local, alibi, deterministic, dtype): + if d > 256: + pytest.skip() + + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 5 + nheads = 6 + window_size = (-1, -1) if not local else torch.randint(0, seqlen, (2,)) + qkv = torch.randn( + batch_size, seqlen, 3, nheads, d, device=device, dtype=dtype, requires_grad=True + ) + + key_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode="random") + # key_padding_mask = generate_random_padding_mask(seqlen, batch_size, device, mode='full') + if alibi: + alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 + attn_bias = attn_bias_from_alibi_slopes( + alibi_slopes, seqlen, seqlen, key_padding_mask, key_padding_mask, causal=causal + ) + else: + alibi_slopes, attn_bias = None, None + + qkv_unpad, cu_seqlens, max_seqlen, qkv, output_pad_fn, dqkv_pad_fn = generate_qkv( + *qkv.unbind(dim=2), key_padding_mask, key_padding_mask, qkvpacked=True + ) + + out_unpad, sm_lse, S_dmask = flash_attn_varlen_qkvpacked_func( + qkv_unpad, + cu_seqlens, + max_seqlen, + dropout_p, + causal=causal, + window_size=window_size, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + out = output_pad_fn(out_unpad) + if dropout_p > 0.0: + # TODO - move to c++ mha_varlen_fwd() + S_dmask = ck_randval_to_dropout_mask(S_dmask, dropout_p) + S_dmask = pad_rearrange_dropout_mask_hts_to_bhss(S_dmask, cu_seqlens, seqlen, seqlen) + + S_dmask_converted = convert_flash_attn_S_to_softmax( + S_dmask, + seqlen, + seqlen, + key_padding_mask, + key_padding_mask, + d, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + + dropout_mask = S_dmask_converted >= 0 + # CK does not return P. Hence, we don't test the attn here. + else: + dropout_mask = None + + out_ref, attn_ref = attention_qkvpacked_ref( + qkv, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + ) + out_pt, attn_pt = attention_qkvpacked_ref( + qkv, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + upcast=False, + reorder_ops=True, + ) + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most twice the numerical error + # of a Pytorch implementation. + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + + g = torch.randn_like(out) + if is_bwd_hdim_supported(d): + (dqkv_unpad,) = torch.autograd.grad(out, qkv_unpad, g) + dqkv = dqkv_pad_fn(dqkv_unpad) + (dqkv_ref,) = torch.autograd.grad(out_ref, qkv, g) + (dqkv_pt,) = torch.autograd.grad(out_pt, qkv, g) + print(f"dQ max diff: {(dqkv[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}") + print(f"dK max diff: {(dqkv[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}") + print(f"dV max diff: {(dqkv[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}") + print(f"dQKV mean diff: {(dqkv - dqkv_ref).abs().mean().item()}") + print(f"dQ Pytorch max diff: {(dqkv_pt[:, :, 0] - dqkv_ref[:, :, 0]).abs().max().item()}") + print(f"dK Pytorch max diff: {(dqkv_pt[:, :, 1] - dqkv_ref[:, :, 1]).abs().max().item()}") + print(f"dV Pytorch max diff: {(dqkv_pt[:, :, 2] - dqkv_ref[:, :, 2]).abs().max().item()}") + print(f"dQKV Pytorch mean diff: {(dqkv_pt - dqkv_ref).abs().mean().item()}") + + # TODO - use 10 times to check, wait for ck to change dq type to f32 + assert (dqkv - dqkv_ref).abs().max().item() <= 10 * (dqkv_pt - dqkv_ref).abs().max().item() + + +@pytest.mark.parametrize("kvpacked", [True, False]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"]) +@pytest.mark.parametrize("deterministic", [False]) +@pytest.mark.parametrize("alibi", [False, True]) +@pytest.mark.parametrize("local", [False, True]) +@pytest.mark.parametrize("causal", [False, True]) +@pytest.mark.parametrize("d", [32, 40, 59, 64, 96, 111, 128, 160, 192, 224, 256]) +@pytest.mark.parametrize( + "seqlen_q,seqlen_k", + [ + (113, 203), + (128, 217), + (113, 211), + (108, 256), + (256, 512), + (512, 256), + (1024, 1024), + (1023, 1024), + (1024, 1023), + (2048, 2048), + ], +) +@pytest.mark.parametrize("dropout_p", [0.0, 0.17]) +def test_flash_attn_output( + seqlen_q, seqlen_k, d, dropout_p, causal, local, alibi, deterministic, mha_type, dtype, kvpacked +): + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 4 + nheads = 9 + nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 3) + assert nheads % nheads_k == 0 + window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,)) + q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True) + if kvpacked: + kv = torch.randn( + batch_size, seqlen_k, 2, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + else: + k = torch.randn( + batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + v = torch.randn( + batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + if alibi: + alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 + attn_bias = attn_bias_from_alibi_slopes(alibi_slopes, seqlen_q, seqlen_k, causal=causal) + else: + alibi_slopes, attn_bias = None, None + + if kvpacked: + out, lse, S_dmask = flash_attn_kvpacked_func( + q, + kv, + dropout_p, + causal=causal, + window_size=window_size, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + else: + out, lse, S_dmask = flash_attn_func( + q, + k, + v, + dropout_p, + causal=causal, + window_size=window_size, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + if dropout_p > 0.0: + # TODO - move to c++ mha_varlen_fwd() + S_dmask = ck_randval_to_dropout_mask(S_dmask, dropout_p) + S_dmask_converted = convert_flash_attn_S_to_softmax( + S_dmask, + seqlen_q, + seqlen_k, + None, + None, + d, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + dropout_mask = S_dmask_converted >= 0 + if kvpacked: + kv_rep = repeat(kv, "b s two h d -> b s two (h g) d", g=nheads // nheads_k) + k_rep, v_rep = kv_rep.unbind(dim=2) + else: + k_rep = repeat(k, "b s h d -> b s (h g) d", g=nheads // nheads_k) + v_rep = repeat(v, "b s h d -> b s (h g) d", g=nheads // nheads_k) + # CK does not return P. Hence, we don't test the attn here. + else: + dropout_mask = None + + if kvpacked: + out_ref, attn_ref = attention_kvpacked_ref( + q, + kv, + None, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + ) + out_pt, attn_pt = attention_kvpacked_ref( + q, + kv, + None, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + upcast=False, + reorder_ops=True, + ) + else: + out_ref, attn_ref = attention_ref( + q, + k, + v, + None, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + ) + out_pt, attn_pt = attention_ref( + q, + k, + v, + None, + None, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + upcast=False, + reorder_ops=True, + ) + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most twice the numerical error + # of a Pytorch implementation. + assert (out - out_ref).abs().max().item() <= 2 * (out_pt - out_ref).abs().max().item() + + g = torch.randn_like(out) + if is_bwd_hdim_supported(d): + if kvpacked: + ( + dq, + dkv, + ) = torch.autograd.grad(out, (q, kv), g) + dk, dv = dkv.unbind(2) + ( + dq_ref, + dkv_ref, + ) = torch.autograd.grad(out_ref, (q, kv), g) + dk_ref, dv_ref = dkv_ref.unbind(2) + ( + dq_pt, + dkv_pt, + ) = torch.autograd.grad(out_pt, (q, kv), g) + dk_pt, dv_pt = dkv_pt.unbind(2) + else: + ( + dq, + dk, + dv, + ) = torch.autograd.grad(out, (q, k, v), g) + ( + dq_ref, + dk_ref, + dv_ref, + ) = torch.autograd.grad(out_ref, (q, k, v), g) + ( + dq_pt, + dk_pt, + dv_pt, + ) = torch.autograd.grad(out_pt, (q, k, v), g) + print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}") + print(f"dK max diff: {(dk - dk_ref).abs().max().item()}") + print(f"dV max diff: {(dv - dv_ref).abs().max().item()}") + print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}") + print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}") + print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}") + print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}") + print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}") + print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}") + print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}") + print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}") + print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}") + + # TODO - use 10 times to check, wait for ck to change dq type to f32 + assert (dq - dq_ref).abs().max().item() <= 10 * (dq_pt - dq_ref).abs().max().item() + assert (dk - dk_ref).abs().max().item() <= 10 * (dk_pt - dk_ref).abs().max().item() + assert (dv - dv_ref).abs().max().item() <= 10 * (dv_pt - dv_ref).abs().max().item() + + +@pytest.mark.parametrize("kvpacked", [True, False]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("mha_type", ["mha", "mqa", "gqa"]) +@pytest.mark.parametrize("deterministic", [False, True]) +@pytest.mark.parametrize("alibi", [False, True]) +@pytest.mark.parametrize("local", [False, True]) +@pytest.mark.parametrize("causal", [False, True]) +@pytest.mark.parametrize("d", [32, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256]) +@pytest.mark.parametrize( + "seqlen_q,seqlen_k", + [ + (1, 147), + (113, 203), + (128, 217), + (113, 211), + (108, 256), + (256, 512), + (512, 256), + (1024, 1024), + (1023, 1024), + (1024, 1023), + (2048, 2048), + ], +) +@pytest.mark.parametrize("dropout_p", [0.0, 0.17]) +def test_flash_attn_varlen_output( + seqlen_q, seqlen_k, d, dropout_p, causal, local, alibi, deterministic, mha_type, dtype, kvpacked +): + device = "cuda" + # set seed + torch.random.manual_seed(0) + batch_size = 4 + nheads = 9 + nheads_k = nheads if mha_type == "mha" else (1 if mha_type == "mqa" else 3) + assert nheads % nheads_k == 0 + window_size = (-1, -1) if not local else torch.randint(0, seqlen_k, (2,)) + q = torch.randn(batch_size, seqlen_q, nheads, d, device=device, dtype=dtype, requires_grad=True) + if kvpacked: + kv = torch.randn( + batch_size, seqlen_k, 2, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + else: + k = torch.randn( + batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + v = torch.randn( + batch_size, seqlen_k, nheads_k, d, device=device, dtype=dtype, requires_grad=True + ) + + query_padding_mask = generate_random_padding_mask(seqlen_q, batch_size, device, mode="random") + key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode="random") + # key_padding_mask = generate_random_padding_mask(seqlen_k, batch_size, device, mode='full') + if alibi: + alibi_slopes = torch.rand(batch_size, nheads, device=device, dtype=torch.float32) * 0.3 + attn_bias = attn_bias_from_alibi_slopes( + alibi_slopes, seqlen_q, seqlen_k, query_padding_mask, key_padding_mask, causal=causal + ) + else: + alibi_slopes, attn_bias = None, None + + if kvpacked: + ( + q_unpad, + kv_unpad, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q, + kv, + output_pad_fn, + dq_pad_fn, + dkv_pad_fn, + ) = generate_qkv(q, *kv.unbind(dim=2), query_padding_mask, key_padding_mask, kvpacked=True) + out_unpad, sm_lse, S_dmask = flash_attn_varlen_kvpacked_func( + q_unpad, + kv_unpad, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + causal=causal, + window_size=window_size, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + else: + ( + q_unpad, + k_unpad, + v_unpad, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q, + k, + v, + output_pad_fn, + dq_pad_fn, + dk_pad_fn, + ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False) + out_unpad, sm_lse, S_dmask = flash_attn_varlen_func( + q_unpad, + k_unpad, + v_unpad, + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + dropout_p, + causal=causal, + window_size=window_size, + alibi_slopes=alibi_slopes, + deterministic=deterministic, + return_attn_probs=True, + ) + out = output_pad_fn(out_unpad) + if dropout_p > 0.0: + # TODO - move to c++ mha_varlen_fwd() + S_dmask = ck_randval_to_dropout_mask(S_dmask, dropout_p) + S_dmask = pad_rearrange_dropout_mask_hts_to_bhss(S_dmask, cu_seqlens_q, seqlen_q, seqlen_k) + S_dmask_converted = convert_flash_attn_S_to_softmax( + S_dmask, + seqlen_q, + seqlen_k, + query_padding_mask, + key_padding_mask, + d, + dropout_p > 0.0, + causal=causal, + window_size=window_size, + ) + dropout_mask = S_dmask_converted >= 0 + if kvpacked: + kv_rep = repeat(kv, "b s two h d -> b s two (h g) d", g=nheads // nheads_k) + k_rep, v_rep = kv_rep.unbind(dim=2) + else: + k_rep = repeat(k, "b s h d -> b s (h g) d", g=nheads // nheads_k) + v_rep = repeat(v, "b s h d -> b s (h g) d", g=nheads // nheads_k) + # CK does not return P. Hence, we don't test the attn here. + else: + dropout_mask = None + + if kvpacked: + out_ref, attn_ref = attention_kvpacked_ref( + q, + kv, + query_padding_mask, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + ) + out_pt, attn_pt = attention_kvpacked_ref( + q, + kv, + query_padding_mask, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + upcast=False, + reorder_ops=True, + ) + else: + out_ref, attn_ref = attention_ref( + q, + k, + v, + query_padding_mask, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + ) + out_pt, attn_pt = attention_ref( + q, + k, + v, + query_padding_mask, + key_padding_mask, + attn_bias, + dropout_p, + dropout_mask, + causal=causal, + window_size=window_size, + upcast=False, + reorder_ops=True, + ) + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"Pytorch max diff: {(out_pt - out_ref).abs().max().item()}") + print(f"Pytorch mean diff: {(out_pt - out_ref).abs().mean().item()}") + + # Check that FlashAttention's numerical error is at most 4 times the numerical error + # of a Pytorch implementation. + assert (out - out_ref).abs().max().item() <= 4 * (out_pt - out_ref).abs().max().item() + + g = torch.randn_like(out) + if is_bwd_hdim_supported(d): + if kvpacked: + ( + dq_unpad, + dkv_unpad, + ) = torch.autograd.grad(out, (q_unpad, kv_unpad), g) + dk, dv = dkv_pad_fn(dkv_unpad).unbind(2) + ( + dq_ref, + dkv_ref, + ) = torch.autograd.grad(out_ref, (q, kv), g) + dk_ref, dv_ref = dkv_ref.unbind(2) + ( + dq_pt, + dkv_pt, + ) = torch.autograd.grad(out_pt, (q, kv), g) + dk_pt, dv_pt = dkv_pt.unbind(2) + else: + ( + dq_unpad, + dk_unpad, + dv_unpad, + ) = torch.autograd.grad(out, (q_unpad, k_unpad, v_unpad), g) + dk = dk_pad_fn(dk_unpad) + dv = dk_pad_fn(dv_unpad) + ( + dq_ref, + dk_ref, + dv_ref, + ) = torch.autograd.grad(out_ref, (q, k, v), g) + ( + dq_pt, + dk_pt, + dv_pt, + ) = torch.autograd.grad(out_pt, (q, k, v), g) + dq = dq_pad_fn(dq_unpad) + print(f"dQ max diff: {(dq - dq_ref).abs().max().item()}") + print(f"dK max diff: {(dk - dk_ref).abs().max().item()}") + print(f"dV max diff: {(dv - dv_ref).abs().max().item()}") + print(f"dQ mean diff: {(dq - dq_ref).abs().mean().item()}") + print(f"dK mean diff: {(dk - dk_ref).abs().mean().item()}") + print(f"dV mean diff: {(dv - dv_ref).abs().mean().item()}") + print(f"dQ Pytorch max diff: {(dq_pt - dq_ref).abs().max().item()}") + print(f"dK Pytorch max diff: {(dk_pt - dk_ref).abs().max().item()}") + print(f"dV Pytorch max diff: {(dv_pt - dv_ref).abs().max().item()}") + print(f"dQ Pytorch mean diff: {(dq_pt - dq_ref).abs().mean().item()}") + print(f"dK Pytorch mean diff: {(dk_pt - dk_ref).abs().mean().item()}") + print(f"dV Pytorch mean diff: {(dv_pt - dv_ref).abs().mean().item()}") + + # TODO - use 10 times to check, wait for ck to change dq type to f32 + assert (dq - dq_ref).abs().max().item() <= 10 * (dq_pt - dq_ref).abs().max().item() + assert (dk - dk_ref).abs().max().item() <= 10 * (dk_pt - dk_ref).abs().max().item() + assert (dv - dv_ref).abs().max().item() <= 10 * (dv_pt - dv_ref).abs().max().item() diff --git a/test_fused_dense.py b/test_fused_dense.py new file mode 100644 index 0000000000000000000000000000000000000000..084dd5f4b8ee7c0c688f60409c644022e6c00a81 --- /dev/null +++ b/test_fused_dense.py @@ -0,0 +1,172 @@ +import math +from functools import partial + +import pytest +import torch +import torch.nn.functional as F +from einops import rearrange +from flash_attn.ops.fused_dense import FusedDense, FusedMLP + + +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("return_residual", [False, True]) +@pytest.mark.parametrize("has_bias", [True, False]) +@pytest.mark.parametrize("out_features", [1024, 4096]) +@pytest.mark.parametrize("in_features", [1024, 4096]) +def test_fused_linear_bias(in_features, out_features, has_bias, return_residual, dtype): + device = "cuda" + rtol, atol = (3e-3, 1e-2) if dtype == torch.bfloat16 else (3e-3, 1e-3) + # set seed + torch.random.manual_seed(0) + batch_size = 8 + seqlen = 512 + x_pt = torch.randn( + batch_size, seqlen, in_features, device=device, dtype=dtype, requires_grad=True + ) + x = x_pt.detach().clone().requires_grad_() + model_pt = torch.nn.Linear(in_features, out_features, bias=has_bias, device=device, dtype=dtype) + model = FusedDense( + in_features, + out_features, + bias=has_bias, + return_residual=return_residual, + device=device, + dtype=dtype, + ) + with torch.no_grad(): + model.weight.copy_(model_pt.weight) + if has_bias: + model.bias.copy_(model_pt.bias) + out_pt = model_pt(x_pt) + if not return_residual: + out = model(x) + else: + out, x_copy = model(x) + x_copy = ( + x_copy[..., :out_features] + if out_features < in_features + else F.pad(x_copy, (0, out_features - in_features)) + ) + x_pt_copy = ( + x_pt[..., :out_features] + if out_features < in_features + else F.pad(x_pt, (0, out_features - in_features)) + ) + # Just add some random function of the residual + out_pt = out_pt + F.gelu(x_pt_copy) + out = out + F.gelu(x_copy) + + # with torch.no_grad(): + # out_fl = F.linear(x_pt.float(), model.weight.float(), model.bias.float()).half() + assert torch.allclose(out, out_pt, rtol=rtol, atol=atol) + + # If we don't divide by batch_size, the gradient gets a bit too large. + g = torch.randn_like(out) / 32 + out_pt.backward(g) + out.backward(g) + assert torch.allclose(x.grad, x_pt.grad, rtol=rtol, atol=atol) + # The error for d_weight and d_bias is quite a bit higher + assert torch.allclose(model.weight.grad, model_pt.weight.grad, rtol=rtol, atol=atol * 10) + if has_bias: + assert torch.allclose(model.bias.grad, model_pt.bias.grad, rtol=rtol, atol=atol * 5) + + +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +# @pytest.mark.parametrize('dtype', [torch.float16]) +@pytest.mark.parametrize("heuristic", ["auto", -1]) +# @pytest.mark.parametrize('heuristic', ['auto']) +@pytest.mark.parametrize("checkpoint_lvl", [0, 1, 2]) +# @pytest.mark.parametrize('checkpoint_lvl', [1]) +@pytest.mark.parametrize("return_residual", [False, True]) +# @pytest.mark.parametrize('return_residual', [False]) +@pytest.mark.parametrize("has_bias2", [True, False]) +@pytest.mark.parametrize("has_bias1", [True, False]) +# @pytest.mark.parametrize('has_bias2', [True]) +# @pytest.mark.parametrize('has_bias1', [True]) +@pytest.mark.parametrize("activation", ["gelu_approx", "relu"]) +# @pytest.mark.parametrize('activation', ['relu']) +@pytest.mark.parametrize("out_features", [1024, 4096]) +@pytest.mark.parametrize("in_features", [1024, 4096]) +# @pytest.mark.parametrize('out_features', [4096]) +# @pytest.mark.parametrize('in_features', [1024]) +def test_fused_mlp( + in_features, + out_features, + activation, + has_bias1, + has_bias2, + return_residual, + checkpoint_lvl, + heuristic, + dtype, +): + device = "cuda" + rtol, atol = (3e-3, 3e-2) if dtype == torch.bfloat16 else (3e-3, 1e-3) + # set seed + torch.random.manual_seed(0) + batch_size = 8 + seqlen = 512 + x_pt = torch.randn( + batch_size, seqlen, in_features, device=device, dtype=dtype, requires_grad=True + ) + x = x_pt.detach().clone().requires_grad_() + model_pt_fc1 = torch.nn.Linear( + in_features, out_features, bias=has_bias1, device=device, dtype=dtype + ) + model_pt_fc2 = torch.nn.Linear( + out_features, in_features, bias=has_bias2, device=device, dtype=dtype + ) + model = FusedMLP( + in_features, + out_features, + in_features, + activation=activation, + bias1=has_bias1, + bias2=has_bias2, + return_residual=return_residual, + checkpoint_lvl=checkpoint_lvl, + heuristic=heuristic, + device=device, + dtype=dtype, + ) + with torch.no_grad(): + model.fc1.weight.copy_(model_pt_fc1.weight) + if has_bias1: + model.fc1.bias.copy_(model_pt_fc1.bias) + model.fc2.weight.copy_(model_pt_fc2.weight) + if has_bias2: + model.fc2.bias.copy_(model_pt_fc2.bias) + activation_fn = ( + partial(F.gelu, approximate="tanh") + if activation == "gelu_approx" + else partial(F.relu, inplace=True) + ) + out_pt = model_pt_fc2(activation_fn(model_pt_fc1(x_pt))) + if not return_residual: + out = model(x) + else: + out, x_copy = model(x) + # Just add some random function of the residual + out_pt = out_pt + F.gelu(x_pt) + out = out + F.gelu(x_copy) + assert torch.allclose(out, out_pt, rtol=rtol, atol=atol) + + # If we don't divide by batch_size, the gradient gets a bit too large. + g = torch.randn_like(out) / 32 + out_pt.backward(g) + out.backward(g) + # The error for relu is higher still + if activation == "relu": + atol = 1e-1 if dtype == torch.bfloat16 else 5e-2 + assert torch.allclose(x.grad, x_pt.grad, rtol=rtol, atol=atol) + # The error for d_weight and d_bias is quite a bit higher + assert torch.allclose( + model.fc1.weight.grad, model_pt_fc1.weight.grad, rtol=rtol, atol=atol * 10 + ) + if has_bias1: + assert torch.allclose(model.fc1.bias.grad, model_pt_fc1.bias.grad, rtol=rtol, atol=atol * 5) + assert torch.allclose( + model.fc2.weight.grad, model_pt_fc2.weight.grad, rtol=rtol, atol=atol * 10 + ) + if has_bias2: + assert torch.allclose(model.fc2.bias.grad, model_pt_fc2.bias.grad, rtol=rtol, atol=atol * 5) diff --git a/test_fused_dense_parallel.py b/test_fused_dense_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..df0e70bcae9d41fbd4af60f0a187709ef8032a31 --- /dev/null +++ b/test_fused_dense_parallel.py @@ -0,0 +1,237 @@ +# Run test with: +# torchrun --no_python --nproc_per_node=8 pytest -q -s tests/ops/test_fused_dense_parallel.py + +import math + +import pytest +import torch +import torch.nn.functional as F +from apex.transformer import parallel_state, tensor_parallel +from flash_attn.ops.fused_dense import ColumnParallelLinear, FusedDense, FusedMLP, ParallelFusedMLP + +is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8 + + +@pytest.mark.parametrize("dtype", [torch.float16] + ([torch.bfloat16] if is_sm8x else [])) +# @pytest.mark.parametrize('dtype', [torch.bfloat16]) +@pytest.mark.parametrize("world_size", [1, 2, 4, 8]) +# @pytest.mark.parametrize('world_size', [2]) +@pytest.mark.parametrize("sequence_parallel", [True, False]) +# @pytest.mark.parametrize('sequence_parallel', [False]) +@pytest.mark.parametrize("has_bias", [True, False]) +# @pytest.mark.parametrize('has_bias', [False]) +@pytest.mark.parametrize("out_features", [1024]) +@pytest.mark.parametrize("in_features", [4096]) +def test_fused_linear_bias( + in_features, out_features, has_bias, sequence_parallel, world_size, dtype +): + assert out_features % world_size == 0 + rtol, atol = (3e-3, 3e-2) if dtype == torch.bfloat16 else (3e-3, 3e-3) + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + # set seed + torch.random.manual_seed(0) + batch_size = 2 + seqlen = 512 + assert batch_size * seqlen % world_size == 0 + x_pt = torch.randn( + batch_size * seqlen, in_features, device=device, dtype=dtype, requires_grad=True + ) + if sequence_parallel: + x = ( + tensor_parallel.scatter_to_sequence_parallel_region(x_pt) + .detach() + .clone() + .requires_grad_() + ) + else: + x = x_pt.detach().clone().requires_grad_() + + model_pt = torch.nn.Linear(in_features, out_features, bias=has_bias, device=device, dtype=dtype) + partition_out_features = out_features // world_size + model = ColumnParallelLinear( + in_features, + out_features, + parallel_state.get_tensor_model_parallel_group(), + bias=has_bias, + sequence_parallel=sequence_parallel, + device=device, + dtype=dtype, + ) + with torch.no_grad(): + model.weight.copy_( + model_pt.weight[rank * partition_out_features : (rank + 1) * partition_out_features] + ) + if has_bias: + model.bias.copy_( + model_pt.bias[rank * partition_out_features : (rank + 1) * partition_out_features] + ) + + out = model(x) + out_pt = model_pt(x_pt) + assert torch.allclose( + out, + out_pt[:, rank * partition_out_features : (rank + 1) * partition_out_features], + rtol=rtol, + atol=atol, + ) + + # If we don't divide by batch_size, the gradient gets a bit too large. + g = torch.randn_like(out_pt) / 32 + out_pt.backward(g) + out.backward(g[:, rank * partition_out_features : (rank + 1) * partition_out_features]) + parallel_state.destroy_model_parallel() + + partition_batch_dim = batch_size * seqlen // world_size + assert torch.allclose( + x.grad, + x_pt.grad[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] + if sequence_parallel + else x_pt.grad, + rtol=rtol, + atol=atol, + ) + # The error for d_weight and d_bias is quite a bit higher + assert torch.allclose( + model.weight.grad, + model_pt.weight.grad[rank * partition_out_features : (rank + 1) * partition_out_features], + rtol=rtol, + atol=atol * 10, + ) + if has_bias: + assert torch.allclose( + model.bias.grad, + model_pt.bias.grad[rank * partition_out_features : (rank + 1) * partition_out_features], + rtol=rtol, + atol=atol * 5, + ) + + +@pytest.mark.parametrize("dtype", [torch.float16] + ([torch.bfloat16] if is_sm8x else [])) +# @pytest.mark.parametrize('dtype', [torch.bfloat16]) +@pytest.mark.parametrize("world_size", [1, 2, 4, 8]) +# @pytest.mark.parametrize('world_size', [2]) +@pytest.mark.parametrize("sequence_parallel", [True, False]) +# @pytest.mark.parametrize('sequence_parallel', [False]) +@pytest.mark.parametrize("has_bias2", [True, False]) +# @pytest.mark.parametrize('has_bias2', [True]) +@pytest.mark.parametrize("out_features", [4096]) +@pytest.mark.parametrize("in_features", [1024]) +def test_fused_mlp(in_features, out_features, has_bias2, sequence_parallel, world_size, dtype): + assert out_features % world_size == 0 + rtol, atol = (3e-3, 3e-2) if dtype == torch.bfloat16 else (3e-3, 3e-3) + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + # set seed + torch.random.manual_seed(0) + batch_size = 2 + seqlen = 512 + assert batch_size * seqlen % world_size == 0 + x_pt = torch.randn( + batch_size * seqlen, in_features, device=device, dtype=dtype, requires_grad=True + ) + # We need to generate g here so that all processes get the same gradient, + # as rank 0 will have an extra bias that changes the RNG. + # If we don't divide by batch_size, the gradient gets a bit too large. + g = torch.randn_like(x_pt) / 32 + if sequence_parallel: + x = ( + tensor_parallel.scatter_to_sequence_parallel_region(x_pt) + .detach() + .clone() + .requires_grad_() + ) + else: + x = x_pt.detach().clone().requires_grad_() + + model_pt_fc1 = torch.nn.Linear(in_features, out_features, device=device, dtype=dtype) + model_pt_fc2 = torch.nn.Linear( + out_features, in_features, bias=has_bias2, device=device, dtype=dtype + ) + partition_out_features = out_features // world_size + partition_in_features = in_features // world_size + model = ParallelFusedMLP( + in_features, + out_features, + in_features, + process_group=parallel_state.get_tensor_model_parallel_group(), + bias2=has_bias2 and rank == 0, + sequence_parallel=sequence_parallel, + device=device, + dtype=dtype, + ) + + with torch.no_grad(): + model.fc1.weight.copy_( + model_pt_fc1.weight[rank * partition_out_features : (rank + 1) * partition_out_features] + ) + model.fc1.bias.copy_( + model_pt_fc1.bias[rank * partition_out_features : (rank + 1) * partition_out_features] + ) + model.fc2.weight.copy_( + model_pt_fc2.weight[ + :, rank * partition_out_features : (rank + 1) * partition_out_features + ] + ) + if has_bias2 and rank == 0: + model.fc2.bias.copy_(model_pt_fc2.bias) + + out = model(x) + out_pt = model_pt_fc2(F.gelu(model_pt_fc1(x_pt), approximate="tanh")) + partition_batch_dim = batch_size * seqlen // world_size + assert torch.allclose( + out, + out_pt[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] + if sequence_parallel + else out_pt, + rtol=rtol, + atol=atol, + ) + + out_pt.backward(g) + out.backward( + g[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] if sequence_parallel else g + ) + parallel_state.destroy_model_parallel() + + assert torch.allclose( + x.grad, + x_pt.grad[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] + if sequence_parallel + else x_pt.grad, + rtol=rtol, + atol=atol, + ) + # The error for d_weight and d_bias is quite a bit higher + assert torch.allclose( + model.fc1.weight.grad, + model_pt_fc1.weight.grad[ + rank * partition_out_features : (rank + 1) * partition_out_features + ], + rtol=rtol, + atol=atol * 10, + ) + assert torch.allclose( + model.fc1.bias.grad, + model_pt_fc1.bias.grad[rank * partition_out_features : (rank + 1) * partition_out_features], + rtol=rtol, + atol=atol * 5, + ) + assert torch.allclose( + model.fc2.weight.grad, + model_pt_fc2.weight.grad[ + :, rank * partition_out_features : (rank + 1) * partition_out_features + ], + rtol=rtol, + atol=atol * 10, + ) + if has_bias2 and rank == 0: + assert torch.allclose(model.fc2.bias.grad, model_pt_fc2.bias.grad, rtol=rtol, atol=atol * 5) diff --git a/test_gpt.py b/test_gpt.py new file mode 100644 index 0000000000000000000000000000000000000000..9822030051b19d7e2e688f31880a705f6046a198 --- /dev/null +++ b/test_gpt.py @@ -0,0 +1,478 @@ +import re + +import pytest +import torch +from einops import rearrange +from flash_attn.models.gpt import ( + GPTLMHeadModel, + remap_state_dict_hf_gpt2, + shard_state_dict_tp, + combine_state_dicts_tp, +) +from flash_attn.utils.generation import InferenceParams +from flash_attn.utils.pretrained import state_dict_from_pretrained +from transformers import GPT2Config, GPT2Tokenizer +from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel as GPT2LMHeadModelHF + + +@pytest.mark.parametrize("model_name", ["gpt2", "gpt2-medium"]) +# @pytest.mark.parametrize('model_name', ["gpt2"]) +def test_gpt2_state_dict(model_name): + config = GPT2Config.from_pretrained(model_name) + pretrained_state_dict = remap_state_dict_hf_gpt2(state_dict_from_pretrained(model_name), config) + model = GPTLMHeadModel(config) + state_dict = model.state_dict() + assert state_dict.keys() == pretrained_state_dict.keys() + for k in state_dict.keys(): + assert state_dict[k].shape == pretrained_state_dict[k].shape + + +@pytest.mark.parametrize("model_name", ["gpt2", "gpt2-medium"]) +# @pytest.mark.parametrize('model_name', ["gpt2"]) +def test_gpt2_non_optimized(model_name): + """Check that our implementation of GPT2 (without any optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + config = GPT2Config.from_pretrained(model_name) + + model = GPTLMHeadModel.from_pretrained(model_name, config) + model = model.cuda().to(dtype=dtype) + + model_ref = GPT2LMHeadModelHF.from_pretrained(model_name).cuda() + model_hf = GPT2LMHeadModelHF.from_pretrained(model_name).cuda().to(dtype=dtype) + + model.eval() + model_ref.eval() + model_hf.eval() + + torch.manual_seed(0) + batch_size = 4 + max_seqlen = 512 + seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device="cuda") + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device="cuda" + ) + out = model.transformer(input_ids) + out_hf = model_hf.transformer(input_ids).last_hidden_state + out_ref = model_ref.transformer(input_ids).last_hidden_state + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}") + assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item() + + logits = model(input_ids).logits + logits_hf = model_hf(input_ids).logits + logits_ref = model_ref(input_ids).logits + + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}") + assert (logits - logits_ref).abs().max().item() < 3 * ( + logits_hf - logits_ref + ).abs().max().item() + + +@pytest.mark.parametrize("model_name", ["gpt2", "gpt2-medium"]) +# @pytest.mark.parametrize('model_name', ["gpt2"]) +def test_gpt2_optimized(model_name): + """Check that our implementation of GPT2 (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + config = GPT2Config.from_pretrained(model_name) + vocab_size_og = config.vocab_size + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = True + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + config.pad_vocab_size_multiple = 8 + + model = GPTLMHeadModel.from_pretrained(model_name, config) + model = model.cuda().to(dtype=dtype) + + model_ref = GPT2LMHeadModelHF.from_pretrained(model_name).cuda() + model_hf = GPT2LMHeadModelHF.from_pretrained(model_name).cuda().to(dtype=dtype) + + model.eval() + model_ref.eval() + model_hf.eval() + + torch.manual_seed(0) + batch_size = 4 + max_seqlen = 512 + seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device="cuda") + input_ids = torch.randint( + 0, vocab_size_og, (batch_size, max_seqlen), dtype=torch.long, device="cuda" + ) + out = model.transformer(input_ids) + out_hf = model_hf.transformer(input_ids).last_hidden_state + out_ref = model_ref.transformer(input_ids).last_hidden_state + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}") + assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item() + + logits = model(input_ids).logits[..., :vocab_size_og] + logits_hf = model_hf(input_ids).logits + logits_ref = model_ref(input_ids).logits + + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}") + assert (logits - logits_ref).abs().max().item() < 3 * ( + logits_hf - logits_ref + ).abs().max().item() + + +@pytest.mark.parametrize("optimized", [False, True]) +# @pytest.mark.parametrize('optimized', [True]) +@pytest.mark.parametrize("rotary", [False, True]) +# @pytest.mark.parametrize('rotary', [False]) +@pytest.mark.parametrize("model_name", ["gpt2"]) +def test_gpt2_generation(model_name, rotary, optimized): + """Check that our implementation of GPT2 generation matches the HF implementation: + the scores in fp16 should be around the same as the HF scores in fp16, when compared to + the HF scores in fp32. + """ + dtype = torch.float16 + device = "cuda" + rtol, atol = 3e-3, 3e-1 + config = GPT2Config.from_pretrained(model_name) + if rotary: + config.n_positions = 0 + config.rotary_emb_fraction = 0.5 + config.rotary_emb_base = 24000 + config.residual_in_fp32 = True + if optimized: + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = True + config.fused_dropout_add_ln = True + + # if not rotary, we load the weight from HF but ignore the position embeddings. + # The model would be nonsense but it doesn't matter for the test. + model = GPTLMHeadModel.from_pretrained( + model_name, config, strict=not rotary, device=device, dtype=dtype + ) + model.eval() + + if not rotary: + model_ref = GPT2LMHeadModelHF.from_pretrained(model_name).to(device=device) + model_hf = GPT2LMHeadModelHF.from_pretrained(model_name, torch_dtype=dtype).to( + device=device + ) + model_ref.eval() + model_hf.eval() + + torch.manual_seed(0) + tokenizer = GPT2Tokenizer.from_pretrained("gpt2") + input_ids = tokenizer("Hello, my dog is cute and he", return_tensors="pt").input_ids.to( + device=device + ) + max_length = 25 + # input_ids = torch.randint(0, 100, (2, 10), dtype=torch.long, device='cuda') + # max_length = input_ids.shape[1] + 40 + + # Slow generation for reference + sequences = [] + scores = [] + cur_input_ids = input_ids + with torch.inference_mode(): + scores.append(model(cur_input_ids).logits[:, -1]) + sequences.append(scores[-1].argmax(dim=-1)) + for _ in range(input_ids.shape[1] + 1, max_length): + cur_input_ids = torch.cat([cur_input_ids, rearrange(sequences[-1], "b -> b 1")], dim=-1) + scores.append(model(cur_input_ids).logits[:, -1]) + sequences.append(scores[-1].argmax(dim=-1)) + sequences = torch.cat([input_ids, torch.stack(sequences, dim=1)], dim=1) + scores = tuple(scores) + + out = model.generate( + input_ids=input_ids, + max_length=max_length, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + ) + print(out.sequences) + print(tokenizer.batch_decode(out.sequences.tolist())) + if getattr(config, "use_flash_attn", False): + out_cg = model.generate( + input_ids=input_ids, + max_length=max_length, + cg=True, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + ) + print(out_cg.sequences) + assert torch.equal(torch.stack(out.scores, dim=1), torch.stack(out_cg.scores, dim=1)) + + if not rotary: + out_hf = model_hf.generate( + input_ids=input_ids, + max_length=max_length, + return_dict_in_generate=True, + output_scores=True, + ) + out_ref = model_ref.generate( + input_ids=input_ids, + max_length=max_length, + return_dict_in_generate=True, + output_scores=True, + ) + + print( + f"Scores max diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}" + ) + print( + f"Scores mean diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}" + ) + print( + f"HF fp16 max diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}" + ) + print( + f"HF fp16 mean diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}" + ) + print(tokenizer.batch_decode(out_ref.sequences.tolist())) + + assert torch.all(out.sequences == sequences) + assert torch.allclose( + torch.stack(out.scores, dim=1), torch.stack(scores, dim=1), rtol=rtol, atol=atol + ) + if not rotary: + assert torch.all(out.sequences == out_ref.sequences) + assert torch.all(out.sequences == out_hf.sequences) + + assert ( + torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1) + ).abs().max().item() < 3 * ( + torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1) + ).abs().max().item() + + +def get_logits(model, input_ids, max_length, teacher_outputs=None, **kwargs): + out = model.generate( + input_ids=input_ids, + max_length=max_length, + teacher_outputs=teacher_outputs, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + **kwargs, + ) + return torch.stack(out.scores, dim=1) + + +@pytest.mark.parametrize("seqlen,maxlen", [(10, 20), (30, 150), (3000, 3400), (14000, 15000)]) +# @pytest.mark.parametrize('seqlen,maxlen', [(10, 20)]) +@pytest.mark.parametrize("rotary", [None, "interleaved", "contiguous"]) +# @pytest.mark.parametrize('rotary', [None]) +@pytest.mark.parametrize("model_name", ["gpt2"]) +def test_gpt2_generation_cg(model_name, rotary, seqlen, maxlen): + """Check that decoding with CUDA graph is the same as decoding without CUDA graph.""" + dtype = torch.float16 + device = "cuda" + rtol, atol = 3e-3, 3e-1 + config = GPT2Config.from_pretrained(model_name) + config.n_positions = 16 * 1024 + assert seqlen <= maxlen <= config.n_positions + if rotary is not None: + config.n_positions = 0 + config.rotary_emb_dim = 32 + config.rotary_emb_interleaved = rotary == "interleaved" + config.residual_in_fp32 = True + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = True + config.fused_dropout_add_ln = True + + model = GPTLMHeadModel(config, device=device, dtype=dtype) + model.eval() + + torch.manual_seed(0) + batch_size = 1 + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device + ) + teacher_outputs = torch.randint( + 0, config.vocab_size, (batch_size, maxlen), dtype=torch.long, device=device + ) + + logits = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs) + logits_cg = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs, cg=True) + assert torch.equal(logits, logits_cg) + + # Try increasing batch size and seqlen, then decrease them to see if it's still correct + batch_size = 3 + maxlen += 30 + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device + ) + teacher_outputs = torch.randint( + 0, config.vocab_size, (batch_size, maxlen), dtype=torch.long, device=device + ) + logits = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs) + logits_cg = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs, cg=True) + assert torch.equal(logits, logits_cg) + + batch_size = 2 + maxlen -= 35 + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device + ) + teacher_outputs = torch.randint( + 0, config.vocab_size, (batch_size, maxlen), dtype=torch.long, device=device + ) + logits = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs) + logits_cg = get_logits(model, input_ids, maxlen, teacher_outputs=teacher_outputs, cg=True) + assert torch.equal(logits, logits_cg) + + +@pytest.mark.parametrize("optimized", [False, True]) +# @pytest.mark.parametrize("optimized", [False]) +@pytest.mark.parametrize("model_name", ["gpt2"]) +def test_gpt2_multiple_token_generation(model_name, optimized): + """Generation when we pass in multiple tokens at a time, not just one.""" + dtype = torch.float16 + device = "cuda" + rtol, atol = 3e-3, 3e-1 + config = GPT2Config.from_pretrained(model_name) + config.residual_in_fp32 = True + if optimized: + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = True + config.fused_dropout_add_ln = True + + model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype) + model.eval() + + torch.manual_seed(0) + input_ids = torch.randint(0, config.vocab_size, (1, 20), dtype=torch.long, device=device) + # Reference logits + logits_ref = model(input_ids).logits + + # Run 10 tokens, then pass in another 4, then another 6, to see if we get the same logits + inference_params = InferenceParams(max_seqlen=20, max_batch_size=1) + logits_10 = model(input_ids[:, :10], inference_params=inference_params).logits + inference_params.seqlen_offset += 10 + position_ids = torch.arange(10, 14, dtype=torch.long, device=device) + logits_1014 = model( + input_ids[:, 10:14], position_ids=position_ids, inference_params=inference_params + ).logits + inference_params.seqlen_offset += 4 + position_ids = torch.arange(14, 20, dtype=torch.long, device=device) + logits_1420 = model( + input_ids[:, 14:20], position_ids=position_ids, inference_params=inference_params + ).logits + logits = torch.cat([logits_10, logits_1014, logits_1420], dim=1) + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}") + assert torch.allclose(logits, logits_ref, rtol=rtol, atol=atol) + + +@pytest.mark.parametrize("cg", [False, True]) +# @pytest.mark.parametrize("cg", [True]) +@pytest.mark.parametrize("optimized", [False, True]) +# @pytest.mark.parametrize("optimized", [True]) +# @pytest.mark.parametrize("model_name", ["gpt2-medium"]) +@pytest.mark.parametrize("model_name", ["gpt2-xl"]) +def test_gpt2_speculative_decoding(model_name, optimized, cg): + if cg and not optimized: + pytest.skip() # CG requires use_flash_attn + dtype = torch.float16 + device = "cuda" + rtol, atol = 3e-3, 3e-1 + config = GPT2Config.from_pretrained(model_name) + config.residual_in_fp32 = True + if optimized: + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = True + config.fused_dropout_add_ln = True + config_draft = GPT2Config.from_pretrained("gpt2") + config_draft.residual_in_fp32 = True + if optimized: + config_draft.use_flash_attn = True + config_draft.fused_bias_fc = True + config_draft.fused_mlp = True + config_draft.fused_dropout_add_ln = True + + model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype) + model.eval() + model_draft = GPTLMHeadModel.from_pretrained("gpt2", config_draft, device=device, dtype=dtype) + model_draft.eval() + + torch.manual_seed(0) + tokenizer = GPT2Tokenizer.from_pretrained("gpt2") + input_ids = tokenizer("Hello, my dog is cute and he", return_tensors="pt").input_ids.to( + device=device + ) + max_length = 100 + + from flash_attn.utils.generation import decode_speculative + + torch.manual_seed(42) + print(f"Speculative decoding, {optimized = }") + out = decode_speculative( + input_ids, + model, + model_draft, + max_length=max_length, + top_k=5, + cg=cg, + speculative_lookahead=4, + enable_timing=True, + # debug=True, + ) + print(tokenizer.batch_decode(out.sequences)) + print(f"Without speculative decoding, {cg = }") + out_og = model.generate( + input_ids, + max_length=max_length, + top_k=5, + cg=cg, + enable_timing=True, + return_dict_in_generate=True, + ) + print(tokenizer.batch_decode(out_og.sequences)) + + +@pytest.mark.parametrize( + "n_heads_q_kv", + [ + (8, 8), # Regular attention + (8, 4), # GQA + (8, 2), # MQA + ], +) +def test_gpt2_shard_unshard(n_heads_q_kv): + world_size = 2 + + config = GPT2Config.from_pretrained("gpt2") + config.vocab_size = 1024 + config.n_head, config.n_head_kv = n_heads_q_kv + model = GPTLMHeadModel(config, device="cuda", dtype=torch.float16) + state_dict = model.state_dict() + shards = [ + # NOTE: Shallow copy as `state_dict` is modified in-place + shard_state_dict_tp(dict(state_dict), config, world_size, rank) + for rank in range(world_size) + ] + state_dict2 = combine_state_dicts_tp(shards, config) + assert state_dict2.keys() == state_dict.keys() + for k in state_dict.keys(): + ref = state_dict[k] + new = state_dict[k] + assert torch.allclose(ref, new, atol=0.0, rtol=0.0) diff --git a/test_gpt_generation_parallel.py b/test_gpt_generation_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..bcf2bf513b39f75911344fd331977afa20d61bc1 --- /dev/null +++ b/test_gpt_generation_parallel.py @@ -0,0 +1,172 @@ +# Run test with: +# torchrun --no_python --nproc_per_node=8 pytest -q -s tests/models/test_gpt_generation_parallel.py -k "parallel" +import os +import re + +import pytest +import torch +from einops import rearrange +from flash_attn.models.gpt import GPTLMHeadModel, remap_state_dict_hf_gpt2 +from flash_attn.utils.distributed import all_gather_raw +from flash_attn.utils.pretrained import state_dict_from_pretrained +from transformers import GPT2Config, GPT2Tokenizer +from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel as GPT2LMHeadModelHF + + +# @pytest.mark.parametrize('world_size', [1, 2, 4, 8]) +@pytest.mark.parametrize("world_size", [2]) +@pytest.mark.parametrize('rotary', [False, True]) +# @pytest.mark.parametrize("rotary", [False]) +@pytest.mark.parametrize("model_name", ["gpt2"]) +def test_tensor_parallel(model_name, rotary, world_size): + """Check that our implementation of GPT2 generation matches the HF implementation: + the scores in fp16 should be around the same as the HF scores in fp16, when compared to + the HF scores in fp32. + """ + dtype = torch.float16 + rtol, atol = 3e-3, 3e-1 + config = GPT2Config.from_pretrained(model_name) + if rotary: + config.n_positions = 0 + config.rotary_emb_dim = 64 + config.residual_in_fp32 = True + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = True + config.fused_dropout_add_ln = True + config.pad_vocab_size_multiple = 8 * world_size + config.sequence_parallel = False # Need to set this to False for generation + + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0" + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + # Need this, otherwise when we capture the graph the process for GPU 1 would run on both + # GPU0 and GPU1 and things would hang + torch.cuda.set_device(device) + + from apex.transformer import parallel_state + + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + process_group = parallel_state.get_tensor_model_parallel_group() + + # if not rotary, we load the weight from HF but ignore the position embeddings. + # The model would be nonsense but it doesn't matter for the test. + model = GPTLMHeadModel.from_pretrained( + model_name, + config, + strict=not rotary, + device=device, + dtype=dtype, + process_group=process_group, + world_size=world_size, + rank=rank, + ) + model.eval() + + if not rotary: + model_ref = GPT2LMHeadModelHF.from_pretrained(model_name).to(device=device) + model_hf = GPT2LMHeadModelHF.from_pretrained(model_name).to(device=device, dtype=dtype) + model_ref.eval() + model_hf.eval() + + torch.manual_seed(0) + tokenizer = GPT2Tokenizer.from_pretrained("gpt2") + input_ids = tokenizer("Hello, my dog is cute and ", return_tensors="pt").input_ids.to( + device=device + ) + max_length = 30 + # input_ids = torch.randint(0, 100, (1, 10), dtype=torch.long, device='cuda') + # max_length = input_ids.shape[1] + 40 + + # Slow generation for reference + sequences = [] + scores = [] + cur_input_ids = input_ids + with torch.inference_mode(): + logits, _ = all_gather_raw(model(cur_input_ids).logits[:, -1], process_group) + logits = rearrange(logits, "(n b) d -> b (n d)", b=input_ids.shape[0])[ + ..., : config.vocab_size + ] + scores.append(logits) + sequences.append(scores[-1].argmax(dim=-1)) + for _ in range(input_ids.shape[1] + 1, max_length): + cur_input_ids = torch.cat([cur_input_ids, rearrange(sequences[-1], "b -> b 1")], dim=-1) + logits, _ = all_gather_raw(model(cur_input_ids).logits[:, -1], process_group) + logits = rearrange(logits, "(n b) d -> b (n d)", b=input_ids.shape[0])[ + ..., : config.vocab_size + ] + scores.append(logits) + sequences.append(scores[-1].argmax(dim=-1)) + sequences = torch.cat([input_ids, torch.stack(sequences, dim=1)], dim=1) + scores = tuple(scores) + print(sequences) + + out = model.generate( + input_ids=input_ids, + max_length=max_length, + tensor_parallel=world_size, + vocab_size=config.vocab_size, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + ) + print(out.sequences) + if getattr(config, "use_flash_attn", False): + out_cg = model.generate( + input_ids=input_ids, + max_length=max_length, + tensor_parallel=world_size, + vocab_size=config.vocab_size, + cg=True, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + ) + print(out_cg.sequences) + + parallel_state.destroy_model_parallel() + + if not rotary: + out_hf = model_hf.generate( + input_ids=input_ids, + max_length=max_length, + return_dict_in_generate=True, + output_scores=True, + ) + out_ref = model_ref.generate( + input_ids=input_ids, + max_length=max_length, + return_dict_in_generate=True, + output_scores=True, + ) + + print( + f"Scores max diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}" + ) + print( + f"Scores mean diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}" + ) + print( + f"HF fp16 max diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}" + ) + print( + f"HF fp16 mean diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}" + ) + + assert torch.all(out.sequences == sequences) + assert torch.allclose( + torch.stack(out.scores, dim=1), torch.stack(scores, dim=1), rtol=rtol, atol=atol + ) + assert torch.equal(torch.stack(out.scores, dim=1), torch.stack(out_cg.scores, dim=1)) + if not rotary: + assert torch.all(out.sequences == out_ref.sequences) + assert torch.all(out.sequences == out_hf.sequences) + + assert ( + torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1) + ).abs().max().item() < 3 * ( + torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1) + ).abs().max().item() diff --git a/test_gpt_neox.py b/test_gpt_neox.py new file mode 100644 index 0000000000000000000000000000000000000000..9ae8aa9a24da90070aabd41b85d6bbfc6ee1aa9c --- /dev/null +++ b/test_gpt_neox.py @@ -0,0 +1,104 @@ +# Copyright (c) 2023, Tri Dao. + +import time + +import pytest +import torch +from flash_attn.models.gpt import GPTLMHeadModel +from flash_attn.models.gpt_neox import gpt_neox_config_to_gpt2_config, remap_state_dict_hf_gpt_neox +from flash_attn.utils.pretrained import state_dict_from_pretrained +from transformers import AutoTokenizer, GPTNeoXConfig +from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM + + +@pytest.mark.parametrize("model_name", ["EleutherAI/gpt-neox-20b"]) +def test_gptj_state_dict(model_name): + config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(model_name)) + pretrained_state_dict = remap_state_dict_hf_gpt_neox( + state_dict_from_pretrained(model_name), config + ) + model = GPTLMHeadModel(config, device="meta") # Without device='meta' init is very slow + state_dict = model.state_dict() + assert state_dict.keys() == pretrained_state_dict.keys() + for k in state_dict.keys(): + assert state_dict[k].shape == pretrained_state_dict[k].shape + + +@pytest.mark.parametrize( + "model_name", + [ + "EleutherAI/pythia-1b", + "EleutherAI/pythia-2.8b", + "EleutherAI/gpt-neox-20b", + "togethercomputer/RedPajama-INCITE-7B-Base", + ], +) +def test_gpt_neox_optimized(model_name): + """Check that our implementation of GPT-NeoX (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + device = "cuda" + config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(model_name)) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = config.activation_function in [ + "gelu_fast", + "gelu_new", + "gelu_approx", + "gelu_pytorch_tanh", + ] + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + + model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype) + model.eval() + + torch.manual_seed(0) + batch_size = 2 + max_seqlen = 256 + seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device) + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device + ) + with torch.no_grad(): + out = model.transformer(input_ids) + logits = model(input_ids).logits + del model + + # Need at least 2 GPUs, otherwise we'll OOM for the 20B model + # Without device_map, the model is loaded on the CPU, which is very slow + model_ref = GPTNeoXForCausalLM.from_pretrained(model_name, device_map="auto") + model_ref.eval() + with torch.no_grad(): + out_ref = model_ref.gpt_neox(input_ids).last_hidden_state.to(device=device) + logits_ref = model_ref(input_ids).logits.to(device=device) + del model_ref + + model_hf = GPTNeoXForCausalLM.from_pretrained( + model_name, torch_dtype=dtype, device_map={"": device} + ) + model_hf.eval() + with torch.no_grad(): + out_hf = model_hf.gpt_neox(input_ids).last_hidden_state + logits_hf = model_hf(input_ids).logits + del model_hf + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}") + assert (out - out_ref).abs().max().item() < 2 * (out_hf - out_ref).abs().max().item() + assert (out - out_ref).abs().mean().item() < 2 * (out_hf - out_ref).abs().mean().item() + + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}") + assert (logits - logits_ref).abs().max().item() < 2 * ( + logits_hf - logits_ref + ).abs().max().item() + assert (logits - logits_ref).abs().mean().item() < 2 * ( + logits_hf - logits_ref + ).abs().mean().item() diff --git a/test_gpt_parallel.py b/test_gpt_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..1674d2a60fca9a16dcf8f0e919c7adae46e7475a --- /dev/null +++ b/test_gpt_parallel.py @@ -0,0 +1,236 @@ +# Run test with: +# torchrun --no_python --nproc_per_node=8 pytest -q -s tests/models/test_gpt_parallel.py + +import math + +import pytest +import torch +import torch.nn as nn +import torch.nn.functional as F +from apex.transformer import parallel_state +from einops import rearrange +from flash_attn.losses.cross_entropy import CrossEntropyLoss +from flash_attn.models.gpt import GPTLMHeadModel, shard_state_dict_tp +from flash_attn.utils.distributed import allreduce_sequence_parallel_grad +from transformers import GPT2Config + +is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8 + + +@pytest.mark.parametrize("dtype", [torch.float16] + ([torch.bfloat16] if is_sm8x else [])) +# @pytest.mark.parametrize('dtype', [torch.bfloat16]) +@pytest.mark.parametrize("world_size", [1, 2, 4, 8]) +# @pytest.mark.parametrize('world_size', [2]) +@pytest.mark.parametrize("sequence_parallel", [True, False]) +# @pytest.mark.parametrize('sequence_parallel', [False]) +@pytest.mark.parametrize("has_pos_emb", [True, False]) +# @pytest.mark.parametrize('has_pos_emb', [True]) +@pytest.mark.parametrize("dim", [1024]) +def test_gpt_parallel(dim, has_pos_emb, sequence_parallel, world_size, dtype): + head_dim = 64 + assert dim % head_dim == 0 + num_heads = dim // head_dim + assert num_heads % world_size == 0 + vocab_size = 50264 + assert vocab_size % world_size == 0 + num_layers = 2 + rtol, atol = (3e-3, 1e-1) if dtype == torch.bfloat16 else (3e-3, 1e-2) + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + process_group = parallel_state.get_tensor_model_parallel_group() + # set seed + torch.random.manual_seed(0) + batch_size = 8 + seqlen = 1024 + assert (batch_size * seqlen) % world_size == 0 + input_ids = torch.randint(0, vocab_size, (batch_size, seqlen + 1), device=device) + + # We need to generate g here so that all processes get the same gradient, + # as rank 0 will have an extra bias that changes the RNG. + g = torch.randn(batch_size * seqlen, device=device) + + config = GPT2Config( + n_embd=dim, + n_head=num_heads, + n_layer=num_layers, + n_positions=seqlen if has_pos_emb else 0, + vocab_size=50257, + resid_pdrop=0.0, + embd_pdrop=0.0, + attn_pdrop=0.0, + scale_attn_by_inverse_layer_idx=True, + use_flash_attn=True, + fused_mlp=True, + fused_bias_fc=True, + fused_dropout_add_ln=True, + residual_in_fp32=True, + rotary_emb_fraction=0.0 if has_pos_emb else 0.5, + pad_vocab_size_multiple=8 * world_size, + sequence_parallel=sequence_parallel, + ) + config.vocab_size = math.ceil(config.vocab_size / (8 * world_size)) * (8 * world_size) + model_pt = GPTLMHeadModel(config, device=device) + + def init_layer_norm(module): + if isinstance(module, nn.LayerNorm): + nn.init.normal_(module.weight) + nn.init.normal_(module.bias) + + model_pt.apply(init_layer_norm) + + model = GPTLMHeadModel(config, process_group=process_group, device=device) + total_nparams = sum(p.numel() for p in model_pt.parameters()) + sharded_nparams = sum(p.numel() for p in model.parameters()) + sharded_nparams_all = torch.empty(world_size, dtype=torch.long, device=device) + torch.distributed.all_gather_into_tensor( + sharded_nparams_all, torch.tensor([sharded_nparams], device=device), group=process_group + ) + shared_nparams = sum( + p.numel() for p in model.parameters() if getattr(p, "_shared_params", False) + ) + shared_nparams_all = torch.empty(world_size, dtype=torch.long, device=device) + torch.distributed.all_gather_into_tensor( + shared_nparams_all, torch.tensor([shared_nparams], device=device), group=process_group + ) + assert torch.all(shared_nparams_all == shared_nparams) + assert total_nparams == ( + (sharded_nparams_all - shared_nparams_all).sum().item() + shared_nparams + ) + + # vocab_size has been rounded up here + partition_vocab_size = config.vocab_size // world_size + partition_dim = dim // world_size + partition_hidden_dim = 4 * dim // world_size + with torch.no_grad(): + model.load_state_dict(shard_state_dict_tp(model_pt.state_dict(), config, world_size, rank)) + model.tie_weights() + + with torch.autocast(device_type="cuda", dtype=dtype): + out = model(input_ids[:, :-1]).logits + if not sequence_parallel: + out = rearrange(out, "b s d -> (b s) d") + out_pt = rearrange(model_pt(input_ids[:, :-1]).logits, "b s d -> (b s) d") + partition_batch_dim = batch_size * seqlen // world_size + assert torch.allclose( + out, + out_pt[:, rank * partition_vocab_size : (rank + 1) * partition_vocab_size], + rtol=rtol, + atol=atol, + ) + loss_fn = CrossEntropyLoss(inplace_backward=True, reduction="none", process_group=process_group) + loss_fn_pt = CrossEntropyLoss(inplace_backward=True, reduction="none") + loss = loss_fn(out, input_ids[:, 1:].flatten()) + loss_pt = loss_fn_pt(out_pt, input_ids[:, 1:].flatten()) + assert torch.allclose(loss, loss_pt, rtol=rtol, atol=atol) + + loss_pt.backward(g) + loss.backward(g) + allreduce_sequence_parallel_grad(model, process_group) + parallel_state.destroy_model_parallel() + + grad_dict = shard_state_dict_tp( + {k: v.grad for k, v in model_pt.named_parameters()}, config, world_size, rank + ) + + assert torch.allclose( + model.transformer.embeddings.word_embeddings.weight.grad, + grad_dict["transformer.embeddings.word_embeddings.weight"], + rtol=rtol, + atol=atol * 5, + ) + if has_pos_emb: + assert torch.allclose( + model.transformer.embeddings.position_embeddings.weight.grad, + grad_dict["transformer.embeddings.position_embeddings.weight"], + rtol=rtol, + atol=atol, + ) + assert torch.allclose( + model.transformer.ln_f.weight.grad, + grad_dict["transformer.ln_f.weight"], + rtol=rtol, + atol=atol, + ) + assert torch.allclose( + model.transformer.ln_f.bias.grad, grad_dict["transformer.ln_f.bias"], rtol=rtol, atol=atol + ) + for i in range(num_layers): + assert torch.allclose( + model.transformer.layers[i].mixer.Wqkv.weight.grad, + grad_dict[f"transformer.layers.{i}.mixer.Wqkv.weight"], + rtol=rtol, + atol=atol * 10, + ) + assert torch.allclose( + model.transformer.layers[i].mixer.Wqkv.bias.grad, + grad_dict[f"transformer.layers.{i}.mixer.Wqkv.bias"], + rtol=rtol, + atol=atol * 10, + ) + assert torch.allclose( + model.transformer.layers[i].mixer.out_proj.weight.grad, + grad_dict[f"transformer.layers.{i}.mixer.out_proj.weight"], + rtol=rtol, + atol=atol * 10, + ) + if rank == 0: + assert torch.allclose( + model.transformer.layers[i].mixer.out_proj.bias.grad, + grad_dict[f"transformer.layers.{i}.mixer.out_proj.bias"], + rtol=rtol, + atol=atol * 5, + ) + assert torch.allclose( + model.transformer.layers[i].mlp.fc1.weight.grad, + grad_dict[f"transformer.layers.{i}.mlp.fc1.weight"], + rtol=rtol, + atol=atol * 10, + ) + assert torch.allclose( + model.transformer.layers[i].mlp.fc1.bias.grad, + grad_dict[f"transformer.layers.{i}.mlp.fc1.bias"], + rtol=rtol, + atol=atol * 10, + ) + assert torch.allclose( + model.transformer.layers[i].mlp.fc2.weight.grad, + grad_dict[f"transformer.layers.{i}.mlp.fc2.weight"], + rtol=rtol, + atol=atol * 10, + ) + if rank == 0: + assert torch.allclose( + model.transformer.layers[i].mlp.fc2.bias.grad, + grad_dict[f"transformer.layers.{i}.mlp.fc2.bias"], + rtol=rtol, + atol=atol * 5, + ) + + assert torch.allclose( + model.transformer.layers[i].norm1.weight.grad, + grad_dict[f"transformer.layers.{i}.norm1.weight"], + rtol=rtol, + atol=atol, + ) + assert torch.allclose( + model.transformer.layers[i].norm1.bias.grad, + grad_dict[f"transformer.layers.{i}.norm1.bias"], + rtol=rtol, + atol=atol, + ) + assert torch.allclose( + model.transformer.layers[i].norm2.weight.grad, + grad_dict[f"transformer.layers.{i}.norm2.weight"], + rtol=rtol, + atol=atol, + ) + assert torch.allclose( + model.transformer.layers[i].norm2.bias.grad, + grad_dict[f"transformer.layers.{i}.norm2.bias"], + rtol=rtol, + atol=atol, + ) diff --git a/test_gptj.py b/test_gptj.py new file mode 100644 index 0000000000000000000000000000000000000000..496d8322523ea37b35c8ade0a5ff855968daca64 --- /dev/null +++ b/test_gptj.py @@ -0,0 +1,184 @@ +# Copyright (c) 2023, Tri Dao. + +import time + +import pytest +import torch +from flash_attn.models.gpt import GPTLMHeadModel +from flash_attn.models.gptj import gptj_config_to_gpt2_config, remap_state_dict_hf_gptj +from flash_attn.utils.generation import update_graph_cache +from flash_attn.utils.pretrained import state_dict_from_pretrained +from transformers import AutoTokenizer, GPTJConfig +from transformers.models.gptj.modeling_gptj import GPTJForCausalLM + + +@pytest.mark.parametrize("model_name", ["EleutherAI/gpt-j-6B"]) +def test_gptj_state_dict(model_name): + config = gptj_config_to_gpt2_config(GPTJConfig.from_pretrained(model_name)) + pretrained_state_dict = remap_state_dict_hf_gptj(state_dict_from_pretrained(model_name), config) + model = GPTLMHeadModel(config, device="meta") # Without device='meta' init is very slow + state_dict = model.state_dict() + assert state_dict.keys() == pretrained_state_dict.keys() + for k in state_dict.keys(): + assert state_dict[k].shape == pretrained_state_dict[k].shape + + +@pytest.mark.parametrize("model_name", ["EleutherAI/gpt-j-6B", "togethercomputer/GPT-JT-6B-v1"]) +def test_gptj_optimized(model_name): + """Check that our implementation of GPT-J (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + device = "cuda" + config = gptj_config_to_gpt2_config(GPTJConfig.from_pretrained(model_name)) + config.use_flash_attn = True # FlashAttention-2 supports headdim 256 + config.fused_bias_fc = True + config.fused_mlp = True + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + + model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype) + model.eval() + + torch.manual_seed(0) + batch_size = 2 + max_seqlen = 256 + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device + ) + with torch.no_grad(): + out = model.transformer(input_ids) + logits = model(input_ids).logits + del model + + # Without device_map, the model is loaded on the CPU, which is very slow + model_ref = GPTJForCausalLM.from_pretrained(model_name, device_map={"": device}) + model_ref.eval() + with torch.no_grad(): + out_ref = model_ref.transformer(input_ids).last_hidden_state + logits_ref = model_ref(input_ids).logits + del model_ref + + model_hf = GPTJForCausalLM.from_pretrained( + model_name, torch_dtype=dtype, device_map={"": device} + ) + model_hf.eval() + out_hf = model_hf.transformer(input_ids).last_hidden_state + logits_hf = model_hf(input_ids).logits + del model_hf + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}") + assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item() + + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}") + assert (logits - logits_ref).abs().max().item() < 3 * ( + logits_hf - logits_ref + ).abs().max().item() + + +@pytest.mark.parametrize("model_name", ["EleutherAI/gpt-j-6B"]) +def test_gptj_generation(model_name): + """Check that our implementation of GPT-J (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + device = "cuda" + config = gptj_config_to_gpt2_config(GPTJConfig.from_pretrained(model_name)) + config.use_flash_attn = True # FlashAttention-2 supports headdim 256 + config.fused_bias_fc = True + config.fused_mlp = True + config.fused_dropout_add_ln = True + # Only prenorm supports residual_in_fp32 + config.residual_in_fp32 = True + + tokenizer = AutoTokenizer.from_pretrained(model_name) + eos_token_id = tokenizer.eos_token_id + + torch.manual_seed(0) + batch_size = 1 + seqlen = 100 + max_length = 150 + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device + ) + + model_hf = GPTJForCausalLM.from_pretrained( + model_name, torch_dtype=dtype, device_map={"": device} + ) + model_hf.eval() + print("HF fp16") + torch.cuda.synchronize() + start = time.time() + out_hf = model_hf.generate( + input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + del model_hf + + model_ref = GPTJForCausalLM.from_pretrained(model_name, device_map={"": device}) + model_ref.eval() + with torch.no_grad(): + logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1] + del model_ref + + model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype) + model.eval() + + print("Without CUDA graph") + torch.cuda.synchronize() + start = time.time() + out = model.generate( + input_ids=input_ids, + max_length=max_length, + eos_token_id=eos_token_id, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + teacher_outputs=out_hf.sequences, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + + # Capture graph outside the timing loop + batch_size, seqlen_og = input_ids.shape + model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length) + print("With CUDA graph") + torch.cuda.synchronize() + start = time.time() + out_cg = model.generate( + input_ids=input_ids, + max_length=max_length, + cg=True, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + teacher_outputs=out_hf.sequences, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + + with torch.no_grad(): + logits_parallel = model(out_hf.sequences).logits[:, (seqlen - 1) : -1] + logits_hf = torch.stack(out_hf.scores, dim=1) + logits = torch.stack(out.scores, dim=1) + logits_cg = torch.stack(out_cg.scores, dim=1) + + del model + + hf_error = (logits_hf - logits_ref).abs().max().item() + assert (logits_parallel - logits_ref).abs().max().item() < 2 * hf_error + + print(f"HF fp16 logits max diff: {hf_error}") + print(f"Logits max diff: {(logits - logits_ref).abs().max().item() }") + assert (logits - logits_ref).abs().max().item() < 2 * hf_error + print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item() }") + assert torch.equal(logits_cg, logits) diff --git a/test_layer_norm.py b/test_layer_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..3d92b6b32961e17acfabc69fe44c2ae84ef1b8b6 --- /dev/null +++ b/test_layer_norm.py @@ -0,0 +1,368 @@ +# Copyright (c) 2024, Tri Dao. + +import pytest +import torch +import torch.nn.functional as F +from einops import rearrange, repeat + +from flash_attn.ops.triton.layer_norm import ( + layer_norm_fn, + layer_norm_ref, + rms_norm_ref, + layer_norm_linear_fn, +) + + +is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8 + + +@pytest.mark.parametrize("has_weight1", [False, True]) +# @pytest.mark.parametrize("has_weight1", [True]) +@pytest.mark.parametrize("has_x1", [False, True]) +# @pytest.mark.parametrize("has_x1", [False]) +@pytest.mark.parametrize("has_rowscale", [False, True]) +# @pytest.mark.parametrize("has_rowscale", [False]) +@pytest.mark.parametrize("dropout_p", [0.0, 0.27]) +# @pytest.mark.parametrize("dropout_p", [0.0]) +@pytest.mark.parametrize("prenorm", [True, False]) +# @pytest.mark.parametrize("prenorm", [False]) +@pytest.mark.parametrize("is_rms_norm", [False, True]) +# @pytest.mark.parametrize("is_rms_norm", [True]) +@pytest.mark.parametrize("has_residual", [True, False]) +# @pytest.mark.parametrize("has_residual", [False]) +@pytest.mark.parametrize( + "weight_dtype", [torch.float32, torch.float16] + ([torch.bfloat16] if is_sm8x else []) +) +# @pytest.mark.parametrize("weight_dtype", [torch.float32]) +@pytest.mark.parametrize( + "input_dtype,residual_dtype", + [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)] + + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []), +) +# @pytest.mark.parametrize("input_dtype,residual_dtype", [(torch.float16, torch.float16)]) +@pytest.mark.parametrize("hidden_size", [192, 2048, 2560, 3000, 4096]) +# @pytest.mark.parametrize("hidden_size", [256]) +def test_layer_norm( + hidden_size, + input_dtype, + residual_dtype, + weight_dtype, + has_residual, + is_rms_norm, + prenorm, + dropout_p, + has_rowscale, + has_x1, + has_weight1, +): + if has_rowscale and has_x1: + pytest.skip("Not supported") + device = "cuda" + if any(x == torch.bfloat16 for x in [input_dtype, residual_dtype, weight_dtype]): + atol = 5e-2 + elif any(x == torch.float16 for x in [input_dtype, residual_dtype, weight_dtype]): + atol = 1e-2 + else: + atol = 1e-4 + # set seed + torch.random.manual_seed(0) + batch_size = 8 + seqlen = 512 + layer_norm_ref_fn = layer_norm_ref if not is_rms_norm else rms_norm_ref + allclose = ( + # Sometimes x0_pt.grad is NaN + lambda x, x_pt, x_ref, atol=atol: (x - x_ref).abs().max() + <= 2 * (x_pt[~x_pt.isnan()] - x_ref[~x_pt.isnan()]).abs().max() + atol + or ( + # Sometimes x_pt and x_ref are the same (e.g. bfloat16) so we want to perturb is a bit + # by multiply and divide by 0.3 + (x_pt[~x_pt.isnan()] - x_ref[~x_pt.isnan()]).abs().max() == 0.0 + and (x - x_ref).abs().max() + <= 2 * (x_pt[~x_pt.isnan()] * 0.3 / 0.3 - x_ref[~x_pt.isnan()]).abs().max() + atol + ) + ) + x0 = torch.randn( + batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True + ) + x0_pt = x0.detach().clone().requires_grad_() + x0_ref = x0.detach().clone().requires_grad_() + if has_residual: + res = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True) + res_pt = res.detach().clone().requires_grad_() + res_ref = res.detach().clone().requires_grad_() + else: + res, res_pt, res_ref = None, None, None + weight = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + if not is_rms_norm: + bias = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + else: + bias = None + weight_pt = weight.detach().clone().requires_grad_() + weight_ref = weight.detach().clone().requires_grad_() + bias_pt = bias.detach().clone().requires_grad_() if bias is not None else None + bias_ref = bias.detach().clone().requires_grad_() if bias is not None else None + if has_x1: + x1 = torch.randn_like(x0, dtype=input_dtype, requires_grad=True) + x1_pt = x1.detach().clone().requires_grad_() + x1_ref = x1.detach().clone().requires_grad_() + else: + x1, x1_pt, x1_ref = None, None, None + if has_weight1: + weight1 = torch.randn( + hidden_size, device=device, dtype=weight_dtype, requires_grad=True + ) + weight1_pt = weight1.detach().clone().requires_grad_() + weight1_ref = weight1.detach().clone().requires_grad_() + if not is_rms_norm: + bias1 = torch.randn( + hidden_size, device=device, dtype=weight_dtype, requires_grad=True + ) + else: + bias1 = None + bias1_pt = bias1.detach().clone().requires_grad_() if bias1 is not None else None + bias1_ref = bias1.detach().clone().requires_grad_() if bias1 is not None else None + else: + weight1, weight1_pt, weight1_ref = None, None, None + bias1, bias1_pt, bias1_ref = None, None, None + + rowscale = ( + torch.randn(batch_size, seqlen, dtype=input_dtype, device=device) + if has_rowscale + else None + ) + + residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32 + out, *rest = layer_norm_fn( + x0, + weight, + bias, + residual=res, + x1=x1, + weight1=weight1, + bias1=bias1, + eps=1e-6, + dropout_p=dropout_p, + rowscale=rowscale, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32, + is_rms_norm=is_rms_norm, + return_dropout_mask=True, + ) + dropout_mask = rest[-2] if dropout_p > 0.0 else None + dropout_mask1 = rest[-1] if dropout_p > 0.0 and x1 is not None else None + out_pt = layer_norm_ref_fn( + x0_pt, + weight_pt, + bias_pt, + residual=res_pt, + x1=x1_pt, + weight1=weight1_pt, + bias1=bias1_pt, + eps=1e-6, + dropout_p=dropout_p, + rowscale=rowscale, + prenorm=prenorm, + dropout_mask=dropout_mask, + dropout_mask1=dropout_mask1, + ) + out_ref = layer_norm_ref_fn( + x0_ref, + weight_ref, + bias_ref, + residual=res_ref, + x1=x1_ref, + weight1=weight1_ref, + bias1=bias1_ref, + eps=1e-6, + dropout_p=dropout_p, + rowscale=rowscale, + prenorm=prenorm, + dropout_mask=dropout_mask, + dropout_mask1=dropout_mask1, + upcast=True, + ) + if not has_weight1: + if prenorm: + residual = rest[0] + out_pt, residual_pt = out_pt + out_ref, residual_ref = out_ref + out1, out1_pt, out1_ref = None, None, None + else: + out1 = rest.pop(0) + if prenorm: + residual = rest[0] + out_pt, out1_pt, residual_pt = out_pt + out_ref, out1_ref, residual_ref = out_ref + else: + out_pt, out1_pt = out_pt + out_ref, out1_ref = out_ref + assert out.dtype == input_dtype + if prenorm: + assert residual.dtype == residual_dtype + assert allclose(residual, residual_pt, residual_ref) + assert allclose(out, out_pt, out_ref) + if out1 is not None: + assert out1.dtype == input_dtype + assert allclose(out1, out1_pt, out1_ref) + if dropout_mask is not None: + dropout_fraction = 1.0 - dropout_mask.float().mean() + assert abs(dropout_fraction - dropout_p) < 0.01 + if dropout_mask1 is not None: + dropout_fraction = 1.0 - dropout_mask1.float().mean() + assert abs(dropout_fraction - dropout_p) < 0.01 + assert not torch.equal(dropout_mask, dropout_mask1) + + g = torch.randn_like(out) / batch_size + if has_weight1: + out = out * F.gelu(out1) + out_pt = out_pt * F.gelu(out1_pt) + out_ref = out_ref * F.gelu(out1_ref) + if not prenorm: + out.backward(g) + out_pt.backward(g) + out_ref.backward(g) + else: + (out * F.sigmoid(residual)).backward(g) + (out_pt * F.sigmoid(residual_pt)).backward(g) + (out_ref * F.sigmoid(residual_ref.to(dtype=residual_dtype))).backward(g) + assert allclose(x0.grad, x0_pt.grad, x0_ref.grad) + if has_residual: + assert allclose(res.grad, res_pt.grad, res_ref.grad) + if has_x1: + assert allclose(x1.grad, x1_pt.grad, x1_ref.grad) + assert allclose(weight.grad, weight_pt.grad, weight_ref.grad) + if bias is not None: + assert allclose(bias.grad, bias_pt.grad, bias_ref.grad) + if has_weight1: + assert allclose(weight1.grad, weight1_pt.grad, weight1_ref.grad) + if bias1 is not None: + assert allclose(bias1.grad, bias1_pt.grad, bias1_ref.grad) + + +@pytest.mark.parametrize("prenorm", [True, False]) +# @pytest.mark.parametrize("prenorm", [True]) +@pytest.mark.parametrize("is_rms_norm", [False, True]) +# @pytest.mark.parametrize("is_rms_norm", [True]) +@pytest.mark.parametrize("has_residual", [True, False]) +# @pytest.mark.parametrize("has_residual", [False]) +@pytest.mark.parametrize("weight_dtype", [torch.float32]) +@pytest.mark.parametrize( + "input_dtype,residual_dtype", + [(torch.float16, torch.float16), (torch.float16, torch.float32)] + + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []), +) +# @pytest.mark.parametrize("input_dtype,residual_dtype", [(torch.bfloat16, torch.float32)]) +@pytest.mark.parametrize("hidden_size", [192, 2048, 2560, 3000]) +# @pytest.mark.parametrize("hidden_size", [256]) +def test_layer_norm_linear( + hidden_size, input_dtype, residual_dtype, weight_dtype, has_residual, is_rms_norm, prenorm +): + device = "cuda" + if any(x == torch.bfloat16 for x in [input_dtype, residual_dtype, weight_dtype]): + atol = 5e-2 + elif any(x == torch.float16 for x in [input_dtype, residual_dtype, weight_dtype]): + atol = 1e-2 + else: + atol = 1e-4 + # set seed + torch.random.manual_seed(0) + batch_size = 4 + seqlen = 512 + # batch_size = 1 + # seqlen = 1 + layer_norm_ref_fn = layer_norm_ref if not is_rms_norm else rms_norm_ref + allclose = ( + lambda x, x_pt, x_ref, atol=atol: (x - x_ref).abs().max() + <= 2 * (x_pt - x_ref).abs().max() + atol + ) + x0 = torch.randn( + batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True + ) + x0_pt = x0.detach().clone().requires_grad_() + x0_ref = x0.detach().clone().requires_grad_() + if has_residual: + res = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True) + res_pt = res.detach().clone().requires_grad_() + res_ref = res.detach().clone().requires_grad_() + else: + res, res_pt, res_ref = None, None, None + norm_weight = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + if not is_rms_norm: + norm_bias = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True) + else: + norm_bias = None + norm_weight_pt = norm_weight.detach().clone().requires_grad_() + norm_weight_ref = norm_weight.detach().clone().requires_grad_() + norm_bias_pt = norm_bias.detach().clone().requires_grad_() if norm_bias is not None else None + norm_bias_ref = norm_bias.detach().clone().requires_grad_() if norm_bias is not None else None + linear_weight = torch.empty( + 2 * hidden_size, hidden_size, device=device, dtype=weight_dtype, requires_grad=True + ) + torch.nn.init.xavier_uniform_(linear_weight) + if not is_rms_norm: + linear_bias = torch.randn( + 2 * hidden_size, device=device, dtype=weight_dtype, requires_grad=True + ) + else: + linear_bias = None + linear_weight_pt = linear_weight.detach().clone().requires_grad_() + linear_weight_ref = linear_weight.detach().clone().requires_grad_() + linear_bias_pt = ( + linear_bias.detach().clone().requires_grad_() if linear_bias is not None else None + ) + linear_bias_ref = ( + linear_bias.detach().clone().requires_grad_() if linear_bias is not None else None + ) + + residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32 + with torch.autocast(device_type="cuda", dtype=input_dtype): + out, *rest = layer_norm_linear_fn( + x0, + norm_weight, + norm_bias, + linear_weight, + linear_bias, + residual=res, + eps=1e-6, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32, + is_rms_norm=is_rms_norm, + ) + out_pt, *rest_pt = layer_norm_ref_fn( + x0_pt, norm_weight_pt, norm_bias_pt, residual=res_pt, eps=1e-6, prenorm=prenorm + ) + with torch.autocast(device_type="cuda", dtype=input_dtype): + out_pt = F.linear(out_pt, linear_weight_pt, linear_bias_pt) + out_ref, *rest_ref = layer_norm_ref_fn( + x0_ref, + norm_weight_ref, + norm_bias_ref, + residual=res_ref, + eps=1e-6, + prenorm=prenorm, + upcast=True, + ) + out_ref = F.linear(out_ref.to(linear_weight_ref.dtype), linear_weight_ref, linear_bias_ref) + if prenorm: + residual = rest[0] + residual_pt = rest_pt[0] + residual_ref = rest_ref[0] + assert out.dtype == input_dtype + if prenorm: + assert residual.dtype == residual_dtype + assert allclose(residual, residual_pt, residual_ref) + assert allclose(out, out_pt, out_ref) + + g = torch.randn_like(out) / batch_size + out.backward(g) + out_pt.backward(g) + out_ref.backward(g) + assert allclose(x0.grad, x0_pt.grad, x0_ref.grad) + if has_residual: + assert allclose(res.grad, res_pt.grad, res_ref.grad) + assert allclose(norm_weight.grad, norm_weight_pt.grad, norm_weight_ref.grad) + if norm_bias is not None: + assert allclose(norm_bias.grad, norm_bias_pt.grad, norm_bias_ref.grad) + assert allclose(linear_weight.grad, linear_weight_pt.grad, linear_weight_ref.grad) + if linear_bias is not None: + assert allclose(linear_bias.grad, linear_bias_pt.grad, linear_bias_ref.grad) diff --git a/test_llama.py b/test_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..32e9cd2114ad85756bd842937ff49ec219bdb38f --- /dev/null +++ b/test_llama.py @@ -0,0 +1,633 @@ +# Copyright (c) 2023, Tri Dao. + +# To run the huggingface implementation of LLaMa (1), we first need to convert the weights: +# https://github.com/huggingface/transformers/pull/21955 +# python -m transformers.models.llama.convert_llama_weights_to_hf --input_dir $CHECKPOINT_DIR/llama --model_size 7B --output_dir $CHECKPOINT_DIR/llama/7B-hf +# and repeat for 13B, 30B, 65B + +import os +import time +from pathlib import Path + +current_dir = Path(__file__).parent.absolute() + +import shutil + +import pytest +import torch +from einops import rearrange +from flash_attn.models.gpt import GPTLMHeadModel, combine_state_dicts_tp, shard_state_dict_tp +from flash_attn.models.llama import ( + config_from_checkpoint, + inv_remap_state_dict_hf_llama, + llama_config_to_gpt2_config, + remap_state_dict_hf_llama, + remap_state_dict_meta_llama, + state_dicts_from_checkpoint, +) +from flash_attn.utils.distributed import all_gather_raw +from flash_attn.utils.generation import update_graph_cache +from flash_attn.utils.pretrained import state_dict_from_pretrained +from transformers import LlamaConfig, LlamaTokenizer +from transformers.models.llama.modeling_llama import LlamaForCausalLM +from transformers import AutoConfig + + +def _pretrained_state_dict_from_checkpoint(checkpoint_path, model_name, config, checkpoint_format): + if checkpoint_format == "meta": + ckpt_state_dicts = state_dicts_from_checkpoint(checkpoint_path, model_name) + pretrained_state_dicts = [remap_state_dict_meta_llama(s, config) for s in ckpt_state_dicts] + pretrained_state_dict = combine_state_dicts_tp(pretrained_state_dicts, config) + else: + pretrained_state_dict = state_dict_from_pretrained( + Path(checkpoint_path) / f"{model_name}-hf" + ) + pretrained_state_dict = remap_state_dict_hf_llama(pretrained_state_dict, config) + return pretrained_state_dict + + +@pytest.mark.parametrize("model_name", ["7B"]) +def test_llama_state_dict(model_name): + checkpoint_path = ( + Path(os.environ.get("CHECKPOINT_DIR", current_dir.parent.parent / "checkpoints")) / "llama" + ) + config = llama_config_to_gpt2_config(config_from_checkpoint(checkpoint_path, model_name)) + ckpt_state_dicts = state_dicts_from_checkpoint(checkpoint_path, model_name) + pretrained_state_dict = remap_state_dict_meta_llama(ckpt_state_dicts[0], config) + model = GPTLMHeadModel(config, device="meta") # Without device='meta' init is very slow + state_dict = model.state_dict() + assert state_dict.keys() == pretrained_state_dict.keys() + for k in state_dict.keys(): + assert state_dict[k].shape == pretrained_state_dict[k].shape + + +# TinyLlama-1.1B is to test MQA +@pytest.mark.parametrize( + "model_name", ["meta-llama/Llama-2-7b-hf", "PY007/TinyLlama-1.1B-step-50K-105b"] +) +def test_inv_remap_state_dict_hf_llama(model_name): + config = llama_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + state_dict = state_dict_from_pretrained(model_name) + # inv_remap_state_dict_hf_llama should be the inverse of remap_state_dict_hf_llama + state_dict = {key: val for key, val in state_dict.items() if "rotary_emb.inv_freq" not in key} + pretrained_state_dict = remap_state_dict_hf_llama(state_dict, config) + state_dict_recover = inv_remap_state_dict_hf_llama(pretrained_state_dict, config) + assert set(state_dict_recover.keys()) == set(state_dict.keys()) + for key in state_dict_recover.keys(): + torch.testing.assert_close(state_dict_recover[key], state_dict[key]) + + +# TinyLlama-1.1B is to test MQA +@pytest.mark.parametrize( + "model_name", + [ + "7B", # Llama 1 + "13B", # Llama 1 + "meta-llama/Llama-2-13b-hf", + "codellama/CodeLlama-7b-hf", + "codellama/CodeLlama-13b-hf", + "codellama/CodeLlama-34b-hf", + "PY007/TinyLlama-1.1B-step-50K-105b", + ], +) +def test_llama_optimized(model_name): + """Check that our implementation of LLaMa (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + checkpoint_path = ( + Path(os.environ.get("CHECKPOINT_DIR", current_dir.parent.parent / "checkpoints")) / "llama" + ) + + dtype = torch.float16 + device = "cuda" + if "/" in model_name: # Download from HF + config = llama_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + else: + config = config_from_checkpoint(checkpoint_path, model_name, checkpoint_format="meta") + config = llama_config_to_gpt2_config(config) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = False # We don't have fused GatedMLP yet + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + + if "/" in model_name: # Download from HF + pretrained_state_dict = remap_state_dict_hf_llama( + state_dict_from_pretrained(model_name), config + ) + else: + pretrained_state_dict = _pretrained_state_dict_from_checkpoint( + checkpoint_path, model_name, config, checkpoint_format="meta" + ) + model = GPTLMHeadModel(config, device=device, dtype=dtype) + model.load_state_dict(pretrained_state_dict) + model.eval() + + torch.manual_seed(0) + batch_size = 2 + max_seqlen = 256 + seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device) + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device + ) + with torch.no_grad(): + out = model.transformer(input_ids) + logits = model(input_ids).logits + del model + + # Without device_map, the model is loaded on the CPU, which is very slow + # Need auto here since the 13B fp32 model doesn't fit in memory on a A100 40GB + model_ref = LlamaForCausalLM.from_pretrained( + model_name if "/" in model_name else Path(checkpoint_path) / f"{model_name}-hf", + device_map="auto", + ) + model_ref.eval() + with torch.no_grad(): + out_ref = model_ref.model(input_ids).last_hidden_state.to(device=device) + logits_ref = model_ref(input_ids).logits.to(device=device) + del model_ref + + model_hf = LlamaForCausalLM.from_pretrained( + model_name if "/" in model_name else Path(checkpoint_path) / f"{model_name}-hf", + torch_dtype=dtype, + device_map={"": device}, + ) + model_hf.eval() + with torch.no_grad(): + out_hf = model_hf.model(input_ids).last_hidden_state + logits_hf = model_hf(input_ids).logits + del model_hf + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}") + assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item() + + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}") + assert (logits - logits_ref).abs().max().item() < 3 * ( + logits_hf - logits_ref + ).abs().max().item() + + +# torchrun --no_python --nproc_per_node=2 pytest -q -s tests/models/test_llama.py -k "parallel" +@pytest.mark.parametrize("world_size", [2]) +@pytest.mark.parametrize( + "model_name", ["13B", "meta-llama/Llama-2-13b-hf", "codellama/CodeLlama-34b-hf"] +) +def test_llama_parallel(model_name, world_size): + """Check that our implementation of LLaMa (with all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + from apex.transformer import parallel_state + + checkpoint_path = ( + Path(os.environ.get("CHECKPOINT_DIR", current_dir.parent.parent / "checkpoints")) / "llama" + ) + + dtype = torch.float16 + if "/" in model_name: # Download from HF + config = llama_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + else: + config = config_from_checkpoint(checkpoint_path, model_name, checkpoint_format="meta") + config = llama_config_to_gpt2_config(config) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = False # We don't have fused GatedMLP yet + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + process_group = parallel_state.get_tensor_model_parallel_group() + + if "/" in model_name: # Download from HF + pretrained_state_dict = remap_state_dict_hf_llama( + state_dict_from_pretrained(model_name), config + ) + else: + pretrained_state_dict = _pretrained_state_dict_from_checkpoint( + checkpoint_path, model_name, config, checkpoint_format="meta" + ) + model = GPTLMHeadModel(config, process_group=process_group, device=device, dtype=dtype) + model.load_state_dict(shard_state_dict_tp(pretrained_state_dict, config, world_size, rank)) + model.eval() + + torch.manual_seed(0) + batch_size = 2 + max_seqlen = 256 + seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device) + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device + ) + with torch.no_grad(): + out = model.transformer(input_ids) + out, _ = all_gather_raw(out, process_group=process_group) + out = rearrange(out, "(b s) d -> b s d", b=batch_size) + logits = model(input_ids).logits + logits = rearrange(logits, "(b s) d -> b s d", b=batch_size) + logits, _ = all_gather_raw(logits, process_group) + logits = rearrange(logits, "(n b) ... d -> b ... (n d)", b=batch_size) + del model + + if rank == 0: + # Without device_map, the model is loaded on the CPU, which is very slow + model_ref = LlamaForCausalLM.from_pretrained( + model_name if "/" in model_name else Path(checkpoint_path) / f"{model_name}-hf", + device_map="auto", + ) + model_ref.eval() + with torch.no_grad(): + out_ref = model_ref.model(input_ids).last_hidden_state.to(device=device) + logits_ref = model_ref(input_ids).logits.to(device=device) + del model_ref + + model_hf = LlamaForCausalLM.from_pretrained( + model_name if "/" in model_name else Path(checkpoint_path) / f"{model_name}-hf", + torch_dtype=dtype, + device_map="auto", + ) + model_hf.eval() + with torch.no_grad(): + out_hf = model_hf.model(input_ids).last_hidden_state.to(device=device) + logits_hf = model_hf(input_ids).logits.to(device=device) + del model_hf + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}") + assert (out - out_ref).abs().max().item() < 2 * (out_hf - out_ref).abs().max().item() + + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}") + assert (logits - logits_ref).abs().max().item() < 2 * ( + logits_hf - logits_ref + ).abs().max().item() + + +# @pytest.mark.parametrize('model_name', ["7B", "13B"]) +@pytest.mark.parametrize("model_name", ["7B"]) +@pytest.mark.parametrize("checkpoint_format", ["meta", "hf"]) +def test_llama_generation(model_name, checkpoint_format): + checkpoint_path = ( + Path(os.environ.get("CHECKPOINT_DIR", current_dir.parent.parent / "checkpoints")) / "llama" + ) + + dtype = torch.float16 + device = "cuda" + config = config_from_checkpoint(checkpoint_path, model_name, checkpoint_format) + config = llama_config_to_gpt2_config(config) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = False # We don't have fused GatedMLP yet + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + + tokenizer = LlamaTokenizer.from_pretrained(Path(checkpoint_path) / f"{model_name}-hf") + eos_token_id = tokenizer.eos_token_id + + torch.manual_seed(0) + batch_size = 1 + seqlen = 100 + max_length = 150 + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device + ) + + model_hf = LlamaForCausalLM.from_pretrained( + Path(checkpoint_path) / f"{model_name}-hf", torch_dtype=dtype, device_map={"": device} + ) + model_hf.eval() + print("HF fp16") + torch.cuda.synchronize() + start = time.time() + out_hf = model_hf.generate( + input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + del model_hf + + # Need auto here since the 13B fp32 model doesn't fit in memory on a A100 40GB + model_ref = LlamaForCausalLM.from_pretrained( + Path(checkpoint_path) / f"{model_name}-hf", device_map="auto" + ) + model_ref.eval() + with torch.no_grad(): + logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1].to(device=device) + del model_ref + + pretrained_state_dict = _pretrained_state_dict_from_checkpoint( + checkpoint_path, model_name, config, checkpoint_format + ) + model = GPTLMHeadModel(config, device=device, dtype=dtype) + model.load_state_dict(pretrained_state_dict) + model.eval() + + print("Without CUDA graph") + torch.cuda.synchronize() + start = time.time() + out = model.generate( + input_ids=input_ids, + max_length=max_length, + eos_token_id=eos_token_id, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + teacher_outputs=out_hf.sequences, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + + # Capture graph outside the timing loop + batch_size, seqlen_og = input_ids.shape + model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length) + print("With CUDA graph") + torch.cuda.synchronize() + start = time.time() + out_cg = model.generate( + input_ids=input_ids, + max_length=max_length, + cg=True, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + teacher_outputs=out_hf.sequences, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + + with torch.no_grad(): + logits_parallel = model(out_hf.sequences).logits[:, (seqlen - 1) : -1] + logits_hf = torch.stack(out_hf.scores, dim=1) + logits = torch.stack(out.scores, dim=1) + logits_cg = torch.stack(out_cg.scores, dim=1) + + del model + + hf_error = (logits_hf - logits_ref).abs().max().item() + + print(f"HF fp16 logits max diff: {hf_error}") + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item()}") + + assert (logits_parallel - logits_ref).abs().max().item() < 2 * hf_error + assert (logits - logits_ref).abs().max().item() < 2 * hf_error + assert torch.equal(logits_cg, logits) + + +# torchrun --no_python --nproc_per_node=2 pytest -q -s tests/models/test_llama.py -k "llama_parallel_generation" +@pytest.mark.parametrize("world_size", [2]) +@pytest.mark.parametrize( + "model_name", ["13B", "meta-llama/Llama-2-13b-hf", "codellama/CodeLlama-34b-hf"] +) +def test_llama_parallel_generation(model_name, world_size): + """Check that our implementation matches the HF implementation: + the scores in fp16 should be around the same as the HF scores in fp16, when compared to + the HF scores in fp32. + """ + from apex.transformer import parallel_state + + checkpoint_path = ( + Path(os.environ.get("CHECKPOINT_DIR", current_dir.parent.parent / "checkpoints")) / "llama" + ) + + dtype = torch.float16 + if "/" in model_name: # Download from HF + config = llama_config_to_gpt2_config( + AutoConfig.from_pretrained(model_name, trust_remote_code=True) + ) + else: + config = config_from_checkpoint(checkpoint_path, model_name, checkpoint_format="meta") + config = llama_config_to_gpt2_config(config) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = False # We don't have fused GatedMLP yet + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + config.pad_vocab_size_multiple = 8 * world_size + config.sequence_parallel = False # Need to set this to False for generation + + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0" + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + process_group = parallel_state.get_tensor_model_parallel_group() + + torch.manual_seed(0) + batch_size = 1 + seqlen = 100 + max_length = 150 + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, seqlen), dtype=torch.long, device=device + ) + + # Need this, otherwise when we capture the graph the process for GPU 1 would run on both + # GPU0 and GPU1 and things would hang + torch.cuda.set_device(device) + + if "/" in model_name: # Download from HF + pretrained_state_dict = remap_state_dict_hf_llama( + state_dict_from_pretrained(model_name), config + ) + else: + pretrained_state_dict = _pretrained_state_dict_from_checkpoint( + checkpoint_path, model_name, config, checkpoint_format="meta" + ) + model = GPTLMHeadModel(config, process_group=process_group, device=device, dtype=dtype) + model.load_state_dict(shard_state_dict_tp(pretrained_state_dict, config, world_size, rank)) + model.eval() + + print("Without CUDA graph") + out = model.generate( + input_ids=input_ids, + max_length=max_length, + tensor_parallel=world_size, + vocab_size=config.vocab_size, + # teacher_outputs=out_hf.sequences, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + ) + + # Capture graph outside the timing loop + batch_size, seqlen_og = input_ids.shape + model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length) + print("With CUDA graph") + out_cg = model.generate( + input_ids=input_ids, + max_length=max_length, + tensor_parallel=world_size, + vocab_size=config.vocab_size, + cg=True, + # teacher_outputs=out_hf.sequences, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + ) + del model + parallel_state.destroy_model_parallel() + + if rank == 0: + # Without device_map, the model is loaded on the CPU, which is very slow + model_hf = LlamaForCausalLM.from_pretrained( + model_name if "/" in model_name else Path(checkpoint_path) / f"{model_name}-hf", + torch_dtype=dtype, + device_map="auto", + ) + model_hf.eval() + print("HF fp16") + torch.cuda.synchronize() + start = time.time() + with torch.inference_mode(): + out_hf = model_hf.generate( + input_ids=input_ids, + max_length=max_length, + return_dict_in_generate=True, + output_scores=True, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + del model_hf + + model_ref = LlamaForCausalLM.from_pretrained( + model_name if "/" in model_name else Path(checkpoint_path) / f"{model_name}-hf", + device_map="auto", + ) + model_ref.eval() + with torch.inference_mode(): + logits_ref = model_ref(out_hf.sequences).logits[:, (seqlen - 1) : -1] + del model_ref + logits_hf = torch.stack(out_hf.scores, dim=1) + + logits = torch.stack(out.scores, dim=1) + logits_cg = torch.stack(out_cg.scores, dim=1) + + hf_error = (logits_hf - logits_ref).abs().max().item() + print(f"HF fp16 logits max diff: {hf_error}") + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + assert (logits - logits_ref).abs().max().item() < 2 * hf_error + print(f"Logits CG max diff: {(logits_cg - logits_ref).abs().max().item()}") + assert torch.equal(logits_cg, logits) + + +@torch.no_grad() +@pytest.mark.parametrize("world_size", [2]) +def test_llama_parallel_uneven_num_heads(world_size): + from apex.transformer import parallel_state + + checkpoint_path = ( + Path(os.environ.get("CHECKPOINT_DIR", current_dir.parent.parent / "checkpoints")) / "llama" + ) + num_attention_heads = world_size + 1 + model_name = f"teeny-{num_attention_heads}-heads" + + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + process_group = parallel_state.get_tensor_model_parallel_group() + + dtype = torch.float16 + llama_config = LlamaConfig( + hidden_size=256 + * num_attention_heads, # ParallelGatedMlp hidden_features must be divisible by 256 + intermediate_size=256 * num_attention_heads * 4, + num_hidden_layers=4, + num_attention_heads=num_attention_heads, + initializer_range=0.5, # Set crazy init range so we don't have near zero weights implying a vacuous test. + ) + config = llama_config_to_gpt2_config(llama_config) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = False # We don't have fused GatedMLP yet + config.fused_dropout_add_ln = True + config.residual_in_fp32 = True + + torch.manual_seed(0) + batch_size = 2 + max_seqlen = 256 + seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device=device) + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device=device + ) + + # Create a shared test model. + if rank == 0: + LlamaForCausalLM(config=llama_config).save_pretrained(checkpoint_path / f"{model_name}-hf") + torch.distributed.barrier() + + # Run the standard forward pass test. + pretrained_state_dict = _pretrained_state_dict_from_checkpoint( + checkpoint_path, model_name, config, checkpoint_format="hf" + ) + model = GPTLMHeadModel(config, process_group=process_group, device=device, dtype=dtype) + model.load_state_dict(shard_state_dict_tp(pretrained_state_dict, config, world_size, rank)) + model.eval() + + # TODO: Avoid duplicate code. Modularize the comparison of two forward pass diffs. + out = model.transformer(input_ids) + out, _ = all_gather_raw(out, process_group=process_group) + out = rearrange(out, "(b s) d -> b s d", b=batch_size) + logits = model(input_ids).logits + logits = rearrange(logits, "(b s) d -> b s d", b=batch_size) + logits, _ = all_gather_raw(logits, process_group) + logits = rearrange(logits, "(n b) ... d -> b ... (n d)", b=batch_size) + + if rank == 0: + model_ref = LlamaForCausalLM.from_pretrained( + Path(checkpoint_path) / f"{model_name}-hf", device_map={"": device} + ) + model_ref = model_ref.to(device=device) + model_ref.eval() + out_ref = model_ref.model(input_ids).last_hidden_state + logits_ref = model_ref(input_ids).logits + del model_ref + + model_hf = LlamaForCausalLM.from_pretrained( + Path(checkpoint_path) / f"{model_name}-hf", torch_dtype=dtype, device_map={"": device} + ) + model_hf.eval() + out_hf = model_hf.model(input_ids).last_hidden_state.to(device=device) + logits_hf = model_hf(input_ids).logits.to(device=device) + del model_hf + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}") + assert (out - out_ref).abs().max().item() < 2 * (out_hf - out_ref).abs().max().item() + + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}") + assert (logits - logits_ref).abs().max().item() < 2 * ( + logits_hf - logits_ref + ).abs().max().item() + + if os.path.exists(checkpoint_path / f"{model_name}-hf"): + shutil.rmtree(checkpoint_path / f"{model_name}-hf") diff --git a/test_mha_parallel.py b/test_mha_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..880cce2f679725da21bdebdbfa38cba972b26070 --- /dev/null +++ b/test_mha_parallel.py @@ -0,0 +1,160 @@ +# Run test with: +# torchrun --no_python --nproc_per_node=8 pytest -q -s tests/modules/test_mha_parallel.py + +import math + +import pytest +import torch +import torch.nn.functional as F +from apex.transformer import parallel_state, tensor_parallel +from einops import rearrange +from flash_attn.modules.mha import MHA, ParallelMHA + +is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8 + + +@pytest.mark.parametrize("dtype", [torch.float16] + ([torch.bfloat16] if is_sm8x else [])) +# @pytest.mark.parametrize('dtype', [torch.float16]) +@pytest.mark.parametrize("world_size", [1, 2, 4, 8]) +# @pytest.mark.parametrize('world_size', [2]) +@pytest.mark.parametrize("sequence_parallel", [True, False]) +# @pytest.mark.parametrize('sequence_parallel', [False]) +@pytest.mark.parametrize("head_dim", [64, 128]) +# @pytest.mark.parametrize('head_dim', [64]) +@pytest.mark.parametrize("embed_dim", [1024, 4096]) +# @pytest.mark.parametrize('embed_dim', [1024]) +def test_mha_parallel(embed_dim, head_dim, sequence_parallel, world_size, dtype): + assert embed_dim % head_dim == 0 + num_heads = embed_dim // head_dim + assert num_heads % world_size == 0 + rtol, atol = (3e-3, 1e-2) if dtype == torch.bfloat16 else (3e-3, 1e-3) + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + # set seed + torch.random.manual_seed(0) + batch_size = 2 + seqlen = 1024 + assert (batch_size * seqlen) % world_size == 0 + x_pt = torch.randn( + batch_size * seqlen, embed_dim, device=device, dtype=dtype, requires_grad=True + ) + # We need to generate g here so that all processes get the same gradient, + # as rank 0 will have an extra bias that changes the RNG. + # If we don't divide by batch_size, the gradient gets a bit too large. + g = torch.randn_like(x_pt) / 32 + if sequence_parallel: + x = ( + tensor_parallel.scatter_to_sequence_parallel_region(x_pt) + .detach() + .clone() + .requires_grad_() + ) + else: + x = x_pt.detach().clone().requires_grad_() + + model_pt = MHA( + embed_dim, + num_heads, + rotary_emb_dim=int(head_dim // 2), + use_flash_attn=True, + device=device, + dtype=dtype, + ) + partition_dim = embed_dim // world_size + model = ParallelMHA( + embed_dim, + num_heads, + parallel_state.get_tensor_model_parallel_group(), + rotary_emb_dim=int(head_dim // 2), + use_flash_attn=True, + sequence_parallel=sequence_parallel, + device=device, + dtype=dtype, + ) + + with torch.no_grad(): + model.Wqkv.weight.copy_( + rearrange( + rearrange(model_pt.Wqkv.weight, "(three o) i -> three o i", three=3)[ + :, rank * partition_dim : (rank + 1) * partition_dim + ], + "three o i -> (three o) i", + ) + ) + model.Wqkv.bias.copy_( + rearrange( + rearrange(model_pt.Wqkv.bias, "(three o) -> three o", three=3)[ + :, rank * partition_dim : (rank + 1) * partition_dim + ], + "three o -> (three o)", + ) + ) + model.out_proj.weight.copy_( + model_pt.out_proj.weight[:, rank * partition_dim : (rank + 1) * partition_dim] + ) + if rank == 0: + model.out_proj.bias.copy_(model_pt.out_proj.bias) + + out = model(x, seqlen=seqlen) + out_pt = rearrange(model_pt(rearrange(x_pt, "(b s) d -> b s d", s=seqlen)), "b s d -> (b s) d") + partition_batch_dim = batch_size * seqlen // world_size + assert torch.allclose( + out, + out_pt[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] + if sequence_parallel + else out_pt, + rtol=rtol, + atol=atol, + ) + + out_pt.backward(g) + out.backward( + g[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] if sequence_parallel else g + ) + parallel_state.destroy_model_parallel() + + assert torch.allclose( + x.grad, + x_pt.grad[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] + if sequence_parallel + else x_pt.grad, + rtol=rtol, + atol=atol / 100, # magnitude of x.grad is quite small + ) + # The error for d_weight and d_bias is quite a bit higher + assert torch.allclose( + model.Wqkv.weight.grad, + rearrange( + rearrange(model_pt.Wqkv.weight.grad, "(three o) i -> three o i", three=3)[ + :, rank * partition_dim : (rank + 1) * partition_dim + ], + "three o i -> (three o) i", + ), + rtol=rtol, + atol=atol * 10, + ) + assert torch.allclose( + model.Wqkv.bias.grad, + rearrange( + rearrange(model_pt.Wqkv.bias.grad, "(three o) -> three o", three=3)[ + :, rank * partition_dim : (rank + 1) * partition_dim + ], + "three o -> (three o)", + ), + rtol=rtol, + atol=atol * 5, + ) + assert torch.allclose( + model.out_proj.weight.grad, + model_pt.out_proj.weight.grad[:, rank * partition_dim : (rank + 1) * partition_dim], + rtol=rtol, + atol=atol * 10, + ) + if rank == 0: + assert torch.allclose( + model.out_proj.bias.grad, model_pt.out_proj.bias.grad, rtol=rtol, atol=atol * 5 + ) diff --git a/test_mlp_parallel.py b/test_mlp_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..49051bc2be19fa8cb63ddc5887be87ac3406dce1 --- /dev/null +++ b/test_mlp_parallel.py @@ -0,0 +1,143 @@ +# Run test with: +# torchrun --no_python --nproc_per_node=8 pytest -q -s tests/modules/test_mlp_parallel.py + +import pytest +import torch +import torch.nn.functional as F +from apex.transformer import parallel_state, tensor_parallel +from einops import rearrange +from flash_attn.modules.mlp import GatedMlp, ParallelGatedMlp + +is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8 + + +@pytest.mark.parametrize("dtype", [torch.float16] + ([torch.bfloat16] if is_sm8x else [])) +# @pytest.mark.parametrize('dtype', [torch.float16]) +@pytest.mark.parametrize("world_size", [1, 2, 4, 8]) +# @pytest.mark.parametrize('world_size', [2]) +@pytest.mark.parametrize("sequence_parallel", [True, False]) +# @pytest.mark.parametrize('sequence_parallel', [False]) +@pytest.mark.parametrize("activation", [F.silu, F.sigmoid]) +# @pytest.mark.parametrize('activation', [F.silu]) +@pytest.mark.parametrize("dim", [1024, 4096]) +# @pytest.mark.parametrize('dim', [1024]) +def test_mlp_parallel(dim, activation, sequence_parallel, world_size, dtype): + rtol, atol = (3e-3, 3e-2) if dtype == torch.bfloat16 else (3e-3, 3e-3) + + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", init_method="env://") + device = f"cuda:{torch.distributed.get_rank()}" + assert world_size <= torch.distributed.get_world_size() + parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size) + rank = parallel_state.get_tensor_model_parallel_rank() + # set seed + torch.random.manual_seed(0) + batch_size = 2 + seqlen = 1024 + assert (batch_size * seqlen) % world_size == 0 + x_pt = torch.randn(batch_size * seqlen, dim, device=device, dtype=dtype, requires_grad=True) + # We need to generate g here so that all processes get the same gradient, + # as rank 0 will have an extra bias that changes the RNG. + # If we don't divide by batch_size, the gradient gets a bit too large. + g = torch.randn_like(x_pt) / 32 + if sequence_parallel: + x = ( + tensor_parallel.scatter_to_sequence_parallel_region(x_pt) + .detach() + .clone() + .requires_grad_() + ) + else: + x = x_pt.detach().clone().requires_grad_() + + model_pt = GatedMlp(dim, activation=activation, device=device, dtype=dtype) + partition_dim = model_pt.fc1.weight.shape[0] // 2 // world_size + model = ParallelGatedMlp( + dim, + parallel_state.get_tensor_model_parallel_group(), + activation=activation, + sequence_parallel=sequence_parallel, + device=device, + dtype=dtype, + ) + + with torch.no_grad(): + model.fc1.weight.copy_( + rearrange( + rearrange(model_pt.fc1.weight, "(two o) i -> two o i", two=2)[ + :, rank * partition_dim : (rank + 1) * partition_dim + ], + "two o i -> (two o) i", + ) + ) + model.fc1.bias.copy_( + rearrange( + rearrange(model_pt.fc1.bias, "(two o) -> two o", two=2)[ + :, rank * partition_dim : (rank + 1) * partition_dim + ], + "two o -> (two o)", + ) + ) + model.fc2.weight.copy_( + model_pt.fc2.weight[:, rank * partition_dim : (rank + 1) * partition_dim] + ) + if rank == 0: + model.fc2.bias.copy_(model_pt.fc2.bias) + + out = model(x) + out_pt = model_pt(x_pt) + partition_batch_dim = batch_size * seqlen // world_size + assert torch.allclose( + out, + out_pt[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] + if sequence_parallel + else out_pt, + rtol=rtol, + atol=atol, + ) + + out_pt.backward(g) + out.backward( + g[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] if sequence_parallel else g + ) + parallel_state.destroy_model_parallel() + + assert torch.allclose( + x.grad, + x_pt.grad[rank * partition_batch_dim : (rank + 1) * partition_batch_dim] + if sequence_parallel + else x_pt.grad, + rtol=rtol, + atol=atol, + ) + + assert torch.allclose( + model.fc1.weight.grad, + rearrange( + rearrange(model_pt.fc1.weight.grad, "(two o) i -> two o i", two=2)[ + :, rank * partition_dim : (rank + 1) * partition_dim + ], + "two o i -> (two o) i", + ), + rtol=rtol, + atol=atol, + ) + assert torch.allclose( + model.fc1.bias.grad, + rearrange( + rearrange(model_pt.fc1.bias.grad, "(two o) -> two o", two=2)[ + :, rank * partition_dim : (rank + 1) * partition_dim + ], + "two o -> (two o)", + ), + rtol=rtol, + atol=atol, + ) + assert torch.allclose( + model.fc2.weight.grad, + model_pt.fc2.weight.grad[:, rank * partition_dim : (rank + 1) * partition_dim], + rtol=rtol, + atol=atol, + ) + if rank == 0: + assert torch.allclose(model.fc2.bias.grad, model_pt.fc2.bias.grad, rtol=rtol, atol=atol) diff --git a/test_opt.py b/test_opt.py new file mode 100644 index 0000000000000000000000000000000000000000..8188e36a30c0e0e869b0ea828e3ea0839db9973c --- /dev/null +++ b/test_opt.py @@ -0,0 +1,237 @@ +import re +import time + +import pytest +import torch +from einops import rearrange +from flash_attn.models.gpt import GPTLMHeadModel +from flash_attn.models.opt import opt_config_to_gpt2_config, remap_state_dict_hf_opt +from flash_attn.utils.generation import update_graph_cache +from flash_attn.utils.pretrained import state_dict_from_pretrained +from transformers import AutoTokenizer, OPTConfig +from transformers.models.opt.modeling_opt import OPTForCausalLM + + +@pytest.mark.parametrize( + "model_name", ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b"] +) +# @pytest.mark.parametrize('model_name', ["facebook/opt-350m"]) +def test_opt_state_dict(model_name): + config = opt_config_to_gpt2_config(OPTConfig.from_pretrained(model_name)) + pretrained_state_dict = remap_state_dict_hf_opt(state_dict_from_pretrained(model_name), config) + model = GPTLMHeadModel(config) + state_dict = model.state_dict() + assert state_dict.keys() == pretrained_state_dict.keys() + for k in state_dict.keys(): + assert state_dict[k].shape == pretrained_state_dict[k].shape + + +@pytest.mark.parametrize( + "model_name", ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b"] +) +# @pytest.mark.parametrize('model_name', ["facebook/opt-350m"]) +def test_opt_optimized(model_name): + """Check that our implementation of OPT (without all optimizations enabled) matches the + HF implementation: the output of our forward pass in fp16 should be around the same as the HF + forward pass in fp16, when compared to the HF forward pass in fp32. + """ + dtype = torch.float16 + device = "cuda" + config = opt_config_to_gpt2_config(OPTConfig.from_pretrained(model_name)) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = True + config.fused_dropout_add_ln = True + # Only prenorm supports residual_in_fp32 + config.residual_in_fp32 = getattr(config, "prenorm", True) + config.pad_vocab_size_multiple = 8 + + model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype) + + model_ref = OPTForCausalLM.from_pretrained(model_name).to(device=device) + model_hf = OPTForCausalLM.from_pretrained(model_name, torch_dtype=dtype).to(device=device) + + model.eval() + model_ref.eval() + model_hf.eval() + + torch.manual_seed(0) + batch_size = 2 + max_seqlen = 256 + seqlens = torch.randint(max_seqlen // 2, max_seqlen + 1, (batch_size,), device="cuda") + input_ids = torch.randint( + 0, config.vocab_size, (batch_size, max_seqlen), dtype=torch.long, device="cuda" + ) + if model_name != "facebook/opt-350m": # The OPT-350m projects the embeddings to dimension 512 + out = model.transformer(input_ids) + out_hf = model_hf.model(input_ids).last_hidden_state + out_ref = model_ref.model(input_ids).last_hidden_state + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(out_hf - out_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(out_hf - out_ref).abs().mean().item()}") + assert (out - out_ref).abs().max().item() < 3 * (out_hf - out_ref).abs().max().item() + + logits = model(input_ids).logits + logits_hf = model_hf(input_ids).logits + logits_ref = model_ref(input_ids).logits + + print(f"Logits max diff: {(logits - logits_ref).abs().max().item()}") + print(f"Logits mean diff: {(logits - logits_ref).abs().mean().item()}") + print(f"HF fp16 max diff: {(logits_hf - logits_ref).abs().max().item()}") + print(f"HF fp16 mean diff: {(logits_hf - logits_ref).abs().mean().item()}") + assert (logits - logits_ref).abs().max().item() < 3 * ( + logits_hf - logits_ref + ).abs().max().item() + + +@pytest.mark.parametrize( + "model_name", + [ + "facebook/opt-125m", + "facebook/opt-350m", + "facebook/opt-1.3b", + "facebook/opt-2.7b", + "facebook/opt-6.7b", + ], +) +# @pytest.mark.parametrize('model_name', ["facebook/opt-125m"]) +def test_opt_generation(model_name): + """Check that our implementation of OPT generation matches the HF implementation: + the scores in fp16 should be around the same as the HF scores in fp16, when compared to + the HF scores in fp32. + """ + print(f"\nMODEL: {model_name}") + verbose = False + dtype = torch.float16 + device = "cuda" + rtol, atol = 3e-3, 3e-1 + config = opt_config_to_gpt2_config(OPTConfig.from_pretrained(model_name)) + # Only prenorm supports residual_in_fp32 + config.residual_in_fp32 = getattr(config, "prenorm", True) + config.use_flash_attn = True + config.fused_bias_fc = True + config.fused_mlp = True + config.fused_dropout_add_ln = True + + model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype) + model.eval() + + torch.manual_seed(0) + # OPT tokenizer requires use_fast=False + # https://huggingface.co/docs/transformers/model_doc/opt + tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) + eos_token_id = tokenizer.eos_token_id + + input_ids = tokenizer("Hello, my dog is cute and he", return_tensors="pt").input_ids.to( + device=device + ) + max_length = 25 + # input_ids = torch.randint(0, 100, (2, 10), dtype=torch.long, device='cuda') + # max_length = input_ids.shape[1] + 40 + + # Slow generation for reference + sequences = [] + scores = [] + cur_input_ids = input_ids + with torch.inference_mode(): + scores.append(model(cur_input_ids).logits[:, -1]) + sequences.append(scores[-1].argmax(dim=-1)) + for _ in range(input_ids.shape[1] + 1, max_length): + cur_input_ids = torch.cat([cur_input_ids, rearrange(sequences[-1], "b -> b 1")], dim=-1) + scores.append(model(cur_input_ids).logits[:, -1]) + sequences.append(scores[-1].argmax(dim=-1)) + if eos_token_id is not None and (sequences[-1] == eos_token_id).all(): + break + sequences = torch.cat([input_ids, torch.stack(sequences, dim=1)], dim=1) + scores = tuple(scores) + + print("Without CUDA graph") + torch.cuda.synchronize() + start = time.time() + out = model.generate( + input_ids=input_ids, + max_length=max_length, + eos_token_id=eos_token_id, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + if verbose: + print(out.sequences) + print(tokenizer.batch_decode(out.sequences.tolist())) + if getattr(config, "use_flash_attn", False): + # Capture graph outside the timing loop + batch_size, seqlen_og = input_ids.shape + model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length) + print("With CUDA graph") + torch.cuda.synchronize() + start = time.time() + out_cg = model.generate( + input_ids=input_ids, + max_length=max_length, + cg=True, + return_dict_in_generate=True, + output_scores=True, + enable_timing=True, + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + if verbose: + print(out_cg.sequences) + print(tokenizer.batch_decode(out_cg.sequences.tolist())) + + del model + + model_hf = OPTForCausalLM.from_pretrained(model_name, torch_dtype=dtype).to(device=device) + model_hf.eval() + print("HF fp16") + torch.cuda.synchronize() + start = time.time() + out_hf = model_hf.generate( + input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + del model_hf + + model_ref = OPTForCausalLM.from_pretrained(model_name).to(device=device) + model_ref.eval() + print("HF fp32") + torch.cuda.synchronize() + start = time.time() + out_ref = model_ref.generate( + input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True + ) + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms") + del model_ref + print(tokenizer.batch_decode(out_ref.sequences.tolist())) + + if verbose: + print( + f"Scores max diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}" + ) + print( + f"Scores mean diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}" + ) + print( + f"HF fp16 max diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}" + ) + print( + f"HF fp16 mean diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}" + ) + + assert torch.all(out.sequences == sequences) + assert torch.allclose( + torch.stack(out.scores, dim=1), torch.stack(scores, dim=1), rtol=rtol, atol=atol + ) + assert torch.all(out.sequences == out_ref.sequences) + assert torch.all(out.sequences == out_hf.sequences) + + assert (torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item() < 3 * ( + torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1) + ).abs().max().item() diff --git a/test_rotary.py b/test_rotary.py new file mode 100644 index 0000000000000000000000000000000000000000..6b73ff90dcebf8b8f4743391f0b26de9356bdaae --- /dev/null +++ b/test_rotary.py @@ -0,0 +1,134 @@ +# Copyright (c) 2023, Tri Dao. + +import math + +import pytest +import torch +import torch.nn.functional as F +from einops import rearrange +from flash_attn.layers.rotary import RotaryEmbedding, apply_rotary_emb_func, apply_rotary_emb_qkv_ +from transformers.models.gpt_neox.modeling_gpt_neox import RotaryEmbedding as RotaryEmbeddingNeoX +from transformers.models.gpt_neox.modeling_gpt_neox import ( + apply_rotary_pos_emb as apply_rotary_pos_emb_neox, +) +from transformers.models.gptj.modeling_gptj import apply_rotary_pos_emb as apply_rotary_pos_emb_gptj +from transformers.models.gptj.modeling_gptj import fixed_pos_embedding + + +# NeoX-style rotary embedding +@pytest.mark.parametrize("seqlen_offset", [0, 711]) +@pytest.mark.parametrize("rotary_emb_fraction", [0.5, 1.0]) +def test_rotary(rotary_emb_fraction, seqlen_offset): + device = "cuda" + dtype = torch.float16 + rtol, atol = (1e-3, 5e-3) + # set seed + torch.random.manual_seed(0) + batch_size = 8 + seqlen_total = 2048 + seqlen = seqlen_total - seqlen_offset + nheads = 16 + headdim = 128 + rotary_dim = int(headdim * rotary_emb_fraction) + qkv = torch.randn( + batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype, requires_grad=True + ) + qkv_og = qkv.clone().detach() # Our implementation modifies qkv inplace + rotary = RotaryEmbedding(rotary_dim, device=device) + rotary_neox = RotaryEmbeddingNeoX(rotary_dim, seqlen_total, device=device) + # Doesn't matter what tensor we pass in, rotary_neox only uses the device of the tensor + cos_neox, sin_neox = rotary_neox(qkv, seq_len=seqlen_total) + cos_neox, sin_neox = cos_neox.to(dtype=dtype), sin_neox.to(dtype=dtype) + q_pt = ( + rearrange(qkv[:, :, 0, :, :rotary_dim], "b s h d -> b h s d") + .detach() + .clone() + .requires_grad_(True) + ) + k_pt = ( + rearrange(qkv[:, :, 1, :, :rotary_dim], "b s h d -> b h s d") + .detach() + .clone() + .requires_grad_(True) + ) + q_neox, k_neox = apply_rotary_pos_emb_neox(q_pt, k_pt, cos_neox, sin_neox, offset=seqlen_offset) + out = rotary(qkv, seqlen_offset=seqlen_offset) + assert torch.allclose( + rotary._cos_cached, cos_neox[..., : rotary_dim // 2].to(dtype=dtype), rtol=rtol, atol=atol + ) + assert torch.allclose( + rotary._sin_cached, sin_neox[..., : rotary_dim // 2].to(dtype=dtype), rtol=rtol, atol=atol + ) + assert torch.allclose( + rearrange(q_neox, "b h s d -> b s h d"), out[:, :, 0, :, :rotary_dim], rtol=rtol, atol=atol + ) + assert torch.allclose( + rearrange(k_neox, "b h s d -> b s h d"), out[:, :, 1, :, :rotary_dim], rtol=rtol, atol=atol + ) + assert torch.equal(out[:, :, 0:2, :, rotary_dim:], qkv_og[:, :, 0:2, :, rotary_dim:]) + assert torch.equal(out[:, :, 2], qkv_og[:, :, 2]) + + g = torch.randn_like(out) + g_og = g.clone().detach() # Our implementation modifies g inplace + out.backward(g) + q_neox.backward(rearrange(g_og[:, :, 0, :, :rotary_dim], "b s h d -> b h s d")) + k_neox.backward(rearrange(g_og[:, :, 1, :, :rotary_dim], "b s h d -> b h s d")) + assert torch.allclose( + rearrange(q_pt.grad, "b h s d -> b s h d"), + qkv.grad[:, :, 0, :, :rotary_dim], + rtol=rtol, + atol=atol, + ) + assert torch.allclose( + rearrange(k_pt.grad, "b h s d -> b s h d"), + qkv.grad[:, :, 1, :, :rotary_dim], + rtol=rtol, + atol=atol, + ) + assert torch.equal(qkv.grad[:, :, 0:2, :, rotary_dim:], g_og[:, :, 0:2, :, rotary_dim:]) + assert torch.equal(qkv.grad[:, :, 2], g_og[:, :, 2]) + + +# GPT-J-style rotary embedding +@pytest.mark.parametrize("seqlen_offset", [0, 711]) +@pytest.mark.parametrize("rotary_emb_fraction", [0.5, 1.0]) +def test_rotary_interleaved(rotary_emb_fraction, seqlen_offset): + device = "cuda" + dtype = torch.float16 + rtol, atol = (1e-3, 5e-3) + # set seed + torch.random.manual_seed(0) + batch_size = 8 + seqlen_total = 2048 + seqlen = seqlen_total - seqlen_offset + nheads = 16 + headdim = 128 + rotary_dim = int(headdim * rotary_emb_fraction) + qkv = torch.randn( + batch_size, seqlen, 3, nheads, headdim, device=device, dtype=dtype, requires_grad=True + ) + qkv_og = qkv.clone().detach() # Our implementation modifies qkv inplace + rotary = RotaryEmbedding(rotary_dim, interleaved=True, device=device) + sincos_gptj = fixed_pos_embedding(qkv[..., :rotary_dim], seq_dim=1, seq_len=seqlen_total) + sincos_gptj = tuple(x.to(dtype=dtype) for x in sincos_gptj) + q_pt = qkv[:, :, 0, :, :rotary_dim].detach().clone().requires_grad_(True) + k_pt = qkv[:, :, 1, :, :rotary_dim].detach().clone().requires_grad_(True) + q_gptj = apply_rotary_pos_emb_gptj(q_pt, sincos_gptj, offset=seqlen_offset) + k_gptj = apply_rotary_pos_emb_gptj(k_pt, sincos_gptj, offset=seqlen_offset) + out = rotary(qkv, seqlen_offset=seqlen_offset) + assert torch.allclose(rotary._cos_cached, sincos_gptj[1], rtol=rtol, atol=atol) + assert torch.allclose(rotary._sin_cached, sincos_gptj[0], rtol=rtol, atol=atol) + assert torch.allclose(q_gptj, out[:, :, 0, :, :rotary_dim], rtol=rtol, atol=atol) + assert torch.allclose(k_gptj, out[:, :, 1, :, :rotary_dim], rtol=rtol, atol=atol) + assert torch.equal(out[:, :, 0:2, :, rotary_dim:], qkv_og[:, :, 0:2, :, rotary_dim:]) + assert torch.equal(out[:, :, 2], qkv_og[:, :, 2]) + + g = torch.randn_like(out) + g_og = g.clone().detach() # Our implementation modifies g inplace + out.backward(g) + q_gptj.backward(g_og[:, :, 0, :, :rotary_dim]) + k_gptj.backward(g_og[:, :, 1, :, :rotary_dim]) + assert torch.allclose(q_pt.grad, qkv.grad[:, :, 0, :, :rotary_dim], rtol=rtol, atol=atol) + assert torch.allclose(k_pt.grad, qkv.grad[:, :, 1, :, :rotary_dim], rtol=rtol, atol=atol) + assert torch.equal(qkv.grad[:, :, 0:2, :, rotary_dim:], g_og[:, :, 0:2, :, rotary_dim:]) + assert torch.equal(qkv.grad[:, :, 2], g_og[:, :, 2]) diff --git a/test_util.py b/test_util.py new file mode 100644 index 0000000000000000000000000000000000000000..513a9b8e8fc134ce86f6a423c068e7efecfadeff --- /dev/null +++ b/test_util.py @@ -0,0 +1,254 @@ +import math + +import torch +from einops import rearrange, repeat +from flash_attn.bert_padding import pad_input, unpad_input + + +def generate_random_padding_mask(max_seqlen, batch_size, device, mode="random"): + assert mode in ["full", "random", "third"] + if mode == "full": + lengths = torch.full((batch_size, 1), max_seqlen, device=device, dtype=torch.int32) + elif mode == "random": + lengths = torch.randint( + max(1, max_seqlen - 20), max_seqlen + 1, (batch_size, 1), device=device + ) + elif mode == "third": + lengths = torch.randint(max_seqlen // 3, max_seqlen + 1, (batch_size, 1), device=device) + padding_mask = ( + repeat(torch.arange(max_seqlen, device=device), "s -> b s", b=batch_size) < lengths + ) + return padding_mask + + +def generate_qkv( + q, k, v, query_padding_mask=None, key_padding_mask=None, kvpacked=False, qkvpacked=False +): + """ + Arguments: + q: (batch_size, seqlen_q, nheads, d) + k: (batch_size, seqlen_k, nheads_k, d) + v: (batch_size, seqlen_k, nheads_k, d) + query_padding_mask: (batch_size, seqlen), bool + key_padding_mask: (batch_size, seqlen), bool + """ + assert not (kvpacked and qkvpacked) + batch_size, seqlen_q, nheads, d = q.shape + _, seqlen_k, nheads_k, _ = k.shape + assert k.shape == (batch_size, seqlen_k, nheads_k, d) + assert v.shape == (batch_size, seqlen_k, nheads_k, d) + + if query_padding_mask is not None: + q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, query_padding_mask) + output_pad_fn = lambda output_unpad: pad_input( + output_unpad, indices_q, batch_size, seqlen_q + ) + else: + q_unpad = rearrange(q, "b s h d -> (b s) h d") + cu_seqlens_q = torch.arange( + 0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q_unpad.device + ) + max_seqlen_q = seqlen_q + output_pad_fn = lambda output_unpad: rearrange( + output_unpad, "(b s) h d -> b s h d", b=batch_size + ) + + if key_padding_mask is not None: + k_unpad, indices_k, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask) + v_unpad, _, _, _ = unpad_input(v, key_padding_mask) + else: + k_unpad = rearrange(k, "b s h d -> (b s) h d") + v_unpad = rearrange(v, "b s h d -> (b s) h d") + cu_seqlens_k = torch.arange( + 0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=k_unpad.device + ) + max_seqlen_k = seqlen_k + + if qkvpacked: + assert (query_padding_mask == key_padding_mask).all() + assert nheads == nheads_k + qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1) + qkv = torch.stack([q, k, v], dim=2) + if query_padding_mask is not None: + dqkv_pad_fn = lambda dqkv_unpad: pad_input(dqkv_unpad, indices_q, batch_size, seqlen_q) + else: + dqkv_pad_fn = lambda dqkv_unpad: rearrange( + dqkv_unpad, "(b s) t h d -> b s t h d", b=batch_size + ) + return ( + qkv_unpad.detach().requires_grad_(), + cu_seqlens_q, + max_seqlen_q, + qkv.detach().requires_grad_(), + output_pad_fn, + dqkv_pad_fn, + ) + elif kvpacked: + kv_unpad = torch.stack([k_unpad, v_unpad], dim=1) + kv = torch.stack([k, v], dim=2) + dq_pad_fn = output_pad_fn + if key_padding_mask is not None: + dkv_pad_fn = lambda dkv_unpad: pad_input(dkv_unpad, indices_k, batch_size, seqlen_k) + else: + dkv_pad_fn = lambda dkv_unpad: rearrange( + dkv_unpad, "(b s) t h d -> b s t h d", b=batch_size + ) + return ( + q_unpad.detach().requires_grad_(), + kv_unpad.detach().requires_grad_(), + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q.detach().requires_grad_(), + kv.detach().requires_grad_(), + output_pad_fn, + dq_pad_fn, + dkv_pad_fn, + ) + else: + dq_pad_fn = output_pad_fn + if key_padding_mask is not None: + dk_pad_fn = lambda dk_unpad: pad_input(dk_unpad, indices_k, batch_size, seqlen_k) + else: + dk_pad_fn = lambda dk_unpad: rearrange(dk_unpad, "(b s) h d -> b s h d", b=batch_size) + return ( + q_unpad.detach().requires_grad_(), + k_unpad.detach().requires_grad_(), + v_unpad.detach().requires_grad_(), + cu_seqlens_q, + cu_seqlens_k, + max_seqlen_q, + max_seqlen_k, + q.detach().requires_grad_(), + k.detach().requires_grad_(), + v.detach().requires_grad_(), + output_pad_fn, + dq_pad_fn, + dk_pad_fn, + ) + + +def construct_local_mask( + seqlen_q, + seqlen_k, + window_size=(-1, -1), # -1 means infinite window size + query_padding_mask=None, + key_padding_mask=None, + device=None, + key_leftpad=None, +): + row_idx = rearrange(torch.arange(seqlen_q, device=device, dtype=torch.long), "s -> s 1") + col_idx = torch.arange(seqlen_k, device=device, dtype=torch.long) + if key_leftpad is not None: + key_leftpad = rearrange(key_leftpad, "b -> b 1 1 1") + col_idx = repeat(col_idx, "s -> b 1 1 s", b=key_leftpad.shape[0]) + col_idx = torch.where(col_idx >= key_leftpad, col_idx - key_leftpad, 2**32) + sk = ( + seqlen_k + if key_padding_mask is None + else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1") + ) + sq = ( + seqlen_q + if query_padding_mask is None + else rearrange(query_padding_mask.sum(-1), "b -> b 1 1 1") + ) + if window_size[0] < 0: + return col_idx > row_idx + sk - sq + window_size[1] + else: + sk = torch.full_like(col_idx, seqlen_k) if key_padding_mask is None else sk + return torch.logical_or( + col_idx > torch.minimum(row_idx + sk - sq + window_size[1], sk), + col_idx < row_idx + sk - sq - window_size[0], + ) + + +def attention_ref( + q, + k, + v, + query_padding_mask=None, + key_padding_mask=None, + attn_bias=None, + dropout_p=0.0, + dropout_mask=None, + causal=False, + window_size=(-1, -1), # -1 means infinite window size + softcap=0.0, + upcast=True, + reorder_ops=False, + key_leftpad=None, +): + """ + Arguments: + q: (batch_size, seqlen_q, nheads, head_dim) + k: (batch_size, seqlen_k, nheads_k, head_dim) + v: (batch_size, seqlen_k, nheads_k, head_dim) + query_padding_mask: (batch_size, seqlen_q) + key_padding_mask: (batch_size, seqlen_k) + attn_bias: broadcastable to (batch_size, nheads, seqlen_q, seqlen_k) + dropout_p: float + dropout_mask: (batch_size, nheads, seqlen_q, seqlen_k) + causal: whether to apply causal masking + window_size: (int, int), left and right window size + upcast: whether to cast all inputs to fp32, do all computation in fp32, then cast + output back to fp16/bf16. + reorder_ops: whether to change the order of operations (scaling k instead of scaling q, etc.) + without changing the math. This is to estimate the numerical error from operation + reordering. + Output: + output: (batch_size, seqlen_q, nheads, head_dim) + attention: (batch_size, nheads, seqlen_q, seqlen_k), softmax after dropout + """ + if causal: + window_size = (window_size[0], 0) + dtype_og = q.dtype + if upcast: + q, k, v = q.float(), k.float(), v.float() + seqlen_q, seqlen_k = q.shape[1], k.shape[1] + k = repeat(k, "b s h d -> b s (h g) d", g=q.shape[2] // k.shape[2]) + v = repeat(v, "b s h d -> b s (h g) d", g=q.shape[2] // v.shape[2]) + d = q.shape[-1] + if not reorder_ops: + scores = torch.einsum("bthd,bshd->bhts", q / math.sqrt(d), k) + else: + scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d)) + if softcap > 0: + scores /= softcap + scores = scores.tanh() + scores *= softcap + if key_padding_mask is not None: + scores.masked_fill_(rearrange(~key_padding_mask, "b s -> b 1 1 s"), float("-inf")) + if window_size[0] >= 0 or window_size[1] >= 0: + local_mask = construct_local_mask( + seqlen_q, + seqlen_k, + window_size, + query_padding_mask, + key_padding_mask, + q.device, + key_leftpad=key_leftpad, + ) + scores.masked_fill_(local_mask, float("-inf")) + if attn_bias is not None: + scores = scores + attn_bias + attention = torch.softmax(scores, dim=-1).to(v.dtype) + # Some rows might be completely masked out so we fill them with zero instead of NaN + if window_size[0] >= 0 or window_size[1] >= 0: + attention = attention.masked_fill(torch.all(local_mask, dim=-1, keepdim=True), 0.0) + # We want to mask here so that the attention matrix doesn't have any NaNs + # Otherwise we'll get NaN in dV + if query_padding_mask is not None: + attention = attention.masked_fill(rearrange(~query_padding_mask, "b s -> b 1 s 1"), 0.0) + dropout_scaling = 1.0 / (1 - dropout_p) + # attention_drop = attention.masked_fill(~dropout_mask, 0.0) * dropout_scaling + # output = torch.einsum('bhts,bshd->bthd', attention_drop , v) + if dropout_mask is not None: + attention_drop = attention.masked_fill(~dropout_mask, 0.0) + else: + attention_drop = attention + output = torch.einsum("bhts,bshd->bthd", attention_drop, v * dropout_scaling) + if query_padding_mask is not None: + output.masked_fill_(rearrange(~query_padding_mask, "b s -> b s 1 1"), 0.0) + return output.to(dtype=dtype_og), attention.to(dtype=dtype_og) diff --git a/test_vit.py b/test_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..129c05ff53f5c7b7dcc79b949d7d97da0c0f5d16 --- /dev/null +++ b/test_vit.py @@ -0,0 +1,48 @@ +import re + +import pytest +import torch +from flash_attn.models.vit import vit_base_patch16_224 as flash_vit_base_patch16_224 +from timm.models.vision_transformer import vit_base_patch16_224 + + +@pytest.mark.parametrize("fused_mlp", [False, True]) +# @pytest.mark.parametrize('fused_mlp', [False]) +@pytest.mark.parametrize("optimized", [False, True]) +# @pytest.mark.parametrize('optimized', [True]) +def test_vit(optimized, fused_mlp): + """Check that our implementation of ViT matches the timm's implementation: + the output of our forward pass in fp16 should be around the same as + timm' forward pass in fp16, when compared to timm's forward pass in fp32. + """ + dtype = torch.float16 + device = "cuda" + + kwargs = {} + if optimized: + kwargs = dict(use_flash_attn=True, fused_bias_fc=True, fused_dropout_add_ln=True) + kwargs["fused_mlp"] = fused_mlp + model = flash_vit_base_patch16_224(**kwargs).to(device=device, dtype=dtype) + + model_ref = vit_base_patch16_224(pretrained=True).to(device=device) + model_timm = vit_base_patch16_224(pretrained=True).to(device=device, dtype=dtype) + + model.load_state_dict(model_ref.state_dict()) + + model.eval() + model_ref.eval() + model_timm.eval() + + torch.manual_seed(0) + batch_size = 2 + x = torch.randn(batch_size, 3, 224, 224, device=device, dtype=dtype) + out = model(x) + out_timm = model_timm(x) + out_ref = model_ref(x.float()) + + print(f"Output max diff: {(out - out_ref).abs().max().item()}") + print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") + print(f"timm fp16 max diff: {(out_timm - out_ref).abs().max().item()}") + print(f"timm fp16 mean diff: {(out_timm - out_ref).abs().mean().item()}") + rtol = 2 if not fused_mlp else 8 + assert (out - out_ref).abs().max().item() < rtol * (out_timm - out_ref).abs().max().item() diff --git a/thepile.yaml b/thepile.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d0f93535ce3fd213737f9466c24b8cf3d6f7dd1f --- /dev/null +++ b/thepile.yaml @@ -0,0 +1,14 @@ +_target_: src.datamodules.language_modeling_hf.LMDataModule +dataset_name: the_pile +dataset_config_name: null +tokenizer_name: gpt2 +cache_dir: ${oc.env:DATA_DIR,${data_dir}}/the_pile/cache +max_length: 2048 +add_eos: True +batch_size: 4 # per GPU +batch_size_eval: ${eval:${.batch_size} * 2} +num_workers: 64 # For preprocessing only +use_shmem: False +shuffle: True +pin_memory: True +__train_len: ${div_up:374337375694, ${.max_length}} diff --git a/tile_scheduler.hpp b/tile_scheduler.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ac93ca94b4ce0eb694b923865418ab2e805da483 --- /dev/null +++ b/tile_scheduler.hpp @@ -0,0 +1,273 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include "cutlass/fast_math.h" +#include "cutlass/arch/barrier.h" + +#include "named_barrier.hpp" + +namespace flash { + +/////////////////////////////////////////////////////////////////////////////// + +struct SingleTileScheduler { + +public: + + // Host side kernel arguments + struct Arguments { + int const num_blocks_m, num_head, num_batch; + int* const tile_count_semaphore = nullptr; + }; + + // Device side kernel params + struct Params {}; + + static Params + to_underlying_arguments(Arguments const& args) { + return {}; + } + + static dim3 + get_grid_dim(Arguments const& args, int num_sm) { + return {uint32_t(args.num_blocks_m), uint32_t(args.num_head), uint32_t(args.num_batch)}; + } + + struct WorkTileInfo { + int M_idx = 0; + int H_idx = 0; + int B_idx = 0; + bool is_valid_tile = false; + + CUTLASS_DEVICE + bool + is_valid(Params const& params) const { + return is_valid_tile; + } + + CUTLASS_DEVICE + cute::tuple + get_block_coord(Params const& params) const { + return {M_idx, H_idx, B_idx}; + } + + }; + + CUTLASS_DEVICE + SingleTileScheduler(int* tile_count_smem_) { } + + CUTLASS_DEVICE + WorkTileInfo + get_initial_work() const { + return {int(blockIdx.x), int(blockIdx.y), int(blockIdx.z), true}; + } + + CUTLASS_DEVICE + void + init_consumer() const {} + + CUTLASS_DEVICE + void + prefetch_next_work(Params const& params, WorkTileInfo& current_work) const {} + + CUTLASS_DEVICE + void + broadcast_next_work(WorkTileInfo& current_work) const {} + + template + CUTLASS_DEVICE + WorkTileInfo + get_next_work(Params const& params, WorkTileInfo const& current_work) const { + return {-1, -1, -1, false}; + } + +}; + +/////////////////////////////////////////////////////////////////////////////// + +class StaticPersistentTileScheduler { + +public: + + // Host side kernel arguments + struct Arguments { + int const num_blocks_m, num_head, num_batch; + int* const tile_count_semaphore = nullptr; + }; + + // Device side kernel params + struct Params { + int total_blocks; + cutlass::FastDivmod m_block_divmod, head_divmod; + }; + + static Params + to_underlying_arguments(Arguments const& args) { + return {args.num_blocks_m * args.num_head * args.num_batch, + cutlass::FastDivmod(args.num_blocks_m), cutlass::FastDivmod(args.num_head)}; + } + + static dim3 + get_grid_dim(Arguments const& args, int num_sm) { + return {uint32_t(num_sm)}; + } + + struct WorkTileInfo { + int tile_idx; + + CUTLASS_DEVICE + bool + is_valid(Params const& params) const { + return tile_idx < params.total_blocks; + } + + CUTLASS_DEVICE + cute::tuple + get_block_coord(Params const& params) const { + int m_block, bidh, bidb; + bidb = params.head_divmod.divmod(bidh, params.m_block_divmod.divmod(m_block, tile_idx)); + return {m_block, bidh, bidb}; + } + + }; + + CUTLASS_DEVICE + StaticPersistentTileScheduler(int* tile_count_smem_) {}; + + CUTLASS_DEVICE + WorkTileInfo + get_initial_work() const { + return {int(blockIdx.x)}; + } + + CUTLASS_DEVICE + void + init_consumer() const {} + + CUTLASS_DEVICE + void + prefetch_next_work(Params const& params, WorkTileInfo& current_work) const {} + + CUTLASS_DEVICE + void + broadcast_next_work(WorkTileInfo& current_work) const {} + + template + CUTLASS_DEVICE + WorkTileInfo + get_next_work(Params const& params, WorkTileInfo const& current_work) const { + return {current_work.tile_idx + int(gridDim.x)}; + } + +}; + +template +class DynamicPersistentTileScheduler { + +protected: + int* const tile_count_smem; + +public: + + // Host side kernel arguments + struct Arguments { + int const num_blocks_m, num_head, num_batch; + int* const tile_count_semaphore; + }; + + // Device side kernel params + struct Params { + int const total_blocks; + cutlass::FastDivmod const m_block_divmod, head_divmod; + int* const tile_count_semaphore; + }; + + static Params + to_underlying_arguments(Arguments const& args) { + return {args.num_blocks_m * args.num_head * args.num_batch, + cutlass::FastDivmod(args.num_blocks_m), cutlass::FastDivmod(args.num_head), + args.tile_count_semaphore}; + } + + static dim3 + get_grid_dim(Arguments const& args, int num_sm) { + return {uint32_t(num_sm)}; + } + + struct WorkTileInfo { + int tile_idx; + + CUTLASS_DEVICE + bool + is_valid(Params const& params) const { + return tile_idx < params.total_blocks; + } + + CUTLASS_DEVICE + cute::tuple + get_block_coord(Params const& params) const { + int m_block, bidh, bidb; + bidb = params.head_divmod.divmod(bidh, params.m_block_divmod.divmod(m_block, tile_idx)); + return {m_block, bidh, bidb}; + } + + }; + + CUTLASS_DEVICE + DynamicPersistentTileScheduler(int* tile_count_smem_) : tile_count_smem(tile_count_smem_) {}; + + CUTLASS_DEVICE + WorkTileInfo + get_initial_work() const { + return {int(blockIdx.x)}; + } + + CUTLASS_DEVICE + void + init_consumer() const { + cutlass::arch::NamedBarrier::arrive(NumMmaThreads + NumProducerThreads, static_cast(FwdNamedBarriers::TileCountSmemEmpty) /*id*/); + } + + CUTLASS_DEVICE + void + prefetch_next_work(Params const& params, WorkTileInfo& current_work) const { + if (threadIdx.x % NumProducerThreads == 0) { + current_work.tile_idx = atomicAdd(params.tile_count_semaphore, 1) + int(gridDim.x); + } + } + + CUTLASS_DEVICE + void + broadcast_next_work(WorkTileInfo& current_work) const { + cutlass::arch::NamedBarrier::sync(NumMmaThreads + NumProducerThreads, static_cast(FwdNamedBarriers::TileCountSmemEmpty) /*id*/); + if (threadIdx.x % NumProducerThreads == 0) { + *tile_count_smem = current_work.tile_idx; + } + cutlass::arch::NamedBarrier::arrive(NumMmaThreads + NumProducerThreads, static_cast(FwdNamedBarriers::TileCountSmemFull) /*id*/); + } + + template + CUTLASS_DEVICE + WorkTileInfo + get_next_work(Params const& params, WorkTileInfo const& current_work) const { + if constexpr (IsProducer && NumProducerThreads == cutlass::NumThreadsPerWarp) { + // thread 0 already has the right tile_idx, just need to broadcast to the rest of the producer threads (warp 0) + return {__shfl_sync(0xffffffff, current_work.tile_idx, 0 /*lane*/)}; + } else if constexpr (IsProducer && NumProducerThreads == cutlass::NumThreadsPerWarpGroup) { + // TODO: investigate optimal synchronize + int tile_idx = *tile_count_smem; + return {tile_idx}; + } else { + cutlass::arch::NamedBarrier::sync(NumMmaThreads + NumProducerThreads, static_cast(FwdNamedBarriers::TileCountSmemFull) /*id*/); + int tile_idx = *tile_count_smem; + cutlass::arch::NamedBarrier::arrive(NumMmaThreads + NumProducerThreads, static_cast(FwdNamedBarriers::TileCountSmemEmpty) /*id*/); + return {tile_idx}; + } + } + +}; + +} // flash \ No newline at end of file diff --git a/tile_scheduler_bwd.hpp b/tile_scheduler_bwd.hpp new file mode 100644 index 0000000000000000000000000000000000000000..3c3c5813a707ab3becd5e742c1eab31edba9f0dc --- /dev/null +++ b/tile_scheduler_bwd.hpp @@ -0,0 +1,92 @@ +/****************************************************************************** + * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include "cutlass/fast_math.h" +#include "cutlass/arch/barrier.h" + +#include "named_barrier.hpp" + +namespace flash { + +/////////////////////////////////////////////////////////////////////////////// + +class SingleTileSchedulerBwd { + +public: + + using SharedStorage = int; + + // Host side kernel arguments + struct Arguments { + int const num_blocks_m, num_head, num_batch; + int* const tile_count_semaphore = nullptr; + int* const cu_seqlens = nullptr; + }; + + // Device side kernel params + struct Params { + int const num_blocks_m, num_head, num_batch; + }; + + static Params + to_underlying_arguments(Arguments const& args) { + return {args.num_blocks_m, args.num_head, args.num_batch}; + } + + static dim3 + get_grid_shape(Params const& params, int num_sm) { + return {uint32_t(params.num_blocks_m), uint32_t(params.num_head), uint32_t(params.num_batch)}; + } + + struct WorkTileInfo { + int M_idx = 0; + int H_idx = 0; + int B_idx = 0; + bool is_valid_tile = false; + + CUTLASS_DEVICE + bool + is_valid(Params const& params) const { + return is_valid_tile; + } + + CUTLASS_DEVICE + cute::tuple + get_block_coord(Params const& params) const { + return {M_idx, H_idx, B_idx}; + } + + }; + + CUTLASS_DEVICE + SingleTileSchedulerBwd(SharedStorage* const smem_scheduler) { } + + template + CUTLASS_DEVICE + WorkTileInfo + get_initial_work(Params const& params) const { + return {int(blockIdx.x), int(blockIdx.y), int(blockIdx.z), true}; + } + + CUTLASS_DEVICE + void + init_consumer() const {} + + CUTLASS_DEVICE + void + prefetch_next_work(Params const& params, WorkTileInfo& current_work) const {} + + template + CUTLASS_DEVICE + WorkTileInfo + get_next_work(Params const& params, WorkTileInfo const& current_work) const { + return {-1, -1, -1, false}; + } + +}; + + +} // flash diff --git a/type_shim.h b/type_shim.h new file mode 100644 index 0000000000000000000000000000000000000000..815ec7ec88967f3b258cf666d43b5fe995f0f2b5 --- /dev/null +++ b/type_shim.h @@ -0,0 +1,20 @@ +#include + +#define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...) \ +switch(TYPE) \ +{ \ +case at::ScalarType::Half: \ + { \ +using scalar_t = at::Half; \ +__VA_ARGS__; \ +break; \ + } \ +case at::ScalarType::BFloat16: \ + { \ +using scalar_t = at::BFloat16; \ +__VA_ARGS__; \ +break; \ + } \ +default: \ + AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ +} diff --git a/usage.md b/usage.md new file mode 100644 index 0000000000000000000000000000000000000000..133bfbdb6b2a8860682fdf82baa0792b07733e8c --- /dev/null +++ b/usage.md @@ -0,0 +1,127 @@ +# FlashAttention adoption + +We've been very happy to see FlashAttention being adopted by many organizations +and research labs to speed up their training / inference (within 6 months after +FlashAttention's release, at the time of writing). +This page contains a partial list of places where FlashAttention is being used. +If you'd like to add links to your organization / product / codebase, please open a +PR or email us. We'd very much like to hear from you! + +## Integrated into machine learning frameworks + +- Pytorch: [integrated](https://github.com/pytorch/pytorch/pull/81434) into core Pytorch in nn.Transformer. + +- Huggingface's [transformers](https://github.com/huggingface/transformers) library. + [On-going](https://github.com/huggingface/transformers/pull/18439), blogpost + coming soon. + +- Microsoft's [DeepSpeed](https://github.com/microsoft/DeepSpeed): + FlashAttention is [integrated](https://github.com/microsoft/DeepSpeed/blob/ec13da6ba7cabc44bb4745a64a208b8580792954/deepspeed/ops/transformer/inference/triton_ops.py) into DeepSpeed's inference engine. + +- Nvidia's [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/pull/267). This + library is a popular framework on training large transformer language models at scale. + +- MosaicML [Composer](https://github.com/mosaicml/composer) + [library](https://www.mosaicml.com/blog/gpt-3-quality-for-500k). Composer is a + library for efficient neural network training. + +- EleutherAI's [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/pull/725). This is a research library for training large language transformer models at scale based on NVIDIA's Megatron-LM and Microsoft's DeepSpeed. + +- PaddlePaddle: integrated into the framework with [API](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/nn/functional/flash_attention.py) `paddle.nn.functional.flash_attention`. + +## MLPerf benchmarks + +[MLPerf](https://mlcommons.org/en/) is a competitive machine learning performance benchmark. FlashAttention +yields the fastest BERT training on cloud instances in MLPerf training 2.0 (June +2022) and MLPerf training 2.1 (November 2022). + +- MLPerf 2.0: [IEEE Spectrum](https://spectrum.ieee.org/mlperf-rankings-2022) and [Forbes](ttps://www.forbes.com/sites/moorinsights/2022/07/12/google-dethrones-nvidia-in-latest-artificial-intelligence-benchmarking-tests/) articles about our submission to the MLPerf 2.0 benchmark using FlashAttention. + +- MLPerf 2.1 - + collaboration + between [Azure and Hazy Research](https://techcommunity.microsoft.com/t5/azure-high-performance-computing/azure-collaborates-with-hazy-research-and-nvidia-to-achieve/ba-p/3667511): for the first time, we can train MLPerf BERT + in under 2 minutes on 16 nodes. + +- MLPerf 2.1 - + [Nvidia](https://developer.nvidia.com/blog/leading-mlperf-training-2-1-with-full-stack-optimizations-for-ai/): + Nvidia uses techniques from FlashAttention to make their (already extremely optimized) BERT + implementation go even faster. + +- MLPerf 2.1 - [MosaicML](https://www.mosaicml.com/blog/mlperf-nlp-nov2022): FlashAttention + helps train BERT 2.7x faster in the open division. + +## Language model training & inference + +- [PubMedGPT 2.7B](https://crfm.stanford.edu/2022/12/15/pubmedgpt.html), a + domain-specific LLM for biomedicine, by Stanford CRFM, trained on + [MosaicML](https://www.mosaicml.com/blog/introducing-pubmed-gpt) Cloud. Just + using FlashAttention nearly halves the total training time. + +- Meta's + [AITemplate](https://ai.facebook.com/blog/gpu-inference-engine-nvidia-amd-open-source/) + uses FlashAttention as part of their approach to speed up Transformer + inference (up to 5.3x on BERT). + +- Nvidia's [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) is a + state-of-the-art Transformer inference library. As of version + [5.2](https://github.com/NVIDIA/FasterTransformer/commit/b672f49e256ba7a2d4fc9691d270b60b7fc1a2ff), + FlashAttention is used as a component of FasterTransformer to speed up GPT inference. + +- [Kernl](https://github.com/ELS-RD/kernl) is a library for fast Transformer + inference. They use FlashAttention as part of their + [approach](https://twitter.com/pommedeterre33/status/1585284221014245377) to + speed up Transformers by up to 12x. + +## Diffusion model training and inference + +- Huggingface's [diffusers](https://github.com/huggingface/diffusers) library + for diffusion models. FlashAttention is integrated into [diffusers + v0.7.0](https://github.com/huggingface/diffusers/releases/tag/v0.7.0). + Up to 2x faster inference and lower memory usage. + +- Colossal-AI's + [implementation](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion) + of Stable Diffusion: with FlashAttention as one of its components, it speeds up + pretraining by up to 6.5x, and reduces the hardware cost of fine-tuning by 7x. + +- Meta's + [AITemplate](https://ai.facebook.com/blog/gpu-inference-engine-nvidia-amd-open-source/) + with FlashAttention one of the components, is currently the [fastest](https://twitter.com/bing_xu_/status/1590447334055632897) Stable + Diffusion inference engine that we know of. + +- Stable Diffusion inference from + [Labml.ai](https://twitter.com/labmlai/status/1573634095732490240): 50% speedup. + +- Our own Stable Diffusion [fork](https://twitter.com/realDanFu/status/1580641495991754752) uses FlashAttention to get 3-4x speedup compared + to the original version. + +## Other models + +- [Uni-Fold](https://github.com/dptech-corp/Uni-Fold): Uni-Fold is an + open-source platform for developing protein models beyond AlphaFold. With + FlashAttention, Uni-Fold is 2.6x + [faster](https://twitter.com/guolin_ke/status/1580532071901995008) than AlphaFold. + +- [OpenFold](https://github.com/aqlaboratory/openfold): a trainable, + memory-efficient, and GPU-friendly PyTorch reproduction of AlphaFold 2. With + FlashAttention as one of its + [components](https://twitter.com/gahdritz/status/1595420944880779266), it is + up to 3x faster than AlphaFold2 to run inference on short sequences, and can + predict 2x longer structures. + +## Different implementations + +- [Triton](https://github.com/openai/triton): an [implementation](https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py) of + FlashAttention in Triton by Phil Tillet from OpenAI. Triton is a Python-based + language and compiler for parallel programming. + +- [xformers](https://github.com/facebookresearch/xformers): The xformers team + has implemented [memory-efficient + attention](https://twitter.com/fvsmassa/status/1580229170629849089) in a + similar spirit to FlashAttention. + xformers dynamically dispatches to whichever implementation is available / faster. + +- [Jax](https://github.com/google/jax): an [implementation](https://github.com/lucidrains/flash-attention-jax) + in Jax by [lucidrains](https://github.com/lucidrains/). + +- [Metal](https://developer.apple.com/metal): an [implementation](https://github.com/philipturner/metal-flash-attention) in Metal by Philip Turner. This ports FlashAttention to mobile GPU architectures such as Apple silicon. diff --git a/utils.h b/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..aaf0712ad9bf54141607cb6135fd259aaf045a4a --- /dev/null +++ b/utils.h @@ -0,0 +1,353 @@ +/****************************************************************************** + * Copyright (c) 2024, Tri Dao. + ******************************************************************************/ + +#pragma once + +#include +#include +#include + +#include + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +#include +#endif + +#include +#include // For cute::elect_one_sync() + +#include +#include +#include +#include + +#define CHECK_CUDA(call) \ + do { \ + cudaError_t status_ = call; \ + if (status_ != cudaSuccess) { \ + fprintf(stderr, "CUDA error (%s:%d): %s\n", __FILE__, __LINE__, cudaGetErrorString(status_)); \ + exit(1); \ + } \ + } while(0) + +#define CHECK_CUDA_KERNEL_LAUNCH() CHECK_CUDA(cudaGetLastError()) + + +namespace flash { + +using namespace cute; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct MaxOp { +__device__ __forceinline__ T operator()(T const & x, T const & y) { return x > y ? x : y; } +}; + +template <> +struct MaxOp { +// This is slightly faster +__device__ __forceinline__ float operator()(float const &x, float const &y) { return max(x, y); } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct SumOp { +__device__ __forceinline__ T operator()(T const & x, T const & y) { return x + y; } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct Allreduce { + static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4); + template + static __device__ __forceinline__ T run(T x, Operator &op) { + constexpr int OFFSET = THREADS / 2; + x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET)); + return Allreduce::run(x, op); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template<> +struct Allreduce<2> { +template +static __device__ __forceinline__ T run(T x, Operator &op) { + x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1)); + return x; +} +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// For SM80, convert acc_layout from (MMA=4, MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, MMA_N)) +// For SM90, convert acc_layout from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, V, MMA_N)) +template +__forceinline__ __device__ auto convert_layout_acc_rowcol(Layout acc_layout) { + if constexpr (decltype(rank<0>(acc_layout))::value == 3) { // SM90 + static_assert(decltype(size<0, 0>(acc_layout))::value == 2); + static_assert(decltype(size<0, 1>(acc_layout))::value == 2); + static_assert(decltype(rank(acc_layout))::value == 3); + auto l = acc_layout; + return make_layout(make_layout(get<0, 1>(l), get<1>(l)), make_layout(get<0, 0>(l), get<0, 2>(l), get<2>(l))); + } else { // SM80 + static_assert(decltype(size<0>(acc_layout))::value == 4); + static_assert(decltype(rank(acc_layout))::value == 3); + auto l = logical_divide(acc_layout, Shape<_2>{}); // ((2, 2), MMA_M, MMA_N) + return make_layout(make_layout(get<0, 1>(l), get<1>(l)), make_layout(get<0, 0>(l), get<2>(l))); + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// For SM90, convert acc_layout from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M), ncol=(2, MMA_N)) +template +__forceinline__ __device__ auto convert_layout_acc_transposed_rowcol(Layout acc_layout) { + static_assert(decltype(size<0, 0>(acc_layout))::value == 2); + static_assert(decltype(size<0, 1>(acc_layout))::value == 2); + static_assert(decltype(rank(acc_layout))::value == 3); + auto l = acc_layout; + return make_layout(make_layout(get<0, 0>(l), get<0, 2>(l), get<2>(l)), make_layout(get<0, 1>(l), get<1>(l))); +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// For SM80, convert acc_layout from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2) +// if using m16n8k16, or to (4, MMA_M, MMA_N) if using m16n8k8. +// For SM90, convert acc_layout from ((2, 2, N / 8), MMA_M, MMA_N) to ((2, 2, 2), MMA_M, (N / 16, MMA_N)) +template +__forceinline__ __device__ auto convert_layout_acc_Aregs(Layout acc_layout) { + using X = Underscore; + if constexpr (decltype(rank<0>(acc_layout))::value == 3) { // SM90 + static_assert(decltype(size<0, 0>(acc_layout))::value == 2); + static_assert(decltype(size<0, 1>(acc_layout))::value == 2); + static_assert(decltype(rank(acc_layout))::value == 3); + static_assert(decltype(rank(get<0>(acc_layout)))::value == 3); + auto l = logical_divide(get<0>(acc_layout), Shape{}); // (2, 2, (2, N / 16))) + return make_layout(make_layout(get<0>(l), get<1>(l), get<2, 0>(l)), get<1>(acc_layout), make_layout(get<2, 1>(l), get<2>(acc_layout))); + } else { // SM80 + static_assert(decltype(size<0>(acc_layout))::value == 4); + static_assert(decltype(rank(acc_layout))::value == 3); + constexpr int mma_shape_K = get<2>(typename MMA_traits::Shape_MNK{}); + static_assert(mma_shape_K == 8 || mma_shape_K == 16); + if constexpr (mma_shape_K == 8) { + return acc_layout; + } else { + auto l = logical_divide(acc_layout, Shape{}); // (4, MMA_M, (2, MMA_N / 2))) + return make_layout(make_layout(get<0>(l), get<2, 0>(l)), get<1>(l), get<2, 1>(l)); + } + } +}; + +// Convert acc_layout from ((2, 2, N / 8), MMA_M, MMA_N) to ((4, 2, 2), MMA_M, (N / 32, MMA_N)) +template +__forceinline__ __device__ auto convert_layout_acc_Aregs_fp8(Layout acc_layout) { + using X = Underscore; + static_assert(decltype(size<0, 0>(acc_layout))::value == 2); + static_assert(decltype(size<0, 1>(acc_layout))::value == 2); + static_assert(decltype(rank(acc_layout))::value == 3); + static_assert(decltype(rank(get<0>(acc_layout)))::value == 3); + auto l = logical_divide(get<0>(acc_layout), Shape{}); // (2, 2, (2, N / 32))) + return make_layout(make_layout(Shape<_4, _2, _2>{}), + get<1>(acc_layout), + make_layout(get<2, 1>(l), get<2>(acc_layout))); +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// Byte permute for fp8 kernel +template +CUTLASS_DEVICE void permute_regs_A_to_C(Fragment &accum) { + + auto data = accum.data(); + + #pragma unroll + for (int n = 0; n < size(accum); n += 8) { + uint32_t *data_32bit = reinterpret_cast(&data[n]); + auto upper = data_32bit[0]; + auto lower = data_32bit[1]; + data_32bit[0] = __byte_perm(upper, lower, 0x5410); + data_32bit[1] = __byte_perm(upper, lower, 0x7632); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +__forceinline__ __device__ auto convert_type(Tensor const &tensor) { + using From_type = typename Engine::value_type; + constexpr int numel = decltype(size(tensor))::value; + cutlass::NumericArrayConverter convert_op; + // HACK: this requires tensor to be "contiguous" + auto frag = convert_op(*reinterpret_cast *>(tensor.data())); + return make_tensor(make_rmem_ptr(&frag), tensor.layout()); + // Tensor out = make_tensor_like(tensor); + // cute::copy(make_tensor(make_rmem_ptr(&frag), tensor.layout()), out); + // return out; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +__forceinline__ __device__ void gemm(TiledMma &tiled_mma, Tensor0 const &tCrA, Tensor1 const &tCrB, Tensor2 &tCrC) { + constexpr bool Is_RS = !cute::is_base_of::value; + // Need to cast away const on tCrA since warpgroup_fence_operand doesn't take const + if constexpr (Is_RS) { warpgroup_fence_operand(const_cast(tCrA)); } + warpgroup_fence_operand(tCrC); + if constexpr (arrive) { + warpgroup_arrive(); + } + if constexpr (zero_init) { + tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; + // Unroll the K mode manually to set scale D to 1 + CUTLASS_PRAGMA_UNROLL + for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { + cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCrC); + tiled_mma.accumulate_ = GMMA::ScaleOut::One; + } + } else { + // cute::gemm(tiled_mma, tCrA, tCrB, tCrC); + // Unroll the K mode manually to set scale D to 1 + CUTLASS_PRAGMA_UNROLL + for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { + cute::gemm(tiled_mma, tCrA(_,_,k_block), tCrB(_,_,k_block), tCrC); + tiled_mma.accumulate_ = GMMA::ScaleOut::One; + } + } + if constexpr (commit) { + warpgroup_commit_batch(); + } + if constexpr (wg_wait >= 0) { warpgroup_wait(); } + warpgroup_fence_operand(tCrC); + if constexpr (Is_RS) { warpgroup_fence_operand(const_cast(tCrA)); } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +__forceinline__ __device__ void copy(TiledCopy tiled_copy, Tensor const &S, + Tensor &D, Tensor const &identity_MN, + Tensor const &predicate_K, const int max_MN=0) { + CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{}); + CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{}); + CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D)); // MMA + CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D)); // MMA_M + CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D)); // MMA_K + // There's no case where !Clear_OOB_K && Clear_OOB_MN + static_assert(!(Clear_OOB_MN && !Clear_OOB_K)); + #pragma unroll + for (int m = 0; m < size<1>(S); ++m) { + if (Is_even_MN || get<0>(identity_MN(0, m, 0)) < max_MN) { + #pragma unroll + for (int k = 0; k < size<2>(S); ++k) { + if (Is_even_K || predicate_K(k)) { + cute::copy(tiled_copy, S(_, m, k), D(_, m, k)); + } else if (Clear_OOB_K) { + cute::clear(D(_, m, k)); + } + } + } else if (Clear_OOB_MN) { + cute::clear(D(_, m, _)); + } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +template +__forceinline__ __device__ void write_tma( + ElemO* O, const TMACopyO& tma_store_O, + const LayoutO& layout_O, const TileShapeO& tile_shape_O, + const SMemO& sO, int m_block, int bidh, int bidb, + const SeqLenTraits& seqlen_traits_o, int write_warp_idx) { + Tensor mO = tma_store_O.get_tma_tensor(layout_O.shape()); + Tensor gO = seqlen_traits_o.get_local_tile_tensor( + mO, tile_shape_O, bidh, bidb + )(_, _, m_block); // (M, K) + auto block_tma_O = tma_store_O.get_slice(_0{}); + Tensor tOgO = block_tma_O.partition_D(gO); // (TMA, TMA_M, TMA_K) + Tensor tOsO = block_tma_O.partition_S(sO); // (TMA, TMA_M, TMA_K) + + int const lane_predicate = cute::elect_one_sync(); + int const warp_idx = cutlass::canonical_warp_idx_sync(); + if (warp_idx == write_warp_idx && lane_predicate) { + cute::copy(tma_store_O, tOsO, tOgO); + tma_store_arrive(); + } + // Note: no wait here. + // tma_store_wait<0>(); +} + +template +__forceinline__ __device__ void write_tiled( + ElemO* O, const TiledCopyO& tiled_copy_O, + const LayoutO& layout_O, const TileShapeO& tile_shape_O, + const SMemO& sO, int m_block, int bidh, int bidb, + const SeqLenTraits& seqlen_traits_o) { + Tensor mO = make_tensor(make_gmem_ptr(O), layout_O); + Tensor gO = seqlen_traits_o.get_local_tile_tensor( + mO, tile_shape_O, bidh, bidb + )(_, _, m_block); // (M, K) + + ThrCopy thr_copy_O = tiled_copy_O.get_slice(threadIdx.x - NumCopyThreads); + Tensor tOgO = thr_copy_O.partition_D(gO); // (CPY,CPY_M,CPY_K,k) + Tensor tOsO = thr_copy_O.partition_S(sO); // (CPY,CPY_M,CPY_K) + + // Prepare for TiledCopy. + // Grouping is needed because cute::copy_if() does group_modes<1, R> for src and dst. + // After grouping, the first dim is number of elements to read together. + Tensor tOsOFlatten = cute::flatten(tOsO); + Tensor tOsOGroup = cute::group_modes<1, rank(tOsOFlatten)>(tOsOFlatten); + Tensor tOgOFlatten = cute::flatten(tOgO); + Tensor tOgOGroup = cute::group_modes<1, rank(tOgOFlatten)>(tOgOFlatten); + + // Get thread coords to global index mapping. + Tensor gOCounting = cute::make_identity_tensor(gO.shape()); + Tensor tSgOCounting = thr_copy_O.partition_D(gOCounting); + Tensor tSgOCountingFlatten = cute::flatten(tSgOCounting); + Tensor tSgOCountingGrouped = + cute::group_modes<1, rank(tSgOCountingFlatten)>(tSgOCountingFlatten); + + // Write out to GMEM. + const int kNumMsPerTile = get<0>(tile_shape_O); + int cta_m = std::min( + seqlen_traits_o.actual_seq_len - m_block * kNumMsPerTile, kNumMsPerTile + ); + if (cta_m == kNumMsPerTile) { + copy(tiled_copy_O, tOsOGroup, tOgOGroup); + } else { + auto predicate_fn = [&](auto coords) { + auto s_coords = tSgOCountingGrouped(_0{}, coords); + return elem_less(get<0>(s_coords), cta_m); + }; + copy_if(tiled_copy_O, predicate_fn, tOsOGroup, tOgOGroup); + } +} + +template +__forceinline__ __device__ void write_O( + ElemO* O, const TMACopyO& tma_copy_O, const TiledCopyO& tiled_copy_O, + const LayoutO& layout_O, const TileShapeO& tile_shape_O, + const SMemO& sO, int m_block, int bidh, int bidb, + const SeqLenTraits& seqlen_traits_o, int write_warp_idx) { + if constexpr (IsTMACopy) { + write_tma(O, tma_copy_O, layout_O, tile_shape_O, sO, m_block, bidh, bidb, seqlen_traits_o, write_warp_idx); + } else { + write_tiled(O, tiled_copy_O, layout_O, tile_shape_O, sO, m_block, bidh, bidb, seqlen_traits_o); + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace flash diff --git a/vit.py b/vit.py new file mode 100644 index 0000000000000000000000000000000000000000..4602fd7414d251e40f9d42250c23cc974d596661 --- /dev/null +++ b/vit.py @@ -0,0 +1,373 @@ +# Copyright (c) 2022, Tri Dao. +# Inspired by / adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py +import math +import re +from collections import OrderedDict +from copy import deepcopy +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange +from timm.models.helpers import named_apply +from torch.nn.init import trunc_normal_ +from torchvision.ops import StochasticDepth + +from flash_attn.layers.patch_embed import PatchEmbed +from flash_attn.modules.block import Block +from flash_attn.modules.mha import MHA +from flash_attn.modules.mlp import FusedMLP, Mlp + +try: + from flash_attn.ops.triton.layer_norm import layer_norm_fn +except ImportError: + layer_norm_fn = None + + +def create_mixer_cls( + num_heads, qkv_bias, attn_drop, use_flash_attn, fused_bias_fc, cross_attn=False +): + mixer_cls = partial( + MHA, + num_heads=num_heads, + cross_attn=cross_attn, + qkv_proj_bias=qkv_bias, + dropout=attn_drop, + fused_bias_fc=fused_bias_fc, + use_flash_attn=use_flash_attn, + ) + return mixer_cls + + +def create_mlp_cls(embed_dim, mlp_ratio, act_layer, fused_mlp): + inner_dim = int(embed_dim * mlp_ratio) + if not fused_mlp: + mlp_cls = partial(Mlp, hidden_features=inner_dim, activation=act_layer()) + else: + mlp_cls = partial(FusedMLP, hidden_features=inner_dim) + return mlp_cls + + +def create_block( + embed_dim, + num_heads, + mlp_ratio, + qkv_bias, + drop_rate, + attn_drop_rate, + drop_path1, + drop_path2, + norm_layer, + act_layer, + use_flash_attn, + fused_bias_fc, + fused_mlp, + fused_dropout_add_ln, + layer_idx=None, + n_layer=None, + last_layer_subset=False, +): + mixer_cls = create_mixer_cls( + num_heads, + qkv_bias, + attn_drop_rate, + use_flash_attn, + fused_bias_fc, + cross_attn=(last_layer_subset and layer_idx == n_layer - 1), + ) + mlp_cls = create_mlp_cls(embed_dim, mlp_ratio, act_layer, fused_mlp) + # TD [2022-10-15]: Force residual in fp32 in case of DeepSpeed + block = Block( + embed_dim, + mixer_cls, + mlp_cls, + norm_cls=norm_layer, + prenorm=True, + resid_dropout1=drop_rate, + resid_dropout2=drop_rate, + drop_path1=drop_path1, + drop_path2=drop_path2, + fused_dropout_add_ln=fused_dropout_add_ln, + residual_in_fp32=True, + ) + return block + + +class VisionTransformer(nn.Module): + """Vision Transformer + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` + - https://arxiv.org/abs/2010.11929 + """ + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + global_pool="token", + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + init_values=None, + class_token=True, + no_embed_class=False, + pre_norm=False, + fc_norm=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.0, + weight_init="", + embed_layer=PatchEmbed, + norm_layer=None, + act_layer=None, + use_flash_attn=False, + fused_bias_fc=False, + fused_mlp=False, + fused_dropout_add_ln=False, + ): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + num_classes (int): number of classes for classification head + global_pool (str): type of global pooling for final sequence (default: 'token') + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + init_values: (float): layer-scale init values + class_token (bool): use class token + fc_norm (Optional[bool]): pre-fc norm after pool, set if global_pool == 'avg' if None (default: None) + drop_rate (float): dropout rate + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + weight_init (str): weight init scheme + embed_layer (nn.Module): patch embedding layer + norm_layer: (nn.Module): normalization layer + act_layer: (nn.Module): MLP activation layer + """ + super().__init__() + assert global_pool == "token", "Only support pooling with CLS token" + assert class_token + assert init_values is None, "LayerScale is not supported yet" + assert weight_init == "" + assert fc_norm is None + # pre_norm seems redundant, as there's a LayerNorm right at the start of each block, idk + assert not pre_norm + use_fc_norm = global_pool == "avg" if fc_norm is None else fc_norm + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + act_layer = act_layer or nn.GELU + + self.num_classes = num_classes + self.global_pool = global_pool + self.num_features = ( + self.embed_dim + ) = embed_dim # num_features for consistency with other models + self.num_prefix_tokens = 1 if class_token else 0 + self.no_embed_class = no_embed_class + + patch_embed_extra_kwargs = ( + {"fused_bias_fc": fused_bias_fc} if embed_layer is PatchEmbed else {} + ) + self.patch_embed = embed_layer( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + bias=not pre_norm, # disable bias if pre-norm is used (e.g. CLIP) + **patch_embed_extra_kwargs, + ) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None + embed_len = num_patches if no_embed_class else num_patches + self.num_prefix_tokens + self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02) + + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + + # We change the order of dropout, residual and layer norm: + # Instead of LN -> Attn / MLP -> Dropout -> Add, we do: + # Dropout -> Add -> LN -> Attn / MLP, returning both the residual branch (output of Add) and + # the main branch (output of MLP). The model definition is unchanged, but the mapping of the + # nn.Dropout probabilities are changed. + # This is for performance reason: we can fuse dropout + add + layer_norm. + self.blocks = nn.ModuleList( + [ + create_block( + embed_dim, + num_heads, + mlp_ratio, + qkv_bias, + drop_rate, + attn_drop_rate, + drop_path1=dpr[i - 1] if i > 0 else 0.0, + drop_path2=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + use_flash_attn=use_flash_attn, + fused_bias_fc=fused_bias_fc, + fused_mlp=fused_mlp, + fused_dropout_add_ln=fused_dropout_add_ln, + layer_idx=i, + n_layer=depth, + last_layer_subset=(global_pool == "token"), + ) + for i in range(depth) + ] + ) + + self.dropout = nn.Dropout(p=drop_rate) + self.drop_path = StochasticDepth(p=dpr[-1], mode="row") + self.norm = norm_layer(embed_dim) + + self.fused_dropout_add_ln = fused_dropout_add_ln + if self.fused_dropout_add_ln and layer_norm_fn is None: + raise ImportError("Triton is not installed") + + # Classifier Head + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + self.init_weights(weight_init) + + def init_weights(self, mode=""): + assert mode == "" + trunc_normal_(self.pos_embed, std=0.02) + if self.cls_token is not None: + nn.init.normal_(self.cls_token, std=1e-6) + named_apply(init_weights_vit_timm, self) + + def _init_weights(self, m): + # this fn left here for compat with downstream users + init_weights_vit_timm(m) + + @torch.jit.ignore + def no_weight_decay(self): + return {"pos_embed", "cls_token"} + + def _pos_embed(self, x): + if self.no_embed_class: + # deit-3, updated JAX (big vision) + # position embedding does not overlap with class token, add then concat + x = x + self.pos_embed + if self.cls_token is not None: + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + else: + # original timm, JAX, and deit vit impl + # pos_embed has entry for class token, concat then add + if self.cls_token is not None: + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + x = x + self.pos_embed + return x + + def forward_features(self, x, all_tokens=True): + """ + If all_tokens==False and self.global_pool == 'token', we only return the features for the + cls token. + """ + x = self.patch_embed(x) + hidden_states = self._pos_embed(x) + residual = None + if self.global_pool != "token" or all_tokens: + # if True: + for block in self.blocks: + hidden_states, residual = block(hidden_states, residual) + else: + for block in self.blocks[:-1]: + hidden_states, residual = block(hidden_states, residual) + # For the last layer, we only want the 1st token of the output. So we do cross-attention + # where the query is the 1st token and the key/value is the whole sequence. + hidden_states, residual = self.blocks[-1]( + hidden_states, residual, mixer_subset=slice(0, 1) + ) + if not self.fused_dropout_add_ln: + residual = self.drop_path(self.dropout(hidden_states)) + residual + hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype)) + else: + if self.drop_path.p == 0 or not self.training: + rowscale = None + else: + rowscale = self.drop_path( + torch.ones( + hidden_states.shape[:-1], + device=hidden_states.device, + dtype=hidden_states.dtype, + ) + ) + # Set prenorm=False here since we don't need to the residual + hidden_states = layer_norm_fn( + hidden_states, + self.norm.weight, + self.norm.bias, + residual=residual, + eps=self.norm.eps, + dropout_p=self.dropout.p if self.training else 0.0, + rowscale=rowscale, + prenorm=False, + ) + return hidden_states + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool: + x = x[:, self.num_prefix_tokens :].mean(dim=1) if self.global_pool == "avg" else x[:, 0] + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x, all_tokens=False) + x = self.forward_head(x) + return x + + def load_state_dict(self, state_dict, strict=True): + patch_embed_weight = state_dict["patch_embed.proj.weight"] + if patch_embed_weight.dim() == 4: + # convert from Conv2d to Linear + state_dict["patch_embed.proj.weight"] = rearrange( + patch_embed_weight, "o c h w -> o (c h w)" + ) + + def key_mapping_attn(key): + key = re.sub(r"^blocks.(\d+).attn.qkv.", r"blocks.\1.mixer.Wqkv.", key) + key = re.sub(r"^blocks.(\d+).attn.proj.", r"blocks.\1.mixer.out_proj.", key) + return key + + state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items()) + n_layer = len(self.blocks) + # Convert from Wqkv to Wq and Wkv for cross attention (last layer) + if ( + self.blocks[-1].mixer.cross_attn + and f"blocks.{n_layer - 1}.mixer.Wqkv.weight" in state_dict + ): + Wqkv = state_dict.pop(f"blocks.{n_layer - 1}.mixer.Wqkv.weight") + bqkv = state_dict.pop(f"blocks.{n_layer - 1}.mixer.Wqkv.bias") + state_dict[f"blocks.{n_layer - 1}.mixer.Wq.weight"] = Wqkv[: self.embed_dim] + state_dict[f"blocks.{n_layer - 1}.mixer.Wkv.weight"] = Wqkv[self.embed_dim :] + state_dict[f"blocks.{n_layer - 1}.mixer.Wq.bias"] = bqkv[: self.embed_dim] + state_dict[f"blocks.{n_layer - 1}.mixer.Wkv.bias"] = bqkv[self.embed_dim :] + return super().load_state_dict(state_dict, strict=strict) + + +def init_weights_vit_timm(module: nn.Module, name: str = ""): + """ViT weight initialization, original timm impl (for reproducibility)""" + if isinstance(module, nn.Linear): + trunc_normal_(module.weight, std=0.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif hasattr(module, "init_weights"): + module.init_weights() + + +def vit_base_patch16_224(pretrained=False, **kwargs): + """ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer. + """ + assert not pretrained + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = VisionTransformer(**model_kwargs) + return model diff --git a/wandb.yaml b/wandb.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c6ae21d3aec18aa281a6d1ef5ea41199a4c04295 --- /dev/null +++ b/wandb.yaml @@ -0,0 +1,26 @@ +defaults: + - default.yaml + +watch_model: + _target_: src.callbacks.wandb_callbacks.WatchModel + log: "all" + log_freq: 100 + +upload_code_as_artifact: + _target_: src.callbacks.wandb_callbacks.UploadCodeAsArtifact + code_dir: ${work_dir}/src + +upload_ckpts_as_artifact: + _target_: src.callbacks.wandb_callbacks.UploadCheckpointsAsArtifact + ckpt_dir: "checkpoints/" + upload_best_only: True + +log_f1_precision_recall_heatmap: + _target_: src.callbacks.wandb_callbacks.LogF1PrecRecHeatmap + +log_confusion_matrix: + _target_: src.callbacks.wandb_callbacks.LogConfusionMatrix + +log_image_predictions: + _target_: src.callbacks.wandb_callbacks.LogImagePredictions + num_samples: 8 diff --git a/xentropy_kernel.cu b/xentropy_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..8d8836e6e45cf13bc6cc0e7ef5917cb7b78212d1 --- /dev/null +++ b/xentropy_kernel.cu @@ -0,0 +1,760 @@ +// Adapted from https://github.com/NVIDIA/apex/blob/master/apex/contrib/csrc/xentropy/xentropy_kernel.cu +// TD [2022-09-17]: We make it work for bfloat16, and add an option to do the backward inplace (to save memory). +/** + * From PyTorch: + * + * Copyright (c) 2016- Facebook, Inc (Adam Paszke) + * Copyright (c) 2014- Facebook, Inc (Soumith Chintala) + * Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) + * Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) + * Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) + * Copyright (c) 2011-2013 NYU (Clement Farabet) + * Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) + * Copyright (c) 2006 Idiap Research Institute (Samy Bengio) + * Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) + * + * From Caffe2: + * + * Copyright (c) 2016-present, Facebook Inc. All rights reserved. + * + * All contributions by Facebook: + * Copyright (c) 2016 Facebook Inc. + * + * All contributions by Google: + * Copyright (c) 2015 Google Inc. + * All rights reserved. + * + * All contributions by Yangqing Jia: + * Copyright (c) 2015 Yangqing Jia + * All rights reserved. + * + * All contributions from Caffe: + * Copyright(c) 2013, 2014, 2015, the respective contributors + * All rights reserved. + * + * All other contributions: + * Copyright(c) 2015, 2016 the respective contributors + * All rights reserved. + * + * Caffe2 uses a copyright model similar to Caffe: each contributor holds + * copyright over their contributions to Caffe2. The project versioning records + * all such contribution and copyright details. If a contributor wants to further + * mark their specific copyright on a particular contribution, they should + * indicate their copyright solely in the commit message of the change when it is + * committed. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America + * and IDIAP Research Institute nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include + +#include +#include + +// https://github.com/NVIDIA/apex/blob/master/csrc/type_shim.h +// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +#define DISPATCH_FLOAT_AND_HALF_AND_BF16(TYPE, LEVEL, NAME, ...) \ + switch(TYPE) \ + { \ + case at::ScalarType::Float: \ + { \ + using scalar_t_##LEVEL = float; \ + __VA_ARGS__; \ + break; \ + } \ + case at::ScalarType::Half: \ + { \ + using scalar_t_##LEVEL = at::Half; \ + __VA_ARGS__; \ + break; \ + } \ + case at::ScalarType::BFloat16: \ + { \ + using scalar_t_##LEVEL = at::BFloat16; \ + __VA_ARGS__; \ + break; \ + } \ + default: \ + AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ + } +// #else +// #define DISPATCH_FLOAT_AND_HALF_AND_BF16(TYPE, LEVEL, NAME, ...) \ +// switch(TYPE) \ +// { \ +// case at::ScalarType::Float: \ +// { \ +// using scalar_t_##LEVEL = float; \ +// __VA_ARGS__; \ +// break; \ +// } \ +// case at::ScalarType::Half: \ +// { \ +// using scalar_t_##LEVEL = at::Half; \ +// __VA_ARGS__; \ +// break; \ +// } \ +// default: \ +// AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ +// } +// #endif + +#define ALIGN_BYTES 16 + +using Tensor = at::Tensor; +using TensorList = at::TensorList; +using ScalarType = at::ScalarType; +using at::acc_type; + +template +struct LogSoftMaxForwardEpilogue { + __device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_input, AccumT sum) + : logsum(max_input + std::log(sum)) {} + + __device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_log_sum_exp) + : logsum(max_log_sum_exp) {} + + __device__ __forceinline__ OutT operator()(T input) const { + return static_cast(input - logsum); + } + + const AccumT logsum; +}; + +template +struct LogSoftMaxBackwardEpilogue { + __device__ __forceinline__ LogSoftMaxBackwardEpilogue(AccumT sum) + : sum(sum) {} + + __device__ __forceinline__ T operator()(OutT gradOutput, OutT output) const { + return static_cast(gradOutput - std::exp(static_cast(output)) * sum); + } + + const AccumT sum; +}; + + + +const int max_threads = 1024; + +inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) { + uint64_t block_size = 1; + uint64_t max_block_size = std::min(dim_size / ILP, static_cast(max_threads)); + while (block_size < (max_block_size/2)) block_size *= 2; + // Launch at least a single warp - the kernel assumes that. + block_size = std::max(block_size, static_cast(32)); + return dim3(block_size); +} + +template +struct Add { + __device__ __forceinline__ T operator()(T a, T b) const { + return a + b; + } +}; + +template +struct Max { + __device__ __forceinline__ T operator()(T a, T b) const { + return a < b ? b : a; + } +}; + + +//////////////////////////////////////////////////////////////////////////////// +// Regular kernel (fast when dim_size is large; requires inner_size == 1) +//////////////////////////////////////////////////////////////////////////////// + + +template +struct MaxFloat +{ + __device__ __forceinline__ AccumT operator()(AccumT max, T v) const { + return ::max(max, (AccumT)v); + } +}; + +template +struct AddFloat +{ + __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const { + return sum + v; + } +}; + +template +struct SumExpFloat +{ + __device__ __forceinline__ SumExpFloat(AccumT v) + : max_k(v) {} + + __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const { + return sum + std::exp(v - max_k); + } + + const AccumT max_k; +}; + +template class Reduction, typename AccumT> +__device__ __forceinline__ AccumT +blockReduce(AccumT* smem, AccumT val, + const Reduction& r, + AccumT defaultVal) +{ + // To avoid RaW races from chaining blockReduce calls together, we need a sync here + __syncthreads(); + + smem[threadIdx.x] = val; + + __syncthreads(); + + AccumT warpVal = defaultVal; + + // First warp will perform per-warp reductions for the remaining warps + uint32_t mask = (((uint64_t)1) << (blockDim.x / 32)) - 1; + if (threadIdx.x < 32) { + int lane = threadIdx.x % 32; + if (lane < blockDim.x / 32) { +#pragma unroll + for (int i = 0; i < 32; ++i) { + warpVal = r(warpVal, smem[lane * 32 + i]); + } + __syncwarp(mask); + smem[lane] = warpVal; + } + } + + __syncthreads(); + + // First thread will perform a reduction of the above per-warp reductions + AccumT blockVal = defaultVal; + + if (threadIdx.x == 0) { + for (int i = 0; i < blockDim.x / 32; ++i) { + blockVal = r(blockVal, smem[i]); + } + smem[0] = blockVal; + } + + // Sync and broadcast + __syncthreads(); + return smem[0]; +} + +template class Reduction1, template class Reduction2, typename AccumT> +__device__ __forceinline__ void +blockReduce(AccumT* smem, + AccumT* reducVal1, + AccumT val1, + const Reduction1& r1, + AccumT defaultVal1, + AccumT* reducVal2, + AccumT val2, + const Reduction2& r2, + AccumT defaultVal2) +{ + // To avoid RaW races from chaining blockReduce calls together, we need a sync here + __syncthreads(); + + smem[threadIdx.x] = val1; + smem[blockDim.x + threadIdx.x] = val2; + + __syncthreads(); + + AccumT warpVal1 = defaultVal1; + AccumT warpVal2 = defaultVal2; + + // First warp will perform per-warp reductions for the remaining warps + uint32_t mask = (((uint64_t)1) << (blockDim.x / 32)) - 1; + if (threadIdx.x < 32) { + int lane = threadIdx.x % 32; + if (lane < blockDim.x / 32) { +#pragma unroll + for (int i = 0; i < 32; ++i) { + warpVal1 = r1(warpVal1, smem[lane * 32 + i]); + warpVal2 = r2(warpVal2, smem[lane * 32 + i + blockDim.x]); + } + __syncwarp(mask); + smem[lane] = warpVal1; + smem[lane + blockDim.x] = warpVal2; + } + } + + __syncthreads(); + + // First thread will perform a reduction of the above per-warp reductions + AccumT blockVal1 = defaultVal1; + AccumT blockVal2 = defaultVal2; + + if (threadIdx.x == 0) { + for (int i = 0; i < blockDim.x / 32; ++i) { + blockVal1 = r1(blockVal1, smem[i]); + blockVal2 = r2(blockVal2, smem[i + blockDim.x]); + } + smem[0] = blockVal1; + smem[blockDim.x] = blockVal2; + } + + // Sync and broadcast + __syncthreads(); + *reducVal1 = smem[0]; + *reducVal2 = smem[blockDim.x]; + __syncthreads(); +} + +template class Reduction, int ILP, typename T, typename AccumT> +__device__ __forceinline__ AccumT +ilpReduce(int shift, + T* data, + int size, + const Reduction& r, + AccumT defaultVal) +{ + typedef typename std::aligned_storage::type LoadT; + AccumT threadVal = defaultVal; + int offset = threadIdx.x; + + // shift and do 1 + if(shift > 0){ + data -= shift; + size += shift; + if(threadIdx.x >= shift){ + threadVal = r(threadVal, data[offset]); + } + size -= blockDim.x; + data += blockDim.x; + } + int last = size % (ILP * blockDim.x); + + T v[ILP]; + LoadT* value = reinterpret_cast(&v); + + for (; offset * ILP < (size - last); offset += blockDim.x) { + *value = reinterpret_cast(data)[offset]; + + for (int j = 0; j < ILP; ++j) { + threadVal = r(threadVal, v[j]); + } + } + + offset = size - last + threadIdx.x; + // Epilogue + for (; offset < size; offset += blockDim.x) + threadVal = r(threadVal, data[offset]); + + return threadVal; +} + +template class Reduction1, template class Reduction2, int ILP, typename T, typename AccumT> +__device__ __forceinline__ void +ilpReduce(int shift, + T* data, + int size, + AccumT* reducVal1, + const Reduction1& r1, + AccumT defaultVal1, + AccumT* reducVal2, + const Reduction2& r2, + AccumT defaultVal2) +{ + typedef typename std::aligned_storage::type LoadT; + + AccumT threadVal1 = defaultVal1; + AccumT threadVal2 = defaultVal2; + int offset = threadIdx.x; + + // shift and do 1 + if(shift > 0){ + data -= shift; + size += shift; + if(threadIdx.x >= shift){ + threadVal1 = r1(threadVal1, data[offset]); + threadVal2 = r2(threadVal2, data[offset]); + } + size -= blockDim.x; + data += blockDim.x; + } + int last = size % (ILP * blockDim.x); + + T v[ILP]; + LoadT* value = reinterpret_cast(&v); + + for (; offset * ILP < (size - last); offset += blockDim.x) { + *value = reinterpret_cast(data)[offset]; + + for (int j = 0; j < ILP; ++j) { + threadVal1 = r1(threadVal1, v[j]); + threadVal2 = r2(threadVal2, v[j]); + } + } + + offset = size - last + threadIdx.x; + // Epilogue + for (; offset < size; offset += blockDim.x) { + threadVal1 = r1(threadVal1, data[offset]); + threadVal2 = r2(threadVal2, data[offset]); + } + + *reducVal1 = threadVal1; + *reducVal2 = threadVal2; +} + +template class Epilogue> +__global__ void +cunn_SoftMaxXEntropyForward( + accscalar_t *losses, + outscalar_t *max_log_sum_exp, + scalar_t *input, + int64_t *labels, + int64_t classes, + const float smoothing, + const int total_classes) +{ + extern __shared__ unsigned char smem[]; + auto sdata = reinterpret_cast(smem); + // forward pointers to batch[blockIdx.x] + // each block handles a sample in the mini-batch + input += blockIdx.x * classes; + //output += blockIdx.x * classes; + const int shift = ((uint64_t)input) % ALIGN_BYTES / sizeof(scalar_t); + + int64_t label = labels[blockIdx.x]; + + // find the max and sum + accscalar_t threadMax, threadSum, max_k, sum_k; + ilpReduce( + shift, input, classes, + &threadMax, MaxFloat(), + -at::numeric_limits::max(), + &threadSum, AddFloat(), + static_cast(0)); + + blockReduce( + sdata, + &max_k, threadMax, Max(), + -at::numeric_limits::max(), + &sum_k, threadSum, Add(), + static_cast(0)); + + accscalar_t threadExp = ilpReduce(shift, input, classes, SumExpFloat(max_k), static_cast(0)); + accscalar_t sumAll = blockReduce( + sdata, threadExp, Add(), static_cast(0)); + + Epilogue epilogue(max_k, sumAll); + + // calculate per element loss with label smoothing + // reserve max + log_sum_exp for bprop + if (threadIdx.x == 0) { + accscalar_t lse = max_k + std::log(sumAll); + accscalar_t log_prob = (label >= 0 && label < classes) ? epilogue(static_cast(input[label])) : 0.f; + losses[blockIdx.x] = (lse - sum_k / total_classes) * smoothing - log_prob * (1 - smoothing); + max_log_sum_exp[blockIdx.x] = lse; + } +} + +template +__device__ __forceinline__ void +apply(scalar_t *gradInput, + scalar_t *logits, + outscalar_t *max_log_sum_exp, + outscalar_t *gradOutput, + int64_t *labels, + const float smoothing, + int classes, + const int total_classes) +{ + accscalar_t smooth_positives = 1.0 - smoothing; + accscalar_t smooth_negatives = smoothing / total_classes; + accscalar_t tmpGradOutput = gradOutput[blockIdx.x]; + int64_t label = labels[blockIdx.x]; + accscalar_t coeff = max_log_sum_exp[blockIdx.x]; + + int offset = threadIdx.x; + int last = classes % (ILP * blockDim.x); + + for (; offset < classes - last; offset += blockDim.x * ILP) { + accscalar_t tmpLogits[ILP]; + +#pragma unroll + for (int j = 0; j < ILP; ++j) { + tmpLogits[j] = static_cast(logits[offset + j * blockDim.x]); + } + +#pragma unroll + for (int j = 0; j < ILP; ++j) + gradInput[offset + j * blockDim.x] = tmpGradOutput * ( + std::exp(tmpLogits[j] - coeff) - static_cast( + (offset + j * blockDim.x == label) ? 1 : 0) * + smooth_positives - smooth_negatives); + } + + for (; offset < classes; offset += blockDim.x) + gradInput[offset] = tmpGradOutput * (std::exp( + static_cast(logits[offset]) - coeff) - + static_cast((offset == label) ? 1 : 0) * + smooth_positives - smooth_negatives); +} + + +template +__device__ __forceinline__ void +aligned_apply(int shift, + scalar_t *gradInput, + scalar_t *logits, + outscalar_t *max_log_sum_exp, + outscalar_t *gradOutput, + int64_t *labels, + const float smoothing, + int classes, + const int total_classes) +{ + accscalar_t smooth_positives = 1.0 - smoothing; + accscalar_t smooth_negatives = smoothing / total_classes; + accscalar_t tmpGradOutput = gradOutput[blockIdx.x]; + int64_t label = labels[blockIdx.x]; + accscalar_t coeff = max_log_sum_exp[blockIdx.x]; + + int offset = threadIdx.x; + + // shift and do 1 + if(shift > 0){ + logits -= shift; + gradInput -= shift; + classes += shift; + if(threadIdx.x >= shift){ + gradInput[offset] = tmpGradOutput * (std::exp( + static_cast(logits[offset]) - coeff) - + static_cast(((offset - shift) == label) ? 1 : 0) * + smooth_positives - smooth_negatives); + } + classes -= blockDim.x; + gradInput += blockDim.x; + logits += blockDim.x; + shift -= blockDim.x; + } + + int last = classes % (ILP * blockDim.x); + + typedef typename std::aligned_storage::type LoadT; + // input + scalar_t v[ILP]; + LoadT* value = reinterpret_cast(&v); + // output + scalar_t r[ILP]; + LoadT* result = reinterpret_cast(&r); + + for (; offset * ILP < (classes - last); offset += blockDim.x) { + *value = reinterpret_cast(logits)[offset]; + +#pragma unroll + for (int j = 0; j < ILP; ++j) { + r[j] = tmpGradOutput * (std::exp( + static_cast(v[j]) - coeff) - + static_cast(((ILP * offset + j - shift) == label) ? 1 : 0) * + smooth_positives - smooth_negatives); + } + reinterpret_cast(gradInput)[offset] = *result; + } + + offset = classes - last + threadIdx.x; + for (; offset < classes; offset += blockDim.x) + gradInput[offset] = tmpGradOutput * (std::exp( + static_cast(logits[offset]) - coeff) - + static_cast(((offset - shift) == label) ? 1 : 0) * + smooth_positives - smooth_negatives); + +} + +template class Epilogue> +__global__ void +cunn_SoftMaxXEntropyBackward( + scalar_t *gradInput, + scalar_t *logits, + outscalar_t *max_log_sum_exp, + outscalar_t *gradOutput, + int64_t *labels, + const float smoothing, + int classes, + const int total_classes) +{ + gradInput += blockIdx.x * classes; + logits += blockIdx.x * classes; + + // Do vectorized load/store when input/output have same alignment + const int shift = ((uint64_t)logits) % ALIGN_BYTES / sizeof(scalar_t); + const int shift_ = ((uint64_t)gradInput) % ALIGN_BYTES / sizeof(scalar_t); + if (shift == shift_){ + aligned_apply(shift, gradInput, logits, max_log_sum_exp, gradOutput, labels, smoothing, classes, total_classes <= 0 ? classes : total_classes); + } + else { + apply(gradInput, logits, max_log_sum_exp, gradOutput, labels, smoothing, classes, total_classes <= 0 ? classes : total_classes); + } + +} + +template class Epilogue> +std::vector host_softmax_xentropy( + const Tensor & input_, + const Tensor & labels_, + const float smoothing, + const int total_classes) { + // For tensor parallel cross entropy with smoothing, we want to pass in the total number + // of classes so that smoothing can be applied correctly. If total_classes=-1, use the + // last dimension of the input tensor. + AT_ASSERTM(labels_.scalar_type() == ScalarType::Long,"Label type should be CUDA Long"); + + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)input_.get_device()}; + + auto input = input_.contiguous(); + Tensor max_log_sum_exp = at::empty_like(labels_, input.options().dtype(ScalarType::Float)); + Tensor losses = at::empty_like(labels_, input_.options().dtype(ScalarType::Float)); + + static_assert(std::is_same, float>::value || + std::is_same, double>::value, + "accscalar_t for half should be float or double"); + AT_ASSERTM(input.dim() == 2, "Currently only 2 dim input supported"); + AT_ASSERTM(labels_.dim() == 1, "Labels should be 1 dimensional"); + AT_ASSERTM(input.size(0) == labels_.size(0), "Input and label should have same number of examples"); + AT_ASSERTM(input.numel() > 0, "Number of classes in input should not be 0"); + + const int64_t dim = 1; + int64_t outer_size = 1; + int64_t dim_size = input.size(dim); + int64_t inner_size = 1; + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + for (int64_t i = 0; i < dim; ++i) + outer_size *= input.size(i); + for (int64_t i = dim + 1; i < input.dim(); ++i) + inner_size *= input.size(i); + // This kernel spawns a block per each element in the batch. + // XXX: it assumes that inner_size == 1 + TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported"); + + dim3 grid(outer_size); + + using namespace at; + DISPATCH_FLOAT_AND_HALF_AND_BF16(input.scalar_type(), 0, "host_softmax_xentropy", + using accscalar_t = at::acc_type; + const int ILP = sizeof(float4)/sizeof(scalar_t_0); + dim3 block = SoftMax_getBlockSize(ILP, dim_size); + cunn_SoftMaxXEntropyForward + <<>>( + losses.data_ptr(), max_log_sum_exp.data_ptr(), + input.data_ptr(), labels_.data_ptr(), + dim_size, smoothing, total_classes <= 0 ? dim_size : total_classes + ); + ); + + C10_CUDA_CHECK(cudaGetLastError()); + + std::vector ret = {losses, max_log_sum_exp}; + return ret; +} + +template class Epilogue> +Tensor host_softmax_xentropy_backward( + const at::Tensor &grad_loss, + at::Tensor &logits_, + const at::Tensor &max_log_sum_exp, + const at::Tensor &labels, + const float smoothing, + bool inplace, + const int total_classes) { + // Otherwise the kernel will be launched from cuda:0 device + // Cast to char to avoid compiler warning about narrowing + at::cuda::CUDAGuard device_guard{(char)grad_loss.get_device()}; + + const int64_t dim = 1; + Tensor gI = inplace ? logits_ : at::empty_like(logits_); + if (grad_loss.numel() == 0) { + return gI; + } + + auto grad = grad_loss.contiguous(); + auto logits = logits_.contiguous(); + + static_assert(std::is_same, float>::value || + std::is_same, double>::value, + "accscalar_t for half should be float or double"); + if (grad.dim() == 0) grad = grad.view(1); + + AT_ASSERTM(logits_.dim() == 2, "Currently only 2 dim input supported"); + AT_ASSERTM(labels.dim() == 1, "Labels should be 1 dimensional"); + AT_ASSERTM(logits_.numel() > 0, "Number of classes in input should not be 0"); + AT_ASSERTM(logits_.size(0) == labels.size(0), "Input and label should have same number of examples"); + AT_ASSERTM(labels.size(0) == grad.size(0), "Label and loss should have same number of examples"); + + int64_t outer_size = 1; + int64_t dim_size = logits.size(dim); + int64_t inner_size = 1; + for (int64_t i = 0; i < dim; ++i) + outer_size *= logits.size(i); + for (int64_t i = dim + 1; i < logits.dim(); ++i) + inner_size *= logits.size(i); + // See descriptions of kernels above. + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported"); + + dim3 grid(outer_size); + + DISPATCH_FLOAT_AND_HALF_AND_BF16(gI.scalar_type(), 0, "host_softmax_xentropy_backward", + using accscalar_t = acc_type; + const int ILP = sizeof(float4)/sizeof(scalar_t_0); + dim3 block = SoftMax_getBlockSize(ILP, dim_size); + cunn_SoftMaxXEntropyBackward + <<>>( + gI.data_ptr(), logits.data_ptr(), + max_log_sum_exp.data_ptr(), + grad.data_ptr(), labels.data_ptr(), + smoothing, dim_size, total_classes + ); + ); + + C10_CUDA_CHECK(cudaGetLastError()); + return gI; +} + +std::vector softmax_xentropy_cuda(const Tensor &input, const Tensor &labels, const float smoothing, const int total_classes){ + return host_softmax_xentropy(input, labels, smoothing, total_classes); +} + +at::Tensor softmax_xentropy_backward_cuda( + const at::Tensor &grad_loss, + at::Tensor &logits, + const at::Tensor &max_log_sum_exp, + const at::Tensor &labels, + const float smoothing, + const bool inplace, + const int total_classes) { + AT_ASSERTM((grad_loss.scalar_type() == ScalarType::Float), "expected grad types to be at::Float"); + return host_softmax_xentropy_backward(grad_loss, logits, max_log_sum_exp, labels, smoothing, inplace, total_classes); +}