akswelh
/

NEOX

Model card Files Files and versions Community

File size: 52,103 Bytes

d90b3a8

# Copyright (c) 2024 EleutherAI
# This file is based on code by the authors denoted below and has been modified from its original version.
#
# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Transformer."""

import math
from contextlib import nullcontext

import torch
import torch.nn.functional as F
import torch.nn as nn
from pkg_resources import packaging
from importlib.metadata import version

from .norms import get_norm
from megatron import mpu
from megatron.model import megablocks_utils
from megatron.model.fused_softmax import FusedScaleMaskSoftmax
from megatron.model.activations import get_activation
from megatron.model.utils import exists, get_fusion_type
from megatron.model.positional_embeddings import (
    RotaryEmbedding,
    apply_rotary_pos_emb_torch,
    apply_rotary_pos_emb,
    AliBi,
)
from megatron.model.fused_rope import (
    FusedRoPEFunc,
    fused_apply_rotary_pos_emb_cached,
)
from megatron.model.fused_bias_dropout import (
    get_bias_dropout_add,
    bias_dropout_add_fused_train,
    bias_dropout_add_fused_inference,
)
from megatron.model.utils import configure_sparse_attention
from deepspeed.moe.layer import MoE

try:
    from flash_attn.ops.activations import swiglu
except ImportError:
    swiglu = None

# flags required to enable jit fusion kernels
torch._C._jit_set_profiling_mode(False)
torch._C._jit_set_profiling_executor(False)
torch._C._jit_override_can_fuse_on_cpu(True)
torch._C._jit_override_can_fuse_on_gpu(True)

""" We use the following notation throughout this file:
     h: hidden size
     n: number of attention heads
     kv: number of key or value heads
     p: number of model parallel partitions
     np: n/p
     kvp: kv/p
     hp: h/p
     hn: h/n
     b: batch size
     s: sequence length
     l: number of layers
    Transformer takes input of size [s, b, h] and returns a
    tensor of the same size. We use the following arguments:
        hyperparameters: transformer hyperparameters
        attention_mask_func: a function that takes `unmasked-attention-scores`
            with size [b, np, s, s] and an `attention-mask` and will apply
            the masking. The function should return a masked score of the
            same size [b, np, s, s].
               masked-attention-scores = attention_mask_func(
                                     unmasked-attention-scores, attention-mask)
"""


class ParallelMLP(nn.Module):
    """MLP.

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform nonlinear transformation, and project the
    state back into h hidden dimension. At the end, dropout is also
    applied.
    """

    def __init__(
        self,
        neox_args,
        init_method,
        output_layer_init_method,
        parallel_output=False,
        multiple_of=256,
        MOE=False,
        MoE_mp_size=1,
    ):
        super().__init__()
        assert (
            neox_args.intermediate_size == None or neox_args.expansion_factor == None
        ), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"

        self.activation_func, self.is_gated = get_activation(neox_args)
        self.activation_type = neox_args.activation
        self.bias_gelu_fusion = neox_args.bias_gelu_fusion
        self.multiple_of = multiple_of

        if neox_args.intermediate_size:
            ffn_dim = neox_args.intermediate_size
        elif neox_args.expansion_factor:
            ffn_dim = int(neox_args.expansion_factor * neox_args.hidden_size)
        else:
            # 4h is default for ffn_dim
            ffn_dim = 4 * neox_args.hidden_size
        ffn_dim_in = ffn_dim
        if self.is_gated:
            # set activation function to be gated implementation
            self.activation_func = Gated_Activation(
                self.activation_func,
                (swiglu is not None)
                and (neox_args.activation == "swiglu")
                and neox_args.use_flashattn_swiglu,
            )
            # auto scale so gated activations has equal parameters
            ffn_dim = int(ffn_dim * 2 / 3)
            ffn_dim_in = ffn_dim // 2
        # set multiple
        ffn_dim = int(
            (2 * self.multiple_of)
            * ((ffn_dim + (2 * multiple_of) - 1) // (2 * multiple_of))
        )
        ffn_dim_in = int(
            self.multiple_of * ((ffn_dim_in + multiple_of - 1) // multiple_of)
        )

        self.linear1 = mpu.ColumnParallelLinear(
            neox_args=neox_args,
            input_size=neox_args.hidden_size,
            output_size=ffn_dim,
            gather_output=False,
            init_method=init_method,
            skip_bias_add=True,
            MOE=MOE,
            MoE_mp_size=MoE_mp_size,
            bias=neox_args.use_bias_in_mlp,
        )
        # Project back to h.
        self.linear2 = mpu.RowParallelLinear(
            neox_args=neox_args,
            input_size=ffn_dim_in,
            output_size=neox_args.hidden_size,
            input_is_parallel=True,
            init_method=output_layer_init_method,
            parallel_output=parallel_output,
            skip_bias_add=True,
            MOE=MOE,
            MoE_mp_size=MoE_mp_size,
            bias=neox_args.use_bias_in_mlp,
        )

    def forward(self, hidden_states):
        # [s, b, intermediate_size]
        intermediate_parallel, bias_parallel = self.linear1(hidden_states)

        if self.is_gated or (self.activation_type == "gelu" and self.bias_gelu_fusion):
            intermediate_parallel = self.activation_func(
                intermediate_parallel, bias_parallel
            )
        else:
            intermediate_parallel = self.activation_func(
                intermediate_parallel + bias_parallel
            )

        # [s, b, h]
        output, output_bias = self.linear2(intermediate_parallel)
        return output, output_bias


class Gated_Activation(torch.nn.Module):
    def __init__(self, activation_func, use_swiglu=False):
        super().__init__()
        self.activation_func = activation_func
        self.use_swiglu = use_swiglu

    def forward(self, x, bias=None):
        x, gate = x.chunk(2, dim=-1)
        if bias is not None:
            bias_1, bias_2 = bias.chunk(2, dim=-1)
            x = x + bias_1
            gate = gate + bias_2
        if not self.use_swiglu:
            intermediate_parallel = self.activation_func(gate)
            return intermediate_parallel * x
        else:
            return swiglu(gate, x)


class ParallelLinear(nn.Module):
    """
    A Parallel Linear Layer transforming the transformer outputs from hidden_size -> vocab_size
    """

    def __init__(
        self,
        neox_args,
        parallel_output=True,
        init_method=nn.init.xavier_normal_,
        is_last_layer=False,
    ):
        super().__init__()
        self.is_rm = neox_args.train_impl == "rm"
        parallelism = neox_args.output_layer_parallelism if not self.is_rm else "row"
        if parallelism == "column":
            self.final_linear = mpu.ColumnParallelLinear(
                neox_args=neox_args,
                input_size=neox_args.hidden_size,
                output_size=neox_args.padded_vocab_size,
                bias=False,
                init_method=init_method,
                gather_output=not parallel_output,
                skip_bias_add=False,
                mup_rescale_parameters=is_last_layer,  # rescale params only called if neox_args.use_mup = True, despite it not being included here
                seq_dim=1,  # important: must mark that this layer receives shape [b, s, h] not [s, b, h] and so Seq. Parallel comms must gather along dim=1 rather than dim=0
            )
        else:
            if not self.is_rm:
                print(
                    'ERROR: Output layer parallelism over the hidden dim is currently broken (https://github.com/EleutherAI/gpt-neox/issues/905). Please run with output_layer_parallelism = "column" until this issue is fixed.'
                )
                exit()
                # self.final_linear = mpu.RowParallelLinear(
                #     neox_args=neox_args,
                #     input_size=neox_args.hidden_size,
                #     output_size=neox_args.padded_vocab_size,
                #     bias=False,
                #     input_is_parallel=False,
                #     init_method=init_method,
                #     parallel_output=parallel_output,
                #     skip_bias_add=False,
                #     mup_rescale_parameters=is_last_layer,  # only called if neox_args.use_mup = True, despite it not being included here
                # )
            else:  # Not using cross entropy loss for RMs
                self.rm_linear = mpu.RowParallelLinear(
                    neox_args=neox_args,
                    input_size=neox_args.hidden_size,
                    output_size=1,
                    bias=False,
                    input_is_parallel=False,
                    init_method=init_method,
                    parallel_output=False,
                    skip_bias_add=False,
                    mup_rescale_parameters=is_last_layer,  # only called if neox_args.use_mup = True, despite it not being included here
                )

    def forward(self, hidden_states):
        if not self.is_rm:
            return self.final_linear(hidden_states)
        else:
            return self.rm_linear(hidden_states)


class _MegablocksAdapter(nn.Module):
    def __init__(
        self, neox_args, layer_cls, init_method, output_layer_init_method, ep_group
    ):
        super().__init__()
        megablocks_utils.assert_megablocks_is_available()
        args = megablocks_utils.as_megablocks_args(neox_args)
        args.device = torch.cuda.current_device()
        args.init_method = init_method
        args.output_layer_init_method = output_layer_init_method

        # NOTE: Shard the MoE layers over the data parallel group. Expert
        # parallel sharding and data parallel sharding could be decoupled
        # by extending the optimizer to handle data parallel reductions for
        # MoE and non-MoE parameters separately.
        if args.moe_expert_model_parallelism:
            args.expert_parallel_group = ep_group

        self.moe = layer_cls(args)

    def forward(self, x):
        return self.moe.forward(x)


class MbMoE(_MegablocksAdapter):
    def __init__(self, neox_args, init_method, output_layer_init_method, ep_group):
        super().__init__(
            neox_args,
            megablocks_utils.moe.MoE,
            init_method,
            output_layer_init_method,
            ep_group,
        )


class dMoE(_MegablocksAdapter):
    def __init__(self, neox_args, init_method, output_layer_init_method, ep_group):
        super().__init__(
            neox_args,
            megablocks_utils.dmoe.dMoE,
            init_method,
            output_layer_init_method,
            ep_group,
        )


class ParallelSelfAttention(nn.Module):
    """Parallel self-attention layer abstract class.

    Self-attention layer takes input with size [b, s, h]
    and returns output of the same size.
    """

    def __init__(
        self,
        neox_args,
        attention_mask_func,
        init_method,
        output_layer_init_method,
        layer_number,
        rpe=None,
        rotary=False,
        use_cache=False,
        parallel_output=False,
    ):
        super().__init__()

        self.fp16 = neox_args.precision == "fp16"
        self.bf16 = neox_args.precision == "bfloat16"
        self.attention_mask_func = attention_mask_func
        self.apply_query_key_layer_scaling = neox_args.apply_query_key_layer_scaling
        self.use_cache = use_cache
        self.attention_softmax_in_fp32 = neox_args.attention_softmax_in_fp32
        if self.apply_query_key_layer_scaling:
            self.attention_softmax_in_fp32 = True
        self.layer_number = layer_number
        # Per attention head and per partition values.
        world_size = mpu.get_model_parallel_world_size()
        self.hidden_size_per_partition = mpu.divide(neox_args.hidden_size, world_size)
        self.hidden_size_per_attention_head = mpu.divide(
            neox_args.hidden_size, neox_args.num_attention_heads
        )
        self.num_attention_heads_per_partition = mpu.divide(
            neox_args.num_attention_heads, world_size
        )
        self.pos_emb = neox_args.pos_emb

        self.use_qk_layernorm = neox_args.use_qk_layernorm
        if self.use_qk_layernorm:
            norm, eps = get_norm(neox_args)
            self.qk_layernorm = norm(
                [
                    self.num_attention_heads_per_partition,
                    self.hidden_size_per_attention_head,
                ],
                eps=eps,
            )

        self.sliding_window_width = neox_args.sliding_window_width

        if (
            not neox_args.num_kv_heads
            or neox_args.num_kv_heads == neox_args.num_attention_heads
        ):
            self.gqa = False
        else:
            self.gqa = True
        if self.gqa:
            self.num_kv_heads_per_partition = mpu.divide(
                neox_args.num_kv_heads, world_size
            )  # we do not yet clone KV heads in MQA across TP ranks...
            self.kv_hidden_size = (
                neox_args.num_kv_heads * self.hidden_size_per_attention_head
            )  # how large the total hidden dim for each of K and V is
        else:
            self.num_kv_heads_per_partition = self.num_attention_heads_per_partition
            self.kv_hidden_size = neox_args.hidden_size

        if not self.gqa:
            # Strided linear layer.
            self.query_key_value = mpu.ColumnParallelLinear(
                neox_args=neox_args,
                input_size=neox_args.hidden_size,
                output_size=3 * neox_args.hidden_size,
                gather_output=False,
                init_method=init_method,
                bias=neox_args.use_bias_in_attn_linear,
            )
        else:
            # QKV proj is smaller if we are using GQA / MQA
            self.query_key_value = mpu.ColumnParallelLinear(
                neox_args=neox_args,
                input_size=neox_args.hidden_size,
                output_size=neox_args.hidden_size + 2 * self.kv_hidden_size,
                gather_output=False,
                init_method=init_method,
                bias=neox_args.use_bias_in_attn_linear,
            )

        coeff = None
        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
        if self.apply_query_key_layer_scaling:
            coeff = max(1, self.layer_number)
            self.norm_factor *= coeff

        if neox_args.use_mup:
            self.norm_factor = self.hidden_size_per_attention_head

        self.rpe = rpe

        if self.pos_emb == "alibi":
            self.alibi_embed = AliBi(
                neox_args.num_attention_heads,
                neox_args.model_parallel_size,
                mpu.get_model_parallel_rank(),
            )

        # TODO: this arg shouldn't need to be passed in - get from neox_args
        if rotary:
            if neox_args.rotary_pct == 1:
                self.rotary_ndims = None
            else:
                assert neox_args.rotary_pct < 1
                self.rotary_ndims = int(
                    self.hidden_size_per_attention_head * neox_args.rotary_pct
                )
            dim = (
                self.rotary_ndims
                if self.rotary_ndims is not None
                else self.hidden_size_per_attention_head
            )
            self.rotary_emb = RotaryEmbedding(
                dim,
                base=neox_args.rotary_emb_base,
                max_seq_len=neox_args.seq_length,
                precision=neox_args.params_dtype,
                save_inv_freqs=neox_args.rotary_save_freqs_buffer,
            )
        else:
            self.rotary_emb = None

        self.rope_fusion = neox_args.rope_fusion
        self.attention_type = neox_args.attention_config[layer_number]
        self.use_flash_attention = self.attention_type == "flash"
        self.use_triton = (
            self.use_flash_attention
            and self.pos_emb == "alibi"
            and (
                not packaging.version.Version(version("flash-attn"))
                >= packaging.version.Version("2.4.0.post1")
            )
        )
        self.sparse = self.attention_type not in ("global", "flash")

        if self.gqa:
            assert not self.sparse

        if self.sparse:
            self.sparse_attn = configure_sparse_attention(
                neox_args,
                self.attention_type,
                self.num_attention_heads_per_partition,
                mpu=mpu,
            )
        else:
            if self.use_flash_attention:
                # we now use Flash Attention 2's provided interface.
                # TODO: we no longer need to use flash_triton_fn since flash cuda supports alibi.
                # consider adding OpenAI's more recent Flash-2 Triton kernel in future
                # from https://github.com/openai/triton/blob/main/python/tutorials/06-fused-attention.py
                from flash_attn.flash_attn_interface import (
                    flash_attn_func,
                    flash_attn_varlen_func,
                )
                from flash_attn.flash_attn_triton import (
                    flash_attn_func as flash_attn_unpadded_unpacked_func_triton,
                )

                self.flash_triton_fn = flash_attn_unpadded_unpacked_func_triton
                self.flash_qkv_fn = flash_attn_func
                self.flash_varlen_qkv_fn = flash_attn_varlen_func
            else:
                self.scale_mask_softmax = FusedScaleMaskSoftmax(
                    input_in_fp16=self.fp16,
                    input_in_bf16=self.bf16,
                    fusion_type=get_fusion_type(neox_args),
                    mask_func=self.attention_mask_func,
                    softmax_in_fp32=self.attention_softmax_in_fp32,
                    scale=coeff,
                )

            # Dropout. Note that for a single iteration, this layer will generate
            # different outputs on different number of parallel partitions but
            # on average it should not be partition dependent.
            self.dropout_p = neox_args.attention_dropout
            self.attention_dropout = nn.Dropout(self.dropout_p)

        # Output.
        self.dense = mpu.RowParallelLinear(
            neox_args=neox_args,
            input_size=neox_args.hidden_size,
            output_size=neox_args.hidden_size,
            input_is_parallel=True,
            init_method=output_layer_init_method,
            skip_bias_add=True,
            parallel_output=parallel_output,
            bias=neox_args.use_bias_in_attn_linear,
        )

    def attention(
        self, query_layer, key_layer, value_layer, layer_past, attention_mask
    ):
        # ===================================
        # Raw attention scores. [b, np, s, s]
        # ===================================

        # [b, np, sq, sk]
        output_size = (
            query_layer.size(1),
            query_layer.size(2),
            query_layer.size(0),
            key_layer.size(0),
        )
        # [sq, b, np, hn] -> [sq, b * np, hn]
        query_layer = query_layer.view(
            output_size[2], output_size[0] * output_size[1], -1
        )
        key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
        # preallocating result tensor: [b * np, sq, sk]
        matmul_result = torch.empty(
            output_size[0] * output_size[1],
            output_size[2],
            output_size[3],
            dtype=query_layer.dtype,
            device=torch.cuda.current_device(),
        )

        # Raw attention scores. [b * np, sq, sk]
        matmul_result = torch.baddbmm(
            matmul_result,
            query_layer.transpose(0, 1),  # [b * np, sq, hn]
            key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
            beta=0.0,
            alpha=(1.0 / self.norm_factor),
        )

        # change view to [b, np, sq, sk]
        attention_scores = matmul_result.view(*output_size)
        # ==================================================
        # Update attention mask for inference. [b, np, sq, sk]
        # ==================================================

        if self.use_cache:
            with torch.no_grad():
                attention_mask = attention_mask[
                    ..., : attention_scores.size(3), : attention_scores.size(3)
                ]

        # ===========================
        # Attention probs and dropout
        # ===========================

        if exists(self.rpe):
            rpe = self.rpe(query_layer.size(0), key_layer.size(0))
            attention_scores += rpe  # [1, np, sq, sk]

        if self.pos_emb == "alibi":
            attention_scores = self.alibi_embed(attention_scores)

        # attention scores and attention mask [b, np, sq, sk]
        attention_probs = self.scale_mask_softmax(attention_scores, attention_mask)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        with mpu.get_cuda_rng_tracker().fork():
            attention_probs = self.attention_dropout(attention_probs)

        # =========================
        # Context layer. [sq, b, hp]
        # =========================

        # value_layer -> context layer.
        # [sk, b, np, hn] --> [b, np, sq, hn]

        # context layer shape: [b, np, sq, hn]
        output_size = (
            value_layer.size(1),
            value_layer.size(2),
            query_layer.size(0),
            value_layer.size(3),
        )

        # change view [sk, b * np, hn]
        value_layer = value_layer.view(
            value_layer.size(0), output_size[0] * output_size[1], -1
        )

        # change view [b * np, sq, sk]
        attention_probs = attention_probs.view(
            output_size[0] * output_size[1], output_size[2], -1
        )

        # matmul: [b * np, sq, hn]
        context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))

        # change view [b, np, sq, hn]
        context_layer = context_layer.view(*output_size)
        return context_layer

    def flash_attention(self, query_layer, key_layer, value_layer):
        # [b, np, sq, sk]
        output_size = (
            query_layer.size(1),
            query_layer.size(2),
            query_layer.size(0),
            key_layer.size(0),
        )

        if self.use_flash_attention and not self.use_triton:

            # [sk, b, np, hn] -> [b, sk, np, hn] -> [b * sk, 1, np, hn]
            key_layer = key_layer.transpose(0, 1).reshape(
                output_size[0], output_size[3], self.num_kv_heads_per_partition, -1
            )
            value_layer = value_layer.transpose(0, 1).reshape(
                output_size[0], output_size[3], self.num_kv_heads_per_partition, -1
            )

            # [sq, b, np, hn] -> [b, sq, np, hn]
            query_layer = query_layer.transpose(0, 1).reshape(
                output_size[0], output_size[2], output_size[1], -1
            )

            # only pass in window_size or alibi_slopes kwarg
            # if we use Sliding Window Attention / AliBi.
            # Flash attn defaults to (-1,-1), or
            # does not have this kwarg prior to v2.3.0
            extra_kwargs = (
                {"window_size": (self.sliding_window_width, -1)}
                if self.sliding_window_width is not None
                else {}
            )
            if self.pos_emb == "alibi":
                extra_kwargs["alibi_slopes"] = self.alibi_embed.slopes.to(
                    query_layer.device
                ).to(torch.float32)

            if not self.training:
                batch_size = output_size[0]
                max_seqlen_q = output_size[2]
                max_seqlen_k = output_size[3]

                cu_seqlens_q = torch.arange(
                    0,
                    (batch_size + 1) * max_seqlen_q,
                    step=max_seqlen_q,
                    dtype=torch.int32,
                    device=query_layer.device,
                )

                cu_seqlens_k = torch.arange(
                    0,
                    (batch_size + 1) * max_seqlen_k,
                    step=max_seqlen_k,
                    dtype=torch.int32,
                    device=key_layer.device,
                )

                q_shape = query_layer.shape
                k_shape = key_layer.shape
                v_shape = value_layer.shape
                is_causal = max_seqlen_q == max_seqlen_k
                output = self.flash_varlen_qkv_fn(
                    query_layer.reshape(
                        (q_shape[0] * q_shape[1], q_shape[2], q_shape[3])
                    ),
                    key_layer.reshape(
                        (k_shape[0] * k_shape[1], k_shape[2], k_shape[3])
                    ),
                    value_layer.reshape(
                        (v_shape[0] * v_shape[1], v_shape[2], v_shape[3])
                    ),
                    cu_seqlens_q,
                    cu_seqlens_k,
                    max_seqlen_q,
                    max_seqlen_k,
                    softmax_scale=None,
                    causal=is_causal,
                    **extra_kwargs,
                )
                output = output.reshape(q_shape)
            else:
                output = self.flash_qkv_fn(
                    query_layer,
                    key_layer,
                    value_layer,
                    self.dropout_p if self.training else 0.0,
                    softmax_scale=None,
                    causal=True,
                    **extra_kwargs,
                )

            matmul_result = output
            # [b, sq, np, hn] -> [b, np, sq, hn]
            matmul_result = matmul_result.transpose(1, 2)

        else:
            # we still use Triton if using AliBi with flash-attn<2.4.0.post1.

            # [sq, b, np, hn] -> [b, sq, np, hn]
            sq = query_layer.size(0)
            b = query_layer.size(1)
            sk = key_layer.size(0)

            query_layer = query_layer.transpose(0, 1)
            key_layer = key_layer.transpose(0, 1)
            value_layer = value_layer.transpose(0, 1)

            bias = self.alibi_embed.bias(sq, sk, query_layer.device, query_layer.dtype)
            bias = bias.unsqueeze(0).tile((b, 1, 1, 1))

            matmul_result = self.flash_triton_fn(
                query_layer, key_layer, value_layer, bias=bias, causal=True
            )
            matmul_result = matmul_result.transpose(1, 2)

        return matmul_result

    def sparse_attention(self, query_layer, key_layer, value_layer, attention_mask):
        # TODO: sparse attn dropout?
        # TODO: pad to block size
        # shape of q/k/v is [sq, b, np, hn] and needs to be transposed to [b, np, sq, hn]
        query_layer, key_layer, value_layer = map(
            lambda t: t.permute(1, 2, 0, 3).contiguous(),
            (query_layer, key_layer, value_layer),
        )
        # output shape [b, np(heads), sq, hn]
        attn_mask = attention_mask.to(query_layer.dtype) * -10000
        if exists(self.rpe):
            rpe = self.rpe(query_layer.size(0), key_layer.size(0))
        else:
            rpe = None
        return self.sparse_attn(
            query_layer, key_layer, value_layer, attn_mask=attn_mask, rpe=rpe
        )

    def gqa_project(self, hidden_states, attention_mask, layer_past=None):
        # QKV projection and separation into separate Q/K/V layers for GQA,
        # where KV projections may be smaller than Q projection.
        # the logic for this is explained in comments of this function
        # detailing the intermediate sizes of tensors at each reshape.

        # pass through projection: [sq, b, h] --> [sq, b, ((np + 2 * kvp) * hn)]
        mixed_x_layer, _ = self.query_key_value(hidden_states)

        # First: reshape so we have seqlen, batch, and num. query heads each as separate dims
        # Final dim is not exactly head dim: the first (head dim) dims are query heads,
        # The last (head dim * ratio of kv to q heads) each are the "k/v heads"
        # (right now we treat like we have same num. heads, but smaller head dim)

        # [sq, b, ((np + 2 * kvp) * hn)] --> [sq, b, np, (hn * (1 + 2 * (kvp / np)))]
        new_qkv_shape = (
            mixed_x_layer.shape[0],
            mixed_x_layer.shape[1],
            self.num_attention_heads_per_partition,
            int(
                self.hidden_size_per_attention_head
                * (
                    1
                    + 2
                    * (
                        self.num_kv_heads_per_partition
                        / self.num_attention_heads_per_partition
                    )
                )
            ),
        )
        mixed_x_layer = mixed_x_layer.reshape(*new_qkv_shape)

        # Next: split our fake head dim. (last dim) so that the first (head dim) dimensions go to Q,
        # the last smaller 2 * (head dim * kv to q head ratio) each divided between K and V separately
        split_sizes = (
            self.hidden_size_per_attention_head,
            int(
                (
                    self.num_kv_heads_per_partition
                    / self.num_attention_heads_per_partition
                )
                * self.hidden_size_per_attention_head
            ),
            int(
                (
                    self.num_kv_heads_per_partition
                    / self.num_attention_heads_per_partition
                )
                * self.hidden_size_per_attention_head
            ),
        )

        # [sq, b, np, (hn * (1 + 2 * (kvp / np)))] --> 1 x [sq, b, np, hn] , 2 x [sq, b, np, (hn * (kvp / np))]
        (query_layer, key_layer, value_layer) = [
            x.contiguous()
            for x in torch.split(
                mixed_x_layer,
                split_sizes,
                dim=mixed_x_layer.dim() - 1,
            )
        ]

        # reshape K/V to proper output shape (last dim = correct full "real" head size again)
        # 2 x [sq, b, np, (hn * (kvp / np))] --> 2 x [sq, b, kvp, hn]
        new_kv_shape = (
            key_layer.size(0),
            key_layer.size(1),
            self.num_kv_heads_per_partition,
            self.hidden_size_per_attention_head,
        )

        key_layer = key_layer.view(*new_kv_shape)

        value_layer = value_layer.view(*new_kv_shape)

        # if not using Flash attention, we repeat K/V heads to match Q head counts
        if not self.use_flash_attention:
            key_layer = torch.repeat_interleave(
                key_layer,
                repeats=int(
                    self.num_attention_heads_per_partition
                    // self.num_kv_heads_per_partition
                ),
                dim=2,
            )
            value_layer = torch.repeat_interleave(
                value_layer,
                repeats=int(
                    self.num_attention_heads_per_partition
                    // self.num_kv_heads_per_partition
                ),
                dim=2,
            )

        return query_layer, key_layer, value_layer

    def forward(self, hidden_states, attention_mask, layer_past=None):

        # hidden_states: [sq, b, h]

        # =====================
        # Query, Key, and Value
        # =====================

        if not self.gqa:
            # QKV projection for MHA.

            # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
            mixed_x_layer, _ = self.query_key_value(hidden_states)

            # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
            new_tensor_shape = mixed_x_layer.size()[:-1] + (
                self.num_attention_heads_per_partition,
                3 * self.hidden_size_per_attention_head,
            )
            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)

            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
            (query_layer, key_layer, value_layer) = mpu.split_tensor_along_last_dim(
                mixed_x_layer, 3
            )
        else:
            # Grouped Query Attention (GQA) - specific logic for performing QKV proj
            # and separating out Q, K, and V outputs.

            # output shapes: 1 x [sq, b, np, hn], 2 x [sq, b, kvp, hn] if using flash
            query_layer, key_layer, value_layer = self.gqa_project(
                hidden_states, attention_mask, layer_past=layer_past
            )

        # QK Normalization https://arxiv.org/abs/2302.05442
        if self.use_qk_layernorm:
            query_layer = self.qk_layernorm(query_layer)
            key_layer = self.qk_layernorm(key_layer)

        if exists(self.rotary_emb):
            if exists(self.rotary_ndims):
                # partial rotary
                query_rot, query_pass = (
                    query_layer[..., : self.rotary_ndims],
                    query_layer[..., self.rotary_ndims :],
                )
                key_rot, key_pass = (
                    key_layer[..., : self.rotary_ndims],
                    key_layer[..., self.rotary_ndims :],
                )
            else:
                # full rotary
                query_rot, key_rot = query_layer, key_layer

            seq_len = key_layer.shape[0]
            offset = 0
            if exists(layer_past) and layer_past.numel() > 0:
                offset = layer_past[0].shape[0]
                seq_len += offset
            cos, sin = self.rotary_emb(value_layer, seq_len=seq_len)
            if self.rope_fusion:
                query_layer, key_layer = (
                    fused_apply_rotary_pos_emb_cached(rot, cos, sin)
                    for rot in [query_rot, key_rot]
                )
            else:
                if self.bf16:
                    apply_rotary_fn = apply_rotary_pos_emb_torch
                else:
                    apply_rotary_fn = apply_rotary_pos_emb
                query_layer, key_layer = apply_rotary_fn(
                    query_rot, key_rot, cos, sin, offset=offset
                )

            if exists(self.rotary_ndims):
                query_layer = torch.cat((query_layer, query_pass), dim=-1)
                key_layer = torch.cat((key_layer, key_pass), dim=-1)

        # ==================================
        # Cache key and value for inference
        # ==================================

        if exists(layer_past) and layer_past.numel() > 0:
            past_key, past_value = layer_past
            key_layer = torch.cat((past_key.type_as(key_layer), key_layer), dim=0)
            value_layer = torch.cat(
                (past_value.type_as(value_layer), value_layer), dim=0
            )

        if self.use_cache:
            present = torch.stack((key_layer, value_layer))

        if self.use_flash_attention:
            context_layer = self.flash_attention(query_layer, key_layer, value_layer)
        elif not self.sparse:
            context_layer = self.attention(
                query_layer, key_layer, value_layer, layer_past, attention_mask
            )
        else:
            context_layer = self.sparse_attention(
                query_layer, key_layer, value_layer, attention_mask
            )

        # [b, np, sq, hn] --> [sq, b, np, hn]
        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()

        # [sq, b, np, hn] --> [sq, b, hp]
        new_context_layer_shape = context_layer.size()[:-2] + (
            self.hidden_size_per_partition,
        )
        context_layer = context_layer.view(*new_context_layer_shape)

        # =================
        # Output. [sq, b, h]
        # =================

        output, bias = self.dense(context_layer)

        if self.use_cache:
            output = [output, present]

        return output, bias


class ParallelTransformerLayer(nn.Module):
    """A single transformer layer.

    Transformer layer takes input with size [b, s, h] and returns an
    output of the same size.
    """

    def __init__(
        self,
        neox_args,
        attention_mask_func,
        init_method,
        output_layer_init_method,
        layer_number,
        rpe=None,
        rotary=False,
        use_cache=False,
    ):

        super().__init__()
        self.layer_number = layer_number
        self.neox_args = neox_args

        norm, eps = get_norm(neox_args)

        # Layernorm on the input data.
        self.input_layernorm = norm(neox_args.hidden_size, eps=eps)
        self.use_cache = use_cache

        self.hidden_dropout = neox_args.hidden_dropout
        self.bias_dropout_fusion = neox_args.bias_dropout_fusion
        self.gpt_j_residual = neox_args.gpt_j_residual
        self.gpt_j_tied = neox_args.gpt_j_tied
        self.moe_type = neox_args.moe_type
        self.activation = neox_args.activation

        if self.gpt_j_residual:
            # GPT-J style layers allow us to defer the reduction of results across TP ranks until the end of the two sublayers.
            # the reduction we use is a simple allreduce for pure Tensor Parallel,
            # but needs to be a reduce-scatter when using Megatron-style Sequence Parallel (LN sharding.)
            self.reduce = (
                mpu.mappings.reduce_from_model_parallel_region
                if not neox_args.sequence_parallel
                else mpu.mappings.reduce_scatter_to_sequence_parallel_region
            )

        # Self attention.
        self.attention = ParallelSelfAttention(
            neox_args=neox_args,
            attention_mask_func=attention_mask_func,
            init_method=init_method,
            output_layer_init_method=output_layer_init_method,
            layer_number=layer_number,
            rpe=rpe,
            use_cache=self.use_cache,
            rotary=rotary,
            parallel_output=self.gpt_j_residual,
        )

        # Layernorm on the output of the attention layer.
        # If GPT-J residuals are used, this is surpurfulous but leaving it in
        # leads to cleaner code
        self.post_attention_layernorm = norm(neox_args.hidden_size, eps=eps)

        # MLP
        def get_mlp(**kw):
            return ParallelMLP(
                neox_args=neox_args,
                init_method=init_method,
                output_layer_init_method=output_layer_init_method,
                parallel_output=self.gpt_j_residual,
                multiple_of=neox_args.mlp_multiple_of,
                **kw,
            )

        self.num_experts = (
            neox_args.moe_num_experts
            if layer_number % neox_args.expert_interval == 0
            else 1
        )
        args = neox_args
        if self.num_experts <= 1:
            self.mlp = get_mlp()
        else:
            from torch import distributed as dist

            if self.num_experts > dist.get_world_size():
                moe_mp_size = 1
            else:
                moe_mp_size = dist.get_world_size() // self.num_experts

            if neox_args.moe_type == "deepspeed":
                self.mlp = MoE(
                    args.hidden_size,
                    get_mlp(
                        "regular",
                        MOE=True,
                        MoE_mp_size=moe_mp_size,
                    ),
                    num_experts=self.num_experts,
                    ep_size=args.moe_expert_parallel_size,
                    k=args.moe_top_k,
                    use_residual=args.moe_use_residual,
                    capacity_factor=args.moe_train_capacity_factor,
                    eval_capacity_factor=args.moe_eval_capacity_factor,
                    min_capacity=args.moe_min_capacity,
                    drop_tokens=args.moe_token_dropping,
                    use_tutel=args.use_tutel,
                    enable_expert_tensor_parallelism=args.enable_expert_tensor_parallelism,
                )
            elif neox_args.moe_type == "megablocks":

                def integrate_megablocks_with_ds_expert_parallelism():
                    # We make megablocks work with DS parallelism.
                    #
                    # We fool DS into accepting these MoE parameters as its own DS MoE params,
                    # which makes things work with the underlying expert parallelism,
                    # including TED parallelism.
                    #
                    # Effectively, we want to:
                    #
                    # - Make DS's data parallel gradient all-reduction skip these params.
                    # - But make these params participate in the expert parallel all-reduction!
                    #
                    # Further background:
                    #
                    # Normally, with the original megablocks demo codebase, it
                    # only supports 1 copy of any expert throughout
                    # the network, since it uses EP group = DP group.
                    #
                    # First, we trigger DS initialization of the MoE expert parallel groups and internal state.
                    throwaway = MoE(
                        args.hidden_size,
                        get_mlp(
                            "regular",
                            MOE=True,
                            MoE_mp_size=moe_mp_size,
                        ),
                        num_experts=self.num_experts,
                        ep_size=args.moe_expert_parallel_size,
                        k=args.moe_top_k,
                        use_residual=args.moe_use_residual,
                        capacity_factor=args.moe_train_capacity_factor,
                        eval_capacity_factor=args.moe_eval_capacity_factor,
                        min_capacity=args.moe_min_capacity,
                        drop_tokens=args.moe_token_dropping,
                        use_tutel=args.use_tutel,
                        enable_expert_tensor_parallelism=args.enable_expert_tensor_parallelism,
                    )
                    throwaway.set_deepspeed_parallelism()

                    ep_group = throwaway.deepspeed_moe.ep_group
                    if args.moe_token_dropping:
                        self.mlp = MbMoE(
                            neox_args, init_method, output_layer_init_method, ep_group
                        )
                    else:
                        self.mlp = dMoE(
                            neox_args, init_method, output_layer_init_method, ep_group
                        )

                    # Next, we trick DS into seeing these as its own MoE params.
                    for param in self.mlp.parameters():
                        if getattr(param, "expert_model_parallel", None) is not None:
                            # is_moe_param looks for this attr.
                            param.allreduce = False
                            param.group_name = throwaway.expert_group_name

                integrate_megablocks_with_ds_expert_parallelism()

            else:
                raise KeyError(neox_args.moe_type)

        self.layer_past = None  # used to cache k/v pairs in inference

    def _get_bias_dropout(self):
        if self.bias_dropout_fusion:
            fn = (
                bias_dropout_add_fused_train
                if self.training
                else bias_dropout_add_fused_inference
            )
        else:
            fn = get_bias_dropout_add(self.training)
        return fn

    def forward(self, x, attention_mask, layer_past=None):
        layer_past = layer_past if layer_past is not None else self.layer_past
        bias_dropout_fn = self._get_bias_dropout()
        moe_loss = torch.tensor(0.0, device=x.device, dtype=x.dtype)
        # x: [b, s, h]
        if self.gpt_j_residual:
            # pseudocode:
            # x = x + attn(ln(x)) + mlp(ln(x))
            # this means we can avoid doing the allreduce in the attn / mlp outputs
            # to save communication time (we can do a single allreduce after we add mlp / attn outputs).
            # due to a bug, the two layernorms are not tied in GPT-NeoX-20B. This is non-desirable, but
            # we preserve the functionality for backwards compatibility

            residual = x
            # applies the correct normalization depending on if the norms are tied
            if self.gpt_j_tied:
                x = self.input_layernorm(x)
                x1, x2 = x, x
            else:
                x1, x2 = self.input_layernorm(x), self.post_attention_layernorm(x)

            # attention operator
            attention_output, attention_bias = self.attention(
                x1, attention_mask, layer_past=layer_past
            )
            if self.use_cache:
                attention_output, presents = attention_output
                self.layer_past = presents

            if attention_bias is not None:
                with torch.enable_grad() if not self.eval else nullcontext():
                    attention_output = bias_dropout_fn(
                        attention_output,
                        bias=attention_bias.expand_as(attention_output),
                        residual=None,
                        prob=self.hidden_dropout,
                    )

            # mlp operator
            mlp_output, mlp_bias = self.mlp(x2)
            if mlp_bias is not None:
                with torch.enable_grad() if not self.eval else nullcontext():
                    output = bias_dropout_fn(
                        mlp_output,
                        bias=mlp_bias.expand_as(mlp_output),
                        residual=attention_output,
                        prob=self.hidden_dropout,
                    )
            else:
                output = mlp_output

            # output = (x + attn(ln(x)) + mlp(ln(x))
            output = residual + self.reduce(output)
        else:
            # pseudocode:
            # x = x + attn(ln1(x))
            # x = x + mlp(ln2(x))

            residual = x

            # x = x + attn(ln1(x))
            attention_output, attention_bias = self.attention(
                self.input_layernorm(x), attention_mask, layer_past=layer_past
            )
            if self.use_cache:
                attention_output, presents = attention_output
                self.layer_past = presents
            with torch.enable_grad() if not self.eval else nullcontext():
                if attention_bias is not None:
                    # Use special bias_dropout_fn if we have a bias term from the above attention layer
                    attention_output = bias_dropout_fn(
                        attention_output,
                        bias=attention_bias.expand_as(residual),
                        residual=residual,
                        prob=self.hidden_dropout,
                    )
                else:
                    # Otherwise just apply dropout + residual
                    attention_output = (
                        torch.nn.functional.dropout(
                            attention_output,
                            p=self.hidden_dropout,
                            training=self.training,
                        )
                        + residual
                    )

            # output = x + mlp(ln2(x))
            layernorm_output = self.post_attention_layernorm(attention_output)
            mlp_bias = torch.tensor(
                0.0, device=layernorm_output.device, dtype=layernorm_output.dtype
            )

            if self.num_experts == 1:
                mlp_output, mlp_bias = self.mlp(layernorm_output)
            else:
                if self.moe_type == "deepspeed":
                    mlp_output, moe_loss, _ = self.mlp(layernorm_output)
                    mlp_bias = (
                        None  # deepspeed.moe.layer.MoE.forward ignores the bias term
                    )
                elif self.moe_type == "megablocks":
                    mlp_output, mlp_bias = self.mlp(layernorm_output)
                else:
                    raise KeyError(self.moe_type)

            with torch.enable_grad() if not self.eval else nullcontext():
                if (
                    self.activation == "swiglu"
                    or self.num_experts > 1
                    and self.moe_type == "deepspeed"
                ):
                    # No dropout either
                    assert mlp_bias is None
                    output = mlp_output + attention_output
                else:
                    output = bias_dropout_fn(
                        mlp_output,
                        bias=mlp_bias.expand_as(attention_output),
                        residual=attention_output,
                        prob=self.hidden_dropout,
                    )

        return output, moe_loss


class ParallelTransformerLayerPipe(ParallelTransformerLayer):
    """Extends ParallelTransformerLayer to forward attention_mask through the pipeline."""

    def forward(self, args):
        assert (
            len(args) == 2
        ), "ParallelTransformerLayerPipe expects 2 arguments - hidden_states and attention_mask"
        hidden_states, attention_mask = args
        # we are returning just [hidden_states, mask]
        output, moe_loss = super().forward(hidden_states, attention_mask)
        # auxiliary output
        self.last_moe_loss = moe_loss
        return output, attention_mask


class ParallelLinearPipe(ParallelLinear):
    """Another helper class to pass presents through to the output when doing inference with a Pipe Parallel model"""

    def forward(self, args):
        assert isinstance(
            args, torch.Tensor
        ), "ParallelLinearPipe expects a single argument - hidden_states"
        hidden_state = args
        logits, bias = super().forward(hidden_state)
        return logits


class NormPipe(nn.Module):
    """Just a helper class to pass presents through to the output when doing inference with a Pipe Parallel model"""

    def __init__(self, norm_class, hidden_size, eps):
        super().__init__()
        self.norm = norm_class(hidden_size, eps=eps)

    def forward(self, args):
        assert not isinstance(
            args, tuple
        ), "NormPipe should only receive a single tensor as input"
        return self.norm(args)


def parallel_lm_logits(
    input_,
    word_embeddings_weight,
    parallel_output,
    seq_parallel=False,
    seq_dim=1,
    bias=None,
):
    """LM logits using word embedding weights."""
    # Parallel logits.
    if seq_parallel:
        # if using Sequence Parallelism, our logits are sharded along the sequence dimension.
        # gather them here. (backward pass: reduce-scatter)
        input_parallel = mpu.gather_from_sequence_parallel_region(
            input_, seq_dim=seq_dim
        )
    else:
        # Set up backprop all-reduce.
        input_parallel = mpu.copy_to_model_parallel_region(input_)

    # Matrix multiply.
    if bias is None:
        logits_parallel = F.linear(input_parallel, word_embeddings_weight)
    else:
        logits_parallel = F.linear(input_parallel, word_embeddings_weight, bias)

    # Gather if needed.
    if parallel_output:
        return logits_parallel

    return mpu.gather_from_model_parallel_region(logits_parallel)