Upload 2 files

Browse files

Files changed (2) hide show

configuration_rotary_indictrans.py +2 -175
modeling_rotary_indictrans.py +92 -232

configuration_rotary_indictrans.py CHANGED Viewed

@@ -1,28 +1,4 @@
-# coding=utf-8
-# Copyright 2023 The IndicTrans2 Authors and AI4Bharat team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch IndicTrans config."""
-import json
-from collections import OrderedDict
-from typing import Any, Mapping, Optional
-from transformers import PreTrainedTokenizer
 from transformers.configuration_utils import PretrainedConfig
-from transformers.onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast
-from transformers.onnx.utils import compute_effective_axis_dimension
-from transformers.utils import TensorType, is_torch_available
 # Copied from transformers.models.m2m_100.configuration_m2m_100.M2M100Config->IndicTrans
@@ -79,6 +55,7 @@ class RotaryIndicTransConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
     ```"""
     model_type = "RotaryIndicTrans"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {
@@ -146,7 +123,7 @@ class RotaryIndicTransConfig(PretrainedConfig):
         self.scale_embedding = scale_embedding
         self.share_decoder_input_output_embed = share_decoder_input_output_embed
         self.attn_implementation = attn_implementation
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
@@ -155,153 +132,3 @@ class RotaryIndicTransConfig(PretrainedConfig):
             decoder_start_token_id=decoder_start_token_id,
             **kwargs,
         )
-class RotaryIndicTransOnnxConfig(OnnxSeq2SeqConfigWithPast):
-    @property
-    def inputs(self) -> Mapping[str, Mapping[int, str]]:
-        common_inputs = OrderedDict(
-            [
-                ("input_ids", {0: "batch", 1: "encoder_sequence"}),
-                ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
-            ]
-        )
-        if self.use_past:
-            common_inputs["decoder_input_ids"] = {0: "batch"}
-            common_inputs["decoder_attention_mask"] = {
-                0: "batch",
-                1: "past_decoder_sequence + sequence",
-            }
-        else:
-            common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
-            common_inputs["decoder_attention_mask"] = {
-                0: "batch",
-                1: "decoder_sequence",
-            }
-        if self.use_past:
-            self.fill_with_past_key_values_(common_inputs, direction="inputs")
-        return common_inputs
-    # Copied from BartOnnxConfig._generate_dummy_inputs_for_sequence_classification_and_question_answering
-    # A better name would be _generate_dummy_inputs_for_encoder_and_decoder because sequence classification and question
-    # answering are not supported for IT2, but this name is preserved to be able to check that the copy matches what
-    # was done for BART so that it can be updated if need be.
-    def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
-        self,
-        tokenizer: PreTrainedTokenizer,
-        batch_size: int = -1,
-        seq_length: int = -1,
-        is_pair: bool = False,
-        framework: Optional[TensorType] = None,
-    ) -> Mapping[str, Any]:
-        # Copied from OnnxConfig.generate_dummy_inputs
-        # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity.
-        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
-        batch_size = compute_effective_axis_dimension(
-            batch_size,
-            fixed_dimension=OnnxConfig.default_fixed_batch,
-            num_token_to_add=0,
-        )
-        # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
-        token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
-        seq_length = compute_effective_axis_dimension(
-            seq_length,
-            fixed_dimension=OnnxConfig.default_fixed_sequence,
-            num_token_to_add=token_to_add,
-        )
-        # Generate dummy inputs according to compute batch and sequence
-        dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
-        common_inputs = dict(tokenizer(dummy_input, return_tensors=framework))
-        return common_inputs
-    # Copied from transformers.models.bart.configuration_bart.BartOnnxConfig._generate_dummy_inputs_for_default_and_seq2seq_lm
-    def _generate_dummy_inputs_for_default_and_seq2seq_lm(
-        self,
-        tokenizer: PreTrainedTokenizer,
-        batch_size: int = -1,
-        seq_length: int = -1,
-        is_pair: bool = False,
-        framework: Optional[TensorType] = None,
-    ) -> Mapping[str, Any]:
-        encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
-            tokenizer, batch_size, seq_length, is_pair, framework
-        )
-        # Generate decoder inputs
-        decoder_seq_length = seq_length if not self.use_past else 1
-        decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
-            tokenizer, batch_size, decoder_seq_length, is_pair, framework
-        )
-        decoder_inputs = {
-            f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()
-        }
-        common_inputs = dict(**encoder_inputs, **decoder_inputs)
-        if self.use_past:
-            if not is_torch_available():
-                raise ValueError(
-                    "Cannot generate dummy past_keys inputs without PyTorch installed."
-                )
-            else:
-                import torch
-            batch, encoder_seq_length = common_inputs["input_ids"].shape
-            decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
-            (
-                num_encoder_attention_heads,
-                num_decoder_attention_heads,
-            ) = self.num_attention_heads
-            encoder_shape = (
-                batch,
-                num_encoder_attention_heads,
-                encoder_seq_length,
-                self._config.hidden_size // num_encoder_attention_heads,
-            )
-            decoder_past_length = decoder_seq_length + 3
-            decoder_shape = (
-                batch,
-                num_decoder_attention_heads,
-                decoder_past_length,
-                self._config.hidden_size // num_decoder_attention_heads,
-            )
-            common_inputs["decoder_attention_mask"] = torch.cat(
-                [
-                    common_inputs["decoder_attention_mask"],
-                    torch.ones(batch, decoder_past_length),
-                ],
-                dim=1,
-            )
-            common_inputs["past_key_values"] = []
-            # If the number of encoder and decoder layers are present in the model configuration, both are considered
-            num_encoder_layers, num_decoder_layers = self.num_layers
-            min_num_layers = min(num_encoder_layers, num_decoder_layers)
-            max_num_layers = (
-                max(num_encoder_layers, num_decoder_layers) - min_num_layers
-            )
-            remaining_side_name = (
-                "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
-            )
-            for _ in range(min_num_layers):
-                common_inputs["past_key_values"].append(
-                    (
-                        torch.zeros(decoder_shape),
-                        torch.zeros(decoder_shape),
-                        torch.zeros(encoder_shape),
-                        torch.zeros(encoder_shape),
-                    )
-                )
-            # TODO: test this.
-            shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
-            for _ in range(min_num_layers, max_num_layers):
-                common_inputs["past_key_values"].append(
-                    (torch.zeros(shape), torch.zeros(shape))
-                )
-        return common_inputs
-    generate_dummy_inputs = _generate_dummy_inputs_for_default_and_seq2seq_lm

 from transformers.configuration_utils import PretrainedConfig
 # Copied from transformers.models.m2m_100.configuration_m2m_100.M2M100Config->IndicTrans
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
     ```"""
     model_type = "RotaryIndicTrans"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {
         self.scale_embedding = scale_embedding
         self.share_decoder_input_output_embed = share_decoder_input_output_embed
         self.attn_implementation = attn_implementation
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             decoder_start_token_id=decoder_start_token_id,
             **kwargs,
         )

modeling_rotary_indictrans.py CHANGED Viewed

@@ -1,20 +1,3 @@
-# coding=utf-8
-# Copyright 2023 The RotaryIndicTrans2 Authors and AI4Bharat team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch RotaryIndicTrans model."""
 import math
 from typing import List, Optional, Tuple, Union
@@ -38,36 +21,24 @@ from transformers.modeling_outputs import (
     Seq2SeqModelOutput,
 )
-from transformers.utils import (
-    logging,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-)
-from einops import rearrange
 from transformers.modeling_utils import PreTrainedModel
 from .configuration_rotary_indictrans import RotaryIndicTransConfig
-try:
-    from rotary_embedding_torch import RotaryEmbedding
-except ImportError:
-    raise ImportError("Please install the rotary-embedding-torch>=0.6.4")
 logger = logging.get_logger(__name__)
-ROTARY_INDICTRANS_PRETRAINED_MODEL_ARCHIVE_LIST = [""]
-try:
-    if is_flash_attn_2_available():
-        from flash_attn import flash_attn_func, flash_attn_varlen_func
-        from flash_attn.bert_padding import (
-            index_first_axis,
-            pad_input,
-            unpad_input,
-        )  # noqa
-except:
-    pass
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
@@ -87,29 +58,20 @@ def _get_unpad_data(attention_mask):
 def shift_tokens_right(
     input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int
 ):
-    """
-    Shift input ids one token to the right.
-    """
     shifted_input_ids = input_ids.new_zeros(input_ids.shape)
     shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
     shifted_input_ids[:, 0] = decoder_start_token_id
     if pad_token_id is None:
         raise ValueError("self.model.config.pad_token_id has to be defined.")
-    # replace possible -100 values in labels by `pad_token_id`
-    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
     return shifted_input_ids
 def create_position_ids_from_input_ids(
     input_ids, padding_idx, past_key_values_length=0
 ):
-    """
-    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
-    are ignored. This is modified from fairseq's `utils.make_positions`.
-    """
-    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
     mask = input_ids.ne(padding_idx).int()
     incremental_indices = (
         torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length
@@ -117,10 +79,64 @@ def create_position_ids_from_input_ids(
     return incremental_indices.long() + padding_idx
 # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->RotaryIndicTrans
 class RotaryIndicTransAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
     def __init__(
         self,
         embed_dim: int,
@@ -133,12 +149,11 @@ class RotaryIndicTransAttention(nn.Module):
         config: Optional[RotaryIndicTransConfig] = None,
     ):
         super().__init__()
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.dropout = dropout
         self.head_dim = embed_dim // num_heads
-        self.config = config
-        self.rope_args = config.rope_args
         if (self.head_dim * num_heads) != self.embed_dim:
             raise ValueError(
@@ -149,15 +164,12 @@ class RotaryIndicTransAttention(nn.Module):
         self.is_decoder = is_decoder
         self.is_causal = is_causal
-        self.xpos = self.rope_args.get("use_xpos", False)
         # partial rotation in RoPE
         self.rotary_pos_embed = (
             RotaryEmbedding(
                 dim=self.head_dim // 2,
-                use_xpos=self.xpos,
-                theta=self.rope_args.get("theta", 10000),
-                xpos_scale_base=self.rope_args.get("xpos_scale_base", 512),
             )
             if not is_cross_attention
             else None
@@ -179,14 +191,10 @@ class RotaryIndicTransAttention(nn.Module):
         q = rearrange(q, "(b h) t d -> b h t d", h=self.num_heads)
         k = rearrange(k, "(b h) t d -> b h t d", h=self.num_heads)
-        if is_inference:
-            q, k = self.rotary_pos_embed.rotate_queries_with_cached_keys(q, k)
-        else:
-            if not self.xpos:
-                q = self.rotary_pos_embed.rotate_queries_or_keys(q)
-                k = self.rotary_pos_embed.rotate_queries_or_keys(k)
-            else:
-                q, k = self.rotary_pos_embed.rotate_queries_and_keys(q, k)
         q = rearrange(q, "b h t d -> (b h) t d")
         k = rearrange(k, "b h t d -> (b h) t d")
@@ -203,49 +211,32 @@ class RotaryIndicTransAttention(nn.Module):
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
         is_cross_attention = key_value_states is not None
         bsz, tgt_len, _ = hidden_states.size()
-        # get query proj
         query_states = self.q_proj(hidden_states) * self.scaling
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
         if (
             is_cross_attention
             and past_key_value is not None
             and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
-            # reuse k,v, cross_attentions
             key_states = past_key_value[0]
             value_states = past_key_value[1]
         elif is_cross_attention:
-            # cross_attentions
             key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
             value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
         elif past_key_value is not None:
-            # reuse k, v, self_attention
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
             key_states = torch.cat([past_key_value[0], key_states], dim=2)
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
         else:
-            # self_attention
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
         if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
             past_key_value = (key_states, value_states)
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
@@ -293,10 +284,6 @@ class RotaryIndicTransAttention(nn.Module):
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         if output_attentions:
-            # this operation is a bit awkward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to be reshaped
-            # twice and have to be reused in the following
             attn_weights_reshaped = attn_weights.view(
                 bsz, self.num_heads, tgt_len, src_len
             )
@@ -316,34 +303,19 @@ class RotaryIndicTransAttention(nn.Module):
                 f" {attn_output.size()}"
             )
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-        attn_output = attn_output.transpose(1, 2)
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned across GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
         attn_output = self.out_proj(attn_output)
         return attn_output, attn_weights_reshaped, past_key_value
 class RotaryIndicTransFlashAttention2(RotaryIndicTransAttention):
-    """
-    RotaryIndicTrans flash attention module. This module inherits from `RotaryIndicTransAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
@@ -362,32 +334,23 @@ class RotaryIndicTransFlashAttention2(RotaryIndicTransAttention):
                 "RotaryIndicTransFlashAttention2 attention does not support output_attentions"
             )
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
         is_cross_attention = key_value_states is not None
         bsz, q_len, _ = hidden_states.size()
-        # get query proj
         query_states = self._reshape(self.q_proj(hidden_states), -1, bsz)
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
         if (
             is_cross_attention
             and past_key_value is not None
             and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
-            # reuse k,v, cross_attentions
             key_states = past_key_value[0].transpose(1, 2)
             value_states = past_key_value[1].transpose(1, 2)
         elif is_cross_attention:
-            # cross_attentions
             key_states = self._reshape(self.k_proj(key_value_states), -1, bsz)
             value_states = self._reshape(self.v_proj(key_value_states), -1, bsz)
         elif past_key_value is not None:
-            # reuse k, v, self_attention
             key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
             key_states = torch.cat(
@@ -397,30 +360,16 @@ class RotaryIndicTransFlashAttention2(RotaryIndicTransAttention):
                 [past_key_value[1].transpose(1, 2), value_states], dim=1
             )
         else:
-            # self_attention
             key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
         if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
             past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (LlamaRMSNorm handles it correctly)
         input_dtype = query_states.dtype
         if input_dtype == torch.float32:
             if torch.is_autocast_enabled():
@@ -493,12 +442,6 @@ class RotaryIndicTransFlashAttention2(RotaryIndicTransAttention):
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
@@ -526,7 +469,7 @@ class RotaryIndicTransFlashAttention2(RotaryIndicTransAttention):
                 max_seqlen_k=max_seqlen_in_batch_k,
                 dropout_p=dropout,
                 softmax_scale=softmax_scale,
-                causal=causal,
             )
             attn_output = pad_input(
@@ -539,7 +482,7 @@ class RotaryIndicTransFlashAttention2(RotaryIndicTransAttention):
                 value_states,
                 dropout,
                 softmax_scale=softmax_scale,
-                causal=causal,
             )
         return attn_output
@@ -571,11 +514,10 @@ class RotaryIndicTransFlashAttention2(RotaryIndicTransAttention):
             max_seqlen_in_batch_q = 1
             cu_seqlens_q = torch.arange(
                 batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
             indices_q = cu_seqlens_q[:-1]
             query_layer = query_layer.squeeze(1)
         else:
-            # The -q_len: slice assumes left padding.
             attention_mask = attention_mask[:, -query_length:]
             query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
                 query_layer, attention_mask
@@ -603,7 +545,6 @@ class RotaryIndicTransSdpaAttention(RotaryIndicTransAttention):
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
         if output_attentions or layer_head_mask is not None:
-            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
             logger.warning_once(
                 "RotaryIndicTransModel is using RotaryIndicTransSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
                 ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
@@ -617,49 +558,32 @@ class RotaryIndicTransSdpaAttention(RotaryIndicTransAttention):
                 output_attentions=output_attentions,
             )
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
         is_cross_attention = key_value_states is not None
         bsz, tgt_len, _ = hidden_states.size()
-        # get query proj
         query_states = self.q_proj(hidden_states)
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
         if (
             is_cross_attention
             and past_key_value is not None
             and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
-            # reuse k,v, cross_attentions
             key_states = past_key_value[0]
             value_states = past_key_value[1]
         elif is_cross_attention:
-            # cross_attentions
             key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
             value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
         elif past_key_value is not None:
-            # reuse k, v, self_attention
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
             key_states = torch.cat([past_key_value[0], key_states], dim=2)
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
         else:
-            # self_attention
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
         if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
             past_key_value = (key_states, value_states)
         query_states = self._shape(query_states, tgt_len, bsz)
@@ -669,15 +593,12 @@ class RotaryIndicTransSdpaAttention(RotaryIndicTransAttention):
                 query_states, key_states, is_inference=past_key_value is not None
             )
-        # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
-        # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
         attn_output = F.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.dropout if self.training else 0.0,
-            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
             is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
         )
@@ -687,14 +608,10 @@ class RotaryIndicTransSdpaAttention(RotaryIndicTransAttention):
                 f" {attn_output.size()}"
             )
-        attn_output = attn_output.transpose(1, 2)
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned across GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
         attn_output = self.out_proj(attn_output)
         return attn_output, None, past_key_value
@@ -859,12 +776,10 @@ class RotaryIndicTransDecoderLayer(nn.Module):
         if self.normalize_before:
             hidden_states = self.self_attn_layer_norm(hidden_states)
-        # Self Attention
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
         self_attn_past_key_value = (
             past_key_value[:2] if past_key_value is not None else None
         )
-        # add present self-attn cache to positions 1,2 of present_key_value tuple
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
             past_key_value=self_attn_past_key_value,
@@ -877,7 +792,6 @@ class RotaryIndicTransDecoderLayer(nn.Module):
         if not self.normalize_before:
             hidden_states = self.self_attn_layer_norm(hidden_states)
-        # Cross-Attention Block
         cross_attn_present_key_value = None
         cross_attn_weights = None
         if encoder_hidden_states is not None:
@@ -885,7 +799,6 @@ class RotaryIndicTransDecoderLayer(nn.Module):
             if self.normalize_before:
                 hidden_states = self.encoder_attn_layer_norm(hidden_states)
-            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
             cross_attn_past_key_value = (
                 past_key_value[-2:] if past_key_value is not None else None
             )
@@ -908,10 +821,8 @@ class RotaryIndicTransDecoderLayer(nn.Module):
             if not self.normalize_before:
                 hidden_states = self.encoder_attn_layer_norm(hidden_states)
-            # add cross-attn to positions 3,4 of present_key_value tuple
             present_key_value = present_key_value + cross_attn_present_key_value
-        # Fully Connected
         residual = hidden_states
         if self.normalize_before:
             hidden_states = self.final_layer_norm(hidden_states)
@@ -961,15 +872,6 @@ class RotaryIndicTransPreTrainedModel(PreTrainedModel):
 # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100EncoderLayer->RotaryIndicTrans
 class RotaryIndicTransEncoder(RotaryIndicTransPreTrainedModel):
-    """
-    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
-    [`RotaryIndicTransEncoderLayer`].
-    Args:
-        config: RotaryIndicTransConfig
-        embed_tokens (nn.Embedding): output embedding
-    """
     def __init__(
         self,
         config: RotaryIndicTransConfig,
@@ -1005,7 +907,6 @@ class RotaryIndicTransEncoder(RotaryIndicTransPreTrainedModel):
         self._use_sdpa = config._attn_implementation == "sdpa"
         self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
         self.post_init()
     def forward(
@@ -1068,7 +969,6 @@ class RotaryIndicTransEncoder(RotaryIndicTransPreTrainedModel):
             return_dict if return_dict is not None else self.config.use_return_dict
         )
-        # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError(
                 "You cannot specify both input_ids and inputs_embeds at the same time"
@@ -1095,14 +995,10 @@ class RotaryIndicTransEncoder(RotaryIndicTransPreTrainedModel):
             if self._use_flash_attention_2:
                 attention_mask = attention_mask if 0 in attention_mask else None
             elif self._use_sdpa and head_mask is None and not output_attentions:
-                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
-                # the manual implementation that requires a 4D causal mask in all cases.
-                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                 attention_mask = _prepare_4d_attention_mask_for_sdpa(
                     attention_mask, inputs_embeds.dtype
                 )
             else:
-                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                 attention_mask = _prepare_4d_attention_mask(
                     attention_mask, inputs_embeds.dtype
                 )
@@ -1110,7 +1006,6 @@ class RotaryIndicTransEncoder(RotaryIndicTransPreTrainedModel):
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
-        # check if head_mask has a correct number of layers specified if desired
         if head_mask is not None:
             if head_mask.size()[0] != len(self.layers):
                 raise ValueError(
@@ -1123,7 +1018,6 @@ class RotaryIndicTransEncoder(RotaryIndicTransPreTrainedModel):
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             dropout_probability = torch.rand([])
             skip_the_layer = (
@@ -1132,10 +1026,8 @@ class RotaryIndicTransEncoder(RotaryIndicTransPreTrainedModel):
                 else False
             )
             if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
                 if self.gradient_checkpointing and self.training:
-                    # create gradient checkpointing function
                     def create_custom_forward(module):
                         def custom_forward(*inputs):
                             return module(*inputs, output_attentions)
@@ -1187,14 +1079,6 @@ class RotaryIndicTransEncoder(RotaryIndicTransPreTrainedModel):
 # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100DecoderLayer->RotaryIndicTrans
 class RotaryIndicTransDecoder(RotaryIndicTransPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`RotaryIndicTransDecoderLayer`]
-    Args:
-        config: RotaryIndicTransConfig
-        embed_tokens (nn.Embedding): output embedding
-    """
     def __init__(
         self,
         config: RotaryIndicTransConfig,
@@ -1229,7 +1113,6 @@ class RotaryIndicTransDecoder(RotaryIndicTransPreTrainedModel):
         self._use_sdpa = config._attn_implementation == "sdpa"
         self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
         self.post_init()
     def forward(
@@ -1327,7 +1210,6 @@ class RotaryIndicTransDecoder(RotaryIndicTransPreTrainedModel):
             return_dict if return_dict is not None else self.config.use_return_dict
         )
-        # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError(
                 "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
@@ -1342,7 +1224,6 @@ class RotaryIndicTransDecoder(RotaryIndicTransPreTrainedModel):
                 "You have to specify either decoder_input_ids or decoder_inputs_embeds"
             )
-        # past_key_values_length
         past_key_values_length = (
             past_key_values[0][0].shape[2] if past_key_values is not None else 0
         )
@@ -1351,15 +1232,12 @@ class RotaryIndicTransDecoder(RotaryIndicTransPreTrainedModel):
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
         if self._use_flash_attention_2:
-            # 2d mask is passed through the layers
             attention_mask = (
                 attention_mask
                 if (attention_mask is not None and 0 in attention_mask)
                 else None
             )
         elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None:
-            # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
             attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
                 attention_mask,
                 input_shape,
@@ -1367,12 +1245,10 @@ class RotaryIndicTransDecoder(RotaryIndicTransPreTrainedModel):
                 past_key_values_length,
             )
         else:
-            # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask, input_shape, inputs_embeds, past_key_values_length
             )
-        # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
             if self._use_flash_attention_2:
                 encoder_attention_mask = (
@@ -1383,16 +1259,12 @@ class RotaryIndicTransDecoder(RotaryIndicTransPreTrainedModel):
                 and cross_attn_head_mask is None
                 and not output_attentions
             ):
-                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
-                # the manual implementation that requires a 4D causal mask in all cases.
-                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                 encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
                     encoder_attention_mask,
                     inputs_embeds.dtype,
                     tgt_len=input_shape[-1],
                 )
             else:
-                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                 encoder_attention_mask = _prepare_4d_attention_mask(
                     encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
                 )
@@ -1412,13 +1284,11 @@ class RotaryIndicTransDecoder(RotaryIndicTransPreTrainedModel):
                 )
                 use_cache = False
-        # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
         all_cross_attentions = () if output_attentions else None
         next_decoder_cache = () if use_cache else None
-        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
         for attn_mask, mask_name in zip(
             [head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]
         ):
@@ -1434,7 +1304,6 @@ class RotaryIndicTransDecoder(RotaryIndicTransPreTrainedModel):
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             dropout_probability = torch.rand([])
             skip_the_layer = (
@@ -1443,8 +1312,6 @@ class RotaryIndicTransDecoder(RotaryIndicTransPreTrainedModel):
                 else False
             )
             if not skip_the_layer or deepspeed_zero3_is_enabled:
-                # under deepspeed zero3 all gpus must run in sync
                 past_key_value = (
                     past_key_values[idx] if past_key_values is not None else None
                 )
@@ -1506,7 +1373,6 @@ class RotaryIndicTransDecoder(RotaryIndicTransPreTrainedModel):
         if self.layer_norm is not None:
             hidden_states = self.layer_norm(hidden_states)
-        # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
@@ -1541,8 +1407,6 @@ class RotaryIndicTransModel(RotaryIndicTransPreTrainedModel):
         self.encoder = RotaryIndicTransEncoder(config)
         self.decoder = RotaryIndicTransDecoder(config)
-        # Initialize weights and apply final processing
         self.post_init()
     def get_encoder(self):
@@ -1594,7 +1458,6 @@ class RotaryIndicTransModel(RotaryIndicTransPreTrainedModel):
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
                 last_hidden_state=encoder_outputs[0],
@@ -1602,7 +1465,6 @@ class RotaryIndicTransModel(RotaryIndicTransPreTrainedModel):
                 attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
             )
-        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
         decoder_outputs = self.decoder(
             input_ids=decoder_input_ids,
             attention_mask=decoder_attention_mask,
@@ -1727,7 +1589,6 @@ class RotaryIndicTransForConditionalGeneration(RotaryIndicTransPreTrainedModel):
         masked_lm_loss = None
         if labels is not None:
-            # move labels to the correct device to enable PP
             labels = labels.to(lm_logits.device)
             masked_lm_loss = F.cross_entropy(
                 input=lm_logits.view(-1, self.config.decoder_vocab_size),
@@ -1766,12 +1627,11 @@ class RotaryIndicTransForConditionalGeneration(RotaryIndicTransPreTrainedModel):
         encoder_outputs=None,
         **kwargs,
     ):
-        # cut decoder_input_ids if past is used
         if past_key_values is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
         return {
-            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
             "encoder_outputs": encoder_outputs,
             "past_key_values": past_key_values,
             "decoder_input_ids": decoder_input_ids,
@@ -1779,7 +1639,7 @@ class RotaryIndicTransForConditionalGeneration(RotaryIndicTransPreTrainedModel):
             "head_mask": head_mask,
             "decoder_head_mask": decoder_head_mask,
             "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
     @staticmethod

 import math
 from typing import List, Optional, Tuple, Union
     Seq2SeqModelOutput,
 )
+from transformers.utils import logging
+from einops import rearrange, repeat
+from torch.amp import autocast
+from torch import einsum
 from transformers.modeling_utils import PreTrainedModel
 from .configuration_rotary_indictrans import RotaryIndicTransConfig
+from flash_attn import flash_attn_func, flash_attn_varlen_func
+from flash_attn.bert_padding import (
+    index_first_axis,
+    pad_input,
+    unpad_input,
+)
 logger = logging.get_logger(__name__)
+device = "cuda" if torch.cuda.is_available() else "cpu"
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def shift_tokens_right(
     input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int
 ):
     shifted_input_ids = input_ids.new_zeros(input_ids.shape)
     shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
     shifted_input_ids[:, 0] = decoder_start_token_id
     if pad_token_id is None:
         raise ValueError("self.model.config.pad_token_id has to be defined.")
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
     return shifted_input_ids
 def create_position_ids_from_input_ids(
     input_ids, padding_idx, past_key_values_length=0
 ):
     mask = input_ids.ne(padding_idx).int()
     incremental_indices = (
         torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length
     return incremental_indices.long() + padding_idx
+def rotate_half(x):
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+@autocast("cuda", enabled=False)
+def apply_rotary_emb(cos, sin, t):
+    rot_dim = cos.shape[-1]
+    assert rot_dim <= t.shape[-1] and cos.shape == sin.shape
+    t_left, t_right = t[..., :rot_dim], t[..., rot_dim:]
+    t_transformed = (t_left * cos) + (rotate_half(t_left) * sin)
+    return torch.cat((t_transformed, t_right), dim=-1).type(t.dtype)
+class RotaryEmbedding(torch.nn.Module):
+    def __init__(
+        self, dim, theta=10000, interpolate_factor=1.0, cache_max_seq_len=8192
+    ):
+        super().__init__()
+        freqs_ = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
+        self.cache_max_seq_len = cache_max_seq_len
+        self.interpolate_factor = interpolate_factor
+        self.freqs = torch.nn.Parameter(freqs_, requires_grad=False).to(device)
+        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
+        self.precompute_freqs(cache_max_seq_len)
+    def precompute_freqs(self, max_seq_len):
+        thetas = self.forward(max_seq_len, device=device)
+        self.register_buffer("cached_cos", thetas.cos(), persistent=False)
+        self.register_buffer("cached_sin", thetas.sin(), persistent=False)
+    def rotate_queries_or_keys(self, t, seq_dim=-2, offset=0):
+        seq_len = t.shape[seq_dim]
+        if seq_len > self.cache_max_seq_len:
+            self.cache_max_seq_len = seq_len * 2
+            self.precompute_freqs(self.cache_max_seq_len)
+        cos, sin = (
+            self.cached_cos[offset : (offset + seq_len)],
+            self.cached_sin[offset : (offset + seq_len)],
+        )
+        return apply_rotary_emb(cos, sin, t)
+    @autocast("cuda", enabled=False)
+    def forward(self, seq_len, device):
+        seq = torch.arange(seq_len, device=device) / self.interpolate_factor
+        thetas = einsum("..., f -> ... f", seq, self.freqs)
+        thetas = repeat(thetas, "... n -> ... (n r)", r=2)
+        return thetas
 # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->RotaryIndicTrans
 class RotaryIndicTransAttention(nn.Module):
     def __init__(
         self,
         embed_dim: int,
         config: Optional[RotaryIndicTransConfig] = None,
     ):
         super().__init__()
+        self.config = config
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.dropout = dropout
         self.head_dim = embed_dim // num_heads
         if (self.head_dim * num_heads) != self.embed_dim:
             raise ValueError(
         self.is_decoder = is_decoder
         self.is_causal = is_causal
         # partial rotation in RoPE
         self.rotary_pos_embed = (
             RotaryEmbedding(
                 dim=self.head_dim // 2,
+                theta=config.rope_args.get("theta", 10000),
+                interpolate_factor=config.rope_args.get("interpolate_factor", 1.0),
             )
             if not is_cross_attention
             else None
         q = rearrange(q, "(b h) t d -> b h t d", h=self.num_heads)
         k = rearrange(k, "(b h) t d -> b h t d", h=self.num_heads)
+        offset = (k.shape[-2] - 1) if is_inference else 0
+        q = self.rotary_pos_embed.rotate_queries_or_keys(q, offset=offset)
+        k = self.rotary_pos_embed.rotate_queries_or_keys(k)
         q = rearrange(q, "b h t d -> (b h) t d")
         k = rearrange(k, "b h t d -> (b h) t d")
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
         is_cross_attention = key_value_states is not None
         bsz, tgt_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states) * self.scaling
         if (
             is_cross_attention
             and past_key_value is not None
             and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
             key_states = past_key_value[0]
             value_states = past_key_value[1]
         elif is_cross_attention:
             key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
             value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
         elif past_key_value is not None:
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
             key_states = torch.cat([past_key_value[0], key_states], dim=2)
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
         else:
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
         if self.is_decoder:
             past_key_value = (key_states, value_states)
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
             attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         if output_attentions:
             attn_weights_reshaped = attn_weights.view(
                 bsz, self.num_heads, tgt_len, src_len
             )
                 f" {attn_output.size()}"
             )
+        attn_output = rearrange(
+            attn_output, "(b h) t d -> b t (h d)", h=self.num_heads, d=self.head_dim
+        )
         attn_output = self.out_proj(attn_output)
         return attn_output, attn_weights_reshaped, past_key_value
 class RotaryIndicTransFlashAttention2(RotaryIndicTransAttention):
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
                 "RotaryIndicTransFlashAttention2 attention does not support output_attentions"
             )
         is_cross_attention = key_value_states is not None
         bsz, q_len, _ = hidden_states.size()
         query_states = self._reshape(self.q_proj(hidden_states), -1, bsz)
         if (
             is_cross_attention
             and past_key_value is not None
             and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
             key_states = past_key_value[0].transpose(1, 2)
             value_states = past_key_value[1].transpose(1, 2)
         elif is_cross_attention:
             key_states = self._reshape(self.k_proj(key_value_states), -1, bsz)
             value_states = self._reshape(self.v_proj(key_value_states), -1, bsz)
         elif past_key_value is not None:
             key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
             key_states = torch.cat(
                 [past_key_value[1].transpose(1, 2), value_states], dim=1
             )
         else:
             key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
         if self.is_decoder:
             past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
         input_dtype = query_states.dtype
         if input_dtype == torch.float32:
             if torch.is_autocast_enabled():
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
                 max_seqlen_k=max_seqlen_in_batch_k,
                 dropout_p=dropout,
                 softmax_scale=softmax_scale,
+                causal=self.is_causal,
             )
             attn_output = pad_input(
                 value_states,
                 dropout,
                 softmax_scale=softmax_scale,
+                causal=self.is_causal,
             )
         return attn_output
             max_seqlen_in_batch_q = 1
             cu_seqlens_q = torch.arange(
                 batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )
             indices_q = cu_seqlens_q[:-1]
             query_layer = query_layer.squeeze(1)
         else:
             attention_mask = attention_mask[:, -query_length:]
             query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
                 query_layer, attention_mask
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
         if output_attentions or layer_head_mask is not None:
             logger.warning_once(
                 "RotaryIndicTransModel is using RotaryIndicTransSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
                 ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
                 output_attentions=output_attentions,
             )
         is_cross_attention = key_value_states is not None
         bsz, tgt_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states)
         if (
             is_cross_attention
             and past_key_value is not None
             and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
             key_states = past_key_value[0]
             value_states = past_key_value[1]
         elif is_cross_attention:
             key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
             value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
         elif past_key_value is not None:
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
             key_states = torch.cat([past_key_value[0], key_states], dim=2)
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
         else:
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
         if self.is_decoder:
             past_key_value = (key_states, value_states)
         query_states = self._shape(query_states, tgt_len, bsz)
                 query_states, key_states, is_inference=past_key_value is not None
             )
         attn_output = F.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.dropout if self.training else 0.0,
             is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
         )
                 f" {attn_output.size()}"
             )
+        attn_output = rearrange(
+            attn_output, "b h t d -> b t (h d)", h=self.num_heads, d=self.head_dim
+        )
         attn_output = self.out_proj(attn_output)
         return attn_output, None, past_key_value
         if self.normalize_before:
             hidden_states = self.self_attn_layer_norm(hidden_states)
         self_attn_past_key_value = (
             past_key_value[:2] if past_key_value is not None else None
         )
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
             past_key_value=self_attn_past_key_value,
         if not self.normalize_before:
             hidden_states = self.self_attn_layer_norm(hidden_states)
         cross_attn_present_key_value = None
         cross_attn_weights = None
         if encoder_hidden_states is not None:
             if self.normalize_before:
                 hidden_states = self.encoder_attn_layer_norm(hidden_states)
             cross_attn_past_key_value = (
                 past_key_value[-2:] if past_key_value is not None else None
             )
             if not self.normalize_before:
                 hidden_states = self.encoder_attn_layer_norm(hidden_states)
             present_key_value = present_key_value + cross_attn_present_key_value
         residual = hidden_states
         if self.normalize_before:
             hidden_states = self.final_layer_norm(hidden_states)
 # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100EncoderLayer->RotaryIndicTrans
 class RotaryIndicTransEncoder(RotaryIndicTransPreTrainedModel):
     def __init__(
         self,
         config: RotaryIndicTransConfig,
         self._use_sdpa = config._attn_implementation == "sdpa"
         self.gradient_checkpointing = False
         self.post_init()
     def forward(
             return_dict if return_dict is not None else self.config.use_return_dict
         )
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError(
                 "You cannot specify both input_ids and inputs_embeds at the same time"
             if self._use_flash_attention_2:
                 attention_mask = attention_mask if 0 in attention_mask else None
             elif self._use_sdpa and head_mask is None and not output_attentions:
                 attention_mask = _prepare_4d_attention_mask_for_sdpa(
                     attention_mask, inputs_embeds.dtype
                 )
             else:
                 attention_mask = _prepare_4d_attention_mask(
                     attention_mask, inputs_embeds.dtype
                 )
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
         if head_mask is not None:
             if head_mask.size()[0] != len(self.layers):
                 raise ValueError(
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             dropout_probability = torch.rand([])
             skip_the_layer = (
                 else False
             )
             if not skip_the_layer or deepspeed_zero3_is_enabled:
                 if self.gradient_checkpointing and self.training:
                     def create_custom_forward(module):
                         def custom_forward(*inputs):
                             return module(*inputs, output_attentions)
 # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100DecoderLayer->RotaryIndicTrans
 class RotaryIndicTransDecoder(RotaryIndicTransPreTrainedModel):
     def __init__(
         self,
         config: RotaryIndicTransConfig,
         self._use_sdpa = config._attn_implementation == "sdpa"
         self.gradient_checkpointing = False
         self.post_init()
     def forward(
             return_dict if return_dict is not None else self.config.use_return_dict
         )
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError(
                 "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
                 "You have to specify either decoder_input_ids or decoder_inputs_embeds"
             )
         past_key_values_length = (
             past_key_values[0][0].shape[2] if past_key_values is not None else 0
         )
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
         if self._use_flash_attention_2:
             attention_mask = (
                 attention_mask
                 if (attention_mask is not None and 0 in attention_mask)
                 else None
             )
         elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None:
             attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
                 attention_mask,
                 input_shape,
                 past_key_values_length,
             )
         else:
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask, input_shape, inputs_embeds, past_key_values_length
             )
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
             if self._use_flash_attention_2:
                 encoder_attention_mask = (
                 and cross_attn_head_mask is None
                 and not output_attentions
             ):
                 encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
                     encoder_attention_mask,
                     inputs_embeds.dtype,
                     tgt_len=input_shape[-1],
                 )
             else:
                 encoder_attention_mask = _prepare_4d_attention_mask(
                     encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
                 )
                 )
                 use_cache = False
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
         all_cross_attentions = () if output_attentions else None
         next_decoder_cache = () if use_cache else None
         for attn_mask, mask_name in zip(
             [head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]
         ):
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
             dropout_probability = torch.rand([])
             skip_the_layer = (
                 else False
             )
             if not skip_the_layer or deepspeed_zero3_is_enabled:
                 past_key_value = (
                     past_key_values[idx] if past_key_values is not None else None
                 )
         if self.layer_norm is not None:
             hidden_states = self.layer_norm(hidden_states)
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
         self.encoder = RotaryIndicTransEncoder(config)
         self.decoder = RotaryIndicTransDecoder(config)
         self.post_init()
     def get_encoder(self):
                 output_hidden_states=output_hidden_states,
                 return_dict=return_dict,
             )
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
                 last_hidden_state=encoder_outputs[0],
                 attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
             )
         decoder_outputs = self.decoder(
             input_ids=decoder_input_ids,
             attention_mask=decoder_attention_mask,
         masked_lm_loss = None
         if labels is not None:
             labels = labels.to(lm_logits.device)
             masked_lm_loss = F.cross_entropy(
                 input=lm_logits.view(-1, self.config.decoder_vocab_size),
         encoder_outputs=None,
         **kwargs,
     ):
         if past_key_values is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
         return {
+            "input_ids": None,
             "encoder_outputs": encoder_outputs,
             "past_key_values": past_key_values,
             "decoder_input_ids": decoder_input_ids,
             "head_mask": head_mask,
             "decoder_head_mask": decoder_head_mask,
             "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
         }
     @staticmethod