dinalt
/

walsh_instruct-1-7b

@@ -1,5 +1,5 @@
 # See: https://huggingface.co/docs/transformers/custom_models
-from typing import Optional, Tuple, Union
 import math
 import copy
 import sys
@@ -9,7 +9,7 @@ import torch
 from torch import nn, Tensor
 import torch.nn.init as init
 from torch.nn import functional as F
-from transformers.modeling_outputs import CausalLMOutput
 from transformers import (
     PreTrainedModel,
     PretrainedConfig,
@@ -18,6 +18,10 @@ from transformers import (
     AutoModelForCausalLM,
 )
 from transformers.utils import (
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
@@ -26,6 +30,8 @@ from transformers.utils import (
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
 # The model type string to bind.
 model_type = "walsh-causal-v1"
@@ -78,6 +84,10 @@ class Config(PretrainedConfig):
         layer_args=dict(),
         embedding_args=dict(),
         output_proj_args=dict(),
         **kwargs,
     ):
@@ -113,6 +123,10 @@ class Config(PretrainedConfig):
         self.layer_args = layer_args
         self.embedding_args = embedding_args
         self.output_proj_args = output_proj_args
         super().__init__(**kwargs)
@@ -204,6 +218,8 @@ class HFCausalModel(PreTrainedModel):
     _no_split_modules = ["DeepNetLayer"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     def __init__(self, config):
         super().__init__(config)
@@ -221,40 +237,144 @@ class HFCausalModel(PreTrainedModel):
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         **kwargs,
     ) -> (Tensor, dict[str, Tensor]):
         if self.gradient_checkpointing and self.training:
             gradient_checkpointing_func = self._gradient_checkpointing_func
         else:
             gradient_checkpointing_func = None
-        logits, attentions = self.transformer_head(
             input_ids=input_ids,
-            need_weights=output_attentions,
             gradient_checkpointing_func=gradient_checkpointing_func,
         )
         # Compute loss.
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, input_ids=input_ids)
         else:
             loss = None
-        return CausalLMOutput(loss=loss, logits=logits, attentions=attentions)
-    # Needed for generate() method.
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-        attention_mask = kwargs.get("attention_mask", None)
-        model_inputs = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
         return model_inputs
     def _make_embedding(self, config):
         embedding_cls = get_dynamic_class(config.embdding_cls)
         return embedding_cls(config.vocab_size, self.d_model, config.pad_index, **config.embedding_args)
@@ -278,7 +398,7 @@ class HFCausalModel(PreTrainedModel):
         norm_cls = get_dynamic_class(config.norm_cls)
         return norm_cls(self.d_model)
-    def _make_self_attention(self, config):
         attention_cls = get_dynamic_class(config.attention_cls)
         # Map HF _attn_implementation to attn_type
         match config._attn_implementation:
@@ -299,28 +419,31 @@ class HFCausalModel(PreTrainedModel):
             d_model=self.d_model,
             num_heads=config.num_attention_heads,
             attn_type=attn_type,
             **config.attention_args,
         )
-    def _make_feedforward(self, config):
         feedforward_cls = get_dynamic_class(config.feedforward_cls)
         return feedforward_cls(
             d_model=self.d_model,
             feedforward_dim=config.dim_feedforward,
             dropout=config.dropout,
             activation=self._make_activation(config),
             **config.feedforward_args,
         )
-    def _make_layer(self, config):
         layer_cls = get_dynamic_class(config.layer_cls)
         return layer_cls(
             d_model=self.d_model,
             dropout=self._make_dropout(config),
-            attention=self._make_self_attention(config),
-            feedforward=self._make_feedforward(config),
             norm1=self._make_norm(config),
             norm2=self._make_norm(config),
             **config.layer_args,
         )
@@ -328,7 +451,7 @@ class HFCausalModel(PreTrainedModel):
         layer_stack_cls = get_dynamic_class(config.layer_stack_cls)
         return layer_stack_cls(
             layers=nn.ModuleList([
-                 self._make_layer(config) for _ in range(config.num_hidden_layers)
             ]),
             **config.layer_stack_args,
         )
@@ -364,18 +487,29 @@ class Transformer(nn.Module):
         self.sqrt_d_model = d_model**0.5
         self.reset_parameters()
-    def forward(self, input_ids, need_weights, gradient_checkpointing_func):
-        x = self.positional_encoder(self.embedding(input_ids) * self.sqrt_d_model)
-        x, attentions = self.layer_stack(
-            x,
-            need_weights,
-            gradient_checkpointing_func,
         )
-        # Translate output embedding ot logits.
-        logits = self.output_projection(x)
-        return logits, attentions
     def reset_parameters(self):
         init.xavier_uniform_(self.output_projection.weight)
@@ -472,7 +606,7 @@ class RSWalshPositionalEncoder(nn.Module):
         # walsh = (hadamard_walsh_matrix(k)[:bits,:d_embed] -0.5) * self.gain
         self.register_buffer('walsh', walsh, persistent=False)
-    def forward(self, x):
         seq_len = x.size(-2)
         # Get sequence of binary codes...
@@ -486,6 +620,12 @@ class RSWalshPositionalEncoder(nn.Module):
             shift = torch.randint(self.max_seq - seq_len + 1, (1,)).item()
             seq = self.binary_code[shift:seq_len + shift,:]
         # Disable shifting when not training. This does not appear to change the evaluation loss, but
         # it does makes predictions easier to analyse when the attention weights are not shifting with each step.
         else:
@@ -508,25 +648,58 @@ class TransformerLayerStack(nn.Module):
         super().__init__()
         self.layers = layers
-    def forward(self, x, need_weights, gradient_checkpointing_func=None):
-        attentions = []
         for layer in self.layers:
             if gradient_checkpointing_func is not None:
-                x, attention_weights = gradient_checkpointing_func(
                     layer.__call__,
-                    x,
-                    need_weights,
-                    use_reentrant=False
                 )
             else:
-                x, attention_weights = layer(x, need_weights=need_weights)
-            if need_weights:
-                attentions.append(attention_weights)
-        return x, attentions
 # DeepNet: Scaling Transformers to 1,000 Layers
 # https://arxiv.org/abs/2203.00555
 class DeepnetLayer(nn.Module):
     def __init__(
         self,
@@ -536,6 +709,7 @@ class DeepnetLayer(nn.Module):
         norm1,
         norm2,
         dropout,
         alpha=1.0,
     ):
         super().__init__()
@@ -547,27 +721,45 @@ class DeepnetLayer(nn.Module):
         self.dropout = dropout
         # Deepnet alpha
         self.alpha = alpha
-    def forward(self, x, need_weights=False):
         # Keep input as residual
-        residual = x * self.alpha
         # Compute attention
-        x, attention_weights = self.attention(x, need_weights)
         # Add attention with residual and normalize.
-        x = self.norm1(residual + self.dropout(x))
         # Keep output as next residual.
-        residual = x * self.alpha
         # Pass through feedforward network.
-        x = self.feedforward(x)
         # Combine residual and ff output, then normalize again.
-        x = self.norm2(residual + self.dropout(x))
-        return x, attention_weights
 # A vanilla MLP transfomer layer.
 class FeedforwardLayer(nn.Module):
@@ -576,6 +768,7 @@ class FeedforwardLayer(nn.Module):
         d_model: int,
         feedforward_dim: int,
         dropout,
         activation=nn.ReLU(),
         beta=1.0,
         bias=True,
@@ -605,6 +798,7 @@ class SwiGLUFeedforwardLayer(nn.Module):
         self,
         d_model,
         d_feedforward,
         beta=1.0,
         dropout=0.1
     ):
@@ -643,6 +837,7 @@ class CausalSelfAttention(nn.Module):
         #   torch: Use pytorch "scaled_dot_product_attention()"; faster; generally good compatibility; does not support returning attn weights.
         #   flash2: Use Flash-Attention2 implementation; fastest; limited to int16 and bfloat16 types; least memory usage.
         attn_type,
         beta=1.0,
         dropout=0.1,
     ):
@@ -651,6 +846,7 @@ class CausalSelfAttention(nn.Module):
         self.num_heads = num_heads
         self.beta = beta
         self.attn_type = attn_type
         assert d_model % num_heads == 0, "d_model must be evenly divisible by num_heads"
@@ -685,9 +881,18 @@ class CausalSelfAttention(nn.Module):
         proj = self.in_proj(qkv)
         return proj.chunk(chunks=3, dim=-1)
-    def forward(self, qkv, need_weights):
         if self.attn_type == "flash2":
-            return self.flash2_forward(qkv)
         # qkv: (batch_size, seq_len, d_embed)
         batch_size, seq_len, d_embed = qkv.shape
@@ -700,8 +905,12 @@ class CausalSelfAttention(nn.Module):
         key = key.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
         value = value.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
         # Default to returning empty attention weights.
-        attention_weights = None
         if self.attn_type == "torch":
             # This context manager can be used to force which implementation to use.
@@ -730,28 +939,40 @@ class CausalSelfAttention(nn.Module):
             )
             # Calculate the attention weights; avoid NANs that might emerge from zeros in softmax's denominator
-            attention_weights = self.dropout(torch.softmax(scores, dim=-1).clamp(min=1e-10))
             del scores
             # Use the attention weights to get a weighted combination of value vectors
-            attended_values = torch.matmul(attention_weights, value)
-            if not need_weights:
-                del attention_weights
-                attention_weights = None
         # Concatenate attention heads and project to original embedding size using the output linear layer
         attended_values = attended_values.transpose(1, 2).contiguous().view(batch_size, seq_len, d_embed)
         # Project the concatenated output through the output matrix.
         attended_values = self.output_linear(attended_values)
-        return attended_values, attention_weights
-    def flash2_forward(self, qkv):
         batch_size, seq_len, d_embed = qkv.shape
         # Feed the inputs through the K, Q, V matrices.
         # query : (batch_size, seq_len, d_model)
         # qkv : (batch_size, seq_len, 3, num_heads, d_kq)
         qkv = self.in_proj(qkv).unflatten(
             -1,
             (3, self.num_heads, self.d_head)
@@ -770,7 +991,89 @@ class CausalSelfAttention(nn.Module):
         # Project the concatenated output through the output matrix.
         attended_values = self.output_linear(attended_values)
-        return attended_values, None
 # Attention layer with ALiBi relative positional encoding
 # TRAIN SHORT, TEST LONG: ATTENTION WITH LINEAR BIASES ENABLES INPUT LENGTH EXTRAPOLATION
@@ -907,7 +1210,7 @@ class CausalAlibiAttention(nn.Module):
             # Use the attention weights to get a weighted combination of value vectors
             attended_values = torch.matmul(attention_weights, value)
-            if not need_weights:
                 attention_weights = None
         # Concatenate attention heads and project to original embedding size using the output linear layer
@@ -946,4 +1249,4 @@ class CausalAlibiAttention(nn.Module):
         # Project the concatenated output through the output matrix.
         attended_values = self.output_linear(attended_values)
-        return attended_values, None

 # See: https://huggingface.co/docs/transformers/custom_models
+from typing import Optional, Tuple, Union, List
 import math
 import copy
 import sys
 from torch import nn, Tensor
 import torch.nn.init as init
 from torch.nn import functional as F
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutput, CausalLMOutputWithPast
 from transformers import (
     PreTrainedModel,
     PretrainedConfig,
     AutoModelForCausalLM,
 )
+from transformers.utils import logging
+from transformers.cache_utils import Cache, DynamicCache
 from transformers.utils import (
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
+logger = logging.get_logger(__name__)
 # The model type string to bind.
 model_type = "walsh-causal-v1"
         layer_args=dict(),
         embedding_args=dict(),
         output_proj_args=dict(),
+        output_attentions=False,
+        output_hidden_states=False,
+        use_cache=True,
         **kwargs,
     ):
         self.layer_args = layer_args
         self.embedding_args = embedding_args
         self.output_proj_args = output_proj_args
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.use_cache = use_cache
         super().__init__(**kwargs)
     _no_split_modules = ["DeepNetLayer"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_cache_class = True
+    _skip_keys_device_placement = "past_key_values"
     def __init__(self, config):
         super().__init__(config)
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         **kwargs,
     ) -> (Tensor, dict[str, Tensor]):
+        batch_size, seq_len = input_ids.shape
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if use_cache:
+            # If legacy cache, convert to DynamicCache
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
         if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
             gradient_checkpointing_func = self._gradient_checkpointing_func
         else:
             gradient_checkpointing_func = None
+        outputs = self.transformer_head(
             input_ids=input_ids,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
             gradient_checkpointing_func=gradient_checkpointing_func,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
         )
+        logits = outputs["logits"].float()
+        attentions = outputs["attentions"]
         # Compute loss.
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, input_ids=input_ids)
         else:
             loss = None
+        # Convert back to legacy cache, if that's what we received
+        new_cache = outputs["past_key_values"]
+        if use_cache and new_cache is not None and use_legacy_cache:
+            new_cache = new_cache.to_legacy_cache()
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=new_cache,
+            hidden_states=outputs["hidden_states"],
+            attentions=outputs["attentions"],
+        )
+    # Implementation from Huggingface Transformers,
+    # https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral/modeling_mistral.py
+    # Note: We do not implement attention mask at present, so some of this code is not applicable
+    # TODO: Reenable attention mask support for batch inference..
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        # NOTE: "RSWalsh" models don't need to have their absolute positions adjusted to zero; they are trained for this.
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        # NOTE: Injecting positional embeddings is not yet supported.
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
         return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
     def _make_embedding(self, config):
         embedding_cls = get_dynamic_class(config.embdding_cls)
         return embedding_cls(config.vocab_size, self.d_model, config.pad_index, **config.embedding_args)
         norm_cls = get_dynamic_class(config.norm_cls)
         return norm_cls(self.d_model)
+    def _make_self_attention(self, layer_idx, config):
         attention_cls = get_dynamic_class(config.attention_cls)
         # Map HF _attn_implementation to attn_type
         match config._attn_implementation:
             d_model=self.d_model,
             num_heads=config.num_attention_heads,
             attn_type=attn_type,
+            layer_idx=layer_idx,
             **config.attention_args,
         )
+    def _make_feedforward(self, layer_idx, config):
         feedforward_cls = get_dynamic_class(config.feedforward_cls)
         return feedforward_cls(
             d_model=self.d_model,
             feedforward_dim=config.dim_feedforward,
             dropout=config.dropout,
             activation=self._make_activation(config),
+            layer_idx=layer_idx,
             **config.feedforward_args,
         )
+    def _make_layer(self, layer_idx, config):
         layer_cls = get_dynamic_class(config.layer_cls)
         return layer_cls(
             d_model=self.d_model,
             dropout=self._make_dropout(config),
+            attention=self._make_self_attention(layer_idx, config),
+            feedforward=self._make_feedforward(layer_idx, config),
             norm1=self._make_norm(config),
             norm2=self._make_norm(config),
+            layer_idx=layer_idx,
             **config.layer_args,
         )
         layer_stack_cls = get_dynamic_class(config.layer_stack_cls)
         return layer_stack_cls(
             layers=nn.ModuleList([
+                 self._make_layer(layer_idx, config) for layer_idx in range(config.num_hidden_layers)
             ]),
             **config.layer_stack_args,
         )
         self.sqrt_d_model = d_model**0.5
         self.reset_parameters()
+    def forward(
+        self,
+        input_ids,
+        position_ids,
+        output_attentions,
+        gradient_checkpointing_func,
+        past_key_values,
+        use_cache,
+        output_hidden_states,
+    ):
+        outputs = self.layer_stack(
+            self.positional_encoder(self.embedding(input_ids) * self.sqrt_d_model, position_ids),
+            output_attentions=output_attentions,
+            gradient_checkpointing_func=gradient_checkpointing_func,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
         )
+        # Translate output states to logits.
+        outputs["logits"] = self.output_projection(outputs["last_hidden_state"])
+        del outputs["last_hidden_state"]
+        return outputs
     def reset_parameters(self):
         init.xavier_uniform_(self.output_projection.weight)
         # walsh = (hadamard_walsh_matrix(k)[:bits,:d_embed] -0.5) * self.gain
         self.register_buffer('walsh', walsh, persistent=False)
+    def forward(self, x, position_ids=None):
         seq_len = x.size(-2)
         # Get sequence of binary codes...
             shift = torch.randint(self.max_seq - seq_len + 1, (1,)).item()
             seq = self.binary_code[shift:seq_len + shift,:]
+        # When the cache is used for generation, after the first call, we are only passed a single token at a time,
+        # with the remaining tokens being in the cache. We need to make sure that the newly injected tokens have the
+        # correct relative position by indexing the codes with the position_ids.
+        elif position_ids != None:
+            seq = self.binary_code[position_ids, :]
         # Disable shifting when not training. This does not appear to change the evaluation loss, but
         # it does makes predictions easier to analyse when the attention weights are not shifting with each step.
         else:
         super().__init__()
         self.layers = layers
+    def forward(
+        self,
+        hidden_states,
+        output_attentions,
+        past_key_values,
+        use_cache,
+        output_hidden_states,
+        gradient_checkpointing_func=None,
+    ):
+        present_key_value = None
+        all_attentions = [] if output_attentions else None
+        all_hidden_states = [hidden_states] if output_hidden_states else None
         for layer in self.layers:
             if gradient_checkpointing_func is not None:
+                layer_outputs = gradient_checkpointing_func(
                     layer.__call__,
+                    hidden_states,
+                    output_attentions,
+                    past_key_values,
+                    use_cache,
+                    use_reentrant=False,
                 )
             else:
+                layer_outputs = layer(
+                    hidden_states,
+                    output_attentions,
+                    past_key_values,
+                    use_cache,
+                )
+            hidden_states = layer_outputs["hidden_states"]
+            if output_hidden_states:
+                all_hidden_states.append(hidden_states)
+            if use_cache:
+                present_key_value = layer_outputs["past_key_values"]
+            if output_attentions:
+                all_attentions.append(layer_outputs["attentions"])
+        return dict(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value,
+            hidden_states=hidden_states,
+            attentions=all_attentions,
+        )
 # DeepNet: Scaling Transformers to 1,000 Layers
 # https://arxiv.org/abs/2203.00555
+# Note: This is a type of Pre-Layer-Norm Transformer layer.
 class DeepnetLayer(nn.Module):
     def __init__(
         self,
         norm1,
         norm2,
         dropout,
+        layer_idx,
         alpha=1.0,
     ):
         super().__init__()
         self.dropout = dropout
         # Deepnet alpha
         self.alpha = alpha
+        self.layer_idx = layer_idx
+    def forward(
+        self,
+        hidden_states,
+        output_attentions,
+        past_key_values,
+        use_cache,
+    ):
         # Keep input as residual
+        residual = hidden_states * self.alpha
         # Compute attention
+        attn_outputs = self.attention(
+            hidden_states,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions
+        )
+        hidden_states = attn_outputs["hidden_states"]
         # Add attention with residual and normalize.
+        hidden_states = self.norm1(residual + self.dropout(hidden_states))
         # Keep output as next residual.
+        residual = hidden_states * self.alpha
         # Pass through feedforward network.
+        hidden_states = self.feedforward(hidden_states)
         # Combine residual and ff output, then normalize again.
+        hidden_states = self.norm2(residual + self.dropout(hidden_states))
+        return dict(
+            hidden_states=hidden_states,
+            attentions=attn_outputs["attentions"],
+            past_key_values=attn_outputs["past_key_values"]
+        )
 # A vanilla MLP transfomer layer.
 class FeedforwardLayer(nn.Module):
         d_model: int,
         feedforward_dim: int,
         dropout,
+        layer_idx,
         activation=nn.ReLU(),
         beta=1.0,
         bias=True,
         self,
         d_model,
         d_feedforward,
+        layer_idx,
         beta=1.0,
         dropout=0.1
     ):
         #   torch: Use pytorch "scaled_dot_product_attention()"; faster; generally good compatibility; does not support returning attn weights.
         #   flash2: Use Flash-Attention2 implementation; fastest; limited to int16 and bfloat16 types; least memory usage.
         attn_type,
+        layer_idx,
         beta=1.0,
         dropout=0.1,
     ):
         self.num_heads = num_heads
         self.beta = beta
         self.attn_type = attn_type
+        self.layer_idx = layer_idx
         assert d_model % num_heads == 0, "d_model must be evenly divisible by num_heads"
         proj = self.in_proj(qkv)
         return proj.chunk(chunks=3, dim=-1)
+    def forward(
+        self,
+        qkv,
+        output_attentions,
+        past_key_values,
+        use_cache,
+    ):
         if self.attn_type == "flash2":
+            if use_cache is None or use_cache == False:
+                return self.flash2_forward(qkv)
+            else:
+                return self.flash2_forward_cached(qkv, past_key_values)
         # qkv: (batch_size, seq_len, d_embed)
         batch_size, seq_len, d_embed = qkv.shape
         key = key.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
         value = value.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
+        # Update the cache values.
+        if past_key_values is not None:
+            key, value = past_key_values.update(key, value, self.layer_idx)
         # Default to returning empty attention weights.
+        attentions = None
         if self.attn_type == "torch":
             # This context manager can be used to force which implementation to use.
             )
             # Calculate the attention weights; avoid NANs that might emerge from zeros in softmax's denominator
+            attentions = self.dropout(torch.softmax(scores, dim=-1).clamp(min=1e-10))
             del scores
             # Use the attention weights to get a weighted combination of value vectors
+            attended_values = torch.matmul(attentions, value)
+            if not output_attentions:
+                del attentions
+                attentions = None
         # Concatenate attention heads and project to original embedding size using the output linear layer
         attended_values = attended_values.transpose(1, 2).contiguous().view(batch_size, seq_len, d_embed)
         # Project the concatenated output through the output matrix.
         attended_values = self.output_linear(attended_values)
+        return dict(
+            hidden_states=attended_values,
+            attentions=attentions,
+            # Unimplemented...
+            past_key_values=None
+        )
+    def flash2_forward(
+        self,
+        qkv,
+    ):
         batch_size, seq_len, d_embed = qkv.shape
         # Feed the inputs through the K, Q, V matrices.
         # query : (batch_size, seq_len, d_model)
         # qkv : (batch_size, seq_len, 3, num_heads, d_kq)
+        # Feed the inputs through the K, Q, V matrices.
+        # query : (batch_size, seq_len, d_model)
+        # qkv : (batch_size, seq_len, 3, num_heads, d_kq)
         qkv = self.in_proj(qkv).unflatten(
             -1,
             (3, self.num_heads, self.d_head)
         # Project the concatenated output through the output matrix.
         attended_values = self.output_linear(attended_values)
+        return dict(
+            hidden_states=attended_values,
+            attentions=None,
+            past_key_values=None
+        )
+    # See https://github.com/huggingface/transformers/blob/main/src/transformers/cache_utils.py
+    #https://huggingface.co/docs/transformers/internal/generation_utils
+    def flash2_forward_cached(
+        self,
+        qkv,
+        past_key_values,
+    ):
+        batch_size, seq_len, d_embed = qkv.shape
+        # Feed the inputs through the K, Q, V matrices.
+        query, key, value = self.project_input(qkv)
+        # TODO: Refactor -- this code is repeated in the baseline implementation.
+        # Split projections into multiple heads and swap position of sequence / heads dimension
+        query = query.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
+        key = key.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
+        value = value.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
+        if past_key_values is not None:
+            key, value = past_key_values.update(key, value, self.layer_idx)
+        #query, key, value = self._downcast_to_float16(query, key, value)
+        # Expected inputs to flash2:
+        # q: (batch_size, seqlen, nheads, headdim)
+        # k: (batch_size, seqlen, nheads_k, headdim)
+        # v: (batch_size, seqlen, nheads_k, headdim)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        attended_values = flash_attn_func(
+            q=query,
+            k=key,
+            v=value,
+            dropout_p=self.dropout.p if self.training else 0.0,
+            softmax_scale=self.dot_product_scale,
+            causal=True,
+        )
+        # attended_values: (batch_size, seqlen, nheads, headdim)
+        # Concatentate heads back into d_embed
+        attended_values = attended_values.view(batch_size, seq_len, d_embed)
+        # Project the concatenated output through the output matrix.
+        attended_values = self.output_linear(attended_values)
+        return dict(
+            hidden_states=attended_values,
+            attentions=None,
+            past_key_values=past_key_values
+        )
+    @staticmethod
+    def _downcast_to_float16(query, key, value):
+        # Copied section for Transformers to handle this
+        # TODO: Revist other Flash2 impelementation, above
+        input_dtype = query.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query = query.to(target_dtype)
+            key = key.to(target_dtype)
+            value = value.to(target_dtype)
+        return query, key, value
+########### TODO: Update to newer API, with inference cache
 # Attention layer with ALiBi relative positional encoding
 # TRAIN SHORT, TEST LONG: ATTENTION WITH LINEAR BIASES ENABLES INPUT LENGTH EXTRAPOLATION
             # Use the attention weights to get a weighted combination of value vectors
             attended_values = torch.matmul(attention_weights, value)
+            if not output_attentions:
                 attention_weights = None
         # Concatenate attention heads and project to original embedding size using the output linear layer
         # Project the concatenated output through the output matrix.
         attended_values = self.output_linear(attended_values)
+        return attended_values, None