OpenNLPLab
/

TransNormerLLM2-3B-300B

@@ -53,8 +53,13 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "TransnormerConfig"
 use_triton = eval(os.environ.get("use_triton", default="True"))
 debug = eval(os.environ.get("debug", default="False"))
 if use_triton:
     try:
@@ -80,9 +85,11 @@ if not has_lightning_attention:
         return output
 ########## start Transnormer
 ##### Linearized Relative Positional Encoding: https://openreview.net/forum?id=xoLyps2qWc&referrer=%5BAuthor%20Console%5D(%2Fgroup%3Fid%3DTMLR%2FAuthors%23your-submissions)
 class Lrpe(nn.Module):
     def __init__(
         self,
         num_heads=8,
@@ -92,9 +99,8 @@ class Lrpe(nn.Module):
         d = num_heads * embed_dim
         self.index = torch.empty(0)
-        self.theta = nn.Parameter(
-            10000 ** (-2 / d * torch.arange(d)).reshape(num_heads, 1, -1)
-        )
     def extra_repr(self):
         return print_module(self)
@@ -113,6 +119,7 @@ class Lrpe(nn.Module):
 class GLU(nn.Module):
     def __init__(self, d1, d2, bias=False):
         super().__init__()
         if debug:
@@ -135,6 +142,7 @@ class GLU(nn.Module):
 class NormLinearAttention(nn.Module):
     def __init__(
         self,
         embed_dim,
@@ -181,7 +189,6 @@ class NormLinearAttention(nn.Module):
         use_cache: bool = False,
         slope_rate: Optional[torch.Tensor] = None,
     ):
-        do_eval = eval(os.environ.get("do_eval", default="False"))
         if (not self.training) and (not do_eval):
             return self.inference(
                 x,
@@ -198,8 +205,8 @@ class NormLinearAttention(nn.Module):
         q, k, v, u = self.qkvu_proj(x).chunk(4, dim=-1)
         # reshape
         q, k, v = map(
-            lambda x: rearrange(x, "b n (h d) -> b h n d", h=self.num_heads), [q, k, v]
-        )
         # act
         q = self.act(q)
         k = self.act(k)
@@ -217,24 +224,23 @@ class NormLinearAttention(nn.Module):
         # lrpe
         if self.linear_use_lrpe:
             q = self.lrpe(q, offset=q_offset)
-            k = self.lrpe(k)
         if attn_mask == None:
             attn_mask = (torch.tril(torch.ones(n, n))).to(q)
         if attn_padding_mask is not None:
             v = v.masked_fill(
-                (1 - attn_padding_mask).unsqueeze(1).unsqueeze(-1).to(torch.bool), 0
-            )
         if not has_lightning_attention:
             if slope_rate != None:
                 attn_mask = torch.exp(slope_rate * attn_mask)
             output = linear_attention(q, k, v, attn_mask)
         else:
-            output = lightning_attention(
-                q, k, v, True, slope_rate.squeeze(-1).squeeze(-1)
-            )
         # reshape
         output = rearrange(output, "b h n d -> b n (h d)")
@@ -253,14 +259,14 @@ class NormLinearAttention(nn.Module):
         return output, attn_weights, past_key_value
     def inference(
-        self,
-        x,
-        attn_mask: Optional[torch.Tensor] = None,  # (b, h, n, m)
-        attn_padding_mask: Optional[torch.Tensor] = None,  # (b, m)
-        output_attentions: bool = False,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        use_cache: bool = False,
-        slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
     ):
         # x: b n d
         n = x.shape[-2]
@@ -268,8 +274,8 @@ class NormLinearAttention(nn.Module):
         q, k, v, u = self.qkvu_proj(x).chunk(4, dim=-1)
         # reshape
         q, k, v = map(
-            lambda x: rearrange(x, "b n (h d) -> b h n d", h=self.num_heads), [q, k, v]
-        )
         # act
         q = self.act(q)
         k = self.act(k)
@@ -277,7 +283,7 @@ class NormLinearAttention(nn.Module):
         # rpe
         if self.linear_use_lrpe:
             q = self.lrpe(q, offset=self.offset)
-            k = self.lrpe(k)
         if past_key_value == None:
             self.offset = q.shape[-2]
@@ -288,38 +294,47 @@ class NormLinearAttention(nn.Module):
         # only use for the first time
         if past_key_value == None:
-            if attn_mask == None:
-                attn_mask = (torch.tril(torch.ones(n, n))).to(q)
-            if slope_rate != None:
-                attn_mask = torch.exp(slope_rate * attn_mask)
             if attn_padding_mask is not None:
-                attn_mask = attn_mask.masked_fill(
-                    (1 - attn_padding_mask).unsqueeze(1).unsqueeze(2).to(torch.bool),
-                    0,
-                )
-            energy = torch.einsum("... n d, ... m d -> ... n m", q, k)
-            if attn_mask != None:
-                energy = energy * attn_mask
-            output = torch.einsum("... n m, ... m d -> ... n d", energy, v)
-            eval_and_not_generate = eval(
-                os.environ.get("eval_and_not_generate", default="False")
-            )
-            if eval_and_not_generate:
-                kv = None
-            else:
-                # b, h, n, e, d
-                kv_outproduct = torch.einsum("... n e, ... n d -> ... n e d", k, v)
-                # 1, 1, n, 1, 1
-                index = torch.arange(n - 1, -1, -1).reshape(1, 1, -1, 1, 1).to(x)
-                # (h, 1, 1) -> (1, h, 1, 1, 1); (1, h, 1, 1, 1), (1, 1, n, 1, 1) -> (1, h, n, 1, 1)
-                decay = ratio.unsqueeze(0).unsqueeze(-1) ** index
-                kv_outproduct_with_decay = kv_outproduct * decay
-                kv = torch.sum(kv_outproduct_with_decay, dim=-3)
         else:
             kv = past_key_value
@@ -327,12 +342,11 @@ class NormLinearAttention(nn.Module):
             for i in range(n):
                 kv = ratio * kv + torch.einsum(
                     "... n d, ... n e -> ... d e",
-                    k[:, :, i : i + 1],
-                    v[:, :, i : i + 1],
-                )
-                qkv = torch.einsum(
-                    "... n e, ... e d -> ... n d", q[:, :, i : i + 1], kv
                 )
                 output.append(qkv)
             output = torch.concat(output, dim=-2)
@@ -351,6 +365,7 @@ class NormLinearAttention(nn.Module):
 class TransnormerDecoderLayer(nn.Module):
     def __init__(self, config: TransnormerConfig):
         super().__init__()
         self.embed_dim = config.decoder_embed_dim
@@ -389,18 +404,18 @@ class TransnormerDecoderLayer(nn.Module):
         return residual + x
     def forward(
-        self,
-        x,
-        attn_mask: Optional[torch.Tensor] = None,
-        attn_padding_mask: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
     ):
         residual = x
         input = x
         o1, self_attn_weights, present_key_value = self.token_mixer(
             x=self.token_norm(input),
             attn_mask=attn_mask,
@@ -418,10 +433,10 @@ class TransnormerDecoderLayer(nn.Module):
         outputs = (o, )
         if output_attentions:
-            outputs += (self_attn_weights,)
         if use_cache:
-            outputs += (present_key_value,)
         return outputs
@@ -443,9 +458,7 @@ TRANSNORMER_START_DOCSTRING = r"""
 """
-@add_start_docstrings(
-    TRANSNORMER_START_DOCSTRING,
-)
 class TransnormerPreTrainedModel(PreTrainedModel):
     config_class = TransnormerConfig
     base_model_prefix = "model"
@@ -530,9 +543,7 @@ TRANSNORMER_INPUTS_DOCSTRING = r"""
 """
-@add_start_docstrings(
-    TRANSNORMER_START_DOCSTRING,
-)
 class TransnormerModel(TransnormerPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`TransnormerDecoderLayer`]
@@ -556,29 +567,31 @@ class TransnormerModel(TransnormerPreTrainedModel):
         self.slopes = self._build_slope_tensor(config.decoder_attention_heads)
         # params
-        self.embed_tokens = nn.Embedding(
-            config.vocab_size, config.decoder_embed_dim, self.padding_idx
-        )
         self.layers = nn.ModuleList([])
         for i in range(config.decoder_layers):
             if len(self.linear_use_lrpe_list) > 0:
                 config.linear_use_lrpe = self.linear_use_lrpe_list[i]
             self.layers.append(TransnormerDecoderLayer(config))
-        self.final_norm = get_norm_fn(config.norm_type)(config.decoder_embed_dim)
         self.embed_dim = config.decoder_embed_dim
-        self.embed_scale = (
-            1.0 if config.no_scale_embedding else math.sqrt(self.embed_dim)
-        )
         # Initialize weights and apply final processing
         self.post_init()
     @staticmethod
     def _build_slope_tensor(n_attention_heads: int):
         def get_slopes(n):
             def get_slopes_power_of_2(n):
-                start = 2 ** (-(2 ** -(math.log2(n) - 3)))
                 ratio = start
                 return [start * ratio**i for i in range(n)]
@@ -587,18 +600,15 @@ class TransnormerModel(TransnormerPreTrainedModel):
                     n
                 )  # In the paper, we only train models that have 2^a heads for some a. This function has
             else:  # some good properties that only occur when the input is a power of 2. To maintain that even
-                closest_power_of_2 = 2 ** math.floor(
                     math.log2(n)
                 )  # when the number of heads is not a power of 2, we use this workaround.
-                return (
-                    get_slopes_power_of_2(closest_power_of_2)
-                    + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
-                )
         # h, 1, 1
         slopes = torch.tensor(get_slopes(n_attention_heads)).reshape(
-            n_attention_heads, 1, 1
-        )
         return slopes
@@ -611,26 +621,26 @@ class TransnormerModel(TransnormerPreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
-    def _prepare_decoder_linear_attn_mask(
-        self, input_shape, inputs_embeds, past_key_values_length
-    ):
         bsz, tgt_len = input_shape
         src_len = tgt_len + past_key_values_length
         def power_log(x):
-            return 2 ** (math.ceil(math.log(x, 2)))
         n = power_log(max(tgt_len, src_len))
         if self._linear_attn_mask.shape[-1] < n:
             def get_mask(n):
-                mask = torch.triu(torch.zeros(n, n).float().fill_(float("-inf")), 1)
                 # no slope version
                 # -n, ..., -2, -1, 0
                 for i in range(n):
                     x = torch.arange(i + 1)
                     y = x
-                    mask[i, : i + 1] = -torch.flip(y, [0])
                 return mask
@@ -642,7 +652,8 @@ class TransnormerModel(TransnormerPreTrainedModel):
         linear_attn_mask = self._linear_attn_mask[:, -tgt_len:, -src_len:]
         num_heads = linear_attn_mask.shape[0]
-        return linear_attn_mask[None, :, :, :].expand(bsz, num_heads, tgt_len, src_len)
     @add_start_docstrings_to_model_forward(TRANSNORMER_INPUTS_DOCSTRING)
     def forward(
@@ -656,21 +667,15 @@ class TransnormerModel(TransnormerPreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
@@ -692,7 +697,7 @@ class TransnormerModel(TransnormerPreTrainedModel):
         if past_key_values is not None:
             past_key_values_length = past_key_values[0][0].shape[-2]
             seq_length_with_past = seq_length_with_past + past_key_values_length
         if inputs_embeds is None:
             # !!! use embed_scale
             inputs_embeds = self.embed_scale * self.embed_tokens(input_ids)
@@ -714,23 +719,23 @@ class TransnormerModel(TransnormerPreTrainedModel):
         ##### norm linear layers
         linear_attn_padding_mask = attn_padding_mask
         linear_attn_mask = self._prepare_decoder_linear_attn_mask(
-            (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
-        slope_rates = [self.slopes.to(input_ids.device) for _ in range(self.num_layers)]
         for idx, layer in enumerate(self.layers):
             if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            past_key_value = (
-                past_key_values[idx] if past_key_values is not None else None
-            )
             slope_rate = slope_rates[idx]
             slope_rate = slope_rate * (1 - idx / (self.num_layers - 1) + 1e-5)
             mask = linear_attn_mask
             layer_outputs = layer(
                 hidden_states,
                 attn_mask=mask,
@@ -744,27 +749,24 @@ class TransnormerModel(TransnormerPreTrainedModel):
             hidden_states = layer_outputs[0]
             if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
             if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-            # if idx == 0:
-            #     break
         hidden_states = self.final_norm(hidden_states)
         # add hidden states from the last decoder layer
         if output_hidden_states:
-            all_hidden_states += (hidden_states,)
         next_cache = next_decoder_cache if use_cache else None
         if not return_dict:
             return tuple(
-                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
-                if v is not None
-            )
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
@@ -774,6 +776,7 @@ class TransnormerModel(TransnormerPreTrainedModel):
 class TransnormerForCausalLM(TransnormerPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.model = TransnormerModel(config)
@@ -781,9 +784,9 @@ class TransnormerForCausalLM(TransnormerPreTrainedModel):
             logging_info(self.model)
         # the lm_head weight is automatically tied to the embed tokens weight
-        self.lm_head = nn.Linear(
-            config.decoder_embed_dim, config.vocab_size, bias=False
-        )
         # Initialize weights and apply final processing
         self.post_init()
@@ -807,9 +810,8 @@ class TransnormerForCausalLM(TransnormerPreTrainedModel):
         return self.model
     @add_start_docstrings_to_model_forward(TRANSNORMER_INPUTS_DOCSTRING)
-    @replace_return_docstrings(
-        output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
-    )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -847,19 +849,13 @@ class TransnormerForCausalLM(TransnormerPreTrainedModel):
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
         ```"""
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
@@ -890,8 +886,8 @@ class TransnormerForCausalLM(TransnormerPreTrainedModel):
             loss = loss_fct(shift_logits, shift_labels)
         if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
         return CausalLMOutputWithPast(
             loss=loss,
@@ -918,22 +914,18 @@ class TransnormerForCausalLM(TransnormerPreTrainedModel):
         else:
             model_inputs = {"input_ids": input_ids}
-        model_inputs.update(
-            {
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
         return model_inputs
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
-            reordered_past += (
-                tuple(
-                    past_state.index_select(0, beam_idx) for past_state in layer_past
-                ),
-            )
         return reordered_past

 _CONFIG_FOR_DOC = "TransnormerConfig"
+# TODO: fix environment: https://huggingface.co/OpenNLPLab/TransNormerLLM-7B/discussions/1
 use_triton = eval(os.environ.get("use_triton", default="True"))
 debug = eval(os.environ.get("debug", default="False"))
+do_eval = eval(os.environ.get("do_eval", default="False"))
+eval_and_not_generate = eval(
+    os.environ.get("eval_and_not_generate", default="False"))
+BLOCK = 256
 if use_triton:
     try:
         return output
 ########## start Transnormer
 ##### Linearized Relative Positional Encoding: https://openreview.net/forum?id=xoLyps2qWc&referrer=%5BAuthor%20Console%5D(%2Fgroup%3Fid%3DTMLR%2FAuthors%23your-submissions)
 class Lrpe(nn.Module):
     def __init__(
         self,
         num_heads=8,
         d = num_heads * embed_dim
         self.index = torch.empty(0)
+        self.theta = nn.Parameter(10000**(-2 / d * torch.arange(d)).reshape(
+            num_heads, 1, -1))
     def extra_repr(self):
         return print_module(self)
 class GLU(nn.Module):
     def __init__(self, d1, d2, bias=False):
         super().__init__()
         if debug:
 class NormLinearAttention(nn.Module):
     def __init__(
         self,
         embed_dim,
         use_cache: bool = False,
         slope_rate: Optional[torch.Tensor] = None,
     ):
         if (not self.training) and (not do_eval):
             return self.inference(
                 x,
         q, k, v, u = self.qkvu_proj(x).chunk(4, dim=-1)
         # reshape
         q, k, v = map(
+            lambda x: rearrange(x, "b n (h d) -> b h n d", h=self.num_heads),
+            [q, k, v])
         # act
         q = self.act(q)
         k = self.act(k)
         # lrpe
         if self.linear_use_lrpe:
             q = self.lrpe(q, offset=q_offset)
+            k = self.lrpe(k, offset=q_offset)
         if attn_mask == None:
             attn_mask = (torch.tril(torch.ones(n, n))).to(q)
         if attn_padding_mask is not None:
             v = v.masked_fill(
+                (1 - attn_padding_mask).unsqueeze(1).unsqueeze(-1).to(
+                    torch.bool), 0)
         if not has_lightning_attention:
             if slope_rate != None:
                 attn_mask = torch.exp(slope_rate * attn_mask)
             output = linear_attention(q, k, v, attn_mask)
         else:
+            output = lightning_attention(q, k, v, True,
+                                         slope_rate.squeeze(-1).squeeze(-1))
         # reshape
         output = rearrange(output, "b h n d -> b n (h d)")
         return output, attn_weights, past_key_value
     def inference(
+            self,
+            x,
+            attn_mask: Optional[torch.Tensor] = None,  # (b, h, n, m)
+            attn_padding_mask: Optional[torch.Tensor] = None,  # (b, m)
+            output_attentions: bool = False,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            use_cache: bool = False,
+            slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
     ):
         # x: b n d
         n = x.shape[-2]
         q, k, v, u = self.qkvu_proj(x).chunk(4, dim=-1)
         # reshape
         q, k, v = map(
+            lambda x: rearrange(x, "b n (h d) -> b h n d", h=self.num_heads),
+            [q, k, v])
         # act
         q = self.act(q)
         k = self.act(k)
         # rpe
         if self.linear_use_lrpe:
             q = self.lrpe(q, offset=self.offset)
+            k = self.lrpe(k, offset=self.offset)
         if past_key_value == None:
             self.offset = q.shape[-2]
         # only use for the first time
         if past_key_value == None:
+            slope_rate = slope_rate.to(torch.float32)
             if attn_padding_mask is not None:
+                v = v.masked_fill(
+                    (1 - attn_padding_mask).unsqueeze(1).unsqueeze(-1).to(
+                        torch.bool), 0)
+            NUM_BLOCK = (n + BLOCK - 1) // BLOCK
+            b, h, n, d = q.shape
+            e = v.shape[-1]
+            # other
+            array = torch.arange(BLOCK).to(q) + 1  ## !!!! important
+            q_decay = torch.exp(-slope_rate * array.reshape(-1, 1))
+            k_decay = torch.exp(-slope_rate * (BLOCK - array.reshape(-1, 1)))
+            index = array[:, None] - array[None, :]
+            s_index = slope_rate * index[
+                None,
+                None,
+            ]
+            s_index = torch.where(index >= 0, -s_index, float("-inf"))
+            diag_decay = torch.exp(s_index)
+            kv = torch.zeros(b, h, d, e).to(torch.float32).to(q.device)
+            output = torch.empty((b, h, n, e), dtype=q.dtype, device=q.device)
+            for i in range(NUM_BLOCK):
+                si = i * BLOCK
+                ei = min(si + BLOCK, n)
+                m = ei - si
+                qi = q[:, :, si:ei].contiguous()
+                ki = k[:, :, si:ei].contiguous()
+                vi = v[:, :, si:ei].contiguous()
+                qkv_none_diag = torch.matmul(qi * q_decay[:, :m],
+                                             kv).to(torch.float32)
+                # diag
+                qk = torch.matmul(qi, ki.transpose(-1, -2)).to(
+                    torch.float32) * diag_decay[:, :, :m, :m]
+                qkv_diag = torch.matmul(qk, vi.to(torch.float32))
+                block_decay = torch.exp(-slope_rate * m)
+                output[:, :, si:ei] = qkv_none_diag + qkv_diag
+                kv = block_decay * kv + torch.matmul(
+                    (ki * k_decay[:, -m:]).transpose(-1, -2).to(vi.dtype), vi)
         else:
             kv = past_key_value
             for i in range(n):
                 kv = ratio * kv + torch.einsum(
                     "... n d, ... n e -> ... d e",
+                    k[:, :, i:i + 1],
+                    v[:, :, i:i + 1],
                 )
+                qkv = torch.einsum("... n e, ... e d -> ... n d",
+                                   q[:, :, i:i + 1], kv)
                 output.append(qkv)
             output = torch.concat(output, dim=-2)
 class TransnormerDecoderLayer(nn.Module):
     def __init__(self, config: TransnormerConfig):
         super().__init__()
         self.embed_dim = config.decoder_embed_dim
         return residual + x
     def forward(
+            self,
+            x,
+            attn_mask: Optional[torch.Tensor] = None,
+            attn_padding_mask: Optional[torch.Tensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+            slope_rate: Optional[torch.Tensor] = None,  # (h, 1, 1)
     ):
         residual = x
         input = x
         o1, self_attn_weights, present_key_value = self.token_mixer(
             x=self.token_norm(input),
             attn_mask=attn_mask,
         outputs = (o, )
         if output_attentions:
+            outputs += (self_attn_weights, )
         if use_cache:
+            outputs += (present_key_value, )
         return outputs
 """
+@add_start_docstrings(TRANSNORMER_START_DOCSTRING, )
 class TransnormerPreTrainedModel(PreTrainedModel):
     config_class = TransnormerConfig
     base_model_prefix = "model"
 """
+@add_start_docstrings(TRANSNORMER_START_DOCSTRING, )
 class TransnormerModel(TransnormerPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`TransnormerDecoderLayer`]
         self.slopes = self._build_slope_tensor(config.decoder_attention_heads)
         # params
+        self.embed_tokens = nn.Embedding(config.vocab_size,
+                                         config.decoder_embed_dim,
+                                         self.padding_idx)
         self.layers = nn.ModuleList([])
         for i in range(config.decoder_layers):
             if len(self.linear_use_lrpe_list) > 0:
                 config.linear_use_lrpe = self.linear_use_lrpe_list[i]
             self.layers.append(TransnormerDecoderLayer(config))
+        self.final_norm = get_norm_fn(config.norm_type)(
+            config.decoder_embed_dim)
         self.embed_dim = config.decoder_embed_dim
+        self.embed_scale = (1.0 if config.no_scale_embedding else math.sqrt(
+            self.embed_dim))
         # Initialize weights and apply final processing
         self.post_init()
     @staticmethod
     def _build_slope_tensor(n_attention_heads: int):
         def get_slopes(n):
             def get_slopes_power_of_2(n):
+                start = 2**(-(2**-(math.log2(n) - 3)))
                 ratio = start
                 return [start * ratio**i for i in range(n)]
                     n
                 )  # In the paper, we only train models that have 2^a heads for some a. This function has
             else:  # some good properties that only occur when the input is a power of 2. To maintain that even
+                closest_power_of_2 = 2**math.floor(
                     math.log2(n)
                 )  # when the number of heads is not a power of 2, we use this workaround.
+                return (get_slopes_power_of_2(closest_power_of_2) + get_slopes(
+                    2 * closest_power_of_2)[0::2][:n - closest_power_of_2])
         # h, 1, 1
         slopes = torch.tensor(get_slopes(n_attention_heads)).reshape(
+            n_attention_heads, 1, 1)
         return slopes
     def set_input_embeddings(self, value):
         self.embed_tokens = value
+    def _prepare_decoder_linear_attn_mask(self, input_shape, inputs_embeds,
+                                          past_key_values_length):
         bsz, tgt_len = input_shape
         src_len = tgt_len + past_key_values_length
         def power_log(x):
+            return 2**(math.ceil(math.log(x, 2)))
         n = power_log(max(tgt_len, src_len))
         if self._linear_attn_mask.shape[-1] < n:
             def get_mask(n):
+                mask = torch.triu(
+                    torch.zeros(n, n).float().fill_(float("-inf")), 1)
                 # no slope version
                 # -n, ..., -2, -1, 0
                 for i in range(n):
                     x = torch.arange(i + 1)
                     y = x
+                    mask[i, :i + 1] = -torch.flip(y, [0])
                 return mask
         linear_attn_mask = self._linear_attn_mask[:, -tgt_len:, -src_len:]
         num_heads = linear_attn_mask.shape[0]
+        return linear_attn_mask[None, :, :, :].expand(bsz, num_heads, tgt_len,
+                                                      src_len)
     @add_start_docstrings_to_model_forward(TRANSNORMER_INPUTS_DOCSTRING)
     def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = (output_attentions if output_attentions is not None
+                             else self.config.output_attentions)
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (return_dict if return_dict is not None else
+                       self.config.use_return_dict)
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
         if past_key_values is not None:
             past_key_values_length = past_key_values[0][0].shape[-2]
             seq_length_with_past = seq_length_with_past + past_key_values_length
         if inputs_embeds is None:
             # !!! use embed_scale
             inputs_embeds = self.embed_scale * self.embed_tokens(input_ids)
         ##### norm linear layers
         linear_attn_padding_mask = attn_padding_mask
         linear_attn_mask = self._prepare_decoder_linear_attn_mask(
+            (batch_size, seq_length), inputs_embeds, past_key_values_length)
+        slope_rates = [
+            self.slopes.to(input_ids.device) for _ in range(self.num_layers)
+        ]
         for idx, layer in enumerate(self.layers):
             if output_hidden_states:
+                all_hidden_states += (hidden_states, )
+            past_key_value = (past_key_values[idx]
+                              if past_key_values is not None else None)
             slope_rate = slope_rates[idx]
             slope_rate = slope_rate * (1 - idx / (self.num_layers - 1) + 1e-5)
             mask = linear_attn_mask
             layer_outputs = layer(
                 hidden_states,
                 attn_mask=mask,
             hidden_states = layer_outputs[0]
             if use_cache:
+                next_decoder_cache += (
+                    layer_outputs[2 if output_attentions else 1], )
             if output_attentions:
+                all_self_attns += (layer_outputs[1], )
         hidden_states = self.final_norm(hidden_states)
         # add hidden states from the last decoder layer
         if output_hidden_states:
+            all_hidden_states += (hidden_states, )
         next_cache = next_decoder_cache if use_cache else None
         if not return_dict:
             return tuple(
+                v for v in
+                [hidden_states, next_cache, all_hidden_states, all_self_attns]
+                if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
             past_key_values=next_cache,
 class TransnormerForCausalLM(TransnormerPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.model = TransnormerModel(config)
             logging_info(self.model)
         # the lm_head weight is automatically tied to the embed tokens weight
+        self.lm_head = nn.Linear(config.decoder_embed_dim,
+                                 config.vocab_size,
+                                 bias=False)
         # Initialize weights and apply final processing
         self.post_init()
         return self.model
     @add_start_docstrings_to_model_forward(TRANSNORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast,
+                               config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
         ```"""
+        output_attentions = (output_attentions if output_attentions is not None
+                             else self.config.output_attentions)
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = (return_dict if return_dict is not None else
+                       self.config.use_return_dict)
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
             loss = loss_fct(shift_logits, shift_labels)
         if not return_dict:
+            output = (logits, ) + outputs[1:]
+            return (loss, ) + output if loss is not None else output
         return CausalLMOutputWithPast(
             loss=loss,
         else:
             model_inputs = {"input_ids": input_ids}
+        model_inputs.update({
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "attention_mask": attention_mask,
+        })
         return model_inputs
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
         for layer_past in past_key_values:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx)
+                for past_state in layer_past), )
         return reordered_past