Upload 3 files

Browse files

Files changed (3) hide show

configuration_rwkv5.py +22 -27
modeling_rwkv5.py +160 -218
tokenization_rwkv_world.py +91 -91

configuration_rwkv5.py CHANGED Viewed

@@ -21,46 +21,44 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
-RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-}
 class Rwkv5Config(PretrainedConfig):
     """
-    This is the configuration class to store the configuration of a [`RwkvModel`]. It is used to instantiate a RWKV
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     defaults will yield a similar configuration to that of the RWVK-4
-    [RWKV/rwkv-4-169m-pile](https://huggingface.co/RWKV/rwkv-4-169m-pile) architecture.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
-        vocab_size (`int`, *optional*, defaults to 50277):
-            Vocabulary size of the RWKV model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`RwkvModel`].
-        context_length (`int`, *optional*, defaults to 1024):
-            The maximum sequence length that this model can be be used with in a single forward (using it in RNN mode
-            lets use any sequence length).
-        hidden_size (`int`, *optional*, defaults to 4096):
             Dimensionality of the embeddings and hidden states.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
             Number of hidden layers in the model.
         attention_hidden_size (`int`, *optional*):
             Dimensionality of the attention hidden states. Will default to `hidden_size` if unset.
         intermediate_size (`int`, *optional*):
             Dimensionality of the inner feed-forward layers. Will default to 4 times `hidden_size` if unset.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
             The epsilon to use in the layer normalization layers.
         bos_token_id (`int`, *optional*, defaults to 0):
-            The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer
             as GPTNeoX.
         eos_token_id (`int`, *optional*, defaults to 0):
-            The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV uses the same tokenizer as
             GPTNeoX.
-        rescale_every (`int`, *optional*, default to 6):
             At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
             `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
@@ -72,28 +70,27 @@ class Rwkv5Config(PretrainedConfig):
     Example:
     ```python
-    >>> from transformers import RwkvConfig, RwkvModel
-    >>> # Initializing a Rwkv configuration
-    >>> configuration = RwkvConfig()
     >>> # Initializing a model (with random weights) from the configuration
-    >>> model = RwkvModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
     model_type = "rwkv5"
-    attribute_map = {"max_position_embeddings": "context_length"}
-    def __init__( #1.5B World
         self,
         vocab_size=65536,
-        context_length=4096,
         hidden_size=768,
         num_hidden_layers=24,
         attention_hidden_size=None,
         head_size=64,
         intermediate_size=None,
         layer_norm_epsilon=1e-5,
@@ -102,14 +99,13 @@ class Rwkv5Config(PretrainedConfig):
         rescale_every=6,
         tie_word_embeddings=False,
         use_cache=True,
-        model_version="5_2",
         **kwargs,
     ):
         self.vocab_size = vocab_size
-        self.context_length = context_length
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
         self.head_size = head_size
         self.intermediate_size = None
         self.layer_norm_epsilon = layer_norm_epsilon
@@ -118,7 +114,6 @@ class Rwkv5Config(PretrainedConfig):
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
-        self.model_version = model_version
         super().__init__(
             tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs

 logger = logging.get_logger(__name__)
+RWKV5_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 class Rwkv5Config(PretrainedConfig):
     """
+    This is the configuration class to store the configuration of a [`Rwkv5Model`]. It is used to instantiate a RWKV5
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     defaults will yield a similar configuration to that of the RWVK-4
+    [RWKV/rwkv-5-world-1b5](https://huggingface.co/RWKV/rwkv-5-world-1b5) architecture.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
+        vocab_size (`int`, *optional*, defaults to 65536):
+            Vocabulary size of the RWKV5 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Rwkv5Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the embeddings and hidden states.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the model.
         attention_hidden_size (`int`, *optional*):
             Dimensionality of the attention hidden states. Will default to `hidden_size` if unset.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            The attention heads to use in rwkv5 self_attention module.
+        head_size (`int`, *optional*, defaults to 64): head_size of rwkv5 self_attention module.
         intermediate_size (`int`, *optional*):
             Dimensionality of the inner feed-forward layers. Will default to 4 times `hidden_size` if unset.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
             The epsilon to use in the layer normalization layers.
         bos_token_id (`int`, *optional*, defaults to 0):
+            The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV5 uses the same tokenizer
             as GPTNeoX.
         eos_token_id (`int`, *optional*, defaults to 0):
+            The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV5 uses the same tokenizer as
             GPTNeoX.
+        rescale_every (`int`, *optional*, defaults to 6):
             At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
             `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
     Example:
     ```python
+    >>> from transformers import Rwkv5Config, Rwkv5Model
+    >>> # Initializing a Rwkv5 configuration
+    >>> configuration = Rwkv5Config()
     >>> # Initializing a model (with random weights) from the configuration
+    >>> model = Rwkv5Model(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
     model_type = "rwkv5"
+    def __init__(
         self,
         vocab_size=65536,
         hidden_size=768,
         num_hidden_layers=24,
         attention_hidden_size=None,
+        num_attention_heads=64,
         head_size=64,
         intermediate_size=None,
         layer_norm_epsilon=1e-5,
         rescale_every=6,
         tie_word_embeddings=False,
         use_cache=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
+        self.num_attention_heads = num_attention_heads
         self.head_size = head_size
         self.intermediate_size = None
         self.layer_norm_epsilon = layer_norm_epsilon
         self.bos_token_id = bos_token_id
         self.eos_token_id = eos_token_id
         super().__init__(
             tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs

modeling_rwkv5.py CHANGED Viewed

@@ -15,16 +15,13 @@
 # limitations under the License.
 """PyTorch RWKV5 World model."""
-import math
 from dataclasses import dataclass
-from pathlib import Path
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.utils.checkpoint
 from torch import nn
-import torch.nn.functional as F
-from torch.nn import CrossEntropyLoss
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
@@ -32,77 +29,59 @@ from transformers.utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_ninja_available,
-    is_torch_cuda_available,
     logging,
 )
 from .configuration_rwkv5 import Rwkv5Config
 logger = logging.get_logger(__name__)
-_CHECKPOINT_FOR_DOC = "RWKV/rwkv-5-world"
 _CONFIG_FOR_DOC = "Rwkv5Config"
-RWKV_PRETRAINED_MODEL_ARCHIVE_LIST = [
 ]
-def rwkv_linear_attention_v5_0(H, S, T, hidden, time_decay, time_first, receptance, key, value, lxw, lxb, ow, state, return_state=False, seq_mode=True):
-    time_decay = torch.exp(-torch.exp(time_decay.float())).reshape(-1,1,1)
-    time_first = torch.exp(time_first.float()).reshape(-1,1,1)
-    lxw = lxw.float()
-    lxb = lxb.float()
-    if seq_mode:
-        w = time_decay.reshape(-1, 1)
-        u = time_first.reshape(-1, 1)
-        ws = w.pow(T).reshape(H, 1, 1)
-        ind = torch.arange(T-1, -1, -1, device=w.device).unsqueeze(0).repeat(H, 1)
-        w = w.repeat(1, T).pow(ind)
-        wk = w.reshape(H, 1, T)
-        wb = wk.transpose(-2, -1).flip(1)
-        w = torch.cat([w[:, 1:], u], dim=1)
-        w = F.pad(w, (0, T))
-        w = torch.tile(w, [T])
-        w = w[:, :-T].reshape(-1, T, 2 * T - 1)
-        w = w[:, :, T-1:].reshape(H, T, T)
-        out = ((receptance @ key) * w) @ value + (receptance @ state) * wb
-        state = ws * state + (key * wk) @ value
-        out = out.transpose(1, 2).contiguous().reshape(T, H*S)
-        out = F.group_norm(out, num_groups=H, weight=lxw, bias=lxb)
-        out = out.to(dtype=hidden.dtype)
-        out = out @ ow
-    else:
-        a = key @ value
-        out = receptance @ (time_first * a + state)
-        state = a + time_decay * state
-        out = out.flatten()
-        out = F.group_norm(out.unsqueeze(0), num_groups=H, weight=lxw, bias=lxb)
-        out = out.to(dtype=hidden.dtype)
-        out = out @ ow
-    return out, state
-cnt = 0
-def rwkv_linear_attention_v5_2(B, H, S, T, n_head, hidden, time_decay, time_first, receptance, key, value, gate, lxw, lxb, ow, state, return_state=False, seq_mode=True):
-    time_decay = torch.exp(-torch.exp(time_decay.float())).reshape(-1,1,1).reshape(n_head, -1, 1)
-    time_first = time_first.float().reshape(-1,1,1).reshape(n_head, -1, 1)
     lxw = lxw.float()
     lxb = lxb.float()
     out = torch.empty((B, T, H, S), dtype=receptance.dtype, device=receptance.device)
     for t in range(T):
-        rt = receptance[:,:,t:t+1,:]
-        kt = key[:,:,:,t:t+1]
-        vt = value[:,:,t:t+1,:]
         at = kt @ vt
         out[:, t] = (rt @ (time_first * at + state)).squeeze(2)
         state = at + time_decay * state
-    out = out.reshape(B*T, H*S)
-    out = F.group_norm(out, num_groups=H, weight=lxw, bias=lxb).reshape(B, T, H*S)
     out = out.to(dtype=hidden.dtype) * gate
     out = out @ ow
@@ -123,13 +102,9 @@ class RwkvSelfAttention(nn.Module):
         )
         self.attention_hidden_size = attention_hidden_size
-        if self.config.model_version == "5_2":
-            self.time_decay = nn.Parameter(torch.empty(num_attention_heads, config.head_size))
-            self.time_faaaa = nn.Parameter(torch.empty(num_attention_heads, config.head_size))
-            self.time_mix_gate = nn.Parameter(torch.empty(1, 1, hidden_size))
-        else:
-            self.time_decay = nn.Parameter(torch.empty(num_attention_heads))
-            self.time_first = nn.Parameter(torch.empty(num_attention_heads))
         self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
         self.time_mix_value = nn.Parameter(torch.empty(1, 1, hidden_size))
@@ -139,8 +114,7 @@ class RwkvSelfAttention(nn.Module):
         self.key = nn.Linear(hidden_size, attention_hidden_size, bias=False)
         self.value = nn.Linear(hidden_size, attention_hidden_size, bias=False)
         self.receptance = nn.Linear(hidden_size, attention_hidden_size, bias=False)
-        if self.config.model_version == "5_2":
-            self.gate = nn.Linear(hidden_size, attention_hidden_size, bias=False)
         self.output = nn.Linear(attention_hidden_size, hidden_size, bias=False)
         # https://github.com/BlinkDL/RWKV-LM/blob/3db37a72356b736966ddd377268f02b80963af3f/RWKV-v4neo/src/model.py#L190C1-L190C1
         self.ln_x = nn.GroupNorm(hidden_size // config.head_size, hidden_size)
@@ -155,32 +129,22 @@ class RwkvSelfAttention(nn.Module):
             if state is not None:
                 shifted[:, 0] = state[0][:, :, self.layer_id]
         if len(shifted.size()) == 2:
-            shifted = shifted.unsqueeze(1)
         key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key)
         value = hidden * self.time_mix_value + shifted * (1 - self.time_mix_value)
         receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)
-        if self.config.model_version == "5_2":
-            gate = hidden* self.time_mix_gate + shifted * (1 - self.time_mix_gate)
-        # if hidden.size(1) == 1 and state is not None:
-        #     receptance = self.receptance(receptance).to(torch.float32).view(B, H, 1, S)
-        #     key = self.key(key).to(torch.float32).view(B, H, S, 1)
-        #     value = self.value(value).to(torch.float32).view(B, H, 1, S)
-        # else:
         # https://github.com/BlinkDL/ChatRWKV/blob/main/rwkv_pip_package/src/rwkv/model.py#L693
         key = self.key(key).to(torch.float32).view(B, T, H, S).transpose(1, 2).transpose(-2, -1)
         value = self.value(value).to(torch.float32).view(B, T, H, S).transpose(1, 2)
         receptance = self.receptance(receptance).to(torch.float32).view(B, T, H, S).transpose(1, 2)
-        if self.config.model_version == "5_2":
-            gate = F.silu(self.gate(gate))
         if state is not None:
             state[0][:, :, self.layer_id] = hidden[:, -1]
-        if self.config.model_version == "5_2":
-            return receptance, key, value, gate, state
-        return receptance, key, value, state
     def forward(self, hidden, state=None, use_cache=False, seq_mode=True):
         B = hidden.shape[0]
@@ -188,13 +152,9 @@ class RwkvSelfAttention(nn.Module):
         S = hidden.shape[-1] // H
         T = hidden.shape[1]
-        if self.config.model_version == "5_2":
-            receptance, key, value, gate, state = self.extract_key_value(B, H, S, T, hidden, state=state)
-        else:
-            receptance, key, value, state = self.extract_key_value(H, S, T, hidden, state=state)
         layer_state = state[1][:, :, :, :, self.layer_id] if state is not None else None
-        if self.config.model_version == "5_2":
-            rwkv, layer_state = rwkv_linear_attention_v5_2(
             B,
             H,
             S,
@@ -214,24 +174,6 @@ class RwkvSelfAttention(nn.Module):
             return_state=use_cache,
             seq_mode=seq_mode,
         )
-        else:
-            rwkv, layer_state = rwkv_linear_attention_v5_0(
-                H,
-                S,
-                T,
-                hidden,
-                self.time_decay,
-                self.time_first,
-                receptance,
-                key,
-                value,
-                self.ln_x.weight,
-                self.ln_x.bias,
-                self.output.weight.t(),
-                state=layer_state,
-                return_state=use_cache,
-                seq_mode=seq_mode,
-            )
         if layer_state is not None:
             state[1][:, :, :, :, self.layer_id] = layer_state
@@ -246,19 +188,16 @@ class RwkvFeedForward(nn.Module):
         self.layer_id = layer_id
         hidden_size = config.hidden_size
         # https://github.com/BlinkDL/RWKV-LM/blob/3db37a72356b736966ddd377268f02b80963af3f/RWKV-v4neo/train.py#L168
-        if self.config.model_version == "5_2":
-            intermediate_size = (
-                config.intermediate_size if config.intermediate_size is not None else int((config.hidden_size * 3.5) // 32 * 32)
-            )
-        else:
-            intermediate_size = (
-                config.intermediate_size if config.intermediate_size is not None else 4 * config.hidden_size
-            )
         self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
         self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
         self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size))
         self.key = nn.Linear(hidden_size, intermediate_size, bias=False)
         self.receptance = nn.Linear(hidden_size, hidden_size, bias=False)
         self.value = nn.Linear(intermediate_size, hidden_size, bias=False)
@@ -301,7 +240,6 @@ class RwkvBlock(nn.Module):
         self.feed_forward = RwkvFeedForward(config, layer_id)
     def forward(self, hidden, state=None, use_cache=False, output_attentions=False, seq_mode=True):
         attention, state = self.attention(self.ln1(hidden), state=state, use_cache=use_cache, seq_mode=seq_mode)
         hidden = hidden + attention
@@ -317,16 +255,18 @@ class RwkvBlock(nn.Module):
         return outputs
-class RwkvPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
     config_class = Rwkv5Config
-    base_model_prefix = "transformer"
     _no_split_modules = ["RwkvBlock"]
     _keep_in_fp32_modules = ["time_decay", "time_first"]
     def _init_weights(self, module):
         """Initialize the weights."""
@@ -335,7 +275,7 @@ class RwkvPreTrainedModel(PreTrainedModel):
             num_hidden_layers = module.config.num_hidden_layers
             hidden_size = module.config.hidden_size
             attention_hidden_size = module.attention_hidden_size
-            num_attention_heads = hidden_size // module.config.head_size
             ratio_0_to_1 = layer_id / (num_hidden_layers - 1)  # 0 to 1
             ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 to ~0
@@ -347,43 +287,30 @@ class RwkvPreTrainedModel(PreTrainedModel):
             )
             time_weight = time_weight[None, None, :]
-            if module.config.model_version == "5_2":
-                # https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4neo/src/model.py#L398
-                decay_speed = [
-                    -6.0 + 5.0 * (h / (attention_hidden_size - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
-                    for h in range(attention_hidden_size)
-                ]
-            else:
-                # https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4neo/src/model.py#L172
-                decay_speed = [
-                    -6.0 + 5.0 * (h / (num_attention_heads - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
-                    for h in range(num_attention_heads)
-                ]
             decay_speed = torch.tensor(decay_speed, dtype=module.time_decay.dtype, device=module.time_decay.device)
-            if module.config.model_version == "5_2":
-                tmp = (
-                    torch.tensor(
-                        [(1.0 - (i / (attention_hidden_size - 1.0))) * ratio_0_to_1 + 0.1 * ((i + 1) % 3 - 1) for i in range(attention_hidden_size)],
-                        dtype=module.time_faaaa.dtype,
-                        device=module.time_faaaa.device,
-                    )
-                )
-            else:
-                tmp = torch.ones(num_attention_heads) * (-3.0)
             with torch.no_grad():
-                if module.config.model_version == "5_2":
-                    module.time_decay.data = decay_speed.reshape(num_attention_heads, module.config.head_size)
-                    module.time_faaaa.data = tmp.reshape(num_attention_heads, module.config.head_size)
-                else:
-                    module.time_decay.data = decay_speed
-                    module.time_first.data = tmp
                 module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
                 module.time_mix_value.data = torch.pow(time_weight, ratio_1_to_almost0) + 0.3 * ratio_0_to_1
                 module.time_mix_receptance.data = torch.pow(time_weight, 0.5 * ratio_1_to_almost0)
-                if module.config.model_version == "5_2":
-                    module.time_mix_gate.data = torch.pow(time_weight, 0.5 * ratio_1_to_almost0)
         elif isinstance(module, RwkvFeedForward):
             layer_id = module.layer_id
             num_hidden_layers = module.config.num_hidden_layers
@@ -402,13 +329,9 @@ class RwkvPreTrainedModel(PreTrainedModel):
                 module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
                 module.time_mix_receptance.data = torch.pow(time_weight, ratio_1_to_almost0)
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, RwkvModel):
-            module.gradient_checkpointing = value
 @dataclass
-class RwkvOutput(ModelOutput):
     """
     Class for the RWKV model outputs.
@@ -420,15 +343,12 @@ class RwkvOutput(ModelOutput):
             avoid providing the old `input_ids`.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
     """
     last_hidden_state: torch.FloatTensor = None
@@ -438,7 +358,7 @@ class RwkvOutput(ModelOutput):
 @dataclass
-class RwkvCausalLMOutput(ModelOutput):
     """
     Base class for causal language model (or autoregressive) outputs.
@@ -452,33 +372,27 @@ class RwkvCausalLMOutput(ModelOutput):
             avoid providing the old `input_ids`.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
     """
     loss: Optional[torch.FloatTensor] = None
     logits: torch.FloatTensor = None
     state: Optional[List[torch.FloatTensor]] = None
-    last_hidden_state: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 RWKV_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
     Parameters:
         config ([`Rwkv5Config`]): Model configuration class with all the parameters of the model.
@@ -491,15 +405,10 @@ RWKV_INPUTS_DOCSTRING = r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
             `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
             `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
-            sequence tokens in the vocabulary.
-            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
-            `input_ids`.
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
@@ -524,7 +433,7 @@ RWKV_INPUTS_DOCSTRING = r"""
     "The bare RWKV Model transformer outputting raw hidden-states without any specific head on top.",
     RWKV_START_DOCSTRING,
 )
-class RwkvModel(RwkvPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -535,6 +444,8 @@ class RwkvModel(RwkvPreTrainedModel):
         self.layers_are_rescaled = False
         self.pre_ln_flag = False
         # Initialize weights and apply final processing
         self.post_init()
@@ -547,28 +458,31 @@ class RwkvModel(RwkvPreTrainedModel):
     @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=RwkvOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         state: Optional[List[torch.FloatTensor]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, RwkvOutput]:
-        seq_mode = input_ids.shape[1] > 1
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if self.training == self.layers_are_rescaled and (self.embeddings.weight.dtype == torch.float16 or self.embeddings.weight.dtype == torch.bfloat16):
             self._rescale_layers()
         if input_ids is not None and inputs_embeds is not None:
@@ -578,23 +492,55 @@ class RwkvModel(RwkvPreTrainedModel):
         if inputs_embeds is None:
             if not self.pre_ln_flag:
-                normalized_weight = F.layer_norm(self.embeddings.weight, (self.config.hidden_size, ), weight=self.blocks[0].pre_ln.weight, bias=self.blocks[0].pre_ln.bias)
                 self.embeddings.weight = nn.Parameter(normalized_weight)
                 self.pre_ln_flag = True
             inputs_embeds = self.embeddings(input_ids)
         if use_cache and state is None:
             # https://github.com/BlinkDL/ChatRWKV/blob/main/rwkv_pip_package/src/rwkv/model.py#L904-L906
             state = []
-            num_attention_heads = self.config.hidden_size // self.config.head_size
-            state.append(torch.zeros((inputs_embeds.size(0), self.config.hidden_size, self.config.num_hidden_layers), dtype=inputs_embeds.dtype, requires_grad=False, device=inputs_embeds.device).contiguous())
-            state.append(torch.zeros((inputs_embeds.size(0), num_attention_heads, self.config.hidden_size // num_attention_heads, self.config.hidden_size // num_attention_heads, self.config.num_hidden_layers), dtype=torch.float32, requires_grad=False, device=inputs_embeds.device).contiguous())
-            state.append(torch.zeros((inputs_embeds.size(0), self.config.hidden_size, self.config.num_hidden_layers), dtype=inputs_embeds.dtype, requires_grad=False, device=inputs_embeds.device).contiguous())
         hidden_states = inputs_embeds
-        global cnt
-        cnt += 1
         all_self_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
         for idx, block in enumerate(self.blocks):
@@ -622,11 +568,11 @@ class RwkvModel(RwkvPreTrainedModel):
         if not return_dict:
             return (hidden_states, state, all_hidden_states, all_self_attentions)
-        return RwkvOutput(
             last_hidden_state=hidden_states,
             state=state,
-            hidden_states=all_hidden_states, #None
-            attentions=all_self_attentions, #None
         )
     def _rescale_layers(self):
@@ -645,6 +591,7 @@ class RwkvModel(RwkvPreTrainedModel):
         self.layers_are_rescaled = not self.training
 @add_start_docstrings(
     """
     The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
@@ -652,10 +599,12 @@ class RwkvModel(RwkvPreTrainedModel):
     """,
     RWKV_START_DOCSTRING,
 )
-class RwkvForCausalLM(RwkvPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.rwkv = RwkvModel(config)
         self.head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
@@ -684,7 +633,7 @@ class RwkvForCausalLM(RwkvPreTrainedModel):
     @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=RwkvCausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
@@ -698,7 +647,7 @@ class RwkvForCausalLM(RwkvPreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, RwkvCausalLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
@@ -716,30 +665,23 @@ class RwkvForCausalLM(RwkvPreTrainedModel):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        last_hidden_state = rwkv_outputs.last_hidden_state
-        state = rwkv_outputs.state
-        logits = self.head(last_hidden_state)
         loss = None
         if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(logits.device)
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
         if not return_dict:
             output = (logits,) + rwkv_outputs[1:]
             return ((loss,) + output) if loss is not None else output
-        return RwkvCausalLMOutput(
             loss=loss,
             logits=logits,
             state=rwkv_outputs.state,
-            last_hidden_state=rwkv_outputs.last_hidden_state,
             attentions=rwkv_outputs.attentions,
         )

 # limitations under the License.
 """PyTorch RWKV5 World model."""
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 import torch
+import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
 )
 from .configuration_rwkv5 import Rwkv5Config
 logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "RWKV/rwkv-5-world-1b5"
 _CONFIG_FOR_DOC = "Rwkv5Config"
+RWKV5_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "RWKV/rwkv-5-world-1b5",
+    # See all RWKV models at https://huggingface.co/models?filter=rwkv
 ]
+def rwkv_linear_attention_v5(
+    B,
+    H,
+    S,
+    T,
+    n_head,
+    hidden,
+    time_decay,
+    time_first,
+    receptance,
+    key,
+    value,
+    gate,
+    lxw,
+    lxb,
+    ow,
+    state,
+    return_state=False,
+    seq_mode=True,
+):
+    time_decay = torch.exp(-torch.exp(time_decay.float())).reshape(-1, 1, 1).reshape(n_head, -1, 1)
+    time_first = time_first.float().reshape(-1, 1, 1).reshape(n_head, -1, 1)
     lxw = lxw.float()
     lxb = lxb.float()
+    # if seq_mode:
     out = torch.empty((B, T, H, S), dtype=receptance.dtype, device=receptance.device)
     for t in range(T):
+        rt = receptance[:, :, t : t + 1, :]
+        kt = key[:, :, :, t : t + 1]
+        vt = value[:, :, t : t + 1, :]
         at = kt @ vt
         out[:, t] = (rt @ (time_first * at + state)).squeeze(2)
         state = at + time_decay * state
+    out = out.reshape(B * T, H * S)
+    out = F.group_norm(out, num_groups=H, weight=lxw, bias=lxb).reshape(B, T, H * S)
     out = out.to(dtype=hidden.dtype) * gate
     out = out @ ow
         )
         self.attention_hidden_size = attention_hidden_size
+        self.time_decay = nn.Parameter(torch.empty(num_attention_heads, config.head_size))
+        self.time_faaaa = nn.Parameter(torch.empty(num_attention_heads, config.head_size))
+        self.time_mix_gate = nn.Parameter(torch.empty(1, 1, hidden_size))
         self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
         self.time_mix_value = nn.Parameter(torch.empty(1, 1, hidden_size))
         self.key = nn.Linear(hidden_size, attention_hidden_size, bias=False)
         self.value = nn.Linear(hidden_size, attention_hidden_size, bias=False)
         self.receptance = nn.Linear(hidden_size, attention_hidden_size, bias=False)
+        self.gate = nn.Linear(hidden_size, attention_hidden_size, bias=False)
         self.output = nn.Linear(attention_hidden_size, hidden_size, bias=False)
         # https://github.com/BlinkDL/RWKV-LM/blob/3db37a72356b736966ddd377268f02b80963af3f/RWKV-v4neo/src/model.py#L190C1-L190C1
         self.ln_x = nn.GroupNorm(hidden_size // config.head_size, hidden_size)
             if state is not None:
                 shifted[:, 0] = state[0][:, :, self.layer_id]
         if len(shifted.size()) == 2:
+            shifted = shifted.unsqueeze(1)
         key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key)
         value = hidden * self.time_mix_value + shifted * (1 - self.time_mix_value)
         receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)
+        gate = hidden * self.time_mix_gate + shifted * (1 - self.time_mix_gate)
         # https://github.com/BlinkDL/ChatRWKV/blob/main/rwkv_pip_package/src/rwkv/model.py#L693
         key = self.key(key).to(torch.float32).view(B, T, H, S).transpose(1, 2).transpose(-2, -1)
         value = self.value(value).to(torch.float32).view(B, T, H, S).transpose(1, 2)
         receptance = self.receptance(receptance).to(torch.float32).view(B, T, H, S).transpose(1, 2)
+        gate = F.silu(self.gate(gate))
         if state is not None:
             state[0][:, :, self.layer_id] = hidden[:, -1]
+        return receptance, key, value, gate, state
     def forward(self, hidden, state=None, use_cache=False, seq_mode=True):
         B = hidden.shape[0]
         S = hidden.shape[-1] // H
         T = hidden.shape[1]
+        receptance, key, value, gate, state = self.extract_key_value(B, H, S, T, hidden, state=state)
         layer_state = state[1][:, :, :, :, self.layer_id] if state is not None else None
+        rwkv, layer_state = rwkv_linear_attention_v5(
             B,
             H,
             S,
             return_state=use_cache,
             seq_mode=seq_mode,
         )
         if layer_state is not None:
             state[1][:, :, :, :, self.layer_id] = layer_state
         self.layer_id = layer_id
         hidden_size = config.hidden_size
         # https://github.com/BlinkDL/RWKV-LM/blob/3db37a72356b736966ddd377268f02b80963af3f/RWKV-v4neo/train.py#L168
+        intermediate_size = (
+            config.intermediate_size
+            if config.intermediate_size is not None
+            else int((config.hidden_size * 3.5) // 32 * 32)
+        )
         self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
         self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
         self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size))
         self.key = nn.Linear(hidden_size, intermediate_size, bias=False)
         self.receptance = nn.Linear(hidden_size, hidden_size, bias=False)
         self.value = nn.Linear(intermediate_size, hidden_size, bias=False)
         self.feed_forward = RwkvFeedForward(config, layer_id)
     def forward(self, hidden, state=None, use_cache=False, output_attentions=False, seq_mode=True):
         attention, state = self.attention(self.ln1(hidden), state=state, use_cache=use_cache, seq_mode=seq_mode)
         hidden = hidden + attention
         return outputs
+class Rwkv5PreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
     config_class = Rwkv5Config
+    base_model_prefix = "rwkv"
     _no_split_modules = ["RwkvBlock"]
     _keep_in_fp32_modules = ["time_decay", "time_first"]
+    supports_gradient_checkpointing = True
+    training = False
     def _init_weights(self, module):
         """Initialize the weights."""
             num_hidden_layers = module.config.num_hidden_layers
             hidden_size = module.config.hidden_size
             attention_hidden_size = module.attention_hidden_size
+            num_attention_heads = hidden_size // module.config.num_attention_heads
             ratio_0_to_1 = layer_id / (num_hidden_layers - 1)  # 0 to 1
             ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 to ~0
             )
             time_weight = time_weight[None, None, :]
+            # https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4neo/src/model.py#L398
+            decay_speed = [
+                -6.0 + 5.0 * (h / (attention_hidden_size - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
+                for h in range(attention_hidden_size)
+            ]
             decay_speed = torch.tensor(decay_speed, dtype=module.time_decay.dtype, device=module.time_decay.device)
+            tmp = torch.tensor(
+                [
+                    (1.0 - (i / (attention_hidden_size - 1.0))) * ratio_0_to_1 + 0.1 * ((i + 1) % 3 - 1)
+                    for i in range(attention_hidden_size)
+                ],
+                dtype=module.time_faaaa.dtype,
+                device=module.time_faaaa.device,
+            )
             with torch.no_grad():
+                module.time_decay.data = decay_speed.reshape(num_attention_heads, module.config.num_attention_heads)
+                module.time_faaaa.data = tmp.reshape(num_attention_heads, module.config.num_attention_heads)
                 module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
                 module.time_mix_value.data = torch.pow(time_weight, ratio_1_to_almost0) + 0.3 * ratio_0_to_1
                 module.time_mix_receptance.data = torch.pow(time_weight, 0.5 * ratio_1_to_almost0)
+                module.time_mix_gate.data = torch.pow(time_weight, 0.5 * ratio_1_to_almost0)
         elif isinstance(module, RwkvFeedForward):
             layer_id = module.layer_id
             num_hidden_layers = module.config.num_hidden_layers
                 module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
                 module.time_mix_receptance.data = torch.pow(time_weight, ratio_1_to_almost0)
 @dataclass
+class Rwkv5Output(ModelOutput):
     """
     Class for the RWKV model outputs.
             avoid providing the old `input_ids`.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
+            the model at the output of each layer plus the optional initial embedding outputs.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
     """
     last_hidden_state: torch.FloatTensor = None
 @dataclass
+class Rwkv5CausalLMOutput(ModelOutput):
     """
     Base class for causal language model (or autoregressive) outputs.
             avoid providing the old `input_ids`.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
+            the model at the output of each layer plus the optional initial embedding outputs.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
     """
     loss: Optional[torch.FloatTensor] = None
     logits: torch.FloatTensor = None
     state: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 RWKV_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.) This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
     Parameters:
         config ([`Rwkv5Config`]): Model configuration class with all the parameters of the model.
         input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
             `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
             `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary. If `past_key_values` is used, only `input_ids` that do not have their
+            past calculated should be passed as `input_ids`. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
     "The bare RWKV Model transformer outputting raw hidden-states without any specific head on top.",
     RWKV_START_DOCSTRING,
 )
+class Rwkv5Model(Rwkv5PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.layers_are_rescaled = False
         self.pre_ln_flag = False
+        self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
     @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Rwkv5Output,
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,  # noqa
         inputs_embeds: Optional[torch.FloatTensor] = None,
         state: Optional[List[torch.FloatTensor]] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Rwkv5Output]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
+        # rwkv5 only support inference in huggingface.
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if self.training == self.layers_are_rescaled and (
+            self.embeddings.weight.dtype == torch.float16 or self.embeddings.weight.dtype == torch.bfloat16
+        ):
             self._rescale_layers()
         if input_ids is not None and inputs_embeds is not None:
         if inputs_embeds is None:
             if not self.pre_ln_flag:
+                normalized_weight = F.layer_norm(
+                    self.embeddings.weight,
+                    (self.config.hidden_size,),
+                    weight=self.blocks[0].pre_ln.weight,
+                    bias=self.blocks[0].pre_ln.bias,
+                )
                 self.embeddings.weight = nn.Parameter(normalized_weight)
                 self.pre_ln_flag = True
             inputs_embeds = self.embeddings(input_ids)
         if use_cache and state is None:
             # https://github.com/BlinkDL/ChatRWKV/blob/main/rwkv_pip_package/src/rwkv/model.py#L904-L906
             state = []
+            num_attention_heads = self.config.hidden_size // self.config.num_attention_heads
+            state.append(
+                torch.zeros(
+                    (inputs_embeds.size(0), self.config.hidden_size, self.config.num_hidden_layers),
+                    dtype=inputs_embeds.dtype,
+                    requires_grad=False,
+                    device=inputs_embeds.device,
+                ).contiguous()
+            )
+            state.append(
+                torch.zeros(
+                    (
+                        inputs_embeds.size(0),
+                        num_attention_heads,
+                        self.config.hidden_size // num_attention_heads,
+                        self.config.hidden_size // num_attention_heads,
+                        self.config.num_hidden_layers,
+                    ),
+                    dtype=torch.float32,
+                    requires_grad=False,
+                    device=inputs_embeds.device,
+                ).contiguous()
+            )
+            state.append(
+                torch.zeros(
+                    (inputs_embeds.size(0), self.config.hidden_size, self.config.num_hidden_layers),
+                    dtype=inputs_embeds.dtype,
+                    requires_grad=False,
+                    device=inputs_embeds.device,
+                ).contiguous()
+            )
+        seq_mode = inputs_embeds.shape[1] > 1
         hidden_states = inputs_embeds
         all_self_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
         for idx, block in enumerate(self.blocks):
         if not return_dict:
             return (hidden_states, state, all_hidden_states, all_self_attentions)
+        return Rwkv5Output(
             last_hidden_state=hidden_states,
             state=state,
+            hidden_states=all_hidden_states,  # None
+            attentions=all_self_attentions,  # None
         )
     def _rescale_layers(self):
         self.layers_are_rescaled = not self.training
 @add_start_docstrings(
     """
     The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
     """,
     RWKV_START_DOCSTRING,
 )
+class Rwkv5ForCausalLM(Rwkv5PreTrainedModel):
+    _tied_weights_keys = ["head.weight"]
     def __init__(self, config):
         super().__init__(config)
+        self.rwkv = Rwkv5Model(config)
         self.head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
     @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Rwkv5CausalLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Rwkv5CausalLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
+        hidden_states = rwkv_outputs[0]
+        logits = self.head(hidden_states)
         loss = None
         if labels is not None:
+            # https://github.com/BlinkDL/ChatRWKV/blob/main/rwkv_pip_package/src/rwkv/model.py#L984
+            loss = torch.tensor(0.0, device=logits.device, dtype=logits.dtype)
         if not return_dict:
             output = (logits,) + rwkv_outputs[1:]
             return ((loss,) + output) if loss is not None else output
+        return Rwkv5CausalLMOutput(
             loss=loss,
             logits=logits,
             state=rwkv_outputs.state,
+            hidden_states=rwkv_outputs.hidden_states,
             attentions=rwkv_outputs.attentions,
         )

tokenization_rwkv_world.py CHANGED Viewed

@@ -12,38 +12,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
 import json
 import os
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
-from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
-from transformers.utils import logging, to_py_obj
-from transformers.tokenization_utils_base import BatchEncoding
-import bisect
-import itertools
-import re
-import unicodedata
-from collections import OrderedDict
-from typing import Any, Dict, List, Optional, Tuple, Union, overload
 from transformers.tokenization_utils_base import (
-    ENCODE_KWARGS_DOCSTRING,
-    ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
-    INIT_TOKENIZER_DOCSTRING,
-    AddedToken,
     BatchEncoding,
     EncodedInput,
-    EncodedInputPair,
-    PreTokenizedInput,
-    PreTokenizedInputPair,
-    PreTrainedTokenizerBase,
     TextInput,
-    TextInputPair,
     TruncationStrategy,
 )
-from transformers.utils import PaddingStrategy, TensorType, add_end_docstrings, logging
 if TYPE_CHECKING:
@@ -54,11 +36,18 @@ logger = logging.get_logger(__name__)
 VOCAB_FILES_NAMES = {
     "vocab_file": "rwkv_vocab_v20230424.txt",
 }
 class TRIE:
     __slots__ = tuple("ch,to,values,front".split(","))
-    to:list
-    values:set
     def __init__(self, front=None, ch=None):
         self.ch = ch
         self.to = [None for ch in range(256)]
@@ -68,64 +57,59 @@ class TRIE:
     def __repr__(self):
         fr = self
         ret = []
-        while(fr!=None):
-            if(fr.ch!=None):
                 ret.append(fr.ch)
             fr = fr.front
-        return "<TRIE %s %s>"%(ret[::-1], self.values)
-    def add(self, key:bytes, idx:int=0, val=None):
-        if(idx == len(key)):
-            if(val is None):
                 val = key
             self.values.add(val)
             return self
         ch = key[idx]
-        if(self.to[ch] is None):
             self.to[ch] = TRIE(front=self, ch=ch)
-        return self.to[ch].add(key, idx=idx+1, val=val)
-    def find_longest(self, key:bytes, idx:int=0):
-        u:TRIE = self
-        ch:int = key[idx]
-        while(u.to[ch] is not None):
             u = u.to[ch]
             idx += 1
-            if(u.values):
                 ret = idx, u, u.values
-            if(idx==len(key)):
                 break
             ch = key[idx]
         return ret
 class RWKVWorldTokenizer(PreTrainedTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
-    def __init__(
-            self,
-            vocab_file,
-            errors="replace",
-            pad_token="0",
-            **kwargs
-    ):
         self.add_bos_token = False
         self.encoder = {}
-        sorted = [] # must be already sorted
         with open(vocab_file, "r", encoding="utf-8") as f:
             lines = f.readlines()
         for l in lines:
-            idx = int(l[:l.index(' ')])
-            x = eval(l[l.index(' '):l.rindex(' ')])
             x = x.encode("utf-8") if isinstance(x, str) else x
             assert isinstance(x, bytes)
-            assert len(x) == int(l[l.rindex(' '):])
             sorted += [x]
             self.encoder[idx] = x
         self.decoder = {}
-        for k,v in self.encoder.items():
             self.decoder[v] = int(k)
         self.trie = TRIE()
@@ -134,13 +118,18 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
         self.errors = errors  # how to handle errors in decoding
         self.cache = {}
         self.first_max_length = 0
-        # pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
         super().__init__(
             errors=errors,
-            # pad_token=pad_token,
             **kwargs,
         )
     @property
     def vocab_size(self):
@@ -148,12 +137,12 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
     def get_vocab(self):
         return dict(self.encoder, **self.added_tokens_encoder)
     def add_tokens(self, new_tokens, special_tokens: bool = False):
         for token in new_tokens:
             token_id = self.convert_tokens_to_ids(token)
             self.added_tokens_decoder[token_id] = token
     def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
         if isinstance(ids, int):
             ids = [ids]
@@ -179,8 +168,7 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
         return output + bos_token_ids + token_ids_1
     def get_special_tokens_mask(
-            self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
-            already_has_special_tokens: bool = False
     ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
@@ -211,19 +199,19 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
             return [1] + ([0] * len(token_ids_0))
         return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
-    def encodeBytes(self, src:bytes):
-        idx:int = 0
         tokens = []
-        while (idx < len(src)):
-            _idx:int = idx
             idx, _, values = self.trie.find_longest(src, idx)
-            assert(idx != _idx)
-            _, token = next(iter(values))
             tokens.append(token)
         return tokens
     def decodeBytes(self, tokens):
-        return b''.join(map(lambda i: self.encoder[i], tokens))
     def _tokenize(self, text, **kwargs):
         """Tokenize a string."""
@@ -231,21 +219,21 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
     def _decode_tokens(self, tokens):
         try:
-            return self.decodeBytes(tokens).decode('utf-8')
-        except:
-            return '\ufffd' # bad utf-8
-    def _decode(self,
-               token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
-               skip_special_tokens: bool = False,
-               **kwargs
-               ) -> str:
         def remove_zeros_from_first_segment(token_ids, first_max_length):
             first_segment = token_ids[:first_max_length]
             first_segment_cleaned = [token for token in first_segment if token != 0]
             return first_segment_cleaned + token_ids[first_max_length:]
         # Convert inputs to python lists
         token_ids = to_py_obj(token_ids)
         token_ids = remove_zeros_from_first_segment(token_ids, self.first_max_length)
@@ -263,7 +251,7 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
                     break
                 out_tokens += [token]
                 tmp = self._decode_tokens(out_tokens[out_last:])
-                if '\ufffd' not in tmp:
                     out_str += tmp
                     out_last = i + 1
             return out_str
@@ -318,16 +306,29 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
     ) -> BatchEncoding:
-        def get_input_ids(text):
             if isinstance(text, str):
-                text_id = self._tokenize(text)
-                return text_id
             elif isinstance(text, list) and len(text) > 0 and isinstance(text[0], str):
-                return [self._tokenize(t) for t in text]
             elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
                 return text
             else:
                 raise ValueError(
                     "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
@@ -383,7 +384,7 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
     ) -> BatchEncoding:
         def get_input_ids(text, max_length=None, pad_token_id=0):
             def pad_sequence(seq, max_len, pad_tok):
@@ -411,7 +412,6 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
                     "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
                 )
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
@@ -462,10 +462,10 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
         )
         return BatchEncoding(batch_outputs)
     def decode(
         self,
-        token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = None,
         **kwargs,
@@ -500,7 +500,7 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
     def batch_decode(
         self,
-        sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = None,
         **kwargs,
@@ -537,5 +537,5 @@ class RWKVWorldTokenizer(PreTrainedTokenizer):
         for is_user, text in conversation.iter_texts():
             input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
         if len(input_ids) > self.model_max_length:
-            input_ids = input_ids[-self.model_max_length:]
         return input_ids

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Tokenization classes for RWKV5."""
 import json
 import os
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_utils_base import (
     BatchEncoding,
     EncodedInput,
     TextInput,
     TruncationStrategy,
 )
+from transformers.utils import PaddingStrategy, TensorType, logging, to_py_obj
 if TYPE_CHECKING:
 VOCAB_FILES_NAMES = {
     "vocab_file": "rwkv_vocab_v20230424.txt",
 }
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "RWKV/rwkv-5-world-169m": "https://huggingface.co/RWKV/rwkv-5-world-169m/blob/main/rwkv_vocab_v20230424.txt",
+    },
+}
 class TRIE:
     __slots__ = tuple("ch,to,values,front".split(","))
+    to: list
+    values: set
     def __init__(self, front=None, ch=None):
         self.ch = ch
         self.to = [None for ch in range(256)]
     def __repr__(self):
         fr = self
         ret = []
+        while fr is not None:
+            if fr.ch is not None:
                 ret.append(fr.ch)
             fr = fr.front
+        return "<TRIE %s %s>" % (ret[::-1], self.values)
+    def add(self, key: bytes, idx: int = 0, val=None):
+        if idx == len(key):
+            if val is None:
                 val = key
             self.values.add(val)
             return self
         ch = key[idx]
+        if self.to[ch] is None:
             self.to[ch] = TRIE(front=self, ch=ch)
+        return self.to[ch].add(key, idx=idx + 1, val=val)
+    def find_longest(self, key: bytes, idx: int = 0):
+        u: TRIE = self
+        ch: int = key[idx]
+        while u.to[ch] is not None:
             u = u.to[ch]
             idx += 1
+            if u.values:
                 ret = idx, u, u.values
+            if idx == len(key):
                 break
             ch = key[idx]
         return ret
 class RWKVWorldTokenizer(PreTrainedTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    def __init__(self, vocab_file, errors="replace", pad_token="0", **kwargs):
         self.add_bos_token = False
         self.encoder = {}
+        sorted = []  # must be already sorted
         with open(vocab_file, "r", encoding="utf-8") as f:
             lines = f.readlines()
         for l in lines:
+            idx = int(l[: l.index(" ")])
+            x = eval(l[l.index(" ") : l.rindex(" ")])
             x = x.encode("utf-8") if isinstance(x, str) else x
             assert isinstance(x, bytes)
+            assert len(x) == int(l[l.rindex(" ") :])
             sorted += [x]
             self.encoder[idx] = x
         self.decoder = {}
+        for k, v in self.encoder.items():
             self.decoder[v] = int(k)
         self.trie = TRIE()
         self.errors = errors  # how to handle errors in decoding
         self.cache = {}
         self.first_max_length = 0
         super().__init__(
             errors=errors,
             **kwargs,
         )
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        return 0
+    @property
+    def eot_token_id(self) -> Optional[int]:
+        return 0
     @property
     def vocab_size(self):
     def get_vocab(self):
         return dict(self.encoder, **self.added_tokens_encoder)
     def add_tokens(self, new_tokens, special_tokens: bool = False):
         for token in new_tokens:
             token_id = self.convert_tokens_to_ids(token)
             self.added_tokens_decoder[token_id] = token
     def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
         if isinstance(ids, int):
             ids = [ids]
         return output + bos_token_ids + token_ids_1
     def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
     ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
             return [1] + ([0] * len(token_ids_0))
         return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
+    def encodeBytes(self, src: bytes):
+        idx: int = 0
         tokens = []
+        while idx < len(src):
+            _idx: int = idx
             idx, _, values = self.trie.find_longest(src, idx)
+            assert idx != _idx
+            _, token = next(iter(values))
             tokens.append(token)
         return tokens
     def decodeBytes(self, tokens):
+        return b''.join(map(lambda i: self.encoder[i], tokens)) # noqa
     def _tokenize(self, text, **kwargs):
         """Tokenize a string."""
     def _decode_tokens(self, tokens):
         try:
+            return self.decodeBytes(tokens).decode("utf-8")
+        except Exception:
+            return "\ufffd"  # bad utf-8
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
         def remove_zeros_from_first_segment(token_ids, first_max_length):
             first_segment = token_ids[:first_max_length]
             first_segment_cleaned = [token for token in first_segment if token != 0]
             return first_segment_cleaned + token_ids[first_max_length:]
         # Convert inputs to python lists
         token_ids = to_py_obj(token_ids)
         token_ids = remove_zeros_from_first_segment(token_ids, self.first_max_length)
                     break
                 out_tokens += [token]
                 tmp = self._decode_tokens(out_tokens[out_last:])
+                if "\ufffd" not in tmp:
                     out_str += tmp
                     out_last = i + 1
             return out_str
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
+        **kwargs,
     ) -> BatchEncoding:
+        def get_input_ids(text, max_length=None, pad_token_id=0):
+            def pad_sequence(seq, max_len, pad_tok):
+                return [pad_tok] * (max_len - len(seq)) + seq
             if isinstance(text, str):
+                tokens = self._tokenize(text)
+                if max_length is not None:
+                    tokens = pad_sequence(tokens, max_length, pad_token_id)
+                return tokens
             elif isinstance(text, list) and len(text) > 0 and isinstance(text[0], str):
+                tokenized_texts = [self._tokenize(t) for t in text]
+                if max_length is None:
+                    max_length = max(len(t) for t in tokenized_texts)
+                return [pad_sequence(t, max_length, pad_token_id) for t in tokenized_texts]
             elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                if max_length is not None and len(text) < max_length:
+                    return pad_sequence(text, max_length, pad_token_id)
                 return text
             else:
                 raise ValueError(
                     "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
+        **kwargs,
     ) -> BatchEncoding:
         def get_input_ids(text, max_length=None, pad_token_id=0):
             def pad_sequence(seq, max_len, pad_tok):
                     "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
                 )
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
         )
         return BatchEncoding(batch_outputs)
     def decode(
         self,
+        token_ids: Union[int, List[int]],
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = None,
         **kwargs,
     def batch_decode(
         self,
+        sequences: Union[List[int], List[List[int]]],
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = None,
         **kwargs,
         for is_user, text in conversation.iter_texts():
             input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
         if len(input_ids) > self.model_max_length:
+            input_ids = input_ids[-self.model_max_length :]
         return input_ids