RWKV
/

rwkv-5-world-1b5

Text Generation

Model card Files Files and versions Community

KaleiNeely commited on Mar 21, 2024

Commit

5879f43

·

verified ·

1 Parent(s): 1db31cb

Update configuration_rwkv5.py

Files changed (1) hide show

configuration_rwkv5.py +4 -6

configuration_rwkv5.py CHANGED Viewed

@@ -53,11 +53,9 @@ class Rwkv5Config(PretrainedConfig):
         layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
             The epsilon to use in the layer normalization layers.
         bos_token_id (`int`, *optional*, defaults to 0):
-            The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV5 uses the same tokenizer
-            as GPTNeoX.
         eos_token_id (`int`, *optional*, defaults to 0):
-            The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV5 uses the same tokenizer as
-            GPTNeoX.
         rescale_every (`int`, *optional*, defaults to 6):
             At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
             `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
@@ -90,8 +88,8 @@ class Rwkv5Config(PretrainedConfig):
         hidden_size=768,
         num_hidden_layers=24,
         attention_hidden_size=None,
-        num_attention_heads=64,
         head_size=64,
         intermediate_size=None,
         layer_norm_epsilon=1e-5,
         bos_token_id=0,
@@ -105,8 +103,8 @@ class Rwkv5Config(PretrainedConfig):
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
-        self.num_attention_heads = num_attention_heads
         self.head_size = head_size
         self.intermediate_size = None
         self.layer_norm_epsilon = layer_norm_epsilon
         self.rescale_every = rescale_every

         layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
             The epsilon to use in the layer normalization layers.
         bos_token_id (`int`, *optional*, defaults to 0):
+            The id of the beginning of sentence token in the vocabulary. Defaults to 0.
         eos_token_id (`int`, *optional*, defaults to 0):
+            The id of the end of sentence token in the vocabulary. Defaults to 0.
         rescale_every (`int`, *optional*, defaults to 6):
             At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
             `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
         hidden_size=768,
         num_hidden_layers=24,
         attention_hidden_size=None,
         head_size=64,
+        head_size_divisor=8,
         intermediate_size=None,
         layer_norm_epsilon=1e-5,
         bos_token_id=0,
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
         self.head_size = head_size
+        self.head_size_divisor = head_size_divisor
         self.intermediate_size = None
         self.layer_norm_epsilon = layer_norm_epsilon
         self.rescale_every = rescale_every