KaleiNeely
commited on
Commit
•
5879f43
1
Parent(s):
1db31cb
Update configuration_rwkv5.py
Browse files- configuration_rwkv5.py +4 -6
configuration_rwkv5.py
CHANGED
@@ -53,11 +53,9 @@ class Rwkv5Config(PretrainedConfig):
|
|
53 |
layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
|
54 |
The epsilon to use in the layer normalization layers.
|
55 |
bos_token_id (`int`, *optional*, defaults to 0):
|
56 |
-
The id of the beginning of sentence token in the vocabulary. Defaults to 0
|
57 |
-
as GPTNeoX.
|
58 |
eos_token_id (`int`, *optional*, defaults to 0):
|
59 |
-
The id of the end of sentence token in the vocabulary. Defaults to 0
|
60 |
-
GPTNeoX.
|
61 |
rescale_every (`int`, *optional*, defaults to 6):
|
62 |
At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
|
63 |
`rescale_every` layer. If set to 0 or a negative number, no rescale is done.
|
@@ -90,8 +88,8 @@ class Rwkv5Config(PretrainedConfig):
|
|
90 |
hidden_size=768,
|
91 |
num_hidden_layers=24,
|
92 |
attention_hidden_size=None,
|
93 |
-
num_attention_heads=64,
|
94 |
head_size=64,
|
|
|
95 |
intermediate_size=None,
|
96 |
layer_norm_epsilon=1e-5,
|
97 |
bos_token_id=0,
|
@@ -105,8 +103,8 @@ class Rwkv5Config(PretrainedConfig):
|
|
105 |
self.hidden_size = hidden_size
|
106 |
self.num_hidden_layers = num_hidden_layers
|
107 |
self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
|
108 |
-
self.num_attention_heads = num_attention_heads
|
109 |
self.head_size = head_size
|
|
|
110 |
self.intermediate_size = None
|
111 |
self.layer_norm_epsilon = layer_norm_epsilon
|
112 |
self.rescale_every = rescale_every
|
|
|
53 |
layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
|
54 |
The epsilon to use in the layer normalization layers.
|
55 |
bos_token_id (`int`, *optional*, defaults to 0):
|
56 |
+
The id of the beginning of sentence token in the vocabulary. Defaults to 0.
|
|
|
57 |
eos_token_id (`int`, *optional*, defaults to 0):
|
58 |
+
The id of the end of sentence token in the vocabulary. Defaults to 0.
|
|
|
59 |
rescale_every (`int`, *optional*, defaults to 6):
|
60 |
At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
|
61 |
`rescale_every` layer. If set to 0 or a negative number, no rescale is done.
|
|
|
88 |
hidden_size=768,
|
89 |
num_hidden_layers=24,
|
90 |
attention_hidden_size=None,
|
|
|
91 |
head_size=64,
|
92 |
+
head_size_divisor=8,
|
93 |
intermediate_size=None,
|
94 |
layer_norm_epsilon=1e-5,
|
95 |
bos_token_id=0,
|
|
|
103 |
self.hidden_size = hidden_size
|
104 |
self.num_hidden_layers = num_hidden_layers
|
105 |
self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
|
|
|
106 |
self.head_size = head_size
|
107 |
+
self.head_size_divisor = head_size_divisor
|
108 |
self.intermediate_size = None
|
109 |
self.layer_norm_epsilon = layer_norm_epsilon
|
110 |
self.rescale_every = rescale_every
|