KaleiNeely commited on
Commit
5879f43
1 Parent(s): 1db31cb

Update configuration_rwkv5.py

Browse files
Files changed (1) hide show
  1. configuration_rwkv5.py +4 -6
configuration_rwkv5.py CHANGED
@@ -53,11 +53,9 @@ class Rwkv5Config(PretrainedConfig):
53
  layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
54
  The epsilon to use in the layer normalization layers.
55
  bos_token_id (`int`, *optional*, defaults to 0):
56
- The id of the beginning of sentence token in the vocabulary. Defaults to 0 as RWKV5 uses the same tokenizer
57
- as GPTNeoX.
58
  eos_token_id (`int`, *optional*, defaults to 0):
59
- The id of the end of sentence token in the vocabulary. Defaults to 0 as RWKV5 uses the same tokenizer as
60
- GPTNeoX.
61
  rescale_every (`int`, *optional*, defaults to 6):
62
  At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
63
  `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
@@ -90,8 +88,8 @@ class Rwkv5Config(PretrainedConfig):
90
  hidden_size=768,
91
  num_hidden_layers=24,
92
  attention_hidden_size=None,
93
- num_attention_heads=64,
94
  head_size=64,
 
95
  intermediate_size=None,
96
  layer_norm_epsilon=1e-5,
97
  bos_token_id=0,
@@ -105,8 +103,8 @@ class Rwkv5Config(PretrainedConfig):
105
  self.hidden_size = hidden_size
106
  self.num_hidden_layers = num_hidden_layers
107
  self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
108
- self.num_attention_heads = num_attention_heads
109
  self.head_size = head_size
 
110
  self.intermediate_size = None
111
  self.layer_norm_epsilon = layer_norm_epsilon
112
  self.rescale_every = rescale_every
 
53
  layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
54
  The epsilon to use in the layer normalization layers.
55
  bos_token_id (`int`, *optional*, defaults to 0):
56
+ The id of the beginning of sentence token in the vocabulary. Defaults to 0.
 
57
  eos_token_id (`int`, *optional*, defaults to 0):
58
+ The id of the end of sentence token in the vocabulary. Defaults to 0.
 
59
  rescale_every (`int`, *optional*, defaults to 6):
60
  At inference, the hidden states (and weights of the correponding output layers) are divided by 2 every
61
  `rescale_every` layer. If set to 0 or a negative number, no rescale is done.
 
88
  hidden_size=768,
89
  num_hidden_layers=24,
90
  attention_hidden_size=None,
 
91
  head_size=64,
92
+ head_size_divisor=8,
93
  intermediate_size=None,
94
  layer_norm_epsilon=1e-5,
95
  bos_token_id=0,
 
103
  self.hidden_size = hidden_size
104
  self.num_hidden_layers = num_hidden_layers
105
  self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
 
106
  self.head_size = head_size
107
+ self.head_size_divisor = head_size_divisor
108
  self.intermediate_size = None
109
  self.layer_norm_epsilon = layer_norm_epsilon
110
  self.rescale_every = rescale_every