damerajee commited on
Commit
2237f41
·
verified ·
1 Parent(s): f35f66c

Update configuration_Llamoe.py

Browse files
Files changed (1) hide show
  1. configuration_Llamoe.py +54 -63
configuration_Llamoe.py CHANGED
@@ -1,72 +1,63 @@
1
- from transformers.configuration_utils import PretrainedConfig
2
- from transformers.utils import logging
3
 
 
4
 
5
- logger = logging.get_logger(__name__)
6
 
7
- GEMMOE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
8
- "damerajee/Llamoe-test": "https://huggingface.co/damerajee/Llamoe-test/resolve/main/config.json",
9
- }
10
 
11
-
12
- class LlamoeConfig(PretrainedConfig):
13
-
14
-
15
- model_type = "llamoe"
16
- keys_to_ignore_at_inference = ["past_key_values"]
 
17
 
18
  def __init__(
19
  self,
20
- vocab_size=32000,
21
- hidden_size=4096,
22
- intermediate_size=11008,
23
- num_hidden_layers=32,
24
- num_attention_heads=32,
25
- num_key_value_heads=32,
26
- head_dim=128,
27
- hidden_act="silu",
28
- max_position_embeddings=2048,
29
- initializer_range=0.02,
30
- rms_norm_eps=1e-06,
31
- use_cache=True,
32
- pad_token_id=0,
33
- eos_token_id=1,
34
- bos_token_id=2,
35
- tie_word_embeddings=False,
36
- rope_theta=10000.0,
37
- attention_bias=False,
38
- attention_dropout=0.0,
39
- num_experts_per_tok=2,
40
- num_local_experts=8,
41
- router_aux_loss_coef=0.02,
42
- output_router_logits=False,
43
- **kwargs,
44
- ):
45
- self.vocab_size = vocab_size
46
- self.max_position_embeddings = max_position_embeddings
47
- self.hidden_size = hidden_size
48
- self.intermediate_size = intermediate_size
49
- self.num_hidden_layers = num_hidden_layers
50
- self.num_attention_heads = num_attention_heads
51
- self.head_dim = head_dim
52
- self.hidden_act = hidden_act
53
- self.hidden_activation = hidden_activation
54
- self.num_key_value_heads = num_key_value_heads
55
- self.initializer_range = initializer_range
56
- self.rms_norm_eps = rms_norm_eps
57
- self.use_cache = use_cache
58
- self.rope_theta = rope_theta
59
- self.attention_bias = attention_bias
60
- self.attention_dropout = attention_dropout
61
  self.num_experts_per_tok = num_experts_per_tok
62
  self.num_local_experts = num_local_experts
63
- self.router_aux_loss_coef = router_aux_loss_coef
64
- self.output_router_logits = output_router_logits
65
-
66
- super().__init__(
67
- pad_token_id=pad_token_id,
68
- bos_token_id=bos_token_id,
69
- eos_token_id=eos_token_id,
70
- tie_word_embeddings=tie_word_embeddings,
71
- **kwargs,
72
- )
 
 
 
1
+ import math
2
+ from typing import Optional
3
 
4
+ from transformers import PretrainedConfig
5
 
 
6
 
7
+ class PhiConfig(PretrainedConfig):
8
+ """Phi configuration."""
 
9
 
10
+ model_type = "llama"
11
+ attribute_map = {
12
+ "max_position_embeddings": "n_positions",
13
+ "hidden_size": "n_embd",
14
+ "num_attention_heads": "n_head",
15
+ "num_hidden_layers": "n_layer",
16
+ }
17
 
18
  def __init__(
19
  self,
20
+ vocab_size: int = 50304,
21
+ n_positions: int = 2048,
22
+ n_embd: int = 1024,
23
+ n_layer: int = 20,
24
+ n_inner: Optional[int] = None,
25
+ n_head: int = 16,
26
+ n_head_kv: Optional[int] = None,
27
+ num_experts_per_tok: int = 2,
28
+ num_local_experts: int = 4,
29
+ rotary_dim: Optional[int] = 32,
30
+ activation_function: Optional[str] = "gelu_new",
31
+ flash_attn: bool = False,
32
+ flash_rotary: bool = False,
33
+ fused_dense: bool = False,
34
+ attn_pdrop: float = 0.0,
35
+ embd_pdrop: float = 0.0,
36
+ resid_pdrop: float = 0.0,
37
+ layer_norm_epsilon: float = 1e-5,
38
+ initializer_range: float = 0.02,
39
+ tie_word_embeddings: bool = False,
40
+ pad_vocab_size_multiple: int = 64,
41
+ **kwargs
42
+ ) -> None:
43
+ self.vocab_size = int(math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple)
44
+ self.n_positions = n_positions
45
+ self.n_embd = n_embd
46
+ self.n_layer = n_layer
47
+ self.n_inner = n_inner
48
+ self.n_head = n_head
49
+ self.n_head_kv = n_head_kv
 
 
 
 
 
 
 
 
 
 
 
50
  self.num_experts_per_tok = num_experts_per_tok
51
  self.num_local_experts = num_local_experts
52
+ self.rotary_dim = min(rotary_dim, n_embd // n_head)
53
+ self.activation_function = activation_function
54
+ self.flash_attn = flash_attn
55
+ self.flash_rotary = flash_rotary
56
+ self.fused_dense = fused_dense
57
+ self.attn_pdrop = attn_pdrop
58
+ self.embd_pdrop = embd_pdrop
59
+ self.resid_pdrop = resid_pdrop
60
+ self.layer_norm_epsilon = layer_norm_epsilon
61
+ self.initializer_range = initializer_range
62
+
63
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)