damerajee commited on
Commit
b9f07f0
·
verified ·
1 Parent(s): 0b0bb29

Update configuration_Llamoe.py

Browse files
Files changed (1) hide show
  1. configuration_Llamoe.py +102 -51
configuration_Llamoe.py CHANGED
@@ -1,63 +1,114 @@
1
- import math
2
- from typing import Optional
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- from transformers import PretrainedConfig
 
 
 
 
 
 
 
5
 
6
 
7
  class LlamoeConfig(PretrainedConfig):
8
- """Phi configuration."""
9
 
10
  model_type = "llama"
11
- attribute_map = {
12
- "max_position_embeddings": "n_positions",
13
- "hidden_size": "n_embd",
14
- "num_attention_heads": "n_head",
15
- "num_hidden_layers": "n_layer",
16
- }
17
 
18
  def __init__(
19
  self,
20
- vocab_size: int = 50304,
21
- n_positions: int = 2048,
22
- n_embd: int = 1024,
23
- n_layer: int = 20,
24
- n_inner: Optional[int] = None,
25
- n_head: int = 16,
26
- n_head_kv: Optional[int] = None,
27
  num_experts_per_tok: int = 2,
28
- num_local_experts: int = 4,
29
- rotary_dim: Optional[int] = 32,
30
- activation_function: Optional[str] = "gelu_new",
31
- flash_attn: bool = False,
32
- flash_rotary: bool = False,
33
- fused_dense: bool = False,
34
- attn_pdrop: float = 0.0,
35
- embd_pdrop: float = 0.0,
36
- resid_pdrop: float = 0.0,
37
- layer_norm_epsilon: float = 1e-5,
38
- initializer_range: float = 0.02,
39
- tie_word_embeddings: bool = False,
40
- pad_vocab_size_multiple: int = 64,
41
- **kwargs
42
- ) -> None:
43
- self.vocab_size = int(math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple)
44
- self.n_positions = n_positions
45
- self.n_embd = n_embd
46
- self.n_layer = n_layer
47
- self.n_inner = n_inner
48
- self.n_head = n_head
49
- self.n_head_kv = n_head_kv
50
- self.num_experts_per_tok = num_experts_per_tok
51
- self.num_local_experts = num_local_experts
52
- self.rotary_dim = min(rotary_dim, n_embd // n_head)
53
- self.activation_function = activation_function
54
- self.flash_attn = flash_attn
55
- self.flash_rotary = flash_rotary
56
- self.fused_dense = fused_dense
57
- self.attn_pdrop = attn_pdrop
58
- self.embd_pdrop = embd_pdrop
59
- self.resid_pdrop = resid_pdrop
60
- self.layer_norm_epsilon = layer_norm_epsilon
61
  self.initializer_range = initializer_range
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ LLaMA model configuration"""
21
 
22
+ from transformers.configuration_utils import PretrainedConfig
23
+ from transformers.utils import logging
24
+
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+
29
+ from transformers.deprecated._archive_maps import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP # noqa: F401, E402
30
 
31
 
32
  class LlamoeConfig(PretrainedConfig):
33
+
34
 
35
  model_type = "llama"
36
+ keys_to_ignore_at_inference = ["past_key_values"]
 
 
 
 
 
37
 
38
  def __init__(
39
  self,
40
+ vocab_size=48064,
41
+ hidden_size=4096,
42
+ intermediate_size=11008,
 
 
 
 
43
  num_experts_per_tok: int = 2,
44
+ num_local_experts: int = 2,
45
+ num_hidden_layers=32,
46
+ num_attention_heads=32,
47
+ num_key_value_heads=None,
48
+ hidden_act="silu",
49
+ max_position_embeddings=2048,
50
+ initializer_range=0.02,
51
+ rms_norm_eps=1e-6,
52
+ use_cache=True,
53
+ pad_token_id=None,
54
+ bos_token_id=1,
55
+ eos_token_id=2,
56
+ pretraining_tp=1,
57
+ tie_word_embeddings=False,
58
+ rope_theta=10000.0,
59
+ rope_scaling=None,
60
+ attention_bias=False,
61
+ attention_dropout=0.0,
62
+ **kwargs,
63
+ ):
64
+ self.vocab_size = vocab_size
65
+ self.max_position_embeddings = max_position_embeddings
66
+ self.hidden_size = hidden_size
67
+ self.intermediate_size = intermediate_size
68
+ self.num_hidden_layers = num_hidden_layers
69
+ self.num_attention_heads = num_attention_heads
70
+
71
+ # for backward compatibility
72
+ if num_key_value_heads is None:
73
+ num_key_value_heads = num_attention_heads
74
+
75
+ self.num_key_value_heads = num_key_value_heads
76
+ self.hidden_act = hidden_act
77
  self.initializer_range = initializer_range
78
+ self.rms_norm_eps = rms_norm_eps
79
+ self.pretraining_tp = pretraining_tp
80
+ self.use_cache = use_cache
81
+ self.rope_theta = rope_theta
82
+ self.rope_scaling = rope_scaling
83
+ self._rope_scaling_validation()
84
+ self.attention_bias = attention_bias
85
+ self.attention_dropout = attention_dropout
86
+
87
+ super().__init__(
88
+ pad_token_id=pad_token_id,
89
+ bos_token_id=bos_token_id,
90
+ eos_token_id=eos_token_id,
91
+ tie_word_embeddings=tie_word_embeddings,
92
+ **kwargs,
93
+ )
94
+
95
+ def _rope_scaling_validation(self):
96
+ """
97
+ Validate the `rope_scaling` configuration.
98
+ """
99
+ if self.rope_scaling is None:
100
+ return
101
 
102
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
103
+ raise ValueError(
104
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
105
+ f"got {self.rope_scaling}"
106
+ )
107
+ rope_scaling_type = self.rope_scaling.get("type", None)
108
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
109
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
110
+ raise ValueError(
111
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
112
+ )
113
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
114
+ raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")