File size: 4,532 Bytes
84b3156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# coding=utf-8
from transformers.configuration_utils import PretrainedConfig

class SmallThinkerConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`SmallThinkerModel`]. 
    It is used to instantiate a SmallThinker model according to the specified arguments, defining the model architecture. 
    The default values for each of the parameters are the same as the ones used in the original SmallThinker 4B model.

    General configs:
    - model_type: "smallthinker"
    - model_name
    - num_hidden_layers
    - hidden_size

    Tokenizer configs:
    - pad_token_id
    - bos_token_id
    - eos_token_id

    Embedding configs:
    - vocab_size

    RMSNorm configs:
    - rms_norm_eps

    Attention configs:
    - num_attention_heads
    - num_key_value_heads
    - head_dim
    - use_cache
    - rope_layout: array of 0 or 1s, 0 for nope, 1 for rope
    - rope_theta
    - max_position_embeddings
    - sliding_window_layout: array of 0 or 1s, 0 for normal attention, 1 for SWA
    - sliding_window_size

    MoE FFN configs:
    - moe_num_primary_experts
    - moe_ffn_hidden_size
    - moe_primary_router_apply_softmax: Use topk-softmax in routing instead of topk-sigmoid-normalize
    - moe_num_active_primary_experts

    LM Head configs:
    - tie_word_embeddings

    Other configs:
    - initializer_range
    """
    def __init__(self,
        model_type = "smallthinker",
        model_name="smallthinker_4b_base",
        num_hidden_layers=32,
        hidden_size=1536,
        pad_token_id=None,
        bos_token_id=151643,
        eos_token_id=[151643,151645],
        vocab_size=151936,
        rms_norm_eps=1e-6,
        num_attention_heads=12,
        num_key_value_heads=2,
        head_dim=128,
        use_cache=True,
        rope_layout=[1]*32,
        rope_theta=1e6,
        max_position_embeddings=4096 * 32,
        sliding_window_layout=[0]*32,
        sliding_window_size=4096,
        moe_num_primary_experts=32,
        moe_ffn_hidden_size=768,
        moe_primary_router_apply_softmax=False,
        moe_num_active_primary_experts=4,
        tie_word_embeddings=True,
        initializer_range=0.02,
        **kwargs,
    ):
        # Configuration sanitizers
        assert num_attention_heads % num_key_value_heads == 0,      "[SmallThinker config sanitizer] num_attention_heads must be divisible by num_key_value_heads"
        assert len(rope_layout) == num_hidden_layers,               "[SmallThinker config sanitizer] rope_layout must have the same length as num_hidden_layers"
        assert len(sliding_window_layout) == num_hidden_layers,     "[SmallThinker config sanitizer] sliding_window_layout must have the same length as num_hidden_layers"
        
        # General configs
        self.model_type = model_type
        self.model_name = model_name
        self.num_hidden_layers = num_hidden_layers
        self.hidden_size = hidden_size

        # Tokenizer configs
        self.pad_token_id = pad_token_id
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

        # Embedding configs
        self.vocab_size = vocab_size

        # RMSNorm configs
        self.rms_norm_eps = rms_norm_eps

        # Attention configs
        self.num_attention_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.head_dim = head_dim
        self.use_cache = use_cache
        self.rope_layout = rope_layout
        self.rope_theta = rope_theta
        self.max_position_embeddings = max_position_embeddings
        self.sliding_window_layout = sliding_window_layout
        self.sliding_window_size = sliding_window_size

        # MoE FFN configs
        self.moe_num_primary_experts = moe_num_primary_experts
        self.moe_ffn_hidden_size = moe_ffn_hidden_size
        self.moe_primary_router_apply_softmax = moe_primary_router_apply_softmax
        self.moe_num_active_primary_experts = moe_num_active_primary_experts

        # Other configs
        self.initializer_range = initializer_range

        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs)

        # VLLM config, not used in transformers, but VLLM requires these args to run correctly. DO NOT DELETE!
        self.sliding_window = sliding_window_size
        self.sliding_window_pattern = sliding_window_layout

        self._attn_implementation = "sdpa"
        
__all__ = ["SmallThinkerConfig"]