Update app.py
Browse files
app.py
CHANGED
@@ -44,9 +44,9 @@ def Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, act_func,
|
|
44 |
# attention:
|
45 |
# layernorm: h/2h
|
46 |
if act_func == "LLaMA":
|
47 |
-
|
48 |
else:
|
49 |
-
|
50 |
# QKV weight: 3h*h/tp, bias: 3h/tp
|
51 |
# output linear weight: h*h/tp, bias: h
|
52 |
num_parameters_attention_Q_weight = hidden_size * hidden_size / tp
|
@@ -85,7 +85,7 @@ def Compute_Parameters(seq_length, vocab_size, layer_num, hidden_size, ffn_size,
|
|
85 |
kv_hidden_size = hidden_size / head_num * group_query_num
|
86 |
|
87 |
# input part
|
88 |
-
num_parameters_input = Compute_Parameters_input(seq_length, hidden_size, vocab_size, tp)
|
89 |
|
90 |
# middle layers part
|
91 |
num_parameters_attention = Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, act_func, tp)
|
|
|
44 |
# attention:
|
45 |
# layernorm: h/2h
|
46 |
if act_func == "LLaMA":
|
47 |
+
num_parameters_attention = hidden_size # RMSNorm
|
48 |
else:
|
49 |
+
num_parameters_attention = 2 * hidden_size # LayerNorm
|
50 |
# QKV weight: 3h*h/tp, bias: 3h/tp
|
51 |
# output linear weight: h*h/tp, bias: h
|
52 |
num_parameters_attention_Q_weight = hidden_size * hidden_size / tp
|
|
|
85 |
kv_hidden_size = hidden_size / head_num * group_query_num
|
86 |
|
87 |
# input part
|
88 |
+
num_parameters_input = Compute_Parameters_input(seq_length, hidden_size, vocab_size, act_func, tp)
|
89 |
|
90 |
# middle layers part
|
91 |
num_parameters_attention = Compute_Parameters_attention(hidden_size, kv_hidden_size, is_bias, act_func, tp)
|