Commit
·
ad987a1
1
Parent(s):
2319002
update engine with inflight options for 7b-sq-int8kv-tp8
Browse files- 7b-sq-int8kv-tp8/config.json +3 -3
- 7b-sq-int8kv-tp8/llama_float16_tp8_rank0.engine +2 -2
- 7b-sq-int8kv-tp8/llama_float16_tp8_rank1.engine +2 -2
- 7b-sq-int8kv-tp8/llama_float16_tp8_rank2.engine +2 -2
- 7b-sq-int8kv-tp8/llama_float16_tp8_rank3.engine +2 -2
- 7b-sq-int8kv-tp8/llama_float16_tp8_rank4.engine +2 -2
- 7b-sq-int8kv-tp8/llama_float16_tp8_rank5.engine +2 -2
- 7b-sq-int8kv-tp8/llama_float16_tp8_rank6.engine +2 -2
- 7b-sq-int8kv-tp8/llama_float16_tp8_rank7.engine +2 -2
- 7b-sq-int8kv-tp8/model.cache +0 -0
7b-sq-int8kv-tp8/config.json
CHANGED
@@ -24,7 +24,7 @@
|
|
24 |
"plugin_config": {
|
25 |
"attention_qk_half_accumulation": false,
|
26 |
"bert_attention_plugin": false,
|
27 |
-
"context_fmha_type":
|
28 |
"gemm_plugin": "float16",
|
29 |
"gpt_attention_plugin": "float16",
|
30 |
"identity_plugin": false,
|
@@ -32,14 +32,14 @@
|
|
32 |
"layernorm_quantization_plugin": false,
|
33 |
"lookup_plugin": false,
|
34 |
"nccl_plugin": "float16",
|
35 |
-
"paged_kv_cache":
|
36 |
"quantize_per_token_plugin": true,
|
37 |
"quantize_tensor_plugin": true,
|
38 |
"remove_input_padding": true,
|
39 |
"rmsnorm_plugin": false,
|
40 |
"rmsnorm_quantization_plugin": "float16",
|
41 |
"smooth_quant_gemm_plugin": "float16",
|
42 |
-
"tokens_per_block":
|
43 |
"use_custom_all_reduce": false,
|
44 |
"weight_only_groupwise_quant_matmul_plugin": false,
|
45 |
"weight_only_quant_matmul_plugin": false
|
|
|
24 |
"plugin_config": {
|
25 |
"attention_qk_half_accumulation": false,
|
26 |
"bert_attention_plugin": false,
|
27 |
+
"context_fmha_type": 1,
|
28 |
"gemm_plugin": "float16",
|
29 |
"gpt_attention_plugin": "float16",
|
30 |
"identity_plugin": false,
|
|
|
32 |
"layernorm_quantization_plugin": false,
|
33 |
"lookup_plugin": false,
|
34 |
"nccl_plugin": "float16",
|
35 |
+
"paged_kv_cache": true,
|
36 |
"quantize_per_token_plugin": true,
|
37 |
"quantize_tensor_plugin": true,
|
38 |
"remove_input_padding": true,
|
39 |
"rmsnorm_plugin": false,
|
40 |
"rmsnorm_quantization_plugin": "float16",
|
41 |
"smooth_quant_gemm_plugin": "float16",
|
42 |
+
"tokens_per_block": 64,
|
43 |
"use_custom_all_reduce": false,
|
44 |
"weight_only_groupwise_quant_matmul_plugin": false,
|
45 |
"weight_only_quant_matmul_plugin": false
|
7b-sq-int8kv-tp8/llama_float16_tp8_rank0.engine
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1a9da2ee1058623607c82ace17218cad0286c6f2ca44fe88e30102d94f03d628
|
3 |
+
size 1108268092
|
7b-sq-int8kv-tp8/llama_float16_tp8_rank1.engine
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d53e39342a9509140705bf3653af81da9801ab6294cda636719d875df0a9f077
|
3 |
+
size 1108268092
|
7b-sq-int8kv-tp8/llama_float16_tp8_rank2.engine
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1b7ebd8d690d26d4ea687fe4d3b386d064f1aa040b86625a176d6e445d25c77f
|
3 |
+
size 1108268092
|
7b-sq-int8kv-tp8/llama_float16_tp8_rank3.engine
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:29ad1b286a33080ee502b612df18a8a148b681b7465d15fee108608e54fa247c
|
3 |
+
size 1108268092
|
7b-sq-int8kv-tp8/llama_float16_tp8_rank4.engine
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:61cc22967c7a6f2096e6e470bb6c2e9536d16f48613f4c8d2741f0d014a07163
|
3 |
+
size 1108268092
|
7b-sq-int8kv-tp8/llama_float16_tp8_rank5.engine
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ecdbeeb825e1698ac4517fbba1b2278d6a6ac268a8196c8e5dde4c2127b1ca44
|
3 |
+
size 1108268092
|
7b-sq-int8kv-tp8/llama_float16_tp8_rank6.engine
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:94012683837e5b7535c5fd6be53be471742874f1d5fec218a497b2345909542f
|
3 |
+
size 1108268092
|
7b-sq-int8kv-tp8/llama_float16_tp8_rank7.engine
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb028fb0dab36064f098ba6c4613c6978a0bd2dffcd34275c909627c53d69068
|
3 |
+
size 1108268092
|
7b-sq-int8kv-tp8/model.cache
CHANGED
Binary files a/7b-sq-int8kv-tp8/model.cache and b/7b-sq-int8kv-tp8/model.cache differ
|
|