pankajroark
/

llama-fp16-engine

pankajroark commited on Oct 26, 2023

Commit

2319002

1 Parent(s): 18b067e

inflight batching engine for 7b-sq-int8kv-tp1

Files changed (3) hide show

7b-sq-int8kv-tp1/config.json CHANGED Viewed

@@ -24,7 +24,7 @@
   "plugin_config": {
     "attention_qk_half_accumulation": false,
     "bert_attention_plugin": false,
-    "context_fmha_type": 0,
     "gemm_plugin": "float16",
     "gpt_attention_plugin": "float16",
     "identity_plugin": false,
@@ -32,14 +32,14 @@
     "layernorm_quantization_plugin": false,
     "lookup_plugin": false,
     "nccl_plugin": false,
-    "paged_kv_cache": false,
     "quantize_per_token_plugin": true,
     "quantize_tensor_plugin": true,
-    "remove_input_padding": false,
     "rmsnorm_plugin": false,
     "rmsnorm_quantization_plugin": "float16",
     "smooth_quant_gemm_plugin": "float16",
-    "tokens_per_block": 0,
     "use_custom_all_reduce": false,
     "weight_only_groupwise_quant_matmul_plugin": false,
     "weight_only_quant_matmul_plugin": false

   "plugin_config": {
     "attention_qk_half_accumulation": false,
     "bert_attention_plugin": false,
+    "context_fmha_type": 1,
     "gemm_plugin": "float16",
     "gpt_attention_plugin": "float16",
     "identity_plugin": false,
     "layernorm_quantization_plugin": false,
     "lookup_plugin": false,
     "nccl_plugin": false,
+    "paged_kv_cache": true,
     "quantize_per_token_plugin": true,
     "quantize_tensor_plugin": true,
+    "remove_input_padding": true,
     "rmsnorm_plugin": false,
     "rmsnorm_quantization_plugin": "float16",
     "smooth_quant_gemm_plugin": "float16",
+    "tokens_per_block": 64,
     "use_custom_all_reduce": false,
     "weight_only_groupwise_quant_matmul_plugin": false,
     "weight_only_quant_matmul_plugin": false

7b-sq-int8kv-tp1/llama_float16_tp1_rank0.engine CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d3bd67a974e1ba09ce35f4de7d03010dfdb4089e72aec6a74c1c2e3714c0bca1
-size 7006262500

 version https://git-lfs.github.com/spec/v1
+oid sha256:55022cb34074c613e09f1fd4d42109c0a375a04bf71d02106e92ef47c2fc795f
+size 7006227084

7b-sq-int8kv-tp1/model.cache CHANGED Viewed

Binary files a/7b-sq-int8kv-tp1/model.cache and b/7b-sq-int8kv-tp1/model.cache differ