haoyang-amd commited on
Commit
9480e2d
1 Parent(s): 40b0f03

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +5 -3
README.md CHANGED
@@ -24,7 +24,8 @@ python3 quantize_quark.py \
24
  --quant_scheme w_fp8_a_fp8 \
25
  --kv_cache_dtype fp8 \
26
  --num_calib_data 128 \
27
- --model_export quark_safetensors
 
28
 
29
  # If model size is too large for single GPU, please use multi GPU instead.
30
  python3 quantize_quark.py \
@@ -34,6 +35,7 @@ python3 quantize_quark.py \
34
  --kv_cache_dtype fp8 \
35
  --num_calib_data 128 \
36
  --model_export quark_safetensors \
 
37
  --multi_gpu
38
  ```
39
  ## Deployment
@@ -56,9 +58,9 @@ The quantization evaluation results are conducted in pseudo-quantization mode, w
56
  <tr>
57
  <td>Perplexity-wikitext2
58
  </td>
59
- <td>7.2169
60
  </td>
61
- <td>7.2752
62
  </td>
63
  </tr>
64
 
 
24
  --quant_scheme w_fp8_a_fp8 \
25
  --kv_cache_dtype fp8 \
26
  --num_calib_data 128 \
27
+ --model_export quark_safetensors \
28
+ --no_weight_matrix_merge
29
 
30
  # If model size is too large for single GPU, please use multi GPU instead.
31
  python3 quantize_quark.py \
 
35
  --kv_cache_dtype fp8 \
36
  --num_calib_data 128 \
37
  --model_export quark_safetensors \
38
+ --no_weight_matrix_merge \
39
  --multi_gpu
40
  ```
41
  ## Deployment
 
58
  <tr>
59
  <td>Perplexity-wikitext2
60
  </td>
61
+ <td>7.2171
62
  </td>
63
+ <td>7.2720
64
  </td>
65
  </tr>
66