haoyang-amd commited on
Commit
5baa836
1 Parent(s): e1e366d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +5 -3
README.md CHANGED
@@ -25,7 +25,8 @@ python3 quantize_quark.py \
25
  --quant_scheme w_fp8_a_fp8 \
26
  --kv_cache_dtype fp8 \
27
  --num_calib_data 128 \
28
- --model_export quark_safetensors
 
29
 
30
  # If model size is too large for single GPU, please use multi GPU instead.
31
  python3 quantize_quark.py \
@@ -35,6 +36,7 @@ python3 quantize_quark.py \
35
  --kv_cache_dtype fp8 \
36
  --num_calib_data 128 \
37
  --model_export quark_safetensors \
 
38
  --multi_gpu
39
  ```
40
  ## Deployment
@@ -57,9 +59,9 @@ The quantization evaluation results are conducted in pseudo-quantization mode, w
57
  <tr>
58
  <td>Perplexity-wikitext2
59
  </td>
60
- <td>1.8561
61
  </td>
62
- <td>1.8951
63
  </td>
64
  </tr>
65
 
 
25
  --quant_scheme w_fp8_a_fp8 \
26
  --kv_cache_dtype fp8 \
27
  --num_calib_data 128 \
28
+ --model_export quark_safetensors \
29
+ --no_weight_matrix_merge
30
 
31
  # If model size is too large for single GPU, please use multi GPU instead.
32
  python3 quantize_quark.py \
 
36
  --kv_cache_dtype fp8 \
37
  --num_calib_data 128 \
38
  --model_export quark_safetensors \
39
+ --no_weight_matrix_merge \
40
  --multi_gpu
41
  ```
42
  ## Deployment
 
59
  <tr>
60
  <td>Perplexity-wikitext2
61
  </td>
62
+ <td>1.8562
63
  </td>
64
+ <td>1.8927
65
  </td>
66
  </tr>
67