Update README.md
Browse files
README.md
CHANGED
@@ -125,8 +125,8 @@ TODO
|
|
125 |
| Benchmark | | |
|
126 |
|----------------------------------|----------------|-------------------------------|
|
127 |
| | Qwen3-32B | Qwen3-32B-float8dq |
|
128 |
-
| latency (batch_size=1) | 9.1s |
|
129 |
-
| latency (batch_size=128) | 12.45s |
|
130 |
| serving (num_prompts=1) | TODO | TODO |
|
131 |
| serving (num_prompts=1000) | TODO | TODO |
|
132 |
|
@@ -156,7 +156,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
|
|
156 |
Server:
|
157 |
```Shell
|
158 |
export MODEL=Qwen/Qwen3-32B # or pytorch/Qwen3-32B-float8dq
|
159 |
-
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve MODEL --tokenizer Qwen/Qwen3-32B -O3
|
160 |
```
|
161 |
|
162 |
Client:
|
|
|
125 |
| Benchmark | | |
|
126 |
|----------------------------------|----------------|-------------------------------|
|
127 |
| | Qwen3-32B | Qwen3-32B-float8dq |
|
128 |
+
| latency (batch_size=1) | 9.1s | 5.77s (-36.6%) |
|
129 |
+
| latency (batch_size=128) | 12.45s | 8.40s (-32.5%) |
|
130 |
| serving (num_prompts=1) | TODO | TODO |
|
131 |
| serving (num_prompts=1000) | TODO | TODO |
|
132 |
|
|
|
156 |
Server:
|
157 |
```Shell
|
158 |
export MODEL=Qwen/Qwen3-32B # or pytorch/Qwen3-32B-float8dq
|
159 |
+
VLLM_DISABLE_COMPILE_CACHE=1 vllm serve $MODEL --tokenizer Qwen/Qwen3-32B -O3
|
160 |
```
|
161 |
|
162 |
Client:
|