update readme for fp8 (#1)
Browse files- update readme for fp8 (57b5f94f4c4e7facb6014752d5de02404a1c3858)
README.md
CHANGED
@@ -153,7 +153,7 @@ docker pull hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-vllm
|
|
153 |
|
154 |
- Download Model file:
|
155 |
- Huggingface: will download automicly by vllm.
|
156 |
-
- ModelScope: `modelscope download --model Tencent-Hunyuan/Hunyuan-A13B-Instruct`
|
157 |
|
158 |
|
159 |
- Start the API server:
|
@@ -165,7 +165,7 @@ docker run --privileged --user root --net=host --ipc=host \
|
|
165 |
--gpus=all -it --entrypoint python hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-vllm
|
166 |
\
|
167 |
-m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 \
|
168 |
-
--tensor-parallel-size
|
169 |
|
170 |
```
|
171 |
|
@@ -174,8 +174,9 @@ model downloaded by modelscope:
|
|
174 |
docker run --privileged --user root --net=host --ipc=host \
|
175 |
-v ~/.cache/modelscope:/root/.cache/modelscope \
|
176 |
--gpus=all -it --entrypoint python hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-vllm \
|
177 |
-
-m vllm.entrypoints.openai.api_server --host 0.0.0.0
|
178 |
-
|
|
|
179 |
```
|
180 |
|
181 |
|
@@ -190,7 +191,13 @@ To get started:
|
|
190 |
- Pull the Docker image
|
191 |
|
192 |
```
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
```
|
195 |
|
196 |
- Start the API server:
|
@@ -200,8 +207,8 @@ docker run --gpus all \
|
|
200 |
--shm-size 32g \
|
201 |
-p 30000:30000 \
|
202 |
--ipc=host \
|
203 |
-
|
204 |
-
-m sglang.launch_server --model-path hunyuan/
|
205 |
```
|
206 |
|
207 |
|
|
|
153 |
|
154 |
- Download Model file:
|
155 |
- Huggingface: will download automicly by vllm.
|
156 |
+
- ModelScope: `modelscope download --model Tencent-Hunyuan/Hunyuan-A13B-Instruct-FP8`
|
157 |
|
158 |
|
159 |
- Start the API server:
|
|
|
165 |
--gpus=all -it --entrypoint python hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-vllm
|
166 |
\
|
167 |
-m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 \
|
168 |
+
--tensor-parallel-size 2 --dtype bfloat16 --kv-cache-dtype fp8 --model tencent/Hunyuan-A13B-Instruct-FP8 --trust-remote-code
|
169 |
|
170 |
```
|
171 |
|
|
|
174 |
docker run --privileged --user root --net=host --ipc=host \
|
175 |
-v ~/.cache/modelscope:/root/.cache/modelscope \
|
176 |
--gpus=all -it --entrypoint python hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-vllm \
|
177 |
+
-m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 \
|
178 |
+
--tensor-parallel-size 2 --dtype bfloat16 --kv-cache-dtype fp8 \
|
179 |
+
--model /root/.cache/modelscope/hub/models/Tencent-Hunyuan/Hunyuan-A13B-Instruct-FP8 --trust_remote_code
|
180 |
```
|
181 |
|
182 |
|
|
|
191 |
- Pull the Docker image
|
192 |
|
193 |
```
|
194 |
+
|
195 |
+
# china mirror
|
196 |
+
docker pull docker.cnb.cool/tencent/hunyuan/hunyuan-a13b:hunyuan-moe-A13B-sglang
|
197 |
+
|
198 |
+
# docker hub:
|
199 |
+
docker pull hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-sglang
|
200 |
+
|
201 |
```
|
202 |
|
203 |
- Start the API server:
|
|
|
207 |
--shm-size 32g \
|
208 |
-p 30000:30000 \
|
209 |
--ipc=host \
|
210 |
+
hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-sglang \
|
211 |
+
-m sglang.launch_server --model-path hunyuan/Hunyuan-A13B-Instruct-FP8 --tp 2 --trust-remote-code --host 0.0.0.0 --port 30000
|
212 |
```
|
213 |
|
214 |
|