Spaces:
Paused
Paused
fix(--token): remove --token not exist
Browse files- Dockerfile +4 -2
Dockerfile
CHANGED
@@ -28,14 +28,16 @@ USER myuser
|
|
28 |
RUN mkdir -p /tmp/.cache/huggingface
|
29 |
|
30 |
# to be set at runtime from secrets
|
31 |
-
ENV
|
|
|
|
|
32 |
ENV MODEL_NAME="meta-llama/Llama-3.2-3B-Instruct"
|
33 |
ENV MODEL_REVISION="0cb88a4f764b7a12671c53f0838cd831a0843b95"
|
34 |
ENV HF_HOME="/tmp/.cache/huggingface"
|
35 |
|
36 |
EXPOSE 7860
|
37 |
|
38 |
-
ENTRYPOINT ["/bin/bash", "-c", "vllm serve ${MODEL_NAME} --
|
39 |
|
40 |
# # FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
|
41 |
# FROM nvidia/cuda:12.9.1-cudnn-runtime-ubuntu24.04
|
|
|
28 |
RUN mkdir -p /tmp/.cache/huggingface
|
29 |
|
30 |
# to be set at runtime from secrets
|
31 |
+
ENV TASK="generate"
|
32 |
+
ENV MAX_MODEL_LEN=32768
|
33 |
+
ENV MAX_NUM_BATCHED_TOKENS=32768
|
34 |
ENV MODEL_NAME="meta-llama/Llama-3.2-3B-Instruct"
|
35 |
ENV MODEL_REVISION="0cb88a4f764b7a12671c53f0838cd831a0843b95"
|
36 |
ENV HF_HOME="/tmp/.cache/huggingface"
|
37 |
|
38 |
EXPOSE 7860
|
39 |
|
40 |
+
ENTRYPOINT ["/bin/bash", "-c", "vllm serve ${MODEL_NAME} --task ${TASK} --revision ${MODEL_REVISION} --code-revision ${MODEL_REVISION} --tokenizer-revision ${MODEL_NAME} --seed 42 --host 0.0.0.0 --port 7860 --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}} --max-model-len ${MAX_MODEL_LEN} --dtype float16 --enforce-eager --gpu-memory-utilization 0.9 --enable-prefix-caching --disable-log-requests --trust-remote-code"]
|
41 |
|
42 |
# # FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
|
43 |
# FROM nvidia/cuda:12.9.1-cudnn-runtime-ubuntu24.04
|