Spaces:
Paused
Paused
feat(token): using openai compatible server token
Browse files- Dockerfile +10 -2
Dockerfile
CHANGED
@@ -14,7 +14,7 @@ RUN uv pip install --system --index-strategy unsafe-best-match vllm[audio]==0.10
|
|
14 |
# Downgrade triton because following error occured when using triton==3.3.1
|
15 |
# https://github.com/vllm-project/vllm/issues/20259#issuecomment-3157159183
|
16 |
# https://github.com/vllm-project/vllm/issues/19203#issuecomment-2989796604
|
17 |
-
RUN uv pip install --system --index-strategy unsafe-best-match triton==3.
|
18 |
|
19 |
# Create a user and group with the specified ID
|
20 |
RUN groupadd -r myuser --gid 1000 && useradd -r -g myuser --uid 1000 myuser
|
@@ -27,7 +27,15 @@ USER myuser
|
|
27 |
|
28 |
RUN mkdir -p /tmp/.cache/huggingface
|
29 |
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
# # FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
|
33 |
# FROM nvidia/cuda:12.9.1-cudnn-runtime-ubuntu24.04
|
|
|
14 |
# Downgrade triton because following error occured when using triton==3.3.1
|
15 |
# https://github.com/vllm-project/vllm/issues/20259#issuecomment-3157159183
|
16 |
# https://github.com/vllm-project/vllm/issues/19203#issuecomment-2989796604
|
17 |
+
RUN uv pip install --system --index-strategy unsafe-best-match triton==3.4.0 --extra-index-url https://download.pytorch.org/whl/cu128
|
18 |
|
19 |
# Create a user and group with the specified ID
|
20 |
RUN groupadd -r myuser --gid 1000 && useradd -r -g myuser --uid 1000 myuser
|
|
|
27 |
|
28 |
RUN mkdir -p /tmp/.cache/huggingface
|
29 |
|
30 |
+
# to be set at runtime from secrets
|
31 |
+
ENV TOKEN="xxx"
|
32 |
+
ENV MODEL_NAME="meta-llama/Llama-3.2-3B-Instruct"
|
33 |
+
ENV MODEL_REVISION="0cb88a4f764b7a12671c53f0838cd831a0843b95"
|
34 |
+
ENV HF_HOME="/tmp/.cache/huggingface"
|
35 |
+
|
36 |
+
EXPOSE 7860
|
37 |
+
|
38 |
+
ENTRYPOINT ["/bin/bash", "-c", "vllm serve ${MODEL_NAME} --token ${TOKEN} --task generate --revision ${MODEL_REVISION} --code-revision ${MODEL_REVISION} --tokenizer-revision ${MODEL_NAME} --seed 42 --host 0.0.0.0 --port 7860 --max-num-batched-tokens 32768 --max-model-len 32768 --dtype float16 --enforce-eager --gpu-memory-utilization 0.9 --enable-prefix-caching --disable-log-requests --trust-remote-code"]
|
39 |
|
40 |
# # FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
|
41 |
# FROM nvidia/cuda:12.9.1-cudnn-runtime-ubuntu24.04
|