yusufs commited on
Commit
524a82b
·
1 Parent(s): b1959b7

feat(token): using openai compatible server token

Browse files
Files changed (1) hide show
  1. Dockerfile +10 -2
Dockerfile CHANGED
@@ -14,7 +14,7 @@ RUN uv pip install --system --index-strategy unsafe-best-match vllm[audio]==0.10
14
  # Downgrade triton because following error occured when using triton==3.3.1
15
  # https://github.com/vllm-project/vllm/issues/20259#issuecomment-3157159183
16
  # https://github.com/vllm-project/vllm/issues/19203#issuecomment-2989796604
17
- RUN uv pip install --system --index-strategy unsafe-best-match triton==3.2 --extra-index-url https://download.pytorch.org/whl/cu128
18
 
19
  # Create a user and group with the specified ID
20
  RUN groupadd -r myuser --gid 1000 && useradd -r -g myuser --uid 1000 myuser
@@ -27,7 +27,15 @@ USER myuser
27
 
28
  RUN mkdir -p /tmp/.cache/huggingface
29
 
30
- ENTRYPOINT ["/bin/bash", "-c", "vllm serve meta-llama/Llama-3.2-3B-Instruct --task generate --revision 0cb88a4f764b7a12671c53f0838cd831a0843b95 --code-revision 0cb88a4f764b7a12671c53f0838cd831a0843b95 --tokenizer-revision 0cb88a4f764b7a12671c53f0838cd831a0843b95 --seed 42 --host 0.0.0.0 --port 7860 --max-num-batched-tokens 32768 --max-model-len 32768 --dtype float16 --enforce-eager --gpu-memory-utilization 0.9 --enable-prefix-caching --disable-log-requests --trust-remote-code"]
 
 
 
 
 
 
 
 
31
 
32
  # # FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
33
  # FROM nvidia/cuda:12.9.1-cudnn-runtime-ubuntu24.04
 
14
  # Downgrade triton because following error occured when using triton==3.3.1
15
  # https://github.com/vllm-project/vllm/issues/20259#issuecomment-3157159183
16
  # https://github.com/vllm-project/vllm/issues/19203#issuecomment-2989796604
17
+ RUN uv pip install --system --index-strategy unsafe-best-match triton==3.4.0 --extra-index-url https://download.pytorch.org/whl/cu128
18
 
19
  # Create a user and group with the specified ID
20
  RUN groupadd -r myuser --gid 1000 && useradd -r -g myuser --uid 1000 myuser
 
27
 
28
  RUN mkdir -p /tmp/.cache/huggingface
29
 
30
+ # to be set at runtime from secrets
31
+ ENV TOKEN="xxx"
32
+ ENV MODEL_NAME="meta-llama/Llama-3.2-3B-Instruct"
33
+ ENV MODEL_REVISION="0cb88a4f764b7a12671c53f0838cd831a0843b95"
34
+ ENV HF_HOME="/tmp/.cache/huggingface"
35
+
36
+ EXPOSE 7860
37
+
38
+ ENTRYPOINT ["/bin/bash", "-c", "vllm serve ${MODEL_NAME} --token ${TOKEN} --task generate --revision ${MODEL_REVISION} --code-revision ${MODEL_REVISION} --tokenizer-revision ${MODEL_NAME} --seed 42 --host 0.0.0.0 --port 7860 --max-num-batched-tokens 32768 --max-model-len 32768 --dtype float16 --enforce-eager --gpu-memory-utilization 0.9 --enable-prefix-caching --disable-log-requests --trust-remote-code"]
39
 
40
  # # FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
41
  # FROM nvidia/cuda:12.9.1-cudnn-runtime-ubuntu24.04