Spaces:

yusufs
/

sailor2-3b-chat

Paused

yusufs commited on 2 days ago

Commit

524a82b

1 Parent(s): b1959b7

feat(token): using openai compatible server token

Files changed (1) hide show

Dockerfile CHANGED Viewed

@@ -14,7 +14,7 @@ RUN uv pip install --system --index-strategy unsafe-best-match vllm[audio]==0.10
 # Downgrade triton because following error occured when using triton==3.3.1
 # https://github.com/vllm-project/vllm/issues/20259#issuecomment-3157159183
 # https://github.com/vllm-project/vllm/issues/19203#issuecomment-2989796604
-RUN uv pip install --system --index-strategy unsafe-best-match triton==3.2 --extra-index-url https://download.pytorch.org/whl/cu128
 # Create a user and group with the specified ID
 RUN groupadd -r myuser --gid 1000 && useradd -r -g myuser --uid 1000 myuser
@@ -27,7 +27,15 @@ USER myuser
 RUN mkdir -p /tmp/.cache/huggingface
-ENTRYPOINT ["/bin/bash",  "-c", "vllm serve meta-llama/Llama-3.2-3B-Instruct --task generate --revision 0cb88a4f764b7a12671c53f0838cd831a0843b95 --code-revision 0cb88a4f764b7a12671c53f0838cd831a0843b95 --tokenizer-revision 0cb88a4f764b7a12671c53f0838cd831a0843b95 --seed 42 --host 0.0.0.0 --port 7860 --max-num-batched-tokens 32768 --max-model-len 32768 --dtype float16 --enforce-eager --gpu-memory-utilization 0.9 --enable-prefix-caching --disable-log-requests --trust-remote-code"]
 # # FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
 # FROM nvidia/cuda:12.9.1-cudnn-runtime-ubuntu24.04

 # Downgrade triton because following error occured when using triton==3.3.1
 # https://github.com/vllm-project/vllm/issues/20259#issuecomment-3157159183
 # https://github.com/vllm-project/vllm/issues/19203#issuecomment-2989796604
+RUN uv pip install --system --index-strategy unsafe-best-match triton==3.4.0 --extra-index-url https://download.pytorch.org/whl/cu128
 # Create a user and group with the specified ID
 RUN groupadd -r myuser --gid 1000 && useradd -r -g myuser --uid 1000 myuser
 RUN mkdir -p /tmp/.cache/huggingface
+# to be set at runtime from secrets
+ENV TOKEN="xxx"
+ENV MODEL_NAME="meta-llama/Llama-3.2-3B-Instruct"
+ENV MODEL_REVISION="0cb88a4f764b7a12671c53f0838cd831a0843b95"
+ENV HF_HOME="/tmp/.cache/huggingface"
+EXPOSE 7860
+ENTRYPOINT ["/bin/bash",  "-c", "vllm serve ${MODEL_NAME} --token ${TOKEN} --task generate --revision ${MODEL_REVISION} --code-revision ${MODEL_REVISION} --tokenizer-revision ${MODEL_NAME} --seed 42 --host 0.0.0.0 --port 7860 --max-num-batched-tokens 32768 --max-model-len 32768 --dtype float16 --enforce-eager --gpu-memory-utilization 0.9 --enable-prefix-caching --disable-log-requests --trust-remote-code"]
 # # FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
 # FROM nvidia/cuda:12.9.1-cudnn-runtime-ubuntu24.04