Spaces:
Sleeping
Sleeping
Commit
·
a99632e
1
Parent(s):
b1174ec
Started switching all models over to Modal labs inference.
Browse files- .devcontainer/devcontainer.json +1 -1
- configuration.py +8 -2
- functions/job_call.py +2 -2
- inference_endpoints/deepseek-v3.py +79 -0
- inference_endpoints/llama-3-1-8B-instruct.py +79 -0
- requirements.txt +1 -0
.devcontainer/devcontainer.json
CHANGED
@@ -15,6 +15,6 @@
|
|
15 |
}
|
16 |
},
|
17 |
|
18 |
-
|
19 |
"postAttachCommand": "gradio resumate.py"
|
20 |
}
|
|
|
15 |
}
|
16 |
},
|
17 |
|
18 |
+
"postCreateCommand": "modal deploy inference_endpoints/llama-3-1-8B-instruct.py",
|
19 |
"postAttachCommand": "gradio resumate.py"
|
20 |
}
|
configuration.py
CHANGED
@@ -5,9 +5,15 @@ from smolagents import OpenAIServerModel
|
|
5 |
|
6 |
DEFAULT_GITHUB_PROFILE = "https://github.com/gperdrizet"
|
7 |
|
|
|
|
|
|
|
|
|
|
|
8 |
AGENT_MODEL = OpenAIServerModel(
|
9 |
-
model_id="
|
10 |
-
|
|
|
11 |
)
|
12 |
|
13 |
INSTRUCTIONS = """
|
|
|
5 |
|
6 |
DEFAULT_GITHUB_PROFILE = "https://github.com/gperdrizet"
|
7 |
|
8 |
+
# AGENT_MODEL = OpenAIServerModel(
|
9 |
+
# model_id="gpt-4.1",
|
10 |
+
# max_tokens=8000
|
11 |
+
# )
|
12 |
+
|
13 |
AGENT_MODEL = OpenAIServerModel(
|
14 |
+
model_id="vllm-deepseek-v3",
|
15 |
+
api_base="https://gperdrizet--vllm-deepseek-v3-serve.modal.run/v1",
|
16 |
+
api_key=os.environ["MODAL_TOKEN_SECRET"],
|
17 |
)
|
18 |
|
19 |
INSTRUCTIONS = """
|
functions/job_call.py
CHANGED
@@ -59,10 +59,10 @@ def summarize_job_call(job_call: str) -> str:
|
|
59 |
|
60 |
logger.info("Summarizing job call (%d characters)", len(job_call))
|
61 |
|
62 |
-
client = OpenAI(api_key=os.environ['
|
63 |
|
64 |
client.base_url = (
|
65 |
-
'https://gperdrizet--vllm-
|
66 |
)
|
67 |
|
68 |
# Default to first available model
|
|
|
59 |
|
60 |
logger.info("Summarizing job call (%d characters)", len(job_call))
|
61 |
|
62 |
+
client = OpenAI(api_key=os.environ['MODAL_TOKEN_SECRET'])
|
63 |
|
64 |
client.base_url = (
|
65 |
+
'https://gperdrizet--vllm-deepseek-v3-serve.modal.run/v1'
|
66 |
)
|
67 |
|
68 |
# Default to first available model
|
inference_endpoints/deepseek-v3.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Run OpenAI-compatible LLM inference with DeepSeek-V3 and vLLM
|
2 |
+
# Usage: modal deploy deepseek-v3.py
|
3 |
+
|
4 |
+
## Set up the container image
|
5 |
+
|
6 |
+
import modal
|
7 |
+
|
8 |
+
vllm_image = (
|
9 |
+
modal.Image.debian_slim(python_version="3.12")
|
10 |
+
.pip_install(
|
11 |
+
"vllm==0.7.2",
|
12 |
+
"huggingface_hub[hf_transfer]==0.26.2",
|
13 |
+
"flashinfer-python==0.2.0.post2", # pinning, very unstable
|
14 |
+
extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
|
15 |
+
)
|
16 |
+
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
|
17 |
+
)
|
18 |
+
|
19 |
+
|
20 |
+
# Turn on V1 backend engine. Note: NVIDIA T4 does not seem to support
|
21 |
+
# this due to CUDA incompatibility. Needs CUDA >=8, excluding 8.6 and 8.9.
|
22 |
+
# For V1 backend use L40S
|
23 |
+
vllm_image = vllm_image.env({"VLLM_USE_V1": "1"})
|
24 |
+
|
25 |
+
# Download the model weights
|
26 |
+
MODELS_DIR = "/models"
|
27 |
+
MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
|
28 |
+
#MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"
|
29 |
+
|
30 |
+
# Cache model weights
|
31 |
+
hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
|
32 |
+
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
|
33 |
+
|
34 |
+
|
35 |
+
## Build a vLLM engine and serve it
|
36 |
+
app = modal.App("vllm-deepseek-v3")
|
37 |
+
|
38 |
+
N_GPU = 1
|
39 |
+
MINUTES = 60 # seconds
|
40 |
+
VLLM_PORT = 8000
|
41 |
+
|
42 |
+
|
43 |
+
@app.function(
|
44 |
+
image=vllm_image,
|
45 |
+
gpu=f"L40S:{N_GPU}",
|
46 |
+
scaledown_window=15 * MINUTES, # how long should we stay up with no requests?
|
47 |
+
timeout=10 * MINUTES, # how long should we wait for container start?
|
48 |
+
volumes={
|
49 |
+
"/root/.cache/huggingface": hf_cache_vol,
|
50 |
+
"/root/.cache/vllm": vllm_cache_vol,
|
51 |
+
},
|
52 |
+
secrets=[modal.Secret.from_name("api_key")]
|
53 |
+
)
|
54 |
+
|
55 |
+
@modal.concurrent(
|
56 |
+
max_inputs=100
|
57 |
+
) # how many requests can one replica handle? tune carefully!
|
58 |
+
|
59 |
+
@modal.web_server(port=VLLM_PORT, startup_timeout=15 * MINUTES)
|
60 |
+
def serve():
|
61 |
+
import os
|
62 |
+
import subprocess
|
63 |
+
|
64 |
+
cmd = [
|
65 |
+
"vllm",
|
66 |
+
"serve",
|
67 |
+
"--uvicorn-log-level=info",
|
68 |
+
MODEL_NAME,
|
69 |
+
#"--revision",
|
70 |
+
#MODEL_REVISION,
|
71 |
+
"--host",
|
72 |
+
"0.0.0.0",
|
73 |
+
"--port",
|
74 |
+
str(VLLM_PORT),
|
75 |
+
"--api-key",
|
76 |
+
os.environ["API_KEY"],
|
77 |
+
]
|
78 |
+
|
79 |
+
subprocess.Popen(" ".join(cmd), shell=True)
|
inference_endpoints/llama-3-1-8B-instruct.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Run OpenAI-compatible LLM text summarization with LLaMA 3.1-8B and vLLM
|
2 |
+
# Usage: modal deploy vllm_summarization_server.py
|
3 |
+
|
4 |
+
## Set up the container image
|
5 |
+
|
6 |
+
import modal
|
7 |
+
|
8 |
+
vllm_image = (
|
9 |
+
modal.Image.debian_slim(python_version="3.12")
|
10 |
+
.pip_install(
|
11 |
+
"vllm==0.7.2",
|
12 |
+
"huggingface_hub[hf_transfer]==0.26.2",
|
13 |
+
"flashinfer-python==0.2.0.post2", # pinning, very unstable
|
14 |
+
extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
|
15 |
+
)
|
16 |
+
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
|
17 |
+
)
|
18 |
+
|
19 |
+
|
20 |
+
# Turn on V1 backend engine. Note: NVIDIA T4 does not seem to support
|
21 |
+
# this due to CUDA incompatibility. Needs CUDA >=8, excluding 8.6 and 8.9.
|
22 |
+
# For V1 backend use L40S
|
23 |
+
vllm_image = vllm_image.env({"VLLM_USE_V1": "1"})
|
24 |
+
|
25 |
+
# Download the model weights
|
26 |
+
MODELS_DIR = "/llamas"
|
27 |
+
MODEL_NAME = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
|
28 |
+
MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"
|
29 |
+
|
30 |
+
# Cache model weights
|
31 |
+
hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
|
32 |
+
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
|
33 |
+
|
34 |
+
|
35 |
+
## Build a vLLM engine and serve it
|
36 |
+
app = modal.App("vllm-openai-compatible-summarization")
|
37 |
+
|
38 |
+
N_GPU = 1
|
39 |
+
MINUTES = 60 # seconds
|
40 |
+
VLLM_PORT = 8000
|
41 |
+
|
42 |
+
|
43 |
+
@app.function(
|
44 |
+
image=vllm_image,
|
45 |
+
gpu=f"L40S:{N_GPU}",
|
46 |
+
scaledown_window=15 * MINUTES, # how long should we stay up with no requests?
|
47 |
+
timeout=10 * MINUTES, # how long should we wait for container start?
|
48 |
+
volumes={
|
49 |
+
"/root/.cache/huggingface": hf_cache_vol,
|
50 |
+
"/root/.cache/vllm": vllm_cache_vol,
|
51 |
+
},
|
52 |
+
secrets=[modal.Secret.from_name("api_key")]
|
53 |
+
)
|
54 |
+
|
55 |
+
@modal.concurrent(
|
56 |
+
max_inputs=100
|
57 |
+
) # how many requests can one replica handle? tune carefully!
|
58 |
+
|
59 |
+
@modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
|
60 |
+
def serve():
|
61 |
+
import os
|
62 |
+
import subprocess
|
63 |
+
|
64 |
+
cmd = [
|
65 |
+
"vllm",
|
66 |
+
"serve",
|
67 |
+
"--uvicorn-log-level=info",
|
68 |
+
MODEL_NAME,
|
69 |
+
"--revision",
|
70 |
+
MODEL_REVISION,
|
71 |
+
"--host",
|
72 |
+
"0.0.0.0",
|
73 |
+
"--port",
|
74 |
+
str(VLLM_PORT),
|
75 |
+
"--api-key",
|
76 |
+
os.environ["API_KEY"],
|
77 |
+
]
|
78 |
+
|
79 |
+
subprocess.Popen(" ".join(cmd), shell=True)
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
gradio==5.35.0
|
|
|
2 |
PyPDF2
|
3 |
requests
|
4 |
smolagents[openai]
|
|
|
1 |
gradio==5.35.0
|
2 |
+
modal
|
3 |
PyPDF2
|
4 |
requests
|
5 |
smolagents[openai]
|