gperdrizet commited on
Commit
a99632e
·
1 Parent(s): b1174ec

Started switching all models over to Modal labs inference.

Browse files
.devcontainer/devcontainer.json CHANGED
@@ -15,6 +15,6 @@
15
  }
16
  },
17
 
18
- // "postCreateCommand": "",
19
  "postAttachCommand": "gradio resumate.py"
20
  }
 
15
  }
16
  },
17
 
18
+ "postCreateCommand": "modal deploy inference_endpoints/llama-3-1-8B-instruct.py",
19
  "postAttachCommand": "gradio resumate.py"
20
  }
configuration.py CHANGED
@@ -5,9 +5,15 @@ from smolagents import OpenAIServerModel
5
 
6
  DEFAULT_GITHUB_PROFILE = "https://github.com/gperdrizet"
7
 
 
 
 
 
 
8
  AGENT_MODEL = OpenAIServerModel(
9
- model_id="gpt-4.1",
10
- max_tokens=8000
 
11
  )
12
 
13
  INSTRUCTIONS = """
 
5
 
6
  DEFAULT_GITHUB_PROFILE = "https://github.com/gperdrizet"
7
 
8
+ # AGENT_MODEL = OpenAIServerModel(
9
+ # model_id="gpt-4.1",
10
+ # max_tokens=8000
11
+ # )
12
+
13
  AGENT_MODEL = OpenAIServerModel(
14
+ model_id="vllm-deepseek-v3",
15
+ api_base="https://gperdrizet--vllm-deepseek-v3-serve.modal.run/v1",
16
+ api_key=os.environ["MODAL_TOKEN_SECRET"],
17
  )
18
 
19
  INSTRUCTIONS = """
functions/job_call.py CHANGED
@@ -59,10 +59,10 @@ def summarize_job_call(job_call: str) -> str:
59
 
60
  logger.info("Summarizing job call (%d characters)", len(job_call))
61
 
62
- client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
63
 
64
  client.base_url = (
65
- 'https://gperdrizet--vllm-openai-compatible-summarization-serve.modal.run/v1'
66
  )
67
 
68
  # Default to first available model
 
59
 
60
  logger.info("Summarizing job call (%d characters)", len(job_call))
61
 
62
+ client = OpenAI(api_key=os.environ['MODAL_TOKEN_SECRET'])
63
 
64
  client.base_url = (
65
+ 'https://gperdrizet--vllm-deepseek-v3-serve.modal.run/v1'
66
  )
67
 
68
  # Default to first available model
inference_endpoints/deepseek-v3.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Run OpenAI-compatible LLM inference with DeepSeek-V3 and vLLM
2
+ # Usage: modal deploy deepseek-v3.py
3
+
4
+ ## Set up the container image
5
+
6
+ import modal
7
+
8
+ vllm_image = (
9
+ modal.Image.debian_slim(python_version="3.12")
10
+ .pip_install(
11
+ "vllm==0.7.2",
12
+ "huggingface_hub[hf_transfer]==0.26.2",
13
+ "flashinfer-python==0.2.0.post2", # pinning, very unstable
14
+ extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
15
+ )
16
+ .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
17
+ )
18
+
19
+
20
+ # Turn on V1 backend engine. Note: NVIDIA T4 does not seem to support
21
+ # this due to CUDA incompatibility. Needs CUDA >=8, excluding 8.6 and 8.9.
22
+ # For V1 backend use L40S
23
+ vllm_image = vllm_image.env({"VLLM_USE_V1": "1"})
24
+
25
+ # Download the model weights
26
+ MODELS_DIR = "/models"
27
+ MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
28
+ #MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"
29
+
30
+ # Cache model weights
31
+ hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
32
+ vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
33
+
34
+
35
+ ## Build a vLLM engine and serve it
36
+ app = modal.App("vllm-deepseek-v3")
37
+
38
+ N_GPU = 1
39
+ MINUTES = 60 # seconds
40
+ VLLM_PORT = 8000
41
+
42
+
43
+ @app.function(
44
+ image=vllm_image,
45
+ gpu=f"L40S:{N_GPU}",
46
+ scaledown_window=15 * MINUTES, # how long should we stay up with no requests?
47
+ timeout=10 * MINUTES, # how long should we wait for container start?
48
+ volumes={
49
+ "/root/.cache/huggingface": hf_cache_vol,
50
+ "/root/.cache/vllm": vllm_cache_vol,
51
+ },
52
+ secrets=[modal.Secret.from_name("api_key")]
53
+ )
54
+
55
+ @modal.concurrent(
56
+ max_inputs=100
57
+ ) # how many requests can one replica handle? tune carefully!
58
+
59
+ @modal.web_server(port=VLLM_PORT, startup_timeout=15 * MINUTES)
60
+ def serve():
61
+ import os
62
+ import subprocess
63
+
64
+ cmd = [
65
+ "vllm",
66
+ "serve",
67
+ "--uvicorn-log-level=info",
68
+ MODEL_NAME,
69
+ #"--revision",
70
+ #MODEL_REVISION,
71
+ "--host",
72
+ "0.0.0.0",
73
+ "--port",
74
+ str(VLLM_PORT),
75
+ "--api-key",
76
+ os.environ["API_KEY"],
77
+ ]
78
+
79
+ subprocess.Popen(" ".join(cmd), shell=True)
inference_endpoints/llama-3-1-8B-instruct.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Run OpenAI-compatible LLM text summarization with LLaMA 3.1-8B and vLLM
2
+ # Usage: modal deploy vllm_summarization_server.py
3
+
4
+ ## Set up the container image
5
+
6
+ import modal
7
+
8
+ vllm_image = (
9
+ modal.Image.debian_slim(python_version="3.12")
10
+ .pip_install(
11
+ "vllm==0.7.2",
12
+ "huggingface_hub[hf_transfer]==0.26.2",
13
+ "flashinfer-python==0.2.0.post2", # pinning, very unstable
14
+ extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
15
+ )
16
+ .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
17
+ )
18
+
19
+
20
+ # Turn on V1 backend engine. Note: NVIDIA T4 does not seem to support
21
+ # this due to CUDA incompatibility. Needs CUDA >=8, excluding 8.6 and 8.9.
22
+ # For V1 backend use L40S
23
+ vllm_image = vllm_image.env({"VLLM_USE_V1": "1"})
24
+
25
+ # Download the model weights
26
+ MODELS_DIR = "/llamas"
27
+ MODEL_NAME = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
28
+ MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"
29
+
30
+ # Cache model weights
31
+ hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
32
+ vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
33
+
34
+
35
+ ## Build a vLLM engine and serve it
36
+ app = modal.App("vllm-openai-compatible-summarization")
37
+
38
+ N_GPU = 1
39
+ MINUTES = 60 # seconds
40
+ VLLM_PORT = 8000
41
+
42
+
43
+ @app.function(
44
+ image=vllm_image,
45
+ gpu=f"L40S:{N_GPU}",
46
+ scaledown_window=15 * MINUTES, # how long should we stay up with no requests?
47
+ timeout=10 * MINUTES, # how long should we wait for container start?
48
+ volumes={
49
+ "/root/.cache/huggingface": hf_cache_vol,
50
+ "/root/.cache/vllm": vllm_cache_vol,
51
+ },
52
+ secrets=[modal.Secret.from_name("api_key")]
53
+ )
54
+
55
+ @modal.concurrent(
56
+ max_inputs=100
57
+ ) # how many requests can one replica handle? tune carefully!
58
+
59
+ @modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
60
+ def serve():
61
+ import os
62
+ import subprocess
63
+
64
+ cmd = [
65
+ "vllm",
66
+ "serve",
67
+ "--uvicorn-log-level=info",
68
+ MODEL_NAME,
69
+ "--revision",
70
+ MODEL_REVISION,
71
+ "--host",
72
+ "0.0.0.0",
73
+ "--port",
74
+ str(VLLM_PORT),
75
+ "--api-key",
76
+ os.environ["API_KEY"],
77
+ ]
78
+
79
+ subprocess.Popen(" ".join(cmd), shell=True)
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  gradio==5.35.0
 
2
  PyPDF2
3
  requests
4
  smolagents[openai]
 
1
  gradio==5.35.0
2
+ modal
3
  PyPDF2
4
  requests
5
  smolagents[openai]