Spaces:

gperdrizet
/

resumate

Sleeping

App Files Files Community

gperdrizet commited on Jul 13

Commit

a99632e

1 Parent(s): b1174ec

Started switching all models over to Modal labs inference.

Browse files

Files changed (6) hide show

.devcontainer/devcontainer.json +1 -1
configuration.py +8 -2
functions/job_call.py +2 -2
inference_endpoints/deepseek-v3.py +79 -0
inference_endpoints/llama-3-1-8B-instruct.py +79 -0
requirements.txt +1 -0

.devcontainer/devcontainer.json CHANGED Viewed

@@ -15,6 +15,6 @@
 	  }
 	},
-	// "postCreateCommand": "",
 	"postAttachCommand": "gradio resumate.py"
 }

 	  }
 	},
+	"postCreateCommand": "modal deploy inference_endpoints/llama-3-1-8B-instruct.py",
 	"postAttachCommand": "gradio resumate.py"
 }

configuration.py CHANGED Viewed

@@ -5,9 +5,15 @@ from smolagents import OpenAIServerModel
 DEFAULT_GITHUB_PROFILE = "https://github.com/gperdrizet"
 AGENT_MODEL = OpenAIServerModel(
-    model_id="gpt-4.1",
-    max_tokens=8000
 )
 INSTRUCTIONS = """

 DEFAULT_GITHUB_PROFILE = "https://github.com/gperdrizet"
+# AGENT_MODEL = OpenAIServerModel(
+#     model_id="gpt-4.1",
+#     max_tokens=8000
+# )
 AGENT_MODEL = OpenAIServerModel(
+    model_id="vllm-deepseek-v3",
+    api_base="https://gperdrizet--vllm-deepseek-v3-serve.modal.run/v1",
+    api_key=os.environ["MODAL_TOKEN_SECRET"],
 )
 INSTRUCTIONS = """

functions/job_call.py CHANGED Viewed

@@ -59,10 +59,10 @@ def summarize_job_call(job_call: str) -> str:
     logger.info("Summarizing job call (%d characters)", len(job_call))
-    client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
     client.base_url = (
-        'https://gperdrizet--vllm-openai-compatible-summarization-serve.modal.run/v1'
     )
     # Default to first available model

     logger.info("Summarizing job call (%d characters)", len(job_call))
+    client = OpenAI(api_key=os.environ['MODAL_TOKEN_SECRET'])
     client.base_url = (
+        'https://gperdrizet--vllm-deepseek-v3-serve.modal.run/v1'
     )
     # Default to first available model

inference_endpoints/deepseek-v3.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Run OpenAI-compatible LLM inference with DeepSeek-V3 and vLLM
+# Usage: modal deploy deepseek-v3.py
+## Set up the container image
+import modal
+vllm_image = (
+    modal.Image.debian_slim(python_version="3.12")
+    .pip_install(
+        "vllm==0.7.2",
+        "huggingface_hub[hf_transfer]==0.26.2",
+        "flashinfer-python==0.2.0.post2",  # pinning, very unstable
+        extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
+    )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})  # faster model transfers
+)
+# Turn on V1 backend engine. Note: NVIDIA T4 does not seem to support
+# this due to CUDA incompatibility. Needs CUDA >=8, excluding 8.6 and 8.9.
+# For V1 backend use L40S
+vllm_image = vllm_image.env({"VLLM_USE_V1": "1"})
+# Download the model weights
+MODELS_DIR = "/models"
+MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
+#MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"
+# Cache model weights
+hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
+vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
+## Build a vLLM engine and serve it
+app = modal.App("vllm-deepseek-v3")
+N_GPU = 1
+MINUTES = 60 # seconds
+VLLM_PORT = 8000
+@app.function(
+    image=vllm_image,
+    gpu=f"L40S:{N_GPU}",
+    scaledown_window=15 * MINUTES,  # how long should we stay up with no requests?
+    timeout=10 * MINUTES,  # how long should we wait for container start?
+    volumes={
+        "/root/.cache/huggingface": hf_cache_vol,
+        "/root/.cache/vllm": vllm_cache_vol,
+    },
+    secrets=[modal.Secret.from_name("api_key")]
+)
+@modal.concurrent(
+    max_inputs=100
+)  # how many requests can one replica handle? tune carefully!
+@modal.web_server(port=VLLM_PORT, startup_timeout=15 * MINUTES)
+def serve():
+    import os
+    import subprocess
+    cmd = [
+        "vllm",
+        "serve",
+        "--uvicorn-log-level=info",
+        MODEL_NAME,
+        #"--revision",
+        #MODEL_REVISION,
+        "--host",
+        "0.0.0.0",
+        "--port",
+        str(VLLM_PORT),
+        "--api-key",
+        os.environ["API_KEY"],
+    ]
+    subprocess.Popen(" ".join(cmd), shell=True)

inference_endpoints/llama-3-1-8B-instruct.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Run OpenAI-compatible LLM text summarization with LLaMA 3.1-8B and vLLM
+# Usage: modal deploy vllm_summarization_server.py
+## Set up the container image
+import modal
+vllm_image = (
+    modal.Image.debian_slim(python_version="3.12")
+    .pip_install(
+        "vllm==0.7.2",
+        "huggingface_hub[hf_transfer]==0.26.2",
+        "flashinfer-python==0.2.0.post2",  # pinning, very unstable
+        extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
+    )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})  # faster model transfers
+)
+# Turn on V1 backend engine. Note: NVIDIA T4 does not seem to support
+# this due to CUDA incompatibility. Needs CUDA >=8, excluding 8.6 and 8.9.
+# For V1 backend use L40S
+vllm_image = vllm_image.env({"VLLM_USE_V1": "1"})
+# Download the model weights
+MODELS_DIR = "/llamas"
+MODEL_NAME = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
+MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"
+# Cache model weights
+hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
+vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
+## Build a vLLM engine and serve it
+app = modal.App("vllm-openai-compatible-summarization")
+N_GPU = 1
+MINUTES = 60 # seconds
+VLLM_PORT = 8000
+@app.function(
+    image=vllm_image,
+    gpu=f"L40S:{N_GPU}",
+    scaledown_window=15 * MINUTES,  # how long should we stay up with no requests?
+    timeout=10 * MINUTES,  # how long should we wait for container start?
+    volumes={
+        "/root/.cache/huggingface": hf_cache_vol,
+        "/root/.cache/vllm": vllm_cache_vol,
+    },
+    secrets=[modal.Secret.from_name("api_key")]
+)
+@modal.concurrent(
+    max_inputs=100
+)  # how many requests can one replica handle? tune carefully!
+@modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
+def serve():
+    import os
+    import subprocess
+    cmd = [
+        "vllm",
+        "serve",
+        "--uvicorn-log-level=info",
+        MODEL_NAME,
+        "--revision",
+        MODEL_REVISION,
+        "--host",
+        "0.0.0.0",
+        "--port",
+        str(VLLM_PORT),
+        "--api-key",
+        os.environ["API_KEY"],
+    ]
+    subprocess.Popen(" ".join(cmd), shell=True)

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 gradio==5.35.0
 PyPDF2
 requests
 smolagents[openai]

 gradio==5.35.0
+modal
 PyPDF2
 requests
 smolagents[openai]