Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import os
|
2 |
-
|
|
|
|
|
3 |
|
4 |
import re
|
5 |
import uuid
|
@@ -86,16 +88,16 @@ def load_system_prompt(repo_id: str, filename: str) -> str:
|
|
86 |
|
87 |
# Model details (adjust as needed)
|
88 |
MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
89 |
-
# Load the system prompt from HF Hub (
|
90 |
SYSTEM_PROMPT = load_system_prompt(MODEL_ID, "SYSTEM_PROMPT.txt")
|
91 |
-
#
|
92 |
# SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, and ends with an ASCII cat."
|
93 |
|
94 |
-
# Set the device explicitly
|
95 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
96 |
|
97 |
# Initialize the Mistral LLM via vllm.
|
98 |
-
# The
|
99 |
llm = LLM(model=MODEL_ID, tokenizer_mode="mistral", device=device, enforce_eager=True)
|
100 |
|
101 |
# -----------------------------------------------------------------------------
|
@@ -124,7 +126,7 @@ def generate(
|
|
124 |
{"role": "system", "content": SYSTEM_PROMPT}
|
125 |
]
|
126 |
|
127 |
-
# Check if any file is provided
|
128 |
video_extensions = (".mp4", ".mov", ".avi", ".mkv", ".webm")
|
129 |
if files:
|
130 |
# If any file is a video, use video inference branch.
|
|
|
1 |
import os
|
2 |
+
# Disable chunked prefill and asynchronous output before importing vllm.
|
3 |
+
os.environ["VLLM_ENABLE_CHUNKED_PREFILL"] = "False"
|
4 |
+
os.environ["VLLM_ENABLE_ASYNC_OUTPUT"] = "False"
|
5 |
|
6 |
import re
|
7 |
import uuid
|
|
|
88 |
|
89 |
# Model details (adjust as needed)
|
90 |
MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
91 |
+
# Load the system prompt from HF Hub (ensure SYSTEM_PROMPT.txt exists in the repo)
|
92 |
SYSTEM_PROMPT = load_system_prompt(MODEL_ID, "SYSTEM_PROMPT.txt")
|
93 |
+
# Alternatively, you can hardcode the system prompt:
|
94 |
# SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, and ends with an ASCII cat."
|
95 |
|
96 |
+
# Set the device explicitly.
|
97 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
98 |
|
99 |
# Initialize the Mistral LLM via vllm.
|
100 |
+
# The enforce_eager flag ensures synchronous (eager) output.
|
101 |
llm = LLM(model=MODEL_ID, tokenizer_mode="mistral", device=device, enforce_eager=True)
|
102 |
|
103 |
# -----------------------------------------------------------------------------
|
|
|
126 |
{"role": "system", "content": SYSTEM_PROMPT}
|
127 |
]
|
128 |
|
129 |
+
# Check if any file is provided.
|
130 |
video_extensions = (".mp4", ".mov", ".avi", ".mkv", ".webm")
|
131 |
if files:
|
132 |
# If any file is a video, use video inference branch.
|