Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
import os
|
|
|
|
|
2 |
import re
|
3 |
import uuid
|
4 |
import json
|
@@ -93,9 +95,7 @@ SYSTEM_PROMPT = load_system_prompt(MODEL_ID, "SYSTEM_PROMPT.txt")
|
|
93 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
94 |
|
95 |
# Initialize the Mistral LLM via vllm.
|
96 |
-
#
|
97 |
-
# The 'enforce_eager=True' parameter disables asynchronous output,
|
98 |
-
# which avoids the NotImplementedError on platforms that do not support it.
|
99 |
llm = LLM(model=MODEL_ID, tokenizer_mode="mistral", device=device, enforce_eager=True)
|
100 |
|
101 |
# -----------------------------------------------------------------------------
|
|
|
1 |
import os
|
2 |
+
os.environ["VLLM_ENABLE_CHUNKED_PREFILL"] = "False" # Disable chunked prefill as a workaround
|
3 |
+
|
4 |
import re
|
5 |
import uuid
|
6 |
import json
|
|
|
95 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
96 |
|
97 |
# Initialize the Mistral LLM via vllm.
|
98 |
+
# The 'enforce_eager=True' parameter disables asynchronous output.
|
|
|
|
|
99 |
llm = LLM(model=MODEL_ID, tokenizer_mode="mistral", device=device, enforce_eager=True)
|
100 |
|
101 |
# -----------------------------------------------------------------------------
|