prithivMLmods commited on
Commit
42f9ebc
·
verified ·
1 Parent(s): 0ed1602

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -1,4 +1,6 @@
1
  import os
 
 
2
  import re
3
  import uuid
4
  import json
@@ -93,9 +95,7 @@ SYSTEM_PROMPT = load_system_prompt(MODEL_ID, "SYSTEM_PROMPT.txt")
93
  device = "cuda" if torch.cuda.is_available() else "cpu"
94
 
95
  # Initialize the Mistral LLM via vllm.
96
- # Note: Running this model on GPU may require very high VRAM.
97
- # The 'enforce_eager=True' parameter disables asynchronous output,
98
- # which avoids the NotImplementedError on platforms that do not support it.
99
  llm = LLM(model=MODEL_ID, tokenizer_mode="mistral", device=device, enforce_eager=True)
100
 
101
  # -----------------------------------------------------------------------------
 
1
  import os
2
+ os.environ["VLLM_ENABLE_CHUNKED_PREFILL"] = "False" # Disable chunked prefill as a workaround
3
+
4
  import re
5
  import uuid
6
  import json
 
95
  device = "cuda" if torch.cuda.is_available() else "cpu"
96
 
97
  # Initialize the Mistral LLM via vllm.
98
+ # The 'enforce_eager=True' parameter disables asynchronous output.
 
 
99
  llm = LLM(model=MODEL_ID, tokenizer_mode="mistral", device=device, enforce_eager=True)
100
 
101
  # -----------------------------------------------------------------------------