prithivMLmods commited on
Commit
803afd5
·
verified ·
1 Parent(s): 42f9ebc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -6
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import os
2
- os.environ["VLLM_ENABLE_CHUNKED_PREFILL"] = "False" # Disable chunked prefill as a workaround
 
 
3
 
4
  import re
5
  import uuid
@@ -86,16 +88,16 @@ def load_system_prompt(repo_id: str, filename: str) -> str:
86
 
87
  # Model details (adjust as needed)
88
  MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
89
- # Load the system prompt from HF Hub (make sure SYSTEM_PROMPT.txt exists in the repo)
90
  SYSTEM_PROMPT = load_system_prompt(MODEL_ID, "SYSTEM_PROMPT.txt")
91
- # If you prefer a hardcoded system prompt, you can use:
92
  # SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, and ends with an ASCII cat."
93
 
94
- # Set the device explicitly
95
  device = "cuda" if torch.cuda.is_available() else "cpu"
96
 
97
  # Initialize the Mistral LLM via vllm.
98
- # The 'enforce_eager=True' parameter disables asynchronous output.
99
  llm = LLM(model=MODEL_ID, tokenizer_mode="mistral", device=device, enforce_eager=True)
100
 
101
  # -----------------------------------------------------------------------------
@@ -124,7 +126,7 @@ def generate(
124
  {"role": "system", "content": SYSTEM_PROMPT}
125
  ]
126
 
127
- # Check if any file is provided
128
  video_extensions = (".mp4", ".mov", ".avi", ".mkv", ".webm")
129
  if files:
130
  # If any file is a video, use video inference branch.
 
1
  import os
2
+ # Disable chunked prefill and asynchronous output before importing vllm.
3
+ os.environ["VLLM_ENABLE_CHUNKED_PREFILL"] = "False"
4
+ os.environ["VLLM_ENABLE_ASYNC_OUTPUT"] = "False"
5
 
6
  import re
7
  import uuid
 
88
 
89
  # Model details (adjust as needed)
90
  MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
91
+ # Load the system prompt from HF Hub (ensure SYSTEM_PROMPT.txt exists in the repo)
92
  SYSTEM_PROMPT = load_system_prompt(MODEL_ID, "SYSTEM_PROMPT.txt")
93
+ # Alternatively, you can hardcode the system prompt:
94
  # SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, and ends with an ASCII cat."
95
 
96
+ # Set the device explicitly.
97
  device = "cuda" if torch.cuda.is_available() else "cpu"
98
 
99
  # Initialize the Mistral LLM via vllm.
100
+ # The enforce_eager flag ensures synchronous (eager) output.
101
  llm = LLM(model=MODEL_ID, tokenizer_mode="mistral", device=device, enforce_eager=True)
102
 
103
  # -----------------------------------------------------------------------------
 
126
  {"role": "system", "content": SYSTEM_PROMPT}
127
  ]
128
 
129
+ # Check if any file is provided.
130
  video_extensions = (".mp4", ".mov", ".avi", ".mkv", ".webm")
131
  if files:
132
  # If any file is a video, use video inference branch.