Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -9,6 +9,7 @@ import cv2
|
|
9 |
from datetime import datetime, timedelta
|
10 |
from threading import Thread
|
11 |
|
|
|
12 |
import gradio as gr
|
13 |
import spaces
|
14 |
import numpy as np
|
@@ -88,9 +89,12 @@ SYSTEM_PROMPT = load_system_prompt(MODEL_ID, "SYSTEM_PROMPT.txt")
|
|
88 |
# If you prefer a hardcoded system prompt, you can use:
|
89 |
# SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, and ends with an ASCII cat."
|
90 |
|
|
|
|
|
|
|
91 |
# Initialize the Mistral LLM via vllm.
|
92 |
# Note: Running this model on GPU may require very high VRAM.
|
93 |
-
llm = LLM(model=MODEL_ID, tokenizer_mode="mistral")
|
94 |
|
95 |
# -----------------------------------------------------------------------------
|
96 |
# Main Generation Function
|
|
|
9 |
from datetime import datetime, timedelta
|
10 |
from threading import Thread
|
11 |
|
12 |
+
import torch
|
13 |
import gradio as gr
|
14 |
import spaces
|
15 |
import numpy as np
|
|
|
89 |
# If you prefer a hardcoded system prompt, you can use:
|
90 |
# SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, and ends with an ASCII cat."
|
91 |
|
92 |
+
# Set the device explicitly (vLLM requires an explicit device specification)
|
93 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
94 |
+
|
95 |
# Initialize the Mistral LLM via vllm.
|
96 |
# Note: Running this model on GPU may require very high VRAM.
|
97 |
+
llm = LLM(model=MODEL_ID, tokenizer_mode="mistral", device=device)
|
98 |
|
99 |
# -----------------------------------------------------------------------------
|
100 |
# Main Generation Function
|