Update app.py
Browse files
app.py
CHANGED
@@ -1,27 +1,41 @@
|
|
1 |
-
import spaces
|
2 |
import gradio as gr
|
3 |
from llama_cpp import Llama
|
4 |
from huggingface_hub import hf_hub_download
|
5 |
import random
|
|
|
|
|
6 |
|
7 |
-
#
|
|
|
|
|
|
|
|
|
8 |
model_path = hf_hub_download(
|
9 |
repo_id="AstroMLab/AstroSage-8B-GGUF",
|
10 |
filename="AstroSage-8B-Q8_0.gguf"
|
11 |
)
|
12 |
|
|
|
13 |
llm = Llama(
|
14 |
model_path=model_path,
|
15 |
-
n_ctx=2048,
|
16 |
-
n_threads=
|
|
|
|
|
17 |
chat_format="llama-3",
|
18 |
seed=42,
|
19 |
-
f16_kv=True,
|
20 |
logits_all=False,
|
21 |
-
use_mmap=
|
22 |
-
use_gpu=True
|
|
|
23 |
)
|
24 |
|
|
|
|
|
|
|
|
|
|
|
25 |
# Placeholder responses for when context is empty
|
26 |
GREETING_MESSAGES = [
|
27 |
"Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?",
|
@@ -38,20 +52,24 @@ def user(user_message, history):
|
|
38 |
|
39 |
@spaces.GPU
|
40 |
def bot(history):
|
41 |
-
"""Generate and stream the bot's response."""
|
42 |
if not history:
|
43 |
history = []
|
44 |
-
|
|
|
|
|
|
|
|
|
45 |
# Prepare the messages for the model
|
46 |
messages = [
|
47 |
{
|
48 |
"role": "system",
|
49 |
-
"content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and space science.
|
50 |
}
|
51 |
]
|
52 |
|
53 |
-
# Add chat history
|
54 |
-
for message in
|
55 |
messages.append({"role": message["role"], "content": message["content"]})
|
56 |
|
57 |
# Add the current user message
|
@@ -60,13 +78,18 @@ def bot(history):
|
|
60 |
# Start generating the response
|
61 |
history.append({"role": "assistant", "content": ""})
|
62 |
|
63 |
-
#
|
64 |
response = llm.create_chat_completion(
|
65 |
messages=messages,
|
66 |
max_tokens=512,
|
67 |
temperature=0.7,
|
68 |
top_p=0.95,
|
69 |
-
stream=True
|
|
|
|
|
|
|
|
|
|
|
70 |
)
|
71 |
|
72 |
for chunk in response:
|
@@ -93,7 +116,7 @@ custom_css = """
|
|
93 |
}
|
94 |
"""
|
95 |
|
96 |
-
# Create the Gradio interface
|
97 |
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")) as demo:
|
98 |
gr.Markdown(
|
99 |
"""
|
@@ -143,7 +166,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutra
|
|
143 |
label="Example Questions"
|
144 |
)
|
145 |
|
146 |
-
# Set up the message chain with
|
147 |
msg.submit(
|
148 |
user,
|
149 |
[msg, chatbot],
|
@@ -152,7 +175,10 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutra
|
|
152 |
).then(
|
153 |
bot,
|
154 |
chatbot,
|
155 |
-
chatbot
|
|
|
|
|
|
|
156 |
)
|
157 |
|
158 |
# Clear button functionality
|
@@ -161,6 +187,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutra
|
|
161 |
# Initial greeting
|
162 |
demo.load(initial_greeting, None, chatbot, queue=False)
|
163 |
|
164 |
-
# Launch the app
|
165 |
if __name__ == "__main__":
|
|
|
166 |
demo.launch()
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from llama_cpp import Llama
|
3 |
from huggingface_hub import hf_hub_download
|
4 |
import random
|
5 |
+
import spaces
|
6 |
+
import torch
|
7 |
|
8 |
+
# Get the number of available CPU cores
|
9 |
+
import multiprocessing
|
10 |
+
n_cores = multiprocessing.cpu_count()
|
11 |
+
|
12 |
+
# Initialize model with optimized parameters
|
13 |
model_path = hf_hub_download(
|
14 |
repo_id="AstroMLab/AstroSage-8B-GGUF",
|
15 |
filename="AstroSage-8B-Q8_0.gguf"
|
16 |
)
|
17 |
|
18 |
+
# Optimized LLaMA parameters for A100
|
19 |
llm = Llama(
|
20 |
model_path=model_path,
|
21 |
+
n_ctx=2048, # Keep context window reasonable
|
22 |
+
n_threads=n_cores, # Use all available CPU cores
|
23 |
+
n_batch=512, # Increase batch size for faster processing
|
24 |
+
n_gpu_layers=35, # Offload more layers to GPU
|
25 |
chat_format="llama-3",
|
26 |
seed=42,
|
27 |
+
f16_kv=True, # Use FP16 for key/value cache
|
28 |
logits_all=False,
|
29 |
+
use_mmap=False, # Disable memory mapping for faster loading
|
30 |
+
use_gpu=True,
|
31 |
+
tensor_split=None, # Let the model handle tensor splitting
|
32 |
)
|
33 |
|
34 |
+
# Optimize CUDA settings if available
|
35 |
+
if torch.cuda.is_available():
|
36 |
+
torch.backends.cuda.matmul.allow_tf32 = True # Allow TF32 for faster matrix multiplication
|
37 |
+
torch.backends.cudnn.benchmark = True # Enable cudnn autotuner
|
38 |
+
|
39 |
# Placeholder responses for when context is empty
|
40 |
GREETING_MESSAGES = [
|
41 |
"Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?",
|
|
|
52 |
|
53 |
@spaces.GPU
|
54 |
def bot(history):
|
55 |
+
"""Generate and stream the bot's response with optimized parameters."""
|
56 |
if not history:
|
57 |
history = []
|
58 |
+
|
59 |
+
# Optimize context by limiting history
|
60 |
+
max_history_tokens = 1024 # Reserve half of context for response
|
61 |
+
recent_history = history[-5:] # Keep only last 5 messages for context
|
62 |
+
|
63 |
# Prepare the messages for the model
|
64 |
messages = [
|
65 |
{
|
66 |
"role": "system",
|
67 |
+
"content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and space science. Be concise and direct in your responses while maintaining accuracy."
|
68 |
}
|
69 |
]
|
70 |
|
71 |
+
# Add optimized chat history
|
72 |
+
for message in recent_history[:-1]:
|
73 |
messages.append({"role": message["role"], "content": message["content"]})
|
74 |
|
75 |
# Add the current user message
|
|
|
78 |
# Start generating the response
|
79 |
history.append({"role": "assistant", "content": ""})
|
80 |
|
81 |
+
# Optimized streaming parameters
|
82 |
response = llm.create_chat_completion(
|
83 |
messages=messages,
|
84 |
max_tokens=512,
|
85 |
temperature=0.7,
|
86 |
top_p=0.95,
|
87 |
+
stream=True,
|
88 |
+
top_k=40, # Add top-k sampling
|
89 |
+
repeat_penalty=1.1, # Slight penalty for repetition
|
90 |
+
mirostat_mode=2, # Enable Mirostat sampling
|
91 |
+
mirostat_tau=5.0,
|
92 |
+
mirostat_eta=0.1,
|
93 |
)
|
94 |
|
95 |
for chunk in response:
|
|
|
116 |
}
|
117 |
"""
|
118 |
|
119 |
+
# Create the Gradio interface with optimized queue settings
|
120 |
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")) as demo:
|
121 |
gr.Markdown(
|
122 |
"""
|
|
|
166 |
label="Example Questions"
|
167 |
)
|
168 |
|
169 |
+
# Set up the message chain with optimized queuing
|
170 |
msg.submit(
|
171 |
user,
|
172 |
[msg, chatbot],
|
|
|
175 |
).then(
|
176 |
bot,
|
177 |
chatbot,
|
178 |
+
chatbot,
|
179 |
+
queue=True, # Enable queuing for bot responses
|
180 |
+
batch=True, # Enable batching
|
181 |
+
max_batch_size=4 # Process up to 4 requests together
|
182 |
)
|
183 |
|
184 |
# Clear button functionality
|
|
|
187 |
# Initial greeting
|
188 |
demo.load(initial_greeting, None, chatbot, queue=False)
|
189 |
|
190 |
+
# Launch the app with optimized settings
|
191 |
if __name__ == "__main__":
|
192 |
+
demo.queue(concurrency_count=2) # Allow 2 concurrent requests
|
193 |
demo.launch()
|