Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
from llama_cpp import Llama | |
from huggingface_hub import hf_hub_download | |
import random | |
import spaces | |
import torch | |
# Get the number of available CPU cores | |
import multiprocessing | |
n_cores = multiprocessing.cpu_count() | |
# Initialize model with optimized parameters | |
model_path = hf_hub_download( | |
repo_id="AstroMLab/AstroSage-8B-GGUF", | |
filename="AstroSage-8B-Q8_0.gguf" | |
) | |
# Optimized LLaMA parameters for A100 | |
llm = Llama( | |
model_path=model_path, | |
n_ctx=2048, # Keep context window reasonable | |
n_threads=n_cores, # Use all available CPU cores | |
n_batch=512, # Increase batch size for faster processing | |
n_gpu_layers=35, # Offload more layers to GPU | |
chat_format="llama-3", | |
seed=42, | |
f16_kv=True, # Use FP16 for key/value cache | |
logits_all=False, | |
use_mmap=False, # Disable memory mapping for faster loading | |
use_gpu=True, | |
tensor_split=None, # Let the model handle tensor splitting | |
) | |
# Optimize CUDA settings if available | |
if torch.cuda.is_available(): | |
torch.backends.cuda.matmul.allow_tf32 = True # Allow TF32 for faster matrix multiplication | |
torch.backends.cudnn.benchmark = True # Enable cudnn autotuner | |
# Placeholder responses for when context is empty | |
GREETING_MESSAGES = [ | |
"Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?", | |
"Welcome to our cosmic journey! I am AstroSage. How may I assist you in understanding the universe?", | |
"AstroSage here. Ready to explore the mysteries of space and time. How may I be of assistance?", | |
"The universe awaits! I'm AstroSage. What astronomical wonders shall we discuss?", | |
] | |
def user(user_message, history): | |
"""Add user message to chat history.""" | |
if history is None: | |
history = [] | |
return "", history + [{"role": "user", "content": user_message}] | |
def bot(history): | |
"""Generate and stream the bot's response with optimized parameters.""" | |
if not history: | |
history = [] | |
# Optimize context by limiting history | |
max_history_tokens = 1024 # Reserve half of context for response | |
recent_history = history[-5:] # Keep only last 5 messages for context | |
# Prepare the messages for the model | |
messages = [ | |
{ | |
"role": "system", | |
"content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and space science. Be concise and direct in your responses while maintaining accuracy." | |
} | |
] | |
# Add optimized chat history | |
for message in recent_history[:-1]: | |
messages.append({"role": message["role"], "content": message["content"]}) | |
# Add the current user message | |
messages.append({"role": "user", "content": history[-1]["content"]}) | |
# Start generating the response | |
history.append({"role": "assistant", "content": ""}) | |
# Optimized streaming parameters | |
response = llm.create_chat_completion( | |
messages=messages, | |
max_tokens=512, | |
temperature=0.7, | |
top_p=0.95, | |
stream=True, | |
top_k=40, # Add top-k sampling | |
repeat_penalty=1.1, # Slight penalty for repetition | |
mirostat_mode=2, # Enable Mirostat sampling | |
mirostat_tau=5.0, | |
mirostat_eta=0.1, | |
) | |
for chunk in response: | |
if chunk and "content" in chunk["choices"][0]["delta"]: | |
history[-1]["content"] += chunk["choices"][0]["delta"]["content"] | |
yield history | |
def initial_greeting(): | |
"""Return properly formatted initial greeting.""" | |
return [{"role": "assistant", "content": random.choice(GREETING_MESSAGES)}] | |
# Custom CSS for a space theme | |
custom_css = """ | |
#component-0 { | |
background-color: #1a1a2e; | |
border-radius: 15px; | |
padding: 20px; | |
} | |
.dark { | |
background-color: #0f0f1a; | |
} | |
.contain { | |
max-width: 1200px !important; | |
} | |
""" | |
# Create the Gradio interface with optimized queue settings | |
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")) as demo: | |
gr.Markdown( | |
""" | |
# π AstroSage: Your Cosmic AI Companion | |
Welcome to AstroSage, an advanced AI assistant specializing in astronomy, astrophysics, and cosmology. | |
Powered by the AstroSage-8B model, I'm here to help you explore the wonders of the universe! | |
### What Can I Help You With? | |
- πͺ Explanations of astronomical phenomena | |
- π Space exploration and missions | |
- β Stars, galaxies, and cosmology | |
- π Planetary science and exoplanets | |
- π Astrophysics concepts and theories | |
- π Astronomical instruments and observations | |
Just type your question below and let's embark on a cosmic journey together! | |
""" | |
) | |
chatbot = gr.Chatbot( | |
label="Chat with AstroSage", | |
bubble_full_width=False, | |
show_label=True, | |
height=450, | |
type="messages" | |
) | |
with gr.Row(): | |
msg = gr.Textbox( | |
label="Type your message here", | |
placeholder="Ask me anything about space and astronomy...", | |
scale=9 | |
) | |
clear = gr.Button("Clear Chat", scale=1) | |
# Example questions for quick start | |
gr.Examples( | |
examples=[ | |
"What is a black hole and how does it form?", | |
"Can you explain the life cycle of a star?", | |
"What are exoplanets and how do we detect them?", | |
"Tell me about the James Webb Space Telescope.", | |
"What is dark matter and why is it important?" | |
], | |
inputs=msg, | |
label="Example Questions" | |
) | |
# Set up the message chain with optimized queuing | |
msg.submit( | |
user, | |
[msg, chatbot], | |
[msg, chatbot], | |
queue=False | |
).then( | |
bot, | |
chatbot, | |
chatbot, | |
queue=True, # Enable queuing for bot responses | |
batch=True, # Enable batching | |
max_batch_size=4 # Process up to 4 requests together | |
) | |
# Clear button functionality | |
clear.click(lambda: None, None, chatbot, queue=False) | |
# Initial greeting | |
demo.load(initial_greeting, None, chatbot, queue=False) | |
# Launch the app with optimized settings | |
if __name__ == "__main__": | |
demo.queue(concurrency_count=2) # Allow 2 concurrent requests | |
demo.launch() |