Spaces:

AstroMLab
/

AstroSage

Running on Zero

App Files Files Community

Tijmen2 commited on Nov 16, 2024

Commit

87b2e49

verified ·

1 Parent(s): 1116052

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -18

app.py CHANGED Viewed

@@ -1,27 +1,41 @@
-import spaces
 import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import random
-# Initialize model
 model_path = hf_hub_download(
     repo_id="AstroMLab/AstroSage-8B-GGUF",
     filename="AstroSage-8B-Q8_0.gguf"
 )
 llm = Llama(
     model_path=model_path,
-    n_ctx=2048,
-    n_threads=4,
     chat_format="llama-3",
     seed=42,
-    f16_kv=True,
     logits_all=False,
-    use_mmap=True,
-    use_gpu=True
 )
 # Placeholder responses for when context is empty
 GREETING_MESSAGES = [
     "Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?",
@@ -38,20 +52,24 @@ def user(user_message, history):
 @spaces.GPU
 def bot(history):
-    """Generate and stream the bot's response."""
     if not history:
         history = []
     # Prepare the messages for the model
     messages = [
         {
             "role": "system",
-            "content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and space science. You provide accurate, scientific information while making complex concepts accessible. You're enthusiastic about space exploration and maintain a sense of wonder about the cosmos."
         }
     ]
-    # Add chat history
-    for message in history[:-1]:  # Exclude the last message which we just added
         messages.append({"role": message["role"], "content": message["content"]})
     # Add the current user message
@@ -60,13 +78,18 @@ def bot(history):
     # Start generating the response
     history.append({"role": "assistant", "content": ""})
-    # Stream the response
     response = llm.create_chat_completion(
         messages=messages,
         max_tokens=512,
         temperature=0.7,
         top_p=0.95,
-        stream=True
     )
     for chunk in response:
@@ -93,7 +116,7 @@ custom_css = """
 }
 """
-# Create the Gradio interface
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")) as demo:
     gr.Markdown(
         """
@@ -143,7 +166,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutra
         label="Example Questions"
     )
-    # Set up the message chain with streaming
     msg.submit(
         user,
         [msg, chatbot],
@@ -152,7 +175,10 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutra
     ).then(
         bot,
         chatbot,
-        chatbot
     )
     # Clear button functionality
@@ -161,6 +187,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutra
     # Initial greeting
     demo.load(initial_greeting, None, chatbot, queue=False)
-# Launch the app
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import random
+import spaces
+import torch
+# Get the number of available CPU cores
+import multiprocessing
+n_cores = multiprocessing.cpu_count()
+# Initialize model with optimized parameters
 model_path = hf_hub_download(
     repo_id="AstroMLab/AstroSage-8B-GGUF",
     filename="AstroSage-8B-Q8_0.gguf"
 )
+# Optimized LLaMA parameters for A100
 llm = Llama(
     model_path=model_path,
+    n_ctx=2048,  # Keep context window reasonable
+    n_threads=n_cores,  # Use all available CPU cores
+    n_batch=512,  # Increase batch size for faster processing
+    n_gpu_layers=35,  # Offload more layers to GPU
     chat_format="llama-3",
     seed=42,
+    f16_kv=True,  # Use FP16 for key/value cache
     logits_all=False,
+    use_mmap=False,  # Disable memory mapping for faster loading
+    use_gpu=True,
+    tensor_split=None,  # Let the model handle tensor splitting
 )
+# Optimize CUDA settings if available
+if torch.cuda.is_available():
+    torch.backends.cuda.matmul.allow_tf32 = True  # Allow TF32 for faster matrix multiplication
+    torch.backends.cudnn.benchmark = True  # Enable cudnn autotuner
 # Placeholder responses for when context is empty
 GREETING_MESSAGES = [
     "Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?",
 @spaces.GPU
 def bot(history):
+    """Generate and stream the bot's response with optimized parameters."""
     if not history:
         history = []
+    # Optimize context by limiting history
+    max_history_tokens = 1024  # Reserve half of context for response
+    recent_history = history[-5:]  # Keep only last 5 messages for context
     # Prepare the messages for the model
     messages = [
         {
             "role": "system",
+            "content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and space science. Be concise and direct in your responses while maintaining accuracy."
         }
     ]
+    # Add optimized chat history
+    for message in recent_history[:-1]:
         messages.append({"role": message["role"], "content": message["content"]})
     # Add the current user message
     # Start generating the response
     history.append({"role": "assistant", "content": ""})
+    # Optimized streaming parameters
     response = llm.create_chat_completion(
         messages=messages,
         max_tokens=512,
         temperature=0.7,
         top_p=0.95,
+        stream=True,
+        top_k=40,  # Add top-k sampling
+        repeat_penalty=1.1,  # Slight penalty for repetition
+        mirostat_mode=2,  # Enable Mirostat sampling
+        mirostat_tau=5.0,
+        mirostat_eta=0.1,
     )
     for chunk in response:
 }
 """
+# Create the Gradio interface with optimized queue settings
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")) as demo:
     gr.Markdown(
         """
         label="Example Questions"
     )
+    # Set up the message chain with optimized queuing
     msg.submit(
         user,
         [msg, chatbot],
     ).then(
         bot,
         chatbot,
+        chatbot,
+        queue=True,  # Enable queuing for bot responses
+        batch=True,  # Enable batching
+        max_batch_size=4  # Process up to 4 requests together
     )
     # Clear button functionality
     # Initial greeting
     demo.load(initial_greeting, None, chatbot, queue=False)
+# Launch the app with optimized settings
 if __name__ == "__main__":
+    demo.queue(concurrency_count=2)  # Allow 2 concurrent requests
     demo.launch()