Tijmen2 commited on
Commit
87b2e49
·
verified ·
1 Parent(s): 1116052

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -18
app.py CHANGED
@@ -1,27 +1,41 @@
1
- import spaces
2
  import gradio as gr
3
  from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
5
  import random
 
 
6
 
7
- # Initialize model
 
 
 
 
8
  model_path = hf_hub_download(
9
  repo_id="AstroMLab/AstroSage-8B-GGUF",
10
  filename="AstroSage-8B-Q8_0.gguf"
11
  )
12
 
 
13
  llm = Llama(
14
  model_path=model_path,
15
- n_ctx=2048,
16
- n_threads=4,
 
 
17
  chat_format="llama-3",
18
  seed=42,
19
- f16_kv=True,
20
  logits_all=False,
21
- use_mmap=True,
22
- use_gpu=True
 
23
  )
24
 
 
 
 
 
 
25
  # Placeholder responses for when context is empty
26
  GREETING_MESSAGES = [
27
  "Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?",
@@ -38,20 +52,24 @@ def user(user_message, history):
38
 
39
  @spaces.GPU
40
  def bot(history):
41
- """Generate and stream the bot's response."""
42
  if not history:
43
  history = []
44
-
 
 
 
 
45
  # Prepare the messages for the model
46
  messages = [
47
  {
48
  "role": "system",
49
- "content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and space science. You provide accurate, scientific information while making complex concepts accessible. You're enthusiastic about space exploration and maintain a sense of wonder about the cosmos."
50
  }
51
  ]
52
 
53
- # Add chat history
54
- for message in history[:-1]: # Exclude the last message which we just added
55
  messages.append({"role": message["role"], "content": message["content"]})
56
 
57
  # Add the current user message
@@ -60,13 +78,18 @@ def bot(history):
60
  # Start generating the response
61
  history.append({"role": "assistant", "content": ""})
62
 
63
- # Stream the response
64
  response = llm.create_chat_completion(
65
  messages=messages,
66
  max_tokens=512,
67
  temperature=0.7,
68
  top_p=0.95,
69
- stream=True
 
 
 
 
 
70
  )
71
 
72
  for chunk in response:
@@ -93,7 +116,7 @@ custom_css = """
93
  }
94
  """
95
 
96
- # Create the Gradio interface
97
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")) as demo:
98
  gr.Markdown(
99
  """
@@ -143,7 +166,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutra
143
  label="Example Questions"
144
  )
145
 
146
- # Set up the message chain with streaming
147
  msg.submit(
148
  user,
149
  [msg, chatbot],
@@ -152,7 +175,10 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutra
152
  ).then(
153
  bot,
154
  chatbot,
155
- chatbot
 
 
 
156
  )
157
 
158
  # Clear button functionality
@@ -161,6 +187,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutra
161
  # Initial greeting
162
  demo.load(initial_greeting, None, chatbot, queue=False)
163
 
164
- # Launch the app
165
  if __name__ == "__main__":
 
166
  demo.launch()
 
 
1
  import gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
  import random
5
+ import spaces
6
+ import torch
7
 
8
+ # Get the number of available CPU cores
9
+ import multiprocessing
10
+ n_cores = multiprocessing.cpu_count()
11
+
12
+ # Initialize model with optimized parameters
13
  model_path = hf_hub_download(
14
  repo_id="AstroMLab/AstroSage-8B-GGUF",
15
  filename="AstroSage-8B-Q8_0.gguf"
16
  )
17
 
18
+ # Optimized LLaMA parameters for A100
19
  llm = Llama(
20
  model_path=model_path,
21
+ n_ctx=2048, # Keep context window reasonable
22
+ n_threads=n_cores, # Use all available CPU cores
23
+ n_batch=512, # Increase batch size for faster processing
24
+ n_gpu_layers=35, # Offload more layers to GPU
25
  chat_format="llama-3",
26
  seed=42,
27
+ f16_kv=True, # Use FP16 for key/value cache
28
  logits_all=False,
29
+ use_mmap=False, # Disable memory mapping for faster loading
30
+ use_gpu=True,
31
+ tensor_split=None, # Let the model handle tensor splitting
32
  )
33
 
34
+ # Optimize CUDA settings if available
35
+ if torch.cuda.is_available():
36
+ torch.backends.cuda.matmul.allow_tf32 = True # Allow TF32 for faster matrix multiplication
37
+ torch.backends.cudnn.benchmark = True # Enable cudnn autotuner
38
+
39
  # Placeholder responses for when context is empty
40
  GREETING_MESSAGES = [
41
  "Greetings! I am AstroSage, your guide to the cosmos. What would you like to explore today?",
 
52
 
53
  @spaces.GPU
54
  def bot(history):
55
+ """Generate and stream the bot's response with optimized parameters."""
56
  if not history:
57
  history = []
58
+
59
+ # Optimize context by limiting history
60
+ max_history_tokens = 1024 # Reserve half of context for response
61
+ recent_history = history[-5:] # Keep only last 5 messages for context
62
+
63
  # Prepare the messages for the model
64
  messages = [
65
  {
66
  "role": "system",
67
+ "content": "You are AstroSage, an intelligent AI assistant specializing in astronomy, astrophysics, and space science. Be concise and direct in your responses while maintaining accuracy."
68
  }
69
  ]
70
 
71
+ # Add optimized chat history
72
+ for message in recent_history[:-1]:
73
  messages.append({"role": message["role"], "content": message["content"]})
74
 
75
  # Add the current user message
 
78
  # Start generating the response
79
  history.append({"role": "assistant", "content": ""})
80
 
81
+ # Optimized streaming parameters
82
  response = llm.create_chat_completion(
83
  messages=messages,
84
  max_tokens=512,
85
  temperature=0.7,
86
  top_p=0.95,
87
+ stream=True,
88
+ top_k=40, # Add top-k sampling
89
+ repeat_penalty=1.1, # Slight penalty for repetition
90
+ mirostat_mode=2, # Enable Mirostat sampling
91
+ mirostat_tau=5.0,
92
+ mirostat_eta=0.1,
93
  )
94
 
95
  for chunk in response:
 
116
  }
117
  """
118
 
119
+ # Create the Gradio interface with optimized queue settings
120
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")) as demo:
121
  gr.Markdown(
122
  """
 
166
  label="Example Questions"
167
  )
168
 
169
+ # Set up the message chain with optimized queuing
170
  msg.submit(
171
  user,
172
  [msg, chatbot],
 
175
  ).then(
176
  bot,
177
  chatbot,
178
+ chatbot,
179
+ queue=True, # Enable queuing for bot responses
180
+ batch=True, # Enable batching
181
+ max_batch_size=4 # Process up to 4 requests together
182
  )
183
 
184
  # Clear button functionality
 
187
  # Initial greeting
188
  demo.load(initial_greeting, None, chatbot, queue=False)
189
 
190
+ # Launch the app with optimized settings
191
  if __name__ == "__main__":
192
+ demo.queue(concurrency_count=2) # Allow 2 concurrent requests
193
  demo.launch()