jwu323 commited on
Commit
43b7c77
·
verified ·
1 Parent(s): e8e6330

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -34
app.py CHANGED
@@ -4,7 +4,6 @@ import gradio as gr
4
  from llama_cpp import Llama
5
  from huggingface_hub import hf_hub_download
6
 
7
- # Keep original template and descriptions
8
  DESCRIPTION = '''
9
  # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
10
  SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
@@ -22,46 +21,74 @@ LICENSE = """
22
  template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
23
 
24
  class OptimizedLLMInterface:
 
 
25
  def __init__(
26
  self,
27
  model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF",
28
  model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
29
- context_size: int = 32768,
30
- num_threads: int = 8,
31
  ):
32
- """Initialize optimized LLM interface"""
33
- self.model = Llama(
34
- model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
35
- n_ctx=context_size,
36
- n_threads=num_threads,
37
- n_batch=512, # Increased batch size for better CPU utilization
38
- logits_all=False, # Disable unnecessary logit calculations
39
- embedding=False, # Disable embedding cache
40
- offload_kqv=True # Enable memory optimizations
41
- )
 
 
 
 
 
 
 
 
 
 
42
 
43
  def generate_response(
44
  self,
45
  message: str,
46
  history: Optional[list] = None,
47
- max_tokens: int = 512,
48
- temperature: float = 0.9,
49
  top_p: float = 0.95,
50
  ) -> Generator[str, None, None]:
51
- """Generate response with optimized streaming"""
52
- input_text = template.format(content=message)
53
- input_tokens = self.model.tokenize(input_text.encode('utf-8'))
 
 
 
 
 
 
 
 
 
54
 
55
- temp = ""
56
  for token in self.model.generate(
57
  input_tokens,
58
  top_p=top_p,
59
  temp=temperature,
60
- repeat_penalty=1.1
 
61
  ):
62
- text = self.model.detokenize([token]).decode('utf-8')
63
- temp += text
64
- yield temp
 
 
 
 
 
 
 
 
 
65
 
66
  def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
67
  """Create the Gradio interface"""
@@ -77,29 +104,29 @@ def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
77
  ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
78
  ['Find the least odd prime factor of $2019^8+1$.'],
79
  ],
80
- cache_examples=False,
81
  fill_height=True
82
  )
83
 
84
  with gr.Accordion("Adjust Parameters", open=False):
85
- gr.Slider(minimum=128, maximum=8192, value=512, step=1, label="Max Tokens")
86
- gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
87
- gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
88
 
89
  gr.Markdown(LICENSE)
90
 
91
  return demo
92
 
93
  def main():
94
- # Initialize the optimized LLM interface
95
- llm = OptimizedLLMInterface(
96
- num_threads=os.cpu_count() or 8 # Automatically use available CPU cores
97
- )
98
 
99
- # Create and launch the demo
100
  demo = create_demo(llm)
101
- demo.queue(max_size=10) # Limit queue size to prevent overload
102
- demo.launch(quiet=True)
 
 
103
 
104
  if __name__ == "__main__":
105
  main()
 
4
  from llama_cpp import Llama
5
  from huggingface_hub import hf_hub_download
6
 
 
7
  DESCRIPTION = '''
8
  # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
9
  SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
 
21
  template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
22
 
23
  class OptimizedLLMInterface:
24
+ _model_instance = None # Class-level model instance for singleton pattern
25
+
26
  def __init__(
27
  self,
28
  model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF",
29
  model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
 
 
30
  ):
31
+ """Initialize optimized LLM interface with aggressive performance settings"""
32
+ # Only create model instance once
33
+ if OptimizedLLMInterface._model_instance is None:
34
+ OptimizedLLMInterface._model_instance = Llama(
35
+ model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
36
+ n_ctx=512, # Reduced context size for speed
37
+ n_threads=4, # Fixed thread count
38
+ n_batch=32, # Smaller batch size for faster responses
39
+ logits_all=False,
40
+ embedding=False,
41
+ seed=-1, # Disable seed for performance
42
+ verbose=False, # Disable logging
43
+ offload_kqv=True,
44
+ )
45
+ self.model = OptimizedLLMInterface._model_instance
46
+
47
+ # Pre-compute template parts
48
+ template_parts = template.split("{content}")
49
+ self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
50
+ self._suffix_tokens = self.model.tokenize(template_parts[1].encode())
51
 
52
  def generate_response(
53
  self,
54
  message: str,
55
  history: Optional[list] = None,
56
+ max_tokens: int = 256, # Reduced max tokens
57
+ temperature: float = 0.7,
58
  top_p: float = 0.95,
59
  ) -> Generator[str, None, None]:
60
+ """Optimized response generation"""
61
+ # Fast token combination
62
+ message_tokens = self.model.tokenize(message.encode())
63
+ input_tokens = []
64
+ input_tokens.extend(self._prefix_tokens)
65
+ input_tokens.extend(message_tokens)
66
+ input_tokens.extend(self._suffix_tokens)
67
+
68
+ # Batch output processing
69
+ output = ""
70
+ batch = []
71
+ batch_size = 8 # Process tokens in small batches
72
 
 
73
  for token in self.model.generate(
74
  input_tokens,
75
  top_p=top_p,
76
  temp=temperature,
77
+ top_k=1, # Minimal sampling for speed
78
+ repeat_penalty=1.0, # Disable repeat penalty
79
  ):
80
+ batch.append(token)
81
+ if len(batch) >= batch_size:
82
+ text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
83
+ output += text
84
+ yield output
85
+ batch = []
86
+
87
+ # Handle remaining tokens
88
+ if batch:
89
+ text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
90
+ output += text
91
+ yield output
92
 
93
  def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
94
  """Create the Gradio interface"""
 
104
  ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
105
  ['Find the least odd prime factor of $2019^8+1$.'],
106
  ],
107
+ cache_examples=True, # Enable example caching
108
  fill_height=True
109
  )
110
 
111
  with gr.Accordion("Adjust Parameters", open=False):
112
+ gr.Slider(minimum=128, maximum=2048, value=256, step=128, label="Max Tokens")
113
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
114
+ gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p")
115
 
116
  gr.Markdown(LICENSE)
117
 
118
  return demo
119
 
120
  def main():
121
+ # Initialize with performance settings
122
+ llm = OptimizedLLMInterface()
 
 
123
 
124
+ # Create and launch the demo with minimal overhead
125
  demo = create_demo(llm)
126
+ demo.queue(max_size=10)
127
+ demo.launch(
128
+ quiet=True,
129
+ )
130
 
131
  if __name__ == "__main__":
132
  main()