jwu323 commited on
Commit
93fb83c
·
verified ·
1 Parent(s): f5f11cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -28
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  from typing import Generator, Optional
3
  import gradio as gr
4
- from llama_cpp import Llama
5
  from huggingface_hub import hf_hub_download
6
 
7
  DESCRIPTION = '''
@@ -21,7 +21,7 @@ LICENSE = """
21
  template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
22
 
23
  class OptimizedLLMInterface:
24
- _model_instance = None
25
 
26
  def __init__(
27
  self,
@@ -29,19 +29,24 @@ class OptimizedLLMInterface:
29
  model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
30
  ):
31
  if OptimizedLLMInterface._model_instance is None:
 
32
  OptimizedLLMInterface._model_instance = Llama(
33
- model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
34
- n_ctx=512,
35
- n_threads=4,
36
- n_batch=32,
37
- logits_all=False,
38
- embedding=False,
39
- seed=-1,
40
- verbose=False,
41
- offload_kqv=True,
 
 
 
42
  )
43
  self.model = OptimizedLLMInterface._model_instance
44
 
 
45
  template_parts = template.split("{content}")
46
  self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
47
  self._suffix_tokens = self.model.tokenize(template_parts[1].encode())
@@ -50,28 +55,33 @@ class OptimizedLLMInterface:
50
  self,
51
  message: str,
52
  history: Optional[list] = None,
53
- max_tokens: int = 256,
54
  temperature: float = 0.7,
55
  top_p: float = 0.95,
56
  ) -> Generator[str, None, None]:
57
- message_tokens = self.model.tokenize(message.encode())
58
- input_tokens = []
59
- input_tokens.extend(self._prefix_tokens)
60
- input_tokens.extend(message_tokens)
61
- input_tokens.extend(self._suffix_tokens)
62
-
63
- output = ""
64
- batch = []
65
- batch_size = 8
66
-
67
  try:
 
 
 
 
 
 
 
 
 
 
 
68
  for token in self.model.generate(
69
  input_tokens,
70
  top_p=top_p,
71
  temp=temperature,
72
- top_k=1,
73
- repeat_penalty=1.0,
74
- max_tokens=max_tokens, # Added max_tokens limit
 
 
 
 
75
  ):
76
  batch.append(token)
77
  if len(batch) >= batch_size:
@@ -101,12 +111,12 @@ def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
101
  ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
102
  ['Find the least odd prime factor of $2019^8+1$.'],
103
  ],
104
- cache_examples=False, # Disabled example caching to fix the error
105
  fill_height=True
106
  )
107
 
108
  with gr.Accordion("Adjust Parameters", open=False):
109
- gr.Slider(minimum=128, maximum=2048, value=256, step=128, label="Max Tokens")
110
  gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
111
  gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p")
112
 
@@ -118,8 +128,8 @@ def main():
118
  llm = OptimizedLLMInterface()
119
  demo = create_demo(llm)
120
 
121
- # Simplified launch configuration
122
  demo.launch(
 
123
  quiet=True
124
  )
125
 
 
1
  import os
2
  from typing import Generator, Optional
3
  import gradio as gr
4
+ from llama_cpp import Llama, LlamaGrammar
5
  from huggingface_hub import hf_hub_download
6
 
7
  DESCRIPTION = '''
 
21
  template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
22
 
23
  class OptimizedLLMInterface:
24
+ _model_instance = None # Singleton pattern
25
 
26
  def __init__(
27
  self,
 
29
  model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
30
  ):
31
  if OptimizedLLMInterface._model_instance is None:
32
+ model_path = hf_hub_download(repo_id=model_repo_id, filename=model_filename)
33
  OptimizedLLMInterface._model_instance = Llama(
34
+ model_path=model_path,
35
+ n_ctx=256, # Minimal context for speed
36
+ n_threads=4, # Fixed thread count
37
+ n_batch=1, # Single batch for low latency
38
+ verbose=False, # Disable logging
39
+ seed=-1, # Disable random seed
40
+ logits_all=False, # Disable logits
41
+ embedding=False, # Disable embeddings
42
+ tensor_split=None, # No tensor splitting
43
+ rope_freq_base=10000, # Default RoPE settings
44
+ rope_freq_scale=1.0,
45
+ main_gpu=0,
46
  )
47
  self.model = OptimizedLLMInterface._model_instance
48
 
49
+ # Pre-tokenize template parts
50
  template_parts = template.split("{content}")
51
  self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
52
  self._suffix_tokens = self.model.tokenize(template_parts[1].encode())
 
55
  self,
56
  message: str,
57
  history: Optional[list] = None,
58
+ max_tokens: int = 128, # Reduced max tokens
59
  temperature: float = 0.7,
60
  top_p: float = 0.95,
61
  ) -> Generator[str, None, None]:
 
 
 
 
 
 
 
 
 
 
62
  try:
63
+ # Fast token preparation
64
+ message_tokens = self.model.tokenize(message.encode())
65
+ input_tokens = []
66
+ input_tokens.extend(self._prefix_tokens)
67
+ input_tokens.extend(message_tokens)
68
+ input_tokens.extend(self._suffix_tokens)
69
+
70
+ output = ""
71
+ batch = []
72
+ batch_size = 4 # Small batch size for faster responses
73
+
74
  for token in self.model.generate(
75
  input_tokens,
76
  top_p=top_p,
77
  temp=temperature,
78
+ top_k=1, # Minimal top_k
79
+ repeat_penalty=1.0, # No repeat penalty
80
+ mirostat_mode=0, # Disable mirostat
81
+ min_p=0.05, # Allow more diversity
82
+ typical_p=1.0, # Disable typical sampling
83
+ presence_penalty=0,
84
+ frequency_penalty=0,
85
  ):
86
  batch.append(token)
87
  if len(batch) >= batch_size:
 
111
  ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
112
  ['Find the least odd prime factor of $2019^8+1$.'],
113
  ],
114
+ cache_examples=False,
115
  fill_height=True
116
  )
117
 
118
  with gr.Accordion("Adjust Parameters", open=False):
119
+ gr.Slider(minimum=64, maximum=512, value=128, step=64, label="Max Tokens")
120
  gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
121
  gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p")
122
 
 
128
  llm = OptimizedLLMInterface()
129
  demo = create_demo(llm)
130
 
 
131
  demo.launch(
132
+ share=False,
133
  quiet=True
134
  )
135