jwu323 commited on
Commit
f5f11cd
·
verified ·
1 Parent(s): 43b7c77

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -37
app.py CHANGED
@@ -21,30 +21,27 @@ LICENSE = """
21
  template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
22
 
23
  class OptimizedLLMInterface:
24
- _model_instance = None # Class-level model instance for singleton pattern
25
 
26
  def __init__(
27
  self,
28
  model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF",
29
  model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
30
  ):
31
- """Initialize optimized LLM interface with aggressive performance settings"""
32
- # Only create model instance once
33
  if OptimizedLLMInterface._model_instance is None:
34
  OptimizedLLMInterface._model_instance = Llama(
35
  model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
36
- n_ctx=512, # Reduced context size for speed
37
- n_threads=4, # Fixed thread count
38
- n_batch=32, # Smaller batch size for faster responses
39
  logits_all=False,
40
  embedding=False,
41
- seed=-1, # Disable seed for performance
42
- verbose=False, # Disable logging
43
  offload_kqv=True,
44
  )
45
  self.model = OptimizedLLMInterface._model_instance
46
 
47
- # Pre-compute template parts
48
  template_parts = template.split("{content}")
49
  self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
50
  self._suffix_tokens = self.model.tokenize(template_parts[1].encode())
@@ -53,45 +50,45 @@ class OptimizedLLMInterface:
53
  self,
54
  message: str,
55
  history: Optional[list] = None,
56
- max_tokens: int = 256, # Reduced max tokens
57
  temperature: float = 0.7,
58
  top_p: float = 0.95,
59
  ) -> Generator[str, None, None]:
60
- """Optimized response generation"""
61
- # Fast token combination
62
  message_tokens = self.model.tokenize(message.encode())
63
  input_tokens = []
64
  input_tokens.extend(self._prefix_tokens)
65
  input_tokens.extend(message_tokens)
66
  input_tokens.extend(self._suffix_tokens)
67
 
68
- # Batch output processing
69
  output = ""
70
  batch = []
71
- batch_size = 8 # Process tokens in small batches
72
 
73
- for token in self.model.generate(
74
- input_tokens,
75
- top_p=top_p,
76
- temp=temperature,
77
- top_k=1, # Minimal sampling for speed
78
- repeat_penalty=1.0, # Disable repeat penalty
79
- ):
80
- batch.append(token)
81
- if len(batch) >= batch_size:
 
 
 
 
 
 
 
 
82
  text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
83
  output += text
84
  yield output
85
- batch = []
86
-
87
- # Handle remaining tokens
88
- if batch:
89
- text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
90
- output += text
91
- yield output
92
 
93
  def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
94
- """Create the Gradio interface"""
95
  with gr.Blocks() as demo:
96
  gr.Markdown(DESCRIPTION)
97
 
@@ -104,7 +101,7 @@ def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
104
  ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
105
  ['Find the least odd prime factor of $2019^8+1$.'],
106
  ],
107
- cache_examples=True, # Enable example caching
108
  fill_height=True
109
  )
110
 
@@ -118,14 +115,12 @@ def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
118
  return demo
119
 
120
  def main():
121
- # Initialize with performance settings
122
  llm = OptimizedLLMInterface()
123
-
124
- # Create and launch the demo with minimal overhead
125
  demo = create_demo(llm)
126
- demo.queue(max_size=10)
 
127
  demo.launch(
128
- quiet=True,
129
  )
130
 
131
  if __name__ == "__main__":
 
21
  template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
22
 
23
  class OptimizedLLMInterface:
24
+ _model_instance = None
25
 
26
  def __init__(
27
  self,
28
  model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF",
29
  model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
30
  ):
 
 
31
  if OptimizedLLMInterface._model_instance is None:
32
  OptimizedLLMInterface._model_instance = Llama(
33
  model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
34
+ n_ctx=512,
35
+ n_threads=4,
36
+ n_batch=32,
37
  logits_all=False,
38
  embedding=False,
39
+ seed=-1,
40
+ verbose=False,
41
  offload_kqv=True,
42
  )
43
  self.model = OptimizedLLMInterface._model_instance
44
 
 
45
  template_parts = template.split("{content}")
46
  self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
47
  self._suffix_tokens = self.model.tokenize(template_parts[1].encode())
 
50
  self,
51
  message: str,
52
  history: Optional[list] = None,
53
+ max_tokens: int = 256,
54
  temperature: float = 0.7,
55
  top_p: float = 0.95,
56
  ) -> Generator[str, None, None]:
 
 
57
  message_tokens = self.model.tokenize(message.encode())
58
  input_tokens = []
59
  input_tokens.extend(self._prefix_tokens)
60
  input_tokens.extend(message_tokens)
61
  input_tokens.extend(self._suffix_tokens)
62
 
 
63
  output = ""
64
  batch = []
65
+ batch_size = 8
66
 
67
+ try:
68
+ for token in self.model.generate(
69
+ input_tokens,
70
+ top_p=top_p,
71
+ temp=temperature,
72
+ top_k=1,
73
+ repeat_penalty=1.0,
74
+ max_tokens=max_tokens, # Added max_tokens limit
75
+ ):
76
+ batch.append(token)
77
+ if len(batch) >= batch_size:
78
+ text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
79
+ output += text
80
+ yield output
81
+ batch = []
82
+
83
+ if batch:
84
  text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
85
  output += text
86
  yield output
87
+
88
+ except Exception as e:
89
+ yield f"Error: {str(e)}"
 
 
 
 
90
 
91
  def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
 
92
  with gr.Blocks() as demo:
93
  gr.Markdown(DESCRIPTION)
94
 
 
101
  ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
102
  ['Find the least odd prime factor of $2019^8+1$.'],
103
  ],
104
+ cache_examples=False, # Disabled example caching to fix the error
105
  fill_height=True
106
  )
107
 
 
115
  return demo
116
 
117
  def main():
 
118
  llm = OptimizedLLMInterface()
 
 
119
  demo = create_demo(llm)
120
+
121
+ # Simplified launch configuration
122
  demo.launch(
123
+ quiet=True
124
  )
125
 
126
  if __name__ == "__main__":