Spaces:

SimpleBerry
/

LLaMA-O1-Supervised-1129-Demo

Running

App Files Files Community

jwu323 commited on Dec 4, 2024

Commit

43b7c77

verified ·

1 Parent(s): e8e6330

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -34

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-# Keep original template and descriptions
 DESCRIPTION = '''
 # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
 SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
@@ -22,46 +21,74 @@ LICENSE = """
 template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
 class OptimizedLLMInterface:
     def __init__(
         self,
         model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF",
         model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
-        context_size: int = 32768,
-        num_threads: int = 8,
     ):
-        """Initialize optimized LLM interface"""
-        self.model = Llama(
-            model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
-            n_ctx=context_size,
-            n_threads=num_threads,
-            n_batch=512,  # Increased batch size for better CPU utilization
-            logits_all=False,  # Disable unnecessary logit calculations
-            embedding=False,  # Disable embedding cache
-            offload_kqv=True  # Enable memory optimizations
-        )
     def generate_response(
         self,
         message: str,
         history: Optional[list] = None,
-        max_tokens: int = 512,
-        temperature: float = 0.9,
         top_p: float = 0.95,
     ) -> Generator[str, None, None]:
-        """Generate response with optimized streaming"""
-        input_text = template.format(content=message)
-        input_tokens = self.model.tokenize(input_text.encode('utf-8'))
-        temp = ""
         for token in self.model.generate(
             input_tokens,
             top_p=top_p,
             temp=temperature,
-            repeat_penalty=1.1
         ):
-            text = self.model.detokenize([token]).decode('utf-8')
-            temp += text
-            yield temp
 def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
     """Create the Gradio interface"""
@@ -77,29 +104,29 @@ def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
                 ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
                 ['Find the least odd prime factor of $2019^8+1$.'],
             ],
-            cache_examples=False,
             fill_height=True
         )
         with gr.Accordion("Adjust Parameters", open=False):
-            gr.Slider(minimum=128, maximum=8192, value=512, step=1, label="Max Tokens")
-            gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature")
-            gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
         gr.Markdown(LICENSE)
     return demo
 def main():
-    # Initialize the optimized LLM interface
-    llm = OptimizedLLMInterface(
-        num_threads=os.cpu_count() or 8  # Automatically use available CPU cores
-    )
-    # Create and launch the demo
     demo = create_demo(llm)
-    demo.queue(max_size=10)  # Limit queue size to prevent overload
-    demo.launch(quiet=True)
 if __name__ == "__main__":
     main()

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 DESCRIPTION = '''
 # SimpleBerry/LLaMA-O1-Supervised-1129 | Duplicate the space and set it to private for faster & personal inference for free.
 SimpleBerry/LLaMA-O1-Supervised-1129: an experimental research model developed by the SimpleBerry.
 template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
 class OptimizedLLMInterface:
+    _model_instance = None  # Class-level model instance for singleton pattern
     def __init__(
         self,
         model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF",
         model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
     ):
+        """Initialize optimized LLM interface with aggressive performance settings"""
+        # Only create model instance once
+        if OptimizedLLMInterface._model_instance is None:
+            OptimizedLLMInterface._model_instance = Llama(
+                model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
+                n_ctx=512,  # Reduced context size for speed
+                n_threads=4,  # Fixed thread count
+                n_batch=32,  # Smaller batch size for faster responses
+                logits_all=False,
+                embedding=False,
+                seed=-1,  # Disable seed for performance
+                verbose=False,  # Disable logging
+                offload_kqv=True,
+            )
+        self.model = OptimizedLLMInterface._model_instance
+        # Pre-compute template parts
+        template_parts = template.split("{content}")
+        self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
+        self._suffix_tokens = self.model.tokenize(template_parts[1].encode())
     def generate_response(
         self,
         message: str,
         history: Optional[list] = None,
+        max_tokens: int = 256,  # Reduced max tokens
+        temperature: float = 0.7,
         top_p: float = 0.95,
     ) -> Generator[str, None, None]:
+        """Optimized response generation"""
+        # Fast token combination
+        message_tokens = self.model.tokenize(message.encode())
+        input_tokens = []
+        input_tokens.extend(self._prefix_tokens)
+        input_tokens.extend(message_tokens)
+        input_tokens.extend(self._suffix_tokens)
+        # Batch output processing
+        output = ""
+        batch = []
+        batch_size = 8  # Process tokens in small batches
         for token in self.model.generate(
             input_tokens,
             top_p=top_p,
             temp=temperature,
+            top_k=1,  # Minimal sampling for speed
+            repeat_penalty=1.0,  # Disable repeat penalty
         ):
+            batch.append(token)
+            if len(batch) >= batch_size:
+                text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
+                output += text
+                yield output
+                batch = []
+        # Handle remaining tokens
+        if batch:
+            text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
+            output += text
+            yield output
 def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
     """Create the Gradio interface"""
                 ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
                 ['Find the least odd prime factor of $2019^8+1$.'],
             ],
+            cache_examples=True,  # Enable example caching
             fill_height=True
         )
         with gr.Accordion("Adjust Parameters", open=False):
+            gr.Slider(minimum=128, maximum=2048, value=256, step=128, label="Max Tokens")
+            gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
+            gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p")
         gr.Markdown(LICENSE)
     return demo
 def main():
+    # Initialize with performance settings
+    llm = OptimizedLLMInterface()
+    # Create and launch the demo with minimal overhead
     demo = create_demo(llm)
+    demo.queue(max_size=10)
+    demo.launch(
+        quiet=True,
+    )
 if __name__ == "__main__":
     main()