Spaces:

SimpleBerry
/

LLaMA-O1-Supervised-1129-Demo

Running

App Files Files Community

jwu323 commited on Dec 4, 2024

Commit

f5f11cd

verified ·

1 Parent(s): 43b7c77

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -37

app.py CHANGED Viewed

@@ -21,30 +21,27 @@ LICENSE = """
 template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
 class OptimizedLLMInterface:
-    _model_instance = None  # Class-level model instance for singleton pattern
     def __init__(
         self,
         model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF",
         model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
     ):
-        """Initialize optimized LLM interface with aggressive performance settings"""
-        # Only create model instance once
         if OptimizedLLMInterface._model_instance is None:
             OptimizedLLMInterface._model_instance = Llama(
                 model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
-                n_ctx=512,  # Reduced context size for speed
-                n_threads=4,  # Fixed thread count
-                n_batch=32,  # Smaller batch size for faster responses
                 logits_all=False,
                 embedding=False,
-                seed=-1,  # Disable seed for performance
-                verbose=False,  # Disable logging
                 offload_kqv=True,
             )
         self.model = OptimizedLLMInterface._model_instance
-        # Pre-compute template parts
         template_parts = template.split("{content}")
         self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
         self._suffix_tokens = self.model.tokenize(template_parts[1].encode())
@@ -53,45 +50,45 @@ class OptimizedLLMInterface:
         self,
         message: str,
         history: Optional[list] = None,
-        max_tokens: int = 256,  # Reduced max tokens
         temperature: float = 0.7,
         top_p: float = 0.95,
     ) -> Generator[str, None, None]:
-        """Optimized response generation"""
-        # Fast token combination
         message_tokens = self.model.tokenize(message.encode())
         input_tokens = []
         input_tokens.extend(self._prefix_tokens)
         input_tokens.extend(message_tokens)
         input_tokens.extend(self._suffix_tokens)
-        # Batch output processing
         output = ""
         batch = []
-        batch_size = 8  # Process tokens in small batches
-        for token in self.model.generate(
-            input_tokens,
-            top_p=top_p,
-            temp=temperature,
-            top_k=1,  # Minimal sampling for speed
-            repeat_penalty=1.0,  # Disable repeat penalty
-        ):
-            batch.append(token)
-            if len(batch) >= batch_size:
                 text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
                 output += text
                 yield output
-                batch = []
-        # Handle remaining tokens
-        if batch:
-            text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
-            output += text
-            yield output
 def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
-    """Create the Gradio interface"""
     with gr.Blocks() as demo:
         gr.Markdown(DESCRIPTION)
@@ -104,7 +101,7 @@ def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
                 ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
                 ['Find the least odd prime factor of $2019^8+1$.'],
             ],
-            cache_examples=True,  # Enable example caching
             fill_height=True
         )
@@ -118,14 +115,12 @@ def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
     return demo
 def main():
-    # Initialize with performance settings
     llm = OptimizedLLMInterface()
-    # Create and launch the demo with minimal overhead
     demo = create_demo(llm)
-    demo.queue(max_size=10)
     demo.launch(
-        quiet=True,
     )
 if __name__ == "__main__":

 template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
 class OptimizedLLMInterface:
+    _model_instance = None
     def __init__(
         self,
         model_repo_id: str = "Lyte/LLaMA-O1-Supervised-1129-Q4_K_M-GGUF",
         model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
     ):
         if OptimizedLLMInterface._model_instance is None:
             OptimizedLLMInterface._model_instance = Llama(
                 model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
+                n_ctx=512,
+                n_threads=4,
+                n_batch=32,
                 logits_all=False,
                 embedding=False,
+                seed=-1,
+                verbose=False,
                 offload_kqv=True,
             )
         self.model = OptimizedLLMInterface._model_instance
         template_parts = template.split("{content}")
         self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
         self._suffix_tokens = self.model.tokenize(template_parts[1].encode())
         self,
         message: str,
         history: Optional[list] = None,
+        max_tokens: int = 256,
         temperature: float = 0.7,
         top_p: float = 0.95,
     ) -> Generator[str, None, None]:
         message_tokens = self.model.tokenize(message.encode())
         input_tokens = []
         input_tokens.extend(self._prefix_tokens)
         input_tokens.extend(message_tokens)
         input_tokens.extend(self._suffix_tokens)
         output = ""
         batch = []
+        batch_size = 8
+        try:
+            for token in self.model.generate(
+                input_tokens,
+                top_p=top_p,
+                temp=temperature,
+                top_k=1,
+                repeat_penalty=1.0,
+                max_tokens=max_tokens,  # Added max_tokens limit
+            ):
+                batch.append(token)
+                if len(batch) >= batch_size:
+                    text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
+                    output += text
+                    yield output
+                    batch = []
+            if batch:
                 text = self.model.detokenize(batch).decode('utf-8', errors='ignore')
                 output += text
                 yield output
+        except Exception as e:
+            yield f"Error: {str(e)}"
 def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
     with gr.Blocks() as demo:
         gr.Markdown(DESCRIPTION)
                 ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
                 ['Find the least odd prime factor of $2019^8+1$.'],
             ],
+            cache_examples=False,  # Disabled example caching to fix the error
             fill_height=True
         )
     return demo
 def main():
     llm = OptimizedLLMInterface()
     demo = create_demo(llm)
+    # Simplified launch configuration
     demo.launch(
+        quiet=True
     )
 if __name__ == "__main__":