Spaces:

SimpleBerry
/

LLaMA-O1-Supervised-1129-Demo

Running

jwu323 commited on Dec 4, 2024

Commit

e8e6330

verified ·

1 Parent(s): 4d7e82f

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -34,7 +34,10 @@ class OptimizedLLMInterface:
             model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
             n_ctx=context_size,
             n_threads=num_threads,
-            n_batch=512  # Increased batch size for better CPU utilization
         )
     def generate_response(
@@ -96,7 +99,7 @@ def main():
     # Create and launch the demo
     demo = create_demo(llm)
     demo.queue(max_size=10)  # Limit queue size to prevent overload
-    demo.launch()
 if __name__ == "__main__":
     main()

             model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
             n_ctx=context_size,
             n_threads=num_threads,
+            n_batch=512,  # Increased batch size for better CPU utilization
+            logits_all=False,  # Disable unnecessary logit calculations
+            embedding=False,  # Disable embedding cache
+            offload_kqv=True  # Enable memory optimizations
         )
     def generate_response(
     # Create and launch the demo
     demo = create_demo(llm)
     demo.queue(max_size=10)  # Limit queue size to prevent overload
+    demo.launch(quiet=True)
 if __name__ == "__main__":
     main()