Update app.py
Browse files
app.py
CHANGED
@@ -34,7 +34,10 @@ class OptimizedLLMInterface:
|
|
34 |
model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
|
35 |
n_ctx=context_size,
|
36 |
n_threads=num_threads,
|
37 |
-
n_batch=512 # Increased batch size for better CPU utilization
|
|
|
|
|
|
|
38 |
)
|
39 |
|
40 |
def generate_response(
|
@@ -96,7 +99,7 @@ def main():
|
|
96 |
# Create and launch the demo
|
97 |
demo = create_demo(llm)
|
98 |
demo.queue(max_size=10) # Limit queue size to prevent overload
|
99 |
-
demo.launch()
|
100 |
|
101 |
if __name__ == "__main__":
|
102 |
main()
|
|
|
34 |
model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
|
35 |
n_ctx=context_size,
|
36 |
n_threads=num_threads,
|
37 |
+
n_batch=512, # Increased batch size for better CPU utilization
|
38 |
+
logits_all=False, # Disable unnecessary logit calculations
|
39 |
+
embedding=False, # Disable embedding cache
|
40 |
+
offload_kqv=True # Enable memory optimizations
|
41 |
)
|
42 |
|
43 |
def generate_response(
|
|
|
99 |
# Create and launch the demo
|
100 |
demo = create_demo(llm)
|
101 |
demo.queue(max_size=10) # Limit queue size to prevent overload
|
102 |
+
demo.launch(quiet=True)
|
103 |
|
104 |
if __name__ == "__main__":
|
105 |
main()
|