jwu323 commited on
Commit
e8e6330
·
verified ·
1 Parent(s): 4d7e82f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -2
app.py CHANGED
@@ -34,7 +34,10 @@ class OptimizedLLMInterface:
34
  model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
35
  n_ctx=context_size,
36
  n_threads=num_threads,
37
- n_batch=512 # Increased batch size for better CPU utilization
 
 
 
38
  )
39
 
40
  def generate_response(
@@ -96,7 +99,7 @@ def main():
96
  # Create and launch the demo
97
  demo = create_demo(llm)
98
  demo.queue(max_size=10) # Limit queue size to prevent overload
99
- demo.launch()
100
 
101
  if __name__ == "__main__":
102
  main()
 
34
  model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
35
  n_ctx=context_size,
36
  n_threads=num_threads,
37
+ n_batch=512, # Increased batch size for better CPU utilization
38
+ logits_all=False, # Disable unnecessary logit calculations
39
+ embedding=False, # Disable embedding cache
40
+ offload_kqv=True # Enable memory optimizations
41
  )
42
 
43
  def generate_response(
 
99
  # Create and launch the demo
100
  demo = create_demo(llm)
101
  demo.queue(max_size=10) # Limit queue size to prevent overload
102
+ demo.launch(quiet=True)
103
 
104
  if __name__ == "__main__":
105
  main()