Damien Benveniste commited on
Commit
ae23345
·
1 Parent(s): a959d74
Files changed (1) hide show
  1. app.py +1 -1
app.py CHANGED
@@ -16,7 +16,7 @@ engine = AsyncLLMEngine.from_engine_args(
16
  max_num_batched_tokens=512, # Reduced for T4
17
  max_num_seqs=16, # Reduced for T4
18
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
19
- max_model_len=4096, # Phi-3-mini-4k context length
20
  quantization='awq', # Enable quantization if supported by the model
21
  enforce_eager=True, # Disable CUDA graph
22
  dtype='half', # Use half precision
 
16
  max_num_batched_tokens=512, # Reduced for T4
17
  max_num_seqs=16, # Reduced for T4
18
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
19
+ max_model_len=512, # Phi-3-mini-4k context length
20
  quantization='awq', # Enable quantization if supported by the model
21
  enforce_eager=True, # Disable CUDA graph
22
  dtype='half', # Use half precision