Spaces:

vilarin
/

vllm-chat

Paused

vilarin commited on Jun 7, 2024

Commit

710394e

verified ·

1 Parent(s): 9443a16

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -34,7 +34,7 @@ tokenizer = AutoTokenizer.from_pretrained(model)
 # max_tokens is for the maximum length for generation.
 # Input the model name or path. Can be GPTQ or AWQ models.
-llm = LLM(model=model)
 @spaces.GPU
 def generate(message, history, system, max_tokens, temperature, top_p, top_k, penalty):

 # max_tokens is for the maximum length for generation.
 # Input the model name or path. Can be GPTQ or AWQ models.
+llm = LLM(model=model, quantization="gptq", kv_cache_dtype="fp8_e5m2")
 @spaces.GPU
 def generate(message, history, system, max_tokens, temperature, top_p, top_k, penalty):