vilarin commited on
Commit
710394e
·
verified ·
1 Parent(s): 9443a16

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -1
app.py CHANGED
@@ -34,7 +34,7 @@ tokenizer = AutoTokenizer.from_pretrained(model)
34
  # max_tokens is for the maximum length for generation.
35
 
36
  # Input the model name or path. Can be GPTQ or AWQ models.
37
- llm = LLM(model=model)
38
 
39
  @spaces.GPU
40
  def generate(message, history, system, max_tokens, temperature, top_p, top_k, penalty):
 
34
  # max_tokens is for the maximum length for generation.
35
 
36
  # Input the model name or path. Can be GPTQ or AWQ models.
37
+ llm = LLM(model=model, quantization="gptq", kv_cache_dtype="fp8_e5m2")
38
 
39
  @spaces.GPU
40
  def generate(message, history, system, max_tokens, temperature, top_p, top_k, penalty):