Update app.py
Browse files
app.py
CHANGED
@@ -34,7 +34,7 @@ tokenizer = AutoTokenizer.from_pretrained(model)
|
|
34 |
# max_tokens is for the maximum length for generation.
|
35 |
|
36 |
# Input the model name or path. Can be GPTQ or AWQ models.
|
37 |
-
llm = LLM(model=model)
|
38 |
|
39 |
@spaces.GPU
|
40 |
def generate(message, history, system, max_tokens, temperature, top_p, top_k, penalty):
|
|
|
34 |
# max_tokens is for the maximum length for generation.
|
35 |
|
36 |
# Input the model name or path. Can be GPTQ or AWQ models.
|
37 |
+
llm = LLM(model=model, quantization="gptq", kv_cache_dtype="fp8_e5m2")
|
38 |
|
39 |
@spaces.GPU
|
40 |
def generate(message, history, system, max_tokens, temperature, top_p, top_k, penalty):
|