Spaces:

vilarin
/

vllm-chat

Paused

vilarin commited on Jun 7, 2024

Commit

98ca206

verified ·

1 Parent(s): 8b4873c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -30,11 +30,7 @@ footer {
 # Initialize the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model)
-# Pass the default decoding hyperparameters of Qwen2-7B-Instruct
-# max_tokens is for the maximum length for generation.
-# Input the model name or path. Can be GPTQ or AWQ models.
-llm = LLM(model=model, kv_cache_dtype="fp8_e5m2")
 @spaces.GPU
 def generate(message, history, system, max_tokens, temperature, top_p, top_k, penalty):

 # Initialize the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(model)
+llm = LLM(model=model)
 @spaces.GPU
 def generate(message, history, system, max_tokens, temperature, top_p, top_k, penalty):