Spaces:

msy127
/

TheBloke-Llama-2-7b-Chat-GPTQ

Runtime error

MS-YUN commited on Sep 24, 2023

Commit

be2dcb9

1 Parent(s): 1a9352f

Add application file6

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,13 +1,25 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
-model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
-model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
-                                             device_map="auto",
-                                             trust_remote_code=False,
-                                             revision="main")
-tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
 def predict(message, chatbot, temperature=0.9, max_new_tokens=256, top_p=0.6, repetition_penalty=1.0,):

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_name ="NousResearch/Llama-2-7b-chat-hf"
+bnb_config = BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_quant_type="nf4",
+ bnb_4bit_compute_dtype=torch.bfloat16,
+)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    quantization_config=bnb_config)
+model.config.use_cache = False
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training
 def predict(message, chatbot, temperature=0.9, max_new_tokens=256, top_p=0.6, repetition_penalty=1.0,):