Qwen-VL-Chat

Paused

Tonic commited on Nov 14, 2023

Commit

ea7c9d2

1 Parent(s): 7c96374

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -21,7 +21,7 @@ model_name = "OpenLLM-France/Claire-7B-0.1"
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
 model = transformers.AutoModelForCausalLM.from_pretrained(model_name,
     device_map="auto",
-    torch_dtype=torch.bfloat16.to("cuda"),
     load_in_4bit=True                          # For efficient inference, if supported by the GPU card
 )
 model = model.to_bettertransformer()
@@ -58,6 +58,7 @@ class FalconChatBot:
         conversation = f"{self.system_prompt}\n {assistant_message if assistant_message else ''}\n {user_message}\n "
         # Encode the conversation using the tokenizer
         input_ids = tokenizer.encode(conversation, return_tensors="pt", add_special_tokens=False)
         # Generate a response using the Falcon model
         response = model.generate(
             input_ids=input_ids,
@@ -76,7 +77,6 @@ class FalconChatBot:
         # Decode the generated response to text
         response_text = tokenizer.decode(response[0], skip_special_tokens=True)
         # Update and return the history with the new conversation
         updated_history = processed_history + [{"user": user_message, "assistant": response_text}]
         return response_text, updated_history

 tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
 model = transformers.AutoModelForCausalLM.from_pretrained(model_name,
     device_map="auto",
+    torch_dtype=torch.bfloat16
     load_in_4bit=True                          # For efficient inference, if supported by the GPU card
 )
 model = model.to_bettertransformer()
         conversation = f"{self.system_prompt}\n {assistant_message if assistant_message else ''}\n {user_message}\n "
         # Encode the conversation using the tokenizer
         input_ids = tokenizer.encode(conversation, return_tensors="pt", add_special_tokens=False)
+        input_ids = input_ids.to(device)
         # Generate a response using the Falcon model
         response = model.generate(
             input_ids=input_ids,
         # Decode the generated response to text
         response_text = tokenizer.decode(response[0], skip_special_tokens=True)
         # Update and return the history with the new conversation
         updated_history = processed_history + [{"user": user_message, "assistant": response_text}]
         return response_text, updated_history