Tonic commited on
Commit
7c96374
·
1 Parent(s): f465778

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -9
app.py CHANGED
@@ -21,9 +21,10 @@ model_name = "OpenLLM-France/Claire-7B-0.1"
21
  tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
22
  model = transformers.AutoModelForCausalLM.from_pretrained(model_name,
23
  device_map="auto",
24
- torch_dtype=torch.bfloat16,
25
  load_in_4bit=True # For efficient inference, if supported by the GPU card
26
  )
 
27
 
28
  # Class to encapsulate the Falcon chatbot
29
  class FalconChatBot:
@@ -50,22 +51,35 @@ class FalconChatBot:
50
  return filtered_history
51
 
52
  def predict(self, user_message, assistant_message, history, temperature=0.4, max_new_tokens=700, top_p=0.99, repetition_penalty=1.9):
53
-
54
  # Process the history to remove special commands
55
  processed_history = self.process_history(history)
56
  # Combine the user and assistant messages into a conversation
57
- conversation = f"{self.system_prompt}\nFalcon: {assistant_message if assistant_message else ''} User: {user_message}\nFalcon:\n"
58
  # Encode the conversation using the tokenizer
59
  input_ids = tokenizer.encode(conversation, return_tensors="pt", add_special_tokens=False)
60
  # Generate a response using the Falcon model
61
- response = model.generate(input_ids=input_ids, max_length=max_length, use_cache=False, early_stopping=False, bos_token_id=model.config.bos_token_id, eos_token_id=model.config.eos_token_id, pad_token_id=model.config.eos_token_id, temperature=0.4, do_sample=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  # Decode the generated response to text
63
  response_text = tokenizer.decode(response[0], skip_special_tokens=True)
64
- # Append the Falcon-like conversation to the history
65
- self.history.append(conversation)
66
- self.history.append(response_text)
67
-
68
- return response_text
69
 
70
 
71
  # Create the Falcon chatbot instance
 
21
  tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
22
  model = transformers.AutoModelForCausalLM.from_pretrained(model_name,
23
  device_map="auto",
24
+ torch_dtype=torch.bfloat16.to("cuda"),
25
  load_in_4bit=True # For efficient inference, if supported by the GPU card
26
  )
27
+ model = model.to_bettertransformer()
28
 
29
  # Class to encapsulate the Falcon chatbot
30
  class FalconChatBot:
 
51
  return filtered_history
52
 
53
  def predict(self, user_message, assistant_message, history, temperature=0.4, max_new_tokens=700, top_p=0.99, repetition_penalty=1.9):
54
+ input_ids = input_ids.to(device)
55
  # Process the history to remove special commands
56
  processed_history = self.process_history(history)
57
  # Combine the user and assistant messages into a conversation
58
+ conversation = f"{self.system_prompt}\n {assistant_message if assistant_message else ''}\n {user_message}\n "
59
  # Encode the conversation using the tokenizer
60
  input_ids = tokenizer.encode(conversation, return_tensors="pt", add_special_tokens=False)
61
  # Generate a response using the Falcon model
62
+ response = model.generate(
63
+ input_ids=input_ids,
64
+ max_length=max_length,
65
+ use_cache=False,
66
+ early_stopping=False,
67
+ bos_token_id=model.config.bos_token_id,
68
+ eos_token_id=model.config.eos_token_id,
69
+ pad_token_id=model.config.eos_token_id,
70
+ temperature=temperature,
71
+ do_sample=True,
72
+ max_new_tokens=max_new_tokens,
73
+ top_p=top_p,
74
+ repetition_penalty=repetition_penalty
75
+ ) # Decode the generated response to text
76
+
77
  # Decode the generated response to text
78
  response_text = tokenizer.decode(response[0], skip_special_tokens=True)
79
+
80
+ # Update and return the history with the new conversation
81
+ updated_history = processed_history + [{"user": user_message, "assistant": response_text}]
82
+ return response_text, updated_history
 
83
 
84
 
85
  # Create the Falcon chatbot instance