Braszczynski commited on
Commit
e1c82eb
·
verified ·
1 Parent(s): a9938e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -40
app.py CHANGED
@@ -11,22 +11,16 @@ max_seq_length = 512 # Adjust as needed
11
  dtype = None # Example dtype, adjust based on your setup
12
  load_in_4bit = True # Set to True if you want to use 4-bit quantization
13
 
14
- # Load the model and tokenizer using FastLanguageModel
15
- model, tokenizer = FastLanguageModel.from_pretrained(
16
- model_name=model_name,
17
- max_seq_length=max_seq_length,
18
- dtype=dtype,
19
- load_in_4bit=load_in_4bit,
20
- )
21
 
22
- # Load the adapter
 
23
  model.load_adapter(lora_adapter)
24
 
25
- # Enable native 2x faster inference
26
- FastLanguageModel.for_inference(model)
27
-
28
- # Optional: Initialize TextStreamer if you plan to use streaming
29
- # text_streamer = TextStreamer(tokenizer, skip_prompt=True)
30
 
31
  def respond(message, history, system_message, max_tokens, temperature, top_p):
32
  # Combine system message and chat history
@@ -34,34 +28,29 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
34
  for user_msg, bot_reply in history:
35
  chat_history += f"User: {user_msg}\nAssistant: {bot_reply}\n"
36
  chat_history += f"User: {message}\nAssistant:"
37
-
38
- # Apply chat template and tokenize the input
39
- inputs = tokenizer.apply_chat_template(
40
- [{"role": "user", "content": message}] if not history else [
41
- {"role": "system", "content": system_message}] + [
42
- {"role": "user", "content": msg} for msg, _ in history
43
- ] + [{"role": "assistant", "content": reply} for _, reply in history] + [
44
- {"role": "user", "content": message}
45
- ],
46
- tokenize=True,
47
- add_generation_prompt=True, # Must add for generation
48
  return_tensors="pt",
49
- ).to("cuda")
50
-
51
- # Generate response
52
- outputs = model.generate(
53
- input_ids=inputs["input_ids"],
54
- max_new_tokens=max_tokens,
55
- temperature=temperature,
56
- top_p=top_p,
57
- pad_token_id=tokenizer.eos_token_id,
58
- use_cache=True
59
- # streamer=text_streamer # Uncomment if using streaming
60
- )
61
-
62
- # Decode and format the output
 
 
63
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
64
- response = response[len(chat_history):].strip() # Remove input context from output
65
  return response
66
 
67
  # Define the Gradio interface
@@ -76,4 +65,4 @@ demo = gr.ChatInterface(
76
  )
77
 
78
  if __name__ == "__main__":
79
- demo.launch()
 
11
  dtype = None # Example dtype, adjust based on your setup
12
  load_in_4bit = True # Set to True if you want to use 4-bit quantization
13
 
14
+ # Load the tokenizer
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
 
 
 
 
 
16
 
17
+ # Load the base model with adapters
18
+ model = AutoAdapterModel.from_pretrained(model_name, low_cpu_mem_usage=True)
19
  model.load_adapter(lora_adapter)
20
 
21
+ # Move the model to CPU
22
+ device = torch.device("cpu")
23
+ model.to(device)
 
 
24
 
25
  def respond(message, history, system_message, max_tokens, temperature, top_p):
26
  # Combine system message and chat history
 
28
  for user_msg, bot_reply in history:
29
  chat_history += f"User: {user_msg}\nAssistant: {bot_reply}\n"
30
  chat_history += f"User: {message}\nAssistant:"
31
+
32
+ # Prepare the input for the model
33
+ inputs = tokenizer(
34
+ chat_history,
 
 
 
 
 
 
 
35
  return_tensors="pt",
36
+ truncation=True,
37
+ max_length=max_seq_length,
38
+ ).to(device)
39
+
40
+ # Generate the response
41
+ with torch.no_grad():
42
+ outputs = model.generate(
43
+ input_ids=inputs["input_ids"],
44
+ max_new_tokens=max_tokens,
45
+ temperature=temperature,
46
+ top_p=top_p,
47
+ pad_token_id=tokenizer.eos_token_id,
48
+ use_cache=True
49
+ )
50
+
51
+ # Decode and format the response
52
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
53
+ response = response[len(chat_history):].strip() # Remove the input context
54
  return response
55
 
56
  # Define the Gradio interface
 
65
  )
66
 
67
  if __name__ == "__main__":
68
+ demo.launch()