Braszczynski commited on
Commit
a9938e0
1 Parent(s): cec8b5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -14
app.py CHANGED
@@ -1,19 +1,32 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoTokenizer
4
- from adapters import AutoAdapterModel
5
 
6
- model_name = "unsloth/Meta-Llama-3.1-8B-Instruct"
 
 
7
 
8
- # Load tokenizer with 4-bit quantization
9
- tokenizer = AutoTokenizer.from_pretrained(model_name, load_in_4bit=True)
 
10
 
11
- # Load the base model with adapters, ensuring it's loaded in 4-bit
12
- model = AutoAdapterModel.from_pretrained(model_name, load_in_4bit=True)
 
 
 
 
 
13
 
14
  # Load the adapter
15
- model.load_adapter("Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps")
 
 
 
16
 
 
 
17
 
18
  def respond(message, history, system_message, max_tokens, temperature, top_p):
19
  # Combine system message and chat history
@@ -22,16 +35,28 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
22
  chat_history += f"User: {user_msg}\nAssistant: {bot_reply}\n"
23
  chat_history += f"User: {message}\nAssistant:"
24
 
25
- # Tokenize the input
26
- inputs = tokenizer(chat_history, return_tensors="pt", truncation=True).to("cuda")
 
 
 
 
 
 
 
 
 
 
27
 
28
- # Generate response with reduced max tokens if necessary
29
  outputs = model.generate(
30
- inputs["input_ids"],
31
- max_new_tokens=max_tokens, # Consider setting a lower default
32
  temperature=temperature,
33
  top_p=top_p,
34
- pad_token_id=tokenizer.eos_token_id
 
 
35
  )
36
 
37
  # Decode and format the output
 
1
  import gradio as gr
2
  import torch
3
+ from unsloth import FastLanguageModel
4
+ from transformers import TextStreamer
5
 
6
+ # Configuration Variables
7
+ model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit" # Replace with your actual model name
8
+ lora_adapter = "Braszczynski/Llama-3.2-3B-Instruct-bnb-4bit-460steps"
9
 
10
+ max_seq_length = 512 # Adjust as needed
11
+ dtype = None # Example dtype, adjust based on your setup
12
+ load_in_4bit = True # Set to True if you want to use 4-bit quantization
13
 
14
+ # Load the model and tokenizer using FastLanguageModel
15
+ model, tokenizer = FastLanguageModel.from_pretrained(
16
+ model_name=model_name,
17
+ max_seq_length=max_seq_length,
18
+ dtype=dtype,
19
+ load_in_4bit=load_in_4bit,
20
+ )
21
 
22
  # Load the adapter
23
+ model.load_adapter(lora_adapter)
24
+
25
+ # Enable native 2x faster inference
26
+ FastLanguageModel.for_inference(model)
27
 
28
+ # Optional: Initialize TextStreamer if you plan to use streaming
29
+ # text_streamer = TextStreamer(tokenizer, skip_prompt=True)
30
 
31
  def respond(message, history, system_message, max_tokens, temperature, top_p):
32
  # Combine system message and chat history
 
35
  chat_history += f"User: {user_msg}\nAssistant: {bot_reply}\n"
36
  chat_history += f"User: {message}\nAssistant:"
37
 
38
+ # Apply chat template and tokenize the input
39
+ inputs = tokenizer.apply_chat_template(
40
+ [{"role": "user", "content": message}] if not history else [
41
+ {"role": "system", "content": system_message}] + [
42
+ {"role": "user", "content": msg} for msg, _ in history
43
+ ] + [{"role": "assistant", "content": reply} for _, reply in history] + [
44
+ {"role": "user", "content": message}
45
+ ],
46
+ tokenize=True,
47
+ add_generation_prompt=True, # Must add for generation
48
+ return_tensors="pt",
49
+ ).to("cuda")
50
 
51
+ # Generate response
52
  outputs = model.generate(
53
+ input_ids=inputs["input_ids"],
54
+ max_new_tokens=max_tokens,
55
  temperature=temperature,
56
  top_p=top_p,
57
+ pad_token_id=tokenizer.eos_token_id,
58
+ use_cache=True
59
+ # streamer=text_streamer # Uncomment if using streaming
60
  )
61
 
62
  # Decode and format the output