GSridhar1982 commited on
Commit
3f577ae
·
verified ·
1 Parent(s): 29365ca

Updated with unsloth inference pipeline

Browse files
Files changed (1) hide show
  1. app.py +40 -17
app.py CHANGED
@@ -1,9 +1,27 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
- import nltk
4
- nltk.download('punkt')
5
  from peft import AutoPeftModelForCausalLM
6
  from transformers import AutoTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def preprocess_text(text):
9
  # Convert to lowercase
@@ -14,27 +32,32 @@ def preprocess_text(text):
14
  text = ' '.join(text.split())
15
  return text
16
 
17
- def generate_answer(model_name,question):
18
- model = AutoPeftModelForCausalLM.from_pretrained(
19
- model_name, # YOUR MODEL YOU USED FOR TRAINING
 
 
20
  load_in_4bit = load_in_4bit,
 
 
21
  )
22
- tokenizer = AutoTokenizer.from_pretrained(model_name)
23
-
24
- question_preprocessed = preprocess_text(question)
25
  inputs = tokenizer(
26
  [
27
- qa_prompt.format(
28
  "Please provide the answer for the question", # instruction
29
- question_preprocessed, # input
30
  "", # output - leave this blank for generation!
31
- )
32
- ], return_tensors = "pt")
33
- outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
34
- decoded_output = tokenizer.batch_decode(outputs,skip_special_tokens=True)[0]
35
- predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
36
- return predicted_title
37
 
 
 
 
 
 
 
 
38
 
39
  iface = gr.Interface(
40
  fn=generate_answer,
 
1
  import gradio as gr
 
 
 
2
  from peft import AutoPeftModelForCausalLM
3
  from transformers import AutoTokenizer
4
+ from unsloth import FastLanguageModel
5
+ import torch
6
+ import nltk
7
+ nltk.download('punkt')
8
+ import re
9
+
10
+ qa_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
11
+
12
+ ### Instruction:
13
+ {}
14
+
15
+ ### Input:
16
+ {}
17
+
18
+ ### Response:
19
+ {}"""
20
+
21
+ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
22
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
23
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
24
+
25
 
26
  def preprocess_text(text):
27
  # Convert to lowercase
 
32
  text = ' '.join(text.split())
33
  return text
34
 
35
+ def generate_answer(model_name,question, load_in_4bit=True): # Added load_in_4bit as a parameter with a default value
36
+ model, tokenizer = FastLanguageModel.from_pretrained(
37
+ model_name = model_name, # YOUR MODEL YOU USED FOR TRAINING
38
+ max_seq_length = max_seq_length,
39
+ dtype = dtype,
40
  load_in_4bit = load_in_4bit,
41
+ #load_in_8bit_fp32_cpu_offload=True, # Add this line to enable CPU offloading
42
+ device_map={"":0} # Add this line to specify GPU 0 for model placement
43
  )
44
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
 
 
45
  inputs = tokenizer(
46
  [
47
+ qa_prompt.format(
48
  "Please provide the answer for the question", # instruction
49
+ question, # input
50
  "", # output - leave this blank for generation!
51
+ )
52
+ ], return_tensors = "pt").to("cuda")
 
 
 
 
53
 
54
+ outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
55
+ predicted_answer = tokenizer.batch_decode(outputs)
56
+ response = predicted_answer[0]
57
+ response = response.split("### Response:")[-1].strip()
58
+ response = "".join(response)
59
+ response = response.replace("</s>", "")
60
+ return response
61
 
62
  iface = gr.Interface(
63
  fn=generate_answer,