Spaces:
Runtime error
Runtime error
File size: 2,411 Bytes
03117d7 4808775 3f577ae 03117d7 4808775 3f577ae 4808775 3f577ae 4808775 3f577ae 4808775 3f577ae 4808775 3f577ae 4808775 3f577ae 03117d7 3f577ae 03117d7 4808775 03117d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import gradio as gr
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
from unsloth import FastLanguageModel
import torch
import nltk
nltk.download('punkt')
import re
qa_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
def preprocess_text(text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = re.sub(r'[^\w\s]', '', text)
# Remove extra whitespace
text = ' '.join(text.split())
return text
def generate_answer(model_name,question, load_in_4bit=True): # Added load_in_4bit as a parameter with a default value
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = model_name, # YOUR MODEL YOU USED FOR TRAINING
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
#load_in_8bit_fp32_cpu_offload=True, # Add this line to enable CPU offloading
device_map={"":0} # Add this line to specify GPU 0 for model placement
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
qa_prompt.format(
"Please provide the answer for the question", # instruction
question, # input
"", # output - leave this blank for generation!
)
], return_tensors = "pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
predicted_answer = tokenizer.batch_decode(outputs)
response = predicted_answer[0]
response = response.split("### Response:")[-1].strip()
response = "".join(response)
response = response.replace("</s>", "")
return response
iface = gr.Interface(
fn=generate_answer,
inputs=[
gr.Dropdown(choices=["GSridhar1982/AIML_QA_Mistral7B_FineTuned_Unsloth","GSridhar1982/AIML_QA_Mistral7B_FineTuned_Unsloth"], label="Select Model"),
gr.Textbox(lines=5, label="Question")
],
outputs=gr.Textbox(label="Answer")
)
iface.launch(debug=True) |