Spaces:
Sleeping
Sleeping
File size: 5,683 Bytes
9c587b3 84ccedd 4f9b1d8 4705650 355ff67 84ccedd 4f9b1d8 4705650 4f9b1d8 4705650 007fe97 c76e7c3 4705650 4f9b1d8 84ccedd 4f9b1d8 84ccedd 4f9b1d8 84ccedd 4f9b1d8 84ccedd 4f9b1d8 84ccedd 4f9b1d8 84ccedd 4f9b1d8 84ccedd 4f9b1d8 84ccedd 4f9b1d8 84ccedd 9c587b3 93fe578 9c587b3 93fe578 9c587b3 93fe578 9c587b3 93fe578 9c587b3 93fe578 9c587b3 dfd3cac 9c587b3 dfd3cac 84ccedd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
'''
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM
import torch
# Load the model and tokenizer
def load_model():
# base_model_name = "unsloth/llama-3.2-1b-instruct-bnb-4bit" # Replace with your base model name
lora_model_name = "sreyanghosh/lora_model" # Replace with your LoRA model path
# tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# model = AutoModelForCausalLM.from_pretrained(
# base_model_name,
# device_map="auto" if torch.cuda.is_available() else None,
# load_in_8bit=not torch.cuda.is_available(),
# )
# model = PeftModel.from_pretrained(model, lora_model_name)
model = AutoPeftModelForCausalLM.from_pretrained(
lora_model_name, # YOUR MODEL YOU USED FOR TRAINING
load_in_4bit = False, # False
)
tokenizer = AutoTokenizer.from_pretrained(lora_model_name)
model.eval()
return tokenizer, model
tokenizer, model = load_model()
# Define the respond function
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
# Prepare the conversation history
messages = [{"role": "system", "content": system_message}]
for user_input, bot_response in history:
if user_input:
messages.append({"role": "user", "content": user_input})
if bot_response:
messages.append({"role": "assistant", "content": bot_response})
messages.append({"role": "user", "content": message})
# Format the input for the model
conversation_text = "\n".join(
f"{msg['role']}: {msg['content']}" for msg in messages
)
inputs = tokenizer(conversation_text, return_tensors="pt", truncation=True)
# Generate the model's response
outputs = model.generate(
inputs.input_ids,
max_length=len(inputs.input_ids[0]) + max_tokens,
temperature=temperature,
top_p=top_p,
pad_token_id=tokenizer.eos_token_id,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract the new response
new_response = response[len(conversation_text):].strip()
yield new_response
# Gradio app configuration
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch()
'''
import gradio as gr
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM
import torch
# Load the model and tokenizer
def load_model():
lora_model_name = "sreyanghosh/lora_model" # Replace with your LoRA model path
# Try loading without 4-bit quantization
model = AutoPeftModelForCausalLM.from_pretrained(
lora_model_name,
torch_dtype=torch.float32, # Ensure no low-bit quantization
device_map="auto" if torch.cuda.is_available() else None, # Use standard device mapping
load_in_4bit=False, # Redundant, but safe to explicitly specify
)
tokenizer = AutoTokenizer.from_pretrained(lora_model_name)
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
return tokenizer, model
# Define the respond function
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
# Prepare the conversation history
messages = [{"role": "system", "content": system_message}]
for user_input, bot_response in history:
if user_input:
messages.append({"role": "user", "content": user_input})
if bot_response:
messages.append({"role": "assistant", "content": bot_response})
messages.append({"role": "user", "content": message})
# Format the input for the model
conversation_text = "\n".join(
f"{msg['role']}: {msg['content']}" for msg in messages
)
inputs = tokenizer(conversation_text, return_tensors="pt", truncation=True).to(model.device)
# Generate the model's response
outputs = model.generate(
inputs.input_ids,
max_length=len(inputs.input_ids[0]) + max_tokens,
temperature=temperature,
top_p=top_p,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract the new response
new_response = response.split("assistant:")[-1].strip()
yield new_response
# Gradio app configuration
demo = gr.ChatInterface(
fn=respond,
chatbot=gr.Chatbot(label="Assistant"), # Use a Gradio Chatbot component
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch()
|