Test / app.py
Grandediw's picture
Update app.py
7e7729e verified
raw
history blame
1.94 kB
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import gradio as gr
# Load the base model
base_model_name = "unsloth/llama-3.2-3b-instruct-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=False)
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
device_map="auto", # Automatically map layers to available devices
torch_dtype=torch.float16 # Ensure compatibility with 4-bit quantization
)
# Load the LoRA adapter
adapter_path = "Grandediw/lora_model" # Replace with your model path
model = PeftModel.from_pretrained(base_model, adapter_path)
model.eval() # Set the model to evaluation mode
# Define the inference function
def respond(
message,
history: list[tuple[str, str]],
max_tokens,
temperature,
top_p,
):
# Build context from history
context = ""
for user_message, assistant_message in history:
context += f"User: {user_message}\nAssistant: {assistant_message}\n"
context += f"User: {message}\nAssistant:"
# Tokenize the input
inputs = tokenizer(context, return_tensors="pt").to("cuda")
# Generate a response
outputs = model.generate(
input_ids=inputs.input_ids,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True
)
# Decode and return the response
response = tokenizer.decode(outputs[:, inputs.input_ids.shape[-1]:][0], skip_special_tokens=True)
return response
# Build the Gradio ChatInterface
demo = gr.ChatInterface(
fn=respond,
additional_inputs=[
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=1.5, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p"),
],
)
if __name__ == "__main__":
demo.launch()