Spaces:
Running
on
Zero
Running
on
Zero
# LoRA Inference Gradio Space Demo | |
import spaces | |
import gradio as gr | |
from peft import PeftModel | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
# Load the base model | |
base_model = AutoModelForCausalLM.from_pretrained( | |
"unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit", | |
device_map="auto", | |
torch_dtype="auto" | |
) | |
# Load the LoRA adapter | |
model = PeftModel.from_pretrained( | |
base_model, | |
"ZennyKenny/GPRO_LoRA_Qwen_3B" | |
) | |
# Load the tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit") | |
def generate_response(prompt): | |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
outputs = model.generate(**inputs, max_new_tokens=50) | |
return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
zk_qwen = gr.Interface( | |
fn=generate_response, | |
inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), | |
outputs=gr.Textbox(label="Response"), | |
title="LoRA Model Inference", | |
description="Demo your LoRA model with Hugging Face Gradio." | |
) | |
zk_qwen.launch() | |