ZennyKenny's picture
Update app.py
c4c5c31 verified
raw
history blame
1.53 kB
# LoRA Inference Gradio Space Demo
import spaces
import gradio as gr
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
"unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit",
device_map="auto",
torch_dtype="auto"
)
# Load the LoRA adapter
model = PeftModel.from_pretrained(
base_model,
"ZennyKenny/GPRO_LoRA_Qwen_3B"
)
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")
@spaces.GPU
def generate_response(prompt):
reasoning_prompt = (
"Answer the following question and explain your reasoning step by step.\n"
f"Question: {prompt}\nReasoning:"
)
inputs = tokenizer(reasoning_prompt, return_tensors="pt").to(model.device)
# Streamed response
stream = model.generate(
**inputs,
max_new_tokens=300, # Increased token limit
do_sample=True,
temperature=0.8,
top_p=0.95,
stream=True
)
# Yield output tokens in real-time
for chunk in stream:
yield tokenizer.decode(chunk[0], skip_special_tokens=True)
demo = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
outputs=gr.Textbox(label="Response"),
title="LoRA Model Reasoning Inference",
description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio.",
live=True
)
demo.launch()