ZennyKenny's picture
Update app.py
3f3d24b verified
import spaces
import gradio as gr
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import threading
import torch
# Load the base model without quantization to avoid bitsandbytes issues
base_model = AutoModelForCausalLM.from_pretrained(
"unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit",
device_map="cpu", # Ensure it runs on CPU to avoid bitsandbytes issues
torch_dtype=torch.float32 # Explicitly set dtype
)
# Load the LoRA adapter
model = PeftModel.from_pretrained(
base_model,
"ZennyKenny/GPRO_LoRA_Qwen_3B"
)
# Move model to CPU explicitly (since peft sometimes does not move it automatically)
model.to("cpu")
model.eval() # Ensure the model is in inference mode
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")
@spaces.GPU
def generate_response(prompt):
reasoning_prompt = (
"Answer the following question and explain your reasoning step by step.\n"
f"Question: {prompt}\nReasoning:"
)
# Tokenize and move to correct device
inputs = tokenizer(reasoning_prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to("cpu") # Ensure tensor is on the correct device
# Using TextIteratorStreamer for streaming responses
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
# Adjust generation parameters
generation_kwargs = dict(
input_ids=input_ids,
max_new_tokens=300,
do_sample=True,
temperature=0.8,
top_p=0.95,
streamer=streamer
)
# Ensure streaming happens in a separate thread
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
for new_text in streamer:
yield new_text
# Define Gradio UI
demo = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
outputs=gr.Textbox(label="Response"),
title="LoRA Model Reasoning Inference",
description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio.",
allow_flagging="never"
)
# Launch the Gradio app
demo.launch(share=True)