File size: 2,213 Bytes
1e58692 e750d47 4e01411 f60da4f 3f3d24b e750d47 15c23cd e750d47 3f3d24b e750d47 3f3d24b e750d47 9ec6b90 e750d47 fab6136 3f3d24b f60da4f 4e01411 3f3d24b 4e01411 3f3d24b 4e01411 c4c5c31 4e01411 c4c5c31 4e01411 3f3d24b f60da4f 4e01411 e750d47 3f3d24b c4c5c31 e750d47 fab6136 c4c5c31 f60da4f e750d47 3f3d24b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import spaces
import gradio as gr
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import threading
import torch
# Load the base model without quantization to avoid bitsandbytes issues
base_model = AutoModelForCausalLM.from_pretrained(
"unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit",
device_map="cpu", # Ensure it runs on CPU to avoid bitsandbytes issues
torch_dtype=torch.float32 # Explicitly set dtype
)
# Load the LoRA adapter
model = PeftModel.from_pretrained(
base_model,
"ZennyKenny/GPRO_LoRA_Qwen_3B"
)
# Move model to CPU explicitly (since peft sometimes does not move it automatically)
model.to("cpu")
model.eval() # Ensure the model is in inference mode
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")
@spaces.GPU
def generate_response(prompt):
reasoning_prompt = (
"Answer the following question and explain your reasoning step by step.\n"
f"Question: {prompt}\nReasoning:"
)
# Tokenize and move to correct device
inputs = tokenizer(reasoning_prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to("cpu") # Ensure tensor is on the correct device
# Using TextIteratorStreamer for streaming responses
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
# Adjust generation parameters
generation_kwargs = dict(
input_ids=input_ids,
max_new_tokens=300,
do_sample=True,
temperature=0.8,
top_p=0.95,
streamer=streamer
)
# Ensure streaming happens in a separate thread
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
for new_text in streamer:
yield new_text
# Define Gradio UI
demo = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
outputs=gr.Textbox(label="Response"),
title="LoRA Model Reasoning Inference",
description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio.",
allow_flagging="never"
)
# Launch the Gradio app
demo.launch(share=True)
|