import spaces import gradio as gr from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import threading import torch # Load the base model without quantization to avoid bitsandbytes issues base_model = AutoModelForCausalLM.from_pretrained( "unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit", device_map="cpu", # Ensure it runs on CPU to avoid bitsandbytes issues torch_dtype=torch.float32 # Explicitly set dtype ) # Load the LoRA adapter model = PeftModel.from_pretrained( base_model, "ZennyKenny/GPRO_LoRA_Qwen_3B" ) # Move model to CPU explicitly (since peft sometimes does not move it automatically) model.to("cpu") model.eval() # Ensure the model is in inference mode # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit") @spaces.GPU def generate_response(prompt): reasoning_prompt = ( "Answer the following question and explain your reasoning step by step.\n" f"Question: {prompt}\nReasoning:" ) # Tokenize and move to correct device inputs = tokenizer(reasoning_prompt, return_tensors="pt") input_ids = inputs["input_ids"].to("cpu") # Ensure tensor is on the correct device # Using TextIteratorStreamer for streaming responses streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) # Adjust generation parameters generation_kwargs = dict( input_ids=input_ids, max_new_tokens=300, do_sample=True, temperature=0.8, top_p=0.95, streamer=streamer ) # Ensure streaming happens in a separate thread thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) thread.start() for new_text in streamer: yield new_text # Define Gradio UI demo = gr.Interface( fn=generate_response, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs=gr.Textbox(label="Response"), title="LoRA Model Reasoning Inference", description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio.", allow_flagging="never" ) # Launch the Gradio app demo.launch(share=True)