File size: 2,213 Bytes
1e58692
e750d47
 
4e01411
f60da4f
3f3d24b
e750d47
15c23cd
e750d47
 
3f3d24b
 
e750d47
 
 
 
 
 
 
 
3f3d24b
 
 
 
e750d47
 
 
9ec6b90
e750d47
fab6136
 
 
 
3f3d24b
 
 
 
f60da4f
4e01411
 
 
3f3d24b
4e01411
3f3d24b
4e01411
c4c5c31
 
 
4e01411
c4c5c31
4e01411
3f3d24b
f60da4f
4e01411
 
 
 
e750d47
3f3d24b
c4c5c31
e750d47
 
 
fab6136
c4c5c31
f60da4f
e750d47
 
3f3d24b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import spaces
import gradio as gr
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import threading
import torch

# Load the base model without quantization to avoid bitsandbytes issues
base_model = AutoModelForCausalLM.from_pretrained(
    "unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit",
    device_map="cpu",  # Ensure it runs on CPU to avoid bitsandbytes issues
    torch_dtype=torch.float32  # Explicitly set dtype
)

# Load the LoRA adapter
model = PeftModel.from_pretrained(
    base_model, 
    "ZennyKenny/GPRO_LoRA_Qwen_3B"
)

# Move model to CPU explicitly (since peft sometimes does not move it automatically)
model.to("cpu")
model.eval()  # Ensure the model is in inference mode

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")

@spaces.GPU
def generate_response(prompt):
    reasoning_prompt = (
        "Answer the following question and explain your reasoning step by step.\n"
        f"Question: {prompt}\nReasoning:"
    )

    # Tokenize and move to correct device
    inputs = tokenizer(reasoning_prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to("cpu")  # Ensure tensor is on the correct device

    # Using TextIteratorStreamer for streaming responses
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)

    # Adjust generation parameters
    generation_kwargs = dict(
        input_ids=input_ids,
        max_new_tokens=300,
        do_sample=True,
        temperature=0.8,
        top_p=0.95,
        streamer=streamer
    )

    # Ensure streaming happens in a separate thread
    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    for new_text in streamer:
        yield new_text

# Define Gradio UI
demo = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
    outputs=gr.Textbox(label="Response"),
    title="LoRA Model Reasoning Inference",
    description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio.",
    allow_flagging="never"
)

# Launch the Gradio app
demo.launch(share=True)