Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ import gradio as gr
|
|
5 |
from peft import PeftModel
|
6 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
7 |
import torch
|
|
|
8 |
|
9 |
# Load the base model
|
10 |
base_model = AutoModelForCausalLM.from_pretrained(
|
@@ -22,14 +23,13 @@ model = PeftModel.from_pretrained(
|
|
22 |
# Load the tokenizer
|
23 |
tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")
|
24 |
|
25 |
-
@spaces.GPU
|
26 |
def generate_response(prompt):
|
27 |
reasoning_prompt = (
|
28 |
"Answer the following question and explain your reasoning step by step.\n"
|
29 |
f"Question: {prompt}\nReasoning:"
|
30 |
)
|
31 |
inputs = tokenizer(reasoning_prompt, return_tensors="pt").to(model.device)
|
32 |
-
|
33 |
# Using TextIteratorStreamer for streaming responses
|
34 |
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
|
35 |
|
@@ -42,7 +42,7 @@ def generate_response(prompt):
|
|
42 |
streamer=streamer
|
43 |
)
|
44 |
|
45 |
-
thread =
|
46 |
thread.start()
|
47 |
|
48 |
for new_text in streamer:
|
@@ -54,7 +54,7 @@ demo = gr.Interface(
|
|
54 |
outputs=gr.Textbox(label="Response"),
|
55 |
title="LoRA Model Reasoning Inference",
|
56 |
description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio.",
|
57 |
-
|
58 |
)
|
59 |
|
60 |
demo.launch(share=True)
|
|
|
5 |
from peft import PeftModel
|
6 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
7 |
import torch
|
8 |
+
import threading
|
9 |
|
10 |
# Load the base model
|
11 |
base_model = AutoModelForCausalLM.from_pretrained(
|
|
|
23 |
# Load the tokenizer
|
24 |
tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")
|
25 |
|
|
|
26 |
def generate_response(prompt):
|
27 |
reasoning_prompt = (
|
28 |
"Answer the following question and explain your reasoning step by step.\n"
|
29 |
f"Question: {prompt}\nReasoning:"
|
30 |
)
|
31 |
inputs = tokenizer(reasoning_prompt, return_tensors="pt").to(model.device)
|
32 |
+
|
33 |
# Using TextIteratorStreamer for streaming responses
|
34 |
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
|
35 |
|
|
|
42 |
streamer=streamer
|
43 |
)
|
44 |
|
45 |
+
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
|
46 |
thread.start()
|
47 |
|
48 |
for new_text in streamer:
|
|
|
54 |
outputs=gr.Textbox(label="Response"),
|
55 |
title="LoRA Model Reasoning Inference",
|
56 |
description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio.",
|
57 |
+
allow_flagging="never"
|
58 |
)
|
59 |
|
60 |
demo.launch(share=True)
|