Update app.py
Browse files
app.py
CHANGED
@@ -28,15 +28,28 @@ def generate_response(prompt):
|
|
28 |
f"Question: {prompt}\nReasoning:"
|
29 |
)
|
30 |
inputs = tokenizer(reasoning_prompt, return_tensors="pt").to(model.device)
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
-
|
35 |
fn=generate_response,
|
36 |
inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
|
37 |
outputs=gr.Textbox(label="Response"),
|
38 |
title="LoRA Model Reasoning Inference",
|
39 |
-
description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio."
|
|
|
40 |
)
|
41 |
|
42 |
-
|
|
|
28 |
f"Question: {prompt}\nReasoning:"
|
29 |
)
|
30 |
inputs = tokenizer(reasoning_prompt, return_tensors="pt").to(model.device)
|
31 |
+
|
32 |
+
# Streamed response
|
33 |
+
stream = model.generate(
|
34 |
+
**inputs,
|
35 |
+
max_new_tokens=300, # Increased token limit
|
36 |
+
do_sample=True,
|
37 |
+
temperature=0.8,
|
38 |
+
top_p=0.95,
|
39 |
+
stream=True
|
40 |
+
)
|
41 |
+
|
42 |
+
# Yield output tokens in real-time
|
43 |
+
for chunk in stream:
|
44 |
+
yield tokenizer.decode(chunk[0], skip_special_tokens=True)
|
45 |
|
46 |
+
demo = gr.Interface(
|
47 |
fn=generate_response,
|
48 |
inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
|
49 |
outputs=gr.Textbox(label="Response"),
|
50 |
title="LoRA Model Reasoning Inference",
|
51 |
+
description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio.",
|
52 |
+
live=True
|
53 |
)
|
54 |
|
55 |
+
demo.launch()
|