Ggyy / app.py
Athspi's picture
Create app.py
63faa06 verified
raw
history blame
1.43 kB
import gradio as gr
from transformers import AutoTokenizer
import onnxruntime as ort
import numpy as np
# Local model directory
model_dir = "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4"
# Load tokenizer and ONNX model
tokenizer = AutoTokenizer.from_pretrained(model_dir)
session = ort.InferenceSession(f"{model_dir}/model.onnx", providers=["CPUExecutionProvider"])
# Inference function
def generate_response(prompt):
full_prompt = f"<|user|>\n{prompt}\n<|assistant|>\n"
inputs = tokenizer(full_prompt, return_tensors="np")
# ONNX model expects input_ids and attention_mask
ort_inputs = {
"input_ids": inputs["input_ids"].astype(np.int64),
"attention_mask": inputs["attention_mask"].astype(np.int64)
}
# Run model
outputs = session.run(None, ort_inputs)
generated_ids = outputs[0]
# Decode output
response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
# Clean response
if "<|assistant|>" in response:
response = response.split("<|assistant|>")[-1].strip()
return response
# Gradio interface
demo = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(label="Your Prompt", placeholder="Type your question here...", lines=4),
outputs=gr.Textbox(label="AI Response"),
title="Phi-4-Mini ONNX Chatbot",
description="Runs locally with ONNX for fast inference (int4 optimized)."
)
# Launch the app
demo.launch()