File size: 1,431 Bytes
63faa06 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import gradio as gr
from transformers import AutoTokenizer
import onnxruntime as ort
import numpy as np
# Local model directory
model_dir = "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4"
# Load tokenizer and ONNX model
tokenizer = AutoTokenizer.from_pretrained(model_dir)
session = ort.InferenceSession(f"{model_dir}/model.onnx", providers=["CPUExecutionProvider"])
# Inference function
def generate_response(prompt):
full_prompt = f"<|user|>\n{prompt}\n<|assistant|>\n"
inputs = tokenizer(full_prompt, return_tensors="np")
# ONNX model expects input_ids and attention_mask
ort_inputs = {
"input_ids": inputs["input_ids"].astype(np.int64),
"attention_mask": inputs["attention_mask"].astype(np.int64)
}
# Run model
outputs = session.run(None, ort_inputs)
generated_ids = outputs[0]
# Decode output
response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
# Clean response
if "<|assistant|>" in response:
response = response.split("<|assistant|>")[-1].strip()
return response
# Gradio interface
demo = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(label="Your Prompt", placeholder="Type your question here...", lines=4),
outputs=gr.Textbox(label="AI Response"),
title="Phi-4-Mini ONNX Chatbot",
description="Runs locally with ONNX for fast inference (int4 optimized)."
)
# Launch the app
demo.launch() |