|
import gradio as gr |
|
from transformers import AutoTokenizer |
|
import onnxruntime as ort |
|
import numpy as np |
|
|
|
|
|
model_dir = "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_dir) |
|
session = ort.InferenceSession(f"{model_dir}/model.onnx", providers=["CPUExecutionProvider"]) |
|
|
|
|
|
def generate_response(prompt): |
|
full_prompt = f"<|user|>\n{prompt}\n<|assistant|>\n" |
|
inputs = tokenizer(full_prompt, return_tensors="np") |
|
|
|
|
|
ort_inputs = { |
|
"input_ids": inputs["input_ids"].astype(np.int64), |
|
"attention_mask": inputs["attention_mask"].astype(np.int64) |
|
} |
|
|
|
|
|
outputs = session.run(None, ort_inputs) |
|
generated_ids = outputs[0] |
|
|
|
|
|
response = tokenizer.decode(generated_ids[0], skip_special_tokens=True) |
|
|
|
|
|
if "<|assistant|>" in response: |
|
response = response.split("<|assistant|>")[-1].strip() |
|
return response |
|
|
|
|
|
demo = gr.Interface( |
|
fn=generate_response, |
|
inputs=gr.Textbox(label="Your Prompt", placeholder="Type your question here...", lines=4), |
|
outputs=gr.Textbox(label="AI Response"), |
|
title="Phi-4-Mini ONNX Chatbot", |
|
description="Runs locally with ONNX for fast inference (int4 optimized)." |
|
) |
|
|
|
|
|
demo.launch() |