File size: 1,431 Bytes
63faa06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import gradio as gr
from transformers import AutoTokenizer
import onnxruntime as ort
import numpy as np

# Local model directory
model_dir = "cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4"

# Load tokenizer and ONNX model
tokenizer = AutoTokenizer.from_pretrained(model_dir)
session = ort.InferenceSession(f"{model_dir}/model.onnx", providers=["CPUExecutionProvider"])

# Inference function
def generate_response(prompt):
    full_prompt = f"<|user|>\n{prompt}\n<|assistant|>\n"
    inputs = tokenizer(full_prompt, return_tensors="np")

    # ONNX model expects input_ids and attention_mask
    ort_inputs = {
        "input_ids": inputs["input_ids"].astype(np.int64),
        "attention_mask": inputs["attention_mask"].astype(np.int64)
    }

    # Run model
    outputs = session.run(None, ort_inputs)
    generated_ids = outputs[0]

    # Decode output
    response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    # Clean response
    if "<|assistant|>" in response:
        response = response.split("<|assistant|>")[-1].strip()
    return response

# Gradio interface
demo = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(label="Your Prompt", placeholder="Type your question here...", lines=4),
    outputs=gr.Textbox(label="AI Response"),
    title="Phi-4-Mini ONNX Chatbot",
    description="Runs locally with ONNX for fast inference (int4 optimized)."
)

# Launch the app
demo.launch()