run
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM
model_id = "microsoft/Phi-3.5-mini-instruct-onnx"
Change file_name to load the .onnx file instead of .onnx.data
model = ORTModelForCausalLM.from_pretrained(
model_id,
subfolder="cpu_and_mobile/cpu-int4-awq-block-128-acc-level-4",
file_name="phi-3.5-mini-instruct-cpu-int4-awq-block-128-acc-level-4.onnx"
)
Use the original model id (without -onnx):
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
result = pipe("Who is Napoleon Bonaparte?")
print(result)
[{'generated_text': 'Who is Napoleon Bonaparte?\n\nNapoleon Bonaparte was a French military and political leader who rose to prom'}]