# import onnx | |
# # Load the ONNX model | |
# model_path = "model_uint8.onnx" # Replace with the path to your ONNX model | |
# onnx_model = onnx.load(model_path) | |
# # Print model's input and output shapes | |
# for input_tensor in onnx_model.graph.input: | |
# print(f"Input Name: {input_tensor.name}") | |
# print( | |
# f"Input Shape: {[dim.dim_value for dim in input_tensor.type.tensor_type.shape.dim]}" | |
# ) | |
# for output_tensor in onnx_model.graph.output: | |
# print(f"Output Name: {output_tensor.name}") | |
# print( | |
# f"Output Shape: {[dim.dim_value for dim in output_tensor.type.tensor_type.shape.dim]}" | |
# ) | |
from onnxruntime.quantization import quantize_dynamic, QuantType | |
# Define the path to the original ONNX model and the quantized output model | |
onnx_model_path = "./granite_embedding_model.onnx" # Path to the original ONNX model | |
quantized_model_path = "./model_uint8.onnx" # Path to save the quantized ONNX model | |
# Perform dynamic quantization to UInt8 | |
quantize_dynamic( | |
model_input=onnx_model_path, # Input ONNX model path | |
model_output=quantized_model_path, # Output quantized model path | |
weight_type=QuantType.QUInt8, # Use UInt8 for weights | |
) | |
# Print confirmation of quantization | |
print(f"Quantized model saved to {quantized_model_path}") | |