# import onnx # # Load the ONNX model # model_path = "model_uint8.onnx" # Replace with the path to your ONNX model # onnx_model = onnx.load(model_path) # # Print model's input and output shapes # for input_tensor in onnx_model.graph.input: # print(f"Input Name: {input_tensor.name}") # print( # f"Input Shape: {[dim.dim_value for dim in input_tensor.type.tensor_type.shape.dim]}" # ) # for output_tensor in onnx_model.graph.output: # print(f"Output Name: {output_tensor.name}") # print( # f"Output Shape: {[dim.dim_value for dim in output_tensor.type.tensor_type.shape.dim]}" # ) from onnxruntime.quantization import quantize_dynamic, QuantType # Define the path to the original ONNX model and the quantized output model onnx_model_path = "./granite_embedding_model.onnx" # Path to the original ONNX model quantized_model_path = "./model_uint8.onnx" # Path to save the quantized ONNX model # Perform dynamic quantization to UInt8 quantize_dynamic( model_input=onnx_model_path, # Input ONNX model path model_output=quantized_model_path, # Output quantized model path weight_type=QuantType.QUInt8, # Use UInt8 for weights ) # Print confirmation of quantization print(f"Quantized model saved to {quantized_model_path}")