# Convert & Optimize model with Optimum 


Steps:
1. Convert model to ONNX
2. Optimize & quantize model with Optimum
3. Create Custom Handler for Inference Endpoints

Helpful links:
* [Accelerate Sentence Transformers with Hugging Face Optimum](https://www.philschmid.de/optimize-sentence-transformers)
* [Create Custom Handler Endpoints](https://link-to-docs)

## Setup & Installation

In [2]:
%%writefile requirements.txt
optimum[onnxruntime]==1.3.0
mkl-include
mkl

Writing requirements.txt


In [None]:
!pip install -r requirements.txt

## 1. Convert model to ONNX

In [6]:
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer
from pathlib import Path


model_id="sentence-transformers/all-MiniLM-L6-v2"
onnx_path = Path(".")

# load vanilla transformers and convert to onnx
model = ORTModelForFeatureExtraction.from_pretrained(model_id, from_transformers=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
tokenizer.save_pretrained(onnx_path)

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

('./tokenizer_config.json',
 './special_tokens_map.json',
 './vocab.txt',
 './added_tokens.json',
 './tokenizer.json')

## 2. Optimize & quantize model with Optimum

In [7]:
from optimum.onnxruntime import ORTOptimizer, ORTQuantizer
from optimum.onnxruntime.configuration import OptimizationConfig, AutoQuantizationConfig

# create ORTOptimizer and define optimization configuration
optimizer = ORTOptimizer.from_pretrained(model_id, feature=model.pipeline_task)
optimization_config = OptimizationConfig(optimization_level=99) # enable all optimizations

# apply the optimization configuration to the model
optimizer.export(
    onnx_model_path=onnx_path / "model.onnx",
    onnx_optimized_model_output_path=onnx_path / "model-optimized.onnx",
    optimization_config=optimization_config,
)


# create ORTQuantizer and define quantization configuration
dynamic_quantizer = ORTQuantizer.from_pretrained(model_id, feature=model.pipeline_task)
dqconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=False)

# apply the quantization configuration to the model
model_quantized_path = dynamic_quantizer.export(
    onnx_model_path=onnx_path / "model-optimized.onnx",
    onnx_quantized_model_output_path=onnx_path / "model-quantized.onnx",
    quantization_config=dqconfig,
)



2022-08-31 19:22:18.331832429 [W:onnxruntime:, inference_session.cc:1488 Initialize] Serializing optimized model with Graph Optimization level greater than ORT_ENABLE_EXTENDED and the NchwcTransformer enabled. The generated model may contain hardware specific optimizations, and should only be used in the same environment the model was optimized in.


## 3. Create Custom Handler for Inference Endpoints


In [2]:
%%writefile pipeline.py
from typing import  Dict, List, Any
from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer
import torch.nn.functional as F
import torch

# copied from the model card
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


class PreTrainedPipeline():
    def __init__(self, path=""):
        # load the optimized model
        self.model = ORTModelForFeatureExtraction.from_pretrained(path, file_name="model-quantized.onnx")
        self.tokenizer = AutoTokenizer.from_pretrained(path)

    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
        """
        Args:
            data (:obj:):
                includes the input data and the parameters for the inference.
        Return:
            A :obj:`list`:. The list contains the embeddings of the inference inputs
        """
        inputs = data.get("inputs", data)

        # tokenize the input
        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors='pt')
        # run the model
        outputs = self.model(**encoded_inputs)
        # Perform pooling
        sentence_embeddings = mean_pooling(outputs, encoded_inputs['attention_mask'])
        # Normalize embeddings
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        # postprocess the prediction
        return {"embeddings": sentence_embeddings.tolist()}

Overwriting pipeline.py


test custom pipeline

In [1]:
from pipeline import PreTrainedPipeline

# init handler
my_handler = PreTrainedPipeline(path=".")

# prepare sample payload
request = {"inputs": "I am quite excited how this will turn out"}

# test the handler
%timeit my_handler(request)


1.55 ms ± 2.04 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [2]:
my_handler(request)

{'embeddings': [[-0.021580450236797333,
   0.021715054288506508,
   0.00979710929095745,
   -0.0005379787762649357,
   0.04682469740509987,
   -0.013600599952042103,
   -0.003064213553443551,
   0.007061154581606388,
   0.026638098061084747,
   -0.011613409034907818,
   -0.06916121393442154,
   0.061429575085639954,
   0.013463253155350685,
   -0.022426923736929893,
   0.04116947948932648,
   0.03925771266222,
   0.014005577191710472,
   -0.07909698039293289,
   -0.028196798637509346,
   -0.003196786157786846,
   0.013688302598893642,
   -0.044537559151649475,
   -0.04594269394874573,
   -0.04054776579141617,
   -0.038281939923763275,
   0.06411226838827133,
   -0.013305696658790112,
   -0.02935652621090412,
   -0.0150306923314929,
   -0.0434146448969841,
   0.03218410909175873,
   0.018695568665862083,
   -0.012916717678308487,
   0.009855723939836025,
   -0.022609280422329903,
   -0.08628173172473907,
   0.03853229060769081,
   -0.03584187850356102,
   0.05425931513309479,
   -0.0029