|
import torch
|
|
import transformers
|
|
from sentence_transformers import SentenceTransformer, models
|
|
|
|
|
|
class OnnxEncoder:
|
|
"""OnxEncoder dedicated to run SentenceTransformer under OnnxRuntime."""
|
|
|
|
def __init__(self, session, tokenizer, pooling, normalization):
|
|
self.session = session
|
|
self.tokenizer = tokenizer
|
|
self.max_length = tokenizer.__dict__["model_max_length"]
|
|
self.pooling = pooling
|
|
self.normalization = normalization
|
|
|
|
def encode(self, sentences: list):
|
|
|
|
sentences = [sentences] if isinstance(sentences, str) else sentences
|
|
|
|
inputs = {
|
|
k: v.numpy()
|
|
for k, v in self.tokenizer(
|
|
sentences,
|
|
padding='max_length',
|
|
truncation=True,
|
|
max_length=self.max_length,
|
|
return_tensors="pt",
|
|
).items()
|
|
}
|
|
|
|
hidden_state = self.session.run(None, inputs)
|
|
|
|
|
|
|
|
|
|
|
|
print("cls_token_embeddings:",hidden_state[0][0].shape)
|
|
sentence_embedding = self.pooling.forward(
|
|
features={
|
|
"token_embeddings": torch.Tensor(hidden_state[0]),
|
|
"attention_mask": torch.Tensor(inputs.get("attention_mask")),
|
|
"cls_token_embeddings": hidden_state[0][0]
|
|
},
|
|
)
|
|
|
|
|
|
if self.normalization is not None:
|
|
sentence_embedding = self.normalization.forward(features=sentence_embedding)
|
|
print("="*100)
|
|
|
|
sentence_embedding = sentence_embedding["sentence_embedding"]
|
|
|
|
if sentence_embedding.shape[0] == 1:
|
|
sentence_embedding = sentence_embedding[0]
|
|
|
|
return sentence_embedding
|
|
|
|
|
|
def sentence_transformers_onnx(
|
|
model,
|
|
path,
|
|
do_lower_case=True,
|
|
input_names=["input_ids", "attention_mask", "segment_ids"],
|
|
providers=["CPUExecutionProvider"],
|
|
):
|
|
"""OnxRuntime for sentence transformers.
|
|
|
|
Parameters
|
|
----------
|
|
model
|
|
SentenceTransformer model.
|
|
path
|
|
Model file dedicated to session inference.
|
|
do_lower_case
|
|
Either or not the model is cased.
|
|
input_names
|
|
Fields needed by the Transformer.
|
|
providers
|
|
Either run the model on CPU or GPU: ["CPUExecutionProvider", "CUDAExecutionProvider"].
|
|
|
|
"""
|
|
try:
|
|
import onnxruntime
|
|
except:
|
|
raise ValueError("You need to install onnxruntime.")
|
|
|
|
model.save(path)
|
|
|
|
configuration = transformers.AutoConfig.from_pretrained(
|
|
path, from_tf=False, local_files_only=True
|
|
)
|
|
|
|
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
|
path, do_lower_case=do_lower_case, from_tf=False, local_files_only=True
|
|
)
|
|
|
|
encoder = transformers.AutoModel.from_pretrained(
|
|
path, from_tf=False, config=configuration, local_files_only=True
|
|
)
|
|
|
|
st = ["xin chào"]
|
|
print("max_length: ",tokenizer.__dict__["model_max_length"])
|
|
inputs = tokenizer(
|
|
st,
|
|
padding='max_length',
|
|
truncation=True,
|
|
max_length=tokenizer.__dict__["model_max_length"],
|
|
return_tensors="pt",
|
|
)
|
|
print(inputs)
|
|
model.eval()
|
|
|
|
with torch.no_grad():
|
|
|
|
symbolic_names = {0: "batch_size", 1: "max_seq_len"}
|
|
|
|
torch.onnx.export(
|
|
encoder,
|
|
|
|
args=(torch.tensor(inputs['input_ids'], dtype=torch.long),torch.tensor(inputs['attention_mask'], dtype=torch.long), torch.tensor(inputs['token_type_ids'], dtype=torch.long)),
|
|
f=f"{path}.onx",
|
|
opset_version=13,
|
|
do_constant_folding=True,
|
|
input_names=input_names,
|
|
|
|
output_names=["output"],
|
|
dynamic_axes={
|
|
"input_ids": symbolic_names,
|
|
"attention_mask": symbolic_names,
|
|
"token_type_ids": symbolic_names,
|
|
"output": symbolic_names
|
|
|
|
|
|
},
|
|
)
|
|
|
|
normalization = None
|
|
for modules in model.modules():
|
|
print(model.modules())
|
|
for idx, module in enumerate(modules):
|
|
if idx == 1:
|
|
pooling = module
|
|
if idx == 2:
|
|
normalization = module
|
|
break
|
|
|
|
return OnnxEncoder(
|
|
session=onnxruntime.InferenceSession(f"{path}.onx", providers=providers),
|
|
tokenizer=tokenizer,
|
|
pooling=pooling,
|
|
normalization=normalization,
|
|
)
|
|
|
|
def infer_onnx(lst_input):
|
|
import onnxruntime
|
|
import numpy as np
|
|
from transformers import AutoTokenizer
|
|
tokenizer = AutoTokenizer.from_pretrained('tensorRT/models/paraphrase-mpnet-base-v2')
|
|
|
|
|
|
|
|
|
|
test_data = tokenizer(lst_input, padding="max_length", truncation=True)
|
|
ort_session = onnxruntime.InferenceSession("tensorRT/models/paraphrase-mpnet-base-v2.onnx")
|
|
print(len(ort_session.get_inputs()))
|
|
ort_inputs = {
|
|
ort_session.get_inputs()[0].name: np.array(test_data['input_ids'], dtype = np.int64),
|
|
ort_session.get_inputs()[1].name: np.array(test_data['attention_mask'], dtype = np.int64),
|
|
}
|
|
print(ort_inputs)
|
|
net_out = ort_session.run(None, ort_inputs)
|
|
|
|
print(net_out[1])
|
|
print(net_out[1].shape)
|
|
|
|
if __name__ == "__main__":
|
|
lst_input = ["Pham Minh Chinh is Vietnam's Prime Minister"]
|
|
|
|
outpath = 'tensorRT/models/distiluse-base-multilingual-cased-v2'
|
|
model_name = "model/distiluse-base-multilingual-cased-v2"
|
|
|
|
outpath = 'tensorRT/models/paraphrase-multilingual-MiniLM-L12-v2'
|
|
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
|
|
|
outpath = 'tensorRT/models/msmacro-base-4'
|
|
model_name = "model/msmacro-base-4"
|
|
|
|
outpath = 'tensorRT/models/model-sup-simcse-vn'
|
|
model_name = "model/model-sup-simcse-vn"
|
|
|
|
|
|
model = SentenceTransformer(model_name)
|
|
embs = model.encode(lst_input)
|
|
print(embs)
|
|
print('-----------------')
|
|
|
|
onnx_encoder = sentence_transformers_onnx(model, outpath,input_names=["input_ids", "attention_mask", "token_type_ids"], do_lower_case=False)
|
|
|
|
embs2 = onnx_encoder.encode(lst_input)
|
|
print(embs2)
|
|
|
|
print(embs.shape) |