cuongnguyen910's picture
Upload folder using huggingface_hub
5120311 verified
import torch
import transformers
from sentence_transformers import SentenceTransformer, models
class OnnxEncoder:
"""OnxEncoder dedicated to run SentenceTransformer under OnnxRuntime."""
def __init__(self, session, tokenizer, pooling, normalization):
self.session = session
self.tokenizer = tokenizer
self.max_length = tokenizer.__dict__["model_max_length"]
self.pooling = pooling
self.normalization = normalization
def encode(self, sentences: list):
sentences = [sentences] if isinstance(sentences, str) else sentences
inputs = {
k: v.numpy()
for k, v in self.tokenizer(
sentences,
padding='max_length',
truncation=True,
max_length=self.max_length,
return_tensors="pt",
).items()
}
hidden_state = self.session.run(None, inputs)
# import json
# with open('tensorRT/z.json', 'w') as f:
# json.dump(hidden_state[0].tolist(), f, ensure_ascii=False)
# with open('tensorRT/z_7.json', 'w') as f:
# json.dump(hidden_state[7].tolist(), f, ensure_ascii=False)
print("cls_token_embeddings:",hidden_state[0][0].shape)
sentence_embedding = self.pooling.forward(
features={
"token_embeddings": torch.Tensor(hidden_state[0]),
"attention_mask": torch.Tensor(inputs.get("attention_mask")),
"cls_token_embeddings": hidden_state[0][0]
},
)
# print("sentence_embedding 1:",sentence_embedding)
if self.normalization is not None:
sentence_embedding = self.normalization.forward(features=sentence_embedding)
print("="*100)
# print("sentence_embedding 2:",sentence_embedding)
sentence_embedding = sentence_embedding["sentence_embedding"]
if sentence_embedding.shape[0] == 1:
sentence_embedding = sentence_embedding[0]
return sentence_embedding#.numpy()
def sentence_transformers_onnx(
model,
path,
do_lower_case=True,
input_names=["input_ids", "attention_mask", "segment_ids"],
providers=["CPUExecutionProvider"],
):
"""OnxRuntime for sentence transformers.
Parameters
----------
model
SentenceTransformer model.
path
Model file dedicated to session inference.
do_lower_case
Either or not the model is cased.
input_names
Fields needed by the Transformer.
providers
Either run the model on CPU or GPU: ["CPUExecutionProvider", "CUDAExecutionProvider"].
"""
try:
import onnxruntime
except:
raise ValueError("You need to install onnxruntime.")
model.save(path)
configuration = transformers.AutoConfig.from_pretrained(
path, from_tf=False, local_files_only=True
)
tokenizer = transformers.AutoTokenizer.from_pretrained(
path, do_lower_case=do_lower_case, from_tf=False, local_files_only=True
)
encoder = transformers.AutoModel.from_pretrained(
path, from_tf=False, config=configuration, local_files_only=True
)
st = ["xin chào"]
print("max_length: ",tokenizer.__dict__["model_max_length"])
inputs = tokenizer(
st,
padding='max_length',
truncation=True,
max_length=tokenizer.__dict__["model_max_length"],
return_tensors="pt",
)
print(inputs)
model.eval()
with torch.no_grad():
symbolic_names = {0: "batch_size", 1: "max_seq_len"}
torch.onnx.export(
encoder,
# args=tuple(inputs.values()),
args=(torch.tensor(inputs['input_ids'], dtype=torch.long),torch.tensor(inputs['attention_mask'], dtype=torch.long), torch.tensor(inputs['token_type_ids'], dtype=torch.long)),
f=f"{path}.onx",
opset_version=13, # ONX version needs to be >= 13 for sentence transformers.
do_constant_folding=True,
input_names=input_names,
# output_names=["start", "end"],
output_names=["output"],
dynamic_axes={
"input_ids": symbolic_names,
"attention_mask": symbolic_names,
"token_type_ids": symbolic_names,
"output": symbolic_names
# "start": symbolic_names,
# "end": symbolic_names,
},
)
normalization = None
for modules in model.modules():
print(model.modules())
for idx, module in enumerate(modules):
if idx == 1:
pooling = module
if idx == 2:
normalization = module
break
return OnnxEncoder(
session=onnxruntime.InferenceSession(f"{path}.onx", providers=providers),
tokenizer=tokenizer,
pooling=pooling,
normalization=normalization,
)
def infer_onnx(lst_input):
import onnxruntime
import numpy as np
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('tensorRT/models/paraphrase-mpnet-base-v2')
test_data = tokenizer(lst_input, padding="max_length", truncation=True)
ort_session = onnxruntime.InferenceSession("tensorRT/models/paraphrase-mpnet-base-v2.onnx")
print(len(ort_session.get_inputs()))
ort_inputs = {
ort_session.get_inputs()[0].name: np.array(test_data['input_ids'], dtype = np.int64),
ort_session.get_inputs()[1].name: np.array(test_data['attention_mask'], dtype = np.int64),
}
print(ort_inputs)
net_out = ort_session.run(None, ort_inputs)
print(net_out[1])
print(net_out[1].shape)
if __name__ == "__main__":
lst_input = ["Pham Minh Chinh is Vietnam's Prime Minister"]
outpath = 'tensorRT/models/distiluse-base-multilingual-cased-v2'
model_name = "model/distiluse-base-multilingual-cased-v2"
outpath = 'tensorRT/models/paraphrase-multilingual-MiniLM-L12-v2'
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
outpath = 'tensorRT/models/msmacro-base-4'
model_name = "model/msmacro-base-4"
outpath = 'tensorRT/models/model-sup-simcse-vn'
model_name = "model/model-sup-simcse-vn"
# model_name = "model/paraphrase-mpnet-base-v2"
model = SentenceTransformer(model_name)
embs = model.encode(lst_input)
print(embs)
print('-----------------')
onnx_encoder = sentence_transformers_onnx(model, outpath,input_names=["input_ids", "attention_mask", "token_type_ids"], do_lower_case=False)
# onnx_encoder = sentence_transformers_onnx(model, outpath)
embs2 = onnx_encoder.encode(lst_input)
print(embs2)
# infer_onnx(lst_input)
print(embs.shape)