import torch import transformers from sentence_transformers import SentenceTransformer, models class OnnxEncoder: """OnxEncoder dedicated to run SentenceTransformer under OnnxRuntime.""" def __init__(self, session, tokenizer, pooling, normalization): self.session = session self.tokenizer = tokenizer self.max_length = tokenizer.__dict__["model_max_length"] self.pooling = pooling self.normalization = normalization def encode(self, sentences: list): sentences = [sentences] if isinstance(sentences, str) else sentences inputs = { k: v.numpy() for k, v in self.tokenizer( sentences, padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt", ).items() } hidden_state = self.session.run(None, inputs) # import json # with open('tensorRT/z.json', 'w') as f: # json.dump(hidden_state[0].tolist(), f, ensure_ascii=False) # with open('tensorRT/z_7.json', 'w') as f: # json.dump(hidden_state[7].tolist(), f, ensure_ascii=False) print("cls_token_embeddings:",hidden_state[0][0].shape) sentence_embedding = self.pooling.forward( features={ "token_embeddings": torch.Tensor(hidden_state[0]), "attention_mask": torch.Tensor(inputs.get("attention_mask")), "cls_token_embeddings": hidden_state[0][0] }, ) # print("sentence_embedding 1:",sentence_embedding) if self.normalization is not None: sentence_embedding = self.normalization.forward(features=sentence_embedding) print("="*100) # print("sentence_embedding 2:",sentence_embedding) sentence_embedding = sentence_embedding["sentence_embedding"] if sentence_embedding.shape[0] == 1: sentence_embedding = sentence_embedding[0] return sentence_embedding#.numpy() def sentence_transformers_onnx( model, path, do_lower_case=True, input_names=["input_ids", "attention_mask", "segment_ids"], providers=["CPUExecutionProvider"], ): """OnxRuntime for sentence transformers. Parameters ---------- model SentenceTransformer model. path Model file dedicated to session inference. do_lower_case Either or not the model is cased. input_names Fields needed by the Transformer. providers Either run the model on CPU or GPU: ["CPUExecutionProvider", "CUDAExecutionProvider"]. """ try: import onnxruntime except: raise ValueError("You need to install onnxruntime.") model.save(path) configuration = transformers.AutoConfig.from_pretrained( path, from_tf=False, local_files_only=True ) tokenizer = transformers.AutoTokenizer.from_pretrained( path, do_lower_case=do_lower_case, from_tf=False, local_files_only=True ) encoder = transformers.AutoModel.from_pretrained( path, from_tf=False, config=configuration, local_files_only=True ) st = ["xin chào"] print("max_length: ",tokenizer.__dict__["model_max_length"]) inputs = tokenizer( st, padding='max_length', truncation=True, max_length=tokenizer.__dict__["model_max_length"], return_tensors="pt", ) print(inputs) model.eval() with torch.no_grad(): symbolic_names = {0: "batch_size", 1: "max_seq_len"} torch.onnx.export( encoder, # args=tuple(inputs.values()), args=(torch.tensor(inputs['input_ids'], dtype=torch.long),torch.tensor(inputs['attention_mask'], dtype=torch.long), torch.tensor(inputs['token_type_ids'], dtype=torch.long)), f=f"{path}.onx", opset_version=13, # ONX version needs to be >= 13 for sentence transformers. do_constant_folding=True, input_names=input_names, # output_names=["start", "end"], output_names=["output"], dynamic_axes={ "input_ids": symbolic_names, "attention_mask": symbolic_names, "token_type_ids": symbolic_names, "output": symbolic_names # "start": symbolic_names, # "end": symbolic_names, }, ) normalization = None for modules in model.modules(): print(model.modules()) for idx, module in enumerate(modules): if idx == 1: pooling = module if idx == 2: normalization = module break return OnnxEncoder( session=onnxruntime.InferenceSession(f"{path}.onx", providers=providers), tokenizer=tokenizer, pooling=pooling, normalization=normalization, ) def infer_onnx(lst_input): import onnxruntime import numpy as np from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained('tensorRT/models/paraphrase-mpnet-base-v2') test_data = tokenizer(lst_input, padding="max_length", truncation=True) ort_session = onnxruntime.InferenceSession("tensorRT/models/paraphrase-mpnet-base-v2.onnx") print(len(ort_session.get_inputs())) ort_inputs = { ort_session.get_inputs()[0].name: np.array(test_data['input_ids'], dtype = np.int64), ort_session.get_inputs()[1].name: np.array(test_data['attention_mask'], dtype = np.int64), } print(ort_inputs) net_out = ort_session.run(None, ort_inputs) print(net_out[1]) print(net_out[1].shape) if __name__ == "__main__": lst_input = ["Pham Minh Chinh is Vietnam's Prime Minister"] outpath = 'tensorRT/models/distiluse-base-multilingual-cased-v2' model_name = "model/distiluse-base-multilingual-cased-v2" outpath = 'tensorRT/models/paraphrase-multilingual-MiniLM-L12-v2' model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" outpath = 'tensorRT/models/msmacro-base-4' model_name = "model/msmacro-base-4" outpath = 'tensorRT/models/model-sup-simcse-vn' model_name = "model/model-sup-simcse-vn" # model_name = "model/paraphrase-mpnet-base-v2" model = SentenceTransformer(model_name) embs = model.encode(lst_input) print(embs) print('-----------------') onnx_encoder = sentence_transformers_onnx(model, outpath,input_names=["input_ids", "attention_mask", "token_type_ids"], do_lower_case=False) # onnx_encoder = sentence_transformers_onnx(model, outpath) embs2 = onnx_encoder.encode(lst_input) print(embs2) # infer_onnx(lst_input) print(embs.shape)