File size: 7,099 Bytes
5120311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import torch
import transformers
from sentence_transformers import SentenceTransformer, models


class OnnxEncoder:
    """OnxEncoder dedicated to run SentenceTransformer under OnnxRuntime."""

    def __init__(self, session, tokenizer, pooling, normalization):
        self.session = session
        self.tokenizer = tokenizer
        self.max_length = tokenizer.__dict__["model_max_length"]
        self.pooling = pooling
        self.normalization = normalization

    def encode(self, sentences: list):

        sentences = [sentences] if isinstance(sentences, str) else sentences

        inputs = {
            k: v.numpy()
            for k, v in self.tokenizer(
                sentences,
                padding='max_length',
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt",
            ).items()
        }
        
        hidden_state = self.session.run(None, inputs)
        # import json
        # with open('tensorRT/z.json', 'w') as f:
        #     json.dump(hidden_state[0].tolist(), f, ensure_ascii=False)
        # with open('tensorRT/z_7.json', 'w') as f:
        #     json.dump(hidden_state[7].tolist(), f, ensure_ascii=False)
        print("cls_token_embeddings:",hidden_state[0][0].shape)
        sentence_embedding = self.pooling.forward(
            features={
                "token_embeddings": torch.Tensor(hidden_state[0]),
                "attention_mask": torch.Tensor(inputs.get("attention_mask")),
                "cls_token_embeddings": hidden_state[0][0]
            },
        )

        # print("sentence_embedding 1:",sentence_embedding)
        if self.normalization is not None:
            sentence_embedding = self.normalization.forward(features=sentence_embedding)
            print("="*100)
        # print("sentence_embedding 2:",sentence_embedding)
        sentence_embedding = sentence_embedding["sentence_embedding"]

        if sentence_embedding.shape[0] == 1:
            sentence_embedding = sentence_embedding[0]

        return sentence_embedding#.numpy()


def sentence_transformers_onnx(

    model,

    path,

    do_lower_case=True,

    input_names=["input_ids", "attention_mask", "segment_ids"],

    providers=["CPUExecutionProvider"],

):
    """OnxRuntime for sentence transformers.



    Parameters

    ----------

    model

        SentenceTransformer model.

    path

        Model file dedicated to session inference.

    do_lower_case

        Either or not the model is cased.

    input_names

        Fields needed by the Transformer.

    providers

        Either run the model on CPU or GPU: ["CPUExecutionProvider", "CUDAExecutionProvider"].



    """
    try:
        import onnxruntime
    except:
        raise ValueError("You need to install onnxruntime.")

    model.save(path)

    configuration = transformers.AutoConfig.from_pretrained(
        path, from_tf=False, local_files_only=True
    )

    tokenizer = transformers.AutoTokenizer.from_pretrained(
        path, do_lower_case=do_lower_case, from_tf=False, local_files_only=True
    )

    encoder = transformers.AutoModel.from_pretrained(
        path, from_tf=False, config=configuration, local_files_only=True
    )

    st = ["xin chào"]
    print("max_length: ",tokenizer.__dict__["model_max_length"])
    inputs = tokenizer(
        st,
        padding='max_length',
        truncation=True,
        max_length=tokenizer.__dict__["model_max_length"],
        return_tensors="pt",
    )
    print(inputs)
    model.eval()

    with torch.no_grad():

        symbolic_names = {0: "batch_size", 1: "max_seq_len"}

        torch.onnx.export(
            encoder,
            # args=tuple(inputs.values()),
            args=(torch.tensor(inputs['input_ids'], dtype=torch.long),torch.tensor(inputs['attention_mask'], dtype=torch.long), torch.tensor(inputs['token_type_ids'], dtype=torch.long)),
            f=f"{path}.onx",
            opset_version=13,  # ONX version needs to be >= 13 for sentence transformers.
            do_constant_folding=True,
            input_names=input_names,
            # output_names=["start", "end"],
            output_names=["output"],
            dynamic_axes={
                "input_ids": symbolic_names,
                "attention_mask": symbolic_names,
                "token_type_ids": symbolic_names,
                "output": symbolic_names
                # "start": symbolic_names,
                # "end": symbolic_names,
            },
        )

        normalization = None
        for modules in model.modules():
            print(model.modules())
            for idx, module in enumerate(modules):
                if idx == 1:
                    pooling = module
                if idx == 2:
                    normalization = module
            break

        return OnnxEncoder(
            session=onnxruntime.InferenceSession(f"{path}.onx", providers=providers),
            tokenizer=tokenizer,
            pooling=pooling,
            normalization=normalization,
        )

def infer_onnx(lst_input):
    import onnxruntime
    import numpy as np
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained('tensorRT/models/paraphrase-mpnet-base-v2')


    

    test_data = tokenizer(lst_input, padding="max_length", truncation=True)
    ort_session = onnxruntime.InferenceSession("tensorRT/models/paraphrase-mpnet-base-v2.onnx")
    print(len(ort_session.get_inputs()))
    ort_inputs = {
                    ort_session.get_inputs()[0].name: np.array(test_data['input_ids'], dtype = np.int64),
                    ort_session.get_inputs()[1].name: np.array(test_data['attention_mask'], dtype = np.int64),
            }
    print(ort_inputs)
    net_out = ort_session.run(None, ort_inputs)
    
    print(net_out[1])
    print(net_out[1].shape)

if __name__ == "__main__":
    lst_input = ["Pham Minh Chinh is Vietnam's Prime Minister"]

    outpath = 'tensorRT/models/distiluse-base-multilingual-cased-v2'
    model_name = "model/distiluse-base-multilingual-cased-v2"

    outpath = 'tensorRT/models/paraphrase-multilingual-MiniLM-L12-v2'
    model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

    outpath = 'tensorRT/models/msmacro-base-4'
    model_name = "model/msmacro-base-4"
    
    outpath = 'tensorRT/models/model-sup-simcse-vn'
    model_name = "model/model-sup-simcse-vn"
    
    # model_name = "model/paraphrase-mpnet-base-v2"
    model = SentenceTransformer(model_name)
    embs = model.encode(lst_input)
    print(embs)
    print('-----------------')
    
    onnx_encoder = sentence_transformers_onnx(model, outpath,input_names=["input_ids", "attention_mask", "token_type_ids"], do_lower_case=False)
    # onnx_encoder = sentence_transformers_onnx(model, outpath)
    embs2 = onnx_encoder.encode(lst_input)
    print(embs2)
    # infer_onnx(lst_input)
    print(embs.shape)