# Copyright 2022, Lefebvre Dalloz Services | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import numpy as np | |
from fastapi import FastAPI | |
from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions | |
from transformers import AutoTokenizer, BatchEncoding, TensorType | |
app = FastAPI() | |
options = SessionOptions() | |
options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL | |
model = InferenceSession("triton_models/model.onnx", options, providers=["CUDAExecutionProvider"]) | |
tokenizer = AutoTokenizer.from_pretrained("philschmid/MiniLM-L6-H384-uncased-sst2") | |
def predict(query: str): | |
encode_dict: BatchEncoding = tokenizer( | |
text=query, | |
max_length=128, | |
truncation=True, | |
return_tensors=TensorType.NUMPY, | |
) | |
result: np.ndarray = model.run(None, dict(encode_dict))[0] | |
return result.tolist() | |