otherhalf-dev
/

whisper-large-v3-turbo-trt

Model card Files Files and versions Community

otherhalf-dev commited on Feb 11

Commit

f9a3600

verified ·

1 Parent(s): fb63d75

Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

.gitattributes +2 -0
infer_bls/1/__pycache__/model.cpython-310.pyc +0 -0
infer_bls/1/__pycache__/tokenizer.cpython-310.pyc +0 -0
infer_bls/1/model.py +126 -0
infer_bls/1/multilingual.tiktoken +0 -0
infer_bls/1/tokenizer.py +184 -0
infer_bls/config.pbtxt +61 -0
whisper/1/__pycache__/fbank.cpython-310.pyc +0 -0
whisper/1/__pycache__/model.cpython-310.pyc +0 -0
whisper/1/fbank.py +98 -0
whisper/1/mel_filters.npz +3 -0
whisper/1/model.py +101 -0
whisper/1/whisper_large-v3-turbo/decoder/config.json +170 -0
whisper/1/whisper_large-v3-turbo/decoder/rank0.engine +3 -0
whisper/1/whisper_large-v3-turbo/encoder/config.json +147 -0
whisper/1/whisper_large-v3-turbo/encoder/rank0.engine +3 -0
whisper/config.pbtxt +71 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+whisper/1/whisper_large-v3-turbo/decoder/rank0.engine filter=lfs diff=lfs merge=lfs -text
+whisper/1/whisper_large-v3-turbo/encoder/rank0.engine filter=lfs diff=lfs merge=lfs -text

infer_bls/1/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (4.92 kB). View file

infer_bls/1/__pycache__/tokenizer.cpython-310.pyc ADDED Viewed

Binary file (4.59 kB). View file

infer_bls/1/model.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# -*- coding: utf-8 -*-
+import triton_python_backend_utils as pb_utils
+import numpy as np
+import json
+import torch
+from torch.utils.dlpack import to_dlpack
+import re
+from .tokenizer import get_tokenizer
+from collections import OrderedDict
+from pathlib import Path
+def read_config(component, engine_dir):
+    config_path = engine_dir / component / 'config.json'
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    model_config = OrderedDict()
+    model_config.update(config['pretrained_config'])
+    model_config.update(config['build_config'])
+    return model_config
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        self.model_config = model_config = json.loads(args['model_config'])
+        # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(
+            model_config, "TRANSCRIPTS")
+        # Convert Triton types to numpy types
+        self.out0_dtype = pb_utils.triton_string_to_numpy(
+            output0_config['data_type'])
+        encoder_config = read_config('encoder', Path(self.model_config['parameters']['engine_dir']["string_value"]))
+        self.tokenizer = get_tokenizer(num_languages=encoder_config['num_languages'])
+        self.blank = self.tokenizer.encode(" ", allowed_special=self.tokenizer.special_tokens_set)[0]
+        self.device = torch.device("cuda")
+    def process_batch(self, wav, wav_len, prompt_id):
+        wav = torch.from_numpy(wav[0]).to(self.device)
+        wav_tensor = pb_utils.Tensor.from_dlpack("WAV", to_dlpack(wav.unsqueeze(0)))
+        wav_len_tensor = pb_utils.Tensor("WAV_LENS", np.array([[wav_len]], np.int32))
+        prompt_id = torch.tensor(prompt_id).unsqueeze(0)
+        prompt_id = pb_utils.Tensor("DECODER_INPUT_IDS", prompt_id.numpy().astype(np.int32))
+        infer_request = pb_utils.InferenceRequest(
+            model_name="whisper",
+            requested_output_names=["OUTPUT_IDS"],
+            inputs=[wav_tensor, wav_len_tensor, prompt_id]
+        )
+        inference_response = infer_request.exec()
+        if inference_response.has_error():
+            raise pb_utils.TritonModelException(inference_response.error().message())
+        else:
+            output_ids = pb_utils.get_output_tensor_by_name(inference_response, "OUTPUT_IDS")
+            return output_ids.as_numpy()
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        # Every Python backend must iterate through list of requests and create
+        # an instance of pb_utils.InferenceResponse class for each of them. You
+        # should avoid storing any of the input Tensors in the class attributes
+        # as they will be overridden in subsequent inference requests. You can
+        # make a copy of the underlying NumPy array and store it if it is
+        # required.
+        responses = []
+        for request in requests:
+            # Perform inference on the request and append it to responses list...
+            in_0 = pb_utils.get_input_tensor_by_name(request, "TEXT_PREFIX")
+            prompt_ids = in_0.as_numpy().tolist()
+            prompt_ids = prompt_ids[0][0].decode('utf-8')
+            if prompt_ids == "":
+                prompt_ids = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
+            prompt_id = self.tokenizer.encode(prompt_ids, allowed_special=self.tokenizer.special_tokens_set)
+            wav = pb_utils.get_input_tensor_by_name(request, "WAV").as_numpy()
+            assert wav.shape[0] == 1, "Only support batch size 1 for now"
+            wav_len = pb_utils.get_input_tensor_by_name(request, "WAV_LENS").as_numpy()
+            wav_len = wav_len.item()
+            output_ids = self.process_batch(wav, wav_len, prompt_id)
+            s = self.tokenizer.decode(output_ids)
+            s = re.sub(r'<\|.*?\|>', '', s)
+            sentence = np.array([s])
+            out0 = pb_utils.Tensor("TRANSCRIPTS", sentence.astype(self.out0_dtype))
+            inference_response = pb_utils.InferenceResponse(output_tensors=[out0])
+            responses.append(inference_response)
+        return responses
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')

infer_bls/1/multilingual.tiktoken ADDED Viewed

The diff for this file is too large to render. See raw diff

infer_bls/1/tokenizer.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
+import base64
+import os
+import tiktoken
+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "he": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+    "yue": "cantonese",
+}
+def get_tokenizer(name: str = "multilingual",
+                  num_languages: int = 99,
+                  tokenizer_dir: str = None):
+    if tokenizer_dir is None:
+        vocab_path = os.path.join(os.path.dirname(__file__),
+                                  f"./{name}.tiktoken")
+    else:
+        vocab_path = os.path.join(tokenizer_dir, f"{name}.tiktoken")
+    ranks = {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in open(vocab_path) if line)
+    }
+    n_vocab = len(ranks)
+    special_tokens = {}
+    specials = [
+        "<|endoftext|>",
+        "<|startoftranscript|>",
+        *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
+        "<|translate|>",
+        "<|transcribe|>",
+        "<|startoflm|>",
+        "<|startofprev|>",
+        "<|nospeech|>",
+        "<|notimestamps|>",
+        *[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
+    ]
+    for token in specials:
+        special_tokens[token] = n_vocab
+        n_vocab += 1
+    return tiktoken.Encoding(
+        name=os.path.basename(vocab_path),
+        explicit_n_vocab=n_vocab,
+        pat_str=
+        r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
+        mergeable_ranks=ranks,
+        special_tokens=special_tokens,
+    )
+if __name__ == "__main__":
+    enc = get_tokenizer()
+    mytest_str = "<|startofprev|> Nvidia<|startoftranscript|><|en|><|transcribe|>"
+    encoding = enc.encode(mytest_str, allowed_special=enc.special_tokens_set)
+    mystr = enc.decode([50361, 45, 43021, 50258, 50259, 50359])
+    mystr2 = enc.decode([50361, 46284, 50258, 50259, 50359])
+    #print(encoding, mystr, mystr2)
+    print(
+        enc.encode("<|startoftranscript|>",
+                   allowed_special=enc.special_tokens_set)[0])
+    print(
+        enc.encode("<|endoftext|>",
+                   allowed_special=enc.special_tokens_set)[0])
+    my_zh_str = "好好学习"
+    encoding = enc.encode(my_zh_str, allowed_special=enc.special_tokens_set)
+    decoding = enc.decode(encoding)
+    print(type(decoding))
+    #print(encoding, decoding)

infer_bls/config.pbtxt ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "infer_bls"
+backend: "python"
+max_batch_size: 64
+parameters [
+  {
+    key: "engine_dir"
+    value: { string_value: "/data/whisper/model_repo_whisper/whisper/1/whisper_large-v3-turbo"}
+  }
+]
+input [
+  {
+    name: "TEXT_PREFIX"
+    data_type: TYPE_STRING
+    dims: [1]
+  },
+  {
+    name: "WAV"
+    data_type: TYPE_FP32
+    dims: [-1]
+  },
+  {
+    name: "WAV_LENS"
+    data_type: TYPE_INT32
+    dims: [1]
+    optional: True
+  }
+]
+output [
+  {
+    name: "TRANSCRIPTS"
+    data_type: TYPE_STRING
+    dims: [1]
+  }
+]
+dynamic_batching {
+    max_queue_delay_microseconds: 100
+  }
+instance_group [
+    {
+      count: 8
+      kind: KIND_CPU
+    }
+  ]

whisper/1/__pycache__/fbank.cpython-310.pyc ADDED Viewed

Binary file (3.22 kB). View file

whisper/1/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (3.16 kB). View file

whisper/1/fbank.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference: https://github.com/openai/whisper/blob/main/whisper/audio.py
+import numpy as np
+import torch
+import torch.nn.functional as F
+from typing import Union
+import os
+def mel_filters(device, n_mels: int =128) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
+        )
+    """
+    assert n_mels == 80 or n_mels == 128 , f"Unsupported n_mels: {n_mels}"
+    with np.load(
+        os.path.join(os.path.dirname(__file__), "mel_filters.npz")
+    ) as f:
+        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+def log_mel_spectrogram(
+    audio: Union[torch.Tensor],
+    filters: torch.Tensor,
+    n_mels: int = 128,
+    n_fft: int = 400,
+    hop_length: int = 160,
+):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 or 128 is supported
+    filters: torch.Tensor
+    Returns
+    -------
+    torch.Tensor, shape = (128, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    window = torch.hann_window(n_fft).to(audio.device)
+    stft = torch.stft(audio, n_fft, hop_length, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    # cast to float 16
+    log_spec = log_spec.half()
+    return log_spec
+class FeatureExtractor(torch.nn.Module):
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    def __init__(self, n_mels: int = 128):
+        self.device = torch.device("cuda")
+        self.n_mels = n_mels
+        self.filters = mel_filters(self.device, n_mels=self.n_mels)
+    def compute_feature(self, wav, padding_target_len: int = 3000):
+        """
+        Compute the log-Mel spectrogram of the input audio waveform.
+        mel: [1, feature_dim, seq_len]
+        """
+        mel = log_mel_spectrogram(wav, self.filters)
+        assert padding_target_len <= 3000, f"padding must be less than 3000, got {padding}"
+        if mel.shape[1] < padding_target_len:
+            mel = F.pad(mel, (0, padding_target_len - mel.shape[1]), mode='constant')
+        if mel.shape[1] % 2:
+            # pad to even length for remove_padding case, since conv1d requires even length
+            mel = torch.nn.functional.pad(mel, (0, 1))
+        mel = mel.unsqueeze(0)
+        return mel

whisper/1/mel_filters.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7450ae70723a5ef9d341e3cee628c7cb0177f36ce42c44b7ed2bf3325f0f6d4c
+size 4271

whisper/1/model.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import json
+from pathlib import Path
+from .fbank import FeatureExtractor
+import torch
+from torch.utils.dlpack import from_dlpack
+import triton_python_backend_utils as pb_utils
+from tensorrt_llm.runtime import ModelRunnerCpp
+from tensorrt_llm.bindings import GptJsonConfig
+from .fbank import FeatureExtractor
+class TritonPythonModel:
+    def initialize(self, args):
+        parameters = json.loads(args['model_config'])['parameters']
+        for key,value in parameters.items():
+            parameters[key] = value["string_value"]
+        engine_dir = parameters["engine_dir"]
+        json_config = GptJsonConfig.parse_file(Path(engine_dir) / 'decoder' / 'config.json')
+        assert json_config.model_config.supports_inflight_batching
+        runner_kwargs = dict(engine_dir=engine_dir,
+                                is_enc_dec=True,
+                                max_batch_size=64,
+                                max_input_len=3000,
+                                max_output_len=96,
+                                max_beam_width=1,
+                                debug_mode=False,
+                                kv_cache_free_gpu_memory_fraction=0.5)
+        self.model_runner_cpp = ModelRunnerCpp.from_dir(**runner_kwargs)
+        self.feature_extractor = FeatureExtractor(n_mels = int(parameters["n_mels"]))
+        self.zero_pad = True if parameters["zero_pad"] == "true" else False
+        self.eot_id = 50257
+    def execute(self, requests):
+        """
+        This function receives a list of requests (`pb_utils.InferenceRequest`),
+        performs inference on every request and appends it to responses.
+        """
+        responses, batch_mel_list, decoder_input_ids = [], [], []
+        for request in requests:
+            wav_tensor = pb_utils.get_input_tensor_by_name(request, "WAV")
+            wav_len = pb_utils.get_input_tensor_by_name(request, "WAV_LENS").as_numpy().item()
+            prompt_ids = pb_utils.get_input_tensor_by_name(request, "DECODER_INPUT_IDS").as_numpy()
+            wav = from_dlpack(wav_tensor.to_dlpack())
+            wav = wav[:, :wav_len]
+            padding = 0 if self.zero_pad else 3000
+            mel = self.feature_extractor.compute_feature(wav[0].to('cuda'), padding_target_len=padding).transpose(1, 2)
+            batch_mel_list.append(mel.squeeze(0))
+            decoder_input_ids.append(torch.tensor(prompt_ids, dtype=torch.int32, device='cuda').squeeze(0))
+        decoder_input_ids = torch.nn.utils.rnn.pad_sequence(decoder_input_ids, batch_first=True, padding_value=self.eot_id)
+        mel_input_lengths = torch.tensor([mel.shape[0] for mel in batch_mel_list], dtype=torch.int32, device='cuda')
+        outputs = self.model_runner_cpp.generate(
+            batch_input_ids=decoder_input_ids,
+            encoder_input_features=batch_mel_list,
+            encoder_output_lengths=mel_input_lengths // 2,
+            max_new_tokens=96,
+            end_id=self.eot_id,
+            pad_id=self.eot_id,
+            num_beams=1,
+            output_sequence_lengths=True,
+            return_dict=True)
+        torch.cuda.synchronize()
+        output_ids = outputs['output_ids'].cpu().numpy()
+        for i, output_id in enumerate(output_ids):
+            response = pb_utils.InferenceResponse(output_tensors=[
+                pb_utils.Tensor("OUTPUT_IDS", output_id[0])
+            ])
+            responses.append(response)
+        assert len(responses) == len(requests)
+        return responses

whisper/1/whisper_large-v3-turbo/decoder/config.json ADDED Viewed

	@@ -0,0 +1,170 @@

+{
+    "version": "0.15.0.dev2024101500",
+    "pretrained_config": {
+        "architecture": "DecoderModel",
+        "dtype": "float16",
+        "vocab_size": 51866,
+        "hidden_size": 1280,
+        "num_hidden_layers": 4,
+        "num_attention_heads": 20,
+        "hidden_act": "gelu",
+        "logits_dtype": "float16",
+        "norm_epsilon": 1e-05,
+        "position_embedding_type": "learned_absolute",
+        "max_position_embeddings": 448,
+        "num_key_value_heads": 20,
+        "intermediate_size": 5120,
+        "mapping": {
+            "world_size": 1,
+            "gpus_per_node": 8,
+            "cp_size": 1,
+            "tp_size": 1,
+            "pp_size": 1,
+            "moe_tp_size": 1,
+            "moe_ep_size": 1
+        },
+        "quantization": {
+            "quant_algo": null,
+            "kv_cache_quant_algo": null,
+            "group_size": 128,
+            "smoothquant_val": 0.5,
+            "clamp_val": null,
+            "has_zero_point": false,
+            "pre_quant_scale": false,
+            "exclude_modules": null
+        },
+        "use_parallel_embedding": false,
+        "embedding_sharding_dim": 0,
+        "share_embedding_table": false,
+        "head_size": 64,
+        "qk_layernorm": false,
+        "use_prompt_tuning": false,
+        "has_position_embedding": true,
+        "layernorm_type": 0,
+        "has_attention_qkvo_bias": true,
+        "has_mlp_bias": true,
+        "has_model_final_layernorm": true,
+        "has_embedding_layernorm": false,
+        "has_embedding_scale": false,
+        "ffn_hidden_size": 5120,
+        "q_scaling": 1.0,
+        "layernorm_position": 0,
+        "relative_attention": false,
+        "max_distance": 0,
+        "num_buckets": 0,
+        "model_type": "whisper",
+        "rescale_before_lm_head": false,
+        "encoder_hidden_size": 1280,
+        "encoder_num_heads": 20,
+        "encoder_head_size": null,
+        "skip_cross_kv": false,
+        "type_vocab_size": null,
+        "encoder_num_kv_heads": null,
+        "skip_cross_qkv": false,
+        "mlp_type": 0,
+        "residual_scaling": 1.0,
+        "has_lm_head_bias": false
+    },
+    "build_config": {
+        "max_input_len": 14,
+        "max_seq_len": 114,
+        "opt_batch_size": null,
+        "max_batch_size": 64,
+        "max_beam_width": 4,
+        "max_num_tokens": 7296,
+        "opt_num_tokens": 256,
+        "max_prompt_embedding_table_size": 0,
+        "kv_cache_type": "PAGED",
+        "gather_context_logits": false,
+        "gather_generation_logits": false,
+        "strongly_typed": true,
+        "force_num_profiles": null,
+        "profiling_verbosity": "layer_names_only",
+        "enable_debug_output": false,
+        "max_draft_len": 0,
+        "speculative_decoding_mode": 1,
+        "use_refit": false,
+        "input_timing_cache": null,
+        "output_timing_cache": "model.cache",
+        "lora_config": {
+            "lora_dir": [],
+            "lora_ckpt_source": "hf",
+            "max_lora_rank": 64,
+            "lora_target_modules": [],
+            "trtllm_modules_to_hf_modules": {}
+        },
+        "auto_parallel_config": {
+            "world_size": 1,
+            "gpus_per_node": 8,
+            "cluster_key": "H100-PCIe",
+            "cluster_info": null,
+            "sharding_cost_model": "alpha_beta",
+            "comm_cost_model": "alpha_beta",
+            "enable_pipeline_parallelism": false,
+            "enable_shard_unbalanced_shape": false,
+            "enable_shard_dynamic_shape": false,
+            "enable_reduce_scatter": true,
+            "builder_flags": null,
+            "debug_mode": false,
+            "infer_shape": true,
+            "validation_mode": false,
+            "same_buffer_io": {
+                "past_key_value_(\\d+)": "present_key_value_\\1"
+            },
+            "same_spec_io": {},
+            "sharded_io_allowlist": [
+                "past_key_value_\\d+",
+                "present_key_value_\\d*"
+            ],
+            "fill_weights": false,
+            "parallel_config_cache": null,
+            "profile_cache": null,
+            "dump_path": null,
+            "debug_outputs": []
+        },
+        "weight_sparsity": false,
+        "weight_streaming": false,
+        "plugin_config": {
+            "dtype": "float16",
+            "bert_attention_plugin": "float16",
+            "gpt_attention_plugin": "float16",
+            "gemm_plugin": "float16",
+            "gemm_swiglu_plugin": null,
+            "fp8_rowwise_gemm_plugin": null,
+            "smooth_quant_gemm_plugin": null,
+            "identity_plugin": null,
+            "layernorm_quantization_plugin": null,
+            "rmsnorm_quantization_plugin": null,
+            "nccl_plugin": null,
+            "lookup_plugin": null,
+            "lora_plugin": null,
+            "weight_only_groupwise_quant_matmul_plugin": null,
+            "weight_only_quant_matmul_plugin": null,
+            "smooth_quant_plugins": true,
+            "quantize_per_token_plugin": false,
+            "quantize_tensor_plugin": false,
+            "moe_plugin": null,
+            "mamba_conv1d_plugin": "auto",
+            "low_latency_gemm_plugin": null,
+            "context_fmha": true,
+            "bert_context_fmha_fp32_acc": false,
+            "paged_kv_cache": true,
+            "remove_input_padding": true,
+            "reduce_fusion": false,
+            "enable_xqa": false,
+            "tokens_per_block": 64,
+            "use_paged_context_fmha": false,
+            "use_fp8_context_fmha": false,
+            "multiple_profiles": false,
+            "paged_state": false,
+            "streamingllm": false,
+            "manage_weights": false,
+            "use_fused_mlp": true,
+            "pp_reduce_scatter": false
+        },
+        "use_strip_plan": false,
+        "max_encoder_input_len": 3000,
+        "use_fused_mlp": "enable",
+        "monitor_memory": false
+    }
+}

whisper/1/whisper_large-v3-turbo/decoder/rank0.engine ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3371d5242ccff35901b2dad4ec911ca6bff53d33e021b4af3af674f1cadcf49
+size 477983452

whisper/1/whisper_large-v3-turbo/encoder/config.json ADDED Viewed

	@@ -0,0 +1,147 @@

+{
+    "version": "0.15.0.dev2024101500",
+    "pretrained_config": {
+        "architecture": "WhisperEncoder",
+        "dtype": "float16",
+        "vocab_size": 51866,
+        "hidden_size": 1280,
+        "num_hidden_layers": 32,
+        "num_attention_heads": 20,
+        "hidden_act": "gelu",
+        "logits_dtype": "float32",
+        "norm_epsilon": 1e-05,
+        "position_embedding_type": "learned_absolute",
+        "max_position_embeddings": 1500,
+        "num_key_value_heads": 20,
+        "intermediate_size": 5120,
+        "mapping": {
+            "world_size": 1,
+            "gpus_per_node": 8,
+            "cp_size": 1,
+            "tp_size": 1,
+            "pp_size": 1,
+            "moe_tp_size": 1,
+            "moe_ep_size": 1
+        },
+        "quantization": {
+            "quant_algo": null,
+            "kv_cache_quant_algo": null,
+            "group_size": 128,
+            "smoothquant_val": 0.5,
+            "clamp_val": null,
+            "has_zero_point": false,
+            "pre_quant_scale": false,
+            "exclude_modules": null
+        },
+        "use_parallel_embedding": false,
+        "embedding_sharding_dim": 0,
+        "share_embedding_table": false,
+        "head_size": 64,
+        "qk_layernorm": false,
+        "has_position_embedding": true,
+        "n_mels": 128,
+        "num_languages": 100
+    },
+    "build_config": {
+        "max_input_len": 3000,
+        "max_seq_len": 3000,
+        "opt_batch_size": null,
+        "max_batch_size": 64,
+        "max_beam_width": 1,
+        "max_num_tokens": 8192,
+        "opt_num_tokens": 64,
+        "max_prompt_embedding_table_size": 0,
+        "kv_cache_type": "PAGED",
+        "gather_context_logits": false,
+        "gather_generation_logits": false,
+        "strongly_typed": true,
+        "force_num_profiles": null,
+        "profiling_verbosity": "layer_names_only",
+        "enable_debug_output": false,
+        "max_draft_len": 0,
+        "speculative_decoding_mode": 1,
+        "use_refit": false,
+        "input_timing_cache": null,
+        "output_timing_cache": "model.cache",
+        "lora_config": {
+            "lora_dir": [],
+            "lora_ckpt_source": "hf",
+            "max_lora_rank": 64,
+            "lora_target_modules": [],
+            "trtllm_modules_to_hf_modules": {}
+        },
+        "auto_parallel_config": {
+            "world_size": 1,
+            "gpus_per_node": 8,
+            "cluster_key": "H100-PCIe",
+            "cluster_info": null,
+            "sharding_cost_model": "alpha_beta",
+            "comm_cost_model": "alpha_beta",
+            "enable_pipeline_parallelism": false,
+            "enable_shard_unbalanced_shape": false,
+            "enable_shard_dynamic_shape": false,
+            "enable_reduce_scatter": true,
+            "builder_flags": null,
+            "debug_mode": false,
+            "infer_shape": true,
+            "validation_mode": false,
+            "same_buffer_io": {
+                "past_key_value_(\\d+)": "present_key_value_\\1"
+            },
+            "same_spec_io": {},
+            "sharded_io_allowlist": [
+                "past_key_value_\\d+",
+                "present_key_value_\\d*"
+            ],
+            "fill_weights": false,
+            "parallel_config_cache": null,
+            "profile_cache": null,
+            "dump_path": null,
+            "debug_outputs": []
+        },
+        "weight_sparsity": false,
+        "weight_streaming": false,
+        "plugin_config": {
+            "dtype": "float16",
+            "bert_attention_plugin": "float16",
+            "gpt_attention_plugin": "auto",
+            "gemm_plugin": null,
+            "gemm_swiglu_plugin": null,
+            "fp8_rowwise_gemm_plugin": null,
+            "smooth_quant_gemm_plugin": null,
+            "identity_plugin": null,
+            "layernorm_quantization_plugin": null,
+            "rmsnorm_quantization_plugin": null,
+            "nccl_plugin": null,
+            "lookup_plugin": null,
+            "lora_plugin": null,
+            "weight_only_groupwise_quant_matmul_plugin": null,
+            "weight_only_quant_matmul_plugin": null,
+            "smooth_quant_plugins": true,
+            "quantize_per_token_plugin": false,
+            "quantize_tensor_plugin": false,
+            "moe_plugin": null,
+            "mamba_conv1d_plugin": "auto",
+            "low_latency_gemm_plugin": null,
+            "context_fmha": true,
+            "bert_context_fmha_fp32_acc": false,
+            "paged_kv_cache": true,
+            "remove_input_padding": true,
+            "reduce_fusion": false,
+            "enable_xqa": false,
+            "tokens_per_block": 64,
+            "use_paged_context_fmha": false,
+            "use_fp8_context_fmha": false,
+            "multiple_profiles": false,
+            "paged_state": false,
+            "streamingllm": false,
+            "manage_weights": false,
+            "use_fused_mlp": true,
+            "pp_reduce_scatter": false
+        },
+        "use_strip_plan": false,
+        "max_encoder_input_len": 1024,
+        "use_fused_mlp": "enable",
+        "monitor_memory": false
+    }
+}

whisper/1/whisper_large-v3-turbo/encoder/rank0.engine ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d807775c5cbd033e1ee8a06845a9f89877f96039a3d570776dfcfd71eee8e59
+size 1287816708

whisper/config.pbtxt ADDED Viewed

	@@ -0,0 +1,71 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: "whisper"
+backend: "python"
+max_batch_size: 64
+dynamic_batching {
+    max_queue_delay_microseconds: 100
+}
+parameters [
+  {
+    key: "engine_dir"
+    value: { string_value: "/data/whisper/model_repo_whisper/whisper/1/whisper_large-v3-turbo"}
+  },
+  {
+   key: "n_mels",
+   value: {string_value:"128"} # 128 dim for large-v3, 80 dim for large-v2
+  },
+  {
+  key: "zero_pad"
+  value: {string_value: "false"}
+  }
+]
+input [
+  {
+    name: "WAV"
+    data_type: TYPE_FP32
+    dims: [-1]
+    optional: True
+  },
+  {
+    name: "WAV_LENS"
+    data_type: TYPE_INT32
+    dims: [1]
+    optional: True
+  },
+  {
+    name: "DECODER_INPUT_IDS"
+    data_type: TYPE_INT32
+    dims: [-1]
+    optional: True
+  }
+]
+output [
+  {
+    name: "OUTPUT_IDS"
+    data_type: TYPE_INT32
+    dims: [-1]
+  }
+]
+instance_group [
+  {
+    count: 1
+    kind: KIND_GPU
+  }
+]