Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- infer_bls/1/__pycache__/model.cpython-310.pyc +0 -0
- infer_bls/1/__pycache__/tokenizer.cpython-310.pyc +0 -0
- infer_bls/1/model.py +126 -0
- infer_bls/1/multilingual.tiktoken +0 -0
- infer_bls/1/tokenizer.py +184 -0
- infer_bls/config.pbtxt +61 -0
- whisper/1/__pycache__/fbank.cpython-310.pyc +0 -0
- whisper/1/__pycache__/model.cpython-310.pyc +0 -0
- whisper/1/fbank.py +98 -0
- whisper/1/mel_filters.npz +3 -0
- whisper/1/model.py +101 -0
- whisper/1/whisper_large-v3-turbo/decoder/config.json +170 -0
- whisper/1/whisper_large-v3-turbo/decoder/rank0.engine +3 -0
- whisper/1/whisper_large-v3-turbo/encoder/config.json +147 -0
- whisper/1/whisper_large-v3-turbo/encoder/rank0.engine +3 -0
- whisper/config.pbtxt +71 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
whisper/1/whisper_large-v3-turbo/decoder/rank0.engine filter=lfs diff=lfs merge=lfs -text
|
37 |
+
whisper/1/whisper_large-v3-turbo/encoder/rank0.engine filter=lfs diff=lfs merge=lfs -text
|
infer_bls/1/__pycache__/model.cpython-310.pyc
ADDED
Binary file (4.92 kB). View file
|
|
infer_bls/1/__pycache__/tokenizer.cpython-310.pyc
ADDED
Binary file (4.59 kB). View file
|
|
infer_bls/1/model.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import triton_python_backend_utils as pb_utils
|
3 |
+
import numpy as np
|
4 |
+
import json
|
5 |
+
import torch
|
6 |
+
from torch.utils.dlpack import to_dlpack
|
7 |
+
import re
|
8 |
+
from .tokenizer import get_tokenizer
|
9 |
+
from collections import OrderedDict
|
10 |
+
from pathlib import Path
|
11 |
+
|
12 |
+
def read_config(component, engine_dir):
|
13 |
+
config_path = engine_dir / component / 'config.json'
|
14 |
+
with open(config_path, 'r') as f:
|
15 |
+
config = json.load(f)
|
16 |
+
model_config = OrderedDict()
|
17 |
+
model_config.update(config['pretrained_config'])
|
18 |
+
model_config.update(config['build_config'])
|
19 |
+
return model_config
|
20 |
+
|
21 |
+
class TritonPythonModel:
|
22 |
+
"""Your Python model must use the same class name. Every Python model
|
23 |
+
that is created must have "TritonPythonModel" as the class name.
|
24 |
+
"""
|
25 |
+
|
26 |
+
def initialize(self, args):
|
27 |
+
"""`initialize` is called only once when the model is being loaded.
|
28 |
+
Implementing `initialize` function is optional. This function allows
|
29 |
+
the model to initialize any state associated with this model.
|
30 |
+
|
31 |
+
Parameters
|
32 |
+
----------
|
33 |
+
args : dict
|
34 |
+
Both keys and values are strings. The dictionary keys and values are:
|
35 |
+
* model_config: A JSON string containing the model configuration
|
36 |
+
* model_instance_kind: A string containing model instance kind
|
37 |
+
* model_instance_device_id: A string containing model instance device ID
|
38 |
+
* model_repository: Model repository path
|
39 |
+
* model_version: Model version
|
40 |
+
* model_name: Model name
|
41 |
+
"""
|
42 |
+
self.model_config = model_config = json.loads(args['model_config'])
|
43 |
+
|
44 |
+
# Get OUTPUT0 configuration
|
45 |
+
output0_config = pb_utils.get_output_config_by_name(
|
46 |
+
model_config, "TRANSCRIPTS")
|
47 |
+
# Convert Triton types to numpy types
|
48 |
+
self.out0_dtype = pb_utils.triton_string_to_numpy(
|
49 |
+
output0_config['data_type'])
|
50 |
+
encoder_config = read_config('encoder', Path(self.model_config['parameters']['engine_dir']["string_value"]))
|
51 |
+
self.tokenizer = get_tokenizer(num_languages=encoder_config['num_languages'])
|
52 |
+
self.blank = self.tokenizer.encode(" ", allowed_special=self.tokenizer.special_tokens_set)[0]
|
53 |
+
self.device = torch.device("cuda")
|
54 |
+
|
55 |
+
def process_batch(self, wav, wav_len, prompt_id):
|
56 |
+
wav = torch.from_numpy(wav[0]).to(self.device)
|
57 |
+
wav_tensor = pb_utils.Tensor.from_dlpack("WAV", to_dlpack(wav.unsqueeze(0)))
|
58 |
+
wav_len_tensor = pb_utils.Tensor("WAV_LENS", np.array([[wav_len]], np.int32))
|
59 |
+
prompt_id = torch.tensor(prompt_id).unsqueeze(0)
|
60 |
+
|
61 |
+
prompt_id = pb_utils.Tensor("DECODER_INPUT_IDS", prompt_id.numpy().astype(np.int32))
|
62 |
+
infer_request = pb_utils.InferenceRequest(
|
63 |
+
model_name="whisper",
|
64 |
+
requested_output_names=["OUTPUT_IDS"],
|
65 |
+
inputs=[wav_tensor, wav_len_tensor, prompt_id]
|
66 |
+
)
|
67 |
+
inference_response = infer_request.exec()
|
68 |
+
if inference_response.has_error():
|
69 |
+
raise pb_utils.TritonModelException(inference_response.error().message())
|
70 |
+
else:
|
71 |
+
output_ids = pb_utils.get_output_tensor_by_name(inference_response, "OUTPUT_IDS")
|
72 |
+
return output_ids.as_numpy()
|
73 |
+
|
74 |
+
def execute(self, requests):
|
75 |
+
"""`execute` must be implemented in every Python model. `execute`
|
76 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
77 |
+
argument. This function is called when an inference is requested
|
78 |
+
for this model.
|
79 |
+
|
80 |
+
Parameters
|
81 |
+
----------
|
82 |
+
requests : list
|
83 |
+
A list of pb_utils.InferenceRequest
|
84 |
+
|
85 |
+
Returns
|
86 |
+
-------
|
87 |
+
list
|
88 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
89 |
+
be the same as `requests`
|
90 |
+
"""
|
91 |
+
# Every Python backend must iterate through list of requests and create
|
92 |
+
# an instance of pb_utils.InferenceResponse class for each of them. You
|
93 |
+
# should avoid storing any of the input Tensors in the class attributes
|
94 |
+
# as they will be overridden in subsequent inference requests. You can
|
95 |
+
# make a copy of the underlying NumPy array and store it if it is
|
96 |
+
# required.
|
97 |
+
responses = []
|
98 |
+
for request in requests:
|
99 |
+
# Perform inference on the request and append it to responses list...
|
100 |
+
in_0 = pb_utils.get_input_tensor_by_name(request, "TEXT_PREFIX")
|
101 |
+
prompt_ids = in_0.as_numpy().tolist()
|
102 |
+
prompt_ids = prompt_ids[0][0].decode('utf-8')
|
103 |
+
if prompt_ids == "":
|
104 |
+
prompt_ids = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
|
105 |
+
prompt_id = self.tokenizer.encode(prompt_ids, allowed_special=self.tokenizer.special_tokens_set)
|
106 |
+
|
107 |
+
wav = pb_utils.get_input_tensor_by_name(request, "WAV").as_numpy()
|
108 |
+
assert wav.shape[0] == 1, "Only support batch size 1 for now"
|
109 |
+
wav_len = pb_utils.get_input_tensor_by_name(request, "WAV_LENS").as_numpy()
|
110 |
+
wav_len = wav_len.item()
|
111 |
+
|
112 |
+
output_ids = self.process_batch(wav, wav_len, prompt_id)
|
113 |
+
s = self.tokenizer.decode(output_ids)
|
114 |
+
s = re.sub(r'<\|.*?\|>', '', s)
|
115 |
+
sentence = np.array([s])
|
116 |
+
out0 = pb_utils.Tensor("TRANSCRIPTS", sentence.astype(self.out0_dtype))
|
117 |
+
inference_response = pb_utils.InferenceResponse(output_tensors=[out0])
|
118 |
+
responses.append(inference_response)
|
119 |
+
return responses
|
120 |
+
|
121 |
+
def finalize(self):
|
122 |
+
"""`finalize` is called only once when the model is being unloaded.
|
123 |
+
Implementing `finalize` function is optional. This function allows
|
124 |
+
the model to perform any necessary clean ups before exit.
|
125 |
+
"""
|
126 |
+
print('Cleaning up...')
|
infer_bls/1/multilingual.tiktoken
ADDED
The diff for this file is too large to render.
See raw diff
|
|
infer_bls/1/tokenizer.py
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
2 |
+
# SPDX-License-Identifier: Apache-2.0
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
# Modified from https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
|
16 |
+
import base64
|
17 |
+
import os
|
18 |
+
|
19 |
+
import tiktoken
|
20 |
+
|
21 |
+
LANGUAGES = {
|
22 |
+
"en": "english",
|
23 |
+
"zh": "chinese",
|
24 |
+
"de": "german",
|
25 |
+
"es": "spanish",
|
26 |
+
"ru": "russian",
|
27 |
+
"ko": "korean",
|
28 |
+
"fr": "french",
|
29 |
+
"ja": "japanese",
|
30 |
+
"pt": "portuguese",
|
31 |
+
"tr": "turkish",
|
32 |
+
"pl": "polish",
|
33 |
+
"ca": "catalan",
|
34 |
+
"nl": "dutch",
|
35 |
+
"ar": "arabic",
|
36 |
+
"sv": "swedish",
|
37 |
+
"it": "italian",
|
38 |
+
"id": "indonesian",
|
39 |
+
"hi": "hindi",
|
40 |
+
"fi": "finnish",
|
41 |
+
"vi": "vietnamese",
|
42 |
+
"he": "hebrew",
|
43 |
+
"uk": "ukrainian",
|
44 |
+
"el": "greek",
|
45 |
+
"ms": "malay",
|
46 |
+
"cs": "czech",
|
47 |
+
"ro": "romanian",
|
48 |
+
"da": "danish",
|
49 |
+
"hu": "hungarian",
|
50 |
+
"ta": "tamil",
|
51 |
+
"no": "norwegian",
|
52 |
+
"th": "thai",
|
53 |
+
"ur": "urdu",
|
54 |
+
"hr": "croatian",
|
55 |
+
"bg": "bulgarian",
|
56 |
+
"lt": "lithuanian",
|
57 |
+
"la": "latin",
|
58 |
+
"mi": "maori",
|
59 |
+
"ml": "malayalam",
|
60 |
+
"cy": "welsh",
|
61 |
+
"sk": "slovak",
|
62 |
+
"te": "telugu",
|
63 |
+
"fa": "persian",
|
64 |
+
"lv": "latvian",
|
65 |
+
"bn": "bengali",
|
66 |
+
"sr": "serbian",
|
67 |
+
"az": "azerbaijani",
|
68 |
+
"sl": "slovenian",
|
69 |
+
"kn": "kannada",
|
70 |
+
"et": "estonian",
|
71 |
+
"mk": "macedonian",
|
72 |
+
"br": "breton",
|
73 |
+
"eu": "basque",
|
74 |
+
"is": "icelandic",
|
75 |
+
"hy": "armenian",
|
76 |
+
"ne": "nepali",
|
77 |
+
"mn": "mongolian",
|
78 |
+
"bs": "bosnian",
|
79 |
+
"kk": "kazakh",
|
80 |
+
"sq": "albanian",
|
81 |
+
"sw": "swahili",
|
82 |
+
"gl": "galician",
|
83 |
+
"mr": "marathi",
|
84 |
+
"pa": "punjabi",
|
85 |
+
"si": "sinhala",
|
86 |
+
"km": "khmer",
|
87 |
+
"sn": "shona",
|
88 |
+
"yo": "yoruba",
|
89 |
+
"so": "somali",
|
90 |
+
"af": "afrikaans",
|
91 |
+
"oc": "occitan",
|
92 |
+
"ka": "georgian",
|
93 |
+
"be": "belarusian",
|
94 |
+
"tg": "tajik",
|
95 |
+
"sd": "sindhi",
|
96 |
+
"gu": "gujarati",
|
97 |
+
"am": "amharic",
|
98 |
+
"yi": "yiddish",
|
99 |
+
"lo": "lao",
|
100 |
+
"uz": "uzbek",
|
101 |
+
"fo": "faroese",
|
102 |
+
"ht": "haitian creole",
|
103 |
+
"ps": "pashto",
|
104 |
+
"tk": "turkmen",
|
105 |
+
"nn": "nynorsk",
|
106 |
+
"mt": "maltese",
|
107 |
+
"sa": "sanskrit",
|
108 |
+
"lb": "luxembourgish",
|
109 |
+
"my": "myanmar",
|
110 |
+
"bo": "tibetan",
|
111 |
+
"tl": "tagalog",
|
112 |
+
"mg": "malagasy",
|
113 |
+
"as": "assamese",
|
114 |
+
"tt": "tatar",
|
115 |
+
"haw": "hawaiian",
|
116 |
+
"ln": "lingala",
|
117 |
+
"ha": "hausa",
|
118 |
+
"ba": "bashkir",
|
119 |
+
"jw": "javanese",
|
120 |
+
"su": "sundanese",
|
121 |
+
"yue": "cantonese",
|
122 |
+
}
|
123 |
+
|
124 |
+
|
125 |
+
def get_tokenizer(name: str = "multilingual",
|
126 |
+
num_languages: int = 99,
|
127 |
+
tokenizer_dir: str = None):
|
128 |
+
if tokenizer_dir is None:
|
129 |
+
vocab_path = os.path.join(os.path.dirname(__file__),
|
130 |
+
f"./{name}.tiktoken")
|
131 |
+
else:
|
132 |
+
vocab_path = os.path.join(tokenizer_dir, f"{name}.tiktoken")
|
133 |
+
ranks = {
|
134 |
+
base64.b64decode(token): int(rank)
|
135 |
+
for token, rank in (line.split() for line in open(vocab_path) if line)
|
136 |
+
}
|
137 |
+
n_vocab = len(ranks)
|
138 |
+
special_tokens = {}
|
139 |
+
|
140 |
+
specials = [
|
141 |
+
"<|endoftext|>",
|
142 |
+
"<|startoftranscript|>",
|
143 |
+
*[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
|
144 |
+
"<|translate|>",
|
145 |
+
"<|transcribe|>",
|
146 |
+
"<|startoflm|>",
|
147 |
+
"<|startofprev|>",
|
148 |
+
"<|nospeech|>",
|
149 |
+
"<|notimestamps|>",
|
150 |
+
*[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
|
151 |
+
]
|
152 |
+
|
153 |
+
for token in specials:
|
154 |
+
special_tokens[token] = n_vocab
|
155 |
+
n_vocab += 1
|
156 |
+
|
157 |
+
return tiktoken.Encoding(
|
158 |
+
name=os.path.basename(vocab_path),
|
159 |
+
explicit_n_vocab=n_vocab,
|
160 |
+
pat_str=
|
161 |
+
r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
|
162 |
+
mergeable_ranks=ranks,
|
163 |
+
special_tokens=special_tokens,
|
164 |
+
)
|
165 |
+
|
166 |
+
|
167 |
+
if __name__ == "__main__":
|
168 |
+
enc = get_tokenizer()
|
169 |
+
mytest_str = "<|startofprev|> Nvidia<|startoftranscript|><|en|><|transcribe|>"
|
170 |
+
encoding = enc.encode(mytest_str, allowed_special=enc.special_tokens_set)
|
171 |
+
mystr = enc.decode([50361, 45, 43021, 50258, 50259, 50359])
|
172 |
+
mystr2 = enc.decode([50361, 46284, 50258, 50259, 50359])
|
173 |
+
#print(encoding, mystr, mystr2)
|
174 |
+
print(
|
175 |
+
enc.encode("<|startoftranscript|>",
|
176 |
+
allowed_special=enc.special_tokens_set)[0])
|
177 |
+
print(
|
178 |
+
enc.encode("<|endoftext|>",
|
179 |
+
allowed_special=enc.special_tokens_set)[0])
|
180 |
+
my_zh_str = "好好学习"
|
181 |
+
encoding = enc.encode(my_zh_str, allowed_special=enc.special_tokens_set)
|
182 |
+
decoding = enc.decode(encoding)
|
183 |
+
print(type(decoding))
|
184 |
+
#print(encoding, decoding)
|
infer_bls/config.pbtxt
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
name: "infer_bls"
|
16 |
+
backend: "python"
|
17 |
+
max_batch_size: 64
|
18 |
+
|
19 |
+
parameters [
|
20 |
+
{
|
21 |
+
key: "engine_dir"
|
22 |
+
value: { string_value: "/data/whisper/model_repo_whisper/whisper/1/whisper_large-v3-turbo"}
|
23 |
+
}
|
24 |
+
]
|
25 |
+
|
26 |
+
input [
|
27 |
+
{
|
28 |
+
name: "TEXT_PREFIX"
|
29 |
+
data_type: TYPE_STRING
|
30 |
+
dims: [1]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
name: "WAV"
|
34 |
+
data_type: TYPE_FP32
|
35 |
+
dims: [-1]
|
36 |
+
},
|
37 |
+
{
|
38 |
+
name: "WAV_LENS"
|
39 |
+
data_type: TYPE_INT32
|
40 |
+
dims: [1]
|
41 |
+
optional: True
|
42 |
+
}
|
43 |
+
]
|
44 |
+
|
45 |
+
output [
|
46 |
+
{
|
47 |
+
name: "TRANSCRIPTS"
|
48 |
+
data_type: TYPE_STRING
|
49 |
+
dims: [1]
|
50 |
+
}
|
51 |
+
]
|
52 |
+
|
53 |
+
dynamic_batching {
|
54 |
+
max_queue_delay_microseconds: 100
|
55 |
+
}
|
56 |
+
instance_group [
|
57 |
+
{
|
58 |
+
count: 8
|
59 |
+
kind: KIND_CPU
|
60 |
+
}
|
61 |
+
]
|
whisper/1/__pycache__/fbank.cpython-310.pyc
ADDED
Binary file (3.22 kB). View file
|
|
whisper/1/__pycache__/model.cpython-310.pyc
ADDED
Binary file (3.16 kB). View file
|
|
whisper/1/fbank.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
# Reference: https://github.com/openai/whisper/blob/main/whisper/audio.py
|
15 |
+
import numpy as np
|
16 |
+
import torch
|
17 |
+
import torch.nn.functional as F
|
18 |
+
from typing import Union
|
19 |
+
import os
|
20 |
+
|
21 |
+
def mel_filters(device, n_mels: int =128) -> torch.Tensor:
|
22 |
+
"""
|
23 |
+
load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
|
24 |
+
Allows decoupling librosa dependency; saved using:
|
25 |
+
|
26 |
+
np.savez_compressed(
|
27 |
+
"mel_filters.npz",
|
28 |
+
mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
|
29 |
+
)
|
30 |
+
"""
|
31 |
+
assert n_mels == 80 or n_mels == 128 , f"Unsupported n_mels: {n_mels}"
|
32 |
+
with np.load(
|
33 |
+
os.path.join(os.path.dirname(__file__), "mel_filters.npz")
|
34 |
+
) as f:
|
35 |
+
return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
|
36 |
+
|
37 |
+
|
38 |
+
def log_mel_spectrogram(
|
39 |
+
audio: Union[torch.Tensor],
|
40 |
+
filters: torch.Tensor,
|
41 |
+
n_mels: int = 128,
|
42 |
+
n_fft: int = 400,
|
43 |
+
hop_length: int = 160,
|
44 |
+
):
|
45 |
+
"""
|
46 |
+
Compute the log-Mel spectrogram of
|
47 |
+
|
48 |
+
Parameters
|
49 |
+
----------
|
50 |
+
audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
|
51 |
+
The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
|
52 |
+
|
53 |
+
n_mels: int
|
54 |
+
The number of Mel-frequency filters, only 80 or 128 is supported
|
55 |
+
|
56 |
+
filters: torch.Tensor
|
57 |
+
|
58 |
+
Returns
|
59 |
+
-------
|
60 |
+
torch.Tensor, shape = (128, n_frames)
|
61 |
+
A Tensor that contains the Mel spectrogram
|
62 |
+
"""
|
63 |
+
window = torch.hann_window(n_fft).to(audio.device)
|
64 |
+
stft = torch.stft(audio, n_fft, hop_length, window=window, return_complex=True)
|
65 |
+
magnitudes = stft[..., :-1].abs() ** 2
|
66 |
+
|
67 |
+
mel_spec = filters @ magnitudes
|
68 |
+
log_spec = torch.clamp(mel_spec, min=1e-10).log10()
|
69 |
+
log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
|
70 |
+
log_spec = (log_spec + 4.0) / 4.0
|
71 |
+
# cast to float 16
|
72 |
+
log_spec = log_spec.half()
|
73 |
+
return log_spec
|
74 |
+
|
75 |
+
class FeatureExtractor(torch.nn.Module):
|
76 |
+
"""Your Python model must use the same class name. Every Python model
|
77 |
+
that is created must have "TritonPythonModel" as the class name.
|
78 |
+
"""
|
79 |
+
|
80 |
+
def __init__(self, n_mels: int = 128):
|
81 |
+
self.device = torch.device("cuda")
|
82 |
+
self.n_mels = n_mels
|
83 |
+
self.filters = mel_filters(self.device, n_mels=self.n_mels)
|
84 |
+
|
85 |
+
def compute_feature(self, wav, padding_target_len: int = 3000):
|
86 |
+
"""
|
87 |
+
Compute the log-Mel spectrogram of the input audio waveform.
|
88 |
+
mel: [1, feature_dim, seq_len]
|
89 |
+
"""
|
90 |
+
mel = log_mel_spectrogram(wav, self.filters)
|
91 |
+
assert padding_target_len <= 3000, f"padding must be less than 3000, got {padding}"
|
92 |
+
if mel.shape[1] < padding_target_len:
|
93 |
+
mel = F.pad(mel, (0, padding_target_len - mel.shape[1]), mode='constant')
|
94 |
+
if mel.shape[1] % 2:
|
95 |
+
# pad to even length for remove_padding case, since conv1d requires even length
|
96 |
+
mel = torch.nn.functional.pad(mel, (0, 1))
|
97 |
+
mel = mel.unsqueeze(0)
|
98 |
+
return mel
|
whisper/1/mel_filters.npz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7450ae70723a5ef9d341e3cee628c7cb0177f36ce42c44b7ed2bf3325f0f6d4c
|
3 |
+
size 4271
|
whisper/1/model.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
3 |
+
#
|
4 |
+
# Redistribution and use in source and binary forms, with or without
|
5 |
+
# modification, are permitted provided that the following conditions
|
6 |
+
# are met:
|
7 |
+
# * Redistributions of source code must retain the above copyright
|
8 |
+
# notice, this list of conditions and the following disclaimer.
|
9 |
+
# * Redistributions in binary form must reproduce the above copyright
|
10 |
+
# notice, this list of conditions and the following disclaimer in the
|
11 |
+
# documentation and/or other materials provided with the distribution.
|
12 |
+
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
13 |
+
# contributors may be used to endorse or promote products derived
|
14 |
+
# from this software without specific prior written permission.
|
15 |
+
#
|
16 |
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
17 |
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
18 |
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
19 |
+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
20 |
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
21 |
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
22 |
+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
23 |
+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
24 |
+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
25 |
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
26 |
+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
27 |
+
import json
|
28 |
+
from pathlib import Path
|
29 |
+
|
30 |
+
from .fbank import FeatureExtractor
|
31 |
+
import torch
|
32 |
+
from torch.utils.dlpack import from_dlpack
|
33 |
+
|
34 |
+
import triton_python_backend_utils as pb_utils
|
35 |
+
from tensorrt_llm.runtime import ModelRunnerCpp
|
36 |
+
from tensorrt_llm.bindings import GptJsonConfig
|
37 |
+
from .fbank import FeatureExtractor
|
38 |
+
|
39 |
+
|
40 |
+
class TritonPythonModel:
|
41 |
+
def initialize(self, args):
|
42 |
+
parameters = json.loads(args['model_config'])['parameters']
|
43 |
+
for key,value in parameters.items():
|
44 |
+
parameters[key] = value["string_value"]
|
45 |
+
engine_dir = parameters["engine_dir"]
|
46 |
+
json_config = GptJsonConfig.parse_file(Path(engine_dir) / 'decoder' / 'config.json')
|
47 |
+
assert json_config.model_config.supports_inflight_batching
|
48 |
+
runner_kwargs = dict(engine_dir=engine_dir,
|
49 |
+
is_enc_dec=True,
|
50 |
+
max_batch_size=64,
|
51 |
+
max_input_len=3000,
|
52 |
+
max_output_len=96,
|
53 |
+
max_beam_width=1,
|
54 |
+
debug_mode=False,
|
55 |
+
kv_cache_free_gpu_memory_fraction=0.5)
|
56 |
+
self.model_runner_cpp = ModelRunnerCpp.from_dir(**runner_kwargs)
|
57 |
+
self.feature_extractor = FeatureExtractor(n_mels = int(parameters["n_mels"]))
|
58 |
+
self.zero_pad = True if parameters["zero_pad"] == "true" else False
|
59 |
+
self.eot_id = 50257
|
60 |
+
|
61 |
+
def execute(self, requests):
|
62 |
+
"""
|
63 |
+
This function receives a list of requests (`pb_utils.InferenceRequest`),
|
64 |
+
performs inference on every request and appends it to responses.
|
65 |
+
"""
|
66 |
+
responses, batch_mel_list, decoder_input_ids = [], [], []
|
67 |
+
for request in requests:
|
68 |
+
wav_tensor = pb_utils.get_input_tensor_by_name(request, "WAV")
|
69 |
+
wav_len = pb_utils.get_input_tensor_by_name(request, "WAV_LENS").as_numpy().item()
|
70 |
+
prompt_ids = pb_utils.get_input_tensor_by_name(request, "DECODER_INPUT_IDS").as_numpy()
|
71 |
+
wav = from_dlpack(wav_tensor.to_dlpack())
|
72 |
+
wav = wav[:, :wav_len]
|
73 |
+
padding = 0 if self.zero_pad else 3000
|
74 |
+
mel = self.feature_extractor.compute_feature(wav[0].to('cuda'), padding_target_len=padding).transpose(1, 2)
|
75 |
+
batch_mel_list.append(mel.squeeze(0))
|
76 |
+
decoder_input_ids.append(torch.tensor(prompt_ids, dtype=torch.int32, device='cuda').squeeze(0))
|
77 |
+
|
78 |
+
decoder_input_ids = torch.nn.utils.rnn.pad_sequence(decoder_input_ids, batch_first=True, padding_value=self.eot_id)
|
79 |
+
mel_input_lengths = torch.tensor([mel.shape[0] for mel in batch_mel_list], dtype=torch.int32, device='cuda')
|
80 |
+
|
81 |
+
outputs = self.model_runner_cpp.generate(
|
82 |
+
batch_input_ids=decoder_input_ids,
|
83 |
+
encoder_input_features=batch_mel_list,
|
84 |
+
encoder_output_lengths=mel_input_lengths // 2,
|
85 |
+
max_new_tokens=96,
|
86 |
+
end_id=self.eot_id,
|
87 |
+
pad_id=self.eot_id,
|
88 |
+
num_beams=1,
|
89 |
+
output_sequence_lengths=True,
|
90 |
+
return_dict=True)
|
91 |
+
torch.cuda.synchronize()
|
92 |
+
|
93 |
+
output_ids = outputs['output_ids'].cpu().numpy()
|
94 |
+
|
95 |
+
for i, output_id in enumerate(output_ids):
|
96 |
+
response = pb_utils.InferenceResponse(output_tensors=[
|
97 |
+
pb_utils.Tensor("OUTPUT_IDS", output_id[0])
|
98 |
+
])
|
99 |
+
responses.append(response)
|
100 |
+
assert len(responses) == len(requests)
|
101 |
+
return responses
|
whisper/1/whisper_large-v3-turbo/decoder/config.json
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"version": "0.15.0.dev2024101500",
|
3 |
+
"pretrained_config": {
|
4 |
+
"architecture": "DecoderModel",
|
5 |
+
"dtype": "float16",
|
6 |
+
"vocab_size": 51866,
|
7 |
+
"hidden_size": 1280,
|
8 |
+
"num_hidden_layers": 4,
|
9 |
+
"num_attention_heads": 20,
|
10 |
+
"hidden_act": "gelu",
|
11 |
+
"logits_dtype": "float16",
|
12 |
+
"norm_epsilon": 1e-05,
|
13 |
+
"position_embedding_type": "learned_absolute",
|
14 |
+
"max_position_embeddings": 448,
|
15 |
+
"num_key_value_heads": 20,
|
16 |
+
"intermediate_size": 5120,
|
17 |
+
"mapping": {
|
18 |
+
"world_size": 1,
|
19 |
+
"gpus_per_node": 8,
|
20 |
+
"cp_size": 1,
|
21 |
+
"tp_size": 1,
|
22 |
+
"pp_size": 1,
|
23 |
+
"moe_tp_size": 1,
|
24 |
+
"moe_ep_size": 1
|
25 |
+
},
|
26 |
+
"quantization": {
|
27 |
+
"quant_algo": null,
|
28 |
+
"kv_cache_quant_algo": null,
|
29 |
+
"group_size": 128,
|
30 |
+
"smoothquant_val": 0.5,
|
31 |
+
"clamp_val": null,
|
32 |
+
"has_zero_point": false,
|
33 |
+
"pre_quant_scale": false,
|
34 |
+
"exclude_modules": null
|
35 |
+
},
|
36 |
+
"use_parallel_embedding": false,
|
37 |
+
"embedding_sharding_dim": 0,
|
38 |
+
"share_embedding_table": false,
|
39 |
+
"head_size": 64,
|
40 |
+
"qk_layernorm": false,
|
41 |
+
"use_prompt_tuning": false,
|
42 |
+
"has_position_embedding": true,
|
43 |
+
"layernorm_type": 0,
|
44 |
+
"has_attention_qkvo_bias": true,
|
45 |
+
"has_mlp_bias": true,
|
46 |
+
"has_model_final_layernorm": true,
|
47 |
+
"has_embedding_layernorm": false,
|
48 |
+
"has_embedding_scale": false,
|
49 |
+
"ffn_hidden_size": 5120,
|
50 |
+
"q_scaling": 1.0,
|
51 |
+
"layernorm_position": 0,
|
52 |
+
"relative_attention": false,
|
53 |
+
"max_distance": 0,
|
54 |
+
"num_buckets": 0,
|
55 |
+
"model_type": "whisper",
|
56 |
+
"rescale_before_lm_head": false,
|
57 |
+
"encoder_hidden_size": 1280,
|
58 |
+
"encoder_num_heads": 20,
|
59 |
+
"encoder_head_size": null,
|
60 |
+
"skip_cross_kv": false,
|
61 |
+
"type_vocab_size": null,
|
62 |
+
"encoder_num_kv_heads": null,
|
63 |
+
"skip_cross_qkv": false,
|
64 |
+
"mlp_type": 0,
|
65 |
+
"residual_scaling": 1.0,
|
66 |
+
"has_lm_head_bias": false
|
67 |
+
},
|
68 |
+
"build_config": {
|
69 |
+
"max_input_len": 14,
|
70 |
+
"max_seq_len": 114,
|
71 |
+
"opt_batch_size": null,
|
72 |
+
"max_batch_size": 64,
|
73 |
+
"max_beam_width": 4,
|
74 |
+
"max_num_tokens": 7296,
|
75 |
+
"opt_num_tokens": 256,
|
76 |
+
"max_prompt_embedding_table_size": 0,
|
77 |
+
"kv_cache_type": "PAGED",
|
78 |
+
"gather_context_logits": false,
|
79 |
+
"gather_generation_logits": false,
|
80 |
+
"strongly_typed": true,
|
81 |
+
"force_num_profiles": null,
|
82 |
+
"profiling_verbosity": "layer_names_only",
|
83 |
+
"enable_debug_output": false,
|
84 |
+
"max_draft_len": 0,
|
85 |
+
"speculative_decoding_mode": 1,
|
86 |
+
"use_refit": false,
|
87 |
+
"input_timing_cache": null,
|
88 |
+
"output_timing_cache": "model.cache",
|
89 |
+
"lora_config": {
|
90 |
+
"lora_dir": [],
|
91 |
+
"lora_ckpt_source": "hf",
|
92 |
+
"max_lora_rank": 64,
|
93 |
+
"lora_target_modules": [],
|
94 |
+
"trtllm_modules_to_hf_modules": {}
|
95 |
+
},
|
96 |
+
"auto_parallel_config": {
|
97 |
+
"world_size": 1,
|
98 |
+
"gpus_per_node": 8,
|
99 |
+
"cluster_key": "H100-PCIe",
|
100 |
+
"cluster_info": null,
|
101 |
+
"sharding_cost_model": "alpha_beta",
|
102 |
+
"comm_cost_model": "alpha_beta",
|
103 |
+
"enable_pipeline_parallelism": false,
|
104 |
+
"enable_shard_unbalanced_shape": false,
|
105 |
+
"enable_shard_dynamic_shape": false,
|
106 |
+
"enable_reduce_scatter": true,
|
107 |
+
"builder_flags": null,
|
108 |
+
"debug_mode": false,
|
109 |
+
"infer_shape": true,
|
110 |
+
"validation_mode": false,
|
111 |
+
"same_buffer_io": {
|
112 |
+
"past_key_value_(\\d+)": "present_key_value_\\1"
|
113 |
+
},
|
114 |
+
"same_spec_io": {},
|
115 |
+
"sharded_io_allowlist": [
|
116 |
+
"past_key_value_\\d+",
|
117 |
+
"present_key_value_\\d*"
|
118 |
+
],
|
119 |
+
"fill_weights": false,
|
120 |
+
"parallel_config_cache": null,
|
121 |
+
"profile_cache": null,
|
122 |
+
"dump_path": null,
|
123 |
+
"debug_outputs": []
|
124 |
+
},
|
125 |
+
"weight_sparsity": false,
|
126 |
+
"weight_streaming": false,
|
127 |
+
"plugin_config": {
|
128 |
+
"dtype": "float16",
|
129 |
+
"bert_attention_plugin": "float16",
|
130 |
+
"gpt_attention_plugin": "float16",
|
131 |
+
"gemm_plugin": "float16",
|
132 |
+
"gemm_swiglu_plugin": null,
|
133 |
+
"fp8_rowwise_gemm_plugin": null,
|
134 |
+
"smooth_quant_gemm_plugin": null,
|
135 |
+
"identity_plugin": null,
|
136 |
+
"layernorm_quantization_plugin": null,
|
137 |
+
"rmsnorm_quantization_plugin": null,
|
138 |
+
"nccl_plugin": null,
|
139 |
+
"lookup_plugin": null,
|
140 |
+
"lora_plugin": null,
|
141 |
+
"weight_only_groupwise_quant_matmul_plugin": null,
|
142 |
+
"weight_only_quant_matmul_plugin": null,
|
143 |
+
"smooth_quant_plugins": true,
|
144 |
+
"quantize_per_token_plugin": false,
|
145 |
+
"quantize_tensor_plugin": false,
|
146 |
+
"moe_plugin": null,
|
147 |
+
"mamba_conv1d_plugin": "auto",
|
148 |
+
"low_latency_gemm_plugin": null,
|
149 |
+
"context_fmha": true,
|
150 |
+
"bert_context_fmha_fp32_acc": false,
|
151 |
+
"paged_kv_cache": true,
|
152 |
+
"remove_input_padding": true,
|
153 |
+
"reduce_fusion": false,
|
154 |
+
"enable_xqa": false,
|
155 |
+
"tokens_per_block": 64,
|
156 |
+
"use_paged_context_fmha": false,
|
157 |
+
"use_fp8_context_fmha": false,
|
158 |
+
"multiple_profiles": false,
|
159 |
+
"paged_state": false,
|
160 |
+
"streamingllm": false,
|
161 |
+
"manage_weights": false,
|
162 |
+
"use_fused_mlp": true,
|
163 |
+
"pp_reduce_scatter": false
|
164 |
+
},
|
165 |
+
"use_strip_plan": false,
|
166 |
+
"max_encoder_input_len": 3000,
|
167 |
+
"use_fused_mlp": "enable",
|
168 |
+
"monitor_memory": false
|
169 |
+
}
|
170 |
+
}
|
whisper/1/whisper_large-v3-turbo/decoder/rank0.engine
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e3371d5242ccff35901b2dad4ec911ca6bff53d33e021b4af3af674f1cadcf49
|
3 |
+
size 477983452
|
whisper/1/whisper_large-v3-turbo/encoder/config.json
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"version": "0.15.0.dev2024101500",
|
3 |
+
"pretrained_config": {
|
4 |
+
"architecture": "WhisperEncoder",
|
5 |
+
"dtype": "float16",
|
6 |
+
"vocab_size": 51866,
|
7 |
+
"hidden_size": 1280,
|
8 |
+
"num_hidden_layers": 32,
|
9 |
+
"num_attention_heads": 20,
|
10 |
+
"hidden_act": "gelu",
|
11 |
+
"logits_dtype": "float32",
|
12 |
+
"norm_epsilon": 1e-05,
|
13 |
+
"position_embedding_type": "learned_absolute",
|
14 |
+
"max_position_embeddings": 1500,
|
15 |
+
"num_key_value_heads": 20,
|
16 |
+
"intermediate_size": 5120,
|
17 |
+
"mapping": {
|
18 |
+
"world_size": 1,
|
19 |
+
"gpus_per_node": 8,
|
20 |
+
"cp_size": 1,
|
21 |
+
"tp_size": 1,
|
22 |
+
"pp_size": 1,
|
23 |
+
"moe_tp_size": 1,
|
24 |
+
"moe_ep_size": 1
|
25 |
+
},
|
26 |
+
"quantization": {
|
27 |
+
"quant_algo": null,
|
28 |
+
"kv_cache_quant_algo": null,
|
29 |
+
"group_size": 128,
|
30 |
+
"smoothquant_val": 0.5,
|
31 |
+
"clamp_val": null,
|
32 |
+
"has_zero_point": false,
|
33 |
+
"pre_quant_scale": false,
|
34 |
+
"exclude_modules": null
|
35 |
+
},
|
36 |
+
"use_parallel_embedding": false,
|
37 |
+
"embedding_sharding_dim": 0,
|
38 |
+
"share_embedding_table": false,
|
39 |
+
"head_size": 64,
|
40 |
+
"qk_layernorm": false,
|
41 |
+
"has_position_embedding": true,
|
42 |
+
"n_mels": 128,
|
43 |
+
"num_languages": 100
|
44 |
+
},
|
45 |
+
"build_config": {
|
46 |
+
"max_input_len": 3000,
|
47 |
+
"max_seq_len": 3000,
|
48 |
+
"opt_batch_size": null,
|
49 |
+
"max_batch_size": 64,
|
50 |
+
"max_beam_width": 1,
|
51 |
+
"max_num_tokens": 8192,
|
52 |
+
"opt_num_tokens": 64,
|
53 |
+
"max_prompt_embedding_table_size": 0,
|
54 |
+
"kv_cache_type": "PAGED",
|
55 |
+
"gather_context_logits": false,
|
56 |
+
"gather_generation_logits": false,
|
57 |
+
"strongly_typed": true,
|
58 |
+
"force_num_profiles": null,
|
59 |
+
"profiling_verbosity": "layer_names_only",
|
60 |
+
"enable_debug_output": false,
|
61 |
+
"max_draft_len": 0,
|
62 |
+
"speculative_decoding_mode": 1,
|
63 |
+
"use_refit": false,
|
64 |
+
"input_timing_cache": null,
|
65 |
+
"output_timing_cache": "model.cache",
|
66 |
+
"lora_config": {
|
67 |
+
"lora_dir": [],
|
68 |
+
"lora_ckpt_source": "hf",
|
69 |
+
"max_lora_rank": 64,
|
70 |
+
"lora_target_modules": [],
|
71 |
+
"trtllm_modules_to_hf_modules": {}
|
72 |
+
},
|
73 |
+
"auto_parallel_config": {
|
74 |
+
"world_size": 1,
|
75 |
+
"gpus_per_node": 8,
|
76 |
+
"cluster_key": "H100-PCIe",
|
77 |
+
"cluster_info": null,
|
78 |
+
"sharding_cost_model": "alpha_beta",
|
79 |
+
"comm_cost_model": "alpha_beta",
|
80 |
+
"enable_pipeline_parallelism": false,
|
81 |
+
"enable_shard_unbalanced_shape": false,
|
82 |
+
"enable_shard_dynamic_shape": false,
|
83 |
+
"enable_reduce_scatter": true,
|
84 |
+
"builder_flags": null,
|
85 |
+
"debug_mode": false,
|
86 |
+
"infer_shape": true,
|
87 |
+
"validation_mode": false,
|
88 |
+
"same_buffer_io": {
|
89 |
+
"past_key_value_(\\d+)": "present_key_value_\\1"
|
90 |
+
},
|
91 |
+
"same_spec_io": {},
|
92 |
+
"sharded_io_allowlist": [
|
93 |
+
"past_key_value_\\d+",
|
94 |
+
"present_key_value_\\d*"
|
95 |
+
],
|
96 |
+
"fill_weights": false,
|
97 |
+
"parallel_config_cache": null,
|
98 |
+
"profile_cache": null,
|
99 |
+
"dump_path": null,
|
100 |
+
"debug_outputs": []
|
101 |
+
},
|
102 |
+
"weight_sparsity": false,
|
103 |
+
"weight_streaming": false,
|
104 |
+
"plugin_config": {
|
105 |
+
"dtype": "float16",
|
106 |
+
"bert_attention_plugin": "float16",
|
107 |
+
"gpt_attention_plugin": "auto",
|
108 |
+
"gemm_plugin": null,
|
109 |
+
"gemm_swiglu_plugin": null,
|
110 |
+
"fp8_rowwise_gemm_plugin": null,
|
111 |
+
"smooth_quant_gemm_plugin": null,
|
112 |
+
"identity_plugin": null,
|
113 |
+
"layernorm_quantization_plugin": null,
|
114 |
+
"rmsnorm_quantization_plugin": null,
|
115 |
+
"nccl_plugin": null,
|
116 |
+
"lookup_plugin": null,
|
117 |
+
"lora_plugin": null,
|
118 |
+
"weight_only_groupwise_quant_matmul_plugin": null,
|
119 |
+
"weight_only_quant_matmul_plugin": null,
|
120 |
+
"smooth_quant_plugins": true,
|
121 |
+
"quantize_per_token_plugin": false,
|
122 |
+
"quantize_tensor_plugin": false,
|
123 |
+
"moe_plugin": null,
|
124 |
+
"mamba_conv1d_plugin": "auto",
|
125 |
+
"low_latency_gemm_plugin": null,
|
126 |
+
"context_fmha": true,
|
127 |
+
"bert_context_fmha_fp32_acc": false,
|
128 |
+
"paged_kv_cache": true,
|
129 |
+
"remove_input_padding": true,
|
130 |
+
"reduce_fusion": false,
|
131 |
+
"enable_xqa": false,
|
132 |
+
"tokens_per_block": 64,
|
133 |
+
"use_paged_context_fmha": false,
|
134 |
+
"use_fp8_context_fmha": false,
|
135 |
+
"multiple_profiles": false,
|
136 |
+
"paged_state": false,
|
137 |
+
"streamingllm": false,
|
138 |
+
"manage_weights": false,
|
139 |
+
"use_fused_mlp": true,
|
140 |
+
"pp_reduce_scatter": false
|
141 |
+
},
|
142 |
+
"use_strip_plan": false,
|
143 |
+
"max_encoder_input_len": 1024,
|
144 |
+
"use_fused_mlp": "enable",
|
145 |
+
"monitor_memory": false
|
146 |
+
}
|
147 |
+
}
|
whisper/1/whisper_large-v3-turbo/encoder/rank0.engine
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8d807775c5cbd033e1ee8a06845a9f89877f96039a3d570776dfcfd71eee8e59
|
3 |
+
size 1287816708
|
whisper/config.pbtxt
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
name: "whisper"
|
16 |
+
backend: "python"
|
17 |
+
max_batch_size: 64
|
18 |
+
|
19 |
+
dynamic_batching {
|
20 |
+
max_queue_delay_microseconds: 100
|
21 |
+
}
|
22 |
+
|
23 |
+
parameters [
|
24 |
+
{
|
25 |
+
key: "engine_dir"
|
26 |
+
value: { string_value: "/data/whisper/model_repo_whisper/whisper/1/whisper_large-v3-turbo"}
|
27 |
+
},
|
28 |
+
{
|
29 |
+
key: "n_mels",
|
30 |
+
value: {string_value:"128"} # 128 dim for large-v3, 80 dim for large-v2
|
31 |
+
},
|
32 |
+
{
|
33 |
+
key: "zero_pad"
|
34 |
+
value: {string_value: "false"}
|
35 |
+
}
|
36 |
+
]
|
37 |
+
|
38 |
+
input [
|
39 |
+
{
|
40 |
+
name: "WAV"
|
41 |
+
data_type: TYPE_FP32
|
42 |
+
dims: [-1]
|
43 |
+
optional: True
|
44 |
+
},
|
45 |
+
{
|
46 |
+
name: "WAV_LENS"
|
47 |
+
data_type: TYPE_INT32
|
48 |
+
dims: [1]
|
49 |
+
optional: True
|
50 |
+
},
|
51 |
+
{
|
52 |
+
name: "DECODER_INPUT_IDS"
|
53 |
+
data_type: TYPE_INT32
|
54 |
+
dims: [-1]
|
55 |
+
optional: True
|
56 |
+
}
|
57 |
+
]
|
58 |
+
output [
|
59 |
+
{
|
60 |
+
name: "OUTPUT_IDS"
|
61 |
+
data_type: TYPE_INT32
|
62 |
+
dims: [-1]
|
63 |
+
}
|
64 |
+
]
|
65 |
+
|
66 |
+
instance_group [
|
67 |
+
{
|
68 |
+
count: 1
|
69 |
+
kind: KIND_GPU
|
70 |
+
}
|
71 |
+
]
|