otherhalf-dev commited on
Commit
f9a3600
·
verified ·
1 Parent(s): fb63d75

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ whisper/1/whisper_large-v3-turbo/decoder/rank0.engine filter=lfs diff=lfs merge=lfs -text
37
+ whisper/1/whisper_large-v3-turbo/encoder/rank0.engine filter=lfs diff=lfs merge=lfs -text
infer_bls/1/__pycache__/model.cpython-310.pyc ADDED
Binary file (4.92 kB). View file
 
infer_bls/1/__pycache__/tokenizer.cpython-310.pyc ADDED
Binary file (4.59 kB). View file
 
infer_bls/1/model.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import triton_python_backend_utils as pb_utils
3
+ import numpy as np
4
+ import json
5
+ import torch
6
+ from torch.utils.dlpack import to_dlpack
7
+ import re
8
+ from .tokenizer import get_tokenizer
9
+ from collections import OrderedDict
10
+ from pathlib import Path
11
+
12
+ def read_config(component, engine_dir):
13
+ config_path = engine_dir / component / 'config.json'
14
+ with open(config_path, 'r') as f:
15
+ config = json.load(f)
16
+ model_config = OrderedDict()
17
+ model_config.update(config['pretrained_config'])
18
+ model_config.update(config['build_config'])
19
+ return model_config
20
+
21
+ class TritonPythonModel:
22
+ """Your Python model must use the same class name. Every Python model
23
+ that is created must have "TritonPythonModel" as the class name.
24
+ """
25
+
26
+ def initialize(self, args):
27
+ """`initialize` is called only once when the model is being loaded.
28
+ Implementing `initialize` function is optional. This function allows
29
+ the model to initialize any state associated with this model.
30
+
31
+ Parameters
32
+ ----------
33
+ args : dict
34
+ Both keys and values are strings. The dictionary keys and values are:
35
+ * model_config: A JSON string containing the model configuration
36
+ * model_instance_kind: A string containing model instance kind
37
+ * model_instance_device_id: A string containing model instance device ID
38
+ * model_repository: Model repository path
39
+ * model_version: Model version
40
+ * model_name: Model name
41
+ """
42
+ self.model_config = model_config = json.loads(args['model_config'])
43
+
44
+ # Get OUTPUT0 configuration
45
+ output0_config = pb_utils.get_output_config_by_name(
46
+ model_config, "TRANSCRIPTS")
47
+ # Convert Triton types to numpy types
48
+ self.out0_dtype = pb_utils.triton_string_to_numpy(
49
+ output0_config['data_type'])
50
+ encoder_config = read_config('encoder', Path(self.model_config['parameters']['engine_dir']["string_value"]))
51
+ self.tokenizer = get_tokenizer(num_languages=encoder_config['num_languages'])
52
+ self.blank = self.tokenizer.encode(" ", allowed_special=self.tokenizer.special_tokens_set)[0]
53
+ self.device = torch.device("cuda")
54
+
55
+ def process_batch(self, wav, wav_len, prompt_id):
56
+ wav = torch.from_numpy(wav[0]).to(self.device)
57
+ wav_tensor = pb_utils.Tensor.from_dlpack("WAV", to_dlpack(wav.unsqueeze(0)))
58
+ wav_len_tensor = pb_utils.Tensor("WAV_LENS", np.array([[wav_len]], np.int32))
59
+ prompt_id = torch.tensor(prompt_id).unsqueeze(0)
60
+
61
+ prompt_id = pb_utils.Tensor("DECODER_INPUT_IDS", prompt_id.numpy().astype(np.int32))
62
+ infer_request = pb_utils.InferenceRequest(
63
+ model_name="whisper",
64
+ requested_output_names=["OUTPUT_IDS"],
65
+ inputs=[wav_tensor, wav_len_tensor, prompt_id]
66
+ )
67
+ inference_response = infer_request.exec()
68
+ if inference_response.has_error():
69
+ raise pb_utils.TritonModelException(inference_response.error().message())
70
+ else:
71
+ output_ids = pb_utils.get_output_tensor_by_name(inference_response, "OUTPUT_IDS")
72
+ return output_ids.as_numpy()
73
+
74
+ def execute(self, requests):
75
+ """`execute` must be implemented in every Python model. `execute`
76
+ function receives a list of pb_utils.InferenceRequest as the only
77
+ argument. This function is called when an inference is requested
78
+ for this model.
79
+
80
+ Parameters
81
+ ----------
82
+ requests : list
83
+ A list of pb_utils.InferenceRequest
84
+
85
+ Returns
86
+ -------
87
+ list
88
+ A list of pb_utils.InferenceResponse. The length of this list must
89
+ be the same as `requests`
90
+ """
91
+ # Every Python backend must iterate through list of requests and create
92
+ # an instance of pb_utils.InferenceResponse class for each of them. You
93
+ # should avoid storing any of the input Tensors in the class attributes
94
+ # as they will be overridden in subsequent inference requests. You can
95
+ # make a copy of the underlying NumPy array and store it if it is
96
+ # required.
97
+ responses = []
98
+ for request in requests:
99
+ # Perform inference on the request and append it to responses list...
100
+ in_0 = pb_utils.get_input_tensor_by_name(request, "TEXT_PREFIX")
101
+ prompt_ids = in_0.as_numpy().tolist()
102
+ prompt_ids = prompt_ids[0][0].decode('utf-8')
103
+ if prompt_ids == "":
104
+ prompt_ids = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
105
+ prompt_id = self.tokenizer.encode(prompt_ids, allowed_special=self.tokenizer.special_tokens_set)
106
+
107
+ wav = pb_utils.get_input_tensor_by_name(request, "WAV").as_numpy()
108
+ assert wav.shape[0] == 1, "Only support batch size 1 for now"
109
+ wav_len = pb_utils.get_input_tensor_by_name(request, "WAV_LENS").as_numpy()
110
+ wav_len = wav_len.item()
111
+
112
+ output_ids = self.process_batch(wav, wav_len, prompt_id)
113
+ s = self.tokenizer.decode(output_ids)
114
+ s = re.sub(r'<\|.*?\|>', '', s)
115
+ sentence = np.array([s])
116
+ out0 = pb_utils.Tensor("TRANSCRIPTS", sentence.astype(self.out0_dtype))
117
+ inference_response = pb_utils.InferenceResponse(output_tensors=[out0])
118
+ responses.append(inference_response)
119
+ return responses
120
+
121
+ def finalize(self):
122
+ """`finalize` is called only once when the model is being unloaded.
123
+ Implementing `finalize` function is optional. This function allows
124
+ the model to perform any necessary clean ups before exit.
125
+ """
126
+ print('Cleaning up...')
infer_bls/1/multilingual.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
infer_bls/1/tokenizer.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # Modified from https://github.com/openai/whisper/blob/main/whisper/tokenizer.py
16
+ import base64
17
+ import os
18
+
19
+ import tiktoken
20
+
21
+ LANGUAGES = {
22
+ "en": "english",
23
+ "zh": "chinese",
24
+ "de": "german",
25
+ "es": "spanish",
26
+ "ru": "russian",
27
+ "ko": "korean",
28
+ "fr": "french",
29
+ "ja": "japanese",
30
+ "pt": "portuguese",
31
+ "tr": "turkish",
32
+ "pl": "polish",
33
+ "ca": "catalan",
34
+ "nl": "dutch",
35
+ "ar": "arabic",
36
+ "sv": "swedish",
37
+ "it": "italian",
38
+ "id": "indonesian",
39
+ "hi": "hindi",
40
+ "fi": "finnish",
41
+ "vi": "vietnamese",
42
+ "he": "hebrew",
43
+ "uk": "ukrainian",
44
+ "el": "greek",
45
+ "ms": "malay",
46
+ "cs": "czech",
47
+ "ro": "romanian",
48
+ "da": "danish",
49
+ "hu": "hungarian",
50
+ "ta": "tamil",
51
+ "no": "norwegian",
52
+ "th": "thai",
53
+ "ur": "urdu",
54
+ "hr": "croatian",
55
+ "bg": "bulgarian",
56
+ "lt": "lithuanian",
57
+ "la": "latin",
58
+ "mi": "maori",
59
+ "ml": "malayalam",
60
+ "cy": "welsh",
61
+ "sk": "slovak",
62
+ "te": "telugu",
63
+ "fa": "persian",
64
+ "lv": "latvian",
65
+ "bn": "bengali",
66
+ "sr": "serbian",
67
+ "az": "azerbaijani",
68
+ "sl": "slovenian",
69
+ "kn": "kannada",
70
+ "et": "estonian",
71
+ "mk": "macedonian",
72
+ "br": "breton",
73
+ "eu": "basque",
74
+ "is": "icelandic",
75
+ "hy": "armenian",
76
+ "ne": "nepali",
77
+ "mn": "mongolian",
78
+ "bs": "bosnian",
79
+ "kk": "kazakh",
80
+ "sq": "albanian",
81
+ "sw": "swahili",
82
+ "gl": "galician",
83
+ "mr": "marathi",
84
+ "pa": "punjabi",
85
+ "si": "sinhala",
86
+ "km": "khmer",
87
+ "sn": "shona",
88
+ "yo": "yoruba",
89
+ "so": "somali",
90
+ "af": "afrikaans",
91
+ "oc": "occitan",
92
+ "ka": "georgian",
93
+ "be": "belarusian",
94
+ "tg": "tajik",
95
+ "sd": "sindhi",
96
+ "gu": "gujarati",
97
+ "am": "amharic",
98
+ "yi": "yiddish",
99
+ "lo": "lao",
100
+ "uz": "uzbek",
101
+ "fo": "faroese",
102
+ "ht": "haitian creole",
103
+ "ps": "pashto",
104
+ "tk": "turkmen",
105
+ "nn": "nynorsk",
106
+ "mt": "maltese",
107
+ "sa": "sanskrit",
108
+ "lb": "luxembourgish",
109
+ "my": "myanmar",
110
+ "bo": "tibetan",
111
+ "tl": "tagalog",
112
+ "mg": "malagasy",
113
+ "as": "assamese",
114
+ "tt": "tatar",
115
+ "haw": "hawaiian",
116
+ "ln": "lingala",
117
+ "ha": "hausa",
118
+ "ba": "bashkir",
119
+ "jw": "javanese",
120
+ "su": "sundanese",
121
+ "yue": "cantonese",
122
+ }
123
+
124
+
125
+ def get_tokenizer(name: str = "multilingual",
126
+ num_languages: int = 99,
127
+ tokenizer_dir: str = None):
128
+ if tokenizer_dir is None:
129
+ vocab_path = os.path.join(os.path.dirname(__file__),
130
+ f"./{name}.tiktoken")
131
+ else:
132
+ vocab_path = os.path.join(tokenizer_dir, f"{name}.tiktoken")
133
+ ranks = {
134
+ base64.b64decode(token): int(rank)
135
+ for token, rank in (line.split() for line in open(vocab_path) if line)
136
+ }
137
+ n_vocab = len(ranks)
138
+ special_tokens = {}
139
+
140
+ specials = [
141
+ "<|endoftext|>",
142
+ "<|startoftranscript|>",
143
+ *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
144
+ "<|translate|>",
145
+ "<|transcribe|>",
146
+ "<|startoflm|>",
147
+ "<|startofprev|>",
148
+ "<|nospeech|>",
149
+ "<|notimestamps|>",
150
+ *[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
151
+ ]
152
+
153
+ for token in specials:
154
+ special_tokens[token] = n_vocab
155
+ n_vocab += 1
156
+
157
+ return tiktoken.Encoding(
158
+ name=os.path.basename(vocab_path),
159
+ explicit_n_vocab=n_vocab,
160
+ pat_str=
161
+ r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
162
+ mergeable_ranks=ranks,
163
+ special_tokens=special_tokens,
164
+ )
165
+
166
+
167
+ if __name__ == "__main__":
168
+ enc = get_tokenizer()
169
+ mytest_str = "<|startofprev|> Nvidia<|startoftranscript|><|en|><|transcribe|>"
170
+ encoding = enc.encode(mytest_str, allowed_special=enc.special_tokens_set)
171
+ mystr = enc.decode([50361, 45, 43021, 50258, 50259, 50359])
172
+ mystr2 = enc.decode([50361, 46284, 50258, 50259, 50359])
173
+ #print(encoding, mystr, mystr2)
174
+ print(
175
+ enc.encode("<|startoftranscript|>",
176
+ allowed_special=enc.special_tokens_set)[0])
177
+ print(
178
+ enc.encode("<|endoftext|>",
179
+ allowed_special=enc.special_tokens_set)[0])
180
+ my_zh_str = "好好学习"
181
+ encoding = enc.encode(my_zh_str, allowed_special=enc.special_tokens_set)
182
+ decoding = enc.decode(encoding)
183
+ print(type(decoding))
184
+ #print(encoding, decoding)
infer_bls/config.pbtxt ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: "infer_bls"
16
+ backend: "python"
17
+ max_batch_size: 64
18
+
19
+ parameters [
20
+ {
21
+ key: "engine_dir"
22
+ value: { string_value: "/data/whisper/model_repo_whisper/whisper/1/whisper_large-v3-turbo"}
23
+ }
24
+ ]
25
+
26
+ input [
27
+ {
28
+ name: "TEXT_PREFIX"
29
+ data_type: TYPE_STRING
30
+ dims: [1]
31
+ },
32
+ {
33
+ name: "WAV"
34
+ data_type: TYPE_FP32
35
+ dims: [-1]
36
+ },
37
+ {
38
+ name: "WAV_LENS"
39
+ data_type: TYPE_INT32
40
+ dims: [1]
41
+ optional: True
42
+ }
43
+ ]
44
+
45
+ output [
46
+ {
47
+ name: "TRANSCRIPTS"
48
+ data_type: TYPE_STRING
49
+ dims: [1]
50
+ }
51
+ ]
52
+
53
+ dynamic_batching {
54
+ max_queue_delay_microseconds: 100
55
+ }
56
+ instance_group [
57
+ {
58
+ count: 8
59
+ kind: KIND_CPU
60
+ }
61
+ ]
whisper/1/__pycache__/fbank.cpython-310.pyc ADDED
Binary file (3.22 kB). View file
 
whisper/1/__pycache__/model.cpython-310.pyc ADDED
Binary file (3.16 kB). View file
 
whisper/1/fbank.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # Reference: https://github.com/openai/whisper/blob/main/whisper/audio.py
15
+ import numpy as np
16
+ import torch
17
+ import torch.nn.functional as F
18
+ from typing import Union
19
+ import os
20
+
21
+ def mel_filters(device, n_mels: int =128) -> torch.Tensor:
22
+ """
23
+ load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
24
+ Allows decoupling librosa dependency; saved using:
25
+
26
+ np.savez_compressed(
27
+ "mel_filters.npz",
28
+ mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
29
+ )
30
+ """
31
+ assert n_mels == 80 or n_mels == 128 , f"Unsupported n_mels: {n_mels}"
32
+ with np.load(
33
+ os.path.join(os.path.dirname(__file__), "mel_filters.npz")
34
+ ) as f:
35
+ return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
36
+
37
+
38
+ def log_mel_spectrogram(
39
+ audio: Union[torch.Tensor],
40
+ filters: torch.Tensor,
41
+ n_mels: int = 128,
42
+ n_fft: int = 400,
43
+ hop_length: int = 160,
44
+ ):
45
+ """
46
+ Compute the log-Mel spectrogram of
47
+
48
+ Parameters
49
+ ----------
50
+ audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
51
+ The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
52
+
53
+ n_mels: int
54
+ The number of Mel-frequency filters, only 80 or 128 is supported
55
+
56
+ filters: torch.Tensor
57
+
58
+ Returns
59
+ -------
60
+ torch.Tensor, shape = (128, n_frames)
61
+ A Tensor that contains the Mel spectrogram
62
+ """
63
+ window = torch.hann_window(n_fft).to(audio.device)
64
+ stft = torch.stft(audio, n_fft, hop_length, window=window, return_complex=True)
65
+ magnitudes = stft[..., :-1].abs() ** 2
66
+
67
+ mel_spec = filters @ magnitudes
68
+ log_spec = torch.clamp(mel_spec, min=1e-10).log10()
69
+ log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
70
+ log_spec = (log_spec + 4.0) / 4.0
71
+ # cast to float 16
72
+ log_spec = log_spec.half()
73
+ return log_spec
74
+
75
+ class FeatureExtractor(torch.nn.Module):
76
+ """Your Python model must use the same class name. Every Python model
77
+ that is created must have "TritonPythonModel" as the class name.
78
+ """
79
+
80
+ def __init__(self, n_mels: int = 128):
81
+ self.device = torch.device("cuda")
82
+ self.n_mels = n_mels
83
+ self.filters = mel_filters(self.device, n_mels=self.n_mels)
84
+
85
+ def compute_feature(self, wav, padding_target_len: int = 3000):
86
+ """
87
+ Compute the log-Mel spectrogram of the input audio waveform.
88
+ mel: [1, feature_dim, seq_len]
89
+ """
90
+ mel = log_mel_spectrogram(wav, self.filters)
91
+ assert padding_target_len <= 3000, f"padding must be less than 3000, got {padding}"
92
+ if mel.shape[1] < padding_target_len:
93
+ mel = F.pad(mel, (0, padding_target_len - mel.shape[1]), mode='constant')
94
+ if mel.shape[1] % 2:
95
+ # pad to even length for remove_padding case, since conv1d requires even length
96
+ mel = torch.nn.functional.pad(mel, (0, 1))
97
+ mel = mel.unsqueeze(0)
98
+ return mel
whisper/1/mel_filters.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7450ae70723a5ef9d341e3cee628c7cb0177f36ce42c44b7ed2bf3325f0f6d4c
3
+ size 4271
whisper/1/model.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
+ #
4
+ # Redistribution and use in source and binary forms, with or without
5
+ # modification, are permitted provided that the following conditions
6
+ # are met:
7
+ # * Redistributions of source code must retain the above copyright
8
+ # notice, this list of conditions and the following disclaimer.
9
+ # * Redistributions in binary form must reproduce the above copyright
10
+ # notice, this list of conditions and the following disclaimer in the
11
+ # documentation and/or other materials provided with the distribution.
12
+ # * Neither the name of NVIDIA CORPORATION nor the names of its
13
+ # contributors may be used to endorse or promote products derived
14
+ # from this software without specific prior written permission.
15
+ #
16
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24
+ # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
+ import json
28
+ from pathlib import Path
29
+
30
+ from .fbank import FeatureExtractor
31
+ import torch
32
+ from torch.utils.dlpack import from_dlpack
33
+
34
+ import triton_python_backend_utils as pb_utils
35
+ from tensorrt_llm.runtime import ModelRunnerCpp
36
+ from tensorrt_llm.bindings import GptJsonConfig
37
+ from .fbank import FeatureExtractor
38
+
39
+
40
+ class TritonPythonModel:
41
+ def initialize(self, args):
42
+ parameters = json.loads(args['model_config'])['parameters']
43
+ for key,value in parameters.items():
44
+ parameters[key] = value["string_value"]
45
+ engine_dir = parameters["engine_dir"]
46
+ json_config = GptJsonConfig.parse_file(Path(engine_dir) / 'decoder' / 'config.json')
47
+ assert json_config.model_config.supports_inflight_batching
48
+ runner_kwargs = dict(engine_dir=engine_dir,
49
+ is_enc_dec=True,
50
+ max_batch_size=64,
51
+ max_input_len=3000,
52
+ max_output_len=96,
53
+ max_beam_width=1,
54
+ debug_mode=False,
55
+ kv_cache_free_gpu_memory_fraction=0.5)
56
+ self.model_runner_cpp = ModelRunnerCpp.from_dir(**runner_kwargs)
57
+ self.feature_extractor = FeatureExtractor(n_mels = int(parameters["n_mels"]))
58
+ self.zero_pad = True if parameters["zero_pad"] == "true" else False
59
+ self.eot_id = 50257
60
+
61
+ def execute(self, requests):
62
+ """
63
+ This function receives a list of requests (`pb_utils.InferenceRequest`),
64
+ performs inference on every request and appends it to responses.
65
+ """
66
+ responses, batch_mel_list, decoder_input_ids = [], [], []
67
+ for request in requests:
68
+ wav_tensor = pb_utils.get_input_tensor_by_name(request, "WAV")
69
+ wav_len = pb_utils.get_input_tensor_by_name(request, "WAV_LENS").as_numpy().item()
70
+ prompt_ids = pb_utils.get_input_tensor_by_name(request, "DECODER_INPUT_IDS").as_numpy()
71
+ wav = from_dlpack(wav_tensor.to_dlpack())
72
+ wav = wav[:, :wav_len]
73
+ padding = 0 if self.zero_pad else 3000
74
+ mel = self.feature_extractor.compute_feature(wav[0].to('cuda'), padding_target_len=padding).transpose(1, 2)
75
+ batch_mel_list.append(mel.squeeze(0))
76
+ decoder_input_ids.append(torch.tensor(prompt_ids, dtype=torch.int32, device='cuda').squeeze(0))
77
+
78
+ decoder_input_ids = torch.nn.utils.rnn.pad_sequence(decoder_input_ids, batch_first=True, padding_value=self.eot_id)
79
+ mel_input_lengths = torch.tensor([mel.shape[0] for mel in batch_mel_list], dtype=torch.int32, device='cuda')
80
+
81
+ outputs = self.model_runner_cpp.generate(
82
+ batch_input_ids=decoder_input_ids,
83
+ encoder_input_features=batch_mel_list,
84
+ encoder_output_lengths=mel_input_lengths // 2,
85
+ max_new_tokens=96,
86
+ end_id=self.eot_id,
87
+ pad_id=self.eot_id,
88
+ num_beams=1,
89
+ output_sequence_lengths=True,
90
+ return_dict=True)
91
+ torch.cuda.synchronize()
92
+
93
+ output_ids = outputs['output_ids'].cpu().numpy()
94
+
95
+ for i, output_id in enumerate(output_ids):
96
+ response = pb_utils.InferenceResponse(output_tensors=[
97
+ pb_utils.Tensor("OUTPUT_IDS", output_id[0])
98
+ ])
99
+ responses.append(response)
100
+ assert len(responses) == len(requests)
101
+ return responses
whisper/1/whisper_large-v3-turbo/decoder/config.json ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "0.15.0.dev2024101500",
3
+ "pretrained_config": {
4
+ "architecture": "DecoderModel",
5
+ "dtype": "float16",
6
+ "vocab_size": 51866,
7
+ "hidden_size": 1280,
8
+ "num_hidden_layers": 4,
9
+ "num_attention_heads": 20,
10
+ "hidden_act": "gelu",
11
+ "logits_dtype": "float16",
12
+ "norm_epsilon": 1e-05,
13
+ "position_embedding_type": "learned_absolute",
14
+ "max_position_embeddings": 448,
15
+ "num_key_value_heads": 20,
16
+ "intermediate_size": 5120,
17
+ "mapping": {
18
+ "world_size": 1,
19
+ "gpus_per_node": 8,
20
+ "cp_size": 1,
21
+ "tp_size": 1,
22
+ "pp_size": 1,
23
+ "moe_tp_size": 1,
24
+ "moe_ep_size": 1
25
+ },
26
+ "quantization": {
27
+ "quant_algo": null,
28
+ "kv_cache_quant_algo": null,
29
+ "group_size": 128,
30
+ "smoothquant_val": 0.5,
31
+ "clamp_val": null,
32
+ "has_zero_point": false,
33
+ "pre_quant_scale": false,
34
+ "exclude_modules": null
35
+ },
36
+ "use_parallel_embedding": false,
37
+ "embedding_sharding_dim": 0,
38
+ "share_embedding_table": false,
39
+ "head_size": 64,
40
+ "qk_layernorm": false,
41
+ "use_prompt_tuning": false,
42
+ "has_position_embedding": true,
43
+ "layernorm_type": 0,
44
+ "has_attention_qkvo_bias": true,
45
+ "has_mlp_bias": true,
46
+ "has_model_final_layernorm": true,
47
+ "has_embedding_layernorm": false,
48
+ "has_embedding_scale": false,
49
+ "ffn_hidden_size": 5120,
50
+ "q_scaling": 1.0,
51
+ "layernorm_position": 0,
52
+ "relative_attention": false,
53
+ "max_distance": 0,
54
+ "num_buckets": 0,
55
+ "model_type": "whisper",
56
+ "rescale_before_lm_head": false,
57
+ "encoder_hidden_size": 1280,
58
+ "encoder_num_heads": 20,
59
+ "encoder_head_size": null,
60
+ "skip_cross_kv": false,
61
+ "type_vocab_size": null,
62
+ "encoder_num_kv_heads": null,
63
+ "skip_cross_qkv": false,
64
+ "mlp_type": 0,
65
+ "residual_scaling": 1.0,
66
+ "has_lm_head_bias": false
67
+ },
68
+ "build_config": {
69
+ "max_input_len": 14,
70
+ "max_seq_len": 114,
71
+ "opt_batch_size": null,
72
+ "max_batch_size": 64,
73
+ "max_beam_width": 4,
74
+ "max_num_tokens": 7296,
75
+ "opt_num_tokens": 256,
76
+ "max_prompt_embedding_table_size": 0,
77
+ "kv_cache_type": "PAGED",
78
+ "gather_context_logits": false,
79
+ "gather_generation_logits": false,
80
+ "strongly_typed": true,
81
+ "force_num_profiles": null,
82
+ "profiling_verbosity": "layer_names_only",
83
+ "enable_debug_output": false,
84
+ "max_draft_len": 0,
85
+ "speculative_decoding_mode": 1,
86
+ "use_refit": false,
87
+ "input_timing_cache": null,
88
+ "output_timing_cache": "model.cache",
89
+ "lora_config": {
90
+ "lora_dir": [],
91
+ "lora_ckpt_source": "hf",
92
+ "max_lora_rank": 64,
93
+ "lora_target_modules": [],
94
+ "trtllm_modules_to_hf_modules": {}
95
+ },
96
+ "auto_parallel_config": {
97
+ "world_size": 1,
98
+ "gpus_per_node": 8,
99
+ "cluster_key": "H100-PCIe",
100
+ "cluster_info": null,
101
+ "sharding_cost_model": "alpha_beta",
102
+ "comm_cost_model": "alpha_beta",
103
+ "enable_pipeline_parallelism": false,
104
+ "enable_shard_unbalanced_shape": false,
105
+ "enable_shard_dynamic_shape": false,
106
+ "enable_reduce_scatter": true,
107
+ "builder_flags": null,
108
+ "debug_mode": false,
109
+ "infer_shape": true,
110
+ "validation_mode": false,
111
+ "same_buffer_io": {
112
+ "past_key_value_(\\d+)": "present_key_value_\\1"
113
+ },
114
+ "same_spec_io": {},
115
+ "sharded_io_allowlist": [
116
+ "past_key_value_\\d+",
117
+ "present_key_value_\\d*"
118
+ ],
119
+ "fill_weights": false,
120
+ "parallel_config_cache": null,
121
+ "profile_cache": null,
122
+ "dump_path": null,
123
+ "debug_outputs": []
124
+ },
125
+ "weight_sparsity": false,
126
+ "weight_streaming": false,
127
+ "plugin_config": {
128
+ "dtype": "float16",
129
+ "bert_attention_plugin": "float16",
130
+ "gpt_attention_plugin": "float16",
131
+ "gemm_plugin": "float16",
132
+ "gemm_swiglu_plugin": null,
133
+ "fp8_rowwise_gemm_plugin": null,
134
+ "smooth_quant_gemm_plugin": null,
135
+ "identity_plugin": null,
136
+ "layernorm_quantization_plugin": null,
137
+ "rmsnorm_quantization_plugin": null,
138
+ "nccl_plugin": null,
139
+ "lookup_plugin": null,
140
+ "lora_plugin": null,
141
+ "weight_only_groupwise_quant_matmul_plugin": null,
142
+ "weight_only_quant_matmul_plugin": null,
143
+ "smooth_quant_plugins": true,
144
+ "quantize_per_token_plugin": false,
145
+ "quantize_tensor_plugin": false,
146
+ "moe_plugin": null,
147
+ "mamba_conv1d_plugin": "auto",
148
+ "low_latency_gemm_plugin": null,
149
+ "context_fmha": true,
150
+ "bert_context_fmha_fp32_acc": false,
151
+ "paged_kv_cache": true,
152
+ "remove_input_padding": true,
153
+ "reduce_fusion": false,
154
+ "enable_xqa": false,
155
+ "tokens_per_block": 64,
156
+ "use_paged_context_fmha": false,
157
+ "use_fp8_context_fmha": false,
158
+ "multiple_profiles": false,
159
+ "paged_state": false,
160
+ "streamingllm": false,
161
+ "manage_weights": false,
162
+ "use_fused_mlp": true,
163
+ "pp_reduce_scatter": false
164
+ },
165
+ "use_strip_plan": false,
166
+ "max_encoder_input_len": 3000,
167
+ "use_fused_mlp": "enable",
168
+ "monitor_memory": false
169
+ }
170
+ }
whisper/1/whisper_large-v3-turbo/decoder/rank0.engine ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3371d5242ccff35901b2dad4ec911ca6bff53d33e021b4af3af674f1cadcf49
3
+ size 477983452
whisper/1/whisper_large-v3-turbo/encoder/config.json ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "0.15.0.dev2024101500",
3
+ "pretrained_config": {
4
+ "architecture": "WhisperEncoder",
5
+ "dtype": "float16",
6
+ "vocab_size": 51866,
7
+ "hidden_size": 1280,
8
+ "num_hidden_layers": 32,
9
+ "num_attention_heads": 20,
10
+ "hidden_act": "gelu",
11
+ "logits_dtype": "float32",
12
+ "norm_epsilon": 1e-05,
13
+ "position_embedding_type": "learned_absolute",
14
+ "max_position_embeddings": 1500,
15
+ "num_key_value_heads": 20,
16
+ "intermediate_size": 5120,
17
+ "mapping": {
18
+ "world_size": 1,
19
+ "gpus_per_node": 8,
20
+ "cp_size": 1,
21
+ "tp_size": 1,
22
+ "pp_size": 1,
23
+ "moe_tp_size": 1,
24
+ "moe_ep_size": 1
25
+ },
26
+ "quantization": {
27
+ "quant_algo": null,
28
+ "kv_cache_quant_algo": null,
29
+ "group_size": 128,
30
+ "smoothquant_val": 0.5,
31
+ "clamp_val": null,
32
+ "has_zero_point": false,
33
+ "pre_quant_scale": false,
34
+ "exclude_modules": null
35
+ },
36
+ "use_parallel_embedding": false,
37
+ "embedding_sharding_dim": 0,
38
+ "share_embedding_table": false,
39
+ "head_size": 64,
40
+ "qk_layernorm": false,
41
+ "has_position_embedding": true,
42
+ "n_mels": 128,
43
+ "num_languages": 100
44
+ },
45
+ "build_config": {
46
+ "max_input_len": 3000,
47
+ "max_seq_len": 3000,
48
+ "opt_batch_size": null,
49
+ "max_batch_size": 64,
50
+ "max_beam_width": 1,
51
+ "max_num_tokens": 8192,
52
+ "opt_num_tokens": 64,
53
+ "max_prompt_embedding_table_size": 0,
54
+ "kv_cache_type": "PAGED",
55
+ "gather_context_logits": false,
56
+ "gather_generation_logits": false,
57
+ "strongly_typed": true,
58
+ "force_num_profiles": null,
59
+ "profiling_verbosity": "layer_names_only",
60
+ "enable_debug_output": false,
61
+ "max_draft_len": 0,
62
+ "speculative_decoding_mode": 1,
63
+ "use_refit": false,
64
+ "input_timing_cache": null,
65
+ "output_timing_cache": "model.cache",
66
+ "lora_config": {
67
+ "lora_dir": [],
68
+ "lora_ckpt_source": "hf",
69
+ "max_lora_rank": 64,
70
+ "lora_target_modules": [],
71
+ "trtllm_modules_to_hf_modules": {}
72
+ },
73
+ "auto_parallel_config": {
74
+ "world_size": 1,
75
+ "gpus_per_node": 8,
76
+ "cluster_key": "H100-PCIe",
77
+ "cluster_info": null,
78
+ "sharding_cost_model": "alpha_beta",
79
+ "comm_cost_model": "alpha_beta",
80
+ "enable_pipeline_parallelism": false,
81
+ "enable_shard_unbalanced_shape": false,
82
+ "enable_shard_dynamic_shape": false,
83
+ "enable_reduce_scatter": true,
84
+ "builder_flags": null,
85
+ "debug_mode": false,
86
+ "infer_shape": true,
87
+ "validation_mode": false,
88
+ "same_buffer_io": {
89
+ "past_key_value_(\\d+)": "present_key_value_\\1"
90
+ },
91
+ "same_spec_io": {},
92
+ "sharded_io_allowlist": [
93
+ "past_key_value_\\d+",
94
+ "present_key_value_\\d*"
95
+ ],
96
+ "fill_weights": false,
97
+ "parallel_config_cache": null,
98
+ "profile_cache": null,
99
+ "dump_path": null,
100
+ "debug_outputs": []
101
+ },
102
+ "weight_sparsity": false,
103
+ "weight_streaming": false,
104
+ "plugin_config": {
105
+ "dtype": "float16",
106
+ "bert_attention_plugin": "float16",
107
+ "gpt_attention_plugin": "auto",
108
+ "gemm_plugin": null,
109
+ "gemm_swiglu_plugin": null,
110
+ "fp8_rowwise_gemm_plugin": null,
111
+ "smooth_quant_gemm_plugin": null,
112
+ "identity_plugin": null,
113
+ "layernorm_quantization_plugin": null,
114
+ "rmsnorm_quantization_plugin": null,
115
+ "nccl_plugin": null,
116
+ "lookup_plugin": null,
117
+ "lora_plugin": null,
118
+ "weight_only_groupwise_quant_matmul_plugin": null,
119
+ "weight_only_quant_matmul_plugin": null,
120
+ "smooth_quant_plugins": true,
121
+ "quantize_per_token_plugin": false,
122
+ "quantize_tensor_plugin": false,
123
+ "moe_plugin": null,
124
+ "mamba_conv1d_plugin": "auto",
125
+ "low_latency_gemm_plugin": null,
126
+ "context_fmha": true,
127
+ "bert_context_fmha_fp32_acc": false,
128
+ "paged_kv_cache": true,
129
+ "remove_input_padding": true,
130
+ "reduce_fusion": false,
131
+ "enable_xqa": false,
132
+ "tokens_per_block": 64,
133
+ "use_paged_context_fmha": false,
134
+ "use_fp8_context_fmha": false,
135
+ "multiple_profiles": false,
136
+ "paged_state": false,
137
+ "streamingllm": false,
138
+ "manage_weights": false,
139
+ "use_fused_mlp": true,
140
+ "pp_reduce_scatter": false
141
+ },
142
+ "use_strip_plan": false,
143
+ "max_encoder_input_len": 1024,
144
+ "use_fused_mlp": "enable",
145
+ "monitor_memory": false
146
+ }
147
+ }
whisper/1/whisper_large-v3-turbo/encoder/rank0.engine ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d807775c5cbd033e1ee8a06845a9f89877f96039a3d570776dfcfd71eee8e59
3
+ size 1287816708
whisper/config.pbtxt ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: "whisper"
16
+ backend: "python"
17
+ max_batch_size: 64
18
+
19
+ dynamic_batching {
20
+ max_queue_delay_microseconds: 100
21
+ }
22
+
23
+ parameters [
24
+ {
25
+ key: "engine_dir"
26
+ value: { string_value: "/data/whisper/model_repo_whisper/whisper/1/whisper_large-v3-turbo"}
27
+ },
28
+ {
29
+ key: "n_mels",
30
+ value: {string_value:"128"} # 128 dim for large-v3, 80 dim for large-v2
31
+ },
32
+ {
33
+ key: "zero_pad"
34
+ value: {string_value: "false"}
35
+ }
36
+ ]
37
+
38
+ input [
39
+ {
40
+ name: "WAV"
41
+ data_type: TYPE_FP32
42
+ dims: [-1]
43
+ optional: True
44
+ },
45
+ {
46
+ name: "WAV_LENS"
47
+ data_type: TYPE_INT32
48
+ dims: [1]
49
+ optional: True
50
+ },
51
+ {
52
+ name: "DECODER_INPUT_IDS"
53
+ data_type: TYPE_INT32
54
+ dims: [-1]
55
+ optional: True
56
+ }
57
+ ]
58
+ output [
59
+ {
60
+ name: "OUTPUT_IDS"
61
+ data_type: TYPE_INT32
62
+ dims: [-1]
63
+ }
64
+ ]
65
+
66
+ instance_group [
67
+ {
68
+ count: 1
69
+ kind: KIND_GPU
70
+ }
71
+ ]